Introduction: Evaluating LLM outputs is challenging because there’s often no single “correct” answer. Traditional metrics like BLEU and ROUGE fall short for open-ended generation. This guide covers modern evaluation approaches: automated metrics for specific tasks, LLM-as-judge for quality assessment, human evaluation frameworks, A/B testing in production, and building comprehensive evaluation pipelines. These techniques help you measure what matters—whether your LLM application actually solves user problems—and make data-driven decisions about model selection and prompt optimization.

Automated Metrics
from dataclasses import dataclass
from typing import Callable
import re
@dataclass
class EvalMetric:
name: str
score: float
details: dict = None
def exact_match(prediction: str, reference: str) -> EvalMetric:
"""Check if prediction exactly matches reference."""
# Normalize whitespace and case
pred_norm = " ".join(prediction.lower().split())
ref_norm = " ".join(reference.lower().split())
match = pred_norm == ref_norm
return EvalMetric(
name="exact_match",
score=1.0 if match else 0.0,
details={"prediction": pred_norm, "reference": ref_norm}
)
def contains_answer(prediction: str, reference: str) -> EvalMetric:
"""Check if prediction contains the reference answer."""
pred_lower = prediction.lower()
ref_lower = reference.lower()
contains = ref_lower in pred_lower
return EvalMetric(
name="contains_answer",
score=1.0 if contains else 0.0
)
def json_validity(prediction: str) -> EvalMetric:
"""Check if prediction is valid JSON."""
import json
# Try to extract JSON from response
json_match = re.search(r'\{.*\}', prediction, re.DOTALL)
if not json_match:
return EvalMetric(name="json_validity", score=0.0, details={"error": "No JSON found"})
try:
parsed = json.loads(json_match.group())
return EvalMetric(
name="json_validity",
score=1.0,
details={"parsed": parsed}
)
except json.JSONDecodeError as e:
return EvalMetric(
name="json_validity",
score=0.0,
details={"error": str(e)}
)
def schema_compliance(prediction: str, required_fields: list[str]) -> EvalMetric:
"""Check if JSON output has required fields."""
import json
try:
json_match = re.search(r'\{.*\}', prediction, re.DOTALL)
if not json_match:
return EvalMetric(name="schema_compliance", score=0.0)
parsed = json.loads(json_match.group())
present = [f for f in required_fields if f in parsed]
score = len(present) / len(required_fields) if required_fields else 1.0
return EvalMetric(
name="schema_compliance",
score=score,
details={
"required": required_fields,
"present": present,
"missing": [f for f in required_fields if f not in parsed]
}
)
except:
return EvalMetric(name="schema_compliance", score=0.0)
def response_length(prediction: str, min_words: int = 10, max_words: int = 500) -> EvalMetric:
"""Check if response length is within bounds."""
word_count = len(prediction.split())
if min_words <= word_count <= max_words:
score = 1.0
elif word_count < min_words:
score = word_count / min_words
else:
score = max(0, 1 - (word_count - max_words) / max_words)
return EvalMetric(
name="response_length",
score=score,
details={"word_count": word_count, "min": min_words, "max": max_words}
)
# Usage
prediction = '{"name": "John", "age": 30}'
print(json_validity(prediction))
print(schema_compliance(prediction, ["name", "age", "email"]))
LLM-as-Judge
from openai import OpenAI
import json
client = OpenAI()
def llm_judge_quality(
question: str,
response: str,
criteria: list[str] = None
) -> EvalMetric:
"""Use LLM to judge response quality."""
if criteria is None:
criteria = ["relevance", "accuracy", "completeness", "clarity"]
criteria_text = "\n".join([f"- {c}" for c in criteria])
prompt = f"""Evaluate this response on the following criteria:
{criteria_text}
Question: {question}
Response: {response}
For each criterion, provide a score from 1-5 and brief justification.
Then provide an overall score from 1-5.
Return JSON:
{{
"criteria_scores": {{"criterion": {{"score": 1-5, "reason": "..."}}}},
"overall_score": 1-5,
"overall_reason": "..."
}}"""
result = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
data = json.loads(result.choices[0].message.content)
return EvalMetric(
name="llm_judge_quality",
score=data["overall_score"] / 5.0, # Normalize to 0-1
details=data
)
def llm_judge_comparison(
question: str,
response_a: str,
response_b: str
) -> dict:
"""Compare two responses using LLM."""
prompt = f"""Compare these two responses to the question.
Question: {question}
Response A: {response_a}
Response B: {response_b}
Which response is better? Consider:
- Accuracy and correctness
- Completeness
- Clarity and helpfulness
- Conciseness
Return JSON:
{{
"winner": "A" or "B" or "tie",
"confidence": 0.0-1.0,
"reasoning": "explanation"
}}"""
result = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
return json.loads(result.choices[0].message.content)
def llm_judge_factuality(
response: str,
context: str = None
) -> EvalMetric:
"""Check if response contains factual errors."""
prompt = f"""Analyze this response for factual accuracy.
{f"Context (ground truth): {context}" if context else ""}
Response to evaluate: {response}
Identify any factual errors, unsupported claims, or hallucinations.
Return JSON:
{{
"is_factual": true/false,
"errors": ["list of errors found"],
"unsupported_claims": ["claims without evidence"],
"confidence": 0.0-1.0
}}"""
result = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
data = json.loads(result.choices[0].message.content)
return EvalMetric(
name="factuality",
score=1.0 if data["is_factual"] else 0.0,
details=data
)
# Usage
question = "What is the capital of France?"
response = "The capital of France is Paris, which is also the largest city in the country."
quality = llm_judge_quality(question, response)
print(f"Quality score: {quality.score:.2f}")
print(f"Details: {quality.details}")
Evaluation Dataset Management
from dataclasses import dataclass, field
from typing import Optional, Any
import json
from pathlib import Path
@dataclass
class EvalExample:
id: str
input: str
expected_output: Optional[str] = None
context: Optional[str] = None
metadata: dict = field(default_factory=dict)
tags: list[str] = field(default_factory=list)
@dataclass
class EvalResult:
example_id: str
model: str
output: str
metrics: list[EvalMetric]
latency_ms: float
timestamp: str
class EvalDataset:
"""Manage evaluation datasets."""
def __init__(self, name: str):
self.name = name
self.examples: list[EvalExample] = []
def add_example(self, example: EvalExample):
"""Add an evaluation example."""
self.examples.append(example)
def add_examples_from_jsonl(self, filepath: str):
"""Load examples from JSONL file."""
with open(filepath) as f:
for line in f:
data = json.loads(line)
self.examples.append(EvalExample(**data))
def filter_by_tag(self, tag: str) -> list[EvalExample]:
"""Get examples with a specific tag."""
return [e for e in self.examples if tag in e.tags]
def save(self, filepath: str):
"""Save dataset to file."""
with open(filepath, 'w') as f:
for example in self.examples:
f.write(json.dumps({
"id": example.id,
"input": example.input,
"expected_output": example.expected_output,
"context": example.context,
"metadata": example.metadata,
"tags": example.tags
}) + '\n')
def get_statistics(self) -> dict:
"""Get dataset statistics."""
return {
"total_examples": len(self.examples),
"with_expected_output": sum(1 for e in self.examples if e.expected_output),
"with_context": sum(1 for e in self.examples if e.context),
"tags": list(set(tag for e in self.examples for tag in e.tags)),
"avg_input_length": sum(len(e.input) for e in self.examples) / len(self.examples) if self.examples else 0
}
# Create evaluation dataset
dataset = EvalDataset("qa_benchmark")
dataset.add_example(EvalExample(
id="q1",
input="What is the capital of France?",
expected_output="Paris",
tags=["geography", "factual"]
))
dataset.add_example(EvalExample(
id="q2",
input="Explain quantum computing in simple terms.",
expected_output=None, # Open-ended, use LLM judge
tags=["explanation", "technical"]
))
print(dataset.get_statistics())
Evaluation Pipeline
from typing import Callable
import time
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor
class EvalPipeline:
"""Run evaluations across models and datasets."""
def __init__(self):
self.metrics: list[Callable] = []
self.results: list[EvalResult] = []
def add_metric(self, metric_fn: Callable):
"""Add a metric function."""
self.metrics.append(metric_fn)
def evaluate_single(
self,
model_fn: Callable[[str], str],
model_name: str,
example: EvalExample
) -> EvalResult:
"""Evaluate a single example."""
start = time.time()
output = model_fn(example.input)
latency = (time.time() - start) * 1000
# Run all metrics
metric_results = []
for metric_fn in self.metrics:
try:
# Different metrics need different inputs
if metric_fn.__name__ in ["exact_match", "contains_answer"]:
if example.expected_output:
result = metric_fn(output, example.expected_output)
metric_results.append(result)
elif metric_fn.__name__ == "llm_judge_quality":
result = metric_fn(example.input, output)
metric_results.append(result)
elif metric_fn.__name__ == "llm_judge_factuality":
result = metric_fn(output, example.context)
metric_results.append(result)
else:
result = metric_fn(output)
metric_results.append(result)
except Exception as e:
metric_results.append(EvalMetric(
name=metric_fn.__name__,
score=0.0,
details={"error": str(e)}
))
return EvalResult(
example_id=example.id,
model=model_name,
output=output,
metrics=metric_results,
latency_ms=latency,
timestamp=datetime.now().isoformat()
)
def evaluate_dataset(
self,
model_fn: Callable[[str], str],
model_name: str,
dataset: EvalDataset,
parallel: bool = False,
max_workers: int = 5
) -> list[EvalResult]:
"""Evaluate entire dataset."""
results = []
if parallel:
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = [
executor.submit(self.evaluate_single, model_fn, model_name, example)
for example in dataset.examples
]
results = [f.result() for f in futures]
else:
for example in dataset.examples:
result = self.evaluate_single(model_fn, model_name, example)
results.append(result)
self.results.extend(results)
return results
def compare_models(
self,
model_fns: dict[str, Callable],
dataset: EvalDataset
) -> dict:
"""Compare multiple models on same dataset."""
comparison = {}
for model_name, model_fn in model_fns.items():
results = self.evaluate_dataset(model_fn, model_name, dataset)
# Aggregate metrics
metric_scores = {}
for result in results:
for metric in result.metrics:
if metric.name not in metric_scores:
metric_scores[metric.name] = []
metric_scores[metric.name].append(metric.score)
comparison[model_name] = {
"avg_scores": {
name: sum(scores) / len(scores)
for name, scores in metric_scores.items()
},
"avg_latency_ms": sum(r.latency_ms for r in results) / len(results),
"total_examples": len(results)
}
return comparison
def generate_report(self) -> str:
"""Generate evaluation report."""
report = "# Evaluation Report\n\n"
# Group by model
by_model = {}
for result in self.results:
if result.model not in by_model:
by_model[result.model] = []
by_model[result.model].append(result)
for model, results in by_model.items():
report += f"## {model}\n\n"
report += f"Examples evaluated: {len(results)}\n"
report += f"Avg latency: {sum(r.latency_ms for r in results) / len(results):.1f}ms\n\n"
# Aggregate metrics
metric_scores = {}
for result in results:
for metric in result.metrics:
if metric.name not in metric_scores:
metric_scores[metric.name] = []
metric_scores[metric.name].append(metric.score)
report += "### Metrics\n\n"
for name, scores in metric_scores.items():
avg = sum(scores) / len(scores)
report += f"- {name}: {avg:.3f}\n"
report += "\n"
return report
# Usage
pipeline = EvalPipeline()
pipeline.add_metric(exact_match)
pipeline.add_metric(contains_answer)
pipeline.add_metric(response_length)
def gpt4_model(prompt: str) -> str:
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}]
)
return response.choices[0].message.content
results = pipeline.evaluate_dataset(gpt4_model, "gpt-4o", dataset)
print(pipeline.generate_report())
A/B Testing in Production
import random
from collections import defaultdict
import hashlib
class ABTest:
"""A/B testing for LLM variants."""
def __init__(self, name: str, variants: dict[str, Callable]):
self.name = name
self.variants = variants
self.results = defaultdict(list)
self.assignments = {} # user_id -> variant
def get_variant(self, user_id: str) -> str:
"""Get consistent variant for user."""
if user_id in self.assignments:
return self.assignments[user_id]
# Deterministic assignment based on user_id
hash_val = int(hashlib.md5(f"{self.name}:{user_id}".encode()).hexdigest(), 16)
variant_names = list(self.variants.keys())
variant = variant_names[hash_val % len(variant_names)]
self.assignments[user_id] = variant
return variant
def run(self, user_id: str, input_text: str) -> tuple[str, str]:
"""Run the appropriate variant for user."""
variant = self.get_variant(user_id)
model_fn = self.variants[variant]
start = time.time()
output = model_fn(input_text)
latency = (time.time() - start) * 1000
self.results[variant].append({
"user_id": user_id,
"input": input_text,
"output": output,
"latency_ms": latency,
"timestamp": datetime.now().isoformat()
})
return variant, output
def record_feedback(self, user_id: str, feedback: dict):
"""Record user feedback for their variant."""
variant = self.assignments.get(user_id)
if variant:
# Find the most recent result for this user
for result in reversed(self.results[variant]):
if result["user_id"] == user_id:
result["feedback"] = feedback
break
def get_statistics(self) -> dict:
"""Get A/B test statistics."""
stats = {}
for variant, results in self.results.items():
feedbacks = [r.get("feedback", {}) for r in results if "feedback" in r]
stats[variant] = {
"total_requests": len(results),
"avg_latency_ms": sum(r["latency_ms"] for r in results) / len(results) if results else 0,
"feedback_count": len(feedbacks),
"avg_rating": sum(f.get("rating", 0) for f in feedbacks) / len(feedbacks) if feedbacks else None,
"thumbs_up_rate": sum(1 for f in feedbacks if f.get("thumbs_up")) / len(feedbacks) if feedbacks else None
}
return stats
# Usage
def variant_a(prompt: str) -> str:
return client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": prompt}]
).choices[0].message.content
def variant_b(prompt: str) -> str:
return client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}]
).choices[0].message.content
ab_test = ABTest("model_comparison", {
"gpt-4o-mini": variant_a,
"gpt-4o": variant_b
})
# Simulate usage
variant, response = ab_test.run("user123", "Explain machine learning")
ab_test.record_feedback("user123", {"rating": 5, "thumbs_up": True})
print(ab_test.get_statistics())
Production Evaluation Service
from fastapi import FastAPI, BackgroundTasks
from pydantic import BaseModel
from typing import Optional
app = FastAPI()
class EvalRequest(BaseModel):
input: str
output: str
expected: Optional[str] = None
context: Optional[str] = None
metrics: list[str] = ["quality", "factuality"]
class EvalResponse(BaseModel):
scores: dict[str, float]
details: dict
@app.post("/evaluate", response_model=EvalResponse)
async def evaluate(request: EvalRequest):
"""Evaluate a single response."""
scores = {}
details = {}
if "quality" in request.metrics:
result = llm_judge_quality(request.input, request.output)
scores["quality"] = result.score
details["quality"] = result.details
if "factuality" in request.metrics:
result = llm_judge_factuality(request.output, request.context)
scores["factuality"] = result.score
details["factuality"] = result.details
if "exact_match" in request.metrics and request.expected:
result = exact_match(request.output, request.expected)
scores["exact_match"] = result.score
return EvalResponse(scores=scores, details=details)
@app.post("/compare")
async def compare_responses(
question: str,
response_a: str,
response_b: str
):
"""Compare two responses."""
result = llm_judge_comparison(question, response_a, response_b)
return result
@app.get("/ab-test/{test_name}/stats")
async def get_ab_stats(test_name: str):
"""Get A/B test statistics."""
# In production, load from database
return {"message": "Load from database"}
References
- HELM Benchmark: https://crfm.stanford.edu/helm/
- LangSmith Evaluation: https://docs.smith.langchain.com/evaluation
- OpenAI Evals: https://github.com/openai/evals
- LMSYS Chatbot Arena: https://chat.lmsys.org/
Conclusion
Effective LLM evaluation requires multiple approaches. Use automated metrics for objective, repeatable measurements on tasks with clear answers. Employ LLM-as-judge for nuanced quality assessment of open-ended responses. Build evaluation datasets that represent your actual use cases and edge cases. Run A/B tests in production to measure real user satisfaction. Combine these approaches in evaluation pipelines that run automatically on model updates or prompt changes. The goal is not perfect metrics but actionable insights that help you improve your LLM application over time.
Discover more from C4: Container, Code, Cloud & Context
Subscribe to get the latest posts sent to your email.