LLM Evaluation: Metrics, Benchmarks, and A/B Testing

Introduction: Evaluating LLM outputs is challenging because there’s often no single “correct” answer. Traditional metrics like BLEU and ROUGE fall short for open-ended generation. This guide covers modern evaluation approaches: automated metrics for specific tasks, LLM-as-judge for quality assessment, human evaluation frameworks, A/B testing in production, and building comprehensive evaluation pipelines. These techniques help you measure what matters—whether your LLM application actually solves user problems—and make data-driven decisions about model selection and prompt optimization.

Automated Metrics

from dataclasses import dataclass
from typing import Callable
import re

@dataclass
class EvalMetric:
    name: str
    score: float
    details: dict = None

def exact_match(prediction: str, reference: str) -> EvalMetric:
    """Check if prediction exactly matches reference."""
    
    # Normalize whitespace and case
    pred_norm = " ".join(prediction.lower().split())
    ref_norm = " ".join(reference.lower().split())
    
    match = pred_norm == ref_norm
    
    return EvalMetric(
        name="exact_match",
        score=1.0 if match else 0.0,
        details={"prediction": pred_norm, "reference": ref_norm}
    )

def contains_answer(prediction: str, reference: str) -> EvalMetric:
    """Check if prediction contains the reference answer."""
    
    pred_lower = prediction.lower()
    ref_lower = reference.lower()
    
    contains = ref_lower in pred_lower
    
    return EvalMetric(
        name="contains_answer",
        score=1.0 if contains else 0.0
    )

def json_validity(prediction: str) -> EvalMetric:
    """Check if prediction is valid JSON."""
    
    import json
    
    # Try to extract JSON from response
    json_match = re.search(r'\{.*\}', prediction, re.DOTALL)
    
    if not json_match:
        return EvalMetric(name="json_validity", score=0.0, details={"error": "No JSON found"})
    
    try:
        parsed = json.loads(json_match.group())
        return EvalMetric(
            name="json_validity",
            score=1.0,
            details={"parsed": parsed}
        )
    except json.JSONDecodeError as e:
        return EvalMetric(
            name="json_validity",
            score=0.0,
            details={"error": str(e)}
        )

def schema_compliance(prediction: str, required_fields: list[str]) -> EvalMetric:
    """Check if JSON output has required fields."""
    
    import json
    
    try:
        json_match = re.search(r'\{.*\}', prediction, re.DOTALL)
        if not json_match:
            return EvalMetric(name="schema_compliance", score=0.0)
        
        parsed = json.loads(json_match.group())
        
        present = [f for f in required_fields if f in parsed]
        score = len(present) / len(required_fields) if required_fields else 1.0
        
        return EvalMetric(
            name="schema_compliance",
            score=score,
            details={
                "required": required_fields,
                "present": present,
                "missing": [f for f in required_fields if f not in parsed]
            }
        )
    except:
        return EvalMetric(name="schema_compliance", score=0.0)

def response_length(prediction: str, min_words: int = 10, max_words: int = 500) -> EvalMetric:
    """Check if response length is within bounds."""
    
    word_count = len(prediction.split())
    
    if min_words <= word_count <= max_words:
        score = 1.0
    elif word_count < min_words:
        score = word_count / min_words
    else:
        score = max(0, 1 - (word_count - max_words) / max_words)
    
    return EvalMetric(
        name="response_length",
        score=score,
        details={"word_count": word_count, "min": min_words, "max": max_words}
    )

# Usage
prediction = '{"name": "John", "age": 30}'
print(json_validity(prediction))
print(schema_compliance(prediction, ["name", "age", "email"]))

LLM-as-Judge

from openai import OpenAI
import json

client = OpenAI()

def llm_judge_quality(
    question: str,
    response: str,
    criteria: list[str] = None
) -> EvalMetric:
    """Use LLM to judge response quality."""
    
    if criteria is None:
        criteria = ["relevance", "accuracy", "completeness", "clarity"]
    
    criteria_text = "\n".join([f"- {c}" for c in criteria])
    
    prompt = f"""Evaluate this response on the following criteria:
{criteria_text}

Question: {question}

Response: {response}

For each criterion, provide a score from 1-5 and brief justification.
Then provide an overall score from 1-5.

Return JSON:
{{
    "criteria_scores": {{"criterion": {{"score": 1-5, "reason": "..."}}}},
    "overall_score": 1-5,
    "overall_reason": "..."
}}"""
    
    result = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
        response_format={"type": "json_object"}
    )
    
    data = json.loads(result.choices[0].message.content)
    
    return EvalMetric(
        name="llm_judge_quality",
        score=data["overall_score"] / 5.0,  # Normalize to 0-1
        details=data
    )

def llm_judge_comparison(
    question: str,
    response_a: str,
    response_b: str
) -> dict:
    """Compare two responses using LLM."""
    
    prompt = f"""Compare these two responses to the question.

Question: {question}

Response A: {response_a}

Response B: {response_b}

Which response is better? Consider:
- Accuracy and correctness
- Completeness
- Clarity and helpfulness
- Conciseness

Return JSON:
{{
    "winner": "A" or "B" or "tie",
    "confidence": 0.0-1.0,
    "reasoning": "explanation"
}}"""
    
    result = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
        response_format={"type": "json_object"}
    )
    
    return json.loads(result.choices[0].message.content)

def llm_judge_factuality(
    response: str,
    context: str = None
) -> EvalMetric:
    """Check if response contains factual errors."""
    
    prompt = f"""Analyze this response for factual accuracy.

{f"Context (ground truth): {context}" if context else ""}

Response to evaluate: {response}

Identify any factual errors, unsupported claims, or hallucinations.

Return JSON:
{{
    "is_factual": true/false,
    "errors": ["list of errors found"],
    "unsupported_claims": ["claims without evidence"],
    "confidence": 0.0-1.0
}}"""
    
    result = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
        response_format={"type": "json_object"}
    )
    
    data = json.loads(result.choices[0].message.content)
    
    return EvalMetric(
        name="factuality",
        score=1.0 if data["is_factual"] else 0.0,
        details=data
    )

# Usage
question = "What is the capital of France?"
response = "The capital of France is Paris, which is also the largest city in the country."

quality = llm_judge_quality(question, response)
print(f"Quality score: {quality.score:.2f}")
print(f"Details: {quality.details}")

Evaluation Dataset Management

from dataclasses import dataclass, field
from typing import Optional, Any
import json
from pathlib import Path

@dataclass
class EvalExample:
    id: str
    input: str
    expected_output: Optional[str] = None
    context: Optional[str] = None
    metadata: dict = field(default_factory=dict)
    tags: list[str] = field(default_factory=list)

@dataclass
class EvalResult:
    example_id: str
    model: str
    output: str
    metrics: list[EvalMetric]
    latency_ms: float
    timestamp: str

class EvalDataset:
    """Manage evaluation datasets."""
    
    def __init__(self, name: str):
        self.name = name
        self.examples: list[EvalExample] = []
    
    def add_example(self, example: EvalExample):
        """Add an evaluation example."""
        self.examples.append(example)
    
    def add_examples_from_jsonl(self, filepath: str):
        """Load examples from JSONL file."""
        
        with open(filepath) as f:
            for line in f:
                data = json.loads(line)
                self.examples.append(EvalExample(**data))
    
    def filter_by_tag(self, tag: str) -> list[EvalExample]:
        """Get examples with a specific tag."""
        return [e for e in self.examples if tag in e.tags]
    
    def save(self, filepath: str):
        """Save dataset to file."""
        
        with open(filepath, 'w') as f:
            for example in self.examples:
                f.write(json.dumps({
                    "id": example.id,
                    "input": example.input,
                    "expected_output": example.expected_output,
                    "context": example.context,
                    "metadata": example.metadata,
                    "tags": example.tags
                }) + '\n')
    
    def get_statistics(self) -> dict:
        """Get dataset statistics."""
        
        return {
            "total_examples": len(self.examples),
            "with_expected_output": sum(1 for e in self.examples if e.expected_output),
            "with_context": sum(1 for e in self.examples if e.context),
            "tags": list(set(tag for e in self.examples for tag in e.tags)),
            "avg_input_length": sum(len(e.input) for e in self.examples) / len(self.examples) if self.examples else 0
        }

# Create evaluation dataset
dataset = EvalDataset("qa_benchmark")

dataset.add_example(EvalExample(
    id="q1",
    input="What is the capital of France?",
    expected_output="Paris",
    tags=["geography", "factual"]
))

dataset.add_example(EvalExample(
    id="q2",
    input="Explain quantum computing in simple terms.",
    expected_output=None,  # Open-ended, use LLM judge
    tags=["explanation", "technical"]
))

print(dataset.get_statistics())

Evaluation Pipeline

from typing import Callable
import time
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor

class EvalPipeline:
    """Run evaluations across models and datasets."""
    
    def __init__(self):
        self.metrics: list[Callable] = []
        self.results: list[EvalResult] = []
    
    def add_metric(self, metric_fn: Callable):
        """Add a metric function."""
        self.metrics.append(metric_fn)
    
    def evaluate_single(
        self,
        model_fn: Callable[[str], str],
        model_name: str,
        example: EvalExample
    ) -> EvalResult:
        """Evaluate a single example."""
        
        start = time.time()
        output = model_fn(example.input)
        latency = (time.time() - start) * 1000
        
        # Run all metrics
        metric_results = []
        for metric_fn in self.metrics:
            try:
                # Different metrics need different inputs
                if metric_fn.__name__ in ["exact_match", "contains_answer"]:
                    if example.expected_output:
                        result = metric_fn(output, example.expected_output)
                        metric_results.append(result)
                elif metric_fn.__name__ == "llm_judge_quality":
                    result = metric_fn(example.input, output)
                    metric_results.append(result)
                elif metric_fn.__name__ == "llm_judge_factuality":
                    result = metric_fn(output, example.context)
                    metric_results.append(result)
                else:
                    result = metric_fn(output)
                    metric_results.append(result)
            except Exception as e:
                metric_results.append(EvalMetric(
                    name=metric_fn.__name__,
                    score=0.0,
                    details={"error": str(e)}
                ))
        
        return EvalResult(
            example_id=example.id,
            model=model_name,
            output=output,
            metrics=metric_results,
            latency_ms=latency,
            timestamp=datetime.now().isoformat()
        )
    
    def evaluate_dataset(
        self,
        model_fn: Callable[[str], str],
        model_name: str,
        dataset: EvalDataset,
        parallel: bool = False,
        max_workers: int = 5
    ) -> list[EvalResult]:
        """Evaluate entire dataset."""
        
        results = []
        
        if parallel:
            with ThreadPoolExecutor(max_workers=max_workers) as executor:
                futures = [
                    executor.submit(self.evaluate_single, model_fn, model_name, example)
                    for example in dataset.examples
                ]
                results = [f.result() for f in futures]
        else:
            for example in dataset.examples:
                result = self.evaluate_single(model_fn, model_name, example)
                results.append(result)
        
        self.results.extend(results)
        return results
    
    def compare_models(
        self,
        model_fns: dict[str, Callable],
        dataset: EvalDataset
    ) -> dict:
        """Compare multiple models on same dataset."""
        
        comparison = {}
        
        for model_name, model_fn in model_fns.items():
            results = self.evaluate_dataset(model_fn, model_name, dataset)
            
            # Aggregate metrics
            metric_scores = {}
            for result in results:
                for metric in result.metrics:
                    if metric.name not in metric_scores:
                        metric_scores[metric.name] = []
                    metric_scores[metric.name].append(metric.score)
            
            comparison[model_name] = {
                "avg_scores": {
                    name: sum(scores) / len(scores)
                    for name, scores in metric_scores.items()
                },
                "avg_latency_ms": sum(r.latency_ms for r in results) / len(results),
                "total_examples": len(results)
            }
        
        return comparison
    
    def generate_report(self) -> str:
        """Generate evaluation report."""
        
        report = "# Evaluation Report\n\n"
        
        # Group by model
        by_model = {}
        for result in self.results:
            if result.model not in by_model:
                by_model[result.model] = []
            by_model[result.model].append(result)
        
        for model, results in by_model.items():
            report += f"## {model}\n\n"
            report += f"Examples evaluated: {len(results)}\n"
            report += f"Avg latency: {sum(r.latency_ms for r in results) / len(results):.1f}ms\n\n"
            
            # Aggregate metrics
            metric_scores = {}
            for result in results:
                for metric in result.metrics:
                    if metric.name not in metric_scores:
                        metric_scores[metric.name] = []
                    metric_scores[metric.name].append(metric.score)
            
            report += "### Metrics\n\n"
            for name, scores in metric_scores.items():
                avg = sum(scores) / len(scores)
                report += f"- {name}: {avg:.3f}\n"
            
            report += "\n"
        
        return report

# Usage
pipeline = EvalPipeline()
pipeline.add_metric(exact_match)
pipeline.add_metric(contains_answer)
pipeline.add_metric(response_length)

def gpt4_model(prompt: str) -> str:
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content

results = pipeline.evaluate_dataset(gpt4_model, "gpt-4o", dataset)
print(pipeline.generate_report())

A/B Testing in Production

import random
from collections import defaultdict
import hashlib

class ABTest:
    """A/B testing for LLM variants."""
    
    def __init__(self, name: str, variants: dict[str, Callable]):
        self.name = name
        self.variants = variants
        self.results = defaultdict(list)
        self.assignments = {}  # user_id -> variant
    
    def get_variant(self, user_id: str) -> str:
        """Get consistent variant for user."""
        
        if user_id in self.assignments:
            return self.assignments[user_id]
        
        # Deterministic assignment based on user_id
        hash_val = int(hashlib.md5(f"{self.name}:{user_id}".encode()).hexdigest(), 16)
        variant_names = list(self.variants.keys())
        variant = variant_names[hash_val % len(variant_names)]
        
        self.assignments[user_id] = variant
        return variant
    
    def run(self, user_id: str, input_text: str) -> tuple[str, str]:
        """Run the appropriate variant for user."""
        
        variant = self.get_variant(user_id)
        model_fn = self.variants[variant]
        
        start = time.time()
        output = model_fn(input_text)
        latency = (time.time() - start) * 1000
        
        self.results[variant].append({
            "user_id": user_id,
            "input": input_text,
            "output": output,
            "latency_ms": latency,
            "timestamp": datetime.now().isoformat()
        })
        
        return variant, output
    
    def record_feedback(self, user_id: str, feedback: dict):
        """Record user feedback for their variant."""
        
        variant = self.assignments.get(user_id)
        if variant:
            # Find the most recent result for this user
            for result in reversed(self.results[variant]):
                if result["user_id"] == user_id:
                    result["feedback"] = feedback
                    break
    
    def get_statistics(self) -> dict:
        """Get A/B test statistics."""
        
        stats = {}
        
        for variant, results in self.results.items():
            feedbacks = [r.get("feedback", {}) for r in results if "feedback" in r]
            
            stats[variant] = {
                "total_requests": len(results),
                "avg_latency_ms": sum(r["latency_ms"] for r in results) / len(results) if results else 0,
                "feedback_count": len(feedbacks),
                "avg_rating": sum(f.get("rating", 0) for f in feedbacks) / len(feedbacks) if feedbacks else None,
                "thumbs_up_rate": sum(1 for f in feedbacks if f.get("thumbs_up")) / len(feedbacks) if feedbacks else None
            }
        
        return stats

# Usage
def variant_a(prompt: str) -> str:
    return client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}]
    ).choices[0].message.content

def variant_b(prompt: str) -> str:
    return client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}]
    ).choices[0].message.content

ab_test = ABTest("model_comparison", {
    "gpt-4o-mini": variant_a,
    "gpt-4o": variant_b
})

# Simulate usage
variant, response = ab_test.run("user123", "Explain machine learning")
ab_test.record_feedback("user123", {"rating": 5, "thumbs_up": True})

print(ab_test.get_statistics())

Production Evaluation Service

from fastapi import FastAPI, BackgroundTasks
from pydantic import BaseModel
from typing import Optional

app = FastAPI()

class EvalRequest(BaseModel):
    input: str
    output: str
    expected: Optional[str] = None
    context: Optional[str] = None
    metrics: list[str] = ["quality", "factuality"]

class EvalResponse(BaseModel):
    scores: dict[str, float]
    details: dict

@app.post("/evaluate", response_model=EvalResponse)
async def evaluate(request: EvalRequest):
    """Evaluate a single response."""
    
    scores = {}
    details = {}
    
    if "quality" in request.metrics:
        result = llm_judge_quality(request.input, request.output)
        scores["quality"] = result.score
        details["quality"] = result.details
    
    if "factuality" in request.metrics:
        result = llm_judge_factuality(request.output, request.context)
        scores["factuality"] = result.score
        details["factuality"] = result.details
    
    if "exact_match" in request.metrics and request.expected:
        result = exact_match(request.output, request.expected)
        scores["exact_match"] = result.score
    
    return EvalResponse(scores=scores, details=details)

@app.post("/compare")
async def compare_responses(
    question: str,
    response_a: str,
    response_b: str
):
    """Compare two responses."""
    
    result = llm_judge_comparison(question, response_a, response_b)
    return result

@app.get("/ab-test/{test_name}/stats")
async def get_ab_stats(test_name: str):
    """Get A/B test statistics."""
    
    # In production, load from database
    return {"message": "Load from database"}

References

HELM Benchmark: https://crfm.stanford.edu/helm/
LangSmith Evaluation: https://docs.smith.langchain.com/evaluation
OpenAI Evals: https://github.com/openai/evals
LMSYS Chatbot Arena: https://chat.lmsys.org/

Conclusion

Effective LLM evaluation requires multiple approaches. Use automated metrics for objective, repeatable measurements on tasks with clear answers. Employ LLM-as-judge for nuanced quality assessment of open-ended responses. Build evaluation datasets that represent your actual use cases and edge cases. Run A/B tests in production to measure real user satisfaction. Combine these approaches in evaluation pipelines that run automatically on model updates or prompt changes. The goal is not perfect metrics but actionable insights that help you improve your LLM application over time.

Discover more from C4: Container, Code, Cloud & Context

Subscribe to get the latest posts sent to your email.

Searching in