LLM Evaluation: Metrics, Benchmarks, and Testing Strategies That Actually Work

Introduction: How do you know if your LLM application is actually working? Evaluation is one of the most challenging aspects of building AI systems—unlike traditional software where tests pass or fail, LLM outputs exist on a spectrum of quality. This guide covers the essential metrics, benchmarks, and tools for evaluating LLMs, from automated metrics like BLEU and ROUGE to modern approaches using LLM-as-judge. Whether you’re comparing foundation models, testing RAG systems, or monitoring production quality, understanding evaluation is crucial for building reliable AI applications.

LLM Evaluation Framework
LLM Evaluation: Metrics, Benchmarks, and Tools

Traditional NLP Metrics

from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import numpy as np

# BLEU Score - measures n-gram overlap with reference
def calculate_bleu(reference: str, candidate: str) -> dict:
    """Calculate BLEU score for translation/generation quality."""
    reference_tokens = reference.lower().split()
    candidate_tokens = candidate.lower().split()
    
    # Smoothing for short sentences
    smoothie = SmoothingFunction().method1
    
    scores = {
        "bleu_1": sentence_bleu([reference_tokens], candidate_tokens, 
                                weights=(1, 0, 0, 0), smoothing_function=smoothie),
        "bleu_2": sentence_bleu([reference_tokens], candidate_tokens, 
                                weights=(0.5, 0.5, 0, 0), smoothing_function=smoothie),
        "bleu_4": sentence_bleu([reference_tokens], candidate_tokens, 
                                weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie)
    }
    return scores

# ROUGE Score - recall-oriented metric for summarization
def calculate_rouge(reference: str, candidate: str) -> dict:
    """Calculate ROUGE scores for summarization quality."""
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, candidate)
    
    return {
        "rouge1_f": scores['rouge1'].fmeasure,
        "rouge2_f": scores['rouge2'].fmeasure,
        "rougeL_f": scores['rougeL'].fmeasure
    }

# Example usage
reference = "The quick brown fox jumps over the lazy dog."
candidate = "A fast brown fox leaps over a sleepy dog."

bleu_scores = calculate_bleu(reference, candidate)
rouge_scores = calculate_rouge(reference, candidate)

print(f"BLEU-4: {bleu_scores['bleu_4']:.3f}")
print(f"ROUGE-L: {rouge_scores['rougeL_f']:.3f}")

LLM-as-Judge Evaluation

from openai import OpenAI
import json

client = OpenAI()

def llm_judge(
    question: str,
    response: str,
    criteria: list[str] = ["accuracy", "relevance", "coherence", "helpfulness"]
) -> dict:
    """Use GPT-4 as a judge to evaluate response quality."""
    
    criteria_text = "\n".join([f"- {c}" for c in criteria])
    
    prompt = f"""You are an expert evaluator. Rate the following response on a scale of 1-5 for each criterion.

Question: {question}

Response: {response}

Criteria to evaluate:
{criteria_text}

Provide your evaluation as JSON with the following format:
{{
    "scores": {{"criterion": score, ...}},
    "reasoning": "Brief explanation of scores",
    "overall_score": average_score
}}

Be strict but fair. A score of 3 is average, 5 is exceptional."""

    response = client.chat.completions.create(
        model="gpt-4-turbo-preview",
        messages=[{"role": "user", "content": prompt}],
        response_format={"type": "json_object"}
    )
    
    return json.loads(response.choices[0].message.content)

# Pairwise comparison
def compare_responses(question: str, response_a: str, response_b: str) -> dict:
    """Compare two responses and determine which is better."""
    
    prompt = f"""Compare these two responses to the same question. Which is better?

Question: {question}

Response A: {response_a}

Response B: {response_b}

Evaluate based on accuracy, helpfulness, and clarity.
Return JSON: {{"winner": "A" or "B" or "tie", "reasoning": "explanation"}}"""

    response = client.chat.completions.create(
        model="gpt-4-turbo-preview",
        messages=[{"role": "user", "content": prompt}],
        response_format={"type": "json_object"}
    )
    
    return json.loads(response.choices[0].message.content)

# Example
result = llm_judge(
    question="What is machine learning?",
    response="Machine learning is a subset of AI where computers learn from data without being explicitly programmed."
)
print(f"Overall Score: {result['overall_score']}/5")

RAG Evaluation with RAGAS

# pip install ragas

from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall
)
from datasets import Dataset

# Prepare evaluation data
eval_data = {
    "question": [
        "What is the capital of France?",
        "Who wrote Romeo and Juliet?"
    ],
    "answer": [
        "The capital of France is Paris, a major European city.",
        "Romeo and Juliet was written by William Shakespeare."
    ],
    "contexts": [
        ["Paris is the capital and largest city of France.", 
         "France is a country in Western Europe."],
        ["William Shakespeare wrote many famous plays.",
         "Romeo and Juliet is a tragedy written in the 1590s."]
    ],
    "ground_truth": [
        "Paris",
        "William Shakespeare"
    ]
}

dataset = Dataset.from_dict(eval_data)

# Run evaluation
results = evaluate(
    dataset,
    metrics=[
        faithfulness,      # Is the answer grounded in context?
        answer_relevancy,  # Is the answer relevant to the question?
        context_precision, # Are retrieved contexts relevant?
        context_recall     # Do contexts contain the answer?
    ]
)

print(results)

# Custom RAG evaluation without RAGAS
def evaluate_rag_response(
    question: str,
    answer: str,
    contexts: list[str],
    ground_truth: str = None
) -> dict:
    """Evaluate RAG response quality."""
    
    prompt = f"""Evaluate this RAG (Retrieval-Augmented Generation) response.

Question: {question}
Retrieved Contexts: {contexts}
Generated Answer: {answer}
{f'Ground Truth: {ground_truth}' if ground_truth else ''}

Rate 1-5 on:
1. Faithfulness: Is the answer supported by the contexts?
2. Relevance: Does the answer address the question?
3. Context Quality: Are the retrieved contexts useful?
4. Completeness: Does the answer fully address the question?

Return JSON with scores and brief reasoning."""

    response = client.chat.completions.create(
        model="gpt-4-turbo-preview",
        messages=[{"role": "user", "content": prompt}],
        response_format={"type": "json_object"}
    )
    
    return json.loads(response.choices[0].message.content)

Unit Testing LLMs with DeepEval

# pip install deepeval

from deepeval import assert_test
from deepeval.metrics import (
    AnswerRelevancyMetric,
    FaithfulnessMetric,
    HallucinationMetric,
    ToxicityMetric
)
from deepeval.test_case import LLMTestCase

# Define test cases
def test_answer_relevancy():
    """Test that answers are relevant to questions."""
    
    test_case = LLMTestCase(
        input="What are the benefits of exercise?",
        actual_output="Regular exercise improves cardiovascular health, boosts mood, and helps maintain healthy weight.",
        retrieval_context=["Exercise has many health benefits including improved heart health."]
    )
    
    metric = AnswerRelevancyMetric(threshold=0.7)
    assert_test(test_case, [metric])

def test_no_hallucination():
    """Test that responses don't hallucinate facts."""
    
    test_case = LLMTestCase(
        input="What is Python?",
        actual_output="Python is a programming language created by Guido van Rossum in 1991.",
        context=["Python is a high-level programming language.", 
                 "Guido van Rossum created Python in 1991."]
    )
    
    metric = HallucinationMetric(threshold=0.5)
    assert_test(test_case, [metric])

def test_no_toxicity():
    """Test that responses are not toxic."""
    
    test_case = LLMTestCase(
        input="Tell me about competitive sports.",
        actual_output="Competitive sports teach valuable lessons about teamwork, perseverance, and fair play."
    )
    
    metric = ToxicityMetric(threshold=0.5)
    assert_test(test_case, [metric])

# Run with pytest
# pytest test_llm.py -v

# Batch evaluation
from deepeval.dataset import EvaluationDataset

dataset = EvaluationDataset(test_cases=[
    LLMTestCase(input="Q1", actual_output="A1"),
    LLMTestCase(input="Q2", actual_output="A2"),
])

results = dataset.evaluate([AnswerRelevancyMetric()])

Standard Benchmarks

BenchmarkWhat It TestsTasksTop Scores
MMLUKnowledge breadth57 subjects (STEM, humanities, etc.)GPT-4: 86.4%
HumanEvalCode generation164 Python problemsGPT-4: 67%
GSM8KMath reasoning8.5K grade school problemsGPT-4: 92%
HellaSwagCommonsense reasoningSentence completionGPT-4: 95.3%
TruthfulQATruthfulness817 questionsClaude 3: 89%
MBPPBasic Python974 problemsGPT-4: 80%
ARCScience reasoning7.7K questionsGPT-4: 96.3%

Building an Evaluation Pipeline

import json
from dataclasses import dataclass
from typing import Callable
import pandas as pd

@dataclass
class EvalCase:
    id: str
    input: str
    expected: str = None
    context: list[str] = None
    metadata: dict = None

@dataclass
class EvalResult:
    case_id: str
    output: str
    scores: dict
    latency_ms: float
    tokens_used: int

class LLMEvaluator:
    """Comprehensive LLM evaluation pipeline."""
    
    def __init__(self, model_fn: Callable, judge_model: str = "gpt-4-turbo-preview"):
        self.model_fn = model_fn
        self.judge_model = judge_model
        self.results = []
    
    def evaluate_case(self, case: EvalCase) -> EvalResult:
        """Evaluate a single test case."""
        import time
        
        start = time.time()
        output, tokens = self.model_fn(case.input)
        latency = (time.time() - start) * 1000
        
        # Calculate scores
        scores = {}
        
        # Exact match (if expected provided)
        if case.expected:
            scores["exact_match"] = 1.0 if output.strip() == case.expected.strip() else 0.0
            scores["contains_answer"] = 1.0 if case.expected.lower() in output.lower() else 0.0
        
        # LLM judge scores
        judge_result = self._llm_judge(case.input, output, case.context)
        scores.update(judge_result)
        
        return EvalResult(
            case_id=case.id,
            output=output,
            scores=scores,
            latency_ms=latency,
            tokens_used=tokens
        )
    
    def _llm_judge(self, question: str, answer: str, context: list = None) -> dict:
        """Get LLM judge scores."""
        # Implementation from earlier
        return {"relevance": 4.0, "coherence": 4.5, "accuracy": 4.0}
    
    def run_evaluation(self, cases: list[EvalCase]) -> pd.DataFrame:
        """Run evaluation on all cases."""
        results = []
        
        for case in cases:
            result = self.evaluate_case(case)
            self.results.append(result)
            results.append({
                "id": result.case_id,
                "output": result.output[:100] + "...",
                **result.scores,
                "latency_ms": result.latency_ms,
                "tokens": result.tokens_used
            })
        
        df = pd.DataFrame(results)
        return df
    
    def summary(self) -> dict:
        """Generate evaluation summary."""
        if not self.results:
            return {}
        
        all_scores = {}
        for result in self.results:
            for metric, score in result.scores.items():
                if metric not in all_scores:
                    all_scores[metric] = []
                all_scores[metric].append(score)
        
        return {
            "total_cases": len(self.results),
            "avg_latency_ms": sum(r.latency_ms for r in self.results) / len(self.results),
            "avg_tokens": sum(r.tokens_used for r in self.results) / len(self.results),
            "metrics": {k: sum(v)/len(v) for k, v in all_scores.items()}
        }

# Usage
def my_model(prompt: str) -> tuple[str, int]:
    response = client.chat.completions.create(
        model="gpt-4-turbo-preview",
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content, response.usage.total_tokens

evaluator = LLMEvaluator(my_model)
cases = [
    EvalCase(id="1", input="What is 2+2?", expected="4"),
    EvalCase(id="2", input="Capital of Japan?", expected="Tokyo"),
]

results_df = evaluator.run_evaluation(cases)
print(evaluator.summary())

References

Conclusion

LLM evaluation is both an art and a science. Traditional metrics like BLEU and ROUGE provide quick, reproducible scores but miss nuances that matter for real applications. LLM-as-judge approaches offer more sophisticated evaluation but introduce their own biases and costs. The best strategy combines multiple approaches: use automated metrics for fast iteration, LLM judges for nuanced quality assessment, and human evaluation for critical decisions. For RAG systems, tools like RAGAS provide specialized metrics for retrieval and generation quality. Build evaluation into your development workflow from the start—it’s much harder to add later. Remember that the goal isn’t perfect scores on benchmarks, but building systems that reliably help users accomplish their goals.


Discover more from C4: Container, Code, Cloud & Context

Subscribe to get the latest posts sent to your email.

Leave a Reply

Your email address will not be published. Required fields are marked *

This site uses Akismet to reduce spam. Learn how your comment data is processed.