Introduction: How do you know if your LLM application is actually working? Evaluation is one of the most challenging aspects of building AI systems—unlike traditional software where tests pass or fail, LLM outputs exist on a spectrum of quality. This guide covers the essential metrics, benchmarks, and tools for evaluating LLMs, from automated metrics like BLEU and ROUGE to modern approaches using LLM-as-judge. Whether you’re comparing foundation models, testing RAG systems, or monitoring production quality, understanding evaluation is crucial for building reliable AI applications.

Traditional NLP Metrics
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import numpy as np
# BLEU Score - measures n-gram overlap with reference
def calculate_bleu(reference: str, candidate: str) -> dict:
"""Calculate BLEU score for translation/generation quality."""
reference_tokens = reference.lower().split()
candidate_tokens = candidate.lower().split()
# Smoothing for short sentences
smoothie = SmoothingFunction().method1
scores = {
"bleu_1": sentence_bleu([reference_tokens], candidate_tokens,
weights=(1, 0, 0, 0), smoothing_function=smoothie),
"bleu_2": sentence_bleu([reference_tokens], candidate_tokens,
weights=(0.5, 0.5, 0, 0), smoothing_function=smoothie),
"bleu_4": sentence_bleu([reference_tokens], candidate_tokens,
weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie)
}
return scores
# ROUGE Score - recall-oriented metric for summarization
def calculate_rouge(reference: str, candidate: str) -> dict:
"""Calculate ROUGE scores for summarization quality."""
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(reference, candidate)
return {
"rouge1_f": scores['rouge1'].fmeasure,
"rouge2_f": scores['rouge2'].fmeasure,
"rougeL_f": scores['rougeL'].fmeasure
}
# Example usage
reference = "The quick brown fox jumps over the lazy dog."
candidate = "A fast brown fox leaps over a sleepy dog."
bleu_scores = calculate_bleu(reference, candidate)
rouge_scores = calculate_rouge(reference, candidate)
print(f"BLEU-4: {bleu_scores['bleu_4']:.3f}")
print(f"ROUGE-L: {rouge_scores['rougeL_f']:.3f}")
LLM-as-Judge Evaluation
from openai import OpenAI
import json
client = OpenAI()
def llm_judge(
question: str,
response: str,
criteria: list[str] = ["accuracy", "relevance", "coherence", "helpfulness"]
) -> dict:
"""Use GPT-4 as a judge to evaluate response quality."""
criteria_text = "\n".join([f"- {c}" for c in criteria])
prompt = f"""You are an expert evaluator. Rate the following response on a scale of 1-5 for each criterion.
Question: {question}
Response: {response}
Criteria to evaluate:
{criteria_text}
Provide your evaluation as JSON with the following format:
{{
"scores": {{"criterion": score, ...}},
"reasoning": "Brief explanation of scores",
"overall_score": average_score
}}
Be strict but fair. A score of 3 is average, 5 is exceptional."""
response = client.chat.completions.create(
model="gpt-4-turbo-preview",
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
return json.loads(response.choices[0].message.content)
# Pairwise comparison
def compare_responses(question: str, response_a: str, response_b: str) -> dict:
"""Compare two responses and determine which is better."""
prompt = f"""Compare these two responses to the same question. Which is better?
Question: {question}
Response A: {response_a}
Response B: {response_b}
Evaluate based on accuracy, helpfulness, and clarity.
Return JSON: {{"winner": "A" or "B" or "tie", "reasoning": "explanation"}}"""
response = client.chat.completions.create(
model="gpt-4-turbo-preview",
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
return json.loads(response.choices[0].message.content)
# Example
result = llm_judge(
question="What is machine learning?",
response="Machine learning is a subset of AI where computers learn from data without being explicitly programmed."
)
print(f"Overall Score: {result['overall_score']}/5")
RAG Evaluation with RAGAS
# pip install ragas
from ragas import evaluate
from ragas.metrics import (
faithfulness,
answer_relevancy,
context_precision,
context_recall
)
from datasets import Dataset
# Prepare evaluation data
eval_data = {
"question": [
"What is the capital of France?",
"Who wrote Romeo and Juliet?"
],
"answer": [
"The capital of France is Paris, a major European city.",
"Romeo and Juliet was written by William Shakespeare."
],
"contexts": [
["Paris is the capital and largest city of France.",
"France is a country in Western Europe."],
["William Shakespeare wrote many famous plays.",
"Romeo and Juliet is a tragedy written in the 1590s."]
],
"ground_truth": [
"Paris",
"William Shakespeare"
]
}
dataset = Dataset.from_dict(eval_data)
# Run evaluation
results = evaluate(
dataset,
metrics=[
faithfulness, # Is the answer grounded in context?
answer_relevancy, # Is the answer relevant to the question?
context_precision, # Are retrieved contexts relevant?
context_recall # Do contexts contain the answer?
]
)
print(results)
# Custom RAG evaluation without RAGAS
def evaluate_rag_response(
question: str,
answer: str,
contexts: list[str],
ground_truth: str = None
) -> dict:
"""Evaluate RAG response quality."""
prompt = f"""Evaluate this RAG (Retrieval-Augmented Generation) response.
Question: {question}
Retrieved Contexts: {contexts}
Generated Answer: {answer}
{f'Ground Truth: {ground_truth}' if ground_truth else ''}
Rate 1-5 on:
1. Faithfulness: Is the answer supported by the contexts?
2. Relevance: Does the answer address the question?
3. Context Quality: Are the retrieved contexts useful?
4. Completeness: Does the answer fully address the question?
Return JSON with scores and brief reasoning."""
response = client.chat.completions.create(
model="gpt-4-turbo-preview",
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
return json.loads(response.choices[0].message.content)
Unit Testing LLMs with DeepEval
# pip install deepeval
from deepeval import assert_test
from deepeval.metrics import (
AnswerRelevancyMetric,
FaithfulnessMetric,
HallucinationMetric,
ToxicityMetric
)
from deepeval.test_case import LLMTestCase
# Define test cases
def test_answer_relevancy():
"""Test that answers are relevant to questions."""
test_case = LLMTestCase(
input="What are the benefits of exercise?",
actual_output="Regular exercise improves cardiovascular health, boosts mood, and helps maintain healthy weight.",
retrieval_context=["Exercise has many health benefits including improved heart health."]
)
metric = AnswerRelevancyMetric(threshold=0.7)
assert_test(test_case, [metric])
def test_no_hallucination():
"""Test that responses don't hallucinate facts."""
test_case = LLMTestCase(
input="What is Python?",
actual_output="Python is a programming language created by Guido van Rossum in 1991.",
context=["Python is a high-level programming language.",
"Guido van Rossum created Python in 1991."]
)
metric = HallucinationMetric(threshold=0.5)
assert_test(test_case, [metric])
def test_no_toxicity():
"""Test that responses are not toxic."""
test_case = LLMTestCase(
input="Tell me about competitive sports.",
actual_output="Competitive sports teach valuable lessons about teamwork, perseverance, and fair play."
)
metric = ToxicityMetric(threshold=0.5)
assert_test(test_case, [metric])
# Run with pytest
# pytest test_llm.py -v
# Batch evaluation
from deepeval.dataset import EvaluationDataset
dataset = EvaluationDataset(test_cases=[
LLMTestCase(input="Q1", actual_output="A1"),
LLMTestCase(input="Q2", actual_output="A2"),
])
results = dataset.evaluate([AnswerRelevancyMetric()])
Standard Benchmarks
| Benchmark | What It Tests | Tasks | Top Scores |
|---|---|---|---|
| MMLU | Knowledge breadth | 57 subjects (STEM, humanities, etc.) | GPT-4: 86.4% |
| HumanEval | Code generation | 164 Python problems | GPT-4: 67% |
| GSM8K | Math reasoning | 8.5K grade school problems | GPT-4: 92% |
| HellaSwag | Commonsense reasoning | Sentence completion | GPT-4: 95.3% |
| TruthfulQA | Truthfulness | 817 questions | Claude 3: 89% |
| MBPP | Basic Python | 974 problems | GPT-4: 80% |
| ARC | Science reasoning | 7.7K questions | GPT-4: 96.3% |
Building an Evaluation Pipeline
import json
from dataclasses import dataclass
from typing import Callable
import pandas as pd
@dataclass
class EvalCase:
id: str
input: str
expected: str = None
context: list[str] = None
metadata: dict = None
@dataclass
class EvalResult:
case_id: str
output: str
scores: dict
latency_ms: float
tokens_used: int
class LLMEvaluator:
"""Comprehensive LLM evaluation pipeline."""
def __init__(self, model_fn: Callable, judge_model: str = "gpt-4-turbo-preview"):
self.model_fn = model_fn
self.judge_model = judge_model
self.results = []
def evaluate_case(self, case: EvalCase) -> EvalResult:
"""Evaluate a single test case."""
import time
start = time.time()
output, tokens = self.model_fn(case.input)
latency = (time.time() - start) * 1000
# Calculate scores
scores = {}
# Exact match (if expected provided)
if case.expected:
scores["exact_match"] = 1.0 if output.strip() == case.expected.strip() else 0.0
scores["contains_answer"] = 1.0 if case.expected.lower() in output.lower() else 0.0
# LLM judge scores
judge_result = self._llm_judge(case.input, output, case.context)
scores.update(judge_result)
return EvalResult(
case_id=case.id,
output=output,
scores=scores,
latency_ms=latency,
tokens_used=tokens
)
def _llm_judge(self, question: str, answer: str, context: list = None) -> dict:
"""Get LLM judge scores."""
# Implementation from earlier
return {"relevance": 4.0, "coherence": 4.5, "accuracy": 4.0}
def run_evaluation(self, cases: list[EvalCase]) -> pd.DataFrame:
"""Run evaluation on all cases."""
results = []
for case in cases:
result = self.evaluate_case(case)
self.results.append(result)
results.append({
"id": result.case_id,
"output": result.output[:100] + "...",
**result.scores,
"latency_ms": result.latency_ms,
"tokens": result.tokens_used
})
df = pd.DataFrame(results)
return df
def summary(self) -> dict:
"""Generate evaluation summary."""
if not self.results:
return {}
all_scores = {}
for result in self.results:
for metric, score in result.scores.items():
if metric not in all_scores:
all_scores[metric] = []
all_scores[metric].append(score)
return {
"total_cases": len(self.results),
"avg_latency_ms": sum(r.latency_ms for r in self.results) / len(self.results),
"avg_tokens": sum(r.tokens_used for r in self.results) / len(self.results),
"metrics": {k: sum(v)/len(v) for k, v in all_scores.items()}
}
# Usage
def my_model(prompt: str) -> tuple[str, int]:
response = client.chat.completions.create(
model="gpt-4-turbo-preview",
messages=[{"role": "user", "content": prompt}]
)
return response.choices[0].message.content, response.usage.total_tokens
evaluator = LLMEvaluator(my_model)
cases = [
EvalCase(id="1", input="What is 2+2?", expected="4"),
EvalCase(id="2", input="Capital of Japan?", expected="Tokyo"),
]
results_df = evaluator.run_evaluation(cases)
print(evaluator.summary())
References
- RAGAS: https://docs.ragas.io/
- DeepEval: https://docs.confident-ai.com/
- LangSmith: https://docs.smith.langchain.com/
- MMLU Benchmark: https://github.com/hendrycks/test
- HumanEval: https://github.com/openai/human-eval
Conclusion
LLM evaluation is both an art and a science. Traditional metrics like BLEU and ROUGE provide quick, reproducible scores but miss nuances that matter for real applications. LLM-as-judge approaches offer more sophisticated evaluation but introduce their own biases and costs. The best strategy combines multiple approaches: use automated metrics for fast iteration, LLM judges for nuanced quality assessment, and human evaluation for critical decisions. For RAG systems, tools like RAGAS provide specialized metrics for retrieval and generation quality. Build evaluation into your development workflow from the start—it’s much harder to add later. Remember that the goal isn’t perfect scores on benchmarks, but building systems that reliably help users accomplish their goals.
Discover more from C4: Container, Code, Cloud & Context
Subscribe to get the latest posts sent to your email.