!pip install -q langchain-openaiEvaluating and Debugging AI Agents
Trajectory scoring, tool-call accuracy, LLM-as-judge, pass@k, and debugging strategies
Table of Contents
- Setup
- Answer Quality: Exact, Fuzzy & Semantic Match
- LLM-as-Judge
- Trajectory Scoring
- Tool-Call Accuracy
- Pass@k Metric
import os
# os.environ["OPENAI_API_KEY"] = "your-key"2. Answer Quality: Exact, Fuzzy & Semantic Match
from difflib import SequenceMatcher
def exact_match(predicted: str, reference: str) -> bool:
"""Case-insensitive exact match."""
return predicted.strip().lower() == reference.strip().lower()
def fuzzy_match(predicted: str, reference: str, threshold: float = 0.8) -> float:
"""SequenceMatcher-based fuzzy score."""
return SequenceMatcher(None, predicted.lower(), reference.lower()).ratio()
def f1_token_score(predicted: str, reference: str) -> float:
"""Token-level F1 score (SQuAD-style)."""
pred_tokens = set(predicted.lower().split())
ref_tokens = set(reference.lower().split())
if not pred_tokens or not ref_tokens:
return 0.0
common = pred_tokens & ref_tokens
precision = len(common) / len(pred_tokens)
recall = len(common) / len(ref_tokens)
if precision + recall == 0:
return 0.0
return 2 * precision * recall / (precision + recall)
# Demo
pred = "The capital of France is Paris."
ref = "Paris is the capital of France."
print(f"Exact match: {exact_match(pred, ref)}")
print(f"Fuzzy match: {fuzzy_match(pred, ref):.2f}")
print(f"Token F1: {f1_token_score(pred, ref):.2f}")3. LLM-as-Judge
Use an LLM to evaluate answer quality with structured scoring.
from langchain_openai import ChatOpenAI
import json
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
def llm_judge_answer(question: str, predicted: str, reference: str) -> dict:
"""LLM-based evaluation with structured output."""
response = llm.invoke([{
"role": "system",
"content": """You are an evaluation judge. Score the predicted answer vs the reference.
Return JSON with:
- correctness: 0-5 (factual accuracy)
- completeness: 0-5 (coverage of key points)
- relevance: 0-5 (stays on topic)
- reasoning: brief explanation""",
}, {
"role": "user",
"content": f"Question: {question}\nPredicted: {predicted}\nReference: {reference}",
}])
try:
# Try to parse JSON from response
text = response.content
# Find JSON in response
start = text.find("{")
end = text.rfind("}") + 1
if start >= 0 and end > start:
return json.loads(text[start:end])
except json.JSONDecodeError:
pass
return {"raw_response": response.content}
# Demo
result = llm_judge_answer(
question="What is RAG?",
predicted="RAG stands for Retrieval-Augmented Generation. It combines document retrieval with LLM generation.",
reference="Retrieval-Augmented Generation (RAG) is a technique that enhances LLM outputs by retrieving relevant documents from a knowledge base before generating a response.",
)
print("LLM Judge Result:")
for k, v in result.items():
print(f" {k}: {v}")4. Trajectory Scoring
Evaluate the process, not just the final answer — was the sequence of steps efficient and correct?
from dataclasses import dataclass
from typing import Optional
@dataclass
class TrajectoryStep:
"""A single step in an agent's execution trace."""
action: str # e.g., "search", "calculate", "respond"
input_text: str
output_text: str
is_correct: Optional[bool] = None
is_necessary: Optional[bool] = None
@dataclass
class StepScore:
correctness: float # 0-1
necessity: float # 0-1 (was this step needed?)
efficiency: float # 0-1 (was this the best action?)
def score_trajectory(steps: list[TrajectoryStep], expected_steps: list[str]) -> dict:
"""Score an agent trajectory against expected behavior."""
actual_actions = [s.action for s in steps]
# Action overlap (did it pick the right tools?)
expected_set = set(expected_steps)
actual_set = set(actual_actions)
action_precision = len(actual_set & expected_set) / max(len(actual_set), 1)
action_recall = len(actual_set & expected_set) / max(len(expected_set), 1)
# Efficiency (fewer steps = more efficient, penalize unnecessary)
efficiency = min(1.0, len(expected_steps) / max(len(actual_actions), 1))
# Order score (did actions happen in reasonable order?)
order_score = 0.0
for i, exp in enumerate(expected_steps):
if exp in actual_actions:
actual_idx = actual_actions.index(exp)
order_score += 1.0 - abs(i - actual_idx) / max(len(actual_actions), 1)
order_score /= max(len(expected_steps), 1)
return {
"action_precision": round(action_precision, 2),
"action_recall": round(action_recall, 2),
"efficiency": round(efficiency, 2),
"order_score": round(order_score, 2),
"total_steps": len(steps),
"expected_steps": len(expected_steps),
"actual_actions": actual_actions,
}
# Demo
steps = [
TrajectoryStep("search", "RAG architectures", "Found 5 documents"),
TrajectoryStep("search", "RAG performance", "Found 3 papers"), # Extra search
TrajectoryStep("summarize", "Combine findings", "RAG improves accuracy..."),
TrajectoryStep("respond", "Generate answer", "RAG is a technique that..."),
]
expected = ["search", "summarize", "respond"]
scores = score_trajectory(steps, expected)
print("Trajectory Scores:")
for k, v in scores.items():
print(f" {k}: {v}")5. Tool-Call Accuracy
Measure whether the agent called the right tools with correct arguments.
def evaluate_tool_calls(
actual_calls: list[dict],
expected_calls: list[dict],
) -> dict:
"""Compare actual vs expected tool calls.
Each call: {"tool": "name", "args": {"key": "value"}}
"""
if not expected_calls:
return {"precision": 1.0 if not actual_calls else 0.0, "recall": 1.0}
# Match by tool name
actual_tools = [c["tool"] for c in actual_calls]
expected_tools = [c["tool"] for c in expected_calls]
# Tool-level precision/recall
correct_tools = sum(1 for t in actual_tools if t in expected_tools)
tool_precision = correct_tools / max(len(actual_tools), 1)
tool_recall = correct_tools / max(len(expected_tools), 1)
# Argument accuracy (for matching tools)
arg_scores = []
for exp in expected_calls:
matches = [a for a in actual_calls if a["tool"] == exp["tool"]]
if matches:
best_match = matches[0]
# Compare arguments
exp_args = set(exp.get("args", {}).items())
act_args = set(best_match.get("args", {}).items())
if exp_args:
arg_scores.append(len(exp_args & act_args) / len(exp_args))
else:
arg_scores.append(1.0)
arg_accuracy = sum(arg_scores) / max(len(arg_scores), 1)
return {
"tool_precision": round(tool_precision, 2),
"tool_recall": round(tool_recall, 2),
"argument_accuracy": round(arg_accuracy, 2),
"actual_tools": actual_tools,
"expected_tools": expected_tools,
}
# Demo
actual = [
{"tool": "search", "args": {"query": "RAG architecture", "top_k": 5}},
{"tool": "calculate", "args": {"expression": "0.85 * 100"}},
]
expected = [
{"tool": "search", "args": {"query": "RAG architecture", "top_k": 5}},
{"tool": "summarize", "args": {"text": "..."}},
]
result = evaluate_tool_calls(actual, expected)
print("Tool-Call Evaluation:")
for k, v in result.items():
print(f" {k}: {v}")6. Pass@k Metric
Probability that at least one of k samples is correct (from Codex/HumanEval).
import math
from functools import reduce
def compute_pass_at_k(n: int, c: int, k: int) -> float:
"""Compute pass@k metric.
Args:
n: Total number of samples generated
c: Number of correct samples
k: Number of samples to consider
Returns:
Probability at least one of k samples is correct.
"""
if n - c < k:
return 1.0
# Use log space for numerical stability
# pass@k = 1 - C(n-c, k) / C(n, k)
log_num = sum(math.log(n - c - i) for i in range(k))
log_den = sum(math.log(n - i) for i in range(k))
return 1.0 - math.exp(log_num - log_den)
# Demo: varying k
print("pass@k for n=20 samples, c=5 correct:")
for k in [1, 3, 5, 10]:
score = compute_pass_at_k(n=20, c=5, k=k)
print(f" pass@{k}: {score:.3f}")
print("\npass@1 for varying correctness (n=10):")
for c in range(11):
score = compute_pass_at_k(n=10, c=c, k=1)
bar = "█" * int(score * 30)
print(f" c={c:2d}: {score:.3f} {bar}")