Evaluating and Debugging AI Agents

Trajectory scoring, tool-call accuracy, LLM-as-judge, pass@k, and debugging strategies

Open In Colab

📖 Read the full article


Table of Contents

  1. Setup
  2. Answer Quality: Exact, Fuzzy & Semantic Match
  3. LLM-as-Judge
  4. Trajectory Scoring
  5. Tool-Call Accuracy
  6. Pass@k Metric
!pip install -q langchain-openai
import os
# os.environ["OPENAI_API_KEY"] = "your-key"

2. Answer Quality: Exact, Fuzzy & Semantic Match

from difflib import SequenceMatcher


def exact_match(predicted: str, reference: str) -> bool:
    """Case-insensitive exact match."""
    return predicted.strip().lower() == reference.strip().lower()


def fuzzy_match(predicted: str, reference: str, threshold: float = 0.8) -> float:
    """SequenceMatcher-based fuzzy score."""
    return SequenceMatcher(None, predicted.lower(), reference.lower()).ratio()


def f1_token_score(predicted: str, reference: str) -> float:
    """Token-level F1 score (SQuAD-style)."""
    pred_tokens = set(predicted.lower().split())
    ref_tokens = set(reference.lower().split())

    if not pred_tokens or not ref_tokens:
        return 0.0

    common = pred_tokens & ref_tokens
    precision = len(common) / len(pred_tokens)
    recall = len(common) / len(ref_tokens)

    if precision + recall == 0:
        return 0.0
    return 2 * precision * recall / (precision + recall)


# Demo
pred = "The capital of France is Paris."
ref = "Paris is the capital of France."

print(f"Exact match: {exact_match(pred, ref)}")
print(f"Fuzzy match: {fuzzy_match(pred, ref):.2f}")
print(f"Token F1: {f1_token_score(pred, ref):.2f}")

3. LLM-as-Judge

Use an LLM to evaluate answer quality with structured scoring.

from langchain_openai import ChatOpenAI
import json

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)


def llm_judge_answer(question: str, predicted: str, reference: str) -> dict:
    """LLM-based evaluation with structured output."""
    response = llm.invoke([{
        "role": "system",
        "content": """You are an evaluation judge. Score the predicted answer vs the reference.
Return JSON with:
- correctness: 0-5 (factual accuracy)
- completeness: 0-5 (coverage of key points)
- relevance: 0-5 (stays on topic)
- reasoning: brief explanation""",
    }, {
        "role": "user",
        "content": f"Question: {question}\nPredicted: {predicted}\nReference: {reference}",
    }])

    try:
        # Try to parse JSON from response
        text = response.content
        # Find JSON in response
        start = text.find("{")
        end = text.rfind("}") + 1
        if start >= 0 and end > start:
            return json.loads(text[start:end])
    except json.JSONDecodeError:
        pass

    return {"raw_response": response.content}


# Demo
result = llm_judge_answer(
    question="What is RAG?",
    predicted="RAG stands for Retrieval-Augmented Generation. It combines document retrieval with LLM generation.",
    reference="Retrieval-Augmented Generation (RAG) is a technique that enhances LLM outputs by retrieving relevant documents from a knowledge base before generating a response.",
)

print("LLM Judge Result:")
for k, v in result.items():
    print(f"  {k}: {v}")

4. Trajectory Scoring

Evaluate the process, not just the final answer — was the sequence of steps efficient and correct?

from dataclasses import dataclass
from typing import Optional


@dataclass
class TrajectoryStep:
    """A single step in an agent's execution trace."""
    action: str  # e.g., "search", "calculate", "respond"
    input_text: str
    output_text: str
    is_correct: Optional[bool] = None
    is_necessary: Optional[bool] = None


@dataclass
class StepScore:
    correctness: float  # 0-1
    necessity: float  # 0-1 (was this step needed?)
    efficiency: float  # 0-1 (was this the best action?)


def score_trajectory(steps: list[TrajectoryStep], expected_steps: list[str]) -> dict:
    """Score an agent trajectory against expected behavior."""
    actual_actions = [s.action for s in steps]

    # Action overlap (did it pick the right tools?)
    expected_set = set(expected_steps)
    actual_set = set(actual_actions)
    action_precision = len(actual_set & expected_set) / max(len(actual_set), 1)
    action_recall = len(actual_set & expected_set) / max(len(expected_set), 1)

    # Efficiency (fewer steps = more efficient, penalize unnecessary)
    efficiency = min(1.0, len(expected_steps) / max(len(actual_actions), 1))

    # Order score (did actions happen in reasonable order?)
    order_score = 0.0
    for i, exp in enumerate(expected_steps):
        if exp in actual_actions:
            actual_idx = actual_actions.index(exp)
            order_score += 1.0 - abs(i - actual_idx) / max(len(actual_actions), 1)
    order_score /= max(len(expected_steps), 1)

    return {
        "action_precision": round(action_precision, 2),
        "action_recall": round(action_recall, 2),
        "efficiency": round(efficiency, 2),
        "order_score": round(order_score, 2),
        "total_steps": len(steps),
        "expected_steps": len(expected_steps),
        "actual_actions": actual_actions,
    }


# Demo
steps = [
    TrajectoryStep("search", "RAG architectures", "Found 5 documents"),
    TrajectoryStep("search", "RAG performance", "Found 3 papers"),  # Extra search
    TrajectoryStep("summarize", "Combine findings", "RAG improves accuracy..."),
    TrajectoryStep("respond", "Generate answer", "RAG is a technique that..."),
]

expected = ["search", "summarize", "respond"]

scores = score_trajectory(steps, expected)
print("Trajectory Scores:")
for k, v in scores.items():
    print(f"  {k}: {v}")

5. Tool-Call Accuracy

Measure whether the agent called the right tools with correct arguments.

def evaluate_tool_calls(
    actual_calls: list[dict],
    expected_calls: list[dict],
) -> dict:
    """Compare actual vs expected tool calls.

    Each call: {"tool": "name", "args": {"key": "value"}}
    """
    if not expected_calls:
        return {"precision": 1.0 if not actual_calls else 0.0, "recall": 1.0}

    # Match by tool name
    actual_tools = [c["tool"] for c in actual_calls]
    expected_tools = [c["tool"] for c in expected_calls]

    # Tool-level precision/recall
    correct_tools = sum(1 for t in actual_tools if t in expected_tools)
    tool_precision = correct_tools / max(len(actual_tools), 1)
    tool_recall = correct_tools / max(len(expected_tools), 1)

    # Argument accuracy (for matching tools)
    arg_scores = []
    for exp in expected_calls:
        matches = [a for a in actual_calls if a["tool"] == exp["tool"]]
        if matches:
            best_match = matches[0]
            # Compare arguments
            exp_args = set(exp.get("args", {}).items())
            act_args = set(best_match.get("args", {}).items())
            if exp_args:
                arg_scores.append(len(exp_args & act_args) / len(exp_args))
            else:
                arg_scores.append(1.0)

    arg_accuracy = sum(arg_scores) / max(len(arg_scores), 1)

    return {
        "tool_precision": round(tool_precision, 2),
        "tool_recall": round(tool_recall, 2),
        "argument_accuracy": round(arg_accuracy, 2),
        "actual_tools": actual_tools,
        "expected_tools": expected_tools,
    }


# Demo
actual = [
    {"tool": "search", "args": {"query": "RAG architecture", "top_k": 5}},
    {"tool": "calculate", "args": {"expression": "0.85 * 100"}},
]

expected = [
    {"tool": "search", "args": {"query": "RAG architecture", "top_k": 5}},
    {"tool": "summarize", "args": {"text": "..."}},
]

result = evaluate_tool_calls(actual, expected)
print("Tool-Call Evaluation:")
for k, v in result.items():
    print(f"  {k}: {v}")

6. Pass@k Metric

Probability that at least one of k samples is correct (from Codex/HumanEval).

import math
from functools import reduce


def compute_pass_at_k(n: int, c: int, k: int) -> float:
    """Compute pass@k metric.

    Args:
        n: Total number of samples generated
        c: Number of correct samples
        k: Number of samples to consider

    Returns:
        Probability at least one of k samples is correct.
    """
    if n - c < k:
        return 1.0

    # Use log space for numerical stability
    # pass@k = 1 - C(n-c, k) / C(n, k)
    log_num = sum(math.log(n - c - i) for i in range(k))
    log_den = sum(math.log(n - i) for i in range(k))

    return 1.0 - math.exp(log_num - log_den)


# Demo: varying k
print("pass@k for n=20 samples, c=5 correct:")
for k in [1, 3, 5, 10]:
    score = compute_pass_at_k(n=20, c=5, k=k)
    print(f"  pass@{k}: {score:.3f}")

print("\npass@1 for varying correctness (n=10):")
for c in range(11):
    score = compute_pass_at_k(n=10, c=c, k=1)
    bar = "█" * int(score * 30)
    print(f"  c={c:2d}: {score:.3f} {bar}")