Evaluating RAG Systems

Metrics, frameworks, and automated evaluation for retrieval quality, generation faithfulness, and end-to-end RAG performance with RAGAS, DeepEval, and LangSmith

Open In Colab

πŸ“– Read the full article


Table of Contents

  1. Setup & Installation
  2. RAG Evaluation Taxonomy
  3. Build a Sample RAG Pipeline
  4. Retrieval Metrics
  5. Generation Metrics
  6. RAGAS Framework
  7. DeepEval Framework
  8. Custom Evaluation with LLM-as-Judge

1. Setup & Installation

!pip install -q ragas deepeval langchain langchain-openai langchain-community faiss-cpu langchain-text-splitters
import os
# os.environ["OPENAI_API_KEY"] = "your-api-key-here"  # Uncomment and set

2. RAG Evaluation Taxonomy

RAG evaluation decomposes into retrieval metrics and generation metrics.

evaluation_taxonomy = {
    "Retrieval Metrics": {
        "Context Precision": "Are the top-ranked retrieved docs relevant?",
        "Context Recall": "Does the retrieved context cover the ground truth?",
        "Recall@k": "Fraction of relevant docs in top-k",
        "MRR": "Reciprocal rank of first relevant document",
    },
    "Generation Metrics": {
        "Faithfulness": "Is the answer grounded in retrieved context?",
        "Answer Relevancy": "Does the answer address the question?",
        "Hallucination Rate": "Fraction of claims not supported by context",
    },
    "End-to-End Metrics": {
        "Answer Correctness": "Is the answer factually correct?",
        "Semantic Similarity": "Does the answer mean the same as reference?",
    },
}

for category, metrics in evaluation_taxonomy.items():
    print(f"\n{category}:")
    for name, description in metrics.items():
        print(f"  {name}: {description}")

3. Build a Sample RAG Pipeline

Create a simple RAG pipeline for evaluation testing.

from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

# Sample knowledge base
knowledge_base = [
    Document(page_content="RAG (Retrieval-Augmented Generation) grounds LLM responses in factual documents, reducing hallucinations. It retrieves relevant context before generation."),
    Document(page_content="RAG enables real-time knowledge updates without retraining the LLM. New documents can be added to the vector store at any time."),
    Document(page_content="The key components of RAG are: document loading, chunking, embedding, vector storage, retrieval, and generation."),
    Document(page_content="RLHF aligns LLMs with human preferences using a reward model and reinforcement learning."),
    Document(page_content="Fine-tuning adapts a pre-trained model to specific tasks using labeled training data."),
]

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vectorstore = FAISS.from_documents(knowledge_base, embeddings)
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
prompt = ChatPromptTemplate.from_template(
    "Answer the question based only on the context.\n\nContext:\n{context}\n\nQuestion: {question}"
)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt | llm | StrOutputParser()
)

# Test
answer = rag_chain.invoke("What are the benefits of RAG?")
print(f"Answer: {answer}")

4. Retrieval Metrics

Measuring whether the retriever finds the right documents.

# Manual retrieval evaluation
def compute_recall_at_k(retrieved_docs, relevant_doc_ids, k):
    """Compute Recall@k: fraction of relevant docs found in top-k."""
    retrieved_ids = set(range(k))  # simplified: use positions
    relevant_found = len(retrieved_ids.intersection(relevant_doc_ids))
    return relevant_found / len(relevant_doc_ids) if relevant_doc_ids else 0


def compute_mrr(retrieved_docs, relevant_doc_contents):
    """Compute Mean Reciprocal Rank."""
    for i, doc in enumerate(retrieved_docs):
        if doc.page_content in relevant_doc_contents:
            return 1.0 / (i + 1)
    return 0.0


# Test retrieval quality
query = "What are the benefits of RAG?"
retrieved = retriever.invoke(query)
relevant_contents = [
    "RAG (Retrieval-Augmented Generation) grounds LLM responses in factual documents, reducing hallucinations. It retrieves relevant context before generation.",
    "RAG enables real-time knowledge updates without retraining the LLM. New documents can be added to the vector store at any time.",
]

mrr = compute_mrr(retrieved, relevant_contents)
print(f"Query: {query}")
print(f"MRR: {mrr:.2f}")
print(f"\nRetrieved documents:")
for i, doc in enumerate(retrieved):
    is_relevant = doc.page_content in relevant_contents
    print(f"  {i+1}. {'[RELEVANT]' if is_relevant else '[         ]'} {doc.page_content[:80]}...")

5. Generation Metrics

Measuring faithfulness (grounding) and answer relevancy.

# Manual faithfulness check: decompose answer into claims, verify each
def check_faithfulness_manual(answer, context, llm):
    """Use LLM to check if answer claims are supported by context."""
    prompt = ChatPromptTemplate.from_template(
        "Given the context and answer below, list each factual claim in the answer "
        "and whether it is supported by the context.\n\n"
        "Context:\n{context}\n\nAnswer:\n{answer}\n\n"
        "For each claim, respond with:\n"
        "CLAIM: <claim>\nSUPPORTED: yes/no\n"
    )
    chain = prompt | llm | StrOutputParser()
    result = chain.invoke({"context": context, "answer": answer})
    return result


# Test
query = "What are the benefits of RAG?"
context_docs = retriever.invoke(query)
context = "\n".join(doc.page_content for doc in context_docs)
answer = rag_chain.invoke(query)

print(f"Answer: {answer}\n")
faithfulness_report = check_faithfulness_manual(answer, context, llm)
print("Faithfulness Report:")
print(faithfulness_report)

6. RAGAS Framework

The most widely-used open-source RAG evaluation framework.

from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
)
from ragas import EvaluationDataset, SingleTurnSample

# Create evaluation samples
samples = [
    SingleTurnSample(
        user_input="What are the benefits of RAG?",
        response="RAG reduces hallucinations by grounding answers in retrieved documents and enables real-time knowledge updates.",
        retrieved_contexts=[
            "RAG grounds LLM responses in factual documents, reducing hallucinations.",
            "RAG enables real-time knowledge updates without retraining.",
        ],
        reference="RAG reduces hallucinations and enables real-time knowledge updates.",
    )
]

dataset = EvaluationDataset(samples=samples)

# Evaluate
results = evaluate(
    dataset=dataset,
    metrics=[faithfulness, answer_relevancy, context_precision, context_recall],
)

print("RAGAS Evaluation Results:")
print(results)

7. DeepEval Framework

Comprehensive LLM evaluation with 50+ metrics and Pytest integration.

from deepeval import evaluate as deepeval_evaluate
from deepeval.test_case import LLMTestCase
from deepeval.metrics import (
    AnswerRelevancyMetric,
    FaithfulnessMetric,
)

# Define test case
test_case = LLMTestCase(
    input="What are the benefits of RAG?",
    actual_output="RAG reduces hallucinations by grounding answers in retrieved documents.",
    retrieval_context=[
        "RAG grounds LLM responses in factual documents, reducing hallucinations.",
        "RAG enables real-time knowledge updates without retraining.",
    ],
)

# Define metrics
metrics = [
    AnswerRelevancyMetric(threshold=0.7, model="gpt-4o"),
    FaithfulnessMetric(threshold=0.7, model="gpt-4o"),
]

# Run evaluation
deepeval_evaluate(test_cases=[test_case], metrics=metrics)

8. Custom Evaluation with LLM-as-Judge

Build a custom evaluation pipeline when frameworks don’t fit your needs.

from pydantic import BaseModel, Field


class EvalResult(BaseModel):
    faithfulness_score: float = Field(description="Score from 0 to 1")
    relevancy_score: float = Field(description="Score from 0 to 1")
    reasoning: str = Field(description="Explanation of scores")


eval_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0).with_structured_output(EvalResult)

eval_prompt = ChatPromptTemplate.from_template(
    "Evaluate the RAG system output:\n\n"
    "Question: {question}\n"
    "Retrieved Context: {context}\n"
    "Generated Answer: {answer}\n\n"
    "Score faithfulness (is the answer grounded in context?) from 0-1.\n"
    "Score relevancy (does the answer address the question?) from 0-1.\n"
    "Provide brief reasoning."
)

eval_chain = eval_prompt | eval_llm

# Evaluate multiple queries
eval_queries = [
    "What are the benefits of RAG?",
    "What are the key components of RAG?",
    "How does fine-tuning work?",
]

print("Custom LLM-as-Judge Evaluation:")
print("=" * 60)
for q in eval_queries:
    context_docs = retriever.invoke(q)
    context = "\n".join(doc.page_content for doc in context_docs)
    answer = rag_chain.invoke(q)
    
    result = eval_chain.invoke({
        "question": q, "context": context, "answer": answer
    })
    
    print(f"\nQ: {q}")
    print(f"A: {answer}")
    print(f"Faithfulness: {result.faithfulness_score:.2f}")
    print(f"Relevancy: {result.relevancy_score:.2f}")
    print(f"Reasoning: {result.reasoning}")