!pip install -q ragas deepeval langchain langchain-openai langchain-community faiss-cpu langchain-text-splittersEvaluating RAG Systems
Metrics, frameworks, and automated evaluation for retrieval quality, generation faithfulness, and end-to-end RAG performance with RAGAS, DeepEval, and LangSmith
Table of Contents
1. Setup & Installation
import os
# os.environ["OPENAI_API_KEY"] = "your-api-key-here" # Uncomment and set2. RAG Evaluation Taxonomy
RAG evaluation decomposes into retrieval metrics and generation metrics.
evaluation_taxonomy = {
"Retrieval Metrics": {
"Context Precision": "Are the top-ranked retrieved docs relevant?",
"Context Recall": "Does the retrieved context cover the ground truth?",
"Recall@k": "Fraction of relevant docs in top-k",
"MRR": "Reciprocal rank of first relevant document",
},
"Generation Metrics": {
"Faithfulness": "Is the answer grounded in retrieved context?",
"Answer Relevancy": "Does the answer address the question?",
"Hallucination Rate": "Fraction of claims not supported by context",
},
"End-to-End Metrics": {
"Answer Correctness": "Is the answer factually correct?",
"Semantic Similarity": "Does the answer mean the same as reference?",
},
}
for category, metrics in evaluation_taxonomy.items():
print(f"\n{category}:")
for name, description in metrics.items():
print(f" {name}: {description}")3. Build a Sample RAG Pipeline
Create a simple RAG pipeline for evaluation testing.
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
# Sample knowledge base
knowledge_base = [
Document(page_content="RAG (Retrieval-Augmented Generation) grounds LLM responses in factual documents, reducing hallucinations. It retrieves relevant context before generation."),
Document(page_content="RAG enables real-time knowledge updates without retraining the LLM. New documents can be added to the vector store at any time."),
Document(page_content="The key components of RAG are: document loading, chunking, embedding, vector storage, retrieval, and generation."),
Document(page_content="RLHF aligns LLMs with human preferences using a reward model and reinforcement learning."),
Document(page_content="Fine-tuning adapts a pre-trained model to specific tasks using labeled training data."),
]
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vectorstore = FAISS.from_documents(knowledge_base, embeddings)
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
prompt = ChatPromptTemplate.from_template(
"Answer the question based only on the context.\n\nContext:\n{context}\n\nQuestion: {question}"
)
def format_docs(docs):
return "\n\n".join(doc.page_content for doc in docs)
rag_chain = (
{"context": retriever | format_docs, "question": RunnablePassthrough()}
| prompt | llm | StrOutputParser()
)
# Test
answer = rag_chain.invoke("What are the benefits of RAG?")
print(f"Answer: {answer}")4. Retrieval Metrics
Measuring whether the retriever finds the right documents.
# Manual retrieval evaluation
def compute_recall_at_k(retrieved_docs, relevant_doc_ids, k):
"""Compute Recall@k: fraction of relevant docs found in top-k."""
retrieved_ids = set(range(k)) # simplified: use positions
relevant_found = len(retrieved_ids.intersection(relevant_doc_ids))
return relevant_found / len(relevant_doc_ids) if relevant_doc_ids else 0
def compute_mrr(retrieved_docs, relevant_doc_contents):
"""Compute Mean Reciprocal Rank."""
for i, doc in enumerate(retrieved_docs):
if doc.page_content in relevant_doc_contents:
return 1.0 / (i + 1)
return 0.0
# Test retrieval quality
query = "What are the benefits of RAG?"
retrieved = retriever.invoke(query)
relevant_contents = [
"RAG (Retrieval-Augmented Generation) grounds LLM responses in factual documents, reducing hallucinations. It retrieves relevant context before generation.",
"RAG enables real-time knowledge updates without retraining the LLM. New documents can be added to the vector store at any time.",
]
mrr = compute_mrr(retrieved, relevant_contents)
print(f"Query: {query}")
print(f"MRR: {mrr:.2f}")
print(f"\nRetrieved documents:")
for i, doc in enumerate(retrieved):
is_relevant = doc.page_content in relevant_contents
print(f" {i+1}. {'[RELEVANT]' if is_relevant else '[ ]'} {doc.page_content[:80]}...")5. Generation Metrics
Measuring faithfulness (grounding) and answer relevancy.
# Manual faithfulness check: decompose answer into claims, verify each
def check_faithfulness_manual(answer, context, llm):
"""Use LLM to check if answer claims are supported by context."""
prompt = ChatPromptTemplate.from_template(
"Given the context and answer below, list each factual claim in the answer "
"and whether it is supported by the context.\n\n"
"Context:\n{context}\n\nAnswer:\n{answer}\n\n"
"For each claim, respond with:\n"
"CLAIM: <claim>\nSUPPORTED: yes/no\n"
)
chain = prompt | llm | StrOutputParser()
result = chain.invoke({"context": context, "answer": answer})
return result
# Test
query = "What are the benefits of RAG?"
context_docs = retriever.invoke(query)
context = "\n".join(doc.page_content for doc in context_docs)
answer = rag_chain.invoke(query)
print(f"Answer: {answer}\n")
faithfulness_report = check_faithfulness_manual(answer, context, llm)
print("Faithfulness Report:")
print(faithfulness_report)6. RAGAS Framework
The most widely-used open-source RAG evaluation framework.
from ragas import evaluate
from ragas.metrics import (
faithfulness,
answer_relevancy,
context_precision,
context_recall,
)
from ragas import EvaluationDataset, SingleTurnSample
# Create evaluation samples
samples = [
SingleTurnSample(
user_input="What are the benefits of RAG?",
response="RAG reduces hallucinations by grounding answers in retrieved documents and enables real-time knowledge updates.",
retrieved_contexts=[
"RAG grounds LLM responses in factual documents, reducing hallucinations.",
"RAG enables real-time knowledge updates without retraining.",
],
reference="RAG reduces hallucinations and enables real-time knowledge updates.",
)
]
dataset = EvaluationDataset(samples=samples)
# Evaluate
results = evaluate(
dataset=dataset,
metrics=[faithfulness, answer_relevancy, context_precision, context_recall],
)
print("RAGAS Evaluation Results:")
print(results)7. DeepEval Framework
Comprehensive LLM evaluation with 50+ metrics and Pytest integration.
from deepeval import evaluate as deepeval_evaluate
from deepeval.test_case import LLMTestCase
from deepeval.metrics import (
AnswerRelevancyMetric,
FaithfulnessMetric,
)
# Define test case
test_case = LLMTestCase(
input="What are the benefits of RAG?",
actual_output="RAG reduces hallucinations by grounding answers in retrieved documents.",
retrieval_context=[
"RAG grounds LLM responses in factual documents, reducing hallucinations.",
"RAG enables real-time knowledge updates without retraining.",
],
)
# Define metrics
metrics = [
AnswerRelevancyMetric(threshold=0.7, model="gpt-4o"),
FaithfulnessMetric(threshold=0.7, model="gpt-4o"),
]
# Run evaluation
deepeval_evaluate(test_cases=[test_case], metrics=metrics)8. Custom Evaluation with LLM-as-Judge
Build a custom evaluation pipeline when frameworks donβt fit your needs.
from pydantic import BaseModel, Field
class EvalResult(BaseModel):
faithfulness_score: float = Field(description="Score from 0 to 1")
relevancy_score: float = Field(description="Score from 0 to 1")
reasoning: str = Field(description="Explanation of scores")
eval_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0).with_structured_output(EvalResult)
eval_prompt = ChatPromptTemplate.from_template(
"Evaluate the RAG system output:\n\n"
"Question: {question}\n"
"Retrieved Context: {context}\n"
"Generated Answer: {answer}\n\n"
"Score faithfulness (is the answer grounded in context?) from 0-1.\n"
"Score relevancy (does the answer address the question?) from 0-1.\n"
"Provide brief reasoning."
)
eval_chain = eval_prompt | eval_llm
# Evaluate multiple queries
eval_queries = [
"What are the benefits of RAG?",
"What are the key components of RAG?",
"How does fine-tuning work?",
]
print("Custom LLM-as-Judge Evaluation:")
print("=" * 60)
for q in eval_queries:
context_docs = retriever.invoke(q)
context = "\n".join(doc.page_content for doc in context_docs)
answer = rag_chain.invoke(q)
result = eval_chain.invoke({
"question": q, "context": context, "answer": answer
})
print(f"\nQ: {q}")
print(f"A: {answer}")
print(f"Faithfulness: {result.faithfulness_score:.2f}")
print(f"Relevancy: {result.relevancy_score:.2f}")
print(f"Reasoning: {result.reasoning}")