Building a RAG Pipeline from Scratch

From document ingestion to answer generation: chunking strategies, embedding models, vector stores, retrieval, and LLM synthesis with LlamaIndex and LangChain

Open In Colab

📖 Read the full article


Table of Contents

  1. Setup & Installation
  2. Document Loading
  3. Chunking Strategies
  4. Embedding Models
  5. Vector Stores and Indexing
  6. Retrieval Strategies
  7. LLM Generation
  8. End-to-End RAG Pipeline

1. Setup & Installation

!pip install -q langchain langchain-openai langchain-community langchain-chroma langchain-text-splitters langchain-experimental llama-index llama-index-core llama-index-embeddings-openai llama-index-embeddings-huggingface llama-index-vector-stores-chroma faiss-cpu chromadb sentence-transformers
import os
# os.environ["OPENAI_API_KEY"] = "your-api-key-here"  # Uncomment and set

2. Document Loading

Getting data into a structured Document format using LlamaIndex and LangChain loaders.

# LlamaIndex: SimpleDirectoryReader
from llama_index.core import SimpleDirectoryReader, Document

# For demo, create sample documents in memory
sample_texts = [
    "The Transformer architecture was introduced by Vaswani et al. in 2017. "
    "It relies entirely on attention mechanisms, dispensing with recurrence and convolutions. "
    "The model uses self-attention to compute representations of input and output sequences.",
    
    "BERT (Bidirectional Encoder Representations from Transformers) was introduced by Google in 2018. "
    "It pre-trains deep bidirectional representations by jointly conditioning on both left and right context. "
    "BERT achieved state-of-the-art results on eleven NLP tasks.",
    
    "GPT (Generative Pre-trained Transformer) uses a decoder-only architecture for language modeling. "
    "GPT-2 demonstrated that language models can perform downstream tasks without explicit fine-tuning. "
    "GPT-3 showed remarkable few-shot learning abilities with 175 billion parameters.",
]

documents = [Document(text=t, metadata={"source": f"doc_{i}"}) for i, t in enumerate(sample_texts)]
print(f"Loaded {len(documents)} documents")
print(f"First doc metadata: {documents[0].metadata}")
# LangChain: Document objects
from langchain_core.documents import Document as LCDocument

lc_documents = [
    LCDocument(page_content=t, metadata={"source": f"doc_{i}"})
    for i, t in enumerate(sample_texts)
]
print(f"LangChain: Loaded {len(lc_documents)} documents")

3. Chunking Strategies

Splitting documents into retrieval-friendly chunks using recursive character splitting.

# LangChain: RecursiveCharacterTextSplitter
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=50,
    separators=["\n\n", "\n", ". ", " ", ""],
    length_function=len,
)

lc_chunks = text_splitter.split_documents(lc_documents)
print(f"Split {len(lc_documents)} documents into {len(lc_chunks)} chunks")
for i, chunk in enumerate(lc_chunks):
    print(f"  Chunk {i}: {len(chunk.page_content)} chars")
# LlamaIndex: SentenceSplitter
from llama_index.core.node_parser import SentenceSplitter

splitter = SentenceSplitter(
    chunk_size=512,
    chunk_overlap=50,
)

nodes = splitter.get_nodes_from_documents(documents)
print(f"Created {len(nodes)} nodes")
for i, node in enumerate(nodes):
    print(f"  Node {i}: {len(node.text)} chars")

4. Embedding Models

Converting text chunks into dense vectors for semantic search.

# LangChain: OpenAI Embeddings
from langchain_openai import OpenAIEmbeddings

lc_embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# Embed a single query
query_vector = lc_embeddings.embed_query("What is the Transformer architecture?")
print(f"Embedding dimensions: {len(query_vector)}")
print(f"First 5 values: {query_vector[:5]}")
# LlamaIndex: OpenAI Embeddings
from llama_index.embeddings.openai import OpenAIEmbedding

li_embed_model = OpenAIEmbedding(model="text-embedding-3-small")

query_embedding = li_embed_model.get_query_embedding("What is the Transformer architecture?")
print(f"LlamaIndex embedding dims: {len(query_embedding)}")

5. Vector Stores and Indexing

Storing embedded chunks for fast similarity search.

# LangChain: FAISS vector store
from langchain_community.vectorstores import FAISS

lc_vectorstore = FAISS.from_documents(
    documents=lc_chunks,
    embedding=lc_embeddings,
)
print(f"FAISS index built with {len(lc_chunks)} chunks")
# LlamaIndex: VectorStoreIndex
from llama_index.core import VectorStoreIndex, Settings

Settings.embed_model = li_embed_model

li_index = VectorStoreIndex.from_documents(
    documents,
    show_progress=True,
)
print("LlamaIndex VectorStoreIndex built")

6. Retrieval Strategies

Finding the most relevant chunks for a given query.

# LangChain: Basic similarity search
query = "What is the Transformer architecture?"

results = lc_vectorstore.similarity_search(query, k=2)
print(f"LangChain retrieved {len(results)} chunks:\n")
for i, doc in enumerate(results):
    print(f"--- Result {i} ---")
    print(doc.page_content[:200])
    print()
# LangChain: Similarity search with scores
results_with_scores = lc_vectorstore.similarity_search_with_score(query, k=3)
for doc, score in results_with_scores:
    print(f"Score: {score:.4f}{doc.page_content[:80]}...")
# LlamaIndex: Retrieval
li_retriever = li_index.as_retriever(similarity_top_k=2)
li_results = li_retriever.retrieve(query)
print(f"LlamaIndex retrieved {len(li_results)} nodes:\n")
for i, node in enumerate(li_results):
    print(f"--- Result {i} (score: {node.score:.4f}) ---")
    print(node.text[:200])
    print()

7. LLM Generation

Synthesizing answers from retrieved context using an LLM.

# LangChain: RetrievalQA chain
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
retriever = lc_vectorstore.as_retriever(search_kwargs={"k": 3})

prompt = ChatPromptTemplate.from_template(
    "Answer the question based on the context below.\n\n"
    "Context:\n{context}\n\nQuestion: {question}\n\nAnswer:"
)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

answer = chain.invoke("What is the Transformer architecture?")
print(answer)
# LlamaIndex: Query Engine
from llama_index.llms.openai import OpenAI

Settings.llm = OpenAI(model="gpt-4o-mini")

li_query_engine = li_index.as_query_engine(similarity_top_k=3)
li_response = li_query_engine.query("What is the Transformer architecture?")
print(str(li_response))

8. End-to-End RAG Pipeline

Complete pipeline from documents to answers.

# LangChain end-to-end pipeline
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document as LCDocument
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

# 1. Load
docs = [LCDocument(page_content=t) for t in sample_texts]

# 2. Chunk
splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
chunks = splitter.split_documents(docs)

# 3. Embed + Index
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vectorstore = FAISS.from_documents(chunks, embeddings)

# 4. Retrieve + Generate
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

prompt = ChatPromptTemplate.from_template(
    "Answer the question based only on the following context:\n\n"
    "{context}\n\nQuestion: {question}"
)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# Ask questions
questions = [
    "What is the Transformer architecture?",
    "How does BERT differ from GPT?",
    "What are the key capabilities of GPT-3?",
]

for q in questions:
    print(f"Q: {q}")
    print(f"A: {rag_chain.invoke(q)}\n")