!pip install -q langchain langchain-openai langchain-community langchain-chroma langchain-text-splitters langchain-experimental llama-index llama-index-core llama-index-embeddings-openai llama-index-embeddings-huggingface llama-index-vector-stores-chroma faiss-cpu chromadb sentence-transformersBuilding a RAG Pipeline from Scratch
From document ingestion to answer generation: chunking strategies, embedding models, vector stores, retrieval, and LLM synthesis with LlamaIndex and LangChain
Table of Contents
1. Setup & Installation
import os
# os.environ["OPENAI_API_KEY"] = "your-api-key-here" # Uncomment and set2. Document Loading
Getting data into a structured Document format using LlamaIndex and LangChain loaders.
# LlamaIndex: SimpleDirectoryReader
from llama_index.core import SimpleDirectoryReader, Document
# For demo, create sample documents in memory
sample_texts = [
"The Transformer architecture was introduced by Vaswani et al. in 2017. "
"It relies entirely on attention mechanisms, dispensing with recurrence and convolutions. "
"The model uses self-attention to compute representations of input and output sequences.",
"BERT (Bidirectional Encoder Representations from Transformers) was introduced by Google in 2018. "
"It pre-trains deep bidirectional representations by jointly conditioning on both left and right context. "
"BERT achieved state-of-the-art results on eleven NLP tasks.",
"GPT (Generative Pre-trained Transformer) uses a decoder-only architecture for language modeling. "
"GPT-2 demonstrated that language models can perform downstream tasks without explicit fine-tuning. "
"GPT-3 showed remarkable few-shot learning abilities with 175 billion parameters.",
]
documents = [Document(text=t, metadata={"source": f"doc_{i}"}) for i, t in enumerate(sample_texts)]
print(f"Loaded {len(documents)} documents")
print(f"First doc metadata: {documents[0].metadata}")# LangChain: Document objects
from langchain_core.documents import Document as LCDocument
lc_documents = [
LCDocument(page_content=t, metadata={"source": f"doc_{i}"})
for i, t in enumerate(sample_texts)
]
print(f"LangChain: Loaded {len(lc_documents)} documents")3. Chunking Strategies
Splitting documents into retrieval-friendly chunks using recursive character splitting.
# LangChain: RecursiveCharacterTextSplitter
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=512,
chunk_overlap=50,
separators=["\n\n", "\n", ". ", " ", ""],
length_function=len,
)
lc_chunks = text_splitter.split_documents(lc_documents)
print(f"Split {len(lc_documents)} documents into {len(lc_chunks)} chunks")
for i, chunk in enumerate(lc_chunks):
print(f" Chunk {i}: {len(chunk.page_content)} chars")# LlamaIndex: SentenceSplitter
from llama_index.core.node_parser import SentenceSplitter
splitter = SentenceSplitter(
chunk_size=512,
chunk_overlap=50,
)
nodes = splitter.get_nodes_from_documents(documents)
print(f"Created {len(nodes)} nodes")
for i, node in enumerate(nodes):
print(f" Node {i}: {len(node.text)} chars")4. Embedding Models
Converting text chunks into dense vectors for semantic search.
# LangChain: OpenAI Embeddings
from langchain_openai import OpenAIEmbeddings
lc_embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
# Embed a single query
query_vector = lc_embeddings.embed_query("What is the Transformer architecture?")
print(f"Embedding dimensions: {len(query_vector)}")
print(f"First 5 values: {query_vector[:5]}")# LlamaIndex: OpenAI Embeddings
from llama_index.embeddings.openai import OpenAIEmbedding
li_embed_model = OpenAIEmbedding(model="text-embedding-3-small")
query_embedding = li_embed_model.get_query_embedding("What is the Transformer architecture?")
print(f"LlamaIndex embedding dims: {len(query_embedding)}")5. Vector Stores and Indexing
Storing embedded chunks for fast similarity search.
# LangChain: FAISS vector store
from langchain_community.vectorstores import FAISS
lc_vectorstore = FAISS.from_documents(
documents=lc_chunks,
embedding=lc_embeddings,
)
print(f"FAISS index built with {len(lc_chunks)} chunks")# LlamaIndex: VectorStoreIndex
from llama_index.core import VectorStoreIndex, Settings
Settings.embed_model = li_embed_model
li_index = VectorStoreIndex.from_documents(
documents,
show_progress=True,
)
print("LlamaIndex VectorStoreIndex built")6. Retrieval Strategies
Finding the most relevant chunks for a given query.
# LangChain: Basic similarity search
query = "What is the Transformer architecture?"
results = lc_vectorstore.similarity_search(query, k=2)
print(f"LangChain retrieved {len(results)} chunks:\n")
for i, doc in enumerate(results):
print(f"--- Result {i} ---")
print(doc.page_content[:200])
print()# LangChain: Similarity search with scores
results_with_scores = lc_vectorstore.similarity_search_with_score(query, k=3)
for doc, score in results_with_scores:
print(f"Score: {score:.4f} — {doc.page_content[:80]}...")# LlamaIndex: Retrieval
li_retriever = li_index.as_retriever(similarity_top_k=2)
li_results = li_retriever.retrieve(query)
print(f"LlamaIndex retrieved {len(li_results)} nodes:\n")
for i, node in enumerate(li_results):
print(f"--- Result {i} (score: {node.score:.4f}) ---")
print(node.text[:200])
print()7. LLM Generation
Synthesizing answers from retrieved context using an LLM.
# LangChain: RetrievalQA chain
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
retriever = lc_vectorstore.as_retriever(search_kwargs={"k": 3})
prompt = ChatPromptTemplate.from_template(
"Answer the question based on the context below.\n\n"
"Context:\n{context}\n\nQuestion: {question}\n\nAnswer:"
)
def format_docs(docs):
return "\n\n".join(doc.page_content for doc in docs)
chain = (
{"context": retriever | format_docs, "question": RunnablePassthrough()}
| prompt
| llm
| StrOutputParser()
)
answer = chain.invoke("What is the Transformer architecture?")
print(answer)# LlamaIndex: Query Engine
from llama_index.llms.openai import OpenAI
Settings.llm = OpenAI(model="gpt-4o-mini")
li_query_engine = li_index.as_query_engine(similarity_top_k=3)
li_response = li_query_engine.query("What is the Transformer architecture?")
print(str(li_response))8. End-to-End RAG Pipeline
Complete pipeline from documents to answers.
# LangChain end-to-end pipeline
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document as LCDocument
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
# 1. Load
docs = [LCDocument(page_content=t) for t in sample_texts]
# 2. Chunk
splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
chunks = splitter.split_documents(docs)
# 3. Embed + Index
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vectorstore = FAISS.from_documents(chunks, embeddings)
# 4. Retrieve + Generate
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
prompt = ChatPromptTemplate.from_template(
"Answer the question based only on the following context:\n\n"
"{context}\n\nQuestion: {question}"
)
rag_chain = (
{"context": retriever | format_docs, "question": RunnablePassthrough()}
| prompt
| llm
| StrOutputParser()
)
# Ask questions
questions = [
"What is the Transformer architecture?",
"How does BERT differ from GPT?",
"What are the key capabilities of GPT-3?",
]
for q in questions:
print(f"Q: {q}")
print(f"A: {rag_chain.invoke(q)}\n")