!pip install -q langchain langchain-openai langchain-community faiss-cpu unstructured PillowRetrieval over Images, Tables, and PDFs
Indexing and retrieving from complex documents with vision-language models, multi-vector retrieval, and LlamaParse
Table of Contents
1. Setup & Installation
import os
# os.environ["OPENAI_API_KEY"] = "your-api-key-here" # Uncomment and set
# os.environ["LLAMA_CLOUD_API_KEY"] = "your-key-here" # For LlamaParse2. Document Complexity Spectrum
Documents range from simple text to complex multi-modal content. Each level requires different parsing and retrieval strategies.
complexity_levels = {
"Level 1: Text Only": {
"Examples": "Articles, emails, plain text docs",
"Strategy": "Standard text splitting + embedding",
"Tools": "RecursiveCharacterTextSplitter, any embedding model",
},
"Level 2: Text + Tables": {
"Examples": "Financial reports, academic papers with data",
"Strategy": "Separate table extraction, LLM summarization for embedding",
"Tools": "Unstructured, LlamaParse, Camelot",
},
"Level 3: Text + Tables + Images": {
"Examples": "Technical manuals, slide decks, medical reports",
"Strategy": "Multi-vector retrieval: summaries for index, raw content for generation",
"Tools": "GPT-4V, LlamaParse Premium, Unstructured",
},
"Level 4: Scanned / Handwritten": {
"Examples": "Scanned contracts, handwritten notes, old forms",
"Strategy": "OCR + vision models for extraction, then standard pipeline",
"Tools": "Tesseract, Azure Document Intelligence, ColPali",
},
}
print("Document Complexity Spectrum:")
print("=" * 60)
for level, details in complexity_levels.items():
print(f"\n{level}:")
for key, value in details.items():
print(f" {key}: {value}")3. Intelligent Document Parsing
Use Unstructured or LlamaParse to extract structured elements from complex documents.
# Unstructured: partition by element type
# from unstructured.partition.pdf import partition_pdf
#
# elements = partition_pdf(
# filename="sample.pdf",
# strategy="hi_res",
# extract_images_in_pdf=True,
# extract_image_block_output_dir="./extracted_images",
# )
#
# tables = [el for el in elements if el.category == "Table"]
# texts = [el for el in elements if el.category == "NarrativeText"]
# images = [el for el in elements if el.category == "Image"]
# Simulating parsed elements for demonstration
parsed_elements = {
"NarrativeText": [
"The quarterly revenue increased by 15% year-over-year, driven by strong performance in the cloud services division.",
"Operating expenses were managed tightly, resulting in improved margins across all business segments.",
],
"Table": [
"| Quarter | Revenue ($M) | Growth (%) |\n|---------|-------------|-----------|\n| Q1 2024 | 52.3 | 12% |\n| Q2 2024 | 58.7 | 15% |\n| Q3 2024 | 61.2 | 14% |\n| Q4 2024 | 67.8 | 18% |",
],
"Image": [
"[Revenue Growth Chart - bar chart showing quarterly growth]",
],
}
print("Parsed Document Elements:")
for element_type, elements in parsed_elements.items():
print(f"\n{element_type} ({len(elements)} elements):")
for el in elements:
preview = el[:100] + "..." if len(el) > 100 else el
print(f" - {preview}")# LlamaParse tiers
llamaparse_tiers = {
"Free Tier": {
"Pages/day": "1,000",
"Features": "Basic text + table extraction",
"Use case": "Prototyping, small documents",
},
"Standard ($0.003/page)": {
"Pages/day": "Unlimited",
"Features": "Advanced table detection, markdown output",
"Use case": "Production text + table documents",
},
"Premium ($0.01/page)": {
"Pages/day": "Unlimited",
"Features": "Multimodal: images, charts, handwriting",
"Use case": "Complex documents with visual content",
},
}
print("LlamaParse Tiers:")
for tier, details in llamaparse_tiers.items():
print(f"\n{tier}:")
for key, value in details.items():
print(f" {key}: {value}")4. Multi-Vector Retrieval
The key insight: embed summaries for retrieval, but return raw content for generation.
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
import uuid
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
# Raw content store (docstore)
docstore = {}
# Summarize elements for embedding
summarize_prompt = ChatPromptTemplate.from_template(
"Summarize this {element_type} in 1-2 concise sentences for search indexing:\n\n{content}"
)
summarize_chain = summarize_prompt | llm | StrOutputParser()
summary_docs = []
# Process text elements
for text in parsed_elements["NarrativeText"]:
doc_id = str(uuid.uuid4())
docstore[doc_id] = {"type": "text", "content": text}
summary = summarize_chain.invoke({"element_type": "text passage", "content": text})
summary_docs.append(Document(page_content=summary, metadata={"doc_id": doc_id, "type": "text"}))
# Process table elements
for table in parsed_elements["Table"]:
doc_id = str(uuid.uuid4())
docstore[doc_id] = {"type": "table", "content": table}
summary = summarize_chain.invoke({"element_type": "table", "content": table})
summary_docs.append(Document(page_content=summary, metadata={"doc_id": doc_id, "type": "table"}))
# Process image elements
for image_desc in parsed_elements["Image"]:
doc_id = str(uuid.uuid4())
docstore[doc_id] = {"type": "image", "content": image_desc}
summary_docs.append(Document(page_content=image_desc, metadata={"doc_id": doc_id, "type": "image"}))
# Build vector index from summaries
vectorstore = FAISS.from_documents(summary_docs, embeddings)
print(f"Multi-vector index: {len(summary_docs)} summaries → {len(docstore)} raw docs")
for doc in summary_docs:
print(f" [{doc.metadata['type']}] {doc.page_content[:80]}...")# Retrieve: search summaries, return raw content
def multi_vector_retrieve(query, k=3):
results = vectorstore.similarity_search(query, k=k)
raw_contents = []
for doc in results:
doc_id = doc.metadata["doc_id"]
raw = docstore[doc_id]
raw_contents.append(raw)
print(f" Retrieved [{raw['type']}]: {raw['content'][:80]}...")
return raw_contents
print("Query: 'What was the quarterly revenue growth?'")
results = multi_vector_retrieve("What was the quarterly revenue growth?")5. Table Extraction and Retrieval
Tables need special handling: extract structure, summarize for embedding, retain raw data for generation.
# Strategy: Summarize tables with LLM, embed summaries, return raw table
table_data = """| Model | Parameters | MMLU | HellaSwag | TruthfulQA |
|-------|-----------|------|-----------|------------|
| GPT-4 | ~1.7T | 86.4 | 95.3 | 59.0 |
| Claude 3.5 | Unknown | 88.7 | 89.0 | 62.4 |
| Llama 3.1 70B | 70B | 82.0 | 87.5 | 54.2 |
| Mixtral 8x7B | 46.7B | 70.6 | 84.4 | 46.8 |"""
# Summarize table for embedding
table_summary = summarize_chain.invoke({"element_type": "table", "content": table_data})
print(f"Table summary (for embedding):\n {table_summary}")
# Generate answer using raw table
answer_prompt = ChatPromptTemplate.from_template(
"Use the table below to answer the question.\n\n"
"Table:\n{table}\n\nQuestion: {question}\nAnswer:"
)
answer_chain = answer_prompt | llm | StrOutputParser()
answer = answer_chain.invoke({
"table": table_data,
"question": "Which model has the highest MMLU score?",
})
print(f"\nQ: Which model has the highest MMLU score?")
print(f"A: {answer}")6. Vision-Based Retrieval
Use vision-language models to understand images and charts in documents.
# Vision-based retrieval approaches
vision_approaches = {
"GPT-4V Summarization": {
"How": "Send image to GPT-4V, get text description, embed description",
"Pros": "Detailed understanding, works with charts/diagrams",
"Cons": "Expensive, requires API call per image",
"Cost": "~$0.01-0.03 per image",
},
"ColPali / ColQwen": {
"How": "Embed entire page as image, late-interaction matching with queries",
"Pros": "No OCR/parsing needed, preserves layout",
"Cons": "New approach, limited tooling",
"Cost": "GPU inference cost only",
},
"CLIP Embeddings": {
"How": "Embed images and text in same space, cross-modal search",
"Pros": "Fast, no LLM call needed",
"Cons": "Limited understanding of complex charts",
"Cost": "Low (local inference)",
},
}
print("Vision-Based Retrieval Approaches:")
print("=" * 60)
for approach, details in vision_approaches.items():
print(f"\n{approach}:")
for key, value in details.items():
print(f" {key}: {value}")# GPT-4V image summarization pattern
# import base64
# from langchain_openai import ChatOpenAI
# from langchain_core.messages import HumanMessage
#
# vision_llm = ChatOpenAI(model="gpt-4o", max_tokens=300)
#
# def summarize_image(image_path):
# with open(image_path, "rb") as f:
# image_data = base64.b64encode(f.read()).decode()
#
# message = HumanMessage(content=[
# {"type": "text", "text": "Describe this image for search indexing. Focus on key data, trends, and labels."},
# {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_data}"}},
# ])
#
# return vision_llm.invoke([message]).content
# Demonstrate the concept
print("GPT-4V Image Summarization Pattern:")
print(" 1. Read image as base64")
print(" 2. Send to GPT-4V with prompt: 'Describe this image for search indexing'")
print(" 3. Embed the text summary in vector store")
print(" 4. At retrieval: match query to summary, return original image + summary")7. End-to-End Multimodal RAG Pipeline
Combine document parsing, multi-vector retrieval, and multimodal generation.
class MultimodalRAGPipeline:
def __init__(self, llm, embeddings):
self.llm = llm
self.embeddings = embeddings
self.docstore = {}
self.summary_docs = []
self.vectorstore = None
def ingest(self, elements: dict):
"""Process parsed document elements."""
summarize = ChatPromptTemplate.from_template(
"Summarize this {type} concisely:\n{content}"
) | self.llm | StrOutputParser()
for element_type, items in elements.items():
for item in items:
doc_id = str(uuid.uuid4())
self.docstore[doc_id] = {"type": element_type, "content": item}
summary = summarize.invoke({"type": element_type, "content": item})
self.summary_docs.append(
Document(page_content=summary, metadata={"doc_id": doc_id, "type": element_type})
)
self.vectorstore = FAISS.from_documents(self.summary_docs, self.embeddings)
print(f"Ingested {len(self.summary_docs)} elements")
def query(self, question: str, k: int = 3) -> str:
"""Retrieve and generate answer."""
results = self.vectorstore.similarity_search(question, k=k)
context_parts = []
for doc in results:
raw = self.docstore[doc.metadata["doc_id"]]
context_parts.append(f"[{raw['type']}]\n{raw['content']}")
context = "\n\n---\n\n".join(context_parts)
answer_prompt = ChatPromptTemplate.from_template(
"Answer using the context (which may include text, tables, and image descriptions).\n\n"
"Context:\n{context}\n\nQuestion: {question}"
)
chain = answer_prompt | self.llm | StrOutputParser()
return chain.invoke({"context": context, "question": question})
# Build and test pipeline
pipeline = MultimodalRAGPipeline(llm, embeddings)
pipeline.ingest(parsed_elements)
questions = [
"What was the revenue growth trend?",
"How did operating expenses perform?",
]
for q in questions:
answer = pipeline.query(q)
print(f"\nQ: {q}")
print(f"A: {answer}")8. Choosing the Right Strategy
decision_guide = {
"Text-only documents": {
"Parser": "Basic text splitter",
"Retrieval": "Standard vector search",
"Complexity": "Low",
},
"Documents with tables": {
"Parser": "LlamaParse Standard / Unstructured",
"Retrieval": "Multi-vector (table summaries → raw tables)",
"Complexity": "Medium",
},
"Documents with images & charts": {
"Parser": "LlamaParse Premium / GPT-4V",
"Retrieval": "Multi-vector with vision summaries",
"Complexity": "High",
},
"Scanned PDFs / handwriting": {
"Parser": "Azure Doc Intelligence / ColPali",
"Retrieval": "Vision-native embeddings (ColPali) or OCR + standard",
"Complexity": "Very High",
},
}
print("Strategy Decision Guide:")
print("=" * 60)
for doc_type, strategy in decision_guide.items():
print(f"\n{doc_type}:")
for key, value in strategy.items():
print(f" {key}: {value}")
print("\n" + "=" * 60)
print("Key Principle: Embed summaries for retrieval, return raw content for generation.")
print("This decouples what you search from what you generate with.")