Retrieval over Images, Tables, and PDFs

Indexing and retrieving from complex documents with vision-language models, multi-vector retrieval, and LlamaParse

Open In Colab

📖 Read the full article


Table of Contents

  1. Setup & Installation
  2. Document Complexity Spectrum
  3. Intelligent Document Parsing
  4. Multi-Vector Retrieval
  5. Table Extraction and Retrieval
  6. Vision-Based Retrieval
  7. End-to-End Multimodal RAG Pipeline
  8. Choosing the Right Strategy

1. Setup & Installation

!pip install -q langchain langchain-openai langchain-community faiss-cpu unstructured Pillow
import os
# os.environ["OPENAI_API_KEY"] = "your-api-key-here"  # Uncomment and set
# os.environ["LLAMA_CLOUD_API_KEY"] = "your-key-here"  # For LlamaParse

2. Document Complexity Spectrum

Documents range from simple text to complex multi-modal content. Each level requires different parsing and retrieval strategies.

complexity_levels = {
    "Level 1: Text Only": {
        "Examples": "Articles, emails, plain text docs",
        "Strategy": "Standard text splitting + embedding",
        "Tools": "RecursiveCharacterTextSplitter, any embedding model",
    },
    "Level 2: Text + Tables": {
        "Examples": "Financial reports, academic papers with data",
        "Strategy": "Separate table extraction, LLM summarization for embedding",
        "Tools": "Unstructured, LlamaParse, Camelot",
    },
    "Level 3: Text + Tables + Images": {
        "Examples": "Technical manuals, slide decks, medical reports",
        "Strategy": "Multi-vector retrieval: summaries for index, raw content for generation",
        "Tools": "GPT-4V, LlamaParse Premium, Unstructured",
    },
    "Level 4: Scanned / Handwritten": {
        "Examples": "Scanned contracts, handwritten notes, old forms",
        "Strategy": "OCR + vision models for extraction, then standard pipeline",
        "Tools": "Tesseract, Azure Document Intelligence, ColPali",
    },
}

print("Document Complexity Spectrum:")
print("=" * 60)
for level, details in complexity_levels.items():
    print(f"\n{level}:")
    for key, value in details.items():
        print(f"  {key}: {value}")

3. Intelligent Document Parsing

Use Unstructured or LlamaParse to extract structured elements from complex documents.

# Unstructured: partition by element type
# from unstructured.partition.pdf import partition_pdf
#
# elements = partition_pdf(
#     filename="sample.pdf",
#     strategy="hi_res",
#     extract_images_in_pdf=True,
#     extract_image_block_output_dir="./extracted_images",
# )
#
# tables = [el for el in elements if el.category == "Table"]
# texts = [el for el in elements if el.category == "NarrativeText"]
# images = [el for el in elements if el.category == "Image"]

# Simulating parsed elements for demonstration
parsed_elements = {
    "NarrativeText": [
        "The quarterly revenue increased by 15% year-over-year, driven by strong performance in the cloud services division.",
        "Operating expenses were managed tightly, resulting in improved margins across all business segments.",
    ],
    "Table": [
        "| Quarter | Revenue ($M) | Growth (%) |\n|---------|-------------|-----------|\n| Q1 2024 | 52.3 | 12% |\n| Q2 2024 | 58.7 | 15% |\n| Q3 2024 | 61.2 | 14% |\n| Q4 2024 | 67.8 | 18% |",
    ],
    "Image": [
        "[Revenue Growth Chart - bar chart showing quarterly growth]",
    ],
}

print("Parsed Document Elements:")
for element_type, elements in parsed_elements.items():
    print(f"\n{element_type} ({len(elements)} elements):")
    for el in elements:
        preview = el[:100] + "..." if len(el) > 100 else el
        print(f"  - {preview}")
# LlamaParse tiers
llamaparse_tiers = {
    "Free Tier": {
        "Pages/day": "1,000",
        "Features": "Basic text + table extraction",
        "Use case": "Prototyping, small documents",
    },
    "Standard ($0.003/page)": {
        "Pages/day": "Unlimited",
        "Features": "Advanced table detection, markdown output",
        "Use case": "Production text + table documents",
    },
    "Premium ($0.01/page)": {
        "Pages/day": "Unlimited",
        "Features": "Multimodal: images, charts, handwriting",
        "Use case": "Complex documents with visual content",
    },
}

print("LlamaParse Tiers:")
for tier, details in llamaparse_tiers.items():
    print(f"\n{tier}:")
    for key, value in details.items():
        print(f"  {key}: {value}")

4. Multi-Vector Retrieval

The key insight: embed summaries for retrieval, but return raw content for generation.

from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
import uuid

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# Raw content store (docstore)
docstore = {}

# Summarize elements for embedding
summarize_prompt = ChatPromptTemplate.from_template(
    "Summarize this {element_type} in 1-2 concise sentences for search indexing:\n\n{content}"
)
summarize_chain = summarize_prompt | llm | StrOutputParser()

summary_docs = []

# Process text elements
for text in parsed_elements["NarrativeText"]:
    doc_id = str(uuid.uuid4())
    docstore[doc_id] = {"type": "text", "content": text}
    summary = summarize_chain.invoke({"element_type": "text passage", "content": text})
    summary_docs.append(Document(page_content=summary, metadata={"doc_id": doc_id, "type": "text"}))

# Process table elements
for table in parsed_elements["Table"]:
    doc_id = str(uuid.uuid4())
    docstore[doc_id] = {"type": "table", "content": table}
    summary = summarize_chain.invoke({"element_type": "table", "content": table})
    summary_docs.append(Document(page_content=summary, metadata={"doc_id": doc_id, "type": "table"}))

# Process image elements
for image_desc in parsed_elements["Image"]:
    doc_id = str(uuid.uuid4())
    docstore[doc_id] = {"type": "image", "content": image_desc}
    summary_docs.append(Document(page_content=image_desc, metadata={"doc_id": doc_id, "type": "image"}))

# Build vector index from summaries
vectorstore = FAISS.from_documents(summary_docs, embeddings)

print(f"Multi-vector index: {len(summary_docs)} summaries → {len(docstore)} raw docs")
for doc in summary_docs:
    print(f"  [{doc.metadata['type']}] {doc.page_content[:80]}...")
# Retrieve: search summaries, return raw content
def multi_vector_retrieve(query, k=3):
    results = vectorstore.similarity_search(query, k=k)
    raw_contents = []
    for doc in results:
        doc_id = doc.metadata["doc_id"]
        raw = docstore[doc_id]
        raw_contents.append(raw)
        print(f"  Retrieved [{raw['type']}]: {raw['content'][:80]}...")
    return raw_contents

print("Query: 'What was the quarterly revenue growth?'")
results = multi_vector_retrieve("What was the quarterly revenue growth?")

5. Table Extraction and Retrieval

Tables need special handling: extract structure, summarize for embedding, retain raw data for generation.

# Strategy: Summarize tables with LLM, embed summaries, return raw table
table_data = """| Model | Parameters | MMLU | HellaSwag | TruthfulQA |
|-------|-----------|------|-----------|------------|
| GPT-4 | ~1.7T | 86.4 | 95.3 | 59.0 |
| Claude 3.5 | Unknown | 88.7 | 89.0 | 62.4 |
| Llama 3.1 70B | 70B | 82.0 | 87.5 | 54.2 |
| Mixtral 8x7B | 46.7B | 70.6 | 84.4 | 46.8 |"""

# Summarize table for embedding
table_summary = summarize_chain.invoke({"element_type": "table", "content": table_data})
print(f"Table summary (for embedding):\n  {table_summary}")

# Generate answer using raw table
answer_prompt = ChatPromptTemplate.from_template(
    "Use the table below to answer the question.\n\n"
    "Table:\n{table}\n\nQuestion: {question}\nAnswer:"
)
answer_chain = answer_prompt | llm | StrOutputParser()

answer = answer_chain.invoke({
    "table": table_data,
    "question": "Which model has the highest MMLU score?",
})
print(f"\nQ: Which model has the highest MMLU score?")
print(f"A: {answer}")

6. Vision-Based Retrieval

Use vision-language models to understand images and charts in documents.

# Vision-based retrieval approaches
vision_approaches = {
    "GPT-4V Summarization": {
        "How": "Send image to GPT-4V, get text description, embed description",
        "Pros": "Detailed understanding, works with charts/diagrams",
        "Cons": "Expensive, requires API call per image",
        "Cost": "~$0.01-0.03 per image",
    },
    "ColPali / ColQwen": {
        "How": "Embed entire page as image, late-interaction matching with queries",
        "Pros": "No OCR/parsing needed, preserves layout",
        "Cons": "New approach, limited tooling",
        "Cost": "GPU inference cost only",
    },
    "CLIP Embeddings": {
        "How": "Embed images and text in same space, cross-modal search",
        "Pros": "Fast, no LLM call needed",
        "Cons": "Limited understanding of complex charts",
        "Cost": "Low (local inference)",
    },
}

print("Vision-Based Retrieval Approaches:")
print("=" * 60)
for approach, details in vision_approaches.items():
    print(f"\n{approach}:")
    for key, value in details.items():
        print(f"  {key}: {value}")
# GPT-4V image summarization pattern
# import base64
# from langchain_openai import ChatOpenAI
# from langchain_core.messages import HumanMessage
#
# vision_llm = ChatOpenAI(model="gpt-4o", max_tokens=300)
#
# def summarize_image(image_path):
#     with open(image_path, "rb") as f:
#         image_data = base64.b64encode(f.read()).decode()
#     
#     message = HumanMessage(content=[
#         {"type": "text", "text": "Describe this image for search indexing. Focus on key data, trends, and labels."},
#         {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_data}"}},
#     ])
#     
#     return vision_llm.invoke([message]).content

# Demonstrate the concept
print("GPT-4V Image Summarization Pattern:")
print("  1. Read image as base64")
print("  2. Send to GPT-4V with prompt: 'Describe this image for search indexing'")
print("  3. Embed the text summary in vector store")
print("  4. At retrieval: match query to summary, return original image + summary")

7. End-to-End Multimodal RAG Pipeline

Combine document parsing, multi-vector retrieval, and multimodal generation.

class MultimodalRAGPipeline:
    def __init__(self, llm, embeddings):
        self.llm = llm
        self.embeddings = embeddings
        self.docstore = {}
        self.summary_docs = []
        self.vectorstore = None

    def ingest(self, elements: dict):
        """Process parsed document elements."""
        summarize = ChatPromptTemplate.from_template(
            "Summarize this {type} concisely:\n{content}"
        ) | self.llm | StrOutputParser()

        for element_type, items in elements.items():
            for item in items:
                doc_id = str(uuid.uuid4())
                self.docstore[doc_id] = {"type": element_type, "content": item}
                summary = summarize.invoke({"type": element_type, "content": item})
                self.summary_docs.append(
                    Document(page_content=summary, metadata={"doc_id": doc_id, "type": element_type})
                )

        self.vectorstore = FAISS.from_documents(self.summary_docs, self.embeddings)
        print(f"Ingested {len(self.summary_docs)} elements")

    def query(self, question: str, k: int = 3) -> str:
        """Retrieve and generate answer."""
        results = self.vectorstore.similarity_search(question, k=k)

        context_parts = []
        for doc in results:
            raw = self.docstore[doc.metadata["doc_id"]]
            context_parts.append(f"[{raw['type']}]\n{raw['content']}")

        context = "\n\n---\n\n".join(context_parts)

        answer_prompt = ChatPromptTemplate.from_template(
            "Answer using the context (which may include text, tables, and image descriptions).\n\n"
            "Context:\n{context}\n\nQuestion: {question}"
        )
        chain = answer_prompt | self.llm | StrOutputParser()
        return chain.invoke({"context": context, "question": question})


# Build and test pipeline
pipeline = MultimodalRAGPipeline(llm, embeddings)
pipeline.ingest(parsed_elements)

questions = [
    "What was the revenue growth trend?",
    "How did operating expenses perform?",
]

for q in questions:
    answer = pipeline.query(q)
    print(f"\nQ: {q}")
    print(f"A: {answer}")

8. Choosing the Right Strategy

decision_guide = {
    "Text-only documents": {
        "Parser": "Basic text splitter",
        "Retrieval": "Standard vector search",
        "Complexity": "Low",
    },
    "Documents with tables": {
        "Parser": "LlamaParse Standard / Unstructured",
        "Retrieval": "Multi-vector (table summaries → raw tables)",
        "Complexity": "Medium",
    },
    "Documents with images & charts": {
        "Parser": "LlamaParse Premium / GPT-4V",
        "Retrieval": "Multi-vector with vision summaries",
        "Complexity": "High",
    },
    "Scanned PDFs / handwriting": {
        "Parser": "Azure Doc Intelligence / ColPali",
        "Retrieval": "Vision-native embeddings (ColPali) or OCR + standard",
        "Complexity": "Very High",
    },
}

print("Strategy Decision Guide:")
print("=" * 60)
for doc_type, strategy in decision_guide.items():
    print(f"\n{doc_type}:")
    for key, value in strategy.items():
        print(f"  {key}: {value}")

print("\n" + "=" * 60)
print("Key Principle: Embed summaries for retrieval, return raw content for generation.")
print("This decouples what you search from what you generate with.")