Deep Research Agents: From RAG to Autonomous Investigation

Iterative retrieval, self-reflection, source triangulation, and report generation

Open In Colab

📖 Read the full article


Table of Contents

  1. Setup
  2. Single-Shot RAG Baseline
  3. Iterative Research Loop
  4. Self-Reflection
  5. Source Triangulation
  6. Report Generation Pipeline
!pip install -q langchain-openai langgraph
import os
# os.environ["OPENAI_API_KEY"] = "your-key"

2. Single-Shot RAG Baseline

Standard retrieve-then-generate — works for factoid questions but fails for complex research.

from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)


def single_shot_rag(question: str) -> str:
    """Baseline: one retrieval pass, one generation pass."""
    # Simulated retrieval
    retrieved = [
        "RAG systems combine retrieval with generation for knowledge-grounded answers.",
        "Vector databases like FAISS and Pinecone store document embeddings.",
        "Chunking strategy significantly impacts retrieval quality.",
    ]
    context = "\n".join(f"- {doc}" for doc in retrieved)

    response = llm.invoke([{
        "role": "system",
        "content": f"Answer based on context only.\n\nContext:\n{context}",
    }, {"role": "user", "content": question}])

    return response.content


answer = single_shot_rag("What are the tradeoffs between different RAG architectures?")
print(f"Single-shot answer:\n{answer[:300]}...")

3. Iterative Research Loop

Multiple rounds of retrieval, each guided by gaps identified in previous rounds.

from dataclasses import dataclass, field


@dataclass
class ResearchState:
    question: str
    findings: list[str] = field(default_factory=list)
    gaps: list[str] = field(default_factory=list)
    queries_used: list[str] = field(default_factory=list)
    iteration: int = 0


def iterative_research(question: str, max_rounds: int = 3) -> ResearchState:
    """Multi-round research: retrieve -> analyze gaps -> retrieve more."""
    state = ResearchState(question=question)

    for i in range(max_rounds):
        state.iteration = i + 1

        # Generate search query
        if state.gaps:
            query_prompt = f"Original: {question}\nGaps: {', '.join(state.gaps)}\nGenerate a focused search query."
        else:
            query_prompt = f"Generate a search query for: {question}"

        query_resp = llm.invoke([{"role": "user", "content": query_prompt}])
        query = query_resp.content.strip()
        state.queries_used.append(query)

        # Simulated retrieval
        retrieval_result = f"[Round {i+1}] Retrieved info about: {query[:50]}"
        state.findings.append(retrieval_result)
        print(f"🔍 Round {i+1}: Query='{query[:60]}...'")

        # Gap analysis
        gap_resp = llm.invoke([{
            "role": "system",
            "content": "List 1-2 information gaps remaining. If sufficient, say DONE.",
        }, {
            "role": "user",
            "content": f"Question: {question}\nFindings: {'; '.join(state.findings)}",
        }])

        if "DONE" in gap_resp.content.upper():
            print(f"✅ Research complete after {i+1} rounds")
            break

        state.gaps = [g.strip() for g in gap_resp.content.split("\n") if g.strip()]
        print(f"  Gaps: {state.gaps[:2]}")

    return state


research = iterative_research("Compare retrieval-augmented generation approaches for scientific literature")
print(f"\nTotal findings: {len(research.findings)}")
print(f"Queries used: {research.queries_used}")

4. Self-Reflection

The agent critiques its own findings to identify hallucinations, weak evidence, and bias.

def reflect_on_findings(question: str, findings: list[str]) -> dict:
    """Self-critique: check for completeness, accuracy, and gaps."""
    findings_text = "\n".join(f"{i+1}. {f}" for i, f in enumerate(findings))

    response = llm.invoke([{
        "role": "system",
        "content": """You are a research critic. Evaluate findings for:
1. Completeness - do they fully answer the question?
2. Evidence quality - are claims well-supported?
3. Potential hallucinations - any unsupported assertions?
4. Missing perspectives - any important viewpoints missing?

Rate overall confidence 0-100 and list specific issues.""",
    }, {
        "role": "user",
        "content": f"Question: {question}\n\nFindings:\n{findings_text}",
    }])

    print("🪞 Self-reflection:")
    print(response.content[:500])
    return {"reflection": response.content}


# Demo
fake_findings = [
    "RAG improves factual accuracy by grounding generation in retrieved documents.",
    "Iterative RAG with 3 rounds achieves 15% higher recall than single-shot.",
    "Self-reflection reduces hallucination rate by 40% in research tasks.",
]

reflect_on_findings("How can we improve RAG for research tasks?", fake_findings)

5. Source Triangulation

Cross-validate claims across multiple independent sources.

def triangulate_claims(claims: list[str]) -> list[dict]:
    """Check each claim against multiple simulated sources."""
    results = []

    for claim in claims:
        response = llm.invoke([{
            "role": "system",
            "content": """Evaluate this claim. For each, provide:
- support_level: strong / moderate / weak / contradicted
- reasoning: brief explanation
- confidence: 0.0-1.0
Respond in a structured way.""",
        }, {"role": "user", "content": f"Claim: {claim}"}])

        results.append({
            "claim": claim,
            "evaluation": response.content[:200],
        })

    return results


claims = [
    "Transformer models always outperform RNNs on sequence tasks.",
    "RAG reduces hallucination compared to closed-book generation.",
    "Fine-tuning is always better than in-context learning.",
]

results = triangulate_claims(claims)
for r in results:
    print(f"\n📌 {r['claim']}")
    print(f"   {r['evaluation']}")

6. Report Generation Pipeline

Combine all research steps into a structured report.

def generate_report(question: str, findings: list[str], reflection: str) -> str:
    """Generate a structured research report from findings."""
    findings_text = "\n".join(f"- {f}" for f in findings)

    response = llm.invoke([{
        "role": "system",
        "content": """Generate a structured research report with:
1. Executive Summary (2-3 sentences)
2. Key Findings (bullet points)
3. Analysis (2-3 paragraphs)
4. Limitations & Gaps
5. Recommendations

Be concise and cite findings where possible.""",
    }, {
        "role": "user",
        "content": f"Question: {question}\n\nFindings:\n{findings_text}\n\nSelf-reflection:\n{reflection}",
    }])

    return response.content


# Full pipeline demo
question = "What are the best practices for building production RAG systems?"

# 1. Research
state = iterative_research(question, max_rounds=2)

# 2. Reflect
reflection = reflect_on_findings(question, state.findings)

# 3. Generate report
report = generate_report(question, state.findings, reflection["reflection"])

print("\n" + "="*60)
print("📄 RESEARCH REPORT")
print("="*60)
print(report)