GraphRAG: Knowledge Graphs Meet Retrieval-Augmented Generation

Building and querying knowledge graphs for RAG with Neo4j, LlamaIndex, and Microsoft GraphRAG — from entity extraction to community summarization

Open In Colab

📖 Read the full article

Setup & Installation
Why Vector-Only RAG Fails
Knowledge Graph Fundamentals
Entity and Relationship Extraction
Microsoft GraphRAG Pipeline
LlamaIndex PropertyGraphIndex
Neo4j + LangChain: Cypher Queries
Hybrid Agent: Vector + Graph

1. Setup & Installation

!pip install -q langchain langchain-openai langchain-community llama-index llama-index-core llama-index-embeddings-openai faiss-cpu

import os
# os.environ["OPENAI_API_KEY"] = "your-api-key-here"  # Uncomment and set

2. Why Vector-Only RAG Fails

Vector search finds similar text, not connected concepts. It fails on aggregation, multi-hop reasoning, and global summarization.

limitations = {
    "Aggregation": {
        "example": "How many open tickets are assigned to Team A?",
        "why_fails": "Cannot count or group — returns k nearest chunks regardless",
    },
    "Multi-hop reasoning": {
        "example": "Which services will break if Database goes down?",
        "why_fails": "Requires traversing dependency chains across entities",
    },
    "Global summarization": {
        "example": "What are the main themes in this dataset?",
        "why_fails": "Answer spans the entire corpus, not any single chunk",
    },
    "Relationship queries": {
        "example": "Who collaborated with Author X on topic Y?",
        "why_fails": "Relationships aren't encoded in flat embeddings",
    },
}

print("Why Vector-Only RAG Fails:")
print("=" * 60)
for limitation, details in limitations.items():
    print(f"\n{limitation}:")
    print(f"  Query: {details['example']}")
    print(f"  Fails because: {details['why_fails']}")

3. Knowledge Graph Fundamentals

A knowledge graph represents information as entities (nodes) and relationships (edges). The atomic unit is a triple: (subject, predicate, object).

# Simple in-memory knowledge graph
class SimpleKnowledgeGraph:
    def __init__(self):
        self.triples = []
        self.entities = set()
    
    def add_triple(self, subject, predicate, obj):
        self.triples.append((subject, predicate, obj))
        self.entities.add(subject)
        self.entities.add(obj)
    
    def query_subject(self, subject):
        return [(p, o) for s, p, o in self.triples if s == subject]
    
    def query_object(self, obj):
        return [(s, p) for s, p, o in self.triples if o == obj]
    
    def traverse(self, start, max_hops=2):
        """BFS traversal from start entity."""
        visited = set()
        queue = [(start, 0)]
        results = []
        while queue:
            entity, depth = queue.pop(0)
            if entity in visited or depth > max_hops:
                continue
            visited.add(entity)
            for s, p, o in self.triples:
                if s == entity:
                    results.append((s, p, o, depth))
                    queue.append((o, depth + 1))
                elif o == entity:
                    results.append((s, p, o, depth))
                    queue.append((s, depth + 1))
        return results


# Build a sample knowledge graph
kg = SimpleKnowledgeGraph()
kg.add_triple("GraphRAG", "USES", "Knowledge Graph")
kg.add_triple("GraphRAG", "PUBLISHED_BY", "Microsoft")
kg.add_triple("GraphRAG", "USES", "Leiden Algorithm")
kg.add_triple("Knowledge Graph", "STORED_IN", "Neo4j")
kg.add_triple("Neo4j", "IS_A", "Graph Database")
kg.add_triple("Microsoft", "PUBLISHED", "GraphRAG Paper")
kg.add_triple("Leiden Algorithm", "PERFORMS", "Community Detection")
kg.add_triple("LlamaIndex", "PROVIDES", "PropertyGraphIndex")
kg.add_triple("PropertyGraphIndex", "USES", "Knowledge Graph")

print(f"Knowledge Graph: {len(kg.entities)} entities, {len(kg.triples)} triples")
print(f"\nAll triples:")
for s, p, o in kg.triples:
    print(f"  ({s}) --[{p}]--> ({o})")

# Query the graph
print("Query: What does GraphRAG use?")
results = kg.query_subject("GraphRAG")
for predicate, obj in results:
    print(f"  GraphRAG --[{predicate}]--> {obj}")

print("\nTraversal from GraphRAG (2 hops):")
traversal = kg.traverse("GraphRAG", max_hops=2)
for s, p, o, depth in traversal:
    indent = "  " * (depth + 1)
    print(f"{indent}({s}) --[{p}]--> ({o}) [hop {depth}]")

4. Entity and Relationship Extraction

Use an LLM to extract structured triples from unstructured text.

from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

EXTRACTION_PROMPT = ChatPromptTemplate.from_template(
    "Extract all entities and relationships from the text below.\n\n"
    "For each relationship, output one line in the format:\n"
    "(entity1) --[RELATIONSHIP]--> (entity2)\n\n"
    "Text: {text}\n\n"
    "Extracted triples:"
)

extract_chain = EXTRACTION_PROMPT | llm | StrOutputParser()

sample_text = """
Microsoft Research published the GraphRAG paper in 2024, authored by Darren Edge 
and others. GraphRAG uses the Leiden algorithm for community detection on knowledge 
graphs. The system extracts entities and relationships from documents using GPT-4, 
then builds a hierarchical community structure. Community summaries enable answering 
global questions that standard RAG cannot handle.
"""

extracted = extract_chain.invoke({"text": sample_text})
print("Extracted triples:")
print(extracted)

5. Microsoft GraphRAG Pipeline

Microsoft’s GraphRAG builds a hierarchical community structure and uses community summaries for global questions.

# Microsoft GraphRAG workflow (conceptual overview)
graphrag_pipeline = [
    "1. Source Documents → Split into TextUnits",
    "2. LLM extracts Entities & Relationships from each TextUnit",
    "3. Build Entity Knowledge Graph",
    "4. Leiden Hierarchical Clustering → Community detection",
    "5. Generate Community Summaries (bottom-up)",
    "6. Query Phase:",
    "   - Global Search: Map-reduce over community summaries",
    "   - Local Search: Find entity → traverse neighbors",
    "   - DRIFT Search: Entity matching + community context",
]

print("Microsoft GraphRAG Pipeline:")
print("=" * 50)
for step in graphrag_pipeline:
    print(f"  {step}")

# Installation and usage
print("\nTo use Microsoft GraphRAG:")
print("  pip install graphrag")
print("  graphrag init --root ./my_project")
print("  graphrag prompt-tune --root ./my_project")
print("  graphrag index --root ./my_project")
print("  graphrag query --root ./my_project --method global --query 'What are the main themes?'")

6. LlamaIndex PropertyGraphIndex

LlamaIndex extracts a knowledge graph directly from documents and supports both graph traversal and vector retrieval.

from llama_index.core import SimpleDirectoryReader, PropertyGraphIndex, Document, Settings
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI

Settings.llm = OpenAI(model="gpt-4o-mini", temperature=0.0)
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")

# Sample documents
documents = [
    Document(text="Paul Graham started Viaweb in 1995. Viaweb was acquired by Yahoo in 1998. "
             "Yahoo renamed Viaweb to Yahoo Store."),
    Document(text="Paul Graham co-founded Y Combinator in 2005 with Jessica Livingston. "
             "Y Combinator has funded over 4000 startups including Airbnb and Stripe."),
    Document(text="Interleaf built impressive document publishing technology in the 1990s "
             "but was eventually overtaken by cheaper desktop publishing software."),
]

# Build property graph index (extracts entities + relationships via LLM)
index = PropertyGraphIndex.from_documents(
    documents,
    show_progress=True,
)
print("PropertyGraphIndex built successfully")

# Query the graph
# Retrieve triples only
retriever = index.as_retriever(include_text=False)
nodes = retriever.retrieve("What happened at Viaweb?")
print("Graph triples:")
for node in nodes:
    print(f"  {node.text}")

# Full query engine with source text
query_engine = index.as_query_engine(include_text=True)
response = query_engine.query("What happened at Viaweb?")
print(f"\nAnswer: {response}")

7. Neo4j + LangChain: Cypher Queries

LangChain’s GraphCypherQAChain translates natural language to Cypher queries for Neo4j.

Note: This section requires a running Neo4j instance. The code below shows the pattern.

# Neo4j + LangChain integration pattern
# Requires: pip install langchain-neo4j neo4j

# from langchain_neo4j import Neo4jGraph
# from langchain.chains import GraphCypherQAChain
# from langchain_openai import ChatOpenAI
#
# graph = Neo4jGraph(
#     url="neo4j+s://your-instance.databases.neo4j.io",
#     username="neo4j",
#     password="your-password",
# )
#
# cypher_chain = GraphCypherQAChain.from_llm(
#     cypher_llm=ChatOpenAI(temperature=0, model="gpt-4o"),
#     qa_llm=ChatOpenAI(temperature=0, model="gpt-4o-mini"),
#     graph=graph,
#     verbose=True,
# )
#
# # Aggregation query (impossible with vector search)
# cypher_chain.run("How many open tickets are there?")
# # LLM generates: MATCH (t:Task {status: 'Open'}) RETURN count(*)
#
# # Graph traversal
# cypher_chain.run("Which services depend on Database?")
# # LLM generates: MATCH (s)-[:DEPENDS_ON]->(:Service {name: 'Database'}) RETURN s.name

print("Neo4j + LangChain pattern shown above (requires running Neo4j instance).")
print("\nKey capabilities:")
print("  - Natural language → Cypher query translation")
print("  - Aggregation queries (COUNT, SUM, GROUP BY)")
print("  - Multi-hop graph traversal")
print("  - Relationship analysis")

8. Hybrid Agent: Vector + Graph

Combine both retrieval modes: route semantic questions to vector search and structural questions to graph queries.

from pydantic import BaseModel, Field


class QueryRoute(BaseModel):
    """Route query to the best retrieval strategy."""
    strategy: str = Field(
        description="One of: 'vector' for semantic similarity, 'graph' for structural/relational, 'both' for combined"
    )
    reasoning: str = Field(description="Why this strategy was chosen")


router = llm.with_structured_output(QueryRoute)
route_prompt = ChatPromptTemplate.from_template(
    "Given the query, decide the best retrieval strategy:\n"
    "- 'vector': semantic similarity questions (what, how, explain)\n"
    "- 'graph': structural/relational questions (who, which, how many, depends on)\n"
    "- 'both': questions needing both semantic and structural context\n\n"
    "Query: {query}"
)

route_chain = route_prompt | router

test_queries = [
    "What is GraphRAG?",
    "Which companies funded by Y Combinator went public?",
    "How many startups has Y Combinator funded?",
    "Explain how community detection works in GraphRAG",
    "What is the relationship between Paul Graham and Y Combinator?",
]

print("Query Routing Results:")
print("=" * 60)
for q in test_queries:
    result = route_chain.invoke({"query": q})
    print(f"\nQ: {q}")
    print(f"  Strategy: {result.strategy}")
    print(f"  Reason: {result.reasoning}")

Table of Contents