Basic RAG Architecture
The fundamental RAG pattern consists of three main steps:- Document Loading: Load and prepare documents
- Vector Storage: Embed and store document chunks
- Query & Generation: Retrieve relevant context and generate answers
Complete RAG Implementation
Here’s a full working example of a basic RAG system:import streamlit as st
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain_chroma import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters.sentence_transformers import SentenceTransformersTokenTextSplitter
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
# Initialize embedding model
embedding_model = GoogleGenerativeAIEmbeddings(
model="models/embedding-001"
)
# Initialize vector database
db = Chroma(
collection_name="pharma_database",
embedding_function=embedding_model,
persist_directory='./pharma_db'
)
def format_docs(docs):
"""Format document list into single string."""
return "\n\n".join(doc.page_content for doc in docs)
def add_documents_to_db(uploaded_files):
"""Process and add PDF files to vector database."""
for uploaded_file in uploaded_files:
# Save uploaded file temporarily
temp_path = f"./temp/{uploaded_file.name}"
with open(temp_path, "wb") as f:
f.write(uploaded_file.getbuffer())
# Load PDF
loader = PyPDFLoader(temp_path)
documents = loader.load()
# Extract metadata and content
doc_metadata = [doc.metadata for doc in documents]
doc_content = [doc.page_content for doc in documents]
# Split into chunks
text_splitter = SentenceTransformersTokenTextSplitter(
model_name="sentence-transformers/all-mpnet-base-v2",
chunk_size=100,
chunk_overlap=50
)
chunks = text_splitter.create_documents(doc_content, doc_metadata)
# Add to database
db.add_documents(chunks)
def run_rag_chain(query):
"""Execute RAG chain for given query."""
# Create retriever with similarity search
retriever = db.as_retriever(
search_type="similarity",
search_kwargs={'k': 5}
)
# Define prompt template
prompt_template = ChatPromptTemplate.from_template("""
You are a knowledgeable assistant.
Answer the question based only on the following context:
{context}
Question: {question}
Provide a detailed and accurate answer based on the context.
Don't make up information not present in the context.
""")
# Initialize LLM
llm = ChatGoogleGenerativeAI(
model="gemini-1.5-pro",
temperature=0.7
)
# Create RAG chain
rag_chain = (
{"context": retriever | format_docs, "question": RunnablePassthrough()}
| prompt_template
| llm
| StrOutputParser()
)
# Execute chain
return rag_chain.invoke(query)
# Streamlit UI
st.title("📚 Basic RAG System")
# Query input
query = st.text_area(
"Enter your question:",
placeholder="What would you like to know?"
)
if st.button("Submit"):
if query:
with st.spinner("Searching and generating answer...":
result = run_rag_chain(query)
st.write(result)
else:
st.warning("Please enter a question")
# Sidebar for document upload
with st.sidebar:
st.header("Document Upload")
uploaded_files = st.file_uploader(
"Upload PDF documents",
type=["pdf"],
accept_multiple_files=True
)
if st.button("Process Documents"):
if uploaded_files:
with st.spinner("Processing documents..."):
add_documents_to_db(uploaded_files)
st.success("Documents added successfully!")
else:
st.warning("Please upload files first")
Document Loading Strategies
- PDF Loader
- Web Loader
- Text Loader
- Multiple Files
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader("document.pdf")
documents = loader.load()
# Each document has:
# - page_content: The text content
# - metadata: {'source': 'document.pdf', 'page': 0}
from langchain_community.document_loaders import WebBaseLoader
loader = WebBaseLoader("https://example.com/article")
documents = loader.load()
# Configure request rate
loader.requests_per_second = 1
from langchain_community.document_loaders import TextLoader
loader = TextLoader("document.txt", encoding="utf-8")
documents = loader.load()
from langchain_community.document_loaders import DirectoryLoader
loader = DirectoryLoader(
"./documents",
glob="**/*.pdf",
loader_cls=PyPDFLoader
)
documents = loader.load()
Text Splitting Techniques
1. Recursive Character Text Splitter
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500, # Maximum chunk size
chunk_overlap=100, # Overlap between chunks
length_function=len, # Function to measure chunk size
separators=["\n\n", "\n", " ", ""] # Split hierarchy
)
chunks = text_splitter.split_documents(documents)
Why overlap? Chunk overlap ensures that context isn’t lost at chunk boundaries, improving retrieval quality.
2. Token-Based Splitting
from langchain_text_splitters import RecursiveCharacterTextSplitter
# Split by tokens (useful for LLM context limits)
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
chunk_size=500,
chunk_overlap=100,
encoding_name="cl100k_base" # GPT-4 tokenizer
)
chunks = text_splitter.split_documents(documents)
3. Semantic Splitting
from langchain_text_splitters.sentence_transformers import SentenceTransformersTokenTextSplitter
text_splitter = SentenceTransformersTokenTextSplitter(
model_name="sentence-transformers/all-mpnet-base-v2",
chunk_size=100,
chunk_overlap=50
)
chunks = text_splitter.split_documents(documents)
Search Strategies
Similarity Search (Most Common)
Similarity Search (Most Common)
# Get top-k most similar documents
retriever = vectorstore.as_retriever(
search_type="similarity",
search_kwargs={'k': 5}
)
docs = retriever.get_relevant_documents(query)
Similarity with Score Threshold
Similarity with Score Threshold
# Only return documents above similarity threshold
retriever = vectorstore.as_retriever(
search_type="similarity_score_threshold",
search_kwargs={
'score_threshold': 0.7, # 0-1 scale
'k': 10
}
)
docs = retriever.get_relevant_documents(query)
MMR (Maximal Marginal Relevance)
MMR (Maximal Marginal Relevance)
# Balance relevance with diversity
retriever = vectorstore.as_retriever(
search_type="mmr",
search_kwargs={
'k': 5,
'fetch_k': 20, # Initial candidates
'lambda_mult': 0.5 # Diversity vs relevance (0=diverse, 1=relevant)
}
)
docs = retriever.get_relevant_documents(query)
Prompt Engineering for RAG
prompt = ChatPromptTemplate.from_template("""
Answer the question based on the following context:
{context}
Question: {question}
Answer:
""")
RAG Chain Patterns
1. Simple Chain
from langchain_core.runnables import RunnablePassthrough
# Linear chain: retrieve -> format -> prompt -> llm -> parse
rag_chain = (
{"context": retriever | format_docs, "question": RunnablePassthrough()}
| prompt
| llm
| StrOutputParser()
)
response = rag_chain.invoke("What are the key findings?")
2. Chain with History
from langchain_core.runnables import RunnablePassthrough
from langchain_core.runnables.history import RunnableWithMessageHistory
def get_session_history(session_id):
# Return chat history for session
return chat_history_store[session_id]
rag_chain = (
{"context": retriever | format_docs, "question": RunnablePassthrough()}
| prompt
| llm
)
chain_with_history = RunnableWithMessageHistory(
rag_chain,
get_session_history,
input_messages_key="question",
history_messages_key="history"
)
response = chain_with_history.invoke(
{"question": "Tell me more"},
config={"configurable": {"session_id": "user123"}}
)
3. Multi-Query Chain
from langchain.retrievers.multi_query import MultiQueryRetriever
# Generate multiple query variations
multi_query_retriever = MultiQueryRetriever.from_llm(
retriever=vectorstore.as_retriever(),
llm=llm
)
rag_chain = (
{"context": multi_query_retriever | format_docs, "question": RunnablePassthrough()}
| prompt
| llm
| StrOutputParser()
)
Metadata Filtering
# Add metadata when creating documents
from langchain.schema import Document
docs = [
Document(
page_content="Content here",
metadata={
"source": "paper.pdf",
"page": 1,
"category": "research",
"date": "2024-01-01"
}
)
]
vectorstore.add_documents(docs)
# Query with metadata filters
retriever = vectorstore.as_retriever(
search_kwargs={
'k': 5,
'filter': {'category': 'research'} # Only retrieve research docs
}
)
Error Handling
import logging
from tenacity import retry, stop_after_attempt, wait_exponential
@retry(
stop=stop_after_attempt(3),
wait=wait_exponential(multiplier=1, min=4, max=10)
)
def safe_rag_query(query: str) -> str:
"""Execute RAG query with retry logic."""
try:
# Check if retriever is initialized
if retriever is None:
return "System not initialized. Please upload documents first."
# Execute query
response = rag_chain.invoke(query)
return response
except Exception as e:
logging.error(f"RAG query error: {str(e)}")
return f"Error processing query: {str(e)}"
Performance Optimization
Batch Processing
Batch Processing
# Process multiple documents at once
vectorstore.add_documents(chunks, batch_size=100)
# Batch retrieval
queries = ["query1", "query2", "query3"]
results = [retriever.get_relevant_documents(q) for q in queries]
Caching
Caching
from langchain.cache import InMemoryCache
from langchain.globals import set_llm_cache
# Cache LLM responses
set_llm_cache(InMemoryCache())
# Same query will use cached result
response1 = rag_chain.invoke(query)
response2 = rag_chain.invoke(query) # From cache
Async Operations
Async Operations
import asyncio
async def async_rag_query(query: str):
"""Async RAG query for better performance."""
docs = await retriever.aget_relevant_documents(query)
response = await rag_chain.ainvoke(query)
return response
# Run multiple queries in parallel
results = await asyncio.gather(*[
async_rag_query(q) for q in queries
])
Common Pitfalls
Chunk Size Issues: Too small = loss of context; Too large = irrelevant information.Solution: Start with 500-1000 characters and adjust based on your documents.
No Overlap: Chunks without overlap can lose critical context at boundaries.Solution: Use 10-20% overlap (e.g., 100 characters for 500-character chunks).
Wrong Search Type: Using similarity when you need diversity or vice versa.Solution: Use MMR for diverse results, similarity for focused retrieval.
Next Steps
Agentic RAG
Add reasoning and tool usage to your RAG system
Advanced Techniques
Learn about hybrid search and corrective RAG
