Retrieval-Augmented Generation (RAG) lets you build AI systems that answer questions about your own documents — contracts, documentation, codebases, or any text corpus. This guide shows you how to build a complete document Q&A system on your VPS using LangChain, a vector database, and a local or remote LLM.
How RAG Works
RAG combines document retrieval with LLM generation in three stages:
- Ingestion: Documents are split into chunks, converted to embeddings (numerical vectors), and stored in a vector database
- Retrieval: When a user asks a question, the system finds the most relevant document chunks using semantic similarity
- Generation: The retrieved chunks are fed as context to the LLM along with the question, producing a grounded answer
System Architecture
User Question
│
▼
┌─────────────┐ ┌──────────────┐
│ Embedding │────▶│ Vector Store │
│ Model │ │ (ChromaDB) │
└─────────────┘ └──────┬───────┘
│ Top-K chunks
▼
┌──────────────┐
│ LLM │
│ (Ollama/API) │
└──────┬───────┘
│
▼
Grounded Answer
Installation
# Create project directory
mkdir -p /opt/doc-qa && cd /opt/doc-qa
# Create Python virtual environment
python3 -m venv venv
source venv/bin/activate
# Install dependencies
pip install langchain langchain-community langchain-chroma \
langchain-ollama langchain-huggingface \
chromadb sentence-transformers \
unstructured pypdf python-docx \
fastapi uvicorn python-multipart
Set Up Ollama for Local LLM
# Install Ollama if not already installed
curl -fsSL https://ollama.ai/install.sh | sh
# Pull models for RAG
ollama pull llama3.1:8b # For generation
ollama pull nomic-embed-text # For embeddings
Building the RAG Pipeline
Document Ingestion
# ingest.py
from langchain_community.document_loaders import (
DirectoryLoader, PyPDFLoader,
UnstructuredWordDocumentLoader,
TextLoader
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
import os
# Configure embedding model
embeddings = HuggingFaceEmbeddings(
model_name="nomic-ai/nomic-embed-text-v1.5",
model_kwargs={"trust_remote_code": True},
encode_kwargs={"normalize_embeddings": True}
)
# Load documents from a directory
def load_documents(docs_dir: str):
loaders = {
"**/*.pdf": PyPDFLoader,
"**/*.txt": TextLoader,
"**/*.md": TextLoader,
"**/*.docx": UnstructuredWordDocumentLoader,
}
documents = []
for glob_pattern, loader_cls in loaders.items():
loader = DirectoryLoader(
docs_dir,
glob=glob_pattern,
loader_cls=loader_cls,
show_progress=True
)
documents.extend(loader.load())
return documents
# Split documents into chunks
def split_documents(documents):
splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
length_function=len,
separators=["\n\n", "\n", ". ", " ", ""]
)
return splitter.split_documents(documents)
# Create vector store
def create_vectorstore(chunks, persist_dir="/opt/doc-qa/vectordb"):
vectorstore = Chroma.from_documents(
documents=chunks,
embedding=embeddings,
persist_directory=persist_dir,
collection_name="documents"
)
print(f"Indexed {len(chunks)} chunks into vector store")
return vectorstore
if __name__ == "__main__":
docs = load_documents("/opt/doc-qa/documents")
chunks = split_documents(docs)
create_vectorstore(chunks)
Query Engine
# query.py
from langchain_ollama import ChatOllama
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
embeddings = HuggingFaceEmbeddings(
model_name="nomic-ai/nomic-embed-text-v1.5",
model_kwargs={"trust_remote_code": True}
)
vectorstore = Chroma(
persist_directory="/opt/doc-qa/vectordb",
embedding_function=embeddings,
collection_name="documents"
)
llm = ChatOllama(
model="llama3.1:8b",
temperature=0.1,
num_ctx=8192
)
PROMPT_TEMPLATE = """Use the following context to answer the question.
If the answer is not in the context, say "I don't have enough information to answer that."
Always cite which document section your answer comes from.
Context:
{context}
Question: {question}
Answer:"""
prompt = PromptTemplate(
template=PROMPT_TEMPLATE,
input_variables=["context", "question"]
)
qa_chain = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=vectorstore.as_retriever(
search_type="mmr",
search_kwargs={"k": 5, "fetch_k": 20}
),
chain_type_kwargs={"prompt": prompt},
return_source_documents=True
)
def ask(question: str):
result = qa_chain.invoke({"query": question})
return {
"answer": result["result"],
"sources": [
{
"content": doc.page_content[:200],
"source": doc.metadata.get("source", "unknown")
}
for doc in result["source_documents"]
]
}
REST API with FastAPI
# api.py
from fastapi import FastAPI, UploadFile, File
from query import ask, create_vectorstore, load_documents, split_documents
import shutil, os
app = FastAPI(title="Document Q&A API")
@app.post("/ask")
async def ask_question(question: str):
result = ask(question)
return result
@app.post("/upload")
async def upload_document(file: UploadFile = File(...)):
upload_dir = "/opt/doc-qa/documents"
os.makedirs(upload_dir, exist_ok=True)
file_path = os.path.join(upload_dir, file.filename)
with open(file_path, "wb") as f:
shutil.copyfileobj(file.file, f)
return {"status": "uploaded", "filename": file.filename}
@app.post("/reindex")
async def reindex():
docs = load_documents("/opt/doc-qa/documents")
chunks = split_documents(docs)
create_vectorstore(chunks)
return {"status": "reindexed", "chunks": len(chunks)}
# Run: uvicorn api:app --host 0.0.0.0 --port 8000
Chunking Strategies
The quality of your RAG system depends heavily on how you chunk documents:
- RecursiveCharacterTextSplitter: Best general-purpose splitter, respects document structure
- Chunk size 800-1200 tokens: Sweet spot for most use cases
- Overlap 150-250 tokens: Ensures context isn't lost at chunk boundaries
- Semantic chunking: Use sentence embeddings to split at topic boundaries for better results
Production Considerations
- Use MMR retrieval (Maximal Marginal Relevance) to get diverse, non-redundant results
- Implement caching for repeated queries to reduce inference costs
- Add metadata filtering to let users scope queries to specific document types or dates
- Monitor retrieval quality by logging queries, retrieved chunks, and user feedback
- Scale ChromaDB with client-server mode for larger document collections