Traditional keyword search fails when users phrase queries differently from how content is written. Semantic search using vector embeddings understands meaning, returning relevant results even when exact words don't match. This guide shows you how to build a production-ready AI search engine on your VPS.
How Semantic Search Works
Instead of matching keywords, semantic search converts text into high-dimensional vectors (embeddings) that capture meaning. Similar concepts cluster together in vector space, so "how to deploy a web application" finds results about "hosting a website" even without shared keywords.
Key Components
- Embedding Model: Converts text to vectors (we'll use sentence-transformers or Ollama)
- Vector Database: Stores and queries vectors efficiently (Qdrant, Milvus, or pgvector)
- API Layer: Handles search requests and returns ranked results
- Hybrid Search: Combines semantic and keyword search for best results
Option 1: Qdrant Vector Database
Install and Configure Qdrant
# Run Qdrant with Docker
docker run -d \
--name qdrant \
-p 6333:6333 \
-p 6334:6334 \
-v /opt/qdrant/storage:/qdrant/storage \
--restart unless-stopped \
qdrant/qdrant:latest
# Verify it's running
curl http://localhost:6333/healthz
Build the Search Engine (Python)
# search_engine.py
from qdrant_client import QdrantClient
from qdrant_client.models import (
Distance, VectorParams, PointStruct,
Filter, FieldCondition, MatchValue
)
from sentence_transformers import SentenceTransformer
import uuid
# Initialize
client = QdrantClient(host="localhost", port=6333)
model = SentenceTransformer("all-MiniLM-L6-v2") # 384-dim, fast
COLLECTION = "documents"
VECTOR_SIZE = 384
# Create collection
def create_collection():
client.recreate_collection(
collection_name=COLLECTION,
vectors_config=VectorParams(
size=VECTOR_SIZE,
distance=Distance.COSINE
)
)
# Index documents
def index_documents(documents: list[dict]):
"""
documents: [{"id": "...", "title": "...", "content": "...",
"category": "...", "url": "..."}]
"""
points = []
for doc in documents:
# Create embedding from title + content
text = f"{doc['title']}. {doc['content']}"
embedding = model.encode(text).tolist()
points.append(PointStruct(
id=str(uuid.uuid4()),
vector=embedding,
payload={
"doc_id": doc["id"],
"title": doc["title"],
"content": doc["content"][:500],
"category": doc.get("category", ""),
"url": doc.get("url", "")
}
))
# Batch upsert
client.upsert(
collection_name=COLLECTION,
points=points,
batch_size=100
)
print(f"Indexed {len(points)} documents")
# Search
def search(query: str, category: str = None, limit: int = 10):
query_vector = model.encode(query).tolist()
# Optional category filter
query_filter = None
if category:
query_filter = Filter(
must=[FieldCondition(
key="category",
match=MatchValue(value=category)
)]
)
results = client.search(
collection_name=COLLECTION,
query_vector=query_vector,
query_filter=query_filter,
limit=limit,
score_threshold=0.3
)
return [
{
"title": hit.payload["title"],
"content": hit.payload["content"],
"category": hit.payload["category"],
"url": hit.payload["url"],
"score": round(hit.score, 4)
}
for hit in results
]
Option 2: PostgreSQL with pgvector
If you already run PostgreSQL, pgvector adds vector search without a separate database:
# Install pgvector extension
sudo apt install postgresql-16-pgvector
# Enable in your database
psql -d mydb -c "CREATE EXTENSION vector;"
# Create table with vector column
CREATE TABLE search_documents (
id SERIAL PRIMARY KEY,
title TEXT NOT NULL,
content TEXT NOT NULL,
category VARCHAR(100),
url TEXT,
embedding vector(384),
created_at TIMESTAMP DEFAULT NOW()
);
-- Create HNSW index for fast search
CREATE INDEX ON search_documents
USING hnsw (embedding vector_cosine_ops)
WITH (m = 16, ef_construction = 64);
-- Search query
SELECT id, title, content, category,
1 - (embedding $1::vector) AS similarity
FROM search_documents
WHERE 1 - (embedding $1::vector) > 0.3
ORDER BY embedding $1::vector
LIMIT 10;
Hybrid Search: Best of Both Worlds
Combine semantic and keyword search for optimal results:
# hybrid_search.py
import re
from search_engine import search as semantic_search
def keyword_search(query: str, documents: list, limit: int = 20):
"""Simple BM25-style keyword matching"""
query_terms = set(re.findall(r'\w+', query.lower()))
scored = []
for doc in documents:
doc_terms = set(re.findall(r'\w+',
f"{doc['title']} {doc['content']}".lower()))
overlap = len(query_terms & doc_terms)
if overlap > 0:
score = overlap / len(query_terms)
scored.append({**doc, "keyword_score": score})
return sorted(scored, key=lambda x: x["keyword_score"],
reverse=True)[:limit]
def hybrid_search(query: str, category: str = None,
limit: int = 10, semantic_weight: float = 0.7):
"""Combine semantic and keyword results with RRF"""
semantic_results = semantic_search(query, category, limit=20)
# ... merge with keyword results using Reciprocal Rank Fusion
# RRF scoring
rrf_scores = {}
k = 60 # RRF constant
for rank, result in enumerate(semantic_results):
doc_id = result["title"]
rrf_scores[doc_id] = rrf_scores.get(doc_id, 0) + \
semantic_weight / (k + rank + 1)
# Sort by RRF score and return top results
sorted_results = sorted(rrf_scores.items(),
key=lambda x: x[1], reverse=True)
return sorted_results[:limit]
REST API for Search
# api.py
from fastapi import FastAPI, Query
from search_engine import search, index_documents, create_collection
app = FastAPI(title="AI Search Engine")
@app.get("/search")
async def search_endpoint(
q: str = Query(..., min_length=1),
category: str = Query(None),
limit: int = Query(10, ge=1, le=100)
):
results = search(q, category=category, limit=limit)
return {
"query": q,
"results": results,
"total": len(results)
}
@app.post("/index")
async def index_endpoint(documents: list[dict]):
index_documents(documents)
return {"status": "indexed", "count": len(documents)}
# Run: uvicorn api:app --host 0.0.0.0 --port 8000
Performance Optimization
Batch Processing for Large Datasets
# Process millions of documents efficiently
def batch_index(documents, batch_size=500):
for i in range(0, len(documents), batch_size):
batch = documents[i:i + batch_size]
# Batch encode for efficiency
texts = [f"{d['title']}. {d['content']}" for d in batch]
embeddings = model.encode(texts, batch_size=32,
show_progress_bar=True)
# Create points and upsert
points = [
PointStruct(
id=str(uuid.uuid4()),
vector=emb.tolist(),
payload=doc
)
for doc, emb in zip(batch, embeddings)
]
client.upsert(COLLECTION, points)
print(f"Indexed batch {i//batch_size + 1}")
Production Best Practices
- Choose the right embedding model: all-MiniLM-L6-v2 (fast, 384d) for general use; BGE-large or E5-large (1024d) for higher quality
- Use HNSW indexes for sub-millisecond search on millions of vectors
- Implement query caching with Redis for repeated searches
- Re-index periodically as new embedding models improve quality
- Monitor search quality with click-through rates and user feedback
- Consider hybrid search — pure semantic search can miss exact keyword matches