Advanced Retrieval Strategies for RAG: From Dense to Hybrid Search

Introduction: Retrieval is the foundation of RAG systems—the quality of retrieved documents directly impacts generation quality. Different retrieval strategies excel in different scenarios: dense retrieval captures semantic similarity, sparse retrieval handles exact keyword matches, and hybrid approaches combine both. This guide covers advanced retrieval techniques: embedding-based dense retrieval, BM25 and sparse methods, hybrid search strategies, query expansion and transformation, multi-stage retrieval with reranking, and optimizing retrieval for your specific use case. These patterns help you build RAG systems that find the right information every time.

Retrieval Strategies: Dense, Sparse, Hybrid, and Reranking
Advanced Retrieval Strategies: Dense embeddings, Sparse BM25, Hybrid fusion, and Two-stage Reranking

Dense Retrieval with Embeddings

from openai import OpenAI
import numpy as np
from typing import Optional

client = OpenAI()

class DenseRetriever:
    """Dense retrieval using embeddings."""
    
    def __init__(self, model: str = "text-embedding-3-small"):
        self.model = model
        self.documents: list[dict] = []
        self.embeddings: np.ndarray = None
    
    def embed(self, texts: list[str]) -> np.ndarray:
        """Get embeddings for texts."""
        
        response = client.embeddings.create(
            model=self.model,
            input=texts
        )
        
        return np.array([e.embedding for e in response.data])
    
    def add_documents(self, documents: list[dict]):
        """Add documents to the index."""
        
        texts = [doc["content"] for doc in documents]
        new_embeddings = self.embed(texts)
        
        if self.embeddings is None:
            self.embeddings = new_embeddings
        else:
            self.embeddings = np.vstack([self.embeddings, new_embeddings])
        
        self.documents.extend(documents)
    
    def search(
        self,
        query: str,
        k: int = 5,
        threshold: float = 0.0
    ) -> list[dict]:
        """Search for similar documents."""
        
        query_embedding = self.embed([query])[0]
        
        # Cosine similarity
        similarities = np.dot(self.embeddings, query_embedding) / (
            np.linalg.norm(self.embeddings, axis=1) * np.linalg.norm(query_embedding)
        )
        
        # Get top-k indices
        top_indices = np.argsort(similarities)[::-1][:k]
        
        results = []
        for idx in top_indices:
            score = similarities[idx]
            if score >= threshold:
                results.append({
                    **self.documents[idx],
                    "score": float(score)
                })
        
        return results

# Usage
retriever = DenseRetriever()

documents = [
    {"id": "1", "content": "Python is a programming language known for its simplicity."},
    {"id": "2", "content": "Machine learning models can learn patterns from data."},
    {"id": "3", "content": "Neural networks are inspired by biological neurons."},
]

retriever.add_documents(documents)

results = retriever.search("What is deep learning?", k=2)
for r in results:
    print(f"Score: {r['score']:.3f} - {r['content'][:50]}...")

Sparse Retrieval with BM25

# pip install rank_bm25

from rank_bm25 import BM25Okapi
import re
from typing import Callable

class SparseRetriever:
    """Sparse retrieval using BM25."""
    
    def __init__(self, tokenizer: Callable[[str], list[str]] = None):
        self.tokenizer = tokenizer or self._default_tokenizer
        self.documents: list[dict] = []
        self.bm25: BM25Okapi = None
        self.corpus: list[list[str]] = []
    
    def _default_tokenizer(self, text: str) -> list[str]:
        """Simple tokenizer."""
        # Lowercase and split on non-alphanumeric
        text = text.lower()
        tokens = re.findall(r'\b\w+\b', text)
        return tokens
    
    def add_documents(self, documents: list[dict]):
        """Add documents to the index."""
        
        for doc in documents:
            tokens = self.tokenizer(doc["content"])
            self.corpus.append(tokens)
            self.documents.append(doc)
        
        # Rebuild BM25 index
        self.bm25 = BM25Okapi(self.corpus)
    
    def search(self, query: str, k: int = 5) -> list[dict]:
        """Search using BM25."""
        
        query_tokens = self.tokenizer(query)
        scores = self.bm25.get_scores(query_tokens)
        
        # Get top-k indices
        top_indices = np.argsort(scores)[::-1][:k]
        
        results = []
        for idx in top_indices:
            if scores[idx] > 0:
                results.append({
                    **self.documents[idx],
                    "score": float(scores[idx])
                })
        
        return results

# Usage
sparse_retriever = SparseRetriever()
sparse_retriever.add_documents(documents)

results = sparse_retriever.search("programming language", k=2)
for r in results:
    print(f"Score: {r['score']:.3f} - {r['content'][:50]}...")

Hybrid Search

class HybridRetriever:
    """Combine dense and sparse retrieval."""
    
    def __init__(
        self,
        dense_weight: float = 0.5,
        sparse_weight: float = 0.5
    ):
        self.dense = DenseRetriever()
        self.sparse = SparseRetriever()
        self.dense_weight = dense_weight
        self.sparse_weight = sparse_weight
    
    def add_documents(self, documents: list[dict]):
        """Add documents to both indexes."""
        self.dense.add_documents(documents)
        self.sparse.add_documents(documents)
    
    def _normalize_scores(self, results: list[dict]) -> dict[str, float]:
        """Normalize scores to 0-1 range."""
        
        if not results:
            return {}
        
        scores = [r["score"] for r in results]
        min_score = min(scores)
        max_score = max(scores)
        range_score = max_score - min_score if max_score != min_score else 1
        
        return {
            r["id"]: (r["score"] - min_score) / range_score
            for r in results
        }
    
    def search(
        self,
        query: str,
        k: int = 5,
        dense_k: int = 20,
        sparse_k: int = 20
    ) -> list[dict]:
        """Hybrid search combining dense and sparse."""
        
        # Get results from both
        dense_results = self.dense.search(query, k=dense_k)
        sparse_results = self.sparse.search(query, k=sparse_k)
        
        # Normalize scores
        dense_scores = self._normalize_scores(dense_results)
        sparse_scores = self._normalize_scores(sparse_results)
        
        # Combine scores
        all_ids = set(dense_scores.keys()) | set(sparse_scores.keys())
        combined_scores = {}
        
        for doc_id in all_ids:
            dense_score = dense_scores.get(doc_id, 0) * self.dense_weight
            sparse_score = sparse_scores.get(doc_id, 0) * self.sparse_weight
            combined_scores[doc_id] = dense_score + sparse_score
        
        # Sort by combined score
        sorted_ids = sorted(combined_scores.keys(), key=lambda x: combined_scores[x], reverse=True)
        
        # Build results
        doc_map = {d["id"]: d for d in self.dense.documents}
        results = []
        
        for doc_id in sorted_ids[:k]:
            doc = doc_map[doc_id]
            results.append({
                **doc,
                "score": combined_scores[doc_id],
                "dense_score": dense_scores.get(doc_id, 0),
                "sparse_score": sparse_scores.get(doc_id, 0)
            })
        
        return results

# Usage
hybrid = HybridRetriever(dense_weight=0.7, sparse_weight=0.3)
hybrid.add_documents(documents)

results = hybrid.search("neural network programming", k=3)
for r in results:
    print(f"Combined: {r['score']:.3f} (dense: {r['dense_score']:.3f}, sparse: {r['sparse_score']:.3f})")

Query Expansion and Transformation

import json

class QueryTransformer:
    """Transform queries for better retrieval."""
    
    def expand_query(self, query: str) -> list[str]:
        """Expand query with synonyms and related terms."""
        
        prompt = f"""Generate 3 alternative phrasings of this search query.
Keep the same intent but use different words.

Query: {query}

Return JSON: {{"alternatives": ["query1", "query2", "query3"]}}"""
        
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            response_format={"type": "json_object"}
        )
        
        data = json.loads(response.choices[0].message.content)
        return [query] + data.get("alternatives", [])
    
    def decompose_query(self, query: str) -> list[str]:
        """Break complex query into sub-queries."""
        
        prompt = f"""Break this complex query into simpler sub-queries.
Each sub-query should retrieve part of the information needed.

Query: {query}

Return JSON: {{"sub_queries": ["query1", "query2", ...]}}"""
        
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            response_format={"type": "json_object"}
        )
        
        data = json.loads(response.choices[0].message.content)
        return data.get("sub_queries", [query])
    
    def hypothetical_document(self, query: str) -> str:
        """Generate hypothetical document that would answer the query (HyDE)."""
        
        prompt = f"""Write a short paragraph that would be a perfect answer to this question.
Write as if you're creating a document that contains the answer.

Question: {query}

Write 2-3 sentences that directly answer the question."""
        
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}]
        )
        
        return response.choices[0].message.content

class EnhancedRetriever:
    """Retriever with query transformation."""
    
    def __init__(self, base_retriever):
        self.retriever = base_retriever
        self.transformer = QueryTransformer()
    
    def search_with_expansion(self, query: str, k: int = 5) -> list[dict]:
        """Search with query expansion."""
        
        queries = self.transformer.expand_query(query)
        
        all_results = {}
        for q in queries:
            results = self.retriever.search(q, k=k)
            for r in results:
                if r["id"] not in all_results or r["score"] > all_results[r["id"]]["score"]:
                    all_results[r["id"]] = r
        
        # Sort by score
        sorted_results = sorted(all_results.values(), key=lambda x: x["score"], reverse=True)
        return sorted_results[:k]
    
    def search_with_hyde(self, query: str, k: int = 5) -> list[dict]:
        """Search using HyDE (Hypothetical Document Embeddings)."""
        
        # Generate hypothetical document
        hypo_doc = self.transformer.hypothetical_document(query)
        
        # Search using the hypothetical document as query
        return self.retriever.search(hypo_doc, k=k)

# Usage
enhanced = EnhancedRetriever(retriever)

# Query expansion
results = enhanced.search_with_expansion("How do neural networks learn?", k=3)

# HyDE
results = enhanced.search_with_hyde("What is backpropagation?", k=3)

Reranking

# pip install sentence-transformers

from sentence_transformers import CrossEncoder

class Reranker:
    """Rerank retrieved documents for better relevance."""
    
    def __init__(self, model_name: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"):
        self.model = CrossEncoder(model_name)
    
    def rerank(
        self,
        query: str,
        documents: list[dict],
        k: int = None
    ) -> list[dict]:
        """Rerank documents using cross-encoder."""
        
        if not documents:
            return []
        
        # Prepare pairs
        pairs = [(query, doc["content"]) for doc in documents]
        
        # Get scores
        scores = self.model.predict(pairs)
        
        # Add scores to documents
        for doc, score in zip(documents, scores):
            doc["rerank_score"] = float(score)
        
        # Sort by rerank score
        sorted_docs = sorted(documents, key=lambda x: x["rerank_score"], reverse=True)
        
        if k:
            return sorted_docs[:k]
        return sorted_docs

class LLMReranker:
    """Rerank using LLM for more nuanced relevance."""
    
    def rerank(
        self,
        query: str,
        documents: list[dict],
        k: int = 5
    ) -> list[dict]:
        """Rerank using LLM."""
        
        if len(documents) <= k:
            return documents
        
        # Format documents for LLM
        doc_texts = "\n\n".join([
            f"[{i}] {doc['content'][:500]}"
            for i, doc in enumerate(documents)
        ])
        
        prompt = f"""Rank these documents by relevance to the query.
Return the indices of the top {k} most relevant documents in order.

Query: {query}

Documents:
{doc_texts}

Return JSON: {{"ranking": [index1, index2, ...]}}"""
        
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            response_format={"type": "json_object"}
        )
        
        data = json.loads(response.choices[0].message.content)
        ranking = data.get("ranking", list(range(k)))
        
        return [documents[i] for i in ranking if i < len(documents)]

# Two-stage retrieval
class TwoStageRetriever:
    """Retrieve then rerank for better results."""
    
    def __init__(self, retriever, reranker):
        self.retriever = retriever
        self.reranker = reranker
    
    def search(
        self,
        query: str,
        k: int = 5,
        retrieve_k: int = 20
    ) -> list[dict]:
        """Two-stage retrieval: retrieve many, rerank to top-k."""
        
        # Stage 1: Retrieve candidates
        candidates = self.retriever.search(query, k=retrieve_k)
        
        # Stage 2: Rerank
        reranked = self.reranker.rerank(query, candidates, k=k)
        
        return reranked

# Usage
reranker = Reranker()
two_stage = TwoStageRetriever(hybrid, reranker)

results = two_stage.search("machine learning basics", k=3, retrieve_k=10)

Contextual Retrieval

class ContextualRetriever:
    """Add context to chunks before embedding."""
    
    def __init__(self, retriever):
        self.retriever = retriever
    
    def add_context_to_chunk(
        self,
        chunk: str,
        document_context: str
    ) -> str:
        """Add document context to chunk for better embeddings."""
        
        prompt = f"""Generate a brief context for this chunk based on the full document.
The context should help understand what this chunk is about.

Document context: {document_context[:1000]}

Chunk: {chunk}

Write 1-2 sentences of context to prepend to the chunk."""
        
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            max_tokens=100
        )
        
        context = response.choices[0].message.content
        return f"{context}\n\n{chunk}"
    
    def add_documents_with_context(
        self,
        documents: list[dict],
        chunk_size: int = 500
    ):
        """Chunk documents and add context."""
        
        contextualized_chunks = []
        
        for doc in documents:
            content = doc["content"]
            doc_context = content[:500]  # Use beginning as context
            
            # Simple chunking
            chunks = [content[i:i+chunk_size] for i in range(0, len(content), chunk_size)]
            
            for i, chunk in enumerate(chunks):
                contextualized = self.add_context_to_chunk(chunk, doc_context)
                contextualized_chunks.append({
                    "id": f"{doc['id']}_chunk_{i}",
                    "content": contextualized,
                    "original_content": chunk,
                    "source_doc": doc["id"]
                })
        
        self.retriever.add_documents(contextualized_chunks)

# Usage
contextual = ContextualRetriever(DenseRetriever())
contextual.add_documents_with_context(documents)

Production Retrieval Service

from fastapi import FastAPI
from pydantic import BaseModel
from typing import Optional

app = FastAPI()

class SearchRequest(BaseModel):
    query: str
    k: int = 5
    strategy: str = "hybrid"  # dense, sparse, hybrid
    use_reranking: bool = True
    use_query_expansion: bool = False

class SearchResult(BaseModel):
    id: str
    content: str
    score: float
    metadata: dict = {}

class SearchResponse(BaseModel):
    results: list[SearchResult]
    query_variants: list[str] = []

# Initialize retrievers
dense_retriever = DenseRetriever()
sparse_retriever = SparseRetriever()
hybrid_retriever = HybridRetriever()
reranker = Reranker()
transformer = QueryTransformer()

@app.post("/search", response_model=SearchResponse)
async def search(request: SearchRequest):
    """Search documents with configurable strategy."""
    
    query_variants = [request.query]
    
    # Query expansion
    if request.use_query_expansion:
        query_variants = transformer.expand_query(request.query)
    
    # Select retriever
    if request.strategy == "dense":
        retriever = dense_retriever
    elif request.strategy == "sparse":
        retriever = sparse_retriever
    else:
        retriever = hybrid_retriever
    
    # Retrieve
    retrieve_k = request.k * 4 if request.use_reranking else request.k
    
    all_results = {}
    for query in query_variants:
        results = retriever.search(query, k=retrieve_k)
        for r in results:
            if r["id"] not in all_results or r["score"] > all_results[r["id"]]["score"]:
                all_results[r["id"]] = r
    
    candidates = list(all_results.values())
    
    # Rerank
    if request.use_reranking and candidates:
        candidates = reranker.rerank(request.query, candidates, k=request.k)
    else:
        candidates = sorted(candidates, key=lambda x: x["score"], reverse=True)[:request.k]
    
    return SearchResponse(
        results=[
            SearchResult(
                id=r["id"],
                content=r["content"],
                score=r.get("rerank_score", r["score"]),
                metadata=r.get("metadata", {})
            )
            for r in candidates
        ],
        query_variants=query_variants
    )

@app.post("/index")
async def index_documents(documents: list[dict]):
    """Add documents to all indexes."""
    
    dense_retriever.add_documents(documents)
    sparse_retriever.add_documents(documents)
    hybrid_retriever.add_documents(documents)
    
    return {"indexed": len(documents)}

References

Conclusion

Effective retrieval requires matching the strategy to your data and queries. Dense retrieval excels at semantic similarity but may miss exact keyword matches. Sparse retrieval handles specific terms well but lacks semantic understanding. Hybrid search combines both for robust results. Query expansion and HyDE improve recall for ambiguous queries. Two-stage retrieval with reranking provides the best precision by using fast retrieval for candidates and accurate reranking for final selection. Start with hybrid search and reranking as a strong baseline, then optimize based on your specific failure cases.


Discover more from C4: Container, Code, Cloud & Context

Subscribe to get the latest posts sent to your email.

Leave a Reply

Your email address will not be published. Required fields are marked *

This site uses Akismet to reduce spam. Learn how your comment data is processed.