Embedding Model Selection: Choosing the Right Model for Your RAG System

Introduction: Choosing the right embedding model is critical for RAG systems, semantic search, and similarity applications. The wrong choice leads to poor retrieval quality, high costs, or unacceptable latency. OpenAI’s text-embedding-3-small is cheap and fast but may miss nuanced similarities. Cohere’s embed-v3 excels at multilingual content. Open-source models like BGE and E5 offer privacy and cost control. This guide covers practical embedding model selection: understanding model characteristics, benchmarking for your specific use case, balancing quality versus cost versus latency, and building evaluation pipelines that help you make data-driven decisions.

Embedding Selection
Embedding Model Selection: Benchmark Evaluation, Cost Analysis, Latency Testing

Embedding Model Registry

from dataclasses import dataclass
from typing import Any, Optional
from enum import Enum

class EmbeddingProvider(Enum):
    """Embedding model providers."""
    
    OPENAI = "openai"
    COHERE = "cohere"
    VOYAGE = "voyage"
    HUGGINGFACE = "huggingface"
    LOCAL = "local"

@dataclass
class EmbeddingModelSpec:
    """Specification for an embedding model."""
    
    name: str
    provider: EmbeddingProvider
    dimensions: int
    max_tokens: int
    cost_per_1m_tokens: float
    supports_batching: bool = True
    supports_truncation: bool = True
    multilingual: bool = False
    description: str = ""

class EmbeddingModelRegistry:
    """Registry of available embedding models."""
    
    def __init__(self):
        self.models: dict[str, EmbeddingModelSpec] = {}
        self._register_defaults()
    
    def _register_defaults(self):
        """Register default models."""
        
        # OpenAI models
        self.register(EmbeddingModelSpec(
            name="text-embedding-3-small",
            provider=EmbeddingProvider.OPENAI,
            dimensions=1536,
            max_tokens=8191,
            cost_per_1m_tokens=0.02,
            multilingual=True,
            description="Fast, cheap, good for most use cases"
        ))
        
        self.register(EmbeddingModelSpec(
            name="text-embedding-3-large",
            provider=EmbeddingProvider.OPENAI,
            dimensions=3072,
            max_tokens=8191,
            cost_per_1m_tokens=0.13,
            multilingual=True,
            description="Higher quality, better for complex retrieval"
        ))
        
        self.register(EmbeddingModelSpec(
            name="text-embedding-ada-002",
            provider=EmbeddingProvider.OPENAI,
            dimensions=1536,
            max_tokens=8191,
            cost_per_1m_tokens=0.10,
            multilingual=False,
            description="Legacy model, still widely used"
        ))
        
        # Cohere models
        self.register(EmbeddingModelSpec(
            name="embed-english-v3.0",
            provider=EmbeddingProvider.COHERE,
            dimensions=1024,
            max_tokens=512,
            cost_per_1m_tokens=0.10,
            multilingual=False,
            description="Optimized for English retrieval"
        ))
        
        self.register(EmbeddingModelSpec(
            name="embed-multilingual-v3.0",
            provider=EmbeddingProvider.COHERE,
            dimensions=1024,
            max_tokens=512,
            cost_per_1m_tokens=0.10,
            multilingual=True,
            description="100+ languages, excellent cross-lingual"
        ))
        
        # Voyage models
        self.register(EmbeddingModelSpec(
            name="voyage-large-2",
            provider=EmbeddingProvider.VOYAGE,
            dimensions=1536,
            max_tokens=16000,
            cost_per_1m_tokens=0.12,
            multilingual=False,
            description="Long context, high quality"
        ))
        
        self.register(EmbeddingModelSpec(
            name="voyage-code-2",
            provider=EmbeddingProvider.VOYAGE,
            dimensions=1536,
            max_tokens=16000,
            cost_per_1m_tokens=0.12,
            multilingual=False,
            description="Optimized for code retrieval"
        ))
        
        # Open source models
        self.register(EmbeddingModelSpec(
            name="BAAI/bge-large-en-v1.5",
            provider=EmbeddingProvider.HUGGINGFACE,
            dimensions=1024,
            max_tokens=512,
            cost_per_1m_tokens=0.0,  # Self-hosted
            multilingual=False,
            description="Top open-source English model"
        ))
        
        self.register(EmbeddingModelSpec(
            name="intfloat/e5-large-v2",
            provider=EmbeddingProvider.HUGGINGFACE,
            dimensions=1024,
            max_tokens=512,
            cost_per_1m_tokens=0.0,
            multilingual=False,
            description="Excellent retrieval performance"
        ))
        
        self.register(EmbeddingModelSpec(
            name="sentence-transformers/all-MiniLM-L6-v2",
            provider=EmbeddingProvider.HUGGINGFACE,
            dimensions=384,
            max_tokens=256,
            cost_per_1m_tokens=0.0,
            multilingual=False,
            description="Fast, lightweight, good baseline"
        ))
    
    def register(self, spec: EmbeddingModelSpec) -> None:
        """Register a model."""
        self.models[spec.name] = spec
    
    def get(self, name: str) -> Optional[EmbeddingModelSpec]:
        """Get model specification."""
        return self.models.get(name)
    
    def list_by_provider(self, provider: EmbeddingProvider) -> list[EmbeddingModelSpec]:
        """List models by provider."""
        return [m for m in self.models.values() if m.provider == provider]
    
    def list_multilingual(self) -> list[EmbeddingModelSpec]:
        """List multilingual models."""
        return [m for m in self.models.values() if m.multilingual]
    
    def list_by_cost(self, max_cost: float) -> list[EmbeddingModelSpec]:
        """List models under cost threshold."""
        return [m for m in self.models.values() if m.cost_per_1m_tokens <= max_cost]

Unified Embedding Client

from dataclasses import dataclass
from typing import Any, Optional
import numpy as np

@dataclass
class EmbeddingResult:
    """Result of embedding generation."""
    
    embeddings: list[list[float]]
    model: str
    tokens_used: int
    latency_ms: float

class UnifiedEmbeddingClient:
    """Unified client for multiple embedding providers."""
    
    def __init__(
        self,
        openai_client: Any = None,
        cohere_client: Any = None,
        voyage_client: Any = None,
        local_model: Any = None
    ):
        self.openai = openai_client
        self.cohere = cohere_client
        self.voyage = voyage_client
        self.local = local_model
        self.registry = EmbeddingModelRegistry()
    
    async def embed(
        self,
        texts: list[str],
        model: str
    ) -> EmbeddingResult:
        """Generate embeddings using specified model."""
        
        import time
        start = time.time()
        
        spec = self.registry.get(model)
        if not spec:
            raise ValueError(f"Unknown model: {model}")
        
        if spec.provider == EmbeddingProvider.OPENAI:
            result = await self._embed_openai(texts, model)
        elif spec.provider == EmbeddingProvider.COHERE:
            result = await self._embed_cohere(texts, model)
        elif spec.provider == EmbeddingProvider.VOYAGE:
            result = await self._embed_voyage(texts, model)
        elif spec.provider == EmbeddingProvider.HUGGINGFACE:
            result = self._embed_local(texts, model)
        else:
            raise ValueError(f"Unsupported provider: {spec.provider}")
        
        latency = (time.time() - start) * 1000
        result.latency_ms = latency
        
        return result
    
    async def _embed_openai(self, texts: list[str], model: str) -> EmbeddingResult:
        """Generate embeddings using OpenAI."""
        
        response = await self.openai.embeddings.create(
            model=model,
            input=texts
        )
        
        embeddings = [d.embedding for d in response.data]
        
        return EmbeddingResult(
            embeddings=embeddings,
            model=model,
            tokens_used=response.usage.total_tokens,
            latency_ms=0
        )
    
    async def _embed_cohere(self, texts: list[str], model: str) -> EmbeddingResult:
        """Generate embeddings using Cohere."""
        
        response = await self.cohere.embed(
            texts=texts,
            model=model,
            input_type="search_document"
        )
        
        return EmbeddingResult(
            embeddings=response.embeddings,
            model=model,
            tokens_used=len(texts) * 100,  # Approximate
            latency_ms=0
        )
    
    async def _embed_voyage(self, texts: list[str], model: str) -> EmbeddingResult:
        """Generate embeddings using Voyage."""
        
        response = await self.voyage.embed(
            texts=texts,
            model=model
        )
        
        return EmbeddingResult(
            embeddings=response.embeddings,
            model=model,
            tokens_used=response.total_tokens,
            latency_ms=0
        )
    
    def _embed_local(self, texts: list[str], model: str) -> EmbeddingResult:
        """Generate embeddings using local model."""
        
        embeddings = self.local.encode(texts).tolist()
        
        return EmbeddingResult(
            embeddings=embeddings,
            model=model,
            tokens_used=sum(len(t.split()) for t in texts),
            latency_ms=0
        )

Embedding Benchmarking

from dataclasses import dataclass
from typing import Any, Optional
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

@dataclass
class BenchmarkQuery:
    """A benchmark query with expected results."""
    
    query: str
    relevant_docs: list[str]
    irrelevant_docs: list[str]

@dataclass
class BenchmarkResult:
    """Result of model benchmarking."""
    
    model: str
    precision_at_k: dict[int, float]
    recall_at_k: dict[int, float]
    mrr: float
    ndcg: float
    avg_latency_ms: float
    cost_per_query: float

class EmbeddingBenchmark:
    """Benchmark embedding models for retrieval quality."""
    
    def __init__(self, client: UnifiedEmbeddingClient):
        self.client = client
    
    async def benchmark_model(
        self,
        model: str,
        queries: list[BenchmarkQuery],
        k_values: list[int] = [1, 3, 5, 10]
    ) -> BenchmarkResult:
        """Benchmark a model on retrieval tasks."""
        
        precision_scores = {k: [] for k in k_values}
        recall_scores = {k: [] for k in k_values}
        mrr_scores = []
        ndcg_scores = []
        latencies = []
        
        for query in queries:
            # Embed query
            query_result = await self.client.embed([query.query], model)
            query_embedding = np.array(query_result.embeddings[0])
            latencies.append(query_result.latency_ms)
            
            # Embed all documents
            all_docs = query.relevant_docs + query.irrelevant_docs
            doc_result = await self.client.embed(all_docs, model)
            doc_embeddings = np.array(doc_result.embeddings)
            latencies.append(doc_result.latency_ms)
            
            # Calculate similarities
            similarities = cosine_similarity([query_embedding], doc_embeddings)[0]
            
            # Rank documents
            ranked_indices = np.argsort(similarities)[::-1]
            
            # Calculate metrics
            num_relevant = len(query.relevant_docs)
            
            for k in k_values:
                top_k = ranked_indices[:k]
                relevant_in_top_k = sum(1 for i in top_k if i < num_relevant)
                
                precision_scores[k].append(relevant_in_top_k / k)
                recall_scores[k].append(relevant_in_top_k / num_relevant)
            
            # MRR
            for rank, idx in enumerate(ranked_indices, 1):
                if idx < num_relevant:
                    mrr_scores.append(1.0 / rank)
                    break
            else:
                mrr_scores.append(0.0)
            
            # NDCG
            ndcg = self._calculate_ndcg(ranked_indices, num_relevant, max(k_values))
            ndcg_scores.append(ndcg)
        
        # Get cost info
        spec = self.client.registry.get(model)
        avg_tokens = sum(len(q.query.split()) for q in queries) / len(queries)
        cost_per_query = (avg_tokens / 1_000_000) * spec.cost_per_1m_tokens
        
        return BenchmarkResult(
            model=model,
            precision_at_k={k: np.mean(scores) for k, scores in precision_scores.items()},
            recall_at_k={k: np.mean(scores) for k, scores in recall_scores.items()},
            mrr=np.mean(mrr_scores),
            ndcg=np.mean(ndcg_scores),
            avg_latency_ms=np.mean(latencies),
            cost_per_query=cost_per_query
        )
    
    def _calculate_ndcg(
        self,
        ranked_indices: np.ndarray,
        num_relevant: int,
        k: int
    ) -> float:
        """Calculate NDCG@k."""
        
        dcg = 0.0
        for i, idx in enumerate(ranked_indices[:k]):
            if idx < num_relevant:
                dcg += 1.0 / np.log2(i + 2)
        
        # Ideal DCG
        idcg = sum(1.0 / np.log2(i + 2) for i in range(min(k, num_relevant)))
        
        return dcg / idcg if idcg > 0 else 0.0
    
    async def compare_models(
        self,
        models: list[str],
        queries: list[BenchmarkQuery]
    ) -> dict[str, BenchmarkResult]:
        """Compare multiple models."""
        
        results = {}
        for model in models:
            results[model] = await self.benchmark_model(model, queries)
        
        return results
    
    def rank_models(
        self,
        results: dict[str, BenchmarkResult],
        weights: dict[str, float] = None
    ) -> list[tuple[str, float]]:
        """Rank models by weighted score."""
        
        if weights is None:
            weights = {
                "mrr": 0.3,
                "ndcg": 0.3,
                "latency": 0.2,
                "cost": 0.2
            }
        
        # Normalize metrics
        mrr_values = [r.mrr for r in results.values()]
        ndcg_values = [r.ndcg for r in results.values()]
        latency_values = [r.avg_latency_ms for r in results.values()]
        cost_values = [r.cost_per_query for r in results.values()]
        
        scores = []
        for model, result in results.items():
            score = 0.0
            
            # Higher is better for MRR and NDCG
            score += weights["mrr"] * (result.mrr / max(mrr_values))
            score += weights["ndcg"] * (result.ndcg / max(ndcg_values))
            
            # Lower is better for latency and cost
            score += weights["latency"] * (1 - result.avg_latency_ms / max(latency_values))
            score += weights["cost"] * (1 - result.cost_per_query / max(cost_values)) if max(cost_values) > 0 else weights["cost"]
            
            scores.append((model, score))
        
        return sorted(scores, key=lambda x: x[1], reverse=True)

Task-Based Model Selection

from dataclasses import dataclass
from typing import Any, Optional
from enum import Enum

class EmbeddingTask(Enum):
    """Types of embedding tasks."""
    
    SEMANTIC_SEARCH = "semantic_search"
    DOCUMENT_RETRIEVAL = "document_retrieval"
    CODE_SEARCH = "code_search"
    MULTILINGUAL = "multilingual"
    CLUSTERING = "clustering"
    CLASSIFICATION = "classification"

@dataclass
class TaskRequirements:
    """Requirements for an embedding task."""
    
    task: EmbeddingTask
    max_latency_ms: float = 100
    max_cost_per_1m: float = 0.5
    min_dimensions: int = 256
    requires_multilingual: bool = False
    requires_long_context: bool = False
    privacy_required: bool = False

class TaskBasedSelector:
    """Select embedding model based on task requirements."""
    
    def __init__(self, registry: EmbeddingModelRegistry):
        self.registry = registry
        
        # Task to model recommendations
        self.task_recommendations = {
            EmbeddingTask.SEMANTIC_SEARCH: [
                "text-embedding-3-small",
                "BAAI/bge-large-en-v1.5",
                "intfloat/e5-large-v2"
            ],
            EmbeddingTask.DOCUMENT_RETRIEVAL: [
                "text-embedding-3-large",
                "voyage-large-2",
                "embed-english-v3.0"
            ],
            EmbeddingTask.CODE_SEARCH: [
                "voyage-code-2",
                "text-embedding-3-large"
            ],
            EmbeddingTask.MULTILINGUAL: [
                "embed-multilingual-v3.0",
                "text-embedding-3-small",
                "text-embedding-3-large"
            ],
            EmbeddingTask.CLUSTERING: [
                "text-embedding-3-small",
                "sentence-transformers/all-MiniLM-L6-v2"
            ],
            EmbeddingTask.CLASSIFICATION: [
                "text-embedding-3-small",
                "BAAI/bge-large-en-v1.5"
            ]
        }
    
    def select(self, requirements: TaskRequirements) -> list[EmbeddingModelSpec]:
        """Select models matching requirements."""
        
        candidates = []
        
        # Get task recommendations
        recommended = self.task_recommendations.get(requirements.task, [])
        
        for model_name in recommended:
            spec = self.registry.get(model_name)
            if not spec:
                continue
            
            # Check requirements
            if spec.cost_per_1m_tokens > requirements.max_cost_per_1m:
                continue
            
            if spec.dimensions < requirements.min_dimensions:
                continue
            
            if requirements.requires_multilingual and not spec.multilingual:
                continue
            
            if requirements.requires_long_context and spec.max_tokens < 4000:
                continue
            
            if requirements.privacy_required and spec.provider != EmbeddingProvider.HUGGINGFACE:
                continue
            
            candidates.append(spec)
        
        # Sort by cost (cheapest first)
        candidates.sort(key=lambda x: x.cost_per_1m_tokens)
        
        return candidates
    
    def recommend(self, requirements: TaskRequirements) -> Optional[EmbeddingModelSpec]:
        """Get top recommendation."""
        
        candidates = self.select(requirements)
        return candidates[0] if candidates else None

class AdaptiveModelSelector:
    """Dynamically select model based on input characteristics."""
    
    def __init__(
        self,
        registry: EmbeddingModelRegistry,
        default_model: str = "text-embedding-3-small"
    ):
        self.registry = registry
        self.default_model = default_model
    
    def select_for_text(self, text: str) -> str:
        """Select model based on text characteristics."""
        
        # Check language
        if self._is_multilingual(text):
            return "embed-multilingual-v3.0"
        
        # Check if code
        if self._is_code(text):
            return "voyage-code-2"
        
        # Check length
        word_count = len(text.split())
        if word_count > 1000:
            return "voyage-large-2"  # Long context
        
        return self.default_model
    
    def _is_multilingual(self, text: str) -> bool:
        """Check if text contains non-ASCII characters."""
        
        non_ascii = sum(1 for c in text if ord(c) > 127)
        return non_ascii / len(text) > 0.1 if text else False
    
    def _is_code(self, text: str) -> bool:
        """Check if text appears to be code."""
        
        code_indicators = [
            "def ", "class ", "import ", "function",
            "const ", "let ", "var ", "return ",
            "if (", "for (", "while (", "=>",
            "{", "}", "[]", "()"
        ]
        
        indicator_count = sum(1 for ind in code_indicators if ind in text)
        return indicator_count >= 3

Production Embedding Service

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import Optional

app = FastAPI()

# Initialize components
registry = EmbeddingModelRegistry()
embedding_client = None  # Initialize with actual clients
benchmark = None  # Initialize with client
task_selector = TaskBasedSelector(registry)
adaptive_selector = AdaptiveModelSelector(registry)

class EmbedRequest(BaseModel):
    texts: list[str]
    model: Optional[str] = None

class SelectModelRequest(BaseModel):
    task: str
    max_latency_ms: float = 100
    max_cost_per_1m: float = 0.5
    requires_multilingual: bool = False
    requires_long_context: bool = False
    privacy_required: bool = False

class BenchmarkRequest(BaseModel):
    models: list[str]
    queries: list[dict]

@app.post("/v1/embeddings")
async def create_embeddings(request: EmbedRequest):
    """Generate embeddings."""
    
    # Auto-select model if not specified
    model = request.model
    if not model:
        # Use adaptive selection based on first text
        model = adaptive_selector.select_for_text(request.texts[0])
    
    result = await embedding_client.embed(request.texts, model)
    
    return {
        "embeddings": result.embeddings,
        "model": result.model,
        "tokens_used": result.tokens_used,
        "latency_ms": result.latency_ms
    }

@app.post("/v1/models/select")
async def select_model(request: SelectModelRequest):
    """Select best model for task."""
    
    try:
        task = EmbeddingTask(request.task)
    except ValueError:
        raise HTTPException(status_code=400, detail=f"Unknown task: {request.task}")
    
    requirements = TaskRequirements(
        task=task,
        max_latency_ms=request.max_latency_ms,
        max_cost_per_1m=request.max_cost_per_1m,
        requires_multilingual=request.requires_multilingual,
        requires_long_context=request.requires_long_context,
        privacy_required=request.privacy_required
    )
    
    candidates = task_selector.select(requirements)
    
    return {
        "recommended": candidates[0].name if candidates else None,
        "candidates": [
            {
                "name": c.name,
                "provider": c.provider.value,
                "dimensions": c.dimensions,
                "cost_per_1m": c.cost_per_1m_tokens,
                "description": c.description
            }
            for c in candidates
        ]
    }

@app.get("/v1/models")
async def list_models(
    provider: Optional[str] = None,
    multilingual: Optional[bool] = None,
    max_cost: Optional[float] = None
):
    """List available models."""
    
    models = list(registry.models.values())
    
    if provider:
        try:
            p = EmbeddingProvider(provider)
            models = [m for m in models if m.provider == p]
        except ValueError:
            pass
    
    if multilingual is not None:
        models = [m for m in models if m.multilingual == multilingual]
    
    if max_cost is not None:
        models = [m for m in models if m.cost_per_1m_tokens <= max_cost]
    
    return {
        "models": [
            {
                "name": m.name,
                "provider": m.provider.value,
                "dimensions": m.dimensions,
                "max_tokens": m.max_tokens,
                "cost_per_1m": m.cost_per_1m_tokens,
                "multilingual": m.multilingual,
                "description": m.description
            }
            for m in models
        ]
    }

@app.get("/v1/models/{model_name}")
async def get_model(model_name: str):
    """Get model details."""
    
    spec = registry.get(model_name)
    if not spec:
        raise HTTPException(status_code=404, detail="Model not found")
    
    return {
        "name": spec.name,
        "provider": spec.provider.value,
        "dimensions": spec.dimensions,
        "max_tokens": spec.max_tokens,
        "cost_per_1m": spec.cost_per_1m_tokens,
        "multilingual": spec.multilingual,
        "description": spec.description
    }

@app.post("/v1/benchmark")
async def run_benchmark(request: BenchmarkRequest):
    """Benchmark models."""
    
    queries = [
        BenchmarkQuery(
            query=q["query"],
            relevant_docs=q["relevant_docs"],
            irrelevant_docs=q.get("irrelevant_docs", [])
        )
        for q in request.queries
    ]
    
    results = await benchmark.compare_models(request.models, queries)
    rankings = benchmark.rank_models(results)
    
    return {
        "results": {
            model: {
                "mrr": r.mrr,
                "ndcg": r.ndcg,
                "precision_at_5": r.precision_at_k.get(5, 0),
                "recall_at_5": r.recall_at_k.get(5, 0),
                "avg_latency_ms": r.avg_latency_ms,
                "cost_per_query": r.cost_per_query
            }
            for model, r in results.items()
        },
        "rankings": [{"model": m, "score": s} for m, s in rankings]
    }

@app.get("/health")
async def health():
    return {"status": "healthy"}

References

Conclusion

Embedding model selection requires balancing quality, cost, and latency for your specific use case. Start with a model registry that captures key specifications: dimensions, token limits, pricing, and capabilities. Build a unified client that abstracts provider differences, making it easy to switch models. Benchmark models on your actual data—public benchmarks like MTEB provide guidance, but your domain-specific queries matter most. Use task-based selection to narrow candidates: semantic search, code retrieval, and multilingual applications have different optimal models. Consider adaptive selection that examines input characteristics to route to appropriate models dynamically. For production systems, text-embedding-3-small offers excellent cost-performance for most English use cases. Cohere's multilingual model excels for international applications. Voyage-code-2 is purpose-built for code search. Open-source models like BGE and E5 provide privacy and cost control for self-hosted deployments. The key is measuring what matters for your application and making data-driven decisions rather than defaulting to the most popular option.


Discover more from C4: Container, Code, Cloud & Context

Subscribe to get the latest posts sent to your email.

Leave a Reply

Your email address will not be published. Required fields are marked *

This site uses Akismet to reduce spam. Learn how your comment data is processed.