Introduction: LLMs have no inherent memory—each request starts fresh. Building effective memory systems enables conversations that span sessions, personalization based on user history, and agents that learn from past interactions. Memory architectures range from simple conversation buffers to sophisticated vector-based long-term storage with semantic retrieval. This guide covers practical memory patterns: conversation buffers, sliding windows, summary-based compression, vector store memory, and hybrid systems that combine multiple approaches for optimal recall and context efficiency.

Conversation Buffer Memory
from dataclasses import dataclass, field
from typing import Optional
from datetime import datetime
from collections import deque
@dataclass
class Message:
"""A single message in conversation."""
role: str
content: str
timestamp: datetime = field(default_factory=datetime.now)
metadata: dict = field(default_factory=dict)
class ConversationBuffer:
"""Simple buffer that stores all messages."""
def __init__(self, max_messages: int = 100):
self.messages: list[Message] = []
self.max_messages = max_messages
def add(self, role: str, content: str, **metadata):
"""Add message to buffer."""
message = Message(
role=role,
content=content,
metadata=metadata
)
self.messages.append(message)
# Trim if over limit
if len(self.messages) > self.max_messages:
self.messages = self.messages[-self.max_messages:]
def get_messages(self) -> list[dict]:
"""Get messages for API call."""
return [
{"role": m.role, "content": m.content}
for m in self.messages
]
def clear(self):
"""Clear all messages."""
self.messages = []
def get_context_string(self) -> str:
"""Get conversation as string."""
return "\n".join([
f"{m.role}: {m.content}"
for m in self.messages
])
class SlidingWindowBuffer:
"""Buffer with sliding window based on token count."""
def __init__(
self,
max_tokens: int = 4000,
counter = None
):
self.max_tokens = max_tokens
self.messages: deque = deque()
self.current_tokens = 0
# Token counter (use tiktoken)
if counter is None:
import tiktoken
enc = tiktoken.get_encoding("cl100k_base")
self.count_tokens = lambda x: len(enc.encode(x))
else:
self.count_tokens = counter
def add(self, role: str, content: str):
"""Add message and slide window if needed."""
tokens = self.count_tokens(content) + 10 # Overhead
self.messages.append({
"role": role,
"content": content,
"tokens": tokens
})
self.current_tokens += tokens
# Slide window
while self.current_tokens > self.max_tokens and len(self.messages) > 1:
removed = self.messages.popleft()
self.current_tokens -= removed["tokens"]
def get_messages(self) -> list[dict]:
"""Get messages for API call."""
return [
{"role": m["role"], "content": m["content"]}
for m in self.messages
]
# Token-aware buffer with priority
class PriorityBuffer:
"""Buffer that keeps important messages longer."""
def __init__(self, max_tokens: int = 4000):
self.max_tokens = max_tokens
self.system_messages: list[dict] = []
self.important_messages: list[dict] = []
self.regular_messages: deque = deque()
import tiktoken
enc = tiktoken.get_encoding("cl100k_base")
self.count_tokens = lambda x: len(enc.encode(x))
def add(
self,
role: str,
content: str,
important: bool = False
):
"""Add message with priority."""
message = {"role": role, "content": content}
if role == "system":
self.system_messages.append(message)
elif important:
self.important_messages.append(message)
else:
self.regular_messages.append(message)
self._trim()
def _trim(self):
"""Trim regular messages to fit budget."""
# Calculate fixed tokens
fixed_tokens = sum(
self.count_tokens(m["content"]) + 10
for m in self.system_messages + self.important_messages
)
available = self.max_tokens - fixed_tokens
# Trim regular messages
current = sum(
self.count_tokens(m["content"]) + 10
for m in self.regular_messages
)
while current > available and self.regular_messages:
removed = self.regular_messages.popleft()
current -= self.count_tokens(removed["content"]) + 10
def get_messages(self) -> list[dict]:
"""Get all messages in order."""
return (
self.system_messages +
list(self.regular_messages) +
self.important_messages
)
Summary Memory
from dataclasses import dataclass
from typing import Optional
@dataclass
class ConversationSummary:
"""Summary of conversation history."""
summary: str
message_count: int
last_updated: datetime
key_points: list[str] = field(default_factory=list)
class SummaryMemory:
"""Memory that summarizes old messages."""
def __init__(
self,
client,
max_messages_before_summary: int = 10,
summary_model: str = "gpt-4o-mini"
):
self.client = client
self.max_messages = max_messages_before_summary
self.summary_model = summary_model
self.current_summary: Optional[ConversationSummary] = None
self.recent_messages: list[Message] = []
def add(self, role: str, content: str):
"""Add message, summarize if needed."""
self.recent_messages.append(Message(role=role, content=content))
if len(self.recent_messages) >= self.max_messages:
self._summarize()
def _summarize(self):
"""Summarize recent messages."""
# Build conversation text
conversation = "\n".join([
f"{m.role}: {m.content}"
for m in self.recent_messages
])
# Include existing summary
context = ""
if self.current_summary:
context = f"Previous summary: {self.current_summary.summary}\n\n"
prompt = f"""{context}Summarize this conversation concisely, preserving key information:
{conversation}
Provide:
1. A brief summary (2-3 sentences)
2. Key points as a list"""
response = self.client.chat.completions.create(
model=self.summary_model,
messages=[{"role": "user", "content": prompt}]
)
summary_text = response.choices[0].message.content
# Update summary
self.current_summary = ConversationSummary(
summary=summary_text,
message_count=(
(self.current_summary.message_count if self.current_summary else 0) +
len(self.recent_messages)
),
last_updated=datetime.now()
)
# Keep only last few messages
self.recent_messages = self.recent_messages[-2:]
def get_context(self) -> str:
"""Get memory context for prompt."""
parts = []
if self.current_summary:
parts.append(f"Conversation summary:\n{self.current_summary.summary}")
if self.recent_messages:
recent = "\n".join([
f"{m.role}: {m.content}"
for m in self.recent_messages
])
parts.append(f"Recent messages:\n{recent}")
return "\n\n".join(parts)
def get_messages(self) -> list[dict]:
"""Get messages for API call."""
messages = []
if self.current_summary:
messages.append({
"role": "system",
"content": f"Conversation history summary: {self.current_summary.summary}"
})
for m in self.recent_messages:
messages.append({"role": m.role, "content": m.content})
return messages
# Progressive summarization
class ProgressiveSummary:
"""Summarize in layers for long conversations."""
def __init__(self, client):
self.client = client
self.summaries: list[str] = [] # Oldest to newest
self.current_chunk: list[Message] = []
self.chunk_size = 10
def add(self, role: str, content: str):
"""Add message."""
self.current_chunk.append(Message(role=role, content=content))
if len(self.current_chunk) >= self.chunk_size:
self._summarize_chunk()
def _summarize_chunk(self):
"""Summarize current chunk."""
conversation = "\n".join([
f"{m.role}: {m.content}"
for m in self.current_chunk
])
response = self.client.chat.completions.create(
model="gpt-4o-mini",
messages=[{
"role": "user",
"content": f"Summarize this conversation in 2-3 sentences:\n\n{conversation}"
}]
)
self.summaries.append(response.choices[0].message.content)
self.current_chunk = []
# Consolidate old summaries if too many
if len(self.summaries) > 5:
self._consolidate_summaries()
def _consolidate_summaries(self):
"""Consolidate multiple summaries into one."""
old_summaries = self.summaries[:-2]
combined = "\n".join([
f"- {s}" for s in old_summaries
])
response = self.client.chat.completions.create(
model="gpt-4o-mini",
messages=[{
"role": "user",
"content": f"Consolidate these summaries into one:\n\n{combined}"
}]
)
self.summaries = [
response.choices[0].message.content
] + self.summaries[-2:]
Vector Store Memory
from dataclasses import dataclass
from typing import Optional
import numpy as np
@dataclass
class MemoryEntry:
"""A memory entry with embedding."""
id: str
content: str
embedding: list[float]
metadata: dict
timestamp: datetime
class VectorMemory:
"""Long-term memory using vector similarity."""
def __init__(
self,
embedding_client,
embedding_model: str = "text-embedding-3-small"
):
self.embedding_client = embedding_client
self.embedding_model = embedding_model
self.memories: list[MemoryEntry] = []
def _embed(self, text: str) -> list[float]:
"""Get embedding for text."""
response = self.embedding_client.embeddings.create(
model=self.embedding_model,
input=text
)
return response.data[0].embedding
def add(
self,
content: str,
metadata: dict = None
) -> str:
"""Add memory entry."""
import uuid
entry = MemoryEntry(
id=str(uuid.uuid4()),
content=content,
embedding=self._embed(content),
metadata=metadata or {},
timestamp=datetime.now()
)
self.memories.append(entry)
return entry.id
def search(
self,
query: str,
top_k: int = 5,
threshold: float = 0.7
) -> list[MemoryEntry]:
"""Search memories by similarity."""
query_embedding = self._embed(query)
# Calculate similarities
similarities = []
for memory in self.memories:
sim = self._cosine_similarity(query_embedding, memory.embedding)
if sim >= threshold:
similarities.append((memory, sim))
# Sort by similarity
similarities.sort(key=lambda x: x[1], reverse=True)
return [m for m, _ in similarities[:top_k]]
def _cosine_similarity(self, a: list[float], b: list[float]) -> float:
"""Calculate cosine similarity."""
a = np.array(a)
b = np.array(b)
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
def get_relevant_context(
self,
query: str,
max_tokens: int = 2000
) -> str:
"""Get relevant memories as context string."""
memories = self.search(query)
context_parts = []
current_tokens = 0
for memory in memories:
# Rough token estimate
tokens = len(memory.content.split()) * 1.5
if current_tokens + tokens > max_tokens:
break
context_parts.append(memory.content)
current_tokens += tokens
return "\n\n".join(context_parts)
# Conversation-aware vector memory
class ConversationVectorMemory:
"""Vector memory that stores conversation turns."""
def __init__(self, embedding_client):
self.vector_memory = VectorMemory(embedding_client)
self.conversation_id: Optional[str] = None
def start_conversation(self, conversation_id: str = None):
"""Start new conversation."""
import uuid
self.conversation_id = conversation_id or str(uuid.uuid4())
def add_turn(self, user_message: str, assistant_response: str):
"""Add conversation turn to memory."""
# Store as combined turn
content = f"User: {user_message}\nAssistant: {assistant_response}"
self.vector_memory.add(
content=content,
metadata={
"conversation_id": self.conversation_id,
"user_message": user_message,
"assistant_response": assistant_response
}
)
def recall(self, query: str, top_k: int = 3) -> list[dict]:
"""Recall relevant past conversations."""
memories = self.vector_memory.search(query, top_k=top_k)
return [
{
"user": m.metadata.get("user_message", ""),
"assistant": m.metadata.get("assistant_response", ""),
"conversation_id": m.metadata.get("conversation_id", "")
}
for m in memories
]
Hybrid Memory System
from dataclasses import dataclass
from typing import Optional
@dataclass
class MemoryConfig:
"""Configuration for hybrid memory."""
buffer_max_messages: int = 10
summary_threshold: int = 20
vector_search_k: int = 5
max_context_tokens: int = 4000
class HybridMemory:
"""Combines buffer, summary, and vector memory."""
def __init__(
self,
client,
embedding_client,
config: MemoryConfig = None
):
self.client = client
self.config = config or MemoryConfig()
# Short-term: recent messages
self.buffer = SlidingWindowBuffer(
max_tokens=self.config.max_context_tokens // 2
)
# Medium-term: summaries
self.summary = SummaryMemory(
client,
max_messages_before_summary=self.config.summary_threshold
)
# Long-term: vector store
self.vector = VectorMemory(embedding_client)
# Track all messages for vector storage
self.message_count = 0
def add(self, role: str, content: str):
"""Add message to all memory systems."""
# Add to buffer
self.buffer.add(role, content)
# Add to summary system
self.summary.add(role, content)
# Add to vector store (every few messages)
self.message_count += 1
if self.message_count % 5 == 0:
# Store recent context
recent = self.buffer.get_messages()[-5:]
context = "\n".join([
f"{m['role']}: {m['content']}"
for m in recent
])
self.vector.add(context)
def get_context(self, current_query: str) -> dict:
"""Get combined context from all memory systems."""
# Recent messages from buffer
recent_messages = self.buffer.get_messages()
# Summary of older conversation
summary_context = self.summary.get_context()
# Relevant memories from vector store
relevant_memories = self.vector.get_relevant_context(
current_query,
max_tokens=500
)
return {
"recent_messages": recent_messages,
"summary": summary_context,
"relevant_memories": relevant_memories
}
def build_messages(
self,
system_prompt: str,
current_query: str
) -> list[dict]:
"""Build messages list for API call."""
context = self.get_context(current_query)
messages = []
# System prompt with memory context
memory_context = ""
if context["summary"]:
memory_context += f"\n\nConversation history:\n{context['summary']}"
if context["relevant_memories"]:
memory_context += f"\n\nRelevant past context:\n{context['relevant_memories']}"
messages.append({
"role": "system",
"content": system_prompt + memory_context
})
# Recent messages
messages.extend(context["recent_messages"])
return messages
# Entity memory
class EntityMemory:
"""Track entities mentioned in conversation."""
def __init__(self, client):
self.client = client
self.entities: dict[str, dict] = {}
def extract_entities(self, text: str) -> list[dict]:
"""Extract entities from text using LLM."""
response = self.client.chat.completions.create(
model="gpt-4o-mini",
messages=[{
"role": "user",
"content": f"""Extract entities from this text. Return JSON array:
[{{"name": "entity name", "type": "person/place/org/concept", "info": "relevant info"}}]
Text: {text}"""
}],
response_format={"type": "json_object"}
)
import json
result = json.loads(response.choices[0].message.content)
return result.get("entities", [])
def update(self, text: str):
"""Update entity memory from text."""
entities = self.extract_entities(text)
for entity in entities:
name = entity.get("name", "").lower()
if name in self.entities:
# Merge info
existing = self.entities[name]
existing["info"] = f"{existing['info']}; {entity.get('info', '')}"
else:
self.entities[name] = entity
def get_entity_context(self) -> str:
"""Get entity context for prompt."""
if not self.entities:
return ""
lines = ["Known entities:"]
for name, info in self.entities.items():
lines.append(f"- {name} ({info['type']}): {info['info']}")
return "\n".join(lines)
Production Memory Service
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import Optional
import uuid
app = FastAPI()
# Initialize memory systems
from openai import OpenAI
client = OpenAI()
# Session-based memory storage
sessions: dict[str, HybridMemory] = {}
def get_or_create_session(session_id: str) -> HybridMemory:
"""Get or create memory for session."""
if session_id not in sessions:
sessions[session_id] = HybridMemory(client, client)
return sessions[session_id]
class ChatRequest(BaseModel):
session_id: Optional[str] = None
message: str
system_prompt: Optional[str] = "You are a helpful assistant."
class MemoryRequest(BaseModel):
session_id: str
content: str
metadata: Optional[dict] = None
@app.post("/v1/chat")
async def chat_with_memory(request: ChatRequest):
"""Chat endpoint with memory."""
# Get or create session
session_id = request.session_id or str(uuid.uuid4())
memory = get_or_create_session(session_id)
# Build messages with memory context
messages = memory.build_messages(
request.system_prompt,
request.message
)
# Add current message
messages.append({"role": "user", "content": request.message})
# Get response
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=messages
)
assistant_message = response.choices[0].message.content
# Update memory
memory.add("user", request.message)
memory.add("assistant", assistant_message)
return {
"session_id": session_id,
"response": assistant_message,
"memory_stats": {
"buffer_size": len(memory.buffer.messages),
"vector_count": len(memory.vector.memories)
}
}
@app.post("/v1/memory/add")
async def add_memory(request: MemoryRequest):
"""Add explicit memory entry."""
memory = get_or_create_session(request.session_id)
memory_id = memory.vector.add(
content=request.content,
metadata=request.metadata
)
return {"memory_id": memory_id}
@app.get("/v1/memory/search")
async def search_memory(session_id: str, query: str, top_k: int = 5):
"""Search session memory."""
if session_id not in sessions:
raise HTTPException(status_code=404, detail="Session not found")
memory = sessions[session_id]
results = memory.vector.search(query, top_k=top_k)
return {
"results": [
{"content": r.content, "metadata": r.metadata}
for r in results
]
}
@app.delete("/v1/session/{session_id}")
async def delete_session(session_id: str):
"""Delete session and its memory."""
if session_id in sessions:
del sessions[session_id]
return {"deleted": True}
@app.get("/health")
async def health():
return {
"status": "healthy",
"active_sessions": len(sessions)
}
References
- LangChain Memory: https://python.langchain.com/docs/modules/memory/
- MemGPT: https://memgpt.ai/
- Zep Memory: https://docs.getzep.com/
- OpenAI Embeddings: https://platform.openai.com/docs/guides/embeddings
Conclusion
Effective memory systems transform stateless LLMs into contextually aware assistants. Use conversation buffers for immediate context—sliding windows with token limits prevent context overflow. Implement summary memory to compress older conversations while preserving key information. Vector store memory enables semantic retrieval of relevant past interactions across sessions. Hybrid systems combining all three approaches provide the best results: recent context from buffers, compressed history from summaries, and relevant long-term memories from vector search. For production, implement session management, memory persistence, and cleanup policies. The goal is giving the model enough context to be helpful without overwhelming it with irrelevant history.
Discover more from C4: Container, Code, Cloud & Context
Subscribe to get the latest posts sent to your email.