Exact-match caching misses queries that are semantically identical but worded differently: "What is the capital of France?" and "Which city is the capital of France?" produce different SHA-256 hashes and both hit the API. A semantic cache stores the embedding of each query alongside the cached response, then uses cosine similarity to find existing answers for new queries that are close enough.
The cheapest way to generate embeddings is OpenAI's text-embedding-3-small at $0.02/1M tokens — a 50-token query costs $0.000001. Even at 1M queries/month the embedding cost is $1. The text-embedding-3-large model costs $0.13/1M tokens and produces better representations for nuanced queries; for most FAQ-style caching, text-embedding-3-small is sufficient. See our Embedding Cost Calculator 2026 for a full breakdown by volume.
Redis setup for vector search — run once at startup:
```python
import numpy as np
import redis
from redis.commands.search.field import VectorField, TextField
from redis.commands.search.indexDefinition import IndexDefinition, IndexType
DIM = 1536 # text-embedding-3-small output dimension
INDEX_NAME = "semantic_cache"
def create_index(r: redis.Redis):
try:
r.ft(INDEX_NAME).info()
except Exception:
schema = (
TextField("$.prompt", as_name="prompt"),
TextField("$.response", as_name="response"),
VectorField(
"$.embedding",
"HNSW",
{"TYPE": "FLOAT32", "DIM": DIM, "DISTANCE_METRIC": "COSINE"},
as_name="embedding",
),
)
r.ft(INDEX_NAME).create_index(
schema,
definition=IndexDefinition(
prefix=["scache:"], index_type=IndexType.JSON
),
)
```
Semantic cache lookup and write:
```python
from openai import OpenAI
import json, os, time
from redis.commands.search.query import Query
client = OpenAI()
SIMILARITY_THRESHOLD = 0.92 # cosine similarity — tune per workload
TTL_SECONDS = 43200 # 12 hours for semantic cache
def embed(text: str) -> list[float]:
resp = client.embeddings.create(
model="text-embedding-3-small",
input=text
)
return resp.data[0].embedding
def semantic_cache_lookup(r: redis.Redis, prompt: str):
vec = np.array(embed(prompt), dtype=np.float32).tobytes()
q = (
Query("(*)=>[KNN 1 @embedding $vec AS score]")
.sort_by("score")
.return_fields("prompt", "response", "score")
.dialect(2)
)
results = r.ft(INDEX_NAME).search(q, query_params={"vec": vec})
if results.total == 0:
return None, None
top = results.docs[0]
similarity = 1 - float(top.score) # COSINE distance → similarity
if similarity >= SIMILARITY_THRESHOLD:
return top.response, similarity
return None, None
def semantic_cache_store(r: redis.Redis, prompt: str, response: str):
embedding = embed(prompt)
key = f"scache:{int(time.time() * 1000)}"
r.json().set(key, "$", {
"prompt": prompt,
"response": response,
"embedding": embedding,
"ts": int(time.time())
})
r.expire(key, TTL_SECONDS)
def semantic_cached_chat(r: redis.Redis, prompt: str, model: str = "gpt-5") -> str:
cached, similarity = semantic_cache_lookup(r, prompt)
if cached:
print(f"Semantic cache hit (similarity={similarity:.3f})")
return cached
messages = [{"role": "user", "content": prompt}]
response = client.chat.completions.create(model=model, messages=messages)
content = response.choices[0].message.content
semantic_cache_store(r, prompt, content)
return content
```
The `SIMILARITY_THRESHOLD` of 0.92 is a good starting point. Too low and you return wrong answers; too high and you miss legitimate cache hits. Tune it by logging similarity scores for a few thousand real queries and plotting the distribution — most apps converge on 0.90-0.95. For safety-critical or factual applications, prefer exact-match caching or set the threshold higher (0.97+).