Prompt caching сохраняет повторяющиеся части промпта. До 90% экономии на токенах.
import anthropic import openai import time import hashlib # Конфигурация клиентов anthropic_client = anthropic.Anthropic(api_key="sk-ant-...") openai_client = openai.OpenAI(api_key="sk-...")
SYSTEM_PROMPT = """You are a senior code reviewer. Always check for: 1. Security vulnerabilities (OWASP Top 10) 2. Performance bottlenecks 3. Type safety issues 4. Error handling completeness 5. Test coverage gaps""" cache_key = hashlib.sha256(SYSTEM_PROMPT.encode()).hexdigest() response = anthropic_client.messages.create( model="claude-sonnet-4-20250514", max_tokens=1024, system=[ { "type": "text", "text": SYSTEM_PROMPT, "cache_control": {"type": "ephemeral"} } ], messages=[{"role": "user", "content": code_to_review}], ) # Проверяем, сработал ли cache cache_hit = response.usage.cache_creation_input_tokens or response.usage.cache_read_input_tokens print(f"Cache hit: {cache_hit}, Input tokens: {response.usage.input_tokens}")
# OpenAI кеширует автоматически при >1024 токенах префикса long_context = "You are an expert data analyst. " + open("/opt/data/schema.md").read() responses = [] for query in ["Total revenue Q1?", "Top customers?", "Churn rate?"]: r = openai_client.chat.completions.create( model="gpt-4o", messages=[ {"role": "system", "content": long_context}, {"role": "user", "content": query}, ], ) usage = r.usage cached = usage.prompt_tokens_details.cached_tokens if usage.prompt_tokens_details else 0 print(f"Query: {query} | Cached: {cached} tokens (50% discount)") responses.append(r)
def benchmark_cache(system_prompt, queries, use_cache=True): total_input = 0 start = time.time() for q in queries: system_block = [ {"type": "text", "text": system_prompt} ] if use_cache: system_block[0]["cache_control"] = {"type": "ephemeral"} resp = anthropic_client.messages.create( model="claude-sonnet-4-20250514", max_tokens=256, system=system_block, messages=[{"role": "user", "content": q}], ) total_input += resp.usage.input_tokens elapsed = time.time() - start return {"input_tokens": total_input, "elapsed": elapsed, "cached": use_cache} no_cache = benchmark_cache(SYSTEM_PROMPT, ["Review code A", "Review code B"], False) with_cache = benchmark_cache(SYSTEM_PROMPT, ["Review code A", "Review code B"], True) print(f"Without cache: {no_cache['input_tokens']} tokens") print(f"With cache: {with_cache['input_tokens']} tokens (90% savings!)")
class CachedRAG: # Кеширует инструкцию + загруженные документы между запросами def __init__(self, client, system_text): self.client = client self.system = [{"type": "text", "text": system_text, "cache_control": {"type": "ephemeral"}}] def query(self, question, documents=None): messages = [] if documents: ctx = " ".join(documents) messages.append({ "role": "user", "content": [ {"type": "text", "text": ctx, "cache_control": {"type": "ephemeral"}} ] }) messages.append({"role": "user", "content": question}) return self.client.messages.create( model="claude-sonnet-4-20250514", max_tokens=1024, system=self.system, messages=messages, ) rag = CachedRAG(anthropic_client, "You are a helpful assistant.") docs = ["Document 1 content...", "Document 2 content..."] r1 = rag.query("First question", documents=docs) r2 = rag.query("Second question", documents=docs) print("System prompt cached across all queries — save ~90% on each!")