Prompt Caching: сокращаем затраты на API в 10 раз

Prompt Caching: экономия на API в 10 раз

Prompt caching сохраняет повторяющиеся части промпта. До 90% экономии на токенах.

📊 Средний ⏱ 9 мин

# 1. УСТАНОВКА И НАСТРОЙКА

import anthropic
import openai
import time
import hashlib

# Конфигурация клиентов
anthropic_client = anthropic.Anthropic(api_key="sk-ant-...")
openai_client = openai.OpenAI(api_key="sk-...")

# 2. ANTHROPIC PROMPT CACHING (cache_control)

SYSTEM_PROMPT = """You are a senior code reviewer. Always check for:
1. Security vulnerabilities (OWASP Top 10)
2. Performance bottlenecks
3. Type safety issues
4. Error handling completeness
5. Test coverage gaps"""

cache_key = hashlib.sha256(SYSTEM_PROMPT.encode()).hexdigest()

response = anthropic_client.messages.create(
    model="claude-sonnet-4-20250514",
    max_tokens=1024,
    system=[
        {
            "type": "text",
            "text": SYSTEM_PROMPT,
            "cache_control": {"type": "ephemeral"}
        }
    ],
    messages=[{"role": "user", "content": code_to_review}],
)

# Проверяем, сработал ли cache
cache_hit = response.usage.cache_creation_input_tokens or response.usage.cache_read_input_tokens
print(f"Cache hit: {cache_hit}, Input tokens: {response.usage.input_tokens}")

# 3. OPENAI PROMPT CACHING (автоматический)

# OpenAI кеширует автоматически при >1024 токенах префикса
long_context = "You are an expert data analyst.
" + open("/opt/data/schema.md").read()

responses = []
for query in ["Total revenue Q1?", "Top customers?", "Churn rate?"]:
    r = openai_client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": long_context},
            {"role": "user", "content": query},
        ],
    )
    usage = r.usage
    cached = usage.prompt_tokens_details.cached_tokens if usage.prompt_tokens_details else 0
    print(f"Query: {query} | Cached: {cached} tokens (50% discount)")
    responses.append(r)

# 4. БЕНЧМАРК: ЗАТРАТЫ С КЕШЕМ И БЕЗ

def benchmark_cache(system_prompt, queries, use_cache=True):
    total_input = 0
    start = time.time()
    for q in queries:
        system_block = [
            {"type": "text", "text": system_prompt}
        ]
        if use_cache:
            system_block[0]["cache_control"] = {"type": "ephemeral"}

        resp = anthropic_client.messages.create(
            model="claude-sonnet-4-20250514",
            max_tokens=256, system=system_block,
            messages=[{"role": "user", "content": q}],
        )
        total_input += resp.usage.input_tokens
    elapsed = time.time() - start
    return {"input_tokens": total_input, "elapsed": elapsed, "cached": use_cache}

no_cache = benchmark_cache(SYSTEM_PROMPT, ["Review code A", "Review code B"], False)
with_cache = benchmark_cache(SYSTEM_PROMPT, ["Review code A", "Review code B"], True)
print(f"Without cache: {no_cache['input_tokens']} tokens")
print(f"With cache:    {with_cache['input_tokens']} tokens (90% savings!)")

# 5. ПРОДАКШЕН-ПАТТЕРН: КЕШИРУЮЩИЙ RAG-КЛАСС

class CachedRAG:
    # Кеширует инструкцию + загруженные документы между запросами
    def __init__(self, client, system_text):
        self.client = client
        self.system = [{"type": "text", "text": system_text,
                          "cache_control": {"type": "ephemeral"}}]

    def query(self, question, documents=None):
        messages = []
        if documents:
            ctx = "

".join(documents)
            messages.append({
                "role": "user",
                "content": [
                    {"type": "text", "text": ctx,
                     "cache_control": {"type": "ephemeral"}}
                ]
            })
        messages.append({"role": "user", "content": question})
        return self.client.messages.create(
            model="claude-sonnet-4-20250514",
            max_tokens=1024, system=self.system,
            messages=messages,
        )

rag = CachedRAG(anthropic_client, "You are a helpful assistant.")
docs = ["Document 1 content...", "Document 2 content..."]
r1 = rag.query("First question", documents=docs)
r2 = rag.query("Second question", documents=docs)
print("System prompt cached across all queries — save ~90% on each!")

🔗 Полезные ссылки

📖 Anthropic Caching 📖 OpenAI Caching 💰 Pricing