llm-gateway-competitors/litellm-wheel-src/litellm/router_utils/prompt_caching_cache.py

"""
Wrapper around router cache. Meant to store model id when prompt caching supported prompt is called.
"""

import hashlib
import json
from typing import TYPE_CHECKING, Any, List, Optional, Union, cast

from typing_extensions import TypedDict

from litellm.caching.caching import DualCache
from litellm.caching.in_memory_cache import InMemoryCache
from litellm.types.llms.openai import AllMessageValues, ChatCompletionToolParam

if TYPE_CHECKING:
    from opentelemetry.trace import Span as _Span

    from litellm.router import Router

    litellm_router = Router
    Span = Union[_Span, Any]
else:
    Span = Any
    litellm_router = Any


class PromptCachingCacheValue(TypedDict):
    model_id: str


class PromptCachingCache:
    def __init__(self, cache: DualCache):
        self.cache = cache
        self.in_memory_cache = InMemoryCache()

    @staticmethod
    def serialize_object(obj: Any) -> Any:
        """Helper function to serialize Pydantic objects, dictionaries, or fallback to string."""
        if hasattr(obj, "dict"):
            # If the object is a Pydantic model, use its `dict()` method
            return obj.dict()
        elif isinstance(obj, dict):
            # If the object is a dictionary, serialize it with sorted keys
            return json.dumps(
                obj, sort_keys=True, separators=(",", ":")
            )  # Standardize serialization

        elif isinstance(obj, list):
            # Serialize lists by ensuring each element is handled properly
            return [PromptCachingCache.serialize_object(item) for item in obj]
        elif isinstance(obj, (int, float, bool)):
            return obj  # Keep primitive types as-is
        return str(obj)

    @staticmethod
    def extract_cacheable_prefix(
        messages: List[AllMessageValues],
    ) -> List[AllMessageValues]:
        """
        Extract the cacheable prefix from messages.

        The cacheable prefix is everything UP TO AND INCLUDING the LAST content block
        (across all messages) that has cache_control. This includes ALL blocks before
        the last cacheable block (even if they don't have cache_control).

        Args:
            messages: List of messages to extract cacheable prefix from

        Returns:
            List of messages containing only the cacheable prefix
        """
        if not messages:
            return messages

        # Find the last content block (across all messages) that has cache_control
        last_cacheable_message_idx = None
        last_cacheable_content_idx = None

        for msg_idx, message in enumerate(messages):
            content = message.get("content")

            # Check for cache_control at message level (when content is a string)
            # This handles the case where cache_control is a sibling of string content:
            # {"role": "user", "content": "...", "cache_control": {"type": "ephemeral"}}
            message_level_cache_control = message.get("cache_control")
            if (
                message_level_cache_control is not None
                and isinstance(message_level_cache_control, dict)
                and message_level_cache_control.get("type") == "ephemeral"
            ):
                last_cacheable_message_idx = msg_idx
                # Set to None to indicate the entire message content is cacheable
                # (not a specific content block index within a list)
                last_cacheable_content_idx = None

            # Also check for cache_control within content blocks (when content is a list)
            if not isinstance(content, list):
                continue

            for content_idx, content_block in enumerate(content):
                if isinstance(content_block, dict):
                    cache_control = content_block.get("cache_control")
                    if (
                        cache_control is not None
                        and isinstance(cache_control, dict)
                        and cache_control.get("type") == "ephemeral"
                    ):
                        last_cacheable_message_idx = msg_idx
                        last_cacheable_content_idx = content_idx

        # If no cacheable block found, return empty list (no cacheable prefix)
        if last_cacheable_message_idx is None:
            return []

        # Build the cacheable prefix: all messages up to and including the last cacheable message
        cacheable_prefix = []

        for msg_idx, message in enumerate(messages):
            if msg_idx < last_cacheable_message_idx:
                # Include entire message (comes before last cacheable block)
                cacheable_prefix.append(message)
            elif msg_idx == last_cacheable_message_idx:
                # Include message but only up to and including the last cacheable content block
                content = message.get("content")
                if isinstance(content, list) and last_cacheable_content_idx is not None:
                    # Create a copy of the message with only cacheable content blocks
                    message_copy = cast(
                        AllMessageValues,
                        {
                            **message,
                            "content": content[: last_cacheable_content_idx + 1],
                        },
                    )
                    cacheable_prefix.append(message_copy)
                else:
                    # Content is not a list or cacheable content idx is None, include full message
                    cacheable_prefix.append(message)
            else:
                # Message comes after last cacheable block, don't include
                break

        return cacheable_prefix

    @staticmethod
    def get_prompt_caching_cache_key(
        messages: Optional[List[AllMessageValues]],
        tools: Optional[List[ChatCompletionToolParam]],
    ) -> Optional[str]:
        if messages is None and tools is None:
            return None

        # Extract cacheable prefix from messages (only include up to last cache_control block)
        cacheable_messages = None
        if messages is not None:
            cacheable_messages = PromptCachingCache.extract_cacheable_prefix(messages)
            # If no cacheable prefix found, return None (can't cache)
            if not cacheable_messages:
                return None

        # Use serialize_object for consistent and stable serialization
        data_to_hash = {}
        if cacheable_messages is not None:
            serialized_messages = PromptCachingCache.serialize_object(
                cacheable_messages
            )
            data_to_hash["messages"] = serialized_messages
        if tools is not None:
            serialized_tools = PromptCachingCache.serialize_object(tools)
            data_to_hash["tools"] = serialized_tools

        # Combine serialized data into a single string
        data_to_hash_str = json.dumps(
            data_to_hash,
            sort_keys=True,
            separators=(",", ":"),
        )

        # Create a hash of the serialized data for a stable cache key
        hashed_data = hashlib.sha256(data_to_hash_str.encode()).hexdigest()
        return f"deployment:{hashed_data}:prompt_caching"

    def add_model_id(
        self,
        model_id: str,
        messages: Optional[List[AllMessageValues]],
        tools: Optional[List[ChatCompletionToolParam]],
    ) -> None:
        if messages is None and tools is None:
            return None

        cache_key = PromptCachingCache.get_prompt_caching_cache_key(messages, tools)
        # If no cacheable prefix found, don't cache (can't generate cache key)
        if cache_key is None:
            return None

        self.cache.set_cache(
            cache_key, PromptCachingCacheValue(model_id=model_id), ttl=300
        )
        return None

    async def async_add_model_id(
        self,
        model_id: str,
        messages: Optional[List[AllMessageValues]],
        tools: Optional[List[ChatCompletionToolParam]],
    ) -> None:
        if messages is None and tools is None:
            return None

        cache_key = PromptCachingCache.get_prompt_caching_cache_key(messages, tools)
        # If no cacheable prefix found, don't cache (can't generate cache key)
        if cache_key is None:
            return None

        await self.cache.async_set_cache(
            cache_key,
            PromptCachingCacheValue(model_id=model_id),
            ttl=300,  # store for 5 minutes
        )
        return None

    async def async_get_model_id(
        self,
        messages: Optional[List[AllMessageValues]],
        tools: Optional[List[ChatCompletionToolParam]],
    ) -> Optional[PromptCachingCacheValue]:
        """
        Get model ID from cache using the cacheable prefix.

        The cache key is based on the cacheable prefix (everything up to and including
        the last cache_control block), so requests with the same cacheable prefix but
        different user messages will have the same cache key.
        """
        if messages is None and tools is None:
            return None

        # Generate cache key using cacheable prefix
        cache_key = PromptCachingCache.get_prompt_caching_cache_key(messages, tools)
        if cache_key is None:
            return None

        # Perform cache lookup
        cache_result = await self.cache.async_get_cache(key=cache_key)
        return cache_result

    def get_model_id(
        self,
        messages: Optional[List[AllMessageValues]],
        tools: Optional[List[ChatCompletionToolParam]],
    ) -> Optional[PromptCachingCacheValue]:
        if messages is None and tools is None:
            return None

        cache_key = PromptCachingCache.get_prompt_caching_cache_key(messages, tools)
        # If no cacheable prefix found, return None (can't cache)
        if cache_key is None:
            return None

        return self.cache.get_cache(cache_key)
chore: initial public snapshot for github upload 2026-03-26 20:06:14 +08:00			`"""`
			`Wrapper around router cache. Meant to store model id when prompt caching supported prompt is called.`
			`"""`

			`import hashlib`
			`import json`
			`from typing import TYPE_CHECKING, Any, List, Optional, Union, cast`

			`from typing_extensions import TypedDict`

			`from litellm.caching.caching import DualCache`
			`from litellm.caching.in_memory_cache import InMemoryCache`
			`from litellm.types.llms.openai import AllMessageValues, ChatCompletionToolParam`

			`if TYPE_CHECKING:`
			`from opentelemetry.trace import Span as _Span`

			`from litellm.router import Router`

			`litellm_router = Router`
			`Span = Union[_Span, Any]`
			`else:`
			`Span = Any`
			`litellm_router = Any`


			`class PromptCachingCacheValue(TypedDict):`
			`model_id: str`


			`class PromptCachingCache:`
			`def __init__(self, cache: DualCache):`
			`self.cache = cache`
			`self.in_memory_cache = InMemoryCache()`

			`@staticmethod`
			`def serialize_object(obj: Any) -> Any:`
			`"""Helper function to serialize Pydantic objects, dictionaries, or fallback to string."""`
			`if hasattr(obj, "dict"):`
			# If the object is a Pydantic model, use its `dict()` method
			`return obj.dict()`
			`elif isinstance(obj, dict):`
			`# If the object is a dictionary, serialize it with sorted keys`
			`return json.dumps(`
			`obj, sort_keys=True, separators=(",", ":")`
			`) # Standardize serialization`

			`elif isinstance(obj, list):`
			`# Serialize lists by ensuring each element is handled properly`
			`return [PromptCachingCache.serialize_object(item) for item in obj]`
			`elif isinstance(obj, (int, float, bool)):`
			`return obj # Keep primitive types as-is`
			`return str(obj)`

			`@staticmethod`
			`def extract_cacheable_prefix(`
			`messages: List[AllMessageValues],`
			`) -> List[AllMessageValues]:`
			`"""`
			`Extract the cacheable prefix from messages.`

			`The cacheable prefix is everything UP TO AND INCLUDING the LAST content block`
			`(across all messages) that has cache_control. This includes ALL blocks before`
			`the last cacheable block (even if they don't have cache_control).`

			`Args:`
			`messages: List of messages to extract cacheable prefix from`

			`Returns:`
			`List of messages containing only the cacheable prefix`
			`"""`
			`if not messages:`
			`return messages`

			`# Find the last content block (across all messages) that has cache_control`
			`last_cacheable_message_idx = None`
			`last_cacheable_content_idx = None`

			`for msg_idx, message in enumerate(messages):`
			`content = message.get("content")`

			`# Check for cache_control at message level (when content is a string)`
			`# This handles the case where cache_control is a sibling of string content:`
			`# {"role": "user", "content": "...", "cache_control": {"type": "ephemeral"}}`
			`message_level_cache_control = message.get("cache_control")`
			`if (`
			`message_level_cache_control is not None`
			`and isinstance(message_level_cache_control, dict)`
			`and message_level_cache_control.get("type") == "ephemeral"`
			`):`
			`last_cacheable_message_idx = msg_idx`
			`# Set to None to indicate the entire message content is cacheable`
			`# (not a specific content block index within a list)`
			`last_cacheable_content_idx = None`

			`# Also check for cache_control within content blocks (when content is a list)`
			`if not isinstance(content, list):`
			`continue`

			`for content_idx, content_block in enumerate(content):`
			`if isinstance(content_block, dict):`
			`cache_control = content_block.get("cache_control")`
			`if (`
			`cache_control is not None`
			`and isinstance(cache_control, dict)`
			`and cache_control.get("type") == "ephemeral"`
			`):`
			`last_cacheable_message_idx = msg_idx`
			`last_cacheable_content_idx = content_idx`

			`# If no cacheable block found, return empty list (no cacheable prefix)`
			`if last_cacheable_message_idx is None:`
			`return []`

			`# Build the cacheable prefix: all messages up to and including the last cacheable message`
			`cacheable_prefix = []`

			`for msg_idx, message in enumerate(messages):`
			`if msg_idx < last_cacheable_message_idx:`
			`# Include entire message (comes before last cacheable block)`
			`cacheable_prefix.append(message)`
			`elif msg_idx == last_cacheable_message_idx:`
			`# Include message but only up to and including the last cacheable content block`
			`content = message.get("content")`
			`if isinstance(content, list) and last_cacheable_content_idx is not None:`
			`# Create a copy of the message with only cacheable content blocks`
			`message_copy = cast(`
			`AllMessageValues,`
			`{`
			`**message,`
			`"content": content[: last_cacheable_content_idx + 1],`
			`},`
			`)`
			`cacheable_prefix.append(message_copy)`
			`else:`
			`# Content is not a list or cacheable content idx is None, include full message`
			`cacheable_prefix.append(message)`
			`else:`
			`# Message comes after last cacheable block, don't include`
			`break`

			`return cacheable_prefix`

			`@staticmethod`
			`def get_prompt_caching_cache_key(`
			`messages: Optional[List[AllMessageValues]],`
			`tools: Optional[List[ChatCompletionToolParam]],`
			`) -> Optional[str]:`
			`if messages is None and tools is None:`
			`return None`

			`# Extract cacheable prefix from messages (only include up to last cache_control block)`
			`cacheable_messages = None`
			`if messages is not None:`
			`cacheable_messages = PromptCachingCache.extract_cacheable_prefix(messages)`
			`# If no cacheable prefix found, return None (can't cache)`
			`if not cacheable_messages:`
			`return None`

			`# Use serialize_object for consistent and stable serialization`
			`data_to_hash = {}`
			`if cacheable_messages is not None:`
			`serialized_messages = PromptCachingCache.serialize_object(`
			`cacheable_messages`
			`)`
			`data_to_hash["messages"] = serialized_messages`
			`if tools is not None:`
			`serialized_tools = PromptCachingCache.serialize_object(tools)`
			`data_to_hash["tools"] = serialized_tools`

			`# Combine serialized data into a single string`
			`data_to_hash_str = json.dumps(`
			`data_to_hash,`
			`sort_keys=True,`
			`separators=(",", ":"),`
			`)`

			`# Create a hash of the serialized data for a stable cache key`
			`hashed_data = hashlib.sha256(data_to_hash_str.encode()).hexdigest()`
			`return f"deployment:{hashed_data}:prompt_caching"`

			`def add_model_id(`
			`self,`
			`model_id: str,`
			`messages: Optional[List[AllMessageValues]],`
			`tools: Optional[List[ChatCompletionToolParam]],`
			`) -> None:`
			`if messages is None and tools is None:`
			`return None`

			`cache_key = PromptCachingCache.get_prompt_caching_cache_key(messages, tools)`
			`# If no cacheable prefix found, don't cache (can't generate cache key)`
			`if cache_key is None:`
			`return None`

			`self.cache.set_cache(`
			`cache_key, PromptCachingCacheValue(model_id=model_id), ttl=300`
			`)`
			`return None`

			`async def async_add_model_id(`
			`self,`
			`model_id: str,`
			`messages: Optional[List[AllMessageValues]],`
			`tools: Optional[List[ChatCompletionToolParam]],`
			`) -> None:`
			`if messages is None and tools is None:`
			`return None`

			`cache_key = PromptCachingCache.get_prompt_caching_cache_key(messages, tools)`
			`# If no cacheable prefix found, don't cache (can't generate cache key)`
			`if cache_key is None:`
			`return None`

			`await self.cache.async_set_cache(`
			`cache_key,`
			`PromptCachingCacheValue(model_id=model_id),`
			`ttl=300, # store for 5 minutes`
			`)`
			`return None`

			`async def async_get_model_id(`
			`self,`
			`messages: Optional[List[AllMessageValues]],`
			`tools: Optional[List[ChatCompletionToolParam]],`
			`) -> Optional[PromptCachingCacheValue]:`
			`"""`
			`Get model ID from cache using the cacheable prefix.`

			`The cache key is based on the cacheable prefix (everything up to and including`
			`the last cache_control block), so requests with the same cacheable prefix but`
			`different user messages will have the same cache key.`
			`"""`
			`if messages is None and tools is None:`
			`return None`

			`# Generate cache key using cacheable prefix`
			`cache_key = PromptCachingCache.get_prompt_caching_cache_key(messages, tools)`
			`if cache_key is None:`
			`return None`

			`# Perform cache lookup`
			`cache_result = await self.cache.async_get_cache(key=cache_key)`
			`return cache_result`

			`def get_model_id(`
			`self,`
			`messages: Optional[List[AllMessageValues]],`
			`tools: Optional[List[ChatCompletionToolParam]],`
			`) -> Optional[PromptCachingCacheValue]:`
			`if messages is None and tools is None:`
			`return None`

			`cache_key = PromptCachingCache.get_prompt_caching_cache_key(messages, tools)`
			`# If no cacheable prefix found, return None (can't cache)`
			`if cache_key is None:`
			`return None`

			`return self.cache.get_cache(cache_key)`