133 lines
4.3 KiB
Python
133 lines
4.3 KiB
Python
"""
|
|
Helper util for handling anthropic-specific cost calculation
|
|
- e.g.: prompt caching
|
|
"""
|
|
|
|
from typing import TYPE_CHECKING, Optional, Tuple
|
|
|
|
from litellm.litellm_core_utils.llm_cost_calc.utils import (
|
|
_get_token_base_cost,
|
|
_parse_prompt_tokens_details,
|
|
calculate_cache_writing_cost,
|
|
generic_cost_per_token,
|
|
)
|
|
|
|
if TYPE_CHECKING:
|
|
from litellm.types.utils import ModelInfo, Usage
|
|
import litellm
|
|
|
|
|
|
def _compute_cache_only_cost(model_info: "ModelInfo", usage: "Usage") -> float:
|
|
"""
|
|
Return only the cache-related portion of the prompt cost (cache read + cache write).
|
|
|
|
These costs must NOT be scaled by geo/speed multipliers because the old
|
|
explicit ``fast/`` model entries carried unchanged cache rates while
|
|
multiplying only the regular input/output token costs.
|
|
"""
|
|
if usage.prompt_tokens_details is None:
|
|
return 0.0
|
|
|
|
prompt_tokens_details = _parse_prompt_tokens_details(usage)
|
|
(
|
|
_,
|
|
_,
|
|
cache_creation_cost,
|
|
cache_creation_cost_above_1hr,
|
|
cache_read_cost,
|
|
) = _get_token_base_cost(model_info=model_info, usage=usage)
|
|
|
|
cache_cost = float(prompt_tokens_details["cache_hit_tokens"]) * cache_read_cost
|
|
|
|
if (
|
|
prompt_tokens_details["cache_creation_tokens"]
|
|
or prompt_tokens_details["cache_creation_token_details"] is not None
|
|
):
|
|
cache_cost += calculate_cache_writing_cost(
|
|
cache_creation_tokens=prompt_tokens_details["cache_creation_tokens"],
|
|
cache_creation_token_details=prompt_tokens_details[
|
|
"cache_creation_token_details"
|
|
],
|
|
cache_creation_cost_above_1hr=cache_creation_cost_above_1hr,
|
|
cache_creation_cost=cache_creation_cost,
|
|
)
|
|
|
|
return cache_cost
|
|
|
|
|
|
def cost_per_token(model: str, usage: "Usage") -> Tuple[float, float]:
|
|
"""
|
|
Calculates the cost per token for a given model, prompt tokens, and completion tokens.
|
|
|
|
Input:
|
|
- model: str, the model name without provider prefix
|
|
- usage: LiteLLM Usage block, containing anthropic caching information
|
|
|
|
Returns:
|
|
Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd
|
|
"""
|
|
prompt_cost, completion_cost = generic_cost_per_token(
|
|
model=model, usage=usage, custom_llm_provider="anthropic"
|
|
)
|
|
|
|
# Apply provider_specific_entry multipliers for geo/speed routing
|
|
try:
|
|
model_info = litellm.get_model_info(
|
|
model=model, custom_llm_provider="anthropic"
|
|
)
|
|
provider_specific_entry: dict = model_info.get("provider_specific_entry") or {}
|
|
|
|
multiplier = 1.0
|
|
if (
|
|
hasattr(usage, "inference_geo")
|
|
and usage.inference_geo
|
|
and usage.inference_geo.lower() not in ["global", "not_available"]
|
|
):
|
|
multiplier *= provider_specific_entry.get(usage.inference_geo.lower(), 1.0)
|
|
if hasattr(usage, "speed") and usage.speed == "fast":
|
|
multiplier *= provider_specific_entry.get("fast", 1.0)
|
|
|
|
if multiplier != 1.0:
|
|
cache_cost = _compute_cache_only_cost(model_info=model_info, usage=usage)
|
|
prompt_cost = (prompt_cost - cache_cost) * multiplier + cache_cost
|
|
completion_cost *= multiplier
|
|
except Exception:
|
|
pass
|
|
|
|
return prompt_cost, completion_cost
|
|
|
|
|
|
def get_cost_for_anthropic_web_search(
|
|
model_info: Optional["ModelInfo"] = None,
|
|
usage: Optional["Usage"] = None,
|
|
) -> float:
|
|
"""
|
|
Get the cost of using a web search tool for Anthropic.
|
|
"""
|
|
from litellm.types.utils import SearchContextCostPerQuery
|
|
|
|
## Check if web search requests are in the usage object
|
|
if model_info is None:
|
|
return 0.0
|
|
|
|
if (
|
|
usage is None
|
|
or usage.server_tool_use is None
|
|
or usage.server_tool_use.web_search_requests is None
|
|
):
|
|
return 0.0
|
|
|
|
## Get the cost per web search request
|
|
search_context_pricing: SearchContextCostPerQuery = (
|
|
model_info.get("search_context_cost_per_query") or SearchContextCostPerQuery()
|
|
)
|
|
cost_per_web_search_request = search_context_pricing.get(
|
|
"search_context_size_medium", 0.0
|
|
)
|
|
if cost_per_web_search_request is None or cost_per_web_search_request == 0.0:
|
|
return 0.0
|
|
|
|
## Calculate the total cost
|
|
total_cost = cost_per_web_search_request * usage.server_tool_use.web_search_requests
|
|
return total_cost
|