llm-gateway-competitors/litellm-wheel-src/litellm/integrations/cloudzero/transform.py

# Copyright 2025 CloudZero
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# CHANGELOG: 2025-01-19 - Updated CBF transformation for daily spend tables and proper CloudZero mapping (erik.peterson)
# CHANGELOG: 2025-01-19 - Migrated from pandas to polars for data transformation (erik.peterson)
# CHANGELOG: 2025-01-19 - Initial CBF transformation module (erik.peterson)

"""Transform LiteLLM data to CloudZero AnyCost CBF format."""

from datetime import datetime
from typing import Any, Optional

import polars as pl

from ...types.integrations.cloudzero import CBFRecord
from .cz_resource_names import CZEntityType, CZRNGenerator


class CBFTransformer:
    """Transform LiteLLM usage data to CloudZero Billing Format (CBF)."""

    def __init__(self):
        """Initialize transformer with CZRN generator."""
        self.czrn_generator = CZRNGenerator()

    def transform(self, data: pl.DataFrame) -> pl.DataFrame:
        """Transform LiteLLM data to CBF format, dropping records with zero successful_requests or invalid CZRNs."""
        if data.is_empty():
            return pl.DataFrame()

        # Filter out records with zero successful_requests first
        original_count = len(data)
        if "successful_requests" in data.columns:
            filtered_data = data.filter(pl.col("successful_requests") > 0)
            zero_requests_dropped = original_count - len(filtered_data)
        else:
            filtered_data = data
            zero_requests_dropped = 0

        cbf_data = []
        czrn_dropped_count = 0
        filtered_count = len(filtered_data)

        for row in filtered_data.iter_rows(named=True):
            try:
                cbf_record = self._create_cbf_record(row)
                # Only include the record if CZRN generation was successful
                cbf_data.append(cbf_record)
            except Exception:
                # Skip records that fail CZRN generation
                czrn_dropped_count += 1
                continue

        # Print summary of dropped records if any
        from rich.console import Console

        console = Console()

        if zero_requests_dropped > 0:
            console.print(
                f"[yellow]⚠️  Dropped {zero_requests_dropped:,} of {original_count:,} records with zero successful_requests[/yellow]"
            )

        if czrn_dropped_count > 0:
            console.print(
                f"[yellow]⚠️  Dropped {czrn_dropped_count:,} of {filtered_count:,} filtered records due to invalid CZRNs[/yellow]"
            )

        if len(cbf_data) > 0:
            console.print(
                f"[green]✓ Successfully transformed {len(cbf_data):,} records[/green]"
            )

        return pl.DataFrame(cbf_data)

    def _create_cbf_record(self, row: dict[str, Any]) -> CBFRecord:
        """Create a single CBF record from LiteLLM daily spend row."""

        # Parse date (daily spend tables use date strings like '2025-04-19')
        usage_date = self._parse_date(row.get("date"))

        # Calculate total tokens
        prompt_tokens = int(row.get("prompt_tokens", 0))
        completion_tokens = int(row.get("completion_tokens", 0))
        total_tokens = prompt_tokens + completion_tokens

        # Create CloudZero Resource Name (CZRN) as resource_id
        resource_id = self.czrn_generator.create_from_litellm_data(row)

        # Build dimensions for CloudZero
        model = str(row.get("model", ""))
        api_key_hash = str(row.get("api_key", ""))[
            :8
        ]  # First 8 chars for identification

        # Handle team information with fallbacks
        team_id = row.get("team_id")
        team_alias = row.get("team_alias")
        user_email = row.get("user_email")

        # Use team_alias if available, otherwise team_id, otherwise fallback to 'unknown'
        entity_id = (
            str(team_alias) if team_alias else (str(team_id) if team_id else "unknown")
        )

        # Get alias fields if they exist
        api_key_alias = row.get("api_key_alias")
        organization_alias = row.get("organization_alias")
        project_alias = row.get("project_alias")
        user_alias = row.get("user_alias")

        dimensions = {
            "entity_type": CZEntityType.TEAM.value,
            "entity_id": entity_id,
            "team_alias": str(team_alias) if team_alias else "unknown",
            "model": model,
            "model_group": str(row.get("model_group", "")),
            "provider": str(row.get("custom_llm_provider", "")),
            "api_key_prefix": api_key_hash,
            "api_key_alias": str(row.get("api_key_alias", "")),
            "user_email": str(user_email) if user_email else "",
            "api_requests": str(row.get("api_requests", 0)),
            "successful_requests": str(row.get("successful_requests", 0)),
            "failed_requests": str(row.get("failed_requests", 0)),
            "cache_creation_tokens": str(row.get("cache_creation_input_tokens", 0)),
            "cache_read_tokens": str(row.get("cache_read_input_tokens", 0)),
            "organization_alias": str(organization_alias) if organization_alias else "",
            "project_alias": str(project_alias) if project_alias else "",
            "user_alias": str(user_alias) if user_alias else "",
        }

        # Extract CZRN components to populate corresponding CBF columns
        czrn_components = self.czrn_generator.extract_components(resource_id)
        (
            service_type,
            provider,
            region,
            owner_account_id,
            resource_type,
            cloud_local_id,
        ) = czrn_components

        # Build resource/account as concat of api_key_alias and api_key_prefix
        resource_account = (
            f"{api_key_alias}|{api_key_hash}" if api_key_alias else api_key_hash
        )

        # CloudZero CBF format with proper column names
        cbf_record = {
            # Required CBF fields
            "time/usage_start": usage_date.isoformat()
            if usage_date
            else None,  # Required: ISO-formatted UTC datetime
            "cost/cost": float(row.get("spend", 0.0)),  # Required: billed cost
            "resource/id": resource_id,  # CZRN (CloudZero Resource Name)
            # Usage metrics for token consumption
            "usage/amount": total_tokens,  # Numeric value of tokens consumed
            "usage/units": "tokens",  # Description of token units
            # CBF fields - updated per LIT-1907
            "resource/service": str(row.get("model_group", "")),  # Send model_group
            "resource/account": resource_account,  # Send api_key_alias|api_key_prefix
            "resource/region": region,  # Maps to CZRN region (cross-region)
            "resource/usage_family": str(
                row.get("custom_llm_provider", "")
            ),  # Send provider
            # Action field
            "action/operation": str(team_id) if team_id else "",  # Send team_id
            # Line item details
            "lineitem/type": "Usage",  # Standard usage line item
        }

        # Add CZRN components that don't have direct CBF column mappings as resource tags
        cbf_record["resource/tag:provider"] = provider  # CZRN provider component
        cbf_record[
            "resource/tag:model"
        ] = cloud_local_id  # CZRN cloud-local-id component (model)

        # Add resource tags for all dimensions (using resource/tag:<key> format)
        for key, value in dimensions.items():
            if (
                value and value != "N/A" and value != "unknown"
            ):  # Only add meaningful tags
                cbf_record[f"resource/tag:{key}"] = str(value)

        # Add token breakdown as resource tags for analysis (excluding total_tokens per LIT-1907)
        if prompt_tokens > 0:
            cbf_record["resource/tag:prompt_tokens"] = str(prompt_tokens)
        if completion_tokens > 0:
            cbf_record["resource/tag:completion_tokens"] = str(completion_tokens)

        return CBFRecord(cbf_record)

    def _parse_date(self, date_str) -> Optional[datetime]:
        """Parse date string from daily spend tables (e.g., '2025-04-19')."""
        if date_str is None:
            return None

        if isinstance(date_str, datetime):
            return date_str

        if isinstance(date_str, str):
            try:
                # Parse date string and set to midnight UTC for daily aggregation
                return pl.Series([date_str]).str.to_datetime("%Y-%m-%d").item()
            except Exception:
                try:
                    # Fallback: try ISO format parsing
                    return pl.Series([date_str]).str.to_datetime().item()
                except Exception:
                    return None

        return None
chore: initial public snapshot for github upload 2026-03-26 20:06:14 +08:00			`# Copyright 2025 CloudZero`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`#`
			`# CHANGELOG: 2025-01-19 - Updated CBF transformation for daily spend tables and proper CloudZero mapping (erik.peterson)`
			`# CHANGELOG: 2025-01-19 - Migrated from pandas to polars for data transformation (erik.peterson)`
			`# CHANGELOG: 2025-01-19 - Initial CBF transformation module (erik.peterson)`

			`"""Transform LiteLLM data to CloudZero AnyCost CBF format."""`

			`from datetime import datetime`
			`from typing import Any, Optional`

			`import polars as pl`

			`from ...types.integrations.cloudzero import CBFRecord`
			`from .cz_resource_names import CZEntityType, CZRNGenerator`


			`class CBFTransformer:`
			`"""Transform LiteLLM usage data to CloudZero Billing Format (CBF)."""`

			`def __init__(self):`
			`"""Initialize transformer with CZRN generator."""`
			`self.czrn_generator = CZRNGenerator()`

			`def transform(self, data: pl.DataFrame) -> pl.DataFrame:`
			`"""Transform LiteLLM data to CBF format, dropping records with zero successful_requests or invalid CZRNs."""`
			`if data.is_empty():`
			`return pl.DataFrame()`

			`# Filter out records with zero successful_requests first`
			`original_count = len(data)`
			`if "successful_requests" in data.columns:`
			`filtered_data = data.filter(pl.col("successful_requests") > 0)`
			`zero_requests_dropped = original_count - len(filtered_data)`
			`else:`
			`filtered_data = data`
			`zero_requests_dropped = 0`

			`cbf_data = []`
			`czrn_dropped_count = 0`
			`filtered_count = len(filtered_data)`

			`for row in filtered_data.iter_rows(named=True):`
			`try:`
			`cbf_record = self._create_cbf_record(row)`
			`# Only include the record if CZRN generation was successful`
			`cbf_data.append(cbf_record)`
			`except Exception:`
			`# Skip records that fail CZRN generation`
			`czrn_dropped_count += 1`
			`continue`

			`# Print summary of dropped records if any`
			`from rich.console import Console`

			`console = Console()`

			`if zero_requests_dropped > 0:`
			`console.print(`
			`f"[yellow]⚠️ Dropped {zero_requests_dropped:,} of {original_count:,} records with zero successful_requests[/yellow]"`
			`)`

			`if czrn_dropped_count > 0:`
			`console.print(`
			`f"[yellow]⚠️ Dropped {czrn_dropped_count:,} of {filtered_count:,} filtered records due to invalid CZRNs[/yellow]"`
			`)`

			`if len(cbf_data) > 0:`
			`console.print(`
			`f"[green]✓ Successfully transformed {len(cbf_data):,} records[/green]"`
			`)`

			`return pl.DataFrame(cbf_data)`

			`def _create_cbf_record(self, row: dict[str, Any]) -> CBFRecord:`
			`"""Create a single CBF record from LiteLLM daily spend row."""`

			`# Parse date (daily spend tables use date strings like '2025-04-19')`
			`usage_date = self._parse_date(row.get("date"))`

			`# Calculate total tokens`
			`prompt_tokens = int(row.get("prompt_tokens", 0))`
			`completion_tokens = int(row.get("completion_tokens", 0))`
			`total_tokens = prompt_tokens + completion_tokens`

			`# Create CloudZero Resource Name (CZRN) as resource_id`
			`resource_id = self.czrn_generator.create_from_litellm_data(row)`

			`# Build dimensions for CloudZero`
			`model = str(row.get("model", ""))`
			`api_key_hash = str(row.get("api_key", ""))[`
			`:8`
			`] # First 8 chars for identification`

			`# Handle team information with fallbacks`
			`team_id = row.get("team_id")`
			`team_alias = row.get("team_alias")`
			`user_email = row.get("user_email")`

			`# Use team_alias if available, otherwise team_id, otherwise fallback to 'unknown'`
			`entity_id = (`
			`str(team_alias) if team_alias else (str(team_id) if team_id else "unknown")`
			`)`

			`# Get alias fields if they exist`
			`api_key_alias = row.get("api_key_alias")`
			`organization_alias = row.get("organization_alias")`
			`project_alias = row.get("project_alias")`
			`user_alias = row.get("user_alias")`

			`dimensions = {`
			`"entity_type": CZEntityType.TEAM.value,`
			`"entity_id": entity_id,`
			`"team_alias": str(team_alias) if team_alias else "unknown",`
			`"model": model,`
			`"model_group": str(row.get("model_group", "")),`
			`"provider": str(row.get("custom_llm_provider", "")),`
			`"api_key_prefix": api_key_hash,`
			`"api_key_alias": str(row.get("api_key_alias", "")),`
			`"user_email": str(user_email) if user_email else "",`
			`"api_requests": str(row.get("api_requests", 0)),`
			`"successful_requests": str(row.get("successful_requests", 0)),`
			`"failed_requests": str(row.get("failed_requests", 0)),`
			`"cache_creation_tokens": str(row.get("cache_creation_input_tokens", 0)),`
			`"cache_read_tokens": str(row.get("cache_read_input_tokens", 0)),`
			`"organization_alias": str(organization_alias) if organization_alias else "",`
			`"project_alias": str(project_alias) if project_alias else "",`
			`"user_alias": str(user_alias) if user_alias else "",`
			`}`

			`# Extract CZRN components to populate corresponding CBF columns`
			`czrn_components = self.czrn_generator.extract_components(resource_id)`
			`(`
			`service_type,`
			`provider,`
			`region,`
			`owner_account_id,`
			`resource_type,`
			`cloud_local_id,`
			`) = czrn_components`

			`# Build resource/account as concat of api_key_alias and api_key_prefix`
			`resource_account = (`
			`f"{api_key_alias}\|{api_key_hash}" if api_key_alias else api_key_hash`
			`)`

			`# CloudZero CBF format with proper column names`
			`cbf_record = {`
			`# Required CBF fields`
			`"time/usage_start": usage_date.isoformat()`
			`if usage_date`
			`else None, # Required: ISO-formatted UTC datetime`
			`"cost/cost": float(row.get("spend", 0.0)), # Required: billed cost`
			`"resource/id": resource_id, # CZRN (CloudZero Resource Name)`
			`# Usage metrics for token consumption`
			`"usage/amount": total_tokens, # Numeric value of tokens consumed`
			`"usage/units": "tokens", # Description of token units`
			`# CBF fields - updated per LIT-1907`
			`"resource/service": str(row.get("model_group", "")), # Send model_group`
			`"resource/account": resource_account, # Send api_key_alias\|api_key_prefix`
			`"resource/region": region, # Maps to CZRN region (cross-region)`
			`"resource/usage_family": str(`
			`row.get("custom_llm_provider", "")`
			`), # Send provider`
			`# Action field`
			`"action/operation": str(team_id) if team_id else "", # Send team_id`
			`# Line item details`
			`"lineitem/type": "Usage", # Standard usage line item`
			`}`

			`# Add CZRN components that don't have direct CBF column mappings as resource tags`
			`cbf_record["resource/tag:provider"] = provider # CZRN provider component`
			`cbf_record[`
			`"resource/tag:model"`
			`] = cloud_local_id # CZRN cloud-local-id component (model)`

			`# Add resource tags for all dimensions (using resource/tag:<key> format)`
			`for key, value in dimensions.items():`
			`if (`
			`value and value != "N/A" and value != "unknown"`
			`): # Only add meaningful tags`
			`cbf_record[f"resource/tag:{key}"] = str(value)`

			`# Add token breakdown as resource tags for analysis (excluding total_tokens per LIT-1907)`
			`if prompt_tokens > 0:`
			`cbf_record["resource/tag:prompt_tokens"] = str(prompt_tokens)`
			`if completion_tokens > 0:`
			`cbf_record["resource/tag:completion_tokens"] = str(completion_tokens)`

			`return CBFRecord(cbf_record)`

			`def _parse_date(self, date_str) -> Optional[datetime]:`
			`"""Parse date string from daily spend tables (e.g., '2025-04-19')."""`
			`if date_str is None:`
			`return None`

			`if isinstance(date_str, datetime):`
			`return date_str`

			`if isinstance(date_str, str):`
			`try:`
			`# Parse date string and set to midnight UTC for daily aggregation`
			`return pl.Series([date_str]).str.to_datetime("%Y-%m-%d").item()`
			`except Exception:`
			`try:`
			`# Fallback: try ISO format parsing`
			`return pl.Series([date_str]).str.to_datetime().item()`
			`except Exception:`
			`return None`

			`return None`