537 lines
17 KiB
Python
537 lines
17 KiB
Python
|
|
from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
|
||
|
|
import base64
|
||
|
|
|
||
|
|
import httpx
|
||
|
|
from httpx._types import RequestFiles
|
||
|
|
|
||
|
|
from litellm.types.videos.main import VideoCreateOptionalRequestParams, VideoObject
|
||
|
|
from litellm.types.router import GenericLiteLLMParams
|
||
|
|
from litellm.secret_managers.main import get_secret_str
|
||
|
|
from litellm.types.videos.utils import (
|
||
|
|
encode_video_id_with_provider,
|
||
|
|
extract_original_video_id,
|
||
|
|
)
|
||
|
|
from litellm.images.utils import ImageEditRequestUtils
|
||
|
|
import litellm
|
||
|
|
from litellm.types.llms.gemini import (
|
||
|
|
GeminiLongRunningOperationResponse,
|
||
|
|
GeminiVideoGenerationInstance,
|
||
|
|
GeminiVideoGenerationParameters,
|
||
|
|
GeminiVideoGenerationRequest,
|
||
|
|
)
|
||
|
|
from litellm.constants import DEFAULT_GOOGLE_VIDEO_DURATION_SECONDS
|
||
|
|
from litellm.llms.base_llm.videos.transformation import BaseVideoConfig
|
||
|
|
|
||
|
|
if TYPE_CHECKING:
|
||
|
|
from litellm.litellm_core_utils.litellm_logging import Logging as _LiteLLMLoggingObj
|
||
|
|
from ...base_llm.chat.transformation import BaseLLMException as _BaseLLMException
|
||
|
|
|
||
|
|
LiteLLMLoggingObj = _LiteLLMLoggingObj
|
||
|
|
BaseLLMException = _BaseLLMException
|
||
|
|
else:
|
||
|
|
LiteLLMLoggingObj = Any
|
||
|
|
BaseLLMException = Any
|
||
|
|
|
||
|
|
|
||
|
|
def _convert_image_to_gemini_format(image_file) -> Dict[str, str]:
|
||
|
|
"""
|
||
|
|
Convert image file to Gemini format with base64 encoding and MIME type.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
image_file: File-like object opened in binary mode (e.g., open("path", "rb"))
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
Dict with bytesBase64Encoded and mimeType
|
||
|
|
"""
|
||
|
|
mime_type = ImageEditRequestUtils.get_image_content_type(image_file)
|
||
|
|
|
||
|
|
if hasattr(image_file, "seek"):
|
||
|
|
image_file.seek(0)
|
||
|
|
image_bytes = image_file.read()
|
||
|
|
base64_encoded = base64.b64encode(image_bytes).decode("utf-8")
|
||
|
|
|
||
|
|
return {"bytesBase64Encoded": base64_encoded, "mimeType": mime_type}
|
||
|
|
|
||
|
|
|
||
|
|
class GeminiVideoConfig(BaseVideoConfig):
|
||
|
|
"""
|
||
|
|
Configuration class for Gemini (Veo) video generation.
|
||
|
|
|
||
|
|
Veo uses a long-running operation model:
|
||
|
|
1. POST to :predictLongRunning returns operation name
|
||
|
|
2. Poll operation until done=true
|
||
|
|
3. Extract video URI from response
|
||
|
|
4. Download video using file API
|
||
|
|
"""
|
||
|
|
|
||
|
|
def __init__(self):
|
||
|
|
super().__init__()
|
||
|
|
|
||
|
|
def get_supported_openai_params(self, model: str) -> list:
|
||
|
|
"""
|
||
|
|
Get the list of supported OpenAI parameters for Veo video generation.
|
||
|
|
Veo supports minimal parameters compared to OpenAI.
|
||
|
|
"""
|
||
|
|
return ["model", "prompt", "input_reference", "seconds", "size"]
|
||
|
|
|
||
|
|
def map_openai_params(
|
||
|
|
self,
|
||
|
|
video_create_optional_params: VideoCreateOptionalRequestParams,
|
||
|
|
model: str,
|
||
|
|
drop_params: bool,
|
||
|
|
) -> Dict[str, Any]:
|
||
|
|
"""
|
||
|
|
Map OpenAI-style parameters to Veo format.
|
||
|
|
|
||
|
|
Mappings:
|
||
|
|
- prompt → prompt
|
||
|
|
- input_reference → image
|
||
|
|
- size → aspectRatio (e.g., "1280x720" → "16:9")
|
||
|
|
- seconds → durationSeconds (defaults to 4 seconds if not provided)
|
||
|
|
|
||
|
|
All other params are passed through as-is to support Gemini-specific parameters.
|
||
|
|
"""
|
||
|
|
mapped_params: Dict[str, Any] = {}
|
||
|
|
|
||
|
|
# Get supported OpenAI params (exclude "model" and "prompt" which are handled separately)
|
||
|
|
supported_openai_params = self.get_supported_openai_params(model)
|
||
|
|
openai_params_to_map = {
|
||
|
|
param
|
||
|
|
for param in supported_openai_params
|
||
|
|
if param not in {"model", "prompt"}
|
||
|
|
}
|
||
|
|
|
||
|
|
# Map input_reference to image
|
||
|
|
if "input_reference" in video_create_optional_params:
|
||
|
|
mapped_params["image"] = video_create_optional_params["input_reference"]
|
||
|
|
|
||
|
|
# Map size to aspectRatio
|
||
|
|
if "size" in video_create_optional_params:
|
||
|
|
size = video_create_optional_params["size"]
|
||
|
|
if size is not None:
|
||
|
|
aspect_ratio = self._convert_size_to_aspect_ratio(size)
|
||
|
|
if aspect_ratio:
|
||
|
|
mapped_params["aspectRatio"] = aspect_ratio
|
||
|
|
|
||
|
|
# Map seconds to durationSeconds, default to 4 seconds (matching OpenAI)
|
||
|
|
if "seconds" in video_create_optional_params:
|
||
|
|
seconds = video_create_optional_params["seconds"]
|
||
|
|
try:
|
||
|
|
duration = int(seconds) if isinstance(seconds, str) else seconds
|
||
|
|
if duration is not None:
|
||
|
|
mapped_params["durationSeconds"] = duration
|
||
|
|
except (ValueError, TypeError):
|
||
|
|
# If conversion fails, use default
|
||
|
|
pass
|
||
|
|
|
||
|
|
# Pass through any other params that weren't mapped (Gemini-specific params)
|
||
|
|
for key, value in video_create_optional_params.items():
|
||
|
|
if key not in openai_params_to_map and key not in mapped_params:
|
||
|
|
mapped_params[key] = value
|
||
|
|
|
||
|
|
return mapped_params
|
||
|
|
|
||
|
|
def _convert_size_to_aspect_ratio(self, size: str) -> Optional[str]:
|
||
|
|
"""
|
||
|
|
Convert OpenAI size format to Veo aspectRatio format.
|
||
|
|
|
||
|
|
https://cloud.google.com/vertex-ai/generative-ai/docs/image/generate-videos
|
||
|
|
|
||
|
|
Supported aspect ratios: 9:16 (portrait), 16:9 (landscape)
|
||
|
|
"""
|
||
|
|
if not size:
|
||
|
|
return None
|
||
|
|
|
||
|
|
aspect_ratio_map = {
|
||
|
|
"1280x720": "16:9",
|
||
|
|
"1920x1080": "16:9",
|
||
|
|
"720x1280": "9:16",
|
||
|
|
"1080x1920": "9:16",
|
||
|
|
}
|
||
|
|
|
||
|
|
return aspect_ratio_map.get(size, "16:9")
|
||
|
|
|
||
|
|
def validate_environment(
|
||
|
|
self,
|
||
|
|
headers: dict,
|
||
|
|
model: str,
|
||
|
|
api_key: Optional[str] = None,
|
||
|
|
litellm_params: Optional[GenericLiteLLMParams] = None,
|
||
|
|
) -> dict:
|
||
|
|
"""
|
||
|
|
Validate environment and add Gemini API key to headers.
|
||
|
|
Gemini uses x-goog-api-key header for authentication.
|
||
|
|
"""
|
||
|
|
# Use api_key from litellm_params if available, otherwise fall back to other sources
|
||
|
|
if litellm_params and litellm_params.api_key:
|
||
|
|
api_key = api_key or litellm_params.api_key
|
||
|
|
|
||
|
|
api_key = (
|
||
|
|
api_key
|
||
|
|
or litellm.api_key
|
||
|
|
or get_secret_str("GOOGLE_API_KEY")
|
||
|
|
or get_secret_str("GEMINI_API_KEY")
|
||
|
|
)
|
||
|
|
|
||
|
|
if not api_key:
|
||
|
|
raise ValueError(
|
||
|
|
"GEMINI_API_KEY or GOOGLE_API_KEY is required for Veo video generation. "
|
||
|
|
"Set it via environment variable or pass it as api_key parameter."
|
||
|
|
)
|
||
|
|
|
||
|
|
headers.update(
|
||
|
|
{
|
||
|
|
"x-goog-api-key": api_key,
|
||
|
|
"Content-Type": "application/json",
|
||
|
|
}
|
||
|
|
)
|
||
|
|
return headers
|
||
|
|
|
||
|
|
def get_complete_url(
|
||
|
|
self,
|
||
|
|
model: str,
|
||
|
|
api_base: Optional[str],
|
||
|
|
litellm_params: dict,
|
||
|
|
) -> str:
|
||
|
|
"""
|
||
|
|
Get the complete URL for Veo video generation.
|
||
|
|
For video creation: returns full URL with :predictLongRunning
|
||
|
|
For status/delete: returns base URL only
|
||
|
|
"""
|
||
|
|
if api_base is None:
|
||
|
|
api_base = (
|
||
|
|
get_secret_str("GEMINI_API_BASE")
|
||
|
|
or "https://generativelanguage.googleapis.com"
|
||
|
|
)
|
||
|
|
|
||
|
|
if not model or model == "":
|
||
|
|
return api_base.rstrip("/")
|
||
|
|
|
||
|
|
model_name = model.replace("gemini/", "")
|
||
|
|
url = f"{api_base.rstrip('/')}/v1beta/models/{model_name}:predictLongRunning"
|
||
|
|
|
||
|
|
return url
|
||
|
|
|
||
|
|
def transform_video_create_request(
|
||
|
|
self,
|
||
|
|
model: str,
|
||
|
|
prompt: str,
|
||
|
|
api_base: str,
|
||
|
|
video_create_optional_request_params: Dict,
|
||
|
|
litellm_params: GenericLiteLLMParams,
|
||
|
|
headers: dict,
|
||
|
|
) -> Tuple[Dict, RequestFiles, str]:
|
||
|
|
"""
|
||
|
|
Transform the video creation request for Veo API.
|
||
|
|
|
||
|
|
Veo expects:
|
||
|
|
{
|
||
|
|
"instances": [
|
||
|
|
{
|
||
|
|
"prompt": "A cat playing with a ball of yarn"
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"parameters": {
|
||
|
|
"aspectRatio": "16:9",
|
||
|
|
"durationSeconds": 8,
|
||
|
|
"resolution": "720p"
|
||
|
|
}
|
||
|
|
}
|
||
|
|
"""
|
||
|
|
instance = GeminiVideoGenerationInstance(prompt=prompt)
|
||
|
|
|
||
|
|
params_copy = video_create_optional_request_params.copy()
|
||
|
|
|
||
|
|
if "image" in params_copy and params_copy["image"] is not None:
|
||
|
|
image_data = _convert_image_to_gemini_format(params_copy["image"])
|
||
|
|
params_copy["image"] = image_data
|
||
|
|
|
||
|
|
parameters = GeminiVideoGenerationParameters(**params_copy)
|
||
|
|
|
||
|
|
request_body_obj = GeminiVideoGenerationRequest(
|
||
|
|
instances=[instance], parameters=parameters
|
||
|
|
)
|
||
|
|
|
||
|
|
request_data = request_body_obj.model_dump(exclude_none=True)
|
||
|
|
|
||
|
|
return request_data, [], api_base
|
||
|
|
|
||
|
|
def transform_video_create_response(
|
||
|
|
self,
|
||
|
|
model: str,
|
||
|
|
raw_response: httpx.Response,
|
||
|
|
logging_obj: LiteLLMLoggingObj,
|
||
|
|
custom_llm_provider: Optional[str] = None,
|
||
|
|
request_data: Optional[Dict] = None,
|
||
|
|
) -> VideoObject:
|
||
|
|
"""
|
||
|
|
Transform the Veo video creation response.
|
||
|
|
|
||
|
|
Veo returns:
|
||
|
|
{
|
||
|
|
"name": "operations/generate_1234567890",
|
||
|
|
"metadata": {...},
|
||
|
|
"done": false,
|
||
|
|
"error": {...}
|
||
|
|
}
|
||
|
|
|
||
|
|
We return this as a VideoObject with:
|
||
|
|
- id: operation name (used for polling)
|
||
|
|
- status: "processing"
|
||
|
|
- usage: includes duration_seconds for cost calculation
|
||
|
|
"""
|
||
|
|
response_data = raw_response.json()
|
||
|
|
|
||
|
|
# Parse response using Pydantic model for type safety
|
||
|
|
try:
|
||
|
|
operation_response = GeminiLongRunningOperationResponse(**response_data)
|
||
|
|
except Exception as e:
|
||
|
|
raise ValueError(f"Failed to parse operation response: {e}")
|
||
|
|
|
||
|
|
operation_name = operation_response.name
|
||
|
|
if not operation_name:
|
||
|
|
raise ValueError(f"No operation name in Veo response: {response_data}")
|
||
|
|
|
||
|
|
if custom_llm_provider:
|
||
|
|
video_id = encode_video_id_with_provider(
|
||
|
|
operation_name, custom_llm_provider, model
|
||
|
|
)
|
||
|
|
else:
|
||
|
|
video_id = operation_name
|
||
|
|
|
||
|
|
video_obj = VideoObject(
|
||
|
|
id=video_id,
|
||
|
|
object="video",
|
||
|
|
status="processing",
|
||
|
|
model=model,
|
||
|
|
)
|
||
|
|
|
||
|
|
usage_data = {}
|
||
|
|
if request_data:
|
||
|
|
parameters = request_data.get("parameters", {})
|
||
|
|
duration = (
|
||
|
|
parameters.get("durationSeconds")
|
||
|
|
or DEFAULT_GOOGLE_VIDEO_DURATION_SECONDS
|
||
|
|
)
|
||
|
|
if duration is not None:
|
||
|
|
try:
|
||
|
|
usage_data["duration_seconds"] = float(duration)
|
||
|
|
except (ValueError, TypeError):
|
||
|
|
pass
|
||
|
|
|
||
|
|
video_obj.usage = usage_data
|
||
|
|
return video_obj
|
||
|
|
|
||
|
|
def transform_video_status_retrieve_request(
|
||
|
|
self,
|
||
|
|
video_id: str,
|
||
|
|
api_base: str,
|
||
|
|
litellm_params: GenericLiteLLMParams,
|
||
|
|
headers: dict,
|
||
|
|
) -> Tuple[str, Dict]:
|
||
|
|
"""
|
||
|
|
Transform the video status retrieve request for Veo API.
|
||
|
|
|
||
|
|
Veo polls operations at:
|
||
|
|
GET https://generativelanguage.googleapis.com/v1beta/{operation_name}
|
||
|
|
"""
|
||
|
|
operation_name = extract_original_video_id(video_id)
|
||
|
|
url = f"{api_base.rstrip('/')}/v1beta/{operation_name}"
|
||
|
|
params: Dict[str, Any] = {}
|
||
|
|
|
||
|
|
return url, params
|
||
|
|
|
||
|
|
def transform_video_status_retrieve_response(
|
||
|
|
self,
|
||
|
|
raw_response: httpx.Response,
|
||
|
|
logging_obj: LiteLLMLoggingObj,
|
||
|
|
custom_llm_provider: Optional[str] = None,
|
||
|
|
) -> VideoObject:
|
||
|
|
"""
|
||
|
|
Transform the Veo operation status response.
|
||
|
|
|
||
|
|
Veo returns:
|
||
|
|
{
|
||
|
|
"name": "operations/generate_1234567890",
|
||
|
|
"done": false # or true when complete
|
||
|
|
}
|
||
|
|
|
||
|
|
When done=true:
|
||
|
|
{
|
||
|
|
"name": "operations/generate_1234567890",
|
||
|
|
"done": true,
|
||
|
|
"response": {
|
||
|
|
"generateVideoResponse": {
|
||
|
|
"generatedSamples": [
|
||
|
|
{
|
||
|
|
"video": {
|
||
|
|
"uri": "files/abc123..."
|
||
|
|
}
|
||
|
|
}
|
||
|
|
]
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
"""
|
||
|
|
response_data = raw_response.json()
|
||
|
|
# Parse response using Pydantic model for type safety
|
||
|
|
operation_response = GeminiLongRunningOperationResponse(**response_data)
|
||
|
|
|
||
|
|
operation_name = operation_response.name
|
||
|
|
is_done = operation_response.done
|
||
|
|
|
||
|
|
if custom_llm_provider:
|
||
|
|
video_id = encode_video_id_with_provider(
|
||
|
|
operation_name, custom_llm_provider, None
|
||
|
|
)
|
||
|
|
else:
|
||
|
|
video_id = operation_name
|
||
|
|
|
||
|
|
video_obj = VideoObject(
|
||
|
|
id=video_id,
|
||
|
|
object="video",
|
||
|
|
status="processing" if not is_done else "completed",
|
||
|
|
)
|
||
|
|
return video_obj
|
||
|
|
|
||
|
|
def transform_video_content_request(
|
||
|
|
self,
|
||
|
|
video_id: str,
|
||
|
|
api_base: str,
|
||
|
|
litellm_params: GenericLiteLLMParams,
|
||
|
|
headers: dict,
|
||
|
|
variant: Optional[str] = None,
|
||
|
|
) -> Tuple[str, Dict]:
|
||
|
|
"""
|
||
|
|
Transform the video content request for Veo API.
|
||
|
|
|
||
|
|
For Veo, we need to:
|
||
|
|
1. Get operation status to extract video URI
|
||
|
|
2. Return download URL for the video
|
||
|
|
"""
|
||
|
|
operation_name = extract_original_video_id(video_id)
|
||
|
|
|
||
|
|
status_url = f"{api_base.rstrip('/')}/v1beta/{operation_name}"
|
||
|
|
client = litellm.module_level_client
|
||
|
|
status_response = client.get(url=status_url, headers=headers)
|
||
|
|
status_response.raise_for_status()
|
||
|
|
response_data = status_response.json()
|
||
|
|
|
||
|
|
operation_response = GeminiLongRunningOperationResponse(**response_data)
|
||
|
|
|
||
|
|
if not operation_response.done:
|
||
|
|
raise ValueError(
|
||
|
|
"Video generation is not complete yet. "
|
||
|
|
"Please check status with video_status() before downloading."
|
||
|
|
)
|
||
|
|
|
||
|
|
if not operation_response.response:
|
||
|
|
raise ValueError("No response data in completed operation")
|
||
|
|
|
||
|
|
generated_samples = (
|
||
|
|
operation_response.response.generateVideoResponse.generatedSamples
|
||
|
|
)
|
||
|
|
download_url = generated_samples[0].video.uri
|
||
|
|
|
||
|
|
params: Dict[str, Any] = {}
|
||
|
|
|
||
|
|
return download_url, params
|
||
|
|
|
||
|
|
def transform_video_content_response(
|
||
|
|
self,
|
||
|
|
raw_response: httpx.Response,
|
||
|
|
logging_obj: LiteLLMLoggingObj,
|
||
|
|
) -> bytes:
|
||
|
|
"""
|
||
|
|
Transform the Veo video content download response.
|
||
|
|
Returns the video bytes directly.
|
||
|
|
"""
|
||
|
|
return raw_response.content
|
||
|
|
|
||
|
|
def transform_video_remix_request(
|
||
|
|
self,
|
||
|
|
video_id: str,
|
||
|
|
prompt: str,
|
||
|
|
api_base: str,
|
||
|
|
litellm_params: GenericLiteLLMParams,
|
||
|
|
headers: dict,
|
||
|
|
extra_body: Optional[Dict[str, Any]] = None,
|
||
|
|
) -> Tuple[str, Dict]:
|
||
|
|
"""
|
||
|
|
Video remix is not supported by Veo API.
|
||
|
|
"""
|
||
|
|
raise NotImplementedError(
|
||
|
|
"Video remix is not supported by Google Veo. "
|
||
|
|
"Please use video_generation() to create new videos."
|
||
|
|
)
|
||
|
|
|
||
|
|
def transform_video_remix_response(
|
||
|
|
self,
|
||
|
|
raw_response: httpx.Response,
|
||
|
|
logging_obj: LiteLLMLoggingObj,
|
||
|
|
custom_llm_provider: Optional[str] = None,
|
||
|
|
) -> VideoObject:
|
||
|
|
"""Video remix is not supported."""
|
||
|
|
raise NotImplementedError("Video remix is not supported by Google Veo.")
|
||
|
|
|
||
|
|
def transform_video_list_request(
|
||
|
|
self,
|
||
|
|
api_base: str,
|
||
|
|
litellm_params: GenericLiteLLMParams,
|
||
|
|
headers: dict,
|
||
|
|
after: Optional[str] = None,
|
||
|
|
limit: Optional[int] = None,
|
||
|
|
order: Optional[str] = None,
|
||
|
|
extra_query: Optional[Dict[str, Any]] = None,
|
||
|
|
) -> Tuple[str, Dict]:
|
||
|
|
"""
|
||
|
|
Video list is not supported by Veo API.
|
||
|
|
"""
|
||
|
|
raise NotImplementedError(
|
||
|
|
"Video list is not supported by Google Veo. "
|
||
|
|
"Use the operations endpoint directly if you need to list operations."
|
||
|
|
)
|
||
|
|
|
||
|
|
def transform_video_list_response(
|
||
|
|
self,
|
||
|
|
raw_response: httpx.Response,
|
||
|
|
logging_obj: LiteLLMLoggingObj,
|
||
|
|
custom_llm_provider: Optional[str] = None,
|
||
|
|
) -> Dict[str, str]:
|
||
|
|
"""Video list is not supported."""
|
||
|
|
raise NotImplementedError("Video list is not supported by Google Veo.")
|
||
|
|
|
||
|
|
def transform_video_delete_request(
|
||
|
|
self,
|
||
|
|
video_id: str,
|
||
|
|
api_base: str,
|
||
|
|
litellm_params: GenericLiteLLMParams,
|
||
|
|
headers: dict,
|
||
|
|
) -> Tuple[str, Dict]:
|
||
|
|
"""
|
||
|
|
Video delete is not supported by Veo API.
|
||
|
|
"""
|
||
|
|
raise NotImplementedError(
|
||
|
|
"Video delete is not supported by Google Veo. "
|
||
|
|
"Videos are automatically cleaned up by Google."
|
||
|
|
)
|
||
|
|
|
||
|
|
def transform_video_delete_response(
|
||
|
|
self,
|
||
|
|
raw_response: httpx.Response,
|
||
|
|
logging_obj: LiteLLMLoggingObj,
|
||
|
|
) -> VideoObject:
|
||
|
|
"""Video delete is not supported."""
|
||
|
|
raise NotImplementedError("Video delete is not supported by Google Veo.")
|
||
|
|
|
||
|
|
def get_error_class(
|
||
|
|
self, error_message: str, status_code: int, headers: Union[dict, httpx.Headers]
|
||
|
|
) -> BaseLLMException:
|
||
|
|
from ..common_utils import GeminiError
|
||
|
|
|
||
|
|
return GeminiError(
|
||
|
|
status_code=status_code,
|
||
|
|
message=error_message,
|
||
|
|
headers=headers,
|
||
|
|
)
|