Files
lijiaoqiao/llm-gateway-competitors/litellm-wheel-src/litellm/llms/bedrock/realtime/transformation.py

1195 lines
42 KiB
Python
Raw Normal View History

"""
This file contains the transformation logic for Bedrock Nova Sonic realtime API.
Transforms between OpenAI Realtime API format and Bedrock Nova Sonic format.
"""
import json
import uuid as uuid_lib
from typing import Any, List, Optional, Union
from litellm._logging import verbose_logger
from litellm._uuid import uuid
from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
from litellm.llms.base_llm.realtime.transformation import BaseRealtimeConfig
from litellm.types.llms.openai import (
OpenAIRealtimeContentPartDone,
OpenAIRealtimeDoneEvent,
OpenAIRealtimeEvents,
OpenAIRealtimeOutputItemDone,
OpenAIRealtimeResponseAudioDone,
OpenAIRealtimeResponseContentPartAdded,
OpenAIRealtimeResponseDelta,
OpenAIRealtimeResponseDoneObject,
OpenAIRealtimeResponseTextDone,
OpenAIRealtimeStreamResponseBaseObject,
OpenAIRealtimeStreamResponseOutputItemAdded,
OpenAIRealtimeStreamSession,
OpenAIRealtimeStreamSessionEvents,
)
from litellm.types.realtime import (
ALL_DELTA_TYPES,
RealtimeResponseTransformInput,
RealtimeResponseTypedDict,
)
from litellm.utils import get_empty_usage
class BedrockRealtimeConfig(BaseRealtimeConfig):
"""Configuration for Bedrock Nova Sonic realtime transformations."""
def __init__(self):
# Track session state
self.prompt_name = str(uuid_lib.uuid4())
self.content_name = str(uuid_lib.uuid4())
self.audio_content_name = str(uuid_lib.uuid4())
# Default configuration values
# Inference configuration
self.max_tokens = 1024
self.top_p = 0.9
self.temperature = 0.7
# Audio output configuration
self.output_sample_rate_hertz = 24000
self.output_sample_size_bits = 16
self.output_channel_count = 1
self.voice_id = "matthew"
self.output_encoding = "base64"
self.output_audio_type = "SPEECH"
self.output_media_type = "audio/lpcm"
# Audio input configuration
self.input_sample_rate_hertz = 16000
self.input_sample_size_bits = 16
self.input_channel_count = 1
self.input_encoding = "base64"
self.input_audio_type = "SPEECH"
self.input_media_type = "audio/lpcm"
# Text configuration
self.text_media_type = "text/plain"
def validate_environment(
self, headers: dict, model: str, api_key: Optional[str] = None
) -> dict:
"""Validate environment - no special validation needed for Bedrock."""
return headers
def get_complete_url(
self, api_base: Optional[str], model: str, api_key: Optional[str] = None
) -> str:
"""Get complete URL - handled by aws_sdk_bedrock_runtime."""
return api_base or ""
def requires_session_configuration(self) -> bool:
"""Bedrock requires session configuration."""
return True
def session_configuration_request(
self, model: str, tools: Optional[List[dict]] = None
) -> str:
"""
Create initial session configuration for Bedrock Nova Sonic.
Args:
model: Model ID
tools: Optional list of tool definitions
Returns JSON string with session start and prompt start events.
"""
session_start = {
"event": {
"sessionStart": {
"inferenceConfiguration": {
"maxTokens": self.max_tokens,
"topP": self.top_p,
"temperature": self.temperature,
}
}
}
}
prompt_start_config = {
"promptName": self.prompt_name,
"textOutputConfiguration": {"mediaType": self.text_media_type},
"audioOutputConfiguration": {
"mediaType": self.output_media_type,
"sampleRateHertz": self.output_sample_rate_hertz,
"sampleSizeBits": self.output_sample_size_bits,
"channelCount": self.output_channel_count,
"voiceId": self.voice_id,
"encoding": self.output_encoding,
"audioType": self.output_audio_type,
},
}
# Add tool configuration if tools are provided
if tools:
prompt_start_config["toolUseOutputConfiguration"] = {
"mediaType": "application/json"
}
prompt_start_config["toolConfiguration"] = {
"tools": self._transform_tools_to_bedrock_format(tools)
}
prompt_start = {"event": {"promptStart": prompt_start_config}}
# Return as a marker that we've sent the configuration
return json.dumps(
{"session_start": session_start, "prompt_start": prompt_start}
)
def _transform_tools_to_bedrock_format(self, tools: List[dict]) -> List[dict]:
"""
Transform OpenAI tool format to Bedrock tool format.
Args:
tools: List of OpenAI format tools
Returns:
List of Bedrock format tools
"""
bedrock_tools = []
for tool in tools:
if tool.get("type") == "function":
function = tool.get("function", {})
bedrock_tool = {
"toolSpec": {
"name": function.get("name", ""),
"description": function.get("description", ""),
"inputSchema": {
"json": json.dumps(function.get("parameters", {}))
},
}
}
bedrock_tools.append(bedrock_tool)
return bedrock_tools
def _map_audio_format_to_sample_rate(
self, audio_format: str, is_output: bool = True
) -> int:
"""
Map OpenAI audio format to sample rate.
Args:
audio_format: OpenAI audio format (pcm16, g711_ulaw, g711_alaw)
is_output: Whether this is for output (True) or input (False)
Returns:
Sample rate in Hz
"""
# OpenAI uses 24kHz for output and can vary for input
# Bedrock Nova Sonic uses 24kHz for output and 16kHz for input by default
if audio_format == "pcm16":
return 24000 if is_output else 16000
elif audio_format in ["g711_ulaw", "g711_alaw"]:
return 8000 # G.711 typically uses 8kHz
return 24000 if is_output else 16000
def transform_session_update_event(self, json_message: dict) -> List[str]:
"""
Transform session.update event to Bedrock session configuration.
Args:
json_message: OpenAI session.update message
Returns:
List of Bedrock format messages (JSON strings)
"""
verbose_logger.debug("Handling session.update")
messages: List[str] = []
session_config = json_message.get("session", {})
# Update inference configuration from session if provided
if "max_response_output_tokens" in session_config:
self.max_tokens = session_config["max_response_output_tokens"]
if "temperature" in session_config:
self.temperature = session_config["temperature"]
# Update audio output configuration from session if provided
if "voice" in session_config:
self.voice_id = session_config["voice"]
if "output_audio_format" in session_config:
output_format = session_config["output_audio_format"]
self.output_sample_rate_hertz = self._map_audio_format_to_sample_rate(
output_format, is_output=True
)
# Update audio input configuration from session if provided
if "input_audio_format" in session_config:
input_format = session_config["input_audio_format"]
self.input_sample_rate_hertz = self._map_audio_format_to_sample_rate(
input_format, is_output=False
)
# Allow direct override of sample rates if provided (custom extension)
if "output_sample_rate_hertz" in session_config:
self.output_sample_rate_hertz = session_config["output_sample_rate_hertz"]
if "input_sample_rate_hertz" in session_config:
self.input_sample_rate_hertz = session_config["input_sample_rate_hertz"]
# Send session start
session_start = {
"event": {
"sessionStart": {
"inferenceConfiguration": {
"maxTokens": self.max_tokens,
"topP": self.top_p,
"temperature": self.temperature,
}
}
}
}
messages.append(json.dumps(session_start))
# Send prompt start
prompt_start_config = {
"promptName": self.prompt_name,
"textOutputConfiguration": {"mediaType": self.text_media_type},
"audioOutputConfiguration": {
"mediaType": self.output_media_type,
"sampleRateHertz": self.output_sample_rate_hertz,
"sampleSizeBits": self.output_sample_size_bits,
"channelCount": self.output_channel_count,
"voiceId": self.voice_id,
"encoding": self.output_encoding,
"audioType": self.output_audio_type,
},
}
# Add tool configuration if tools are provided
tools = session_config.get("tools")
if tools:
prompt_start_config["toolUseOutputConfiguration"] = {
"mediaType": "application/json"
}
prompt_start_config["toolConfiguration"] = {
"tools": self._transform_tools_to_bedrock_format(tools)
}
prompt_start = {"event": {"promptStart": prompt_start_config}}
messages.append(json.dumps(prompt_start))
# Send system prompt if provided
instructions = session_config.get("instructions")
if instructions:
text_content_name = str(uuid_lib.uuid4())
# Content start
text_content_start = {
"event": {
"contentStart": {
"promptName": self.prompt_name,
"contentName": text_content_name,
"type": "TEXT",
"interactive": False,
"role": "SYSTEM",
"textInputConfiguration": {"mediaType": self.text_media_type},
}
}
}
messages.append(json.dumps(text_content_start))
# Text input
text_input = {
"event": {
"textInput": {
"promptName": self.prompt_name,
"contentName": text_content_name,
"content": instructions,
}
}
}
messages.append(json.dumps(text_input))
# Content end
text_content_end = {
"event": {
"contentEnd": {
"promptName": self.prompt_name,
"contentName": text_content_name,
}
}
}
messages.append(json.dumps(text_content_end))
return messages
def transform_input_audio_buffer_append_event(
self, json_message: dict
) -> List[str]:
"""
Transform input_audio_buffer.append event to Bedrock audio input.
Args:
json_message: OpenAI input_audio_buffer.append message
Returns:
List of Bedrock format messages (JSON strings)
"""
verbose_logger.debug("Handling input_audio_buffer.append")
messages: List[str] = []
# Check if we need to start audio content
if not hasattr(self, "_audio_content_started"):
audio_content_start = {
"event": {
"contentStart": {
"promptName": self.prompt_name,
"contentName": self.audio_content_name,
"type": "AUDIO",
"interactive": True,
"role": "USER",
"audioInputConfiguration": {
"mediaType": self.input_media_type,
"sampleRateHertz": self.input_sample_rate_hertz,
"sampleSizeBits": self.input_sample_size_bits,
"channelCount": self.input_channel_count,
"audioType": self.input_audio_type,
"encoding": self.input_encoding,
},
}
}
}
messages.append(json.dumps(audio_content_start))
self._audio_content_started = True
# Send audio chunk
audio_data = json_message.get("audio", "")
audio_event = {
"event": {
"audioInput": {
"promptName": self.prompt_name,
"contentName": self.audio_content_name,
"content": audio_data,
}
}
}
messages.append(json.dumps(audio_event))
return messages
def transform_input_audio_buffer_commit_event(
self, json_message: dict
) -> List[str]:
"""
Transform input_audio_buffer.commit event to Bedrock audio content end.
Args:
json_message: OpenAI input_audio_buffer.commit message
Returns:
List of Bedrock format messages (JSON strings)
"""
verbose_logger.debug("Handling input_audio_buffer.commit")
messages: List[str] = []
if hasattr(self, "_audio_content_started"):
audio_content_end = {
"event": {
"contentEnd": {
"promptName": self.prompt_name,
"contentName": self.audio_content_name,
}
}
}
messages.append(json.dumps(audio_content_end))
delattr(self, "_audio_content_started")
return messages
def transform_conversation_item_create_event(self, json_message: dict) -> List[str]:
"""
Transform conversation.item.create event to Bedrock text input or tool result.
Args:
json_message: OpenAI conversation.item.create message
Returns:
List of Bedrock format messages (JSON strings)
"""
verbose_logger.debug("Handling conversation.item.create")
messages: List[str] = []
item = json_message.get("item", {})
item_type = item.get("type")
# Handle tool result
if item_type == "function_call_output":
return self.transform_conversation_item_create_tool_result_event(
json_message
)
# Handle regular message
if item_type == "message":
content = item.get("content", [])
for content_part in content:
if content_part.get("type") == "input_text":
text_content_name = str(uuid_lib.uuid4())
# Content start
text_content_start = {
"event": {
"contentStart": {
"promptName": self.prompt_name,
"contentName": text_content_name,
"type": "TEXT",
"interactive": True,
"role": "USER",
"textInputConfiguration": {
"mediaType": self.text_media_type
},
}
}
}
messages.append(json.dumps(text_content_start))
# Text input
text_input = {
"event": {
"textInput": {
"promptName": self.prompt_name,
"contentName": text_content_name,
"content": content_part.get("text", ""),
}
}
}
messages.append(json.dumps(text_input))
# Content end
text_content_end = {
"event": {
"contentEnd": {
"promptName": self.prompt_name,
"contentName": text_content_name,
}
}
}
messages.append(json.dumps(text_content_end))
return messages
def transform_response_create_event(self, json_message: dict) -> List[str]:
"""
Transform response.create event to Bedrock format.
Args:
json_message: OpenAI response.create message
Returns:
List of Bedrock format messages (JSON strings)
"""
verbose_logger.debug("Handling response.create")
# Bedrock starts generating automatically, no explicit trigger needed
return []
def transform_response_cancel_event(self, json_message: dict) -> List[str]:
"""
Transform response.cancel event to Bedrock format.
Args:
json_message: OpenAI response.cancel message
Returns:
List of Bedrock format messages (JSON strings)
"""
verbose_logger.debug("Handling response.cancel")
# Send interrupt signal if needed
return []
def transform_realtime_request(
self,
message: str,
model: str,
session_configuration_request: Optional[str] = None,
) -> List[str]:
"""
Transform OpenAI realtime request to Bedrock Nova Sonic format.
Args:
message: OpenAI format message (JSON string)
model: Model ID
session_configuration_request: Previous session config
Returns:
List of Bedrock format messages (JSON strings)
"""
try:
json_message = json.loads(message)
except json.JSONDecodeError:
verbose_logger.warning(f"Invalid JSON message: {message[:200]}")
return []
message_type = json_message.get("type")
# Route to appropriate transformation method
if message_type == "session.update":
return self.transform_session_update_event(json_message)
elif message_type == "input_audio_buffer.append":
return self.transform_input_audio_buffer_append_event(json_message)
elif message_type == "input_audio_buffer.commit":
return self.transform_input_audio_buffer_commit_event(json_message)
elif message_type == "conversation.item.create":
return self.transform_conversation_item_create_event(json_message)
elif message_type == "response.create":
return self.transform_response_create_event(json_message)
elif message_type == "response.cancel":
return self.transform_response_cancel_event(json_message)
else:
verbose_logger.warning(f"Unknown message type: {message_type}")
return []
def transform_session_start_event(
self,
event: dict,
model: str,
logging_obj: LiteLLMLoggingObj,
) -> OpenAIRealtimeStreamSessionEvents:
"""
Transform Bedrock sessionStart event to OpenAI session.created.
Args:
event: Bedrock sessionStart event
model: Model ID
logging_obj: Logging object
Returns:
OpenAI session.created event
"""
verbose_logger.debug("Handling sessionStart")
session = OpenAIRealtimeStreamSession(
id=logging_obj.litellm_trace_id,
modalities=["text", "audio"],
)
if model is not None and isinstance(model, str):
session["model"] = model
return OpenAIRealtimeStreamSessionEvents(
type="session.created",
session=session,
event_id=str(uuid.uuid4()),
)
def transform_content_start_event(
self,
event: dict,
current_response_id: Optional[str],
current_output_item_id: Optional[str],
current_conversation_id: Optional[str],
) -> tuple[
List[OpenAIRealtimeEvents],
Optional[str],
Optional[str],
Optional[str],
Optional[ALL_DELTA_TYPES],
]:
"""
Transform Bedrock contentStart event to OpenAI response events.
Args:
event: Bedrock contentStart event
current_response_id: Current response ID
current_output_item_id: Current output item ID
current_conversation_id: Current conversation ID
Returns:
Tuple of (events, response_id, output_item_id, conversation_id, delta_type)
"""
content_start = event["contentStart"]
role = content_start.get("role")
if role != "ASSISTANT":
return (
[],
current_response_id,
current_output_item_id,
current_conversation_id,
None,
)
verbose_logger.debug("Handling ASSISTANT contentStart")
# Initialize IDs if needed
if not current_response_id:
current_response_id = f"resp_{uuid.uuid4()}"
if not current_output_item_id:
current_output_item_id = f"item_{uuid.uuid4()}"
if not current_conversation_id:
current_conversation_id = f"conv_{uuid.uuid4()}"
# Determine content type
content_type = content_start.get("type", "TEXT")
current_delta_type: ALL_DELTA_TYPES = (
"text" if content_type == "TEXT" else "audio"
)
returned_messages: List[OpenAIRealtimeEvents] = []
# Send response.created
response_created = OpenAIRealtimeStreamResponseBaseObject(
type="response.created",
event_id=f"event_{uuid.uuid4()}",
response={
"object": "realtime.response",
"id": current_response_id,
"status": "in_progress",
"output": [],
"conversation_id": current_conversation_id,
},
)
returned_messages.append(response_created)
# Send response.output_item.added
output_item_added = OpenAIRealtimeStreamResponseOutputItemAdded(
type="response.output_item.added",
response_id=current_response_id,
output_index=0,
item={
"id": current_output_item_id,
"object": "realtime.item",
"type": "message",
"status": "in_progress",
"role": "assistant",
"content": [],
},
)
returned_messages.append(output_item_added)
# Send response.content_part.added
content_part_added = OpenAIRealtimeResponseContentPartAdded(
type="response.content_part.added",
content_index=0,
output_index=0,
event_id=f"event_{uuid.uuid4()}",
item_id=current_output_item_id,
part=(
{"type": "text", "text": ""}
if current_delta_type == "text"
else {"type": "audio", "transcript": ""}
),
response_id=current_response_id,
)
returned_messages.append(content_part_added)
return (
returned_messages,
current_response_id,
current_output_item_id,
current_conversation_id,
current_delta_type,
)
def transform_text_output_event(
self,
event: dict,
current_output_item_id: Optional[str],
current_response_id: Optional[str],
current_delta_chunks: Optional[List[OpenAIRealtimeResponseDelta]],
) -> tuple[List[OpenAIRealtimeEvents], Optional[List[OpenAIRealtimeResponseDelta]]]:
"""
Transform Bedrock textOutput event to OpenAI response.text.delta.
Args:
event: Bedrock textOutput event
current_output_item_id: Current output item ID
current_response_id: Current response ID
current_delta_chunks: Current delta chunks
Returns:
Tuple of (events, updated_delta_chunks)
"""
verbose_logger.debug("Handling textOutput")
text_content = event["textOutput"].get("content", "")
if not current_output_item_id or not current_response_id:
return [], current_delta_chunks
text_delta = OpenAIRealtimeResponseDelta(
type="response.text.delta",
content_index=0,
event_id=f"event_{uuid.uuid4()}",
item_id=current_output_item_id,
output_index=0,
response_id=current_response_id,
delta=text_content,
)
# Track delta chunks
if current_delta_chunks is None:
current_delta_chunks = []
current_delta_chunks.append(text_delta)
return [text_delta], current_delta_chunks
def transform_audio_output_event(
self,
event: dict,
current_output_item_id: Optional[str],
current_response_id: Optional[str],
) -> List[OpenAIRealtimeEvents]:
"""
Transform Bedrock audioOutput event to OpenAI response.audio.delta.
Args:
event: Bedrock audioOutput event
current_output_item_id: Current output item ID
current_response_id: Current response ID
Returns:
List of OpenAI events
"""
verbose_logger.debug("Handling audioOutput")
audio_content = event["audioOutput"].get("content", "")
if not current_output_item_id or not current_response_id:
return []
audio_delta = OpenAIRealtimeResponseDelta(
type="response.audio.delta",
content_index=0,
event_id=f"event_{uuid.uuid4()}",
item_id=current_output_item_id,
output_index=0,
response_id=current_response_id,
delta=audio_content,
)
return [audio_delta]
def transform_content_end_event(
self,
event: dict,
current_output_item_id: Optional[str],
current_response_id: Optional[str],
current_delta_type: Optional[str],
current_delta_chunks: Optional[List[OpenAIRealtimeResponseDelta]],
) -> tuple[List[OpenAIRealtimeEvents], Optional[List[OpenAIRealtimeResponseDelta]]]:
"""
Transform Bedrock contentEnd event to OpenAI response done events.
Args:
event: Bedrock contentEnd event
current_output_item_id: Current output item ID
current_response_id: Current response ID
current_delta_type: Current delta type (text or audio)
current_delta_chunks: Current delta chunks
Returns:
Tuple of (events, reset_delta_chunks)
"""
content_end = event["contentEnd"]
verbose_logger.debug(f"Handling contentEnd: {content_end}")
if not current_output_item_id or not current_response_id:
return [], current_delta_chunks
returned_messages: List[OpenAIRealtimeEvents] = []
# Send appropriate done event based on type
if current_delta_type == "text":
# Accumulate text
accumulated_text = ""
if current_delta_chunks:
accumulated_text = "".join(
[chunk.get("delta", "") for chunk in current_delta_chunks]
)
text_done = OpenAIRealtimeResponseTextDone(
type="response.text.done",
content_index=0,
event_id=f"event_{uuid.uuid4()}",
item_id=current_output_item_id,
output_index=0,
response_id=current_response_id,
text=accumulated_text,
)
returned_messages.append(text_done)
# Send content_part.done
content_part_done = OpenAIRealtimeContentPartDone(
type="response.content_part.done",
content_index=0,
event_id=f"event_{uuid.uuid4()}",
item_id=current_output_item_id,
output_index=0,
part={"type": "text", "text": accumulated_text},
response_id=current_response_id,
)
returned_messages.append(content_part_done)
elif current_delta_type == "audio":
audio_done = OpenAIRealtimeResponseAudioDone(
type="response.audio.done",
content_index=0,
event_id=f"event_{uuid.uuid4()}",
item_id=current_output_item_id,
output_index=0,
response_id=current_response_id,
)
returned_messages.append(audio_done)
# Send content_part.done
content_part_done = OpenAIRealtimeContentPartDone(
type="response.content_part.done",
content_index=0,
event_id=f"event_{uuid.uuid4()}",
item_id=current_output_item_id,
output_index=0,
part={"type": "audio", "transcript": ""},
response_id=current_response_id,
)
returned_messages.append(content_part_done)
# Send output_item.done
output_item_done = OpenAIRealtimeOutputItemDone(
type="response.output_item.done",
event_id=f"event_{uuid.uuid4()}",
output_index=0,
response_id=current_response_id,
item={
"id": current_output_item_id,
"object": "realtime.item",
"type": "message",
"status": "completed",
"role": "assistant",
"content": [],
},
)
returned_messages.append(output_item_done)
# Reset delta chunks
return returned_messages, None
def transform_prompt_end_event(
self,
event: dict,
current_response_id: Optional[str],
current_conversation_id: Optional[str],
) -> tuple[
List[OpenAIRealtimeEvents],
Optional[str],
Optional[str],
Optional[ALL_DELTA_TYPES],
]:
"""
Transform Bedrock promptEnd event to OpenAI response.done.
Args:
event: Bedrock promptEnd event
current_response_id: Current response ID
current_conversation_id: Current conversation ID
Returns:
Tuple of (events, reset_output_item_id, reset_response_id, reset_delta_type)
"""
verbose_logger.debug("Handling promptEnd")
if not current_response_id or not current_conversation_id:
return [], None, None, None
usage_obj = get_empty_usage()
response_done = OpenAIRealtimeDoneEvent(
type="response.done",
event_id=f"event_{uuid.uuid4()}",
response=OpenAIRealtimeResponseDoneObject(
object="realtime.response",
id=current_response_id,
status="completed",
output=[],
conversation_id=current_conversation_id,
usage={
"prompt_tokens": usage_obj.prompt_tokens,
"completion_tokens": usage_obj.completion_tokens,
"total_tokens": usage_obj.total_tokens,
},
),
)
# Reset state for next response
return [response_done], None, None, None
def transform_tool_use_event(
self,
event: dict,
current_output_item_id: Optional[str],
current_response_id: Optional[str],
) -> tuple[List[OpenAIRealtimeEvents], str, str]:
"""
Transform Bedrock toolUse event to OpenAI format.
Args:
event: Bedrock toolUse event
current_output_item_id: Current output item ID
current_response_id: Current response ID
Returns:
Tuple of (events, tool_call_id, tool_name) for tracking
"""
verbose_logger.debug("Handling toolUse")
tool_use = event["toolUse"]
if not current_output_item_id or not current_response_id:
return [], "", ""
# Parse the tool input
tool_input = {}
if "input" in tool_use:
try:
tool_input = (
json.loads(tool_use["input"])
if isinstance(tool_use["input"], str)
else tool_use["input"]
)
except json.JSONDecodeError:
tool_input = {}
tool_call_id = tool_use.get("toolUseId", "")
tool_name = tool_use.get("toolName", "")
# Create a function call arguments done event
# This is a custom event format that matches what clients expect
from typing import cast
function_call_event: dict[str, Any] = {
"type": "response.function_call_arguments.done",
"event_id": f"event_{uuid.uuid4()}",
"response_id": current_response_id,
"item_id": current_output_item_id,
"output_index": 0,
"call_id": tool_call_id,
"name": tool_name,
"arguments": json.dumps(tool_input),
}
return (
[cast(OpenAIRealtimeEvents, function_call_event)],
tool_call_id,
tool_name,
)
def transform_conversation_item_create_tool_result_event(
self, json_message: dict
) -> List[str]:
"""
Transform conversation.item.create with tool result to Bedrock format.
Args:
json_message: OpenAI conversation.item.create message with tool result
Returns:
List of Bedrock format messages (JSON strings)
"""
verbose_logger.debug("Handling conversation.item.create for tool result")
messages: List[str] = []
item = json_message.get("item", {})
if item.get("type") == "function_call_output":
tool_content_name = str(uuid_lib.uuid4())
call_id = item.get("call_id", "")
output = item.get("output", "")
# Content start for tool result
tool_content_start = {
"event": {
"contentStart": {
"promptName": self.prompt_name,
"contentName": tool_content_name,
"interactive": False,
"type": "TOOL",
"role": "TOOL",
"toolResultInputConfiguration": {
"toolUseId": call_id,
"type": "TEXT",
"textInputConfiguration": {"mediaType": "text/plain"},
},
}
}
}
messages.append(json.dumps(tool_content_start))
# Tool result
tool_result = {
"event": {
"toolResult": {
"promptName": self.prompt_name,
"contentName": tool_content_name,
"content": output
if isinstance(output, str)
else json.dumps(output),
}
}
}
messages.append(json.dumps(tool_result))
# Content end
tool_content_end = {
"event": {
"contentEnd": {
"promptName": self.prompt_name,
"contentName": tool_content_name,
}
}
}
messages.append(json.dumps(tool_content_end))
return messages
def transform_realtime_response(
self,
message: Union[str, bytes],
model: str,
logging_obj: LiteLLMLoggingObj,
realtime_response_transform_input: RealtimeResponseTransformInput,
) -> RealtimeResponseTypedDict:
"""
Transform Bedrock Nova Sonic response to OpenAI realtime format.
Args:
message: Bedrock format message (JSON string)
model: Model ID
logging_obj: Logging object
realtime_response_transform_input: Current state
Returns:
Transformed response with updated state
"""
try:
json_message = json.loads(message)
except json.JSONDecodeError:
message_preview = (
message[:200].decode("utf-8", errors="replace")
if isinstance(message, bytes)
else message[:200]
)
verbose_logger.warning(f"Invalid JSON message: {message_preview}")
return {
"response": [],
"current_output_item_id": realtime_response_transform_input.get(
"current_output_item_id"
),
"current_response_id": realtime_response_transform_input.get(
"current_response_id"
),
"current_delta_chunks": realtime_response_transform_input.get(
"current_delta_chunks"
),
"current_conversation_id": realtime_response_transform_input.get(
"current_conversation_id"
),
"current_item_chunks": realtime_response_transform_input.get(
"current_item_chunks"
),
"current_delta_type": realtime_response_transform_input.get(
"current_delta_type"
),
"session_configuration_request": realtime_response_transform_input.get(
"session_configuration_request"
),
}
# Extract state
current_output_item_id = realtime_response_transform_input.get(
"current_output_item_id"
)
current_response_id = realtime_response_transform_input.get(
"current_response_id"
)
current_conversation_id = realtime_response_transform_input.get(
"current_conversation_id"
)
current_delta_chunks = realtime_response_transform_input.get(
"current_delta_chunks"
)
current_delta_type = realtime_response_transform_input.get("current_delta_type")
session_configuration_request = realtime_response_transform_input.get(
"session_configuration_request"
)
returned_messages: List[OpenAIRealtimeEvents] = []
# Parse Bedrock event
event = json_message.get("event", {})
# Route to appropriate transformation method
if "sessionStart" in event:
session_created = self.transform_session_start_event(
event, model, logging_obj
)
returned_messages.append(session_created)
session_configuration_request = json.dumps({"configured": True})
elif "contentStart" in event:
(
events,
current_response_id,
current_output_item_id,
current_conversation_id,
current_delta_type,
) = self.transform_content_start_event(
event,
current_response_id,
current_output_item_id,
current_conversation_id,
)
returned_messages.extend(events)
elif "textOutput" in event:
events, current_delta_chunks = self.transform_text_output_event(
event,
current_output_item_id,
current_response_id,
current_delta_chunks,
)
returned_messages.extend(events)
elif "audioOutput" in event:
events = self.transform_audio_output_event(
event, current_output_item_id, current_response_id
)
returned_messages.extend(events)
elif "contentEnd" in event:
events, current_delta_chunks = self.transform_content_end_event(
event,
current_output_item_id,
current_response_id,
current_delta_type,
current_delta_chunks,
)
returned_messages.extend(events)
elif "toolUse" in event:
events, tool_call_id, tool_name = self.transform_tool_use_event(
event, current_output_item_id, current_response_id
)
returned_messages.extend(events)
# Store tool call info for potential use
verbose_logger.debug(f"Tool use event: {tool_name} (ID: {tool_call_id})")
elif "promptEnd" in event:
(
events,
current_output_item_id,
current_response_id,
current_delta_type,
) = self.transform_prompt_end_event(
event, current_response_id, current_conversation_id
)
returned_messages.extend(events)
return {
"response": returned_messages,
"current_output_item_id": current_output_item_id,
"current_response_id": current_response_id,
"current_delta_chunks": current_delta_chunks,
"current_conversation_id": current_conversation_id,
"current_item_chunks": realtime_response_transform_input.get(
"current_item_chunks"
),
"current_delta_type": current_delta_type,
"session_configuration_request": session_configuration_request,
}