chore: initial public snapshot for github upload
This commit is contained in:
@@ -0,0 +1,178 @@
|
||||
# OpenAI Text-to-Speech Guardrail Translation Handler
|
||||
|
||||
Handler for processing OpenAI's text-to-speech endpoint (`/v1/audio/speech`) with guardrails.
|
||||
|
||||
## Overview
|
||||
|
||||
This handler processes text-to-speech requests by:
|
||||
1. Extracting the input text from the request
|
||||
2. Applying guardrails to the input text
|
||||
3. Updating the request with the guardrailed text
|
||||
4. Returning the output unchanged (audio is binary, not text)
|
||||
|
||||
## Data Format
|
||||
|
||||
### Input Format
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "tts-1",
|
||||
"input": "The quick brown fox jumped over the lazy dog.",
|
||||
"voice": "alloy",
|
||||
"response_format": "mp3",
|
||||
"speed": 1.0
|
||||
}
|
||||
```
|
||||
|
||||
### Output Format
|
||||
|
||||
The output is binary audio data (MP3, WAV, etc.), not text, so it cannot be guardrailed.
|
||||
|
||||
## Usage
|
||||
|
||||
The handler is automatically discovered and applied when guardrails are used with the text-to-speech endpoint.
|
||||
|
||||
### Example: Using Guardrails with Text-to-Speech
|
||||
|
||||
```bash
|
||||
curl -X POST 'http://localhost:4000/v1/audio/speech' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: Bearer your-api-key' \
|
||||
-d '{
|
||||
"model": "tts-1",
|
||||
"input": "The quick brown fox jumped over the lazy dog.",
|
||||
"voice": "alloy",
|
||||
"guardrails": ["content_moderation"]
|
||||
}' \
|
||||
--output speech.mp3
|
||||
```
|
||||
|
||||
The guardrail will be applied to the input text before the text-to-speech conversion.
|
||||
|
||||
### Example: PII Masking in TTS Input
|
||||
|
||||
```bash
|
||||
curl -X POST 'http://localhost:4000/v1/audio/speech' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: Bearer your-api-key' \
|
||||
-d '{
|
||||
"model": "tts-1",
|
||||
"input": "Please call John Doe at john@example.com",
|
||||
"voice": "nova",
|
||||
"guardrails": ["mask_pii"]
|
||||
}' \
|
||||
--output speech.mp3
|
||||
```
|
||||
|
||||
The audio will say: "Please call [NAME_REDACTED] at [EMAIL_REDACTED]"
|
||||
|
||||
### Example: Content Filtering Before TTS
|
||||
|
||||
```bash
|
||||
curl -X POST 'http://localhost:4000/v1/audio/speech' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: Bearer your-api-key' \
|
||||
-d '{
|
||||
"model": "tts-1-hd",
|
||||
"input": "This is the text that will be spoken",
|
||||
"voice": "shimmer",
|
||||
"guardrails": ["content_filter"]
|
||||
}' \
|
||||
--output speech.mp3
|
||||
```
|
||||
|
||||
## Implementation Details
|
||||
|
||||
### Input Processing
|
||||
|
||||
- **Field**: `input` (string)
|
||||
- **Processing**: Applies guardrail to input text
|
||||
- **Result**: Updated input text in request
|
||||
|
||||
### Output Processing
|
||||
|
||||
- **Processing**: Not applicable (audio is binary data)
|
||||
- **Result**: Response returned unchanged
|
||||
|
||||
## Use Cases
|
||||
|
||||
1. **PII Protection**: Remove personally identifiable information before converting to speech
|
||||
2. **Content Filtering**: Remove inappropriate content before TTS conversion
|
||||
3. **Compliance**: Ensure text meets requirements before voice synthesis
|
||||
4. **Text Sanitization**: Clean up text before audio generation
|
||||
|
||||
## Extension
|
||||
|
||||
Override these methods to customize behavior:
|
||||
|
||||
- `process_input_messages()`: Customize how input text is processed
|
||||
- `process_output_response()`: Currently a no-op, but can be overridden if needed
|
||||
|
||||
## Supported Call Types
|
||||
|
||||
- `CallTypes.speech` - Synchronous text-to-speech
|
||||
- `CallTypes.aspeech` - Asynchronous text-to-speech
|
||||
|
||||
## Notes
|
||||
|
||||
- Only the input text is processed by guardrails
|
||||
- Output processing is a no-op since audio cannot be text-guardrailed
|
||||
- Both sync and async call types use the same handler
|
||||
- Works with all TTS models (tts-1, tts-1-hd, etc.)
|
||||
- Works with all voice options
|
||||
|
||||
## Common Patterns
|
||||
|
||||
### Remove PII Before TTS
|
||||
|
||||
```python
|
||||
import litellm
|
||||
from pathlib import Path
|
||||
|
||||
speech_file_path = Path(__file__).parent / "speech.mp3"
|
||||
response = litellm.speech(
|
||||
model="tts-1",
|
||||
voice="alloy",
|
||||
input="Hi, this is John Doe calling from john@company.com",
|
||||
guardrails=["mask_pii"],
|
||||
)
|
||||
response.stream_to_file(speech_file_path)
|
||||
# Audio will have PII masked
|
||||
```
|
||||
|
||||
### Content Moderation Before TTS
|
||||
|
||||
```python
|
||||
import litellm
|
||||
from pathlib import Path
|
||||
|
||||
speech_file_path = Path(__file__).parent / "speech.mp3"
|
||||
response = litellm.speech(
|
||||
model="tts-1-hd",
|
||||
voice="nova",
|
||||
input="Your text here",
|
||||
guardrails=["content_moderation"],
|
||||
)
|
||||
response.stream_to_file(speech_file_path)
|
||||
```
|
||||
|
||||
### Async TTS with Guardrails
|
||||
|
||||
```python
|
||||
import litellm
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
|
||||
async def generate_speech():
|
||||
speech_file_path = Path(__file__).parent / "speech.mp3"
|
||||
response = await litellm.aspeech(
|
||||
model="tts-1",
|
||||
voice="echo",
|
||||
input="Text to convert to speech",
|
||||
guardrails=["pii_mask"],
|
||||
)
|
||||
response.stream_to_file(speech_file_path)
|
||||
|
||||
asyncio.run(generate_speech())
|
||||
```
|
||||
|
||||
@@ -0,0 +1,13 @@
|
||||
"""OpenAI Text-to-Speech handler for Unified Guardrails."""
|
||||
|
||||
from litellm.llms.openai.speech.guardrail_translation.handler import (
|
||||
OpenAITextToSpeechHandler,
|
||||
)
|
||||
from litellm.types.utils import CallTypes
|
||||
|
||||
guardrail_translation_mappings = {
|
||||
CallTypes.speech: OpenAITextToSpeechHandler,
|
||||
CallTypes.aspeech: OpenAITextToSpeechHandler,
|
||||
}
|
||||
|
||||
__all__ = ["guardrail_translation_mappings", "OpenAITextToSpeechHandler"]
|
||||
@@ -0,0 +1,108 @@
|
||||
"""
|
||||
OpenAI Text-to-Speech Handler for Unified Guardrails
|
||||
|
||||
This module provides guardrail translation support for OpenAI's text-to-speech endpoint.
|
||||
The handler processes the 'input' text parameter (output is audio, so no text to guardrail).
|
||||
"""
|
||||
|
||||
from typing import TYPE_CHECKING, Any, Optional
|
||||
|
||||
from litellm._logging import verbose_proxy_logger
|
||||
from litellm.llms.base_llm.guardrail_translation.base_translation import BaseTranslation
|
||||
from litellm.types.utils import GenericGuardrailAPIInputs
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from litellm.integrations.custom_guardrail import CustomGuardrail
|
||||
from litellm.types.llms.openai import HttpxBinaryResponseContent
|
||||
|
||||
|
||||
class OpenAITextToSpeechHandler(BaseTranslation):
|
||||
"""
|
||||
Handler for processing OpenAI text-to-speech requests with guardrails.
|
||||
|
||||
This class provides methods to:
|
||||
1. Process input text (pre-call hook)
|
||||
|
||||
Note: Output processing is not applicable since the output is audio (binary),
|
||||
not text. Only the input text is processed.
|
||||
"""
|
||||
|
||||
async def process_input_messages(
|
||||
self,
|
||||
data: dict,
|
||||
guardrail_to_apply: "CustomGuardrail",
|
||||
litellm_logging_obj: Optional[Any] = None,
|
||||
) -> Any:
|
||||
"""
|
||||
Process input text by applying guardrails.
|
||||
|
||||
Args:
|
||||
data: Request data dictionary containing 'input' parameter
|
||||
guardrail_to_apply: The guardrail instance to apply
|
||||
|
||||
Returns:
|
||||
Modified data with guardrails applied to input text
|
||||
"""
|
||||
input_text = data.get("input")
|
||||
if input_text is None:
|
||||
verbose_proxy_logger.debug(
|
||||
"OpenAI Text-to-Speech: No input text found in request data"
|
||||
)
|
||||
return data
|
||||
|
||||
if isinstance(input_text, str):
|
||||
inputs = GenericGuardrailAPIInputs(texts=[input_text])
|
||||
# Include model information if available (voice model)
|
||||
model = data.get("model")
|
||||
if model:
|
||||
inputs["model"] = model
|
||||
guardrailed_inputs = await guardrail_to_apply.apply_guardrail(
|
||||
inputs=inputs,
|
||||
request_data=data,
|
||||
input_type="request",
|
||||
logging_obj=litellm_logging_obj,
|
||||
)
|
||||
guardrailed_texts = guardrailed_inputs.get("texts", [])
|
||||
data["input"] = guardrailed_texts[0] if guardrailed_texts else input_text
|
||||
|
||||
verbose_proxy_logger.debug(
|
||||
"OpenAI Text-to-Speech: Applied guardrail to input text. "
|
||||
"Original length: %d, New length: %d",
|
||||
len(input_text),
|
||||
len(data["input"]),
|
||||
)
|
||||
else:
|
||||
verbose_proxy_logger.debug(
|
||||
"OpenAI Text-to-Speech: Unexpected input type: %s. Expected string.",
|
||||
type(input_text),
|
||||
)
|
||||
|
||||
return data
|
||||
|
||||
async def process_output_response(
|
||||
self,
|
||||
response: "HttpxBinaryResponseContent",
|
||||
guardrail_to_apply: "CustomGuardrail",
|
||||
litellm_logging_obj: Optional[Any] = None,
|
||||
user_api_key_dict: Optional[Any] = None,
|
||||
) -> Any:
|
||||
"""
|
||||
Process output - not applicable for text-to-speech.
|
||||
|
||||
The output is audio (binary data), not text, so there's nothing to apply
|
||||
guardrails to. This method returns the response unchanged.
|
||||
|
||||
Args:
|
||||
response: Binary audio response
|
||||
guardrail_to_apply: The guardrail instance (unused)
|
||||
litellm_logging_obj: Optional logging object (unused)
|
||||
user_api_key_dict: User API key metadata (unused)
|
||||
|
||||
Returns:
|
||||
Unmodified response (audio data doesn't need text guardrails)
|
||||
"""
|
||||
verbose_proxy_logger.debug(
|
||||
"OpenAI Text-to-Speech: Output processing not applicable "
|
||||
"(output is audio data, not text)"
|
||||
)
|
||||
return response
|
||||
Reference in New Issue
Block a user