chore: initial public snapshot for github upload
This commit is contained in:
@@ -0,0 +1,12 @@
|
||||
"""Azure AI OCR module."""
|
||||
from .common_utils import get_azure_ai_ocr_config
|
||||
from .document_intelligence.transformation import (
|
||||
AzureDocumentIntelligenceOCRConfig,
|
||||
)
|
||||
from .transformation import AzureAIOCRConfig
|
||||
|
||||
__all__ = [
|
||||
"AzureAIOCRConfig",
|
||||
"AzureDocumentIntelligenceOCRConfig",
|
||||
"get_azure_ai_ocr_config",
|
||||
]
|
||||
@@ -0,0 +1,52 @@
|
||||
"""
|
||||
Common utilities for Azure AI OCR providers.
|
||||
|
||||
This module provides routing logic to determine which OCR configuration to use
|
||||
based on the model name.
|
||||
"""
|
||||
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
|
||||
from litellm._logging import verbose_logger
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from litellm.llms.base_llm.ocr.transformation import BaseOCRConfig
|
||||
|
||||
|
||||
def get_azure_ai_ocr_config(model: str) -> Optional["BaseOCRConfig"]:
|
||||
"""
|
||||
Determine which Azure AI OCR configuration to use based on the model name.
|
||||
|
||||
Azure AI supports multiple OCR services:
|
||||
- Azure Document Intelligence: azure_ai/doc-intelligence/<model>
|
||||
- Mistral OCR (via Azure AI): azure_ai/<model>
|
||||
|
||||
Args:
|
||||
model: The model name (e.g., "azure_ai/doc-intelligence/prebuilt-read",
|
||||
"azure_ai/pixtral-12b-2409")
|
||||
|
||||
Returns:
|
||||
OCR configuration instance for the specified model
|
||||
|
||||
Examples:
|
||||
>>> get_azure_ai_ocr_config("azure_ai/doc-intelligence/prebuilt-read")
|
||||
<AzureDocumentIntelligenceOCRConfig object>
|
||||
|
||||
>>> get_azure_ai_ocr_config("azure_ai/pixtral-12b-2409")
|
||||
<AzureAIOCRConfig object>
|
||||
"""
|
||||
from litellm.llms.azure_ai.ocr.document_intelligence.transformation import (
|
||||
AzureDocumentIntelligenceOCRConfig,
|
||||
)
|
||||
from litellm.llms.azure_ai.ocr.transformation import AzureAIOCRConfig
|
||||
|
||||
# Check for Azure Document Intelligence models
|
||||
if "doc-intelligence" in model or "documentintelligence" in model:
|
||||
verbose_logger.debug(
|
||||
f"Routing {model} to Azure Document Intelligence OCR config"
|
||||
)
|
||||
return AzureDocumentIntelligenceOCRConfig()
|
||||
|
||||
# Default to Mistral-based OCR for other azure_ai models
|
||||
verbose_logger.debug(f"Routing {model} to Azure AI (Mistral) OCR config")
|
||||
return AzureAIOCRConfig()
|
||||
@@ -0,0 +1,4 @@
|
||||
"""Azure Document Intelligence OCR module."""
|
||||
from .transformation import AzureDocumentIntelligenceOCRConfig
|
||||
|
||||
__all__ = ["AzureDocumentIntelligenceOCRConfig"]
|
||||
@@ -0,0 +1,698 @@
|
||||
"""
|
||||
Azure Document Intelligence OCR transformation implementation.
|
||||
|
||||
Azure Document Intelligence (formerly Form Recognizer) provides advanced document analysis capabilities.
|
||||
This implementation transforms between Mistral OCR format and Azure Document Intelligence API v4.0.
|
||||
|
||||
Note: Azure Document Intelligence API is async - POST returns 202 Accepted with Operation-Location header.
|
||||
The operation location must be polled until the analysis completes.
|
||||
"""
|
||||
import asyncio
|
||||
import re
|
||||
import time
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
import httpx
|
||||
|
||||
from litellm._logging import verbose_logger
|
||||
from litellm.constants import (
|
||||
AZURE_DOCUMENT_INTELLIGENCE_API_VERSION,
|
||||
AZURE_DOCUMENT_INTELLIGENCE_DEFAULT_DPI,
|
||||
AZURE_OPERATION_POLLING_TIMEOUT,
|
||||
)
|
||||
from litellm.llms.base_llm.ocr.transformation import (
|
||||
BaseOCRConfig,
|
||||
DocumentType,
|
||||
OCRPage,
|
||||
OCRPageDimensions,
|
||||
OCRRequestData,
|
||||
OCRResponse,
|
||||
OCRUsageInfo,
|
||||
)
|
||||
from litellm.secret_managers.main import get_secret_str
|
||||
|
||||
|
||||
class AzureDocumentIntelligenceOCRConfig(BaseOCRConfig):
|
||||
"""
|
||||
Azure Document Intelligence OCR transformation configuration.
|
||||
|
||||
Supports Azure Document Intelligence v4.0 (2024-11-30) API.
|
||||
Model route: azure_ai/doc-intelligence/<model>
|
||||
|
||||
Supported models:
|
||||
- prebuilt-layout: Extracts text with markdown, tables, and structure (closest to Mistral OCR)
|
||||
- prebuilt-read: Basic text extraction optimized for reading
|
||||
- prebuilt-document: General document analysis
|
||||
|
||||
Reference: https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
|
||||
def get_supported_ocr_params(self, model: str) -> list:
|
||||
"""
|
||||
Get supported OCR parameters for Azure Document Intelligence.
|
||||
|
||||
Azure DI has minimal optional parameters compared to Mistral OCR.
|
||||
Most Mistral-specific params are ignored during transformation.
|
||||
"""
|
||||
return []
|
||||
|
||||
def validate_environment(
|
||||
self,
|
||||
headers: Dict,
|
||||
model: str,
|
||||
api_key: Optional[str] = None,
|
||||
api_base: Optional[str] = None,
|
||||
litellm_params: Optional[dict] = None,
|
||||
**kwargs,
|
||||
) -> Dict:
|
||||
"""
|
||||
Validate environment and return headers for Azure Document Intelligence.
|
||||
|
||||
Authentication uses Ocp-Apim-Subscription-Key header.
|
||||
"""
|
||||
# Get API key from environment if not provided
|
||||
if api_key is None:
|
||||
api_key = get_secret_str("AZURE_DOCUMENT_INTELLIGENCE_API_KEY")
|
||||
|
||||
if api_key is None:
|
||||
raise ValueError(
|
||||
"Missing Azure Document Intelligence API Key - Set AZURE_DOCUMENT_INTELLIGENCE_API_KEY environment variable or pass api_key parameter"
|
||||
)
|
||||
|
||||
# Validate API base/endpoint is provided
|
||||
if api_base is None:
|
||||
api_base = get_secret_str("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT")
|
||||
|
||||
if api_base is None:
|
||||
raise ValueError(
|
||||
"Missing Azure Document Intelligence Endpoint - Set AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT environment variable or pass api_base parameter"
|
||||
)
|
||||
|
||||
headers = {
|
||||
"Ocp-Apim-Subscription-Key": api_key,
|
||||
"Content-Type": "application/json",
|
||||
**headers,
|
||||
}
|
||||
|
||||
return headers
|
||||
|
||||
def get_complete_url(
|
||||
self,
|
||||
api_base: Optional[str],
|
||||
model: str,
|
||||
optional_params: dict,
|
||||
litellm_params: Optional[dict] = None,
|
||||
**kwargs,
|
||||
) -> str:
|
||||
"""
|
||||
Get complete URL for Azure Document Intelligence endpoint.
|
||||
|
||||
Format: {endpoint}/documentintelligence/documentModels/{modelId}:analyze?api-version=2024-11-30
|
||||
|
||||
Note: API version 2024-11-30 uses /documentintelligence/ path (not /formrecognizer/)
|
||||
|
||||
Args:
|
||||
api_base: Azure Document Intelligence endpoint (e.g., https://your-resource.cognitiveservices.azure.com)
|
||||
model: Model ID (e.g., "prebuilt-layout", "prebuilt-read")
|
||||
optional_params: Optional parameters
|
||||
|
||||
Returns: Complete URL for Azure DI analyze endpoint
|
||||
"""
|
||||
if api_base is None:
|
||||
api_base = get_secret_str("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT")
|
||||
|
||||
if api_base is None:
|
||||
raise ValueError(
|
||||
"Missing Azure Document Intelligence Endpoint - Set AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT environment variable or pass api_base parameter"
|
||||
)
|
||||
|
||||
# Ensure no trailing slash
|
||||
api_base = api_base.rstrip("/")
|
||||
|
||||
# Extract model ID from full model path if needed
|
||||
# Model can be "prebuilt-layout" or "azure_ai/doc-intelligence/prebuilt-layout"
|
||||
model_id = model
|
||||
if "/" in model:
|
||||
# Extract the last part after the last slash
|
||||
model_id = model.split("/")[-1]
|
||||
|
||||
# Azure Document Intelligence analyze endpoint
|
||||
# Note: API version 2024-11-30+ uses /documentintelligence/ (not /formrecognizer/)
|
||||
return f"{api_base}/documentintelligence/documentModels/{model_id}:analyze?api-version={AZURE_DOCUMENT_INTELLIGENCE_API_VERSION}"
|
||||
|
||||
def _extract_base64_from_data_uri(self, data_uri: str) -> str:
|
||||
"""
|
||||
Extract base64 content from a data URI.
|
||||
|
||||
Args:
|
||||
data_uri: Data URI like "data:application/pdf;base64,..."
|
||||
|
||||
Returns:
|
||||
Base64 string without the data URI prefix
|
||||
"""
|
||||
# Match pattern: data:[<mediatype>][;base64],<data>
|
||||
match = re.match(r"data:([^;]+)(?:;base64)?,(.+)", data_uri)
|
||||
if match:
|
||||
return match.group(2)
|
||||
return data_uri
|
||||
|
||||
def transform_ocr_request(
|
||||
self,
|
||||
model: str,
|
||||
document: DocumentType,
|
||||
optional_params: dict,
|
||||
headers: dict,
|
||||
**kwargs,
|
||||
) -> OCRRequestData:
|
||||
"""
|
||||
Transform OCR request to Azure Document Intelligence format.
|
||||
|
||||
Mistral OCR format:
|
||||
{
|
||||
"document": {
|
||||
"type": "document_url",
|
||||
"document_url": "https://example.com/doc.pdf"
|
||||
}
|
||||
}
|
||||
|
||||
Azure DI format:
|
||||
{
|
||||
"urlSource": "https://example.com/doc.pdf"
|
||||
}
|
||||
OR
|
||||
{
|
||||
"base64Source": "base64_encoded_content"
|
||||
}
|
||||
|
||||
Args:
|
||||
model: Model name
|
||||
document: Document dict from user (Mistral format)
|
||||
optional_params: Already mapped optional parameters
|
||||
headers: Request headers
|
||||
|
||||
Returns:
|
||||
OCRRequestData with JSON data
|
||||
"""
|
||||
verbose_logger.debug(
|
||||
f"Azure Document Intelligence transform_ocr_request - model: {model}"
|
||||
)
|
||||
|
||||
if not isinstance(document, dict):
|
||||
raise ValueError(f"Expected document dict, got {type(document)}")
|
||||
|
||||
# Extract document URL from Mistral format
|
||||
doc_type = document.get("type")
|
||||
document_url = None
|
||||
|
||||
if doc_type == "document_url":
|
||||
document_url = document.get("document_url", "")
|
||||
elif doc_type == "image_url":
|
||||
document_url = document.get("image_url", "")
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Invalid document type: {doc_type}. Must be 'document_url' or 'image_url'"
|
||||
)
|
||||
|
||||
if not document_url:
|
||||
raise ValueError("Document URL is required")
|
||||
|
||||
# Build Azure DI request
|
||||
data: Dict[str, Any] = {}
|
||||
|
||||
# Check if it's a data URI (base64)
|
||||
if document_url.startswith("data:"):
|
||||
# Extract base64 content
|
||||
base64_content = self._extract_base64_from_data_uri(document_url)
|
||||
data["base64Source"] = base64_content
|
||||
verbose_logger.debug("Using base64Source for Azure Document Intelligence")
|
||||
else:
|
||||
# Regular URL
|
||||
data["urlSource"] = document_url
|
||||
verbose_logger.debug("Using urlSource for Azure Document Intelligence")
|
||||
|
||||
# Azure DI doesn't support most Mistral-specific params
|
||||
# Ignore pages, include_image_base64, etc.
|
||||
|
||||
return OCRRequestData(data=data, files=None)
|
||||
|
||||
def _extract_page_markdown(self, page_data: Dict[str, Any]) -> str:
|
||||
"""
|
||||
Extract text from Azure DI page and format as markdown.
|
||||
|
||||
Azure DI provides text in 'lines' array. We concatenate them with newlines.
|
||||
|
||||
Args:
|
||||
page_data: Azure DI page object
|
||||
|
||||
Returns:
|
||||
Markdown-formatted text
|
||||
"""
|
||||
lines = page_data.get("lines", [])
|
||||
if not lines:
|
||||
return ""
|
||||
|
||||
# Extract text content from each line
|
||||
text_lines = [line.get("content", "") for line in lines]
|
||||
|
||||
# Join with newlines to preserve structure
|
||||
return "\n".join(text_lines)
|
||||
|
||||
def _convert_dimensions(
|
||||
self, width: float, height: float, unit: str
|
||||
) -> OCRPageDimensions:
|
||||
"""
|
||||
Convert Azure DI dimensions to pixels.
|
||||
|
||||
Azure DI provides dimensions in inches. We convert to pixels using configured DPI.
|
||||
|
||||
Args:
|
||||
width: Width in specified unit
|
||||
height: Height in specified unit
|
||||
unit: Unit of measurement (e.g., "inch")
|
||||
|
||||
Returns:
|
||||
OCRPageDimensions with pixel values
|
||||
"""
|
||||
# Convert to pixels using configured DPI
|
||||
dpi = AZURE_DOCUMENT_INTELLIGENCE_DEFAULT_DPI
|
||||
if unit == "inch":
|
||||
width_px = int(width * dpi)
|
||||
height_px = int(height * dpi)
|
||||
else:
|
||||
# If unit is not inches, assume it's already in pixels
|
||||
width_px = int(width)
|
||||
height_px = int(height)
|
||||
|
||||
return OCRPageDimensions(width=width_px, height=height_px, dpi=dpi)
|
||||
|
||||
@staticmethod
|
||||
def _check_timeout(start_time: float, timeout_secs: int) -> None:
|
||||
"""
|
||||
Check if operation has timed out.
|
||||
|
||||
Args:
|
||||
start_time: Start time of the operation
|
||||
timeout_secs: Timeout duration in seconds
|
||||
|
||||
Raises:
|
||||
TimeoutError: If operation has exceeded timeout
|
||||
"""
|
||||
if time.time() - start_time > timeout_secs:
|
||||
raise TimeoutError(
|
||||
f"Azure Document Intelligence operation polling timed out after {timeout_secs} seconds"
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _get_retry_after(response: httpx.Response) -> int:
|
||||
"""
|
||||
Get retry-after duration from response headers.
|
||||
|
||||
Args:
|
||||
response: HTTP response
|
||||
|
||||
Returns:
|
||||
Retry-after duration in seconds (default: 2)
|
||||
"""
|
||||
retry_after = int(response.headers.get("retry-after", "2"))
|
||||
verbose_logger.debug(f"Retry polling after: {retry_after} seconds")
|
||||
return retry_after
|
||||
|
||||
@staticmethod
|
||||
def _check_operation_status(response: httpx.Response) -> str:
|
||||
"""
|
||||
Check Azure DI operation status from response.
|
||||
|
||||
Args:
|
||||
response: HTTP response from operation endpoint
|
||||
|
||||
Returns:
|
||||
Operation status string
|
||||
|
||||
Raises:
|
||||
ValueError: If operation failed or status is unknown
|
||||
"""
|
||||
try:
|
||||
result = response.json()
|
||||
status = result.get("status")
|
||||
|
||||
verbose_logger.debug(f"Azure DI operation status: {status}")
|
||||
|
||||
if status == "succeeded":
|
||||
return "succeeded"
|
||||
elif status == "failed":
|
||||
error_msg = result.get("error", {}).get("message", "Unknown error")
|
||||
raise ValueError(
|
||||
f"Azure Document Intelligence analysis failed: {error_msg}"
|
||||
)
|
||||
elif status in ["running", "notStarted"]:
|
||||
return "running"
|
||||
else:
|
||||
raise ValueError(f"Unknown operation status: {status}")
|
||||
|
||||
except Exception as e:
|
||||
if "succeeded" in str(e) or "failed" in str(e):
|
||||
raise
|
||||
# If we can't parse JSON, something went wrong
|
||||
raise ValueError(f"Failed to parse Azure DI operation response: {e}")
|
||||
|
||||
def _poll_operation_sync(
|
||||
self,
|
||||
operation_url: str,
|
||||
headers: Dict[str, str],
|
||||
timeout_secs: int,
|
||||
) -> httpx.Response:
|
||||
"""
|
||||
Poll Azure Document Intelligence operation until completion (sync).
|
||||
|
||||
Azure DI POST returns 202 with Operation-Location header.
|
||||
We need to poll that URL until status is "succeeded" or "failed".
|
||||
|
||||
Args:
|
||||
operation_url: The Operation-Location URL to poll
|
||||
headers: Request headers (including auth)
|
||||
timeout_secs: Total timeout in seconds
|
||||
|
||||
Returns:
|
||||
Final response with completed analysis
|
||||
"""
|
||||
from litellm.llms.custom_httpx.http_handler import _get_httpx_client
|
||||
|
||||
client = _get_httpx_client()
|
||||
start_time = time.time()
|
||||
|
||||
verbose_logger.debug(f"Polling Azure DI operation: {operation_url}")
|
||||
|
||||
while True:
|
||||
self._check_timeout(start_time=start_time, timeout_secs=timeout_secs)
|
||||
|
||||
# Poll the operation status
|
||||
response = client.get(url=operation_url, headers=headers)
|
||||
|
||||
# Check operation status
|
||||
status = self._check_operation_status(response=response)
|
||||
|
||||
if status == "succeeded":
|
||||
return response
|
||||
elif status == "running":
|
||||
# Wait before polling again
|
||||
retry_after = self._get_retry_after(response=response)
|
||||
time.sleep(retry_after)
|
||||
|
||||
async def _poll_operation_async(
|
||||
self,
|
||||
operation_url: str,
|
||||
headers: Dict[str, str],
|
||||
timeout_secs: int,
|
||||
) -> httpx.Response:
|
||||
"""
|
||||
Poll Azure Document Intelligence operation until completion (async).
|
||||
|
||||
Args:
|
||||
operation_url: The Operation-Location URL to poll
|
||||
headers: Request headers (including auth)
|
||||
timeout_secs: Total timeout in seconds
|
||||
|
||||
Returns:
|
||||
Final response with completed analysis
|
||||
"""
|
||||
import litellm
|
||||
from litellm.llms.custom_httpx.http_handler import get_async_httpx_client
|
||||
|
||||
client = get_async_httpx_client(llm_provider=litellm.LlmProviders.AZURE_AI)
|
||||
start_time = time.time()
|
||||
|
||||
verbose_logger.debug(f"Polling Azure DI operation (async): {operation_url}")
|
||||
|
||||
while True:
|
||||
self._check_timeout(start_time=start_time, timeout_secs=timeout_secs)
|
||||
|
||||
# Poll the operation status
|
||||
response = await client.get(url=operation_url, headers=headers)
|
||||
|
||||
# Check operation status
|
||||
status = self._check_operation_status(response=response)
|
||||
|
||||
if status == "succeeded":
|
||||
return response
|
||||
elif status == "running":
|
||||
# Wait before polling again
|
||||
retry_after = self._get_retry_after(response=response)
|
||||
await asyncio.sleep(retry_after)
|
||||
|
||||
def transform_ocr_response(
|
||||
self,
|
||||
model: str,
|
||||
raw_response: httpx.Response,
|
||||
logging_obj: Any,
|
||||
**kwargs,
|
||||
) -> OCRResponse:
|
||||
"""
|
||||
Transform Azure Document Intelligence response to Mistral OCR format.
|
||||
|
||||
Handles async operation polling: If response is 202 Accepted, polls Operation-Location
|
||||
until analysis completes.
|
||||
|
||||
Azure DI response (after polling):
|
||||
{
|
||||
"status": "succeeded",
|
||||
"analyzeResult": {
|
||||
"content": "Full document text...",
|
||||
"pages": [
|
||||
{
|
||||
"pageNumber": 1,
|
||||
"width": 8.5,
|
||||
"height": 11,
|
||||
"unit": "inch",
|
||||
"lines": [{"content": "text", "boundingBox": [...]}]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
Mistral OCR format:
|
||||
{
|
||||
"pages": [
|
||||
{
|
||||
"index": 0,
|
||||
"markdown": "extracted text",
|
||||
"dimensions": {"width": 816, "height": 1056, "dpi": 96}
|
||||
}
|
||||
],
|
||||
"model": "azure_ai/doc-intelligence/prebuilt-layout",
|
||||
"usage_info": {"pages_processed": 1},
|
||||
"object": "ocr"
|
||||
}
|
||||
|
||||
Args:
|
||||
model: Model name
|
||||
raw_response: Raw HTTP response from Azure DI (may be 202 Accepted)
|
||||
logging_obj: Logging object
|
||||
|
||||
Returns:
|
||||
OCRResponse in Mistral format
|
||||
"""
|
||||
try:
|
||||
# Check if we got 202 Accepted (async operation started)
|
||||
if raw_response.status_code == 202:
|
||||
verbose_logger.debug(
|
||||
"Azure DI returned 202 Accepted, polling operation..."
|
||||
)
|
||||
|
||||
# Get Operation-Location header
|
||||
operation_url = raw_response.headers.get("Operation-Location")
|
||||
if not operation_url:
|
||||
raise ValueError(
|
||||
"Azure Document Intelligence returned 202 but no Operation-Location header found"
|
||||
)
|
||||
|
||||
# Get headers for polling (need auth)
|
||||
poll_headers = {
|
||||
"Ocp-Apim-Subscription-Key": raw_response.request.headers.get(
|
||||
"Ocp-Apim-Subscription-Key", ""
|
||||
)
|
||||
}
|
||||
|
||||
# Get timeout from kwargs or use default
|
||||
timeout_secs = AZURE_OPERATION_POLLING_TIMEOUT
|
||||
|
||||
# Poll until operation completes
|
||||
raw_response = self._poll_operation_sync(
|
||||
operation_url=operation_url,
|
||||
headers=poll_headers,
|
||||
timeout_secs=timeout_secs,
|
||||
)
|
||||
|
||||
# Now parse the completed response
|
||||
response_json = raw_response.json()
|
||||
|
||||
verbose_logger.debug(
|
||||
f"Azure Document Intelligence response status: {response_json.get('status')}"
|
||||
)
|
||||
|
||||
# Check if request succeeded
|
||||
status = response_json.get("status")
|
||||
if status != "succeeded":
|
||||
raise ValueError(
|
||||
f"Azure Document Intelligence analysis failed with status: {status}"
|
||||
)
|
||||
|
||||
# Extract analyze result
|
||||
analyze_result = response_json.get("analyzeResult", {})
|
||||
azure_pages = analyze_result.get("pages", [])
|
||||
|
||||
# Transform pages to Mistral format
|
||||
mistral_pages = []
|
||||
for azure_page in azure_pages:
|
||||
page_number = azure_page.get("pageNumber", 1)
|
||||
index = page_number - 1 # Convert to 0-based index
|
||||
|
||||
# Extract markdown text
|
||||
markdown = self._extract_page_markdown(azure_page)
|
||||
|
||||
# Convert dimensions
|
||||
width = azure_page.get("width", 8.5)
|
||||
height = azure_page.get("height", 11)
|
||||
unit = azure_page.get("unit", "inch")
|
||||
dimensions = self._convert_dimensions(
|
||||
width=width, height=height, unit=unit
|
||||
)
|
||||
|
||||
# Build OCR page
|
||||
ocr_page = OCRPage(
|
||||
index=index, markdown=markdown, dimensions=dimensions
|
||||
)
|
||||
mistral_pages.append(ocr_page)
|
||||
|
||||
# Build usage info
|
||||
usage_info = OCRUsageInfo(
|
||||
pages_processed=len(mistral_pages), doc_size_bytes=None
|
||||
)
|
||||
|
||||
# Return Mistral OCR response
|
||||
return OCRResponse(
|
||||
pages=mistral_pages,
|
||||
model=model,
|
||||
usage_info=usage_info,
|
||||
object="ocr",
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
verbose_logger.error(
|
||||
f"Error parsing Azure Document Intelligence response: {e}"
|
||||
)
|
||||
raise e
|
||||
|
||||
async def async_transform_ocr_response(
|
||||
self,
|
||||
model: str,
|
||||
raw_response: httpx.Response,
|
||||
logging_obj: Any,
|
||||
**kwargs,
|
||||
) -> OCRResponse:
|
||||
"""
|
||||
Async transform Azure Document Intelligence response to Mistral OCR format.
|
||||
|
||||
Handles async operation polling: If response is 202 Accepted, polls Operation-Location
|
||||
until analysis completes using async polling.
|
||||
|
||||
Args:
|
||||
model: Model name
|
||||
raw_response: Raw HTTP response from Azure DI (may be 202 Accepted)
|
||||
logging_obj: Logging object
|
||||
|
||||
Returns:
|
||||
OCRResponse in Mistral format
|
||||
"""
|
||||
try:
|
||||
# Check if we got 202 Accepted (async operation started)
|
||||
if raw_response.status_code == 202:
|
||||
verbose_logger.debug(
|
||||
"Azure DI returned 202 Accepted, polling operation (async)..."
|
||||
)
|
||||
|
||||
# Get Operation-Location header
|
||||
operation_url = raw_response.headers.get("Operation-Location")
|
||||
if not operation_url:
|
||||
raise ValueError(
|
||||
"Azure Document Intelligence returned 202 but no Operation-Location header found"
|
||||
)
|
||||
|
||||
# Get headers for polling (need auth)
|
||||
poll_headers = {
|
||||
"Ocp-Apim-Subscription-Key": raw_response.request.headers.get(
|
||||
"Ocp-Apim-Subscription-Key", ""
|
||||
)
|
||||
}
|
||||
|
||||
# Get timeout from kwargs or use default
|
||||
timeout_secs = AZURE_OPERATION_POLLING_TIMEOUT
|
||||
|
||||
# Poll until operation completes (async)
|
||||
raw_response = await self._poll_operation_async(
|
||||
operation_url=operation_url,
|
||||
headers=poll_headers,
|
||||
timeout_secs=timeout_secs,
|
||||
)
|
||||
|
||||
# Now parse the completed response
|
||||
response_json = raw_response.json()
|
||||
|
||||
verbose_logger.debug(
|
||||
f"Azure Document Intelligence response status: {response_json.get('status')}"
|
||||
)
|
||||
|
||||
# Check if request succeeded
|
||||
status = response_json.get("status")
|
||||
if status != "succeeded":
|
||||
raise ValueError(
|
||||
f"Azure Document Intelligence analysis failed with status: {status}"
|
||||
)
|
||||
|
||||
# Extract analyze result
|
||||
analyze_result = response_json.get("analyzeResult", {})
|
||||
azure_pages = analyze_result.get("pages", [])
|
||||
|
||||
# Transform pages to Mistral format
|
||||
mistral_pages = []
|
||||
for azure_page in azure_pages:
|
||||
page_number = azure_page.get("pageNumber", 1)
|
||||
index = page_number - 1 # Convert to 0-based index
|
||||
|
||||
# Extract markdown text
|
||||
markdown = self._extract_page_markdown(azure_page)
|
||||
|
||||
# Convert dimensions
|
||||
width = azure_page.get("width", 8.5)
|
||||
height = azure_page.get("height", 11)
|
||||
unit = azure_page.get("unit", "inch")
|
||||
dimensions = self._convert_dimensions(
|
||||
width=width, height=height, unit=unit
|
||||
)
|
||||
|
||||
# Build OCR page
|
||||
ocr_page = OCRPage(
|
||||
index=index, markdown=markdown, dimensions=dimensions
|
||||
)
|
||||
mistral_pages.append(ocr_page)
|
||||
|
||||
# Build usage info
|
||||
usage_info = OCRUsageInfo(
|
||||
pages_processed=len(mistral_pages), doc_size_bytes=None
|
||||
)
|
||||
|
||||
# Return Mistral OCR response
|
||||
return OCRResponse(
|
||||
pages=mistral_pages,
|
||||
model=model,
|
||||
usage_info=usage_info,
|
||||
object="ocr",
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
verbose_logger.error(
|
||||
f"Error parsing Azure Document Intelligence response (async): {e}"
|
||||
)
|
||||
raise e
|
||||
@@ -0,0 +1,281 @@
|
||||
"""
|
||||
Azure AI OCR transformation implementation.
|
||||
"""
|
||||
from typing import Dict, Optional
|
||||
|
||||
from litellm._logging import verbose_logger
|
||||
from litellm.litellm_core_utils.prompt_templates.image_handling import (
|
||||
async_convert_url_to_base64,
|
||||
convert_url_to_base64,
|
||||
)
|
||||
from litellm.llms.base_llm.ocr.transformation import DocumentType, OCRRequestData
|
||||
from litellm.llms.mistral.ocr.transformation import MistralOCRConfig
|
||||
from litellm.secret_managers.main import get_secret_str
|
||||
|
||||
|
||||
class AzureAIOCRConfig(MistralOCRConfig):
|
||||
"""
|
||||
Azure AI OCR transformation configuration.
|
||||
|
||||
Azure AI uses Mistral's OCR API but with a different endpoint format.
|
||||
Inherits transformation logic from MistralOCRConfig since they use the same format.
|
||||
|
||||
Reference: Azure AI Foundry OCR documentation
|
||||
|
||||
Important: Azure AI only supports base64 data URIs (data:image/..., data:application/pdf;base64,...).
|
||||
Regular URLs are not supported.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
|
||||
def validate_environment(
|
||||
self,
|
||||
headers: Dict,
|
||||
model: str,
|
||||
api_key: Optional[str] = None,
|
||||
api_base: Optional[str] = None,
|
||||
litellm_params: Optional[dict] = None,
|
||||
**kwargs,
|
||||
) -> Dict:
|
||||
"""
|
||||
Validate environment and return headers for Azure AI OCR.
|
||||
|
||||
Azure AI uses Bearer token authentication with AZURE_AI_API_KEY.
|
||||
"""
|
||||
# Get API key from environment if not provided
|
||||
if api_key is None:
|
||||
api_key = get_secret_str("AZURE_AI_API_KEY")
|
||||
|
||||
if api_key is None:
|
||||
raise ValueError(
|
||||
"Missing Azure AI API Key - A call is being made to Azure AI but no key is set either in the environment variables or via params"
|
||||
)
|
||||
|
||||
# Validate API base is provided
|
||||
if api_base is None:
|
||||
api_base = get_secret_str("AZURE_AI_API_BASE")
|
||||
|
||||
if api_base is None:
|
||||
raise ValueError(
|
||||
"Missing Azure AI API Base - Set AZURE_AI_API_BASE environment variable or pass api_base parameter"
|
||||
)
|
||||
|
||||
headers = {
|
||||
"Authorization": f"Bearer {api_key}",
|
||||
"Content-Type": "application/json",
|
||||
**headers,
|
||||
}
|
||||
|
||||
return headers
|
||||
|
||||
def get_complete_url(
|
||||
self,
|
||||
api_base: Optional[str],
|
||||
model: str,
|
||||
optional_params: dict,
|
||||
litellm_params: Optional[dict] = None,
|
||||
**kwargs,
|
||||
) -> str:
|
||||
"""
|
||||
Get complete URL for Azure AI OCR endpoint.
|
||||
|
||||
Azure AI endpoint format: https://<api_base>/providers/mistral/azure/ocr
|
||||
|
||||
Args:
|
||||
api_base: Azure AI API base URL
|
||||
model: Model name (not used in URL construction)
|
||||
optional_params: Optional parameters
|
||||
|
||||
Returns: Complete URL for Azure AI OCR endpoint
|
||||
"""
|
||||
if api_base is None:
|
||||
raise ValueError(
|
||||
"Missing Azure AI API Base - Set AZURE_AI_API_BASE environment variable or pass api_base parameter"
|
||||
)
|
||||
|
||||
# Ensure no trailing slash
|
||||
api_base = api_base.rstrip("/")
|
||||
|
||||
# Azure AI OCR endpoint format
|
||||
return f"{api_base}/providers/mistral/azure/ocr"
|
||||
|
||||
def _convert_url_to_data_uri_sync(self, url: str) -> str:
|
||||
"""
|
||||
Synchronously convert a URL to a base64 data URI.
|
||||
|
||||
Azure AI OCR doesn't have internet access, so we need to fetch URLs
|
||||
and convert them to base64 data URIs.
|
||||
|
||||
Args:
|
||||
url: The URL to convert
|
||||
|
||||
Returns:
|
||||
Base64 data URI string
|
||||
"""
|
||||
verbose_logger.debug(
|
||||
f"Azure AI OCR: Converting URL to base64 data URI (sync): {url}"
|
||||
)
|
||||
|
||||
# Fetch and convert to base64 data URI
|
||||
# convert_url_to_base64 already returns a full data URI like "data:image/jpeg;base64,..."
|
||||
data_uri = convert_url_to_base64(url=url)
|
||||
|
||||
verbose_logger.debug(
|
||||
f"Azure AI OCR: Converted URL to data URI (length: {len(data_uri)})"
|
||||
)
|
||||
|
||||
return data_uri
|
||||
|
||||
async def _convert_url_to_data_uri_async(self, url: str) -> str:
|
||||
"""
|
||||
Asynchronously convert a URL to a base64 data URI.
|
||||
|
||||
Azure AI OCR doesn't have internet access, so we need to fetch URLs
|
||||
and convert them to base64 data URIs.
|
||||
|
||||
Args:
|
||||
url: The URL to convert
|
||||
|
||||
Returns:
|
||||
Base64 data URI string
|
||||
"""
|
||||
verbose_logger.debug(
|
||||
f"Azure AI OCR: Converting URL to base64 data URI (async): {url}"
|
||||
)
|
||||
|
||||
# Fetch and convert to base64 data URI asynchronously
|
||||
# async_convert_url_to_base64 already returns a full data URI like "data:image/jpeg;base64,..."
|
||||
data_uri = await async_convert_url_to_base64(url=url)
|
||||
|
||||
verbose_logger.debug(
|
||||
f"Azure AI OCR: Converted URL to data URI (length: {len(data_uri)})"
|
||||
)
|
||||
|
||||
return data_uri
|
||||
|
||||
def transform_ocr_request(
|
||||
self,
|
||||
model: str,
|
||||
document: DocumentType,
|
||||
optional_params: dict,
|
||||
headers: dict,
|
||||
**kwargs,
|
||||
) -> OCRRequestData:
|
||||
"""
|
||||
Transform OCR request for Azure AI, converting URLs to base64 data URIs (sync).
|
||||
|
||||
Azure AI OCR doesn't have internet access, so we automatically fetch
|
||||
any URLs and convert them to base64 data URIs synchronously.
|
||||
|
||||
Args:
|
||||
model: Model name
|
||||
document: Document dict from user
|
||||
optional_params: Already mapped optional parameters
|
||||
headers: Request headers
|
||||
**kwargs: Additional arguments
|
||||
|
||||
Returns:
|
||||
OCRRequestData with JSON data
|
||||
"""
|
||||
verbose_logger.debug(
|
||||
f"Azure AI OCR transform_ocr_request (sync) - model: {model}"
|
||||
)
|
||||
|
||||
if not isinstance(document, dict):
|
||||
raise ValueError(f"Expected document dict, got {type(document)}")
|
||||
|
||||
# Check if we need to convert URL to base64
|
||||
doc_type = document.get("type")
|
||||
transformed_document = document.copy()
|
||||
|
||||
if doc_type == "document_url":
|
||||
document_url = document.get("document_url", "")
|
||||
# If it's not already a data URI, convert it
|
||||
if document_url and not document_url.startswith("data:"):
|
||||
verbose_logger.debug(
|
||||
"Azure AI OCR: Converting document URL to base64 data URI (sync)"
|
||||
)
|
||||
data_uri = self._convert_url_to_data_uri_sync(url=document_url)
|
||||
transformed_document["document_url"] = data_uri
|
||||
elif doc_type == "image_url":
|
||||
image_url = document.get("image_url", "")
|
||||
# If it's not already a data URI, convert it
|
||||
if image_url and not image_url.startswith("data:"):
|
||||
verbose_logger.debug(
|
||||
"Azure AI OCR: Converting image URL to base64 data URI (sync)"
|
||||
)
|
||||
data_uri = self._convert_url_to_data_uri_sync(url=image_url)
|
||||
transformed_document["image_url"] = data_uri
|
||||
|
||||
# Call parent's transform to build the request
|
||||
return super().transform_ocr_request(
|
||||
model=model,
|
||||
document=transformed_document,
|
||||
optional_params=optional_params,
|
||||
headers=headers,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
async def async_transform_ocr_request(
|
||||
self,
|
||||
model: str,
|
||||
document: DocumentType,
|
||||
optional_params: dict,
|
||||
headers: dict,
|
||||
**kwargs,
|
||||
) -> OCRRequestData:
|
||||
"""
|
||||
Transform OCR request for Azure AI, converting URLs to base64 data URIs (async).
|
||||
|
||||
Azure AI OCR doesn't have internet access, so we automatically fetch
|
||||
any URLs and convert them to base64 data URIs asynchronously.
|
||||
|
||||
Args:
|
||||
model: Model name
|
||||
document: Document dict from user
|
||||
optional_params: Already mapped optional parameters
|
||||
headers: Request headers
|
||||
**kwargs: Additional arguments
|
||||
|
||||
Returns:
|
||||
OCRRequestData with JSON data
|
||||
"""
|
||||
verbose_logger.debug(
|
||||
f"Azure AI OCR async_transform_ocr_request - model: {model}"
|
||||
)
|
||||
|
||||
if not isinstance(document, dict):
|
||||
raise ValueError(f"Expected document dict, got {type(document)}")
|
||||
|
||||
# Check if we need to convert URL to base64
|
||||
doc_type = document.get("type")
|
||||
transformed_document = document.copy()
|
||||
|
||||
if doc_type == "document_url":
|
||||
document_url = document.get("document_url", "")
|
||||
# If it's not already a data URI, convert it
|
||||
if document_url and not document_url.startswith("data:"):
|
||||
verbose_logger.debug(
|
||||
"Azure AI OCR: Converting document URL to base64 data URI (async)"
|
||||
)
|
||||
data_uri = await self._convert_url_to_data_uri_async(url=document_url)
|
||||
transformed_document["document_url"] = data_uri
|
||||
elif doc_type == "image_url":
|
||||
image_url = document.get("image_url", "")
|
||||
# If it's not already a data URI, convert it
|
||||
if image_url and not image_url.startswith("data:"):
|
||||
verbose_logger.debug(
|
||||
"Azure AI OCR: Converting image URL to base64 data URI (async)"
|
||||
)
|
||||
data_uri = await self._convert_url_to_data_uri_async(url=image_url)
|
||||
transformed_document["image_url"] = data_uri
|
||||
|
||||
# Call parent's transform to build the request
|
||||
return super().transform_ocr_request(
|
||||
model=model,
|
||||
document=transformed_document,
|
||||
optional_params=optional_params,
|
||||
headers=headers,
|
||||
**kwargs,
|
||||
)
|
||||
Reference in New Issue
Block a user