""" Main OCR function for LiteLLM. """ import asyncio import base64 import contextvars import mimetypes import os import re from functools import partial from io import IOBase from pathlib import Path from typing import Any, Coroutine, Dict, Optional, Union import httpx import litellm from litellm._logging import verbose_logger from litellm.constants import request_timeout from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj from litellm.llms.base_llm.ocr.transformation import BaseOCRConfig, OCRResponse from litellm.llms.custom_httpx.llm_http_handler import BaseLLMHTTPHandler from litellm.types.router import GenericLiteLLMParams from litellm.utils import ProviderConfigManager, client ####### ENVIRONMENT VARIABLES ################### base_llm_http_handler = BaseLLMHTTPHandler() ################################################# @client async def aocr( model: str, document: Dict[str, Any], api_key: Optional[str] = None, api_base: Optional[str] = None, timeout: Optional[Union[float, httpx.Timeout]] = None, custom_llm_provider: Optional[str] = None, extra_headers: Optional[Dict[str, Any]] = None, **kwargs, ) -> OCRResponse: """ Async OCR function. Args: model: Model name (e.g., "mistral/mistral-ocr-latest") document: Document to process in Mistral format: {"type": "document_url", "document_url": "https://..."} for PDFs/docs, {"type": "image_url", "image_url": "https://..."} for images, or {"type": "file", "file": } for local files api_key: Optional API key api_base: Optional API base URL timeout: Optional timeout custom_llm_provider: Optional custom LLM provider extra_headers: Optional extra headers **kwargs: Additional parameters (e.g., include_image_base64, pages, image_limit) Returns: OCRResponse in Mistral OCR format with pages, model, usage_info, etc. Example: ```python import litellm # OCR with PDF response = await litellm.aocr( model="mistral/mistral-ocr-latest", document={ "type": "document_url", "document_url": "https://arxiv.org/pdf/2201.04234" }, include_image_base64=True ) # OCR with image response = await litellm.aocr( model="mistral/mistral-ocr-latest", document={ "type": "image_url", "image_url": "https://example.com/image.png" } ) # OCR with base64 encoded PDF response = await litellm.aocr( model="mistral/mistral-ocr-latest", document={ "type": "document_url", "document_url": f"data:application/pdf;base64,{base64_pdf}" } ) # OCR with local file response = await litellm.aocr( model="mistral/mistral-ocr-latest", document={"type": "file", "file": "/path/to/document.pdf"} ) ``` """ local_vars = locals() try: loop = asyncio.get_event_loop() kwargs["aocr"] = True # Get custom llm provider if custom_llm_provider is None: _, custom_llm_provider, _, _ = litellm.get_llm_provider( model=model, api_base=api_base ) func = partial( ocr, model=model, document=document, api_key=api_key, api_base=api_base, timeout=timeout, custom_llm_provider=custom_llm_provider, extra_headers=extra_headers, **kwargs, ) ctx = contextvars.copy_context() func_with_context = partial(ctx.run, func) init_response = await loop.run_in_executor(None, func_with_context) if asyncio.iscoroutine(init_response): response = await init_response else: response = init_response if response is None: raise ValueError( f"Got an unexpected None response from the OCR API: {response}" ) return response except Exception as e: raise litellm.exception_type( model=model, custom_llm_provider=custom_llm_provider, original_exception=e, completion_kwargs=local_vars, extra_kwargs=kwargs, ) @client def ocr( model: str, document: Dict[str, Any], api_key: Optional[str] = None, api_base: Optional[str] = None, timeout: Optional[Union[float, httpx.Timeout]] = None, custom_llm_provider: Optional[str] = None, extra_headers: Optional[Dict[str, Any]] = None, **kwargs, ) -> Union[OCRResponse, Coroutine[Any, Any, OCRResponse]]: """ Synchronous OCR function. Args: model: Model name (e.g., "mistral/mistral-ocr-latest") document: Document to process in Mistral format: {"type": "document_url", "document_url": "https://..."} for PDFs/docs, {"type": "image_url", "image_url": "https://..."} for images, or {"type": "file", "file": } for local files api_key: Optional API key api_base: Optional API base URL timeout: Optional timeout custom_llm_provider: Optional custom LLM provider extra_headers: Optional extra headers **kwargs: Additional parameters (e.g., include_image_base64, pages, image_limit) Returns: OCRResponse in Mistral OCR format with pages, model, usage_info, etc. Example: ```python import litellm # OCR with PDF response = litellm.ocr( model="mistral/mistral-ocr-latest", document={ "type": "document_url", "document_url": "https://arxiv.org/pdf/2201.04234" }, include_image_base64=True ) # OCR with image response = litellm.ocr( model="mistral/mistral-ocr-latest", document={ "type": "image_url", "image_url": "https://example.com/image.png" } ) # OCR with base64 encoded PDF response = litellm.ocr( model="mistral/mistral-ocr-latest", document={ "type": "document_url", "document_url": f"data:application/pdf;base64,{base64_pdf}" } ) # OCR with local file response = litellm.ocr( model="mistral/mistral-ocr-latest", document={"type": "file", "file": "/path/to/document.pdf"} ) # Access pages for page in response.pages: print(f"Page {page.index}: {page.markdown}") ``` """ local_vars = locals() try: litellm_logging_obj: LiteLLMLoggingObj = kwargs.pop("litellm_logging_obj") # type: ignore litellm_call_id: Optional[str] = kwargs.get("litellm_call_id", None) _is_async = kwargs.pop("aocr", False) is True # Validate document parameter format if not isinstance(document, dict): raise ValueError( f"document must be a dict with 'type' and URL/file field, got {type(document)}" ) doc_type = document.get("type") # Handle file type: convert to document_url/image_url with base64 data URI if doc_type == "file": document = convert_file_document_to_url_document(document) doc_type = document.get("type") if doc_type not in ["document_url", "image_url"]: raise ValueError( f"Invalid document type: {doc_type}. " "Must be 'document_url', 'image_url', or 'file'" ) ( model, custom_llm_provider, dynamic_api_key, dynamic_api_base, ) = litellm.get_llm_provider( model=model, custom_llm_provider=custom_llm_provider, api_base=api_base, api_key=api_key, ) # Update with dynamic values if available if dynamic_api_key: api_key = dynamic_api_key if dynamic_api_base: api_base = dynamic_api_base # Get provider config ocr_provider_config: Optional[ BaseOCRConfig ] = ProviderConfigManager.get_provider_ocr_config( model=model, provider=litellm.LlmProviders(custom_llm_provider), ) if ocr_provider_config is None: raise ValueError( f"OCR is not supported for provider: {custom_llm_provider}" ) verbose_logger.debug( f"OCR call - model: {model}, provider: {custom_llm_provider}" ) # Get litellm params using GenericLiteLLMParams (same as responses API) litellm_params = GenericLiteLLMParams(**kwargs) # Extract OCR-specific parameters from kwargs supported_params = ocr_provider_config.get_supported_ocr_params(model=model) non_default_params = {} for param in supported_params: if param in kwargs: non_default_params[param] = kwargs.pop(param) # Map parameters to provider-specific format optional_params = ocr_provider_config.map_ocr_params( non_default_params=non_default_params, optional_params={}, model=model, ) verbose_logger.debug(f"OCR optional_params after mapping: {optional_params}") # Pre Call logging litellm_logging_obj.update_environment_variables( model=model, optional_params=optional_params, litellm_params={ "litellm_call_id": litellm_call_id, "api_base": api_base, }, custom_llm_provider=custom_llm_provider, ) # Call the handler - pass document dict directly response = base_llm_http_handler.ocr( model=model, document=document, # Pass the entire document dict optional_params=optional_params, timeout=timeout or request_timeout, logging_obj=litellm_logging_obj, api_key=api_key, api_base=api_base, custom_llm_provider=custom_llm_provider, aocr=_is_async, headers=extra_headers, provider_config=ocr_provider_config, litellm_params=dict(litellm_params), ) return response except Exception as e: raise litellm.exception_type( model=model, custom_llm_provider=custom_llm_provider, original_exception=e, completion_kwargs=local_vars, extra_kwargs=kwargs, ) ################################################# # Public utilities — used by the SDK and the proxy ################################################# _MIME_PATTERN = re.compile(r"^[\w.+-]+/[\w.+-]+$") _MIME_TYPE_MAP = { ".pdf": "application/pdf", ".png": "image/png", ".jpg": "image/jpeg", ".jpeg": "image/jpeg", ".gif": "image/gif", ".webp": "image/webp", ".tiff": "image/tiff", ".tif": "image/tiff", ".bmp": "image/bmp", } def get_mime_type(file_path: str) -> str: """ Determine MIME type from file path extension. Falls back to mimetypes.guess_type, then to 'application/octet-stream'. """ ext = os.path.splitext(file_path)[1].lower() mime = _MIME_TYPE_MAP.get(ext) if mime: return mime guessed, _ = mimetypes.guess_type(file_path) return guessed or "application/octet-stream" def convert_file_document_to_url_document(document: Dict[str, Any]) -> Dict[str, str]: """ Convert a file-type document dict to a document_url-type document dict with an inline base64 data URI. Accepts document dicts like: {"type": "file", "file": "/path/to/document.pdf"} # file path string {"type": "file", "file": Path("/path/to/doc.pdf")} # pathlib.Path {"type": "file", "file": } # file-like object (BinaryIO) {"type": "file", "file": b"raw bytes"} # raw bytes Returns: {"type": "document_url", "document_url": "data:;base64,"} or {"type": "image_url", "image_url": "data:;base64,"} """ file_input = document.get("file") if file_input is None: raise ValueError( "document with type='file' must include a 'file' field containing " "a file path (str), pathlib.Path, file-like object, or bytes" ) file_bytes: bytes mime_type: str = "application/octet-stream" file_name: Optional[str] = None if isinstance(file_input, (str, Path)): file_path = str(file_input) if not os.path.isfile(file_path): raise FileNotFoundError(f"File not found: {file_path}") mime_type = get_mime_type(file_path) file_name = os.path.basename(file_path) with open(file_path, "rb") as f: file_bytes = f.read() elif isinstance(file_input, bytes): file_bytes = file_input elif isinstance(file_input, IOBase) or hasattr(file_input, "read"): if hasattr(file_input, "name"): file_name = getattr(file_input, "name", None) if file_name: mime_type = get_mime_type(file_name) file_bytes = file_input.read() if isinstance(file_bytes, str): file_bytes = file_bytes.encode("utf-8") else: raise ValueError( f"Unsupported file input type: {type(file_input)}. " "Expected str (file path), pathlib.Path, bytes, or a file-like object." ) if not file_bytes: raise ValueError("File is empty or could not be read") if "mime_type" in document: mime_type = document["mime_type"] if not _MIME_PATTERN.match(mime_type): raise ValueError(f"Invalid MIME type: {mime_type}") base64_data = base64.b64encode(file_bytes).decode("utf-8") data_uri = f"data:{mime_type};base64,{base64_data}" if mime_type.startswith("image/"): verbose_logger.debug( f"OCR file input: Converted file to image_url data URI " f"(mime={mime_type}, size={len(file_bytes)} bytes, name={file_name})" ) return {"type": "image_url", "image_url": data_uri} else: verbose_logger.debug( f"OCR file input: Converted file to document_url data URI " f"(mime={mime_type}, size={len(file_bytes)} bytes, name={file_name})" ) return {"type": "document_url", "document_url": data_uri}