lijiaoqiao/llm-gateway-competitors/litellm-wheel-src/litellm/llms/firecrawl/search/transformation.py

"""
Calls Firecrawl's /search endpoint to search the web.

Firecrawl API Reference: https://docs.firecrawl.dev/api-reference/endpoint/search
"""
from typing import Dict, List, Optional, TypedDict, Union

import httpx

from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
from litellm.llms.base_llm.search.transformation import (
    BaseSearchConfig,
    SearchResponse,
    SearchResult,
)
from litellm.secret_managers.main import get_secret_str


class _FirecrawlSearchRequestRequired(TypedDict):
    """Required fields for Firecrawl Search API request."""

    query: str  # Required - search query


class FirecrawlSearchRequest(_FirecrawlSearchRequestRequired, total=False):
    """
    Firecrawl Search API request format.
    Based on: https://docs.firecrawl.dev/api-reference/endpoint/search
    """

    limit: int  # Optional - maximum number of results to return (default 5, max 100)
    sources: List[
        str
    ]  # Optional - sources to search ('web', 'images', 'news'), default ['web']
    categories: List[
        Dict[str, str]
    ]  # Optional - categories to filter by (github, research, pdf)
    tbs: str  # Optional - time-based search parameter
    location: str  # Optional - location parameter for geo-targeting
    country: str  # Optional - ISO country code (default 'US')
    timeout: int  # Optional - timeout in milliseconds (default 60000)
    ignoreInvalidURLs: bool  # Optional - exclude invalid URLs (default false)
    scrapeOptions: Dict  # Optional - options for scraping search results


class FirecrawlSearchConfig(BaseSearchConfig):
    FIRECRAWL_API_BASE = "https://api.firecrawl.dev/v2"

    @staticmethod
    def ui_friendly_name() -> str:
        return "Firecrawl"

    def validate_environment(
        self,
        headers: Dict,
        api_key: Optional[str] = None,
        api_base: Optional[str] = None,
        **kwargs,
    ) -> Dict:
        """
        Validate environment and return headers.
        """
        api_key = api_key or get_secret_str("FIRECRAWL_API_KEY")
        if not api_key:
            raise ValueError(
                "FIRECRAWL_API_KEY is not set. Set `FIRECRAWL_API_KEY` environment variable."
            )
        headers["Authorization"] = f"Bearer {api_key}"
        headers["Content-Type"] = "application/json"
        return headers

    def get_complete_url(
        self,
        api_base: Optional[str],
        optional_params: dict,
        data: Optional[Union[Dict, List[Dict]]] = None,
        **kwargs,
    ) -> str:
        """
        Get complete URL for Search endpoint.
        """
        api_base = (
            api_base or get_secret_str("FIRECRAWL_API_BASE") or self.FIRECRAWL_API_BASE
        )

        # Append "/search" to the api base if it's not already there
        if not api_base.endswith("/search"):
            api_base = f"{api_base}/search"

        return api_base

    def transform_search_request(
        self,
        query: Union[str, List[str]],
        optional_params: dict,
        **kwargs,
    ) -> Dict:
        """
        Transform Search request to Firecrawl API format.

        Transforms Perplexity unified spec parameters:
        - query → query (same)
        - max_results → limit
        - search_domain_filter → (not directly supported, can use scrapeOptions)
        - country → country
        - max_tokens_per_page → (not applicable, ignored)

        All other Firecrawl-specific parameters are passed through as-is.

        Args:
            query: Search query (string or list of strings). Firecrawl only supports single string queries.
            optional_params: Optional parameters for the request

        Returns:
            Dict with typed request data following FirecrawlSearchRequest spec
        """
        if isinstance(query, list):
            # Firecrawl only supports single string queries, join with spaces
            query = " ".join(query)

        request_data: FirecrawlSearchRequest = {
            "query": query,
        }

        # Transform Perplexity unified spec parameters to Firecrawl format
        if "max_results" in optional_params:
            request_data["limit"] = optional_params["max_results"]

        if "country" in optional_params:
            request_data["country"] = optional_params["country"]

        # Convert to dict before dynamic key assignments
        result_data = dict(request_data)

        # pass through all other parameters as-is
        for param, value in optional_params.items():
            if (
                param not in self.get_supported_perplexity_optional_params()
                and param not in result_data
            ):
                result_data[param] = value

        # By default, request markdown content if not explicitly specified
        # Firecrawl doesn't return content unless explicitly requested via scrapeOptions
        if "scrapeOptions" not in result_data:
            result_data["scrapeOptions"] = {
                "formats": ["markdown"],
                "onlyMainContent": True,
            }

        return result_data

    def transform_search_response(
        self,
        raw_response: httpx.Response,
        logging_obj: LiteLLMLoggingObj,
        **kwargs,
    ) -> SearchResponse:
        """
        Transform Firecrawl API response to LiteLLM unified SearchResponse format.

        Firecrawl → LiteLLM mappings:
        - data.web[].title → SearchResult.title
        - data.web[].url → SearchResult.url
        - data.web[].description OR data.web[].markdown → SearchResult.snippet
        - No date field in web results (set to None)
        - No last_updated field in Firecrawl response (set to None)

        Note: Firecrawl v2 returns results organized by source type (web, images, news).
        We primarily use web results for the unified format.

        Args:
            raw_response: Raw httpx response from Firecrawl API
            logging_obj: Logging object for tracking

        Returns:
            SearchResponse with standardized format
        """
        response_json = raw_response.json()

        # Transform results to SearchResult objects
        results = []

        # Process web results (primary source)
        data = response_json.get("data", {})
        web_results = data.get("web", [])

        for result in web_results:
            # Use markdown if available, otherwise fall back to description
            snippet = result.get("markdown") or result.get("description", "")

            search_result = SearchResult(
                title=result.get("title", ""),
                url=result.get("url", ""),
                snippet=snippet,
                date=None,  # Web results don't include date
                last_updated=None,  # Firecrawl doesn't provide last_updated in response
            )
            results.append(search_result)

        # Process news results if available (they have date field)
        news_results = data.get("news", [])
        for result in news_results:
            snippet = result.get("markdown") or result.get("snippet", "")

            search_result = SearchResult(
                title=result.get("title", ""),
                url=result.get("url", ""),
                snippet=snippet,
                date=result.get("date"),  # News results include date
                last_updated=None,
            )
            results.append(search_result)

        return SearchResponse(
            results=results,
            object="search",
        )