219 lines
7.4 KiB
Python
219 lines
7.4 KiB
Python
"""
|
|
Calls Firecrawl's /search endpoint to search the web.
|
|
|
|
Firecrawl API Reference: https://docs.firecrawl.dev/api-reference/endpoint/search
|
|
"""
|
|
from typing import Dict, List, Optional, TypedDict, Union
|
|
|
|
import httpx
|
|
|
|
from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
|
|
from litellm.llms.base_llm.search.transformation import (
|
|
BaseSearchConfig,
|
|
SearchResponse,
|
|
SearchResult,
|
|
)
|
|
from litellm.secret_managers.main import get_secret_str
|
|
|
|
|
|
class _FirecrawlSearchRequestRequired(TypedDict):
|
|
"""Required fields for Firecrawl Search API request."""
|
|
|
|
query: str # Required - search query
|
|
|
|
|
|
class FirecrawlSearchRequest(_FirecrawlSearchRequestRequired, total=False):
|
|
"""
|
|
Firecrawl Search API request format.
|
|
Based on: https://docs.firecrawl.dev/api-reference/endpoint/search
|
|
"""
|
|
|
|
limit: int # Optional - maximum number of results to return (default 5, max 100)
|
|
sources: List[
|
|
str
|
|
] # Optional - sources to search ('web', 'images', 'news'), default ['web']
|
|
categories: List[
|
|
Dict[str, str]
|
|
] # Optional - categories to filter by (github, research, pdf)
|
|
tbs: str # Optional - time-based search parameter
|
|
location: str # Optional - location parameter for geo-targeting
|
|
country: str # Optional - ISO country code (default 'US')
|
|
timeout: int # Optional - timeout in milliseconds (default 60000)
|
|
ignoreInvalidURLs: bool # Optional - exclude invalid URLs (default false)
|
|
scrapeOptions: Dict # Optional - options for scraping search results
|
|
|
|
|
|
class FirecrawlSearchConfig(BaseSearchConfig):
|
|
FIRECRAWL_API_BASE = "https://api.firecrawl.dev/v2"
|
|
|
|
@staticmethod
|
|
def ui_friendly_name() -> str:
|
|
return "Firecrawl"
|
|
|
|
def validate_environment(
|
|
self,
|
|
headers: Dict,
|
|
api_key: Optional[str] = None,
|
|
api_base: Optional[str] = None,
|
|
**kwargs,
|
|
) -> Dict:
|
|
"""
|
|
Validate environment and return headers.
|
|
"""
|
|
api_key = api_key or get_secret_str("FIRECRAWL_API_KEY")
|
|
if not api_key:
|
|
raise ValueError(
|
|
"FIRECRAWL_API_KEY is not set. Set `FIRECRAWL_API_KEY` environment variable."
|
|
)
|
|
headers["Authorization"] = f"Bearer {api_key}"
|
|
headers["Content-Type"] = "application/json"
|
|
return headers
|
|
|
|
def get_complete_url(
|
|
self,
|
|
api_base: Optional[str],
|
|
optional_params: dict,
|
|
data: Optional[Union[Dict, List[Dict]]] = None,
|
|
**kwargs,
|
|
) -> str:
|
|
"""
|
|
Get complete URL for Search endpoint.
|
|
"""
|
|
api_base = (
|
|
api_base or get_secret_str("FIRECRAWL_API_BASE") or self.FIRECRAWL_API_BASE
|
|
)
|
|
|
|
# Append "/search" to the api base if it's not already there
|
|
if not api_base.endswith("/search"):
|
|
api_base = f"{api_base}/search"
|
|
|
|
return api_base
|
|
|
|
def transform_search_request(
|
|
self,
|
|
query: Union[str, List[str]],
|
|
optional_params: dict,
|
|
**kwargs,
|
|
) -> Dict:
|
|
"""
|
|
Transform Search request to Firecrawl API format.
|
|
|
|
Transforms Perplexity unified spec parameters:
|
|
- query → query (same)
|
|
- max_results → limit
|
|
- search_domain_filter → (not directly supported, can use scrapeOptions)
|
|
- country → country
|
|
- max_tokens_per_page → (not applicable, ignored)
|
|
|
|
All other Firecrawl-specific parameters are passed through as-is.
|
|
|
|
Args:
|
|
query: Search query (string or list of strings). Firecrawl only supports single string queries.
|
|
optional_params: Optional parameters for the request
|
|
|
|
Returns:
|
|
Dict with typed request data following FirecrawlSearchRequest spec
|
|
"""
|
|
if isinstance(query, list):
|
|
# Firecrawl only supports single string queries, join with spaces
|
|
query = " ".join(query)
|
|
|
|
request_data: FirecrawlSearchRequest = {
|
|
"query": query,
|
|
}
|
|
|
|
# Transform Perplexity unified spec parameters to Firecrawl format
|
|
if "max_results" in optional_params:
|
|
request_data["limit"] = optional_params["max_results"]
|
|
|
|
if "country" in optional_params:
|
|
request_data["country"] = optional_params["country"]
|
|
|
|
# Convert to dict before dynamic key assignments
|
|
result_data = dict(request_data)
|
|
|
|
# pass through all other parameters as-is
|
|
for param, value in optional_params.items():
|
|
if (
|
|
param not in self.get_supported_perplexity_optional_params()
|
|
and param not in result_data
|
|
):
|
|
result_data[param] = value
|
|
|
|
# By default, request markdown content if not explicitly specified
|
|
# Firecrawl doesn't return content unless explicitly requested via scrapeOptions
|
|
if "scrapeOptions" not in result_data:
|
|
result_data["scrapeOptions"] = {
|
|
"formats": ["markdown"],
|
|
"onlyMainContent": True,
|
|
}
|
|
|
|
return result_data
|
|
|
|
def transform_search_response(
|
|
self,
|
|
raw_response: httpx.Response,
|
|
logging_obj: LiteLLMLoggingObj,
|
|
**kwargs,
|
|
) -> SearchResponse:
|
|
"""
|
|
Transform Firecrawl API response to LiteLLM unified SearchResponse format.
|
|
|
|
Firecrawl → LiteLLM mappings:
|
|
- data.web[].title → SearchResult.title
|
|
- data.web[].url → SearchResult.url
|
|
- data.web[].description OR data.web[].markdown → SearchResult.snippet
|
|
- No date field in web results (set to None)
|
|
- No last_updated field in Firecrawl response (set to None)
|
|
|
|
Note: Firecrawl v2 returns results organized by source type (web, images, news).
|
|
We primarily use web results for the unified format.
|
|
|
|
Args:
|
|
raw_response: Raw httpx response from Firecrawl API
|
|
logging_obj: Logging object for tracking
|
|
|
|
Returns:
|
|
SearchResponse with standardized format
|
|
"""
|
|
response_json = raw_response.json()
|
|
|
|
# Transform results to SearchResult objects
|
|
results = []
|
|
|
|
# Process web results (primary source)
|
|
data = response_json.get("data", {})
|
|
web_results = data.get("web", [])
|
|
|
|
for result in web_results:
|
|
# Use markdown if available, otherwise fall back to description
|
|
snippet = result.get("markdown") or result.get("description", "")
|
|
|
|
search_result = SearchResult(
|
|
title=result.get("title", ""),
|
|
url=result.get("url", ""),
|
|
snippet=snippet,
|
|
date=None, # Web results don't include date
|
|
last_updated=None, # Firecrawl doesn't provide last_updated in response
|
|
)
|
|
results.append(search_result)
|
|
|
|
# Process news results if available (they have date field)
|
|
news_results = data.get("news", [])
|
|
for result in news_results:
|
|
snippet = result.get("markdown") or result.get("snippet", "")
|
|
|
|
search_result = SearchResult(
|
|
title=result.get("title", ""),
|
|
url=result.get("url", ""),
|
|
snippet=snippet,
|
|
date=result.get("date"), # News results include date
|
|
last_updated=None,
|
|
)
|
|
results.append(search_result)
|
|
|
|
return SearchResponse(
|
|
results=results,
|
|
object="search",
|
|
)
|