stremio-sekai/scrapers/firecrawl_scraper.py
2025-10-31 19:03:17 +01:00

138 lines
4.4 KiB
Python

"""
Firecrawl scraper for deep web crawling and hierarchical content extraction.
"""
from typing import Dict, Any, Optional, List
from scrapers.base_scraper import BaseScraper
from utils.retry import retry_with_backoff
from config import FIRECRAWL_API_KEY
class FirecrawlScraper(BaseScraper):
"""
Scraper using Firecrawl for deep web content extraction.
Preferred for crawling deep web content or when data depth is critical.
"""
def __init__(self, api_key: Optional[str] = None, **kwargs):
"""
Initialize Firecrawl scraper.
Args:
api_key: Firecrawl API key (default from config)
**kwargs: Additional arguments for BaseScraper
"""
super().__init__(**kwargs)
self.api_key = api_key or FIRECRAWL_API_KEY
if not self.api_key:
self.logger.warning("Firecrawl API key not provided. Set FIRECRAWL_API_KEY in .env")
try:
from firecrawl import FirecrawlApp
self.client = FirecrawlApp(api_key=self.api_key) if self.api_key else None
except ImportError:
self.logger.error("Firecrawl library not installed. Install with: pip install firecrawl-py")
self.client = None
@retry_with_backoff(max_retries=3)
def scrape(self, url: str, **kwargs) -> Dict[str, Any]:
"""
Scrape a single URL using Firecrawl.
Args:
url: Target URL to scrape
**kwargs: Additional parameters for Firecrawl
Returns:
Dictionary containing scraped content and metadata
"""
if not self.client:
return {
"url": url,
"error": "Firecrawl client not initialized",
"success": False
}
self.logger.info(f"Scraping URL with Firecrawl: {url}")
self.rate_limiter.wait()
try:
result = self.client.scrape_url(url, params=kwargs)
return {
"url": url,
"content": result.get("content", ""),
"markdown": result.get("markdown", ""),
"metadata": result.get("metadata", {}),
"success": True
}
except Exception as e:
self.logger.error(f"Firecrawl scraping failed for {url}: {str(e)}")
return {
"url": url,
"error": str(e),
"success": False
}
def crawl(
self,
url: str,
max_depth: int = 2,
max_pages: int = 10,
include_patterns: Optional[List[str]] = None,
exclude_patterns: Optional[List[str]] = None,
**kwargs
) -> Dict[str, Any]:
"""
Crawl a website hierarchically using Firecrawl.
Args:
url: Starting URL for the crawl
max_depth: Maximum crawl depth
max_pages: Maximum number of pages to crawl
include_patterns: URL patterns to include
exclude_patterns: URL patterns to exclude
**kwargs: Additional parameters
Returns:
Dictionary containing all crawled pages and their content
"""
if not self.client:
return {
"url": url,
"error": "Firecrawl client not initialized",
"success": False
}
self.logger.info(f"Starting crawl from {url} (max_depth={max_depth}, max_pages={max_pages})")
crawl_params = {
"maxDepth": max_depth,
"limit": max_pages
}
if include_patterns:
crawl_params["includePaths"] = include_patterns
if exclude_patterns:
crawl_params["excludePaths"] = exclude_patterns
try:
result = self.client.crawl_url(url, params=crawl_params)
return {
"url": url,
"pages": result.get("data", []),
"total_pages": len(result.get("data", [])),
"success": True
}
except Exception as e:
self.logger.error(f"Firecrawl crawling failed for {url}: {str(e)}")
return {
"url": url,
"error": str(e),
"success": False
}