138 lines
4.4 KiB
Python
138 lines
4.4 KiB
Python
"""
|
|
Firecrawl scraper for deep web crawling and hierarchical content extraction.
|
|
"""
|
|
from typing import Dict, Any, Optional, List
|
|
from scrapers.base_scraper import BaseScraper
|
|
from utils.retry import retry_with_backoff
|
|
from config import FIRECRAWL_API_KEY
|
|
|
|
|
|
class FirecrawlScraper(BaseScraper):
|
|
"""
|
|
Scraper using Firecrawl for deep web content extraction.
|
|
Preferred for crawling deep web content or when data depth is critical.
|
|
"""
|
|
|
|
def __init__(self, api_key: Optional[str] = None, **kwargs):
|
|
"""
|
|
Initialize Firecrawl scraper.
|
|
|
|
Args:
|
|
api_key: Firecrawl API key (default from config)
|
|
**kwargs: Additional arguments for BaseScraper
|
|
"""
|
|
super().__init__(**kwargs)
|
|
self.api_key = api_key or FIRECRAWL_API_KEY
|
|
|
|
if not self.api_key:
|
|
self.logger.warning("Firecrawl API key not provided. Set FIRECRAWL_API_KEY in .env")
|
|
|
|
try:
|
|
from firecrawl import FirecrawlApp
|
|
self.client = FirecrawlApp(api_key=self.api_key) if self.api_key else None
|
|
except ImportError:
|
|
self.logger.error("Firecrawl library not installed. Install with: pip install firecrawl-py")
|
|
self.client = None
|
|
|
|
@retry_with_backoff(max_retries=3)
|
|
def scrape(self, url: str, **kwargs) -> Dict[str, Any]:
|
|
"""
|
|
Scrape a single URL using Firecrawl.
|
|
|
|
Args:
|
|
url: Target URL to scrape
|
|
**kwargs: Additional parameters for Firecrawl
|
|
|
|
Returns:
|
|
Dictionary containing scraped content and metadata
|
|
"""
|
|
if not self.client:
|
|
return {
|
|
"url": url,
|
|
"error": "Firecrawl client not initialized",
|
|
"success": False
|
|
}
|
|
|
|
self.logger.info(f"Scraping URL with Firecrawl: {url}")
|
|
self.rate_limiter.wait()
|
|
|
|
try:
|
|
result = self.client.scrape_url(url, params=kwargs)
|
|
|
|
return {
|
|
"url": url,
|
|
"content": result.get("content", ""),
|
|
"markdown": result.get("markdown", ""),
|
|
"metadata": result.get("metadata", {}),
|
|
"success": True
|
|
}
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Firecrawl scraping failed for {url}: {str(e)}")
|
|
return {
|
|
"url": url,
|
|
"error": str(e),
|
|
"success": False
|
|
}
|
|
|
|
def crawl(
|
|
self,
|
|
url: str,
|
|
max_depth: int = 2,
|
|
max_pages: int = 10,
|
|
include_patterns: Optional[List[str]] = None,
|
|
exclude_patterns: Optional[List[str]] = None,
|
|
**kwargs
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Crawl a website hierarchically using Firecrawl.
|
|
|
|
Args:
|
|
url: Starting URL for the crawl
|
|
max_depth: Maximum crawl depth
|
|
max_pages: Maximum number of pages to crawl
|
|
include_patterns: URL patterns to include
|
|
exclude_patterns: URL patterns to exclude
|
|
**kwargs: Additional parameters
|
|
|
|
Returns:
|
|
Dictionary containing all crawled pages and their content
|
|
"""
|
|
if not self.client:
|
|
return {
|
|
"url": url,
|
|
"error": "Firecrawl client not initialized",
|
|
"success": False
|
|
}
|
|
|
|
self.logger.info(f"Starting crawl from {url} (max_depth={max_depth}, max_pages={max_pages})")
|
|
|
|
crawl_params = {
|
|
"maxDepth": max_depth,
|
|
"limit": max_pages
|
|
}
|
|
|
|
if include_patterns:
|
|
crawl_params["includePaths"] = include_patterns
|
|
|
|
if exclude_patterns:
|
|
crawl_params["excludePaths"] = exclude_patterns
|
|
|
|
try:
|
|
result = self.client.crawl_url(url, params=crawl_params)
|
|
|
|
return {
|
|
"url": url,
|
|
"pages": result.get("data", []),
|
|
"total_pages": len(result.get("data", [])),
|
|
"success": True
|
|
}
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Firecrawl crawling failed for {url}: {str(e)}")
|
|
return {
|
|
"url": url,
|
|
"error": str(e),
|
|
"success": False
|
|
}
|
|
|