77 lines
2.1 KiB
Python
77 lines
2.1 KiB
Python
"""
|
|
Base scraper class with common functionality.
|
|
"""
|
|
from abc import ABC, abstractmethod
|
|
from typing import Any, Dict, Optional
|
|
from utils.logger import setup_logger
|
|
from utils.rate_limiter import RateLimiter
|
|
from config import RATE_LIMIT_DELAY
|
|
|
|
|
|
class BaseScraper(ABC):
|
|
"""
|
|
Abstract base class for all scrapers.
|
|
Provides common functionality and enforces interface consistency.
|
|
"""
|
|
|
|
def __init__(self, rate_limit: Optional[float] = None):
|
|
"""
|
|
Initialize base scraper.
|
|
|
|
Args:
|
|
rate_limit: Delay between requests in seconds (default from config)
|
|
"""
|
|
self.logger = setup_logger(self.__class__.__name__)
|
|
self.rate_limiter = RateLimiter(
|
|
min_delay=rate_limit or RATE_LIMIT_DELAY,
|
|
max_delay=(rate_limit or RATE_LIMIT_DELAY) * 2
|
|
)
|
|
|
|
@abstractmethod
|
|
def scrape(self, url: str, **kwargs) -> Dict[str, Any]:
|
|
"""
|
|
Main scraping method to be implemented by subclasses.
|
|
|
|
Args:
|
|
url: Target URL to scrape
|
|
**kwargs: Additional scraping parameters
|
|
|
|
Returns:
|
|
Dictionary containing scraped data
|
|
"""
|
|
pass
|
|
|
|
def validate_data(self, data: Dict[str, Any], required_fields: list) -> bool:
|
|
"""
|
|
Validate that scraped data contains required fields.
|
|
|
|
Args:
|
|
data: Data to validate
|
|
required_fields: List of required field names
|
|
|
|
Returns:
|
|
True if valid, False otherwise
|
|
"""
|
|
missing_fields = [field for field in required_fields if field not in data]
|
|
|
|
if missing_fields:
|
|
self.logger.warning(f"Missing required fields: {missing_fields}")
|
|
return False
|
|
|
|
return True
|
|
|
|
def cleanup(self):
|
|
"""
|
|
Cleanup method for releasing resources.
|
|
Override in subclasses if needed.
|
|
"""
|
|
pass
|
|
|
|
def __enter__(self):
|
|
"""Context manager entry."""
|
|
return self
|
|
|
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
"""Context manager exit."""
|
|
self.cleanup()
|
|
|