stremio-sekai/scrapers/base_scraper.py
2025-10-31 19:03:17 +01:00

77 lines
2.1 KiB
Python

"""
Base scraper class with common functionality.
"""
from abc import ABC, abstractmethod
from typing import Any, Dict, Optional
from utils.logger import setup_logger
from utils.rate_limiter import RateLimiter
from config import RATE_LIMIT_DELAY
class BaseScraper(ABC):
"""
Abstract base class for all scrapers.
Provides common functionality and enforces interface consistency.
"""
def __init__(self, rate_limit: Optional[float] = None):
"""
Initialize base scraper.
Args:
rate_limit: Delay between requests in seconds (default from config)
"""
self.logger = setup_logger(self.__class__.__name__)
self.rate_limiter = RateLimiter(
min_delay=rate_limit or RATE_LIMIT_DELAY,
max_delay=(rate_limit or RATE_LIMIT_DELAY) * 2
)
@abstractmethod
def scrape(self, url: str, **kwargs) -> Dict[str, Any]:
"""
Main scraping method to be implemented by subclasses.
Args:
url: Target URL to scrape
**kwargs: Additional scraping parameters
Returns:
Dictionary containing scraped data
"""
pass
def validate_data(self, data: Dict[str, Any], required_fields: list) -> bool:
"""
Validate that scraped data contains required fields.
Args:
data: Data to validate
required_fields: List of required field names
Returns:
True if valid, False otherwise
"""
missing_fields = [field for field in required_fields if field not in data]
if missing_fields:
self.logger.warning(f"Missing required fields: {missing_fields}")
return False
return True
def cleanup(self):
"""
Cleanup method for releasing resources.
Override in subclasses if needed.
"""
pass
def __enter__(self):
"""Context manager entry."""
return self
def __exit__(self, exc_type, exc_val, exc_tb):
"""Context manager exit."""
self.cleanup()