stremio-sekai/scrapers/basic_scraper.py

"""
Basic scraper using requests and BeautifulSoup for static websites.
"""
import requests
from bs4 import BeautifulSoup
from typing import Dict, Any, Optional
from requests.exceptions import RequestException, Timeout
from scrapers.base_scraper import BaseScraper
from utils.retry import retry_with_backoff
from config import DEFAULT_HEADERS, TIMEOUT


class BasicScraper(BaseScraper):
    """
    Scraper for static websites using requests and BeautifulSoup.
    """

    def __init__(self, headers: Optional[Dict[str, str]] = None, **kwargs):
        """
        Initialize basic scraper.

        Args:
            headers: Custom HTTP headers (default from config)
            **kwargs: Additional arguments for BaseScraper
        """
        super().__init__(**kwargs)
        self.headers = headers or DEFAULT_HEADERS
        self.session = requests.Session()
        self.session.headers.update(self.headers)

    @retry_with_backoff(
        max_retries=3,
        exceptions=(RequestException, Timeout)
    )
    def scrape(self, url: str, parser: str = "lxml", **kwargs) -> Dict[str, Any]:
        """
        Scrape a static website.

        Args:
            url: Target URL to scrape
            parser: HTML parser to use (default: lxml)
            **kwargs: Additional parameters for requests.get()

        Returns:
            Dictionary containing status, HTML content, and BeautifulSoup object
        """
        self.logger.info(f"Scraping URL: {url}")
        self.rate_limiter.wait()

        try:
            response = self.session.get(
                url,
                timeout=kwargs.get('timeout', TIMEOUT),
                **kwargs
            )
            response.raise_for_status()

            soup = BeautifulSoup(response.content, parser)

            return {
                "url": url,
                "status_code": response.status_code,
                "html": response.text,
                "soup": soup,
                "headers": dict(response.headers),
                "success": True
            }

        except RequestException as e:
            self.logger.error(f"Request failed for {url}: {str(e)}")
            return {
                "url": url,
                "error": str(e),
                "success": False
            }

    def extract_text(self, soup: BeautifulSoup, selector: str) -> list:
        """
        Extract text from elements matching a CSS selector.

        Args:
            soup: BeautifulSoup object
            selector: CSS selector

        Returns:
            List of text content from matched elements
        """
        elements = soup.select(selector)
        return [elem.get_text(strip=True) for elem in elements]

    def extract_links(self, soup: BeautifulSoup, base_url: str = "") -> list:
        """
        Extract all links from the page.

        Args:
            soup: BeautifulSoup object
            base_url: Base URL for resolving relative links

        Returns:
            List of absolute URLs
        """
        from urllib.parse import urljoin

        links = []
        for link in soup.find_all('a', href=True):
            absolute_url = urljoin(base_url, link['href'])
            links.append(absolute_url)

        return links

    def cleanup(self):
        """Close the requests session."""
        self.session.close()
        self.logger.info("Session closed")