stremio-sekai/scrapers/selenium_scraper.py

"""
Selenium scraper for JavaScript-heavy and dynamic websites.
"""
from typing import Dict, Any, Optional
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    TimeoutException,
    NoSuchElementException,
    WebDriverException
)
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from scrapers.base_scraper import BaseScraper
from utils.retry import retry_with_backoff
from config import SELENIUM_HEADLESS, SELENIUM_IMPLICIT_WAIT, USER_AGENT


class SeleniumScraper(BaseScraper):
    """
    Scraper for dynamic websites using Selenium WebDriver.
    """

    def __init__(self, headless: bool = SELENIUM_HEADLESS, **kwargs):
        """
        Initialize Selenium scraper.

        Args:
            headless: Run browser in headless mode
            **kwargs: Additional arguments for BaseScraper
        """
        super().__init__(**kwargs)
        self.headless = headless
        self.driver = None
        self._initialize_driver()

    def _initialize_driver(self):
        """Initialize Chrome WebDriver with appropriate options."""
        chrome_options = Options()

        if self.headless:
            chrome_options.add_argument("--headless=new")

        chrome_options.add_argument(f"user-agent={USER_AGENT}")
        chrome_options.add_argument("--disable-blink-features=AutomationControlled")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
        chrome_options.add_experimental_option("useAutomationExtension", False)

        try:
            service = Service(ChromeDriverManager().install())
            self.driver = webdriver.Chrome(service=service, options=chrome_options)
            self.driver.implicitly_wait(SELENIUM_IMPLICIT_WAIT)
            self.logger.info("Chrome WebDriver initialized successfully")
        except WebDriverException as e:
            self.logger.error(f"Failed to initialize WebDriver: {str(e)}")
            raise

    @retry_with_backoff(
        max_retries=2,
        exceptions=(TimeoutException, WebDriverException)
    )
    def scrape(self, url: str, wait_for: Optional[str] = None, **kwargs) -> Dict[str, Any]:
        """
        Scrape a dynamic website using Selenium.

        Args:
            url: Target URL to scrape
            wait_for: CSS selector to wait for before returning
            **kwargs: Additional parameters

        Returns:
            Dictionary containing page source and BeautifulSoup object
        """
        self.logger.info(f"Scraping URL with Selenium: {url}")
        self.rate_limiter.wait()

        try:
            self.driver.get(url)

            # Wait for specific element if provided
            if wait_for:
                timeout = kwargs.get('timeout', 10)
                WebDriverWait(self.driver, timeout).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, wait_for))
                )

            page_source = self.driver.page_source
            soup = BeautifulSoup(page_source, 'lxml')

            return {
                "url": url,
                "html": page_source,
                "soup": soup,
                "title": self.driver.title,
                "current_url": self.driver.current_url,
                "success": True
            }

        except (TimeoutException, WebDriverException) as e:
            self.logger.error(f"Selenium scraping failed for {url}: {str(e)}")
            return {
                "url": url,
                "error": str(e),
                "success": False
            }

    def click_element(self, selector: str, by: By = By.CSS_SELECTOR, timeout: int = 10):
        """
        Click an element on the page.

        Args:
            selector: Element selector
            by: Selenium By strategy (default: CSS_SELECTOR)
            timeout: Wait timeout in seconds
        """
        try:
            element = WebDriverWait(self.driver, timeout).until(
                EC.element_to_be_clickable((by, selector))
            )
            element.click()
            self.logger.info(f"Clicked element: {selector}")
        except (TimeoutException, NoSuchElementException) as e:
            self.logger.error(f"Failed to click element {selector}: {str(e)}")
            raise

    def fill_form(self, selector: str, text: str, by: By = By.CSS_SELECTOR):
        """
        Fill a form field with text.

        Args:
            selector: Element selector
            text: Text to input
            by: Selenium By strategy
        """
        try:
            element = self.driver.find_element(by, selector)
            element.clear()
            element.send_keys(text)
            self.logger.info(f"Filled form field: {selector}")
        except NoSuchElementException as e:
            self.logger.error(f"Form field not found {selector}: {str(e)}")
            raise

    def execute_script(self, script: str):
        """
        Execute JavaScript in the browser.

        Args:
            script: JavaScript code to execute

        Returns:
            Result of script execution
        """
        return self.driver.execute_script(script)

    def take_screenshot(self, filepath: str):
        """
        Take a screenshot of the current page.

        Args:
            filepath: Path to save the screenshot
        """
        self.driver.save_screenshot(filepath)
        self.logger.info(f"Screenshot saved to {filepath}")

    def cleanup(self):
        """Quit the WebDriver and cleanup resources."""
        if self.driver:
            self.driver.quit()
            self.logger.info("WebDriver closed")