stremio-sekai/scrapers/basic_scraper.py
2025-10-31 19:03:17 +01:00

115 lines
3.4 KiB
Python

"""
Basic scraper using requests and BeautifulSoup for static websites.
"""
import requests
from bs4 import BeautifulSoup
from typing import Dict, Any, Optional
from requests.exceptions import RequestException, Timeout
from scrapers.base_scraper import BaseScraper
from utils.retry import retry_with_backoff
from config import DEFAULT_HEADERS, TIMEOUT
class BasicScraper(BaseScraper):
"""
Scraper for static websites using requests and BeautifulSoup.
"""
def __init__(self, headers: Optional[Dict[str, str]] = None, **kwargs):
"""
Initialize basic scraper.
Args:
headers: Custom HTTP headers (default from config)
**kwargs: Additional arguments for BaseScraper
"""
super().__init__(**kwargs)
self.headers = headers or DEFAULT_HEADERS
self.session = requests.Session()
self.session.headers.update(self.headers)
@retry_with_backoff(
max_retries=3,
exceptions=(RequestException, Timeout)
)
def scrape(self, url: str, parser: str = "lxml", **kwargs) -> Dict[str, Any]:
"""
Scrape a static website.
Args:
url: Target URL to scrape
parser: HTML parser to use (default: lxml)
**kwargs: Additional parameters for requests.get()
Returns:
Dictionary containing status, HTML content, and BeautifulSoup object
"""
self.logger.info(f"Scraping URL: {url}")
self.rate_limiter.wait()
try:
response = self.session.get(
url,
timeout=kwargs.get('timeout', TIMEOUT),
**kwargs
)
response.raise_for_status()
soup = BeautifulSoup(response.content, parser)
return {
"url": url,
"status_code": response.status_code,
"html": response.text,
"soup": soup,
"headers": dict(response.headers),
"success": True
}
except RequestException as e:
self.logger.error(f"Request failed for {url}: {str(e)}")
return {
"url": url,
"error": str(e),
"success": False
}
def extract_text(self, soup: BeautifulSoup, selector: str) -> list:
"""
Extract text from elements matching a CSS selector.
Args:
soup: BeautifulSoup object
selector: CSS selector
Returns:
List of text content from matched elements
"""
elements = soup.select(selector)
return [elem.get_text(strip=True) for elem in elements]
def extract_links(self, soup: BeautifulSoup, base_url: str = "") -> list:
"""
Extract all links from the page.
Args:
soup: BeautifulSoup object
base_url: Base URL for resolving relative links
Returns:
List of absolute URLs
"""
from urllib.parse import urljoin
links = []
for link in soup.find_all('a', href=True):
absolute_url = urljoin(base_url, link['href'])
links.append(absolute_url)
return links
def cleanup(self):
"""Close the requests session."""
self.session.close()
self.logger.info("Session closed")