115 lines
3.4 KiB
Python
115 lines
3.4 KiB
Python
"""
|
|
Basic scraper using requests and BeautifulSoup for static websites.
|
|
"""
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from typing import Dict, Any, Optional
|
|
from requests.exceptions import RequestException, Timeout
|
|
from scrapers.base_scraper import BaseScraper
|
|
from utils.retry import retry_with_backoff
|
|
from config import DEFAULT_HEADERS, TIMEOUT
|
|
|
|
|
|
class BasicScraper(BaseScraper):
|
|
"""
|
|
Scraper for static websites using requests and BeautifulSoup.
|
|
"""
|
|
|
|
def __init__(self, headers: Optional[Dict[str, str]] = None, **kwargs):
|
|
"""
|
|
Initialize basic scraper.
|
|
|
|
Args:
|
|
headers: Custom HTTP headers (default from config)
|
|
**kwargs: Additional arguments for BaseScraper
|
|
"""
|
|
super().__init__(**kwargs)
|
|
self.headers = headers or DEFAULT_HEADERS
|
|
self.session = requests.Session()
|
|
self.session.headers.update(self.headers)
|
|
|
|
@retry_with_backoff(
|
|
max_retries=3,
|
|
exceptions=(RequestException, Timeout)
|
|
)
|
|
def scrape(self, url: str, parser: str = "lxml", **kwargs) -> Dict[str, Any]:
|
|
"""
|
|
Scrape a static website.
|
|
|
|
Args:
|
|
url: Target URL to scrape
|
|
parser: HTML parser to use (default: lxml)
|
|
**kwargs: Additional parameters for requests.get()
|
|
|
|
Returns:
|
|
Dictionary containing status, HTML content, and BeautifulSoup object
|
|
"""
|
|
self.logger.info(f"Scraping URL: {url}")
|
|
self.rate_limiter.wait()
|
|
|
|
try:
|
|
response = self.session.get(
|
|
url,
|
|
timeout=kwargs.get('timeout', TIMEOUT),
|
|
**kwargs
|
|
)
|
|
response.raise_for_status()
|
|
|
|
soup = BeautifulSoup(response.content, parser)
|
|
|
|
return {
|
|
"url": url,
|
|
"status_code": response.status_code,
|
|
"html": response.text,
|
|
"soup": soup,
|
|
"headers": dict(response.headers),
|
|
"success": True
|
|
}
|
|
|
|
except RequestException as e:
|
|
self.logger.error(f"Request failed for {url}: {str(e)}")
|
|
return {
|
|
"url": url,
|
|
"error": str(e),
|
|
"success": False
|
|
}
|
|
|
|
def extract_text(self, soup: BeautifulSoup, selector: str) -> list:
|
|
"""
|
|
Extract text from elements matching a CSS selector.
|
|
|
|
Args:
|
|
soup: BeautifulSoup object
|
|
selector: CSS selector
|
|
|
|
Returns:
|
|
List of text content from matched elements
|
|
"""
|
|
elements = soup.select(selector)
|
|
return [elem.get_text(strip=True) for elem in elements]
|
|
|
|
def extract_links(self, soup: BeautifulSoup, base_url: str = "") -> list:
|
|
"""
|
|
Extract all links from the page.
|
|
|
|
Args:
|
|
soup: BeautifulSoup object
|
|
base_url: Base URL for resolving relative links
|
|
|
|
Returns:
|
|
List of absolute URLs
|
|
"""
|
|
from urllib.parse import urljoin
|
|
|
|
links = []
|
|
for link in soup.find_all('a', href=True):
|
|
absolute_url = urljoin(base_url, link['href'])
|
|
links.append(absolute_url)
|
|
|
|
return links
|
|
|
|
def cleanup(self):
|
|
"""Close the requests session."""
|
|
self.session.close()
|
|
self.logger.info("Session closed")
|
|
|