stremio-sekai/main.py
2025-10-31 19:03:17 +01:00

130 lines
3.6 KiB
Python

"""
Main entry point for the web scraping project.
Example usage and demonstration of different scraping methods.
"""
import argparse
from scrapers import (
BasicScraper,
SeleniumScraper,
JinaScraper,
FirecrawlScraper,
AgentQLScraper,
MultionScraper
)
from data_processors.storage import DataStorage
from data_processors.validator import DataValidator
from utils.logger import setup_logger
logger = setup_logger(__name__)
def scrape_basic(url: str, output: str = None):
"""Scrape using basic HTTP requests."""
logger.info(f"Starting basic scrape: {url}")
with BasicScraper() as scraper:
result = scraper.scrape(url)
if result["success"]:
logger.info(f"Successfully scraped {url}")
if output:
storage = DataStorage()
storage.save_json(result, output)
logger.info(f"Saved results to {output}")
return result
else:
logger.error(f"Scraping failed: {result.get('error')}")
return None
def scrape_dynamic(url: str, output: str = None):
"""Scrape using Selenium for dynamic content."""
logger.info(f"Starting Selenium scrape: {url}")
with SeleniumScraper(headless=True) as scraper:
result = scraper.scrape(url)
if result["success"]:
logger.info(f"Successfully scraped {url}")
if output:
storage = DataStorage()
storage.save_json(result, output)
logger.info(f"Saved results to {output}")
return result
else:
logger.error(f"Scraping failed: {result.get('error')}")
return None
def scrape_jina(url: str, output: str = None):
"""Scrape using Jina AI."""
logger.info(f"Starting Jina scrape: {url}")
with JinaScraper() as scraper:
result = scraper.scrape(url, return_format="markdown")
if result["success"]:
logger.info(f"Successfully scraped {url}")
if output:
storage = DataStorage()
storage.save_text(result["content"], output)
logger.info(f"Saved results to {output}")
return result
else:
logger.error(f"Scraping failed: {result.get('error')}")
return None
def main():
"""Main entry point with CLI argument parsing."""
parser = argparse.ArgumentParser(
description="Web Scraping Framework",
formatter_class=argparse.RawDescriptionHelpFormatter
)
parser.add_argument(
"url",
help="Target URL to scrape"
)
parser.add_argument(
"-m", "--method",
choices=["basic", "selenium", "jina", "firecrawl", "agentql", "multion"],
default="basic",
help="Scraping method to use (default: basic)"
)
parser.add_argument(
"-o", "--output",
help="Output file path (optional)"
)
parser.add_argument(
"-v", "--verbose",
action="store_true",
help="Enable verbose logging"
)
args = parser.parse_args()
# Execute appropriate scraper
if args.method == "basic":
scrape_basic(args.url, args.output)
elif args.method == "selenium":
scrape_dynamic(args.url, args.output)
elif args.method == "jina":
scrape_jina(args.url, args.output)
else:
logger.warning(f"Method '{args.method}' not yet implemented in CLI")
print(f"Please use: basic, selenium, or jina")
if __name__ == "__main__":
main()