130 lines
3.6 KiB
Python
130 lines
3.6 KiB
Python
"""
|
|
Main entry point for the web scraping project.
|
|
Example usage and demonstration of different scraping methods.
|
|
"""
|
|
import argparse
|
|
from scrapers import (
|
|
BasicScraper,
|
|
SeleniumScraper,
|
|
JinaScraper,
|
|
FirecrawlScraper,
|
|
AgentQLScraper,
|
|
MultionScraper
|
|
)
|
|
from data_processors.storage import DataStorage
|
|
from data_processors.validator import DataValidator
|
|
from utils.logger import setup_logger
|
|
|
|
logger = setup_logger(__name__)
|
|
|
|
|
|
def scrape_basic(url: str, output: str = None):
|
|
"""Scrape using basic HTTP requests."""
|
|
logger.info(f"Starting basic scrape: {url}")
|
|
|
|
with BasicScraper() as scraper:
|
|
result = scraper.scrape(url)
|
|
|
|
if result["success"]:
|
|
logger.info(f"Successfully scraped {url}")
|
|
|
|
if output:
|
|
storage = DataStorage()
|
|
storage.save_json(result, output)
|
|
logger.info(f"Saved results to {output}")
|
|
|
|
return result
|
|
else:
|
|
logger.error(f"Scraping failed: {result.get('error')}")
|
|
return None
|
|
|
|
|
|
def scrape_dynamic(url: str, output: str = None):
|
|
"""Scrape using Selenium for dynamic content."""
|
|
logger.info(f"Starting Selenium scrape: {url}")
|
|
|
|
with SeleniumScraper(headless=True) as scraper:
|
|
result = scraper.scrape(url)
|
|
|
|
if result["success"]:
|
|
logger.info(f"Successfully scraped {url}")
|
|
|
|
if output:
|
|
storage = DataStorage()
|
|
storage.save_json(result, output)
|
|
logger.info(f"Saved results to {output}")
|
|
|
|
return result
|
|
else:
|
|
logger.error(f"Scraping failed: {result.get('error')}")
|
|
return None
|
|
|
|
|
|
def scrape_jina(url: str, output: str = None):
|
|
"""Scrape using Jina AI."""
|
|
logger.info(f"Starting Jina scrape: {url}")
|
|
|
|
with JinaScraper() as scraper:
|
|
result = scraper.scrape(url, return_format="markdown")
|
|
|
|
if result["success"]:
|
|
logger.info(f"Successfully scraped {url}")
|
|
|
|
if output:
|
|
storage = DataStorage()
|
|
storage.save_text(result["content"], output)
|
|
logger.info(f"Saved results to {output}")
|
|
|
|
return result
|
|
else:
|
|
logger.error(f"Scraping failed: {result.get('error')}")
|
|
return None
|
|
|
|
|
|
def main():
|
|
"""Main entry point with CLI argument parsing."""
|
|
parser = argparse.ArgumentParser(
|
|
description="Web Scraping Framework",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter
|
|
)
|
|
|
|
parser.add_argument(
|
|
"url",
|
|
help="Target URL to scrape"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"-m", "--method",
|
|
choices=["basic", "selenium", "jina", "firecrawl", "agentql", "multion"],
|
|
default="basic",
|
|
help="Scraping method to use (default: basic)"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"-o", "--output",
|
|
help="Output file path (optional)"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"-v", "--verbose",
|
|
action="store_true",
|
|
help="Enable verbose logging"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Execute appropriate scraper
|
|
if args.method == "basic":
|
|
scrape_basic(args.url, args.output)
|
|
elif args.method == "selenium":
|
|
scrape_dynamic(args.url, args.output)
|
|
elif args.method == "jina":
|
|
scrape_jina(args.url, args.output)
|
|
else:
|
|
logger.warning(f"Method '{args.method}' not yet implemented in CLI")
|
|
print(f"Please use: basic, selenium, or jina")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|