57 lines
1.6 KiB
Python
57 lines
1.6 KiB
Python
"""
|
|
Configuration module for web scraping project.
|
|
Loads environment variables and defines project-wide settings.
|
|
"""
|
|
import os
|
|
from pathlib import Path
|
|
from dotenv import load_dotenv
|
|
|
|
# Load environment variables
|
|
load_dotenv()
|
|
|
|
# Project Paths
|
|
BASE_DIR = Path(__file__).resolve().parent
|
|
DATA_DIR = BASE_DIR / "data"
|
|
LOGS_DIR = BASE_DIR / "logs"
|
|
CACHE_DIR = BASE_DIR / "cache"
|
|
|
|
# Create directories if they don't exist
|
|
DATA_DIR.mkdir(exist_ok=True)
|
|
LOGS_DIR.mkdir(exist_ok=True)
|
|
CACHE_DIR.mkdir(exist_ok=True)
|
|
|
|
# API Keys
|
|
JINA_API_KEY = os.getenv("JINA_API_KEY", "")
|
|
FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY", "")
|
|
AGENTQL_API_KEY = os.getenv("AGENTQL_API_KEY", "")
|
|
MULTION_API_KEY = os.getenv("MULTION_API_KEY", "")
|
|
TWOCAPTCHA_API_KEY = os.getenv("TWOCAPTCHA_API_KEY", "")
|
|
|
|
# Scraping Configuration
|
|
RATE_LIMIT_DELAY = float(os.getenv("RATE_LIMIT_DELAY", 2))
|
|
MAX_RETRIES = int(os.getenv("MAX_RETRIES", 3))
|
|
TIMEOUT = int(os.getenv("TIMEOUT", 30))
|
|
USER_AGENT = os.getenv(
|
|
"USER_AGENT",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
|
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
)
|
|
|
|
# Request Headers
|
|
DEFAULT_HEADERS = {
|
|
"User-Agent": USER_AGENT,
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
"Accept-Language": "en-US,en;q=0.5",
|
|
"Accept-Encoding": "gzip, deflate, br",
|
|
"DNT": "1",
|
|
"Connection": "keep-alive",
|
|
"Upgrade-Insecure-Requests": "1"
|
|
}
|
|
|
|
# Selenium Configuration
|
|
SELENIUM_HEADLESS = True
|
|
SELENIUM_IMPLICIT_WAIT = 10
|
|
|
|
# Cache Configuration
|
|
CACHE_EXPIRATION = 3600 # 1 hour in seconds
|
|
|