""" Data storage utilities for saving scraped content. """ import json import csv from pathlib import Path from typing import Any, Dict, List, Optional from datetime import datetime from utils.logger import setup_logger from config import DATA_DIR logger = setup_logger(__name__) class DataStorage: """ Storage handler for scraped data supporting multiple formats. """ def __init__(self, output_dir: Optional[Path] = None): """ Initialize data storage. Args: output_dir: Directory for storing data (default: DATA_DIR from config) """ self.output_dir = output_dir or DATA_DIR self.output_dir.mkdir(exist_ok=True) self.logger = logger def save_json( self, data: Any, filename: str, indent: int = 2, append: bool = False ) -> Path: """ Save data as JSON file. Args: data: Data to save filename: Output filename indent: JSON indentation append: Append to existing file if True Returns: Path to saved file """ filepath = self.output_dir / filename try: if append and filepath.exists(): with open(filepath, 'r', encoding='utf-8') as f: existing_data = json.load(f) if isinstance(existing_data, list) and isinstance(data, list): data = existing_data + data else: self.logger.warning("Cannot append: data types don't match") with open(filepath, 'w', encoding='utf-8') as f: json.dump(data, f, indent=indent, ensure_ascii=False) self.logger.info(f"Saved JSON data to {filepath}") return filepath except Exception as e: self.logger.error(f"Failed to save JSON: {str(e)}") raise def save_csv( self, data: List[Dict[str, Any]], filename: str, fieldnames: Optional[List[str]] = None, append: bool = False ) -> Path: """ Save data as CSV file. Args: data: List of dictionaries to save filename: Output filename fieldnames: CSV column names (auto-detected if None) append: Append to existing file if True Returns: Path to saved file """ filepath = self.output_dir / filename if not data: self.logger.warning("No data to save") return filepath try: if fieldnames is None: fieldnames = list(data[0].keys()) mode = 'a' if append and filepath.exists() else 'w' write_header = not (append and filepath.exists()) with open(filepath, mode, newline='', encoding='utf-8') as f: writer = csv.DictWriter(f, fieldnames=fieldnames) if write_header: writer.writeheader() writer.writerows(data) self.logger.info(f"Saved CSV data to {filepath}") return filepath except Exception as e: self.logger.error(f"Failed to save CSV: {str(e)}") raise def save_text(self, content: str, filename: str, append: bool = False) -> Path: """ Save content as text file. Args: content: Text content to save filename: Output filename append: Append to existing file if True Returns: Path to saved file """ filepath = self.output_dir / filename try: mode = 'a' if append else 'w' with open(filepath, mode, encoding='utf-8') as f: f.write(content) if append: f.write('\n') self.logger.info(f"Saved text data to {filepath}") return filepath except Exception as e: self.logger.error(f"Failed to save text: {str(e)}") raise def create_timestamped_filename(self, base_name: str, extension: str) -> str: """ Create a filename with timestamp. Args: base_name: Base filename extension: File extension (without dot) Returns: Timestamped filename """ timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') return f"{base_name}_{timestamp}.{extension}" def load_json(self, filename: str) -> Any: """ Load data from JSON file. Args: filename: Input filename Returns: Loaded data """ filepath = self.output_dir / filename try: with open(filepath, 'r', encoding='utf-8') as f: data = json.load(f) self.logger.info(f"Loaded JSON data from {filepath}") return data except Exception as e: self.logger.error(f"Failed to load JSON: {str(e)}") raise