stremio-sekai/data_processors/storage.py
2025-10-31 19:03:17 +01:00

184 lines
5.3 KiB
Python

"""
Data storage utilities for saving scraped content.
"""
import json
import csv
from pathlib import Path
from typing import Any, Dict, List, Optional
from datetime import datetime
from utils.logger import setup_logger
from config import DATA_DIR
logger = setup_logger(__name__)
class DataStorage:
"""
Storage handler for scraped data supporting multiple formats.
"""
def __init__(self, output_dir: Optional[Path] = None):
"""
Initialize data storage.
Args:
output_dir: Directory for storing data (default: DATA_DIR from config)
"""
self.output_dir = output_dir or DATA_DIR
self.output_dir.mkdir(exist_ok=True)
self.logger = logger
def save_json(
self,
data: Any,
filename: str,
indent: int = 2,
append: bool = False
) -> Path:
"""
Save data as JSON file.
Args:
data: Data to save
filename: Output filename
indent: JSON indentation
append: Append to existing file if True
Returns:
Path to saved file
"""
filepath = self.output_dir / filename
try:
if append and filepath.exists():
with open(filepath, 'r', encoding='utf-8') as f:
existing_data = json.load(f)
if isinstance(existing_data, list) and isinstance(data, list):
data = existing_data + data
else:
self.logger.warning("Cannot append: data types don't match")
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=indent, ensure_ascii=False)
self.logger.info(f"Saved JSON data to {filepath}")
return filepath
except Exception as e:
self.logger.error(f"Failed to save JSON: {str(e)}")
raise
def save_csv(
self,
data: List[Dict[str, Any]],
filename: str,
fieldnames: Optional[List[str]] = None,
append: bool = False
) -> Path:
"""
Save data as CSV file.
Args:
data: List of dictionaries to save
filename: Output filename
fieldnames: CSV column names (auto-detected if None)
append: Append to existing file if True
Returns:
Path to saved file
"""
filepath = self.output_dir / filename
if not data:
self.logger.warning("No data to save")
return filepath
try:
if fieldnames is None:
fieldnames = list(data[0].keys())
mode = 'a' if append and filepath.exists() else 'w'
write_header = not (append and filepath.exists())
with open(filepath, mode, newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
if write_header:
writer.writeheader()
writer.writerows(data)
self.logger.info(f"Saved CSV data to {filepath}")
return filepath
except Exception as e:
self.logger.error(f"Failed to save CSV: {str(e)}")
raise
def save_text(self, content: str, filename: str, append: bool = False) -> Path:
"""
Save content as text file.
Args:
content: Text content to save
filename: Output filename
append: Append to existing file if True
Returns:
Path to saved file
"""
filepath = self.output_dir / filename
try:
mode = 'a' if append else 'w'
with open(filepath, mode, encoding='utf-8') as f:
f.write(content)
if append:
f.write('\n')
self.logger.info(f"Saved text data to {filepath}")
return filepath
except Exception as e:
self.logger.error(f"Failed to save text: {str(e)}")
raise
def create_timestamped_filename(self, base_name: str, extension: str) -> str:
"""
Create a filename with timestamp.
Args:
base_name: Base filename
extension: File extension (without dot)
Returns:
Timestamped filename
"""
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
return f"{base_name}_{timestamp}.{extension}"
def load_json(self, filename: str) -> Any:
"""
Load data from JSON file.
Args:
filename: Input filename
Returns:
Loaded data
"""
filepath = self.output_dir / filename
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = json.load(f)
self.logger.info(f"Loaded JSON data from {filepath}")
return data
except Exception as e:
self.logger.error(f"Failed to load JSON: {str(e)}")
raise