184 lines
5.3 KiB
Python
184 lines
5.3 KiB
Python
"""
|
|
Data storage utilities for saving scraped content.
|
|
"""
|
|
import json
|
|
import csv
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional
|
|
from datetime import datetime
|
|
from utils.logger import setup_logger
|
|
from config import DATA_DIR
|
|
|
|
logger = setup_logger(__name__)
|
|
|
|
|
|
class DataStorage:
|
|
"""
|
|
Storage handler for scraped data supporting multiple formats.
|
|
"""
|
|
|
|
def __init__(self, output_dir: Optional[Path] = None):
|
|
"""
|
|
Initialize data storage.
|
|
|
|
Args:
|
|
output_dir: Directory for storing data (default: DATA_DIR from config)
|
|
"""
|
|
self.output_dir = output_dir or DATA_DIR
|
|
self.output_dir.mkdir(exist_ok=True)
|
|
self.logger = logger
|
|
|
|
def save_json(
|
|
self,
|
|
data: Any,
|
|
filename: str,
|
|
indent: int = 2,
|
|
append: bool = False
|
|
) -> Path:
|
|
"""
|
|
Save data as JSON file.
|
|
|
|
Args:
|
|
data: Data to save
|
|
filename: Output filename
|
|
indent: JSON indentation
|
|
append: Append to existing file if True
|
|
|
|
Returns:
|
|
Path to saved file
|
|
"""
|
|
filepath = self.output_dir / filename
|
|
|
|
try:
|
|
if append and filepath.exists():
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
existing_data = json.load(f)
|
|
|
|
if isinstance(existing_data, list) and isinstance(data, list):
|
|
data = existing_data + data
|
|
else:
|
|
self.logger.warning("Cannot append: data types don't match")
|
|
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, indent=indent, ensure_ascii=False)
|
|
|
|
self.logger.info(f"Saved JSON data to {filepath}")
|
|
return filepath
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Failed to save JSON: {str(e)}")
|
|
raise
|
|
|
|
def save_csv(
|
|
self,
|
|
data: List[Dict[str, Any]],
|
|
filename: str,
|
|
fieldnames: Optional[List[str]] = None,
|
|
append: bool = False
|
|
) -> Path:
|
|
"""
|
|
Save data as CSV file.
|
|
|
|
Args:
|
|
data: List of dictionaries to save
|
|
filename: Output filename
|
|
fieldnames: CSV column names (auto-detected if None)
|
|
append: Append to existing file if True
|
|
|
|
Returns:
|
|
Path to saved file
|
|
"""
|
|
filepath = self.output_dir / filename
|
|
|
|
if not data:
|
|
self.logger.warning("No data to save")
|
|
return filepath
|
|
|
|
try:
|
|
if fieldnames is None:
|
|
fieldnames = list(data[0].keys())
|
|
|
|
mode = 'a' if append and filepath.exists() else 'w'
|
|
write_header = not (append and filepath.exists())
|
|
|
|
with open(filepath, mode, newline='', encoding='utf-8') as f:
|
|
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
|
|
|
if write_header:
|
|
writer.writeheader()
|
|
|
|
writer.writerows(data)
|
|
|
|
self.logger.info(f"Saved CSV data to {filepath}")
|
|
return filepath
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Failed to save CSV: {str(e)}")
|
|
raise
|
|
|
|
def save_text(self, content: str, filename: str, append: bool = False) -> Path:
|
|
"""
|
|
Save content as text file.
|
|
|
|
Args:
|
|
content: Text content to save
|
|
filename: Output filename
|
|
append: Append to existing file if True
|
|
|
|
Returns:
|
|
Path to saved file
|
|
"""
|
|
filepath = self.output_dir / filename
|
|
|
|
try:
|
|
mode = 'a' if append else 'w'
|
|
|
|
with open(filepath, mode, encoding='utf-8') as f:
|
|
f.write(content)
|
|
if append:
|
|
f.write('\n')
|
|
|
|
self.logger.info(f"Saved text data to {filepath}")
|
|
return filepath
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Failed to save text: {str(e)}")
|
|
raise
|
|
|
|
def create_timestamped_filename(self, base_name: str, extension: str) -> str:
|
|
"""
|
|
Create a filename with timestamp.
|
|
|
|
Args:
|
|
base_name: Base filename
|
|
extension: File extension (without dot)
|
|
|
|
Returns:
|
|
Timestamped filename
|
|
"""
|
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
return f"{base_name}_{timestamp}.{extension}"
|
|
|
|
def load_json(self, filename: str) -> Any:
|
|
"""
|
|
Load data from JSON file.
|
|
|
|
Args:
|
|
filename: Input filename
|
|
|
|
Returns:
|
|
Loaded data
|
|
"""
|
|
filepath = self.output_dir / filename
|
|
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
self.logger.info(f"Loaded JSON data from {filepath}")
|
|
return data
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Failed to load JSON: {str(e)}")
|
|
raise
|
|
|