stremio-sekai/data_processors/validator.py
2025-10-31 19:03:17 +01:00

142 lines
4.2 KiB
Python

"""
Data validation utilities for scraped content.
"""
from typing import Any, Dict, List, Optional
import re
from datetime import datetime
from utils.logger import setup_logger
logger = setup_logger(__name__)
class DataValidator:
"""
Validator for scraped data with various validation rules.
"""
@staticmethod
def validate_email(email: str) -> bool:
"""Validate email format."""
pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
return bool(re.match(pattern, email))
@staticmethod
def validate_url(url: str) -> bool:
"""Validate URL format."""
pattern = r'^https?://[^\s/$.?#].[^\s]*$'
return bool(re.match(pattern, url))
@staticmethod
def validate_phone(phone: str) -> bool:
"""Validate phone number format."""
# Basic validation - adjust pattern as needed
pattern = r'^\+?1?\d{9,15}$'
cleaned = re.sub(r'[\s\-\(\)]', '', phone)
return bool(re.match(pattern, cleaned))
@staticmethod
def validate_required_fields(data: Dict[str, Any], required_fields: List[str]) -> Dict[str, Any]:
"""
Validate that required fields are present and non-empty.
Args:
data: Data dictionary to validate
required_fields: List of required field names
Returns:
Dictionary with validation results
"""
missing_fields = []
empty_fields = []
for field in required_fields:
if field not in data:
missing_fields.append(field)
elif not data[field] or (isinstance(data[field], str) and not data[field].strip()):
empty_fields.append(field)
is_valid = len(missing_fields) == 0 and len(empty_fields) == 0
return {
"valid": is_valid,
"missing_fields": missing_fields,
"empty_fields": empty_fields
}
@staticmethod
def validate_data_types(data: Dict[str, Any], type_schema: Dict[str, type]) -> Dict[str, Any]:
"""
Validate data types against a schema.
Args:
data: Data dictionary to validate
type_schema: Dictionary mapping field names to expected types
Returns:
Dictionary with validation results
"""
type_errors = []
for field, expected_type in type_schema.items():
if field in data and not isinstance(data[field], expected_type):
type_errors.append({
"field": field,
"expected": expected_type.__name__,
"actual": type(data[field]).__name__
})
return {
"valid": len(type_errors) == 0,
"type_errors": type_errors
}
@staticmethod
def clean_text(text: str) -> str:
"""
Clean and normalize text content.
Args:
text: Raw text to clean
Returns:
Cleaned text
"""
if not isinstance(text, str):
return str(text)
# Remove extra whitespace
text = ' '.join(text.split())
# Remove special characters (optional, adjust as needed)
# text = re.sub(r'[^\w\s\-.,!?]', '', text)
return text.strip()
@staticmethod
def sanitize_data(data: Dict[str, Any]) -> Dict[str, Any]:
"""
Sanitize all string fields in a data dictionary.
Args:
data: Data dictionary to sanitize
Returns:
Sanitized data dictionary
"""
sanitized = {}
for key, value in data.items():
if isinstance(value, str):
sanitized[key] = DataValidator.clean_text(value)
elif isinstance(value, dict):
sanitized[key] = DataValidator.sanitize_data(value)
elif isinstance(value, list):
sanitized[key] = [
DataValidator.clean_text(item) if isinstance(item, str) else item
for item in value
]
else:
sanitized[key] = value
return sanitized