""" Data validation utilities for scraped content. """ from typing import Any, Dict, List, Optional import re from datetime import datetime from utils.logger import setup_logger logger = setup_logger(__name__) class DataValidator: """ Validator for scraped data with various validation rules. """ @staticmethod def validate_email(email: str) -> bool: """Validate email format.""" pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$' return bool(re.match(pattern, email)) @staticmethod def validate_url(url: str) -> bool: """Validate URL format.""" pattern = r'^https?://[^\s/$.?#].[^\s]*$' return bool(re.match(pattern, url)) @staticmethod def validate_phone(phone: str) -> bool: """Validate phone number format.""" # Basic validation - adjust pattern as needed pattern = r'^\+?1?\d{9,15}$' cleaned = re.sub(r'[\s\-\(\)]', '', phone) return bool(re.match(pattern, cleaned)) @staticmethod def validate_required_fields(data: Dict[str, Any], required_fields: List[str]) -> Dict[str, Any]: """ Validate that required fields are present and non-empty. Args: data: Data dictionary to validate required_fields: List of required field names Returns: Dictionary with validation results """ missing_fields = [] empty_fields = [] for field in required_fields: if field not in data: missing_fields.append(field) elif not data[field] or (isinstance(data[field], str) and not data[field].strip()): empty_fields.append(field) is_valid = len(missing_fields) == 0 and len(empty_fields) == 0 return { "valid": is_valid, "missing_fields": missing_fields, "empty_fields": empty_fields } @staticmethod def validate_data_types(data: Dict[str, Any], type_schema: Dict[str, type]) -> Dict[str, Any]: """ Validate data types against a schema. Args: data: Data dictionary to validate type_schema: Dictionary mapping field names to expected types Returns: Dictionary with validation results """ type_errors = [] for field, expected_type in type_schema.items(): if field in data and not isinstance(data[field], expected_type): type_errors.append({ "field": field, "expected": expected_type.__name__, "actual": type(data[field]).__name__ }) return { "valid": len(type_errors) == 0, "type_errors": type_errors } @staticmethod def clean_text(text: str) -> str: """ Clean and normalize text content. Args: text: Raw text to clean Returns: Cleaned text """ if not isinstance(text, str): return str(text) # Remove extra whitespace text = ' '.join(text.split()) # Remove special characters (optional, adjust as needed) # text = re.sub(r'[^\w\s\-.,!?]', '', text) return text.strip() @staticmethod def sanitize_data(data: Dict[str, Any]) -> Dict[str, Any]: """ Sanitize all string fields in a data dictionary. Args: data: Data dictionary to sanitize Returns: Sanitized data dictionary """ sanitized = {} for key, value in data.items(): if isinstance(value, str): sanitized[key] = DataValidator.clean_text(value) elif isinstance(value, dict): sanitized[key] = DataValidator.sanitize_data(value) elif isinstance(value, list): sanitized[key] = [ DataValidator.clean_text(item) if isinstance(item, str) else item for item in value ] else: sanitized[key] = value return sanitized