142 lines
4.2 KiB
Python
142 lines
4.2 KiB
Python
"""
|
|
Data validation utilities for scraped content.
|
|
"""
|
|
from typing import Any, Dict, List, Optional
|
|
import re
|
|
from datetime import datetime
|
|
from utils.logger import setup_logger
|
|
|
|
logger = setup_logger(__name__)
|
|
|
|
|
|
class DataValidator:
|
|
"""
|
|
Validator for scraped data with various validation rules.
|
|
"""
|
|
|
|
@staticmethod
|
|
def validate_email(email: str) -> bool:
|
|
"""Validate email format."""
|
|
pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
|
|
return bool(re.match(pattern, email))
|
|
|
|
@staticmethod
|
|
def validate_url(url: str) -> bool:
|
|
"""Validate URL format."""
|
|
pattern = r'^https?://[^\s/$.?#].[^\s]*$'
|
|
return bool(re.match(pattern, url))
|
|
|
|
@staticmethod
|
|
def validate_phone(phone: str) -> bool:
|
|
"""Validate phone number format."""
|
|
# Basic validation - adjust pattern as needed
|
|
pattern = r'^\+?1?\d{9,15}$'
|
|
cleaned = re.sub(r'[\s\-\(\)]', '', phone)
|
|
return bool(re.match(pattern, cleaned))
|
|
|
|
@staticmethod
|
|
def validate_required_fields(data: Dict[str, Any], required_fields: List[str]) -> Dict[str, Any]:
|
|
"""
|
|
Validate that required fields are present and non-empty.
|
|
|
|
Args:
|
|
data: Data dictionary to validate
|
|
required_fields: List of required field names
|
|
|
|
Returns:
|
|
Dictionary with validation results
|
|
"""
|
|
missing_fields = []
|
|
empty_fields = []
|
|
|
|
for field in required_fields:
|
|
if field not in data:
|
|
missing_fields.append(field)
|
|
elif not data[field] or (isinstance(data[field], str) and not data[field].strip()):
|
|
empty_fields.append(field)
|
|
|
|
is_valid = len(missing_fields) == 0 and len(empty_fields) == 0
|
|
|
|
return {
|
|
"valid": is_valid,
|
|
"missing_fields": missing_fields,
|
|
"empty_fields": empty_fields
|
|
}
|
|
|
|
@staticmethod
|
|
def validate_data_types(data: Dict[str, Any], type_schema: Dict[str, type]) -> Dict[str, Any]:
|
|
"""
|
|
Validate data types against a schema.
|
|
|
|
Args:
|
|
data: Data dictionary to validate
|
|
type_schema: Dictionary mapping field names to expected types
|
|
|
|
Returns:
|
|
Dictionary with validation results
|
|
"""
|
|
type_errors = []
|
|
|
|
for field, expected_type in type_schema.items():
|
|
if field in data and not isinstance(data[field], expected_type):
|
|
type_errors.append({
|
|
"field": field,
|
|
"expected": expected_type.__name__,
|
|
"actual": type(data[field]).__name__
|
|
})
|
|
|
|
return {
|
|
"valid": len(type_errors) == 0,
|
|
"type_errors": type_errors
|
|
}
|
|
|
|
@staticmethod
|
|
def clean_text(text: str) -> str:
|
|
"""
|
|
Clean and normalize text content.
|
|
|
|
Args:
|
|
text: Raw text to clean
|
|
|
|
Returns:
|
|
Cleaned text
|
|
"""
|
|
if not isinstance(text, str):
|
|
return str(text)
|
|
|
|
# Remove extra whitespace
|
|
text = ' '.join(text.split())
|
|
|
|
# Remove special characters (optional, adjust as needed)
|
|
# text = re.sub(r'[^\w\s\-.,!?]', '', text)
|
|
|
|
return text.strip()
|
|
|
|
@staticmethod
|
|
def sanitize_data(data: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""
|
|
Sanitize all string fields in a data dictionary.
|
|
|
|
Args:
|
|
data: Data dictionary to sanitize
|
|
|
|
Returns:
|
|
Sanitized data dictionary
|
|
"""
|
|
sanitized = {}
|
|
|
|
for key, value in data.items():
|
|
if isinstance(value, str):
|
|
sanitized[key] = DataValidator.clean_text(value)
|
|
elif isinstance(value, dict):
|
|
sanitized[key] = DataValidator.sanitize_data(value)
|
|
elif isinstance(value, list):
|
|
sanitized[key] = [
|
|
DataValidator.clean_text(item) if isinstance(item, str) else item
|
|
for item in value
|
|
]
|
|
else:
|
|
sanitized[key] = value
|
|
|
|
return sanitized
|
|
|