"""Input validation functions for RAG system.""" from config import MAX_CONTENT_LENGTH, MAX_TAG_LENGTH import re def validate_content(content: str) -> tuple[bool, str]: """ Validate content string. Args: content: Content to validate Returns: (is_valid, error_message) """ if not content or not content.strip(): return False, "Content cannot be empty" if len(content) > MAX_CONTENT_LENGTH: return False, f"Content exceeds maximum length of {MAX_CONTENT_LENGTH} characters" return True, "" def validate_tag(tag: str) -> tuple[bool, str]: """ Validate tag string. Args: tag: Tag to validate Returns: (is_valid, error_message) """ if not tag or not tag.strip(): return False, "Tag cannot be empty" if len(tag) > MAX_TAG_LENGTH: return False, f"Tag exceeds maximum length of {MAX_TAG_LENGTH} characters" # Only allow alphanumeric, underscore, hyphen if not re.match(r'^[a-zA-Z0-9_-]+$', tag): return False, "Tag must contain only alphanumeric characters, underscores, and hyphens" return True, "" def validate_document_id(doc_id: str) -> tuple[bool, str]: """ Validate document ID format. Args: doc_id: Document ID to validate Returns: (is_valid, error_message) """ if not doc_id or not doc_id.strip(): return False, "Document ID cannot be empty" # Basic UUID format check (flexible for various ID formats) if len(doc_id) < 8 or len(doc_id) > 100: return False, "Document ID must be between 8 and 100 characters" return True, "" def sanitize_for_prompt(text: str) -> str: """ Sanitize text to prevent prompt injection attacks. Args: text: Text to sanitize Returns: Sanitized text """ # Remove common prompt injection patterns dangerous_patterns = [ r'(?i)ignore\s+previous\s+instructions', r'(?i)ignore\s+all\s+previous', r'(?i)disregard\s+previous', r'(?i)you\s+are\s+now', r'(?i)system\s+prompt', r'(?i)bypass\s+filters', ] sanitized = text for pattern in dangerous_patterns: sanitized = re.sub(pattern, '[FILTERED]', sanitized) # Limit repeated characters sanitized = re.sub(r'(.)\1{20,}', r'\1' * 20, sanitized) return sanitized