rag-mcp/utils/validation.py

"""Input validation functions for RAG system."""
from config import MAX_CONTENT_LENGTH, MAX_TAG_LENGTH
import re

def validate_content(content: str) -> tuple[bool, str]:
    """
    Validate content string.

    Args:
        content: Content to validate

    Returns:
        (is_valid, error_message)
    """
    if not content or not content.strip():
        return False, "Content cannot be empty"

    if len(content) > MAX_CONTENT_LENGTH:
        return False, f"Content exceeds maximum length of {MAX_CONTENT_LENGTH} characters"

    return True, ""

def validate_tag(tag: str) -> tuple[bool, str]:
    """
    Validate tag string.

    Args:
        tag: Tag to validate

    Returns:
        (is_valid, error_message)
    """
    if not tag or not tag.strip():
        return False, "Tag cannot be empty"

    if len(tag) > MAX_TAG_LENGTH:
        return False, f"Tag exceeds maximum length of {MAX_TAG_LENGTH} characters"

    # Only allow alphanumeric, underscore, hyphen
    if not re.match(r'^[a-zA-Z0-9_-]+$', tag):
        return False, "Tag must contain only alphanumeric characters, underscores, and hyphens"

    return True, ""

def validate_document_id(doc_id: str) -> tuple[bool, str]:
    """
    Validate document ID format.

    Args:
        doc_id: Document ID to validate

    Returns:
        (is_valid, error_message)
    """
    if not doc_id or not doc_id.strip():
        return False, "Document ID cannot be empty"

    # Basic UUID format check (flexible for various ID formats)
    if len(doc_id) < 8 or len(doc_id) > 100:
        return False, "Document ID must be between 8 and 100 characters"

    return True, ""

def sanitize_for_prompt(text: str) -> str:
    """
    Sanitize text to prevent prompt injection attacks.

    Args:
        text: Text to sanitize

    Returns:
        Sanitized text
    """
    # Remove common prompt injection patterns
    dangerous_patterns = [
        r'(?i)ignore\s+previous\s+instructions',
        r'(?i)ignore\s+all\s+previous',
        r'(?i)disregard\s+previous',
        r'(?i)you\s+are\s+now',
        r'(?i)system\s+prompt',
        r'(?i)bypass\s+filters',
    ]

    sanitized = text
    for pattern in dangerous_patterns:
        sanitized = re.sub(pattern, '[FILTERED]', sanitized)

    # Limit repeated characters
    sanitized = re.sub(r'(.)\1{20,}', r'\1' * 20, sanitized)

    return sanitized