Features: - Vector search with Pinecone + Vertex AI embeddings - Document relationships (link, unlink, related, graph) - Auto-link with LLM analysis - Intelligent merge with Gemini Modular structure: - clients/: Pinecone, Vertex AI - tools/: core, relations, stats - utils/: validation, logging Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
92 lines
2.3 KiB
Python
92 lines
2.3 KiB
Python
"""Input validation functions for RAG system."""
|
|
from config import MAX_CONTENT_LENGTH, MAX_TAG_LENGTH
|
|
import re
|
|
|
|
def validate_content(content: str) -> tuple[bool, str]:
|
|
"""
|
|
Validate content string.
|
|
|
|
Args:
|
|
content: Content to validate
|
|
|
|
Returns:
|
|
(is_valid, error_message)
|
|
"""
|
|
if not content or not content.strip():
|
|
return False, "Content cannot be empty"
|
|
|
|
if len(content) > MAX_CONTENT_LENGTH:
|
|
return False, f"Content exceeds maximum length of {MAX_CONTENT_LENGTH} characters"
|
|
|
|
return True, ""
|
|
|
|
def validate_tag(tag: str) -> tuple[bool, str]:
|
|
"""
|
|
Validate tag string.
|
|
|
|
Args:
|
|
tag: Tag to validate
|
|
|
|
Returns:
|
|
(is_valid, error_message)
|
|
"""
|
|
if not tag or not tag.strip():
|
|
return False, "Tag cannot be empty"
|
|
|
|
if len(tag) > MAX_TAG_LENGTH:
|
|
return False, f"Tag exceeds maximum length of {MAX_TAG_LENGTH} characters"
|
|
|
|
# Only allow alphanumeric, underscore, hyphen
|
|
if not re.match(r'^[a-zA-Z0-9_-]+$', tag):
|
|
return False, "Tag must contain only alphanumeric characters, underscores, and hyphens"
|
|
|
|
return True, ""
|
|
|
|
def validate_document_id(doc_id: str) -> tuple[bool, str]:
|
|
"""
|
|
Validate document ID format.
|
|
|
|
Args:
|
|
doc_id: Document ID to validate
|
|
|
|
Returns:
|
|
(is_valid, error_message)
|
|
"""
|
|
if not doc_id or not doc_id.strip():
|
|
return False, "Document ID cannot be empty"
|
|
|
|
# Basic UUID format check (flexible for various ID formats)
|
|
if len(doc_id) < 8 or len(doc_id) > 100:
|
|
return False, "Document ID must be between 8 and 100 characters"
|
|
|
|
return True, ""
|
|
|
|
def sanitize_for_prompt(text: str) -> str:
|
|
"""
|
|
Sanitize text to prevent prompt injection attacks.
|
|
|
|
Args:
|
|
text: Text to sanitize
|
|
|
|
Returns:
|
|
Sanitized text
|
|
"""
|
|
# Remove common prompt injection patterns
|
|
dangerous_patterns = [
|
|
r'(?i)ignore\s+previous\s+instructions',
|
|
r'(?i)ignore\s+all\s+previous',
|
|
r'(?i)disregard\s+previous',
|
|
r'(?i)you\s+are\s+now',
|
|
r'(?i)system\s+prompt',
|
|
r'(?i)bypass\s+filters',
|
|
]
|
|
|
|
sanitized = text
|
|
for pattern in dangerous_patterns:
|
|
sanitized = re.sub(pattern, '[FILTERED]', sanitized)
|
|
|
|
# Limit repeated characters
|
|
sanitized = re.sub(r'(.)\1{20,}', r'\1' * 20, sanitized)
|
|
|
|
return sanitized
|