Initial commit: RAG MCP Server with relationship graph

Features: - Vector search with Pinecone + Vertex AI embeddings - Document relationships (link, unlink, related, graph) - Auto-link with LLM analysis - Intelligent merge with Gemini Modular structure: - clients/: Pinecone, Vertex AI - tools/: core, relations, stats - utils/: validation, logging Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-03 11:05:45 +09:00
commit 2858e0a344
17 changed files with 1450 additions and 0 deletions
--- a/utils/validation.py
+++ b/utils/validation.py
@@ -0,0 +1,91 @@
+"""Input validation functions for RAG system."""
+from config import MAX_CONTENT_LENGTH, MAX_TAG_LENGTH
+import re
+
+def validate_content(content: str) -> tuple[bool, str]:
+    """
+    Validate content string.
+
+    Args:
+        content: Content to validate
+
+    Returns:
+        (is_valid, error_message)
+    """
+    if not content or not content.strip():
+        return False, "Content cannot be empty"
+
+    if len(content) > MAX_CONTENT_LENGTH:
+        return False, f"Content exceeds maximum length of {MAX_CONTENT_LENGTH} characters"
+
+    return True, ""
+
+def validate_tag(tag: str) -> tuple[bool, str]:
+    """
+    Validate tag string.
+
+    Args:
+        tag: Tag to validate
+
+    Returns:
+        (is_valid, error_message)
+    """
+    if not tag or not tag.strip():
+        return False, "Tag cannot be empty"
+
+    if len(tag) > MAX_TAG_LENGTH:
+        return False, f"Tag exceeds maximum length of {MAX_TAG_LENGTH} characters"
+
+    # Only allow alphanumeric, underscore, hyphen
+    if not re.match(r'^[a-zA-Z0-9_-]+$', tag):
+        return False, "Tag must contain only alphanumeric characters, underscores, and hyphens"
+
+    return True, ""
+
+def validate_document_id(doc_id: str) -> tuple[bool, str]:
+    """
+    Validate document ID format.
+
+    Args:
+        doc_id: Document ID to validate
+
+    Returns:
+        (is_valid, error_message)
+    """
+    if not doc_id or not doc_id.strip():
+        return False, "Document ID cannot be empty"
+
+    # Basic UUID format check (flexible for various ID formats)
+    if len(doc_id) < 8 or len(doc_id) > 100:
+        return False, "Document ID must be between 8 and 100 characters"
+
+    return True, ""
+
+def sanitize_for_prompt(text: str) -> str:
+    """
+    Sanitize text to prevent prompt injection attacks.
+
+    Args:
+        text: Text to sanitize
+
+    Returns:
+        Sanitized text
+    """
+    # Remove common prompt injection patterns
+    dangerous_patterns = [
+        r'(?i)ignore\s+previous\s+instructions',
+        r'(?i)ignore\s+all\s+previous',
+        r'(?i)disregard\s+previous',
+        r'(?i)you\s+are\s+now',
+        r'(?i)system\s+prompt',
+        r'(?i)bypass\s+filters',
+    ]
+
+    sanitized = text
+    for pattern in dangerous_patterns:
+        sanitized = re.sub(pattern, '[FILTERED]', sanitized)
+
+    # Limit repeated characters
+    sanitized = re.sub(r'(.)\1{20,}', r'\1' * 20, sanitized)
+
+    return sanitized