Initial commit: RAG MCP Server with relationship graph

Features:
- Vector search with Pinecone + Vertex AI embeddings
- Document relationships (link, unlink, related, graph)
- Auto-link with LLM analysis
- Intelligent merge with Gemini

Modular structure:
- clients/: Pinecone, Vertex AI
- tools/: core, relations, stats
- utils/: validation, logging

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
kappa
2026-02-03 11:05:45 +09:00
commit 2858e0a344
17 changed files with 1450 additions and 0 deletions

91
utils/validation.py Normal file
View File

@@ -0,0 +1,91 @@
"""Input validation functions for RAG system."""
from config import MAX_CONTENT_LENGTH, MAX_TAG_LENGTH
import re
def validate_content(content: str) -> tuple[bool, str]:
"""
Validate content string.
Args:
content: Content to validate
Returns:
(is_valid, error_message)
"""
if not content or not content.strip():
return False, "Content cannot be empty"
if len(content) > MAX_CONTENT_LENGTH:
return False, f"Content exceeds maximum length of {MAX_CONTENT_LENGTH} characters"
return True, ""
def validate_tag(tag: str) -> tuple[bool, str]:
"""
Validate tag string.
Args:
tag: Tag to validate
Returns:
(is_valid, error_message)
"""
if not tag or not tag.strip():
return False, "Tag cannot be empty"
if len(tag) > MAX_TAG_LENGTH:
return False, f"Tag exceeds maximum length of {MAX_TAG_LENGTH} characters"
# Only allow alphanumeric, underscore, hyphen
if not re.match(r'^[a-zA-Z0-9_-]+$', tag):
return False, "Tag must contain only alphanumeric characters, underscores, and hyphens"
return True, ""
def validate_document_id(doc_id: str) -> tuple[bool, str]:
"""
Validate document ID format.
Args:
doc_id: Document ID to validate
Returns:
(is_valid, error_message)
"""
if not doc_id or not doc_id.strip():
return False, "Document ID cannot be empty"
# Basic UUID format check (flexible for various ID formats)
if len(doc_id) < 8 or len(doc_id) > 100:
return False, "Document ID must be between 8 and 100 characters"
return True, ""
def sanitize_for_prompt(text: str) -> str:
"""
Sanitize text to prevent prompt injection attacks.
Args:
text: Text to sanitize
Returns:
Sanitized text
"""
# Remove common prompt injection patterns
dangerous_patterns = [
r'(?i)ignore\s+previous\s+instructions',
r'(?i)ignore\s+all\s+previous',
r'(?i)disregard\s+previous',
r'(?i)you\s+are\s+now',
r'(?i)system\s+prompt',
r'(?i)bypass\s+filters',
]
sanitized = text
for pattern in dangerous_patterns:
sanitized = re.sub(pattern, '[FILTERED]', sanitized)
# Limit repeated characters
sanitized = re.sub(r'(.)\1{20,}', r'\1' * 20, sanitized)
return sanitized