Initial commit: RAG MCP Server with relationship graph
Features: - Vector search with Pinecone + Vertex AI embeddings - Document relationships (link, unlink, related, graph) - Auto-link with LLM analysis - Intelligent merge with Gemini Modular structure: - clients/: Pinecone, Vertex AI - tools/: core, relations, stats - utils/: validation, logging Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
11
utils/__init__.py
Normal file
11
utils/__init__.py
Normal file
@@ -0,0 +1,11 @@
|
||||
"""Utility modules for RAG system."""
|
||||
from .validation import validate_content, validate_tag, validate_document_id
|
||||
from .logging import setup_logging, get_logger
|
||||
|
||||
__all__ = [
|
||||
"validate_content",
|
||||
"validate_tag",
|
||||
"validate_document_id",
|
||||
"setup_logging",
|
||||
"get_logger"
|
||||
]
|
||||
30
utils/logging.py
Normal file
30
utils/logging.py
Normal file
@@ -0,0 +1,30 @@
|
||||
"""Logging configuration for RAG system."""
|
||||
import logging
|
||||
import sys
|
||||
|
||||
def setup_logging(level: str = "INFO") -> None:
|
||||
"""
|
||||
Setup logging configuration.
|
||||
|
||||
Args:
|
||||
level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
|
||||
"""
|
||||
logging.basicConfig(
|
||||
level=getattr(logging, level.upper(), logging.INFO),
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.StreamHandler(sys.stdout)
|
||||
]
|
||||
)
|
||||
|
||||
def get_logger(name: str) -> logging.Logger:
|
||||
"""
|
||||
Get logger instance.
|
||||
|
||||
Args:
|
||||
name: Logger name (usually __name__)
|
||||
|
||||
Returns:
|
||||
Logger instance
|
||||
"""
|
||||
return logging.getLogger(name)
|
||||
91
utils/validation.py
Normal file
91
utils/validation.py
Normal file
@@ -0,0 +1,91 @@
|
||||
"""Input validation functions for RAG system."""
|
||||
from config import MAX_CONTENT_LENGTH, MAX_TAG_LENGTH
|
||||
import re
|
||||
|
||||
def validate_content(content: str) -> tuple[bool, str]:
|
||||
"""
|
||||
Validate content string.
|
||||
|
||||
Args:
|
||||
content: Content to validate
|
||||
|
||||
Returns:
|
||||
(is_valid, error_message)
|
||||
"""
|
||||
if not content or not content.strip():
|
||||
return False, "Content cannot be empty"
|
||||
|
||||
if len(content) > MAX_CONTENT_LENGTH:
|
||||
return False, f"Content exceeds maximum length of {MAX_CONTENT_LENGTH} characters"
|
||||
|
||||
return True, ""
|
||||
|
||||
def validate_tag(tag: str) -> tuple[bool, str]:
|
||||
"""
|
||||
Validate tag string.
|
||||
|
||||
Args:
|
||||
tag: Tag to validate
|
||||
|
||||
Returns:
|
||||
(is_valid, error_message)
|
||||
"""
|
||||
if not tag or not tag.strip():
|
||||
return False, "Tag cannot be empty"
|
||||
|
||||
if len(tag) > MAX_TAG_LENGTH:
|
||||
return False, f"Tag exceeds maximum length of {MAX_TAG_LENGTH} characters"
|
||||
|
||||
# Only allow alphanumeric, underscore, hyphen
|
||||
if not re.match(r'^[a-zA-Z0-9_-]+$', tag):
|
||||
return False, "Tag must contain only alphanumeric characters, underscores, and hyphens"
|
||||
|
||||
return True, ""
|
||||
|
||||
def validate_document_id(doc_id: str) -> tuple[bool, str]:
|
||||
"""
|
||||
Validate document ID format.
|
||||
|
||||
Args:
|
||||
doc_id: Document ID to validate
|
||||
|
||||
Returns:
|
||||
(is_valid, error_message)
|
||||
"""
|
||||
if not doc_id or not doc_id.strip():
|
||||
return False, "Document ID cannot be empty"
|
||||
|
||||
# Basic UUID format check (flexible for various ID formats)
|
||||
if len(doc_id) < 8 or len(doc_id) > 100:
|
||||
return False, "Document ID must be between 8 and 100 characters"
|
||||
|
||||
return True, ""
|
||||
|
||||
def sanitize_for_prompt(text: str) -> str:
|
||||
"""
|
||||
Sanitize text to prevent prompt injection attacks.
|
||||
|
||||
Args:
|
||||
text: Text to sanitize
|
||||
|
||||
Returns:
|
||||
Sanitized text
|
||||
"""
|
||||
# Remove common prompt injection patterns
|
||||
dangerous_patterns = [
|
||||
r'(?i)ignore\s+previous\s+instructions',
|
||||
r'(?i)ignore\s+all\s+previous',
|
||||
r'(?i)disregard\s+previous',
|
||||
r'(?i)you\s+are\s+now',
|
||||
r'(?i)system\s+prompt',
|
||||
r'(?i)bypass\s+filters',
|
||||
]
|
||||
|
||||
sanitized = text
|
||||
for pattern in dangerous_patterns:
|
||||
sanitized = re.sub(pattern, '[FILTERED]', sanitized)
|
||||
|
||||
# Limit repeated characters
|
||||
sanitized = re.sub(r'(.)\1{20,}', r'\1' * 20, sanitized)
|
||||
|
||||
return sanitized
|
||||
Reference in New Issue
Block a user