Add time-aware traffic logger and auto-retrain system

- Log traffic features with timestamps to CSV every 5s
- Add hour_sin/hour_cos time features (15 → 17 feature vector)
- Auto-retrain from traffic log at configurable interval (default 24h)
- Detect old 15-feature models and switch to learning mode
- SIGUSR1 now retrains from traffic log first, falls back to collect mode
- Add CLI: `ai traffic` (time-bucketed summary), `ai log` (recent entries)
- Add config keys: traffic_log_file, retention_days, retrain_window

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
kaffa
2026-02-07 10:14:07 +09:00
parent 667c6eac81
commit 11c1ab0134
3 changed files with 337 additions and 10 deletions

View File

@@ -12,6 +12,7 @@ time-profile switching, and automatic escalation.
"""
import copy
import math
import os
import sys
import time
@@ -22,7 +23,7 @@ import logging.handlers
import csv
import pickle
from collections import defaultdict
from datetime import datetime
from datetime import datetime, timedelta
import yaml
@@ -80,6 +81,10 @@ DEFAULT_CONFIG = {
'min_packets_for_sample': 20,
'model_file': '/var/lib/xdp-defense/ai_model.pkl',
'training_data_file': '/var/lib/xdp-defense/training_data.csv',
'traffic_log_file': '/var/lib/xdp-defense/traffic_log.csv',
'traffic_log_retention_days': 7,
'retrain_interval': 86400,
'retrain_window': 86400,
},
}
@@ -267,6 +272,7 @@ class AIDetector:
with open(data_file, 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow([
'hour_sin', 'hour_cos',
'total_packets', 'total_bytes', 'tcp_syn_count', 'tcp_other_count',
'udp_count', 'icmp_count', 'other_proto_count', 'unique_ips_approx',
'small_pkt_count', 'large_pkt_count',
@@ -280,17 +286,30 @@ class AIDetector:
log.error("AI training failed: %s", e)
def load_model(self):
"""Load a previously trained model."""
"""Load a previously trained model. Check feature dimension compatibility."""
model_file = self.cfg.get('model_file', '/var/lib/xdp-defense/ai_model.pkl')
if not os.path.exists(model_file):
return False
try:
with open(model_file, 'rb') as f:
data = pickle.load(f)
self.model = data['model']
self.scaler = data['scaler']
model = data['model']
scaler = data['scaler']
expected_features = 17
if hasattr(scaler, 'n_features_in_') and scaler.n_features_in_ != expected_features:
log.warning(
"Model has %d features, expected %d. Switching to learning mode.",
scaler.n_features_in_, expected_features
)
self.is_learning = True
return False
self.model = model
self.scaler = scaler
self.is_learning = False
log.info("AI model loaded from %s", model_file)
log.info("AI model loaded from %s (%d features)",
model_file, getattr(scaler, 'n_features_in_', '?'))
return True
except Exception as e:
log.error("Failed to load AI model: %s", e)
@@ -312,6 +331,51 @@ class AIDetector:
log.error("AI prediction error: %s", e)
return False, 0.0
def retrain_from_log(self):
"""Retrain the model from traffic_log.csv data."""
log_file = self.cfg.get('traffic_log_file', '/var/lib/xdp-defense/traffic_log.csv')
if not os.path.exists(log_file):
log.warning("Traffic log not found: %s", log_file)
return False
retrain_window = self.cfg.get('retrain_window', 86400)
cutoff = datetime.now() - timedelta(seconds=retrain_window)
try:
samples = []
with open(log_file, 'r', newline='') as f:
reader = csv.reader(f)
header = next(reader, None)
if header is None:
log.warning("Traffic log is empty")
return False
# Feature columns: skip timestamp and hour (first 2), take remaining 17
for row in reader:
try:
ts = datetime.fromisoformat(row[0])
if ts < cutoff:
continue
features = [float(v) for v in row[2:]] # skip timestamp, hour
if len(features) == 17:
samples.append(features)
except (ValueError, IndexError):
continue
if len(samples) < 10:
log.warning("Not enough recent samples for retrain (%d)", len(samples))
return False
log.info("Auto-retrain: loading %d samples from traffic log (window=%ds)",
len(samples), retrain_window)
self.training_data = samples
self._train()
return True
except Exception as e:
log.error("retrain_from_log failed: %s", e)
return False
# ==================== ProfileManager ====================
@@ -421,6 +485,9 @@ class DDoSDaemon:
if self.ai_detector.enabled:
self.ai_detector.load_model()
self._last_retrain_time = self._get_model_mtime()
self._last_log_cleanup = time.time()
level = self.cfg['general'].get('log_level', 'info').upper()
log.setLevel(getattr(logging, level, logging.INFO))
@@ -482,8 +549,80 @@ class DDoSDaemon:
self._stop_event.set()
def _handle_sigusr1(self, signum, frame):
log.info("SIGUSR1 received, requesting AI retrain...")
self.ai_detector.request_retrain()
log.info("SIGUSR1 received, triggering retrain from traffic log...")
if self.ai_detector.retrain_from_log():
self._last_retrain_time = time.time()
log.info("SIGUSR1 retrain completed successfully")
else:
log.warning("SIGUSR1 retrain failed (falling back to collect mode)")
self.ai_detector.request_retrain()
# ---- Traffic Logging ----
TRAFFIC_CSV_HEADER = [
'timestamp', 'hour',
'hour_sin', 'hour_cos',
'total_packets', 'total_bytes', 'tcp_syn_count', 'tcp_other_count',
'udp_count', 'icmp_count', 'other_proto_count', 'unique_ips_approx',
'small_pkt_count', 'large_pkt_count',
'syn_ratio', 'udp_ratio', 'icmp_ratio', 'small_pkt_ratio', 'avg_pkt_size'
]
def _log_traffic(self, now, hour, features):
"""Append one row to traffic_log.csv."""
log_file = self.cfg['ai'].get('traffic_log_file', '/var/lib/xdp-defense/traffic_log.csv')
try:
write_header = not os.path.exists(log_file) or os.path.getsize(log_file) == 0
os.makedirs(os.path.dirname(log_file), exist_ok=True)
with open(log_file, 'a', newline='') as f:
writer = csv.writer(f)
if write_header:
writer.writerow(self.TRAFFIC_CSV_HEADER)
row = [now.isoformat(), f'{hour:.4f}'] + [f'{v:.6f}' for v in features]
writer.writerow(row)
except Exception as e:
log.error("Failed to write traffic log: %s", e)
def _cleanup_traffic_log(self):
"""Remove entries older than retention_days from traffic_log.csv."""
log_file = self.cfg['ai'].get('traffic_log_file', '/var/lib/xdp-defense/traffic_log.csv')
retention_days = self.cfg['ai'].get('traffic_log_retention_days', 7)
cutoff = datetime.now() - timedelta(days=retention_days)
if not os.path.exists(log_file):
return
try:
kept = []
header = None
with open(log_file, 'r', newline='') as f:
reader = csv.reader(f)
header = next(reader, None)
for row in reader:
try:
ts = datetime.fromisoformat(row[0])
if ts >= cutoff:
kept.append(row)
except (ValueError, IndexError):
kept.append(row) # keep unparseable rows
with open(log_file, 'w', newline='') as f:
writer = csv.writer(f)
if header:
writer.writerow(header)
writer.writerows(kept)
log.info("Traffic log cleanup: kept %d rows (retention=%dd)", len(kept), retention_days)
except Exception as e:
log.error("Traffic log cleanup failed: %s", e)
def _get_model_mtime(self):
"""Get model file modification time, or current time if not found."""
model_file = self.cfg['ai'].get('model_file', '/var/lib/xdp-defense/ai_model.pkl')
try:
return os.path.getmtime(model_file)
except OSError:
return time.time()
# ---- Worker Threads ----
@@ -552,6 +691,8 @@ class DDoSDaemon:
from xdp_common import read_percpu_features, dump_rate_counters, block_ip, is_whitelisted
prev_features = None
self._last_retrain_time = self._get_model_mtime()
self._last_log_cleanup = time.time()
while not self._stop_event.is_set():
interval = self._ai_interval
@@ -592,13 +733,39 @@ class DDoSDaemon:
avg_pkt_size = deltas[1] / total
deltas.extend([syn_ratio, udp_ratio, icmp_ratio, small_pkt_ratio, avg_pkt_size])
# Add time features (hour_sin, hour_cos) at the front
now = datetime.now()
hour = now.hour + now.minute / 60.0
hour_sin = math.sin(2 * math.pi * hour / 24)
hour_cos = math.cos(2 * math.pi * hour / 24)
deltas_with_time = [hour_sin, hour_cos] + deltas # 17 features
# Log to traffic CSV
self._log_traffic(now, hour, deltas_with_time)
# Periodic log file cleanup (once per day)
if time.time() - self._last_log_cleanup > 86400:
self._cleanup_traffic_log()
self._last_log_cleanup = time.time()
if self.ai_detector.is_learning:
self.ai_detector.collect_sample(deltas)
self.ai_detector.collect_sample(deltas_with_time)
if len(self.ai_detector.training_data) % 100 == 0:
log.debug("AI learning: %d samples collected",
len(self.ai_detector.training_data))
else:
is_anomaly, score = self.ai_detector.predict(deltas)
# Auto-retrain check
retrain_interval = self.cfg['ai'].get('retrain_interval', 86400)
if time.time() - self._last_retrain_time >= retrain_interval:
log.info("Auto-retrain triggered (interval=%ds)", retrain_interval)
if self.ai_detector.retrain_from_log():
self._last_retrain_time = time.time()
log.info("Auto-retrain completed successfully")
else:
log.warning("Auto-retrain failed, will retry next interval")
self._last_retrain_time = time.time()
is_anomaly, score = self.ai_detector.predict(deltas_with_time)
if is_anomaly:
log.warning(
"AI ANOMALY detected: score=%.4f deltas=%s",