feat(phase-5-3): Logger, Metrics, 알림 시스템 통합

Phase 5-3 모니터링 강화 작업의 통합을 완료했습니다.

변경사항:
- Logger 통합: console.log를 구조화된 로깅으로 전환 (9개 파일)
  - JSON 기반 로그, 환경별 자동 전환 (개발/프로덕션)
  - 타입 안전성 보장, 성능 측정 타이머 내장

- Metrics 통합: 실시간 성능 모니터링 시스템 연결 (3개 파일)
  - Circuit Breaker 상태 추적 (api_call_count, error_count, state)
  - Retry 재시도 횟수 추적 (retry_count)
  - OpenAI API 응답 시간 측정 (api_call_duration)

- 알림 통합: 장애 자동 알림 시스템 구현 (2개 파일)
  - Circuit Breaker OPEN 상태 → 관리자 Telegram 알림
  - 재시도 실패 → 관리자 Telegram 알림
  - Rate Limiting 적용 (1시간에 1회)

- 문서 업데이트:
  - CLAUDE.md: coder 에이전트 설명 강화 (20년+ 시니어 전문가)
  - README.md, docs/: 아키텍처 문서 추가

영향받은 파일: 16개 (수정 14개, 신규 2개)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
kappa
2026-01-19 21:23:38 +09:00
parent 410676e322
commit eee934391a
16 changed files with 675 additions and 777 deletions

View File

@@ -1,3 +1,6 @@
import { metrics } from './metrics';
import { notifyAdmin, NotificationOptions } from '../services/notification';
/**
* Circuit Breaker pattern implementation
*
@@ -42,6 +45,10 @@ export interface CircuitBreakerOptions {
resetTimeoutMs?: number;
/** Time window in ms for monitoring failures (default: 120000) */
monitoringWindowMs?: number;
/** Service name for metrics (default: 'unknown') */
serviceName?: string;
/** Admin notification options (optional) */
notification?: NotificationOptions;
}
/**
@@ -82,17 +89,25 @@ export class CircuitBreaker {
private readonly failureThreshold: number;
private readonly resetTimeoutMs: number;
private readonly monitoringWindowMs: number;
private readonly serviceName: string;
private readonly notification?: NotificationOptions;
constructor(options?: CircuitBreakerOptions) {
this.failureThreshold = options?.failureThreshold ?? 5;
this.resetTimeoutMs = options?.resetTimeoutMs ?? 60000;
this.monitoringWindowMs = options?.monitoringWindowMs ?? 120000;
this.serviceName = options?.serviceName ?? 'unknown';
this.notification = options?.notification;
console.log('[CircuitBreaker] Initialized', {
serviceName: this.serviceName,
failureThreshold: this.failureThreshold,
resetTimeoutMs: this.resetTimeoutMs,
monitoringWindowMs: this.monitoringWindowMs,
});
// 초기 상태 메트릭 기록 (CLOSED)
metrics.record('circuit_breaker_state', 0, { service: this.serviceName });
}
/**
@@ -137,6 +152,9 @@ export class CircuitBreaker {
this.openedAt = null;
this.successCount = 0;
this.failureCount = 0;
// 상태 메트릭 기록 (CLOSED)
metrics.record('circuit_breaker_state', 0, { service: this.serviceName });
}
/**
@@ -162,6 +180,9 @@ export class CircuitBreaker {
if (elapsed >= this.resetTimeoutMs) {
console.log('[CircuitBreaker] Reset timeout reached, transitioning to HALF_OPEN');
this.state = CircuitState.HALF_OPEN;
// 상태 메트릭 기록 (HALF_OPEN)
metrics.record('circuit_breaker_state', 2, { service: this.serviceName });
}
}
}
@@ -177,6 +198,9 @@ export class CircuitBreaker {
this.state = CircuitState.CLOSED;
this.failures = [];
this.openedAt = null;
// 상태 메트릭 기록 (CLOSED)
metrics.record('circuit_breaker_state', 0, { service: this.serviceName });
}
}
@@ -197,6 +221,25 @@ export class CircuitBreaker {
console.log('[CircuitBreaker] Half-open test failed, reopening circuit');
this.state = CircuitState.OPEN;
this.openedAt = now;
// 상태 메트릭 기록 (OPEN)
metrics.record('circuit_breaker_state', 1, { service: this.serviceName });
// 관리자 알림 전송 (HALF_OPEN → OPEN 전환)
if (this.notification) {
notifyAdmin(
'circuit_breaker',
{
service: this.serviceName,
error: 'Test request failed in HALF_OPEN state',
context: 'Circuit breaker reopened after failed test'
},
this.notification
).catch(() => {
// 알림 실패는 무시 (메인 로직에 영향 없음)
});
}
return;
}
@@ -208,6 +251,24 @@ export class CircuitBreaker {
);
this.state = CircuitState.OPEN;
this.openedAt = now;
// 상태 메트릭 기록 (OPEN)
metrics.record('circuit_breaker_state', 1, { service: this.serviceName });
// 관리자 알림 전송 (CLOSED → OPEN 전환)
if (this.notification) {
notifyAdmin(
'circuit_breaker',
{
service: this.serviceName,
error: error.message || 'Unknown error',
context: `Failure threshold: ${this.failureThreshold}, Current failures: ${this.failures.length}`
},
this.notification
).catch(() => {
// 알림 실패는 무시 (메인 로직에 영향 없음)
});
}
}
}
}
@@ -234,6 +295,9 @@ export class CircuitBreaker {
throw error;
}
// API 호출 카운트 증가
metrics.increment('api_call_count', { service: this.serviceName });
try {
// Execute the function
const result = await fn();
@@ -243,6 +307,9 @@ export class CircuitBreaker {
return result;
} catch (error) {
// API 에러 카운트 증가
metrics.increment('api_error_count', { service: this.serviceName });
// Record failure
const err = error instanceof Error ? error : new Error(String(error));
this.onFailure(err);

View File

@@ -5,11 +5,14 @@
* ```typescript
* const result = await retryWithBackoff(
* async () => fetch('https://api.example.com'),
* { maxRetries: 3, initialDelayMs: 1000 }
* { maxRetries: 3, initialDelayMs: 1000, serviceName: 'external-api' }
* );
* ```
*/
import { metrics } from './metrics';
import { notifyAdmin, NotificationOptions } from '../services/notification';
/**
* Configuration options for retry behavior
*/
@@ -24,6 +27,10 @@ export interface RetryOptions {
backoffMultiplier?: number;
/** Whether to add random jitter to delays (default: true) */
jitter?: boolean;
/** Service name for metrics tracking (optional) */
serviceName?: string;
/** Notification options for admin alerts (optional) */
notification?: NotificationOptions;
}
/**
@@ -103,6 +110,8 @@ export async function retryWithBackoff<T>(
maxDelayMs = 10000,
backoffMultiplier = 2,
jitter = true,
serviceName = 'unknown',
notification,
} = options || {};
let lastError: Error;
@@ -127,6 +136,22 @@ export async function retryWithBackoff<T>(
`[Retry] All ${maxRetries + 1} attempts failed. Last error:`,
lastError.message
);
// Send admin notification if configured
if (notification) {
notifyAdmin(
'retry_exhausted',
{
service: serviceName,
error: lastError.message,
context: `All ${maxRetries + 1} attempts failed`,
},
notification
).catch(() => {
// Ignore notification failures
});
}
throw new RetryError(
`Operation failed after ${maxRetries + 1} attempts: ${lastError.message}`,
maxRetries + 1,
@@ -134,6 +159,14 @@ export async function retryWithBackoff<T>(
);
}
// Track retry metric (only for actual retries, not first attempt)
if (attempt > 0) {
metrics.increment('retry_count', {
service: serviceName,
attempt: String(attempt),
});
}
// Calculate delay for next retry
const delay = calculateDelay(
attempt,