- Add new `app` package to manage application initialization and lifecycle - Refactor `main.go` to use new application management approach - Implement graceful shutdown with context timeout and signal handling - Add dependency injection container initialization - Enhance logging with configurable log levels and structured logging - Update configuration loading and server initialization process - Modify Jitsi configuration in `.env` for custom deployment - Improve error handling and logging throughout application startup - Centralize application startup and shutdown logic in single package Introduces a more robust and flexible application management system with improved initialization, logging, and shutdown capabilities.
326 lines
8.4 KiB
Go
326 lines
8.4 KiB
Go
package monitoring
|
|
|
|
import (
|
|
"context"
|
|
"sync"
|
|
"time"
|
|
|
|
"attune-heart-therapy/internal/errors"
|
|
"attune-heart-therapy/internal/logger"
|
|
)
|
|
|
|
// ErrorTracker tracks and monitors application errors
|
|
type ErrorTracker struct {
|
|
log *logger.Logger
|
|
errorCounts map[errors.ErrorCode]int64
|
|
errorTimes map[errors.ErrorCode][]time.Time
|
|
mu sync.RWMutex
|
|
windowSize time.Duration
|
|
maxWindowSize int
|
|
}
|
|
|
|
// NewErrorTracker creates a new error tracker
|
|
func NewErrorTracker() *ErrorTracker {
|
|
return &ErrorTracker{
|
|
log: logger.New("error_tracker"),
|
|
errorCounts: make(map[errors.ErrorCode]int64),
|
|
errorTimes: make(map[errors.ErrorCode][]time.Time),
|
|
windowSize: time.Hour,
|
|
maxWindowSize: 1000, // Keep last 1000 occurrences per error type
|
|
}
|
|
}
|
|
|
|
// TrackError records an error occurrence
|
|
func (et *ErrorTracker) TrackError(ctx context.Context, appErr *errors.AppError, userID string, endpoint string) {
|
|
et.mu.Lock()
|
|
defer et.mu.Unlock()
|
|
|
|
now := time.Now()
|
|
|
|
// Increment error count
|
|
et.errorCounts[appErr.Code]++
|
|
|
|
// Add timestamp to error times
|
|
if _, exists := et.errorTimes[appErr.Code]; !exists {
|
|
et.errorTimes[appErr.Code] = make([]time.Time, 0)
|
|
}
|
|
|
|
et.errorTimes[appErr.Code] = append(et.errorTimes[appErr.Code], now)
|
|
|
|
// Keep only recent errors within window
|
|
et.cleanupOldErrors(appErr.Code, now)
|
|
|
|
// Log error with tracking information
|
|
fields := map[string]interface{}{
|
|
"error_code": appErr.Code,
|
|
"error_count": et.errorCounts[appErr.Code],
|
|
"recent_count": len(et.errorTimes[appErr.Code]),
|
|
"endpoint": endpoint,
|
|
"http_status": appErr.HTTPStatus,
|
|
}
|
|
|
|
if userID != "" {
|
|
fields["user_id"] = userID
|
|
}
|
|
|
|
if traceID := ctx.Value("trace_id"); traceID != nil {
|
|
fields["trace_id"] = traceID
|
|
}
|
|
|
|
// Add error fields if available
|
|
if appErr.Fields != nil {
|
|
for k, v := range appErr.Fields {
|
|
fields["error_"+k] = v
|
|
}
|
|
}
|
|
|
|
// Check for error rate spikes
|
|
if et.isErrorSpike(appErr.Code) {
|
|
et.log.Error("Error rate spike detected", appErr.Cause, fields)
|
|
et.alertOnErrorSpike(appErr.Code, fields)
|
|
} else if appErr.HTTPStatus >= 500 {
|
|
et.log.Error("Server error tracked", appErr.Cause, fields)
|
|
} else {
|
|
et.log.Warn("Client error tracked", fields)
|
|
}
|
|
}
|
|
|
|
// cleanupOldErrors removes errors outside the time window
|
|
func (et *ErrorTracker) cleanupOldErrors(code errors.ErrorCode, now time.Time) {
|
|
cutoff := now.Add(-et.windowSize)
|
|
times := et.errorTimes[code]
|
|
|
|
// Find first index within window
|
|
start := 0
|
|
for i, t := range times {
|
|
if t.After(cutoff) {
|
|
start = i
|
|
break
|
|
}
|
|
}
|
|
|
|
// Keep only recent errors
|
|
et.errorTimes[code] = times[start:]
|
|
|
|
// Limit to max window size
|
|
if len(et.errorTimes[code]) > et.maxWindowSize {
|
|
et.errorTimes[code] = et.errorTimes[code][len(et.errorTimes[code])-et.maxWindowSize:]
|
|
}
|
|
}
|
|
|
|
// isErrorSpike checks if there's an unusual spike in error rate
|
|
func (et *ErrorTracker) isErrorSpike(code errors.ErrorCode) bool {
|
|
times := et.errorTimes[code]
|
|
if len(times) < 10 {
|
|
return false
|
|
}
|
|
|
|
// Check if we have more than 10 errors in the last 5 minutes
|
|
fiveMinutesAgo := time.Now().Add(-5 * time.Minute)
|
|
recentCount := 0
|
|
|
|
for _, t := range times {
|
|
if t.After(fiveMinutesAgo) {
|
|
recentCount++
|
|
}
|
|
}
|
|
|
|
return recentCount >= 10
|
|
}
|
|
|
|
// alertOnErrorSpike handles error spike alerts
|
|
func (et *ErrorTracker) alertOnErrorSpike(code errors.ErrorCode, fields map[string]interface{}) {
|
|
// In a production system, this would send alerts to monitoring systems
|
|
// like PagerDuty, Slack, or email notifications
|
|
et.log.Error("ALERT: Error spike detected", nil, map[string]interface{}{
|
|
"alert_type": "error_spike",
|
|
"error_code": code,
|
|
"recent_count": len(et.errorTimes[code]),
|
|
"details": fields,
|
|
})
|
|
}
|
|
|
|
// GetErrorStats returns error statistics
|
|
func (et *ErrorTracker) GetErrorStats() map[string]interface{} {
|
|
et.mu.RLock()
|
|
defer et.mu.RUnlock()
|
|
|
|
stats := map[string]interface{}{
|
|
"total_errors": make(map[errors.ErrorCode]int64),
|
|
"recent_errors": make(map[errors.ErrorCode]int),
|
|
"error_rates": make(map[errors.ErrorCode]float64),
|
|
}
|
|
|
|
now := time.Now()
|
|
oneHourAgo := now.Add(-time.Hour)
|
|
|
|
for code, count := range et.errorCounts {
|
|
stats["total_errors"].(map[errors.ErrorCode]int64)[code] = count
|
|
|
|
// Count recent errors
|
|
recentCount := 0
|
|
if times, exists := et.errorTimes[code]; exists {
|
|
for _, t := range times {
|
|
if t.After(oneHourAgo) {
|
|
recentCount++
|
|
}
|
|
}
|
|
}
|
|
|
|
stats["recent_errors"].(map[errors.ErrorCode]int)[code] = recentCount
|
|
stats["error_rates"].(map[errors.ErrorCode]float64)[code] = float64(recentCount) / 60.0 // errors per minute
|
|
}
|
|
|
|
return stats
|
|
}
|
|
|
|
// MetricsCollector collects application metrics
|
|
type MetricsCollector struct {
|
|
log *logger.Logger
|
|
requestCounts map[string]int64
|
|
responseTimes map[string][]time.Duration
|
|
activeRequests int64
|
|
mu sync.RWMutex
|
|
}
|
|
|
|
// NewMetricsCollector creates a new metrics collector
|
|
func NewMetricsCollector() *MetricsCollector {
|
|
return &MetricsCollector{
|
|
log: logger.New("metrics_collector"),
|
|
requestCounts: make(map[string]int64),
|
|
responseTimes: make(map[string][]time.Duration),
|
|
}
|
|
}
|
|
|
|
// RecordRequest records a request metric
|
|
func (mc *MetricsCollector) RecordRequest(endpoint string, method string, statusCode int, duration time.Duration) {
|
|
mc.mu.Lock()
|
|
defer mc.mu.Unlock()
|
|
|
|
key := method + " " + endpoint
|
|
mc.requestCounts[key]++
|
|
|
|
if _, exists := mc.responseTimes[key]; !exists {
|
|
mc.responseTimes[key] = make([]time.Duration, 0)
|
|
}
|
|
|
|
mc.responseTimes[key] = append(mc.responseTimes[key], duration)
|
|
|
|
// Keep only last 1000 response times per endpoint
|
|
if len(mc.responseTimes[key]) > 1000 {
|
|
mc.responseTimes[key] = mc.responseTimes[key][len(mc.responseTimes[key])-1000:]
|
|
}
|
|
|
|
// Log slow requests
|
|
if duration > 5*time.Second {
|
|
mc.log.Warn("Slow request detected", map[string]interface{}{
|
|
"endpoint": endpoint,
|
|
"method": method,
|
|
"status_code": statusCode,
|
|
"duration_ms": duration.Milliseconds(),
|
|
})
|
|
}
|
|
}
|
|
|
|
// IncrementActiveRequests increments the active request counter
|
|
func (mc *MetricsCollector) IncrementActiveRequests() {
|
|
mc.mu.Lock()
|
|
defer mc.mu.Unlock()
|
|
mc.activeRequests++
|
|
}
|
|
|
|
// DecrementActiveRequests decrements the active request counter
|
|
func (mc *MetricsCollector) DecrementActiveRequests() {
|
|
mc.mu.Lock()
|
|
defer mc.mu.Unlock()
|
|
if mc.activeRequests > 0 {
|
|
mc.activeRequests--
|
|
}
|
|
}
|
|
|
|
// GetMetrics returns collected metrics
|
|
func (mc *MetricsCollector) GetMetrics() map[string]interface{} {
|
|
mc.mu.RLock()
|
|
defer mc.mu.RUnlock()
|
|
|
|
metrics := map[string]interface{}{
|
|
"active_requests": mc.activeRequests,
|
|
"request_counts": make(map[string]int64),
|
|
"avg_response_times": make(map[string]float64),
|
|
}
|
|
|
|
// Copy request counts
|
|
for k, v := range mc.requestCounts {
|
|
metrics["request_counts"].(map[string]int64)[k] = v
|
|
}
|
|
|
|
// Calculate average response times
|
|
for endpoint, times := range mc.responseTimes {
|
|
if len(times) > 0 {
|
|
var total time.Duration
|
|
for _, t := range times {
|
|
total += t
|
|
}
|
|
avg := total / time.Duration(len(times))
|
|
metrics["avg_response_times"].(map[string]float64)[endpoint] = float64(avg.Milliseconds())
|
|
}
|
|
}
|
|
|
|
return metrics
|
|
}
|
|
|
|
// Monitor provides comprehensive application monitoring
|
|
type Monitor struct {
|
|
ErrorTracker *ErrorTracker
|
|
MetricsCollector *MetricsCollector
|
|
log *logger.Logger
|
|
}
|
|
|
|
// NewMonitor creates a new application monitor
|
|
func NewMonitor() *Monitor {
|
|
return &Monitor{
|
|
ErrorTracker: NewErrorTracker(),
|
|
MetricsCollector: NewMetricsCollector(),
|
|
log: logger.New("monitor"),
|
|
}
|
|
}
|
|
|
|
// GetHealthStatus returns comprehensive health status
|
|
func (m *Monitor) GetHealthStatus() map[string]interface{} {
|
|
errorStats := m.ErrorTracker.GetErrorStats()
|
|
metrics := m.MetricsCollector.GetMetrics()
|
|
|
|
status := map[string]interface{}{
|
|
"status": "healthy",
|
|
"timestamp": time.Now().UTC().Format(time.RFC3339),
|
|
"errors": errorStats,
|
|
"metrics": metrics,
|
|
}
|
|
|
|
// Determine overall health based on error rates
|
|
if recentErrors, ok := errorStats["recent_errors"].(map[errors.ErrorCode]int); ok {
|
|
totalRecentErrors := 0
|
|
for _, count := range recentErrors {
|
|
totalRecentErrors += count
|
|
}
|
|
|
|
if totalRecentErrors > 100 { // More than 100 errors in the last hour
|
|
status["status"] = "degraded"
|
|
}
|
|
|
|
if totalRecentErrors > 500 { // More than 500 errors in the last hour
|
|
status["status"] = "unhealthy"
|
|
}
|
|
}
|
|
|
|
return status
|
|
}
|
|
|
|
// Global monitor instance
|
|
var globalMonitor = NewMonitor()
|
|
|
|
// GetGlobalMonitor returns the global monitor instance
|
|
func GetGlobalMonitor() *Monitor {
|
|
return globalMonitor
|
|
}
|