metrics

mstgnz · mstgnz · commit 8e4bd9177e3c · 2024-12-29T22:00:50.000+03:00
diff --git a/handler/health.go b/handler/health.go
@@ -6,6 +6,7 @@ import (
 
 	"github.com/gofiber/fiber/v2"
 	"github.com/minio/minio-go/v7"
+	"github.com/mstgnz/cdn/pkg/observability"
 	"github.com/mstgnz/cdn/service"
 )
 
@@ -38,9 +39,9 @@ func (h *HealthChecker) HealthCheck(c *fiber.Ctx) error {
 		c.Status(fiber.StatusServiceUnavailable)
 	}
 
-	status := map[string]interface{}{
+	status := map[string]any{
 		"status": overallStatus,
-		"services": map[string]interface{}{
+		"services": map[string]any{
 			"minio": minioHealth,
 			"aws":   awsHealth,
 			"cache": cacheHealth,
@@ -52,32 +53,60 @@ func (h *HealthChecker) HealthCheck(c *fiber.Ctx) error {
 }
 
 func (h *HealthChecker) checkMinioHealth(ctx context.Context) string {
+	start := time.Now()
+	defer func() {
+		duration := time.Since(start).Seconds()
+		observability.ServiceHealthCheckDuration.WithLabelValues("minio").Observe(duration)
+		observability.LastHealthCheckTimestamp.WithLabelValues("minio").Set(float64(time.Now().Unix()))
+	}()
+
 	if _, err := h.minioClient.ListBuckets(ctx); err != nil {
+		observability.ServiceHealth.WithLabelValues("minio").Set(0)
 		return "unhealthy: " + err.Error()
 	}
+	observability.ServiceHealth.WithLabelValues("minio").Set(1)
 	return "healthy"
 }
 
 func (h *HealthChecker) checkAwsHealth(ctx context.Context) string {
+	start := time.Now()
+	defer func() {
+		duration := time.Since(start).Seconds()
+		observability.ServiceHealthCheckDuration.WithLabelValues("aws").Observe(duration)
+		observability.LastHealthCheckTimestamp.WithLabelValues("aws").Set(float64(time.Now().Unix()))
+	}()
+
 	if _, err := h.awsService.ListBuckets(); err != nil {
+		observability.ServiceHealth.WithLabelValues("aws").Set(0)
 		return "unhealthy: " + err.Error()
 	}
+	observability.ServiceHealth.WithLabelValues("aws").Set(1)
 	return "healthy"
 }
 
 func (h *HealthChecker) checkCacheHealth(ctx context.Context) string {
+	start := time.Now()
+	defer func() {
+		duration := time.Since(start).Seconds()
+		observability.ServiceHealthCheckDuration.WithLabelValues("cache").Observe(duration)
+		observability.LastHealthCheckTimestamp.WithLabelValues("cache").Set(float64(time.Now().Unix()))
+	}()
+
 	testKey := "health:test"
 	testValue := []byte("test")
 
 	// Try to set a test value
 	if err := h.cache.Set(testKey, testValue, time.Second); err != nil {
+		observability.ServiceHealth.WithLabelValues("cache").Set(0)
 		return "unhealthy: set failed - " + err.Error()
 	}
 
 	// Try to get the test value
 	if _, err := h.cache.Get(testKey); err != nil {
+		observability.ServiceHealth.WithLabelValues("cache").Set(0)
 		return "unhealthy: get failed - " + err.Error()
 	}
 
+	observability.ServiceHealth.WithLabelValues("cache").Set(1)
 	return "healthy"
 }
diff --git a/pkg/batch/processor.go b/pkg/batch/processor.go
@@ -12,7 +12,7 @@ import (
 // BatchItem represents a single item in a batch
 type BatchItem struct {
 	ID      string
-	Data    interface{}
+	Data    any
 	Error   error
 	Success bool
 }
@@ -132,26 +132,35 @@ func (b *BatchProcessor) processBatchWithRetry(items []BatchItem) {
 
 	var processed []BatchItem
 	retries := 0
+	start := time.Now()
 
 	for retries <= b.config.MaxRetries {
-		start := time.Now()
 		processed = b.processor(items)
 		duration := time.Since(start).Seconds()
 
-		observability.StorageOperationDuration.WithLabelValues("batch_process", "bulk").Observe(duration)
-
 		failed := 0
+		success := 0
 		for _, item := range processed {
-			if !item.Success {
+			if item.Success {
+				success++
+			} else {
 				failed++
 			}
 		}
 
+		// Update metrics
+		observability.BatchProcessingDuration.WithLabelValues("success").Observe(duration)
+		observability.BatchItemsProcessed.WithLabelValues("success").Add(float64(success))
+		observability.BatchItemsProcessed.WithLabelValues("failed").Add(float64(failed))
+		observability.BatchProcessorQueueSize.Set(float64(len(b.items)))
+
 		if failed == 0 {
 			break
 		}
 
 		retries++
+		observability.BatchRetries.Inc()
+
 		if retries <= b.config.MaxRetries {
 			b.logger.Warn().
 				Int("retry", retries).
diff --git a/pkg/observability/metrics.go b/pkg/observability/metrics.go
@@ -45,6 +45,130 @@ var (
 		},
 		[]string{"operation", "provider"},
 	)
+
+	// Health Check Metrics
+	ServiceHealth = promauto.NewGaugeVec(
+		prometheus.GaugeOpts{
+			Name: "service_health_status",
+			Help: "Current health status of services (1 for healthy, 0 for unhealthy)",
+		},
+		[]string{"service"},
+	)
+
+	ServiceHealthCheckDuration = promauto.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Name:    "service_health_check_duration_seconds",
+			Help:    "Duration of health checks in seconds",
+			Buckets: []float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5},
+		},
+		[]string{"service"},
+	)
+
+	LastHealthCheckTimestamp = promauto.NewGaugeVec(
+		prometheus.GaugeOpts{
+			Name: "service_last_health_check_timestamp",
+			Help: "Timestamp of the last health check",
+		},
+		[]string{"service"},
+	)
+
+	// Worker Pool Metrics
+	WorkerPoolQueueSize = promauto.NewGauge(
+		prometheus.GaugeOpts{
+			Name: "worker_pool_queue_size",
+			Help: "Current number of jobs in the worker pool queue",
+		},
+	)
+
+	WorkerPoolActiveWorkers = promauto.NewGauge(
+		prometheus.GaugeOpts{
+			Name: "worker_pool_active_workers",
+			Help: "Current number of active workers in the pool",
+		},
+	)
+
+	WorkerJobProcessingDuration = promauto.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Name:    "worker_job_processing_duration_seconds",
+			Help:    "Duration of job processing in seconds",
+			Buckets: []float64{.001, .005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5},
+		},
+		[]string{"status"},
+	)
+
+	WorkerJobRetries = promauto.NewCounterVec(
+		prometheus.CounterOpts{
+			Name: "worker_job_retries_total",
+			Help: "Total number of job retries",
+		},
+		[]string{"job_type"},
+	)
+
+	// Batch Processor Metrics
+	BatchProcessorQueueSize = promauto.NewGauge(
+		prometheus.GaugeOpts{
+			Name: "batch_processor_queue_size",
+			Help: "Current number of items in the batch processor queue",
+		},
+	)
+
+	BatchProcessingDuration = promauto.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Name:    "batch_processing_duration_seconds",
+			Help:    "Duration of batch processing in seconds",
+			Buckets: []float64{.01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10},
+		},
+		[]string{"status"},
+	)
+
+	BatchItemsProcessed = promauto.NewCounterVec(
+		prometheus.CounterOpts{
+			Name: "batch_items_processed_total",
+			Help: "Total number of items processed by the batch processor",
+		},
+		[]string{"status"},
+	)
+
+	BatchRetries = promauto.NewCounter(
+		prometheus.CounterOpts{
+			Name: "batch_retries_total",
+			Help: "Total number of batch retries",
+		},
+	)
+
+	// Cache Metrics
+	CacheOperations = promauto.NewCounterVec(
+		prometheus.CounterOpts{
+			Name: "cache_operations_total",
+			Help: "Total number of cache operations",
+		},
+		[]string{"operation", "status"},
+	)
+
+	CacheHitRatio = promauto.NewGaugeVec(
+		prometheus.GaugeOpts{
+			Name: "cache_hit_ratio",
+			Help: "Cache hit ratio for different operations",
+		},
+		[]string{"operation"},
+	)
+
+	CacheOperationDuration = promauto.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Name:    "cache_operation_duration_seconds",
+			Help:    "Duration of cache operations in seconds",
+			Buckets: []float64{.001, .005, .01, .025, .05, .1, .25, .5},
+		},
+		[]string{"operation", "status"},
+	)
+
+	CacheSize = promauto.NewGaugeVec(
+		prometheus.GaugeOpts{
+			Name: "cache_size_bytes",
+			Help: "Size of cached data in bytes",
+		},
+		[]string{"type"},
+	)
 )
 
 // MetricsHandler HTTP handler for Prometheus metrics
diff --git a/pkg/worker/pool.go b/pkg/worker/pool.go
@@ -3,7 +3,6 @@ package worker
 import (
 	"context"
 	"fmt"
-	"strconv"
 	"sync"
 	"time"
 
@@ -115,22 +114,25 @@ func (p *Pool) worker(id int) {
 				return
 			}
 
+			observability.WorkerPoolActiveWorkers.Inc()
+			defer observability.WorkerPoolActiveWorkers.Dec()
+
 			var err error
 			retries := 0
+			start := time.Now()
 
 			for retries <= p.maxRetries {
-				start := time.Now()
 				err = job.Task()
 				duration := time.Since(start).Seconds()
 
-				// Record metrics
-				observability.ImageProcessingDuration.WithLabelValues("worker_" + strconv.Itoa(id)).Observe(duration)
-
 				if err == nil {
+					observability.WorkerJobProcessingDuration.WithLabelValues("success").Observe(duration)
 					break
 				}
 
 				retries++
+				observability.WorkerJobRetries.WithLabelValues("image_processing").Inc()
+
 				p.logger.Error().
 					Err(err).
 					Str("jobID", job.ID).
@@ -142,10 +144,16 @@ func (p *Pool) worker(id int) {
 					time.Sleep(p.retryDelay)
 					continue
 				}
+
+				observability.WorkerJobProcessingDuration.WithLabelValues("failure").Observe(duration)
 			}
 
 			job.Response <- err
 
+			// Update queue size metric
+			queueSize := float64(len(p.jobQueue))
+			observability.WorkerPoolQueueSize.Set(queueSize)
+
 		case <-p.ctx.Done():
 			return
 		}
diff --git a/service/cache.go b/service/cache.go