Skip to content

Commit 2c04acf

Browse files
committed
Improve cache observability
1 parent 83d72ef commit 2c04acf

File tree

6 files changed

+82
-47
lines changed

6 files changed

+82
-47
lines changed

app/proxy/proxy.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ func Handle(w http.ResponseWriter, r *http.Request) {
149149
}
150150
monitor.ErrorToSentry(err, map[string]string{"request": fmt.Sprintf("%+v", rpcReq), "response": fmt.Sprintf("%+v", rpcRes)})
151151
observeFailure(metrics.GetDuration(r), rpcReq.Method, metrics.FailureKindNet)
152-
logger.Log().Errorf("error calling lbrynet: %v, request: %+v", err, rpcReq)
152+
logger.Log().Errorf("error calling sdk method %s: %s", rpcReq.Method, err)
153153
return
154154
}
155155

app/query/cache.go

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ import (
88
"fmt"
99
"time"
1010

11-
"github.com/OdyseeTeam/odysee-api/internal/metrics"
1211
"github.com/OdyseeTeam/odysee-api/internal/monitor"
1312
"github.com/OdyseeTeam/odysee-api/pkg/rpcerrors"
1413

@@ -35,9 +34,9 @@ type QueryCache struct {
3534
}
3635

3736
func NewQueryCache(store cache.CacheInterface[any]) *QueryCache {
38-
m := marshaler.New(store)
37+
marshal := marshaler.New(store)
3938
return &QueryCache{
40-
cache: m,
39+
cache: marshal,
4140
singleflight: &singleflight.Group{},
4241
}
4342
}
@@ -49,31 +48,34 @@ func (c *QueryCache) Retrieve(query *Query, getter func() (any, error)) (*Cached
4948
Params: query.Params(),
5049
}
5150

52-
ctx, cancel := context.WithTimeout(context.Background(), 1000*time.Millisecond)
51+
ctx, cancel := context.WithTimeout(context.Background(), 5000*time.Millisecond)
5352
defer cancel()
5453

54+
start := time.Now()
55+
5556
hit, err := c.cache.Get(ctx, cacheReq, &CachedResponse{})
5657
if err != nil {
5758
if !errors.Is(err, &store.NotFound{}) {
58-
metrics.SturdyQueryCacheErrorCount.WithLabelValues(cacheReq.Method).Inc()
59+
ObserveQueryCacheOperation(CacheOperationGet, CacheResultError, cacheReq.Method, start)
5960
return nil, fmt.Errorf("error during cache.get: %w", err)
6061
}
6162

62-
metrics.SturdyQueryCacheMissCount.WithLabelValues(cacheReq.Method).Inc()
63+
ObserveQueryCacheOperation(CacheOperationGet, CacheResultMiss, cacheReq.Method, start)
6364

6465
if getter == nil {
6566
log.Warnf("nil getter provided for %s", cacheReq.Method)
66-
metrics.SturdyQueryCacheErrorCount.WithLabelValues(cacheReq.Method).Inc()
6767
return nil, nil
6868
}
6969

70+
log.Infof("cache miss for %s, key=%s, duration=%.2fs", cacheReq.Method, cacheReq.GetCacheKey(), time.Since(start).Seconds())
7071
// Cold object retrieval after cache miss
71-
log.Infof("cold object retrieval for %s [%s]", cacheReq.Method, cacheReq.GetCacheKey())
72+
start := time.Now()
7273
obj, err, _ := c.singleflight.Do(cacheReq.GetCacheKey(), getter)
7374
if err != nil {
74-
metrics.SturdyQueryCacheErrorCount.WithLabelValues(cacheReq.Method).Inc()
75+
ObserveQueryRetrievalDuration(CacheResultError, cacheReq.Method, start)
7576
return nil, fmt.Errorf("error calling getter: %w", err)
7677
}
78+
ObserveQueryRetrievalDuration(CacheResultSuccess, cacheReq.Method, start)
7779

7880
res, ok := obj.(*jsonrpc.RPCResponse)
7981
if !ok {
@@ -84,27 +86,29 @@ func (c *QueryCache) Retrieve(query *Query, getter func() (any, error)) (*Cached
8486
if res.Error != nil {
8587
log.Debugf("rpc error received (%s), not caching", cacheReq.Method)
8688
} else {
89+
start := time.Now()
8790
err = c.cache.Set(
8891
ctx, cacheReq, cacheResp,
8992
store.WithExpiration(cacheReq.Expiration()),
9093
store.WithTags(cacheReq.Tags()),
9194
)
9295
if err != nil {
93-
metrics.SturdyQueryCacheErrorCount.WithLabelValues(cacheReq.Method).Inc()
96+
ObserveQueryCacheOperation(CacheOperationSet, CacheResultError, cacheReq.Method, start)
9497
monitor.ErrorToSentry(fmt.Errorf("error during cache.set: %w", err), map[string]string{"method": cacheReq.Method})
95-
return cacheResp, fmt.Errorf("error during cache.set: %w", err)
98+
log.Warnf("error during cache.set: %s", err)
99+
return cacheResp, nil
96100
}
101+
ObserveQueryCacheOperation(CacheOperationSet, CacheResultSuccess, cacheReq.Method, start)
97102
}
98103

99104
return cacheResp, nil
100105
}
101-
log.Debugf("cache hit for %s [%s]", cacheReq.Method, cacheReq.GetCacheKey())
106+
log.Infof("cache hit for %s, key=%s, duration=%.2fs", cacheReq.Method, cacheReq.GetCacheKey(), time.Since(start).Seconds())
107+
ObserveQueryCacheOperation(CacheOperationGet, CacheResultHit, cacheReq.Method, start)
102108
cacheResp, ok := hit.(*CachedResponse)
103109
if !ok {
104-
metrics.SturdyQueryCacheErrorCount.WithLabelValues(cacheReq.Method).Inc()
105110
return nil, errors.New("unknown cache object retrieved")
106111
}
107-
metrics.SturdyQueryCacheHitCount.WithLabelValues(cacheReq.Method).Inc()
108112
return cacheResp, nil
109113
}
110114

app/query/metrics.go

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
package query
2+
3+
import (
4+
"time"
5+
6+
"github.com/prometheus/client_golang/prometheus"
7+
"github.com/prometheus/client_golang/prometheus/promauto"
8+
)
9+
10+
const (
11+
CacheOperationGet = "get"
12+
CacheOperationSet = "set"
13+
14+
CacheResultHit = "hit"
15+
CacheResultMiss = "miss"
16+
CacheResultSuccess = "success"
17+
CacheResultError = "error"
18+
)
19+
20+
var (
21+
queryRetrievalDurationBuckets = []float64{0.025, 0.05, 0.1, 0.25, 0.4, 1, 2.5, 5, 10, 25, 50, 100, 300}
22+
cacheDurationBuckets = []float64{0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0}
23+
24+
QueryCacheOperationDuration = promauto.NewHistogramVec(
25+
prometheus.HistogramOpts{
26+
Namespace: "query_cache",
27+
Name: "operation_duration_seconds",
28+
Help: "Cache operation latency",
29+
Buckets: cacheDurationBuckets,
30+
},
31+
[]string{"operation", "result", "method"},
32+
)
33+
QueryRetrievalDuration = promauto.NewHistogramVec(
34+
prometheus.HistogramOpts{
35+
Namespace: "query_cache",
36+
Name: "retrieval_duration_seconds",
37+
Help: "Latency for cold cache retrieval",
38+
Buckets: queryRetrievalDurationBuckets,
39+
},
40+
[]string{"result", "method"},
41+
)
42+
)
43+
44+
func ObserveQueryCacheOperation(operation, result, method string, start time.Time) {
45+
QueryCacheOperationDuration.WithLabelValues(operation, result, method).Observe(float64(time.Since(start).Seconds()))
46+
}
47+
48+
func ObserveQueryRetrievalDuration(result, method string, start time.Time) {
49+
QueryRetrievalDuration.WithLabelValues(result, method).Observe(float64(time.Since(start).Seconds()))
50+
}

internal/metrics/metrics.go

Lines changed: 8 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ const (
4747
)
4848

4949
var (
50-
callsSecondsBuckets = []float64{0.005, 0.025, 0.05, 0.1, 0.25, 0.4, 1, 2, 5, 10, 20, 60, 120, 300}
50+
callDurationBuckets = []float64{0.005, 0.025, 0.05, 0.1, 0.25, 0.4, 1, 2, 5, 10, 20, 60, 120, 300}
5151

5252
IAPIAuthSuccessDurations = promauto.NewHistogram(prometheus.HistogramOpts{
5353
Namespace: nsIAPI,
@@ -85,7 +85,7 @@ var (
8585
Subsystem: "e2e_calls",
8686
Name: "total_seconds",
8787
Help: "End-to-end method call latency distributions",
88-
Buckets: callsSecondsBuckets,
88+
Buckets: callDurationBuckets,
8989
},
9090
[]string{"method"},
9191
)
@@ -95,7 +95,7 @@ var (
9595
Subsystem: "e2e_calls",
9696
Name: "failed_seconds",
9797
Help: "Failed end-to-end method call latency distributions",
98-
Buckets: callsSecondsBuckets,
98+
Buckets: callDurationBuckets,
9999
},
100100
[]string{"method", "kind"},
101101
)
@@ -124,7 +124,7 @@ var (
124124
Subsystem: "calls",
125125
Name: "total_seconds",
126126
Help: "Method call latency distributions",
127-
Buckets: callsSecondsBuckets,
127+
Buckets: callDurationBuckets,
128128
},
129129
[]string{"method", "endpoint", "origin"},
130130
)
@@ -134,7 +134,7 @@ var (
134134
Subsystem: "calls",
135135
Name: "failed_seconds",
136136
Help: "Failed method call latency distributions",
137-
Buckets: callsSecondsBuckets,
137+
Buckets: callDurationBuckets,
138138
},
139139
[]string{"method", "endpoint", "origin", "kind"},
140140
)
@@ -176,25 +176,6 @@ var (
176176
Help: "Total number of errors retrieving queries from the local cache",
177177
}, []string{"method"})
178178

179-
SturdyQueryCacheHitCount = promauto.NewCounterVec(prometheus.CounterOpts{
180-
Namespace: nsSturdyCache,
181-
Subsystem: "query",
182-
Name: "hit_count",
183-
Help: "Total number of queries found in sturdycache",
184-
}, []string{"method"})
185-
SturdyQueryCacheMissCount = promauto.NewCounterVec(prometheus.CounterOpts{
186-
Namespace: nsSturdyCache,
187-
Subsystem: "query",
188-
Name: "miss_count",
189-
Help: "Total number of queries that were not in sturdycache",
190-
}, []string{"method"})
191-
SturdyQueryCacheErrorCount = promauto.NewCounterVec(prometheus.CounterOpts{
192-
Namespace: nsSturdyCache,
193-
Subsystem: "query",
194-
Name: "error_count",
195-
Help: "Total number of errors retrieving queries from sturdycache",
196-
}, []string{"method"})
197-
198179
LbrynetWalletsLoaded = promauto.NewGaugeVec(prometheus.GaugeOpts{
199180
Namespace: nsLbrynet,
200181
Subsystem: "wallets",
@@ -222,7 +203,7 @@ var (
222203
Subsystem: "calls",
223204
Name: "total_seconds",
224205
Help: "How long do calls to lbrytv take (end-to-end)",
225-
Buckets: callsSecondsBuckets,
206+
Buckets: callDurationBuckets,
226207
},
227208
[]string{"path"},
228209
)
@@ -278,7 +259,7 @@ var (
278259
Subsystem: "calls",
279260
Name: "total_seconds",
280261
Help: "Method call latency distributions",
281-
Buckets: callsSecondsBuckets,
262+
Buckets: callDurationBuckets,
282263
},
283264
[]string{"method", "endpoint", "group"},
284265
)
@@ -288,7 +269,7 @@ var (
288269
Subsystem: "calls",
289270
Name: "failed_seconds",
290271
Help: "Failed method call latency distributions",
291-
Buckets: callsSecondsBuckets,
272+
Buckets: callDurationBuckets,
292273
},
293274
[]string{"method", "endpoint", "group", "kind"},
294275
)

internal/status/status.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -110,18 +110,18 @@ func StatusV2(w http.ResponseWriter, r *http.Request) {
110110
c.Cache = query.CacheFromRequest(r)
111111
}
112112

113-
rpcRes, err := c.Call(r.Context(), jsonrpc.NewRequest("resolve", map[string]interface{}{"urls": resolveURL}))
113+
rpcRes, err := c.Call(r.Context(), jsonrpc.NewRequest(query.MethodResolve, map[string]interface{}{"urls": resolveURL}))
114114

115115
if err != nil {
116116
srv.Error = err.Error()
117117
srv.Status = statusOffline
118118
failureDetected = true
119-
logger.Log().Error("we're failing: ", err)
119+
logger.Log().Errorf("status call resolve is failing: %s", err)
120120
} else if rpcRes.Error != nil {
121121
srv.Error = rpcRes.Error.Message
122122
srv.Status = statusNotReady
123123
failureDetected = true
124-
logger.Log().Error("we're failing: ", err)
124+
logger.Log().Errorf("status call resolve is failing: %s", err)
125125
} else {
126126
if user != nil {
127127
response.User = &userData{

pkg/sturdycache/sturdycache.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,8 @@ func NewReplicatedCache(
2525
Addr: masterAddr,
2626
Password: password,
2727
DB: 0,
28-
PoolSize: 10,
29-
MinIdleConns: 5,
28+
PoolSize: 200,
29+
MinIdleConns: 10,
3030
})
3131

3232
masterStore := redis_store.NewRedis(masterClient)

0 commit comments

Comments
 (0)