-
Notifications
You must be signed in to change notification settings - Fork 75
Add prefix cache aware scheduling #768
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,6 +18,7 @@ package metrics | |
|
||
import ( | ||
"context" | ||
"runtime/debug" | ||
"sync" | ||
"time" | ||
|
||
|
@@ -219,6 +220,40 @@ var ( | |
}, | ||
[]string{"commit"}, | ||
) | ||
|
||
// Prefix indexer Metrics | ||
PrefixCacheSize = compbasemetrics.NewGaugeVec( | ||
&compbasemetrics.GaugeOpts{ | ||
Subsystem: InferenceExtension, | ||
Name: "prefix_indexer_size", | ||
Help: "Size of the prefix indexer.", | ||
StabilityLevel: compbasemetrics.ALPHA, | ||
}, | ||
[]string{}, | ||
) | ||
|
||
PrefixCacheHitRatio = compbasemetrics.NewHistogramVec( | ||
&compbasemetrics.HistogramOpts{ | ||
Subsystem: InferenceExtension, | ||
Name: "prefix_indexer_hit_ratio", | ||
Help: "Ratio of prefix length matched to total prefix length in the cache lookup.", | ||
// Buckets from 0.0 to 1.0 in increments | ||
Buckets: []float64{0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0}, | ||
StabilityLevel: compbasemetrics.ALPHA, | ||
}, | ||
[]string{}, | ||
) | ||
|
||
PrefixCacheHitLength = compbasemetrics.NewHistogramVec( | ||
&compbasemetrics.HistogramOpts{ | ||
Subsystem: InferenceExtension, | ||
Name: "prefix_indexer_hit_bytes", | ||
Help: "Length of the prefix match in number of bytes in the cache lookup.", | ||
Buckets: []float64{0, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536}, | ||
StabilityLevel: compbasemetrics.ALPHA, | ||
}, | ||
[]string{}, | ||
) | ||
) | ||
|
||
var registerMetrics sync.Once | ||
|
@@ -244,6 +279,10 @@ func Register() { | |
legacyregistry.MustRegister(SchedulerE2ELatency) | ||
|
||
legacyregistry.MustRegister(InferenceExtensionInfo) | ||
|
||
legacyregistry.MustRegister(PrefixCacheSize) | ||
legacyregistry.MustRegister(PrefixCacheHitRatio) | ||
legacyregistry.MustRegister(PrefixCacheHitLength) | ||
}) | ||
} | ||
|
||
|
@@ -352,8 +391,44 @@ func RecordSchedulerE2ELatency(duration time.Duration) { | |
SchedulerE2ELatency.WithLabelValues().Observe(duration.Seconds()) | ||
} | ||
|
||
// RecordPrefixCacheSize records the size of the prefix indexer in megabytes. | ||
func RecordPrefixCacheSize(size int64) { | ||
PrefixCacheSize.WithLabelValues().Set(float64(size)) | ||
} | ||
|
||
// RecordPrefixCacheMatch records both the hit ratio and hit length for a prefix indexer match. | ||
// matchedLength is the number of characters that matched, and totalLength is the total prefix length. | ||
func RecordPrefixCacheMatch(matchedLength, totalLength int) { | ||
// Record the hit length metric | ||
PrefixCacheHitLength.WithLabelValues().Observe(float64(matchedLength)) | ||
|
||
// Record the hit ratio metric if totalLength is positive | ||
if totalLength > 0 { | ||
ratio := float64(matchedLength) / float64(totalLength) | ||
PrefixCacheHitRatio.WithLabelValues().Observe(ratio) | ||
} | ||
} | ||
|
||
func RecordInferenceExtensionInfo() { | ||
if CommitSHA != "" { | ||
InferenceExtensionInfo.WithLabelValues(CommitSHA).Set(1) | ||
} | ||
} | ||
|
||
func init() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @JeffLuoo did you send a PR to populate the commit sha? is this really needed? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure why is it still here. It should have been deleted because the CommitSHA is no longer fetched from the The original task is #579 which adds the "info" metrics for the inference gateway and start with version and hash for the build. I personally think the version might be more useful in terms of determining the current build of EPP, SHA is useful if you are testing out a specific change on top of the main. |
||
info, ok := debug.ReadBuildInfo() | ||
if !ok { | ||
return | ||
} | ||
|
||
var Commit = func(i *debug.BuildInfo) string { | ||
for _, setting := range i.Settings { | ||
if setting.Key == "vcs.revision" { | ||
return setting.Value | ||
} | ||
} | ||
return "" | ||
}(info) | ||
|
||
CommitSHA = Commit | ||
} |
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
@@ -0,0 +1,19 @@ | ||||
# HELP inference_extension_prefix_indexer_hit_bytes [ALPHA] Length of the prefix match in number of bytes in the cache lookup. | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can these be multiline string variables in the testfile(s)? Reduces the need for these files/dir There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Prior art:
|
||||
# TYPE inference_extension_prefix_indexer_hit_bytes histogram | ||||
inference_extension_prefix_indexer_hit_bytes_bucket{le="0"} 2 | ||||
inference_extension_prefix_indexer_hit_bytes_bucket{le="16"} 5 | ||||
inference_extension_prefix_indexer_hit_bytes_bucket{le="32"} 5 | ||||
inference_extension_prefix_indexer_hit_bytes_bucket{le="64"} 6 | ||||
inference_extension_prefix_indexer_hit_bytes_bucket{le="128"} 6 | ||||
inference_extension_prefix_indexer_hit_bytes_bucket{le="256"} 6 | ||||
inference_extension_prefix_indexer_hit_bytes_bucket{le="512"} 6 | ||||
inference_extension_prefix_indexer_hit_bytes_bucket{le="1024"} 6 | ||||
inference_extension_prefix_indexer_hit_bytes_bucket{le="2048"} 6 | ||||
inference_extension_prefix_indexer_hit_bytes_bucket{le="4096"} 6 | ||||
inference_extension_prefix_indexer_hit_bytes_bucket{le="8192"} 6 | ||||
inference_extension_prefix_indexer_hit_bytes_bucket{le="16384"} 6 | ||||
inference_extension_prefix_indexer_hit_bytes_bucket{le="32768"} 6 | ||||
inference_extension_prefix_indexer_hit_bytes_bucket{le="65536"} 6 | ||||
inference_extension_prefix_indexer_hit_bytes_bucket{le="+Inf"} 6 | ||||
inference_extension_prefix_indexer_hit_bytes_sum 86 | ||||
inference_extension_prefix_indexer_hit_bytes_count 6 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
# HELP inference_extension_prefix_indexer_hit_ratio [ALPHA] Ratio of prefix length matched to total prefix length in the cache lookup. | ||
# TYPE inference_extension_prefix_indexer_hit_ratio histogram | ||
inference_extension_prefix_indexer_hit_ratio_bucket{le="0"} 2 | ||
inference_extension_prefix_indexer_hit_ratio_bucket{le="0.1"} 2 | ||
inference_extension_prefix_indexer_hit_ratio_bucket{le="0.2"} 2 | ||
inference_extension_prefix_indexer_hit_ratio_bucket{le="0.3"} 2 | ||
inference_extension_prefix_indexer_hit_ratio_bucket{le="0.4"} 2 | ||
inference_extension_prefix_indexer_hit_ratio_bucket{le="0.5"} 4 | ||
inference_extension_prefix_indexer_hit_ratio_bucket{le="0.6"} 4 | ||
inference_extension_prefix_indexer_hit_ratio_bucket{le="0.7"} 5 | ||
inference_extension_prefix_indexer_hit_ratio_bucket{le="0.8"} 5 | ||
inference_extension_prefix_indexer_hit_ratio_bucket{le="0.9"} 5 | ||
inference_extension_prefix_indexer_hit_ratio_bucket{le="1"} 6 | ||
inference_extension_prefix_indexer_hit_ratio_bucket{le="+Inf"} 6 | ||
inference_extension_prefix_indexer_hit_ratio_sum 2.7 | ||
inference_extension_prefix_indexer_hit_ratio_count 6 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
# HELP inference_extension_prefix_indexer_size [ALPHA] Size of the prefix indexer. | ||
# TYPE inference_extension_prefix_indexer_size gauge | ||
inference_extension_prefix_indexer_size{} 4096 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
/* | ||
Copyright 2025 The Kubernetes Authors. | ||
|
||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
|
||
http://www.apache.org/licenses/LICENSE-2.0 | ||
|
||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
|
||
// Package scheduling implements request scheduling algorithms. | ||
package scheduling | ||
|
||
import ( | ||
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins" | ||
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/filter" | ||
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/picker" | ||
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/prefix" | ||
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/scorer" | ||
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" | ||
) | ||
|
||
func CreateConfig(weights ScorerWeights, prefixConfig prefix.Config) *SchedulerConfig { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What does the previous iteration of config restrict? Creating a V2 as one of the initial adopters of this framework may set a bad precedent for reusability |
||
prefixPlugin := prefix.New(prefixConfig) | ||
queuePlugin := &scorer.QueueScorer{} | ||
kvCachePlugin := &scorer.KVCacheScorer{} | ||
config := &SchedulerConfig{ | ||
PreSchedulePlugins: []plugins.PreSchedule{prefixPlugin}, | ||
PostSchedulePlugins: []plugins.PostSchedule{prefixPlugin}, | ||
Scorers: map[plugins.Scorer]int{ | ||
prefixPlugin: weights.Prefix, | ||
queuePlugin: weights.Queue, | ||
kvCachePlugin: weights.KVCache, | ||
}, | ||
Filters: []plugins.Filter{&sheddableRequestFilterV2{}}, | ||
Picker: &picker.MaxScorePicker{}, | ||
} | ||
return config | ||
} | ||
|
||
type ScorerWeights struct { | ||
Prefix int | ||
Queue int | ||
KVCache int | ||
} | ||
Comment on lines
+47
to
+51
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Shouldn't this be defined with the extension itself? |
||
|
||
type sheddableRequestFilterV2 struct{} | ||
|
||
func (p *sheddableRequestFilterV2) Name() string { | ||
return "sheddableRequestFilterV2" | ||
} | ||
|
||
func (p *sheddableRequestFilterV2) Filter(ctx *types.SchedulingContext, pods []types.Pod) []types.Pod { | ||
if ctx.Req.Critical { | ||
// Allow all pods to pass through if the request is critical, even if all pods reach their capacity. | ||
return pods | ||
} | ||
|
||
// Only allow pods that have enough capacity to handle the request. | ||
return filter.HasCapacityFilter.Filter(ctx, pods) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -256,6 +256,14 @@ var HasCapacityFilter = &baseFilter{ | |
filter: toFilterFunc(queueThresholdPredicate(config.Conf.QueueThresholdCritical).and(kvCacheThresholdPredicate(config.Conf.KVCacheThreshold))), | ||
} | ||
|
||
// NoopFilter is a filter that does not filter out any pods. | ||
var NoopFilter = &baseFilter{ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why are we adding this back? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ++ There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ++ |
||
name: "noop", | ||
filter: toFilterFunc(func(req *types.LLMRequest, pod types.Pod) bool { | ||
return true | ||
}), | ||
} | ||
|
||
// podPredicate is a filter function to check whether a pod is desired. | ||
type podPredicate func(req *types.LLMRequest, pod types.Pod) bool | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
recommend to comment on the defaults and how they were selected.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I also suggest to define the defaults in the config file
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
how do these weight defined, test based or brain storm?