Skip to content

Commit f0f3eca

Browse files
authored
Merge pull request #1053 from Concordium/fix/load-shed
Add load-shedding to the GRPC V2 service.
2 parents 91a7f8a + d472d01 commit f0f3eca

File tree

4 files changed

+50
-20
lines changed

4 files changed

+50
-20
lines changed

CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,13 @@
22

33
## Unreleased changes
44

5+
## 6.1.7
6+
7+
- Add load-shedding to the V2 GRPC API. In particular, if at the time of the
8+
request the node is already handling more than
9+
`CONCORDIUM_NODE_GRPC2_MAX_CONCURRENT_REQUESTS` requests then the incoming
10+
request will be immediately rejected.
11+
512
## 6.1.6
613

714
- Fix a regression in the start up time. When upgrading from an earlier version, the first start-up

concordium-node/Cargo.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

concordium-node/Cargo.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "concordium_node"
3-
version = "6.1.6" # must be kept in sync with 'is_compatible_version' in 'src/configuration.rs'
3+
version = "6.1.7" # must be kept in sync with 'is_compatible_version' in 'src/configuration.rs'
44
description = "Concordium Node"
55
authors = ["Concordium <[email protected]>"]
66
exclude = [".gitignore", ".gitlab-ci.yml", "test/**/*","**/**/.gitignore","**/**/.gitlab-ci.yml"]
@@ -78,7 +78,7 @@ tempfile = { version = "3.1" }
7878
tonic = { version = "0.9", features = ["tls"] }
7979
tonic-reflection = "0.9"
8080
tower-http = { version = "0.4", features = ["trace", "metrics"] }
81-
tower = "0.4"
81+
tower = {version = "0.4", features = ["load-shed"]}
8282
tonic-web = "0.9"
8383
prost = "0.11"
8484
tokio = { version = "1.20", features = ["macros", "rt-multi-thread", "signal", "io-util", "time"] }

concordium-node/src/grpc2.rs

Lines changed: 40 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1064,12 +1064,18 @@ pub mod server {
10641064
.http2_keepalive_timeout(Some(std::time::Duration::from_secs(
10651065
config.keepalive_timeout,
10661066
)))
1067-
.layer(log_layer)
1068-
.layer(tower::limit::ConcurrencyLimitLayer::new(config.max_concurrent_requests))
1069-
// Note: the in-flight request layer applies after the limit layer. This is what we want so that the
1070-
// metric reflects the actual number of in-flight requests.
1067+
// Note: the in-flight request layer applies first here. Since we are using a load-shed
1068+
// layer just below this corresponds very directly to the number of requests being actually handled.
1069+
// The technical reason for this is that we cannot really stack the in flight requests layer
1070+
// below the stats layer since we want to transform some `Err` responses in the stats layer
1071+
// to Ok responses with a meaningful gRPC status code,
1072+
// but since the in flight request layer adds a guard to count in-flight requests this would
1073+
// mean we'd have to construct such a guard in the response, which is not possible.
10711074
.layer(in_flight_request_layer)
1072-
.layer(stats_layer);
1075+
.layer(stats_layer)
1076+
.layer(tower::load_shed::LoadShedLayer::new())
1077+
.layer(tower::limit::ConcurrencyLimitLayer::new(config.max_concurrent_requests))
1078+
.layer(log_layer);
10731079
if let Some(identity) = identity {
10741080
builder = builder
10751081
.tls_config(ServerTlsConfig::new().identity(identity))
@@ -2560,19 +2566,21 @@ fn get_grpc_code_label(code: tonic::Code) -> &'static str {
25602566
/// Actual middleware implementation updating the stats.
25612567
/// The middleware is called once for each gRPC request, even for the streaming
25622568
/// gRPC methods.
2563-
impl<S> tower::Service<hyper::Request<hyper::Body>> for StatsMiddleware<S>
2569+
impl<S, Body: hyper::body::HttpBody + Unpin + Send + Default>
2570+
tower::Service<hyper::Request<hyper::Body>> for StatsMiddleware<S>
25642571
where
25652572
S: tower::Service<
25662573
hyper::Request<hyper::Body>,
2567-
Response = hyper::Response<tonic::body::BoxBody>,
2574+
Response = hyper::Response<Body>,
2575+
Error = tower::BoxError,
25682576
> + Clone
25692577
+ Send
25702578
+ 'static,
25712579
S::Future: Send + 'static,
25722580
{
2573-
type Error = S::Error;
2581+
type Error = tower::BoxError;
25742582
type Future = futures::future::BoxFuture<'static, Result<Self::Response, Self::Error>>;
2575-
type Response = S::Response;
2583+
type Response = hyper::Response<Body>;
25762584

25772585
fn poll_ready(
25782586
&mut self,
@@ -2599,15 +2607,30 @@ where
25992607
// Time taken for the inner service to send back a response, meaning for
26002608
// streaming gRPC methods this is the duration for it to first return a stream.
26012609
let duration = request_received.elapsed().as_secs_f64();
2602-
if result.is_err() {
2603-
grpc_request_duration
2604-
.with_label_values(&[
2605-
endpoint_name.as_str(),
2606-
get_grpc_code_label(tonic::Code::Internal),
2607-
])
2608-
.observe(duration);
2610+
match result {
2611+
Err(e) => {
2612+
// If the load shed service terminated the request this will be signalled as
2613+
// an Err(Overloaded). So record resource exhaustion
2614+
// in the metrics.
2615+
let (code, response) = if e.is::<tower::load_shed::error::Overloaded>() {
2616+
// return a response with empty body of the correct type. `to_http`
2617+
// constructs a response with a `BoxBody` but
2618+
// here we need a more general one to make the service generic enough.
2619+
let new_response =
2620+
tonic::Status::resource_exhausted("Too many concurrent requests.")
2621+
.to_http()
2622+
.map(|_| Default::default());
2623+
(tonic::Code::ResourceExhausted, Ok(new_response))
2624+
} else {
2625+
(tonic::Code::Internal, Err(e))
2626+
};
2627+
grpc_request_duration
2628+
.with_label_values(&[endpoint_name.as_str(), get_grpc_code_label(code)])
2629+
.observe(duration);
2630+
return response;
2631+
}
2632+
Ok(result) => (result, duration),
26092633
}
2610-
(result?, duration)
26112634
};
26122635

26132636
// Check if the gRPC status header is part of the HTTP headers, if not check for

0 commit comments

Comments
 (0)