Skip to content

Commit 0ee1d31

Browse files
sjuddConvex, Inc.
authored andcommitted
Spawn scheduled jobs rather than executing them on a single thread (#23664)
Previously we used a single thread to poll all scheduled job futures. As we increase scheduled job execution concurrency, even without tons of blocking IO or CPU bound work in the jobs, we can end up starving some futures. Spawning each future separately effectively punts them back to tokio's work scheduler. This allows each job to be polled by different threads and for the work to be stolen if a particular worker is too busy. We're now able to process tasks at a much higher level of concurrency (1000+). GitOrigin-RevId: b240cb703e274c408b5bad181e2e3a68f4e07bd5
1 parent 53e6197 commit 0ee1d31

File tree

1 file changed

+49
-13
lines changed
  • crates/application/src/scheduled_jobs

1 file changed

+49
-13
lines changed

crates/application/src/scheduled_jobs/mod.rs

Lines changed: 49 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
use std::{
22
collections::HashSet,
3+
ops::Deref,
34
sync::Arc,
45
time::Duration,
56
};
@@ -43,6 +44,7 @@ use database::{
4344
};
4445
use errors::ErrorMetadataAnyhowExt;
4546
use futures::{
47+
channel::oneshot,
4648
future::Either,
4749
select_biased,
4850
stream::FuturesUnordered,
@@ -132,6 +134,19 @@ const INITIAL_BACKOFF: Duration = Duration::from_millis(10);
132134
const MAX_BACKOFF: Duration = Duration::from_secs(5);
133135

134136
pub struct ScheduledJobExecutor<RT: Runtime> {
137+
context: ScheduledJobContext<RT>,
138+
}
139+
140+
impl<RT: Runtime> Deref for ScheduledJobExecutor<RT> {
141+
type Target = ScheduledJobContext<RT>;
142+
143+
fn deref(&self) -> &Self::Target {
144+
&self.context
145+
}
146+
}
147+
148+
#[derive(Clone)]
149+
pub struct ScheduledJobContext<RT: Runtime> {
135150
rt: RT,
136151
database: Database<RT>,
137152
runner: Arc<ApplicationFunctionRunner<RT>>,
@@ -146,10 +161,12 @@ impl<RT: Runtime> ScheduledJobExecutor<RT> {
146161
function_log: FunctionExecutionLog<RT>,
147162
) -> impl Future<Output = ()> + Send {
148163
let executor = Self {
149-
rt,
150-
database,
151-
runner,
152-
function_log,
164+
context: ScheduledJobContext {
165+
rt,
166+
database,
167+
runner,
168+
function_log,
169+
},
153170
};
154171
async move {
155172
let mut backoff = Backoff::new(INITIAL_BACKOFF, MAX_BACKOFF);
@@ -170,10 +187,12 @@ impl<RT: Runtime> ScheduledJobExecutor<RT> {
170187
function_log: FunctionExecutionLog<RT>,
171188
) -> Self {
172189
Self {
173-
rt,
174-
database,
175-
runner,
176-
function_log,
190+
context: ScheduledJobContext {
191+
rt,
192+
database,
193+
runner,
194+
function_log,
195+
},
177196
}
178197
}
179198

@@ -231,7 +250,22 @@ impl<RT: Runtime> ScheduledJobExecutor<RT> {
231250
next_job_wait = Some(Duration::from_secs(5));
232251
break;
233252
}
234-
futures.push(self.execute_job(job, job_id));
253+
let (tx, rx) = oneshot::channel();
254+
let context = self.context.clone();
255+
self.rt.spawn("spawn_scheduled_job", async move {
256+
let result = context.execute_job(job, job_id).await;
257+
let _ = tx.send(result);
258+
});
259+
260+
futures.push(async move {
261+
let Ok(result) = rx.await else {
262+
// This should never happen, but if it does, it's the same scenario as if
263+
// backend crashed during execution which we have to handle anyway.
264+
report_error(&mut anyhow::anyhow!("Cancelled job!"));
265+
return job_id;
266+
};
267+
result
268+
});
235269
running_job_ids.insert(job_id);
236270
}
237271

@@ -243,6 +277,7 @@ impl<RT: Runtime> ScheduledJobExecutor<RT> {
243277

244278
let token = tx.into_token()?;
245279
let subscription = self.database.subscribe(token).await?;
280+
246281
select_biased! {
247282
job_id = futures.select_next_some() => {
248283
running_job_ids.remove(&job_id);
@@ -251,11 +286,13 @@ impl<RT: Runtime> ScheduledJobExecutor<RT> {
251286
}
252287
_ = subscription.wait_for_invalidation().fuse() => {
253288
},
254-
};
289+
}
255290
backoff.reset();
256291
}
257292
}
293+
}
258294

295+
impl<RT: Runtime> ScheduledJobContext<RT> {
259296
// This handles re-running the scheduled function on transient errors. It
260297
// guarantees that the job was successfully run or the job state changed.
261298
pub async fn execute_job(
@@ -265,8 +302,7 @@ impl<RT: Runtime> ScheduledJobExecutor<RT> {
265302
) -> ResolvedDocumentId {
266303
let mut backoff = Backoff::new(INITIAL_BACKOFF, MAX_BACKOFF);
267304
loop {
268-
let result = self.run_function(job.clone(), job_id).await;
269-
match result {
305+
match self.run_function(job.clone(), job_id).await {
270306
Ok(result) => {
271307
metrics::log_scheduled_job_success(backoff.failures());
272308
return result;
@@ -711,7 +747,7 @@ impl<RT: Runtime> ScheduledJobGarbageCollector<RT> {
711747
}
712748
_ = subscription.wait_for_invalidation().fuse() => {
713749
},
714-
};
750+
}
715751
backoff.reset();
716752
}
717753
}

0 commit comments

Comments
 (0)