From 2ff78f841832da6ad0a886f2e9282d136f7b95b7 Mon Sep 17 00:00:00 2001 From: Ruben Fiszel Date: Thu, 26 Sep 2024 10:32:15 +0200 Subject: [PATCH] feat(cli): add queues, workers and worker-groups commands (#4439) * all * all * all * all * all * all * all * all * all * all * all * all * all * all --- ...e9ad2c9fcbd75f55817b2114c04207f66e4a.json} | 4 +- ...a1cc0dde4141aa4c5002765a95432d0101ab.json} | 28 +- ...e97e8494393c29f7c8d915aa54da7ab7eed51.json | 21 - ...e9e5d24834082ebf2fe9fd3964fdd80b69ccb.json | 24 + ...c83b8b4da98bda204a40f045a62172cfb4ebb.json | 29 + ...8fbd983a4a3ab25648c562497a27c74b5c8c.json} | 10 +- ...24213010_add_last_occupancy_rates.down.sql | 5 + ...0924213010_add_last_occupancy_rates.up.sql | 5 + backend/src/main.rs | 2 + backend/src/monitor.rs | 87 +- backend/windmill-api/openapi.yaml | 44 + backend/windmill-api/src/configs.rs | 17 +- backend/windmill-api/src/jobs.rs | 50 +- backend/windmill-api/src/lib.rs | 2 +- backend/windmill-api/src/workers.rs | 10 +- backend/windmill-common/src/ee.rs~main | 73 ++ backend/windmill-common/src/ee.rs~main_0 | 76 ++ .../windmill-worker/src/ansible_executor.rs | 14 +- backend/windmill-worker/src/bash_executor.rs | 10 +- .../windmill-worker/src/bigquery_executor.rs | 5 +- backend/windmill-worker/src/bun_executor.rs | 29 +- backend/windmill-worker/src/common.rs | 913 ++---------------- .../windmill-worker/src/dedicated_worker.rs | 5 +- backend/windmill-worker/src/deno_executor.rs | 9 +- backend/windmill-worker/src/go_executor.rs | 15 +- .../windmill-worker/src/graphql_executor.rs | 5 +- backend/windmill-worker/src/handle_child.rs | 628 ++++++++++++ backend/windmill-worker/src/job_logger.rs | 283 ++++++ backend/windmill-worker/src/js_eval.rs | 5 +- backend/windmill-worker/src/lib.rs | 5 +- backend/windmill-worker/src/mssql_executor.rs | 5 +- backend/windmill-worker/src/mysql_executor.rs | 5 +- backend/windmill-worker/src/pg_executor.rs | 5 +- backend/windmill-worker/src/php_executor.rs | 10 +- .../windmill-worker/src/python_executor.rs | 16 +- backend/windmill-worker/src/rust_executor.rs | 12 +- .../windmill-worker/src/snowflake_executor.rs | 5 +- backend/windmill-worker/src/worker.rs | 59 +- .../windmill-worker/src/worker_lockfiles.rs | 31 + cli/gen/core/OpenAPI.ts | 2 +- cli/gen/services.gen.ts | 19 +- cli/gen/types.gen.ts | 19 + cli/instance.ts | 274 ++++-- cli/main.ts | 10 +- cli/queues.ts | 140 +++ cli/settings.ts | 17 +- cli/worker_groups.ts | 138 +++ cli/workers.ts | 105 ++ .../(root)/(logged)/workers/+page.svelte | 20 +- 49 files changed, 2245 insertions(+), 1060 deletions(-) rename backend/.sqlx/{query-9bf41c3161a02b7d0731c4e1d79519cef5255f5df1b759af3aa4985bb64313e5.json => query-41f68f4ce5bed783cf69e42da115e9ad2c9fcbd75f55817b2114c04207f66e4a.json} (78%) rename backend/.sqlx/{query-8375c1efeb1e2a2d2803052a2899bf70f4a6434eb91b4b05b9fb8420beae26af.json => query-6a497334c98bfaf70be44fced572a1cc0dde4141aa4c5002765a95432d0101ab.json} (68%) delete mode 100644 backend/.sqlx/query-9cf96fa6364b7f34dc83719b4a0e97e8494393c29f7c8d915aa54da7ab7eed51.json create mode 100644 backend/.sqlx/query-a439552f74ed0ba305e3d9cb99ae9e5d24834082ebf2fe9fd3964fdd80b69ccb.json create mode 100644 backend/.sqlx/query-c3b5abbf2c9079d597a55f7c63bc83b8b4da98bda204a40f045a62172cfb4ebb.json rename backend/.sqlx/{query-95cb1fe8658f98fb736d899fa21cd7378b0c9d3b5f3d6bd6cafcba273f8277d4.json => query-e968e879d3c52f7dd502c3cd15fc8fbd983a4a3ab25648c562497a27c74b5c8c.json} (50%) create mode 100644 backend/migrations/20240924213010_add_last_occupancy_rates.down.sql create mode 100644 backend/migrations/20240924213010_add_last_occupancy_rates.up.sql create mode 100644 backend/windmill-common/src/ee.rs~main create mode 100644 backend/windmill-common/src/ee.rs~main_0 create mode 100644 backend/windmill-worker/src/handle_child.rs create mode 100644 backend/windmill-worker/src/job_logger.rs create mode 100644 cli/queues.ts create mode 100644 cli/worker_groups.ts create mode 100644 cli/workers.ts diff --git a/backend/.sqlx/query-9bf41c3161a02b7d0731c4e1d79519cef5255f5df1b759af3aa4985bb64313e5.json b/backend/.sqlx/query-41f68f4ce5bed783cf69e42da115e9ad2c9fcbd75f55817b2114c04207f66e4a.json similarity index 78% rename from backend/.sqlx/query-9bf41c3161a02b7d0731c4e1d79519cef5255f5df1b759af3aa4985bb64313e5.json rename to backend/.sqlx/query-41f68f4ce5bed783cf69e42da115e9ad2c9fcbd75f55817b2114c04207f66e4a.json index 2eec9fba301b2..9397f52b83d48 100644 --- a/backend/.sqlx/query-9bf41c3161a02b7d0731c4e1d79519cef5255f5df1b759af3aa4985bb64313e5.json +++ b/backend/.sqlx/query-41f68f4ce5bed783cf69e42da115e9ad2c9fcbd75f55817b2114c04207f66e4a.json @@ -1,6 +1,6 @@ { "db_name": "PostgreSQL", - "query": "INSERT INTO metrics (id, value)\n VALUES ($1, to_jsonb((SELECT EXTRACT(EPOCH FROM now() - scheduled_for)\n FROM queue WHERE tag = $2 AND running = false AND scheduled_for <= now() - ('3 seconds')::interval\n ORDER BY priority DESC NULLS LAST, scheduled_for, created_at LIMIT 1)))", + "query": "INSERT INTO metrics (id, value)\n VALUES ($1, to_jsonb((SELECT EXTRACT(EPOCH FROM now() - scheduled_for)\n FROM queue WHERE tag = $2 AND running = false AND scheduled_for <= now() - ('3 seconds')::interval\n ORDER BY priority DESC NULLS LAST, scheduled_for LIMIT 1)))", "describe": { "columns": [], "parameters": { @@ -11,5 +11,5 @@ }, "nullable": [] }, - "hash": "9bf41c3161a02b7d0731c4e1d79519cef5255f5df1b759af3aa4985bb64313e5" + "hash": "41f68f4ce5bed783cf69e42da115e9ad2c9fcbd75f55817b2114c04207f66e4a" } diff --git a/backend/.sqlx/query-8375c1efeb1e2a2d2803052a2899bf70f4a6434eb91b4b05b9fb8420beae26af.json b/backend/.sqlx/query-6a497334c98bfaf70be44fced572a1cc0dde4141aa4c5002765a95432d0101ab.json similarity index 68% rename from backend/.sqlx/query-8375c1efeb1e2a2d2803052a2899bf70f4a6434eb91b4b05b9fb8420beae26af.json rename to backend/.sqlx/query-6a497334c98bfaf70be44fced572a1cc0dde4141aa4c5002765a95432d0101ab.json index 4bcbb0380fddb..9df7dced7b398 100644 --- a/backend/.sqlx/query-8375c1efeb1e2a2d2803052a2899bf70f4a6434eb91b4b05b9fb8420beae26af.json +++ b/backend/.sqlx/query-6a497334c98bfaf70be44fced572a1cc0dde4141aa4c5002765a95432d0101ab.json @@ -1,6 +1,6 @@ { "db_name": "PostgreSQL", - "query": "SELECT worker, worker_instance, EXTRACT(EPOCH FROM (now() - ping_at))::integer as last_ping, started_at, ip, jobs_executed, CASE WHEN $4 IS TRUE THEN current_job_id ELSE NULL END as last_job_id, CASE WHEN $4 IS TRUE THEN current_job_workspace_id ELSE NULL END as last_job_workspace_id, custom_tags, worker_group, wm_version, occupancy_rate, memory, vcpus, memory_usage, wm_memory_usage\n FROM worker_ping\n WHERE ($1::integer IS NULL AND ping_at > now() - interval '5 minute') OR (ping_at > now() - ($1 || ' seconds')::interval)\n ORDER BY ping_at desc LIMIT $2 OFFSET $3", + "query": "SELECT worker, worker_instance, EXTRACT(EPOCH FROM (now() - ping_at))::integer as last_ping, started_at, ip, jobs_executed,\n CASE WHEN $4 IS TRUE THEN current_job_id ELSE NULL END as last_job_id, CASE WHEN $4 IS TRUE THEN current_job_workspace_id ELSE NULL END as last_job_workspace_id, \n custom_tags, worker_group, wm_version, occupancy_rate, occupancy_rate_15s, occupancy_rate_5m, occupancy_rate_30m, memory, vcpus, memory_usage, wm_memory_usage\n FROM worker_ping\n WHERE ($1::integer IS NULL AND ping_at > now() - interval '5 minute') OR (ping_at > now() - ($1 || ' seconds')::interval)\n ORDER BY ping_at desc LIMIT $2 OFFSET $3", "describe": { "columns": [ { @@ -65,21 +65,36 @@ }, { "ordinal": 12, + "name": "occupancy_rate_15s", + "type_info": "Float4" + }, + { + "ordinal": 13, + "name": "occupancy_rate_5m", + "type_info": "Float4" + }, + { + "ordinal": 14, + "name": "occupancy_rate_30m", + "type_info": "Float4" + }, + { + "ordinal": 15, "name": "memory", "type_info": "Int8" }, { - "ordinal": 13, + "ordinal": 16, "name": "vcpus", "type_info": "Int8" }, { - "ordinal": 14, + "ordinal": 17, "name": "memory_usage", "type_info": "Int8" }, { - "ordinal": 15, + "ordinal": 18, "name": "wm_memory_usage", "type_info": "Int8" } @@ -108,8 +123,11 @@ true, true, true, + true, + true, + true, true ] }, - "hash": "8375c1efeb1e2a2d2803052a2899bf70f4a6434eb91b4b05b9fb8420beae26af" + "hash": "6a497334c98bfaf70be44fced572a1cc0dde4141aa4c5002765a95432d0101ab" } diff --git a/backend/.sqlx/query-9cf96fa6364b7f34dc83719b4a0e97e8494393c29f7c8d915aa54da7ab7eed51.json b/backend/.sqlx/query-9cf96fa6364b7f34dc83719b4a0e97e8494393c29f7c8d915aa54da7ab7eed51.json deleted file mode 100644 index 92825535a4d8e..0000000000000 --- a/backend/.sqlx/query-9cf96fa6364b7f34dc83719b4a0e97e8494393c29f7c8d915aa54da7ab7eed51.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "db_name": "PostgreSQL", - "query": "UPDATE worker_ping SET ping_at = now(), jobs_executed = $1, custom_tags = $2, occupancy_rate = $3, memory_usage = $4, wm_memory_usage = $5, vcpus = COALESCE($7, vcpus), memory = COALESCE($8, memory) WHERE worker = $6", - "describe": { - "columns": [], - "parameters": { - "Left": [ - "Int4", - "TextArray", - "Float4", - "Int8", - "Int8", - "Text", - "Int8", - "Int8" - ] - }, - "nullable": [] - }, - "hash": "9cf96fa6364b7f34dc83719b4a0e97e8494393c29f7c8d915aa54da7ab7eed51" -} diff --git a/backend/.sqlx/query-a439552f74ed0ba305e3d9cb99ae9e5d24834082ebf2fe9fd3964fdd80b69ccb.json b/backend/.sqlx/query-a439552f74ed0ba305e3d9cb99ae9e5d24834082ebf2fe9fd3964fdd80b69ccb.json new file mode 100644 index 0000000000000..691a6d5e31cc0 --- /dev/null +++ b/backend/.sqlx/query-a439552f74ed0ba305e3d9cb99ae9e5d24834082ebf2fe9fd3964fdd80b69ccb.json @@ -0,0 +1,24 @@ +{ + "db_name": "PostgreSQL", + "query": "UPDATE worker_ping SET ping_at = now(), jobs_executed = $1, custom_tags = $2,\n occupancy_rate = $3, memory_usage = $4, wm_memory_usage = $5, vcpus = COALESCE($7, vcpus),\n memory = COALESCE($8, memory), occupancy_rate_15s = $9, occupancy_rate_5m = $10, occupancy_rate_30m = $11 WHERE worker = $6", + "describe": { + "columns": [], + "parameters": { + "Left": [ + "Int4", + "TextArray", + "Float4", + "Int8", + "Int8", + "Text", + "Int8", + "Int8", + "Float4", + "Float4", + "Float4" + ] + }, + "nullable": [] + }, + "hash": "a439552f74ed0ba305e3d9cb99ae9e5d24834082ebf2fe9fd3964fdd80b69ccb" +} diff --git a/backend/.sqlx/query-c3b5abbf2c9079d597a55f7c63bc83b8b4da98bda204a40f045a62172cfb4ebb.json b/backend/.sqlx/query-c3b5abbf2c9079d597a55f7c63bc83b8b4da98bda204a40f045a62172cfb4ebb.json new file mode 100644 index 0000000000000..b545c4859953d --- /dev/null +++ b/backend/.sqlx/query-c3b5abbf2c9079d597a55f7c63bc83b8b4da98bda204a40f045a62172cfb4ebb.json @@ -0,0 +1,29 @@ +{ + "db_name": "PostgreSQL", + "query": "\n SELECT tag as \"tag!\", COUNT(*) as \"count!\"\n FROM completed_job\n WHERE started_at > NOW() - make_interval(secs => $1) AND ($2::text IS NULL OR workspace_id = $2)\n GROUP BY tag\n ORDER BY \"count!\" DESC\n ", + "describe": { + "columns": [ + { + "ordinal": 0, + "name": "tag!", + "type_info": "Varchar" + }, + { + "ordinal": 1, + "name": "count!", + "type_info": "Int8" + } + ], + "parameters": { + "Left": [ + "Float8", + "Text" + ] + }, + "nullable": [ + false, + null + ] + }, + "hash": "c3b5abbf2c9079d597a55f7c63bc83b8b4da98bda204a40f045a62172cfb4ebb" +} diff --git a/backend/.sqlx/query-95cb1fe8658f98fb736d899fa21cd7378b0c9d3b5f3d6bd6cafcba273f8277d4.json b/backend/.sqlx/query-e968e879d3c52f7dd502c3cd15fc8fbd983a4a3ab25648c562497a27c74b5c8c.json similarity index 50% rename from backend/.sqlx/query-95cb1fe8658f98fb736d899fa21cd7378b0c9d3b5f3d6bd6cafcba273f8277d4.json rename to backend/.sqlx/query-e968e879d3c52f7dd502c3cd15fc8fbd983a4a3ab25648c562497a27c74b5c8c.json index 991be7c97724e..3bacda16d936b 100644 --- a/backend/.sqlx/query-95cb1fe8658f98fb736d899fa21cd7378b0c9d3b5f3d6bd6cafcba273f8277d4.json +++ b/backend/.sqlx/query-e968e879d3c52f7dd502c3cd15fc8fbd983a4a3ab25648c562497a27c74b5c8c.json @@ -1,6 +1,6 @@ { "db_name": "PostgreSQL", - "query": "UPDATE worker_ping SET ping_at = now(), current_job_id = $1, current_job_workspace_id = $2, memory_usage = $3, wm_memory_usage = $4 WHERE worker = $5", + "query": "UPDATE worker_ping SET ping_at = now(), current_job_id = $1, current_job_workspace_id = $2, memory_usage = $3, wm_memory_usage = $4,\n occupancy_rate = $6, occupancy_rate_15s = $7, occupancy_rate_5m = $8, occupancy_rate_30m = $9 WHERE worker = $5", "describe": { "columns": [], "parameters": { @@ -9,10 +9,14 @@ "Varchar", "Int8", "Int8", - "Text" + "Text", + "Float4", + "Float4", + "Float4", + "Float4" ] }, "nullable": [] }, - "hash": "95cb1fe8658f98fb736d899fa21cd7378b0c9d3b5f3d6bd6cafcba273f8277d4" + "hash": "e968e879d3c52f7dd502c3cd15fc8fbd983a4a3ab25648c562497a27c74b5c8c" } diff --git a/backend/migrations/20240924213010_add_last_occupancy_rates.down.sql b/backend/migrations/20240924213010_add_last_occupancy_rates.down.sql new file mode 100644 index 0000000000000..d2249d992d9bb --- /dev/null +++ b/backend/migrations/20240924213010_add_last_occupancy_rates.down.sql @@ -0,0 +1,5 @@ +-- Add down migration script here +ALTER TABLE worker_ping +DROP COLUMN occupancy_rate_15s, +DROP COLUMN occupancy_rate_5m, +DROP COLUMN occupancy_rate_30m; diff --git a/backend/migrations/20240924213010_add_last_occupancy_rates.up.sql b/backend/migrations/20240924213010_add_last_occupancy_rates.up.sql new file mode 100644 index 0000000000000..39033324c3ea0 --- /dev/null +++ b/backend/migrations/20240924213010_add_last_occupancy_rates.up.sql @@ -0,0 +1,5 @@ +-- Add up migration script here +ALTER TABLE worker_ping +ADD COLUMN occupancy_rate_15s REAL, +ADD COLUMN occupancy_rate_5m REAL, +ADD COLUMN occupancy_rate_30m REAL; diff --git a/backend/src/main.rs b/backend/src/main.rs index 20bc963bd5bb7..aa6993cf92d57 100644 --- a/backend/src/main.rs +++ b/backend/src/main.rs @@ -158,6 +158,7 @@ async fn cache_hub_scripts(file_path: Option) -> anyhow::Result<()> { "global", "global", "", + &mut None, ) .await?; tokio::fs::remove_dir_all(job_dir).await?; @@ -178,6 +179,7 @@ async fn cache_hub_scripts(file_path: Option) -> anyhow::Result<()> { "cache_init", windmill_worker::get_common_bun_proc_envs(None).await, false, + &mut None, ) .await?; } else { diff --git a/backend/src/monitor.rs b/backend/src/monitor.rs index 8397f134aacce..2c3220f237d9b 100644 --- a/backend/src/monitor.rs +++ b/backend/src/monitor.rs @@ -1071,73 +1071,68 @@ pub async fn monitor_db( } pub async fn expose_queue_metrics(db: &Pool) { - let tx = db.begin().await; - if let Ok(mut tx) = tx { - let last_check = sqlx::query_scalar!( + let last_check = sqlx::query_scalar!( "SELECT created_at FROM metrics WHERE id LIKE 'queue_count_%' ORDER BY created_at DESC LIMIT 1" ) .fetch_optional(db) .await .unwrap_or(Some(chrono::Utc::now())); - let metrics_enabled = METRICS_ENABLED.load(std::sync::atomic::Ordering::Relaxed); - let save_metrics = last_check - .map(|last_check| chrono::Utc::now() - last_check > chrono::Duration::seconds(25)) - .unwrap_or(true); + let metrics_enabled = METRICS_ENABLED.load(std::sync::atomic::Ordering::Relaxed); + let save_metrics = last_check + .map(|last_check| chrono::Utc::now() - last_check > chrono::Duration::seconds(25)) + .unwrap_or(true); - if metrics_enabled || save_metrics { - let queue_counts = sqlx::query!( - "SELECT tag, count(*) as count FROM queue WHERE + if metrics_enabled || save_metrics { + let queue_counts = sqlx::query!( + "SELECT tag, count(*) as count FROM queue WHERE scheduled_for <= now() - ('3 seconds')::interval AND running = false GROUP BY tag" - ) - .fetch_all(&mut *tx) - .await - .ok() - .unwrap_or_else(|| vec![]); + ) + .fetch_all(db) + .await + .ok() + .unwrap_or_else(|| vec![]); - for q in queue_counts { - let count = q.count.unwrap_or(0); - let tag = q.tag; - if metrics_enabled { - let metric = (*QUEUE_COUNT).with_label_values(&[&tag]); - metric.set(count as i64); - } + for q in queue_counts { + let count = q.count.unwrap_or(0); + let tag = q.tag; + if metrics_enabled { + let metric = (*QUEUE_COUNT).with_label_values(&[&tag]); + metric.set(count as i64); + } - // save queue_count and delay metrics per tag - if save_metrics { + // save queue_count and delay metrics per tag + if save_metrics { + sqlx::query!( + "INSERT INTO metrics (id, value) VALUES ($1, $2)", + format!("queue_count_{}", tag), + serde_json::json!(count) + ) + .execute(db) + .await + .ok(); + if count > 0 { sqlx::query!( - "INSERT INTO metrics (id, value) VALUES ($1, $2)", - format!("queue_count_{}", tag), - serde_json::json!(count) - ) - .execute(&mut *tx) - .await - .ok(); - if count > 0 { - sqlx::query!( "INSERT INTO metrics (id, value) VALUES ($1, to_jsonb((SELECT EXTRACT(EPOCH FROM now() - scheduled_for) FROM queue WHERE tag = $2 AND running = false AND scheduled_for <= now() - ('3 seconds')::interval - ORDER BY priority DESC NULLS LAST, scheduled_for, created_at LIMIT 1)))", + ORDER BY priority DESC NULLS LAST, scheduled_for LIMIT 1)))", format!("queue_delay_{}", tag), tag - ).execute(&mut *tx).await.ok(); - } + ).execute(db).await.ok(); } } } - - // clean queue metrics older than 14 days - sqlx::query!( - "DELETE FROM metrics WHERE id LIKE 'queue_%' AND created_at < NOW() - INTERVAL '14 day'" - ) - .execute(&mut *tx) - .await - .ok(); - - tx.commit().await.ok(); } + + // clean queue metrics older than 14 days + sqlx::query!( + "DELETE FROM metrics WHERE id LIKE 'queue_%' AND created_at < NOW() - INTERVAL '14 day'" + ) + .execute(db) + .await + .ok(); } pub async fn reload_smtp_config(db: &Pool) { diff --git a/backend/windmill-api/openapi.yaml b/backend/windmill-api/openapi.yaml index 3245b03ed5d61..b4954fb482358 100644 --- a/backend/windmill-api/openapi.yaml +++ b/backend/windmill-api/openapi.yaml @@ -5964,6 +5964,43 @@ paths: schema: type: integer + /jobs/completed/count_by_tag: + get: + summary: Count jobs by tag + operationId: countJobsByTag + tags: + - job + parameters: + - name: horizon_secs + in: query + description: Past Time horizon in seconds (when to start the count = now - horizon) (default is 3600) + required: false + schema: + type: integer + - name: workspace_id + in: query + description: Specific workspace ID to filter results (optional) + required: false + schema: + type: string + responses: + "200": + description: Job counts by tag + content: + application/json: + schema: + type: array + items: + type: object + properties: + tag: + type: string + count: + type: integer + required: + - tag + - count + /w/{workspace}/jobs_u/get/{id}: get: summary: get job @@ -6498,6 +6535,7 @@ paths: schema: type: string + /w/{workspace}/jobs_u/cancel/{id}/{resume_id}/{signature}: get: summary: cancel a job for a suspended flow @@ -11158,6 +11196,12 @@ components: type: string occupancy_rate: type: number + occupancy_rate_15s: + type: number + occupancy_rate_5m: + type: number + occupancy_rate_30m: + type: number memory: type: number vcpus: diff --git a/backend/windmill-api/src/configs.rs b/backend/windmill-api/src/configs.rs index aacfd49ef7570..64c8a1884ff8b 100644 --- a/backend/windmill-api/src/configs.rs +++ b/backend/windmill-api/src/configs.rs @@ -33,7 +33,7 @@ pub fn global_service() -> Router { #[derive(Serialize, Deserialize, FromRow)] struct Config { - name: String, + name: Option, config: serde_json::Value, } @@ -41,9 +41,18 @@ async fn list_worker_groups( authed: ApiAuthed, Extension(db): Extension, ) -> error::JsonResult> { - let configs_raw = sqlx::query_as!(Config, "SELECT * FROM config WHERE name LIKE 'worker__%'") - .fetch_all(&db) - .await?; + let mut configs_raw = + sqlx::query_as!(Config, "SELECT * FROM config WHERE name LIKE 'worker__%'") + .fetch_all(&db) + .await?; + // Remove the 'worker__' prefix from all config names + for config in configs_raw.iter_mut() { + if let Some(name) = &config.name { + if name.starts_with("worker__") { + config.name = Some(name.strip_prefix("worker__").unwrap().to_string()); + } + } + } let configs = if !authed.is_admin { let mut obfuscated_configs: Vec = vec![]; for config in configs_raw { diff --git a/backend/windmill-api/src/jobs.rs b/backend/windmill-api/src/jobs.rs index c43d2935e3bab..82e58c5322d8f 100644 --- a/backend/windmill-api/src/jobs.rs +++ b/backend/windmill-api/src/jobs.rs @@ -11,6 +11,7 @@ use axum::http::HeaderValue; use quick_cache::sync::Cache; use serde_json::value::RawValue; use sqlx::Pool; +use windmill_common::error::JsonResult; use std::collections::HashMap; #[cfg(feature = "prometheus")] use std::sync::atomic::Ordering; @@ -247,7 +248,7 @@ pub fn workspaced_service() -> Router { .route("/run/flow_dependencies", post(run_flow_dependencies_job)) } -pub fn global_service() -> Router { +pub fn workspace_unauthed_service() -> Router { Router::new() .route( "/resume/:job_id/:resume_id/:secret", @@ -291,7 +292,12 @@ pub fn global_service() -> Router { } pub fn global_root_service() -> Router { - Router::new().route("/db_clock", get(get_db_clock)) + Router::new() + .route("/db_clock", get(get_db_clock)) + .route( + "/completed/count_by_tag", + get(count_by_tag), + ) } #[derive(Deserialize)] @@ -5100,6 +5106,46 @@ async fn get_completed_job_result( Ok(Json(result).into_response()) } + + +#[derive(Deserialize)] +struct CountByTagQuery { + horizon_secs: Option, + workspace_id: Option, +} + +#[derive(Serialize)] +struct TagCount { + tag: String, + count: i64, +} + +async fn count_by_tag( + ApiAuthed { email, ..}: ApiAuthed, + Extension(db): Extension, + Query(query): Query, +) -> JsonResult> { + require_super_admin(&db, &email).await?; + let horizon = query.horizon_secs.unwrap_or(3600); // Default to 1 hour if not specified + + let counts = sqlx::query_as!( + TagCount, + r#" + SELECT tag as "tag!", COUNT(*) as "count!" + FROM completed_job + WHERE started_at > NOW() - make_interval(secs => $1) AND ($2::text IS NULL OR workspace_id = $2) + GROUP BY tag + ORDER BY "count!" DESC + "#, + horizon as f64, + query.workspace_id + ) + .fetch_all(&db) + .await?; + + Ok(Json(counts)) +} + #[derive(Serialize)] struct CompletedJobResult { started: Option, diff --git a/backend/windmill-api/src/lib.rs b/backend/windmill-api/src/lib.rs index b5bba9f6450f8..97d2865e2e79a 100644 --- a/backend/windmill-api/src/lib.rs +++ b/backend/windmill-api/src/lib.rs @@ -330,7 +330,7 @@ pub async fn run_server( ) .nest( "/w/:workspace_id/jobs_u", - jobs::global_service().layer(cors.clone()), + jobs::workspace_unauthed_service().layer(cors.clone()), ) .nest( "/w/:workspace_id/resources_u", diff --git a/backend/windmill-api/src/workers.rs b/backend/windmill-api/src/workers.rs index eecefe61467cf..2f8439bf244f2 100644 --- a/backend/windmill-api/src/workers.rs +++ b/backend/windmill-api/src/workers.rs @@ -54,6 +54,12 @@ struct WorkerPing { #[serde(skip_serializing_if = "Option::is_none")] occupancy_rate: Option, #[serde(skip_serializing_if = "Option::is_none")] + occupancy_rate_15s: Option, + #[serde(skip_serializing_if = "Option::is_none")] + occupancy_rate_5m: Option, + #[serde(skip_serializing_if = "Option::is_none")] + occupancy_rate_30m: Option, + #[serde(skip_serializing_if = "Option::is_none")] memory: Option, #[serde(skip_serializing_if = "Option::is_none")] vcpus: Option, @@ -88,7 +94,9 @@ async fn list_worker_pings( let rows = sqlx::query_as!( WorkerPing, - "SELECT worker, worker_instance, EXTRACT(EPOCH FROM (now() - ping_at))::integer as last_ping, started_at, ip, jobs_executed, CASE WHEN $4 IS TRUE THEN current_job_id ELSE NULL END as last_job_id, CASE WHEN $4 IS TRUE THEN current_job_workspace_id ELSE NULL END as last_job_workspace_id, custom_tags, worker_group, wm_version, occupancy_rate, memory, vcpus, memory_usage, wm_memory_usage + "SELECT worker, worker_instance, EXTRACT(EPOCH FROM (now() - ping_at))::integer as last_ping, started_at, ip, jobs_executed, + CASE WHEN $4 IS TRUE THEN current_job_id ELSE NULL END as last_job_id, CASE WHEN $4 IS TRUE THEN current_job_workspace_id ELSE NULL END as last_job_workspace_id, + custom_tags, worker_group, wm_version, occupancy_rate, occupancy_rate_15s, occupancy_rate_5m, occupancy_rate_30m, memory, vcpus, memory_usage, wm_memory_usage FROM worker_ping WHERE ($1::integer IS NULL AND ping_at > now() - interval '5 minute') OR (ping_at > now() - ($1 || ' seconds')::interval) ORDER BY ping_at desc LIMIT $2 OFFSET $3", diff --git a/backend/windmill-common/src/ee.rs~main b/backend/windmill-common/src/ee.rs~main new file mode 100644 index 0000000000000..482b61a0fe1f0 --- /dev/null +++ b/backend/windmill-common/src/ee.rs~main @@ -0,0 +1,73 @@ +#[cfg(feature = "enterprise")] +use crate::db::DB; +use crate::ee::LicensePlan::Community; +#[cfg(feature = "enterprise")] +use crate::error; +use serde::Deserialize; +use std::sync::Arc; +use tokio::sync::RwLock; + +lazy_static::lazy_static! { + pub static ref LICENSE_KEY_VALID: Arc> = Arc::new(RwLock::new(true)); + pub static ref LICENSE_KEY_ID: Arc> = Arc::new(RwLock::new("".to_string())); + pub static ref LICENSE_KEY: Arc> = Arc::new(RwLock::new("".to_string())); +} + +pub enum LicensePlan { + Community, + Pro, + Enterprise, +} + +pub async fn get_license_plan() -> LicensePlan { + // Implementation is not open source + return Community; +} + +#[derive(Deserialize)] +#[serde(untagged)] +pub enum CriticalErrorChannel {} + +pub enum CriticalAlertKind { + #[cfg(feature = "enterprise")] + CriticalError, + #[cfg(feature = "enterprise")] + RecoveredCriticalError, +} + +#[cfg(feature = "enterprise")] +pub async fn send_critical_alert( + _error_message: String, + _db: &DB, + _kind: CriticalAlertKind, + _channels: Option>, +) { +} + +#[cfg(feature = "enterprise")] +pub async fn schedule_key_renewal(_http_client: &reqwest::Client, _db: &crate::db::DB) -> () { + // Implementation is not open source +} + +#[cfg(feature = "enterprise")] +pub async fn renew_license_key( + _http_client: &reqwest::Client, + _db: &crate::db::DB, + _key: Option, + _manual: bool, +) -> String { + // Implementation is not open source + "".to_string() +} + +#[cfg(feature = "enterprise")] +pub async fn create_customer_portal_session( + _http_client: &reqwest::Client, + _key: Option, +) -> error::Result { + // Implementation is not open source + Ok("".to_string()) +} + +#[cfg(feature = "enterprise")] +pub async fn worker_groups_alerts(_db: &DB) {} diff --git a/backend/windmill-common/src/ee.rs~main_0 b/backend/windmill-common/src/ee.rs~main_0 new file mode 100644 index 0000000000000..2f32756e45617 --- /dev/null +++ b/backend/windmill-common/src/ee.rs~main_0 @@ -0,0 +1,76 @@ +#[cfg(feature = "enterprise")] +use crate::db::DB; +use crate::ee::LicensePlan::Community; +#[cfg(feature = "enterprise")] +use crate::error; +use serde::Deserialize; +use std::sync::Arc; +use tokio::sync::RwLock; + +lazy_static::lazy_static! { + pub static ref LICENSE_KEY_VALID: Arc> = Arc::new(RwLock::new(true)); + pub static ref LICENSE_KEY_ID: Arc> = Arc::new(RwLock::new("".to_string())); + pub static ref LICENSE_KEY: Arc> = Arc::new(RwLock::new("".to_string())); +} + +pub enum LicensePlan { + Community, + Pro, + Enterprise, +} + +pub async fn get_license_plan() -> LicensePlan { + // Implementation is not open source + return Community; +} + +#[derive(Deserialize)] +#[serde(untagged)] +pub enum CriticalErrorChannel { + Email { email: String }, + Slack { slack_channel: String }, +} + +pub enum CriticalAlertKind { + #[cfg(feature = "enterprise")] + CriticalError, + #[cfg(feature = "enterprise")] + RecoveredCriticalError, +} + +#[cfg(feature = "enterprise")] +pub async fn send_critical_alert( + _error_message: String, + _db: &DB, + _kind: CriticalAlertKind, + _channels: Option>, +) { +} + +#[cfg(feature = "enterprise")] +pub async fn schedule_key_renewal(_http_client: &reqwest::Client, _db: &crate::db::DB) -> () { + // Implementation is not open source +} + +#[cfg(feature = "enterprise")] +pub async fn renew_license_key( + _http_client: &reqwest::Client, + _db: &crate::db::DB, + _key: Option, + _manual: bool, +) -> String { + // Implementation is not open source + "".to_string() +} + +#[cfg(feature = "enterprise")] +pub async fn create_customer_portal_session( + _http_client: &reqwest::Client, + _key: Option, +) -> error::Result { + // Implementation is not open source + Ok("".to_string()) +} + +#[cfg(feature = "enterprise")] +pub async fn worker_groups_alerts(_db: &DB) {} diff --git a/backend/windmill-worker/src/ansible_executor.rs b/backend/windmill-worker/src/ansible_executor.rs index 297458633a80f..6639189f6147c 100644 --- a/backend/windmill-worker/src/ansible_executor.rs +++ b/backend/windmill-worker/src/ansible_executor.rs @@ -21,9 +21,10 @@ use windmill_queue::{append_logs, CanceledBy}; use crate::{ bash_executor::BIN_BASH, common::{ - get_reserved_variables, handle_child, read_and_check_result, start_child_process, - transform_json, + get_reserved_variables, read_and_check_result, start_child_process, transform_json, + OccupancyMetrics, }, + handle_child::handle_child, python_executor::{create_dependencies_dir, handle_python_reqs, pip_compile}, AuthedClientBackgroundTask, DISABLE_NSJAIL, DISABLE_NUSER, HOME_ENV, NSJAIL_PATH, PATH_ENV, TZ_ENV, @@ -50,6 +51,7 @@ async fn handle_ansible_python_deps( worker_dir: &str, mem_peak: &mut i32, canceled_by: &mut Option, + occupancy_metrics: &mut OccupancyMetrics, ) -> error::Result> { create_dependencies_dir(job_dir).await; @@ -79,6 +81,7 @@ async fn handle_ansible_python_deps( db, worker_name, w_id, + &mut Some(occupancy_metrics), ) .await .map_err(|e| { @@ -102,6 +105,7 @@ async fn handle_ansible_python_deps( worker_name, job_dir, worker_dir, + &mut Some(occupancy_metrics), ) .await?; additional_python_paths.append(&mut venv_path); @@ -118,6 +122,7 @@ async fn install_galaxy_collections( mem_peak: &mut i32, canceled_by: &mut Option, db: &sqlx::Pool, + occupancy_metrics: &mut OccupancyMetrics, ) -> anyhow::Result<()> { write_file(job_dir, "requirements.yml", collections_yml)?; @@ -160,6 +165,7 @@ async fn install_galaxy_collections( "ansible galaxy install", None, false, + &mut Some(occupancy_metrics), ) .await?; @@ -198,6 +204,7 @@ pub async fn handle_ansible_job( shared_mount: &str, base_internal_url: &str, envs: HashMap, + occupancy_metrics: &mut OccupancyMetrics, ) -> windmill_common::error::Result> { check_ansible_exists()?; @@ -216,6 +223,7 @@ pub async fn handle_ansible_job( worker_dir, mem_peak, canceled_by, + occupancy_metrics, ) .await?; @@ -289,6 +297,7 @@ pub async fn handle_ansible_job( mem_peak, canceled_by, db, + occupancy_metrics, ) .await?; } @@ -424,6 +433,7 @@ fi "python run", job.timeout, false, + &mut Some(occupancy_metrics), ) .await?; read_and_check_result(job_dir).await diff --git a/backend/windmill-worker/src/bash_executor.rs b/backend/windmill-worker/src/bash_executor.rs index 412eb6b440637..e9b135d27715e 100644 --- a/backend/windmill-worker/src/bash_executor.rs +++ b/backend/windmill-worker/src/bash_executor.rs @@ -24,9 +24,10 @@ lazy_static::lazy_static! { use crate::{ common::{ - build_args_map, get_reserved_variables, handle_child, read_file, read_file_content, - start_child_process, + build_args_map, get_reserved_variables, read_file, read_file_content, start_child_process, + OccupancyMetrics, }, + handle_child::handle_child, AuthedClientBackgroundTask, DISABLE_NSJAIL, DISABLE_NUSER, HOME_ENV, NSJAIL_PATH, PATH_ENV, POWERSHELL_CACHE_DIR, POWERSHELL_PATH, TZ_ENV, }; @@ -49,6 +50,7 @@ pub async fn handle_bash_job( base_internal_url: &str, worker_name: &str, envs: HashMap, + occupancy_metrics: &mut OccupancyMetrics, ) -> Result, Error> { let logs1 = "\n\n--- BASH CODE EXECUTION ---\n".to_string(); append_logs(&job.id, &job.workspace_id, logs1, db).await; @@ -142,6 +144,7 @@ pub async fn handle_bash_job( "bash run", job.timeout, true, + &mut Some(occupancy_metrics), ) .await?; @@ -194,6 +197,7 @@ pub async fn handle_powershell_job( base_internal_url: &str, worker_name: &str, envs: HashMap, + occupancy_metrics: &mut OccupancyMetrics, ) -> Result, Error> { let pwsh_args = { let args = build_args_map(job, client, db).await?.map(Json); @@ -275,6 +279,7 @@ pub async fn handle_powershell_job( "powershell install", job.timeout, false, + &mut Some(occupancy_metrics), ) .await?; } @@ -378,6 +383,7 @@ $env:PSModulePath = \"{}:$PSModulePathBackup\"", "powershell run", job.timeout, false, + &mut Some(occupancy_metrics), ) .await?; diff --git a/backend/windmill-worker/src/bigquery_executor.rs b/backend/windmill-worker/src/bigquery_executor.rs index ea240b87c9baa..1552686b79c94 100644 --- a/backend/windmill-worker/src/bigquery_executor.rs +++ b/backend/windmill-worker/src/bigquery_executor.rs @@ -14,7 +14,8 @@ use windmill_queue::{CanceledBy, HTTP_CLIENT}; use serde::Deserialize; -use crate::common::run_future_with_polling_update_job_poller; +use crate::common::OccupancyMetrics; +use crate::handle_child::run_future_with_polling_update_job_poller; use crate::{ common::{build_args_values, resolve_job_timeout}, AuthedClientBackgroundTask, @@ -210,6 +211,7 @@ pub async fn do_bigquery( canceled_by: &mut Option, worker_name: &str, column_order: &mut Option>, + occupancy_metrics: &mut OccupancyMetrics, ) -> windmill_common::error::Result> { let bigquery_args = build_args_values(job, client, db).await?; @@ -364,6 +366,7 @@ pub async fn do_bigquery( result_f.map_err(to_anyhow), worker_name, &job.workspace_id, + &mut Some(occupancy_metrics), ) .await?; diff --git a/backend/windmill-worker/src/bun_executor.rs b/backend/windmill-worker/src/bun_executor.rs index ca2f00d4b0d10..e17f140b3211b 100644 --- a/backend/windmill-worker/src/bun_executor.rs +++ b/backend/windmill-worker/src/bun_executor.rs @@ -13,10 +13,11 @@ use crate::common::build_envs_map; use crate::{ common::{ - create_args_and_out_file, get_main_override, get_reserved_variables, handle_child, - parse_npm_config, read_file, read_file_content, read_result, start_child_process, - write_file_binary, + create_args_and_out_file, get_main_override, get_reserved_variables, parse_npm_config, + read_file, read_file_content, read_result, start_child_process, write_file_binary, + OccupancyMetrics, }, + handle_child::handle_child, AuthedClientBackgroundTask, BUNFIG_INSTALL_SCOPES, BUN_BUNDLE_CACHE_DIR, BUN_CACHE_DIR, BUN_DEPSTAR_CACHE_DIR, BUN_PATH, DISABLE_NSJAIL, DISABLE_NUSER, HOME_ENV, NODE_BIN_PATH, NODE_PATH, NPM_CONFIG_REGISTRY, NPM_PATH, NSJAIL_PATH, PATH_ENV, TZ_ENV, @@ -69,6 +70,7 @@ pub async fn gen_bun_lockfile( export_pkg: bool, raw_deps: Option, npm_mode: bool, + occupancy_metrics: &mut Option<&mut OccupancyMetrics>, ) -> Result> { let common_bun_proc_envs: HashMap = get_common_bun_proc_envs(None).await; @@ -125,6 +127,7 @@ pub async fn gen_bun_lockfile( "bun build", None, false, + occupancy_metrics, ) .await?; } else { @@ -149,6 +152,7 @@ pub async fn gen_bun_lockfile( worker_name, common_bun_proc_envs, npm_mode, + occupancy_metrics, ) .await?; } else { @@ -230,6 +234,7 @@ pub async fn install_bun_lockfile( worker_name: &str, common_bun_proc_envs: HashMap, npm_mode: bool, + occupancy_metrics: &mut Option<&mut OccupancyMetrics>, ) -> Result<()> { let mut child_cmd = Command::new(if npm_mode { &*NPM_PATH } else { &*BUN_PATH }); child_cmd @@ -296,6 +301,7 @@ pub async fn install_bun_lockfile( "bun install", None, false, + occupancy_metrics, ) .await? } else { @@ -436,6 +442,7 @@ pub async fn generate_wrapper_mjs( mem_peak: &mut i32, canceled_by: &mut Option, common_bun_proc_envs: &HashMap, + occupancy_metrics: &mut Option<&mut OccupancyMetrics>, ) -> Result<()> { let mut child = Command::new(&*BUN_PATH); child @@ -459,6 +466,7 @@ pub async fn generate_wrapper_mjs( "bun build", timeout, false, + occupancy_metrics, ) .await?; fs::rename( @@ -479,6 +487,7 @@ pub async fn generate_bun_bundle( mem_peak: &mut i32, canceled_by: &mut Option, common_bun_proc_envs: &HashMap, + occupancy_metrics: &mut OccupancyMetrics, ) -> Result<()> { let mut child = Command::new(&*BUN_PATH); child @@ -503,6 +512,7 @@ pub async fn generate_bun_bundle( "bun build", timeout, false, + &mut Some(occupancy_metrics), ) .await?; } else { @@ -609,6 +619,7 @@ pub async fn prebundle_bun_script( base_internal_url: &str, worker_name: &str, token: &str, + occupancy_metrics: &mut OccupancyMetrics, ) -> Result<()> { let (local_path, remote_path) = compute_bundle_local_and_remote_path( inner_content, @@ -656,6 +667,7 @@ pub async fn prebundle_bun_script( &mut 0, &mut None, &common_bun_proc_envs, + occupancy_metrics, ) .await?; @@ -751,6 +763,7 @@ pub async fn handle_bun_job( envs: HashMap, shared_mount: &str, new_args: &mut Option>>, + occupancy_metrics: &mut OccupancyMetrics, ) -> error::Result> { let mut annotation = windmill_common::worker::get_annotation(inner_content); @@ -861,6 +874,7 @@ pub async fn handle_bun_job( worker_name, common_bun_proc_envs.clone(), annotation.npm_mode, + &mut Some(occupancy_metrics), ) .await?; @@ -888,7 +902,6 @@ pub async fn handle_bun_job( // if !*DISABLE_NSJAIL || !empty_trusted_deps || has_custom_config_registry { let logs1 = "\n\n--- BUN INSTALL ---\n".to_string(); append_logs(&job.id, &job.workspace_id, logs1, db).await; - let _ = gen_bun_lockfile( mem_peak, canceled_by, @@ -903,6 +916,7 @@ pub async fn handle_bun_job( false, None, annotation.npm_mode, + &mut Some(occupancy_metrics), ) .await?; @@ -1128,6 +1142,7 @@ try {{ mem_peak, canceled_by, &common_bun_proc_envs, + occupancy_metrics, ) .await?; if !local_path.is_empty() { @@ -1169,6 +1184,7 @@ try {{ mem_peak, canceled_by, &common_bun_proc_envs, + &mut Some(occupancy_metrics), ) .await?; } @@ -1204,6 +1220,7 @@ try {{ worker_name, &job.workspace_id, false, + occupancy_metrics, ) .await?; tracing::info!( @@ -1360,6 +1377,7 @@ try {{ "bun run", job.timeout, false, + &mut Some(occupancy_metrics), ) .await?; @@ -1501,6 +1519,7 @@ pub async fn start_worker( worker_name, common_bun_proc_envs.clone(), annotation.npm_mode, + &mut None, ) .await?; tracing::info!("dedicated worker requirements installed: {reqs}"); @@ -1521,6 +1540,7 @@ pub async fn start_worker( false, None, annotation.npm_mode, + &mut None, ) .await?; } @@ -1617,6 +1637,7 @@ for await (const line of Readline.createInterface({{ input: process.stdin }})) { &mut mem_peak, &mut canceled_by, &common_bun_proc_envs, + &mut None, ) .await?; } diff --git a/backend/windmill-worker/src/common.rs b/backend/windmill-worker/src/common.rs index 1cda8fb03b041..406f128917835 100644 --- a/backend/windmill-worker/src/common.rs +++ b/backend/windmill-worker/src/common.rs @@ -1,15 +1,7 @@ use async_recursion::async_recursion; -use deno_ast::swc::parser::lexer::util::CharExt; -use futures::Future; -use itertools::Itertools; -#[cfg(any(target_os = "linux", target_os = "macos"))] -use nix::sys::signal::{self, Signal}; -#[cfg(any(target_os = "linux", target_os = "macos"))] -use nix::unistd::Pid; +use itertools::Itertools; -#[cfg(all(feature = "enterprise", feature = "parquet"))] -use object_store::path::Path; use regex::Regex; use serde::{Deserialize, Serialize}; use serde_json::value::RawValue; @@ -18,18 +10,15 @@ use sqlx::types::Json; use sqlx::{Pool, Postgres}; use tokio::process::Command; use tokio::{fs::File, io::AsyncReadExt}; -use windmill_common::error::to_anyhow; use windmill_common::jobs::ENTRYPOINT_OVERRIDE; -#[cfg(all(feature = "enterprise", feature = "parquet"))] -use windmill_common::s3_helpers::OBJECT_STORE_CACHE_SETTINGS; + #[cfg(feature = "parquet")] use windmill_common::s3_helpers::{ get_etag_or_empty, LargeFileStorage, ObjectStoreResource, S3Object, }; use windmill_common::variables::{build_crypt_with_key_suffix, decrypt_value_with_mc}; use windmill_common::worker::{ - get_windmill_memory_usage, get_worker_memory_usage, to_raw_value, write_file, CLOUD_HOSTED, - ROOT_CACHE_DIR, TMP_DIR, WORKER_CONFIG, + to_raw_value, write_file, CLOUD_HOSTED, ROOT_CACHE_DIR, WORKER_CONFIG, }; use windmill_common::{ error::{self, Error}, @@ -38,43 +27,21 @@ use windmill_common::{ }; use anyhow::{anyhow, Result}; -use windmill_queue::{append_logs, CanceledBy}; -#[cfg(any(target_os = "linux", target_os = "macos"))] -use std::os::unix::process::ExitStatusExt; - -use std::process::ExitStatus; -use std::sync::atomic::AtomicU32; -use std::sync::Arc; use std::{ collections::{hash_map::DefaultHasher, HashMap}, hash::{Hash, Hasher}, - io, panic, time::Duration, }; -use tracing::{trace_span, Instrument}; use uuid::Uuid; use windmill_common::{variables, DB}; -#[cfg(feature = "enterprise")] -use windmill_common::job_metrics; - -use tokio::{ - io::{AsyncBufReadExt, AsyncWriteExt, BufReader}, - process::Child, - sync::{broadcast, watch}, - time::{interval, sleep, Instant, MissedTickBehavior}, -}; - -use futures::{ - future::{self, ready, FutureExt}, - stream, StreamExt, -}; +use tokio::{io::AsyncWriteExt, process::Child, time::Instant}; use crate::{ AuthedClient, AuthedClientBackgroundTask, JOB_DEFAULT_TIMEOUT, MAX_RESULT_SIZE, - MAX_TIMEOUT_DURATION, MAX_WAIT_FOR_SIGINT, MAX_WAIT_FOR_SIGTERM, + MAX_TIMEOUT_DURATION, }; pub async fn build_args_map<'a>( @@ -466,36 +433,6 @@ pub fn get_main_override(args: Option<&Json>>>) -> .flatten(); } -async fn get_mem_peak(pid: Option, nsjail: bool) -> i32 { - if pid.is_none() { - return -1; - } - let pid = if nsjail { - // This is a bit hacky, but the process id of the nsjail process is the pid of nsjail + 1. - // Ideally, we would get the number from fork() itself. This works in MOST cases. - pid.unwrap() + 1 - } else { - pid.unwrap() - }; - - if let Ok(file) = File::open(format!("/proc/{}/status", pid)).await { - let mut lines = BufReader::new(file).lines(); - while let Some(line) = lines.next_line().await.unwrap_or(None) { - if line.starts_with("VmHWM:") { - return line - .split_whitespace() - .nth(1) - .and_then(|s| s.parse::().ok()) - .unwrap_or(-1); - }; - } - -2 - } else { - // rand::random::() % 100 // to remove - used to fake memory data on MacOS - -3 - } -} - pub fn sizeof_val(v: &serde_json::Value) -> usize { std::mem::size_of::() + match v { @@ -516,63 +453,6 @@ pub fn sizeof_val(v: &serde_json::Value) -> usize { } } -pub async fn run_future_with_polling_update_job_poller( - job_id: Uuid, - timeout: Option, - db: &DB, - mem_peak: &mut i32, - canceled_by_ref: &mut Option, - result_f: Fut, - worker_name: &str, - w_id: &str, -) -> error::Result -where - Fut: Future>, -{ - let (tx, rx) = broadcast::channel::<()>(3); - - let update_job = update_job_poller( - job_id, - db, - mem_peak, - canceled_by_ref, - || async { 0 }, - worker_name, - w_id, - rx, - ); - - let timeout_ms = u64::try_from( - resolve_job_timeout(&db, &w_id, job_id, timeout) - .await - .0 - .as_millis(), - ) - .unwrap_or(200000); - - let rows = tokio::select! { - biased; - result = tokio::time::timeout(std::time::Duration::from_millis(timeout_ms), result_f) => result - .map_err(|e| { - tracing::error!("Query timeout: {}", e); - Error::ExecutionErr(format!("Query timeout after (>{}s)", timeout_ms/1000)) - })?, - ex = update_job, if job_id != Uuid::nil() => { - match ex { - UpdateJobPollingExit::Done => Err(Error::ExecutionErr("Job cancelled".to_string())).map_err(to_anyhow)?, - UpdateJobPollingExit::AlreadyCompleted => Err(Error::AlreadyCompleted("Job already completed".to_string())).map_err(to_anyhow)?, - } - } - }?; - drop(tx); - Ok(rows) -} - -pub enum UpdateJobPollingExit { - Done, - AlreadyCompleted, -} - pub async fn update_worker_ping_for_failed_init_script( db: &DB, worker_name: &str, @@ -594,672 +474,83 @@ pub async fn update_worker_ping_for_failed_init_script( tracing::error!("Error updating worker ping for failed init script: {e:?}"); } } - -pub async fn update_job_poller( - job_id: Uuid, - db: &DB, - mem_peak: &mut i32, - canceled_by_ref: &mut Option, - get_mem: F, - worker_name: &str, - w_id: &str, - mut rx: broadcast::Receiver<()>, -) -> UpdateJobPollingExit -where - F: Fn() -> Fut, - Fut: Future, -{ - let update_job_interval = Duration::from_millis(500); - - let db = db.clone(); - - let mut interval = interval(update_job_interval); - interval.set_missed_tick_behavior(MissedTickBehavior::Skip); - - let mut i = 0; - - #[cfg(feature = "enterprise")] - let mut memory_metric_id: Result = - Err(Error::NotFound("not yet initialized".to_string())); - - loop { - tokio::select!( - _ = rx.recv() => break, - _ = interval.tick() => { - // update the last_ping column every 5 seconds - i+=1; - if i == 1 || i % 10 == 0 { - let memory_usage = get_worker_memory_usage(); - let wm_memory_usage = get_windmill_memory_usage(); - tracing::info!("job {job_id} on {worker_name} in {w_id} worker memory snapshot {}kB/{}kB", memory_usage.unwrap_or_default()/1024, wm_memory_usage.unwrap_or_default()/1024); - if job_id != Uuid::nil() { - sqlx::query!( - "UPDATE worker_ping SET ping_at = now(), current_job_id = $1, current_job_workspace_id = $2, memory_usage = $3, wm_memory_usage = $4 WHERE worker = $5", - &job_id, - &w_id, - memory_usage, - wm_memory_usage, - &worker_name - ) - .execute(&db) - .await - .expect("update worker ping"); - } - } - let current_mem = get_mem().await; - if current_mem > *mem_peak { - *mem_peak = current_mem - } - tracing::info!("job {job_id} on {worker_name} in {w_id} still running. mem: {current_mem}kB, peak mem: {mem_peak}kB"); - - - let update_job_row = i == 2 || (!*SLOW_LOGS && (i < 20 || (i < 120 && i % 5 == 0) || i % 10 == 0)) || i % 20 == 0; - if update_job_row { - #[cfg(feature = "enterprise")] - { - if job_id != Uuid::nil() { - - // tracking metric starting at i >= 2 b/c first point it useless and we don't want to track metric for super fast jobs - if i == 2 { - memory_metric_id = job_metrics::register_metric_for_job( - &db, - w_id.to_string(), - job_id, - "memory_kb".to_string(), - job_metrics::MetricKind::TimeseriesInt, - Some("Job Memory Footprint (kB)".to_string()), - ) - .await; - } - if let Ok(ref metric_id) = memory_metric_id { - if let Err(err) = job_metrics::record_metric(&db, w_id.to_string(), job_id, metric_id.to_owned(), job_metrics::MetricNumericValue::Integer(current_mem)).await { - tracing::error!("Unable to save memory stat for job {} in workspace {}. Error was: {:?}", job_id, w_id, err); - } - } - } - } - if job_id != Uuid::nil() { - let (canceled, canceled_by, canceled_reason, already_completed) = sqlx::query_as::<_, (bool, Option, Option, bool)>("UPDATE queue SET mem_peak = $1, last_ping = now() WHERE id = $2 RETURNING canceled, canceled_by, canceled_reason, false") - .bind(*mem_peak) - .bind(job_id) - .fetch_optional(&db) - .await - .unwrap_or_else(|e| { - tracing::error!(%e, "error updating job {job_id}: {e:#}"); - Some((false, None, None, false)) - }) - .unwrap_or_else(|| { - // if the job is not in queue, it can only be in the completed_job so it is already complete - (false, None, None, true) - }); - if already_completed { - return UpdateJobPollingExit::AlreadyCompleted - } - if canceled { - canceled_by_ref.replace(CanceledBy { - username: canceled_by.clone(), - reason: canceled_reason.clone(), - }); - break - } - } - } - }, - ); - } - tracing::info!("job {job_id} finished"); - - UpdateJobPollingExit::Done +pub struct OccupancyMetrics { + pub running_job_started_at: Option, + pub total_duration_of_running_jobs: f32, + pub worker_occupancy_rate_history: Vec<(f32, f32)>, + pub start_time: Instant, } -pub enum CompactLogs { - NotEE, - NoS3, - S3, -} - -async fn compact_logs( - job_id: Uuid, - w_id: &str, - db: &DB, - nlogs: String, - total_size: Arc, - compact_kind: CompactLogs, - _worker_name: &str, -) -> error::Result<(String, String)> { - let mut prev_logs = sqlx::query_scalar!( - "SELECT logs FROM job_logs WHERE job_id = $1 AND workspace_id = $2", - job_id, - w_id - ) - .fetch_optional(db) - .await? - .flatten() - .unwrap_or_default(); - let size = prev_logs.char_indices().count() as i32; - let nlogs_len = nlogs.char_indices().count(); - let to_keep_in_db = usize::max( - usize::min(nlogs_len, 3000), - nlogs_len % LARGE_LOG_THRESHOLD_SIZE, - ); - let extra_split = to_keep_in_db < nlogs_len; - let stored_in_storage_len = if extra_split { - nlogs_len - to_keep_in_db - } else { - 0 - }; - let extra_to_newline = nlogs - .chars() - .skip(stored_in_storage_len) - .find_position(|x| x.is_line_break()) - .map(|(i, _)| i) - .unwrap_or(to_keep_in_db); - let stored_in_storage_to_newline = stored_in_storage_len + extra_to_newline; - - let (append_to_storage, stored_in_db) = if extra_split { - if stored_in_storage_to_newline == nlogs.len() { - (nlogs.as_ref(), "".to_string()) - } else { - let split_idx = nlogs - .char_indices() - .nth(stored_in_storage_to_newline) - .map(|(i, _)| i) - .unwrap_or(0); - let (append_to_storage, stored_in_db) = nlogs.split_at(split_idx); - // tracing::error!("{append_to_storage} ||||| {stored_in_db}"); - // tracing::error!( - // "{:?} {:?} {} {}", - // excess_prev_logs.lines().last(), - // current_logs.lines().next(), - // split_idx, - // excess_size_modulo - // ); - (append_to_storage, stored_in_db.to_string()) - } - } else { - // tracing::error!("{:?}", nlogs.lines().last()); - ("", nlogs.to_string()) - }; - - let new_size_with_excess = size + stored_in_storage_to_newline as i32; - - let new_size = total_size.fetch_add( - new_size_with_excess as u32, - std::sync::atomic::Ordering::SeqCst, - ) + new_size_with_excess as u32; - - let path = format!( - "logs/{job_id}/{}_{new_size}.txt", - chrono::Utc::now().timestamp_millis() - ); - - let mut new_current_logs = match compact_kind { - CompactLogs::NoS3 => format!("\n[windmill] No object storage set in instance settings. Previous logs have been saved to disk at {path}"), - CompactLogs::S3 => format!("\n[windmill] Previous logs have been saved to object storage at {path}"), - CompactLogs::NotEE => format!("\n[windmill] Previous logs have been saved to disk at {path}"), - }; - new_current_logs.push_str(&stored_in_db); - - sqlx::query!( - "UPDATE job_logs SET logs = $1, log_offset = $2, - log_file_index = array_append(coalesce(log_file_index, array[]::text[]), $3) - WHERE workspace_id = $4 AND job_id = $5", - new_current_logs, - new_size as i32, - path, - w_id, - job_id - ) - .execute(db) - .await?; - prev_logs.push_str(&append_to_storage); - - return Ok((prev_logs, path)); -} - -async fn default_disk_log_storage( - job_id: Uuid, - w_id: &str, - db: &DB, - nlogs: String, - total_size: Arc, - compact_kind: CompactLogs, - worker_name: &str, -) { - match compact_logs( - job_id, - &w_id, - &db, - nlogs, - total_size, - compact_kind, - worker_name, - ) - .await - { - Err(e) => tracing::error!("Could not compact logs for job {job_id}: {e:?}",), - Ok((prev_logs, path)) => { - let path = format!("{}/{}", TMP_DIR, path); - let splitted = &path.split("/").collect_vec(); - tokio::fs::create_dir_all(splitted.into_iter().take(splitted.len() - 1).join("/")) - .await - .map_err(|e| { - tracing::error!("Could not create logs directory: {e:?}",); - e - }) - .ok(); - let created = tokio::fs::File::create(&path).await; - if let Err(e) = created { - tracing::error!("Could not create logs file {path}: {e:?}",); - return; - } - if let Err(e) = tokio::fs::write(&path, prev_logs).await { - tracing::error!("Could not write to logs file {path}: {e:?}"); - } else { - tracing::info!("Logs length of {job_id} has exceeded a threshold. Previous logs have been saved to disk at {path}"); - } +impl OccupancyMetrics { + pub fn new(start_time: Instant) -> Self { + OccupancyMetrics { + running_job_started_at: None, + total_duration_of_running_jobs: 0.0, + worker_occupancy_rate_history: Vec::new(), + start_time, } } -} -async fn append_job_logs( - job_id: Uuid, - w_id: String, - logs: String, - db: DB, - must_compact_logs: bool, - total_size: Arc, - worker_name: String, -) -> () { - if must_compact_logs { - #[cfg(all(feature = "enterprise", feature = "parquet"))] - if let Some(os) = OBJECT_STORE_CACHE_SETTINGS.read().await.clone() { - match compact_logs( - job_id, - &w_id, - &db, - logs, - total_size, - CompactLogs::S3, - &worker_name, - ) - .await - { - Err(e) => tracing::error!("Could not compact logs for job {job_id}: {e:?}",), - Ok((prev_logs, path)) => { - tracing::info!("Logs length of {job_id} has exceeded a threshold. Previous logs have been saved to object storage at {path}"); - let path2 = path.clone(); - if let Err(e) = os - .put(&Path::from(path), prev_logs.to_string().into_bytes().into()) - .await - { - tracing::error!("Could not save logs to s3: {e:?}"); - } - tracing::info!("Logs of {job_id} saved to object storage at {path2}"); - } - } - } else { - default_disk_log_storage( - job_id, - &w_id, - &db, - logs, - total_size, - CompactLogs::NoS3, - &worker_name, - ) - .await; - } - - #[cfg(not(all(feature = "enterprise", feature = "parquet")))] - { - default_disk_log_storage( - job_id, - &w_id, - &db, - logs, - total_size, - CompactLogs::NotEE, - &worker_name, - ) - .await; - } - } else { - append_logs(&job_id, w_id, logs, db).await; - } -} - -pub const LARGE_LOG_THRESHOLD_SIZE: usize = 9000; -/// - wait until child exits and return with exit status -/// - read lines from stdout and stderr and append them to the "queue"."logs" -/// quitting early if output exceedes MAX_LOG_SIZE characters (not bytes) -/// - update the `last_line` and `logs` strings with the program output -/// - update "queue"."last_ping" every five seconds -/// - kill process if we exceed timeout or "queue"."canceled" is set -#[tracing::instrument(level = "trace", skip_all)] -pub async fn handle_child( - job_id: &Uuid, - db: &Pool, - mem_peak: &mut i32, - canceled_by_ref: &mut Option, - mut child: Child, - nsjail: bool, - worker: &str, - w_id: &str, - child_name: &str, - custom_timeout: Option, - sigterm: bool, -) -> error::Result<()> { - let start = Instant::now(); - - let pid = child.id(); - #[cfg(target_os = "linux")] - if let Some(pid) = pid { - //set the highest oom priority - if let Some(mut file) = File::create(format!("/proc/{pid}/oom_score_adj")) - .await - .map_err(|e| { - tracing::error!("Could not create oom_score_file to pid {pid}: {e:#}"); - e - }) - .ok() - { - let _ = file.write_all(b"1000").await; - let _ = file.sync_all().await; - } - } else { - tracing::info!("could not get child pid"); - } - let (set_too_many_logs, mut too_many_logs) = watch::channel::(false); - let (tx, rx) = broadcast::channel::<()>(3); - let mut rx2 = tx.subscribe(); - - let output = child_joined_output_stream(&mut child); - - let job_id = job_id.clone(); - - /* the cancellation future is polled on by `wait_on_child` while - * waiting for the child to exit normally */ - let update_job = update_job_poller( - job_id, - db, - mem_peak, - canceled_by_ref, - || get_mem_peak(pid, nsjail), - worker, - w_id, - rx, - ); - - #[derive(PartialEq, Debug)] - enum KillReason { - TooManyLogs, - Timeout, - Cancelled, - AlreadyCompleted, - } - - let (timeout_duration, timeout_warn_msg) = - resolve_job_timeout(&db, w_id, job_id, custom_timeout).await; - if let Some(msg) = timeout_warn_msg { - append_logs(&job_id, w_id, msg.as_str(), db).await; - } - - /* a future that completes when the child process exits */ - let wait_on_child = async { - let db = db.clone(); - - let kill_reason = tokio::select! { - biased; - result = child.wait() => return result.map(Ok), - Ok(()) = too_many_logs.changed() => KillReason::TooManyLogs, - _ = sleep(timeout_duration) => KillReason::Timeout, - ex = update_job, if job_id != Uuid::nil() => match ex { - UpdateJobPollingExit::Done => KillReason::Cancelled, - UpdateJobPollingExit::AlreadyCompleted => KillReason::AlreadyCompleted, - }, - }; - tx.send(()).expect("rx should never be dropped"); - drop(tx); - - let set_reason = async { - if kill_reason == KillReason::Timeout { - if let Err(err) = sqlx::query( - r#" - UPDATE queue - SET canceled = true - , canceled_by = 'timeout' - , canceled_reason = $1 - WHERE id = $2 - "#, - ) - .bind(format!("duration > {}", timeout_duration.as_secs())) - .bind(job_id) - .execute(&db) - .await + pub fn update_occupancy_metrics(&mut self) -> (f32, Option, Option, Option) { + let metrics = self; + let current_occupied_duration = metrics + .running_job_started_at + .map(|started_at| started_at.elapsed().as_secs_f32()) + .unwrap_or(0.0); + let total_occupation = metrics.total_duration_of_running_jobs + current_occupied_duration; + + let elapsed = metrics.start_time.elapsed().as_secs_f32(); + + let (occupancy_rate_15s, occupancy_rate_5m, occupancy_rate_30m) = + if !metrics.worker_occupancy_rate_history.is_empty() { + let mut total_occupation_15s = 0.0; + let mut total_occupation_5m = 0.0; + let mut total_occupation_30m = 0.0; + let mut index30m = 0; + for (i, (past_total_occupation, time)) in + metrics.worker_occupancy_rate_history.iter().enumerate() { - tracing::error!(%job_id, %err, "error setting cancelation reason for job {job_id}: {err}"); - } - } - }; - - if let Some(id) = child.id() { - if *MAX_WAIT_FOR_SIGINT > 0 { - #[cfg(any(target_os = "linux", target_os = "macos"))] - signal::kill(Pid::from_raw(id as i32), Signal::SIGINT).unwrap(); - - for _ in 0..*MAX_WAIT_FOR_SIGINT { - if child.try_wait().is_ok_and(|x| x.is_some()) { - break; + let diff = elapsed - time; + if diff < 1800.0 && total_occupation_30m == 0.0 { + total_occupation_30m = (total_occupation - past_total_occupation) / diff; + index30m = i; } - sleep(Duration::from_secs(1)).await; - } - if child.try_wait().is_ok_and(|x| x.is_some()) { - set_reason.await; - return Ok(Err(kill_reason)); - } - } - if sigterm { - #[cfg(any(target_os = "linux", target_os = "macos"))] - signal::kill(Pid::from_raw(id as i32), Signal::SIGTERM).unwrap(); - - for _ in 0..*MAX_WAIT_FOR_SIGTERM { - if child.try_wait().is_ok_and(|x| x.is_some()) { + if diff < 300.0 && total_occupation_5m == 0.0 { + total_occupation_5m = (total_occupation - past_total_occupation) / diff; + } + if diff < 15.0 { + total_occupation_15s = (total_occupation - past_total_occupation) / diff; break; } - sleep(Duration::from_secs(1)).await; - } - if child.try_wait().is_ok_and(|x| x.is_some()) { - set_reason.await; - return Ok(Err(kill_reason)); } - } - } - /* send SIGKILL and reap child process */ - let (_, kill) = future::join(set_reason, child.kill()).await; - kill.map(|()| Err(kill_reason)) - }; - - /* a future that reads output from the child and appends to the database */ - let lines = async move { - let max_log_size = if *CLOUD_HOSTED { - MAX_RESULT_SIZE - } else { - usize::MAX - }; + //drop all elements before the oldest one in 30m windows + metrics.worker_occupancy_rate_history.drain(..index30m); - /* log_remaining is zero when output limit was reached */ - let mut log_remaining = if *CLOUD_HOSTED { - max_log_size - } else { - usize::MAX - }; - let mut result = io::Result::Ok(()); - let mut output = output.take_until(async { - let _ = rx2.recv().await; - //wait at most 50ms after end of a script for output stream to end - tokio::time::sleep(Duration::from_millis(50)).await; - }).boxed(); - /* `do_write` resolves the task, but does not contain the Result. - * It's useful to know if the task completed. */ - let (mut do_write, mut write_result) = tokio::spawn(ready(())).remote_handle(); - - let mut log_total_size: u64 = 0; - let pg_log_total_size = Arc::new(AtomicU32::new(0)); - - while let Some(line) = output.by_ref().next().await { - - let do_write_ = do_write.shared(); - - let delay = if start.elapsed() < Duration::from_secs(10) { - Duration::from_millis(500) - } else if start.elapsed() < Duration::from_secs(60){ - Duration::from_millis(2500) - } else { - Duration::from_millis(5000) - }; - - let delay = if *SLOW_LOGS { - delay * 10 + ( + Some(total_occupation_15s), + Some(total_occupation_5m), + Some(total_occupation_30m), + ) } else { - delay + (None, None, None) }; - - let mut read_lines = stream::once(async { line }) - .chain(output.by_ref()) - /* after receiving a line, continue until some delay has passed - * _and_ the previous database write is complete */ - .take_until(future::join(sleep(delay), do_write_.clone())) - .boxed(); - - /* Read up until an error is encountered, - * handle log lines first and then the error... */ - let mut joined = String::new(); - - while let Some(line) = read_lines.next().await { - - match line { - Ok(line) => { - if line.is_empty() { - continue; - } - append_with_limit(&mut joined, &line, &mut log_remaining); - if log_remaining == 0 { - tracing::info!(%job_id, "Too many logs lines for job {job_id}"); - let _ = set_too_many_logs.send(true); - joined.push_str(&format!( - "Job logs or result reached character limit of {MAX_RESULT_SIZE}; killing job." - )); - /* stop reading and drop our streams fairly quickly */ - break; - } - } - Err(err) => { - result = Err(err); - break; - } - } - } - - - /* Ensure the last flush completed before starting a new one. - * - * This shouldn't pause since `take_until()` reads lines until `do_write` - * resolves. We only stop reading lines before `take_until()` resolves if we reach - * EOF or a read error. In those cases, waiting on a database query to complete is - * fine because we're done. */ - - if let Some(Ok(p)) = do_write_ - .then(|()| write_result) - .await - .err() - .map(|err| err.try_into_panic()) - { - panic::resume_unwind(p); - } - - - let joined_len = joined.len() as u64; - log_total_size += joined_len; - let compact_logs = log_total_size > LARGE_LOG_THRESHOLD_SIZE as u64; - if compact_logs { - log_total_size = 0; - } - - let worker_name = worker.to_string(); - let w_id2 = w_id.to_string(); - (do_write, write_result) = tokio::spawn(append_job_logs(job_id, w_id2, joined, db.clone(), compact_logs, pg_log_total_size.clone(), worker_name)).remote_handle(); - - - - if let Err(err) = result { - tracing::error!(%job_id, %err, "error reading output for job {job_id} '{child_name}': {err}"); - break; - } - - if *set_too_many_logs.borrow() { - break; - } - } - - /* drop our end of the pipe */ - drop(output); - - if let Some(Ok(p)) = do_write - .then(|()| write_result) - .await - .err() - .map(|err| err.try_into_panic()) - { - panic::resume_unwind(p); - } - }.instrument(trace_span!("child_lines")); - - let (wait_result, _) = tokio::join!(wait_on_child, lines); - - let success = wait_result.is_ok() - && wait_result.as_ref().unwrap().is_ok() - && wait_result.as_ref().unwrap().as_ref().unwrap().success(); - tracing::info!(%job_id, %success, %mem_peak, %worker, "child process '{child_name}' took {}ms", start.elapsed().as_millis()); - - match wait_result { - _ if *too_many_logs.borrow() => Err(Error::ExecutionErr(format!( - "logs or result reached limit. (current max size: {MAX_RESULT_SIZE} characters)" - ))), - Ok(Ok(status)) => process_status(status), - Ok(Err(kill_reason)) => match kill_reason { - KillReason::AlreadyCompleted => { - Err(Error::AlreadyCompleted("Job already completed".to_string())) - } - _ => Err(Error::ExecutionErr(format!( - "job process killed because {kill_reason:#?}" - ))), - }, - Err(err) => Err(Error::ExecutionErr(format!("job process io error: {err}"))), + let occupancy_rate = total_occupation / elapsed; + + //push the current occupancy rate and the timestamp + metrics + .worker_occupancy_rate_history + .push((total_occupation, elapsed)); + + ( + occupancy_rate, + occupancy_rate_15s, + occupancy_rate_5m, + occupancy_rate_30m, + ) } } -pub fn process_status(status: ExitStatus) -> error::Result<()> { - if status.success() { - Ok(()) - } else if let Some(code) = status.code() { - Err(error::Error::ExitStatus(code)) - } else { - #[cfg(any(target_os = "linux", target_os = "macos"))] - return Err(error::Error::ExecutionErr(format!( - "process terminated by signal: {:#?}, stopped_signal: {:#?}, core_dumped: {}", - status.signal(), - status.stopped_signal(), - status.core_dumped() - ))); - - #[cfg(not(any(target_os = "linux", target_os = "macos")))] - return Err(error::Error::ExecutionErr(String::from( - "process terminated by signal", - ))); - } -} pub async fn start_child_process(mut cmd: Command, executable: &str) -> Result { return cmd .spawn() @@ -1324,80 +615,6 @@ pub async fn resolve_job_timeout( } } -/// takes stdout and stderr from Child, panics if either are not present -/// -/// builds a stream joining both stdout and stderr each read line by line -fn child_joined_output_stream( - child: &mut Child, -) -> impl stream::FusedStream> { - let stderr = child - .stderr - .take() - .expect("child did not have a handle to stdout"); - - let stdout = child - .stdout - .take() - .expect("child did not have a handle to stdout"); - - let stdout = BufReader::new(stdout).lines(); - let stderr = BufReader::new(stderr).lines(); - stream::select(lines_to_stream(stderr), lines_to_stream(stdout)) -} - -pub fn lines_to_stream( - mut lines: tokio::io::Lines, -) -> impl futures::Stream> { - stream::poll_fn(move |cx| { - std::pin::Pin::new(&mut lines) - .poll_next_line(cx) - .map(|result| result.transpose()) - }) -} - -lazy_static::lazy_static! { - static ref RE_00: Regex = Regex::new('\u{00}'.to_string().as_str()).unwrap(); - pub static ref NO_LOGS_AT_ALL: bool = std::env::var("NO_LOGS_AT_ALL").ok().is_some_and(|x| x == "1" || x == "true"); - pub static ref SLOW_LOGS: bool = std::env::var("SLOW_LOGS").ok().is_some_and(|x| x == "1" || x == "true"); -} -// as a detail, `BufReader::lines()` removes \n and \r\n from the strings it yields, -// so this pushes \n to thd destination string in each call -fn append_with_limit(dst: &mut String, src: &str, limit: &mut usize) { - if *NO_LOGS_AT_ALL { - return; - } - let src_str; - let src = { - src_str = RE_00.replace_all(src, ""); - src_str.as_ref() - }; - if !*CLOUD_HOSTED { - dst.push('\n'); - dst.push_str(&src); - return; - } else { - if *limit > 0 { - dst.push('\n'); - } - *limit -= 1; - } - - let src_len = src.chars().count(); - if src_len <= *limit { - dst.push_str(&src); - *limit -= src_len; - } else { - let byte_pos = src - .char_indices() - .skip(*limit) - .next() - .map(|(byte_pos, _)| byte_pos) - .unwrap_or(0); - dst.push_str(&src[0..byte_pos]); - *limit = 0; - } -} - pub async fn hash_args( _db: &DB, _client: &AuthedClient, diff --git a/backend/windmill-worker/src/dedicated_worker.rs b/backend/windmill-worker/src/dedicated_worker.rs index fbbc161de9500..08d2ccdc5427d 100644 --- a/backend/windmill-worker/src/dedicated_worker.rs +++ b/backend/windmill-worker/src/dedicated_worker.rs @@ -29,8 +29,7 @@ use windmill_queue::append_logs; use anyhow::Context; use crate::{ - common::{process_status, start_child_process}, - JobCompleted, JobCompletedSender, MAX_BUFFERED_DEDICATED_JOBS, + common::start_child_process, JobCompleted, JobCompletedSender, MAX_BUFFERED_DEDICATED_JOBS, }; use futures::{future, Future}; @@ -78,6 +77,8 @@ pub async fn handle_dedicated_process( ) -> std::result::Result<(), error::Error> { //do not cache local dependencies + use crate::handle_child::process_status; + let mut child = { let mut cmd = Command::new(command_path); cmd.current_dir(job_dir) diff --git a/backend/windmill-worker/src/deno_executor.rs b/backend/windmill-worker/src/deno_executor.rs index 146d67be0748a..fbb875a0120c2 100644 --- a/backend/windmill-worker/src/deno_executor.rs +++ b/backend/windmill-worker/src/deno_executor.rs @@ -7,9 +7,10 @@ use windmill_queue::{append_logs, CanceledBy}; use crate::{ common::{ - create_args_and_out_file, get_main_override, get_reserved_variables, handle_child, - parse_npm_config, read_file, read_result, start_child_process, + create_args_and_out_file, get_main_override, get_reserved_variables, parse_npm_config, + read_file, read_result, start_child_process, OccupancyMetrics, }, + handle_child::handle_child, AuthedClientBackgroundTask, DENO_CACHE_DIR, DENO_PATH, DISABLE_NSJAIL, HOME_ENV, NPM_CONFIG_REGISTRY, PATH_ENV, TZ_ENV, }; @@ -94,6 +95,7 @@ pub async fn generate_deno_lock( w_id: &str, worker_name: &str, base_internal_url: &str, + occupancy_metrics: &mut Option<&mut OccupancyMetrics>, ) -> error::Result { let _ = write_file(job_dir, "main.ts", code)?; @@ -146,6 +148,7 @@ pub async fn generate_deno_lock( "deno cache", None, false, + occupancy_metrics, ) .await?; } else { @@ -173,6 +176,7 @@ pub async fn handle_deno_job( worker_name: &str, envs: HashMap, new_args: &mut Option>>, + occupancy_metrics: &mut OccupancyMetrics, ) -> error::Result> { // let mut start = Instant::now(); let logs1 = "\n\n--- DENO CODE EXECUTION ---\n".to_string(); @@ -397,6 +401,7 @@ try {{ "deno run", job.timeout, false, + &mut Some(occupancy_metrics), ) .await?; // logs.push_str(format!("execute: {:?}\n", start.elapsed().as_millis()).as_str()); diff --git a/backend/windmill-worker/src/go_executor.rs b/backend/windmill-worker/src/go_executor.rs index 202584c653e82..b1398590052c4 100644 --- a/backend/windmill-worker/src/go_executor.rs +++ b/backend/windmill-worker/src/go_executor.rs @@ -15,9 +15,10 @@ use windmill_queue::{append_logs, CanceledBy}; use crate::{ common::{ - capitalize, create_args_and_out_file, get_reserved_variables, handle_child, read_result, - start_child_process, + capitalize, create_args_and_out_file, get_reserved_variables, read_result, + start_child_process, OccupancyMetrics, }, + handle_child::handle_child, AuthedClientBackgroundTask, DISABLE_NSJAIL, DISABLE_NUSER, GOPRIVATE, GOPROXY, GO_BIN_CACHE_DIR, GO_CACHE_DIR, HOME_ENV, NSJAIL_PATH, PATH_ENV, TZ_ENV, }; @@ -44,6 +45,7 @@ pub async fn handle_go_job( base_internal_url: &str, worker_name: &str, envs: HashMap, + occupation_metrics: &mut OccupancyMetrics, ) -> Result, Error> { //go does not like executing modules at temp root let job_dir = &format!("{job_dir}/go"); @@ -88,6 +90,7 @@ pub async fn handle_go_job( skip_tidy, worker_name, &job.workspace_id, + occupation_metrics, ) .await?; @@ -202,6 +205,7 @@ func Run(req Req) (interface{{}}, error){{ "go build", None, false, + &mut Some(occupation_metrics), ) .await?; @@ -297,6 +301,7 @@ func Run(req Req) (interface{{}}, error){{ "go run", job.timeout, false, + &mut Some(occupation_metrics), ) .await?; @@ -336,6 +341,7 @@ pub async fn install_go_dependencies( has_sum: bool, worker_name: &str, w_id: &str, + occupation_metrics: &mut OccupancyMetrics, ) -> error::Result { if !skip_go_mod { gen_go_mymod(code, job_dir).await?; @@ -359,6 +365,7 @@ pub async fn install_go_dependencies( "go init", None, false, + &mut Some(occupation_metrics), ) .await?; @@ -424,9 +431,9 @@ pub async fn install_go_dependencies( &format!("go {mod_command}"), None, false, + &mut Some(occupation_metrics), ) - .await - .map_err(|e| Error::ExecutionErr(format!("Lockfile generation failed: {e:?}")))?; + .await?; if (!new_lockfile || has_sum) && non_dep_job { return Ok("".to_string()); diff --git a/backend/windmill-worker/src/graphql_executor.rs b/backend/windmill-worker/src/graphql_executor.rs index db1d47920fe62..8fb8cf13b9f9d 100644 --- a/backend/windmill-worker/src/graphql_executor.rs +++ b/backend/windmill-worker/src/graphql_executor.rs @@ -12,7 +12,8 @@ use windmill_queue::{CanceledBy, HTTP_CLIENT}; use serde::Deserialize; -use crate::common::run_future_with_polling_update_job_poller; +use crate::common::OccupancyMetrics; +use crate::handle_child::run_future_with_polling_update_job_poller; use crate::{common::build_args_map, AuthedClientBackgroundTask}; #[derive(Deserialize)] @@ -41,6 +42,7 @@ pub async fn do_graphql( mem_peak: &mut i32, canceled_by: &mut Option, worker_name: &str, + occupation_metrics: &mut OccupancyMetrics, ) -> windmill_common::error::Result> { let args = build_args_map(job, client, db).await?.map(Json); let job_args = if args.is_some() { @@ -151,6 +153,7 @@ pub async fn do_graphql( result_f, worker_name, &job.workspace_id, + &mut Some(occupation_metrics), ) .await?; diff --git a/backend/windmill-worker/src/handle_child.rs b/backend/windmill-worker/src/handle_child.rs new file mode 100644 index 0000000000000..e71629f3cacce --- /dev/null +++ b/backend/windmill-worker/src/handle_child.rs @@ -0,0 +1,628 @@ +use futures::Future; + +#[cfg(any(target_os = "linux", target_os = "macos"))] +use nix::sys::signal::{self, Signal}; +#[cfg(any(target_os = "linux", target_os = "macos"))] +use nix::unistd::Pid; + +use sqlx::{Pool, Postgres}; +use tokio::fs::File; +use windmill_common::error::to_anyhow; + +use windmill_common::error::{self, Error}; + +use windmill_common::worker::{get_windmill_memory_usage, get_worker_memory_usage, CLOUD_HOSTED}; + +use anyhow::Result; +use windmill_queue::{append_logs, CanceledBy}; + +#[cfg(any(target_os = "linux", target_os = "macos"))] +use std::os::unix::process::ExitStatusExt; + +use std::process::ExitStatus; +use std::sync::atomic::AtomicU32; +use std::sync::Arc; +use std::{io, panic, time::Duration}; + +use tracing::{trace_span, Instrument}; +use uuid::Uuid; +use windmill_common::DB; + +#[cfg(feature = "enterprise")] +use windmill_common::job_metrics; + +use tokio::{ + io::{AsyncBufReadExt, AsyncWriteExt, BufReader}, + process::Child, + sync::{broadcast, watch}, + time::{interval, sleep, Instant, MissedTickBehavior}, +}; + +use futures::{ + future::{self, ready, FutureExt}, + stream, StreamExt, +}; + +use crate::common::{resolve_job_timeout, OccupancyMetrics}; +use crate::job_logger::{append_job_logs, append_with_limit, LARGE_LOG_THRESHOLD_SIZE}; +use crate::{MAX_RESULT_SIZE, MAX_WAIT_FOR_SIGINT, MAX_WAIT_FOR_SIGTERM}; + +lazy_static::lazy_static! { + pub static ref SLOW_LOGS: bool = std::env::var("SLOW_LOGS").ok().is_some_and(|x| x == "1" || x == "true"); +} +/// - wait until child exits and return with exit status +/// - read lines from stdout and stderr and append them to the "queue"."logs" +/// quitting early if output exceedes MAX_LOG_SIZE characters (not bytes) +/// - update the `last_line` and `logs` strings with the program output +/// - update "queue"."last_ping" every five seconds +/// - kill process if we exceed timeout or "queue"."canceled" is set +#[tracing::instrument(level = "trace", skip_all)] +pub async fn handle_child( + job_id: &Uuid, + db: &Pool, + mem_peak: &mut i32, + canceled_by_ref: &mut Option, + mut child: Child, + nsjail: bool, + worker: &str, + w_id: &str, + child_name: &str, + custom_timeout: Option, + sigterm: bool, + occupancy_metrics: &mut Option<&mut OccupancyMetrics>, +) -> error::Result<()> { + let start = Instant::now(); + + let pid = child.id(); + #[cfg(target_os = "linux")] + if let Some(pid) = pid { + //set the highest oom priority + if let Some(mut file) = File::create(format!("/proc/{pid}/oom_score_adj")) + .await + .map_err(|e| { + tracing::error!("Could not create oom_score_file to pid {pid}: {e:#}"); + e + }) + .ok() + { + let _ = file.write_all(b"1000").await; + let _ = file.sync_all().await; + } + } else { + tracing::info!("could not get child pid"); + } + let (set_too_many_logs, mut too_many_logs) = watch::channel::(false); + let (tx, rx) = broadcast::channel::<()>(3); + let mut rx2 = tx.subscribe(); + + let output = child_joined_output_stream(&mut child); + + let job_id = job_id.clone(); + + /* the cancellation future is polled on by `wait_on_child` while + * waiting for the child to exit normally */ + let update_job = update_job_poller( + job_id, + db, + mem_peak, + canceled_by_ref, + || get_mem_peak(pid, nsjail), + worker, + w_id, + rx, + occupancy_metrics, + ); + + #[derive(PartialEq, Debug)] + enum KillReason { + TooManyLogs, + Timeout, + Cancelled, + AlreadyCompleted, + } + + let (timeout_duration, timeout_warn_msg) = + resolve_job_timeout(&db, w_id, job_id, custom_timeout).await; + if let Some(msg) = timeout_warn_msg { + append_logs(&job_id, w_id, msg.as_str(), db).await; + } + + /* a future that completes when the child process exits */ + let wait_on_child = async { + let db = db.clone(); + + let kill_reason = tokio::select! { + biased; + result = child.wait() => return result.map(Ok), + Ok(()) = too_many_logs.changed() => KillReason::TooManyLogs, + _ = sleep(timeout_duration) => KillReason::Timeout, + ex = update_job, if job_id != Uuid::nil() => match ex { + UpdateJobPollingExit::Done => KillReason::Cancelled, + UpdateJobPollingExit::AlreadyCompleted => KillReason::AlreadyCompleted, + }, + }; + tx.send(()).expect("rx should never be dropped"); + drop(tx); + + let set_reason = async { + if kill_reason == KillReason::Timeout { + if let Err(err) = sqlx::query( + r#" + UPDATE queue + SET canceled = true + , canceled_by = 'timeout' + , canceled_reason = $1 + WHERE id = $2 + "#, + ) + .bind(format!("duration > {}", timeout_duration.as_secs())) + .bind(job_id) + .execute(&db) + .await + { + tracing::error!(%job_id, %err, "error setting cancelation reason for job {job_id}: {err}"); + } + } + }; + + if let Some(id) = child.id() { + if *MAX_WAIT_FOR_SIGINT > 0 { + #[cfg(any(target_os = "linux", target_os = "macos"))] + signal::kill(Pid::from_raw(id as i32), Signal::SIGINT).unwrap(); + + for _ in 0..*MAX_WAIT_FOR_SIGINT { + if child.try_wait().is_ok_and(|x| x.is_some()) { + break; + } + sleep(Duration::from_secs(1)).await; + } + if child.try_wait().is_ok_and(|x| x.is_some()) { + set_reason.await; + return Ok(Err(kill_reason)); + } + } + if sigterm { + #[cfg(any(target_os = "linux", target_os = "macos"))] + signal::kill(Pid::from_raw(id as i32), Signal::SIGTERM).unwrap(); + + for _ in 0..*MAX_WAIT_FOR_SIGTERM { + if child.try_wait().is_ok_and(|x| x.is_some()) { + break; + } + sleep(Duration::from_secs(1)).await; + } + if child.try_wait().is_ok_and(|x| x.is_some()) { + set_reason.await; + return Ok(Err(kill_reason)); + } + } + } + /* send SIGKILL and reap child process */ + let (_, kill) = future::join(set_reason, child.kill()).await; + kill.map(|()| Err(kill_reason)) + }; + + /* a future that reads output from the child and appends to the database */ + let lines = async move { + + let max_log_size = if *CLOUD_HOSTED { + MAX_RESULT_SIZE + } else { + usize::MAX + }; + + /* log_remaining is zero when output limit was reached */ + let mut log_remaining = if *CLOUD_HOSTED { + max_log_size + } else { + usize::MAX + }; + let mut result = io::Result::Ok(()); + let mut output = output.take_until(async { + let _ = rx2.recv().await; + //wait at most 50ms after end of a script for output stream to end + tokio::time::sleep(Duration::from_millis(50)).await; + }).boxed(); + /* `do_write` resolves the task, but does not contain the Result. + * It's useful to know if the task completed. */ + let (mut do_write, mut write_result) = tokio::spawn(ready(())).remote_handle(); + + let mut log_total_size: u64 = 0; + let pg_log_total_size = Arc::new(AtomicU32::new(0)); + + while let Some(line) = output.by_ref().next().await { + + let do_write_ = do_write.shared(); + + let delay = if start.elapsed() < Duration::from_secs(10) { + Duration::from_millis(500) + } else if start.elapsed() < Duration::from_secs(60){ + Duration::from_millis(2500) + } else { + Duration::from_millis(5000) + }; + + let delay = if *SLOW_LOGS { + delay * 10 + } else { + delay + }; + + let mut read_lines = stream::once(async { line }) + .chain(output.by_ref()) + /* after receiving a line, continue until some delay has passed + * _and_ the previous database write is complete */ + .take_until(future::join(sleep(delay), do_write_.clone())) + .boxed(); + + /* Read up until an error is encountered, + * handle log lines first and then the error... */ + let mut joined = String::new(); + + while let Some(line) = read_lines.next().await { + + match line { + Ok(line) => { + if line.is_empty() { + continue; + } + append_with_limit(&mut joined, &line, &mut log_remaining); + if log_remaining == 0 { + tracing::info!(%job_id, "Too many logs lines for job {job_id}"); + let _ = set_too_many_logs.send(true); + joined.push_str(&format!( + "Job logs or result reached character limit of {MAX_RESULT_SIZE}; killing job." + )); + /* stop reading and drop our streams fairly quickly */ + break; + } + } + Err(err) => { + result = Err(err); + break; + } + } + } + + + /* Ensure the last flush completed before starting a new one. + * + * This shouldn't pause since `take_until()` reads lines until `do_write` + * resolves. We only stop reading lines before `take_until()` resolves if we reach + * EOF or a read error. In those cases, waiting on a database query to complete is + * fine because we're done. */ + + if let Some(Ok(p)) = do_write_ + .then(|()| write_result) + .await + .err() + .map(|err| err.try_into_panic()) + { + panic::resume_unwind(p); + } + + + let joined_len = joined.len() as u64; + log_total_size += joined_len; + let compact_logs = log_total_size > LARGE_LOG_THRESHOLD_SIZE as u64; + if compact_logs { + log_total_size = 0; + } + + let worker_name = worker.to_string(); + let w_id2 = w_id.to_string(); + (do_write, write_result) = tokio::spawn(append_job_logs(job_id, w_id2, joined, db.clone(), compact_logs, pg_log_total_size.clone(), worker_name)).remote_handle(); + + + + if let Err(err) = result { + tracing::error!(%job_id, %err, "error reading output for job {job_id} '{child_name}': {err}"); + break; + } + + if *set_too_many_logs.borrow() { + break; + } + } + + /* drop our end of the pipe */ + drop(output); + + if let Some(Ok(p)) = do_write + .then(|()| write_result) + .await + .err() + .map(|err| err.try_into_panic()) + { + panic::resume_unwind(p); + } + }.instrument(trace_span!("child_lines")); + + let (wait_result, _) = tokio::join!(wait_on_child, lines); + + let success = wait_result.is_ok() + && wait_result.as_ref().unwrap().is_ok() + && wait_result.as_ref().unwrap().as_ref().unwrap().success(); + tracing::info!(%job_id, %success, %mem_peak, %worker, "child process '{child_name}' took {}ms", start.elapsed().as_millis()); + + match wait_result { + _ if *too_many_logs.borrow() => Err(Error::ExecutionErr(format!( + "logs or result reached limit. (current max size: {MAX_RESULT_SIZE} characters)" + ))), + Ok(Ok(status)) => process_status(status), + Ok(Err(kill_reason)) => match kill_reason { + KillReason::AlreadyCompleted => { + Err(Error::AlreadyCompleted("Job already completed".to_string())) + } + _ => Err(Error::ExecutionErr(format!( + "job process killed because {kill_reason:#?}" + ))), + }, + Err(err) => Err(Error::ExecutionErr(format!("job process io error: {err}"))), + } +} + +async fn get_mem_peak(pid: Option, nsjail: bool) -> i32 { + if pid.is_none() { + return -1; + } + let pid = if nsjail { + // This is a bit hacky, but the process id of the nsjail process is the pid of nsjail + 1. + // Ideally, we would get the number from fork() itself. This works in MOST cases. + pid.unwrap() + 1 + } else { + pid.unwrap() + }; + + if let Ok(file) = File::open(format!("/proc/{}/status", pid)).await { + let mut lines = BufReader::new(file).lines(); + while let Some(line) = lines.next_line().await.unwrap_or(None) { + if line.starts_with("VmHWM:") { + return line + .split_whitespace() + .nth(1) + .and_then(|s| s.parse::().ok()) + .unwrap_or(-1); + }; + } + -2 + } else { + // rand::random::() % 100 // to remove - used to fake memory data on MacOS + -3 + } +} + +pub async fn run_future_with_polling_update_job_poller( + job_id: Uuid, + timeout: Option, + db: &DB, + mem_peak: &mut i32, + canceled_by_ref: &mut Option, + result_f: Fut, + worker_name: &str, + w_id: &str, + occupancy_metrics: &mut Option<&mut OccupancyMetrics>, +) -> error::Result +where + Fut: Future>, +{ + let (tx, rx) = broadcast::channel::<()>(3); + + let update_job = update_job_poller( + job_id, + db, + mem_peak, + canceled_by_ref, + || async { 0 }, + worker_name, + w_id, + rx, + occupancy_metrics, + ); + + let timeout_ms = u64::try_from( + resolve_job_timeout(&db, &w_id, job_id, timeout) + .await + .0 + .as_millis(), + ) + .unwrap_or(200000); + + let rows = tokio::select! { + biased; + result = tokio::time::timeout(std::time::Duration::from_millis(timeout_ms), result_f) => result + .map_err(|e| { + tracing::error!("Query timeout: {}", e); + Error::ExecutionErr(format!("Query timeout after (>{}s)", timeout_ms/1000)) + })?, + ex = update_job, if job_id != Uuid::nil() => { + match ex { + UpdateJobPollingExit::Done => Err(Error::ExecutionErr("Job cancelled".to_string())).map_err(to_anyhow)?, + UpdateJobPollingExit::AlreadyCompleted => Err(Error::AlreadyCompleted("Job already completed".to_string())).map_err(to_anyhow)?, + } + } + }?; + drop(tx); + Ok(rows) +} + +pub enum UpdateJobPollingExit { + Done, + AlreadyCompleted, +} + +pub async fn update_job_poller( + job_id: Uuid, + db: &DB, + mem_peak: &mut i32, + canceled_by_ref: &mut Option, + get_mem: F, + worker_name: &str, + w_id: &str, + mut rx: broadcast::Receiver<()>, + occupancy_metrics: &mut Option<&mut OccupancyMetrics>, +) -> UpdateJobPollingExit +where + F: Fn() -> Fut, + Fut: Future, +{ + let update_job_interval = Duration::from_millis(500); + + let db = db.clone(); + + let mut interval = interval(update_job_interval); + interval.set_missed_tick_behavior(MissedTickBehavior::Skip); + + let mut i = 0; + + #[cfg(feature = "enterprise")] + let mut memory_metric_id: Result = + Err(Error::NotFound("not yet initialized".to_string())); + + loop { + tokio::select!( + _ = rx.recv() => break, + _ = interval.tick() => { + // update the last_ping column every 5 seconds + i+=1; + if i == 1 || i % 10 == 0 { + let memory_usage = get_worker_memory_usage(); + let wm_memory_usage = get_windmill_memory_usage(); + tracing::info!("job {job_id} on {worker_name} in {w_id} worker memory snapshot {}kB/{}kB", memory_usage.unwrap_or_default()/1024, wm_memory_usage.unwrap_or_default()/1024); + let occupancy = occupancy_metrics.as_mut().map(|x| x.update_occupancy_metrics()); + if job_id != Uuid::nil() { + sqlx::query!( + "UPDATE worker_ping SET ping_at = now(), current_job_id = $1, current_job_workspace_id = $2, memory_usage = $3, wm_memory_usage = $4, + occupancy_rate = $6, occupancy_rate_15s = $7, occupancy_rate_5m = $8, occupancy_rate_30m = $9 WHERE worker = $5", + &job_id, + &w_id, + memory_usage, + wm_memory_usage, + &worker_name, + occupancy.map(|x| x.0), + occupancy.and_then(|x| x.1), + occupancy.and_then(|x| x.2), + occupancy.and_then(|x| x.3), + ) + .execute(&db) + .await + .expect("update worker ping"); + } + } + let current_mem = get_mem().await; + if current_mem > *mem_peak { + *mem_peak = current_mem + } + tracing::info!("job {job_id} on {worker_name} in {w_id} still running. mem: {current_mem}kB, peak mem: {mem_peak}kB"); + + + let update_job_row = i == 2 || (!*SLOW_LOGS && (i < 20 || (i < 120 && i % 5 == 0) || i % 10 == 0)) || i % 20 == 0; + if update_job_row { + #[cfg(feature = "enterprise")] + { + if job_id != Uuid::nil() { + + // tracking metric starting at i >= 2 b/c first point it useless and we don't want to track metric for super fast jobs + if i == 2 { + memory_metric_id = job_metrics::register_metric_for_job( + &db, + w_id.to_string(), + job_id, + "memory_kb".to_string(), + job_metrics::MetricKind::TimeseriesInt, + Some("Job Memory Footprint (kB)".to_string()), + ) + .await; + } + if let Ok(ref metric_id) = memory_metric_id { + if let Err(err) = job_metrics::record_metric(&db, w_id.to_string(), job_id, metric_id.to_owned(), job_metrics::MetricNumericValue::Integer(current_mem)).await { + tracing::error!("Unable to save memory stat for job {} in workspace {}. Error was: {:?}", job_id, w_id, err); + } + } + } + } + if job_id != Uuid::nil() { + let (canceled, canceled_by, canceled_reason, already_completed) = sqlx::query_as::<_, (bool, Option, Option, bool)>("UPDATE queue SET mem_peak = $1, last_ping = now() WHERE id = $2 RETURNING canceled, canceled_by, canceled_reason, false") + .bind(*mem_peak) + .bind(job_id) + .fetch_optional(&db) + .await + .unwrap_or_else(|e| { + tracing::error!(%e, "error updating job {job_id}: {e:#}"); + Some((false, None, None, false)) + }) + .unwrap_or_else(|| { + // if the job is not in queue, it can only be in the completed_job so it is already complete + (false, None, None, true) + }); + if already_completed { + return UpdateJobPollingExit::AlreadyCompleted + } + if canceled { + canceled_by_ref.replace(CanceledBy { + username: canceled_by.clone(), + reason: canceled_reason.clone(), + }); + break + } + } + } + }, + ); + } + tracing::info!("job {job_id} finished"); + + UpdateJobPollingExit::Done +} + +/// takes stdout and stderr from Child, panics if either are not present +/// +/// builds a stream joining both stdout and stderr each read line by line +fn child_joined_output_stream( + child: &mut Child, +) -> impl stream::FusedStream> { + let stderr = child + .stderr + .take() + .expect("child did not have a handle to stdout"); + + let stdout = child + .stdout + .take() + .expect("child did not have a handle to stdout"); + + let stdout = BufReader::new(stdout).lines(); + let stderr = BufReader::new(stderr).lines(); + stream::select(lines_to_stream(stderr), lines_to_stream(stdout)) +} + +pub fn lines_to_stream( + mut lines: tokio::io::Lines, +) -> impl futures::Stream> { + stream::poll_fn(move |cx| { + std::pin::Pin::new(&mut lines) + .poll_next_line(cx) + .map(|result| result.transpose()) + }) +} + +pub fn process_status(status: ExitStatus) -> error::Result<()> { + if status.success() { + Ok(()) + } else if let Some(code) = status.code() { + Err(error::Error::ExitStatus(code)) + } else { + #[cfg(any(target_os = "linux", target_os = "macos"))] + return Err(error::Error::ExecutionErr(format!( + "process terminated by signal: {:#?}, stopped_signal: {:#?}, core_dumped: {}", + status.signal(), + status.stopped_signal(), + status.core_dumped() + ))); + + #[cfg(not(any(target_os = "linux", target_os = "macos")))] + return Err(error::Error::ExecutionErr(String::from( + "process terminated by signal", + ))); + } +} diff --git a/backend/windmill-worker/src/job_logger.rs b/backend/windmill-worker/src/job_logger.rs new file mode 100644 index 0000000000000..32b2d0deec95e --- /dev/null +++ b/backend/windmill-worker/src/job_logger.rs @@ -0,0 +1,283 @@ +use deno_ast::swc::parser::lexer::util::CharExt; +use itertools::Itertools; + +#[cfg(all(feature = "enterprise", feature = "parquet"))] +use object_store::path::Path; +use regex::Regex; + +#[cfg(all(feature = "enterprise", feature = "parquet"))] +use windmill_common::s3_helpers::OBJECT_STORE_CACHE_SETTINGS; + +use windmill_common::error::{self}; +use windmill_common::worker::{CLOUD_HOSTED, TMP_DIR}; + +use windmill_queue::append_logs; + +use std::sync::atomic::AtomicU32; +use std::sync::Arc; + +use uuid::Uuid; +use windmill_common::DB; + +pub enum CompactLogs { + NotEE, + #[cfg(all(feature = "enterprise", feature = "parquet"))] + NoS3, + #[cfg(all(feature = "enterprise", feature = "parquet"))] + S3, +} + +async fn compact_logs( + job_id: Uuid, + w_id: &str, + db: &DB, + nlogs: String, + total_size: Arc, + compact_kind: CompactLogs, + _worker_name: &str, +) -> error::Result<(String, String)> { + let mut prev_logs = sqlx::query_scalar!( + "SELECT logs FROM job_logs WHERE job_id = $1 AND workspace_id = $2", + job_id, + w_id + ) + .fetch_optional(db) + .await? + .flatten() + .unwrap_or_default(); + let size = prev_logs.char_indices().count() as i32; + let nlogs_len = nlogs.char_indices().count(); + let to_keep_in_db = usize::max( + usize::min(nlogs_len, 3000), + nlogs_len % LARGE_LOG_THRESHOLD_SIZE, + ); + let extra_split = to_keep_in_db < nlogs_len; + let stored_in_storage_len = if extra_split { + nlogs_len - to_keep_in_db + } else { + 0 + }; + let extra_to_newline = nlogs + .chars() + .skip(stored_in_storage_len) + .find_position(|x| x.is_line_break()) + .map(|(i, _)| i) + .unwrap_or(to_keep_in_db); + let stored_in_storage_to_newline = stored_in_storage_len + extra_to_newline; + + let (append_to_storage, stored_in_db) = if extra_split { + if stored_in_storage_to_newline == nlogs.len() { + (nlogs.as_ref(), "".to_string()) + } else { + let split_idx = nlogs + .char_indices() + .nth(stored_in_storage_to_newline) + .map(|(i, _)| i) + .unwrap_or(0); + let (append_to_storage, stored_in_db) = nlogs.split_at(split_idx); + // tracing::error!("{append_to_storage} ||||| {stored_in_db}"); + // tracing::error!( + // "{:?} {:?} {} {}", + // excess_prev_logs.lines().last(), + // current_logs.lines().next(), + // split_idx, + // excess_size_modulo + // ); + (append_to_storage, stored_in_db.to_string()) + } + } else { + // tracing::error!("{:?}", nlogs.lines().last()); + ("", nlogs.to_string()) + }; + + let new_size_with_excess = size + stored_in_storage_to_newline as i32; + + let new_size = total_size.fetch_add( + new_size_with_excess as u32, + std::sync::atomic::Ordering::SeqCst, + ) + new_size_with_excess as u32; + + let path = format!( + "logs/{job_id}/{}_{new_size}.txt", + chrono::Utc::now().timestamp_millis() + ); + + let mut new_current_logs = match compact_kind { + CompactLogs::NoS3 => format!("\n[windmill] No object storage set in instance settings. Previous logs have been saved to disk at {path}"), + CompactLogs::S3 => format!("\n[windmill] Previous logs have been saved to object storage at {path}"), + CompactLogs::NotEE => format!("\n[windmill] Previous logs have been saved to disk at {path}"), + }; + new_current_logs.push_str(&stored_in_db); + + sqlx::query!( + "UPDATE job_logs SET logs = $1, log_offset = $2, + log_file_index = array_append(coalesce(log_file_index, array[]::text[]), $3) + WHERE workspace_id = $4 AND job_id = $5", + new_current_logs, + new_size as i32, + path, + w_id, + job_id + ) + .execute(db) + .await?; + prev_logs.push_str(&append_to_storage); + + return Ok((prev_logs, path)); +} + +async fn default_disk_log_storage( + job_id: Uuid, + w_id: &str, + db: &DB, + nlogs: String, + total_size: Arc, + compact_kind: CompactLogs, + worker_name: &str, +) { + match compact_logs( + job_id, + &w_id, + &db, + nlogs, + total_size, + compact_kind, + worker_name, + ) + .await + { + Err(e) => tracing::error!("Could not compact logs for job {job_id}: {e:?}",), + Ok((prev_logs, path)) => { + let path = format!("{}/{}", TMP_DIR, path); + let splitted = &path.split("/").collect_vec(); + tokio::fs::create_dir_all(splitted.into_iter().take(splitted.len() - 1).join("/")) + .await + .map_err(|e| { + tracing::error!("Could not create logs directory: {e:?}",); + e + }) + .ok(); + let created = tokio::fs::File::create(&path).await; + if let Err(e) = created { + tracing::error!("Could not create logs file {path}: {e:?}",); + return; + } + if let Err(e) = tokio::fs::write(&path, prev_logs).await { + tracing::error!("Could not write to logs file {path}: {e:?}"); + } else { + tracing::info!("Logs length of {job_id} has exceeded a threshold. Previous logs have been saved to disk at {path}"); + } + } + } +} + +pub(crate) async fn append_job_logs( + job_id: Uuid, + w_id: String, + logs: String, + db: DB, + must_compact_logs: bool, + total_size: Arc, + worker_name: String, +) -> () { + if must_compact_logs { + #[cfg(all(feature = "enterprise", feature = "parquet"))] + if let Some(os) = OBJECT_STORE_CACHE_SETTINGS.read().await.clone() { + match compact_logs( + job_id, + &w_id, + &db, + logs, + total_size, + CompactLogs::S3, + &worker_name, + ) + .await + { + Err(e) => tracing::error!("Could not compact logs for job {job_id}: {e:?}",), + Ok((prev_logs, path)) => { + tracing::info!("Logs length of {job_id} has exceeded a threshold. Previous logs have been saved to object storage at {path}"); + let path2 = path.clone(); + if let Err(e) = os + .put(&Path::from(path), prev_logs.to_string().into_bytes().into()) + .await + { + tracing::error!("Could not save logs to s3: {e:?}"); + } + tracing::info!("Logs of {job_id} saved to object storage at {path2}"); + } + } + } else { + default_disk_log_storage( + job_id, + &w_id, + &db, + logs, + total_size, + CompactLogs::NoS3, + &worker_name, + ) + .await; + } + + #[cfg(not(all(feature = "enterprise", feature = "parquet")))] + { + default_disk_log_storage( + job_id, + &w_id, + &db, + logs, + total_size, + CompactLogs::NotEE, + &worker_name, + ) + .await; + } + } else { + append_logs(&job_id, w_id, logs, db).await; + } +} + +pub const LARGE_LOG_THRESHOLD_SIZE: usize = 9000; + +lazy_static::lazy_static! { + static ref RE_00: Regex = Regex::new('\u{00}'.to_string().as_str()).unwrap(); + pub static ref NO_LOGS_AT_ALL: bool = std::env::var("NO_LOGS_AT_ALL").ok().is_some_and(|x| x == "1" || x == "true"); +} +// as a detail, `BufReader::lines()` removes \n and \r\n from the strings it yields, +// so this pushes \n to thd destination string in each call +pub fn append_with_limit(dst: &mut String, src: &str, limit: &mut usize) { + if *NO_LOGS_AT_ALL { + return; + } + let src_str; + let src = { + src_str = RE_00.replace_all(src, ""); + src_str.as_ref() + }; + if !*CLOUD_HOSTED { + dst.push('\n'); + dst.push_str(&src); + return; + } else { + if *limit > 0 { + dst.push('\n'); + } + *limit -= 1; + } + + let src_len = src.chars().count(); + if src_len <= *limit { + dst.push_str(&src); + *limit -= src_len; + } else { + let byte_pos = src + .char_indices() + .skip(*limit) + .next() + .map(|(byte_pos, _)| byte_pos) + .unwrap_or(0); + dst.push_str(&src[0..byte_pos]); + *limit = 0; + } +} diff --git a/backend/windmill-worker/src/js_eval.rs b/backend/windmill-worker/src/js_eval.rs index fb3a378861a62..24f83c86af1ef 100644 --- a/backend/windmill-worker/src/js_eval.rs +++ b/backend/windmill-worker/src/js_eval.rs @@ -40,7 +40,8 @@ use windmill_common::{error::Error, flow_status::JobResult, DB}; use windmill_queue::CanceledBy; use crate::{ - common::{run_future_with_polling_update_job_poller, unsafe_raw}, + common::{unsafe_raw, OccupancyMetrics}, + handle_child::run_future_with_polling_update_job_poller, AuthedClient, }; @@ -688,6 +689,7 @@ pub async fn eval_fetch_timeout( worker_name: &str, w_id: &str, load_client: bool, + occupation_metrics: &mut OccupancyMetrics, ) -> anyhow::Result<(Box, String)> { let (sender, mut receiver) = oneshot::channel::(); @@ -836,6 +838,7 @@ pub async fn eval_fetch_timeout( async { result_f.await? }, worker_name, w_id, + &mut Some(occupation_metrics), ) .await .map_err(|e| { diff --git a/backend/windmill-worker/src/lib.rs b/backend/windmill-worker/src/lib.rs index cf5c601c90d6d..2fc5d513dd902 100644 --- a/backend/windmill-worker/src/lib.rs +++ b/backend/windmill-worker/src/lib.rs @@ -5,6 +5,7 @@ mod mssql_executor; #[cfg(feature = "enterprise")] mod snowflake_executor; +mod ansible_executor; mod bash_executor; mod bun_executor; pub mod common; @@ -15,6 +16,8 @@ mod deno_executor; mod global_cache; mod go_executor; mod graphql_executor; +mod handle_child; +mod job_logger; mod js_eval; mod mysql_executor; mod pg_executor; @@ -25,7 +28,7 @@ mod rust_executor; mod worker; mod worker_flow; mod worker_lockfiles; -mod ansible_executor; + pub use worker::*; pub use result_processor::handle_job_error; diff --git a/backend/windmill-worker/src/mssql_executor.rs b/backend/windmill-worker/src/mssql_executor.rs index 2bffb2ee6136d..f50bad4619daa 100644 --- a/backend/windmill-worker/src/mssql_executor.rs +++ b/backend/windmill-worker/src/mssql_executor.rs @@ -14,7 +14,8 @@ use windmill_common::{error::to_anyhow, jobs::QueuedJob}; use windmill_parser_sql::{parse_db_resource, parse_mssql_sig}; use windmill_queue::{append_logs, CanceledBy}; -use crate::common::{build_args_values, run_future_with_polling_update_job_poller}; +use crate::common::{build_args_values, OccupancyMetrics}; +use crate::handle_child::run_future_with_polling_update_job_poller; use crate::AuthedClientBackgroundTask; #[derive(Deserialize)] @@ -39,6 +40,7 @@ pub async fn do_mssql( mem_peak: &mut i32, canceled_by: &mut Option, worker_name: &str, + occupancy_metrics: &mut OccupancyMetrics, ) -> error::Result> { let mssql_args = build_args_values(job, client, db).await?; @@ -157,6 +159,7 @@ pub async fn do_mssql( result_f, worker_name, &job.workspace_id, + &mut Some(occupancy_metrics), ) .await?; diff --git a/backend/windmill-worker/src/mysql_executor.rs b/backend/windmill-worker/src/mysql_executor.rs index 9d16c6bde2bc7..03cde1d508e54 100644 --- a/backend/windmill-worker/src/mysql_executor.rs +++ b/backend/windmill-worker/src/mysql_executor.rs @@ -22,7 +22,8 @@ use windmill_parser_sql::{ use windmill_queue::CanceledBy; use crate::{ - common::{build_args_map, run_future_with_polling_update_job_poller}, + common::{build_args_map, OccupancyMetrics}, + handle_child::run_future_with_polling_update_job_poller, AuthedClientBackgroundTask, }; @@ -110,6 +111,7 @@ pub async fn do_mysql( canceled_by: &mut Option, worker_name: &str, column_order: &mut Option>, + occupancy_metrics: &mut OccupancyMetrics, ) -> windmill_common::error::Result> { let args = build_args_map(job, client, db).await?.map(Json); let job_args = if args.is_some() { @@ -295,6 +297,7 @@ pub async fn do_mysql( result_f, worker_name, &job.workspace_id, + &mut Some(occupancy_metrics), ) .await?; diff --git a/backend/windmill-worker/src/pg_executor.rs b/backend/windmill-worker/src/pg_executor.rs index 000042d3e62a4..7c4192cf66903 100644 --- a/backend/windmill-worker/src/pg_executor.rs +++ b/backend/windmill-worker/src/pg_executor.rs @@ -38,7 +38,8 @@ use windmill_parser_sql::{ }; use windmill_queue::CanceledBy; -use crate::common::{build_args_values, run_future_with_polling_update_job_poller, sizeof_val}; +use crate::common::{build_args_values, sizeof_val, OccupancyMetrics}; +use crate::handle_child::run_future_with_polling_update_job_poller; use crate::{AuthedClientBackgroundTask, MAX_RESULT_SIZE}; use bytes::{Buf, BytesMut}; use lazy_static::lazy_static; @@ -162,6 +163,7 @@ pub async fn do_postgresql( canceled_by: &mut Option, worker_name: &str, column_order: &mut Option>, + occupancy_metrics: &mut OccupancyMetrics, ) -> error::Result> { let pg_args = build_args_values(job, client, db).await?; @@ -351,6 +353,7 @@ pub async fn do_postgresql( result_f, worker_name, &job.workspace_id, + &mut Some(occupancy_metrics), ) .await?; diff --git a/backend/windmill-worker/src/php_executor.rs b/backend/windmill-worker/src/php_executor.rs index cd2f7e24992a1..e62d96b3fd6fc 100644 --- a/backend/windmill-worker/src/php_executor.rs +++ b/backend/windmill-worker/src/php_executor.rs @@ -15,9 +15,10 @@ use windmill_queue::{append_logs, CanceledBy}; use crate::{ common::{ - create_args_and_out_file, get_main_override, get_reserved_variables, handle_child, - read_result, start_child_process, + create_args_and_out_file, get_main_override, get_reserved_variables, read_result, + start_child_process, OccupancyMetrics, }, + handle_child::handle_child, AuthedClientBackgroundTask, COMPOSER_CACHE_DIR, COMPOSER_PATH, DISABLE_NSJAIL, DISABLE_NUSER, NSJAIL_PATH, PHP_PATH, }; @@ -70,6 +71,7 @@ pub async fn composer_install( worker_name: &str, requirements: String, lock: Option, + occupancy_metrics: &mut OccupancyMetrics, ) -> Result { check_php_exists()?; @@ -101,6 +103,7 @@ pub async fn composer_install( "composer install", None, false, + &mut Some(occupancy_metrics), ) .await?; @@ -160,6 +163,7 @@ pub async fn handle_php_job( worker_name: &str, envs: HashMap, shared_mount: &str, + occupancy_metrics: &mut OccupancyMetrics, ) -> error::Result> { check_php_exists()?; @@ -190,6 +194,7 @@ pub async fn handle_php_job( worker_name, composer_json, composer_lock, + occupancy_metrics, ) .await?; "require './vendor/autoload.php';" @@ -346,6 +351,7 @@ try {{ "php run", job.timeout, false, + &mut Some(occupancy_metrics), ) .await?; read_result(job_dir).await diff --git a/backend/windmill-worker/src/python_executor.rs b/backend/windmill-worker/src/python_executor.rs index 51584eb037d95..87c1c829ae125 100644 --- a/backend/windmill-worker/src/python_executor.rs +++ b/backend/windmill-worker/src/python_executor.rs @@ -55,9 +55,10 @@ use windmill_common::s3_helpers::OBJECT_STORE_CACHE_SETTINGS; use crate::{ common::{ - create_args_and_out_file, get_main_override, get_reserved_variables, handle_child, - read_file, read_result, start_child_process, + create_args_and_out_file, get_main_override, get_reserved_variables, read_file, + read_result, start_child_process, OccupancyMetrics, }, + handle_child::handle_child, AuthedClientBackgroundTask, DISABLE_NSJAIL, DISABLE_NUSER, HOME_ENV, HTTPS_PROXY, HTTP_PROXY, LOCK_CACHE_DIR, NO_PROXY, NSJAIL_PATH, PATH_ENV, PIP_CACHE_DIR, PIP_EXTRA_INDEX_URL, PIP_INDEX_URL, TZ_ENV, @@ -101,6 +102,7 @@ pub async fn pip_compile( db: &Pool, worker_name: &str, w_id: &str, + occupancy_metrics: &mut Option<&mut OccupancyMetrics>, ) -> error::Result { let mut logs = String::new(); logs.push_str(&format!("\nresolving dependencies...")); @@ -210,6 +212,7 @@ pub async fn pip_compile( "pip-compile", None, false, + occupancy_metrics, ) .await .map_err(|e| Error::ExecutionErr(format!("Lock file generation failed: {e:?}")))?; @@ -247,6 +250,7 @@ pub async fn handle_python_job( base_internal_url: &str, envs: HashMap, new_args: &mut Option>>, + occupancy_metrics: &mut OccupancyMetrics, ) -> windmill_common::error::Result> { let script_path = crate::common::use_flow_root_path(job.script_path()); let additional_python_paths = handle_python_deps( @@ -261,6 +265,7 @@ pub async fn handle_python_job( worker_dir, mem_peak, canceled_by, + &mut Some(occupancy_metrics), ) .await?; @@ -485,6 +490,7 @@ mount {{ "python run", job.timeout, false, + &mut Some(occupancy_metrics), ) .await?; @@ -753,6 +759,7 @@ async fn handle_python_deps( worker_dir: &str, mem_peak: &mut i32, canceled_by: &mut Option, + occupancy_metrics: &mut Option<&mut OccupancyMetrics>, ) -> error::Result> { create_dependencies_dir(job_dir).await; @@ -790,6 +797,7 @@ async fn handle_python_deps( db, worker_name, w_id, + occupancy_metrics, ) .await .map_err(|e| { @@ -813,6 +821,7 @@ async fn handle_python_deps( worker_name, job_dir, worker_dir, + occupancy_metrics, ) .await?; additional_python_paths.append(&mut venv_path); @@ -834,6 +843,7 @@ pub async fn handle_python_reqs( worker_name: &str, job_dir: &str, worker_dir: &str, + occupancy_metrics: &mut Option<&mut OccupancyMetrics>, ) -> error::Result> { let mut req_paths: Vec = vec![]; let mut vars = vec![("PATH", PATH_ENV.as_str())]; @@ -1089,6 +1099,7 @@ pub async fn handle_python_reqs( &format!("pip install {req}"), None, false, + occupancy_metrics, ) .await; tracing::info!( @@ -1173,6 +1184,7 @@ pub async fn start_worker( job_dir, &mut mem_peak, &mut canceled_by, + &mut None, ) .await?; diff --git a/backend/windmill-worker/src/rust_executor.rs b/backend/windmill-worker/src/rust_executor.rs index 3477eed51bc9a..ab6fdd247f2ce 100644 --- a/backend/windmill-worker/src/rust_executor.rs +++ b/backend/windmill-worker/src/rust_executor.rs @@ -15,9 +15,10 @@ use windmill_queue::{append_logs, CanceledBy}; use crate::{ common::{ - create_args_and_out_file, get_reserved_variables, handle_child, read_result, - start_child_process, + create_args_and_out_file, get_reserved_variables, read_result, start_child_process, + OccupancyMetrics, }, + handle_child::handle_child, AuthedClientBackgroundTask, DISABLE_NSJAIL, DISABLE_NUSER, HOME_ENV, NSJAIL_PATH, PATH_ENV, RUST_CACHE_DIR, TZ_ENV, }; @@ -113,6 +114,7 @@ pub async fn generate_cargo_lockfile( db: &sqlx::Pool, worker_name: &str, w_id: &str, + occupancy_metrics: &mut OccupancyMetrics, ) -> error::Result { check_cargo_exists()?; @@ -137,6 +139,7 @@ pub async fn generate_cargo_lockfile( "cargo generate-lockfile", None, false, + &mut Some(occupancy_metrics), ) .await?; @@ -157,6 +160,7 @@ pub async fn build_rust_crate( w_id: &str, base_internal_url: &str, hash: &str, + occupancy_metrics: &mut OccupancyMetrics, ) -> error::Result { let bin_path = format!("{}/{hash}", RUST_CACHE_DIR); @@ -185,6 +189,7 @@ pub async fn build_rust_crate( "rust build", None, false, + &mut Some(occupancy_metrics), ) .await?; append_logs(job_id, w_id, "\n\n", db).await; @@ -262,6 +267,7 @@ pub async fn handle_rust_job( base_internal_url: &str, worker_name: &str, envs: HashMap, + occupancy_metrics: &mut OccupancyMetrics, ) -> Result, Error> { check_cargo_exists()?; @@ -305,6 +311,7 @@ pub async fn handle_rust_job( &job.workspace_id, base_internal_url, &hash, + occupancy_metrics, ) .await? }; @@ -367,6 +374,7 @@ pub async fn handle_rust_job( "rust run", job.timeout, false, + &mut Some(occupancy_metrics), ) .await?; read_result(job_dir).await diff --git a/backend/windmill-worker/src/snowflake_executor.rs b/backend/windmill-worker/src/snowflake_executor.rs index 9c9bec7c9f993..bf9dae41ff394 100644 --- a/backend/windmill-worker/src/snowflake_executor.rs +++ b/backend/windmill-worker/src/snowflake_executor.rs @@ -18,7 +18,8 @@ use windmill_queue::{CanceledBy, HTTP_CLIENT}; use serde::{Deserialize, Serialize}; -use crate::common::{resolve_job_timeout, run_future_with_polling_update_job_poller}; +use crate::common::{resolve_job_timeout, OccupancyMetrics}; +use crate::handle_child::run_future_with_polling_update_job_poller; use crate::{common::build_args_values, AuthedClientBackgroundTask}; #[derive(Serialize)] @@ -237,6 +238,7 @@ pub async fn do_snowflake( canceled_by: &mut Option, worker_name: &str, column_order: &mut Option>, + occupancy_metrics: &mut OccupancyMetrics, ) -> windmill_common::error::Result> { let snowflake_args = build_args_values(job, client, db).await?; @@ -383,6 +385,7 @@ pub async fn do_snowflake( result_f.map_err(to_anyhow), worker_name, &job.workspace_id, + &mut Some(occupancy_metrics), ) .await?; *mem_peak = (r.get().len() / 1000) as i32; diff --git a/backend/windmill-worker/src/worker.rs b/backend/windmill-worker/src/worker.rs index 5c4a02431b6a3..0101db38a9e10 100644 --- a/backend/windmill-worker/src/worker.rs +++ b/backend/windmill-worker/src/worker.rs @@ -84,8 +84,8 @@ use rand::Rng; use crate::{ ansible_executor::handle_ansible_job, bash_executor::{handle_bash_job, handle_powershell_job}, bun_executor::handle_bun_job, common::{ - build_args_map, get_cached_resource_value_if_valid, get_reserved_variables, hash_args, update_worker_ping_for_failed_init_script, NO_LOGS_AT_ALL, SLOW_LOGS - }, deno_executor::handle_deno_job, go_executor::handle_go_job, graphql_executor::do_graphql, handle_job_error, js_eval::{eval_fetch_timeout, transpile_ts}, mysql_executor::do_mysql, pg_executor::do_postgresql, php_executor::handle_php_job, python_executor::handle_python_job, result_processor::{handle_receive_completed_job, process_result}, rust_executor::handle_rust_job, worker_flow::{ + build_args_map, get_cached_resource_value_if_valid, get_reserved_variables, hash_args, update_worker_ping_for_failed_init_script, OccupancyMetrics + }, deno_executor::handle_deno_job, go_executor::handle_go_job, graphql_executor::do_graphql, handle_child::SLOW_LOGS, handle_job_error, job_logger::NO_LOGS_AT_ALL, js_eval::{eval_fetch_timeout, transpile_ts}, mysql_executor::do_mysql, pg_executor::do_postgresql, php_executor::handle_php_job, python_executor::handle_python_job, result_processor::{handle_receive_completed_job, process_result}, rust_executor::handle_rust_job, worker_flow::{ handle_flow, update_flow_status_after_job_completion, update_flow_status_in_progress, Step, }, worker_lockfiles::{ handle_app_dependency_job, handle_dependency_job, handle_flow_dependency_job, @@ -698,6 +698,8 @@ fn add_outstanding_wait_time( }.in_current_span()); } + + #[tracing::instrument(name = "worker", level = "info", skip_all, fields(worker = %worker_name, hostname = %hostname))] pub async fn run_worker( db: &Pool, @@ -877,6 +879,7 @@ pub async fn run_worker NUM_SECS_READINGS { @@ -1360,16 +1366,24 @@ pub async fn run_worker { @@ -1713,7 +1729,7 @@ pub async fn run_worker, worker_name: &str, + occupancy_metrics: &mut OccupancyMetrics, ) -> windmill_common::error::Result<(Box, String)> { let args = build_args_map(job, client, db).await?.map(Json); let job_args = if args.is_some() { @@ -2033,6 +2050,7 @@ async fn do_nativets( worker_name, &job.workspace_id, true, + occupancy_metrics, ) .await?; Ok((result.0, result.1)) @@ -2057,7 +2075,7 @@ async fn handle_queued_job( base_internal_url: &str, rsmq: Option, job_completed_tx: JobCompletedSender, - worker_code_execution_metric: &mut f32, + occupancy_metrics: &mut OccupancyMetrics, _worker_flow_initial_transition_duration: Option, _worker_code_execution_duration: Option, ) -> windmill_common::error::Result { @@ -2273,6 +2291,7 @@ async fn handle_queued_job( base_internal_url, &client.get_token().await, rsmq.clone(), + occupancy_metrics, ) .await } @@ -2288,6 +2307,7 @@ async fn handle_queued_job( base_internal_url, &client.get_token().await, rsmq.clone(), + occupancy_metrics, ) .await } @@ -2302,6 +2322,7 @@ async fn handle_queued_job( base_internal_url, &client.get_token().await, rsmq.clone(), + occupancy_metrics, ) .await .map(|()| serde_json::from_str("{}").unwrap()), @@ -2328,9 +2349,10 @@ async fn handle_queued_job( worker_name, &mut column_order, &mut new_args, + occupancy_metrics, ) .await; - *worker_code_execution_metric += metric_timer.elapsed().as_secs_f32(); + occupancy_metrics.total_duration_of_running_jobs += metric_timer.elapsed().as_secs_f32(); #[cfg(feature = "prometheus")] timer.map(|x| x.stop_and_record()); r @@ -2496,6 +2518,7 @@ async fn handle_code_execution_job( worker_name: &str, column_order: &mut Option>, new_args: &mut Option>>, + occupancy_metrics: &mut OccupancyMetrics, ) -> error::Result> { let ContentReqLangEnvs { content: inner_content, @@ -2551,6 +2574,7 @@ async fn handle_code_execution_job( canceled_by, worker_name, column_order, + occupancy_metrics, ) .await; } else if language == Some(ScriptLang::Mysql) { @@ -2563,6 +2587,7 @@ async fn handle_code_execution_job( canceled_by, worker_name, column_order, + occupancy_metrics, ) .await; } else if language == Some(ScriptLang::Bigquery) { @@ -2584,6 +2609,7 @@ async fn handle_code_execution_job( canceled_by, worker_name, column_order, + occupancy_metrics, ) .await; } @@ -2606,6 +2632,7 @@ async fn handle_code_execution_job( canceled_by, worker_name, column_order, + occupancy_metrics, ) .await; } @@ -2627,6 +2654,7 @@ async fn handle_code_execution_job( mem_peak, canceled_by, worker_name, + occupancy_metrics, ) .await; } @@ -2639,6 +2667,7 @@ async fn handle_code_execution_job( mem_peak, canceled_by, worker_name, + occupancy_metrics, ) .await; } else if language == Some(ScriptLang::Nativets) { @@ -2668,6 +2697,7 @@ async fn handle_code_execution_job( mem_peak, canceled_by, worker_name, + occupancy_metrics, ) .await?; append_logs(&job.id, &job.workspace_id, ts_logs, db).await; @@ -2729,6 +2759,7 @@ mount {{ base_internal_url, envs, new_args, + occupancy_metrics, ) .await } @@ -2746,6 +2777,7 @@ mount {{ worker_name, envs, new_args, + occupancy_metrics, ) .await } @@ -2765,6 +2797,7 @@ mount {{ envs, &shared_mount, new_args, + occupancy_metrics, ) .await } @@ -2782,6 +2815,7 @@ mount {{ base_internal_url, worker_name, envs, + occupancy_metrics, ) .await } @@ -2798,6 +2832,7 @@ mount {{ base_internal_url, worker_name, envs, + occupancy_metrics, ) .await } @@ -2814,6 +2849,7 @@ mount {{ base_internal_url, worker_name, envs, + occupancy_metrics, ) .await } @@ -2831,6 +2867,7 @@ mount {{ worker_name, envs, &shared_mount, + occupancy_metrics, ) .await } @@ -2848,6 +2885,7 @@ mount {{ base_internal_url, worker_name, envs, + occupancy_metrics, ) .await } @@ -2867,6 +2905,7 @@ mount {{ &shared_mount, base_internal_url, envs, + occupancy_metrics, ).await } _ => panic!("unreachable, language is not supported: {language:#?}"), diff --git a/backend/windmill-worker/src/worker_lockfiles.rs b/backend/windmill-worker/src/worker_lockfiles.rs index 6381dac896e6c..6f25ec606bedb 100644 --- a/backend/windmill-worker/src/worker_lockfiles.rs +++ b/backend/windmill-worker/src/worker_lockfiles.rs @@ -25,6 +25,7 @@ use windmill_parser_py_imports::parse_relative_imports; use windmill_parser_ts::parse_expr_for_imports; use windmill_queue::{append_logs, CanceledBy, PushIsolationLevel}; +use crate::common::OccupancyMetrics; use crate::python_executor::{create_dependencies_dir, handle_python_reqs, pip_compile}; use crate::rust_executor::{build_rust_crate, compute_rust_hash, generate_cargo_lockfile}; use crate::{ @@ -212,6 +213,7 @@ pub async fn handle_dependency_job, + occupancy_metrics: &mut OccupancyMetrics, ) -> error::Result> { let raw_code = match job.raw_code { Some(ref code) => code.to_owned(), @@ -273,6 +275,7 @@ pub async fn handle_dependency_job, + occupancy_metrics: &mut OccupancyMetrics, ) -> error::Result> { let job_path = job.script_path.clone().ok_or_else(|| { error::Error::InternalErr( @@ -605,6 +609,7 @@ pub async fn handle_flow_dependency_job( base_internal_url: &str, token: &str, locks_to_reload: &Option>, + occupancy_metrics: &mut OccupancyMetrics, // (modules to replace old seq (even unmmodified ones), new transaction, modified ids) ) ) -> Result<( Vec, @@ -754,6 +760,7 @@ async fn lock_modules<'c>( base_internal_url, token, locks_to_reload, + occupancy_metrics, )) .await?; e.value = FlowModuleValue::ForloopFlow { @@ -785,6 +792,7 @@ async fn lock_modules<'c>( base_internal_url, token, locks_to_reload, + occupancy_metrics, )) .await?; nmodified_ids.extend(inner_modified_ids); @@ -809,6 +817,7 @@ async fn lock_modules<'c>( base_internal_url, token, locks_to_reload, + occupancy_metrics, )) .await?; e.value = @@ -835,6 +844,7 @@ async fn lock_modules<'c>( base_internal_url, token, locks_to_reload, + occupancy_metrics, )) .await?; nmodified_ids.extend(inner_modified_ids); @@ -856,6 +866,7 @@ async fn lock_modules<'c>( base_internal_url, token, locks_to_reload, + occupancy_metrics, )) .await?; e.value = FlowModuleValue::BranchOne { branches: nbranches, default: ndefault } @@ -904,6 +915,7 @@ async fn lock_modules<'c>( ), false, None, + occupancy_metrics, ) .await; // @@ -1014,6 +1026,7 @@ async fn lock_modules_app( job_path: &str, base_internal_url: &str, token: &str, + occupancy_metrics: &mut OccupancyMetrics, ) -> Result { match value { Value::Object(mut m) => { @@ -1058,6 +1071,7 @@ async fn lock_modules_app( &format!("{}/app", job.script_path()), false, None, + occupancy_metrics, ) .await; match new_lock { @@ -1113,6 +1127,7 @@ async fn lock_modules_app( job_path, base_internal_url, token, + occupancy_metrics, ) .await?, ); @@ -1135,6 +1150,7 @@ async fn lock_modules_app( job_path, base_internal_url, token, + occupancy_metrics, ) .await?, ); @@ -1156,6 +1172,7 @@ pub async fn handle_app_dependency_job, + occupancy_metrics: &mut OccupancyMetrics, ) -> error::Result<()> { let job_path = job.script_path.clone().ok_or_else(|| { error::Error::InternalErr( @@ -1185,6 +1202,7 @@ pub async fn handle_app_dependency_job, ) -> std::result::Result { create_dependencies_dir(job_dir).await; let req: std::result::Result = pip_compile( @@ -1270,6 +1289,7 @@ async fn python_dep( db, worker_name, w_id, + occupancy_metrics, ) .await; // install the dependencies to pre-fill the cache @@ -1284,6 +1304,7 @@ async fn python_dep( worker_name, job_dir, worker_dir, + occupancy_metrics, ) .await; @@ -1313,6 +1334,7 @@ async fn capture_dependency_job( script_path: &str, raw_deps: bool, npm_mode: Option, + occupancy_metrics: &mut OccupancyMetrics, ) -> error::Result { match job_language { ScriptLang::Python3 => { @@ -1342,6 +1364,7 @@ async fn capture_dependency_job( worker_name, w_id, worker_dir, + &mut Some(occupancy_metrics), ) .await } @@ -1364,6 +1387,7 @@ async fn capture_dependency_job( worker_name, w_id, worker_dir, + &mut Some(occupancy_metrics), ) .await } @@ -1385,6 +1409,7 @@ async fn capture_dependency_job( false, worker_name, w_id, + occupancy_metrics, ) .await } @@ -1404,6 +1429,7 @@ async fn capture_dependency_job( w_id, worker_name, base_internal_url, + &mut Some(occupancy_metrics), ) .await } @@ -1431,6 +1457,7 @@ async fn capture_dependency_job( None }, npm_mode, + &mut Some(occupancy_metrics), ) .await?; if req.is_some() && !raw_deps { @@ -1445,6 +1472,7 @@ async fn capture_dependency_job( base_internal_url, worker_name, &token, + occupancy_metrics, ) .await?; } @@ -1475,6 +1503,7 @@ async fn capture_dependency_job( worker_name, reqs, None, + occupancy_metrics, ) .await } @@ -1494,6 +1523,7 @@ async fn capture_dependency_job( db, worker_name, w_id, + occupancy_metrics, ) .await?; @@ -1507,6 +1537,7 @@ async fn capture_dependency_job( w_id, base_internal_url, &compute_rust_hash(&job_raw_code, Some(&lockfile)), + occupancy_metrics, ) .await?; Ok(lockfile) diff --git a/cli/gen/core/OpenAPI.ts b/cli/gen/core/OpenAPI.ts index 88e02785eea55..6af5246be253f 100644 --- a/cli/gen/core/OpenAPI.ts +++ b/cli/gen/core/OpenAPI.ts @@ -54,7 +54,7 @@ export const OpenAPI: OpenAPIConfig = { PASSWORD: undefined, TOKEN: getEnv("WM_TOKEN"), USERNAME: undefined, - VERSION: '1.398.1', + VERSION: '1.401.0', WITH_CREDENTIALS: true, interceptors: { request: new Interceptors(), diff --git a/cli/gen/services.gen.ts b/cli/gen/services.gen.ts index 9894d6de36c55..ef6dafe997fb7 100644 --- a/cli/gen/services.gen.ts +++ b/cli/gen/services.gen.ts @@ -3,7 +3,7 @@ import type { CancelablePromise } from './core/CancelablePromise.ts'; import { OpenAPI } from './core/OpenAPI.ts'; import { request as __request } from './core/request.ts'; -import type { BackendVersionResponse, BackendUptodateResponse, GetLicenseIdResponse, GetOpenApiYamlResponse, GetAuditLogData, GetAuditLogResponse, ListAuditLogsData, ListAuditLogsResponse, LoginData, LoginResponse, LogoutResponse, GetUserData, GetUserResponse, UpdateUserData, UpdateUserResponse, IsOwnerOfPathData, IsOwnerOfPathResponse, SetPasswordData, SetPasswordResponse, CreateUserGloballyData, CreateUserGloballyResponse, GlobalUserUpdateData, GlobalUserUpdateResponse, GlobalUsernameInfoData, GlobalUsernameInfoResponse, GlobalUserRenameData, GlobalUserRenameResponse, GlobalUserDeleteData, GlobalUserDeleteResponse, GlobalUsersOverwriteData, GlobalUsersOverwriteResponse, GlobalUsersExportResponse, DeleteUserData, DeleteUserResponse, ListWorkspacesResponse, IsDomainAllowedResponse, ListUserWorkspacesResponse, ListWorkspacesAsSuperAdminData, ListWorkspacesAsSuperAdminResponse, CreateWorkspaceData, CreateWorkspaceResponse, ExistsWorkspaceData, ExistsWorkspaceResponse, ExistsUsernameData, ExistsUsernameResponse, GetGlobalData, GetGlobalResponse, SetGlobalData, SetGlobalResponse, GetLocalResponse, TestSmtpData, TestSmtpResponse, TestCriticalChannelsData, TestCriticalChannelsResponse, TestLicenseKeyData, TestLicenseKeyResponse, TestObjectStorageConfigData, TestObjectStorageConfigResponse, SendStatsResponse, GetLatestKeyRenewalAttemptResponse, RenewLicenseKeyData, RenewLicenseKeyResponse, CreateCustomerPortalSessionData, CreateCustomerPortalSessionResponse, TestMetadataData, TestMetadataResponse, ListGlobalSettingsResponse, GetCurrentEmailResponse, RefreshUserTokenResponse, GetTutorialProgressResponse, UpdateTutorialProgressData, UpdateTutorialProgressResponse, LeaveInstanceResponse, GetUsageResponse, GetRunnableResponse, GlobalWhoamiResponse, ListWorkspaceInvitesResponse, WhoamiData, WhoamiResponse, AcceptInviteData, AcceptInviteResponse, DeclineInviteData, DeclineInviteResponse, InviteUserData, InviteUserResponse, AddUserData, AddUserResponse, DeleteInviteData, DeleteInviteResponse, ArchiveWorkspaceData, ArchiveWorkspaceResponse, UnarchiveWorkspaceData, UnarchiveWorkspaceResponse, DeleteWorkspaceData, DeleteWorkspaceResponse, LeaveWorkspaceData, LeaveWorkspaceResponse, GetWorkspaceNameData, GetWorkspaceNameResponse, ChangeWorkspaceNameData, ChangeWorkspaceNameResponse, ChangeWorkspaceIdData, ChangeWorkspaceIdResponse, WhoisData, WhoisResponse, ExistsEmailData, ExistsEmailResponse, ListUsersAsSuperAdminData, ListUsersAsSuperAdminResponse, ListPendingInvitesData, ListPendingInvitesResponse, GetSettingsData, GetSettingsResponse, GetDeployToData, GetDeployToResponse, GetIsPremiumData, GetIsPremiumResponse, GetPremiumInfoData, GetPremiumInfoResponse, SetAutomaticBillingData, SetAutomaticBillingResponse, EditSlackCommandData, EditSlackCommandResponse, RunSlackMessageTestJobData, RunSlackMessageTestJobResponse, EditDeployToData, EditDeployToResponse, EditAutoInviteData, EditAutoInviteResponse, EditWebhookData, EditWebhookResponse, EditCopilotConfigData, EditCopilotConfigResponse, GetCopilotInfoData, GetCopilotInfoResponse, EditErrorHandlerData, EditErrorHandlerResponse, EditLargeFileStorageConfigData, EditLargeFileStorageConfigResponse, EditWorkspaceGitSyncConfigData, EditWorkspaceGitSyncConfigResponse, EditWorkspaceDeployUiSettingsData, EditWorkspaceDeployUiSettingsResponse, EditWorkspaceDefaultAppData, EditWorkspaceDefaultAppResponse, EditDefaultScriptsData, EditDefaultScriptsResponse, GetDefaultScriptsData, GetDefaultScriptsResponse, SetEnvironmentVariableData, SetEnvironmentVariableResponse, GetWorkspaceEncryptionKeyData, GetWorkspaceEncryptionKeyResponse, SetWorkspaceEncryptionKeyData, SetWorkspaceEncryptionKeyResponse, GetWorkspaceDefaultAppData, GetWorkspaceDefaultAppResponse, GetLargeFileStorageConfigData, GetLargeFileStorageConfigResponse, GetWorkspaceUsageData, GetWorkspaceUsageResponse, ListUsersData, ListUsersResponse, ListUsersUsageData, ListUsersUsageResponse, ListUsernamesData, ListUsernamesResponse, UsernameToEmailData, UsernameToEmailResponse, CreateTokenData, CreateTokenResponse, CreateTokenImpersonateData, CreateTokenImpersonateResponse, DeleteTokenData, DeleteTokenResponse, ListTokensData, ListTokensResponse, GetOidcTokenData, GetOidcTokenResponse, CreateVariableData, CreateVariableResponse, EncryptValueData, EncryptValueResponse, DeleteVariableData, DeleteVariableResponse, UpdateVariableData, UpdateVariableResponse, GetVariableData, GetVariableResponse, GetVariableValueData, GetVariableValueResponse, ExistsVariableData, ExistsVariableResponse, ListVariableData, ListVariableResponse, ListContextualVariablesData, ListContextualVariablesResponse, LoginWithOauthData, LoginWithOauthResponse, ConnectSlackCallbackData, ConnectSlackCallbackResponse, ConnectSlackCallbackInstanceData, ConnectSlackCallbackInstanceResponse, ConnectCallbackData, ConnectCallbackResponse, CreateAccountData, CreateAccountResponse, RefreshTokenData, RefreshTokenResponse, DisconnectAccountData, DisconnectAccountResponse, DisconnectSlackData, DisconnectSlackResponse, ListOauthLoginsResponse, ListOauthConnectsResponse, GetOauthConnectData, GetOauthConnectResponse, CreateResourceData, CreateResourceResponse, DeleteResourceData, DeleteResourceResponse, UpdateResourceData, UpdateResourceResponse, UpdateResourceValueData, UpdateResourceValueResponse, GetResourceData, GetResourceResponse, GetResourceValueInterpolatedData, GetResourceValueInterpolatedResponse, GetResourceValueData, GetResourceValueResponse, ExistsResourceData, ExistsResourceResponse, ListResourceData, ListResourceResponse, ListSearchResourceData, ListSearchResourceResponse, ListResourceNamesData, ListResourceNamesResponse, CreateResourceTypeData, CreateResourceTypeResponse, FileResourceTypeToFileExtMapData, FileResourceTypeToFileExtMapResponse, DeleteResourceTypeData, DeleteResourceTypeResponse, UpdateResourceTypeData, UpdateResourceTypeResponse, GetResourceTypeData, GetResourceTypeResponse, ExistsResourceTypeData, ExistsResourceTypeResponse, ListResourceTypeData, ListResourceTypeResponse, ListResourceTypeNamesData, ListResourceTypeNamesResponse, QueryResourceTypesData, QueryResourceTypesResponse, ListHubIntegrationsData, ListHubIntegrationsResponse, ListHubFlowsResponse, GetHubFlowByIdData, GetHubFlowByIdResponse, ListHubAppsResponse, GetHubAppByIdData, GetHubAppByIdResponse, GetHubScriptContentByPathData, GetHubScriptContentByPathResponse, GetHubScriptByPathData, GetHubScriptByPathResponse, GetTopHubScriptsData, GetTopHubScriptsResponse, QueryHubScriptsData, QueryHubScriptsResponse, ListSearchScriptData, ListSearchScriptResponse, ListScriptsData, ListScriptsResponse, ListScriptPathsData, ListScriptPathsResponse, CreateDraftData, CreateDraftResponse, DeleteDraftData, DeleteDraftResponse, CreateScriptData, CreateScriptResponse, ToggleWorkspaceErrorHandlerForScriptData, ToggleWorkspaceErrorHandlerForScriptResponse, GetCustomTagsResponse, GeDefaultTagsResponse, IsDefaultTagsPerWorkspaceResponse, ArchiveScriptByPathData, ArchiveScriptByPathResponse, ArchiveScriptByHashData, ArchiveScriptByHashResponse, DeleteScriptByHashData, DeleteScriptByHashResponse, DeleteScriptByPathData, DeleteScriptByPathResponse, GetScriptByPathData, GetScriptByPathResponse, GetScriptByPathWithDraftData, GetScriptByPathWithDraftResponse, GetScriptHistoryByPathData, GetScriptHistoryByPathResponse, UpdateScriptHistoryData, UpdateScriptHistoryResponse, RawScriptByPathData, RawScriptByPathResponse, RawScriptByPathTokenedData, RawScriptByPathTokenedResponse, ExistsScriptByPathData, ExistsScriptByPathResponse, GetScriptByHashData, GetScriptByHashResponse, RawScriptByHashData, RawScriptByHashResponse, GetScriptDeploymentStatusData, GetScriptDeploymentStatusResponse, RunScriptByPathData, RunScriptByPathResponse, OpenaiSyncScriptByPathData, OpenaiSyncScriptByPathResponse, RunWaitResultScriptByPathData, RunWaitResultScriptByPathResponse, RunWaitResultScriptByPathGetData, RunWaitResultScriptByPathGetResponse, OpenaiSyncFlowByPathData, OpenaiSyncFlowByPathResponse, RunWaitResultFlowByPathData, RunWaitResultFlowByPathResponse, ResultByIdData, ResultByIdResponse, ListFlowPathsData, ListFlowPathsResponse, ListSearchFlowData, ListSearchFlowResponse, ListFlowsData, ListFlowsResponse, GetFlowHistoryData, GetFlowHistoryResponse, GetFlowVersionData, GetFlowVersionResponse, UpdateFlowHistoryData, UpdateFlowHistoryResponse, GetFlowByPathData, GetFlowByPathResponse, ToggleWorkspaceErrorHandlerForFlowData, ToggleWorkspaceErrorHandlerForFlowResponse, GetFlowByPathWithDraftData, GetFlowByPathWithDraftResponse, ExistsFlowByPathData, ExistsFlowByPathResponse, CreateFlowData, CreateFlowResponse, UpdateFlowData, UpdateFlowResponse, ArchiveFlowByPathData, ArchiveFlowByPathResponse, DeleteFlowByPathData, DeleteFlowByPathResponse, ListRawAppsData, ListRawAppsResponse, ExistsRawAppData, ExistsRawAppResponse, GetRawAppDataData, GetRawAppDataResponse, ListSearchAppData, ListSearchAppResponse, ListAppsData, ListAppsResponse, CreateAppData, CreateAppResponse, ExistsAppData, ExistsAppResponse, GetAppByPathData, GetAppByPathResponse, GetAppByPathWithDraftData, GetAppByPathWithDraftResponse, GetAppHistoryByPathData, GetAppHistoryByPathResponse, UpdateAppHistoryData, UpdateAppHistoryResponse, GetPublicAppBySecretData, GetPublicAppBySecretResponse, GetPublicResourceData, GetPublicResourceResponse, GetPublicSecretOfAppData, GetPublicSecretOfAppResponse, GetAppByVersionData, GetAppByVersionResponse, CreateRawAppData, CreateRawAppResponse, UpdateRawAppData, UpdateRawAppResponse, DeleteRawAppData, DeleteRawAppResponse, DeleteAppData, DeleteAppResponse, UpdateAppData, UpdateAppResponse, ExecuteComponentData, ExecuteComponentResponse, RunFlowByPathData, RunFlowByPathResponse, RestartFlowAtStepData, RestartFlowAtStepResponse, RunScriptByHashData, RunScriptByHashResponse, RunScriptPreviewData, RunScriptPreviewResponse, RunCodeWorkflowTaskData, RunCodeWorkflowTaskResponse, RunRawScriptDependenciesData, RunRawScriptDependenciesResponse, RunFlowPreviewData, RunFlowPreviewResponse, ListQueueData, ListQueueResponse, GetQueueCountData, GetQueueCountResponse, GetCompletedCountData, GetCompletedCountResponse, ListFilteredUuidsData, ListFilteredUuidsResponse, CancelSelectionData, CancelSelectionResponse, ListCompletedJobsData, ListCompletedJobsResponse, ListJobsData, ListJobsResponse, GetDbClockResponse, GetJobData, GetJobResponse, GetRootJobIdData, GetRootJobIdResponse, GetJobLogsData, GetJobLogsResponse, GetJobArgsData, GetJobArgsResponse, GetJobUpdatesData, GetJobUpdatesResponse, GetLogFileFromStoreData, GetLogFileFromStoreResponse, GetFlowDebugInfoData, GetFlowDebugInfoResponse, GetCompletedJobData, GetCompletedJobResponse, GetCompletedJobResultData, GetCompletedJobResultResponse, GetCompletedJobResultMaybeData, GetCompletedJobResultMaybeResponse, DeleteCompletedJobData, DeleteCompletedJobResponse, CancelQueuedJobData, CancelQueuedJobResponse, CancelPersistentQueuedJobsData, CancelPersistentQueuedJobsResponse, ForceCancelQueuedJobData, ForceCancelQueuedJobResponse, CreateJobSignatureData, CreateJobSignatureResponse, GetResumeUrlsData, GetResumeUrlsResponse, ResumeSuspendedJobGetData, ResumeSuspendedJobGetResponse, ResumeSuspendedJobPostData, ResumeSuspendedJobPostResponse, SetFlowUserStateData, SetFlowUserStateResponse, GetFlowUserStateData, GetFlowUserStateResponse, ResumeSuspendedFlowAsOwnerData, ResumeSuspendedFlowAsOwnerResponse, CancelSuspendedJobGetData, CancelSuspendedJobGetResponse, CancelSuspendedJobPostData, CancelSuspendedJobPostResponse, GetSuspendedJobFlowData, GetSuspendedJobFlowResponse, PreviewScheduleData, PreviewScheduleResponse, CreateScheduleData, CreateScheduleResponse, UpdateScheduleData, UpdateScheduleResponse, SetScheduleEnabledData, SetScheduleEnabledResponse, DeleteScheduleData, DeleteScheduleResponse, GetScheduleData, GetScheduleResponse, ExistsScheduleData, ExistsScheduleResponse, ListSchedulesData, ListSchedulesResponse, ListSchedulesWithJobsData, ListSchedulesWithJobsResponse, SetDefaultErrorOrRecoveryHandlerData, SetDefaultErrorOrRecoveryHandlerResponse, CreateHttpTriggerData, CreateHttpTriggerResponse, UpdateHttpTriggerData, UpdateHttpTriggerResponse, DeleteHttpTriggerData, DeleteHttpTriggerResponse, GetHttpTriggerData, GetHttpTriggerResponse, ListHttpTriggersData, ListHttpTriggersResponse, ExistsHttpTriggerData, ExistsHttpTriggerResponse, ExistsRouteData, ExistsRouteResponse, UsedData, UsedResponse, ListInstanceGroupsResponse, GetInstanceGroupData, GetInstanceGroupResponse, CreateInstanceGroupData, CreateInstanceGroupResponse, UpdateInstanceGroupData, UpdateInstanceGroupResponse, DeleteInstanceGroupData, DeleteInstanceGroupResponse, AddUserToInstanceGroupData, AddUserToInstanceGroupResponse, RemoveUserFromInstanceGroupData, RemoveUserFromInstanceGroupResponse, ExportInstanceGroupsResponse, OverwriteInstanceGroupsData, OverwriteInstanceGroupsResponse, ListGroupsData, ListGroupsResponse, ListGroupNamesData, ListGroupNamesResponse, CreateGroupData, CreateGroupResponse, UpdateGroupData, UpdateGroupResponse, DeleteGroupData, DeleteGroupResponse, GetGroupData, GetGroupResponse, AddUserToGroupData, AddUserToGroupResponse, RemoveUserToGroupData, RemoveUserToGroupResponse, ListFoldersData, ListFoldersResponse, ListFolderNamesData, ListFolderNamesResponse, CreateFolderData, CreateFolderResponse, UpdateFolderData, UpdateFolderResponse, DeleteFolderData, DeleteFolderResponse, GetFolderData, GetFolderResponse, GetFolderUsageData, GetFolderUsageResponse, AddOwnerToFolderData, AddOwnerToFolderResponse, RemoveOwnerToFolderData, RemoveOwnerToFolderResponse, ListWorkersData, ListWorkersResponse, ExistsWorkerWithTagData, ExistsWorkerWithTagResponse, GetQueueMetricsResponse, ListWorkerGroupsResponse, GetConfigData, GetConfigResponse, UpdateConfigData, UpdateConfigResponse, DeleteConfigData, DeleteConfigResponse, ListConfigsResponse, GetGranularAclsData, GetGranularAclsResponse, AddGranularAclsData, AddGranularAclsResponse, RemoveGranularAclsData, RemoveGranularAclsResponse, UpdateCaptureData, UpdateCaptureResponse, CreateCaptureData, CreateCaptureResponse, GetCaptureData, GetCaptureResponse, StarData, StarResponse, UnstarData, UnstarResponse, GetInputHistoryData, GetInputHistoryResponse, GetArgsFromHistoryOrSavedInputData, GetArgsFromHistoryOrSavedInputResponse, ListInputsData, ListInputsResponse, CreateInputData, CreateInputResponse, UpdateInputData, UpdateInputResponse, DeleteInputData, DeleteInputResponse, DuckdbConnectionSettingsData, DuckdbConnectionSettingsResponse, DuckdbConnectionSettingsV2Data, DuckdbConnectionSettingsV2Response, PolarsConnectionSettingsData, PolarsConnectionSettingsResponse, PolarsConnectionSettingsV2Data, PolarsConnectionSettingsV2Response, S3ResourceInfoData, S3ResourceInfoResponse, DatasetStorageTestConnectionData, DatasetStorageTestConnectionResponse, ListStoredFilesData, ListStoredFilesResponse, LoadFileMetadataData, LoadFileMetadataResponse, LoadFilePreviewData, LoadFilePreviewResponse, LoadParquetPreviewData, LoadParquetPreviewResponse, LoadTableRowCountData, LoadTableRowCountResponse, LoadCsvPreviewData, LoadCsvPreviewResponse, DeleteS3FileData, DeleteS3FileResponse, MoveS3FileData, MoveS3FileResponse, FileUploadData, FileUploadResponse, FileDownloadData, FileDownloadResponse, FileDownloadParquetAsCsvData, FileDownloadParquetAsCsvResponse, GetJobMetricsData, GetJobMetricsResponse, SetJobProgressData, SetJobProgressResponse, GetJobProgressData, GetJobProgressResponse, ListLogFilesData, ListLogFilesResponse, GetLogFileData, GetLogFileResponse, ListConcurrencyGroupsResponse, DeleteConcurrencyGroupData, DeleteConcurrencyGroupResponse, GetConcurrencyKeyData, GetConcurrencyKeyResponse, ListExtendedJobsData, ListExtendedJobsResponse, SearchJobsIndexData, SearchJobsIndexResponse } from './types.gen.ts'; +import type { BackendVersionResponse, BackendUptodateResponse, GetLicenseIdResponse, GetOpenApiYamlResponse, GetAuditLogData, GetAuditLogResponse, ListAuditLogsData, ListAuditLogsResponse, LoginData, LoginResponse, LogoutResponse, GetUserData, GetUserResponse, UpdateUserData, UpdateUserResponse, IsOwnerOfPathData, IsOwnerOfPathResponse, SetPasswordData, SetPasswordResponse, CreateUserGloballyData, CreateUserGloballyResponse, GlobalUserUpdateData, GlobalUserUpdateResponse, GlobalUsernameInfoData, GlobalUsernameInfoResponse, GlobalUserRenameData, GlobalUserRenameResponse, GlobalUserDeleteData, GlobalUserDeleteResponse, GlobalUsersOverwriteData, GlobalUsersOverwriteResponse, GlobalUsersExportResponse, DeleteUserData, DeleteUserResponse, ListWorkspacesResponse, IsDomainAllowedResponse, ListUserWorkspacesResponse, ListWorkspacesAsSuperAdminData, ListWorkspacesAsSuperAdminResponse, CreateWorkspaceData, CreateWorkspaceResponse, ExistsWorkspaceData, ExistsWorkspaceResponse, ExistsUsernameData, ExistsUsernameResponse, GetGlobalData, GetGlobalResponse, SetGlobalData, SetGlobalResponse, GetLocalResponse, TestSmtpData, TestSmtpResponse, TestCriticalChannelsData, TestCriticalChannelsResponse, TestLicenseKeyData, TestLicenseKeyResponse, TestObjectStorageConfigData, TestObjectStorageConfigResponse, SendStatsResponse, GetLatestKeyRenewalAttemptResponse, RenewLicenseKeyData, RenewLicenseKeyResponse, CreateCustomerPortalSessionData, CreateCustomerPortalSessionResponse, TestMetadataData, TestMetadataResponse, ListGlobalSettingsResponse, GetCurrentEmailResponse, RefreshUserTokenResponse, GetTutorialProgressResponse, UpdateTutorialProgressData, UpdateTutorialProgressResponse, LeaveInstanceResponse, GetUsageResponse, GetRunnableResponse, GlobalWhoamiResponse, ListWorkspaceInvitesResponse, WhoamiData, WhoamiResponse, AcceptInviteData, AcceptInviteResponse, DeclineInviteData, DeclineInviteResponse, InviteUserData, InviteUserResponse, AddUserData, AddUserResponse, DeleteInviteData, DeleteInviteResponse, ArchiveWorkspaceData, ArchiveWorkspaceResponse, UnarchiveWorkspaceData, UnarchiveWorkspaceResponse, DeleteWorkspaceData, DeleteWorkspaceResponse, LeaveWorkspaceData, LeaveWorkspaceResponse, GetWorkspaceNameData, GetWorkspaceNameResponse, ChangeWorkspaceNameData, ChangeWorkspaceNameResponse, ChangeWorkspaceIdData, ChangeWorkspaceIdResponse, WhoisData, WhoisResponse, ExistsEmailData, ExistsEmailResponse, ListUsersAsSuperAdminData, ListUsersAsSuperAdminResponse, ListPendingInvitesData, ListPendingInvitesResponse, GetSettingsData, GetSettingsResponse, GetDeployToData, GetDeployToResponse, GetIsPremiumData, GetIsPremiumResponse, GetPremiumInfoData, GetPremiumInfoResponse, SetAutomaticBillingData, SetAutomaticBillingResponse, EditSlackCommandData, EditSlackCommandResponse, RunSlackMessageTestJobData, RunSlackMessageTestJobResponse, EditDeployToData, EditDeployToResponse, EditAutoInviteData, EditAutoInviteResponse, EditWebhookData, EditWebhookResponse, EditCopilotConfigData, EditCopilotConfigResponse, GetCopilotInfoData, GetCopilotInfoResponse, EditErrorHandlerData, EditErrorHandlerResponse, EditLargeFileStorageConfigData, EditLargeFileStorageConfigResponse, EditWorkspaceGitSyncConfigData, EditWorkspaceGitSyncConfigResponse, EditWorkspaceDeployUiSettingsData, EditWorkspaceDeployUiSettingsResponse, EditWorkspaceDefaultAppData, EditWorkspaceDefaultAppResponse, EditDefaultScriptsData, EditDefaultScriptsResponse, GetDefaultScriptsData, GetDefaultScriptsResponse, SetEnvironmentVariableData, SetEnvironmentVariableResponse, GetWorkspaceEncryptionKeyData, GetWorkspaceEncryptionKeyResponse, SetWorkspaceEncryptionKeyData, SetWorkspaceEncryptionKeyResponse, GetWorkspaceDefaultAppData, GetWorkspaceDefaultAppResponse, GetLargeFileStorageConfigData, GetLargeFileStorageConfigResponse, GetWorkspaceUsageData, GetWorkspaceUsageResponse, ListUsersData, ListUsersResponse, ListUsersUsageData, ListUsersUsageResponse, ListUsernamesData, ListUsernamesResponse, UsernameToEmailData, UsernameToEmailResponse, CreateTokenData, CreateTokenResponse, CreateTokenImpersonateData, CreateTokenImpersonateResponse, DeleteTokenData, DeleteTokenResponse, ListTokensData, ListTokensResponse, GetOidcTokenData, GetOidcTokenResponse, CreateVariableData, CreateVariableResponse, EncryptValueData, EncryptValueResponse, DeleteVariableData, DeleteVariableResponse, UpdateVariableData, UpdateVariableResponse, GetVariableData, GetVariableResponse, GetVariableValueData, GetVariableValueResponse, ExistsVariableData, ExistsVariableResponse, ListVariableData, ListVariableResponse, ListContextualVariablesData, ListContextualVariablesResponse, LoginWithOauthData, LoginWithOauthResponse, ConnectSlackCallbackData, ConnectSlackCallbackResponse, ConnectSlackCallbackInstanceData, ConnectSlackCallbackInstanceResponse, ConnectCallbackData, ConnectCallbackResponse, CreateAccountData, CreateAccountResponse, RefreshTokenData, RefreshTokenResponse, DisconnectAccountData, DisconnectAccountResponse, DisconnectSlackData, DisconnectSlackResponse, ListOauthLoginsResponse, ListOauthConnectsResponse, GetOauthConnectData, GetOauthConnectResponse, CreateResourceData, CreateResourceResponse, DeleteResourceData, DeleteResourceResponse, UpdateResourceData, UpdateResourceResponse, UpdateResourceValueData, UpdateResourceValueResponse, GetResourceData, GetResourceResponse, GetResourceValueInterpolatedData, GetResourceValueInterpolatedResponse, GetResourceValueData, GetResourceValueResponse, ExistsResourceData, ExistsResourceResponse, ListResourceData, ListResourceResponse, ListSearchResourceData, ListSearchResourceResponse, ListResourceNamesData, ListResourceNamesResponse, CreateResourceTypeData, CreateResourceTypeResponse, FileResourceTypeToFileExtMapData, FileResourceTypeToFileExtMapResponse, DeleteResourceTypeData, DeleteResourceTypeResponse, UpdateResourceTypeData, UpdateResourceTypeResponse, GetResourceTypeData, GetResourceTypeResponse, ExistsResourceTypeData, ExistsResourceTypeResponse, ListResourceTypeData, ListResourceTypeResponse, ListResourceTypeNamesData, ListResourceTypeNamesResponse, QueryResourceTypesData, QueryResourceTypesResponse, ListHubIntegrationsData, ListHubIntegrationsResponse, ListHubFlowsResponse, GetHubFlowByIdData, GetHubFlowByIdResponse, ListHubAppsResponse, GetHubAppByIdData, GetHubAppByIdResponse, GetHubScriptContentByPathData, GetHubScriptContentByPathResponse, GetHubScriptByPathData, GetHubScriptByPathResponse, GetTopHubScriptsData, GetTopHubScriptsResponse, QueryHubScriptsData, QueryHubScriptsResponse, ListSearchScriptData, ListSearchScriptResponse, ListScriptsData, ListScriptsResponse, ListScriptPathsData, ListScriptPathsResponse, CreateDraftData, CreateDraftResponse, DeleteDraftData, DeleteDraftResponse, CreateScriptData, CreateScriptResponse, ToggleWorkspaceErrorHandlerForScriptData, ToggleWorkspaceErrorHandlerForScriptResponse, GetCustomTagsResponse, GeDefaultTagsResponse, IsDefaultTagsPerWorkspaceResponse, ArchiveScriptByPathData, ArchiveScriptByPathResponse, ArchiveScriptByHashData, ArchiveScriptByHashResponse, DeleteScriptByHashData, DeleteScriptByHashResponse, DeleteScriptByPathData, DeleteScriptByPathResponse, GetScriptByPathData, GetScriptByPathResponse, GetScriptByPathWithDraftData, GetScriptByPathWithDraftResponse, GetScriptHistoryByPathData, GetScriptHistoryByPathResponse, UpdateScriptHistoryData, UpdateScriptHistoryResponse, RawScriptByPathData, RawScriptByPathResponse, RawScriptByPathTokenedData, RawScriptByPathTokenedResponse, ExistsScriptByPathData, ExistsScriptByPathResponse, GetScriptByHashData, GetScriptByHashResponse, RawScriptByHashData, RawScriptByHashResponse, GetScriptDeploymentStatusData, GetScriptDeploymentStatusResponse, RunScriptByPathData, RunScriptByPathResponse, OpenaiSyncScriptByPathData, OpenaiSyncScriptByPathResponse, RunWaitResultScriptByPathData, RunWaitResultScriptByPathResponse, RunWaitResultScriptByPathGetData, RunWaitResultScriptByPathGetResponse, OpenaiSyncFlowByPathData, OpenaiSyncFlowByPathResponse, RunWaitResultFlowByPathData, RunWaitResultFlowByPathResponse, ResultByIdData, ResultByIdResponse, ListFlowPathsData, ListFlowPathsResponse, ListSearchFlowData, ListSearchFlowResponse, ListFlowsData, ListFlowsResponse, GetFlowHistoryData, GetFlowHistoryResponse, GetFlowVersionData, GetFlowVersionResponse, UpdateFlowHistoryData, UpdateFlowHistoryResponse, GetFlowByPathData, GetFlowByPathResponse, ToggleWorkspaceErrorHandlerForFlowData, ToggleWorkspaceErrorHandlerForFlowResponse, GetFlowByPathWithDraftData, GetFlowByPathWithDraftResponse, ExistsFlowByPathData, ExistsFlowByPathResponse, CreateFlowData, CreateFlowResponse, UpdateFlowData, UpdateFlowResponse, ArchiveFlowByPathData, ArchiveFlowByPathResponse, DeleteFlowByPathData, DeleteFlowByPathResponse, ListRawAppsData, ListRawAppsResponse, ExistsRawAppData, ExistsRawAppResponse, GetRawAppDataData, GetRawAppDataResponse, ListSearchAppData, ListSearchAppResponse, ListAppsData, ListAppsResponse, CreateAppData, CreateAppResponse, ExistsAppData, ExistsAppResponse, GetAppByPathData, GetAppByPathResponse, GetAppByPathWithDraftData, GetAppByPathWithDraftResponse, GetAppHistoryByPathData, GetAppHistoryByPathResponse, UpdateAppHistoryData, UpdateAppHistoryResponse, GetPublicAppBySecretData, GetPublicAppBySecretResponse, GetPublicResourceData, GetPublicResourceResponse, GetPublicSecretOfAppData, GetPublicSecretOfAppResponse, GetAppByVersionData, GetAppByVersionResponse, CreateRawAppData, CreateRawAppResponse, UpdateRawAppData, UpdateRawAppResponse, DeleteRawAppData, DeleteRawAppResponse, DeleteAppData, DeleteAppResponse, UpdateAppData, UpdateAppResponse, ExecuteComponentData, ExecuteComponentResponse, RunFlowByPathData, RunFlowByPathResponse, RestartFlowAtStepData, RestartFlowAtStepResponse, RunScriptByHashData, RunScriptByHashResponse, RunScriptPreviewData, RunScriptPreviewResponse, RunCodeWorkflowTaskData, RunCodeWorkflowTaskResponse, RunRawScriptDependenciesData, RunRawScriptDependenciesResponse, RunFlowPreviewData, RunFlowPreviewResponse, ListQueueData, ListQueueResponse, GetQueueCountData, GetQueueCountResponse, GetCompletedCountData, GetCompletedCountResponse, ListFilteredUuidsData, ListFilteredUuidsResponse, CancelSelectionData, CancelSelectionResponse, ListCompletedJobsData, ListCompletedJobsResponse, ListJobsData, ListJobsResponse, GetDbClockResponse, CountJobsByTagData, CountJobsByTagResponse, GetJobData, GetJobResponse, GetRootJobIdData, GetRootJobIdResponse, GetJobLogsData, GetJobLogsResponse, GetJobArgsData, GetJobArgsResponse, GetJobUpdatesData, GetJobUpdatesResponse, GetLogFileFromStoreData, GetLogFileFromStoreResponse, GetFlowDebugInfoData, GetFlowDebugInfoResponse, GetCompletedJobData, GetCompletedJobResponse, GetCompletedJobResultData, GetCompletedJobResultResponse, GetCompletedJobResultMaybeData, GetCompletedJobResultMaybeResponse, DeleteCompletedJobData, DeleteCompletedJobResponse, CancelQueuedJobData, CancelQueuedJobResponse, CancelPersistentQueuedJobsData, CancelPersistentQueuedJobsResponse, ForceCancelQueuedJobData, ForceCancelQueuedJobResponse, CreateJobSignatureData, CreateJobSignatureResponse, GetResumeUrlsData, GetResumeUrlsResponse, ResumeSuspendedJobGetData, ResumeSuspendedJobGetResponse, ResumeSuspendedJobPostData, ResumeSuspendedJobPostResponse, SetFlowUserStateData, SetFlowUserStateResponse, GetFlowUserStateData, GetFlowUserStateResponse, ResumeSuspendedFlowAsOwnerData, ResumeSuspendedFlowAsOwnerResponse, CancelSuspendedJobGetData, CancelSuspendedJobGetResponse, CancelSuspendedJobPostData, CancelSuspendedJobPostResponse, GetSuspendedJobFlowData, GetSuspendedJobFlowResponse, PreviewScheduleData, PreviewScheduleResponse, CreateScheduleData, CreateScheduleResponse, UpdateScheduleData, UpdateScheduleResponse, SetScheduleEnabledData, SetScheduleEnabledResponse, DeleteScheduleData, DeleteScheduleResponse, GetScheduleData, GetScheduleResponse, ExistsScheduleData, ExistsScheduleResponse, ListSchedulesData, ListSchedulesResponse, ListSchedulesWithJobsData, ListSchedulesWithJobsResponse, SetDefaultErrorOrRecoveryHandlerData, SetDefaultErrorOrRecoveryHandlerResponse, CreateHttpTriggerData, CreateHttpTriggerResponse, UpdateHttpTriggerData, UpdateHttpTriggerResponse, DeleteHttpTriggerData, DeleteHttpTriggerResponse, GetHttpTriggerData, GetHttpTriggerResponse, ListHttpTriggersData, ListHttpTriggersResponse, ExistsHttpTriggerData, ExistsHttpTriggerResponse, ExistsRouteData, ExistsRouteResponse, UsedData, UsedResponse, ListInstanceGroupsResponse, GetInstanceGroupData, GetInstanceGroupResponse, CreateInstanceGroupData, CreateInstanceGroupResponse, UpdateInstanceGroupData, UpdateInstanceGroupResponse, DeleteInstanceGroupData, DeleteInstanceGroupResponse, AddUserToInstanceGroupData, AddUserToInstanceGroupResponse, RemoveUserFromInstanceGroupData, RemoveUserFromInstanceGroupResponse, ExportInstanceGroupsResponse, OverwriteInstanceGroupsData, OverwriteInstanceGroupsResponse, ListGroupsData, ListGroupsResponse, ListGroupNamesData, ListGroupNamesResponse, CreateGroupData, CreateGroupResponse, UpdateGroupData, UpdateGroupResponse, DeleteGroupData, DeleteGroupResponse, GetGroupData, GetGroupResponse, AddUserToGroupData, AddUserToGroupResponse, RemoveUserToGroupData, RemoveUserToGroupResponse, ListFoldersData, ListFoldersResponse, ListFolderNamesData, ListFolderNamesResponse, CreateFolderData, CreateFolderResponse, UpdateFolderData, UpdateFolderResponse, DeleteFolderData, DeleteFolderResponse, GetFolderData, GetFolderResponse, GetFolderUsageData, GetFolderUsageResponse, AddOwnerToFolderData, AddOwnerToFolderResponse, RemoveOwnerToFolderData, RemoveOwnerToFolderResponse, ListWorkersData, ListWorkersResponse, ExistsWorkerWithTagData, ExistsWorkerWithTagResponse, GetQueueMetricsResponse, ListWorkerGroupsResponse, GetConfigData, GetConfigResponse, UpdateConfigData, UpdateConfigResponse, DeleteConfigData, DeleteConfigResponse, ListConfigsResponse, GetGranularAclsData, GetGranularAclsResponse, AddGranularAclsData, AddGranularAclsResponse, RemoveGranularAclsData, RemoveGranularAclsResponse, UpdateCaptureData, UpdateCaptureResponse, CreateCaptureData, CreateCaptureResponse, GetCaptureData, GetCaptureResponse, StarData, StarResponse, UnstarData, UnstarResponse, GetInputHistoryData, GetInputHistoryResponse, GetArgsFromHistoryOrSavedInputData, GetArgsFromHistoryOrSavedInputResponse, ListInputsData, ListInputsResponse, CreateInputData, CreateInputResponse, UpdateInputData, UpdateInputResponse, DeleteInputData, DeleteInputResponse, DuckdbConnectionSettingsData, DuckdbConnectionSettingsResponse, DuckdbConnectionSettingsV2Data, DuckdbConnectionSettingsV2Response, PolarsConnectionSettingsData, PolarsConnectionSettingsResponse, PolarsConnectionSettingsV2Data, PolarsConnectionSettingsV2Response, S3ResourceInfoData, S3ResourceInfoResponse, DatasetStorageTestConnectionData, DatasetStorageTestConnectionResponse, ListStoredFilesData, ListStoredFilesResponse, LoadFileMetadataData, LoadFileMetadataResponse, LoadFilePreviewData, LoadFilePreviewResponse, LoadParquetPreviewData, LoadParquetPreviewResponse, LoadTableRowCountData, LoadTableRowCountResponse, LoadCsvPreviewData, LoadCsvPreviewResponse, DeleteS3FileData, DeleteS3FileResponse, MoveS3FileData, MoveS3FileResponse, FileUploadData, FileUploadResponse, FileDownloadData, FileDownloadResponse, FileDownloadParquetAsCsvData, FileDownloadParquetAsCsvResponse, GetJobMetricsData, GetJobMetricsResponse, SetJobProgressData, SetJobProgressResponse, GetJobProgressData, GetJobProgressResponse, ListLogFilesData, ListLogFilesResponse, GetLogFileData, GetLogFileResponse, ListConcurrencyGroupsResponse, DeleteConcurrencyGroupData, DeleteConcurrencyGroupResponse, GetConcurrencyKeyData, GetConcurrencyKeyResponse, ListExtendedJobsData, ListExtendedJobsResponse, SearchJobsIndexData, SearchJobsIndexResponse } from './types.gen.ts'; /** * get backend version @@ -4284,6 +4284,23 @@ export const getDbClock = (): CancelablePromise => { return url: '/jobs/db_clock' }); }; +/** + * Count jobs by tag + * @param data The data for the request. + * @param data.horizonSecs Past Time horizon in seconds (when to start the count = now - horizon) (default is 3600) + * @param data.workspaceId Specific workspace ID to filter results (optional) + * @returns unknown Job counts by tag + * @throws ApiError + */ +export const countJobsByTag = (data: CountJobsByTagData = {}): CancelablePromise => { return __request(OpenAPI, { + method: 'GET', + url: '/jobs/completed/count_by_tag', + query: { + horizon_secs: data.horizonSecs, + workspace_id: data.workspaceId + } +}); }; + /** * get job * @param data The data for the request. diff --git a/cli/gen/types.gen.ts b/cli/gen/types.gen.ts index e066cf1436093..4e62e030b50a7 100644 --- a/cli/gen/types.gen.ts +++ b/cli/gen/types.gen.ts @@ -613,6 +613,9 @@ export type WorkerPing = { last_job_id?: string; last_job_workspace_id?: string; occupancy_rate?: number; + occupancy_rate_15s?: number; + occupancy_rate_5m?: number; + occupancy_rate_30m?: number; memory?: number; vcpus?: number; memory_usage?: number; @@ -4406,6 +4409,22 @@ export type ListJobsResponse = (Array); export type GetDbClockResponse = (number); +export type CountJobsByTagData = { + /** + * Past Time horizon in seconds (when to start the count = now - horizon) (default is 3600) + */ + horizonSecs?: number; + /** + * Specific workspace ID to filter results (optional) + */ + workspaceId?: string; +}; + +export type CountJobsByTagResponse = (Array<{ + tag: string; + count: number; +}>); + export type GetJobData = { id: string; noLogs?: boolean; diff --git a/cli/instance.ts b/cli/instance.ts index e71f237d525f6..cf2d5f5b02a5f 100644 --- a/cli/instance.ts +++ b/cli/instance.ts @@ -6,6 +6,7 @@ import { yamlParse, Command, setClient, + Table, } from "./deps.ts"; import * as wmill from "./gen/services.gen.ts"; @@ -62,39 +63,49 @@ export async function allInstances(): Promise { return []; } } -export async function addInstance() { - let remote = await Input.prompt({ - message: "Enter the remote url of this instance", - default: "https://app.windmill.dev/", - }); - remote = new URL(remote).toString(); // add trailing slash in all cases! - - const defaultName = new URL(remote).hostname; +export async function addInstance( + opts: {}, + instanceName: string | undefined, + remote: string | undefined, + token: string | undefined +) { + if (!remote) { + remote = await Input.prompt({ + message: "Enter the remote url of this instance", + default: "https://my.windmill.dev/", + }); + remote = new URL(remote).toString(); // add trailing slash in all cases! + } - const name = await Input.prompt({ - message: "Enter a name for this instance", - default: defaultName, - }); + if (!instanceName) { + const defaultName = new URL(remote).hostname.split(".")[0]; - const prefix = name.toLowerCase().replace(/[^a-z0-9]/g, ""); + instanceName = await Input.prompt({ + message: "Enter a name for this instance", + default: defaultName, + }); + } + const prefix = instanceName.toLowerCase().replace(/[^a-z0-9]/g, ""); - let token: string | undefined = undefined; while (!token) { token = await loginInteractive(remote); } await appendInstance({ - name, + name: instanceName, remote, token, prefix, }); log.info( - colors.green.underline(`Added instance ${name} with remote ${remote}!`) + colors.green.underline( + `Added instance ${instanceName} with remote ${remote}!` + ) ); + await switchI({}, instanceName); return { - name, + name: instanceName, remote, token, prefix, @@ -160,37 +171,59 @@ export function compareInstanceObjects( return changes; } -type InstanceSyncOptions = { +export type InstanceSyncOptions = { skipUsers?: boolean; skipSettings?: boolean; skipConfigs?: boolean; skipGroups?: boolean; includeWorkspaces?: boolean; + instance?: string; baseUrl?: string; + token?: string; + yes?: boolean; }; -async function instancePull(opts: GlobalOptions & InstanceSyncOptions) { +export async function pickInstance(opts: InstanceSyncOptions, allowNew: boolean) { const instances = await allInstances(); - let instance: Instance; - if (instances.length < 1) { - instance = await addInstance(); - } else { - const choice = (await Select.prompt({ - message: "Select an instance to pull from", - options: [ - ...instances.map((i) => ({ - name: `${i.name} (${i.remote})`, - value: i.name, - })), - { name: "Add new instance", value: "new" }, - ], - })) as unknown as string; - - if (choice === "new") { - instance = await addInstance(); + if (opts.baseUrl && opts.token) { + log.info("Using instance fully defined by --base-url and --token") + return { + name: "custom", + remote: opts.baseUrl, + token: opts.token, + prefix: "custom", + }; + } + if (!allowNew && instances.length < 1) { + throw new Error("No instance found, please add one first"); + } + const instanceName = await getActiveInstance(opts); + let instance: Instance | undefined = instances.find( + (i) => i.name === instanceName + ); + if (!instance) { + if (instances.length < 1) { + instance = await addInstance({}, undefined, undefined, undefined); } else { - instance = instances.find((i) => i.name === choice)!; + const choice = (await Select.prompt({ + message: "Select an instance", + options: [ + ...instances.map((i) => ({ + name: `${i.name} (${i.remote})`, + value: i.name, + })), + { name: "Add new instance", value: "new" }, + ], + })) as unknown as string; + + if (choice === "new") { + instance = await addInstance({}, undefined, undefined, undefined); + } else { + instance = instances.find((i) => i.name === choice)!; + } } + } else { + log.info(`Selected instance: ${instance.name}`); } setClient( @@ -198,6 +231,10 @@ async function instancePull(opts: GlobalOptions & InstanceSyncOptions) { instance.remote.slice(0, instance.remote.length - 1) ); + return instance; +} +async function instancePull(opts: GlobalOptions & InstanceSyncOptions) { + const instance = await pickInstance(opts, true); log.info("Pulling instance-level changes"); log.info(`remote (${instance.name}) -> local`); @@ -221,10 +258,13 @@ async function instancePull(opts: GlobalOptions & InstanceSyncOptions) { const totalChanges = uChanges + sChanges + cChanges + gChanges; if (totalChanges > 0) { - const confirm = await Confirm.prompt({ - message: `Do you want to apply these ${totalChanges} instance-level changes?`, - default: true, - }); + let confirm = true; + if (opts.yes !== true) { + confirm = await Confirm.prompt({ + message: `Do you want to pull these ${totalChanges} instance-level changes?`, + default: true, + }); + } if (confirm) { if (!opts.skipUsers && uChanges > 0) { @@ -318,32 +358,7 @@ async function instancePull(opts: GlobalOptions & InstanceSyncOptions) { async function instancePush(opts: GlobalOptions & InstanceSyncOptions) { let instances = await allInstances(); - let instance: Instance; - if (instances.length < 1) { - instance = await addInstance(); - } else { - const choice = (await Select.prompt({ - message: "Select an instance to push to", - options: [ - ...instances.map((i) => ({ - name: `${i.name} (${i.remote})`, - value: i.name, - })), - { name: "Add new instance", value: "new" }, - ], - })) as unknown as string; - - if (choice === "new") { - instance = await addInstance(); - } else { - instance = instances.find((i) => i.name === choice)!; - } - } - - setClient( - instance.token, - instance.remote.slice(0, instance.remote.length - 1) - ); + const instance = await pickInstance(opts, true); log.info("Pushing instance-level changes"); log.info!(`remote (${instance.name}) <- local`); @@ -368,10 +383,13 @@ async function instancePush(opts: GlobalOptions & InstanceSyncOptions) { const totalChanges = uChanges + sChanges + cChanges + gChanges; if (totalChanges > 0) { - const confirm = await Confirm.prompt({ - message: `Do you want to apply these ${totalChanges} instance-level changes?`, - default: true, - }); + let confirm = true; + if (opts.yes !== true) { + confirm = await Confirm.prompt({ + message: `Do you want to apply these ${totalChanges} instance-level changes?`, + default: true, + }); + } if (confirm) { if (!opts.skipUsers && uChanges > 0) { @@ -485,36 +503,140 @@ async function instancePush(opts: GlobalOptions & InstanceSyncOptions) { } } +async function switchI(opts: {}, instanceName: string) { + const all = await allInstances(); + if (all.findIndex((x) => x.name === instanceName) === -1) { + log.info( + colors.red.bold(`! This instance ${instanceName} does not exist locally.`) + ); + log.info("available instances:"); + for (const w of all) { + log.info(" - " + w.name); + } + return; + } + + await Deno.writeTextFile( + (await getRootStore()) + "/activeInstance", + instanceName + ); + + log.info(colors.green.underline(`Switched to instance ${instanceName}`)); +} + +export async function getActiveInstance(opts: { + instance?: string; +}): Promise { + if (opts.instance) { + return opts.instance; + } + try { + return await Deno.readTextFile((await getRootStore()) + "/activeInstance"); + } catch { + return undefined; + } +} + +async function whoami(opts: {}) { + await pickInstance({}, false); + try { + const whoamiInfo = await wmill.globalWhoami(); + log.info(colors.green.underline(`global whoami infos:`)); + log.info(JSON.stringify(whoamiInfo, null, 2)); + } catch (error) { + log.error(colors.red(`Failed to retrieve whoami information: ${error.message}`)); + } +} + const command = new Command() .description( "sync local with a remote instance or the opposite (push or pull)" ) - .action(() => - log.info("2 actions available, pull and push. Use -h to display help.") - ) + .action(async () => { + log.info( + "4 actions available, add, remove, switch, pull and push. Use -h to display help." + ); + const activeInstance = await getActiveInstance({}); + + new Table() + .header(["name", "remote", "token"]) + .padding(2) + .border(true) + .body( + (await allInstances()).map((x) => [ + x.name === activeInstance ? colors.underline(x.name) : x.name, + x.remote, + x.token.substring(0, 7) + "***", + ]) + ) + .render(); + if (activeInstance) { + log.info(`Selected instance: ${activeInstance}`); + } else { + log.info("No active instance selected"); + } + log.info("Use 'wmill instance add' to add a new instance"); + }) + .command("add") + .description("Add a new instance") + .action(addInstance as any) + .arguments("[instance_name:string] [remote:string] [token:string]") + .command("remove") + .description("Remove an instance") + .complete("instance", async () => (await allInstances()).map((x) => x.name)) + .arguments("") + .action(async (instance) => { + const instances = await allInstances(); + + const choice = (await Select.prompt({ + message: "Select an instance to remove", + options: instances.map((i) => ({ + name: `${i.name} (${i.remote})`, + value: i.name, + })), + })) as unknown as string; + + await removeInstance(choice); + log.info(colors.green.underline(`Removed instance ${choice}`)); + }) + .command("switch") + .complete("instance", async () => (await allInstances()).map((x) => x.name)) + .arguments("") + .description("Switch the current instance") + .action(switchI as any) .command("pull") .description( "Pull instance settings, users, configs, instance groups and overwrite local" ) + .option("--yes", "Pull without needing confirmation") .option("--skip-users", "Skip pulling users") .option("--skip-settings", "Skip pulling settings") .option("--skip-configs", "Skip pulling configs (worker groups and SMTP)") .option("--skip-groups", "Skip pulling instance groups") .option("--include-workspaces", "Also pull workspaces") + .action(instancePull as any) .command("push") .description( "Push instance settings, users, configs, group and overwrite remote" ) + .option("--yes", "Push without needing confirmation") .option("--skip-users", "Skip pushing users") .option("--skip-settings", "Skip pushing settings") .option("--skip-configs", "Skip pushing configs (worker groups and SMTP)") .option("--skip-groups", "Skip pushing instance groups") .option("--include-workspaces", "Also push workspaces") + .option( + "--instance", + "Name of the instance to push to, override the active instance" + ) .option( "--base-url", - "Base url to be passed to the instance settings instead of the local one" + "If used with --token, will be used as the base url for the instance" ) - .action(instancePush as any); + .action(instancePush as any) + .command("whoami") + .description("Display information about the currently logged-in user") + .action(whoami as any); export default command; diff --git a/cli/main.ts b/cli/main.ts index b5f9bb0899780..d86356d486245 100644 --- a/cli/main.ts +++ b/cli/main.ts @@ -19,6 +19,8 @@ import folder from "./folder.ts"; import schedule from "./schedule.ts"; import sync from "./sync.ts"; import instance from "./instance.ts"; +import workerGroups from "./worker_groups.ts"; + import dev from "./dev.ts"; import { fetchVersion } from "./context.ts"; import { GlobalOptions } from "./types.ts"; @@ -28,6 +30,8 @@ import { NpmProvider } from "./upgrade.ts"; import { pull as hubPull } from "./hub.ts"; import { pull, push } from "./sync.ts"; import { add as workspaceAdd } from "./workspace.ts"; +import workers from "./workers.ts"; +import queues from "./queues.ts"; export { flow, @@ -63,7 +67,7 @@ const command = new Command() .action(() => log.info(`Welcome to Windmill CLI ${VERSION}. Use -h for help.`) ) - .description("A simple CLI tool for windmill.") + .description("Windmill CLI") .globalOption( "--workspace ", @@ -117,7 +121,9 @@ const command = new Command() .command("dev", dev) .command("sync", sync) .command("instance", instance) - + .command("worker-groups", workerGroups) + .command("workers", workers) + .command("queues", queues) .command("version", "Show version information") .action(async (opts) => { console.log("CLI build against " + VERSION); diff --git a/cli/queues.ts b/cli/queues.ts new file mode 100644 index 0000000000000..1d027bf5038be --- /dev/null +++ b/cli/queues.ts @@ -0,0 +1,140 @@ +import { Command, Table } from "./deps.ts"; +import { log } from "./deps.ts"; +import * as wmill from "./gen/services.gen.ts"; +import { pickInstance } from "./instance.ts"; + +type Data = { + count: number; + later: number; + waiting: number; + running: number; + rps30s: string; + rps5min: string; + rps30min: string; + rps24h: string; +} + +type GlobalOptions = { + instance?: string; + baseUrl?: string; +}; + + +function createRow(tag: string, data: Record) { + if (data[tag]) { + return; + } else { + data[tag] = { + count: 0, + waiting: 0, + later: 0, + running: 0, + rps30s: "", + rps5min: "", + rps30min: "", + rps24h: "", + } + } +} +async function displayQueues(opts: GlobalOptions, workspace?: string) { + const activeInstance = await pickInstance(opts, true); + if (activeInstance) { + try { + const queuedJobs = await wmill.listQueue({workspace: workspace ?? 'admins', allWorkspaces: workspace === undefined}); + const jobCounts30s = await wmill.countJobsByTag({ + horizonSecs: 30, + workspaceId: workspace, + }); + const nowFromDb = new Date(await wmill.getDbClock()); + const jobCounts5min = await wmill.countJobsByTag({ + horizonSecs: 300, + workspaceId: workspace, + + }); + const jobCounts30min = await wmill.countJobsByTag({ + horizonSecs: 1800, + workspaceId: workspace, + + }); + const jobCounts24h = await wmill.countJobsByTag({ + horizonSecs: 86400, + workspaceId: workspace, + + }); + + const data: Record = {} + + for (const job of queuedJobs) { + createRow(job.tag, data); + const scheduledFor = new Date(job.scheduled_for ?? ""); + if (job.running) { + data[job.tag].running += 1; + } else if (scheduledFor <= nowFromDb) { + data[job.tag].waiting += 1; + } else { + data[job.tag].later += 1; + } + } + + for (const count of jobCounts30s) { + const tag = count.tag; + createRow(tag, data); + data[tag].rps30s = (count.count / 30).toFixed(3); + } + for (const count of jobCounts5min) { + const tag = count.tag; + createRow(tag, data); + data[tag].rps5min = (count.count / 300).toFixed(3); + } + for (const count of jobCounts30min) { + const tag = count.tag; + createRow(tag, data); + data[tag].rps30min = (count.count / 1800).toFixed(3); + } + + for (const count of jobCounts24h) { + const tag = count.tag; + createRow(tag, data); + data[tag].rps24h = (count.count / 86400).toFixed(3); + } + + const table = new Table(); + table.header([ + "", + "Running", + "Waiting", + "Later", + "RPS (30s)", + "RPS (5min)", + "RPS (30min)", + "RPS (24h)", + ]); + let body = [] + for (const tag in data) { + body.push([tag, data[tag].running, data[tag].waiting, data[tag].later, data[tag].rps30s, data[tag].rps5min, data[tag].rps30min, data[tag].rps24h]); + } + table.body(body).render(); + + } catch (error) { + log.error("Failed to fetch queue metrics:", error); + } + } else { + log.info("No active instance found"); + log.info("Use 'wmill instance add' to add a new instance"); + } +} + +const command = new Command() + .description("List all queues with their metrics") + .arguments("[workspace:string] the optional workspace to filter by (default to all workspaces)") + .option( + "--instance [instance]", + "Name of the instance to push to, override the active instance" + ) + .option( + "--base-url [baseUrl]", + "If used with --token, will be used as the base url for the instance" + ) + .action(displayQueues as any); + +export default command; diff --git a/cli/settings.ts b/cli/settings.ts index 9ef51e80d21ed..b3a1b6a272a3b 100644 --- a/cli/settings.ts +++ b/cli/settings.ts @@ -8,6 +8,7 @@ import { isSuperset } from "./types.ts"; import { deepEqual } from "./utils.ts"; import * as wmill from "./gen/services.gen.ts"; import { Config, GlobalSetting } from "./gen/types.gen.ts"; +import { removeWorkerPrefix } from "./worker_groups.ts"; export interface SimplifiedSettings { // slack_team_id?: string; @@ -354,7 +355,12 @@ export async function pushInstanceSettings( } export async function pullInstanceConfigs(preview = false) { - const remoteConfigs = await wmill.listConfigs(); + const remoteConfigs = (await wmill.listConfigs()).map((x) => { + return { + ...x, + name: removeWorkerPrefix(x.name), + }; + }); if (preview) { let localConfigs: Config[] = []; @@ -383,7 +389,12 @@ export async function pullInstanceConfigs(preview = false) { } export async function pushInstanceConfigs(preview: boolean = false) { - const remoteConfigs = await wmill.listConfigs(); + const remoteConfigs = (await wmill.listConfigs()).map((x) => { + return { + ...x, + name: removeWorkerPrefix(x.name), + }; + }); const localConfigs = (await Deno.readTextFile("instance_configs.yaml") .then((raw) => yamlParse(raw)) .catch(() => [])) as Config[]; @@ -404,7 +415,7 @@ export async function pushInstanceConfigs(preview: boolean = false) { } try { await wmill.updateConfig({ - name: config.name, + name: config.name.startsWith('worker__') ? config.name : `worker__${config.name}`, requestBody: config.config, }); } catch (err) { diff --git a/cli/worker_groups.ts b/cli/worker_groups.ts new file mode 100644 index 0000000000000..bd1eb6f56f7a3 --- /dev/null +++ b/cli/worker_groups.ts @@ -0,0 +1,138 @@ +import { Command, Confirm, setClient, Table } from "./deps.ts"; + +import { log } from "./deps.ts"; +import { allInstances, getActiveInstance, Instance, InstanceSyncOptions, pickInstance } from "./instance.ts"; +import * as wmill from "./gen/services.gen.ts"; +import { pullInstanceConfigs, pushInstanceConfigs } from "./settings.ts"; + +type GlobalOptions = { + instance?: string; + baseUrl?: string; +}; +export async function getInstance(opts: GlobalOptions) { + const instances = await allInstances(); + + const instanceName = await getActiveInstance(opts); + const instance = instances.find((i) => i.name === instanceName); + if (instance) { + setClient( + instance.token, + instance.remote.slice(0, instance.remote.length - 1) + ); + } + return instance; +} + +export function removeWorkerPrefix(name: string) { + if (name.startsWith("worker__")) { + return name.substring(8); + } + return name; +} + +export async function displayWorkerGroups(opts: void) { + log.info("2 actions available, pull and push."); + const activeInstance = await getActiveInstance({}); + + if (activeInstance) { + log.info("Active instance: " + activeInstance); + const instance = await getInstance({}); + if (instance) { + const wGroups = await wmill.listWorkerGroups(); + new Table() + .header(["name", "config"]) + .padding(2) + .border(true) + .body(wGroups.map((x) => [removeWorkerPrefix(x.name), JSON.stringify(x.config, null, 2)])) + .render(); + } else { + log.error(`Instance ${activeInstance} not found`); + } + } else { + log.info("No active instance found"); + log.info("Use 'wmill instance add' to add a new instance"); + } +} +async function pullWorkerGroups(opts: InstanceSyncOptions) { + await pickInstance(opts, true); + + const totalChanges = await pullInstanceConfigs(true) ?? 0; + + if (totalChanges === 0) { + log.info("No changes to apply"); + return; + } + + let confirm = true; + if (opts.yes !== true) { + confirm = await Confirm.prompt({ + message: `Do you want to pul these ${totalChanges} instance-level changes?`, + default: true, + }); + } + + if (confirm) { + await pullInstanceConfigs(false); + } +} + +async function pushWorkerGroups(opts: InstanceSyncOptions) { + await pickInstance(opts, true); + + const totalChanges = await pushInstanceConfigs(true) ?? 0; + + if (totalChanges === 0) { + log.info("No changes to apply"); + return; + } + + let confirm = true; + if (opts.yes !== true) { + confirm = await Confirm.prompt({ + message: `Do you want to apply these ${totalChanges} instance-level changes?`, + default: true, + }); + } + + if (confirm) { + await pushInstanceConfigs(false); + } +} + + + +const command = new Command() + .description("display worker groups, pull and push worker groups configs") + .action(displayWorkerGroups) + .command("pull") + .description( + "Pull worker groups (similar to `wmill instance pull --skip-users --skip-settings --skip-groups`)" + ) + .option( + "--instance", + "Name of the instance to push to, override the active instance" + ) + .option( + "--base-url", + "Base url to be passed to the instance settings instead of the local one" + ) + .option("--yes", "Pull without needing confirmation") + .action(pullWorkerGroups as any) + .command("push") + .description( + "Push instance settings, users, configs, group and overwrite remote" + ) + .option( + "--instance [instance]", + "Name of the instance to push to, override the active instance" + ) + .option( + "--base-url [baseUrl]", + "If used with --token, will be used as the base url for the instance" + ) + .option("--yes", "Push without needing confirmation") + .action(pushWorkerGroups as any); + + + +export default command; diff --git a/cli/workers.ts b/cli/workers.ts new file mode 100644 index 0000000000000..ff28a1c504fad --- /dev/null +++ b/cli/workers.ts @@ -0,0 +1,105 @@ +import { Command, Table } from "./deps.ts"; +import { log } from "./deps.ts"; +import * as wmill from "./gen/services.gen.ts"; +import { pickInstance } from "./instance.ts"; + +type GlobalOptions = { + instance?: string; + baseUrl?: string; +}; + + +function toPercent(value: number | undefined): string { + return value != undefined ? `${(value * 100).toFixed(1)}%` : '?%'; +} + +async function displayWorkers(opts: GlobalOptions) { + const activeInstance = await pickInstance(opts, true); + + if (activeInstance) { + const workerGroups = await wmill.listWorkerGroups(); + const workers = await wmill.listWorkers({ + pingSince: 10 + }); + + const groupedWorkers = workerGroups.map(group => { + + + const groupWorkers = workers.filter(worker => worker.worker_group === group.name); + return { + groupName: group.name, + workers: groupWorkers + }; + }); + + // Add workers that don't belong to any worker group + const ungroupedWorkers = workers.filter(worker => + !workerGroups.some(group => group.name === worker.worker_group) + ); + + if (ungroupedWorkers.length > 0) { + ungroupedWorkers.forEach(worker => { + const groupName = worker.worker_group || "Ungrouped"; + let group = groupedWorkers.find(g => g.groupName === groupName); + if (!group) { + group = { groupName, workers: [] }; + groupedWorkers.push(group); + } + group.workers.push(worker); + }); + } + + // Sort groupedWorkers + groupedWorkers.sort((a, b) => { + // Always put 'default' first + if (a.groupName === 'default') return -1; + if (b.groupName === 'default') return 1; + + // Then sort by number of workers (descending order) + return b.workers.length - a.workers.length; + }); + + groupedWorkers.forEach(group => { + log.info(`\nWorker Group: ${group.groupName} (${group.workers.length} workers)`); + if (group.workers.length === 0) { + log.info(" No workers in this group"); + } else { + + new Table() + .header(["Worker ID", "Host", "Queues", "Jobs", "Occupancy rate 15s/5m/30m/ever)", "Last job", "Last Ping"]) + .padding(2) + .border(true) + .maxColWidth(30) + .body(group.workers.map(worker => [ + worker.worker, + worker.worker_instance, + worker.custom_tags?.join(', ') || '', + worker.jobs_executed, + `${toPercent(worker.occupancy_rate_15s)}/${toPercent(worker.occupancy_rate_5m)}/${toPercent(worker.occupancy_rate_30m)}/${toPercent(worker.occupancy_rate)}`, + + worker.last_job_id ? worker.last_job_id + ' ' +worker.last_job_workspace_id : '', + `${worker.last_ping}s ago` + ])) + .render(); + } + }); + + } else { + log.info("No active instance found"); + log.info("Use 'wmill instance add' to add a new instance"); + } +} + +const command = new Command() + .description("List all workers grouped by worker groups") + .option( + "--instance [instance]", + "Name of the instance to push to, override the active instance" + ) + .option( + "--base-url [baseUrl]", + "If used with --token, will be used as the base url for the instance" + ) + .action(displayWorkers as any); + +export default command; diff --git a/frontend/src/routes/(root)/(logged)/workers/+page.svelte b/frontend/src/routes/(root)/(logged)/workers/+page.svelte index a1f9880348c2a..f3a595a56147a 100644 --- a/frontend/src/routes/(root)/(logged)/workers/+page.svelte +++ b/frontend/src/routes/(root)/(logged)/workers/+page.svelte @@ -74,7 +74,7 @@ async function loadWorkerGroups(): Promise { try { workerGroups = Object.fromEntries( - (await ConfigService.listWorkerGroups()).map((x) => [x.name.substring(8), x.config]) + (await ConfigService.listWorkerGroups()).map((x) => [x.name, x.config]) ) } catch (err) { sendUserToast(`Could not load worker groups: ${err}`, true) @@ -240,6 +240,14 @@ const openSearchWithPrefilledText: (t?: string) => void = getContext( 'openSearchWithPrefilledText' ) + + function displayOccupancyRate(occupancy_rate: number | undefined) { + if (occupancy_rate == undefined) { + return '--' + } + + return Math.ceil(occupancy_rate * 100) + '%' + } {#if $superadmin} @@ -503,7 +511,7 @@ Jobs ran {#if (!config || config?.dedicated_worker == undefined) && $superadmin} Last job - Occupancy rate + Occupancy rate
(15s/5m/30m/ever)
{/if} Memory usage
(Windmill)
Limits @@ -538,7 +546,7 @@ {#if workers} - {#each workers as { worker, custom_tags, last_ping, started_at, jobs_executed, last_job_id, last_job_workspace_id, occupancy_rate, wm_version, vcpus, memory, memory_usage, wm_memory_usage }} + {#each workers as { worker, custom_tags, last_ping, started_at, jobs_executed, last_job_id, last_job_workspace_id, occupancy_rate_15s, occupancy_rate_5m, occupancy_rate_30m, occupancy_rate, wm_version, vcpus, memory, memory_usage, wm_memory_usage }} {worker} @@ -564,7 +572,11 @@ {/if} - {Math.ceil((occupancy_rate ?? 0) * 100)}% + {displayOccupancyRate(occupancy_rate_15s)}/{displayOccupancyRate( + occupancy_rate_5m + )}/{displayOccupancyRate(occupancy_rate_30m)}/{displayOccupancyRate( + occupancy_rate + )} {/if}