From eac02ce9b709afb8d1a2dfb537c735fc8595b58d Mon Sep 17 00:00:00 2001 From: Toni Harzendorf Date: Sun, 19 Jan 2025 22:24:57 +0100 Subject: [PATCH 01/15] add slurmctld stats --- pyslurm/core/slurmctld/__init__.py | 7 + pyslurm/core/slurmctld/stats.pxd | 136 +++++++++++++++ pyslurm/core/slurmctld/stats.pyx | 258 +++++++++++++++++++++++++++++ 3 files changed, 401 insertions(+) create mode 100644 pyslurm/core/slurmctld/stats.pxd create mode 100644 pyslurm/core/slurmctld/stats.pyx diff --git a/pyslurm/core/slurmctld/__init__.py b/pyslurm/core/slurmctld/__init__.py index fe8f970b..e61544dd 100644 --- a/pyslurm/core/slurmctld/__init__.py +++ b/pyslurm/core/slurmctld/__init__.py @@ -5,6 +5,13 @@ CgroupConfig, ) from .enums import ShutdownMode +from .stats import ( + Statistics, + RPCUserStatistic, + RPCUserStatistics, + RPCTypeStatistic, + RPCTypeStatistics, +) from .base import ( PingResponse, ping, diff --git a/pyslurm/core/slurmctld/stats.pxd b/pyslurm/core/slurmctld/stats.pxd new file mode 100644 index 00000000..16b1600b --- /dev/null +++ b/pyslurm/core/slurmctld/stats.pxd @@ -0,0 +1,136 @@ +######################################################################### +# slurmctld/stats.pxd - pyslurm slurmctld statistics api (sdiag) +######################################################################### +# Copyright (C) 2025 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# cython: c_string_type=unicode, c_string_encoding=default +# cython: language_level=3 + +from pyslurm cimport slurm +from pyslurm.slurm cimport ( + stats_info_response_msg_t, + stats_info_request_msg_t, + slurm_get_statistics, + slurm_reset_statistics, + slurm_free_stats_response_msg, + xfree, +) +from pyslurm.utils cimport cstr +from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t, int64_t +from pyslurm.utils.uint cimport ( + u16_parse, + u32_parse, + u64_parse, + u16_parse_bool, +) + +cdef extern const char *rpc_num2string(uint16_t msg_type) + +cdef parse_response(stats_info_response_msg_t *ptr) + + +cdef class RPCTypeStatistic: + + cdef public: + id + name + count + time + average_time + queued + dropped + cycle_last + cycle_max + + +cdef class RPCUserStatistic: + + cdef public: + user_id + user_name + count + time + average_time + + +cdef class RPCTypeStatistics(dict): + + @staticmethod + cdef RPCTypeStatistics from_ptr(stats_info_response_msg_t *ptr, rpc_queue_enabled) + + +cdef class RPCUserStatistics(dict): + + @staticmethod + cdef RPCUserStatistics from_ptr(stats_info_response_msg_t *ptr) + + +cdef class Statistics: + + cdef public: + request_time + data_since + server_thread_count + agent_queue_size + agent_count + agent_thread_count + dbd_agent_queue_size + rpc_queue_enabled + + jobs_submitted + jobs_started + jobs_completed + jobs_canceled + jobs_failed + jobs_pending + jobs_running + job_states_ts + + schedule_cycle_max + schedule_cycle_last + schedule_cycle_counter + schedule_cycle_mean + schedule_cycle_mean_depth + schedule_queue_len + cycles_per_minute + schedule_exit + + backfill_active + backfilled_jobs + last_backfilled_jobs + backfilled_het_jobs + backfill_last_cycle_when + backfill_last_cycle + backfill_cycle_last + backfill_cycle_max + backfill_total_cycles + backfill_cycle_mean + backfill_last_depth_cycle + backfill_last_depth_cycle_try_sched + backfill_mean_depth_cycle + backfill_mean_depth_cycle_try_sched + backfill_queue_len + backfill_queue_len_mean + backfill_table_size + backfill_table_size_mean + + gettimeofday_latency + + rpc_type_stats + rpc_user_stats diff --git a/pyslurm/core/slurmctld/stats.pyx b/pyslurm/core/slurmctld/stats.pyx new file mode 100644 index 00000000..f903e539 --- /dev/null +++ b/pyslurm/core/slurmctld/stats.pyx @@ -0,0 +1,258 @@ +######################################################################### +# slurmctld/stats.pyx - pyslurm slurmctld statistics api (sdiag) +######################################################################### +# Copyright (C) 2025 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# cython: c_string_type=unicode, c_string_encoding=default +# cython: language_level=3 + +from pyslurm.core.error import verify_rpc, RPCError +from pyslurm.utils.uint import ( + u16_parse, + u32_parse, + u64_parse, +) +from pyslurm.constants import UNLIMITED +from pyslurm.utils.ctime import _raw_time +from pyslurm.utils.helpers import ( + instance_to_dict, + uid_to_name, +) +from pyslurm.utils import cstr +from pyslurm import xcollections + + +cdef class RPCTypeStatistic: + + def __init__(self): + self.id = 0 + self.name = None + self.count = 0 + self.time = 0 + self.average_time = 0 + self.queued = 0 + self.dropped = 0 + self.cycle_last = 0 + self.cycle_max = 0 + + def to_dict(self): + return instance_to_dict(self) + + +cdef class RPCUserStatistic: + + def __init__(self): + self.user_id = 0 + self.user_name = None + self.count = 0 + self.time = 0 + self.average_time = 0 + + def to_dict(self): + return instance_to_dict(self) + + +cdef class RPCTypeStatistics(dict): + + def __init__(self): + super().__init__() + + @staticmethod + cdef RPCTypeStatistics from_ptr(stats_info_response_msg_t *ptr, + rpc_queue_enabled): + out = RPCTypeStatistics() + + for i in range(ptr.rpc_type_size): + stats = RPCTypeStatistic() + stats.id = ptr.rpc_type_id[i] + stats.name = rpc_num2string(ptr.rpc_type_id[i]) + stats.count = ptr.rpc_type_cnt[i] + stats.time = ptr.rpc_type_time[i] + + if ptr.rpc_type_cnt[i]: + stats.average_time = ptr.rpc_type_time[i] / ptr.rpc_type_cnt[i] + + if rpc_queue_enabled: + stats.queued = ptr.rpc_type_queued[i] + stats.dropped = ptr.rpc_type_dropped[i] + stats.cycle_last = ptr.rpc_type_cycle_last[i] + stats.cycle_max = ptr.rpc_type_cycle_max[i] + + if stats.name: + out[stats.name] = stats + + return out + + @property + def total_queued(self): + return xcollections.sum_property(self, RPCTypeStatistic.queued) + + @property + def total_count(self): + return xcollections.sum_property(self, RPCTypeStatistic.count) + + @property + def total_time(self): + return xcollections.sum_property(self, RPCTypeStatistic.time) + + @property + def total_dropped(self): + return xcollections.sum_property(self, RPCTypeStatistic.dropped) + + +cdef class RPCUserStatistics(dict): + + def __init__(self): + super().__init__() + + @staticmethod + cdef RPCUserStatistics from_ptr(stats_info_response_msg_t *ptr): + out = RPCUserStatistics() + + for i in range(ptr.rpc_user_size): + user_id = ptr.rpc_user_id[i] + user = uid_to_name(user_id) + if not user: + continue + + stats = RPCUserStatistic() + stats.user_id = ptr.rpc_user_id[i] + stats.user_name = user + stats.count = ptr.rpc_user_cnt[i] + stats.time = ptr.rpc_user_time[i] + + if ptr.rpc_user_cnt[i]: + stats.average_time = ptr.rpc_user_time[i] / ptr.rpc_user_cnt[i] + + out[user] = stats + + return out + + @property + def total_queued(self): + return xcollections.sum_attr(self, "queued") + + +cdef parse_response(stats_info_response_msg_t *ptr): + cdef Statistics out = Statistics() + + cycle_count = ptr.schedule_cycle_counter + bf_cycle_count = ptr.bf_cycle_counter + + out.server_thread_count = ptr.server_thread_count + out.rpc_queue_enabled = True if ptr.rpc_queue_enabled else False + out.agent_queue_size = ptr.agent_queue_size + out.agent_count = ptr.agent_count + out.agent_thread_count = ptr.agent_thread_count + out.dbd_agent_queue_size = ptr.dbd_agent_queue_size + out.jobs_submitted = ptr.jobs_submitted + out.jobs_started = ptr.jobs_started + out.jobs_completed = ptr.jobs_completed + out.jobs_canceled = ptr.jobs_canceled + out.jobs_failed = ptr.jobs_failed + out.jobs_pending = ptr.jobs_pending + out.jobs_running = ptr.jobs_running + out.schedule_cycle_max = ptr.schedule_cycle_max + out.schedule_cycle_last = ptr.schedule_cycle_last + out.schedule_cycle_counter = cycle_count + out.schedule_queue_len = ptr.schedule_queue_len + + # TODO: job_states_ts ? + # TODO: scheduler exits + + if cycle_count > 0: + out.schedule_cycle_mean = ptr.schedule_cycle_sum / cycle_count + out.schedule_cycle_mean_depth = ptr.schedule_cycle_depth / cycle_count + + ts = ptr.req_time - ptr.req_time_start + if ts > 60: + out.cycles_per_minute = cycle_count / (ts / 60) + + + out.backfill_active = bool(ptr.bf_active) + out.backfilled_jobs = ptr.bf_backfilled_jobs + out.last_backfilled_jobs = ptr.bf_last_backfilled_jobs + out.backfilled_het_jobs = ptr.bf_backfilled_het_jobs + out.backfill_last_cycle_when = ptr.bf_when_last_cycle + out.backfill_last_cycle = ptr.bf_cycle_last + out.backfill_cycle_max = ptr.bf_cycle_max + out.backfill_total_cycles = bf_cycle_count + out.backfill_last_depth_cycle = ptr.bf_last_depth + out.backfill_last_depth_cycle_try_sched = ptr.bf_last_depth_try + out.backfill_queue_len = ptr.bf_queue_len + out.backfill_table_size = ptr.bf_table_size + + if bf_cycle_count > 0: + out.backfill_cycle_mean = ptr.bf_cycle_sum / bf_cycle_count + out.backfill_mean_depth_cycle = ptr.bf_depth_sum / bf_cycle_count + out.backfill_mean_depth_cycle_try_sched = ptr.bf_depth_try_sum / bf_cycle_count + out.backfill_queue_len_mean = ptr.bf_queue_len_sum / bf_cycle_count + out.backfill_table_size_mean = ptr.bf_table_size_sum / bf_cycle_count + + out.gettimeofday_latency = ptr.gettimeofday_latency + + out.rpc_type_stats = RPCTypeStatistics.from_ptr(ptr, out.rpc_queue_enabled) + out.rpc_user_stats = RPCUserStatistics.from_ptr(ptr) + + return out + + +cdef class Statistics: + + def __init__(self): + self.schedule_cycle_mean = 0 + self.schedule_cycle_mean_depth = 0 + self.cycles_per_minute = 0 + self.backfill_cycle_mean = 0 + self.backfill_mean_depth_cycle = 0 + self.backfill_mean_depth_cycle_try_sched = 0 + self.backfill_queue_len_mean = 0 + self.backfill_table_size_mean = 0 + + @staticmethod + def load(): + cdef: + stats_info_request_msg_t req + stats_info_response_msg_t *resp = NULL + Statistics out = None + + req.command_id = slurm.STAT_COMMAND_GET + verify_rpc(slurm_get_statistics(&resp, &req)) + + try: + out = parse_response(resp) + except Exception as e: + raise e + finally: + slurm_free_stats_response_msg(resp) + + return out + + @staticmethod + def reset(): + cdef stats_info_request_msg_t req + verify_rpc(slurm_reset_statistics(&req)) + + def to_dict(self): + out = instance_to_dict(self) + out["rpc_type_stats"] = xcollections.dict_recursive(self.rpc_type_stats) + out["rpc_user_stats"] = xcollections.dict_recursive(self.rpc_user_stats) + return out + + From ab6d007ea739339cf5248ff762af16293cc15973 Mon Sep 17 00:00:00 2001 From: Toni Harzendorf Date: Tue, 21 Jan 2025 21:38:06 +0100 Subject: [PATCH 02/15] wip: docs and few attribute renames --- pyslurm/core/slurmctld/stats.pxd | 233 +++++++++++++++++++++++++++++-- pyslurm/core/slurmctld/stats.pyx | 127 ++++++++++------- 2 files changed, 299 insertions(+), 61 deletions(-) diff --git a/pyslurm/core/slurmctld/stats.pxd b/pyslurm/core/slurmctld/stats.pxd index 16b1600b..d7319d98 100644 --- a/pyslurm/core/slurmctld/stats.pxd +++ b/pyslurm/core/slurmctld/stats.pxd @@ -45,8 +45,49 @@ cdef extern const char *rpc_num2string(uint16_t msg_type) cdef parse_response(stats_info_response_msg_t *ptr) +cdef class PendingRPC: + """A RPC in pending State + + Attributes: + id (int): + The numeric ID of the RPC type. + name (str): + The string representation of the RPC. + count (int): + How many RPCs are pending of this type. + """ + cdef public: + id + name + count + + cdef class RPCTypeStatistic: + """Statistics for a specific RPC Type + Attributes: + id (int): + The numeric ID of the RPC Type + name (str): + The string representation of the RPC + count (int): + How many times this RPC was issued since the last time the + statistics were cleared. + time (int): + How much total time it has taken to process this RPC. The unit is + microseconds + average_time (int): + How much time on average it has taken to process this RPC. The unit + is microseconds. + queued (int): + How many of these RPCs are still queued. + dropped (int): + How many of these RPCs have been dropped. + cycle_last (int): + Number of RPCs processed within the last RPC queue cycle + cycle_max (int): + Maximum number of RPCs processed within a RPC queue cycle + """ cdef public: id name @@ -70,28 +111,195 @@ cdef class RPCUserStatistic: cdef class RPCTypeStatistics(dict): + """Collection of [](pyslurm.slurmctld.RPCTypeStatistic)'s + Attributes: + count (int): + Total amount of RPCs made to the `slurmctld` since last reset. + time (int): + Total amount of time it has taken to process all RPCs made yet. + queued (int): + Total amount of RPCs queued. + queued (int): + Total amount of RPCs dropped. + """ @staticmethod cdef RPCTypeStatistics from_ptr(stats_info_response_msg_t *ptr, rpc_queue_enabled) cdef class RPCUserStatistics(dict): + """Collection of [](pyslurm.slurmctld.RPCUserStatistic)'s + Attributes: + count (int): + Total amount of RPCs made to the `slurmctld` since last reset. + time (int): + Total amount of time it has taken to process all RPCs made yet. + """ @staticmethod cdef RPCUserStatistics from_ptr(stats_info_response_msg_t *ptr) +cdef class PendingRPCStatistics(dict): + """Collection of [](pyslurm.slurmctld.PendingRPCStatistics) + + Attributes: + count (int): + Total amount of RPCs made to the `slurmctld` since last reset. + """ + @staticmethod + cdef PendingRPCStatistics from_ptr(stats_info_response_msg_t *ptr) + + cdef class Statistics: + """Statistics for the `slurmctld`. + Attributes: + request_time (int): + Time when the data was requested. This is a unix timestamp. + data_since (int): + The date when `slurmctld` started gathering statistics. This is a + unix timestamp. + server_thread_count (int): + The number of current active slurmctld threads. + rpc_queue_enabled (bool): + Whether RPC queuing is enabled. + agent_queue_size (int): + Count of enqueued outgoing RPC requests in an internal retry list. + agent_count (int): + Number of agent threads. + agent_thread_count (int): + Total count of active threads created by all the agent threads. + dbd_agent_queue_size (int): + Number of messages intended for the `slurmdbd`. If the `slurmdbd` + goes down, then this number starts going up. + jobs_submitted (int): + Number of jobs submitted since last reset + jobs_started (int): + Number of jobs started since last reset. This includes backfilled + jobs. + jobs_completed (int): + Number of jobs completed since last reset. + jobs_canceled (int): + Number of jobs canceled since last reset. + jobs_failed (int): + Number of jobs failed due to slurmd or other internal issues since + last reset. + jobs_pending (int): + Number of jobs pending. + jobs_running (int): + Number of jobs running. + schedule_cycle_last (int): + Time in microseconds for last scheduling cycle. + schedule_cycle_max (int): + Maximum time in microseconds for any scheduling cycle since last + reset. + schedule_cycle_counter (int): + Total run time in microseconds for all scheduling cycles since last + reset. + schedule_cycle_mean (int): + Mean time in microseconds for all scheduling cycles since last + reset. + schedule_cycle_mean_depth (int): + Mean of cycle depth. Depth means number of jobs processed in a + scheduling cycle. + schedule_cycles_per_minute (int): + Counter of scheduling executions per minute. + schedule_queue_length (int): + Length of jobs pending queue. + backfill_active (bool): + Whether these statistics have been gathered during backfilling + operation. + backfilled_jobs (int): + Number of jobs started thanks to backfilling since last slurm + start. + last_backfilled_jobs (int): + Number of jobs started thanks to backfilling since last time stats + where reset. By default these values are reset at midnight UTC + time. + backfilled_het_jobs (int): + Number of heterogeneous job components started thanks to + backfilling since last Slurm start. + backfill_cycle_counter (int): + Number of backfill scheduling cycles since last reset. + backfill_last_cycle_when (int): + Time when last backfill scheduling cycle happened. This is a unix + timestamp. + backfill_last_cycle (int): + Time in microseconds of last backfill scheduling cycle. It counts + only execution time, removing sleep time inside a scheduling cycle + when it executes for an extended period time. Note that locks are + released during the sleep time so that other work can proceed. + backfill_cycle_max (int): + Time in microseconds of maximum backfill scheduling cycle execution + since last reset. It counts only execution time, removing sleep + time inside a scheduling cycle when it executes for an extended + period time. Note that locks are released during the sleep time so + that other work can proceed. + backfill_cycle_mean (int): + Mean time in microseconds of backfilling scheduling cycles since + last reset. + backfill_last_depth (int): + Number of processed jobs during last backfilling scheduling cycle. + It counts every job even if that job can not be started due to + dependencies or limits. + backfill_last_depth_try (int): + Number of processed jobs during last backfilling scheduling cycle. + It counts only jobs with a chance to start using available + resources. These jobs consume more scheduling time than jobs which + are found can not be started due to dependencies or limits. + backfill_mean_depth (int): + Mean count of jobs processed during all backfilling scheduling + cycles since last reset. Jobs which are found to be ineligible to + run when examined by the backfill scheduler are not counted (e.g. + jobs submitted to multiple partitions and already started, jobs + which have reached a QOS or account limit such as maximum running + jobs for an account, etc). + backfill_mean_depth_try (int): + The subset of `backfill_mean_depth` that the backfill + scheduler attempted to schedule. + backfill_queue_len (int): + Number of jobs pending to be processed by backfilling algorithm. A + job is counted once for each partition it is queued to use. A + pending job array will normally be counted as one job (tasks of a + job array which have already been started/requeued or individually + modified will already have individual job records and are each + counted as a separate job). + backfill_queue_len_mean (int): + Mean count of jobs pending to be processed by backfilling + algorithm. A job is counted once for each partition it requested. A + pending job array will normally be counted as one job (tasks of a + job array which have already been started/requeued or individually + modified will already have individual job records and are each + counted as a separate job). + backfill_table_size (int): + Count of different time slots tested by the backfill scheduler in + its last iteration. + backfill_table_size_mean (int): + Mean count of different time slots tested by the backfill + scheduler. Larger counts increase the time required for the + backfill operation. The table size is influenced by many scheduling + parameters, including: bf_min_age_reserve, bf_min_prio_reserve, + bf_resolution, and bf_window. + gettimeofday_latency (int): + Latency of 1000 calls to the gettimeofday() syscall in + microseconds, as measured at controller startup. + rpcs_by_type (pyslurm.slurmctld.RPCTypeStatistics): + RPC Statistics organized by Type. + rpcs_by_user (pyslurm.slurmctld.RPCUserStatistics): + RPC Statistics organized by User. + pending_rpcs (pyslurm.slurmctld.PendingRPCStatistics): + Statistics for pending RPCs. + """ cdef public: request_time data_since server_thread_count + rpc_queue_enabled agent_queue_size agent_count agent_thread_count dbd_agent_queue_size - rpc_queue_enabled jobs_submitted jobs_started @@ -100,31 +308,31 @@ cdef class Statistics: jobs_failed jobs_pending jobs_running - job_states_ts + # job_states_time - schedule_cycle_max schedule_cycle_last + schedule_cycle_max schedule_cycle_counter schedule_cycle_mean schedule_cycle_mean_depth - schedule_queue_len - cycles_per_minute + schedule_cycles_per_minute + schedule_queue_length schedule_exit backfill_active backfilled_jobs last_backfilled_jobs backfilled_het_jobs + backfill_cycle_counter backfill_last_cycle_when backfill_last_cycle backfill_cycle_last backfill_cycle_max - backfill_total_cycles backfill_cycle_mean - backfill_last_depth_cycle - backfill_last_depth_cycle_try_sched - backfill_mean_depth_cycle - backfill_mean_depth_cycle_try_sched + backfill_last_depth + backfill_last_depth_try + backfill_mean_depth + backfill_mean_depth_try backfill_queue_len backfill_queue_len_mean backfill_table_size @@ -132,5 +340,6 @@ cdef class Statistics: gettimeofday_latency - rpc_type_stats - rpc_user_stats + rpcs_by_type + rpcs_by_user + pending_rpcs diff --git a/pyslurm/core/slurmctld/stats.pyx b/pyslurm/core/slurmctld/stats.pyx index f903e539..7bf8cf11 100644 --- a/pyslurm/core/slurmctld/stats.pyx +++ b/pyslurm/core/slurmctld/stats.pyx @@ -23,12 +23,6 @@ # cython: language_level=3 from pyslurm.core.error import verify_rpc, RPCError -from pyslurm.utils.uint import ( - u16_parse, - u32_parse, - u64_parse, -) -from pyslurm.constants import UNLIMITED from pyslurm.utils.ctime import _raw_time from pyslurm.utils.helpers import ( instance_to_dict, @@ -38,6 +32,14 @@ from pyslurm.utils import cstr from pyslurm import xcollections +cdef class PendingRPC: + + def __init__(self): + self.id = 0 + self.name = None + self.count = 0 + + cdef class RPCTypeStatistic: def __init__(self): @@ -86,7 +88,7 @@ cdef class RPCTypeStatistics(dict): stats.time = ptr.rpc_type_time[i] if ptr.rpc_type_cnt[i]: - stats.average_time = ptr.rpc_type_time[i] / ptr.rpc_type_cnt[i] + stats.average_time = int(ptr.rpc_type_time[i] / ptr.rpc_type_cnt[i]) if rpc_queue_enabled: stats.queued = ptr.rpc_type_queued[i] @@ -94,25 +96,24 @@ cdef class RPCTypeStatistics(dict): stats.cycle_last = ptr.rpc_type_cycle_last[i] stats.cycle_max = ptr.rpc_type_cycle_max[i] - if stats.name: - out[stats.name] = stats + out[stats.name] = stats return out @property - def total_queued(self): - return xcollections.sum_property(self, RPCTypeStatistic.queued) - - @property - def total_count(self): + def count(self): return xcollections.sum_property(self, RPCTypeStatistic.count) @property - def total_time(self): + def time(self): return xcollections.sum_property(self, RPCTypeStatistic.time) @property - def total_dropped(self): + def queued(self): + return xcollections.sum_property(self, RPCTypeStatistic.queued) + + @property + def dropped(self): return xcollections.sum_property(self, RPCTypeStatistic.dropped) @@ -127,10 +128,7 @@ cdef class RPCUserStatistics(dict): for i in range(ptr.rpc_user_size): user_id = ptr.rpc_user_id[i] - user = uid_to_name(user_id) - if not user: - continue - + user = uid_to_name(user_id, err_on_invalid=False) stats = RPCUserStatistic() stats.user_id = ptr.rpc_user_id[i] stats.user_name = user @@ -138,15 +136,43 @@ cdef class RPCUserStatistics(dict): stats.time = ptr.rpc_user_time[i] if ptr.rpc_user_cnt[i]: - stats.average_time = ptr.rpc_user_time[i] / ptr.rpc_user_cnt[i] + stats.average_time = int(ptr.rpc_user_time[i] / ptr.rpc_user_cnt[i]) - out[user] = stats + key = user if user is not None else str(user_id) + out[key] = stats return out @property - def total_queued(self): - return xcollections.sum_attr(self, "queued") + def count(self): + return xcollections.sum_property(self, RPCUserStatistic.count) + + @property + def time(self): + return xcollections.sum_property(self, RPCUserStatistic.time) + + +cdef class PendingRPCStatistics(dict): + + def __init__(self): + super().__init__() + + @staticmethod + cdef PendingRPCStatistics from_ptr(stats_info_response_msg_t *ptr): + out = PendingRPCStatistics() + + for i in range(ptr.rpc_queue_type_count): + stats = PendingRPC() + stats.id = ptr.rpc_queue_type_id[i] + stats.name = rpc_num2string(ptr.rpc_queue_type_id[i]) + stats.count = ptr.rpc_queue_count[i] + out[stats.name] = stats + + return out + + @property + def count(self): + return xcollections.sum_property(self, PendingRPCStatistics.count) cdef parse_response(stats_info_response_msg_t *ptr): @@ -155,6 +181,8 @@ cdef parse_response(stats_info_response_msg_t *ptr): cycle_count = ptr.schedule_cycle_counter bf_cycle_count = ptr.bf_cycle_counter + out.request_time = ptr.req_time + out.data_since = ptr.req_time_start out.server_thread_count = ptr.server_thread_count out.rpc_queue_enabled = True if ptr.rpc_queue_enabled else False out.agent_queue_size = ptr.agent_queue_size @@ -168,21 +196,21 @@ cdef parse_response(stats_info_response_msg_t *ptr): out.jobs_failed = ptr.jobs_failed out.jobs_pending = ptr.jobs_pending out.jobs_running = ptr.jobs_running - out.schedule_cycle_max = ptr.schedule_cycle_max - out.schedule_cycle_last = ptr.schedule_cycle_last - out.schedule_cycle_counter = cycle_count - out.schedule_queue_len = ptr.schedule_queue_len + out.schedule_cycle_last = int(ptr.schedule_cycle_last) + out.schedule_cycle_max = int(ptr.schedule_cycle_max) + out.schedule_cycle_counter = int(cycle_count) + out.schedule_queue_length = int(ptr.schedule_queue_len) - # TODO: job_states_ts ? + # TODO: job_states_time ? # TODO: scheduler exits if cycle_count > 0: - out.schedule_cycle_mean = ptr.schedule_cycle_sum / cycle_count - out.schedule_cycle_mean_depth = ptr.schedule_cycle_depth / cycle_count + out.schedule_cycle_mean = int(ptr.schedule_cycle_sum / cycle_count) + out.schedule_cycle_mean_depth = int(ptr.schedule_cycle_depth / cycle_count) ts = ptr.req_time - ptr.req_time_start if ts > 60: - out.cycles_per_minute = cycle_count / (ts / 60) + out.schedule_cycles_per_minute = int(cycle_count / (ts / 60)) out.backfill_active = bool(ptr.bf_active) @@ -192,23 +220,24 @@ cdef parse_response(stats_info_response_msg_t *ptr): out.backfill_last_cycle_when = ptr.bf_when_last_cycle out.backfill_last_cycle = ptr.bf_cycle_last out.backfill_cycle_max = ptr.bf_cycle_max - out.backfill_total_cycles = bf_cycle_count - out.backfill_last_depth_cycle = ptr.bf_last_depth - out.backfill_last_depth_cycle_try_sched = ptr.bf_last_depth_try + out.backfill_cycle_counter = bf_cycle_count + out.backfill_last_depth = ptr.bf_last_depth + out.backfill_last_depth_try = ptr.bf_last_depth_try out.backfill_queue_len = ptr.bf_queue_len out.backfill_table_size = ptr.bf_table_size if bf_cycle_count > 0: - out.backfill_cycle_mean = ptr.bf_cycle_sum / bf_cycle_count - out.backfill_mean_depth_cycle = ptr.bf_depth_sum / bf_cycle_count - out.backfill_mean_depth_cycle_try_sched = ptr.bf_depth_try_sum / bf_cycle_count - out.backfill_queue_len_mean = ptr.bf_queue_len_sum / bf_cycle_count - out.backfill_table_size_mean = ptr.bf_table_size_sum / bf_cycle_count + out.backfill_cycle_mean = int(ptr.bf_cycle_sum / bf_cycle_count) + out.backfill_mean_depth = int(ptr.bf_depth_sum / bf_cycle_count) + out.backfill_mean_depth_try = int(ptr.bf_depth_try_sum / bf_cycle_count) + out.backfill_queue_len_mean = int(ptr.bf_queue_len_sum / bf_cycle_count) + out.backfill_table_size_mean = int(ptr.bf_table_size_sum / bf_cycle_count) out.gettimeofday_latency = ptr.gettimeofday_latency - out.rpc_type_stats = RPCTypeStatistics.from_ptr(ptr, out.rpc_queue_enabled) - out.rpc_user_stats = RPCUserStatistics.from_ptr(ptr) + out.rpcs_by_type = RPCTypeStatistics.from_ptr(ptr, out.rpc_queue_enabled) + out.rpcs_by_user = RPCUserStatistics.from_ptr(ptr) + out.pending_rpcs = PendingRPCStatistics.from_ptr(ptr) return out @@ -218,10 +247,10 @@ cdef class Statistics: def __init__(self): self.schedule_cycle_mean = 0 self.schedule_cycle_mean_depth = 0 - self.cycles_per_minute = 0 + self.schedule_cycles_per_minute = 0 self.backfill_cycle_mean = 0 - self.backfill_mean_depth_cycle = 0 - self.backfill_mean_depth_cycle_try_sched = 0 + self.backfill_mean_depth = 0 + self.backfill_mean_depth_try = 0 self.backfill_queue_len_mean = 0 self.backfill_table_size_mean = 0 @@ -247,12 +276,12 @@ cdef class Statistics: @staticmethod def reset(): cdef stats_info_request_msg_t req + req.command_id = slurm.STAT_COMMAND_RESET verify_rpc(slurm_reset_statistics(&req)) def to_dict(self): out = instance_to_dict(self) - out["rpc_type_stats"] = xcollections.dict_recursive(self.rpc_type_stats) - out["rpc_user_stats"] = xcollections.dict_recursive(self.rpc_user_stats) + out["rpcs_by_type"] = xcollections.dict_recursive(self.rpcs_by_type) + out["rpcs_by_user"] = xcollections.dict_recursive(self.rpcs_by_user) + out["pending_rpcs"] = xcollections.dict_recursive(self.pending_rpcs) return out - - From 3abb079303b0bc13117a588f7889fb0a59338ed6 Mon Sep 17 00:00:00 2001 From: Toni Harzendorf Date: Wed, 22 Jan 2025 13:58:10 +0100 Subject: [PATCH 03/15] wip: add SchedulerExit and BackfillExit statistics --- pyslurm/core/slurmctld/stats.pxd | 23 ++++++++++++++++++++++- pyslurm/core/slurmctld/stats.pyx | 21 +++++++++++++++++++++ 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/pyslurm/core/slurmctld/stats.pxd b/pyslurm/core/slurmctld/stats.pxd index d7319d98..453cc123 100644 --- a/pyslurm/core/slurmctld/stats.pxd +++ b/pyslurm/core/slurmctld/stats.pxd @@ -45,8 +45,29 @@ cdef extern const char *rpc_num2string(uint16_t msg_type) cdef parse_response(stats_info_response_msg_t *ptr) +cdef class SchedulerExitStatistics: + + cdef public: + end_of_job_queue + default_queue_depth + max_job_start + max_sched_time + blocked_on_licences + + +cdef class BackfillExitStatistics: + + cdef public: + end_of_job_queue + max_job_start + max_job_test + max_time + node_space_size + state_changed + + cdef class PendingRPC: - """A RPC in pending State + """A RPC in pending State. Attributes: id (int): diff --git a/pyslurm/core/slurmctld/stats.pyx b/pyslurm/core/slurmctld/stats.pyx index 7bf8cf11..37fdf20c 100644 --- a/pyslurm/core/slurmctld/stats.pyx +++ b/pyslurm/core/slurmctld/stats.pyx @@ -32,6 +32,27 @@ from pyslurm.utils import cstr from pyslurm import xcollections +cdef class SchedulerExitStatistics: + + def __init__(self): + self.end_of_job_queue = 0 + self.default_queue_depth = 0 + self.max_job_start = 0 + self.max_sched_time = 0 + self.blocked_on_licenses = 0 + + +cdef class BackfillExitStatistics: + + def __init__(self): + self.end_of_job_queue = 0 + self.max_job_start = 0 + self.max_job_test = 0 + self.max_time = 0 + self.node_space_size = 0 + self.state_changed = 0 + + cdef class PendingRPC: def __init__(self): From a51ff1ad3fd89758341b32bc62c4b187da9f1be7 Mon Sep 17 00:00:00 2001 From: Toni Harzendorf Date: Wed, 22 Jan 2025 15:24:31 +0100 Subject: [PATCH 04/15] implement BackfillExit and SchedulerExit statistics --- pyslurm/core/slurmctld/stats.pxd | 52 ++++++++++- pyslurm/core/slurmctld/stats.pyx | 143 +++++++++++++++++++++---------- 2 files changed, 145 insertions(+), 50 deletions(-) diff --git a/pyslurm/core/slurmctld/stats.pxd b/pyslurm/core/slurmctld/stats.pxd index 453cc123..c5197af3 100644 --- a/pyslurm/core/slurmctld/stats.pxd +++ b/pyslurm/core/slurmctld/stats.pxd @@ -46,16 +46,58 @@ cdef parse_response(stats_info_response_msg_t *ptr) cdef class SchedulerExitStatistics: + """Conditions reached at the end of a scheduling run + Each attribute is simply a counter that describes how many times a specific + condition was met during the main scheduling run. + + Attributes: + end_of_job_queue (int): + Times the end of the job queue was reached. + default_queue_depth (int): + Reached the number of jobs allowed to be tested limit + max_job_start (int): + Reached the number of jobs allowed to start limit + blocked_on_licenses (int): + Times the scheduler blocked on licenses. + max_rpc_count (int): + Reached RPC Limit. + max_time (int): + Reached maximum allowed scheduler time for a cycle. + """ cdef public: end_of_job_queue default_queue_depth max_job_start - max_sched_time - blocked_on_licences + blocked_on_licenses + max_rpc_count + max_time + + @staticmethod + cdef SchedulerExitStatistics from_ptr(stats_info_response_msg_t *ptr) cdef class BackfillExitStatistics: + """Conditions reached at the end of a Backfill scheduling run. + + Each attribute is simply a counter that describes how many times a specific + condition was met during the Backfill scheduling run. + + Attributes: + end_of_job_queue (int): + Times the end of the job queue was reached. + max_job_start (int): + Reached the number of jobs allowed to start limit + max_job_test (int): + Reached the number of jobs allowed to attempt backfill scheduling + for. + max_time (int): + Reached maximum allowed scheduler time for a cycle. + node_space_size (int): + Reached the node_space table size limit. + state_changed (int): + System state changes. + """ cdef public: end_of_job_queue @@ -65,6 +107,9 @@ cdef class BackfillExitStatistics: node_space_size state_changed + @staticmethod + cdef BackfillExitStatistics from_ptr(stats_info_response_msg_t *ptr) + cdef class PendingRPC: """A RPC in pending State. @@ -338,7 +383,7 @@ cdef class Statistics: schedule_cycle_mean_depth schedule_cycles_per_minute schedule_queue_length - schedule_exit + schedule_exit_stats backfill_active backfilled_jobs @@ -358,6 +403,7 @@ cdef class Statistics: backfill_queue_len_mean backfill_table_size backfill_table_size_mean + backfill_exit_stats gettimeofday_latency diff --git a/pyslurm/core/slurmctld/stats.pyx b/pyslurm/core/slurmctld/stats.pyx index 37fdf20c..3fa0cf91 100644 --- a/pyslurm/core/slurmctld/stats.pyx +++ b/pyslurm/core/slurmctld/stats.pyx @@ -32,14 +32,39 @@ from pyslurm.utils import cstr from pyslurm import xcollections +# Make sure this is in sync with the current Slurm release we are targeting. +# Check in Slurm source at src/slurmctld/slurmctld.h +BF_EXIT_COUNT = 6 +SCHED_EXIT_COUNT = 6 + + cdef class SchedulerExitStatistics: def __init__(self): self.end_of_job_queue = 0 self.default_queue_depth = 0 self.max_job_start = 0 - self.max_sched_time = 0 self.blocked_on_licenses = 0 + self.max_rpc_count = 0 + self.max_time = 0 + + @staticmethod + cdef SchedulerExitStatistics from_ptr(stats_info_response_msg_t *ptr): + if ptr.schedule_exit_cnt != SCHED_EXIT_COUNT: + raise RPCError(msg="schedule_exit_cnt has an unexpected size. " + f"Got {ptr.schedule_exit_cnt}, expected {SCHED_EXIT_COUNT}.") + + out = SchedulerExitStatistics() + out.end_of_job_queue = ptr.schedule_exit[0] + out.default_queue_depth = ptr.schedule_exit[1] + out.max_job_start = ptr.schedule_exit[2] + out.blocked_on_licenses = ptr.schedule_exit[3] + out.max_rpc_count = ptr.schedule_exit[4] + out.max_time = ptr.schedule_exit[5] + return out + + def to_dict(self): + return instance_to_dict(self) cdef class BackfillExitStatistics: @@ -52,6 +77,24 @@ cdef class BackfillExitStatistics: self.node_space_size = 0 self.state_changed = 0 + @staticmethod + cdef BackfillExitStatistics from_ptr(stats_info_response_msg_t *ptr): + if ptr.bf_exit_cnt != BF_EXIT_COUNT: + raise RPCError(msg="bf_exit_cnt has an unexpected size. " + f"Got {ptr.bf_exit_cnt}, expected {BF_EXIT_COUNT}.") + + out = BackfillExitStatistics() + out.end_of_job_queue = ptr.bf_exit[0] + out.max_job_start = ptr.bf_exit[1] + out.max_job_test = ptr.bf_exit[2] + out.max_time = ptr.bf_exit[3] + out.node_space_size = ptr.bf_exit[4] + out.state_changed = ptr.bf_exit[5] + return out + + def to_dict(self): + return instance_to_dict(self) + cdef class PendingRPC: @@ -60,6 +103,9 @@ cdef class PendingRPC: self.name = None self.count = 0 + def to_dict(self): + return instance_to_dict(self) + cdef class RPCTypeStatistic: @@ -196,6 +242,53 @@ cdef class PendingRPCStatistics(dict): return xcollections.sum_property(self, PendingRPCStatistics.count) +cdef class Statistics: + + def __init__(self): + self.schedule_cycle_mean = 0 + self.schedule_cycle_mean_depth = 0 + self.schedule_cycles_per_minute = 0 + self.backfill_cycle_mean = 0 + self.backfill_mean_depth = 0 + self.backfill_mean_depth_try = 0 + self.backfill_queue_len_mean = 0 + self.backfill_table_size_mean = 0 + + @staticmethod + def load(): + cdef: + stats_info_request_msg_t req + stats_info_response_msg_t *resp = NULL + Statistics out = None + + req.command_id = slurm.STAT_COMMAND_GET + verify_rpc(slurm_get_statistics(&resp, &req)) + + try: + out = parse_response(resp) + except Exception as e: + raise e + finally: + slurm_free_stats_response_msg(resp) + + return out + + @staticmethod + def reset(): + cdef stats_info_request_msg_t req + req.command_id = slurm.STAT_COMMAND_RESET + verify_rpc(slurm_reset_statistics(&req)) + + def to_dict(self): + out = instance_to_dict(self) + out["rpcs_by_type"] = xcollections.dict_recursive(self.rpcs_by_type) + out["rpcs_by_user"] = xcollections.dict_recursive(self.rpcs_by_user) + out["pending_rpcs"] = xcollections.dict_recursive(self.pending_rpcs) + out["schedule_exit_stats"] = self.schedule_exit_stats.to_dict() + out["backfill_exit_stats"] = self.backfill_exit_stats.to_dict() + return out + + cdef parse_response(stats_info_response_msg_t *ptr): cdef Statistics out = Statistics() @@ -223,7 +316,6 @@ cdef parse_response(stats_info_response_msg_t *ptr): out.schedule_queue_length = int(ptr.schedule_queue_len) # TODO: job_states_time ? - # TODO: scheduler exits if cycle_count > 0: out.schedule_cycle_mean = int(ptr.schedule_cycle_sum / cycle_count) @@ -259,50 +351,7 @@ cdef parse_response(stats_info_response_msg_t *ptr): out.rpcs_by_type = RPCTypeStatistics.from_ptr(ptr, out.rpc_queue_enabled) out.rpcs_by_user = RPCUserStatistics.from_ptr(ptr) out.pending_rpcs = PendingRPCStatistics.from_ptr(ptr) + out.schedule_exit_stats = SchedulerExitStatistics.from_ptr(ptr) + out.backfill_exit_stats = BackfillExitStatistics.from_ptr(ptr) return out - - -cdef class Statistics: - - def __init__(self): - self.schedule_cycle_mean = 0 - self.schedule_cycle_mean_depth = 0 - self.schedule_cycles_per_minute = 0 - self.backfill_cycle_mean = 0 - self.backfill_mean_depth = 0 - self.backfill_mean_depth_try = 0 - self.backfill_queue_len_mean = 0 - self.backfill_table_size_mean = 0 - - @staticmethod - def load(): - cdef: - stats_info_request_msg_t req - stats_info_response_msg_t *resp = NULL - Statistics out = None - - req.command_id = slurm.STAT_COMMAND_GET - verify_rpc(slurm_get_statistics(&resp, &req)) - - try: - out = parse_response(resp) - except Exception as e: - raise e - finally: - slurm_free_stats_response_msg(resp) - - return out - - @staticmethod - def reset(): - cdef stats_info_request_msg_t req - req.command_id = slurm.STAT_COMMAND_RESET - verify_rpc(slurm_reset_statistics(&req)) - - def to_dict(self): - out = instance_to_dict(self) - out["rpcs_by_type"] = xcollections.dict_recursive(self.rpcs_by_type) - out["rpcs_by_user"] = xcollections.dict_recursive(self.rpcs_by_user) - out["pending_rpcs"] = xcollections.dict_recursive(self.pending_rpcs) - return out From 81bba6f80d3069642578e153dc7dfc04111baac0 Mon Sep 17 00:00:00 2001 From: Toni Harzendorf Date: Wed, 22 Jan 2025 15:26:12 +0100 Subject: [PATCH 05/15] remove scripts/slurm_msg_type_dict.py This is superseded by the fact that we now use libslurmfull and can access the rpc_num2string function provided there. --- scripts/slurm_msg_type_dict.py | 46 ---------------------------------- 1 file changed, 46 deletions(-) delete mode 100755 scripts/slurm_msg_type_dict.py diff --git a/scripts/slurm_msg_type_dict.py b/scripts/slurm_msg_type_dict.py deleted file mode 100755 index dbd2d0dc..00000000 --- a/scripts/slurm_msg_type_dict.py +++ /dev/null @@ -1,46 +0,0 @@ -#! /usr/bin/env python3 -""" -Parse $slurmrepo/src/common/slurm_protocol_defs.h and create -a small C program that generates a mapping of the numeric -slurm msg types to their symbolic names. - -Example: - ./slurm_msg_type_dict.py $slurmrepo/src/common/slurm_protocol_defs.h > msgdict.c - gcc -o msgdict msgdict.c - ./msgdict -""" - -import re -import sys -import argparse - -def generate_c(header_file_name): - typedef_re = re.compile(r"\s*typedef\s+enum\s*{(.*?)}\s*slurm_msg_type_t\s*;", re.DOTALL) - symbol_re = re.compile(r"^\s*([A-Z0-9_]+)\s*[,=\n]") - - with open(header_file_name, mode="r", encoding="utf-8") as header_file: - header = header_file.read() - typedef = typedef_re.search(header) - if typedef is None: - print("could not identify the slurm_msg_type_t typedef in the header file") - sys.exit(1) - - print("""#include """) - print(typedef.group(0)) - print("""\n\nint main(void) {""") - for line in typedef.group(1).split("\n"): - symbol = symbol_re.match(line) - if symbol is not None: - print(f""" printf("%d: \\\"%s\\\",\\n", {symbol.group(1)}, "{symbol.group(1)}");""") - else: - print(f""" printf("\\n");""") - print(""" return 0;\n}""") - -def main(): - parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawTextHelpFormatter) - parser.add_argument("header", help="$slurmrepo/src/common/slurm_protocol_defs.h") - args = parser.parse_args() - generate_c(args.header) - -if __name__ == "__main__": - main() From 67c68c88ed5be972af85089e3e95827601041d16 Mon Sep 17 00:00:00 2001 From: Toni Harzendorf Date: Wed, 22 Jan 2025 15:34:59 +0100 Subject: [PATCH 06/15] import the new stats classes to the pyslurm.slurmctld module --- pyslurm/core/slurmctld/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pyslurm/core/slurmctld/__init__.py b/pyslurm/core/slurmctld/__init__.py index e61544dd..4dc68875 100644 --- a/pyslurm/core/slurmctld/__init__.py +++ b/pyslurm/core/slurmctld/__init__.py @@ -7,6 +7,10 @@ from .enums import ShutdownMode from .stats import ( Statistics, + SchedulerExitStatistics, + BackfillExitStatistics, + PendingRPC, + PendingRPCStatistics, RPCUserStatistic, RPCUserStatistics, RPCTypeStatistic, From ddf49949c01584703d405bffb2491e9070b3c3b3 Mon Sep 17 00:00:00 2001 From: Toni Harzendorf Date: Thu, 23 Jan 2025 17:50:07 +0100 Subject: [PATCH 07/15] wip: rename a few things --- pyslurm/core/slurmctld/__init__.py | 10 ++-- pyslurm/core/slurmctld/stats.pxd | 90 +++++++++++++++++++++--------- pyslurm/core/slurmctld/stats.pyx | 69 +++++++++++++---------- 3 files changed, 108 insertions(+), 61 deletions(-) diff --git a/pyslurm/core/slurmctld/__init__.py b/pyslurm/core/slurmctld/__init__.py index 4dc68875..67619729 100644 --- a/pyslurm/core/slurmctld/__init__.py +++ b/pyslurm/core/slurmctld/__init__.py @@ -7,13 +7,13 @@ from .enums import ShutdownMode from .stats import ( Statistics, - SchedulerExitStatistics, + ScheduleExitStatistics, BackfillExitStatistics, - PendingRPC, - PendingRPCStatistics, - RPCUserStatistic, + RPCPending, + RPCUser, + RPCType, + RPCPendingStatistics, RPCUserStatistics, - RPCTypeStatistic, RPCTypeStatistics, ) from .base import ( diff --git a/pyslurm/core/slurmctld/stats.pxd b/pyslurm/core/slurmctld/stats.pxd index c5197af3..582213f7 100644 --- a/pyslurm/core/slurmctld/stats.pxd +++ b/pyslurm/core/slurmctld/stats.pxd @@ -45,7 +45,7 @@ cdef extern const char *rpc_num2string(uint16_t msg_type) cdef parse_response(stats_info_response_msg_t *ptr) -cdef class SchedulerExitStatistics: +cdef class ScheduleExitStatistics: """Conditions reached at the end of a scheduling run Each attribute is simply a counter that describes how many times a specific @@ -74,7 +74,7 @@ cdef class SchedulerExitStatistics: max_time @staticmethod - cdef SchedulerExitStatistics from_ptr(stats_info_response_msg_t *ptr) + cdef ScheduleExitStatistics from_ptr(stats_info_response_msg_t *ptr) cdef class BackfillExitStatistics: @@ -98,7 +98,6 @@ cdef class BackfillExitStatistics: state_changed (int): System state changes. """ - cdef public: end_of_job_queue max_job_start @@ -111,8 +110,8 @@ cdef class BackfillExitStatistics: cdef BackfillExitStatistics from_ptr(stats_info_response_msg_t *ptr) -cdef class PendingRPC: - """A RPC in pending State. +cdef class RPCPending: + """Statistics for a pending RPC. Attributes: id (int): @@ -128,8 +127,8 @@ cdef class PendingRPC: count -cdef class RPCTypeStatistic: - """Statistics for a specific RPC Type +cdef class RPCType: + """Statistics for a specific RPC Type. Attributes: id (int): @@ -150,9 +149,9 @@ cdef class RPCTypeStatistic: dropped (int): How many of these RPCs have been dropped. cycle_last (int): - Number of RPCs processed within the last RPC queue cycle + Number of RPCs processed within the last RPC queue cycle. cycle_max (int): - Maximum number of RPCs processed within a RPC queue cycle + Maximum number of RPCs processed within a RPC queue cycle. """ cdef public: id @@ -166,8 +165,24 @@ cdef class RPCTypeStatistic: cycle_max -cdef class RPCUserStatistic: +cdef class RPCUser: + """RPC Statistics for a specific User. + Attributes: + user_id (int): + The numeric ID of the User. + user_name (str): + The name of the User. + count (int): + How many times the User issued RPCs since the last time the + statistics were cleared. + time (int): + How much total time it has taken to process RPCs by this User. The + unit is microseconds + average_time (int): + How much time on average it has taken to process RPCs by this User. + The unit is microseconds. + """ cdef public: user_id user_name @@ -194,7 +209,7 @@ cdef class RPCTypeStatistics(dict): cdef class RPCUserStatistics(dict): - """Collection of [](pyslurm.slurmctld.RPCUserStatistic)'s + """Collection of [](pyslurm.slurmctld.RPCUser)'s Attributes: count (int): @@ -206,15 +221,15 @@ cdef class RPCUserStatistics(dict): cdef RPCUserStatistics from_ptr(stats_info_response_msg_t *ptr) -cdef class PendingRPCStatistics(dict): - """Collection of [](pyslurm.slurmctld.PendingRPCStatistics) +cdef class RPCPendingStatistics(dict): + """Collection of [](pyslurm.slurmctld.RPCPendingStatistics) Attributes: count (int): Total amount of RPCs made to the `slurmctld` since last reset. """ @staticmethod - cdef PendingRPCStatistics from_ptr(stats_info_response_msg_t *ptr) + cdef RPCPendingStatistics from_ptr(stats_info_response_msg_t *ptr) cdef class Statistics: @@ -269,6 +284,9 @@ cdef class Statistics: schedule_cycle_mean_depth (int): Mean of cycle depth. Depth means number of jobs processed in a scheduling cycle. + schedule_cycle_sum (int): + Total run time in microseconds for all scheduling cycles since last + reset format. schedule_cycles_per_minute (int): Counter of scheduling executions per minute. schedule_queue_length (int): @@ -288,10 +306,10 @@ cdef class Statistics: backfilling since last Slurm start. backfill_cycle_counter (int): Number of backfill scheduling cycles since last reset. - backfill_last_cycle_when (int): + backfill_cycle_last_when (int): Time when last backfill scheduling cycle happened. This is a unix timestamp. - backfill_last_cycle (int): + backfill_cycle_last (int): Time in microseconds of last backfill scheduling cycle. It counts only execution time, removing sleep time inside a scheduling cycle when it executes for an extended period time. Note that locks are @@ -305,15 +323,24 @@ cdef class Statistics: backfill_cycle_mean (int): Mean time in microseconds of backfilling scheduling cycles since last reset. + backfill_cycle_sum (int): + Total time in microseconds of backfilling scheduling cycles since + last reset. backfill_last_depth (int): Number of processed jobs during last backfilling scheduling cycle. It counts every job even if that job can not be started due to dependencies or limits. + backfill_depth_sum (int): + Total number of jobs processed during all backfilling scheduling + cycles since last reset. backfill_last_depth_try (int): Number of processed jobs during last backfilling scheduling cycle. It counts only jobs with a chance to start using available resources. These jobs consume more scheduling time than jobs which are found can not be started due to dependencies or limits. + backfill_depth_try_sum (int): + Subset of `backfill_depth_sum` that the backfill scheduler + attempted to schedule. backfill_mean_depth (int): Mean count of jobs processed during all backfilling scheduling cycles since last reset. Jobs which are found to be ineligible to @@ -324,14 +351,17 @@ cdef class Statistics: backfill_mean_depth_try (int): The subset of `backfill_mean_depth` that the backfill scheduler attempted to schedule. - backfill_queue_len (int): + backfill_queue_length (int): Number of jobs pending to be processed by backfilling algorithm. A job is counted once for each partition it is queued to use. A pending job array will normally be counted as one job (tasks of a job array which have already been started/requeued or individually modified will already have individual job records and are each counted as a separate job). - backfill_queue_len_mean (int): + backfill_queue_length_sum (int): + Total number of jobs pending to be processed by backfilling + algorithm since last reset. + backfill_queue_length_mean (int): Mean count of jobs pending to be processed by backfilling algorithm. A job is counted once for each partition it requested. A pending job array will normally be counted as one job (tasks of a @@ -341,6 +371,9 @@ cdef class Statistics: backfill_table_size (int): Count of different time slots tested by the backfill scheduler in its last iteration. + backfill_table_size_sum (int): + Total number of different time slots tested by the backfill + scheduler. backfill_table_size_mean (int): Mean count of different time slots tested by the backfill scheduler. Larger counts increase the time required for the @@ -354,7 +387,7 @@ cdef class Statistics: RPC Statistics organized by Type. rpcs_by_user (pyslurm.slurmctld.RPCUserStatistics): RPC Statistics organized by User. - pending_rpcs (pyslurm.slurmctld.PendingRPCStatistics): + rpcs_pending (pyslurm.slurmctld.RPCPendingStatistics): Statistics for pending RPCs. """ cdef public: @@ -381,32 +414,37 @@ cdef class Statistics: schedule_cycle_counter schedule_cycle_mean schedule_cycle_mean_depth + schedule_cycle_sum schedule_cycles_per_minute schedule_queue_length - schedule_exit_stats + schedule_exit backfill_active backfilled_jobs last_backfilled_jobs backfilled_het_jobs backfill_cycle_counter - backfill_last_cycle_when - backfill_last_cycle + backfill_cycle_last_when backfill_cycle_last backfill_cycle_max backfill_cycle_mean + backfill_cycle_sum backfill_last_depth + backfill_depth_sum backfill_last_depth_try + backfill_depth_try_sum backfill_mean_depth backfill_mean_depth_try - backfill_queue_len - backfill_queue_len_mean + backfill_queue_length + backfill_queue_length_sum + backfill_queue_length_mean backfill_table_size + backfill_table_size_sum backfill_table_size_mean - backfill_exit_stats + backfill_exit gettimeofday_latency rpcs_by_type rpcs_by_user - pending_rpcs + rpcs_pending diff --git a/pyslurm/core/slurmctld/stats.pyx b/pyslurm/core/slurmctld/stats.pyx index 3fa0cf91..081f992c 100644 --- a/pyslurm/core/slurmctld/stats.pyx +++ b/pyslurm/core/slurmctld/stats.pyx @@ -38,7 +38,7 @@ BF_EXIT_COUNT = 6 SCHED_EXIT_COUNT = 6 -cdef class SchedulerExitStatistics: +cdef class ScheduleExitStatistics: def __init__(self): self.end_of_job_queue = 0 @@ -49,12 +49,12 @@ cdef class SchedulerExitStatistics: self.max_time = 0 @staticmethod - cdef SchedulerExitStatistics from_ptr(stats_info_response_msg_t *ptr): + cdef ScheduleExitStatistics from_ptr(stats_info_response_msg_t *ptr): if ptr.schedule_exit_cnt != SCHED_EXIT_COUNT: raise RPCError(msg="schedule_exit_cnt has an unexpected size. " f"Got {ptr.schedule_exit_cnt}, expected {SCHED_EXIT_COUNT}.") - out = SchedulerExitStatistics() + out = ScheduleExitStatistics() out.end_of_job_queue = ptr.schedule_exit[0] out.default_queue_depth = ptr.schedule_exit[1] out.max_job_start = ptr.schedule_exit[2] @@ -96,7 +96,7 @@ cdef class BackfillExitStatistics: return instance_to_dict(self) -cdef class PendingRPC: +cdef class RPCPending: def __init__(self): self.id = 0 @@ -107,7 +107,7 @@ cdef class PendingRPC: return instance_to_dict(self) -cdef class RPCTypeStatistic: +cdef class RPCType: def __init__(self): self.id = 0 @@ -124,7 +124,7 @@ cdef class RPCTypeStatistic: return instance_to_dict(self) -cdef class RPCUserStatistic: +cdef class RPCUser: def __init__(self): self.user_id = 0 @@ -148,7 +148,7 @@ cdef class RPCTypeStatistics(dict): out = RPCTypeStatistics() for i in range(ptr.rpc_type_size): - stats = RPCTypeStatistic() + stats = RPCType() stats.id = ptr.rpc_type_id[i] stats.name = rpc_num2string(ptr.rpc_type_id[i]) stats.count = ptr.rpc_type_cnt[i] @@ -169,19 +169,19 @@ cdef class RPCTypeStatistics(dict): @property def count(self): - return xcollections.sum_property(self, RPCTypeStatistic.count) + return xcollections.sum_property(self, RPCType.count) @property def time(self): - return xcollections.sum_property(self, RPCTypeStatistic.time) + return xcollections.sum_property(self, RPCType.time) @property def queued(self): - return xcollections.sum_property(self, RPCTypeStatistic.queued) + return xcollections.sum_property(self, RPCType.queued) @property def dropped(self): - return xcollections.sum_property(self, RPCTypeStatistic.dropped) + return xcollections.sum_property(self, RPCType.dropped) cdef class RPCUserStatistics(dict): @@ -196,7 +196,7 @@ cdef class RPCUserStatistics(dict): for i in range(ptr.rpc_user_size): user_id = ptr.rpc_user_id[i] user = uid_to_name(user_id, err_on_invalid=False) - stats = RPCUserStatistic() + stats = RPCUser() stats.user_id = ptr.rpc_user_id[i] stats.user_name = user stats.count = ptr.rpc_user_cnt[i] @@ -212,24 +212,24 @@ cdef class RPCUserStatistics(dict): @property def count(self): - return xcollections.sum_property(self, RPCUserStatistic.count) + return xcollections.sum_property(self, RPCUser.count) @property def time(self): - return xcollections.sum_property(self, RPCUserStatistic.time) + return xcollections.sum_property(self, RPCUser.time) -cdef class PendingRPCStatistics(dict): +cdef class RPCPendingStatistics(dict): def __init__(self): super().__init__() @staticmethod - cdef PendingRPCStatistics from_ptr(stats_info_response_msg_t *ptr): - out = PendingRPCStatistics() + cdef RPCPendingStatistics from_ptr(stats_info_response_msg_t *ptr): + out = RPCPendingStatistics() for i in range(ptr.rpc_queue_type_count): - stats = PendingRPC() + stats = RPCPending() stats.id = ptr.rpc_queue_type_id[i] stats.name = rpc_num2string(ptr.rpc_queue_type_id[i]) stats.count = ptr.rpc_queue_count[i] @@ -239,7 +239,7 @@ cdef class PendingRPCStatistics(dict): @property def count(self): - return xcollections.sum_property(self, PendingRPCStatistics.count) + return xcollections.sum_property(self, RPCPendingStatistics.count) cdef class Statistics: @@ -249,10 +249,13 @@ cdef class Statistics: self.schedule_cycle_mean_depth = 0 self.schedule_cycles_per_minute = 0 self.backfill_cycle_mean = 0 + self.backfill_cycle_sum = 0 self.backfill_mean_depth = 0 self.backfill_mean_depth_try = 0 - self.backfill_queue_len_mean = 0 + self.backfill_queue_length_mean = 0 self.backfill_table_size_mean = 0 + self.backfill_queue_length_sum = 0 + self.backfill_table_size_sum = 0 @staticmethod def load(): @@ -283,9 +286,9 @@ cdef class Statistics: out = instance_to_dict(self) out["rpcs_by_type"] = xcollections.dict_recursive(self.rpcs_by_type) out["rpcs_by_user"] = xcollections.dict_recursive(self.rpcs_by_user) - out["pending_rpcs"] = xcollections.dict_recursive(self.pending_rpcs) - out["schedule_exit_stats"] = self.schedule_exit_stats.to_dict() - out["backfill_exit_stats"] = self.backfill_exit_stats.to_dict() + out["rpcs_pending"] = xcollections.dict_recursive(self.rpcs_pending) + out["schedule_exit"] = self.schedule_exit.to_dict() + out["backfill_exit"] = self.backfill_exit.to_dict() return out @@ -314,6 +317,7 @@ cdef parse_response(stats_info_response_msg_t *ptr): out.schedule_cycle_max = int(ptr.schedule_cycle_max) out.schedule_cycle_counter = int(cycle_count) out.schedule_queue_length = int(ptr.schedule_queue_len) + out.schedule_cycle_sum = int(ptr.schedule_cycle_sum) # TODO: job_states_time ? @@ -330,28 +334,33 @@ cdef parse_response(stats_info_response_msg_t *ptr): out.backfilled_jobs = ptr.bf_backfilled_jobs out.last_backfilled_jobs = ptr.bf_last_backfilled_jobs out.backfilled_het_jobs = ptr.bf_backfilled_het_jobs - out.backfill_last_cycle_when = ptr.bf_when_last_cycle - out.backfill_last_cycle = ptr.bf_cycle_last + out.backfill_cycle_last_when = ptr.bf_when_last_cycle + out.backfill_cycle_last = ptr.bf_cycle_last out.backfill_cycle_max = ptr.bf_cycle_max out.backfill_cycle_counter = bf_cycle_count + out.backfill_cycle_sum = ptr.bf_cycle_sum out.backfill_last_depth = ptr.bf_last_depth out.backfill_last_depth_try = ptr.bf_last_depth_try - out.backfill_queue_len = ptr.bf_queue_len + out.backfill_queue_length = ptr.bf_queue_len + out.backfill_queue_length_sum = ptr.bf_queue_len_sum out.backfill_table_size = ptr.bf_table_size + out.backfill_table_size_sum = ptr.bf_table_size_sum + out.backfill_depth_sum = ptr.bf_depth_sum + out.backfill_depth_try_sum = ptr.bf_depth_try_sum if bf_cycle_count > 0: out.backfill_cycle_mean = int(ptr.bf_cycle_sum / bf_cycle_count) out.backfill_mean_depth = int(ptr.bf_depth_sum / bf_cycle_count) out.backfill_mean_depth_try = int(ptr.bf_depth_try_sum / bf_cycle_count) - out.backfill_queue_len_mean = int(ptr.bf_queue_len_sum / bf_cycle_count) + out.backfill_queue_length_mean = int(ptr.bf_queue_len_sum / bf_cycle_count) out.backfill_table_size_mean = int(ptr.bf_table_size_sum / bf_cycle_count) out.gettimeofday_latency = ptr.gettimeofday_latency out.rpcs_by_type = RPCTypeStatistics.from_ptr(ptr, out.rpc_queue_enabled) out.rpcs_by_user = RPCUserStatistics.from_ptr(ptr) - out.pending_rpcs = PendingRPCStatistics.from_ptr(ptr) - out.schedule_exit_stats = SchedulerExitStatistics.from_ptr(ptr) - out.backfill_exit_stats = BackfillExitStatistics.from_ptr(ptr) + out.rpcs_pending = RPCPendingStatistics.from_ptr(ptr) + out.schedule_exit = ScheduleExitStatistics.from_ptr(ptr) + out.backfill_exit = BackfillExitStatistics.from_ptr(ptr) return out From 256337b6fa3c2d67eb3c46bfca866ba4c8abe89a Mon Sep 17 00:00:00 2001 From: Toni Harzendorf Date: Fri, 24 Jan 2025 13:43:56 +0100 Subject: [PATCH 08/15] wip: make sure we get the copyright notices right. --- pyslurm/core/slurmctld/stats.pxd | 22 ++++++++++++++++++---- pyslurm/core/slurmctld/stats.pyx | 16 +++++++++++++--- 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/pyslurm/core/slurmctld/stats.pxd b/pyslurm/core/slurmctld/stats.pxd index 582213f7..1d7eedcf 100644 --- a/pyslurm/core/slurmctld/stats.pxd +++ b/pyslurm/core/slurmctld/stats.pxd @@ -3,6 +3,20 @@ ######################################################################### # Copyright (C) 2025 Toni Harzendorf # +######################################################################### +# Much of the documentation here (with some modifications) has been taken from: +# - https://slurm.schedmd.com/sdiag.html +# - https://github.com/SchedMD/slurm/blob/c28fcf4f15981f891df7893099bceda21e2c5e6e/src/sdiag/sdiag.c +# +# So for completeness, the appropriate Copyright notices are also written +# below: +# +# Copyright (C) 2010-2011 Barcelona Supercomputing Center. +# Copyright (C) 2010-2022 SchedMD LLC. +# +# Please also check the Slurm DISCLAIMER at: pyslurm/slurm/SLURM_DISCLAIMER +######################################################################### + # This file is part of PySlurm # # PySlurm is free software; you can redistribute it and/or modify @@ -87,7 +101,7 @@ cdef class BackfillExitStatistics: end_of_job_queue (int): Times the end of the job queue was reached. max_job_start (int): - Reached the number of jobs allowed to start limit + Reached the number of jobs allowed to start limit. max_job_test (int): Reached the number of jobs allowed to attempt backfill scheduling for. @@ -149,9 +163,10 @@ cdef class RPCType: dropped (int): How many of these RPCs have been dropped. cycle_last (int): - Number of RPCs processed within the last RPC queue cycle. + Count of RPCs processed within the last RPC queue cycle. cycle_max (int): - Maximum number of RPCs processed within a RPC queue cycle. + Maximum count of RPCs that have been processed within a RPC queue + cycle. """ cdef public: id @@ -407,7 +422,6 @@ cdef class Statistics: jobs_failed jobs_pending jobs_running - # job_states_time schedule_cycle_last schedule_cycle_max diff --git a/pyslurm/core/slurmctld/stats.pyx b/pyslurm/core/slurmctld/stats.pyx index 081f992c..d004bfd1 100644 --- a/pyslurm/core/slurmctld/stats.pyx +++ b/pyslurm/core/slurmctld/stats.pyx @@ -2,6 +2,19 @@ # slurmctld/stats.pyx - pyslurm slurmctld statistics api (sdiag) ######################################################################### # Copyright (C) 2025 Toni Harzendorf + +######################################################################### +# The implementation here is inspired by: +# - https://github.com/SchedMD/slurm/blob/c28fcf4f15981f891df7893099bceda21e2c5e6e/src/sdiag/sdiag.c +# +# So for completeness, the appropriate Copyright notices are also written +# below: +# +# Copyright (C) 2010-2011 Barcelona Supercomputing Center. +# Copyright (C) 2010-2022 SchedMD LLC. +# +# Please also check the Slurm DISCLAIMER at: pyslurm/slurm/SLURM_DISCLAIMER +######################################################################### # # This file is part of PySlurm # @@ -319,8 +332,6 @@ cdef parse_response(stats_info_response_msg_t *ptr): out.schedule_queue_length = int(ptr.schedule_queue_len) out.schedule_cycle_sum = int(ptr.schedule_cycle_sum) - # TODO: job_states_time ? - if cycle_count > 0: out.schedule_cycle_mean = int(ptr.schedule_cycle_sum / cycle_count) out.schedule_cycle_mean_depth = int(ptr.schedule_cycle_depth / cycle_count) @@ -329,7 +340,6 @@ cdef parse_response(stats_info_response_msg_t *ptr): if ts > 60: out.schedule_cycles_per_minute = int(cycle_count / (ts / 60)) - out.backfill_active = bool(ptr.bf_active) out.backfilled_jobs = ptr.bf_backfilled_jobs out.last_backfilled_jobs = ptr.bf_last_backfilled_jobs From b42fc124f429c15ace89967d416e55446505ae6c Mon Sep 17 00:00:00 2001 From: Toni Harzendorf Date: Fri, 24 Jan 2025 14:08:28 +0100 Subject: [PATCH 09/15] finalize documentation --- pyslurm/core/slurmctld/__init__.py | 1 + pyslurm/core/slurmctld/stats.pxd | 6 ++-- pyslurm/core/slurmctld/stats.pyx | 53 ++++++++++++++++++++++++++++++ 3 files changed, 57 insertions(+), 3 deletions(-) diff --git a/pyslurm/core/slurmctld/__init__.py b/pyslurm/core/slurmctld/__init__.py index 67619729..68be857f 100644 --- a/pyslurm/core/slurmctld/__init__.py +++ b/pyslurm/core/slurmctld/__init__.py @@ -6,6 +6,7 @@ ) from .enums import ShutdownMode from .stats import ( + diag, Statistics, ScheduleExitStatistics, BackfillExitStatistics, diff --git a/pyslurm/core/slurmctld/stats.pxd b/pyslurm/core/slurmctld/stats.pxd index 1d7eedcf..3edcd00f 100644 --- a/pyslurm/core/slurmctld/stats.pxd +++ b/pyslurm/core/slurmctld/stats.pxd @@ -207,7 +207,7 @@ cdef class RPCUser: cdef class RPCTypeStatistics(dict): - """Collection of [](pyslurm.slurmctld.RPCTypeStatistic)'s + """Collection of [pyslurm.slurmctld.RPCType][] objects. Attributes: count (int): @@ -224,7 +224,7 @@ cdef class RPCTypeStatistics(dict): cdef class RPCUserStatistics(dict): - """Collection of [](pyslurm.slurmctld.RPCUser)'s + """Collection of [pyslurm.slurmctld.RPCUser][] objects. Attributes: count (int): @@ -237,7 +237,7 @@ cdef class RPCUserStatistics(dict): cdef class RPCPendingStatistics(dict): - """Collection of [](pyslurm.slurmctld.RPCPendingStatistics) + """Collection of [pyslurm.slurmctld.RPCPending][] objects. Attributes: count (int): diff --git a/pyslurm/core/slurmctld/stats.pyx b/pyslurm/core/slurmctld/stats.pyx index d004bfd1..2df630ae 100644 --- a/pyslurm/core/slurmctld/stats.pyx +++ b/pyslurm/core/slurmctld/stats.pyx @@ -272,6 +272,20 @@ cdef class Statistics: @staticmethod def load(): + """Load the Statistics of the `slurmctld`. + + Returns: + (pyslurm.slurmctld.Statistics): The Controller statistics. + + Raises: + (pyslurm.RPCError): When fetching the Statistics failed. + + Examples: + >>> from pyslurm import slurmctld + >>> stats = slurmctld.Statistics.load() + >>> print(stats.jobs_completed, stats.schedule_cycle_counter) + 10 20 + """ cdef: stats_info_request_msg_t req stats_info_response_msg_t *resp = NULL @@ -291,11 +305,30 @@ cdef class Statistics: @staticmethod def reset(): + """Reset the Statistics of the `slurmctld`. + + Raises: + (pyslurm.RPCError): When resetting the Statistics failed. + + Examples: + >>> from pyslurm import slurmctld + >>> slurmctld.Statistics.reset() + """ cdef stats_info_request_msg_t req req.command_id = slurm.STAT_COMMAND_RESET verify_rpc(slurm_reset_statistics(&req)) def to_dict(self): + """Convert the statistics to a dictionary. + + Returns: + (dict): Statistics as a dict. + + Examples: + >>> from pyslurm import slurmctld + >>> stats = slurmctld.Statistics.load() + >>> stats_dict = stats.to_dict() + """ out = instance_to_dict(self) out["rpcs_by_type"] = xcollections.dict_recursive(self.rpcs_by_type) out["rpcs_by_user"] = xcollections.dict_recursive(self.rpcs_by_user) @@ -305,6 +338,26 @@ cdef class Statistics: return out +def diag(): + """Load the Statistics of the `slurmctld`. + + This is a shortcut for [pyslurm.slurmctld.Statistics.load][] + + Returns: + (pyslurm.slurmctld.Statistics): The Controller statistics. + + Raises: + (pyslurm.RPCError): When fetching the Statistics failed. + + Examples: + >>> from pyslurm import slurmctld + >>> stats = slurmctld.Statistics.load() + >>> print(stats.jobs_completed, stats.schedule_cycle_counter) + 10 20 + """ + return Statistics.load() + + cdef parse_response(stats_info_response_msg_t *ptr): cdef Statistics out = Statistics() From d55460a1be3c1c873742e8235cfbb921cecc08fc Mon Sep 17 00:00:00 2001 From: Toni Harzendorf Date: Sat, 25 Jan 2025 11:55:56 +0100 Subject: [PATCH 10/15] prepare some test data, to ensure parsing from a pointer works. --- pyslurm/core/slurmctld/stats.pxd | 2 + pyslurm/core/slurmctld/stats.pyx | 65 ++++++++++++++++++++++++++++++ tests/unit/test_slurmctld.py | 69 ++++++++++++++++++++++++++++++++ 3 files changed, 136 insertions(+) create mode 100644 tests/unit/test_slurmctld.py diff --git a/pyslurm/core/slurmctld/stats.pxd b/pyslurm/core/slurmctld/stats.pxd index 3edcd00f..8035bed9 100644 --- a/pyslurm/core/slurmctld/stats.pxd +++ b/pyslurm/core/slurmctld/stats.pxd @@ -36,6 +36,7 @@ # cython: c_string_type=unicode, c_string_encoding=default # cython: language_level=3 +from libc.string cimport memset from pyslurm cimport slurm from pyslurm.slurm cimport ( stats_info_response_msg_t, @@ -44,6 +45,7 @@ from pyslurm.slurm cimport ( slurm_reset_statistics, slurm_free_stats_response_msg, xfree, + xmalloc, ) from pyslurm.utils cimport cstr from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t, int64_t diff --git a/pyslurm/core/slurmctld/stats.pyx b/pyslurm/core/slurmctld/stats.pyx index 2df630ae..3990591b 100644 --- a/pyslurm/core/slurmctld/stats.pyx +++ b/pyslurm/core/slurmctld/stats.pyx @@ -427,3 +427,68 @@ cdef parse_response(stats_info_response_msg_t *ptr): out.backfill_exit = BackfillExitStatistics.from_ptr(ptr) return out + + +# Prepare some test data +def _parse_test_data(): + import datetime + + cdef stats_info_response_msg_t stats + memset(&stats, 0, sizeof(stats)) + + stats.req_time = int(datetime.datetime.now().timestamp()) + stats.req_time_start = int(datetime.datetime.now().timestamp()) - 200 + stats.jobs_submitted = 20 + stats.jobs_running = 3 + stats.schedule_cycle_counter = 10 + stats.schedule_cycle_last = 40 + stats.schedule_cycle_sum = 45 + + stats.bf_cycle_counter = 100 + stats.bf_active = 0 + stats.bf_backfilled_jobs = 10 + stats.bf_cycle_sum = 200 + stats.bf_depth_try_sum = 300 + stats.bf_queue_len_sum = 600 + stats.bf_table_size_sum = 200 + + stats.rpc_type_size = 3 + stats.rpc_type_id = xmalloc(sizeof(uint16_t) * stats.rpc_type_size) + stats.rpc_type_cnt = xmalloc(sizeof(uint32_t) * stats.rpc_type_size) + stats.rpc_type_time = xmalloc(sizeof(uint64_t) * stats.rpc_type_size) + + for i in range(stats.rpc_type_size): + stats.rpc_type_id[i] = 2000+i + stats.rpc_type_cnt[i] = i+1 + stats.rpc_type_time[i] = i+2 + + stats.rpc_user_size = 1 + stats.rpc_user_id = xmalloc(sizeof(uint32_t) * stats.rpc_user_size) + stats.rpc_user_cnt = xmalloc(sizeof(uint32_t) * stats.rpc_user_size) + stats.rpc_user_time = xmalloc(sizeof(uint64_t) * stats.rpc_user_size) + + for i in range(stats.rpc_user_size): + stats.rpc_user_id[i] = i + stats.rpc_user_cnt[i] = i+1 + stats.rpc_user_time[i] = i+2 + + stats.bf_exit_cnt = BF_EXIT_COUNT + stats.bf_exit = xmalloc(sizeof(uint32_t) * BF_EXIT_COUNT) + for i in range(stats.bf_exit_cnt): + stats.bf_exit[i] = i+1 + + stats.schedule_exit_cnt = SCHED_EXIT_COUNT + stats.schedule_exit = xmalloc(sizeof(uint32_t) * SCHED_EXIT_COUNT) + + for i in range(stats.schedule_exit_cnt): + stats.schedule_exit[i] = i+1 + + stats.rpc_queue_type_count = 5 + stats.rpc_queue_count = xmalloc(sizeof(uint32_t) * stats.rpc_queue_type_count) + stats.rpc_queue_type_id = xmalloc(sizeof(uint32_t) * stats.rpc_queue_type_count) + + for i in range(stats.rpc_queue_type_count): + stats.rpc_queue_count[i] = i+1 + stats.rpc_queue_type_id[i] = 2000+i + + return parse_response(&stats) diff --git a/tests/unit/test_slurmctld.py b/tests/unit/test_slurmctld.py new file mode 100644 index 00000000..71d36d34 --- /dev/null +++ b/tests/unit/test_slurmctld.py @@ -0,0 +1,69 @@ +######################################################################### +# test_slurmctld.py - slurmctld unit tests +######################################################################### +# Copyright (C) 2025 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +"""test_slurmctld.py - Unit test basic slurmctld functionalities.""" + +import pyslurm +from pyslurm import slurmctld +from pyslurm.core.slurmctld.stats import _parse_test_data + + +def test_statistics(): + stats = _parse_test_data() + assert stats.to_dict() + + assert len(stats.rpcs_by_type) == 3 + for typ, val in stats.rpcs_by_type.items(): + assert val.count > 0 + assert val.time > 0 + assert val.average_time > 0 + assert typ is not None + + assert len(stats.rpcs_pending) == 5 + for typ, val in stats.rpcs_pending.items(): + assert val.count > 0 + assert typ is not None + assert isinstance(typ, str) + + assert len(stats.rpcs_by_user) == 1 + for typ, val in stats.rpcs_by_user.items(): + assert val.user_id == 0 + assert val.user_name == "root" + assert val.time > 0 + assert val.average_time > 0 + assert typ is not None + + assert stats.schedule_exit + assert stats.backfill_exit + assert stats.jobs_submitted == 20 + assert stats.jobs_running == 3 + assert stats.schedule_cycle_last == 40 + assert stats.schedule_cycle_sum == 45 + assert stats.schedule_cycle_mean == 4 + assert stats.schedule_cycle_counter == 10 + + assert stats.backfill_cycle_counter == 100 + assert stats.backfill_active is False + assert stats.backfilled_jobs == 10 + assert stats.backfill_cycle_sum == 200 + assert stats.backfill_depth_try_sum == 300 + assert stats.backfill_queue_length_sum == 600 + assert stats.backfill_table_size_sum == 200 + assert stats.backfill_cycle_mean == 2 From ead942d82bc4e478770d9fb926e2e898eb17da78 Mon Sep 17 00:00:00 2001 From: Toni Harzendorf Date: Sat, 25 Jan 2025 11:56:27 +0100 Subject: [PATCH 11/15] add integration tests for slurmctld stats --- tests/integration/test_slurmctld.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/integration/test_slurmctld.py b/tests/integration/test_slurmctld.py index 4d1f1004..83637361 100644 --- a/tests/integration/test_slurmctld.py +++ b/tests/integration/test_slurmctld.py @@ -107,3 +107,16 @@ def test_fair_share_dampening_factor(): with pytest.raises(pyslurm.RPCError, match=r"Invalid Dampening*"): slurmctld.set_fair_share_dampening_factor(99999999) + + +def test_statistics(): + stats = slurmctld.diag() + assert stats.to_dict() + assert len(stats.rpcs_by_type) > 0 + data_since = stats.data_since + + slurmctld.Statistics.reset() + new_stats = slurmctld.Statistics.load() + assert new_stats.to_dict() + # Check that resetting it was actually sucessful. + assert data_since < new_stats.data_since From dc56036b2241b82d39cea437f799c446e2c82636 Mon Sep 17 00:00:00 2001 From: Toni Harzendorf Date: Sat, 25 Jan 2025 12:01:20 +0100 Subject: [PATCH 12/15] fix codespell error --- tests/integration/test_slurmctld.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_slurmctld.py b/tests/integration/test_slurmctld.py index 83637361..5ad1551c 100644 --- a/tests/integration/test_slurmctld.py +++ b/tests/integration/test_slurmctld.py @@ -118,5 +118,5 @@ def test_statistics(): slurmctld.Statistics.reset() new_stats = slurmctld.Statistics.load() assert new_stats.to_dict() - # Check that resetting it was actually sucessful. + # Check that resetting it was actually successful. assert data_since < new_stats.data_since From b3ac8e4f4f0583df28df41877df723b33ad237f9 Mon Sep 17 00:00:00 2001 From: Toni Harzendorf Date: Sat, 25 Jan 2025 12:35:04 +0100 Subject: [PATCH 13/15] wip docs --- docs/reference/slurmctld.md | 1 + pyslurm/core/slurmctld/stats.pxd | 5 ++--- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/reference/slurmctld.md b/docs/reference/slurmctld.md index f13419fe..e0e1baa7 100644 --- a/docs/reference/slurmctld.md +++ b/docs/reference/slurmctld.md @@ -6,3 +6,4 @@ title: slurmctld handler: python options: members: yes + members_order: source diff --git a/pyslurm/core/slurmctld/stats.pxd b/pyslurm/core/slurmctld/stats.pxd index 8035bed9..57e89601 100644 --- a/pyslurm/core/slurmctld/stats.pxd +++ b/pyslurm/core/slurmctld/stats.pxd @@ -293,8 +293,7 @@ cdef class Statistics: Maximum time in microseconds for any scheduling cycle since last reset. schedule_cycle_counter (int): - Total run time in microseconds for all scheduling cycles since last - reset. + Total amount of scheduling cycles ran since last reset. schedule_cycle_mean (int): Mean time in microseconds for all scheduling cycles since last reset. @@ -303,7 +302,7 @@ cdef class Statistics: scheduling cycle. schedule_cycle_sum (int): Total run time in microseconds for all scheduling cycles since last - reset format. + reset. schedule_cycles_per_minute (int): Counter of scheduling executions per minute. schedule_queue_length (int): From 8137edc6ba0b8d72f9591e3a928d328631155f8c Mon Sep 17 00:00:00 2001 From: Toni Harzendorf Date: Sat, 25 Jan 2025 12:48:13 +0100 Subject: [PATCH 14/15] update docs --- pyslurm/core/slurmctld/stats.pxd | 43 +++++++++----------------------- 1 file changed, 12 insertions(+), 31 deletions(-) diff --git a/pyslurm/core/slurmctld/stats.pxd b/pyslurm/core/slurmctld/stats.pxd index 57e89601..ccff880e 100644 --- a/pyslurm/core/slurmctld/stats.pxd +++ b/pyslurm/core/slurmctld/stats.pxd @@ -252,6 +252,8 @@ cdef class RPCPendingStatistics(dict): cdef class Statistics: """Statistics for the `slurmctld`. + For more information, also check out the Slurm [sdiag documentation](https://slurm.schedmd.com/sdiag.html). + Attributes: request_time (int): Time when the data was requested. This is a unix timestamp. @@ -259,7 +261,7 @@ cdef class Statistics: The date when `slurmctld` started gathering statistics. This is a unix timestamp. server_thread_count (int): - The number of current active slurmctld threads. + The number of current active `slurmctld` threads. rpc_queue_enabled (bool): Whether RPC queuing is enabled. agent_queue_size (int): @@ -281,7 +283,7 @@ cdef class Statistics: jobs_canceled (int): Number of jobs canceled since last reset. jobs_failed (int): - Number of jobs failed due to slurmd or other internal issues since + Number of jobs failed due to `slurmd` or other internal issues since last reset. jobs_pending (int): Number of jobs pending. @@ -315,8 +317,7 @@ cdef class Statistics: start. last_backfilled_jobs (int): Number of jobs started thanks to backfilling since last time stats - where reset. By default these values are reset at midnight UTC - time. + where reset. (which is midnight UTC time in this case) backfilled_het_jobs (int): Number of heterogeneous job components started thanks to backfilling since last Slurm start. @@ -326,16 +327,10 @@ cdef class Statistics: Time when last backfill scheduling cycle happened. This is a unix timestamp. backfill_cycle_last (int): - Time in microseconds of last backfill scheduling cycle. It counts - only execution time, removing sleep time inside a scheduling cycle - when it executes for an extended period time. Note that locks are - released during the sleep time so that other work can proceed. + Time in microseconds of last backfill scheduling cycle. backfill_cycle_max (int): Time in microseconds of maximum backfill scheduling cycle execution - since last reset. It counts only execution time, removing sleep - time inside a scheduling cycle when it executes for an extended - period time. Note that locks are released during the sleep time so - that other work can proceed. + since last reset. backfill_cycle_mean (int): Mean time in microseconds of backfilling scheduling cycles since last reset. @@ -352,38 +347,26 @@ cdef class Statistics: backfill_last_depth_try (int): Number of processed jobs during last backfilling scheduling cycle. It counts only jobs with a chance to start using available - resources. These jobs consume more scheduling time than jobs which - are found can not be started due to dependencies or limits. + resources. backfill_depth_try_sum (int): Subset of `backfill_depth_sum` that the backfill scheduler attempted to schedule. backfill_mean_depth (int): Mean count of jobs processed during all backfilling scheduling cycles since last reset. Jobs which are found to be ineligible to - run when examined by the backfill scheduler are not counted (e.g. - jobs submitted to multiple partitions and already started, jobs - which have reached a QOS or account limit such as maximum running - jobs for an account, etc). + run when examined by the backfill scheduler are not counted. backfill_mean_depth_try (int): The subset of `backfill_mean_depth` that the backfill scheduler attempted to schedule. backfill_queue_length (int): Number of jobs pending to be processed by backfilling algorithm. A - job is counted once for each partition it is queued to use. A - pending job array will normally be counted as one job (tasks of a - job array which have already been started/requeued or individually - modified will already have individual job records and are each - counted as a separate job). + job is counted once for each partition it is queued to use. backfill_queue_length_sum (int): Total number of jobs pending to be processed by backfilling algorithm since last reset. backfill_queue_length_mean (int): Mean count of jobs pending to be processed by backfilling - algorithm. A job is counted once for each partition it requested. A - pending job array will normally be counted as one job (tasks of a - job array which have already been started/requeued or individually - modified will already have individual job records and are each - counted as a separate job). + algorithm. backfill_table_size (int): Count of different time slots tested by the backfill scheduler in its last iteration. @@ -393,9 +376,7 @@ cdef class Statistics: backfill_table_size_mean (int): Mean count of different time slots tested by the backfill scheduler. Larger counts increase the time required for the - backfill operation. The table size is influenced by many scheduling - parameters, including: bf_min_age_reserve, bf_min_prio_reserve, - bf_resolution, and bf_window. + backfill operation. gettimeofday_latency (int): Latency of 1000 calls to the gettimeofday() syscall in microseconds, as measured at controller startup. From d59d0b598460715d079a26012a9644413f7bc2c3 Mon Sep 17 00:00:00 2001 From: Toni Harzendorf Date: Sat, 25 Jan 2025 12:57:17 +0100 Subject: [PATCH 15/15] wip docs --- pyslurm/core/slurmctld/stats.pxd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyslurm/core/slurmctld/stats.pxd b/pyslurm/core/slurmctld/stats.pxd index ccff880e..5be8d4b5 100644 --- a/pyslurm/core/slurmctld/stats.pxd +++ b/pyslurm/core/slurmctld/stats.pxd @@ -218,7 +218,7 @@ cdef class RPCTypeStatistics(dict): Total amount of time it has taken to process all RPCs made yet. queued (int): Total amount of RPCs queued. - queued (int): + dropped (int): Total amount of RPCs dropped. """ @staticmethod @@ -243,7 +243,7 @@ cdef class RPCPendingStatistics(dict): Attributes: count (int): - Total amount of RPCs made to the `slurmctld` since last reset. + Total amount of RPCs currently pending. """ @staticmethod cdef RPCPendingStatistics from_ptr(stats_info_response_msg_t *ptr)