diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..f05284b --- /dev/null +++ b/Dockerfile @@ -0,0 +1,18 @@ +FROM ros:noetic-ros-core-focal + +# install pip3 +RUN apt-get update && apt-get install -y --no-install-recommends \ + python3 \ + python3-pip \ + ros-noetic-diagnostic-aggregator \ + build-essential \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +#upgrade pip +RUN python3 -m pip install --upgrade pip +#install jetson-stats +RUN python3 -m pip install -U jetson-stats + +#source ros env +RUN echo "source /opt/ros/noetic/setup.bash" >> ~/.bashrc \ No newline at end of file diff --git a/param/jtop.yaml b/param/jtop.yaml index 9ca071a..385ee26 100644 --- a/param/jtop.yaml +++ b/param/jtop.yaml @@ -13,24 +13,34 @@ analyzers: type: diagnostic_aggregator/GenericAnalyzer path: GPU contains: 'gpu' - remove_prefix: jetson_stats + remove_prefix: jetson_stats gpu memory: type: diagnostic_aggregator/GenericAnalyzer path: Memory contains: 'mem' remove_prefix: jetson_stats mem + engine: + type: diagnostic_aggregator/GenericAnalyzer + path: Engine + contains: 'engine' + remove_prefix: jetson_stats engine temperatures: type: diagnostic_aggregator/GenericAnalyzer path: Temperatures contains: 'temp' - remove_prefix: jetson_stats + remove_prefix: jetson_stats temp power: type: diagnostic_aggregator/GenericAnalyzer path: Power contains: 'power' - remove_prefix: jetson_stats + remove_prefix: jetson_stats power board: type: diagnostic_aggregator/GenericAnalyzer path: board contains: 'board' - remove_prefix: jetson_stats board \ No newline at end of file + remove_prefix: jetson_stats board + fan: + type: diagnostic_aggregator/GenericAnalyzer + path: Board + contains: 'fan' + remove_prefix: jetson_stats fan \ No newline at end of file diff --git a/scripts/jetson_stats.py b/scripts/jetson_stats.py index 574461a..9cbb3b9 100755 --- a/scripts/jetson_stats.py +++ b/scripts/jetson_stats.py @@ -39,6 +39,7 @@ other_status, board_status, disk_status, + engine_status, cpu_status, fan_status, gpu_status, @@ -87,18 +88,18 @@ def stop(self): self.jetson.close() def fan_service(self, req): - # Try to set new nvpmodel - fan_mode = req.mode + # Update fan_service to use fan.profile instead of fan.mode + fan_profile = req.mode fan_speed = req.fanSpeed try: - self.jetson.fan.mode = fan_mode - if fan_mode == "manual": + self.jetson.fan.profile = fan_profile + if fan_profile == "manual": self.jetson.fan.speed = fan_speed except jtop.JtopException as e: rospy.logerr(e) - # Return same nvp model - fan_mode = str(self.jetson.fan.mode) - return fanResponse(fan_mode) + # Return current fan profile + fan_profile = str(self.jetson.fan.profile) + return fanResponse(fan_profile) def jetson_clocks_service(self, req): # Set new jetson_clocks @@ -122,21 +123,35 @@ def jetson_callback(self, jetson): # Status board and board info self.arr.status = [other_status(self.hardware, jetson, jtop.__version__)] # Make diagnostic message for each cpu - self.arr.status += [cpu_status(self.hardware, name, jetson.cpu[name]) for name in jetson.cpu] + self.arr.status += [cpu_status(self.hardware, name, cpu) + for name, cpu in enumerate(jetson.cpu['cpu'])] # Merge all other diagnostics - self.arr.status += [gpu_status(self.hardware, jetson.gpu[1])] - self.arr.status += [ram_status(self.hardware, jetson.ram, 'mem')] - self.arr.status += [swap_status(self.hardware, jetson.swap, 'mem')] - self.arr.status += [emc_status(self.hardware, jetson.emc, 'mem')] - # Temperature - self.arr.status += [temp_status(self.hardware, jetson.temperature, self.level_options)] - # Read power - total, power = jetson.power - if power: - self.arr.status += [power_status(self.hardware, total, power)] - # Fan controller + self.arr.status += [gpu_status(self.hardware, name, jetson.gpu[name]) + for name in self.jetson.gpu] + # Make diagnostic message for each engine + self.arr.status += [engine_status(self.hardware, name, engine) + for name, engine in jetson.engine.items()] + # Update access to jetson.memory + self.arr.status += [ram_status(self.hardware, jetson.memory['RAM'], 'mem')] + self.arr.status += [swap_status(self.hardware, jetson.memory['SWAP'], 'mem')] + if 'EMC' in jetson.memory: + self.arr.status += [emc_status(self.hardware, jetson.memory['EMC'], 'mem')] + # Make diagnostic message for each Temperature + self.arr.status += [temp_status(self.hardware, name, sensor) + for name, sensor in jetson.temperature.items()] + # Make diagnostic message for each power rail + self.arr.status += [power_status(self.hardware, name, rail) + for name, rail in self.jetson.power['rail'].items()] + if 'name' in self.jetson.power['tot']: + name_total = self.jetson.power['tot']['name'] + else: + name_total = 'ALL' + self.arr.status += [power_status(self.hardware, + name_total, jetson.power['tot'])] + # Update fan_status usage if jetson.fan: - self.arr.status += [fan_status(self.hardware, jetson.fan, 'board')] + self.arr.status += [fan_status(self.hardware, name, fan) + for name, fan in jetson.fan.items()] # Status board and board info self.arr.status += [self.board_status] # Add disk status diff --git a/src/ros_jetson_stats/utils.py b/src/ros_jetson_stats/utils.py index 5a41d45..ec0004d 100644 --- a/src/ros_jetson_stats/utils.py +++ b/src/ros_jetson_stats/utils.py @@ -1,92 +1,109 @@ -#!/usr/bin/env python -# -*- coding: UTF-8 -*- -# Copyright (C) 2020, Raffaello Bonghi -# All rights reserved +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: # -# 1. Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# 2. Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# 3. Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. # -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND -# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, -# BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; -# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, -# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE -# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, -# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import rospy -from copy import deepcopy +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# +# SPDX-License-Identifier: MIT + from diagnostic_msgs.msg import DiagnosticStatus, KeyValue +TEMPERATURE_MAX = 84 +TEMPERATURE_CRIT = 100 + + def size_min(num, divider=1.0, n=0, start=''): - if num >= divider * 1000.0: + return find_unit(num, 1024.0, divider, n, start) + + +def unit_min(num, divider=1.0, n=0, start=''): + return find_unit(num, 1000.0, divider, n, start) + + +def find_unit(size, power, divider=1.0, n=0, start=''): + n = 0 + power_labels = ['m', '', 'k', 'M', 'G', 'T'] + while size > power - 1: + divider *= power + size /= power n += 1 - divider *= 1000.0 - return size_min(num, divider, n, start) - else: - vect = ['', 'k', 'M', 'G', 'T'] - idx = vect.index(start) - return round(num / divider, 1), divider, vect[n + idx] + idx = power_labels.index(start) + return round(size, 1), divider, power_labels[n + idx] + + +def size_to_string(value, unit): + return value_to_string(value, unit, '', size_min) + + +def unit_to_string(value, unit, system_unit): + return value_to_string(value, unit, system_unit, unit_min) + + +def value_to_string(value, unit, system_unit, func): + value, _, unit = func(value, start=unit) + value_string = str(value) + if value >= 100: + value_string = value_string[:3].rstrip('.') + return f'{value_string}{unit}{system_unit}' def strfdelta(tdelta, fmt): - """ Print delta time - - https://stackoverflow.com/questions/8906926/formatting-python-timedelta-objects - """ - d = {"days": tdelta.days} - d["hours"], rem = divmod(tdelta.seconds, 3600) - d["minutes"], d["seconds"] = divmod(rem, 60) + d = {'days': tdelta.days} + d['hours'], rem = divmod(tdelta.seconds, 3600) + d['minutes'], d['seconds'] = divmod(rem, 60) return fmt.format(**d) def other_status(hardware, jetson, version): - """ - Generic information about jetson_clock and nvpmodel - """ values = [] nvpmodel = jetson.nvpmodel - text = "" + text = '' if nvpmodel is not None: - nvp_name = nvpmodel.name.replace('MODE_', '').replace('_', ' ') - values += [KeyValue("NV Power" , "{id} - {name}".format(id=nvpmodel.id, name=nvp_name))] - text += "NV Power[{id}] {name}".format(id=nvpmodel.id, name=nvp_name) + nvp_name = nvpmodel.name + nvpmodel_id = nvpmodel.id + values.append(KeyValue(key='NV Power-ID', value=str(nvpmodel.id))) + values.append(KeyValue(key='NV Power-Mode', value=nvp_name)) + text += f'NV Power[{nvpmodel_id}] {nvp_name}' jc = jetson.jetson_clocks + level = DiagnosticStatus.OK if jetson.jetson_clocks is not None: - if jetson.jetson_clocks.status in ["running", "inactive"]: + if jetson.jetson_clocks.status in ['running', 'inactive']: level = DiagnosticStatus.OK - elif "ing" in jc.status: + elif 'ing' in jc.status: level = DiagnosticStatus.WARN else: level = DiagnosticStatus.ERROR # Show if JetsonClock is enabled or not - values += [KeyValue("jetson_clocks" , "{status}".format(status=jc.status))] - values += [KeyValue("jetson_clocks on boot" , "{boot}".format(boot=jc.boot))] - text += " - JC {status}".format(status=jc.status) + values.append(KeyValue(key='jetson_clocks', value=str(jc.status))) + values.append(KeyValue(key='jetson_clocks on boot', value=str(jc.boot))) + text += f' - JC {jc.status}' # Uptime - uptime_string = strfdelta(jetson.uptime, "{days} days {hours}:{minutes}:{seconds}") - values += [KeyValue("Up Time" , "{time}".format(time=uptime_string))] + uptime_string = strfdelta( + jetson.uptime, '{days} days {hours}:{minutes}:{seconds}') + values.append(KeyValue(key='Up Time', value=str(uptime_string))) # Jtop version - values += [KeyValue("interval" , "{interval}".format(interval=jetson.interval))] - values += [KeyValue("jtop" , "v{version}".format(version=version))] + values.append(KeyValue(key='interval', value=str(jetson.interval))) + values.append(KeyValue(key='jtop', value=str(version))) # Make board diagnostic status status = DiagnosticStatus( level=level, - name='jetson_stats board status', + name='jetson_stats/board/Status', message=text, hardware_id=hardware, values=values) @@ -94,28 +111,42 @@ def other_status(hardware, jetson, version): def board_status(hardware, board, dgtype): - """ - Board information and libraries installed - """ + if board['platform']['Machine'] == 'x86_64': + platform = board['platform'] + system = platform['System'].upper() + machine = platform['Machine'] + distribution = platform['Distribution'] + release = platform['Release'].split('-')[0] + message = f'{system} {machine} machine - {distribution} [{release}]' + level = DiagnosticStatus.OK + elif 'L4T' in board['hardware']: + model = board['hardware']['Model'] + jetpack = board['hardware']['Jetpack'] + message = f'{model} - Jetpack {jetpack}' + level = DiagnosticStatus.OK + else: + message = 'Unrecognized hardware' + level = DiagnosticStatus.WARN values = [] for key, value in board['hardware'].items(): - values += [KeyValue(key, "{value}".format(value=value))] + values.append(KeyValue(key=key, value=str(value))) for key, value in board['libraries'].items(): - values += [KeyValue("lib " + key, "{value}".format(value=value))] + values.append(KeyValue(key='lib ' + key, value=str(value))) # Make board diagnostic status d_board = DiagnosticStatus( - name='jetson_stats {type} config'.format(type=dgtype), - message='Jetpack {jetpack}'.format(jetpack=board['hardware']['Jetpack']), + level=level, + name=f'jetson_stats/{dgtype}/Config', + message=message, hardware_id=hardware, values=values) return d_board def disk_status(hardware, disk, dgtype): - """ - Status disk - """ value = int(float(disk['used']) / float(disk['total']) * 100.0) + used = size_to_string(disk['used'], disk['unit']) + total = size_to_string(disk['total'], disk['unit']) + # Set level status if value >= 90: level = DiagnosticStatus.ERROR elif value >= 70: @@ -125,270 +156,229 @@ def disk_status(hardware, disk, dgtype): # Make board diagnostic status d_board = DiagnosticStatus( level=level, - name='jetson_stats {type} disk'.format(type=dgtype), - message="{0:2.1f}GB/{1:2.1f}GB".format(disk['used'], disk['total']), + name=f'jetson_stats/{dgtype}/Disk', + message=f'{used}/{total}', hardware_id=hardware, values=[ - KeyValue("Used", "{used:2.1f}".format(used=disk['used'])), - KeyValue("Total", "{total:2.1f}".format(total=disk['total'])), - KeyValue("Unit", "GB")]) + KeyValue(key='Used', value=used), + KeyValue(key='Total', value=total) + ]) return d_board def cpu_status(hardware, name, cpu): - """ - Decode a cpu stats - - Fields: - * min_freq - Minimum frequency in kHz - * max_freq - Maximum frequency in kHz - * frq - Running frequency in kHz - * governor - Governor selected - * val - Status CPU, value between [0, 100] - * model - Model Architecture - * IdleStates - """ message = 'OFF' values = [] if cpu: - if 'val' in cpu: - # read value - val = cpu['val'] - message = '{val}%'.format(val=val) + if 'idle' in cpu: + # Decode utilizaiton status + val = 100 - cpu['idle'] + freq_cpu = unit_to_string(cpu['freq']['cur'], 'k', 'Hz') # Make Diagnostic Status message with cpu info values = [ - KeyValue("Val", "{val}%".format(val=val)), - KeyValue("Freq", "{frq}".format(frq=cpu['frq'])), - KeyValue("Unit", "khz")] - if 'governor' in cpu: - values += [KeyValue("Governor", "{governor}".format(governor=cpu['governor']))] - if 'model' in cpu: - values += [KeyValue("Model", "{model}".format(model=cpu['model']))] - # Make Diagnostic message + KeyValue(key='Idle', value=f"{cpu['idle']:6.2f}%"), + KeyValue(key='User', value=f"{cpu['user']:6.2f}%"), + KeyValue(key='Nice', value=f"{cpu['nice']:6.2f}%"), + KeyValue(key='System', value=f"{cpu['system']:6.2f}%"), + KeyValue(key='Governor', value=cpu['governor']), + KeyValue(key='Freq', value=freq_cpu)] + # Update message + message = f'{val:6.2f}%' + if 'model' in cpu and cpu['model']: + values.append(KeyValue(key='Model', value=cpu['model'])) + + # Build diagnostic message d_cpu = DiagnosticStatus( - name='jetson_stats cpu {name}'.format(name=name), + name=f'jetson_stats/cpu/{name}', message=message, hardware_id=hardware, values=values) return d_cpu -def gpu_status(hardware, gpu): - """ - Decode and build a diagnostic status message +def gpu_status(hardware, name, gpu): + gpu_load = gpu['status']['load'] + used_gpu = f'{gpu_load:6.2f}%' + freq_gpu = unit_to_string(gpu['freq']['cur'], 'k', 'Hz') + railgate_string = 'Active' if gpu['status']['railgate'] else 'Disable' + scaling_string = 'Active' if gpu['status']['3d_scaling'] else 'Disable' + # Build diagnostic message + values = [ + KeyValue(key='Used', value=used_gpu), + KeyValue(key='Freq', value=freq_gpu), + KeyValue(key='Railgate', value=railgate_string), + KeyValue(key='3D scaling', value=scaling_string), + KeyValue(key='governor', value=gpu['freq']['governor']), + ] + + if 'GPC' in gpu['freq']: + for i, gpc in enumerate(gpu['freq']['GPC']): + values.append(KeyValue(key=f'GPC {i}', value=f'{gpc:6.2f}%')) + + if 'power_control' in gpu and gpu['power_control']: + values.append(KeyValue(key='Power Control', value=gpu['power_control'])) - Fields: - * min_freq - Minimum frequency in kHz - * max_freq - Maximum frequency in kHz - * frq - Running frequency in kHz - * val - Status GPU, value between [0, 100] - """ d_gpu = DiagnosticStatus( - name='jetson_stats gpu', - message='{val}%'.format(val=gpu['val']), + name=f'jetson_stats/gpu/{name}', + message=used_gpu, hardware_id=hardware, - values=[ - KeyValue('Val', '{val}%'.format(val=gpu['val'])), - KeyValue("Freq", "{frq}".format(frq=gpu['frq'])), - KeyValue("Unit", "khz")]) + values=values) return d_gpu -def fan_status(hardware, fan, dgtype): - """ - Fan speed and type of control - - Fields: - * auto - boolean with fan control. - * True = Automatic speed control enabled - * False = Automatic speed control disabled - * speed - Speed set. Value between [0, 100] (float) - * measure - Speed measured. Value between [0, 100] (float) - * rpm - Revolution Per Minute. This number can be 0 if the hardware does not implement this feature - * mode - Mode selected for your fan - """ - ctrl = "Ta" if fan.auto else "Tm" - if fan.speed is not None: - label = "{ctrl}={target: >3.0f}%".format(ctrl=ctrl, target=fan.speed) - else: - label = "{ctrl}".format(ctrl=ctrl) +def fan_status(hardware, name, fan): + values = [] + # List of all speeds + for idx, speed in enumerate(fan['speed']): + values.append(KeyValue(key=f'PWM {idx}', value=f'{speed: >3.0f}%')) + if 'rpm' in fan: + rpm_fan = fan['rpm'][idx] + values.append(KeyValue(key=f'RPM {idx}', value=f'{rpm_fan}RPM')) + values.append(KeyValue(key='Profile', value=fan['profile'])) + first_fan_speed = fan['speed'][0] + message = 'Fan0: ' if len(fan['speed']) > 1 else '' + message += f'{first_fan_speed: >3.0f}%' # Make fan diagnostic status d_fan = DiagnosticStatus( - name='jetson_stats {type} fan'.format(type=dgtype), - message='speed={speed}% {label}'.format(speed=fan['measure'], label=label), + name=f'jetson_stats/fan/{name}', + message=message, hardware_id=hardware, - values=[ - KeyValue("Mode", "{mode}".format(mode=fan['mode'])), - KeyValue("Speed", "{speed}".format(speed=fan['speed'])), - KeyValue("Measure", "{measure}".format(measure=fan['measure'])), - KeyValue("Automatic", "{ctrl}".format(ctrl=fan['auto'])), - KeyValue("RPM", "{rpm}".format(rpm=fan['rpm']))]) + values=values) return d_fan def ram_status(hardware, ram, dgtype): - """ - Make a RAM diagnostic status message - - Fields: - * use - status ram used - * shared - status of shared memory used from GPU - * tot - Total size RAM - * unit - Unit size RAM, usually in kB - * lfb - Largest Free Block (lfb) is a statistic about the memory allocator - * nblock - Number of block used - * size - Size of the largest free block - * unit - Unit size lfb - """ - lfb_status = ram['lfb'] - tot_ram, divider, unit_name = size_min(ram.get('tot', 0), start=ram.get('unit', 'M')) + # Build label string + used = size_to_string(ram['used'], 'k') + total = size_to_string(ram['tot'], 'k') + percent = f'{used}/{total}B' + lfb = ram['lfb'] + label_lfb = f'(lfb {lfb}x4MB)' # Make ram diagnostic status d_ram = DiagnosticStatus( - name='jetson_stats {type} ram'.format(type=dgtype), - message='{use:2.1f}{unit_ram}B/{tot:2.1f}{unit_ram}B (lfb {nblock}x{size}{unit}B)'.format( - use=ram['use'] / divider, - unit_ram=unit_name, - tot=tot_ram, - nblock=lfb_status['nblock'], - size=lfb_status['size'], - unit=lfb_status['unit']), + name=f'jetson_stats/{dgtype}/ram', + message=f'{percent} - {label_lfb}', hardware_id=hardware, values=[ - KeyValue("Use", "{use}".format(use=ram.get('use', 0))), - KeyValue("Shared", "{shared}".format(shared=ram.get('shared', 0))), - KeyValue("Total", "{tot}".format(tot=ram.get('tot', 0))), - KeyValue("Unit", "{unit}B".format(unit=ram.get('unit', 'M'))), - KeyValue("lfb", "{nblock}x{size}{unit}B".format( - nblock=lfb_status['nblock'], - size=lfb_status['size'], - unit=lfb_status['unit']))]) + KeyValue(key='Use', value=size_to_string(ram['used'], 'k')), + KeyValue(key='Shared', value=size_to_string(ram['shared'], 'k')), + KeyValue(key='Buffers', value=size_to_string(ram['buffers'], 'k')), + KeyValue(key='Cached', value=size_to_string(ram['cached'], 'k')), + KeyValue(key='Free', value=size_to_string(ram['free'], 'k')), + KeyValue(key='Total', value=size_to_string(ram['tot'], 'k')), + KeyValue(key='lfb', value=f'{lfb}x4MB')]) return d_ram def swap_status(hardware, swap, dgtype): - """ - Make a swap diagnostic message - - Fields: - * use - Amount of SWAP in use - * tot - Total amount of SWAP available for applications - * unit - Unit SWAP, usually in MB - * cached - * size - Cache size - * unit - Unit cache size - """ - swap_cached = swap.get('cached', {}) - tot_swap, divider, unit = size_min(swap.get('tot', 0), start=swap.get('unit', 'M')) - message = '{use}{unit_swap}B/{tot}{unit_swap}B (cached {size}{unit}B)'.format( - use=swap.get('use', 0) / divider, - tot=tot_swap, - unit_swap=unit, - size=swap_cached.get('size', '0'), - unit=swap_cached.get('unit', '')) + used = size_to_string(swap['used'], 'k') + total = size_to_string(swap['tot'], 'k') + cached = size_to_string(swap['cached'], 'k') + # Build list of keys + values = [ + KeyValue(key='Use', value=used), + KeyValue(key='Total', value=total), + KeyValue(key='Cached', value=cached) + ] + for swap_name in swap['table']: + swap_data = swap['table'][swap_name] + prio = swap_data['prio'] + used = size_to_string(swap_data['used'], 'k') + total = size_to_string(swap_data['size'], 'k') + boot = '- Boot' if swap_data['boot'] else '' + values.append(KeyValue(key=f'swap {swap_name}', value=f'{prio} - {used}/{total} {boot}')) # Make swap diagnostic status d_swap = DiagnosticStatus( - name='jetson_stats {type} swap'.format(type=dgtype), - message=message, + name=f'jetson_stats/{dgtype}/swap', + message=f'{used}/{total} (Cached {cached})', hardware_id=hardware, - values=[ - KeyValue("Use", "{use}".format(use=swap.get('use', 0))), - KeyValue("Total", "{tot}".format(tot=swap.get('tot', 0))), - KeyValue("Unit", "{unit}B".format(unit=swap.get('unit', 'M'))), - KeyValue("Cached", "{size}{unit}B".format(size=swap_cached.get('size', '0'), unit=swap_cached.get('unit', '')))]) + values=values) return d_swap -def power_status(hardware, total, power): - """ - Make a Power diagnostic message - - Fields: - * Two power dictionaries: - * total - The total power estimated is not available of the NVIDIA Jetson power comsumption - * power - A dictionary with all power comsumption - - For each power comsumption there are two fields: - * avg - Average power consumption in milliwatts - * cur - Current power consumption in milliwatts - """ +def power_status(hardware, name, power): values = [] - # Make list power - for watt in sorted(power): - value = power[watt] - watt_name = watt.replace("VDD_", "").replace("POM_", "").replace("_", " ") - values += [KeyValue(watt_name, "curr={curr}mW avg={avg}mW".format(curr=int(value['cur']), avg=int(value['avg'])))] - # Make voltage diagnostic status + # name = name.replace("VDDQ_", "").replace("VDD_", "").replace("_", " ") + unit_power = unit_to_string(power['power'], 'm', 'W') + unit_avg = unit_to_string(power['avg'], 'm', 'W') + # Add values + if 'volt' in power: + values.append(KeyValue(key='Volt', value=unit_to_string(power['volt'], 'm', 'W'))) + if 'curr' in power: + values.append(KeyValue(key='Current', value=unit_to_string(power['curr'], 'm', 'W'))) + values.append(KeyValue(key='Power', value=f'{unit_power}')) + values.append(KeyValue(key='Average', value=f'{unit_avg}')) + if 'warn' in power: + values.append(KeyValue(key='Warning', value=unit_to_string(power['warn'], 'm', 'W'))) + if 'crit' in power: + values.append(KeyValue(key='Critical', value=unit_to_string(power['crit'], 'm', 'W'))) + # Set status + level = DiagnosticStatus.OK d_volt = DiagnosticStatus( - name='jetson_stats power', - message='curr={curr}mW avg={avg}mW'.format(curr=int(total['cur']), avg=int(total['avg'])), + level=level, + name=f'jetson_stats/power/{name}', + message=f'Power: {unit_power} - Avg: {unit_avg}', hardware_id=hardware, values=values) return d_volt -def temp_status(hardware, temp, level_options): - """ - Make a temperature diagnostic message - - Fields: - * All temperatures are in Celsius - """ - values = [] +def temp_status(hardware, name, sensor): + temperature = sensor['temp'] + # Set color temperature + max_value = sensor['max'] if 'max' in sensor else TEMPERATURE_MAX + crit_value = sensor['crit'] if 'crit' in sensor else TEMPERATURE_CRIT + message = f'{temperature:3.2f}C' if sensor['online'] else 'Offline' + # Set temperature level level = DiagnosticStatus.OK - list_options = sorted(level_options.keys(), reverse=True) - max_temp = 20 - # List all temperatures - for key, value in temp.items(): - values += [KeyValue(key, "{value:8.2f}C".format(value=value))] - if value > max_temp: - # Add last high temperature - max_temp = value - # Make status message - for th in list_options: - if max_temp >= th: - level = level_options[th] - break - - if level is not DiagnosticStatus.OK: - max_temp_names = [] - # List off names - for key, value in temp.items(): - if value >= th: - # Store name - max_temp_names += [key] - # Write a message - message = '[' + ', '.join(max_temp_names) + '] are more than {temp} C'.format(temp=th) - else: - message = '{n_temp} temperatures reads'.format(n_temp=len(temp)) - # Make temperature diagnostic status + if temperature >= crit_value: + level = DiagnosticStatus.ERROR + message = f'{temperature:3.2f}C more than {crit_value:3.2f}C' + elif temperature >= max_value: + level = DiagnosticStatus.WARN + message = f'{temperature:3.2f}C more than {max_value:3.2f}C' + # Build diagnostic message d_temp = DiagnosticStatus( level=level, - name='jetson_stats temp', + name=f'jetson_stats/temp/{name}', message=message, hardware_id=hardware, - values=values) + values=[ + KeyValue(key='Warning', value=f'{max_value:3.2f}C'), + KeyValue(key='Critical', value=f'{crit_value:3.2f}C') + ]) return d_temp def emc_status(hardware, emc, dgtype): - """ - Make a EMC diagnostic message - - Fields: - * min_freq - Minimum frequency in kHz - * max_freq - Maximum frequency in kHz - * frq - Running frequency in kHz - * val - Status EMC, value between [0, 100] - * FreqOverride - Status override - """ + frequency = unit_to_string(emc['cur'], 'k', 'Hz') + emc_val = emc['val'] # Make EMC diagnostic status d_emc = DiagnosticStatus( - name='jetson_stats {type} emc'.format(type=dgtype), - message='{val}%'.format(val=emc['val']), + name=f'jetson_stats/{dgtype}/emc', + message=frequency, hardware_id=hardware, values=[ - KeyValue('Val', '{val}%'.format(val=emc['val'])), - KeyValue("Freq", "{frq}".format(frq=emc['frq'])), - KeyValue("Unit", "khz")]) + KeyValue(key='Freq', value=frequency), + KeyValue(key='Bandwidth', value=f'{emc_val:6.2f}%')]) return d_emc -# EOF + + +def engine_status(hardware, name_group, engines): + values = [] + message = '' + for name, engine in engines.items(): + if engine['online']: + frequency = unit_to_string(engine['cur'], 'k', 'Hz') + values.append(KeyValue(key=name, value=frequency)) + message += f'{name}: {frequency} ' if len(engines) > 1 else frequency + else: + values.append(KeyValue(key=name, value='Offline')) + message += f'{name}: Offline ' if len(engines) > 1 else 'Offline' + d_engine = DiagnosticStatus( + name=f'jetson_stats/engine/{name_group}', + message=message, + hardware_id=hardware, + values=values) + return d_engine +# EOF \ No newline at end of file