Skip to content

Commit 3816eb2

Browse files
authored
Merge pull request #481 from yungwine/accelerator
Support Accelerator Node update in MyTonCtrl
2 parents 5b457de + b46e4e3 commit 3816eb2

File tree

18 files changed

+735
-45
lines changed

18 files changed

+735
-45
lines changed

modules/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import typing
22
from dataclasses import dataclass
33

4+
from modules.collator import CollatorModule
45
from modules.module import MtcModule
56
from modules.pool import PoolModule
67
from modules.nominator_pool import NominatorPoolModule
@@ -18,6 +19,7 @@
1819
'single-nominator': SingleNominatorModule,
1920
'liquid-staking': ControllerModule,
2021
'liteserver': LiteserverModule,
22+
'collator': CollatorModule,
2123
'alert-bot': AlertBotModule,
2224
'prometheus': PrometheusModule
2325
}

modules/alert_bot.py

Lines changed: 139 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
from modules.module import MtcModule
66
from mypylib.mypylib import get_timestamp, print_table, color_print
7-
from mytoncore import get_hostname
7+
from mytoncore import get_hostname, signed_int_to_hex64
88
from mytonctrl.utils import timestamp2utcdatetime
99

1010

@@ -34,6 +34,12 @@ def init_alerts():
3434
"Validator's wallet <code>{wallet}</code> balance is less than 10 TON: {balance} TON.",
3535
18 * HOUR
3636
),
37+
"low_wallet_balance_ok": Alert(
38+
"info",
39+
"Validator's wallet balance is back to normal",
40+
"Validator's wallet <code>{wallet}</code> balance is {balance} TON.",
41+
0
42+
),
3743
"db_usage_80": Alert(
3844
"high",
3945
"Node's db usage is more than 80%",
@@ -50,6 +56,12 @@ def init_alerts():
5056
or (and) set node\'s archive ttl to lower value.""",
5157
6 * HOUR
5258
),
59+
"db_usage_ok": Alert(
60+
"info",
61+
"Node's db usage is back to normal",
62+
"TON DB usage is back to normal: <b>{usage}%</b>.",
63+
0
64+
),
5365
"low_efficiency": Alert(
5466
"high",
5567
"Validator had efficiency less than 90% in the validation round",
@@ -62,24 +74,48 @@ def init_alerts():
6274
"Node is out of sync on more than 20 sec: <b>{sync} sec</b>.",
6375
300
6476
),
77+
"sync_ok": Alert(
78+
"info",
79+
"Node is back in sync",
80+
"Node is back in sync: <b>{sync} sec</b>.",
81+
0
82+
),
6583
"service_down": Alert(
6684
"critical",
67-
"Node is not running (service is down)",
68-
"validator.service is down.",
85+
"Node is not running",
86+
"Node is not running. Probably daemon service is down.",
6987
300
7088
),
89+
"service_down_ok": Alert(
90+
"info",
91+
"Node is recovered",
92+
"Node is running.",
93+
0
94+
),
7195
"adnl_connection_failed": Alert(
7296
"high",
7397
"Node is not answering to ADNL connection",
74-
"ADNL connection to node failed",
98+
"ADNL connection to node failed.",
7599
3 * HOUR
76100
),
101+
"adnl_connection_ok": Alert(
102+
"info",
103+
"ADNL connection restored",
104+
"ADNL connection to node is OK.",
105+
0
106+
),
77107
"zero_block_created": Alert(
78108
"critical",
79109
"Validator has not created any blocks in the last few hours",
80110
"Validator has not created any blocks in the last <b>{hours} hours</b>.",
81111
int(VALIDATION_PERIOD / 2.3)
82112
),
113+
"zero_block_created_ok": Alert(
114+
"info",
115+
"Validator resumed block production",
116+
"Validator resumed block production. Blocks produced in the recent window: <b>{blocks}</b> in ~<b>{hours}h</b>.",
117+
0
118+
),
83119
"validator_slashed": Alert(
84120
"high",
85121
"Validator has been slashed in the previous validation round",
@@ -116,12 +152,30 @@ def init_alerts():
116152
"Found proposals with hashes `{hashes}` that have significant amount of votes, but current validator didn't vote for them. Please check @tonstatus for more details.",
117153
VALIDATION_PERIOD
118154
),
155+
"voting_ok": Alert(
156+
"info",
157+
"All high-priority proposals are voted",
158+
"All high-priority proposals are voted or no longer require action.",
159+
0
160+
),
119161
"initial_sync_completed": Alert(
120162
"info",
121163
"Initial sync has been completed (info alert with no sound)",
122164
"Node initial sync has been completed",
123165
0
124-
)
166+
),
167+
"shard_collators_offline": Alert(
168+
"high",
169+
"All collators for specific shards are offline",
170+
"All collators for shards <code>{shards}</code> are offline.",
171+
3600
172+
),
173+
"shard_collators_ok": Alert(
174+
"info",
175+
"Shards have online collators again",
176+
"All required shards have online collators again.",
177+
0
178+
),
125179
}
126180

127181

@@ -156,7 +210,7 @@ def send_message(self, text: str, silent: bool = False, disable_web_page_preview
156210
if not response['ok']:
157211
raise Exception(f"send_message error: {response}")
158212

159-
def send_alert(self, alert_name: str, *args, **kwargs):
213+
def send_alert(self, alert_name: str, *args, track_active: bool = True, **kwargs):
160214
if not self.alert_is_enabled(alert_name):
161215
return
162216
last_sent = self.get_alert_sent(alert_name)
@@ -169,8 +223,10 @@ def send_alert(self, alert_name: str, *args, **kwargs):
169223
for key, value in kwargs.items():
170224
if isinstance(value, (int, float)):
171225
kwargs[key] = f'{value:,}'.replace(',', ' ') # make space separator for thousands
172-
173-
text = '🆘' if alert.severity != 'info' else ''
226+
if alert_name.endswith('_ok'):
227+
text = '✅'
228+
else:
229+
text = '🆘' if alert.severity != 'info' else ''
174230
text += f''' <b>Node {self.hostname}: {alert_name_readable} </b>
175231
176232
{alert.text.format(*args, **kwargs)}
@@ -190,6 +246,30 @@ def send_alert(self, alert_name: str, *args, **kwargs):
190246
if time.time() - last_sent > alert.timeout:
191247
self.send_message(text, alert.severity == "info") # send info alerts without sound
192248
self.set_alert_sent(alert_name)
249+
if track_active:
250+
self._set_alert_active(alert_name, True)
251+
252+
def resolve_alert(self, alert_name: str, ok_alert_name: str = None, **kwargs):
253+
if not self._is_alert_active(alert_name):
254+
return
255+
ok_alert_name = ok_alert_name or f"{alert_name}_ok"
256+
if ok_alert_name not in ALERTS:
257+
self._set_alert_active(alert_name, False)
258+
return
259+
if not self.alert_is_enabled(ok_alert_name):
260+
self._set_alert_active(alert_name, False)
261+
return
262+
self.send_alert(ok_alert_name, track_active=False, **kwargs)
263+
self._set_alert_active(alert_name, False)
264+
265+
def resolve_alert_group(self, alert_names: list, ok_alert_name: str, **kwargs):
266+
if not any(self._is_alert_active(name) for name in alert_names):
267+
return
268+
if ok_alert_name in ALERTS and self.alert_is_enabled(ok_alert_name):
269+
self.send_alert(ok_alert_name, track_active=False, **kwargs)
270+
for name in alert_names:
271+
if self._is_alert_active(name):
272+
self._set_alert_active(name, False)
193273

194274
def set_global_vars(self):
195275
# set global vars for correct alerts timeouts for current network
@@ -222,7 +302,7 @@ def get_alert_from_db(self, alert_name: str):
222302
if 'alerts' not in self.ton.local.db:
223303
self.ton.local.db['alerts'] = {}
224304
if alert_name not in self.ton.local.db['alerts']:
225-
self.ton.local.db['alerts'][alert_name] = {'sent': 0, 'enabled': True}
305+
self.ton.local.db['alerts'][alert_name] = {'sent': 0, 'enabled': True, 'active': False, 'resolved_sent': 0}
226306
return self.ton.local.db['alerts'][alert_name]
227307

228308
def set_alert_sent(self, alert_name: str):
@@ -242,6 +322,16 @@ def set_alert_enabled(self, alert_name: str, enabled: bool):
242322
alert['enabled'] = enabled
243323
self.ton.local.save()
244324

325+
def _set_alert_active(self, alert_name: str, active: bool):
326+
alert = self.get_alert_from_db(alert_name)
327+
if alert.get('active') != active:
328+
alert['active'] = active
329+
if not active:
330+
alert['resolved_sent'] = int(time.time())
331+
332+
def _is_alert_active(self, alert_name: str) -> bool:
333+
return self.get_alert_from_db(alert_name).get('active', False)
334+
245335
def enable_alert(self, args):
246336
if len(args) != 1:
247337
raise Exception("Usage: enable_alert <alert_name>")
@@ -314,6 +404,8 @@ def check_db_usage(self):
314404
self.send_alert("db_usage_95")
315405
elif usage > 80:
316406
self.send_alert("db_usage_80")
407+
else:
408+
self.resolve_alert_group(["db_usage_95", "db_usage_80"], "db_usage_ok", usage=int(usage))
317409

318410
def check_validator_wallet_balance(self):
319411
if not self.ton.using_validator():
@@ -325,6 +417,8 @@ def check_validator_wallet_balance(self):
325417
validator_account = self.ton.GetAccount(validator_wallet.addrB64)
326418
if validator_account.status != "empty" and validator_account.balance < 10:
327419
self.send_alert("low_wallet_balance", wallet=validator_wallet.addrB64, balance=validator_account.balance)
420+
else:
421+
self.resolve_alert("low_wallet_balance", ok_alert_name="low_wallet_balance_ok", wallet=validator_wallet.addrB64, balance=validator_account.balance)
328422

329423
def check_efficiency(self):
330424
if not self.ton.using_validator():
@@ -345,11 +439,15 @@ def check_validator_working(self):
345439
validator_status = self.ton.GetValidatorStatus()
346440
if not self.initial_sync and not validator_status.is_working:
347441
self.send_alert("service_down")
442+
elif not self.initial_sync and validator_status.is_working:
443+
self.resolve_alert("service_down", ok_alert_name="service_down_ok")
348444

349445
def check_sync(self):
350446
validator_status = self.ton.GetValidatorStatus()
351447
if not self.initial_sync and validator_status.is_working and validator_status.out_of_sync >= 20:
352448
self.send_alert("out_of_sync", sync=validator_status.out_of_sync)
449+
elif not self.initial_sync and validator_status.is_working and validator_status.out_of_sync < 20:
450+
self.resolve_alert("out_of_sync", ok_alert_name="sync_ok", sync=validator_status.out_of_sync)
353451

354452
def check_zero_blocks_created(self):
355453
if not self.ton.using_validator():
@@ -362,7 +460,10 @@ def check_zero_blocks_created(self):
362460
return
363461
validators = self.ton.GetValidatorsList(start=start, end=end)
364462
validator = self.validator_module.find_myself(validators)
365-
if validator is None or validator.blocks_created > 0:
463+
if validator is None:
464+
return
465+
if validator.blocks_created > 0:
466+
self.resolve_alert("zero_block_created", ok_alert_name="zero_block_created_ok", blocks=validator.blocks_created, hours=round(period / 3600))
366467
return
367468
self.send_alert("zero_block_created", hours=round(period / 3600))
368469

@@ -380,6 +481,8 @@ def check_adnl_connection_failed(self):
380481
if not ok:
381482
self.local.add_log(error, "warning")
382483
self.send_alert("adnl_connection_failed")
484+
else:
485+
self.resolve_alert("adnl_connection_failed", ok_alert_name="adnl_connection_ok")
383486

384487
def get_myself_from_election(self, config: dict):
385488
if not config["validators"]:
@@ -444,6 +547,8 @@ def check_voting(self):
444547
need_to_vote.append(offer['hash'])
445548
if need_to_vote:
446549
self.send_alert("voting", hashes=' '.join(need_to_vote))
550+
else:
551+
self.resolve_alert("voting", ok_alert_name="voting_ok")
447552

448553
def check_initial_sync(self):
449554
if not self.initial_sync:
@@ -452,6 +557,29 @@ def check_initial_sync(self):
452557
self.initial_sync = False
453558
self.send_alert("initial_sync_completed")
454559

560+
def check_online_collators(self):
561+
if not self.ton.using_validator():
562+
return
563+
collators_list = self.validator_module.get_collators_list()
564+
if not collators_list or not collators_list['shards']:
565+
return
566+
collators_stats = self.validator_module.get_collators_stats()
567+
offline_shards = []
568+
569+
for shard in collators_list['shards']:
570+
if not shard['collators']:
571+
continue
572+
collators_alive = []
573+
for c in shard['collators']:
574+
collators_alive.append(collators_stats.get(c['adnl_id']))
575+
if not any(collators_alive):
576+
offline_shards.append(f"{shard['shard_id']['workchain']}:{signed_int_to_hex64(int(shard['shard_id']['shard']))}")
577+
578+
if offline_shards:
579+
self.send_alert("shard_collators_offline", shards=' '.join(offline_shards))
580+
else:
581+
self.resolve_alert("shard_collators_offline", ok_alert_name="shard_collators_ok")
582+
455583
def check_status(self):
456584
if not self.ton.using_alert_bot():
457585
return
@@ -471,6 +599,7 @@ def check_status(self):
471599
self.local.try_function(self.check_stake_returned)
472600
self.local.try_function(self.check_voting)
473601
self.local.try_function(self.check_initial_sync)
602+
self.local.try_function(self.check_online_collators)
474603

475604
def add_console_commands(self, console):
476605
console.AddItem("enable_alert", self.enable_alert, self.local.translate("enable_alert_cmd"))

0 commit comments

Comments
 (0)