Skip to content

Commit

Permalink
Merge pull request #1126 from skalenetwork/fix-process-check
Browse files Browse the repository at this point in the history
Fix various checks issues
  • Loading branch information
badrogger authored Nov 4, 2024
2 parents 34ee3f6 + d8c2b7b commit 63d8319
Show file tree
Hide file tree
Showing 11 changed files with 74 additions and 96 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -71,4 +71,6 @@ tests/skale-data/contracts_info/schain_ima_abi.json
temp_*

.DS_Store
.coverage*
.coverage*

.env
13 changes: 6 additions & 7 deletions core/schains/checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@
from core.schains.dkg.utils import get_secret_key_share_filepath
from core.schains.firewall.types import IRuleController
from core.schains.ima import get_ima_time_frame, get_migration_ts as get_ima_migration_ts
from core.schains.process import is_monitor_process_alive
from core.schains.rpc import (
check_endpoint_alive,
check_endpoint_blocks,
Expand Down Expand Up @@ -390,11 +389,6 @@ def blocks(self) -> CheckRes:
return CheckRes(check_endpoint_blocks(http_endpoint))
return CheckRes(False)

@property
def process(self) -> CheckRes:
"""Checks that sChain monitor process is running"""
return CheckRes(is_monitor_process_alive(self.schain_record.monitor_id))

@property
def exit_zero(self) -> CheckRes:
"""Check that sChain container exited with zero code"""
Expand Down Expand Up @@ -453,7 +447,12 @@ def __getattr__(self, attr: str) -> Any:
def get_name(self) -> str:
return self.name

def get_all(self, log: bool = True, save: bool = False, needed: Optional[List[str]] = None):
def get_all(
self,
log: bool = True,
save: bool = False,
needed: Optional[List[str]] = None
) -> dict:
needed = needed or API_ALLOWED_CHECKS

plain_checks = {}
Expand Down
6 changes: 3 additions & 3 deletions core/schains/info.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class SchainData:
dkg_status: int
is_deleted: bool
first_run: bool
repair_mode: bool
repair_ts: int

def to_dict(self) -> dict:
return {
Expand All @@ -30,7 +30,7 @@ def to_dict(self) -> dict:
'dkg_status': self.dkg_status,
'is_deleted': self.is_deleted,
'first_run': self.first_run,
'repair_mode': self.repair_mode
'repair_ts': self.repair_ts
}


Expand All @@ -52,7 +52,7 @@ def get_schain_info_by_name(skale: Skale, schain_name: str) -> SchainData:
record.dkg_status,
record.is_deleted,
record.first_run,
record.repair_mode
int(record.repair_date.timestamp())
)


Expand Down
7 changes: 7 additions & 0 deletions core/schains/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import os
import shutil
import signal
import time
from typing import Tuple

import pathlib
Expand Down Expand Up @@ -143,3 +144,9 @@ def terminate_process(
def is_monitor_process_alive(monitor_pid: int) -> bool:
"""Checks that provided monitor_id is inited and alive"""
return monitor_pid != 0 and check_pid(monitor_pid)


def is_process_healthy(schain_name: str, allowed_diff: int) -> bool:
pid, pts = get_schain_process_info(schain_name)
current_ts = int(time.time())
return pid is not None and is_monitor_process_alive(pid) and current_ts - pts < allowed_diff
9 changes: 8 additions & 1 deletion tests/routes/node_test.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import socket
import datetime
import glob
import shutil
import socket

import pytest
import mock
Expand All @@ -15,6 +17,7 @@
from core.node import Node, NodeStatus
from core.node_config import NodeConfig
from core.schains.config.file_manager import ConfigFileManager
from tools.configs.schains import SCHAINS_DIR_PATH
from tools.configs.tg import TG_API_KEY, TG_CHAT_ID
from web.routes.node import node_bp
from web.helper import get_api_url
Expand Down Expand Up @@ -259,6 +262,10 @@ def test_exit_maintenance(skale_bp, node_config_in_maintenance):


def test_update_safe(skale, schain_on_contracts, schain_config, upstreams, skale_bp):
for path in glob.glob(f'{SCHAINS_DIR_PATH}/*'):
if not path.endswith(schain_on_contracts):
shutil.rmtree(path)

data = get_bp_data(
skale_bp,
get_api_url(BLUEPRINT_NAME, 'update-safe'),
Expand Down
2 changes: 1 addition & 1 deletion tests/routes/schains_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def test_get_schain(
'id': schain_id,
'mainnet_owner': skale.wallet.address,
'part_of_node': 1, 'dkg_status': 1, 'is_deleted': False,
'first_run': True, 'repair_mode': False
'first_run': True, 'repair_ts': int(r.repair_date.timestamp())
}
}

Expand Down
28 changes: 1 addition & 27 deletions tests/schains/checks_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from http import HTTPStatus

from collections import namedtuple
from multiprocessing import Process

import mock
import docker
Expand All @@ -20,9 +19,8 @@
schain_config_dir
)
from core.schains.config.schain_node import generate_schain_nodes
from core.schains.skaled_exit_codes import SkaledExitCodes
from core.schains.runner import get_container_info, get_image_name, run_ima_container
# from core.schains.cleaner import remove_ima_container
from core.schains.skaled_exit_codes import SkaledExitCodes

from tools.configs.containers import IMA_CONTAINER, SCHAIN_CONTAINER
from tools.helper import read_json
Expand Down Expand Up @@ -330,29 +328,6 @@ def test_exit_code(skale, rule_controller, schain_db, current_nodes, estate, dut
dutils.safe_rm(container_name)


def test_process(skale, rule_controller, schain_db, current_nodes, estate, dutils):
schain_record = SChainRecord.get_by_name(schain_db)
checks = SChainChecks(
schain_db,
TEST_NODE_ID,
schain_record=schain_record,
rule_controller=rule_controller,
stream_version=CONFIG_STREAM,
current_nodes=current_nodes,
last_dkg_successful=True,
estate=estate,
dutils=dutils
)
assert not checks.process.status

process = Process(target=time.sleep, args=(5,))
process.start()
schain_record.set_monitor_id(process.ident)
assert checks.process.status
process.join()
assert not checks.process.status


def test_get_all(schain_config, rule_controller, dutils, current_nodes, schain_db, estate):
schain_name = schain_config['skaleConfig']['sChain']['schainName']
schain_record = SChainRecord.get_by_name(schain_name)
Expand All @@ -377,7 +352,6 @@ def test_get_all(schain_config, rule_controller, dutils, current_nodes, schain_d
assert isinstance(checks_dict['rpc'], bool)
assert isinstance(checks_dict['blocks'], bool)
assert isinstance(checks_dict['ima_container'], bool)
assert isinstance(checks_dict['process'], bool)

estate.ima_linked = False
checks_without_ima = SChainChecksMock(
Expand Down
34 changes: 17 additions & 17 deletions tests/schains/info_test.py
Original file line number Diff line number Diff line change
@@ -1,42 +1,42 @@
import freezegun

from core.schains.info import get_schain_info_by_name
from tests.utils import upsert_schain_record_with_config
from tests.utils import CURRENT_DATETIME, upsert_schain_record_with_config


@freezegun.freeze_time(CURRENT_DATETIME)
def test_get_schain_info_by_name(skale, schain_on_contracts, schain_db):
name = schain_on_contracts
upsert_schain_record_with_config(name)
schain_record = upsert_schain_record_with_config(name)
info = get_schain_info_by_name(skale, name)
expected_ts = int(schain_record.repair_date.timestamp())
assert info.name == name
assert info.schain_id == skale.schains.name_to_id(name)
assert info.part_of_node == 1
assert info.dkg_status == 1
assert not info.is_deleted
assert info.first_run
assert not info.repair_mode
assert info.repair_ts == expected_ts

assert info.to_dict() == {
'name': name,
'id': skale.schains.name_to_id(name),
'mainnet_owner': info.mainnet_owner,
'part_of_node': 1,
'dkg_status': 1,
'is_deleted': False,
'first_run': True,
'repair_mode': False
'name': name,
'id': skale.schains.name_to_id(name),
'mainnet_owner': info.mainnet_owner,
'part_of_node': 1,
'dkg_status': 1,
'is_deleted': False,
'first_run': True,
'repair_ts': expected_ts,
}


def test_get_schain_info_by_name_not_exist_contracts(
skale, schain_db
):
def test_get_schain_info_by_name_not_exist_contracts(skale, schain_db):
name = 'undefined_schain'
info = get_schain_info_by_name(skale, name)
assert info is None


def test_get_schain_info_by_name_not_exist_db(
skale, schain_on_contracts, db
):
def test_get_schain_info_by_name_not_exist_db(skale, schain_on_contracts, db):
name = schain_on_contracts
info = get_schain_info_by_name(skale, name)
assert info is None
2 changes: 2 additions & 0 deletions tests/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
""" SKALE test utilities """

import datetime
import os
import json
import random
Expand Down Expand Up @@ -40,6 +41,7 @@
from web.models.schain import upsert_schain_record

CURRENT_TS = 1594903080
CURRENT_DATETIME = datetime.datetime.utcfromtimestamp(CURRENT_TS)

DIR_PATH = os.path.dirname(os.path.realpath(__file__))
ENDPOINT = os.getenv('ENDPOINT')
Expand Down
55 changes: 20 additions & 35 deletions web/routes/health.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,20 +28,14 @@
from core.node import get_check_report, get_skale_node_version
from core.node import get_current_nodes
from core.schains.checks import SChainChecks
from core.schains.firewall.utils import (
get_default_rule_controller,
get_sync_agent_ranges
)
from core.schains.ima import get_ima_log_checks
from core.schains.external_config import ExternalState
from core.schains.firewall.utils import get_default_rule_controller, get_sync_agent_ranges
from core.schains.ima import get_ima_log_checks
from core.schains.process import is_process_healthy
from tools.configs.schains import DKG_TIMEOUT_COEFFICIENT
from tools.sgx_utils import SGX_CERTIFICATES_FOLDER, SGX_SERVER_URL
from web.models.schain import SChainRecord
from web.helper import (
construct_err_response,
construct_ok_response,
get_api_url,
g_skale
)
from web.helper import construct_err_response, construct_ok_response, get_api_url, g_skale

logger = logging.getLogger(__name__)
BLUEPRINT_NAME = 'health'
Expand All @@ -56,9 +50,7 @@ def containers():
all = request.args.get('all') == 'True'
name_filter = request.args.get('name_filter') or ''
containers_list = g.docker_utils.get_containers_info(
all=all,
name_filter=name_filter,
format=True
all=all, name_filter=name_filter, format=True
)
return construct_ok_response(containers_list)

Expand All @@ -72,26 +64,21 @@ def schains_checks():
checks_filter = checks_filter.split(',')
node_id = g.config.id
if node_id is None:
return construct_err_response(status_code=HTTPStatus.BAD_REQUEST,
msg='No node installed')
return construct_err_response(status_code=HTTPStatus.BAD_REQUEST, msg='No node installed')

schains = g.skale.schains.get_schains_for_node(node_id)
allowed_diff = int(g.skale.constants_holder.get_dkg_timeout() * DKG_TIMEOUT_COEFFICIENT)
sync_agent_ranges = get_sync_agent_ranges(g.skale)
stream_version = get_skale_node_version()
estate = ExternalState(
chain_id=g.skale.web3.eth.chain_id,
ima_linked=True,
ranges=[]
)
estate = ExternalState(chain_id=g.skale.web3.eth.chain_id, ima_linked=True, ranges=[])
checks = []
for schain in schains:
if schain.name != '':
rotation_data = g.skale.node_rotation.get_rotation(schain.name)
rotation_id = rotation_data['rotation_id']
if SChainRecord.added(schain.name):
rc = get_default_rule_controller(
name=schain.name,
sync_agent_ranges=sync_agent_ranges
name=schain.name, sync_agent_ranges=sync_agent_ranges
)
current_nodes = get_current_nodes(g.skale, schain.name)
schain_record = SChainRecord.get_by_name(schain.name)
Expand All @@ -105,12 +92,14 @@ def schains_checks():
current_nodes=current_nodes,
last_dkg_successful=True,
estate=estate,
sync_node=False
sync_node=False,
).get_all(needed=checks_filter)
checks.append({
'name': schain.name,
'healthchecks': schain_checks
})
if not checks_filter or 'process' in checks_filter:
schain_checks.update(
{'process': is_process_healthy(schain.name, allowed_diff=allowed_diff)}
)

checks.append({'name': schain.name, 'healthchecks': schain_checks})
return construct_ok_response(checks)


Expand All @@ -119,8 +108,7 @@ def ima_log_checks():
logger.debug(request)
node_id = g.config.id
if node_id is None:
return construct_err_response(status_code=HTTPStatus.BAD_REQUEST,
msg='No node installed')
return construct_err_response(status_code=HTTPStatus.BAD_REQUEST, msg='No node installed')
checks = get_ima_log_checks()
return construct_ok_response(checks)

Expand Down Expand Up @@ -152,15 +140,12 @@ def sgx_info():
'status_https': status_https,
'sgx_server_url': SGX_SERVER_URL,
'sgx_keyname': g.config.sgx_key_name,
'sgx_wallet_version': version
'sgx_wallet_version': version,
}
return construct_ok_response(data=res)


@health_bp.route(
get_api_url(BLUEPRINT_NAME, 'check-report'),
methods=['GET']
)
@health_bp.route(get_api_url(BLUEPRINT_NAME, 'check-report'), methods=['GET'])
def check_report():
logger.debug(request)
report = get_check_report()
Expand Down
10 changes: 6 additions & 4 deletions web/routes/schains.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
# along with this program. If not, see <https://www.gnu.org/licenses/>.

import logging
from dataclasses import asdict

from flask import Blueprint, g, request

Expand Down Expand Up @@ -89,10 +90,11 @@ def schains_list():
node_id = g.config.id
if node_id is None:
return construct_err_response(msg='No node installed')
schains_list = list(filter(
lambda s: s.get('name'),
g.skale.schains.get_schains_for_node(node_id)
))
schains_list = [
asdict(s)
for s in g.skale.schains.get_schains_for_node(node_id)
if s and s.name != ''
]
return construct_ok_response(schains_list)


Expand Down

0 comments on commit 63d8319

Please sign in to comment.