Skip to content

Commit

Permalink
Merge pull request #294 from w3f/slash
Browse files Browse the repository at this point in the history
Slash
  • Loading branch information
ironoa authored Jul 3, 2024
2 parents a8fb450 + 89b535d commit 7b4ed85
Show file tree
Hide file tree
Showing 10 changed files with 48 additions and 219 deletions.
8 changes: 3 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[![CircleCI](https://circleci.com/gh/w3f/polkadot-watcher-validator.svg?style=svg)](https://circleci.com/gh/w3f/polkadot-watcher-validator)

# polkadot-watcher-validator
# polkadot-watcher-validator

## How to Run

Expand All @@ -25,8 +25,7 @@ It can then monitor the status of the node, leveraging on mechanisms such as the

### Monitoring Features

- An Active validator has been detected offline
- An Active validator is risking to be caught offline, please act ASAP
- A validator has been reported for Slash
- A validator is not seleceted by Phragmen alorithm to be part of the active set
- A validator has changed his payout address destination
- A validator has an unexpected payout address destination
Expand All @@ -40,11 +39,10 @@ If an expected destination address is specified in the config file, it is implic

### Resources

- validators staking and heartbeats: https://wiki.polkadot.network/docs/en/learn-staking#unresponsiveness
- session: https://wiki.polkadot.network/docs/en/glossary#session
- era: https://wiki.polkadot.network/docs/en/glossary#era
- polkadotJs library (raccomended, Nodejs + typescript): https://polkadot.js.org/docs/
- event, validators offline: https://polkadot.js.org/docs/substrate/events#someofflinevecidentificationtuple
- event, validator SlashReported: https://polkadot.js.org/docs/substrate/events/#slashreportedaccountid32-perbill-u32

## Configuration

Expand Down
26 changes: 4 additions & 22 deletions charts/polkadot-watcher/templates/alertrules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,33 +44,15 @@ spec:
severity: critical
origin: {{ .Values.prometheusRules.origin }}
{{ end }}
- alert: ValidatorOffline
- alert: ValidatorSlashed
annotations:
message: 'Target <a href="https://{{`{{ $labels.network }}`}}.subscan.io/validator/{{`{{ $labels.address }}`}}">{{`{{ $labels.name }}`}}</a> was reported offline, an advanced double check can be carried <a href="https://{{`{{ $labels.network }}`}}.subscan.io/event?address=&module=imonline&event=someoffline">here</a>. Check the account for eventual <a href="https://{{`{{ $labels.network }}`}}.subscan.io/account/{{`{{ $labels.address }}`}}?tab=reward">slashes</a>. This message is going to RESOLVE by itself soon.'
runbook_url: "https://github.com/w3f/infrastructure/wiki/Validator-Offline"
expr: max without(instance,pod) (increase(polkadot_validator_offline_reports{environment="{{ .Values.config.environment }}"}[5m])) > 0
message: 'Target <a href="https://{{`{{ $labels.network }}`}}.subscan.io/validator/{{`{{ $labels.address }}`}}">{{`{{ $labels.name }}`}}</a> was reported for Slash, an advanced double check can be carried <a href="https://{{`{{ $labels.network }}`}}.subscan.io/event?address={{`{{ $labels.address }}`}}&module=staking&event_id=slashreported">here</a>. This message is going to RESOLVE by itself soon.'
runbook_url: "https://github.com/w3f/infrastructure/wiki/Validator-Slashed"
expr: max without(instance,pod) (increase(polkadot_validator_slashed_reports{environment="{{ .Values.config.environment }}"}[5m])) > 0
for: 30s
labels:
severity: critical
origin: {{ .Values.prometheusRules.origin }}
{{ if ne .Values.prometheusRules.offlineRisk false }}
- alert: ValidatorOfflineRiskLong
annotations:
message: 'Target <a href="https://{{`{{ $labels.network }}`}}.subscan.io/validator/{{`{{ $labels.address }}`}}">{{`{{ $labels.name }}`}}</a> has either not authored any block or sent any heartbeat yet in this session. It is risking to be caught offline'
expr: max without(instance,pod) (last_over_time(polkadot_validator_offline_risk_state{environment="{{ .Values.config.environment }}"}[10m])) > 0
for: 10m
labels:
severity: critical
origin: {{ .Values.prometheusRules.origin }}
- alert: ValidatorOfflineRiskShort
annotations:
message: 'Target <a href="https://{{`{{ $labels.network }}`}}.subscan.io/validator/{{`{{ $labels.address }}`}}">{{`{{ $labels.name }}`}}</a> has either not authored any block or sent any heartbeat yet in this session. It is risking to be caught offline'
expr: max without(instance,pod) (last_over_time(polkadot_validator_offline_risk_state{environment="{{ .Values.config.environment }}"}[10m])) > 0
for: 8m
labels:
severity: warning
origin: {{ .Values.prometheusRules.origin }}
{{ end }}
- alert: ValidatorRewardDestinationChanged
annotations:
message: 'Target <a href="https://{{`{{ $labels.network }}`}}.subscan.io/validator/{{`{{ $labels.address }}`}}">{{`{{ $labels.name }}`}}</a> may have changed his reward destination recently, please double check. This message is going to RESOLVE by itself soon.'
Expand Down
1 change: 0 additions & 1 deletion charts/polkadot-watcher/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ prometheusRules:
labels:
app: w3f
origin: cluster
offlineRisk: true
producerStall: true

resources:
Expand Down
39 changes: 6 additions & 33 deletions src/prometheus.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,8 @@ import { PromClient } from './types';

export class Prometheus implements PromClient {

static readonly nameOfflineRiskMetric = 'polkadot_validator_offline_risk_state';

private blocksProducedReports: promClient.Counter<"network" | "name" | "address" | "environment">;
private offlineReports: promClient.Counter<"network" | "name" | "address" | "environment">;
private stateOfflineRisk: promClient.Gauge<"network" | "name" | "address" | "environment">;
private slashedReports: promClient.Counter<"network" | "name" | "address" | "environment">;
private stateOutOfActiveSet: promClient.Gauge<"network" | "name" | "address" | "environment">;

private payeeChangedReports: promClient.Counter<"network" | "name" | "address" | "environment">;
Expand All @@ -32,33 +29,14 @@ export class Prometheus implements PromClient {

increaseBlocksProducedReports(name: string, address: string): void {
this.blocksProducedReports.inc({network:this.network, name, address, environment: this.environment })
this.resetStatusOfflineRisk(name, address) //solve potential risk status
}

increaseOfflineReports(name: string, address: string): void {
this.offlineReports.inc({network:this.network, name, address, environment: this.environment });
}

setStatusOfflineRisk(name: string, address: string): void {
this.stateOfflineRisk.set({network:this.network, name, address, environment: this.environment }, 1);
}

resetStatusOfflineRisk(name: string, address: string): void {
this.stateOfflineRisk.set({network:this.network, name, address, environment: this.environment }, 0);
}

isStatusOfflineRiskFiring(name: string, address: string): boolean {
try {
return promClient.register.getSingleMetric(Prometheus.nameOfflineRiskMetric)['hashMap'][`name:${name},network:${this.network},address:${address},environment:${this.environment}`]['value'] == 1
} catch (error) {
this.resetStatusOfflineRisk(name, address)
return promClient.register.getSingleMetric(Prometheus.nameOfflineRiskMetric)['hashMap'][`name:${name},network:${this.network},address:${address},environment:${this.environment}`]['value'] == 1
}
increaseSlashedReports(name: string, address: string): void {
this.slashedReports.inc({network:this.network, name, address, environment: this.environment });
}

setStatusOutOfActiveSet(name: string, address: string): void{
this.stateOutOfActiveSet.set({network:this.network, name, address, environment: this.environment }, 1);
this.resetStatusOfflineRisk(name,address) //solve potential risk status
}

resetStatusOutOfActiveSet(name: string, address: string): void{
Expand Down Expand Up @@ -95,14 +73,9 @@ export class Prometheus implements PromClient {
help: 'Number of blocks produced by a validator',
labelNames: ['network', 'name', 'address', 'environment']
});
this.offlineReports = new promClient.Gauge({
name: 'polkadot_validator_offline_reports',
help: 'Times a validator has been reported offline',
labelNames: ['network', 'name', 'address', 'environment']
});
this.stateOfflineRisk = new promClient.Gauge({
name: Prometheus.nameOfflineRiskMetric,
help: 'Whether a validator has not produced a block and neither has sent an expected heartbeat yet. It is risking to be caught offline',
this.slashedReports = new promClient.Gauge({
name: 'polkadot_validator_slashed_reports',
help: 'Times a validator has been reported for slashing',
labelNames: ['network', 'name', 'address', 'environment']
});
this.stateOutOfActiveSet = new promClient.Gauge({
Expand Down
67 changes: 19 additions & 48 deletions src/subscriber.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,8 @@ import {
InputConfig,
Subscribable,
PromClient,
ValidatorImOnlineParameters
} from './types';
import { getActiveEraIndex, isHeadAfterHeartbeatBlockThreshold, hasValidatorProvedOnline, isNewSessionEvent, isOfflineEvent } from './utils';
import { getActiveEraIndex } from './utils';

export class Subscriber {
private validators: Array<Subscribable>;
Expand Down Expand Up @@ -40,7 +39,7 @@ export class Subscriber {

public triggerConnectivityTest(): void {
const testAccountName = "CONNECTIVITY_TEST_NO_ACTION_REQUIRED"
this.promClient.increaseOfflineReports(testAccountName,testAccountName);
this.promClient.increaseSlashedReports(testAccountName,testAccountName);
}

private async _initInstanceVariables(): Promise<void>{
Expand All @@ -60,7 +59,7 @@ export class Subscriber {
private async _handleNewHeadSubscriptions(): Promise<void> {
this.api.rpc.chain.subscribeNewHeads(async (header) => {
this._producerHandler(header);
this._validatorStatusHandler(header);
this._validatorStatusHandler();
this._payeeChangeHandler(header);
this._commissionChangeHandler(header);
this._checkUnexpected();
Expand Down Expand Up @@ -98,11 +97,11 @@ export class Subscriber {
events.forEach(async (record) => {
const { event } = record;

if(isOfflineEvent(event)){
this._offlineEventHandler(event)
if(this.api.events.staking.SlashReported.is(event)){
this._slashedEventHandler(event)
}

if(isNewSessionEvent(event)){
if(this.api.events.session.NewSession.is(event)){
await this._newSessionEventHandler()
}
});
Expand All @@ -123,18 +122,16 @@ export class Subscriber {
}
}

private async _validatorStatusHandler(header: Header): Promise<void> {
const parameters = await this._getImOnlineParametersAtomic(header)
private async _validatorStatusHandler(): Promise<void> {

this.validators.forEach(async account => {

const validatorActiveSetIndex = parameters.validatorActiveSet.indexOf(account.address)
const validatorActiveSetIndex = this.validatorActiveSet.indexOf(account.address)
if ( validatorActiveSetIndex < 0 ) {
this.logger.debug(`Target ${account.name} is not present in the validation active set of era ${parameters.eraIndex}`);
this.logger.debug(`Target ${account.name} is not present in the validation active set of era ${this.currentEraIndex}`);
this.promClient.setStatusOutOfActiveSet(account.name,account.address);
} else {
this.promClient.resetStatusOutOfActiveSet(account.name,account.address);
await this._checkOfflineRiskStatus(parameters,account,validatorActiveSetIndex)
}
})

Expand Down Expand Up @@ -208,28 +205,18 @@ export class Subscriber {
}
}

private async _checkOfflineRiskStatus(parameters: ValidatorImOnlineParameters,validator: Subscribable,validatorActiveSetIndex: number): Promise<void>{
if ( await hasValidatorProvedOnline(validator,validatorActiveSetIndex,parameters.sessionIndex,this.api) ) {
this.promClient.resetStatusOfflineRisk(validator.name,validator.address);
} else if(parameters.isHeartbeatExpected) {
this.logger.info(`Target ${validator.name} has either not authored any block or sent any heartbeat yet in session:${parameters.sessionIndex}/era:${parameters.eraIndex}`);
this.promClient.setStatusOfflineRisk(validator.name,validator.address);
}
// else let it be as it is.
// with this solution, if a validator has been caught offline, it will eventually remain in a risk status also for the first half of the subsequent session.
}

private _offlineEventHandler(event: Event): void {
private _slashedEventHandler(event: Event): void {

const items = event.data[0];

(items as Tuple).forEach((item) => {
const offlineValidator = item[0];
this.logger.debug(`${offlineValidator} found offline`);
const account = this.validators.find((subject) => subject.address == offlineValidator);
const validator = item[0];
this.logger.debug(`${validator} has been reported for Slash`);
const account = this.validators.find((subject) => subject.address == validator);

if (account) {
this.logger.info(`Really bad... Target ${account.name} found offline`);
this.promClient.increaseOfflineReports(account.name, account.address);
this.logger.info(`Really bad... Target ${account.name} has been reported for Slash`);
this.promClient.increaseSlashedReports(account.name, account.address);
}
});
}
Expand All @@ -249,25 +236,9 @@ export class Subscriber {
await this._initValidatorsControllers();
}

private async _getImOnlineParametersAtomic(header: Header): Promise<ValidatorImOnlineParameters> {

const sessionIndex = this.sessionIndex
const eraIndex = this.currentEraIndex
const validatorActiveSet = this.validatorActiveSet
this.logger.debug(`Current EraIndex: ${eraIndex}\tCurrent SessionIndex: ${sessionIndex}`);
const isHeartbeatExpected = await isHeadAfterHeartbeatBlockThreshold(this.api,header)

return {
isHeartbeatExpected,
sessionIndex,
eraIndex,
validatorActiveSet
}
}

private _initCounterMetrics(): void {
this._initBlocksProducedMetrics();
this._initOfflineReportsMetrics()
this._initSlashedReportsMetrics()
this._initPayeeChangedMetrics();
this._initCommissionChangedMetrics();
}
Expand All @@ -280,11 +251,11 @@ export class Subscriber {
});
}

private _initOfflineReportsMetrics(): void {
private _initSlashedReportsMetrics(): void {
this.validators.forEach((account) => {
// always increase counters even the first time, so that we initialize the time series
// https://github.com/prometheus/prometheus/issues/1673
this.promClient.increaseOfflineReports(account.name, account.address);
this.promClient.increaseSlashedReports(account.name, account.address);
});
}

Expand Down
6 changes: 1 addition & 5 deletions src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,7 @@ export interface InputConfigFromGit {

export interface PromClient {
increaseBlocksProducedReports(name: string, address: string): void;
increaseOfflineReports(name: string, address: string): void;
setStatusOfflineRisk(name: string, address: string): void;
resetStatusOfflineRisk(name: string, address: string): void;
isStatusOfflineRiskFiring(name: string, address: string): boolean;
increaseSlashedReports(name: string, address: string): void;
setStatusOutOfActiveSet(name: string, address: string): void;
resetStatusOutOfActiveSet(name: string, address: string): void;
increasePayeeChangedReports(name: string, address: string): void;
Expand Down Expand Up @@ -73,7 +70,6 @@ export interface MatrixbotMsg {
}

export interface ValidatorImOnlineParameters {
isHeartbeatExpected: boolean;
sessionIndex: SessionIndex;
eraIndex: number;
validatorActiveSet: Vec<ValidatorId>;
Expand Down
40 changes: 0 additions & 40 deletions src/utils.ts
Original file line number Diff line number Diff line change
@@ -1,56 +1,16 @@
/*eslint @typescript-eslint/no-use-before-define: ["error", { "variables": false }]*/

import { ApiPromise } from '@polkadot/api';
import { Event } from '@polkadot/types/interfaces/system';
import { SessionIndex, Header } from '@polkadot/types/interfaces';
import { Subscribable } from './types';
import { ZeroBN } from './constants';
import { LoggerSingleton } from './logger';

const logger = LoggerSingleton.getInstance()

export const isNewSessionEvent = (event: Event): boolean => {
return event.section == 'session' && event.method == 'NewSession';
}

export const isOfflineEvent = (event: Event): boolean => {
return event.section == 'imOnline' && event.method == 'SomeOffline';
}

export const hasValidatorProvedOnline = async (account: Subscribable, validatorIndex: number, sessionIndex: SessionIndex, api: ApiPromise): Promise<boolean> => {
return await _hasValidatorAuthoredBlocks(account,sessionIndex,api) || await _hasValidatorSentHeartbeats(validatorIndex,sessionIndex,api)
}

export const getActiveEraIndex = async (api: ApiPromise): Promise<number> => {
return (await api.query.staking.activeEra()).toJSON()['index'];
}

export const isHeadAfterHeartbeatBlockThreshold = async (api: ApiPromise, header: Header): Promise<boolean> => {
return false
//I'm online pallet got removed: https://github.com/paritytech/polkadot-sdk/issues/4359
const currentBlock = header.number.toBn()
const blockThreshold = await api.query.imOnline.heartbeatAfter() //threshold after which an heartbeat is expected
logger.debug(`Current Block: ${currentBlock}\tHeartbeatBlock Threshold: ${blockThreshold}`);
return currentBlock.cmp(blockThreshold) > 0
}

export async function asyncForEach<T>(array: Array<T>, callback: (arg0: T, arg1: number, arg2: Array<T>) => void): Promise<void> {
for (let index = 0; index < array.length; index++) {
await callback(array[index], index, array);
}
}

const _hasValidatorAuthoredBlocks = async (validator: Subscribable, sessionIndex: SessionIndex, api: ApiPromise): Promise<boolean> => {
return true
//I'm online pallet got removed: https://github.com/paritytech/polkadot-sdk/issues/4359
const numBlocksAuthored = await api.query.imOnline.authoredBlocks(sessionIndex,validator.address)
return numBlocksAuthored.cmp(ZeroBN) > 0
}

const _hasValidatorSentHeartbeats = async (validatorIndex: number, sessionIndex: SessionIndex, api: ApiPromise): Promise<boolean> => {
return true
//I'm online pallet got removed: https://github.com/paritytech/polkadot-sdk/issues/4359
if (validatorIndex < 0) return false;
const hb = await api.query.imOnline.receivedHeartbeats(sessionIndex,validatorIndex)
return hb.toHuman() ? true : false
}
Loading

0 comments on commit 7b4ed85

Please sign in to comment.