Skip to content

Commit 29b4e6b

Browse files
Alert upon an unknown future upgrade (#4474)
1 parent 5565231 commit 29b4e6b

File tree

1 file changed

+129
-2
lines changed

1 file changed

+129
-2
lines changed

node/node.go

Lines changed: 129 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313
"fmt"
1414
"io"
1515
"io/fs"
16+
"math"
1617
"net"
1718
"net/netip"
1819
"os"
@@ -25,6 +26,7 @@ import (
2526
"github.com/prometheus/client_golang/prometheus/collectors"
2627
"github.com/prometheus/client_golang/prometheus/promhttp"
2728
"go.uber.org/zap"
29+
"golang.org/x/exp/maps"
2830

2931
"github.com/ava-labs/avalanchego/api/admin"
3032
"github.com/ava-labs/avalanchego/api/health"
@@ -105,6 +107,7 @@ const (
105107
responsesNamespace = constants.PlatformName + metric.NamespaceSeparator + "responses"
106108
rpcchainvmNamespace = constants.PlatformName + metric.NamespaceSeparator + "rpcchainvm"
107109
systemResourcesNamespace = constants.PlatformName + metric.NamespaceSeparator + "system_resources"
110+
upgradeNamespace = constants.PlatformName + metric.NamespaceSeparator + "upgrade"
108111
)
109112

110113
var (
@@ -113,8 +116,12 @@ var (
113116

114117
indexerDBPrefix = []byte{0x00}
115118

116-
errInvalidTLSKey = errors.New("invalid TLS key")
117-
errShuttingDown = errors.New("server shutting down")
119+
errInvalidTLSKey = errors.New("invalid TLS key")
120+
errShuttingDown = errors.New("server shutting down")
121+
errNoValidators = errors.New("no validators in the current validator set")
122+
errUpgradeNeeded = errors.New("unknown network upgrade detected")
123+
errUpgradeWithinTheDay = errors.New("unknown network upgrade detected - update as soon as possible")
124+
errUpgradeWithinTheHour = errors.New("imminent network upgrade detected - update immediately")
118125
)
119126

120127
// New returns an instance of Node
@@ -1498,6 +1505,126 @@ func (n *Node) initHealthAPI() error {
14981505
return fmt.Errorf("couldn't register bls health check: %w", err)
14991506
}
15001507

1508+
upgradeReg, err := metrics.MakeAndRegister(
1509+
n.MetricsGatherer,
1510+
upgradeNamespace,
1511+
)
1512+
if err != nil {
1513+
return fmt.Errorf("couldn't create upgrade metrics register: %w", err)
1514+
}
1515+
1516+
timeUntilUpgradeMetric := prometheus.NewGauge(prometheus.GaugeOpts{
1517+
Name: "time_until",
1518+
Help: "Time until an upcoming network upgrade (ns). +Inf means the upgrade is unscheduled.",
1519+
})
1520+
infinity := math.Inf(1)
1521+
timeUntilUpgradeMetric.Set(infinity)
1522+
if err := upgradeReg.Register(timeUntilUpgradeMetric); err != nil {
1523+
return fmt.Errorf("couldn't register time until upgrade metric: %w", err)
1524+
}
1525+
1526+
// TODO: This healthcheck calls both n.vdrs.GetMap and n.Net.PeerInfo which
1527+
// are expensive calls. This could be rewritten as an event based monitor to
1528+
// avoid expensive iteration.
1529+
var (
1530+
localUpgradeTime = n.Config.UpgradeConfig.GraniteTime
1531+
localUpgradeTimeUnix = uint64(localUpgradeTime.Unix())
1532+
lastLogTime time.Time
1533+
)
1534+
futureUpgradeCheck := health.CheckerFunc(func(context.Context) (interface{}, error) {
1535+
var (
1536+
currentValidators = n.vdrs.GetMap(constants.PrimaryNetworkID)
1537+
totalWeight uint64
1538+
)
1539+
for _, vdr := range currentValidators {
1540+
totalWeight += vdr.Weight
1541+
}
1542+
if totalWeight == 0 {
1543+
return nil, errNoValidators
1544+
}
1545+
1546+
var (
1547+
peers = n.Net.PeerInfo(maps.Keys(currentValidators))
1548+
upgradeTimes = make(map[uint64]uint64) // upgrade time -> stake weight
1549+
modeUpgradeTimeUnix uint64
1550+
modeUpgradeWeight uint64
1551+
)
1552+
for _, peer := range peers {
1553+
vdr := currentValidators[peer.ID]
1554+
upgradeWeight := upgradeTimes[peer.UpgradeTime]
1555+
upgradeWeight += vdr.Weight
1556+
upgradeTimes[peer.UpgradeTime] = upgradeWeight
1557+
1558+
if upgradeWeight > modeUpgradeWeight {
1559+
modeUpgradeTimeUnix = peer.UpgradeTime
1560+
modeUpgradeWeight = upgradeWeight
1561+
}
1562+
}
1563+
1564+
modeUpgradeWeightPortion := float64(modeUpgradeWeight) / float64(totalWeight)
1565+
result := map[string]interface{}{
1566+
"localUpgradeTime": localUpgradeTime,
1567+
"modeUpgradeTime": time.Unix(int64(modeUpgradeTimeUnix), 0).UTC(),
1568+
"modeUpgradeWeightPercentage": 100 * modeUpgradeWeightPortion,
1569+
"numUpgradeTimes": len(upgradeTimes),
1570+
}
1571+
if localUpgradeTimeUnix >= modeUpgradeTimeUnix || modeUpgradeWeightPortion < .5 {
1572+
timeUntilUpgradeMetric.Set(infinity)
1573+
return result, nil
1574+
}
1575+
1576+
const (
1577+
day = 24 * time.Hour
1578+
week = 7 * day
1579+
)
1580+
modeUpgradeTime := time.Unix(int64(modeUpgradeTimeUnix), 0)
1581+
timeUntilUpgrade := time.Until(modeUpgradeTime)
1582+
timeUntilUpgradeMetric.Set(float64(timeUntilUpgrade))
1583+
result["timeUntilUpgrade"] = timeUntilUpgrade.String()
1584+
1585+
var (
1586+
logFrequency time.Duration
1587+
log func(msg string, fields ...zap.Field)
1588+
err error
1589+
)
1590+
switch {
1591+
case timeUntilUpgrade > week:
1592+
logFrequency = 12 * time.Hour
1593+
log = n.Log.Info
1594+
case timeUntilUpgrade > 3*day:
1595+
logFrequency = 12 * time.Hour
1596+
log = n.Log.Warn
1597+
case timeUntilUpgrade > day:
1598+
logFrequency = time.Hour
1599+
log = n.Log.Warn
1600+
err = errUpgradeNeeded
1601+
case timeUntilUpgrade > time.Hour:
1602+
logFrequency = time.Hour
1603+
log = n.Log.Error
1604+
err = errUpgradeWithinTheDay
1605+
default:
1606+
logFrequency = 0 // log at the rate of the health check
1607+
log = n.Log.Error
1608+
err = errUpgradeWithinTheHour
1609+
}
1610+
1611+
if time.Since(lastLogTime) >= logFrequency {
1612+
log("unknown upgrade detected - this node should be updated to a compatible version",
1613+
zap.String("latestReleaseURL", "https://github.com/ava-labs/avalanchego/releases/latest"),
1614+
zap.Time("upgradeTime", modeUpgradeTime),
1615+
zap.Duration("timeUntilUpgrade", timeUntilUpgrade),
1616+
zap.Error(err),
1617+
)
1618+
lastLogTime = time.Now()
1619+
}
1620+
return result, err
1621+
})
1622+
1623+
err = n.health.RegisterHealthCheck("futureupgrade", futureUpgradeCheck, health.ApplicationTag)
1624+
if err != nil {
1625+
return fmt.Errorf("couldn't register future upgrade health check: %w", err)
1626+
}
1627+
15011628
handler, err := health.NewGetAndPostHandler(n.Log, n.health)
15021629
if err != nil {
15031630
return err

0 commit comments

Comments
 (0)