@@ -13,6 +13,7 @@ import (
1313 "fmt"
1414 "io"
1515 "io/fs"
16+ "math"
1617 "net"
1718 "net/netip"
1819 "os"
@@ -25,6 +26,7 @@ import (
2526 "github.com/prometheus/client_golang/prometheus/collectors"
2627 "github.com/prometheus/client_golang/prometheus/promhttp"
2728 "go.uber.org/zap"
29+ "golang.org/x/exp/maps"
2830
2931 "github.com/ava-labs/avalanchego/api/admin"
3032 "github.com/ava-labs/avalanchego/api/health"
@@ -105,6 +107,7 @@ const (
105107 responsesNamespace = constants .PlatformName + metric .NamespaceSeparator + "responses"
106108 rpcchainvmNamespace = constants .PlatformName + metric .NamespaceSeparator + "rpcchainvm"
107109 systemResourcesNamespace = constants .PlatformName + metric .NamespaceSeparator + "system_resources"
110+ upgradeNamespace = constants .PlatformName + metric .NamespaceSeparator + "upgrade"
108111)
109112
110113var (
@@ -113,8 +116,12 @@ var (
113116
114117 indexerDBPrefix = []byte {0x00 }
115118
116- errInvalidTLSKey = errors .New ("invalid TLS key" )
117- errShuttingDown = errors .New ("server shutting down" )
119+ errInvalidTLSKey = errors .New ("invalid TLS key" )
120+ errShuttingDown = errors .New ("server shutting down" )
121+ errNoValidators = errors .New ("no validators in the current validator set" )
122+ errUpgradeNeeded = errors .New ("unknown network upgrade detected" )
123+ errUpgradeWithinTheDay = errors .New ("unknown network upgrade detected - update as soon as possible" )
124+ errUpgradeWithinTheHour = errors .New ("imminent network upgrade detected - update immediately" )
118125)
119126
120127// New returns an instance of Node
@@ -1498,6 +1505,126 @@ func (n *Node) initHealthAPI() error {
14981505 return fmt .Errorf ("couldn't register bls health check: %w" , err )
14991506 }
15001507
1508+ upgradeReg , err := metrics .MakeAndRegister (
1509+ n .MetricsGatherer ,
1510+ upgradeNamespace ,
1511+ )
1512+ if err != nil {
1513+ return fmt .Errorf ("couldn't create upgrade metrics register: %w" , err )
1514+ }
1515+
1516+ timeUntilUpgradeMetric := prometheus .NewGauge (prometheus.GaugeOpts {
1517+ Name : "time_until" ,
1518+ Help : "Time until an upcoming network upgrade (ns). +Inf means the upgrade is unscheduled." ,
1519+ })
1520+ infinity := math .Inf (1 )
1521+ timeUntilUpgradeMetric .Set (infinity )
1522+ if err := upgradeReg .Register (timeUntilUpgradeMetric ); err != nil {
1523+ return fmt .Errorf ("couldn't register time until upgrade metric: %w" , err )
1524+ }
1525+
1526+ // TODO: This healthcheck calls both n.vdrs.GetMap and n.Net.PeerInfo which
1527+ // are expensive calls. This could be rewritten as an event based monitor to
1528+ // avoid expensive iteration.
1529+ var (
1530+ localUpgradeTime = n .Config .UpgradeConfig .GraniteTime
1531+ localUpgradeTimeUnix = uint64 (localUpgradeTime .Unix ())
1532+ lastLogTime time.Time
1533+ )
1534+ futureUpgradeCheck := health .CheckerFunc (func (context.Context ) (interface {}, error ) {
1535+ var (
1536+ currentValidators = n .vdrs .GetMap (constants .PrimaryNetworkID )
1537+ totalWeight uint64
1538+ )
1539+ for _ , vdr := range currentValidators {
1540+ totalWeight += vdr .Weight
1541+ }
1542+ if totalWeight == 0 {
1543+ return nil , errNoValidators
1544+ }
1545+
1546+ var (
1547+ peers = n .Net .PeerInfo (maps .Keys (currentValidators ))
1548+ upgradeTimes = make (map [uint64 ]uint64 ) // upgrade time -> stake weight
1549+ modeUpgradeTimeUnix uint64
1550+ modeUpgradeWeight uint64
1551+ )
1552+ for _ , peer := range peers {
1553+ vdr := currentValidators [peer .ID ]
1554+ upgradeWeight := upgradeTimes [peer .UpgradeTime ]
1555+ upgradeWeight += vdr .Weight
1556+ upgradeTimes [peer .UpgradeTime ] = upgradeWeight
1557+
1558+ if upgradeWeight > modeUpgradeWeight {
1559+ modeUpgradeTimeUnix = peer .UpgradeTime
1560+ modeUpgradeWeight = upgradeWeight
1561+ }
1562+ }
1563+
1564+ modeUpgradeWeightPortion := float64 (modeUpgradeWeight ) / float64 (totalWeight )
1565+ result := map [string ]interface {}{
1566+ "localUpgradeTime" : localUpgradeTime ,
1567+ "modeUpgradeTime" : time .Unix (int64 (modeUpgradeTimeUnix ), 0 ).UTC (),
1568+ "modeUpgradeWeightPercentage" : 100 * modeUpgradeWeightPortion ,
1569+ "numUpgradeTimes" : len (upgradeTimes ),
1570+ }
1571+ if localUpgradeTimeUnix >= modeUpgradeTimeUnix || modeUpgradeWeightPortion < .5 {
1572+ timeUntilUpgradeMetric .Set (infinity )
1573+ return result , nil
1574+ }
1575+
1576+ const (
1577+ day = 24 * time .Hour
1578+ week = 7 * day
1579+ )
1580+ modeUpgradeTime := time .Unix (int64 (modeUpgradeTimeUnix ), 0 )
1581+ timeUntilUpgrade := time .Until (modeUpgradeTime )
1582+ timeUntilUpgradeMetric .Set (float64 (timeUntilUpgrade ))
1583+ result ["timeUntilUpgrade" ] = timeUntilUpgrade .String ()
1584+
1585+ var (
1586+ logFrequency time.Duration
1587+ log func (msg string , fields ... zap.Field )
1588+ err error
1589+ )
1590+ switch {
1591+ case timeUntilUpgrade > week :
1592+ logFrequency = 12 * time .Hour
1593+ log = n .Log .Info
1594+ case timeUntilUpgrade > 3 * day :
1595+ logFrequency = 12 * time .Hour
1596+ log = n .Log .Warn
1597+ case timeUntilUpgrade > day :
1598+ logFrequency = time .Hour
1599+ log = n .Log .Warn
1600+ err = errUpgradeNeeded
1601+ case timeUntilUpgrade > time .Hour :
1602+ logFrequency = time .Hour
1603+ log = n .Log .Error
1604+ err = errUpgradeWithinTheDay
1605+ default :
1606+ logFrequency = 0 // log at the rate of the health check
1607+ log = n .Log .Error
1608+ err = errUpgradeWithinTheHour
1609+ }
1610+
1611+ if time .Since (lastLogTime ) >= logFrequency {
1612+ log ("unknown upgrade detected - this node should be updated to a compatible version" ,
1613+ zap .String ("latestReleaseURL" , "https://github.com/ava-labs/avalanchego/releases/latest" ),
1614+ zap .Time ("upgradeTime" , modeUpgradeTime ),
1615+ zap .Duration ("timeUntilUpgrade" , timeUntilUpgrade ),
1616+ zap .Error (err ),
1617+ )
1618+ lastLogTime = time .Now ()
1619+ }
1620+ return result , err
1621+ })
1622+
1623+ err = n .health .RegisterHealthCheck ("futureupgrade" , futureUpgradeCheck , health .ApplicationTag )
1624+ if err != nil {
1625+ return fmt .Errorf ("couldn't register future upgrade health check: %w" , err )
1626+ }
1627+
15011628 handler , err := health .NewGetAndPostHandler (n .Log , n .health )
15021629 if err != nil {
15031630 return err
0 commit comments