Skip to content

Commit d76f5ff

Browse files
Add retry logic when susbcription fails
1 parent 034e2ad commit d76f5ff

File tree

6 files changed

+177
-68
lines changed

6 files changed

+177
-68
lines changed

redfish-exporter/.env

Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,20 @@
1-
UPDATED="2025-01-22"
1+
UPDATED="2024-09-24"
22
DESCRIPTION="Redfish Event Listener/Exporter"
3-
LISTENER_IP="<Listener_IP>"
4-
LISTENER_PORT="<PORT>"
5-
METRICS_PORT="<MERTRICS_PORT>"
3+
LISTENER_IP="10.11.18.55"
4+
LISTENER_PORT="9003"
5+
METRICS_PORT="2112"
66
USE_SSL="false"
77
CERTFILE="path/to/certfile"
88
KEYFILE="path/to/keyfile"
9-
SLURM_CONTROL_NODE="<SLURM_CONTROL_NODE_IP>"
10-
#List of '|' seperated reasons for avoiding drain action if there is a match
11-
SLURM_DRAIN_EXCLUDE_REASON_LIST="reason 1|reason 2"
9+
SLURM_CONTROL_NODE="10.235.34.47"
10+
SLURM_DRAIN_EXCLUDE_REASON_LIST="AMD|Pensando|RebootNeeded"
1211
SLURM_SCONTROL_PATH="/usr/bin/scontrol"
12+
TLS_TIMEOUT="15"
1313

14-
# Match RAS events received based on severity and '|' seperated list of message fields and perform drain action with the DrainReasonPrefix set as the prefix in the reason
15-
# Message can be left empty if it doesn't need to be matched against, in that case only severity is matched
16-
# only DrainNode action is supported for now
1714
TRIGGER_EVENTS="[\
18-
{\"Severity\":\"Critical\",\"Message\":\"message 1|This is a critical test event\",\"Action\":\"DrainNode\", \"DrainReasonPrefix\":\"RebootNeeded\"},\
19-
{\"Severity\":\"Info\",\"Message\":\"message 3\",\"Action\":\"DrainNode\", \"DrainReasonPrefix\":\"RebootNotNeeded\"},\
20-
{\"Severity\":\"Warning\",\"Message\":\"message 4|This is a test event message\",\"Action\":\"DrainNode\", \"DrainReasonPrefix\":\"RebootNotNeeded\"}
15+
{\"Severity\":\"Critical\",\"Message\":\"Image 'UBB_FPGA' is being verified at 'ERoT'|This is an e2e critical test event\",\"Action\":\"DrainNode\", \"DrainReasonPrefix\":\"RebootNeeded\"},\
16+
{\"Severity\":\"Info\",\"Message\":\"Image 'UBB_FPGA' is being verified at 'ERoT'\",\"Action\":\"DrainNode\", \"DrainReasonPrefix\":\"RebootNotNeeded\"},\
17+
{\"Severity\":\"Warning\",\"Message\":\"Image 'UBB_FPGA' is being verified at 'ERoT'|This is an e2e test event message\",\"Action\":\"DrainNode\", \"DrainReasonPrefix\":\"RebootNotNeeded\"}
2118
]"
2219

2320
# Subscription (v1.5+)
@@ -33,11 +30,11 @@ TRIGGER_EVENTS="[\
3330

3431
# Deprecated <v1.5
3532
SUBSCRIPTION_PAYLOAD="{\
36-
\"Destination\":\"http://<Listener_IP:Port>\",\
33+
\"Destination\":\"http://10.11.18.55:9003\",\
3734
\"EventTypes\":[\"Alert\"],\
3835
\"Protocol\":\"Redfish\",\
3936
\"Context\":\"YourContextData\",\
40-
\"Oem\":{\"Supermicro\":{\"EnableSubscription\":true}}\
37+
\"Oem\":{\"Supermicro\": {\"EnableSubscription\": true}}\
4138
}"
4239

4340
# Config for setting default labels in Prometheus counter metrics.
@@ -46,5 +43,10 @@ PROMETHEUS_CONFIG="{\
4643
}"
4744

4845
REDFISH_SERVERS="[\
49-
{\"ip\":\"https://<BMC_IP>\",\"username\":\"<username>\",\"password\":\"<password>\",\"loginType\":\"Session\",\"slurmNode\":\"<nodename\"}
50-
]"
46+
{\"ip\":\"https://10.235.37.54\",\"username\":\"ADMIN\",\"password\":\"PHHCJZUHDV\",\"loginType\":\"Session\",\"slurmNode\":\"smc300x-ccs-aus-GPUFCE9\"},
47+
{\"ip\":\"https://10.235.37.48\",\"username\":\"ADMIN\",\"password\":\"PHHCJZUHDV\",\"loginType\":\"Session\",\"slurmNode\":\"smc300x-ccs-aus-GPUFCE9\"}
48+
49+
]"
50+
51+
REDFISH_SERVERS_COMMON_CONFIG="{\
52+
\"hostSuffix\":\"ipmi.cluster\",\"username\":\"<username>\",\"password\":\"<password>\"}"

redfish-exporter/config.go

Lines changed: 66 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,15 @@ package main
1919
import (
2020
"crypto/tls"
2121
"encoding/json"
22+
"fmt"
2223
"log"
24+
"net"
2325
"os"
2426
"strconv"
2527
"strings"
2628

2729
"github.com/joho/godotenv"
30+
"gopkg.in/yaml.v3"
2831
)
2932

3033
const (
@@ -61,6 +64,7 @@ type Config struct {
6164
context *tls.Config
6265
eventCount int
6366
dataBuffer []byte
67+
TlsTimeOut string
6468
}
6569

6670
type TriggerEvent struct {
@@ -74,7 +78,12 @@ type PrometheusConfig struct {
7478
Severity []string `json:"Severity"`
7579
}
7680

77-
func setupConfig() Config {
81+
type target struct {
82+
Targets []string `yaml:"targets"`
83+
Labels map[string]string `yaml:"labels"`
84+
}
85+
86+
func setupConfig(targetFile string) Config {
7887
// Load .env file
7988
err := godotenv.Load()
8089
if err != nil {
@@ -125,6 +134,7 @@ func setupConfig() Config {
125134
AppConfig.SlurmUser = os.Getenv("SLURM_USER")
126135
AppConfig.SlurmDrainExcludeStr = os.Getenv("SLURM_DRAIN_EXCLUDE_REASON_LIST")
127136
AppConfig.SlurmScontrolPath = os.Getenv("SLURM_SCONTROL_PATH")
137+
AppConfig.TlsTimeOut = os.Getenv("TLS_TIMEOUT")
128138

129139
subscriptionPayloadJSON := os.Getenv("SUBSCRIPTION_PAYLOAD")
130140
if err := json.Unmarshal([]byte(subscriptionPayloadJSON), &AppConfig.SubscriptionPayload); err != nil {
@@ -154,10 +164,63 @@ func setupConfig() Config {
154164
redfishServersJSON := os.Getenv("REDFISH_SERVERS")
155165
if redfishServersJSON == "" {
156166
log.Println("REDFISH_SERVERS environment variable is not set or is empty")
167+
} else {
168+
if err := json.Unmarshal([]byte(redfishServersJSON), &AppConfig.RedfishServers); err != nil {
169+
log.Fatalf("Failed to parse REDFISH_SERVERS: %v", err)
170+
}
171+
}
172+
173+
// Read and parse the REDFISH_SERVERS_COMMON_CONFIG environment variable
174+
redfishServersCommonConfigJSON := os.Getenv("REDFISH_SERVERS_COMMON_CONFIG")
175+
if redfishServersCommonConfigJSON == "" {
176+
log.Println("redfishServersCommonConfigJSON environment variable is not set or is empty")
157177
return AppConfig
158178
}
159-
if err := json.Unmarshal([]byte(redfishServersJSON), &AppConfig.RedfishServers); err != nil {
160-
log.Fatalf("Failed to parse REDFISH_SERVERS: %v", err)
179+
redfishServersCommonConfig := RedfishServersCommongConfig{}
180+
if err := json.Unmarshal([]byte(redfishServersCommonConfigJSON), &redfishServersCommonConfig); err != nil {
181+
log.Fatalf("Failed to parse REDFISH_SERVERS_COMMON_CONFIG: %v", err)
182+
}
183+
184+
if targetFile == "" {
185+
log.Println("No target file provided")
186+
return AppConfig
187+
}
188+
189+
targetYamlFile, err := os.ReadFile(targetFile)
190+
191+
if err != nil {
192+
log.Fatalf("Failed to read file: %v", targetFile)
193+
}
194+
195+
targets := []target{}
196+
197+
err = yaml.Unmarshal(targetYamlFile, &targets)
198+
199+
if err != nil {
200+
log.Fatalf("Error parsing target file: %v | err: %v", targetFile, err)
201+
}
202+
203+
for _, t := range targets {
204+
log.Println("target: ", t.Targets)
205+
206+
for _, hostName := range t.Targets {
207+
// add this target to Redfish servers
208+
server := RedfishServer{}
209+
bmcHost := fmt.Sprintf(hostName+".%v", redfishServersCommonConfig.HostSuffix)
210+
ips, err := net.LookupIP(bmcHost)
211+
if err != nil || len(ips) == 0 {
212+
log.Printf("[error] Couldn't get the IP for host: %v | ips: %v | err: %v", bmcHost, ips, err)
213+
continue
214+
}
215+
log.Println("IPs: ", ips)
216+
217+
server.IP = fmt.Sprintf("https://%v", ips[0])
218+
server.LoginType = "Session"
219+
server.Username = redfishServersCommonConfig.UserName
220+
server.Password = redfishServersCommonConfig.Password
221+
server.SlurmNode = hostName
222+
AppConfig.RedfishServers = append(AppConfig.RedfishServers, server)
223+
}
161224
}
162225

163226
return AppConfig

redfish-exporter/go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ require (
99
github.com/nod-ai/ADA/redfish-exporter v0.0.0-20241002210630-2ef2d1070d90
1010
github.com/prometheus/client_golang v1.20.4
1111
github.com/stmcginnis/gofish v0.19.0
12+
gopkg.in/yaml.v3 v3.0.1
1213
)
1314

1415
require (

redfish-exporter/go.sum

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,6 @@ golang.org/x/sys v0.22.0 h1:RI27ohtqKCnwULzJLqkv897zojh5/DwS/ENaMzUOaWI=
2626
golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
2727
google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg=
2828
google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw=
29+
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
30+
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
31+
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

redfish-exporter/main.go

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626
"os/signal"
2727
"strconv"
2828
"strings"
29+
"sync"
2930
"syscall"
3031
"time"
3132

@@ -36,15 +37,24 @@ import (
3637

3738
func main() {
3839
var (
39-
enableSlurm = flag.Bool("enable-slurm", false, "Enable slurm")
40+
targetFile string
41+
enableSlurm = flag.Bool("enable-slurm", false, "Enable slurm")
42+
subscriptionMapLock sync.Mutex // to guard access to the map
4043
)
44+
45+
flag.StringVar(&targetFile, "target", "", "Path to the target file for host/slurm node names")
4146
flag.Parse()
4247

4348
log.SetFlags(log.LstdFlags | log.Lshortfile)
4449
log.Println("Starting Redfish Event Listener/Exporter")
4550

51+
/*
52+
if targetFile == "" {
53+
log.Fatalf("Target file for host/slurm node names not set: Usage: ./amd-redfish-exporter --enable-slurm -target <filename>")
54+
}
55+
*/
4656
// Setup configuration
47-
AppConfig := setupConfig()
57+
AppConfig := setupConfig(targetFile)
4858

4959
// Log the initialized config
5060
log.Printf("Initialized Config: %+v", AppConfig)
@@ -57,8 +67,10 @@ func main() {
5767
go slurmQueue.ProcessEventActionQueue()
5868
}
5969

70+
subscriptionMap := make(map[string]string)
71+
6072
// Subscribe the listener to the event stream for all servers
61-
subscriptionMap, err := CreateSubscriptionsForAllServers(AppConfig.RedfishServers, AppConfig.SubscriptionPayload)
73+
err := CreateSubscriptionsForAllServers(AppConfig.RedfishServers, AppConfig.SubscriptionPayload, subscriptionMap, &subscriptionMapLock, AppConfig.TlsTimeOut)
6274
if err != nil {
6375
log.Fatal(err)
6476
}
@@ -99,7 +111,9 @@ func main() {
99111
time.Sleep(time.Second)
100112

101113
// Unsubscribe the listener from all servers
114+
subscriptionMapLock.Lock()
102115
DeleteSubscriptionsFromAllServers(AppConfig.RedfishServers, subscriptionMap)
116+
subscriptionMapLock.Unlock()
103117

104118
cancel()
105119

0 commit comments

Comments
 (0)