@@ -50,8 +50,8 @@ func TestEvaluate(t *testing.T) {
50
50
AtLeastPorts : 2 ,
51
51
AtLeastRate : 0 ,
52
52
},
53
- wantReason : "only 0 ports (>= 0 Gb/s) are active, expect at least 2" ,
54
- wantHealth : apiv1 .HealthStateTypeUnhealthy ,
53
+ wantReason : reasonThresholdNotSetSkipped ,
54
+ wantHealth : apiv1 .HealthStateTypeHealthy ,
55
55
},
56
56
{
57
57
name : "only rate threshold set" ,
@@ -60,7 +60,7 @@ func TestEvaluate(t *testing.T) {
60
60
AtLeastPorts : 0 ,
61
61
AtLeastRate : 200 ,
62
62
},
63
- wantReason : reasonNoIbIssueFound ,
63
+ wantReason : reasonThresholdNotSetSkipped ,
64
64
wantHealth : apiv1 .HealthStateTypeHealthy ,
65
65
},
66
66
{
@@ -431,13 +431,21 @@ func TestComponentCheck(t *testing.T) {
431
431
assert .Equal (t , apiv1 .HealthStateTypeHealthy , data .health )
432
432
assert .Equal (t , "NVIDIA NVML instance is nil" , data .reason )
433
433
434
- // Case 2: With NVML
435
- nvmlMock := & mockNVMLInstance {exists : true }
434
+ // Case 2: With NVML but missing product name
435
+ nvmlMock := & mockNVMLInstance {exists : true , productName : "" }
436
436
c .nvmlInstance = nvmlMock
437
437
result = c .Check ()
438
438
data , ok = result .(* checkResult )
439
439
require .True (t , ok )
440
440
assert .Equal (t , apiv1 .HealthStateTypeHealthy , data .health )
441
+ assert .Equal (t , "NVIDIA NVML is loaded but GPU is not detected (missing product name)" , data .reason )
442
+
443
+ // Case 3: With NVML and valid product name
444
+ nvmlMock .productName = "Tesla V100"
445
+ result = c .Check ()
446
+ data , ok = result .(* checkResult )
447
+ require .True (t , ok )
448
+ assert .Equal (t , apiv1 .HealthStateTypeHealthy , data .health )
441
449
assert .NotNil (t , data .IbstatOutput )
442
450
}
443
451
@@ -669,7 +677,8 @@ func (m *MockEventBucket) GetEvents() apiv1.Events {
669
677
670
678
// Test helpers for mocking NVML and IBStat
671
679
type mockNVMLInstance struct {
672
- exists bool
680
+ exists bool
681
+ productName string
673
682
}
674
683
675
684
func (m * mockNVMLInstance ) NVMLExists () bool {
@@ -686,7 +695,10 @@ func (m *mockNVMLInstance) Library() nvmllib.Library {
686
695
}
687
696
688
697
func (m * mockNVMLInstance ) ProductName () string {
689
- return "test"
698
+ if m .productName == "" {
699
+ return "" // Empty string for testing
700
+ }
701
+ return m .productName // Return custom value for testing
690
702
}
691
703
692
704
func (m * mockNVMLInstance ) Architecture () string {
@@ -886,7 +898,7 @@ func TestComponentCheckErrorCases(t *testing.T) {
886
898
return nil , errors .New ("ibstat error" )
887
899
},
888
900
getThresholdsFunc : mockGetThresholds ,
889
- nvmlInstance : & mockNVMLInstance {exists : true },
901
+ nvmlInstance : & mockNVMLInstance {exists : true , productName : "Tesla V100" },
890
902
}
891
903
892
904
result := c .Check ()
@@ -903,7 +915,7 @@ func TestComponentCheckErrorCases(t *testing.T) {
903
915
return nil , nil
904
916
},
905
917
getThresholdsFunc : mockGetThresholds ,
906
- nvmlInstance : & mockNVMLInstance {exists : true },
918
+ nvmlInstance : & mockNVMLInstance {exists : true , productName : "Tesla V100" },
907
919
}
908
920
909
921
result = c .Check ()
@@ -920,7 +932,7 @@ func TestComponentCheckErrorCases(t *testing.T) {
920
932
return nil , infiniband .ErrNoIbstatCommand
921
933
},
922
934
getThresholdsFunc : mockGetThresholds ,
923
- nvmlInstance : & mockNVMLInstance {exists : true },
935
+ nvmlInstance : & mockNVMLInstance {exists : true , productName : "Tesla V100" },
924
936
}
925
937
926
938
result = c .Check ()
@@ -943,7 +955,7 @@ func TestComponentCheckEventBucketOperations(t *testing.T) {
943
955
ctx : cctx ,
944
956
cancel : ccancel ,
945
957
eventBucket : mockBucket ,
946
- nvmlInstance : & mockNVMLInstance {exists : true },
958
+ nvmlInstance : & mockNVMLInstance {exists : true , productName : "Tesla V100" },
947
959
getIbstatOutputFunc : mockGetIbstatOutput ,
948
960
getThresholdsFunc : func () infiniband.ExpectedPortStates {
949
961
// Return thresholds that will trigger an unhealthy state
@@ -961,7 +973,7 @@ func TestComponentCheckEventBucketOperations(t *testing.T) {
961
973
962
974
// Verify that an event was inserted
963
975
events := mockBucket .GetEvents ()
964
- assert .NotEmpty (t , events )
976
+ require .NotEmpty (t , events )
965
977
assert .Equal (t , "ibstat" , events [0 ].Name )
966
978
assert .Equal (t , apiv1 .EventTypeWarning , events [0 ].Type )
967
979
}
@@ -1035,7 +1047,7 @@ func TestCheckWithEventErrors(t *testing.T) {
1035
1047
ctx : cctx ,
1036
1048
cancel : ccancel ,
1037
1049
eventBucket : errorBucket ,
1038
- nvmlInstance : & mockNVMLInstance {exists : true },
1050
+ nvmlInstance : & mockNVMLInstance {exists : true , productName : "Tesla V100" },
1039
1051
getIbstatOutputFunc : mockGetIbstatOutput ,
1040
1052
getThresholdsFunc : func () infiniband.ExpectedPortStates {
1041
1053
return infiniband.ExpectedPortStates {
@@ -1115,7 +1127,7 @@ func TestCheckWithExistingEvent(t *testing.T) {
1115
1127
ctx : cctx ,
1116
1128
cancel : ccancel ,
1117
1129
eventBucket : mockBucket ,
1118
- nvmlInstance : & mockNVMLInstance {exists : true },
1130
+ nvmlInstance : & mockNVMLInstance {exists : true , productName : "Tesla V100" },
1119
1131
getIbstatOutputFunc : mockGetIbstatOutput ,
1120
1132
getThresholdsFunc : func () infiniband.ExpectedPortStates {
1121
1133
return infiniband.ExpectedPortStates {
@@ -1185,7 +1197,7 @@ func TestCheckNilIbstatFunc(t *testing.T) {
1185
1197
c := & component {
1186
1198
ctx : cctx ,
1187
1199
cancel : ccancel ,
1188
- nvmlInstance : & mockNVMLInstance {exists : true },
1200
+ nvmlInstance : & mockNVMLInstance {exists : true , productName : "Tesla V100" },
1189
1201
getIbstatOutputFunc : nil , // Set to nil explicitly
1190
1202
getThresholdsFunc : mockGetThresholds ,
1191
1203
}
@@ -1196,3 +1208,159 @@ func TestCheckNilIbstatFunc(t *testing.T) {
1196
1208
assert .Equal (t , apiv1 .HealthStateTypeHealthy , data .health )
1197
1209
assert .Equal (t , "ibstat checker not found" , data .reason )
1198
1210
}
1211
+
1212
+ // TestComponentCheckOrder tests that the checks in the Check() method are evaluated in the correct order
1213
+ func TestComponentCheckOrder (t * testing.T ) {
1214
+ t .Parallel ()
1215
+
1216
+ // Create a context for tests
1217
+ cctx , ccancel := context .WithCancel (context .Background ())
1218
+ defer ccancel ()
1219
+
1220
+ var checksCalled []string
1221
+ trackCheck := func (name string ) {
1222
+ checksCalled = append (checksCalled , name )
1223
+ }
1224
+
1225
+ // 1. Test threshold check first
1226
+ // Create a component with threshold check that returns IsZero() true
1227
+ c := & component {
1228
+ ctx : cctx ,
1229
+ cancel : ccancel ,
1230
+ getThresholdsFunc : func () infiniband.ExpectedPortStates {
1231
+ trackCheck ("thresholds" )
1232
+ return infiniband.ExpectedPortStates {} // zero thresholds
1233
+ },
1234
+ }
1235
+
1236
+ result := c .Check ()
1237
+ data , ok := result .(* checkResult )
1238
+ require .True (t , ok )
1239
+ assert .Equal (t , apiv1 .HealthStateTypeHealthy , data .health )
1240
+ assert .Equal (t , reasonThresholdNotSetSkipped , data .reason )
1241
+ assert .Equal (t , []string {"thresholds" }, checksCalled )
1242
+
1243
+ // 2. Test NVML instance nil check
1244
+ checksCalled = nil // reset
1245
+ c = & component {
1246
+ ctx : cctx ,
1247
+ cancel : ccancel ,
1248
+ getThresholdsFunc : func () infiniband.ExpectedPortStates {
1249
+ trackCheck ("thresholds" )
1250
+ return infiniband.ExpectedPortStates {AtLeastPorts : 1 , AtLeastRate : 100 } // non-zero thresholds
1251
+ },
1252
+ nvmlInstance : nil , // nil NVML
1253
+ }
1254
+
1255
+ result = c .Check ()
1256
+ data , ok = result .(* checkResult )
1257
+ require .NotNil (t , data )
1258
+ require .True (t , ok )
1259
+ assert .Equal (t , apiv1 .HealthStateTypeHealthy , data .health )
1260
+ assert .Equal (t , "NVIDIA NVML instance is nil" , data .reason )
1261
+ assert .Equal (t , []string {"thresholds" }, checksCalled ) // Only threshold check should be called
1262
+
1263
+ // 3. Test NVML exists check
1264
+ checksCalled = nil // reset
1265
+ c = & component {
1266
+ ctx : cctx ,
1267
+ cancel : ccancel ,
1268
+ getThresholdsFunc : func () infiniband.ExpectedPortStates {
1269
+ trackCheck ("thresholds" )
1270
+ return infiniband.ExpectedPortStates {AtLeastPorts : 1 , AtLeastRate : 100 }
1271
+ },
1272
+ nvmlInstance : & mockNVMLInstance {
1273
+ exists : false , // NVML does not exist
1274
+ },
1275
+ }
1276
+
1277
+ result = c .Check ()
1278
+ data , ok = result .(* checkResult )
1279
+ require .True (t , ok )
1280
+ assert .Equal (t , apiv1 .HealthStateTypeHealthy , data .health )
1281
+ assert .Equal (t , "NVIDIA NVML library is not loaded" , data .reason )
1282
+ assert .Equal (t , []string {"thresholds" }, checksCalled )
1283
+
1284
+ // 4. Test ProductName check
1285
+ checksCalled = nil // reset
1286
+ c = & component {
1287
+ ctx : cctx ,
1288
+ cancel : ccancel ,
1289
+ getThresholdsFunc : func () infiniband.ExpectedPortStates {
1290
+ trackCheck ("thresholds" )
1291
+ return infiniband.ExpectedPortStates {AtLeastPorts : 1 , AtLeastRate : 100 }
1292
+ },
1293
+ nvmlInstance : & mockNVMLInstance {
1294
+ exists : true ,
1295
+ productName : "" , // Empty product name
1296
+ },
1297
+ }
1298
+
1299
+ result = c .Check ()
1300
+ data , ok = result .(* checkResult )
1301
+ require .True (t , ok )
1302
+ assert .Equal (t , apiv1 .HealthStateTypeHealthy , data .health )
1303
+ assert .Equal (t , "NVIDIA NVML is loaded but GPU is not detected (missing product name)" , data .reason )
1304
+ assert .Equal (t , []string {"thresholds" }, checksCalled )
1305
+
1306
+ // 5. Test ibstat function check
1307
+ checksCalled = nil // reset
1308
+ c = & component {
1309
+ ctx : cctx ,
1310
+ cancel : ccancel ,
1311
+ getThresholdsFunc : func () infiniband.ExpectedPortStates {
1312
+ trackCheck ("thresholds" )
1313
+ return infiniband.ExpectedPortStates {AtLeastPorts : 1 , AtLeastRate : 100 }
1314
+ },
1315
+ nvmlInstance : & mockNVMLInstance {
1316
+ exists : true ,
1317
+ productName : "Tesla V100" , // Valid product name
1318
+ },
1319
+ getIbstatOutputFunc : nil , // No ibstat function
1320
+ }
1321
+
1322
+ result = c .Check ()
1323
+ data , ok = result .(* checkResult )
1324
+ require .NotNil (t , data )
1325
+ require .True (t , ok )
1326
+ assert .Equal (t , apiv1 .HealthStateTypeHealthy , data .health )
1327
+ assert .Equal (t , "ibstat checker not found" , data .reason )
1328
+ assert .Equal (t , []string {"thresholds" }, checksCalled )
1329
+
1330
+ // 6. Test the full sequence passing all early checks
1331
+ checksCalled = nil // reset
1332
+ c = & component {
1333
+ ctx : cctx ,
1334
+ cancel : ccancel ,
1335
+ getThresholdsFunc : func () infiniband.ExpectedPortStates {
1336
+ trackCheck ("thresholds" )
1337
+ return infiniband.ExpectedPortStates {AtLeastPorts : 1 , AtLeastRate : 100 }
1338
+ },
1339
+ nvmlInstance : & mockNVMLInstance {
1340
+ exists : true ,
1341
+ productName : "Tesla V100" ,
1342
+ },
1343
+ getIbstatOutputFunc : func (ctx context.Context , ibstatCommands []string ) (* infiniband.IbstatOutput , error ) {
1344
+ trackCheck ("ibstat" )
1345
+ return & infiniband.IbstatOutput {
1346
+ Raw : "mock output" ,
1347
+ Parsed : infiniband.IBStatCards {
1348
+ {
1349
+ Name : "mlx5_0" ,
1350
+ Port1 : infiniband.IBStatPort {
1351
+ State : "Active" ,
1352
+ PhysicalState : "LinkUp" ,
1353
+ Rate : 200 ,
1354
+ },
1355
+ },
1356
+ },
1357
+ }, nil
1358
+ },
1359
+ }
1360
+
1361
+ result = c .Check ()
1362
+ data , ok = result .(* checkResult )
1363
+ require .NotNil (t , data )
1364
+ require .True (t , ok )
1365
+ assert .Equal (t , []string {"thresholds" , "ibstat" }, checksCalled ) // Both checks should be called
1366
+ }
0 commit comments