@@ -19,7 +19,6 @@ package oci
19
19
import (
20
20
"context"
21
21
"fmt"
22
- "net/http"
23
22
"sort"
24
23
"time"
25
24
@@ -29,6 +28,7 @@ import (
29
28
30
29
"github.com/NVIDIA/topograph/pkg/metrics"
31
30
"github.com/NVIDIA/topograph/pkg/topology"
31
+ "github.com/NVIDIA/topograph/pkg/translate"
32
32
)
33
33
34
34
type level int
@@ -39,109 +39,111 @@ const (
39
39
hpcIslandLevel
40
40
)
41
41
42
- func GenerateInstanceTopology (ctx context.Context , factory ClientFactory , cis []topology.ComputeInstances ) ([] * core.ComputeBareMetalHostSummary , error ) {
43
- var err error
44
- bareMetalHostSummaries := [] * core. ComputeBareMetalHostSummary {}
42
+ func GenerateInstanceTopology (ctx context.Context , factory ClientFactory , pageSize * int , cis []topology.ComputeInstances ) (hosts [] core.ComputeHostSummary , blockMap map [ string ] string , err error ) {
43
+ blockMap = make ( map [ string ] string )
44
+
45
45
for _ , ci := range cis {
46
- if bareMetalHostSummaries , err = generateInstanceTopology (ctx , factory , & ci , bareMetalHostSummaries ); err != nil {
47
- return nil , err
46
+ var client Client
47
+ if client , err = factory (ci .Region , pageSize ); err != nil {
48
+ return
49
+ }
50
+ if hosts , err = getComputeHostInfo (ctx , client , hosts , blockMap ); err != nil {
51
+ return
48
52
}
49
53
}
50
54
51
- return bareMetalHostSummaries , nil
55
+ return
52
56
}
53
57
54
- func getComputeCapacityTopologies (ctx context.Context , client Client ) (cct []core.ComputeCapacityTopologySummary , err error ) {
55
- compartmentId := client .TenancyOCID ()
56
-
57
- adRequest := identity.ListAvailabilityDomainsRequest {
58
- CompartmentId : & compartmentId ,
59
- }
58
+ func getComputeHostSummary (ctx context.Context , client Client , availabilityDomain * string ) ([]core.ComputeHostSummary , error ) {
59
+ var hosts []core.ComputeHostSummary
60
60
61
- timeStart := time . Now ()
62
- ads , err := client .ListAvailabilityDomains ( ctx , adRequest )
63
- if err != nil {
64
- return cct , fmt . Errorf ( "unable to get AD: %v" , err )
61
+ req := core. ListComputeHostsRequest {
62
+ CompartmentId : client .TenancyOCID (),
63
+ AvailabilityDomain : availabilityDomain ,
64
+ Limit : client . Limit (),
65
65
}
66
- requestLatency .WithLabelValues ("ListAvailabilityDomains" , ads .HTTPResponse ().Status ).Observe (time .Since (timeStart ).Seconds ())
67
66
68
- for _ , ad := range ads .Items {
69
- cctRequest := core.ListComputeCapacityTopologiesRequest {
70
- CompartmentId : & compartmentId ,
71
- AvailabilityDomain : ad .Name ,
67
+ for {
68
+ timeStart := time .Now ()
69
+ resp , err := client .ListComputeHosts (ctx , req )
70
+ requestLatency .WithLabelValues ("ListComputeHosts" , resp .HTTPResponse ().Status ).Observe (time .Since (timeStart ).Seconds ())
71
+ if err != nil {
72
+ return nil , err
72
73
}
73
74
74
- for {
75
- timeStart := time .Now ()
76
- resp , err := client .ListComputeCapacityTopologies (ctx , cctRequest )
77
- requestLatency .WithLabelValues ("ListComputeCapacityTopologies" , resp .HTTPResponse ().Status ).Observe (time .Since (timeStart ).Seconds ())
78
- if err != nil {
79
- if resp .HTTPResponse ().StatusCode == http .StatusNotFound {
80
- return cct , fmt .Errorf ("%v for getting ComputeCapacityTopology in %s: %v" , resp .HTTPResponse ().StatusCode , * ad .Name , err )
81
- } else {
82
- return cct , fmt .Errorf ("unable to get ComputeCapacity Topologies in %s : %v" , * ad .Name , err )
83
- }
84
- }
85
- cct = append (cct , resp .Items ... )
86
- klog .V (4 ).Infof ("Received computeCapacityTopology %d groups; processed %d" , len (resp .Items ), len (cct ))
87
- if resp .OpcNextPage != nil {
88
- cctRequest .Page = resp .OpcNextPage
89
- } else {
90
- break
91
- }
75
+ hosts = append (hosts , resp .Items ... )
76
+
77
+ if resp .OpcNextPage != nil {
78
+ req .Page = resp .OpcNextPage
79
+ } else {
80
+ break
92
81
}
93
82
}
94
83
95
- return cct , nil
84
+ return hosts , nil
96
85
}
97
86
98
- func getBMHSummaryPerComputeCapacityTopology (ctx context.Context , client Client , topologyID string ) (bmhSummary []core.ComputeBareMetalHostSummary , err error ) {
99
- compartmentId := client .TenancyOCID ()
100
- request := core.ListComputeCapacityTopologyComputeBareMetalHostsRequest {
101
- ComputeCapacityTopologyId : & topologyID ,
102
- CompartmentId : & compartmentId ,
87
+ // getLocalBlockMap returns a map between LocalBlocks and ComputeGpuMemoryFabrics
88
+ func getLocalBlockMap (ctx context.Context , client Client , availabilityDomain * string , blockMap map [string ]string ) error {
89
+ req := core.ListComputeGpuMemoryFabricsRequest {
90
+ CompartmentId : client .TenancyOCID (),
91
+ AvailabilityDomain : availabilityDomain ,
92
+ Limit : client .Limit (),
103
93
}
94
+
104
95
for {
105
96
timeStart := time .Now ()
106
- response , err := client .ListComputeCapacityTopologyComputeBareMetalHosts (ctx , request )
107
- requestLatency .WithLabelValues ("ListComputeCapacityTopologyComputeBareMetalHosts " , response .HTTPResponse ().Status ).Observe (time .Since (timeStart ).Seconds ())
97
+ resp , err := client .ListComputeGpuMemoryFabrics (ctx , req )
98
+ requestLatency .WithLabelValues ("ListComputeGpuMemoryFabrics " , resp .HTTPResponse ().Status ).Observe (time .Since (timeStart ).Seconds ())
108
99
if err != nil {
109
- klog .Errorln (err .Error ())
110
- break
100
+ return err
111
101
}
112
102
113
- bmhSummary = append (bmhSummary , response .Items ... )
103
+ for _ , fabrics := range resp .Items {
104
+ blockMap [* fabrics .ComputeLocalBlockId ] = * fabrics .Id
105
+ }
114
106
115
- if response .OpcNextPage != nil {
116
- request .Page = response .OpcNextPage
107
+ if resp .OpcNextPage != nil {
108
+ req .Page = resp .OpcNextPage
117
109
} else {
118
110
break
119
111
}
120
112
}
121
- return bmhSummary , nil
113
+
114
+ return nil
122
115
}
123
116
124
- func getBareMetalHostSummaries (ctx context.Context , client Client ) ([]core.ComputeBareMetalHostSummary , error ) {
125
- computeCapacityTopology , err := getComputeCapacityTopologies (ctx , client )
117
+ func getComputeHostInfo (ctx context.Context , client Client , hosts []core.ComputeHostSummary , blockMap map [string ]string ) ([]core.ComputeHostSummary , error ) {
118
+ req := identity.ListAvailabilityDomainsRequest {
119
+ CompartmentId : client .TenancyOCID (),
120
+ }
121
+
122
+ timeStart := time .Now ()
123
+ resp , err := client .ListAvailabilityDomains (ctx , req )
126
124
if err != nil {
127
- return nil , fmt .Errorf ("unable to get compute capacity topologies : %s " , err . Error () )
125
+ return nil , fmt .Errorf ("unable to get availability domains : %v " , err )
128
126
}
129
- klog . V ( 4 ). Infof ( "Received computeCapacityTopology for %d groups " , len ( computeCapacityTopology ))
127
+ requestLatency . WithLabelValues ( "ListAvailabilityDomains " , resp . HTTPResponse (). Status ). Observe ( time . Since ( timeStart ). Seconds ( ))
130
128
131
- var bareMetalHostSummaries []core.ComputeBareMetalHostSummary
132
- for _ , cct := range computeCapacityTopology {
133
- bareMetalHostSummary , err := getBMHSummaryPerComputeCapacityTopology (ctx , client , * cct .Id )
129
+ for _ , ad := range resp .Items {
130
+ summary , err := getComputeHostSummary (ctx , client , ad .Name )
134
131
if err != nil {
135
- return nil , fmt .Errorf ("unable to get bare metal hosts info: %s" , err .Error ())
132
+ return nil , fmt .Errorf ("unable to get hosts info: %v" , err )
133
+ }
134
+ hosts = append (hosts , summary ... )
135
+
136
+ if err = getLocalBlockMap (ctx , client , ad .Name , blockMap ); err != nil {
137
+ return nil , fmt .Errorf ("unable to get local block map: %v" , err )
136
138
}
137
- bareMetalHostSummaries = append (bareMetalHostSummaries , bareMetalHostSummary ... )
138
139
}
139
- klog .V (4 ).Infof ("Returning bareMetalHostSummaries for %d nodes" , len (bareMetalHostSummaries ))
140
140
141
- return bareMetalHostSummaries , nil
141
+ klog .V (4 ).Infof ("Returning host info for %d nodes and %d blocks" , len (hosts ), len (blockMap ))
142
+
143
+ return hosts , nil
142
144
}
143
145
144
- func toGraph (bareMetalHostSummaries []* core.ComputeBareMetalHostSummary , cis []topology.ComputeInstances ) (* topology.Vertex , error ) {
146
+ func toGraph (hosts []core.ComputeHostSummary , blockMap map [ string ] string , cis []topology.ComputeInstances ) (* topology.Vertex , error ) {
145
147
instanceToNodeMap := make (map [string ]string )
146
148
for _ , ci := range cis {
147
149
for instance , node := range ci .Instances {
@@ -152,18 +154,25 @@ func toGraph(bareMetalHostSummaries []*core.ComputeBareMetalHostSummary, cis []t
152
154
153
155
nodes := make (map [string ]* topology.Vertex )
154
156
forest := make (map [string ]* topology.Vertex )
157
+ domainMap := translate .NewDomainMap ()
158
+
155
159
levelWiseSwitchCount := map [level ]int {localBlockLevel : 0 , networkBlockLevel : 0 , hpcIslandLevel : 0 }
156
- bareMetalHostSummaries = filterAndSort (bareMetalHostSummaries , instanceToNodeMap )
157
- for _ , bmhSummary := range bareMetalHostSummaries {
158
- nodeName := instanceToNodeMap [* bmhSummary . InstanceId ]
159
- delete (instanceToNodeMap , * bmhSummary . InstanceId )
160
+ hosts = filterAndSort (hosts , instanceToNodeMap )
161
+ for _ , host := range hosts {
162
+ nodeName := instanceToNodeMap [* host . Id ]
163
+ delete (instanceToNodeMap , * host . Id )
160
164
161
165
instance := & topology.Vertex {
162
166
Name : nodeName ,
163
- ID : * bmhSummary .InstanceId ,
167
+ ID : * host .Id ,
168
+ }
169
+
170
+ localBlockId := * host .LocalBlockId
171
+
172
+ if blockDomain , ok := blockMap [localBlockId ]; ok {
173
+ domainMap .AddHost (blockDomain , nodeName )
164
174
}
165
175
166
- localBlockId := * bmhSummary .ComputeLocalBlockId
167
176
localBlock , ok := nodes [localBlockId ]
168
177
if ! ok {
169
178
levelWiseSwitchCount [localBlockLevel ]++
@@ -176,7 +185,7 @@ func toGraph(bareMetalHostSummaries []*core.ComputeBareMetalHostSummary, cis []t
176
185
}
177
186
localBlock .Vertices [instance .ID ] = instance
178
187
179
- networkBlockId := * bmhSummary . ComputeNetworkBlockId
188
+ networkBlockId := * host . NetworkBlockId
180
189
networkBlock , ok := nodes [networkBlockId ]
181
190
if ! ok {
182
191
levelWiseSwitchCount [networkBlockLevel ]++
@@ -189,7 +198,7 @@ func toGraph(bareMetalHostSummaries []*core.ComputeBareMetalHostSummary, cis []t
189
198
}
190
199
networkBlock .Vertices [localBlockId ] = localBlock
191
200
192
- hpcIslandId := * bmhSummary . ComputeHpcIslandId
201
+ hpcIslandId := * host . HpcIslandId
193
202
hpcIsland , ok := nodes [hpcIslandId ]
194
203
if ! ok {
195
204
levelWiseSwitchCount [hpcIslandLevel ]++
@@ -231,75 +240,61 @@ func toGraph(bareMetalHostSummaries []*core.ComputeBareMetalHostSummary, cis []t
231
240
Vertices : make (map [string ]* topology.Vertex ),
232
241
}
233
242
root .Vertices [topology .TopologyTree ] = treeRoot
243
+ if len (domainMap ) != 0 {
244
+ root .Vertices [topology .TopologyBlock ] = domainMap .ToBlocks ()
245
+ }
234
246
return root , nil
235
247
236
248
}
237
249
238
- func filterAndSort (bareMetalHostSummaries []* core.ComputeBareMetalHostSummary , instanceToNodeMap map [string ]string ) []* core.ComputeBareMetalHostSummary {
239
- var filtered []* core.ComputeBareMetalHostSummary
240
- for _ , bmh := range bareMetalHostSummaries {
241
- if bmh . InstanceId == nil {
242
- klog .V ( 5 ). Infof ( "Instance ID is nil for bmhSummary %s" , bmh .String ())
250
+ func filterAndSort (hosts []core.ComputeHostSummary , instanceToNodeMap map [string ]string ) []core.ComputeHostSummary {
251
+ var filtered []core.ComputeHostSummary
252
+ for _ , host := range hosts {
253
+ if host . Id == nil {
254
+ klog .Warningf ( "InstanceID is nil for host %s" , host .String ())
243
255
continue
244
256
}
245
257
246
- if bmh . ComputeLocalBlockId == nil {
247
- klog .Warningf ("ComputeLocalBlockId is nil for instance %q" , * bmh . InstanceId )
248
- missingAncestor .WithLabelValues ("localBlock " , * bmh . InstanceId ).Add (float64 (1 ))
258
+ if host . LocalBlockId == nil {
259
+ klog .Warningf ("LocalBlockId is nil for instance %q" , * host . Id )
260
+ missingAncestor .WithLabelValues ("LocalBlock " , * host . Id ).Add (float64 (1 ))
249
261
continue
250
262
}
251
263
252
- if bmh . ComputeNetworkBlockId == nil {
253
- klog .Warningf ("ComputeNetworkBlockId is nil for instance %q" , * bmh . InstanceId )
254
- missingAncestor .WithLabelValues ("networkBlock" , * bmh . InstanceId ).Add (float64 (1 ))
264
+ if host . NetworkBlockId == nil {
265
+ klog .Warningf ("NetworkBlockId is nil for instance %q" , * host . Id )
266
+ missingAncestor .WithLabelValues ("networkBlock" , * host . Id ).Add (float64 (1 ))
255
267
continue
256
268
}
257
269
258
- if bmh . ComputeHpcIslandId == nil {
259
- klog .Warningf ("ComputeHpcIslandId is nil for instance %q" , * bmh . InstanceId )
260
- missingAncestor .WithLabelValues ("hpcIsland" , * bmh . InstanceId ).Add (float64 (1 ))
270
+ if host . HpcIslandId == nil {
271
+ klog .Warningf ("HpcIslandId is nil for instance %q" , * host . Id )
272
+ missingAncestor .WithLabelValues ("hpcIsland" , * host . Id ).Add (float64 (1 ))
261
273
continue
262
274
}
263
275
264
- if _ , ok := instanceToNodeMap [* bmh . InstanceId ]; ok {
265
- klog .V (4 ).Infof ("Adding bmhSummary %s" , bmh .String ())
266
- filtered = append (filtered , bmh )
276
+ if _ , ok := instanceToNodeMap [* host . Id ]; ok {
277
+ klog .V (4 ).Infof ("Adding host %s" , host .String ())
278
+ filtered = append (filtered , host )
267
279
} else {
268
- klog .V (4 ).Infof ("Skipping bmhSummary %s" , bmh .String ())
280
+ klog .V (4 ).Infof ("Skipping host %s" , host .String ())
269
281
}
270
282
}
271
283
272
284
sort .Slice (filtered , func (i , j int ) bool {
273
- if filtered [i ].ComputeHpcIslandId != filtered [j ].ComputeHpcIslandId {
274
- return * filtered [i ].ComputeHpcIslandId < * filtered [j ].ComputeHpcIslandId
285
+ if filtered [i ].HpcIslandId != filtered [j ].HpcIslandId {
286
+ return * filtered [i ].HpcIslandId < * filtered [j ].HpcIslandId
275
287
}
276
288
277
- if filtered [i ].ComputeNetworkBlockId != filtered [j ].ComputeNetworkBlockId {
278
- return * filtered [i ].ComputeNetworkBlockId < * filtered [j ].ComputeNetworkBlockId
289
+ if filtered [i ].NetworkBlockId != filtered [j ].NetworkBlockId {
290
+ return * filtered [i ].NetworkBlockId < * filtered [j ].NetworkBlockId
279
291
}
280
292
281
- if filtered [i ].ComputeLocalBlockId != filtered [j ].ComputeLocalBlockId {
282
- return * filtered [i ].ComputeLocalBlockId < * filtered [j ].ComputeLocalBlockId
293
+ if filtered [i ].LocalBlockId != filtered [j ].LocalBlockId {
294
+ return * filtered [i ].LocalBlockId < * filtered [j ].LocalBlockId
283
295
}
284
296
285
- return * filtered [i ].InstanceId < * filtered [j ].InstanceId
297
+ return * filtered [i ].Id < * filtered [j ].Id
286
298
})
287
299
return filtered
288
300
}
289
-
290
- func generateInstanceTopology (ctx context.Context , factory ClientFactory , ci * topology.ComputeInstances , bareMetalHostSummaries []* core.ComputeBareMetalHostSummary ) ([]* core.ComputeBareMetalHostSummary , error ) {
291
- client , err := factory (ci .Region )
292
- if err != nil {
293
- return nil , err
294
- }
295
-
296
- bmh , err := getBareMetalHostSummaries (ctx , client )
297
- if err != nil {
298
- return nil , fmt .Errorf ("unable to populate compute capacity topology: %s" , err .Error ())
299
- }
300
-
301
- for _ , bm := range bmh {
302
- bareMetalHostSummaries = append (bareMetalHostSummaries , & bm )
303
- }
304
- return bareMetalHostSummaries , nil
305
- }
0 commit comments