Skip to content

Commit 10b89eb

Browse files
committed
implement mock API for OCI
Signed-off-by: Dmitry Shmulevich <[email protected]>
1 parent cfe5bfc commit 10b89eb

File tree

12 files changed

+356
-142
lines changed

12 files changed

+356
-142
lines changed

go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,4 +112,5 @@ replace (
112112
github.com/aws/aws-sdk-go-v2/credentials v1.17.45 => github.com/pkedy/aws-sdk-go-v2/credentials v0.0.0-20241115203348-0198b6c98cd9
113113
github.com/aws/aws-sdk-go-v2/service/autoscaling v1.48.0 => github.com/pkedy/aws-sdk-go-v2/service/autoscaling v0.0.0-20241115203348-0198b6c98cd9
114114
github.com/aws/aws-sdk-go-v2/service/ec2 v1.187.0 => github.com/pkedy/aws-sdk-go-v2/service/ec2 v0.0.0-20241115203348-0198b6c98cd9
115+
github.com/oracle/oci-go-sdk/v65 v65.78.0 => ../../oracle/oci-go-sdk/v65
115116
)

go.sum

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -161,8 +161,6 @@ github.com/onsi/ginkgo/v2 v2.19.0 h1:9Cnnf7UHo57Hy3k6/m5k3dRfGTMXGvxhHFvkDTCTpvA
161161
github.com/onsi/ginkgo/v2 v2.19.0/go.mod h1:rlwLi9PilAFJ8jCg9UE1QP6VBpd6/xj3SRC0d6TU0To=
162162
github.com/onsi/gomega v1.33.1 h1:dsYjIxxSR755MDmKVsaFQTE22ChNBcuuTWgkUDSubOk=
163163
github.com/onsi/gomega v1.33.1/go.mod h1:U4R44UsT+9eLIaYRB2a5qajjtQYn0hauxvRm16AVYg0=
164-
github.com/oracle/oci-go-sdk/v65 v65.78.0 h1:iM7lFFA7cJkUD4tmrlsAHWgL3HuTuF9mdvTAliMkcFA=
165-
github.com/oracle/oci-go-sdk/v65 v65.78.0/go.mod h1:IBEV9l1qBzUpo7zgGaRUhbB05BVfcDGYRFBCPlTcPp0=
166164
github.com/pkedy/aws-sdk-go-v2 v0.0.0-20241115203348-0198b6c98cd9 h1:QhMFD0yJ9nEj4BCX9lREQ7twLM5oEL8y9UwKsRNJamo=
167165
github.com/pkedy/aws-sdk-go-v2 v0.0.0-20241115203348-0198b6c98cd9/go.mod h1:2SK5n0a2karNTv5tbP1SjsX0uhttou00v/HpXKM1ZUo=
168166
github.com/pkedy/aws-sdk-go-v2/service/ec2 v0.0.0-20241115203348-0198b6c98cd9 h1:wA7yd0OxRH3EWuKaJ7ijRowlWgH2b99nrP+d10+0Sc4=

pkg/providers/oci/instance_topology.go

Lines changed: 111 additions & 116 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ package oci
1919
import (
2020
"context"
2121
"fmt"
22-
"net/http"
2322
"sort"
2423
"time"
2524

@@ -29,6 +28,7 @@ import (
2928

3029
"github.com/NVIDIA/topograph/pkg/metrics"
3130
"github.com/NVIDIA/topograph/pkg/topology"
31+
"github.com/NVIDIA/topograph/pkg/translate"
3232
)
3333

3434
type level int
@@ -39,109 +39,111 @@ const (
3939
hpcIslandLevel
4040
)
4141

42-
func GenerateInstanceTopology(ctx context.Context, factory ClientFactory, cis []topology.ComputeInstances) ([]*core.ComputeBareMetalHostSummary, error) {
43-
var err error
44-
bareMetalHostSummaries := []*core.ComputeBareMetalHostSummary{}
42+
func GenerateInstanceTopology(ctx context.Context, factory ClientFactory, pageSize *int, cis []topology.ComputeInstances) (hosts []core.ComputeHostSummary, blockMap map[string]string, err error) {
43+
blockMap = make(map[string]string)
44+
4545
for _, ci := range cis {
46-
if bareMetalHostSummaries, err = generateInstanceTopology(ctx, factory, &ci, bareMetalHostSummaries); err != nil {
47-
return nil, err
46+
var client Client
47+
if client, err = factory(ci.Region, pageSize); err != nil {
48+
return
49+
}
50+
if hosts, err = getComputeHostInfo(ctx, client, hosts, blockMap); err != nil {
51+
return
4852
}
4953
}
5054

51-
return bareMetalHostSummaries, nil
55+
return
5256
}
5357

54-
func getComputeCapacityTopologies(ctx context.Context, client Client) (cct []core.ComputeCapacityTopologySummary, err error) {
55-
compartmentId := client.TenancyOCID()
56-
57-
adRequest := identity.ListAvailabilityDomainsRequest{
58-
CompartmentId: &compartmentId,
59-
}
58+
func getComputeHostSummary(ctx context.Context, client Client, availabilityDomain *string) ([]core.ComputeHostSummary, error) {
59+
var hosts []core.ComputeHostSummary
6060

61-
timeStart := time.Now()
62-
ads, err := client.ListAvailabilityDomains(ctx, adRequest)
63-
if err != nil {
64-
return cct, fmt.Errorf("unable to get AD: %v", err)
61+
req := core.ListComputeHostsRequest{
62+
CompartmentId: client.TenancyOCID(),
63+
AvailabilityDomain: availabilityDomain,
64+
Limit: client.Limit(),
6565
}
66-
requestLatency.WithLabelValues("ListAvailabilityDomains", ads.HTTPResponse().Status).Observe(time.Since(timeStart).Seconds())
6766

68-
for _, ad := range ads.Items {
69-
cctRequest := core.ListComputeCapacityTopologiesRequest{
70-
CompartmentId: &compartmentId,
71-
AvailabilityDomain: ad.Name,
67+
for {
68+
timeStart := time.Now()
69+
resp, err := client.ListComputeHosts(ctx, req)
70+
requestLatency.WithLabelValues("ListComputeHosts", resp.HTTPResponse().Status).Observe(time.Since(timeStart).Seconds())
71+
if err != nil {
72+
return nil, err
7273
}
7374

74-
for {
75-
timeStart := time.Now()
76-
resp, err := client.ListComputeCapacityTopologies(ctx, cctRequest)
77-
requestLatency.WithLabelValues("ListComputeCapacityTopologies", resp.HTTPResponse().Status).Observe(time.Since(timeStart).Seconds())
78-
if err != nil {
79-
if resp.HTTPResponse().StatusCode == http.StatusNotFound {
80-
return cct, fmt.Errorf("%v for getting ComputeCapacityTopology in %s: %v", resp.HTTPResponse().StatusCode, *ad.Name, err)
81-
} else {
82-
return cct, fmt.Errorf("unable to get ComputeCapacity Topologies in %s : %v", *ad.Name, err)
83-
}
84-
}
85-
cct = append(cct, resp.Items...)
86-
klog.V(4).Infof("Received computeCapacityTopology %d groups; processed %d", len(resp.Items), len(cct))
87-
if resp.OpcNextPage != nil {
88-
cctRequest.Page = resp.OpcNextPage
89-
} else {
90-
break
91-
}
75+
hosts = append(hosts, resp.Items...)
76+
77+
if resp.OpcNextPage != nil {
78+
req.Page = resp.OpcNextPage
79+
} else {
80+
break
9281
}
9382
}
9483

95-
return cct, nil
84+
return hosts, nil
9685
}
9786

98-
func getBMHSummaryPerComputeCapacityTopology(ctx context.Context, client Client, topologyID string) (bmhSummary []core.ComputeBareMetalHostSummary, err error) {
99-
compartmentId := client.TenancyOCID()
100-
request := core.ListComputeCapacityTopologyComputeBareMetalHostsRequest{
101-
ComputeCapacityTopologyId: &topologyID,
102-
CompartmentId: &compartmentId,
87+
// getLocalBlockMap returns a map between LocalBlocks and ComputeGpuMemoryFabrics
88+
func getLocalBlockMap(ctx context.Context, client Client, availabilityDomain *string, blockMap map[string]string) error {
89+
req := core.ListComputeGpuMemoryFabricsRequest{
90+
CompartmentId: client.TenancyOCID(),
91+
AvailabilityDomain: availabilityDomain,
92+
Limit: client.Limit(),
10393
}
94+
10495
for {
10596
timeStart := time.Now()
106-
response, err := client.ListComputeCapacityTopologyComputeBareMetalHosts(ctx, request)
107-
requestLatency.WithLabelValues("ListComputeCapacityTopologyComputeBareMetalHosts", response.HTTPResponse().Status).Observe(time.Since(timeStart).Seconds())
97+
resp, err := client.ListComputeGpuMemoryFabrics(ctx, req)
98+
requestLatency.WithLabelValues("ListComputeGpuMemoryFabrics", resp.HTTPResponse().Status).Observe(time.Since(timeStart).Seconds())
10899
if err != nil {
109-
klog.Errorln(err.Error())
110-
break
100+
return err
111101
}
112102

113-
bmhSummary = append(bmhSummary, response.Items...)
103+
for _, fabrics := range resp.Items {
104+
blockMap[*fabrics.ComputeLocalBlockId] = *fabrics.Id
105+
}
114106

115-
if response.OpcNextPage != nil {
116-
request.Page = response.OpcNextPage
107+
if resp.OpcNextPage != nil {
108+
req.Page = resp.OpcNextPage
117109
} else {
118110
break
119111
}
120112
}
121-
return bmhSummary, nil
113+
114+
return nil
122115
}
123116

124-
func getBareMetalHostSummaries(ctx context.Context, client Client) ([]core.ComputeBareMetalHostSummary, error) {
125-
computeCapacityTopology, err := getComputeCapacityTopologies(ctx, client)
117+
func getComputeHostInfo(ctx context.Context, client Client, hosts []core.ComputeHostSummary, blockMap map[string]string) ([]core.ComputeHostSummary, error) {
118+
req := identity.ListAvailabilityDomainsRequest{
119+
CompartmentId: client.TenancyOCID(),
120+
}
121+
122+
timeStart := time.Now()
123+
resp, err := client.ListAvailabilityDomains(ctx, req)
126124
if err != nil {
127-
return nil, fmt.Errorf("unable to get compute capacity topologies: %s", err.Error())
125+
return nil, fmt.Errorf("unable to get availability domains: %v", err)
128126
}
129-
klog.V(4).Infof("Received computeCapacityTopology for %d groups", len(computeCapacityTopology))
127+
requestLatency.WithLabelValues("ListAvailabilityDomains", resp.HTTPResponse().Status).Observe(time.Since(timeStart).Seconds())
130128

131-
var bareMetalHostSummaries []core.ComputeBareMetalHostSummary
132-
for _, cct := range computeCapacityTopology {
133-
bareMetalHostSummary, err := getBMHSummaryPerComputeCapacityTopology(ctx, client, *cct.Id)
129+
for _, ad := range resp.Items {
130+
summary, err := getComputeHostSummary(ctx, client, ad.Name)
134131
if err != nil {
135-
return nil, fmt.Errorf("unable to get bare metal hosts info: %s", err.Error())
132+
return nil, fmt.Errorf("unable to get hosts info: %v", err)
133+
}
134+
hosts = append(hosts, summary...)
135+
136+
if err = getLocalBlockMap(ctx, client, ad.Name, blockMap); err != nil {
137+
return nil, fmt.Errorf("unable to get local block map: %v", err)
136138
}
137-
bareMetalHostSummaries = append(bareMetalHostSummaries, bareMetalHostSummary...)
138139
}
139-
klog.V(4).Infof("Returning bareMetalHostSummaries for %d nodes", len(bareMetalHostSummaries))
140140

141-
return bareMetalHostSummaries, nil
141+
klog.V(4).Infof("Returning host info for %d nodes and %d blocks", len(hosts), len(blockMap))
142+
143+
return hosts, nil
142144
}
143145

144-
func toGraph(bareMetalHostSummaries []*core.ComputeBareMetalHostSummary, cis []topology.ComputeInstances) (*topology.Vertex, error) {
146+
func toGraph(hosts []core.ComputeHostSummary, blockMap map[string]string, cis []topology.ComputeInstances) (*topology.Vertex, error) {
145147
instanceToNodeMap := make(map[string]string)
146148
for _, ci := range cis {
147149
for instance, node := range ci.Instances {
@@ -152,18 +154,25 @@ func toGraph(bareMetalHostSummaries []*core.ComputeBareMetalHostSummary, cis []t
152154

153155
nodes := make(map[string]*topology.Vertex)
154156
forest := make(map[string]*topology.Vertex)
157+
domainMap := translate.NewDomainMap()
158+
155159
levelWiseSwitchCount := map[level]int{localBlockLevel: 0, networkBlockLevel: 0, hpcIslandLevel: 0}
156-
bareMetalHostSummaries = filterAndSort(bareMetalHostSummaries, instanceToNodeMap)
157-
for _, bmhSummary := range bareMetalHostSummaries {
158-
nodeName := instanceToNodeMap[*bmhSummary.InstanceId]
159-
delete(instanceToNodeMap, *bmhSummary.InstanceId)
160+
hosts = filterAndSort(hosts, instanceToNodeMap)
161+
for _, host := range hosts {
162+
nodeName := instanceToNodeMap[*host.Id]
163+
delete(instanceToNodeMap, *host.Id)
160164

161165
instance := &topology.Vertex{
162166
Name: nodeName,
163-
ID: *bmhSummary.InstanceId,
167+
ID: *host.Id,
168+
}
169+
170+
localBlockId := *host.LocalBlockId
171+
172+
if blockDomain, ok := blockMap[localBlockId]; ok {
173+
domainMap.AddHost(blockDomain, nodeName)
164174
}
165175

166-
localBlockId := *bmhSummary.ComputeLocalBlockId
167176
localBlock, ok := nodes[localBlockId]
168177
if !ok {
169178
levelWiseSwitchCount[localBlockLevel]++
@@ -176,7 +185,7 @@ func toGraph(bareMetalHostSummaries []*core.ComputeBareMetalHostSummary, cis []t
176185
}
177186
localBlock.Vertices[instance.ID] = instance
178187

179-
networkBlockId := *bmhSummary.ComputeNetworkBlockId
188+
networkBlockId := *host.NetworkBlockId
180189
networkBlock, ok := nodes[networkBlockId]
181190
if !ok {
182191
levelWiseSwitchCount[networkBlockLevel]++
@@ -189,7 +198,7 @@ func toGraph(bareMetalHostSummaries []*core.ComputeBareMetalHostSummary, cis []t
189198
}
190199
networkBlock.Vertices[localBlockId] = localBlock
191200

192-
hpcIslandId := *bmhSummary.ComputeHpcIslandId
201+
hpcIslandId := *host.HpcIslandId
193202
hpcIsland, ok := nodes[hpcIslandId]
194203
if !ok {
195204
levelWiseSwitchCount[hpcIslandLevel]++
@@ -231,75 +240,61 @@ func toGraph(bareMetalHostSummaries []*core.ComputeBareMetalHostSummary, cis []t
231240
Vertices: make(map[string]*topology.Vertex),
232241
}
233242
root.Vertices[topology.TopologyTree] = treeRoot
243+
if len(domainMap) != 0 {
244+
root.Vertices[topology.TopologyBlock] = domainMap.ToBlocks()
245+
}
234246
return root, nil
235247

236248
}
237249

238-
func filterAndSort(bareMetalHostSummaries []*core.ComputeBareMetalHostSummary, instanceToNodeMap map[string]string) []*core.ComputeBareMetalHostSummary {
239-
var filtered []*core.ComputeBareMetalHostSummary
240-
for _, bmh := range bareMetalHostSummaries {
241-
if bmh.InstanceId == nil {
242-
klog.V(5).Infof("Instance ID is nil for bmhSummary %s", bmh.String())
250+
func filterAndSort(hosts []core.ComputeHostSummary, instanceToNodeMap map[string]string) []core.ComputeHostSummary {
251+
var filtered []core.ComputeHostSummary
252+
for _, host := range hosts {
253+
if host.Id == nil {
254+
klog.Warningf("InstanceID is nil for host %s", host.String())
243255
continue
244256
}
245257

246-
if bmh.ComputeLocalBlockId == nil {
247-
klog.Warningf("ComputeLocalBlockId is nil for instance %q", *bmh.InstanceId)
248-
missingAncestor.WithLabelValues("localBlock", *bmh.InstanceId).Add(float64(1))
258+
if host.LocalBlockId == nil {
259+
klog.Warningf("LocalBlockId is nil for instance %q", *host.Id)
260+
missingAncestor.WithLabelValues("LocalBlock", *host.Id).Add(float64(1))
249261
continue
250262
}
251263

252-
if bmh.ComputeNetworkBlockId == nil {
253-
klog.Warningf("ComputeNetworkBlockId is nil for instance %q", *bmh.InstanceId)
254-
missingAncestor.WithLabelValues("networkBlock", *bmh.InstanceId).Add(float64(1))
264+
if host.NetworkBlockId == nil {
265+
klog.Warningf("NetworkBlockId is nil for instance %q", *host.Id)
266+
missingAncestor.WithLabelValues("networkBlock", *host.Id).Add(float64(1))
255267
continue
256268
}
257269

258-
if bmh.ComputeHpcIslandId == nil {
259-
klog.Warningf("ComputeHpcIslandId is nil for instance %q", *bmh.InstanceId)
260-
missingAncestor.WithLabelValues("hpcIsland", *bmh.InstanceId).Add(float64(1))
270+
if host.HpcIslandId == nil {
271+
klog.Warningf("HpcIslandId is nil for instance %q", *host.Id)
272+
missingAncestor.WithLabelValues("hpcIsland", *host.Id).Add(float64(1))
261273
continue
262274
}
263275

264-
if _, ok := instanceToNodeMap[*bmh.InstanceId]; ok {
265-
klog.V(4).Infof("Adding bmhSummary %s", bmh.String())
266-
filtered = append(filtered, bmh)
276+
if _, ok := instanceToNodeMap[*host.Id]; ok {
277+
klog.V(4).Infof("Adding host %s", host.String())
278+
filtered = append(filtered, host)
267279
} else {
268-
klog.V(4).Infof("Skipping bmhSummary %s", bmh.String())
280+
klog.V(4).Infof("Skipping host %s", host.String())
269281
}
270282
}
271283

272284
sort.Slice(filtered, func(i, j int) bool {
273-
if filtered[i].ComputeHpcIslandId != filtered[j].ComputeHpcIslandId {
274-
return *filtered[i].ComputeHpcIslandId < *filtered[j].ComputeHpcIslandId
285+
if filtered[i].HpcIslandId != filtered[j].HpcIslandId {
286+
return *filtered[i].HpcIslandId < *filtered[j].HpcIslandId
275287
}
276288

277-
if filtered[i].ComputeNetworkBlockId != filtered[j].ComputeNetworkBlockId {
278-
return *filtered[i].ComputeNetworkBlockId < *filtered[j].ComputeNetworkBlockId
289+
if filtered[i].NetworkBlockId != filtered[j].NetworkBlockId {
290+
return *filtered[i].NetworkBlockId < *filtered[j].NetworkBlockId
279291
}
280292

281-
if filtered[i].ComputeLocalBlockId != filtered[j].ComputeLocalBlockId {
282-
return *filtered[i].ComputeLocalBlockId < *filtered[j].ComputeLocalBlockId
293+
if filtered[i].LocalBlockId != filtered[j].LocalBlockId {
294+
return *filtered[i].LocalBlockId < *filtered[j].LocalBlockId
283295
}
284296

285-
return *filtered[i].InstanceId < *filtered[j].InstanceId
297+
return *filtered[i].Id < *filtered[j].Id
286298
})
287299
return filtered
288300
}
289-
290-
func generateInstanceTopology(ctx context.Context, factory ClientFactory, ci *topology.ComputeInstances, bareMetalHostSummaries []*core.ComputeBareMetalHostSummary) ([]*core.ComputeBareMetalHostSummary, error) {
291-
client, err := factory(ci.Region)
292-
if err != nil {
293-
return nil, err
294-
}
295-
296-
bmh, err := getBareMetalHostSummaries(ctx, client)
297-
if err != nil {
298-
return nil, fmt.Errorf("unable to populate compute capacity topology: %s", err.Error())
299-
}
300-
301-
for _, bm := range bmh {
302-
bareMetalHostSummaries = append(bareMetalHostSummaries, &bm)
303-
}
304-
return bareMetalHostSummaries, nil
305-
}

0 commit comments

Comments
 (0)