Skip to content

Commit

Permalink
make switch names in oci shorter
Browse files Browse the repository at this point in the history
  • Loading branch information
XRFXLP committed Oct 8, 2024
1 parent d612876 commit e3d37bc
Show file tree
Hide file tree
Showing 3 changed files with 160 additions and 43 deletions.
107 changes: 73 additions & 34 deletions pkg/oci/instance_topology.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"context"
"fmt"
"net/http"
"sort"
"time"

OCICommon "github.com/oracle/oci-go-sdk/v65/common"
Expand All @@ -30,6 +31,14 @@ import (
"github.com/NVIDIA/topograph/pkg/common"
)

type level int

const (
localBlockLevel level = iota + 1
networkBlockLevel
hpcIslandLevel
)

func GenerateInstanceTopology(ctx context.Context, creds OCICommon.ConfigurationProvider, cis []common.ComputeInstances) ([]*core.ComputeBareMetalHostSummary, error) {
var err error
bareMetalHostSummaries := []*core.ComputeBareMetalHostSummary{}
Expand Down Expand Up @@ -144,73 +153,51 @@ func toGraph(bareMetalHostSummaries []*core.ComputeBareMetalHostSummary, cis []c

nodes := make(map[string]*common.Vertex)
forest := make(map[string]*common.Vertex)

levelWiseSwitchCount := map[level]int{localBlockLevel: 0, networkBlockLevel: 0, hpcIslandLevel: 0}
bareMetalHostSummaries = filterAndSort(bareMetalHostSummaries, instanceToNodeMap)
for _, bmhSummary := range bareMetalHostSummaries {
if bmhSummary.InstanceId == nil {
klog.V(5).Infof("Skipped bmhSummary %s", bmhSummary.String())
continue
}
nodeName, ok := instanceToNodeMap[*bmhSummary.InstanceId]
if !ok {
klog.V(5).Infof("Node not found for instance ID %s", *bmhSummary.InstanceId)
continue
}
klog.V(4).Infof("Found node %q instance %q", nodeName, *bmhSummary.InstanceId)
nodeName := instanceToNodeMap[*bmhSummary.InstanceId]
delete(instanceToNodeMap, *bmhSummary.InstanceId)

instance := &common.Vertex{
Name: nodeName,
ID: *bmhSummary.InstanceId,
}

localBlockId := "lb_nil"
if bmhSummary.ComputeLocalBlockId != nil {
localBlockId = *bmhSummary.ComputeLocalBlockId
} else {
klog.Warningf("ComputeLocalBlockId is nil for instance %q", *bmhSummary.InstanceId)
missingAncestor.WithLabelValues("localBlock", nodeName).Add(float64(1))
}

localBlockId := *bmhSummary.ComputeLocalBlockId
localBlock, ok := nodes[localBlockId]
if !ok {
levelWiseSwitchCount[localBlockLevel]++
localBlock = &common.Vertex{
ID: localBlockId,
Vertices: make(map[string]*common.Vertex),
Name: fmt.Sprintf("Switch.%d.%d", localBlockLevel, levelWiseSwitchCount[localBlockLevel]),
}
nodes[localBlockId] = localBlock
}
localBlock.Vertices[instance.ID] = instance

networkBlockId := "nw_nil"
if bmhSummary.ComputeNetworkBlockId != nil {
networkBlockId = *bmhSummary.ComputeNetworkBlockId
} else {
klog.Warningf("ComputeNetworkBlockId is nil for instance %q", *bmhSummary.InstanceId)
missingAncestor.WithLabelValues("networkBlock", nodeName).Add(float64(1))
}

networkBlockId := *bmhSummary.ComputeNetworkBlockId
networkBlock, ok := nodes[networkBlockId]
if !ok {
levelWiseSwitchCount[networkBlockLevel]++
networkBlock = &common.Vertex{
ID: networkBlockId,
Vertices: make(map[string]*common.Vertex),
Name: fmt.Sprintf("Switch.%d.%d", networkBlockLevel, levelWiseSwitchCount[networkBlockLevel]),
}
nodes[networkBlockId] = networkBlock
}
networkBlock.Vertices[localBlockId] = localBlock

hpcIslandId := "hpc_nil"
if bmhSummary.ComputeHpcIslandId != nil {
hpcIslandId = *bmhSummary.ComputeHpcIslandId
} else {
klog.Warningf("ComputeHpcIslandId is nil for instance %q", *bmhSummary.InstanceId)
missingAncestor.WithLabelValues("hpcIsland", nodeName).Add(float64(1))
}
hpcIslandId := *bmhSummary.ComputeHpcIslandId
hpcIsland, ok := nodes[hpcIslandId]
if !ok {
levelWiseSwitchCount[hpcIslandLevel]++
hpcIsland = &common.Vertex{
ID: hpcIslandId,
Vertices: make(map[string]*common.Vertex),
Name: fmt.Sprintf("Switch.%d.%d", hpcIslandLevel, levelWiseSwitchCount[hpcIslandLevel]),
}
nodes[hpcIslandId] = hpcIsland
forest[hpcIslandId] = hpcIsland
Expand Down Expand Up @@ -244,6 +231,58 @@ func toGraph(bareMetalHostSummaries []*core.ComputeBareMetalHostSummary, cis []c

}

func filterAndSort(bareMetalHostSummaries []*core.ComputeBareMetalHostSummary, instanceToNodeMap map[string]string) []*core.ComputeBareMetalHostSummary {
var filtered []*core.ComputeBareMetalHostSummary
for _, bmh := range bareMetalHostSummaries {
if bmh.InstanceId == nil {
klog.V(5).Infof("Instance ID is nil for bmhSummary %s", bmh.String())
continue
}

if bmh.ComputeLocalBlockId == nil {
klog.Warningf("ComputeLocalBlockId is nil for instance %q", *bmh.InstanceId)
missingAncestor.WithLabelValues("localBlock", *bmh.InstanceId).Add(float64(1))
continue
}

if bmh.ComputeNetworkBlockId == nil {
klog.Warningf("ComputeNetworkBlockId is nil for instance %q", *bmh.InstanceId)
missingAncestor.WithLabelValues("networkBlock", *bmh.InstanceId).Add(float64(1))
continue
}

if bmh.ComputeHpcIslandId == nil {
klog.Warningf("ComputeHpcIslandId is nil for instance %q", *bmh.InstanceId)
missingAncestor.WithLabelValues("hpcIsland", *bmh.InstanceId).Add(float64(1))
continue
}

if _, ok := instanceToNodeMap[*bmh.InstanceId]; ok {
klog.V(4).Infof("Adding bmhSummary %s", bmh.String())
filtered = append(filtered, bmh)
} else {
klog.V(4).Infof("Skipping bmhSummary %s", bmh.String())
}
}

sort.Slice(filtered, func(i, j int) bool {
if filtered[i].ComputeHpcIslandId != filtered[j].ComputeHpcIslandId {
return *filtered[i].ComputeHpcIslandId < *filtered[j].ComputeHpcIslandId
}

if filtered[i].ComputeNetworkBlockId != filtered[j].ComputeNetworkBlockId {
return *filtered[i].ComputeNetworkBlockId < *filtered[j].ComputeNetworkBlockId
}

if filtered[i].ComputeLocalBlockId != filtered[j].ComputeLocalBlockId {
return *filtered[i].ComputeLocalBlockId < *filtered[j].ComputeLocalBlockId
}

return *filtered[i].InstanceId < *filtered[j].InstanceId
})
return filtered
}

func generateInstanceTopology(ctx context.Context, provider OCICommon.ConfigurationProvider, ci *common.ComputeInstances, bareMetalHostSummaries []*core.ComputeBareMetalHostSummary) ([]*core.ComputeBareMetalHostSummary, error) {
identityClient, err := identity.NewIdentityClientWithConfigurationProvider(provider)
if err != nil {
Expand Down
34 changes: 25 additions & 9 deletions pkg/translate/output.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,15 @@ func ToSLURM(wr io.Writer, root *common.Vertex) error {
leaves := make(map[string][]string)
parents := []*common.Vertex{}
queue := []*common.Vertex{root}
idToName := make(map[string]string)

for len(queue) > 0 {
v := queue[0]
queue = queue[1:]
if len(v.ID) != 0 {
parents = append(parents, v)
}

idToName[v.ID] = v.Name
for _, w := range v.Vertices {
if len(w.Vertices) == 0 { // it's a leaf; don't add to queue
_, ok := leaves[v.ID]
Expand All @@ -55,14 +56,19 @@ func ToSLURM(wr io.Writer, root *common.Vertex) error {

for _, sw := range parents {
if _, ok := leaves[sw.ID]; !ok {
if err := writeSwitch(wr, sw); err != nil {
return err
}
writeSwitch(wr, sw)
}
}

var comment, switchName string
for sw, nodes := range leaves {
_, err := wr.Write([]byte(fmt.Sprintf("SwitchName=%s Nodes=%s\n", sw, strings.Join(compress(nodes), ","))))
if idToName[sw] != "" {
comment = fmt.Sprintf("# %s=%s\n", idToName[sw], sw)
switchName = idToName[sw]
} else {
comment = ""
switchName = sw
}
_, err := wr.Write([]byte(fmt.Sprintf("%sSwitchName=%s Nodes=%s\n", comment, switchName, strings.Join(compress(nodes), ","))))
if err != nil {
return err
}
Expand All @@ -78,10 +84,20 @@ func writeSwitch(wr io.Writer, v *common.Vertex) error {

arr := make([]string, 0, len(v.Vertices))
for _, node := range v.Vertices {
arr = append(arr, node.ID)
if node.Name == "" {
arr = append(arr, node.ID)
} else {
arr = append(arr, node.Name)
}
}

_, err := wr.Write([]byte(fmt.Sprintf("SwitchName=%s Switches=%s\n", v.ID, strings.Join(compress(arr), ","))))
var comment string
if v.Name == "" {
comment = ""
v.Name = v.ID
} else {
comment = fmt.Sprintf("# %s=%s\n", v.Name, v.ID)
}
_, err := wr.Write([]byte(fmt.Sprintf("%sSwitchName=%s Switches=%s\n", comment, v.Name, strings.Join(compress(arr), ","))))
if err != nil {
return err
}
Expand Down
62 changes: 62 additions & 0 deletions pkg/translate/output_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"testing"

"github.com/stretchr/testify/require"
"gitlab-master.nvidia.com/dgxcloud/tools/cluster-topology-generator/pkg/common"

Check failure on line 24 in pkg/translate/output_test.go

View workflow job for this annotation

GitHub Actions / test

no required module provides package gitlab-master.nvidia.com/dgxcloud/tools/cluster-topology-generator/pkg/common; to add it:
)

const (
Expand All @@ -32,6 +33,17 @@ SwitchName=S3 Nodes=Node[304-306]
testConfig2 = `SwitchName=S1 Switches=S[2-3]
SwitchName=S3 Nodes=Node[304-306]
SwitchName=S2 Nodes=Node[201-202],Node205
`
shortNameExpectedResult = `# switch.3.1=hpcislandid-1
SwitchName=switch.3.1 Switches=switch.2.[1-2]
# switch.2.1=network-block-1
SwitchName=switch.2.1 Switches=switch.1.1
# switch.2.2=network-block-2
SwitchName=switch.2.2 Switches=switch.1.2
# switch.1.1=local-block-1
SwitchName=switch.1.1 Nodes=node-1
# switch.1.2=local-block-2
SwitchName=switch.1.2 Nodes=node-2
`
)

Expand All @@ -48,6 +60,56 @@ func TestToSLURM(t *testing.T) {
}
}

func TestToSlurmNameShortener(t *testing.T) {
v := &common.Vertex{
Vertices: map[string]*common.Vertex{
"hpcislandid-1": {
ID: "hpcislandid-1",
Name: "switch.3.1",
Vertices: map[string]*common.Vertex{
"network-block-1": {
ID: "network-block-1",
Name: "switch.2.1",
Vertices: map[string]*common.Vertex{
"local-block-1": {
ID: "local-block-1",
Name: "switch.1.1",
Vertices: map[string]*common.Vertex{
"node-1": {
ID: "node-1-id",
Name: "node-1",
},
},
},
},
},
"network-block-2": {
ID: "network-block-2",
Name: "switch.2.2",
Vertices: map[string]*common.Vertex{
"local-block-2": {
ID: "local-block-2",
Name: "switch.1.2",
Vertices: map[string]*common.Vertex{
"node-2": {
ID: "node-2-id",
Name: "node-2",
},
},
},
},
},
},
},
},
}

buf := &bytes.Buffer{}
err := ToSLURM(buf, v)
require.NoError(t, err)
require.Equal(t, shortNameExpectedResult, buf.String())
}

func TestCompress(t *testing.T) {
testCases := []struct {
name string
Expand Down

0 comments on commit e3d37bc

Please sign in to comment.