Skip to content

Commit

Permalink
Merge pull request #15 from NVIDIA/test-model
Browse files Browse the repository at this point in the history
Topograph model import for use with toposim
  • Loading branch information
henryh2 authored Oct 25, 2024
2 parents 3f45adb + bc2d9ff commit 7ce1be9
Show file tree
Hide file tree
Showing 13 changed files with 289 additions and 268 deletions.
22 changes: 19 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,16 +99,32 @@ systemctl disable topograph.service
systemctl daemon-reload
```

#### Testing the Service
To verify the service is running correctly, you can use the following commands:
#### Verifying Health
To verify the service is healthy, you can use the following command:

```bash
curl http://localhost:49021/healthz
```

#### Using Toposim
To test the service on a simulated cluster, first add the following line to `/etc/topograph/topograph-config.yaml` so that any topology requests are forwarded to toposim.
```bash
forward_service_url: dns:localhost:49025
```
Then run the topograph service as normal.

id=$(curl -s -X POST -H "Content-Type: application/json" -d '{"provider":{"name":"test"},"engine":{"name":"test"}}' http://localhost:49021/v1/generate)
You must then start the toposim service as such, setting the path to the test model that you want to use in simulation:
```bash
/usr/local/bin/topograph -m /usr/local/bin/tests/models/<cluster-model>.yaml
```

You can then verify the topology results via simulation by querying topograph using the `test` provider and engine, and specifying the test model path as a parameter to the provider, as such:
```bash
id=$(curl -s -X POST -H "Content-Type: application/json" -d '{"provider":{"name":"test", "params":{"model_path":"/usr/local/bin/topograph/tests/models/<cluster-model>.yaml"}},"engine":{"name":"test"}}' http://localhost:49021/v1/generate)

curl -s "http://localhost:49021/v1/topology?uid=$id"
```
Note the path specified in the topograph query should point to the same model as provided to toposim.

#### Using the Cluster Topology Generator

Expand Down
3 changes: 2 additions & 1 deletion cmd/toposim/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
"github.com/oklog/run"
"k8s.io/klog/v2"

"github.com/NVIDIA/topograph/pkg/models"
"github.com/NVIDIA/topograph/pkg/toposim"
)

Expand All @@ -50,7 +51,7 @@ func mainInternal() error {
return fmt.Errorf("must specify topology model path and listening port")
}

model, err := toposim.NewModelFromFile(path)
model, err := models.NewModelFromFile(path)
if err != nil {
return err
}
Expand Down
1 change: 1 addition & 0 deletions pkg/common/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ const (
KeyTopoConfigmapNamespace = "topology_configmap_namespace"
KeyBlockSizes = "block_sizes"
KeySkipReload = "skip_reload"
KeyModelPath = "model_path"

KeyPlugin = "plugin"
ValTopologyTree = "topology/tree"
Expand Down
8 changes: 5 additions & 3 deletions pkg/common/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,13 +76,14 @@ type TopologyRequest struct {
}

type provider struct {
Name string `json:"name"`
Creds map[string]string `json:"creds"` // access credentials
Name string `json:"name"`
Creds map[string]string `json:"creds"` // access credentials
Params map[string]string `json:"params"`
}

type engine struct {
Name string `json:"name"`
Params map[string]string `json:"params"` // access credentials
Params map[string]string `json:"params"`
}

type ComputeInstances struct {
Expand All @@ -108,6 +109,7 @@ func (p *TopologyRequest) String() string {
sb.WriteString("TopologyRequest:\n")
sb.WriteString(fmt.Sprintf(" Provider: %s\n", p.Provider.Name))
sb.WriteString(map2string(p.Provider.Creds, " Credentials", true, "\n"))
sb.WriteString(map2string(p.Provider.Params, " Parameters", false, "\n"))
sb.WriteString(fmt.Sprintf(" Engine: %s\n", p.Engine.Name))
sb.WriteString(map2string(p.Engine.Params, " Parameters", false, "\n"))
sb.WriteString(" Nodes: ")
Expand Down
6 changes: 5 additions & 1 deletion pkg/common/types_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ func TestPayload(t *testing.T) {
print: `TopologyRequest:
Provider:
Credentials: []
Parameters: []
Engine:
Parameters: []
Nodes:
Expand All @@ -58,7 +59,8 @@ func TestPayload(t *testing.T) {
"creds": {
"access_key_id": "id",
"secret_access_key": "secret"
}
},
"params": {}
},
"engine": {
"name": "slurm",
Expand Down Expand Up @@ -94,6 +96,7 @@ func TestPayload(t *testing.T) {
"access_key_id": "id",
"secret_access_key": "secret",
},
Params: map[string]string{},
},
Engine: engine{
Name: "slurm",
Expand Down Expand Up @@ -124,6 +127,7 @@ func TestPayload(t *testing.T) {
print: `TopologyRequest:
Provider: aws
Credentials: [access_key_id:*** secret_access_key:***]
Parameters: []
Engine: slurm
Parameters: [block_sizes:30,120 plugin:topology/block]
Nodes: region1: [instance1:node1 instance2:node2 instance3:node3] region2: [instance4:node4 instance5:node5 instance6:node6]
Expand Down
25 changes: 20 additions & 5 deletions pkg/factory/provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"net/http"

"github.com/NVIDIA/topograph/pkg/common"
"github.com/NVIDIA/topograph/pkg/models"
"github.com/NVIDIA/topograph/pkg/providers/aws"
"github.com/NVIDIA/topograph/pkg/providers/baremetal"
"github.com/NVIDIA/topograph/pkg/providers/cw"
Expand All @@ -30,7 +31,7 @@ import (
"github.com/NVIDIA/topograph/pkg/translate"
)

func GetProvider(provider string) (common.Provider, *common.HTTPError) {
func GetProvider(provider string, params map[string]string) (common.Provider, *common.HTTPError) {
var (
prv common.Provider
err error
Expand All @@ -48,7 +49,7 @@ func GetProvider(provider string) (common.Provider, *common.HTTPError) {
case common.ProviderBM:
prv, err = baremetal.GetProvider()
case common.ProviderTest:
prv = GetTestProvider()
prv, err = GetTestProvider(params)
default:
return nil, common.NewHTTPError(http.StatusBadRequest, fmt.Sprintf("unsupported provider %q", provider))
}
Expand All @@ -65,11 +66,25 @@ type testProvider struct {
instance2node map[string]string
}

func GetTestProvider() *testProvider {
func GetTestProvider(params map[string]string) (*testProvider, error) {
p := &testProvider{}
p.tree, p.instance2node = translate.GetTreeTestSet(false)

return p
var modelPath string
if len(params) != 0 {
modelPath = params[common.KeyModelPath]
}

if len(modelPath) == 0 {
p.tree, p.instance2node = translate.GetTreeTestSet(false)
} else {
model, err := models.NewModelFromFile(modelPath)
if err != nil {
return nil, err
}
p.tree, p.instance2node = model.ToTree()
}

return p, nil
}

func (p *testProvider) GetCredentials(_ map[string]string) (interface{}, error) {
Expand Down
50 changes: 49 additions & 1 deletion pkg/toposim/model.go → pkg/models/model.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,15 @@
* limitations under the License.
*/

package toposim
package models

import (
"fmt"
"os"

"gopkg.in/yaml.v3"

"github.com/NVIDIA/topograph/pkg/common"
)

type Model struct {
Expand Down Expand Up @@ -137,3 +139,49 @@ func getNetworkLayers(name string, swmap map[string]string) ([]string, error) {
name = parent
}
}

func (model *Model) ToTree() (*common.Vertex, map[string]string) {
instance2node := make(map[string]string)
nodeVertexMap := make(map[string]*common.Vertex)
swVertexMap := make(map[string]*common.Vertex)
swRootMap := make(map[string]bool)

// Create all the vertices for each node
for k, v := range model.Nodes {
instance2node[k] = k
nodeVertexMap[k] = &common.Vertex{ID: v.Name, Name: v.Name}
}

// Initialize all the vertices for each switch (setting each on to be a possible root)
for _, sw := range model.Switches {
swVertexMap[sw.Name] = &common.Vertex{ID: sw.Name, Vertices: make(map[string]*common.Vertex)}
swRootMap[sw.Name] = true
}

// Connect all the switches to their sub-switches and sub-nodes
for _, sw := range model.Switches {
for _, subsw := range sw.Switches {
swRootMap[subsw] = false
swVertexMap[sw.Name].Vertices[subsw] = swVertexMap[subsw]
}
for _, cbname := range sw.CapacityBlocks {
for _, block := range model.CapacityBlocks {
if cbname == block.Name {
for _, node := range block.Nodes {
swVertexMap[sw.Name].Vertices[node] = nodeVertexMap[node]
}
break
}
}
}
}

// Connects all root vertices to the hidden root
root := &common.Vertex{Vertices: make(map[string]*common.Vertex)}
for k, v := range swRootMap {
if v {
root.Vertices[k] = swVertexMap[k]
}
}
return root, instance2node
}
4 changes: 2 additions & 2 deletions pkg/toposim/model_test.go → pkg/models/model_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* limitations under the License.
*/

package toposim
package models

import (
"testing"
Expand All @@ -23,7 +23,7 @@ import (
)

func TestNewModelFromFile(t *testing.T) {
cfg, err := NewModelFromFile("testdata/toposim.yaml")
cfg, err := NewModelFromFile("../../tests/models/medium-h100.yaml")
require.NoError(t, err)

expected := &Model{
Expand Down
Loading

0 comments on commit 7ce1be9

Please sign in to comment.