Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 22 additions & 37 deletions test/kubernetes/testcluster/objects.go
Original file line number Diff line number Diff line change
Expand Up @@ -187,51 +187,18 @@ func (n *Namespace) GetService(name string, spec v13.ServiceSpec) *v13.Service {

// ContainerResourcesRequest holds arguments to set requested resource on a container.
type ContainerResourcesRequest struct {
CPUResources string // CPUResources to request. Note: Will be overridden by flag above.
MemoryResources string // MemoryResources to request. Note: Will be overridden by flag above.
GPU bool
TPU bool
GPU bool
TPU bool
}

// String returns a string representation of the `ContainerResourcesRequest`.
func (crr ContainerResourcesRequest) String() string {
return fmt.Sprintf("cpu=%q memory=%q gpu=%v tpu=%v", crr.CPUResources, crr.MemoryResources, crr.GPU, crr.TPU)
return fmt.Sprintf("gpu=%v tpu=%v", crr.GPU, crr.TPU)
}

// SetContainerResources sets container resources.
// Sets both the resource limits and requests as container runtimes honor
// them differently.
// `containerName` is optional if the pod has exactly one container.
func SetContainerResources(pod *v13.Pod, containerName string, requests ContainerResourcesRequest) (*v13.Pod, error) {
resourceList := v13.ResourceList{}
if requests.CPUResources != "" {
resourceList[v13.ResourceCPU] = resource.MustParse(requests.CPUResources)
}
if requests.MemoryResources != "" {
resourceList[v13.ResourceMemory] = resource.MustParse(requests.MemoryResources)
}

if requests.GPU {
acceleratorCount, ok := pod.Spec.NodeSelector[NodepoolNumAcceleratorsKey]
if !ok {
return nil, fmt.Errorf("cannot determine number of accelerators that the pod should use, make sure to call ConfigurePodForRuntimeTestNodepool first")
}
resourceList[v13.ResourceName("nvidia.com/gpu")] = resource.MustParse(acceleratorCount)
}

if requests.TPU {
acceleratorCount, ok := pod.Spec.NodeSelector[NodepoolTPUNumAcceleratorKey]
if !ok {
return nil, fmt.Errorf("cannot determine number of accelerators that the pod should use, make sure to call ConfigurePodForRuntimeTestNodepool first")
}
resourceList[v13.ResourceName("google.com/tpu")] = resource.MustParse(acceleratorCount)
}

requirements := v13.ResourceRequirements{
Limits: resourceList,
Requests: resourceList,
}

var containerToChange *v13.Container
if containerName == "" {
switch len(pod.Spec.Containers) {
Expand All @@ -252,7 +219,25 @@ func SetContainerResources(pod *v13.Pod, containerName string, requests Containe
if containerToChange == nil {
return nil, fmt.Errorf("container %q not found", containerName)
}
containerToChange.Resources = requirements
for _, resourceList := range []v13.ResourceList{
containerToChange.Resources.Limits,
containerToChange.Resources.Requests,
} {
if requests.GPU {
acceleratorCount, ok := pod.Spec.NodeSelector[NodepoolNumAcceleratorsKey]
if !ok {
return nil, fmt.Errorf("cannot determine number of accelerators that the pod should use, make sure to call ConfigurePodForRuntimeTestNodepool first")
}
resourceList[v13.ResourceName("nvidia.com/gpu")] = resource.MustParse(acceleratorCount)
}
if requests.TPU {
acceleratorCount, ok := pod.Spec.NodeSelector[NodepoolTPUNumAcceleratorKey]
if !ok {
return nil, fmt.Errorf("cannot determine number of accelerators that the pod should use, make sure to call ConfigurePodForRuntimeTestNodepool first")
}
resourceList[v13.ResourceName("google.com/tpu")] = resource.MustParse(acceleratorCount)
}
}
return pod, nil
}

Expand Down
55 changes: 38 additions & 17 deletions test/kubernetes/testcluster/testcluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -166,17 +166,36 @@ type MachineInfo struct {

// IsVirtual is whether the machine type is a virtual machine.
IsVirtual bool

// MaxPodCores is the maximum number of cores to set in a pod.
// If set to 0, but the runtime requires explicit limits, the
// the default value is based on `defaultMaxResourceUtilization`.
MaxPodCores int

// MaxPodMemoryGiB is the maximum amount of memory in GiB to set in a pod.
// If set to 0, but the runtime requires explicit limits, the
// the default value is based on `defaultMaxResourceUtilization`.
MaxPodMemoryGiB int
}

// defaultMaxResourceUtilization is the default maximum resource utilization for a machine type.
const defaultMaxResourceUtilization = 0.8

// KnownMachineTypes is a map of known GCE machine types to their info.
var KnownMachineTypes = map[string]*MachineInfo{
"n1-standard-4": {NumCores: 4, MemoryGiB: 15, IsVirtual: true},
"n2-standard-4": {NumCores: 4, MemoryGiB: 16, IsVirtual: true},
"n2-standard-8": {NumCores: 8, MemoryGiB: 32, IsVirtual: true},
"n2d-standard-8": {NumCores: 8, MemoryGiB: 32, IsVirtual: true},
"g2-standard-8": {NumCores: 8, MemoryGiB: 32, IsVirtual: true},
"ct4p-hightpu-4t": {NumCores: 240, MemoryGiB: 407, IsVirtual: true},
"c3-standard-192-metal": {NumCores: 192, MemoryGiB: 768, IsVirtual: false},
"n1-standard-4": {NumCores: 4, MemoryGiB: 15, IsVirtual: true},
"n2-standard-4": {NumCores: 4, MemoryGiB: 16, IsVirtual: true},
"n2-standard-8": {NumCores: 8, MemoryGiB: 32, IsVirtual: true},
"n2d-standard-8": {NumCores: 8, MemoryGiB: 32, IsVirtual: true},
"g2-standard-8": {NumCores: 8, MemoryGiB: 32, IsVirtual: true},
"ct4p-hightpu-4t": {NumCores: 240, MemoryGiB: 407, IsVirtual: true},
"c3-standard-192-metal": {
NumCores: 192,
MemoryGiB: 768,
IsVirtual: false,
MaxPodCores: 144,
MaxPodMemoryGiB: 64, // More than this causes "Large hotplug" when using ACPI hotplugging.
},
}

// TestCluster wraps clusters with their individual ClientSets so that helper methods can be called.
Expand Down Expand Up @@ -730,17 +749,19 @@ func (t *TestCluster) applyCommonPodConfigurations(ctx context.Context, np *Node
// Apply the runtime we've chosen, whether by override or autodetection.
applyRuntime.ApplyPodSpec(podSpec)
if applyRuntime.RequiresExplicitResourceLimits() {
const (
overheadMargin = 0.2
leftoverRatio = 1.0 - overheadMargin
)
cores := int(float64(np.spec.NumCores) * leftoverRatio)
if cores < 1 {
cores = 1
targetCores := np.spec.MaxPodCores
if targetCores == 0 {
targetCores = int(float64(np.spec.NumCores) * defaultMaxResourceUtilization)
}
if targetCores < 1 {
targetCores = 1
}
targetMemoryMiB := np.spec.MaxPodMemoryGiB * 1024
if targetMemoryMiB == 0 {
targetMemoryMiB = int(float64(np.spec.MemoryGiB*1024) * defaultMaxResourceUtilization)
}
memMiB := int(float64(np.spec.MemoryGiB) * 1024 * leftoverRatio)
resCPU := resource.MustParse(fmt.Sprintf("%d", cores))
resMem := resource.MustParse(fmt.Sprintf("%dMi", memMiB))
resCPU := resource.MustParse(fmt.Sprintf("%d", targetCores))
resMem := resource.MustParse(fmt.Sprintf("%dMi", targetMemoryMiB))
for _, containers := range [][]v13.Container{
podSpec.InitContainers,
podSpec.Containers,
Expand Down
Loading