Skip to content

Commit 781fa6c

Browse files
authored
fix: mpi flow and add resourceClaim (#3446) (#3844)
Signed-off-by: Rohan Varma <[email protected]>
1 parent 6c624f2 commit 781fa6c

File tree

11 files changed

+380
-14
lines changed

11 files changed

+380
-14
lines changed

deploy/cloud/helm/crds/templates/nvidia.com_dynamocomponentdeployments.yaml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10181,6 +10181,26 @@ spec:
1018110181
Resources requested and limits for this component, including CPU, memory,
1018210182
GPUs/devices, and any runtime-specific resources.
1018310183
properties:
10184+
claims:
10185+
items:
10186+
description: ResourceClaim references one entry in PodSpec.ResourceClaims.
10187+
properties:
10188+
name:
10189+
description: |-
10190+
Name must match the name of one entry in pod.spec.resourceClaims of
10191+
the Pod where this field is used. It makes that resource available
10192+
inside a container.
10193+
type: string
10194+
request:
10195+
description: |-
10196+
Request is the name chosen for a request in the referenced claim.
10197+
If empty, everything from the claim is made available, otherwise
10198+
only the result of this request.
10199+
type: string
10200+
required:
10201+
- name
10202+
type: object
10203+
type: array
1018410204
limits:
1018510205
properties:
1018610206
cpu:

deploy/cloud/helm/crds/templates/nvidia.com_dynamographdeployments.yaml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10312,6 +10312,26 @@ spec:
1031210312
Resources requested and limits for this component, including CPU, memory,
1031310313
GPUs/devices, and any runtime-specific resources.
1031410314
properties:
10315+
claims:
10316+
items:
10317+
description: ResourceClaim references one entry in PodSpec.ResourceClaims.
10318+
properties:
10319+
name:
10320+
description: |-
10321+
Name must match the name of one entry in pod.spec.resourceClaims of
10322+
the Pod where this field is used. It makes that resource available
10323+
inside a container.
10324+
type: string
10325+
request:
10326+
description: |-
10327+
Request is the name chosen for a request in the referenced claim.
10328+
If empty, everything from the claim is made available, otherwise
10329+
only the result of this request.
10330+
type: string
10331+
required:
10332+
- name
10333+
type: object
10334+
type: array
1031510335
limits:
1031610336
properties:
1031710337
cpu:

deploy/cloud/operator/api/dynamo/common/common.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,9 @@ type ResourceItem struct {
3232
}
3333

3434
type Resources struct {
35-
Requests *ResourceItem `json:"requests,omitempty"`
36-
Limits *ResourceItem `json:"limits,omitempty"`
35+
Requests *ResourceItem `json:"requests,omitempty"`
36+
Limits *ResourceItem `json:"limits,omitempty"`
37+
Claims []corev1.ResourceClaim `json:"claims,omitempty"`
3738
}
3839

3940
type DeploymentTargetHPAConf struct {

deploy/cloud/operator/api/dynamo/common/zz_generated.deepcopy.go

Lines changed: 5 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

deploy/cloud/operator/config/crd/bases/nvidia.com_dynamocomponentdeployments.yaml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10181,6 +10181,26 @@ spec:
1018110181
Resources requested and limits for this component, including CPU, memory,
1018210182
GPUs/devices, and any runtime-specific resources.
1018310183
properties:
10184+
claims:
10185+
items:
10186+
description: ResourceClaim references one entry in PodSpec.ResourceClaims.
10187+
properties:
10188+
name:
10189+
description: |-
10190+
Name must match the name of one entry in pod.spec.resourceClaims of
10191+
the Pod where this field is used. It makes that resource available
10192+
inside a container.
10193+
type: string
10194+
request:
10195+
description: |-
10196+
Request is the name chosen for a request in the referenced claim.
10197+
If empty, everything from the claim is made available, otherwise
10198+
only the result of this request.
10199+
type: string
10200+
required:
10201+
- name
10202+
type: object
10203+
type: array
1018410204
limits:
1018510205
properties:
1018610206
cpu:

deploy/cloud/operator/config/crd/bases/nvidia.com_dynamographdeployments.yaml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10312,6 +10312,26 @@ spec:
1031210312
Resources requested and limits for this component, including CPU, memory,
1031310313
GPUs/devices, and any runtime-specific resources.
1031410314
properties:
10315+
claims:
10316+
items:
10317+
description: ResourceClaim references one entry in PodSpec.ResourceClaims.
10318+
properties:
10319+
name:
10320+
description: |-
10321+
Name must match the name of one entry in pod.spec.resourceClaims of
10322+
the Pod where this field is used. It makes that resource available
10323+
inside a container.
10324+
type: string
10325+
request:
10326+
description: |-
10327+
Request is the name chosen for a request in the referenced claim.
10328+
If empty, everything from the claim is made available, otherwise
10329+
only the result of this request.
10330+
type: string
10331+
required:
10332+
- name
10333+
type: object
10334+
type: array
1031510335
limits:
1031610336
properties:
1031710337
cpu:

deploy/cloud/operator/internal/controller_common/resource.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -468,6 +468,12 @@ func GetResourcesConfig(resources *common.Resources) (*corev1.ResourceRequiremen
468468
currentResources.Requests[corev1.ResourceName(k)] = q
469469
}
470470
}
471+
if resources.Claims != nil {
472+
if currentResources.Claims == nil {
473+
currentResources.Claims = make([]corev1.ResourceClaim, 0)
474+
}
475+
currentResources.Claims = append(currentResources.Claims, resources.Claims...)
476+
}
471477
return currentResources, nil
472478
}
473479

deploy/cloud/operator/internal/dynamo/backend_trtllm.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -143,12 +143,12 @@ func (b *TRTLLMBackend) setupLeaderContainer(container *corev1.Container, number
143143

144144
// Build mpirun command with explicit SSH configuration and environment variables
145145
// Wrap the entire command (trtllm-llmapi-launch + original command) in bash -c for proper shell interpretation
146-
wrappedCommand := fmt.Sprintf("bash -c 'source /opt/dynamo/venv/bin/activate && trtllm-llmapi-launch %s'", originalCommand)
146+
wrappedCommand := fmt.Sprintf("bash -c 'trtllm-llmapi-launch %s'", originalCommand)
147147

148148
// Generate environment variable flags for mpirun
149149
envVarsStr := generateEnvVarFlags(container.Env)
150150

151-
mpirunCmd := fmt.Sprintf("mpirun --oversubscribe -n %d -H %s --mca pml ob1 --mca plm_rsh_args \"-p %d -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa\" %s %s",
151+
mpirunCmd := fmt.Sprintf("mpirun --allow-run-as-root --oversubscribe -n %d -H %s --mca pml ob1 --mca plm_rsh_args \"-p %d -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa\" %s %s",
152152
totalGPUs,
153153
workerHosts,
154154
commonconsts.MpiRunSshPort,

0 commit comments

Comments
 (0)