Skip to content

Commit 5b1258f

Browse files
sutaakaropenshift-merge-bot[bot]
authored andcommitted
Specify Worker replicas in PyTorchJob only for existing workers
1 parent 8bbd0c4 commit 5b1258f

File tree

1 file changed

+121
-118
lines changed

1 file changed

+121
-118
lines changed

tests/kfto/kfto_training_test.go

Lines changed: 121 additions & 118 deletions
Original file line numberDiff line numberDiff line change
@@ -275,148 +275,151 @@ func createKFTOPyTorchJob(test Test, namespace string, config corev1.ConfigMap,
275275
},
276276
},
277277
},
278-
kftov1.PyTorchJobReplicaTypeWorker: {
279-
Replicas: Ptr(int32(numberOfWorkerNodes)),
280-
RestartPolicy: "OnFailure",
281-
Template: corev1.PodTemplateSpec{
282-
ObjectMeta: metav1.ObjectMeta{
283-
Labels: map[string]string{
284-
"app": "kfto-llm",
285-
},
286-
},
287-
Spec: corev1.PodSpec{
288-
Affinity: &corev1.Affinity{
289-
PodAntiAffinity: &corev1.PodAntiAffinity{
290-
RequiredDuringSchedulingIgnoredDuringExecution: []corev1.PodAffinityTerm{
291-
{
292-
LabelSelector: &metav1.LabelSelector{
293-
MatchLabels: map[string]string{
294-
"app": "kfto-llm",
295-
},
296-
},
297-
TopologyKey: "kubernetes.io/hostname",
278+
},
279+
},
280+
}
281+
// Declaring worker replicas separately, if worker replica is declared with number of pods 0 then operator keeps creating and deleting worker pods
282+
if numberOfWorkerNodes > 0 {
283+
tuningJob.Spec.PyTorchReplicaSpecs[kftov1.PyTorchJobReplicaTypeWorker] = &kftov1.ReplicaSpec{
284+
Replicas: Ptr(int32(numberOfWorkerNodes)),
285+
RestartPolicy: "OnFailure",
286+
Template: corev1.PodTemplateSpec{
287+
ObjectMeta: metav1.ObjectMeta{
288+
Labels: map[string]string{
289+
"app": "kfto-llm",
290+
},
291+
},
292+
Spec: corev1.PodSpec{
293+
Affinity: &corev1.Affinity{
294+
PodAntiAffinity: &corev1.PodAntiAffinity{
295+
RequiredDuringSchedulingIgnoredDuringExecution: []corev1.PodAffinityTerm{
296+
{
297+
LabelSelector: &metav1.LabelSelector{
298+
MatchLabels: map[string]string{
299+
"app": "kfto-llm",
298300
},
299301
},
302+
TopologyKey: "kubernetes.io/hostname",
300303
},
301304
},
302-
Tolerations: []corev1.Toleration{
305+
},
306+
},
307+
Tolerations: []corev1.Toleration{
308+
{
309+
Key: gpu.ResourceLabel,
310+
Operator: corev1.TolerationOpExists,
311+
},
312+
},
313+
InitContainers: []corev1.Container{
314+
{
315+
Name: "copy-model",
316+
Image: GetBloomModelImage(),
317+
ImagePullPolicy: corev1.PullIfNotPresent,
318+
VolumeMounts: []corev1.VolumeMount{
303319
{
304-
Key: gpu.ResourceLabel,
305-
Operator: corev1.TolerationOpExists,
320+
Name: "tmp-volume",
321+
MountPath: "/tmp",
306322
},
307323
},
308-
InitContainers: []corev1.Container{
324+
Command: []string{"/bin/sh", "-c"},
325+
Args: []string{"mkdir /tmp/model; cp -r /models/bloom-560m /tmp/model"},
326+
},
327+
{
328+
Name: "copy-dataset",
329+
Image: GetAlpacaDatasetImage(),
330+
ImagePullPolicy: corev1.PullIfNotPresent,
331+
VolumeMounts: []corev1.VolumeMount{
309332
{
310-
Name: "copy-model",
311-
Image: GetBloomModelImage(),
312-
ImagePullPolicy: corev1.PullIfNotPresent,
313-
VolumeMounts: []corev1.VolumeMount{
314-
{
315-
Name: "tmp-volume",
316-
MountPath: "/tmp",
317-
},
318-
},
319-
Command: []string{"/bin/sh", "-c"},
320-
Args: []string{"mkdir /tmp/model; cp -r /models/bloom-560m /tmp/model"},
333+
Name: "tmp-volume",
334+
MountPath: "/tmp",
321335
},
336+
},
337+
Command: []string{"/bin/sh", "-c"},
338+
Args: []string{"mkdir /tmp/all_datasets; cp -r /dataset/* /tmp/all_datasets;ls /tmp/all_datasets"},
339+
},
340+
},
341+
Containers: []corev1.Container{
342+
{
343+
Name: "pytorch",
344+
Image: baseImage,
345+
ImagePullPolicy: corev1.PullIfNotPresent,
346+
Command: []string{
347+
"/bin/bash", "-c",
348+
`torchrun /etc/config/hf_llm_training.py \
349+
--model_uri /tmp/model/bloom-560m \
350+
--model_dir /tmp/model/bloom-560m \
351+
--dataset_file /tmp/all_datasets/alpaca_data_tenth.json \
352+
--transformer_type AutoModelForCausalLM \
353+
--training_parameters '{"output_dir": "/mnt/output", "per_device_train_batch_size": 8, "num_train_epochs": 3, "logging_dir": "/logs", "eval_strategy": "epoch", "save_strategy": "no"}' \
354+
--lora_config '{"r": 4, "lora_alpha": 16, "lora_dropout": 0.1, "bias": "none"}'`,
355+
},
356+
Env: []corev1.EnvVar{
322357
{
323-
Name: "copy-dataset",
324-
Image: GetAlpacaDatasetImage(),
325-
ImagePullPolicy: corev1.PullIfNotPresent,
326-
VolumeMounts: []corev1.VolumeMount{
327-
{
328-
Name: "tmp-volume",
329-
MountPath: "/tmp",
330-
},
331-
},
332-
Command: []string{"/bin/sh", "-c"},
333-
Args: []string{"mkdir /tmp/all_datasets; cp -r /dataset/* /tmp/all_datasets;ls /tmp/all_datasets"},
358+
Name: "HF_HOME",
359+
Value: "/tmp/.cache",
334360
},
335-
},
336-
Containers: []corev1.Container{
337361
{
338-
Name: "pytorch",
339-
Image: baseImage,
340-
ImagePullPolicy: corev1.PullIfNotPresent,
341-
Command: []string{
342-
"/bin/bash", "-c",
343-
`torchrun /etc/config/hf_llm_training.py \
344-
--model_uri /tmp/model/bloom-560m \
345-
--model_dir /tmp/model/bloom-560m \
346-
--dataset_file /tmp/all_datasets/alpaca_data_tenth.json \
347-
--transformer_type AutoModelForCausalLM \
348-
--training_parameters '{"output_dir": "/mnt/output", "per_device_train_batch_size": 8, "num_train_epochs": 3, "logging_dir": "/logs", "eval_strategy": "epoch", "save_strategy": "no"}' \
349-
--lora_config '{"r": 4, "lora_alpha": 16, "lora_dropout": 0.1, "bias": "none"}'`,
350-
},
351-
Env: []corev1.EnvVar{
352-
{
353-
Name: "HF_HOME",
354-
Value: "/tmp/.cache",
355-
},
356-
{
357-
Name: "TRITON_CACHE_DIR",
358-
Value: "/tmp/.triton",
359-
},
360-
{
361-
Name: "TOKENIZERS_PARALLELISM",
362-
Value: "false",
363-
},
364-
{
365-
Name: "NCCL_DEBUG",
366-
Value: "INFO",
367-
},
368-
},
369-
VolumeMounts: []corev1.VolumeMount{
370-
{
371-
Name: "config-volume",
372-
MountPath: "/etc/config",
373-
},
374-
{
375-
Name: "tmp-volume",
376-
MountPath: "/tmp",
377-
},
378-
},
379-
Resources: corev1.ResourceRequirements{
380-
Requests: corev1.ResourceList{
381-
corev1.ResourceCPU: resource.MustParse("2"),
382-
corev1.ResourceMemory: resource.MustParse("8Gi"),
383-
corev1.ResourceName(gpu.ResourceLabel): resource.MustParse(fmt.Sprint(numGpus)),
384-
},
385-
Limits: corev1.ResourceList{
386-
corev1.ResourceCPU: resource.MustParse("2"),
387-
corev1.ResourceMemory: resource.MustParse("8Gi"),
388-
corev1.ResourceName(gpu.ResourceLabel): resource.MustParse(fmt.Sprint(numGpus)),
389-
},
390-
},
391-
SecurityContext: &corev1.SecurityContext{
392-
RunAsNonRoot: Ptr(true),
393-
ReadOnlyRootFilesystem: Ptr(true),
394-
},
362+
Name: "TRITON_CACHE_DIR",
363+
Value: "/tmp/.triton",
364+
},
365+
{
366+
Name: "TOKENIZERS_PARALLELISM",
367+
Value: "false",
368+
},
369+
{
370+
Name: "NCCL_DEBUG",
371+
Value: "INFO",
395372
},
396373
},
397-
Volumes: []corev1.Volume{
374+
VolumeMounts: []corev1.VolumeMount{
398375
{
399-
Name: "config-volume",
400-
VolumeSource: corev1.VolumeSource{
401-
ConfigMap: &corev1.ConfigMapVolumeSource{
402-
LocalObjectReference: corev1.LocalObjectReference{
403-
Name: config.Name,
404-
},
405-
},
406-
},
376+
Name: "config-volume",
377+
MountPath: "/etc/config",
407378
},
408379
{
409-
Name: "tmp-volume",
410-
VolumeSource: corev1.VolumeSource{
411-
EmptyDir: &corev1.EmptyDirVolumeSource{},
380+
Name: "tmp-volume",
381+
MountPath: "/tmp",
382+
},
383+
},
384+
Resources: corev1.ResourceRequirements{
385+
Requests: corev1.ResourceList{
386+
corev1.ResourceCPU: resource.MustParse("2"),
387+
corev1.ResourceMemory: resource.MustParse("8Gi"),
388+
corev1.ResourceName(gpu.ResourceLabel): resource.MustParse(fmt.Sprint(numGpus)),
389+
},
390+
Limits: corev1.ResourceList{
391+
corev1.ResourceCPU: resource.MustParse("2"),
392+
corev1.ResourceMemory: resource.MustParse("8Gi"),
393+
corev1.ResourceName(gpu.ResourceLabel): resource.MustParse(fmt.Sprint(numGpus)),
394+
},
395+
},
396+
SecurityContext: &corev1.SecurityContext{
397+
RunAsNonRoot: Ptr(true),
398+
ReadOnlyRootFilesystem: Ptr(true),
399+
},
400+
},
401+
},
402+
Volumes: []corev1.Volume{
403+
{
404+
Name: "config-volume",
405+
VolumeSource: corev1.VolumeSource{
406+
ConfigMap: &corev1.ConfigMapVolumeSource{
407+
LocalObjectReference: corev1.LocalObjectReference{
408+
Name: config.Name,
412409
},
413410
},
414411
},
415412
},
413+
{
414+
Name: "tmp-volume",
415+
VolumeSource: corev1.VolumeSource{
416+
EmptyDir: &corev1.EmptyDirVolumeSource{},
417+
},
418+
},
416419
},
417420
},
418421
},
419-
},
422+
}
420423
}
421424

422425
tuningJob, err := test.Client().Kubeflow().KubeflowV1().PyTorchJobs(namespace).Create(test.Ctx(), tuningJob, metav1.CreateOptions{})

0 commit comments

Comments
 (0)