@@ -275,148 +275,151 @@ func createKFTOPyTorchJob(test Test, namespace string, config corev1.ConfigMap,
275275 },
276276 },
277277 },
278- kftov1 .PyTorchJobReplicaTypeWorker : {
279- Replicas : Ptr (int32 (numberOfWorkerNodes )),
280- RestartPolicy : "OnFailure" ,
281- Template : corev1.PodTemplateSpec {
282- ObjectMeta : metav1.ObjectMeta {
283- Labels : map [string ]string {
284- "app" : "kfto-llm" ,
285- },
286- },
287- Spec : corev1.PodSpec {
288- Affinity : & corev1.Affinity {
289- PodAntiAffinity : & corev1.PodAntiAffinity {
290- RequiredDuringSchedulingIgnoredDuringExecution : []corev1.PodAffinityTerm {
291- {
292- LabelSelector : & metav1.LabelSelector {
293- MatchLabels : map [string ]string {
294- "app" : "kfto-llm" ,
295- },
296- },
297- TopologyKey : "kubernetes.io/hostname" ,
278+ },
279+ },
280+ }
281+ // Declaring worker replicas separately, if worker replica is declared with number of pods 0 then operator keeps creating and deleting worker pods
282+ if numberOfWorkerNodes > 0 {
283+ tuningJob .Spec .PyTorchReplicaSpecs [kftov1 .PyTorchJobReplicaTypeWorker ] = & kftov1.ReplicaSpec {
284+ Replicas : Ptr (int32 (numberOfWorkerNodes )),
285+ RestartPolicy : "OnFailure" ,
286+ Template : corev1.PodTemplateSpec {
287+ ObjectMeta : metav1.ObjectMeta {
288+ Labels : map [string ]string {
289+ "app" : "kfto-llm" ,
290+ },
291+ },
292+ Spec : corev1.PodSpec {
293+ Affinity : & corev1.Affinity {
294+ PodAntiAffinity : & corev1.PodAntiAffinity {
295+ RequiredDuringSchedulingIgnoredDuringExecution : []corev1.PodAffinityTerm {
296+ {
297+ LabelSelector : & metav1.LabelSelector {
298+ MatchLabels : map [string ]string {
299+ "app" : "kfto-llm" ,
298300 },
299301 },
302+ TopologyKey : "kubernetes.io/hostname" ,
300303 },
301304 },
302- Tolerations : []corev1.Toleration {
305+ },
306+ },
307+ Tolerations : []corev1.Toleration {
308+ {
309+ Key : gpu .ResourceLabel ,
310+ Operator : corev1 .TolerationOpExists ,
311+ },
312+ },
313+ InitContainers : []corev1.Container {
314+ {
315+ Name : "copy-model" ,
316+ Image : GetBloomModelImage (),
317+ ImagePullPolicy : corev1 .PullIfNotPresent ,
318+ VolumeMounts : []corev1.VolumeMount {
303319 {
304- Key : gpu . ResourceLabel ,
305- Operator : corev1 . TolerationOpExists ,
320+ Name : "tmp-volume" ,
321+ MountPath : "/tmp" ,
306322 },
307323 },
308- InitContainers : []corev1.Container {
324+ Command : []string {"/bin/sh" , "-c" },
325+ Args : []string {"mkdir /tmp/model; cp -r /models/bloom-560m /tmp/model" },
326+ },
327+ {
328+ Name : "copy-dataset" ,
329+ Image : GetAlpacaDatasetImage (),
330+ ImagePullPolicy : corev1 .PullIfNotPresent ,
331+ VolumeMounts : []corev1.VolumeMount {
309332 {
310- Name : "copy-model" ,
311- Image : GetBloomModelImage (),
312- ImagePullPolicy : corev1 .PullIfNotPresent ,
313- VolumeMounts : []corev1.VolumeMount {
314- {
315- Name : "tmp-volume" ,
316- MountPath : "/tmp" ,
317- },
318- },
319- Command : []string {"/bin/sh" , "-c" },
320- Args : []string {"mkdir /tmp/model; cp -r /models/bloom-560m /tmp/model" },
333+ Name : "tmp-volume" ,
334+ MountPath : "/tmp" ,
321335 },
336+ },
337+ Command : []string {"/bin/sh" , "-c" },
338+ Args : []string {"mkdir /tmp/all_datasets; cp -r /dataset/* /tmp/all_datasets;ls /tmp/all_datasets" },
339+ },
340+ },
341+ Containers : []corev1.Container {
342+ {
343+ Name : "pytorch" ,
344+ Image : baseImage ,
345+ ImagePullPolicy : corev1 .PullIfNotPresent ,
346+ Command : []string {
347+ "/bin/bash" , "-c" ,
348+ `torchrun /etc/config/hf_llm_training.py \
349+ --model_uri /tmp/model/bloom-560m \
350+ --model_dir /tmp/model/bloom-560m \
351+ --dataset_file /tmp/all_datasets/alpaca_data_tenth.json \
352+ --transformer_type AutoModelForCausalLM \
353+ --training_parameters '{"output_dir": "/mnt/output", "per_device_train_batch_size": 8, "num_train_epochs": 3, "logging_dir": "/logs", "eval_strategy": "epoch", "save_strategy": "no"}' \
354+ --lora_config '{"r": 4, "lora_alpha": 16, "lora_dropout": 0.1, "bias": "none"}'` ,
355+ },
356+ Env : []corev1.EnvVar {
322357 {
323- Name : "copy-dataset" ,
324- Image : GetAlpacaDatasetImage (),
325- ImagePullPolicy : corev1 .PullIfNotPresent ,
326- VolumeMounts : []corev1.VolumeMount {
327- {
328- Name : "tmp-volume" ,
329- MountPath : "/tmp" ,
330- },
331- },
332- Command : []string {"/bin/sh" , "-c" },
333- Args : []string {"mkdir /tmp/all_datasets; cp -r /dataset/* /tmp/all_datasets;ls /tmp/all_datasets" },
358+ Name : "HF_HOME" ,
359+ Value : "/tmp/.cache" ,
334360 },
335- },
336- Containers : []corev1.Container {
337361 {
338- Name : "pytorch" ,
339- Image : baseImage ,
340- ImagePullPolicy : corev1 .PullIfNotPresent ,
341- Command : []string {
342- "/bin/bash" , "-c" ,
343- `torchrun /etc/config/hf_llm_training.py \
344- --model_uri /tmp/model/bloom-560m \
345- --model_dir /tmp/model/bloom-560m \
346- --dataset_file /tmp/all_datasets/alpaca_data_tenth.json \
347- --transformer_type AutoModelForCausalLM \
348- --training_parameters '{"output_dir": "/mnt/output", "per_device_train_batch_size": 8, "num_train_epochs": 3, "logging_dir": "/logs", "eval_strategy": "epoch", "save_strategy": "no"}' \
349- --lora_config '{"r": 4, "lora_alpha": 16, "lora_dropout": 0.1, "bias": "none"}'` ,
350- },
351- Env : []corev1.EnvVar {
352- {
353- Name : "HF_HOME" ,
354- Value : "/tmp/.cache" ,
355- },
356- {
357- Name : "TRITON_CACHE_DIR" ,
358- Value : "/tmp/.triton" ,
359- },
360- {
361- Name : "TOKENIZERS_PARALLELISM" ,
362- Value : "false" ,
363- },
364- {
365- Name : "NCCL_DEBUG" ,
366- Value : "INFO" ,
367- },
368- },
369- VolumeMounts : []corev1.VolumeMount {
370- {
371- Name : "config-volume" ,
372- MountPath : "/etc/config" ,
373- },
374- {
375- Name : "tmp-volume" ,
376- MountPath : "/tmp" ,
377- },
378- },
379- Resources : corev1.ResourceRequirements {
380- Requests : corev1.ResourceList {
381- corev1 .ResourceCPU : resource .MustParse ("2" ),
382- corev1 .ResourceMemory : resource .MustParse ("8Gi" ),
383- corev1 .ResourceName (gpu .ResourceLabel ): resource .MustParse (fmt .Sprint (numGpus )),
384- },
385- Limits : corev1.ResourceList {
386- corev1 .ResourceCPU : resource .MustParse ("2" ),
387- corev1 .ResourceMemory : resource .MustParse ("8Gi" ),
388- corev1 .ResourceName (gpu .ResourceLabel ): resource .MustParse (fmt .Sprint (numGpus )),
389- },
390- },
391- SecurityContext : & corev1.SecurityContext {
392- RunAsNonRoot : Ptr (true ),
393- ReadOnlyRootFilesystem : Ptr (true ),
394- },
362+ Name : "TRITON_CACHE_DIR" ,
363+ Value : "/tmp/.triton" ,
364+ },
365+ {
366+ Name : "TOKENIZERS_PARALLELISM" ,
367+ Value : "false" ,
368+ },
369+ {
370+ Name : "NCCL_DEBUG" ,
371+ Value : "INFO" ,
395372 },
396373 },
397- Volumes : []corev1.Volume {
374+ VolumeMounts : []corev1.VolumeMount {
398375 {
399- Name : "config-volume" ,
400- VolumeSource : corev1.VolumeSource {
401- ConfigMap : & corev1.ConfigMapVolumeSource {
402- LocalObjectReference : corev1.LocalObjectReference {
403- Name : config .Name ,
404- },
405- },
406- },
376+ Name : "config-volume" ,
377+ MountPath : "/etc/config" ,
407378 },
408379 {
409- Name : "tmp-volume" ,
410- VolumeSource : corev1.VolumeSource {
411- EmptyDir : & corev1.EmptyDirVolumeSource {},
380+ Name : "tmp-volume" ,
381+ MountPath : "/tmp" ,
382+ },
383+ },
384+ Resources : corev1.ResourceRequirements {
385+ Requests : corev1.ResourceList {
386+ corev1 .ResourceCPU : resource .MustParse ("2" ),
387+ corev1 .ResourceMemory : resource .MustParse ("8Gi" ),
388+ corev1 .ResourceName (gpu .ResourceLabel ): resource .MustParse (fmt .Sprint (numGpus )),
389+ },
390+ Limits : corev1.ResourceList {
391+ corev1 .ResourceCPU : resource .MustParse ("2" ),
392+ corev1 .ResourceMemory : resource .MustParse ("8Gi" ),
393+ corev1 .ResourceName (gpu .ResourceLabel ): resource .MustParse (fmt .Sprint (numGpus )),
394+ },
395+ },
396+ SecurityContext : & corev1.SecurityContext {
397+ RunAsNonRoot : Ptr (true ),
398+ ReadOnlyRootFilesystem : Ptr (true ),
399+ },
400+ },
401+ },
402+ Volumes : []corev1.Volume {
403+ {
404+ Name : "config-volume" ,
405+ VolumeSource : corev1.VolumeSource {
406+ ConfigMap : & corev1.ConfigMapVolumeSource {
407+ LocalObjectReference : corev1.LocalObjectReference {
408+ Name : config .Name ,
412409 },
413410 },
414411 },
415412 },
413+ {
414+ Name : "tmp-volume" ,
415+ VolumeSource : corev1.VolumeSource {
416+ EmptyDir : & corev1.EmptyDirVolumeSource {},
417+ },
418+ },
416419 },
417420 },
418421 },
419- },
422+ }
420423 }
421424
422425 tuningJob , err := test .Client ().Kubeflow ().KubeflowV1 ().PyTorchJobs (namespace ).Create (test .Ctx (), tuningJob , metav1.CreateOptions {})
0 commit comments