Skip to content

Commit debe387

Browse files
sutaakaropenshift-merge-bot[bot]
authored andcommitted
Add KFTO upgrade test for uninterrupted training
1 parent 296c1d6 commit debe387

File tree

2 files changed

+114
-0
lines changed

2 files changed

+114
-0
lines changed

tests/kfto/upgrade/kfto_kueue_sft_upgrade_test.go renamed to tests/kfto/upgrade/kfto_kueue_sft_upgrade_training_test.go

File renamed without changes.
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
/*
2+
Copyright 2024.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package kfto
18+
19+
import (
20+
"testing"
21+
22+
. "github.com/onsi/gomega"
23+
kftocore "github.com/opendatahub-io/distributed-workloads/tests/kfto/core"
24+
. "github.com/project-codeflare/codeflare-common/support"
25+
26+
corev1 "k8s.io/api/core/v1"
27+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
28+
29+
kftov1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
30+
)
31+
32+
var (
33+
sleepNamespaceName = "test-kfto-upgrade-sleep"
34+
sleepPyTorchJobName = "pytorch-upgrade-sleep"
35+
)
36+
37+
func TestSetupSleepPytorchjob(t *testing.T) {
38+
test := With(t)
39+
40+
// Create a namespace
41+
namespace := &corev1.Namespace{
42+
ObjectMeta: metav1.ObjectMeta{
43+
Name: sleepNamespaceName,
44+
},
45+
}
46+
_, err := test.Client().Core().CoreV1().Namespaces().Create(test.Ctx(), namespace, metav1.CreateOptions{})
47+
test.Expect(err).NotTo(HaveOccurred())
48+
49+
// Create training PyTorch job
50+
createSleepPyTorchJob(test, sleepNamespaceName)
51+
52+
// Make sure the PyTorch job is running, waiting for Training operator upgrade
53+
test.Eventually(kftocore.PytorchJob(test, sleepNamespaceName, sleepPyTorchJobName), TestTimeoutShort).
54+
Should(WithTransform(kftocore.PytorchJobConditionRunning, Equal(corev1.ConditionTrue)))
55+
}
56+
57+
func TestVerifySleepPytorchjob(t *testing.T) {
58+
test := With(t)
59+
namespace := GetNamespaceWithName(test, sleepNamespaceName)
60+
61+
// Cleanup namespace in the end
62+
defer DeleteTestNamespace(test, namespace)
63+
64+
// PyTorch job should be still running
65+
test.Expect(kftocore.PytorchJob(test, sleepNamespaceName, sleepPyTorchJobName)(test)).
66+
Should(WithTransform(kftocore.PytorchJobConditionRunning, Equal(corev1.ConditionTrue)))
67+
68+
// Pod job should be running without restart
69+
test.Expect(GetPods(test, sleepNamespaceName, metav1.ListOptions{})).
70+
Should(
71+
And(
72+
HaveLen(1),
73+
ContainElement(WithTransform(sleepPodRestartCount, BeNumerically("==", 0))),
74+
),
75+
)
76+
}
77+
78+
func createSleepPyTorchJob(test Test, namespace string) *kftov1.PyTorchJob {
79+
tuningJob := &kftov1.PyTorchJob{
80+
ObjectMeta: metav1.ObjectMeta{
81+
Name: sleepPyTorchJobName,
82+
},
83+
Spec: kftov1.PyTorchJobSpec{
84+
PyTorchReplicaSpecs: map[kftov1.ReplicaType]*kftov1.ReplicaSpec{
85+
"Master": {
86+
Replicas: Ptr(int32(1)),
87+
RestartPolicy: "OnFailure",
88+
Template: corev1.PodTemplateSpec{
89+
Spec: corev1.PodSpec{
90+
Containers: []corev1.Container{
91+
{
92+
Name: "pytorch",
93+
Image: "gcr.io/k8s-staging-perf-tests/sleep:v0.1.0",
94+
ImagePullPolicy: corev1.PullIfNotPresent,
95+
Args: []string{"24h"},
96+
},
97+
},
98+
},
99+
},
100+
},
101+
},
102+
},
103+
}
104+
105+
tuningJob, err := test.Client().Kubeflow().KubeflowV1().PyTorchJobs(namespace).Create(test.Ctx(), tuningJob, metav1.CreateOptions{})
106+
test.Expect(err).NotTo(HaveOccurred())
107+
test.T().Logf("Created PytorchJob %s/%s successfully", tuningJob.Namespace, tuningJob.Name)
108+
109+
return tuningJob
110+
}
111+
112+
func sleepPodRestartCount(pod corev1.Pod) int {
113+
return int(pod.Status.ContainerStatuses[0].RestartCount)
114+
}

0 commit comments

Comments
 (0)