Skip to content

Commit f0df855

Browse files
committed
Adjust KFTO PyTorchJob upgrade tests to be idempotent
1 parent f3697a9 commit f0df855

File tree

3 files changed

+81
-54
lines changed

3 files changed

+81
-54
lines changed

tests/kfto/core/support.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,14 @@ func PytorchJob(t Test, namespace, name string) func(g Gomega) *kftov1.PyTorchJo
4949
}
5050
}
5151

52+
func PytorchJobs(t Test, namespace string) func(g Gomega) []kftov1.PyTorchJob {
53+
return func(g Gomega) []kftov1.PyTorchJob {
54+
jobs, err := t.Client().Kubeflow().KubeflowV1().PyTorchJobs(namespace).List(t.Ctx(), metav1.ListOptions{})
55+
g.Expect(err).NotTo(HaveOccurred())
56+
return jobs.Items
57+
}
58+
}
59+
5260
func PytorchJobConditionRunning(job *kftov1.PyTorchJob) corev1.ConditionStatus {
5361
return PytorchJobCondition(job, kftov1.JobRunning)
5462
}

tests/kfto/upgrade/kfto_kueue_sft_upgrade_training_test.go

Lines changed: 59 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626
kueueacv1beta1 "sigs.k8s.io/kueue/client-go/applyconfiguration/kueue/v1beta1"
2727

2828
corev1 "k8s.io/api/core/v1"
29+
"k8s.io/apimachinery/pkg/api/errors"
2930
"k8s.io/apimachinery/pkg/api/resource"
3031
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3132

@@ -42,14 +43,7 @@ var (
4243
func TestSetupPytorchjob(t *testing.T) {
4344
test := With(t)
4445

45-
// Create a namespace
46-
namespace := &corev1.Namespace{
47-
ObjectMeta: metav1.ObjectMeta{
48-
Name: namespaceName,
49-
},
50-
}
51-
_, err := test.Client().Core().CoreV1().Namespaces().Create(test.Ctx(), namespace, metav1.CreateOptions{})
52-
test.Expect(err).NotTo(HaveOccurred())
46+
createOrGetUpgradeTestNamespace(test, namespaceName)
5347

5448
// Create a ConfigMap with training dataset and configuration
5549
configData := map[string][]byte{
@@ -59,47 +53,31 @@ func TestSetupPytorchjob(t *testing.T) {
5953
config := CreateConfigMap(test, namespaceName, configData)
6054

6155
// Create Kueue resources
62-
resourceFlavor := &kueuev1beta1.ResourceFlavor{
63-
ObjectMeta: metav1.ObjectMeta{
64-
Name: resourceFlavorName,
65-
},
66-
}
67-
resourceFlavor, err = test.Client().Kueue().KueueV1beta1().ResourceFlavors().Create(test.Ctx(), resourceFlavor, metav1.CreateOptions{})
56+
resourceFlavor := kueueacv1beta1.ResourceFlavor(resourceFlavorName)
57+
_, err := test.Client().Kueue().KueueV1beta1().ResourceFlavors().Apply(test.Ctx(), resourceFlavor, metav1.ApplyOptions{FieldManager: "setup-PyTorchJob", Force: true})
6858
test.Expect(err).NotTo(HaveOccurred())
6959

70-
clusterQueue := &kueuev1beta1.ClusterQueue{
71-
ObjectMeta: metav1.ObjectMeta{
72-
Name: clusterQueueName,
73-
},
74-
Spec: kueuev1beta1.ClusterQueueSpec{
75-
NamespaceSelector: &metav1.LabelSelector{},
76-
ResourceGroups: []kueuev1beta1.ResourceGroup{
77-
{
78-
CoveredResources: []corev1.ResourceName{corev1.ResourceName("cpu"), corev1.ResourceName("memory")},
79-
Flavors: []kueuev1beta1.FlavorQuotas{
80-
{
81-
Name: kueuev1beta1.ResourceFlavorReference(resourceFlavor.Name),
82-
Resources: []kueuev1beta1.ResourceQuota{
83-
{
84-
Name: corev1.ResourceCPU,
85-
NominalQuota: resource.MustParse("8"),
86-
},
87-
{
88-
Name: corev1.ResourceMemory,
89-
NominalQuota: resource.MustParse("12Gi"),
90-
},
91-
},
92-
},
93-
},
94-
},
95-
},
96-
StopPolicy: Ptr(kueuev1beta1.Hold),
97-
},
98-
}
99-
clusterQueue, err = test.Client().Kueue().KueueV1beta1().ClusterQueues().Create(test.Ctx(), clusterQueue, metav1.CreateOptions{})
60+
clusterQueue := kueueacv1beta1.ClusterQueue(clusterQueueName).WithSpec(
61+
kueueacv1beta1.ClusterQueueSpec().
62+
WithNamespaceSelector(metav1.LabelSelector{}).
63+
WithResourceGroups(
64+
kueueacv1beta1.ResourceGroup().WithCoveredResources(
65+
corev1.ResourceName("cpu"), corev1.ResourceName("memory"),
66+
).WithFlavors(
67+
kueueacv1beta1.FlavorQuotas().
68+
WithName(kueuev1beta1.ResourceFlavorReference(resourceFlavorName)).
69+
WithResources(
70+
kueueacv1beta1.ResourceQuota().WithName(corev1.ResourceCPU).WithNominalQuota(resource.MustParse("8")),
71+
kueueacv1beta1.ResourceQuota().WithName(corev1.ResourceMemory).WithNominalQuota(resource.MustParse("12Gi")),
72+
),
73+
),
74+
).
75+
WithStopPolicy(kueuev1beta1.Hold),
76+
)
77+
_, err = test.Client().Kueue().KueueV1beta1().ClusterQueues().Apply(test.Ctx(), clusterQueue, metav1.ApplyOptions{FieldManager: "setup-PyTorchJob", Force: true})
10078
test.Expect(err).NotTo(HaveOccurred())
10179

102-
localQueue := CreateKueueLocalQueue(test, namespaceName, clusterQueue.Name, AsDefaultQueue)
80+
localQueue := CreateKueueLocalQueue(test, namespaceName, clusterQueueName, AsDefaultQueue)
10381

10482
// Create training PyTorch job
10583
tuningJob := createPyTorchJob(test, namespaceName, localQueue.Name, *config)
@@ -133,6 +111,17 @@ func TestRunPytorchjob(t *testing.T) {
133111
}
134112

135113
func createPyTorchJob(test Test, namespace, localQueueName string, config corev1.ConfigMap) *kftov1.PyTorchJob {
114+
// Does PyTorchJob already exist?
115+
_, err := test.Client().Kubeflow().KubeflowV1().PyTorchJobs(namespace).Get(test.Ctx(), pyTorchJobName, metav1.GetOptions{})
116+
if err == nil {
117+
// If yes then delete it and wait until there are no PyTorchJobs in the namespace
118+
err := test.Client().Kubeflow().KubeflowV1().PyTorchJobs(namespace).Delete(test.Ctx(), pyTorchJobName, metav1.DeleteOptions{})
119+
test.Expect(err).NotTo(HaveOccurred())
120+
test.Eventually(kftocore.PytorchJobs(test, namespace), TestTimeoutShort).Should(BeEmpty())
121+
} else if !errors.IsNotFound(err) {
122+
test.T().Fatalf("Error retrieving PyTorchJob with name `%s`: %v", pyTorchJobName, err)
123+
}
124+
136125
tuningJob := &kftov1.PyTorchJob{
137126
ObjectMeta: metav1.ObjectMeta{
138127
Name: pyTorchJobName,
@@ -186,6 +175,10 @@ func createPyTorchJob(test Test, namespace, localQueueName string, config corev1
186175
Name: "tmp-volume",
187176
MountPath: "/tmp",
188177
},
178+
{
179+
Name: "output-volume",
180+
MountPath: "/mnt/output",
181+
},
189182
},
190183
Resources: corev1.ResourceRequirements{
191184
Requests: corev1.ResourceList{
@@ -226,6 +219,12 @@ func createPyTorchJob(test Test, namespace, localQueueName string, config corev1
226219
EmptyDir: &corev1.EmptyDirVolumeSource{},
227220
},
228221
},
222+
{
223+
Name: "output-volume",
224+
VolumeSource: corev1.VolumeSource{
225+
EmptyDir: &corev1.EmptyDirVolumeSource{},
226+
},
227+
},
229228
},
230229
},
231230
},
@@ -234,9 +233,23 @@ func createPyTorchJob(test Test, namespace, localQueueName string, config corev1
234233
},
235234
}
236235

237-
tuningJob, err := test.Client().Kubeflow().KubeflowV1().PyTorchJobs(namespace).Create(test.Ctx(), tuningJob, metav1.CreateOptions{})
236+
tuningJob, err = test.Client().Kubeflow().KubeflowV1().PyTorchJobs(namespace).Create(test.Ctx(), tuningJob, metav1.CreateOptions{})
238237
test.Expect(err).NotTo(HaveOccurred())
239238
test.T().Logf("Created PytorchJob %s/%s successfully", tuningJob.Namespace, tuningJob.Name)
240239

241240
return tuningJob
242241
}
242+
243+
func createOrGetUpgradeTestNamespace(test Test, name string, options ...Option[*corev1.Namespace]) (namespace *corev1.Namespace) {
244+
// Verify that the namespace really exists and return it, create it if doesn't exist yet
245+
namespace, err := test.Client().Core().CoreV1().Namespaces().Get(test.Ctx(), name, metav1.GetOptions{})
246+
if err == nil {
247+
return
248+
} else if errors.IsNotFound(err) {
249+
test.T().Logf("%s namespace doesn't exists. Creating ...", name)
250+
return CreateTestNamespaceWithName(test, name, options...)
251+
} else {
252+
test.T().Fatalf("Error retrieving namespace with name `%s`: %v", name, err)
253+
}
254+
return
255+
}

tests/kfto/upgrade/kfto_sft_upgrade_sleep_test.go

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
. "github.com/project-codeflare/codeflare-common/support"
2525

2626
corev1 "k8s.io/api/core/v1"
27+
"k8s.io/apimachinery/pkg/api/errors"
2728
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2829

2930
kftov1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
@@ -38,13 +39,7 @@ func TestSetupSleepPytorchjob(t *testing.T) {
3839
test := With(t)
3940

4041
// Create a namespace
41-
namespace := &corev1.Namespace{
42-
ObjectMeta: metav1.ObjectMeta{
43-
Name: sleepNamespaceName,
44-
},
45-
}
46-
_, err := test.Client().Core().CoreV1().Namespaces().Create(test.Ctx(), namespace, metav1.CreateOptions{})
47-
test.Expect(err).NotTo(HaveOccurred())
42+
createOrGetUpgradeTestNamespace(test, sleepNamespaceName)
4843

4944
// Create training PyTorch job
5045
createSleepPyTorchJob(test, sleepNamespaceName)
@@ -76,6 +71,17 @@ func TestVerifySleepPytorchjob(t *testing.T) {
7671
}
7772

7873
func createSleepPyTorchJob(test Test, namespace string) *kftov1.PyTorchJob {
74+
// Does PyTorchJob already exist?
75+
_, err := test.Client().Kubeflow().KubeflowV1().PyTorchJobs(namespace).Get(test.Ctx(), sleepPyTorchJobName, metav1.GetOptions{})
76+
if err == nil {
77+
// If yes then delete it and wait until there are no PyTorchJobs in the namespace
78+
err := test.Client().Kubeflow().KubeflowV1().PyTorchJobs(namespace).Delete(test.Ctx(), sleepPyTorchJobName, metav1.DeleteOptions{})
79+
test.Expect(err).NotTo(HaveOccurred())
80+
test.Eventually(kftocore.PytorchJobs(test, namespace), TestTimeoutShort).Should(BeEmpty())
81+
} else if !errors.IsNotFound(err) {
82+
test.T().Fatalf("Error retrieving PyTorchJob with name `%s`: %v", sleepPyTorchJobName, err)
83+
}
84+
7985
tuningJob := &kftov1.PyTorchJob{
8086
ObjectMeta: metav1.ObjectMeta{
8187
Name: sleepPyTorchJobName,
@@ -102,7 +108,7 @@ func createSleepPyTorchJob(test Test, namespace string) *kftov1.PyTorchJob {
102108
},
103109
}
104110

105-
tuningJob, err := test.Client().Kubeflow().KubeflowV1().PyTorchJobs(namespace).Create(test.Ctx(), tuningJob, metav1.CreateOptions{})
111+
tuningJob, err = test.Client().Kubeflow().KubeflowV1().PyTorchJobs(namespace).Create(test.Ctx(), tuningJob, metav1.CreateOptions{})
106112
test.Expect(err).NotTo(HaveOccurred())
107113
test.T().Logf("Created PytorchJob %s/%s successfully", tuningJob.Namespace, tuningJob.Name)
108114

0 commit comments

Comments
 (0)