Skip to content

Commit f39ebd6

Browse files
sutaakaropenshift-merge-bot[bot]
authored andcommitted
Implement upgrade test for Training operator
Test verifies that Training operator is able to monitor PyTorchJob created by previous version.
1 parent f8b262d commit f39ebd6

File tree

8 files changed

+308
-73
lines changed

8 files changed

+308
-73
lines changed

.github/workflows/odh-release.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,8 @@ jobs:
3838

3939
- name: Compile tests
4040
run: |
41-
go test -c -o compiled-tests/kfto ./tests/kfto/
41+
go test -c -o compiled-tests/kfto ./tests/kfto/core
42+
go test -c -o compiled-tests/kfto-upgrade ./tests/kfto/upgrade
4243
4344
- name: Creates a release in GitHub
4445
run: |
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
1414
limitations under the License.
1515
*/
1616

17-
package kfto
17+
package core
1818

1919
import (
2020
"os"
@@ -28,7 +28,7 @@ const (
2828
)
2929

3030
func GetFmsHfTuningImage() string {
31-
return lookupEnvOrDefault(fmsHfTuningImageEnvVar, "quay.io/modh/fms-hf-tuning:d0bd35b0297c28b87ee6caa32d5966d77587591f")
31+
return lookupEnvOrDefault(fmsHfTuningImageEnvVar, "quay.io/modh/fms-hf-tuning:bd8bf628cd739c7a201a976bc3c1096785353f1a")
3232
}
3333

3434
func GetBloomModelImage() string {
Lines changed: 3 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
1414
limitations under the License.
1515
*/
1616

17-
package kfto
17+
package core
1818

1919
import (
2020
"testing"
@@ -30,39 +30,6 @@ import (
3030
kftov1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
3131
)
3232

33-
func PytorchJob(t Test, namespace, name string) func(g Gomega) *kftov1.PyTorchJob {
34-
return func(g Gomega) *kftov1.PyTorchJob {
35-
job, err := t.Client().Kubeflow().KubeflowV1().PyTorchJobs(namespace).Get(t.Ctx(), name, metav1.GetOptions{})
36-
g.Expect(err).NotTo(HaveOccurred())
37-
return job
38-
}
39-
}
40-
41-
func PytorchJobConditionRunning(job *kftov1.PyTorchJob) corev1.ConditionStatus {
42-
return PytorchJobCondition(job, kftov1.JobRunning)
43-
}
44-
45-
func PytorchJobConditionSucceeded(job *kftov1.PyTorchJob) corev1.ConditionStatus {
46-
return PytorchJobCondition(job, kftov1.JobSucceeded)
47-
}
48-
49-
func PytorchJobConditionSuspended(job *kftov1.PyTorchJob) corev1.ConditionStatus {
50-
return PytorchJobCondition(job, kftov1.JobSuspended)
51-
}
52-
53-
func PytorchJobCondition(job *kftov1.PyTorchJob, conditionType kftov1.JobConditionType) corev1.ConditionStatus {
54-
for _, condition := range job.Status.Conditions {
55-
if condition.Type == conditionType {
56-
return condition.Status
57-
}
58-
}
59-
return corev1.ConditionUnknown
60-
}
61-
62-
func OwnerReferenceName(meta metav1.Object) string {
63-
return meta.GetOwnerReferences()[0].Name
64-
}
65-
6633
func TestPytorchjobWithSFTtrainer(t *testing.T) {
6734
test := With(t)
6835

@@ -119,7 +86,7 @@ func TestPytorchjobWithSFTtrainer(t *testing.T) {
11986
)
12087

12188
// Make sure the PyTorch job is running
122-
test.Eventually(PytorchJob(test, namespace.Name, tuningJob.Name), TestTimeoutShort).
89+
test.Eventually(PytorchJob(test, namespace.Name, tuningJob.Name), TestTimeoutLong).
12390
Should(WithTransform(PytorchJobConditionRunning, Equal(corev1.ConditionTrue)))
12491

12592
// Make sure the PyTorch job succeed
@@ -174,7 +141,7 @@ func TestPytorchjobUsingKueueQuota(t *testing.T) {
174141
tuningJob := createPyTorchJob(test, namespace.Name, localQueue.Name, *config)
175142

176143
// Make sure the PyTorch job is running
177-
test.Eventually(PytorchJob(test, namespace.Name, tuningJob.Name), TestTimeoutShort).
144+
test.Eventually(PytorchJob(test, namespace.Name, tuningJob.Name), TestTimeoutLong).
178145
Should(WithTransform(PytorchJobConditionRunning, Equal(corev1.ConditionTrue)))
179146

180147
// Create second training PyTorch job

tests/kfto/core/support.go

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
/*
2+
Copyright 2023.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package core
18+
19+
import (
20+
"embed"
21+
22+
. "github.com/onsi/gomega"
23+
. "github.com/project-codeflare/codeflare-common/support"
24+
25+
corev1 "k8s.io/api/core/v1"
26+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
27+
28+
kftov1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
29+
)
30+
31+
//go:embed *.json
32+
var files embed.FS
33+
34+
func ReadFile(t Test, fileName string) []byte {
35+
t.T().Helper()
36+
file, err := files.ReadFile(fileName)
37+
t.Expect(err).NotTo(HaveOccurred())
38+
return file
39+
}
40+
41+
func PytorchJob(t Test, namespace, name string) func(g Gomega) *kftov1.PyTorchJob {
42+
return func(g Gomega) *kftov1.PyTorchJob {
43+
job, err := t.Client().Kubeflow().KubeflowV1().PyTorchJobs(namespace).Get(t.Ctx(), name, metav1.GetOptions{})
44+
g.Expect(err).NotTo(HaveOccurred())
45+
return job
46+
}
47+
}
48+
49+
func PytorchJobConditionRunning(job *kftov1.PyTorchJob) corev1.ConditionStatus {
50+
return PytorchJobCondition(job, kftov1.JobRunning)
51+
}
52+
53+
func PytorchJobConditionSucceeded(job *kftov1.PyTorchJob) corev1.ConditionStatus {
54+
return PytorchJobCondition(job, kftov1.JobSucceeded)
55+
}
56+
57+
func PytorchJobConditionSuspended(job *kftov1.PyTorchJob) corev1.ConditionStatus {
58+
return PytorchJobCondition(job, kftov1.JobSuspended)
59+
}
60+
61+
func PytorchJobCondition(job *kftov1.PyTorchJob, conditionType kftov1.JobConditionType) corev1.ConditionStatus {
62+
for _, condition := range job.Status.Conditions {
63+
if condition.Type == conditionType {
64+
return condition.Status
65+
}
66+
}
67+
return corev1.ConditionUnknown
68+
}
File renamed without changes.

tests/kfto/support.go

Lines changed: 0 additions & 34 deletions
This file was deleted.

0 commit comments

Comments
 (0)