Skip to content

Commit 02b6637

Browse files
sutaakaropenshift-merge-bot[bot]
authored andcommitted
Add support for Kueue in KFTO e2e test
1 parent 605a47c commit 02b6637

File tree

6 files changed

+123
-67
lines changed

6 files changed

+123
-67
lines changed

Makefile

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,6 @@ setup-kueue: ## Set up Kueue for e2e tests.
66
kubectl apply --server-side -k "github.com/opendatahub-io/kueue/config/rhoai"
77
echo "Wait for Kueue deployment"
88
kubectl -n opendatahub wait --timeout=300s --for=condition=Available deployments --all
9-
echo "Creating Kueue Resources, to be removed once tests creates own Kueue resources"
10-
kubectl apply -f kueue-config.yaml
119

1210
.PHONY: setup-kfto
1311
setup-kfto: ## Set up Training operator for e2e tests.

go.mod

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,10 @@ toolchain go1.21.5
77
require (
88
github.com/kubeflow/training-operator v1.7.0
99
github.com/onsi/gomega v1.31.1
10-
github.com/project-codeflare/codeflare-common v0.0.0-20240422163521-380101642c8f
10+
github.com/project-codeflare/codeflare-common v0.0.0-20240430071721-f782f78e5bb8
1111
k8s.io/api v0.29.2
1212
k8s.io/apimachinery v0.29.2
13+
sigs.k8s.io/kueue v0.6.2
1314
)
1415

1516
require (
@@ -78,7 +79,6 @@ require (
7879
k8s.io/utils v0.0.0-20230726121419-3b25d923346b // indirect
7980
sigs.k8s.io/controller-runtime v0.17.0 // indirect
8081
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect
81-
sigs.k8s.io/kueue v0.6.2 // indirect
8282
sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect
8383
sigs.k8s.io/yaml v1.4.0 // indirect
8484
)

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -361,8 +361,8 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb
361361
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
362362
github.com/project-codeflare/appwrapper v0.8.0 h1:vWHNtXUtHutN2EzYb6rryLdESnb8iDXsCokXOuNYXvg=
363363
github.com/project-codeflare/appwrapper v0.8.0/go.mod h1:FMQ2lI3fz6LakUVXgN1FTdpsc3BBkNIZZgtMmM9J5UM=
364-
github.com/project-codeflare/codeflare-common v0.0.0-20240422163521-380101642c8f h1:9Uron4ej4Tt5ULX5CMzjmPqIZu3q/m07d4jhbNSwdPY=
365-
github.com/project-codeflare/codeflare-common v0.0.0-20240422163521-380101642c8f/go.mod h1:tlPi2e1HZQuf7AAFc7keWdVUNcxV+Gfh6Ss4KAQs1O0=
364+
github.com/project-codeflare/codeflare-common v0.0.0-20240430071721-f782f78e5bb8 h1:qTGgufSQF2L5IaG3FSE7mqGCLKeb2XBGWHSaMrIo7Gk=
365+
github.com/project-codeflare/codeflare-common v0.0.0-20240430071721-f782f78e5bb8/go.mod h1:tlPi2e1HZQuf7AAFc7keWdVUNcxV+Gfh6Ss4KAQs1O0=
366366
github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw=
367367
github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo=
368368
github.com/prometheus/client_golang v1.7.1/go.mod h1:PY5Wy2awLA44sXw4AOSfFBetzPP4j5+D6mVACh+pe2M=

kueue-config.yaml

Lines changed: 0 additions & 28 deletions
This file was deleted.

tests/kfto/environment.go

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
/*
2+
Copyright 2024
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package kfto
18+
19+
import (
20+
"os"
21+
)
22+
23+
const (
24+
// The environment variable for FMS HF Tuning image to be tested
25+
fmsHfTuningImageEnvVar = "FMS_HF_TUNING_IMAGE"
26+
)
27+
28+
func GetFmsHfTuningImage() string {
29+
return lookupEnvOrDefault(fmsHfTuningImageEnvVar, "quay.io/modh/fms-hf-tuning:b71215c3ae202eab9da1d347f52b89feb3d0378c")
30+
}
31+
32+
func lookupEnvOrDefault(key, value string) string {
33+
if v, ok := os.LookupEnv(key); ok {
34+
return v
35+
}
36+
return value
37+
}

tests/kfto/kfto_kueue_sft_test.go

Lines changed: 82 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -19,30 +19,40 @@ package kfto
1919
import (
2020
"testing"
2121

22-
"github.com/onsi/gomega"
2322
. "github.com/onsi/gomega"
2423
. "github.com/project-codeflare/codeflare-common/support"
24+
kueuev1beta1 "sigs.k8s.io/kueue/apis/kueue/v1beta1"
2525

2626
corev1 "k8s.io/api/core/v1"
27+
"k8s.io/apimachinery/pkg/api/resource"
2728
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2829

2930
kftov1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
3031
)
3132

32-
func PytorchJob(t Test, namespace, name string) func(g gomega.Gomega) *kftov1.PyTorchJob {
33-
return func(g gomega.Gomega) *kftov1.PyTorchJob {
33+
func PytorchJob(t Test, namespace, name string) func(g Gomega) *kftov1.PyTorchJob {
34+
return func(g Gomega) *kftov1.PyTorchJob {
3435
job, err := t.Client().Kubeflow().KubeflowV1().PyTorchJobs(namespace).Get(t.Ctx(), name, metav1.GetOptions{})
35-
g.Expect(err).NotTo(gomega.HaveOccurred())
36+
g.Expect(err).NotTo(HaveOccurred())
3637
return job
3738
}
3839
}
3940

40-
// s
41-
func PytorchJobCondition(job *kftov1.PyTorchJob) string {
42-
if len(job.Status.Conditions) == 0 {
43-
return ""
41+
func PytorchJobConditionRunning(job *kftov1.PyTorchJob) corev1.ConditionStatus {
42+
return PytorchJobCondition(job, kftov1.JobRunning)
43+
}
44+
45+
func PytorchJobConditionSucceeded(job *kftov1.PyTorchJob) corev1.ConditionStatus {
46+
return PytorchJobCondition(job, kftov1.JobSucceeded)
47+
}
48+
49+
func PytorchJobCondition(job *kftov1.PyTorchJob, conditionType kftov1.JobConditionType) corev1.ConditionStatus {
50+
for _, condition := range job.Status.Conditions {
51+
if condition.Type == conditionType {
52+
return condition.Status
53+
}
4454
}
45-
return job.Status.Conditions[len(job.Status.Conditions)-1].Reason
55+
return corev1.ConditionUnknown
4656
}
4757

4858
func TestPytorchjobWithSFTtrainer(t *testing.T) {
@@ -51,29 +61,45 @@ func TestPytorchjobWithSFTtrainer(t *testing.T) {
5161

5262
// Create a namespace
5363
namespace := test.NewTestNamespace()
54-
config := &corev1.ConfigMap{
55-
TypeMeta: metav1.TypeMeta{
56-
APIVersion: corev1.SchemeGroupVersion.String(),
57-
Kind: "ConfigMap",
58-
},
59-
ObjectMeta: metav1.ObjectMeta{
60-
Name: "my-config",
61-
Namespace: namespace.Name,
62-
Labels: map[string]string{
63-
"kueue.x-k8s.io/queue-name": "lq-trainer",
64+
65+
// Create a ConfigMap with training dataset and configuration
66+
configData := map[string][]byte{
67+
"config.json": ReadFile(test, "config.json"),
68+
"twitter_complaints_small.json": ReadFile(test, "twitter_complaints_small.json"),
69+
}
70+
config := CreateConfigMap(test, namespace.Name, configData)
71+
72+
// Create Kueue resources
73+
resourceFlavor := CreateKueueResourceFlavor(test, kueuev1beta1.ResourceFlavorSpec{})
74+
defer test.Client().Kueue().KueueV1beta1().ResourceFlavors().Delete(test.Ctx(), resourceFlavor.Name, metav1.DeleteOptions{})
75+
cqSpec := kueuev1beta1.ClusterQueueSpec{
76+
NamespaceSelector: &metav1.LabelSelector{},
77+
ResourceGroups: []kueuev1beta1.ResourceGroup{
78+
{
79+
CoveredResources: []corev1.ResourceName{corev1.ResourceName("cpu"), corev1.ResourceName("memory")},
80+
Flavors: []kueuev1beta1.FlavorQuotas{
81+
{
82+
Name: kueuev1beta1.ResourceFlavorReference(resourceFlavor.Name),
83+
Resources: []kueuev1beta1.ResourceQuota{
84+
{
85+
Name: corev1.ResourceCPU,
86+
NominalQuota: resource.MustParse("8"),
87+
},
88+
{
89+
Name: corev1.ResourceMemory,
90+
NominalQuota: resource.MustParse("12Gi"),
91+
},
92+
},
93+
},
94+
},
6495
},
6596
},
66-
BinaryData: map[string][]byte{
67-
"config.json": ReadFile(test, "config.json"),
68-
"twitter_complaints_small.json": ReadFile(test, "twitter_complaints_small.json"),
69-
},
70-
Immutable: Ptr(true),
7197
}
98+
clusterQueue := CreateKueueClusterQueue(test, cqSpec)
99+
defer test.Client().Kueue().KueueV1beta1().ClusterQueues().Delete(test.Ctx(), clusterQueue.Name, metav1.DeleteOptions{})
100+
localQueue := CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name)
72101

73-
config, err := test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Create(test.Ctx(), config, metav1.CreateOptions{})
74-
test.Expect(err).NotTo(HaveOccurred())
75-
test.T().Logf("Created ConfigMap %s/%s successfully", config.Namespace, config.Name)
76-
102+
// Run training PyTorch job
77103
tuningJob := &kftov1.PyTorchJob{
78104
TypeMeta: metav1.TypeMeta{
79105
APIVersion: corev1.SchemeGroupVersion.String(),
@@ -82,18 +108,21 @@ func TestPytorchjobWithSFTtrainer(t *testing.T) {
82108
ObjectMeta: metav1.ObjectMeta{
83109
Name: "kfto-sft",
84110
Namespace: namespace.Name,
111+
Labels: map[string]string{
112+
"kueue.x-k8s.io/queue-name": localQueue.Name,
113+
},
85114
},
86115
Spec: kftov1.PyTorchJobSpec{
87116
PyTorchReplicaSpecs: map[kftov1.ReplicaType]*kftov1.ReplicaSpec{
88-
"Master": &kftov1.ReplicaSpec{
117+
"Master": {
89118
Replicas: Ptr(int32(1)),
90119
RestartPolicy: "Never",
91120
Template: corev1.PodTemplateSpec{
92121
Spec: corev1.PodSpec{
93122
Containers: []corev1.Container{
94123
{
95124
Name: "pytorch",
96-
Image: "quay.io/tedchang/sft-trainer:dev",
125+
Image: GetFmsHfTuningImage(),
97126
ImagePullPolicy: corev1.PullIfNotPresent,
98127
Command: []string{"python", "/app/launch_training.py"},
99128
Env: []corev1.EnvVar{
@@ -108,6 +137,12 @@ func TestPytorchjobWithSFTtrainer(t *testing.T) {
108137
MountPath: "/etc/config",
109138
},
110139
},
140+
Resources: corev1.ResourceRequirements{
141+
Requests: corev1.ResourceList{
142+
corev1.ResourceCPU: resource.MustParse("2"),
143+
corev1.ResourceMemory: resource.MustParse("5Gi"),
144+
},
145+
},
111146
},
112147
},
113148
Volumes: []corev1.Volume{
@@ -116,7 +151,7 @@ func TestPytorchjobWithSFTtrainer(t *testing.T) {
116151
VolumeSource: corev1.VolumeSource{
117152
ConfigMap: &corev1.ConfigMapVolumeSource{
118153
LocalObjectReference: corev1.LocalObjectReference{
119-
Name: "my-config",
154+
Name: config.Name,
120155
},
121156
Items: []corev1.KeyToPath{
122157
{
@@ -139,10 +174,24 @@ func TestPytorchjobWithSFTtrainer(t *testing.T) {
139174
},
140175
}
141176

142-
tuningJob, err = test.Client().Kubeflow().KubeflowV1().PyTorchJobs(namespace.Name).Create(test.Ctx(), tuningJob, metav1.CreateOptions{})
177+
tuningJob, err := test.Client().Kubeflow().KubeflowV1().PyTorchJobs(namespace.Name).Create(test.Ctx(), tuningJob, metav1.CreateOptions{})
143178
test.Expect(err).NotTo(HaveOccurred())
144179
test.T().Logf("Created PytorchJob %s/%s successfully", tuningJob.Namespace, tuningJob.Name)
145180

146-
test.Eventually(PytorchJob(test, namespace.Name, tuningJob.Name), TestTimeoutLong).Should(WithTransform(PytorchJobCondition, Equal("PyTorchJobSucceeded")))
181+
// Make sure the Kueue Workload is admitted
182+
test.Eventually(KueueWorkloads(test, namespace.Name), TestTimeoutLong).
183+
Should(
184+
And(
185+
HaveLen(1),
186+
ContainElement(WithTransform(KueueWorkloadAdmitted, BeTrueBecause("Workload failed to be admitted"))),
187+
),
188+
)
189+
190+
// Make sure the PyTorch job is running
191+
test.Eventually(PytorchJob(test, namespace.Name, tuningJob.Name), TestTimeoutShort).
192+
Should(WithTransform(PytorchJobConditionRunning, Equal(corev1.ConditionTrue)))
193+
194+
// Make sure the PyTorch job succeed
195+
test.Eventually(PytorchJob(test, namespace.Name, tuningJob.Name), TestTimeoutLong).Should(WithTransform(PytorchJobConditionSucceeded, Equal(corev1.ConditionTrue)))
147196
test.T().Logf("PytorchJob %s/%s ran successfully", tuningJob.Namespace, tuningJob.Name)
148197
}

0 commit comments

Comments
 (0)