Skip to content

Commit b9b43e7

Browse files
Added provision to assert job status, updated HPO script for disconnected and removed local-queue parameter usage from cluster configuration to make it optional
1 parent 77edf83 commit b9b43e7

15 files changed

+720
-523
lines changed

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ toolchain go1.21.5
77
require (
88
github.com/kubeflow/training-operator v1.7.0
99
github.com/onsi/gomega v1.31.1
10-
github.com/project-codeflare/codeflare-common v0.0.0-20240809123324-d44e319ba556
10+
github.com/project-codeflare/codeflare-common v0.0.0-20240827080155-9234d23ff47d
1111
github.com/prometheus/client_golang v1.18.0
1212
github.com/prometheus/common v0.45.0
1313
github.com/ray-project/kuberay/ray-operator v1.1.0-alpha.0

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -363,8 +363,8 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb
363363
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
364364
github.com/project-codeflare/appwrapper v0.8.0 h1:vWHNtXUtHutN2EzYb6rryLdESnb8iDXsCokXOuNYXvg=
365365
github.com/project-codeflare/appwrapper v0.8.0/go.mod h1:FMQ2lI3fz6LakUVXgN1FTdpsc3BBkNIZZgtMmM9J5UM=
366-
github.com/project-codeflare/codeflare-common v0.0.0-20240809123324-d44e319ba556 h1:4SI3d63CNZ+7sKQ1JEqLmNzGSgVXqz3aT3+aDXRgo18=
367-
github.com/project-codeflare/codeflare-common v0.0.0-20240809123324-d44e319ba556/go.mod h1:unKTw+XoMANTES3WieG016im7rxZ7IR2/ph++L5Vp1Y=
366+
github.com/project-codeflare/codeflare-common v0.0.0-20240827080155-9234d23ff47d h1:hbfF20rw/NHvXNXYLuxPjCnBS5Lotvt6rU0S9DLs0HU=
367+
github.com/project-codeflare/codeflare-common v0.0.0-20240827080155-9234d23ff47d/go.mod h1:unKTw+XoMANTES3WieG016im7rxZ7IR2/ph++L5Vp1Y=
368368
github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw=
369369
github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo=
370370
github.com/prometheus/client_golang v1.7.1/go.mod h1:PY5Wy2awLA44sXw4AOSfFBetzPP4j5+D6mVACh+pe2M=

tests/kfto/core/kfto_kueue_sft_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ func runPytorchjobWithSFTtrainer(t *testing.T, modelConfigFile string) {
7979
}
8080
clusterQueue := CreateKueueClusterQueue(test, cqSpec)
8181
defer test.Client().Kueue().KueueV1beta1().ClusterQueues().Delete(test.Ctx(), clusterQueue.Name, metav1.DeleteOptions{})
82-
localQueue := CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name)
82+
localQueue := CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name, AsDefaultQueue)
8383

8484
// Create training PyTorch job
8585
tuningJob := createPyTorchJob(test, namespace.Name, localQueue.Name, *config)
@@ -143,7 +143,7 @@ func TestPytorchjobUsingKueueQuota(t *testing.T) {
143143
}
144144
clusterQueue := CreateKueueClusterQueue(test, cqSpec)
145145
defer test.Client().Kueue().KueueV1beta1().ClusterQueues().Delete(test.Ctx(), clusterQueue.Name, metav1.DeleteOptions{})
146-
localQueue := CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name)
146+
localQueue := CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name, AsDefaultQueue)
147147

148148
// Create first training PyTorch job
149149
tuningJob := createPyTorchJob(test, namespace.Name, localQueue.Name, *config)

tests/kfto/upgrade/kfto_kueue_sft_upgrade_training_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ func TestSetupPytorchjob(t *testing.T) {
9999
clusterQueue, err = test.Client().Kueue().KueueV1beta1().ClusterQueues().Create(test.Ctx(), clusterQueue, metav1.CreateOptions{})
100100
test.Expect(err).NotTo(HaveOccurred())
101101

102-
localQueue := CreateKueueLocalQueue(test, namespaceName, clusterQueue.Name)
102+
localQueue := CreateKueueLocalQueue(test, namespaceName, clusterQueue.Name, AsDefaultQueue)
103103

104104
// Create training PyTorch job
105105
tuningJob := createPyTorchJob(test, namespaceName, localQueue.Name, *config)

tests/odh/mnist_ray_test.go

Lines changed: 47 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"bytes"
2121
"fmt"
2222
"testing"
23+
"time"
2324

2425
. "github.com/onsi/gomega"
2526
. "github.com/project-codeflare/codeflare-common/support"
@@ -77,11 +78,11 @@ func mnistRay(t *testing.T, numGpus int) {
7778
}
7879
clusterQueue := CreateKueueClusterQueue(test, cqSpec)
7980
defer test.Client().Kueue().KueueV1beta1().ClusterQueues().Delete(test.Ctx(), clusterQueue.Name, metav1.DeleteOptions{})
80-
localQueue := CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name)
81+
CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name, AsDefaultQueue)
8182

8283
// Test configuration
8384
jupyterNotebookConfigMapFileName := "mnist_ray_mini.ipynb"
84-
mnist := readMnistPy(test)
85+
mnist := readMnistScriptTemplate(test, "resources/mnist.py")
8586
if numGpus > 0 {
8687
mnist = bytes.Replace(mnist, []byte("accelerator=\"has to be specified\""), []byte("accelerator=\"gpu\""), 1)
8788
} else {
@@ -91,7 +92,7 @@ func mnistRay(t *testing.T, numGpus int) {
9192
// MNIST Ray Notebook
9293
jupyterNotebookConfigMapFileName: ReadFile(test, "resources/mnist_ray_mini.ipynb"),
9394
"mnist.py": mnist,
94-
"requirements.txt": readRequirementsTxt(test),
95+
"requirements.txt": ReadFile(test, "resources/requirements.txt"),
9596
})
9697

9798
// Define the regular(non-admin) user
@@ -102,7 +103,7 @@ func mnistRay(t *testing.T, numGpus int) {
102103
CreateUserRoleBindingWithClusterRole(test, userName, namespace.Name, "admin")
103104

104105
// Create Notebook CR
105-
createNotebook(test, namespace, userToken, localQueue.Name, config.Name, jupyterNotebookConfigMapFileName, numGpus)
106+
createNotebook(test, namespace, userToken, config.Name, jupyterNotebookConfigMapFileName, numGpus)
106107

107108
// Gracefully cleanup Notebook
108109
defer func() {
@@ -111,7 +112,7 @@ func mnistRay(t *testing.T, numGpus int) {
111112
}()
112113

113114
// Make sure the RayCluster is created and running
114-
test.Eventually(rayClusters(test, namespace), TestTimeoutLong).
115+
test.Eventually(RayClusters(test, namespace.Name), TestTimeoutLong).
115116
Should(
116117
And(
117118
HaveLen(1),
@@ -128,32 +129,53 @@ func mnistRay(t *testing.T, numGpus int) {
128129
),
129130
)
130131

131-
// Make sure the RayCluster finishes and is deleted
132-
test.Eventually(rayClusters(test, namespace), TestTimeoutLong).
133-
Should(HaveLen(0))
134-
}
132+
time.Sleep(30 * time.Second)
135133

136-
func readRequirementsTxt(test Test) []byte {
137-
// Read the requirements.txt from resources and perform replacements for custom values using go template
138-
props := struct {
139-
PipIndexUrl string
140-
PipTrustedHost string
141-
}{
142-
PipIndexUrl: "--index " + string(GetPipIndexURL()),
143-
}
134+
// Fetch created raycluster
135+
rayClusterName := "mnisttest"
136+
rayCluster, err := test.Client().Ray().RayV1().RayClusters(namespace.Name).Get(test.Ctx(), rayClusterName, metav1.GetOptions{})
137+
test.Expect(err).ToNot(HaveOccurred())
144138

145-
// Provide trusted host only if defined
146-
if len(GetPipTrustedHost()) > 0 {
147-
props.PipTrustedHost = "--trusted-host " + GetPipTrustedHost()
139+
// Initialise raycluster client to interact with raycluster to get rayjob details using REST-API
140+
dashboardUrl := GetDashboardUrl(test, namespace, rayCluster)
141+
rayClusterClientConfig := RayClusterClientConfig{Address: dashboardUrl.String(), Client: nil, InsecureSkipVerify: true}
142+
rayClient, err := NewRayClusterClient(rayClusterClientConfig, test.Config().BearerToken)
143+
if err != nil {
144+
test.T().Errorf("%s", err)
148145
}
149146

150-
template, err := files.ReadFile("resources/requirements.txt")
151-
test.Expect(err).NotTo(HaveOccurred())
147+
jobID := GetTestJobId(test, rayClient, dashboardUrl.Host)
148+
test.Expect(jobID).ToNot(Equal(nil))
149+
150+
// Wait for the job to be succeeded or failed
151+
var rayJobStatus string
152+
fmt.Printf("Waiting for job to be Succeeded...\n")
153+
test.Eventually(func() string {
154+
resp, err := rayClient.GetJobDetails(jobID)
155+
test.Expect(err).ToNot(HaveOccurred())
156+
rayJobStatusVal := resp.Status
157+
if rayJobStatusVal == "SUCCEEDED" || rayJobStatusVal == "FAILED" {
158+
fmt.Printf("JobStatus : %s\n", rayJobStatusVal)
159+
rayJobStatus = rayJobStatusVal
160+
return rayJobStatus
161+
}
162+
if rayJobStatus != rayJobStatusVal && rayJobStatusVal != "SUCCEEDED" {
163+
fmt.Printf("JobStatus : %s...\n", rayJobStatusVal)
164+
rayJobStatus = rayJobStatusVal
165+
}
166+
return rayJobStatus
167+
}, TestTimeoutDouble, 3*time.Second).Should(Or(Equal("SUCCEEDED"), Equal("FAILED")), "Job did not complete within the expected time")
168+
test.Expect(rayJobStatus).To(Equal("SUCCEEDED"), "RayJob failed !")
169+
170+
// Store job logs in output directory
171+
WriteRayJobAPILogs(test, rayClient, jobID)
152172

153-
return ParseTemplate(test, template, props)
173+
// Make sure the RayCluster finishes and is deleted
174+
test.Eventually(RayClusters(test, namespace.Name), TestTimeoutLong).
175+
Should(HaveLen(0))
154176
}
155177

156-
func readMnistPy(test Test) []byte {
178+
func readMnistScriptTemplate(test Test, filePath string) []byte {
157179
// Read the mnist.py from resources and perform replacements for custom values using go template
158180
storage_bucket_endpoint, storage_bucket_endpoint_exists := GetStorageBucketDefaultEndpoint()
159181
storage_bucket_access_key_id, storage_bucket_access_key_id_exists := GetStorageBucketAccessKeyId()
@@ -184,7 +206,7 @@ func readMnistPy(test Test) []byte {
184206
StorageBucketMnistDir: storage_bucket_mnist_dir,
185207
StorageBucketMnistDirExists: storage_bucket_mnist_dir_exists,
186208
}
187-
template, err := files.ReadFile("resources/mnist.py")
209+
template, err := files.ReadFile(filePath)
188210
test.Expect(err).NotTo(HaveOccurred())
189211

190212
return ParseTemplate(test, template, props)

tests/odh/mnist_raytune_hpo_test.go

Lines changed: 46 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"bytes"
2121
"fmt"
2222
"testing"
23+
"time"
2324

2425
. "github.com/onsi/gomega"
2526
. "github.com/project-codeflare/codeflare-common/support"
@@ -76,11 +77,11 @@ func mnistRayTuneHpo(t *testing.T, numGpus int) {
7677
}
7778
clusterQueue := CreateKueueClusterQueue(test, cqSpec)
7879
defer test.Client().Kueue().KueueV1beta1().ClusterQueues().Delete(test.Ctx(), clusterQueue.Name, metav1.DeleteOptions{})
79-
localQueue := CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name)
80+
CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name, AsDefaultQueue)
8081

8182
// Test configuration
8283
jupyterNotebookConfigMapFileName := "mnist_hpo_raytune.ipynb"
83-
mnist_hpo := ReadFile(test, "resources/mnist_hpo.py")
84+
mnist_hpo := readMnistScriptTemplate(test, "resources/mnist_hpo.py")
8485

8586
if numGpus > 0 {
8687
mnist_hpo = bytes.Replace(mnist_hpo, []byte("gpu_value=\"has to be specified\""), []byte("gpu_value=\"1\""), 1)
@@ -103,7 +104,7 @@ func mnistRayTuneHpo(t *testing.T, numGpus int) {
103104
CreateUserRoleBindingWithClusterRole(test, userName, namespace.Name, "admin")
104105

105106
// Create Notebook CR
106-
createNotebook(test, namespace, userToken, localQueue.Name, config.Name, jupyterNotebookConfigMapFileName, numGpus)
107+
createNotebook(test, namespace, userToken, config.Name, jupyterNotebookConfigMapFileName, numGpus)
107108

108109
// Gracefully cleanup Notebook
109110
defer func() {
@@ -112,7 +113,7 @@ func mnistRayTuneHpo(t *testing.T, numGpus int) {
112113
}()
113114

114115
// Make sure the RayCluster is created and running
115-
test.Eventually(rayClusters(test, namespace), TestTimeoutLong).
116+
test.Eventually(RayClusters(test, namespace.Name), TestTimeoutLong).
116117
Should(
117118
And(
118119
HaveLen(1),
@@ -128,8 +129,48 @@ func mnistRayTuneHpo(t *testing.T, numGpus int) {
128129
ContainElement(WithTransform(KueueWorkloadAdmitted, BeTrueBecause("Workload failed to be admitted"))),
129130
),
130131
)
132+
time.Sleep(30 * time.Second)
133+
134+
// Fetch created raycluster
135+
rayClusterName := "mnisthpotest"
136+
rayCluster, err := test.Client().Ray().RayV1().RayClusters(namespace.Name).Get(test.Ctx(), rayClusterName, metav1.GetOptions{})
137+
test.Expect(err).ToNot(HaveOccurred())
138+
139+
// Initialise raycluster client to interact with raycluster to get rayjob details using REST-API
140+
dashboardUrl := GetDashboardUrl(test, namespace, rayCluster)
141+
rayClusterClientConfig := RayClusterClientConfig{Address: dashboardUrl.String(), Client: nil, InsecureSkipVerify: true}
142+
rayClient, err := NewRayClusterClient(rayClusterClientConfig, test.Config().BearerToken)
143+
if err != nil {
144+
test.T().Errorf("%s", err)
145+
}
146+
147+
jobID := GetTestJobId(test, rayClient, dashboardUrl.Host)
148+
test.Expect(jobID).ToNot(Equal(nil))
149+
150+
// Wait for the job to be succeeded or failed
151+
var rayJobStatus string
152+
fmt.Printf("Waiting for job to be Succeeded...\n")
153+
test.Eventually(func() string {
154+
resp, err := rayClient.GetJobDetails(jobID)
155+
test.Expect(err).ToNot(HaveOccurred())
156+
rayJobStatusVal := resp.Status
157+
if rayJobStatusVal == "SUCCEEDED" || rayJobStatusVal == "FAILED" {
158+
fmt.Printf("JobStatus : %s\n", rayJobStatusVal)
159+
rayJobStatus = rayJobStatusVal
160+
return rayJobStatus
161+
}
162+
if rayJobStatus != rayJobStatusVal && rayJobStatusVal != "SUCCEEDED" {
163+
fmt.Printf("JobStatus : %s...\n", rayJobStatusVal)
164+
rayJobStatus = rayJobStatusVal
165+
}
166+
return rayJobStatus
167+
}, TestTimeoutDouble, 3*time.Second).Should(Or(Equal("SUCCEEDED"), Equal("FAILED")), "Job did not complete within the expected time")
168+
test.Expect(rayJobStatus).To(Equal("SUCCEEDED"), "RayJob failed !")
169+
170+
// Store job logs in output directory
171+
WriteRayJobAPILogs(test, rayClient, jobID)
131172

132173
// Make sure the RayCluster finishes and is deleted
133-
test.Eventually(rayClusters(test, namespace), TestTimeoutLong).
174+
test.Eventually(RayClusters(test, namespace.Name), TestTimeoutLong).
134175
Should(HaveLen(0))
135176
}

tests/odh/notebook.go

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,16 +41,32 @@ type NotebookProps struct {
4141
OpenDataHubNamespace string
4242
RayImage string
4343
NotebookImage string
44-
LocalQueue string
4544
NotebookConfigMapName string
4645
NotebookConfigMapFileName string
4746
NotebookPVC string
4847
NumGpus int
48+
PipIndexUrl string
49+
PipTrustedHost string
50+
S3BucketName string
51+
S3AccessKeyId string
52+
S3SecretAccessKey string
53+
S3DefaultRegion string
4954
}
5055

51-
func createNotebook(test Test, namespace *corev1.Namespace, notebookUserToken, localQueue, jupyterNotebookConfigMapName, jupyterNotebookConfigMapFileName string, numGpus int) {
56+
func createNotebook(test Test, namespace *corev1.Namespace, notebookUserToken, jupyterNotebookConfigMapName, jupyterNotebookConfigMapFileName string, numGpus int) {
5257
// Create PVC for Notebook
5358
notebookPVC := CreatePersistentVolumeClaim(test, namespace.Name, "10Gi", corev1.ReadWriteOnce)
59+
s3BucketName, s3BucketNameExists := GetStorageBucketName()
60+
s3AccessKeyId, _ := GetStorageBucketAccessKeyId()
61+
s3SecretAccessKey, _ := GetStorageBucketSecretKey()
62+
s3DefaultRegion, _ := GetStorageBucketDefaultRegion()
63+
64+
if !s3BucketNameExists {
65+
s3BucketName = "''"
66+
s3AccessKeyId = "''"
67+
s3SecretAccessKey = "''"
68+
s3DefaultRegion = "''"
69+
}
5470

5571
// Read the Notebook CR from resources and perform replacements for custom values using go template
5672
notebookProps := NotebookProps{
@@ -61,11 +77,16 @@ func createNotebook(test Test, namespace *corev1.Namespace, notebookUserToken, l
6177
OpenDataHubNamespace: GetOpenDataHubNamespace(test),
6278
RayImage: GetRayImage(),
6379
NotebookImage: GetNotebookImage(test),
64-
LocalQueue: localQueue,
6580
NotebookConfigMapName: jupyterNotebookConfigMapName,
6681
NotebookConfigMapFileName: jupyterNotebookConfigMapFileName,
6782
NotebookPVC: notebookPVC.Name,
6883
NumGpus: numGpus,
84+
S3BucketName: s3BucketName,
85+
S3AccessKeyId: s3AccessKeyId,
86+
S3SecretAccessKey: s3SecretAccessKey,
87+
S3DefaultRegion: s3DefaultRegion,
88+
PipIndexUrl: GetPipIndexURL(),
89+
PipTrustedHost: GetPipTrustedHost(),
6990
}
7091
notebookTemplate, err := files.ReadFile("resources/custom-nb-small.yaml")
7192
test.Expect(err).NotTo(gomega.HaveOccurred())

tests/odh/resources/custom-nb-small.yaml

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,20 @@ spec:
5050
value: {{.NotebookImage}}
5151
- name: JUPYTER_NOTEBOOK_PORT
5252
value: "8888"
53+
- name: AWS_ACCESS_KEY_ID
54+
value: {{.S3AccessKeyId}}
55+
- name: AWS_SECRET_ACCESS_KEY
56+
value: {{.S3SecretAccessKey}}
57+
- name: AWS_DEFAULT_REGION
58+
value: {{.S3DefaultRegion}}
59+
- name: AWS_S3_BUCKET
60+
value: {{.S3BucketName}}
61+
- name: PIP_INDEX_URL
62+
value: {{.PipIndexUrl}}
63+
- name: PIP_TRUSTED_HOST
64+
value: {{.PipTrustedHost}}
5365
image: {{.NotebookImage}}
54-
command: ["/bin/sh", "-c", "pip install papermill && papermill /opt/app-root/notebooks/{{.NotebookConfigMapFileName}} /opt/app-root/src/mcad-out.ipynb -p namespace {{.Namespace}} -p ray_image {{.RayImage}} -p local_queue {{.LocalQueue}} -p openshift_api_url {{.OpenShiftApiUrl}} -p kubernetes_user_bearer_token {{.KubernetesUserBearerToken}} -p num_gpus {{ .NumGpus }} --log-output && sleep infinity"]
66+
command: ["/bin/sh", "-c", "pip install papermill && papermill /opt/app-root/notebooks/{{.NotebookConfigMapFileName}} /opt/app-root/src/mcad-out.ipynb -p namespace {{.Namespace}} -p ray_image {{.RayImage}} -p openshift_api_url {{.OpenShiftApiUrl}} -p kubernetes_user_bearer_token {{.KubernetesUserBearerToken}} -p num_gpus {{ .NumGpus }} --log-output && sleep infinity"]
5567
# args: ["pip install papermill && oc login --token=${OCP_TOKEN} --server=${OCP_SERVER} --insecure-skip-tls-verify=true && papermill /opt/app-root/notebooks/mcad.ipynb /opt/app-root/src/mcad-out.ipynb" ]
5668
imagePullPolicy: Always
5769
# livenessProbe:
@@ -158,4 +170,4 @@ spec:
158170
secretName: jupyter-nb-kube-3aadmin-tls
159171
- name: {{.NotebookConfigMapName}}
160172
configMap:
161-
name: {{.NotebookConfigMapName}}
173+
name: {{.NotebookConfigMapName}}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
torchvision==0.18.0
2+
minio

tests/odh/resources/mnist.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,11 +133,20 @@ def prepare_data(self):
133133
secret_key = "{{.StorageBucketSecretKey}}"
134134
bucket_name = "{{.StorageBucketName}}"
135135

136+
# remove prefix if specified in storage bucket endpoint url
137+
secure = True
138+
if endpoint.startswith("https://"):
139+
endpoint = endpoint[len("https://") :]
140+
elif endpoint.startswith("http://"):
141+
endpoint = endpoint[len("http://") :]
142+
secure = False
143+
136144
client = Minio(
137145
endpoint,
138146
access_key=access_key,
139147
secret_key=secret_key,
140148
cert_check=False,
149+
secure=secure
141150
)
142151

143152
if not os.path.exists(dataset_dir):

0 commit comments

Comments
 (0)