@@ -46,14 +46,14 @@ func runPytorchjobWithSFTtrainer(t *testing.T, modelConfigFile string, numGpus i
4646 test := With (t )
4747
4848 // Create a namespace
49- namespace := test . NewTestNamespace ( )
49+ namespace := GetOrCreateTestNamespace ( test )
5050
5151 // Create a ConfigMap with training dataset and configuration
5252 configData := map [string ][]byte {
5353 "config.json" : ReadFile (test , modelConfigFile ),
5454 "twitter_complaints_small.json" : ReadFile (test , "twitter_complaints_small.json" ),
5555 }
56- config := CreateConfigMap (test , namespace . Name , configData )
56+ config := CreateConfigMap (test , namespace , configData )
5757
5858 // Create Kueue resources
5959 resourceFlavor := CreateKueueResourceFlavor (test , kueuev1beta1.ResourceFlavorSpec {})
@@ -87,13 +87,19 @@ func runPytorchjobWithSFTtrainer(t *testing.T, modelConfigFile string, numGpus i
8787 }
8888 clusterQueue := CreateKueueClusterQueue (test , cqSpec )
8989 defer test .Client ().Kueue ().KueueV1beta1 ().ClusterQueues ().Delete (test .Ctx (), clusterQueue .Name , metav1.DeleteOptions {})
90- localQueue := CreateKueueLocalQueue (test , namespace .Name , clusterQueue .Name , AsDefaultQueue )
90+ localQueue := CreateKueueLocalQueue (test , namespace , clusterQueue .Name , AsDefaultQueue )
91+ defer test .Client ().Kueue ().KueueV1beta1 ().LocalQueues (namespace ).Delete (test .Ctx (), localQueue .Name , metav1.DeleteOptions {})
92+
93+ // Create PVC for trained model
94+ outputPvc := CreatePersistentVolumeClaim (test , namespace , "10Gi" , corev1 .ReadWriteOnce )
95+ defer test .Client ().Core ().CoreV1 ().PersistentVolumeClaims (namespace ).Delete (test .Ctx (), outputPvc .Name , metav1.DeleteOptions {})
9196
9297 // Create training PyTorch job
93- tuningJob := createPyTorchJob (test , namespace .Name , localQueue .Name , * config , numGpus )
98+ tuningJob := createPyTorchJob (test , namespace , localQueue .Name , * config , numGpus , outputPvc .Name )
99+ defer test .Client ().Kubeflow ().KubeflowV1 ().PyTorchJobs (namespace ).Delete (test .Ctx (), tuningJob .Name , * metav1 .NewDeleteOptions (0 ))
94100
95101 // Make sure the Kueue Workload is admitted
96- test .Eventually (KueueWorkloads (test , namespace . Name ), TestTimeoutLong ).
102+ test .Eventually (KueueWorkloads (test , namespace ), TestTimeoutLong ).
97103 Should (
98104 And (
99105 HaveLen (1 ),
@@ -102,26 +108,31 @@ func runPytorchjobWithSFTtrainer(t *testing.T, modelConfigFile string, numGpus i
102108 )
103109
104110 // Make sure the PyTorch job is running
105- test .Eventually (PytorchJob (test , namespace . Name , tuningJob .Name ), TestTimeoutLong ).
111+ test .Eventually (PytorchJob (test , namespace , tuningJob .Name ), TestTimeoutLong ).
106112 Should (WithTransform (PytorchJobConditionRunning , Equal (corev1 .ConditionTrue )))
107113
108114 // Make sure the PyTorch job succeed
109- test .Eventually (PytorchJob (test , namespace . Name , tuningJob .Name ), TestTimeoutMedium ).Should (WithTransform (PytorchJobConditionSucceeded , Equal (corev1 .ConditionTrue )))
115+ test .Eventually (PytorchJob (test , namespace , tuningJob .Name ), TestTimeoutMedium ).Should (WithTransform (PytorchJobConditionSucceeded , Equal (corev1 .ConditionTrue )))
110116 test .T ().Logf ("PytorchJob %s/%s ran successfully" , tuningJob .Namespace , tuningJob .Name )
117+
118+ _ , bucketEndpointSet := GetStorageBucketDefaultEndpoint ()
119+ if bucketEndpointSet {
120+ uploadToS3 (test , namespace , outputPvc .Name , "model" )
121+ }
111122}
112123
113124func TestPytorchjobUsingKueueQuota (t * testing.T ) {
114125 test := With (t )
115126
116127 // Create a namespace
117- namespace := test . NewTestNamespace ( )
128+ namespace := GetOrCreateTestNamespace ( test )
118129
119130 // Create a ConfigMap with training dataset and configuration
120131 configData := map [string ][]byte {
121132 "config.json" : ReadFile (test , "config.json" ),
122133 "twitter_complaints_small.json" : ReadFile (test , "twitter_complaints_small.json" ),
123134 }
124- config := CreateConfigMap (test , namespace . Name , configData )
135+ config := CreateConfigMap (test , namespace , configData )
125136
126137 // Create limited Kueue resources to run just one Pytorchjob at a time
127138 resourceFlavor := CreateKueueResourceFlavor (test , kueuev1beta1.ResourceFlavorSpec {})
@@ -151,36 +162,44 @@ func TestPytorchjobUsingKueueQuota(t *testing.T) {
151162 }
152163 clusterQueue := CreateKueueClusterQueue (test , cqSpec )
153164 defer test .Client ().Kueue ().KueueV1beta1 ().ClusterQueues ().Delete (test .Ctx (), clusterQueue .Name , metav1.DeleteOptions {})
154- localQueue := CreateKueueLocalQueue (test , namespace .Name , clusterQueue .Name , AsDefaultQueue )
165+ localQueue := CreateKueueLocalQueue (test , namespace , clusterQueue .Name , AsDefaultQueue )
166+
167+ // Create first PVC for trained model
168+ outputPvc := CreatePersistentVolumeClaim (test , namespace , "10Gi" , corev1 .ReadWriteOnce )
169+ defer test .Client ().Core ().CoreV1 ().PersistentVolumeClaims (namespace ).Delete (test .Ctx (), outputPvc .Name , metav1.DeleteOptions {})
155170
156171 // Create first training PyTorch job
157- tuningJob := createPyTorchJob (test , namespace . Name , localQueue .Name , * config , 0 )
172+ tuningJob := createPyTorchJob (test , namespace , localQueue .Name , * config , 0 , outputPvc . Name )
158173
159174 // Make sure the PyTorch job is running
160- test .Eventually (PytorchJob (test , namespace . Name , tuningJob .Name ), TestTimeoutLong ).
175+ test .Eventually (PytorchJob (test , namespace , tuningJob .Name ), TestTimeoutLong ).
161176 Should (WithTransform (PytorchJobConditionRunning , Equal (corev1 .ConditionTrue )))
162177
178+ // Create second PVC for trained model
179+ secondOutputPvc := CreatePersistentVolumeClaim (test , namespace , "10Gi" , corev1 .ReadWriteOnce )
180+ defer test .Client ().Core ().CoreV1 ().PersistentVolumeClaims (namespace ).Delete (test .Ctx (), outputPvc .Name , metav1.DeleteOptions {})
181+
163182 // Create second training PyTorch job
164- secondTuningJob := createPyTorchJob (test , namespace . Name , localQueue .Name , * config , 0 )
183+ secondTuningJob := createPyTorchJob (test , namespace , localQueue .Name , * config , 0 , secondOutputPvc . Name )
165184
166185 // Make sure the second PyTorch job is suspended, waiting for first job to finish
167- test .Eventually (PytorchJob (test , namespace . Name , secondTuningJob .Name ), TestTimeoutShort ).
186+ test .Eventually (PytorchJob (test , namespace , secondTuningJob .Name ), TestTimeoutShort ).
168187 Should (WithTransform (PytorchJobConditionSuspended , Equal (corev1 .ConditionTrue )))
169188
170189 // Make sure the first PyTorch job succeed
171- test .Eventually (PytorchJob (test , namespace . Name , tuningJob .Name ), TestTimeoutLong ).Should (WithTransform (PytorchJobConditionSucceeded , Equal (corev1 .ConditionTrue )))
190+ test .Eventually (PytorchJob (test , namespace , tuningJob .Name ), TestTimeoutLong ).Should (WithTransform (PytorchJobConditionSucceeded , Equal (corev1 .ConditionTrue )))
172191 test .T ().Logf ("PytorchJob %s/%s ran successfully" , tuningJob .Namespace , tuningJob .Name )
173192
174193 // Second PyTorch job should be started now
175- test .Eventually (PytorchJob (test , namespace . Name , secondTuningJob .Name ), TestTimeoutShort ).
194+ test .Eventually (PytorchJob (test , namespace , secondTuningJob .Name ), TestTimeoutShort ).
176195 Should (WithTransform (PytorchJobConditionRunning , Equal (corev1 .ConditionTrue )))
177196
178197 // Make sure the second PyTorch job succeed
179- test .Eventually (PytorchJob (test , namespace . Name , secondTuningJob .Name ), TestTimeoutLong ).Should (WithTransform (PytorchJobConditionSucceeded , Equal (corev1 .ConditionTrue )))
198+ test .Eventually (PytorchJob (test , namespace , secondTuningJob .Name ), TestTimeoutLong ).Should (WithTransform (PytorchJobConditionSucceeded , Equal (corev1 .ConditionTrue )))
180199 test .T ().Logf ("PytorchJob %s/%s ran successfully" , secondTuningJob .Namespace , secondTuningJob .Name )
181200}
182201
183- func createPyTorchJob (test Test , namespace , localQueueName string , config corev1.ConfigMap , numGpus int ) * kftov1.PyTorchJob {
202+ func createPyTorchJob (test Test , namespace , localQueueName string , config corev1.ConfigMap , numGpus int , outputPvcName string ) * kftov1.PyTorchJob {
184203 tuningJob := & kftov1.PyTorchJob {
185204 TypeMeta : metav1.TypeMeta {
186205 APIVersion : corev1 .SchemeGroupVersion .String (),
@@ -248,6 +267,10 @@ func createPyTorchJob(test Test, namespace, localQueueName string, config corev1
248267 Name : "tmp-volume" ,
249268 MountPath : "/tmp" ,
250269 },
270+ {
271+ Name : "output-volume" ,
272+ MountPath : "/mnt/output" ,
273+ },
251274 },
252275 Resources : corev1.ResourceRequirements {
253276 Requests : corev1.ResourceList {
@@ -284,6 +307,14 @@ func createPyTorchJob(test Test, namespace, localQueueName string, config corev1
284307 EmptyDir : & corev1.EmptyDirVolumeSource {},
285308 },
286309 },
310+ {
311+ Name : "output-volume" ,
312+ VolumeSource : corev1.VolumeSource {
313+ PersistentVolumeClaim : & corev1.PersistentVolumeClaimVolumeSource {
314+ ClaimName : outputPvcName ,
315+ },
316+ },
317+ },
287318 },
288319 },
289320 },
0 commit comments