embed file in cf map and create notebook

KPostOffice · openshift-merge-bot[bot] · commit 004214562e1e · 2025-02-06T08:38:03.000Z
Signed-off-by: Kevin &lt;kpostlet@redhat.com&gt;
diff --git a/tests/kfto/kfto_mnist_sdk_test.go b/tests/kfto/kfto_mnist_sdk_test.go
@@ -17,37 +17,65 @@ limitations under the License.
 package kfto
 
 import (
+	"strings"
 	"testing"
 	"time"
 
 	. "github.com/onsi/gomega"
 	. "github.com/project-codeflare/codeflare-common/support"
+
+	v1 "k8s.io/api/core/v1"
 )
 
 func TestMnistSDK(t *testing.T) {
 	test := With(t)
 
 	// Create a namespace
 	namespace := test.NewTestNamespace()
-
-	jupyterNotebookConfigMapFileName := "mnist_kfto.ipynb"
-	mnist := readMnistScriptTemplate(test, "resources/mnist.py")
-
-	jupyterNotebook := ReadFile(test, "resources/mnist_kfto.ipynb")
-	config := CreateConfigMap(test, namespace.Name, map[string][]byte{
-		jupyterNotebookConfigMapFileName: jupyterNotebook,
-		"mnist.py":                       mnist,
-	})
-
-	// Define the regular(non-admin) user
 	userName := GetNotebookUserName(test)
 	userToken := GetNotebookUserToken(test)
+	jupyterNotebookConfigMapFileName := "mnist_kfto.ipynb"
+	mnist := readMnistScriptTemplate(test, "resources/kfto_sdk_train.py")
 
 	// Create role binding with Namespace specific admin cluster role
 	CreateUserRoleBindingWithClusterRole(test, userName, namespace.Name, "admin")
 
+	requiredChangesInNotebook := map[string]string{
+		"${api_url}":        GetOpenShiftApiUrl(test),
+		"${train_function}": "train_func_2",
+		"${password}":       userToken,
+		"${num_gpus}":       "2",
+		"${namespace}":      namespace.Name,
+	}
+
+	jupyterNotebook := string(ReadFile(test, "resources/mnist_kfto.ipynb"))
+	for oldValue, newValue := range requiredChangesInNotebook {
+		jupyterNotebook = strings.Replace(string(jupyterNotebook), oldValue, newValue, -1)
+	}
+
+	config := CreateConfigMap(test, namespace.Name, map[string][]byte{
+		jupyterNotebookConfigMapFileName: []byte(jupyterNotebook),
+		"kfto_sdk_mnist.py":              mnist,
+	})
+
 	// Create Notebook CR
 	createNotebook(test, namespace, userToken, config.Name, jupyterNotebookConfigMapFileName, 0)
+
+	// Gracefully cleanup Notebook
+	defer func() {
+		deleteNotebook(test, namespace)
+		test.Eventually(listNotebooks(test, namespace), TestTimeoutGpuProvisioning).Should(HaveLen(0))
+	}()
+
+	// Make sure pytorch job is created
+	Eventually(PyTorchJob(test, namespace.Name, "pytorch-ddp")).
+		Should(WithTransform(PyTorchJobConditionRunning, Equal(v1.ConditionTrue)))
+
+	// Make sure that the job eventually succeeds
+	Eventually(PyTorchJob(test, namespace.Name, "pytorch-ddp")).
+		Should(WithTransform(PyTorchJobConditionSucceeded, Equal(v1.ConditionTrue)))
+
+	// TODO: write torch job logs?
 	time.Sleep(60 * time.Second)
 }
 
diff --git a/tests/kfto/notebook.go b/tests/kfto/notebook.go
@@ -125,3 +125,20 @@ func GetNotebookImage(t Test) string {
 	}
 	return notebook_image
 }
+
+func deleteNotebook(test Test, namespace *corev1.Namespace) {
+	err := test.Client().Dynamic().Resource(notebookResource).Namespace(namespace.Name).Delete(test.Ctx(), "jupyter-nb-kube-3aadmin", metav1.DeleteOptions{})
+	test.Expect(err).NotTo(gomega.HaveOccurred())
+}
+
+func listNotebooks(test Test, namespace *corev1.Namespace) []*unstructured.Unstructured {
+	ntbs, err := test.Client().Dynamic().Resource(notebookResource).Namespace(namespace.Name).List(test.Ctx(), metav1.ListOptions{})
+	test.Expect(err).NotTo(gomega.HaveOccurred())
+
+	ntbsp := []*unstructured.Unstructured{}
+	for _, v := range ntbs.Items {
+		ntbsp = append(ntbsp, &v)
+	}
+
+	return ntbsp
+}
diff --git a/tests/kfto/resources/kfto_sdk_mnist.py b/tests/kfto/resources/kfto_sdk_mnist.py
@@ -0,0 +1,140 @@
+def train_func():
+    import os
+    import torch
+    import torch.distributed as dist
+    import torch.nn as nn
+    import torch.optim as optim
+    from torchvision import datasets, transforms
+    from torch.utils.data import DataLoader, DistributedSampler
+
+    # Initialize distributed process group
+    dist.init_process_group(backend="nccl" if torch.cuda.is_available() else "gloo")
+    rank = dist.get_rank()
+    world_size = dist.get_world_size()
+    local_rank = int(os.getenv("LOCAL_RANK", 0))
+    torch.cuda.set_device(local_rank)
+
+    # Configuration
+    batch_size = 64
+    epochs = 5
+    learning_rate = 0.01
+
+    # Dataset and DataLoader
+    transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
+    train_dataset = datasets.MNIST(root="/tmp/datasets/mnist", train=True, download=True, transform=transform)
+    train_sampler = DistributedSampler(train_dataset, num_replicas=world_size, rank=rank)
+    train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, sampler=train_sampler)
+
+    # Model, Loss, and Optimizer
+    model = nn.Sequential(
+        nn.Flatten(),
+        nn.Linear(28 * 28, 128),
+        nn.ReLU(),
+        nn.Linear(128, 10)
+    ).cuda(local_rank)
+
+    model = nn.parallel.DistributedDataParallel(model, device_ids=[local_rank])
+    criterion = nn.CrossEntropyLoss().cuda(local_rank)
+    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
+
+    # Training loop
+    for epoch in range(epochs):
+        model.train()
+        epoch_loss = 0
+        for batch_idx, (data, target) in enumerate(train_loader):
+            data, target = data.cuda(local_rank, non_blocking=True), target.cuda(local_rank, non_blocking=True)
+
+            optimizer.zero_grad()
+            output = model(data)
+            loss = criterion(output, target)
+            loss.backward()
+            optimizer.step()
+
+            epoch_loss += loss.item()
+
+        # Log epoch stats
+        print(f"Rank {rank} | Epoch {epoch + 1}/{epochs} | Loss: {epoch_loss / len(train_loader)}")
+
+    # Cleanup
+    dist.destroy_process_group()
+
+def train_func_2():
+    import os
+    import torch
+    import torch.nn.functional as F
+    from torch.utils.data import DistributedSampler
+    from torchvision import datasets, transforms
+    import torch.distributed as dist
+
+    # [1] Setup PyTorch DDP. Distributed environment will be set automatically by Training Operator.
+    dist.init_process_group(backend="nccl" if torch.cuda.is_available() else "gloo")
+    Distributor = torch.nn.parallel.DistributedDataParallel
+    local_rank = int(os.getenv("LOCAL_RANK", 0))
+    print(
+        "Distributed Training for WORLD_SIZE: {}, RANK: {}, LOCAL_RANK: {}".format(
+            dist.get_world_size(),
+            dist.get_rank(),
+            local_rank,
+        )
+    )
+
+    # [2] Create PyTorch CNN Model.
+    class Net(torch.nn.Module):
+        def __init__(self):
+            super(Net, self).__init__()
+            self.conv1 = torch.nn.Conv2d(1, 20, 5, 1)
+            self.conv2 = torch.nn.Conv2d(20, 50, 5, 1)
+            self.fc1 = torch.nn.Linear(4 * 4 * 50, 500)
+            self.fc2 = torch.nn.Linear(500, 10)
+
+        def forward(self, x):
+            x = F.relu(self.conv1(x))
+            x = F.max_pool2d(x, 2, 2)
+            x = F.relu(self.conv2(x))
+            x = F.max_pool2d(x, 2, 2)
+            x = x.view(-1, 4 * 4 * 50)
+            x = F.relu(self.fc1(x))
+            x = self.fc2(x)
+            return F.log_softmax(x, dim=1)
+
+    # [3] Attach model to the correct GPU device and distributor.
+    device = torch.device(f"cuda:{local_rank}")
+    model = Net().to(device)
+    model = Distributor(model)
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
+
+    # [4] Setup FashionMNIST dataloader and distribute data across PyTorchJob workers.
+    dataset = datasets.FashionMNIST(
+        "./data",
+        download=True,
+        train=True,
+        transform=transforms.Compose([transforms.ToTensor()]),
+    )
+    train_loader = torch.utils.data.DataLoader(
+        dataset=dataset,
+        batch_size=128,
+        sampler=DistributedSampler(dataset),
+    )
+
+    # [5] Start model Training.
+    for epoch in range(3):
+        for batch_idx, (data, target) in enumerate(train_loader):
+            # Attach Tensors to the device.
+            data = data.to(device)
+            target = target.to(device)
+
+            optimizer.zero_grad()
+            output = model(data)
+            loss = F.nll_loss(output, target)
+            loss.backward()
+            optimizer.step()
+            if batch_idx % 10 == 0 and dist.get_rank() == 0:
+                print(
+                    "Train Epoch: {} [{}/{} ({:.0f}%)]\tloss={:.4f}".format(
+                        epoch,
+                        batch_idx * len(data),
+                        len(train_loader.dataset),
+                        100.0 * batch_idx / len(train_loader),
+                        loss.item(),
+                    )
+                )
diff --git a/tests/kfto/resources/mnist_kfto.ipynb b/tests/kfto/resources/mnist_kfto.ipynb