Update KFTO test to utilise alpaca dataset image to access dataset instead of downloading it from huggingface

abhijeet-dhumal · openshift-merge-bot[bot] · commit 9bd695948f5a · 2024-11-29T08:38:33.000Z
diff --git a/tests/kfto/core/hf_llm_training.py b/tests/kfto/core/hf_llm_training.py
@@ -21,7 +21,7 @@
 import json
 import os
 
-from datasets import load_from_disk, Dataset
+from datasets import load_dataset, Dataset
 from datasets.distributed import split_dataset_by_node
 from peft import LoraConfig, get_peft_model
 import transformers
@@ -71,28 +71,26 @@ def setup_model_and_tokenizer(model_uri, transformer_type, model_dir):
     return model, tokenizer
 
 # This function is a modified version of the original.
-def load_and_preprocess_data(dataset_dir, transformer_type, tokenizer):
+def load_and_preprocess_data(dataset_file, transformer_type, tokenizer):
     # Load and preprocess the dataset
     logger.info("Load and preprocess dataset")
 
-    file_path = os.path.realpath(dataset_dir)
+    file_path = os.path.realpath(dataset_file)
 
-    if transformer_type != AutoModelForImageClassification:
-        dataset = load_from_disk(file_path)
+    dataset=load_dataset('json',data_files=file_path)
 
+    if transformer_type != AutoModelForImageClassification:
         logger.info(f"Dataset specification: {dataset}")
         logger.info("-" * 40)
 
         logger.info("Tokenize dataset")
         # TODO (andreyvelich): Discuss how user should set the tokenizer function.
         num_cores = os.cpu_count()
         dataset = dataset.map(
-            lambda x: tokenizer(x["text"], padding=True, truncation=True, max_length=128),
+            lambda x: tokenizer(x["output"], padding=True, truncation=True, max_length=128),
             batched=True,
             num_proc=num_cores
         )
-    else:
-        dataset = load_from_disk(file_path)
 
     # Check if dataset contains `train` key. Otherwise, load full dataset to train_data.
     if "train" in dataset:
@@ -175,7 +173,7 @@ def parse_arguments():
     parser.add_argument("--model_uri", help="model uri")
     parser.add_argument("--transformer_type", help="model transformer type")
     parser.add_argument("--model_dir", help="directory containing model")
-    parser.add_argument("--dataset_dir", help="directory containing dataset")
+    parser.add_argument("--dataset_file", help="dataset file path")
     parser.add_argument("--lora_config", help="lora_config")
     parser.add_argument(
         "--training_parameters", help="hugging face training parameters"
@@ -197,7 +195,7 @@ def parse_arguments():
 
     logger.info("Preprocess dataset")
     train_data, eval_data = load_and_preprocess_data(
-        args.dataset_dir, transformer_type, tokenizer
+        args.dataset_file, transformer_type, tokenizer
     )
 
     logger.info("Setup LoRA config for model")
diff --git a/tests/kfto/core/kfto_training_test.go b/tests/kfto/core/kfto_training_test.go
@@ -106,26 +106,16 @@ func createKFTOPyTorchJob(test Test, namespace string, config corev1.ConfigMap,
 								},
 								{
 									Name:            "copy-dataset",
-									Image:           "registry.access.redhat.com/ubi9/python-311:9.5-1730564330",
+									Image:           GetAlpacaDatasetImage(),
 									ImagePullPolicy: corev1.PullIfNotPresent,
 									VolumeMounts: []corev1.VolumeMount{
 										{
 											Name:      "tmp-volume",
 											MountPath: "/tmp",
 										},
 									},
-									Command: []string{
-										"/bin/sh",
-										"-c",
-										`pip install --target /tmp/.local datasets && \
-									HF_HOME=/tmp/.cache PYTHONPATH=/tmp/.local python -c "from datasets import load_dataset; dataset = load_dataset('tatsu-lab/alpaca', split='train[:100]'); dataset.save_to_disk('/tmp/dataset')"`,
-									},
-									Env: []corev1.EnvVar{
-										{
-											Name:  "HF_HOME",
-											Value: "/tmp/.cache",
-										},
-									},
+									Command: []string{"/bin/sh", "-c"},
+									Args:    []string{"mkdir /tmp/all_datasets; cp -r /dataset/* /tmp/all_datasets;ls /tmp/all_datasets"},
 								},
 							},
 							Containers: []corev1.Container{
@@ -138,7 +128,7 @@ func createKFTOPyTorchJob(test Test, namespace string, config corev1.ConfigMap,
 										`python /etc/config/hf_llm_training.py \
 										--model_uri /tmp/model/bloom-560m \
 										--model_dir /tmp/model/bloom-560m \
-										--dataset_dir /tmp/dataset \
+										--dataset_file /tmp/all_datasets/alpaca_data_hundredth.json \
 										--transformer_type AutoModelForCausalLM \
 										--training_parameters '{"output_dir": "/mnt/output", "per_device_train_batch_size": 8, "num_train_epochs": 3, "logging_dir": "/logs", "eval_strategy": "epoch"}' \
 										--lora_config '{"r": 4, "lora_alpha": 16, "lora_dropout": 0.1, "bias": "none"}'`,