Skip to content

Commit 095c68e

Browse files
sutaakaropenshift-merge-bot[bot]
authored andcommitted
Add multiGPU finetuning tests for granite, llama and mixtral models
1 parent d82f293 commit 095c68e

9 files changed

+210
-104
lines changed

go.mod

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,11 @@ toolchain go1.21.5
77
require (
88
github.com/kubeflow/training-operator v1.7.0
99
github.com/onsi/gomega v1.31.1
10-
github.com/project-codeflare/codeflare-common v0.0.0-20240618073051-795d7ecc5ac7
10+
github.com/openshift/api v0.0.0-20230718161610-2a3e8b481cec
11+
github.com/project-codeflare/codeflare-common v0.0.0-20240702071428-eae5837bea22
1112
github.com/prometheus/client_golang v1.18.0
1213
github.com/prometheus/common v0.45.0
14+
github.com/ray-project/kuberay/ray-operator v1.1.0-alpha.0
1315
k8s.io/api v0.29.2
1416
k8s.io/apimachinery v0.29.2
1517
sigs.k8s.io/kueue v0.6.2
@@ -51,13 +53,11 @@ require (
5153
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
5254
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect
5355
github.com/openshift-online/ocm-sdk-go v0.1.368 // indirect
54-
github.com/openshift/api v0.0.0-20230718161610-2a3e8b481cec // indirect
5556
github.com/openshift/client-go v0.0.0-20230718165156-6014fb98e86a // indirect
5657
github.com/pkg/errors v0.9.1 // indirect
5758
github.com/project-codeflare/appwrapper v0.8.0 // indirect
5859
github.com/prometheus/client_model v0.5.0 // indirect
5960
github.com/prometheus/procfs v0.12.0 // indirect
60-
github.com/ray-project/kuberay/ray-operator v1.1.0-alpha.0 // indirect
6161
github.com/sirupsen/logrus v1.9.3 // indirect
6262
github.com/spf13/pflag v1.0.5 // indirect
6363
golang.org/x/exp v0.0.0-20230905200255-921286631fa9 // indirect

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -363,8 +363,8 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb
363363
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
364364
github.com/project-codeflare/appwrapper v0.8.0 h1:vWHNtXUtHutN2EzYb6rryLdESnb8iDXsCokXOuNYXvg=
365365
github.com/project-codeflare/appwrapper v0.8.0/go.mod h1:FMQ2lI3fz6LakUVXgN1FTdpsc3BBkNIZZgtMmM9J5UM=
366-
github.com/project-codeflare/codeflare-common v0.0.0-20240618073051-795d7ecc5ac7 h1:XTK5l2FRO3BbSk4Qn9xAwsRFTJ4IeGljymQWcfYLlMI=
367-
github.com/project-codeflare/codeflare-common v0.0.0-20240618073051-795d7ecc5ac7/go.mod h1:unKTw+XoMANTES3WieG016im7rxZ7IR2/ph++L5Vp1Y=
366+
github.com/project-codeflare/codeflare-common v0.0.0-20240702071428-eae5837bea22 h1:gjbp5kz/azRGmRBJBS6ZmoW2PHGsvYj2Mi0Dre/x5KI=
367+
github.com/project-codeflare/codeflare-common v0.0.0-20240702071428-eae5837bea22/go.mod h1:unKTw+XoMANTES3WieG016im7rxZ7IR2/ph++L5Vp1Y=
368368
github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw=
369369
github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo=
370370
github.com/prometheus/client_golang v1.7.1/go.mod h1:PY5Wy2awLA44sXw4AOSfFBetzPP4j5+D6mVACh+pe2M=

tests/kfto/core/config_GPU.json renamed to tests/kfto/core/config_granite_20b_code_instruct.json

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,19 @@
11
{
2-
"model_name_or_path": "/tmp/model/bloom-560m",
3-
"training_data_path": "/tmp/dataset/alpaca_data.json",
4-
"output_dir": "/tmp/out",
2+
"model_name_or_path": "ibm-granite/granite-20b-code-instruct",
3+
"training_data_path": "/mnt/scratch/dataset/alpaca_data.json",
4+
"output_dir": "/mnt/output/model",
55
"num_train_epochs": 1.0,
6-
"per_device_train_batch_size": 4,
6+
"per_device_train_batch_size": 1,
77
"per_device_eval_batch_size": 4,
88
"gradient_accumulation_steps": 4,
99
"evaluation_strategy": "no",
1010
"save_strategy": "epoch",
1111
"learning_rate": 1e-5,
1212
"weight_decay": 0.0,
1313
"lr_scheduler_type": "cosine",
14-
"logging_steps": 1.0,
15-
"packing": false,
1614
"include_tokens_per_second": true,
1715
"response_template": "\n### Response:",
1816
"dataset_text_field": "output",
1917
"use_flash_attn": false,
20-
"torch_dtype": "float32",
21-
"tokenizer_name_or_path": "/tmp/model/bloom-560m"
18+
"tokenizer_name_or_path": "ibm-granite/granite-20b-code-instruct"
2219
}
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
{
2+
"model_name_or_path": "meta-llama/Llama-2-13b-chat-hf",
3+
"training_data_path": "/mnt/scratch/dataset/alpaca_data.json",
4+
"output_dir": "/mnt/output/model",
5+
"num_train_epochs": 1.0,
6+
"per_device_train_batch_size": 1,
7+
"per_device_eval_batch_size": 4,
8+
"gradient_accumulation_steps": 4,
9+
"evaluation_strategy": "no",
10+
"save_strategy": "epoch",
11+
"learning_rate": 1e-5,
12+
"weight_decay": 0.0,
13+
"lr_scheduler_type": "cosine",
14+
"include_tokens_per_second": true,
15+
"response_template": "\n### Response:",
16+
"dataset_text_field": "output",
17+
"use_flash_attn": false,
18+
"tokenizer_name_or_path": "meta-llama/Llama-2-13b-chat-hf"
19+
}
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
{
2+
"model_name_or_path": "meta-llama/Meta-Llama-3-70B-Instruct",
3+
"training_data_path": "/mnt/scratch/dataset/alpaca_data.json",
4+
"output_dir": "/mnt/output/model",
5+
"num_train_epochs": 1.0,
6+
"per_device_train_batch_size": 1,
7+
"per_device_eval_batch_size": 4,
8+
"gradient_accumulation_steps": 4,
9+
"evaluation_strategy": "no",
10+
"save_strategy": "epoch",
11+
"learning_rate": 1e-5,
12+
"weight_decay": 0.0,
13+
"lr_scheduler_type": "cosine",
14+
"include_tokens_per_second": true,
15+
"response_template": "\n### Response:",
16+
"dataset_text_field": "output",
17+
"use_flash_attn": false,
18+
"tokenizer_name_or_path": "meta-llama/Meta-Llama-3-70B-Instruct",
19+
"peft_method": "lora"
20+
}
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
{
2+
"model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct",
3+
"training_data_path": "/mnt/scratch/dataset/alpaca_data.json",
4+
"output_dir": "/mnt/output/model",
5+
"num_train_epochs": 1.0,
6+
"per_device_train_batch_size": 1,
7+
"per_device_eval_batch_size": 4,
8+
"gradient_accumulation_steps": 4,
9+
"evaluation_strategy": "no",
10+
"save_strategy": "epoch",
11+
"learning_rate": 1e-5,
12+
"weight_decay": 0.0,
13+
"lr_scheduler_type": "cosine",
14+
"include_tokens_per_second": true,
15+
"response_template": "\n### Response:",
16+
"dataset_text_field": "output",
17+
"use_flash_attn": false,
18+
"tokenizer_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct"
19+
}
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
{
2+
"model_name_or_path": "mistralai/Mixtral-8x7B-Instruct-v0.1",
3+
"training_data_path": "/mnt/scratch/dataset/alpaca_data.json",
4+
"output_dir": "/mnt/output/model",
5+
"num_train_epochs": 1.0,
6+
"per_device_train_batch_size": 1,
7+
"per_device_eval_batch_size": 4,
8+
"gradient_accumulation_steps": 1,
9+
"evaluation_strategy": "no",
10+
"save_strategy": "epoch",
11+
"learning_rate": 1e-5,
12+
"weight_decay": 0.0,
13+
"lr_scheduler_type": "cosine",
14+
"include_tokens_per_second": true,
15+
"response_template": "\n### Response:",
16+
"dataset_text_field": "output",
17+
"use_flash_attn": false,
18+
"tokenizer_name_or_path": "mistralai/Mixtral-8x7B-Instruct-v0.1"
19+
}
20+

tests/kfto/core/environment.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,10 @@ const (
2929
bloomModelImageEnvVar = "BLOOM_MODEL_IMAGE"
3030
// The environment variable referring to image containing Stanford Alpaca dataset
3131
alpacaDatasetImageEnvVar = "ALPACA_DATASET_IMAGE"
32+
// The environment variable for HuggingFace token to download models which require authentication
33+
huggingfaceTokenEnvVar = "HF_TOKEN"
34+
// The environment variable specifying existing namespace to be used for multiGPU tests
35+
multiGpuNamespaceEnvVar = "MULTIGPU_NAMESPACE"
3236
)
3337

3438
func GetFmsHfTuningImage(t Test) string {
@@ -47,6 +51,22 @@ func GetAlpacaDatasetImage() string {
4751
return lookupEnvOrDefault(alpacaDatasetImageEnvVar, "quay.io/ksuta/alpaca-dataset@sha256:c0492ff0005c13ac491e00d074902aa9dd21a49691945b122da23db3a3b3ac76")
4852
}
4953

54+
func GetHuggingFaceToken(t Test) string {
55+
image, ok := os.LookupEnv(huggingfaceTokenEnvVar)
56+
if !ok {
57+
t.T().Fatalf("Expected environment variable %s not found, please use this environment variable to specify HuggingFace token to download models.", huggingfaceTokenEnvVar)
58+
}
59+
return image
60+
}
61+
62+
func GetMultiGpuNamespace(t Test) string {
63+
image, ok := os.LookupEnv(multiGpuNamespaceEnvVar)
64+
if !ok {
65+
t.T().Fatalf("Expected environment variable %s not found, please use this environment variable to specify namespace to be used for multiGPU tests.", multiGpuNamespaceEnvVar)
66+
}
67+
return image
68+
}
69+
5070
func lookupEnvOrDefault(key, value string) string {
5171
if v, ok := os.LookupEnv(key); ok {
5272
return v

0 commit comments

Comments
 (0)