Skip to content

Commit 53c2eb4

Browse files
Updated training script to use sequential dataset pre-processing instead of parallel to avoid MD5 hash checks
1 parent 9bd6959 commit 53c2eb4

File tree

1 file changed

+1
-2
lines changed

1 file changed

+1
-2
lines changed

tests/kfto/core/hf_llm_training.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,11 +85,10 @@ def load_and_preprocess_data(dataset_file, transformer_type, tokenizer):
8585

8686
logger.info("Tokenize dataset")
8787
# TODO (andreyvelich): Discuss how user should set the tokenizer function.
88-
num_cores = os.cpu_count()
8988
dataset = dataset.map(
9089
lambda x: tokenizer(x["output"], padding=True, truncation=True, max_length=128),
9190
batched=True,
92-
num_proc=num_cores
91+
keep_in_memory=True
9392
)
9493

9594
# Check if dataset contains `train` key. Otherwise, load full dataset to train_data.

0 commit comments

Comments
 (0)