From 029a58cf33bd421022ab543c2b267b4acda9b3cb Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 28 Mar 2025 22:39:27 -0700 Subject: [PATCH 1/4] fix llm hp optimization error Signed-off-by: helenxie-bit --- .../kubeflow/storage_initializer/hugging_face.py | 4 +++- .../kubeflow/storage_initializer/requirements.txt | 8 ++++---- sdk/python/kubeflow/trainer/hf_llm_training.py | 11 +++++++++++ sdk/python/kubeflow/trainer/requirements.txt | 9 +++++---- sdk/python/setup.py | 2 +- 5 files changed, 24 insertions(+), 10 deletions(-) diff --git a/sdk/python/kubeflow/storage_initializer/hugging_face.py b/sdk/python/kubeflow/storage_initializer/hugging_face.py index bb6eb6a1c0..5b53b27aa4 100644 --- a/sdk/python/kubeflow/storage_initializer/hugging_face.py +++ b/sdk/python/kubeflow/storage_initializer/hugging_face.py @@ -72,7 +72,9 @@ def download_model_and_tokenizer(self): trust_remote_code=True, ) transformers.AutoTokenizer.from_pretrained( - self.model, cache_dir=VOLUME_PATH_MODEL + self.model, + token=self.config.access_token, + cache_dir=VOLUME_PATH_MODEL, ) diff --git a/sdk/python/kubeflow/storage_initializer/requirements.txt b/sdk/python/kubeflow/storage_initializer/requirements.txt index 4aa157c00c..eb0facfd58 100644 --- a/sdk/python/kubeflow/storage_initializer/requirements.txt +++ b/sdk/python/kubeflow/storage_initializer/requirements.txt @@ -1,4 +1,4 @@ -peft==0.3.0 -datasets==2.21.0 -transformers==4.38.0 -boto3==1.33.9 +peft==0.15.1 +datasets==3.5.0 +transformers==4.50.2 +boto3==1.37.22 diff --git a/sdk/python/kubeflow/trainer/hf_llm_training.py b/sdk/python/kubeflow/trainer/hf_llm_training.py index a79445bae2..0c591946ee 100644 --- a/sdk/python/kubeflow/trainer/hf_llm_training.py +++ b/sdk/python/kubeflow/trainer/hf_llm_training.py @@ -13,6 +13,7 @@ AutoModelForImageClassification, AutoTokenizer, DataCollatorForLanguageModeling, + DataCollatorWithPadding, Trainer, TrainingArguments, ) @@ -58,6 +59,9 @@ def setup_model_and_tokenizer(model_uri, transformer_type, model_dir, num_labels # Freeze model parameters for param in model.parameters(): param.requires_grad = False + + if not tokenizer.pad_token: + tokenizer.pad_token = tokenizer.eos_token return model, tokenizer @@ -148,6 +152,13 @@ def train_model(model, transformer_type, train_data, eval_data, tokenizer, train pad_to_multiple_of=8, mlm=False, ) + else: + logger.info("Add general data collator with padding") + logger.info("-" * 40) + trainer.data_collator = DataCollatorWithPadding( + tokenizer, + pad_to_multiple_of=8, + ) # Train the model. trainer.train() diff --git a/sdk/python/kubeflow/trainer/requirements.txt b/sdk/python/kubeflow/trainer/requirements.txt index f820ccddc8..9f70cd754d 100644 --- a/sdk/python/kubeflow/trainer/requirements.txt +++ b/sdk/python/kubeflow/trainer/requirements.txt @@ -1,4 +1,5 @@ -peft==0.3.0 -datasets==2.21.0 -transformers==4.38.0 -accelerate==0.28.0 +peft==0.15.1 +datasets==3.5.0 +transformers==4.50.2 +accelerate==1.5.2 +tensorboard==2.19.0 diff --git a/sdk/python/setup.py b/sdk/python/setup.py index bd1389cfc2..6c82104550 100644 --- a/sdk/python/setup.py +++ b/sdk/python/setup.py @@ -64,6 +64,6 @@ tests_require=TESTS_REQUIRES, extras_require={ "test": TESTS_REQUIRES, - "huggingface": ["transformers==4.38.0", "peft==0.3.0"], + "huggingface": ["transformers==4.50.2", "peft==0.15.1"], }, ) From 175acaf8aa110d15e67631bb8852b0426d917bc8 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 28 Mar 2025 22:47:35 -0700 Subject: [PATCH 2/4] fix pre-commit error Signed-off-by: helenxie-bit --- sdk/python/kubeflow/trainer/hf_llm_training.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/kubeflow/trainer/hf_llm_training.py b/sdk/python/kubeflow/trainer/hf_llm_training.py index 0c591946ee..475878b5ee 100644 --- a/sdk/python/kubeflow/trainer/hf_llm_training.py +++ b/sdk/python/kubeflow/trainer/hf_llm_training.py @@ -59,7 +59,7 @@ def setup_model_and_tokenizer(model_uri, transformer_type, model_dir, num_labels # Freeze model parameters for param in model.parameters(): param.requires_grad = False - + if not tokenizer.pad_token: tokenizer.pad_token = tokenizer.eos_token From 2db9b2b42f95395ec9121b6a56d936aa76dad5d3 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sun, 30 Mar 2025 21:17:12 -0700 Subject: [PATCH 3/4] fix json serialization error Signed-off-by: helenxie-bit --- sdk/python/kubeflow/training/api/training_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/kubeflow/training/api/training_client.py b/sdk/python/kubeflow/training/api/training_client.py index 34db0d9942..ea0836b980 100644 --- a/sdk/python/kubeflow/training/api/training_client.py +++ b/sdk/python/kubeflow/training/api/training_client.py @@ -294,7 +294,7 @@ def train( VOLUME_PATH_DATASET, "--lora_config", json.dumps( - trainer_parameters.lora_config.__dict__, cls=utils.SetEncoder + trainer_parameters.lora_config.to_dict(), cls=utils.SetEncoder ), "--training_parameters", json.dumps(trainer_parameters.training_parameters.to_dict()), From 7cd174b564ce162175dedf018d17579b922d630c Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sun, 30 Mar 2025 21:50:20 -0700 Subject: [PATCH 4/4] =?UTF-8?q?fix=20warning=20message=20'=20is=20deprecat?= =?UTF-8?q?ed=20and=20will=20be=20removed=20in=20version=204.46=20of=20?= =?UTF-8?q?=F0=9F=A4=97=20Transformers.=20Use=20=20instead'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: helenxie-bit --- .../test/e2e-fine-tune-llm/test_e2e_pytorch_fine_tune_llm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/test/e2e-fine-tune-llm/test_e2e_pytorch_fine_tune_llm.py b/sdk/python/test/e2e-fine-tune-llm/test_e2e_pytorch_fine_tune_llm.py index 9d6f1c48bf..81415faf98 100644 --- a/sdk/python/test/e2e-fine-tune-llm/test_e2e_pytorch_fine_tune_llm.py +++ b/sdk/python/test/e2e-fine-tune-llm/test_e2e_pytorch_fine_tune_llm.py @@ -55,7 +55,7 @@ def test_sdk_e2e_create_from_train_api(job_namespace="default"): training_parameters=transformers.TrainingArguments( output_dir="test_trainer", save_strategy="no", - evaluation_strategy="no", + eval_strategy="no", do_eval=False, disable_tqdm=True, log_level="info",