Skip to content

Commit 6eb5145

Browse files
Rocketknight1gante
andauthored
TF Examples Rewrite (#18451)
* Finished QA example * Dodge a merge conflict * Update text classification and LM examples * Update NER example * New Keras metrics WIP, fix NER example * Update NER example * Update MC, summarization and translation examples * Add XLA warnings when shapes are variable * Make sure batch_size is consistently scaled by num_replicas * Add PushToHubCallback to all models * Add docs links for KerasMetricCallback * Add docs links for prepare_tf_dataset and jit_compile * Correct inferred model names * Don't assume the dataset has 'lang' * Don't assume the dataset has 'lang' * Write metrics in text classification * Add 'framework' to TrainingArguments and TFTrainingArguments * Export metrics in all examples and add tests * Fix training args for Flax * Update command line args for translation test * make fixup * Fix accidentally running other tests in fp16 * Remove do_train/do_eval from run_clm.py * Remove do_train/do_eval from run_mlm.py * Add tensorflow tests to circleci * Fix circleci * Update examples/tensorflow/language-modeling/run_mlm.py Co-authored-by: Joao Gante <[email protected]> * Update examples/tensorflow/test_tensorflow_examples.py Co-authored-by: Joao Gante <[email protected]> * Update examples/tensorflow/translation/run_translation.py Co-authored-by: Joao Gante <[email protected]> * Update examples/tensorflow/token-classification/run_ner.py Co-authored-by: Joao Gante <[email protected]> * Fix save path for tests * Fix some model card kwargs * Explain the magical -1000 * Actually enable tests this time * Skip text classification PR until we fix shape inference * make fixup Co-authored-by: Joao Gante <[email protected]>
1 parent d7e2d7b commit 6eb5145

File tree

15 files changed

+1491
-661
lines changed

15 files changed

+1491
-661
lines changed

.circleci/config.yml

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -658,6 +658,71 @@ jobs:
658658
- store_artifacts:
659659
path: ~/transformers/reports
660660

661+
run_examples_tensorflow:
662+
working_directory: ~/transformers
663+
docker:
664+
- image: cimg/python:3.7.12
665+
environment:
666+
OMP_NUM_THREADS: 1
667+
TRANSFORMERS_IS_CI: yes
668+
PYTEST_TIMEOUT: 120
669+
resource_class: xlarge
670+
parallelism: 1
671+
steps:
672+
- checkout
673+
- restore_cache:
674+
keys:
675+
- v0.5-tensorflow_examples-{{ checksum "setup.py" }}
676+
- v0.5-{{ checksum "setup.py" }}
677+
- run: pip install --upgrade pip
678+
- run: pip install .[sklearn,tensorflow,sentencepiece,testing]
679+
- run: pip install -r examples/tensorflow/_tests_requirements.txt
680+
- save_cache:
681+
key: v0.5-tensorflow_examples-{{ checksum "setup.py" }}
682+
paths:
683+
- '~/.cache/pip'
684+
- run: python utils/tests_fetcher.py --filters examples tests | tee test_preparation.txt
685+
- store_artifacts:
686+
path: ~/transformers/test_preparation.txt
687+
- run: |
688+
if [ -f test_list.txt ]; then
689+
python -m pytest -n 8 --max-worker-restart=0 --dist=loadfile -s --make-reports=examples_tensorflow ./examples/tensorflow/ | tee tests_output.txt
690+
fi
691+
- store_artifacts:
692+
path: ~/transformers/tensorflow_examples_output.txt
693+
- store_artifacts:
694+
path: ~/transformers/reports
695+
696+
run_examples_tensorflow_all:
697+
working_directory: ~/transformers
698+
docker:
699+
- image: cimg/python:3.7.12
700+
environment:
701+
OMP_NUM_THREADS: 1
702+
TRANSFORMERS_IS_CI: yes
703+
PYTEST_TIMEOUT: 120
704+
resource_class: xlarge
705+
parallelism: 1
706+
steps:
707+
- checkout
708+
- restore_cache:
709+
keys:
710+
- v0.5-tensorflow_examples-{{ checksum "setup.py" }}
711+
- v0.5-{{ checksum "setup.py" }}
712+
- run: pip install --upgrade pip
713+
- run: pip install .[sklearn,tensorflow,sentencepiece,testing]
714+
- run: pip install -r examples/tensorflow/_tests_requirements.txt
715+
- save_cache:
716+
key: v0.5-tensorflow_examples-{{ checksum "setup.py" }}
717+
paths:
718+
- '~/.cache/pip'
719+
- run: |
720+
TRANSFORMERS_IS_CI=1 python -m pytest -n 8 --max-worker-restart=0 --dist=loadfile -s --make-reports=examples_tensorflow ./examples/tensorflow/ | tee examples_output.txt
721+
- store_artifacts:
722+
path: ~/transformers/tensorflow_examples_output.txt
723+
- store_artifacts:
724+
path: ~/transformers/reports
725+
661726
run_examples_flax:
662727
working_directory: ~/transformers
663728
docker:
@@ -1000,6 +1065,7 @@ workflows:
10001065
- check_code_quality
10011066
- check_repository_consistency
10021067
- run_examples_torch
1068+
- run_examples_tensorflow
10031069
- run_examples_flax
10041070
- run_tests_custom_tokenizers
10051071
- run_tests_torch_and_tf
@@ -1022,6 +1088,7 @@ workflows:
10221088
- main
10231089
jobs:
10241090
- run_examples_torch_all
1091+
- run_examples_tensorflow_all
10251092
- run_examples_flax_all
10261093
- run_tests_torch_and_tf_all
10271094
- run_tests_torch_and_flax_all
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
tensorflow
2+
tensorboard
3+
scikit-learn
4+
seqeval
5+
psutil
6+
sacrebleu >= 1.4.12
7+
git+https://github.com/huggingface/accelerate@main#egg=accelerate
8+
rouge-score
9+
tensorflow_datasets
10+
matplotlib
11+
git-python==1.0.3
12+
faiss-cpu
13+
streamlit
14+
elasticsearch
15+
nltk
16+
pandas
17+
datasets >= 1.13.3
18+
fire
19+
pytest
20+
conllu
21+
sentencepiece != 0.1.92
22+
protobuf
23+
jiwer
24+
librosa
25+
evaluate >= 0.2.0

examples/tensorflow/language-modeling/run_clm.py

Lines changed: 114 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
"""
2323
# You can also adapt this script on your own clm task. Pointers for this are left as comments.
2424

25+
import json
26+
2527
# region Imports
2628
import logging
2729
import math
@@ -46,8 +48,8 @@
4648
TF_MODEL_FOR_CAUSAL_LM_MAPPING,
4749
AutoConfig,
4850
AutoTokenizer,
49-
DefaultDataCollator,
5051
HfArgumentParser,
52+
PushToHubCallback,
5153
TFAutoModelForCausalLM,
5254
TFTrainingArguments,
5355
create_optimizer,
@@ -205,21 +207,6 @@ def __post_init__(self):
205207
assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
206208

207209

208-
# endregion
209-
210-
# region Helper classes
211-
class SavePretrainedCallback(tf.keras.callbacks.Callback):
212-
# Hugging Face models have a save_pretrained() method that saves both the weights and the necessary
213-
# metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback
214-
# that saves the model with this method after each epoch.
215-
def __init__(self, output_dir, **kwargs):
216-
super().__init__()
217-
self.output_dir = output_dir
218-
219-
def on_epoch_end(self, epoch, logs=None):
220-
self.model.save_pretrained(self.output_dir)
221-
222-
223210
# endregion
224211

225212

@@ -299,19 +286,22 @@ def main():
299286
raw_datasets = load_dataset(
300287
data_args.dataset_name,
301288
data_args.dataset_config_name,
289+
cache_dir=model_args.cache_dir,
302290
use_auth_token=True if model_args.use_auth_token else None,
303291
)
304292
if "validation" not in raw_datasets.keys():
305293
raw_datasets["validation"] = load_dataset(
306294
data_args.dataset_name,
307295
data_args.dataset_config_name,
308296
split=f"train[:{data_args.validation_split_percentage}%]",
297+
cache_dir=model_args.cache_dir,
309298
use_auth_token=True if model_args.use_auth_token else None,
310299
)
311300
raw_datasets["train"] = load_dataset(
312301
data_args.dataset_name,
313302
data_args.dataset_config_name,
314303
split=f"train[{data_args.validation_split_percentage}%:]",
304+
cache_dir=model_args.cache_dir,
315305
use_auth_token=True if model_args.use_auth_token else None,
316306
)
317307
else:
@@ -321,16 +311,39 @@ def main():
321311
data_files["train"] = data_args.train_file
322312
if data_args.validation_file is not None:
323313
data_files["validation"] = data_args.validation_file
324-
extension = data_args.train_file.split(".")[-1]
314+
extension = (
315+
data_args.train_file.split(".")[-1]
316+
if data_args.train_file is not None
317+
else data_args.validation_file.split(".")[-1]
318+
)
325319
if extension == "txt":
326320
extension = "text"
327321
dataset_args["keep_linebreaks"] = data_args.keep_linebreaks
328322
raw_datasets = load_dataset(
329323
extension,
330324
data_files=data_files,
325+
cache_dir=model_args.cache_dir,
331326
use_auth_token=True if model_args.use_auth_token else None,
332327
**dataset_args,
333328
)
329+
# If no validation data is there, validation_split_percentage will be used to divide the dataset.
330+
if "validation" not in raw_datasets.keys():
331+
raw_datasets["validation"] = load_dataset(
332+
extension,
333+
data_files=data_files,
334+
split=f"train[:{data_args.validation_split_percentage}%]",
335+
cache_dir=model_args.cache_dir,
336+
use_auth_token=True if model_args.use_auth_token else None,
337+
**dataset_args,
338+
)
339+
raw_datasets["train"] = load_dataset(
340+
extension,
341+
data_files=data_files,
342+
split=f"train[{data_args.validation_split_percentage}%:]",
343+
cache_dir=model_args.cache_dir,
344+
use_auth_token=True if model_args.use_auth_token else None,
345+
**dataset_args,
346+
)
334347
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
335348
# https://huggingface.co/docs/datasets/loading_datasets.html.
336349
# endregion
@@ -446,7 +459,7 @@ def group_texts(examples):
446459
eval_dataset = eval_dataset.select(range(max_eval_samples))
447460

448461
# Log a few random samples from the training set:
449-
for index in random.sample(range(len(train_dataset)), 3):
462+
for index in random.sample(range(len(train_dataset)), min(3, len(train_dataset))):
450463
logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
451464
# endregion
452465

@@ -465,44 +478,88 @@ def group_texts(examples):
465478

466479
# region TF Dataset preparation
467480
num_replicas = training_args.strategy.num_replicas_in_sync
468-
data_collator = DefaultDataCollator(return_tensors="tf")
469481
options = tf.data.Options()
470482
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
471483

472-
tf_train_dataset = train_dataset.to_tf_dataset(
473-
# labels are passed as input, as we will use the model's internal loss
474-
columns=[col for col in train_dataset.features if col != "special_tokens_mask"],
484+
# model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
485+
# training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
486+
# use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
487+
# yourself if you use this method, whereas they are automatically inferred from the model input names when
488+
# using model.prepare_tf_dataset()
489+
# For more info see the docs:
490+
# https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
491+
# https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset
492+
493+
tf_train_dataset = model.prepare_tf_dataset(
494+
train_dataset,
475495
shuffle=True,
476496
batch_size=num_replicas * training_args.per_device_train_batch_size,
477-
collate_fn=data_collator,
478-
drop_remainder=True,
479497
).with_options(options)
480498

481-
tf_eval_dataset = eval_dataset.to_tf_dataset(
482-
# labels are passed as input, as we will use the model's internal loss
483-
columns=[col for col in eval_dataset.features if col != "special_tokens_mask"],
499+
tf_eval_dataset = model.prepare_tf_dataset(
500+
eval_dataset,
484501
shuffle=False,
485-
batch_size=num_replicas * training_args.per_device_train_batch_size,
486-
collate_fn=data_collator,
502+
batch_size=num_replicas * training_args.per_device_eval_batch_size,
487503
drop_remainder=True,
488504
).with_options(options)
489505
# endregion
490506

491507
# region Optimizer and loss
492-
batches_per_epoch = len(train_dataset) // (num_replicas * training_args.per_device_train_batch_size)
508+
num_train_steps = len(tf_train_dataset) * int(training_args.num_train_epochs)
509+
if training_args.warmup_steps > 0:
510+
num_warmup_steps = training_args.warmup_steps
511+
elif training_args.warmup_ratio > 0:
512+
num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
513+
else:
514+
num_warmup_steps = 0
515+
493516
# Bias and layernorm weights are automatically excluded from the decay
494517
optimizer, lr_schedule = create_optimizer(
495518
init_lr=training_args.learning_rate,
496-
num_train_steps=int(training_args.num_train_epochs * batches_per_epoch),
497-
num_warmup_steps=training_args.warmup_steps,
519+
num_train_steps=num_train_steps,
520+
num_warmup_steps=num_warmup_steps,
498521
adam_beta1=training_args.adam_beta1,
499522
adam_beta2=training_args.adam_beta2,
500523
adam_epsilon=training_args.adam_epsilon,
501524
weight_decay_rate=training_args.weight_decay,
525+
adam_global_clipnorm=training_args.max_grad_norm,
502526
)
503527

504528
# no user-specified loss = will use the model internal loss
505-
model.compile(optimizer=optimizer)
529+
model.compile(optimizer=optimizer, jit_compile=training_args.xla)
530+
# endregion
531+
532+
# region Preparing push_to_hub and model card
533+
push_to_hub_model_id = training_args.push_to_hub_model_id
534+
model_name = model_args.model_name_or_path.split("/")[-1]
535+
if not push_to_hub_model_id:
536+
if data_args.dataset_name is not None:
537+
push_to_hub_model_id = f"{model_name}-finetuned-{data_args.dataset_name}"
538+
else:
539+
push_to_hub_model_id = f"{model_name}-finetuned-clm"
540+
541+
model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"}
542+
if data_args.dataset_name is not None:
543+
model_card_kwargs["dataset_tags"] = data_args.dataset_name
544+
if data_args.dataset_config_name is not None:
545+
model_card_kwargs["dataset_args"] = data_args.dataset_config_name
546+
model_card_kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
547+
else:
548+
model_card_kwargs["dataset"] = data_args.dataset_name
549+
550+
if training_args.push_to_hub:
551+
callbacks = [
552+
PushToHubCallback(
553+
output_dir=training_args.output_dir,
554+
model_id=push_to_hub_model_id,
555+
organization=training_args.push_to_hub_organization,
556+
token=training_args.push_to_hub_token,
557+
tokenizer=tokenizer,
558+
**model_card_kwargs,
559+
)
560+
]
561+
else:
562+
callbacks = []
506563
# endregion
507564

508565
# region Training and validation
@@ -512,33 +569,45 @@ def group_texts(examples):
512569
logger.info(f" Instantaneous batch size per device = {training_args.per_device_train_batch_size}")
513570
logger.info(f" Total train batch size = {training_args.per_device_train_batch_size * num_replicas}")
514571

572+
# For long training runs, you may wish to use the PushToHub() callback here to save intermediate checkpoints
573+
# to the Hugging Face Hub rather than just pushing the finished model.
574+
# See https://huggingface.co/docs/transformers/main_classes/keras_callbacks#transformers.PushToHubCallback
575+
515576
history = model.fit(
516577
tf_train_dataset,
517578
validation_data=tf_eval_dataset,
518579
epochs=int(training_args.num_train_epochs),
519-
steps_per_epoch=len(train_dataset) // (training_args.per_device_train_batch_size * num_replicas),
520-
callbacks=[SavePretrainedCallback(output_dir=training_args.output_dir)],
580+
callbacks=callbacks,
521581
)
582+
train_loss = history.history["loss"][-1]
522583
try:
523-
train_perplexity = math.exp(history.history["loss"][-1])
584+
train_perplexity = math.exp(train_loss)
524585
except OverflowError:
525586
train_perplexity = math.inf
587+
logger.info(f" Final train loss: {train_loss:.3f}")
588+
logger.info(f" Final train perplexity: {train_perplexity:.3f}")
589+
validation_loss = history.history["val_loss"][-1]
526590
try:
527-
validation_perplexity = math.exp(history.history["val_loss"][-1])
591+
validation_perplexity = math.exp(validation_loss)
528592
except OverflowError:
529593
validation_perplexity = math.inf
530-
logger.info(f" Final train loss: {history.history['loss'][-1]:.3f}")
531-
logger.info(f" Final train perplexity: {train_perplexity:.3f}")
532-
logger.info(f" Final validation loss: {history.history['val_loss'][-1]:.3f}")
594+
logger.info(f" Final validation loss: {validation_loss:.3f}")
533595
logger.info(f" Final validation perplexity: {validation_perplexity:.3f}")
534-
# endregion
535596

536597
if training_args.output_dir is not None:
537-
model.save_pretrained(training_args.output_dir)
598+
output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
599+
results_dict = dict()
600+
results_dict["train_loss"] = train_loss
601+
results_dict["train_perplexity"] = train_perplexity
602+
results_dict["eval_loss"] = validation_loss
603+
results_dict["eval_perplexity"] = validation_perplexity
604+
with open(output_eval_file, "w") as writer:
605+
writer.write(json.dumps(results_dict))
606+
# endregion
538607

539-
if training_args.push_to_hub:
540-
# You'll probably want to include some of your own metadata here!
541-
model.push_to_hub()
608+
if training_args.output_dir is not None and not training_args.push_to_hub:
609+
# If we're not pushing to hub, at least save a local copy when we're done
610+
model.save_pretrained(training_args.output_dir)
542611

543612

544613
if __name__ == "__main__":

0 commit comments

Comments
 (0)