Closed
Description
System Info
transformers
version: 4.20.1- Platform: Darwin-21.5.0-x86_64-i386-64bit
- Python version: 3.7.2
- Huggingface_hub version: 0.8.1
- PyTorch version (GPU?): 1.12.0 (False)
- Tensorflow version (GPU?): not installed (NA)
- Flax version (CPU?/GPU?/TPU?): not installed (NA)
- Jax version: not installed
- JaxLib version: not installed
- Using GPU in script?: no
- Using distributed or parallel set-up in script?: no
Who can help?
Information
- The official example scripts
- My own modified scripts
Tasks
- An officially supported task in the
examples
folder (such as GLUE/SQuAD, ...) - My own task or dataset (give details below)
Reproduction
Hi!
I'm trying to fine-tune WMT model on my dataset, but running into strange behaviour. The code was taken from official notebook listed on website https://github.com/huggingface/notebooks/blob/main/examples/translation.ipynb
Data: https://www.kaggle.com/datasets/nltkdata/wmt15-eval
Code to reproduce:
import pandas as pd
from datasets import Dataset, load_metric
import transformers
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import numpy as np
with open('newstest-2015-100sents.en-ru.ref.ru') as f:
en = f.read()
with open('newstest-2015-100sents.en-ru.src.en') as f:
ru = f.read()
en = en.split('\n')
ru = ru.split('\n')
df_all = pd.DataFrame({'en': en, 'ru': ru})
df = Dataset.from_pandas(df_all)
metric = load_metric("sacrebleu")
dataset_splitted = df.shuffle(1337).train_test_split(0.1)
model_checkpoint = 'facebook/wmt19-en-ru'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
max_input_length = 128
max_target_length = 128
def preprocess_function(examples):
inputs = [ex for ex in examples["en"]]
targets = [ex for ex in examples["ru"]]
model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
# Setup the tokenizer for targets
with tokenizer.as_target_tokenizer():
labels = tokenizer(targets, max_length=max_target_length, truncation=True)
model_inputs["labels"] = labels["input_ids"]
return model_inputs
tokenized_datasets = dataset_splitted.map(preprocess_function, batched=True)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
batch_size = 16
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
"./tmp",
evaluation_strategy = "epoch",
learning_rate=2e-5,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
weight_decay=0.01,
save_total_limit=3,
num_train_epochs=1,
predict_with_generate=True
)
def postprocess_text(preds, labels):
preds = [pred.strip() for pred in preds]
labels = [[label.strip()] for label in labels]
return preds, labels
def compute_metrics(eval_preds):
preds, labels = eval_preds
if isinstance(preds, tuple):
preds = preds[0]
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
# Replace -100 in the labels as we can't decode them.
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
# Some simple post-processing
decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
result = metric.compute(predictions=decoded_preds, references=decoded_labels)
result = {"bleu": result["score"]}
prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
result["gen_len"] = np.mean(prediction_lens)
result = {k: round(v, 4) for k, v in result.items()}
return result
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer = Seq2SeqTrainer(
model,
args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["test"],
data_collator=data_collator,
tokenizer=tokenizer,
compute_metrics=compute_metrics
)
trainer.train()
The traceback I get:
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
/var/folders/cv/dmhc689x3gn9vgg44b67yl2c0000gq/T/ipykernel_29677/4032920361.py in <module>
----> 1 trainer.train()
~/pet_projects/fairseq_experiments/venv/lib/python3.7/site-packages/transformers/trainer.py in train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1411 resume_from_checkpoint=resume_from_checkpoint,
1412 trial=trial,
-> 1413 ignore_keys_for_eval=ignore_keys_for_eval,
1414 )
1415
~/pet_projects/fairseq_experiments/venv/lib/python3.7/site-packages/transformers/trainer.py in _inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
1649 tr_loss_step = self.training_step(model, inputs)
1650 else:
-> 1651 tr_loss_step = self.training_step(model, inputs)
1652
1653 if (
~/pet_projects/fairseq_experiments/venv/lib/python3.7/site-packages/transformers/trainer.py in training_step(self, model, inputs)
2343
2344 with self.compute_loss_context_manager():
-> 2345 loss = self.compute_loss(model, inputs)
2346
2347 if self.args.n_gpu > 1:
~/pet_projects/fairseq_experiments/venv/lib/python3.7/site-packages/transformers/trainer.py in compute_loss(self, model, inputs, return_outputs)
2375 else:
2376 labels = None
-> 2377 outputs = model(**inputs)
2378 # Save past state if it exists
2379 # TODO: this needs to be fixed and made cleaner later.
~/pet_projects/fairseq_experiments/venv/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1129 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130 return forward_call(*input, **kwargs)
1131 # Do not call functions when jit is used
1132 full_backward_hooks, non_full_backward_hooks = [], []
~/pet_projects/fairseq_experiments/venv/lib/python3.7/site-packages/transformers/models/fsmt/modeling_fsmt.py in forward(self, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, head_mask, decoder_head_mask, cross_attn_head_mask, encoder_outputs, past_key_values, labels, use_cache, output_attentions, output_hidden_states, return_dict)
1175 output_attentions=output_attentions,
1176 output_hidden_states=output_hidden_states,
-> 1177 return_dict=return_dict,
1178 )
1179 lm_logits = outputs[0]
~/pet_projects/fairseq_experiments/venv/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1129 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130 return forward_call(*input, **kwargs)
1131 # Do not call functions when jit is used
1132 full_backward_hooks, non_full_backward_hooks = [], []
~/pet_projects/fairseq_experiments/venv/lib/python3.7/site-packages/transformers/models/fsmt/modeling_fsmt.py in forward(self, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, head_mask, decoder_head_mask, cross_attn_head_mask, encoder_outputs, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)
1079 output_attentions=output_attentions,
1080 output_hidden_states=output_hidden_states,
-> 1081 return_dict=return_dict,
1082 )
1083
~/pet_projects/fairseq_experiments/venv/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1129 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130 return forward_call(*input, **kwargs)
1131 # Do not call functions when jit is used
1132 full_backward_hooks, non_full_backward_hooks = [], []
~/pet_projects/fairseq_experiments/venv/lib/python3.7/site-packages/transformers/models/fsmt/modeling_fsmt.py in forward(self, input_ids, encoder_hidden_states, encoder_padding_mask, decoder_padding_mask, decoder_causal_mask, head_mask, cross_attn_head_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)
722 # assert input_ids.ne(self.padding_idx).any()
723
--> 724 x = self.embed_tokens(input_ids) * self.embed_scale
725 x += positions
726 x = nn.functional.dropout(x, p=self.dropout, training=self.training)
~/pet_projects/fairseq_experiments/venv/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1129 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130 return forward_call(*input, **kwargs)
1131 # Do not call functions when jit is used
1132 full_backward_hooks, non_full_backward_hooks = [], []
~/pet_projects/fairseq_experiments/venv/lib/python3.7/site-packages/torch/nn/modules/sparse.py in forward(self, input)
158 return F.embedding(
159 input, self.weight, self.padding_idx, self.max_norm,
--> 160 self.norm_type, self.scale_grad_by_freq, self.sparse)
161
162 def extra_repr(self) -> str:
~/pet_projects/fairseq_experiments/venv/lib/python3.7/site-packages/torch/nn/functional.py in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
2197 # remove once script supports set_grad_enabled
2198 _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 2199 return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
2200
2201
IndexError: index out of range in self
Expected behavior
Could you please help me figure out what's wrong with the trainer?