feat: update CTC task.

Alexzhuan · Alexzhuan · commit 77b2f9aaf030 · 2021-06-18T22:16:20.000+08:00
diff --git a/baselines/run_classifier.py b/baselines/run_classifier.py
@@ -139,14 +139,14 @@ def main():
         train_samples = data_processor.get_train_sample()
         eval_samples = data_processor.get_dev_sample()
 
-        if args.task_name != 'ee':
-            train_dataset = dataset_class(train_samples, data_processor, mode='train')
-            eval_dataset = dataset_class(eval_samples, data_processor, mode='eval')
-        else:
+        if args.task_name == 'ee' or args.task_name == 'ctc':
             train_dataset = dataset_class(train_samples, data_processor, tokenizer, mode='train',
                                           model_type=args.model_type, ngram_dict=ngram_dict, max_length=args.max_length)
             eval_dataset = dataset_class(eval_samples, data_processor, tokenizer, mode='eval',
                                          model_type=args.model_type, ngram_dict=ngram_dict, max_length=args.max_length)
+        else:
+            train_dataset = dataset_class(train_samples, data_processor, mode='train')
+            eval_dataset = dataset_class(eval_samples, data_processor, mode='eval')
 
         model = model_class.from_pretrained(os.path.join(args.model_dir, args.model_name),
                                             num_labels=data_processor.num_labels)
@@ -167,12 +167,12 @@ def main():
         data_processor = data_processor_class(root=args.data_dir)
         test_samples = data_processor.get_test_sample()
 
-        if args.task_name != 'ee':
-            test_dataset = dataset_class(test_samples, data_processor, mode='test')
-        else:
+        if args.task_name == 'ee' or args.task_name == 'ctc':
             test_dataset = dataset_class(test_samples, data_processor, tokenizer, mode='test', ngram_dict=ngram_dict,
                                          max_length=args.max_length, model_type=args.model_type)
-
+        else:
+            test_dataset = dataset_class(test_samples, data_processor, mode='test')
+            
         model = model_class.from_pretrained(args.output_dir, num_labels=data_processor.num_labels)
         trainer = trainer_class(args=args, model=model, data_processor=data_processor,
                                 tokenizer=tokenizer, logger=logger, model_class=model_class, ngram_dict=ngram_dict)
diff --git a/cblue/data/__pycache__/data_process.cpython-37.pyc b/cblue/data/__pycache__/data_process.cpython-37.pyc
diff --git a/cblue/data/__pycache__/dataset.cpython-37.pyc b/cblue/data/__pycache__/dataset.cpython-37.pyc
diff --git a/cblue/data/data_process.py b/cblue/data/data_process.py
@@ -693,7 +693,7 @@ def _pre_process(self, path, is_predict=False):
         samples = load_json(path)
         outputs = {'text': [], 'label': [], 'id': []}
         for sample in samples:
-            outputs['text'].append(sample['text'])
+            outputs['text'].append("\002".join([ t for t in list(sample["text"].lower())]))
             outputs['id'].append(sample['id'])
             if not is_predict:
                 outputs['label'].append(self.label2id[sample['label']])
diff --git a/cblue/data/dataset.py b/cblue/data/dataset.py
@@ -267,26 +267,52 @@ def __init__(
             self,
             samples,
             data_processor,
-            mode='train'
+            tokenizer,
+            max_length=128,
+            mode='train',
+            model_type='bert',
+            ngram_dict=None
     ):
         super(CTCDataset, self).__init__()
 
-        self.texts = samples['text']
+        self.texts = [text.split("\002") for text in samples['text']]
         self.ids = samples['id']
 
         if mode != 'test':
             self.labels = samples['label']
         self.data_processor = data_processor
         self.mode = mode
+        self.ngram_dict = ngram_dict
+        self.max_length = max_length
+        self.tokenizer = tokenizer
+        self.model_type = model_type
 
     def __getitem__(self, idx):
         text = self.texts[idx]
+        if self.model_type == 'zen':
+            inputs = convert_examples_to_features(text1=text, ngram_dict=self.ngram_dict,
+                                                  tokenizer=self.tokenizer, max_seq_length=self.max_length)
+        else:
+            inputs = self.tokenizer.encode_plus(text, padding='max_length', max_length=self.max_length, truncation=True)
 
         if self.mode != 'test':
-            label = self.labels[idx]
-            return text, label
+            if self.model_type == 'zen':
+                return inputs['input_ids'], inputs['token_type_ids'], \
+                       inputs['attention_mask'], self.labels[idx], inputs['input_ngram_ids'], \
+                       inputs['ngram_attention_mask'], inputs['ngram_token_type_ids'], \
+                       inputs['ngram_position_matrix']
+            else:
+                return np.array(inputs['input_ids']), np.array(inputs['token_type_ids']), \
+                    np.array(inputs['attention_mask']), self.labels[idx]
         else:
-            return text
+            if self.model_type == 'zen':
+                return inputs['input_ids'], inputs['token_type_ids'], \
+                       inputs['attention_mask'], inputs['input_ngram_ids'], \
+                       inputs['ngram_attention_mask'], inputs['ngram_token_type_ids'], \
+                       inputs['ngram_position_matrix']
+            else:
+                return np.array(inputs['input_ids']), np.array(inputs['token_type_ids']), \
+                       np.array(inputs['attention_mask']),
 
     def __len__(self):
         return len(self.texts)
diff --git a/cblue/trainer/train.py b/cblue/trainer/train.py
@@ -1129,28 +1129,25 @@ def __init__(
     def training_step(self, model, item):
         model.train()
 
-        text1 = item[0]
-        labels = item[1].to(self.args.device)
-        if self.args.model_type == 'zen':
-            inputs = convert_examples_to_features(text1=text1, ngram_dict=self.ngram_dict,
-                                                  tokenizer=self.tokenizer, max_seq_length=self.args.max_length,
-                                                  return_tensors=True)
-        else:
-            inputs = self.tokenizer(text1, padding='max_length', max_length=self.args.max_length,
-                                    truncation=True, return_tensors='pt')
+        input_ids = item[0].to(self.args.device)
+        token_type_ids = item[1].to(self.args.device)
+        attention_mask = item[2].to(self.args.device)
+        labels = item[3].to(self.args.device)
 
         if self.args.model_type == 'zen':
-            inputs['input_ngram_ids'] = inputs['input_ngram_ids'].to(self.args.device)
-            inputs['ngram_position_matrix'] = inputs['ngram_position_matrix'].to(self.args.device)
-            inputs['ngram_attention_mask'] = inputs['ngram_attention_mask'].to(self.args.device)
-            inputs['ngram_token_type_ids'] = inputs['ngram_token_type_ids'].to(self.args.device)
-
-        inputs['input_ids'] = inputs['input_ids'].to(self.args.device)
-        inputs['attention_mask'] = inputs['attention_mask'].to(self.args.device)
-        inputs['token_type_ids'] = inputs['token_type_ids'].to(self.args.device)
+            input_ngram_ids = item[4].to(self.args.device)
+            ngram_attention_mask = item[5].to(self.args.device)
+            ngram_token_type_ids = item[6].to(self.args.device)
+            ngram_position_matrix = item[7].to(self.args.device)
 
         # default using 'Transformers' library models.
-        outputs = model(labels=labels, **inputs)
+        if self.args.model_type == 'zen':
+            outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids,
+                            labels=labels, ngram_ids=input_ngram_ids, ngram_positions=ngram_position_matrix,
+                            ngram_attention_mask=ngram_attention_mask, ngram_token_type_ids=ngram_token_type_ids)
+        else:
+            outputs = model(labels=labels, input_ids=input_ids, token_type_ids=token_type_ids,
+                            attention_mask=attention_mask)
         loss = outputs[0]
         loss.backward()
 
@@ -1170,29 +1167,28 @@ def evaluate(self, model):
         for step, item in enumerate(eval_dataloader):
             model.eval()
 
-            text1 = item[0]
-            labels = item[1].to(args.device)
-
-            if self.args.model_type == 'zen':
-                inputs = convert_examples_to_features(text1=text1, ngram_dict=self.ngram_dict,
-                                                      tokenizer=self.tokenizer, max_seq_length=self.args.max_length,
-                                                      return_tensors=True)
-            else:
-                inputs = self.tokenizer(text1, padding='max_length', max_length=self.args.max_length,
-                                        truncation=True, return_tensors='pt')
-
-            if self.args.model_type == 'zen':
-                inputs['input_ngram_ids'] = inputs['input_ngram_ids'].to(self.args.device)
-                inputs['ngram_position_matrix'] = inputs['ngram_position_matrix'].to(self.args.device)
-                inputs['ngram_attention_mask'] = inputs['ngram_attention_mask'].to(self.args.device)
-                inputs['ngram_token_type_ids'] = inputs['ngram_token_type_ids'].to(self.args.device)
+            input_ids = item[0].to(self.args.device)
+            token_type_ids = item[1].to(self.args.device)
+            attention_mask = item[2].to(self.args.device)
+            labels = item[3].to(self.args.device)
 
-            inputs['input_ids'] = inputs['input_ids'].to(self.args.device)
-            inputs['attention_mask'] = inputs['attention_mask'].to(self.args.device)
-            inputs['token_type_ids'] = inputs['token_type_ids'].to(self.args.device)
+            if args.model_type == 'zen':
+                input_ngram_ids = item[4].to(self.args.device)
+                ngram_attention_mask = item[5].to(self.args.device)
+                ngram_token_type_ids = item[6].to(self.args.device)
+                ngram_position_matrix = item[7].to(self.args.device)
 
             with torch.no_grad():
-                outputs = model(labels=labels, **inputs)
+                if self.args.model_type == 'zen':
+                    outputs = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask,
+                                    labels=labels, ngram_ids=input_ngram_ids,
+                                    ngram_positions=ngram_position_matrix,
+                                    ngram_token_type_ids=ngram_token_type_ids,
+                                    ngram_attention_mask=ngram_attention_mask)
+                else:
+                    outputs = model(labels=labels, input_ids=input_ids, token_type_ids=token_type_ids,
+                                    attention_mask=attention_mask)
+
                 loss, logits = outputs[:2]
 
             if preds is None:
@@ -1222,32 +1218,31 @@ def predict(self, test_dataset, model):
         for step, item in enumerate(test_dataloader):
             model.eval()
 
-            text1 = item
-
-            if self.args.model_type == 'zen':
-                inputs = convert_examples_to_features(text1=text1, ngram_dict=self.ngram_dict,
-                                                      tokenizer=self.tokenizer, max_seq_length=self.args.max_length,
-                                                      return_tensors=True)
-            else:
-                inputs = self.tokenizer(text1, padding='max_length', max_length=self.args.max_length,
-                                        truncation=True, return_tensors='pt')
-
-            if self.args.model_type == 'zen':
-                inputs['input_ngram_ids'] = inputs['input_ngram_ids'].to(self.args.device)
-                inputs['ngram_position_matrix'] = inputs['ngram_position_matrix'].to(self.args.device)
-                inputs['ngram_attention_mask'] = inputs['ngram_attention_mask'].to(self.args.device)
-                inputs['ngram_token_type_ids'] = inputs['ngram_token_type_ids'].to(self.args.device)
+            input_ids = item[0].to(self.args.device)
+            token_type_ids = item[1].to(self.args.device)
+            attention_mask = item[2].to(self.args.device)
 
-            inputs['input_ids'] = inputs['input_ids'].to(self.args.device)
-            inputs['attention_mask'] = inputs['attention_mask'].to(self.args.device)
-            inputs['token_type_ids'] = inputs['token_type_ids'].to(self.args.device)
+            if args.model_type == 'zen':
+                input_ngram_ids = item[3].to(self.args.device)
+                ngram_attention_mask = item[4].to(self.args.device)
+                ngram_token_type_ids = item[5].to(self.args.device)
+                ngram_position_matrix = item[6].to(self.args.device)
 
             with torch.no_grad():
-                outputs = model(**inputs)
+                if self.args.model_type == 'zen':
+                    outputs = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask,
+                                    ngram_ids=input_ngram_ids,
+                                    ngram_positions=ngram_position_matrix,
+                                    ngram_token_type_ids=ngram_token_type_ids,
+                                    ngram_attention_mask=ngram_attention_mask)
+                else:
+                    outputs = model(input_ids=input_ids, token_type_ids=token_type_ids,
+                                    attention_mask=attention_mask)
+
                 if args.model_type == 'zen':
-                    logits = outputs
+                    logits = outputs.detach()
                 else:
-                    logits = outputs[0]
+                    logits = outputs[0].detach()
 
             if preds is None:
                 preds = logits.detach().cpu().numpy()
diff --git a/examples/run_ctc.sh b/examples/run_ctc.sh
@@ -4,7 +4,7 @@ DATA_DIR="CBLUEDatasets"
 TASK_NAME="ctc"
 MODEL_TYPE="bert"
 MODEL_DIR="data/model_data"
-MODEL_NAME="chinese-bert-wwm-ext"
+MODEL_NAME="chinese-roberta-large"
 OUTPUT_DIR="data/output"
 RESULT_OUTPUT_DIR="data/result_output"
 
@@ -23,15 +23,15 @@ if [ $# == 0 ]; then
         --result_output_dir=${RESULT_OUTPUT_DIR} \
         --do_train \
         --max_length=${MAX_LENGTH} \
-        --train_batch_size=16 \
-        --eval_batch_size=16 \
-        --learning_rate=3e-5 \
-        --epochs=3 \
+        --train_batch_size=24 \
+        --eval_batch_size=64 \
+        --learning_rate=2e-5 \
+        --epochs=5 \
         --warmup_proportion=0.1 \
-        --earlystop_patience=3 \
+        --earlystop_patience=10 \
         --logging_steps=200 \
         --save_steps=200 \
-        --seed=2021
+        --seed=1000
 elif [ $1 == "predict" ]; then
     python baselines/run_classifier.py \
         --data_dir=${DATA_DIR} \