Merge pull request #200 from hirofumi0810/transformer

hirofumi0810 · web-flow · commit 8e615ddb7d7d · 2020-12-09T00:36:07.000+09:00
Update CSJ configuration files
diff --git a/examples/csj/s5/conf/asr/mocha/blstm_mocha.yaml b/examples/csj/s5/conf/asr/mocha/blstm_mocha.yaml
@@ -40,7 +40,7 @@ batch_size: 30
 optimizer: adam
 n_epochs: 25
 convert_to_sgd_epoch: 100
-print_step: 400
+print_step: 800
 metric: edit_distance
 lr: 1e-3
 lr_decay_type: always
diff --git a/examples/csj/s5/conf/asr/mocha/lcblstm_mocha_chunk4040.yaml b/examples/csj/s5/conf/asr/mocha/lcblstm_mocha_chunk4040.yaml
diff --git a/examples/csj/s5/conf/asr/mocha/lcblstm_mocha_chunk4040_ctc_sync.yaml b/examples/csj/s5/conf/asr/mocha/lcblstm_mocha_chunk4040_ctc_sync.yaml
diff --git a/examples/csj/s5/conf/asr/mocha/lcblstm_mocha_chunk4040_decot16.yaml b/examples/csj/s5/conf/asr/mocha/lcblstm_mocha_chunk4040_decot16.yaml
@@ -0,0 +1,76 @@
+### topology
+n_stacks: 1
+n_skips: 1
+max_n_frames: 1600
+conv_in_channel: 1
+conv_channels: "32_32"
+conv_kernel_sizes: "(3,3)_(3,3)"
+conv_strides: "(1,1)_(1,1)"
+conv_poolings: "(2,2)_(2,2)"
+conv_batch_norm: false
+subsample: "1_1_1_1_1"
+enc_type: conv_blstm
+bidirectional_sum_fwd_bwd: true
+enc_n_units: 512
+enc_n_projs: 0
+enc_n_layers: 5
+subsample_type: drop
+lc_chunk_size_left: 40
+lc_chunk_size_right: 40
+attn_type: mocha
+mocha_chunk_size: 4
+mocha_init_r: -4
+mocha_eps: 1e-6
+mocha_std: 1.0
+mocha_1dconv: false
+mocha_quantity_loss_weight: 1.0  ### this is important
+mocha_latency_metric: decot
+mocha_latency_loss_weight: 0.0
+mocha_decot_lookahead: 16
+attn_sharpening_factor: 1.0
+attn_dim: 512
+attn_n_heads: 1
+dec_type: lstm
+dec_n_units: 1024
+dec_n_projs: 0
+dec_n_layers: 1
+dec_bottleneck_dim: 1024  ### this is effective
+emb_dim: 512
+tie_embedding: false
+ctc_fc_list: "512"
+### optimization
+batch_size: 20
+optimizer: adam
+n_epochs: 25
+convert_to_sgd_epoch: 100
+print_step: 800
+metric: edit_distance
+lr: 1e-3
+lr_decay_type: always
+lr_decay_start_epoch: 10
+lr_decay_rate: 0.85
+lr_decay_patient_n_epochs: 0
+early_stop_patient_n_epochs: 5
+sort_stop_epoch: 100
+eval_start_epoch: 1
+warmup_start_lr: 1e-4
+warmup_n_steps: 0
+### initialization
+param_init: 0.1
+### regularization
+clip_grad_norm: 5.0
+dropout_in: 0.0
+dropout_enc: 0.4
+dropout_dec: 0.4
+dropout_emb: 0.4
+dropout_att: 0.0
+weight_decay: 1e-6
+lsm_prob: 0.1
+### MTL
+ctc_weight: 0.3
+ctc_lsm_prob: 0.1
+mtl_per_batch: false
+task_specific_layer: false
+### alignment
+train_word_alignment: /home/inaguma/kaldi/egs/csj/s5/exp/tri4_ali_nodup/split_word_alignments
+dev_word_alignment: /home/inaguma/kaldi/egs/csj/s5/exp/tri4_ali_train_dev/split_word_alignments
diff --git a/examples/csj/s5/conf/asr/mocha/lcblstm_mocha_chunk4040_minlt.yaml b/examples/csj/s5/conf/asr/mocha/lcblstm_mocha_chunk4040_minlt.yaml
@@ -0,0 +1,75 @@
+### topology
+n_stacks: 1
+n_skips: 1
+max_n_frames: 1600
+conv_in_channel: 1
+conv_channels: "32_32"
+conv_kernel_sizes: "(3,3)_(3,3)"
+conv_strides: "(1,1)_(1,1)"
+conv_poolings: "(2,2)_(2,2)"
+conv_batch_norm: false
+subsample: "1_1_1_1_1"
+enc_type: conv_blstm
+bidirectional_sum_fwd_bwd: true
+enc_n_units: 512
+enc_n_projs: 0
+enc_n_layers: 5
+subsample_type: drop
+lc_chunk_size_left: 40
+lc_chunk_size_right: 40
+attn_type: mocha
+mocha_chunk_size: 4
+mocha_init_r: -4
+mocha_eps: 1e-6
+mocha_std: 1.0
+mocha_1dconv: false
+mocha_quantity_loss_weight: 0.0  ### this is important
+mocha_latency_metric: minlt
+mocha_latency_loss_weight: 1.0
+attn_sharpening_factor: 1.0
+attn_dim: 512
+attn_n_heads: 1
+dec_type: lstm
+dec_n_units: 1024
+dec_n_projs: 0
+dec_n_layers: 1
+dec_bottleneck_dim: 1024  ### this is effective
+emb_dim: 512
+tie_embedding: false
+ctc_fc_list: "512"
+### optimization
+batch_size: 20
+optimizer: adam
+n_epochs: 25
+convert_to_sgd_epoch: 100
+print_step: 800
+metric: edit_distance
+lr: 1e-3
+lr_decay_type: always
+lr_decay_start_epoch: 10
+lr_decay_rate: 0.85
+lr_decay_patient_n_epochs: 0
+early_stop_patient_n_epochs: 5
+sort_stop_epoch: 100
+eval_start_epoch: 1
+warmup_start_lr: 1e-4
+warmup_n_steps: 0
+### initialization
+param_init: 0.1
+### regularization
+clip_grad_norm: 5.0
+dropout_in: 0.0
+dropout_enc: 0.4
+dropout_dec: 0.4
+dropout_emb: 0.4
+dropout_att: 0.0
+weight_decay: 1e-6
+lsm_prob: 0.1
+### MTL
+ctc_weight: 0.3
+ctc_lsm_prob: 0.1
+mtl_per_batch: false
+task_specific_layer: false
+### alignment
+train_word_alignment: /home/inaguma/kaldi/egs/csj/s5/exp/tri4_ali_nodup/split_word_alignments
+dev_word_alignment: /home/inaguma/kaldi/egs/csj/s5/exp/tri4_ali_train_dev/split_word_alignments
diff --git a/examples/csj/s5/conf/asr/mocha/lstm_mocha.yaml b/examples/csj/s5/conf/asr/mocha/lstm_mocha.yaml
@@ -37,7 +37,7 @@ batch_size: 30
 optimizer: adam
 n_epochs: 30  # for uni
 convert_to_sgd_epoch: 100
-print_step: 400
+print_step: 800
 metric: edit_distance
 lr: 1e-3
 lr_decay_type: always
diff --git a/examples/csj/s5/conf/asr/mocha/lstm_mocha_ctc_sync.yaml b/examples/csj/s5/conf/asr/mocha/lstm_mocha_ctc_sync.yaml
diff --git a/examples/csj/s5/conf/data/pretrain.yaml b/examples/csj/s5/conf/data/pretrain.yaml
@@ -1,5 +1,5 @@
 # optimization
-n_epochs: 15  # 20->15
+n_epochs: 20  # 25->20
 print_step: 800
 lr_decay_start_epoch: 5
 lr_decay_rate: 0.8
diff --git a/examples/csj/s5/conf/data/spec_augment_pretrain_F13_T50.yaml b/examples/csj/s5/conf/data/spec_augment_pretrain_F13_T50.yaml
@@ -0,0 +1,12 @@
+# optimization
+n_epochs: 40  # 20->40
+print_step: 1600  # 800->1600
+lr_decay_start_epoch: 5
+lr_decay_rate: 0.9
+
+# mask
+freq_width: 13
+n_freq_masks: 2
+time_width: 50
+n_time_masks: 2
+time_width_upper: 1.0
diff --git a/examples/csj/s5/conf/data/spec_augment_pretrain_F27_T100.yaml b/examples/csj/s5/conf/data/spec_augment_pretrain_F27_T100.yaml
@@ -0,0 +1,12 @@
+# optimization
+n_epochs: 40  # 20->40
+print_step: 1600  # 800->1600
+lr_decay_start_epoch: 5
+lr_decay_rate: 0.9
+
+# mask
+freq_width: 27
+n_freq_masks: 2
+time_width: 100
+n_time_masks: 2
+time_width_upper: 1.0
diff --git a/examples/csj/s5/conf/data/spec_augment_pretrain_F27_T50.yaml b/examples/csj/s5/conf/data/spec_augment_pretrain_F27_T50.yaml
@@ -0,0 +1,12 @@
+# optimization
+n_epochs: 40  # 20->40
+print_step: 1600  # 800->1600
+lr_decay_start_epoch: 5
+lr_decay_rate: 0.9
+
+# mask
+freq_width: 27
+n_freq_masks: 2
+time_width: 50
+n_time_masks: 2
+time_width_upper: 1.0
diff --git a/examples/csj/s5/conf/data/spec_augment_speed_perturb_transformer_subsample8.yaml b/examples/csj/s5/conf/data/spec_augment_speed_perturb_transformer_subsample8.yaml
@@ -0,0 +1,10 @@
+# optimization
+n_epochs: 50  # 25->50
+print_step: 2400  # 400->2400
+
+# mask
+freq_width: 27
+n_freq_masks: 2
+time_width: 100
+n_time_masks: 2
+time_width_upper: 1.0
diff --git a/examples/csj/s5/conf/data/spec_augment_speed_perturb_transformer_subsample8_F27_T50.yaml b/examples/csj/s5/conf/data/spec_augment_speed_perturb_transformer_subsample8_F27_T50.yaml
@@ -0,0 +1,10 @@
+# optimization
+n_epochs: 50  # 25->50
+print_step: 2400  # 400->2400
+
+# mask
+freq_width: 27
+n_freq_masks: 2
+time_width: 50
+n_time_masks: 2
+time_width_upper: 1.0
diff --git a/examples/csj/s5/conf/lm/transformer_xl.yaml b/examples/csj/s5/conf/lm/transformer_xl.yaml
@@ -1,8 +1,8 @@
 # topology
 lm_type: transformer_xl
+n_layers: 12
 transformer_d_model: 512
 transformer_d_ff: 2048
-n_layers: 12
 transformer_n_heads: 8
 tie_embedding: true
 # optimization
diff --git a/examples/csj/s5/conf/lm/transformerlm.yaml b/examples/csj/s5/conf/lm/transformerlm.yaml
@@ -1,9 +1,9 @@
 # topology
 lm_type: transformer
-transformer_d_model: 512
-transformer_d_ff: 2048
 n_layers: 12
 transformer_pe_type: add
+transformer_d_model: 512
+transformer_d_ff: 2048
 transformer_n_heads: 8
 tie_embedding: true
 # optimization