Skip to content
This repository was archived by the owner on Nov 21, 2023. It is now read-only.

Fixes all current issues with FLORES V1 #40

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
3 changes: 0 additions & 3 deletions floresv1/configs/neen.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
"--warmup-init-lr 1e-7",
"--warmup-updates 4000",
"--lr 0.003",
"--min-lr 1e-9",
"--clip-norm 0.0",
"--dropout 0.3",
"--criterion label_smoothed_cross_entropy",
Expand Down Expand Up @@ -79,7 +78,6 @@
"--warmup-init-lr 1e-7",
"--warmup-updates 4000",
"--lr 0.0007",
"--min-lr 1e-9",
"--clip-norm 0.0",
"--dropout 0.2",
"--criterion label_smoothed_cross_entropy",
Expand Down Expand Up @@ -132,7 +130,6 @@
"--warmup-init-lr 1e-7",
"--warmup-updates 4000",
"--lr 0.001",
"--min-lr 1e-9",
"--clip-norm 0.0",
"--dropout 0.1",
"--criterion label_smoothed_cross_entropy",
Expand Down
31 changes: 13 additions & 18 deletions floresv1/download-data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,14 @@ NE_OPUS_DATASETS=(
"$NE_ROOT/GNOME.en-ne"
"$NE_ROOT/Ubuntu.en-ne"
"$NE_ROOT/KDE4.en-ne"
"$NE_ROOT/GlobalVoices.en-ne"
)

NE_OPUS_URLS=(
"https://object.pouta.csc.fi/OPUS-GNOME/v1/moses/en-ne.txt.zip"
"https://object.pouta.csc.fi/OPUS-Ubuntu/v14.10/moses/en-ne.txt.zip"
"https://object.pouta.csc.fi/OPUS-KDE4/v2/moses/en-ne.txt.zip"
"https://opus.nlpl.eu/download.php?f=GlobalVoices/v2018q4/moses/en-ne.txt.zip"
)

REMOVE_FILE_PATHS=()
Expand Down Expand Up @@ -96,6 +98,10 @@ download_opus_data() {
cat ${DATASETS[0]}.$SRC ${DATASETS[1]}.$SRC ${DATASETS[2]}.$SRC > $LANG_ROOT/GNOMEKDEUbuntu.$SRC-$TGT.$SRC
cat ${DATASETS[0]}.$TGT ${DATASETS[1]}.$TGT ${DATASETS[2]}.$TGT > $LANG_ROOT/GNOMEKDEUbuntu.$SRC-$TGT.$TGT

# remove blank lines from GlobalVoices
sed -i '/^$/d' ${DATASETS[4]}.$SRC
sed -i '/^$/d' ${DATASETS[4]}.$TGT

REMOVE_FILE_PATHS+=( ${DATASETS[0]}.$SRC ${DATASETS[1]}.$SRC ${DATASETS[2]}.$SRC )
REMOVE_FILE_PATHS+=( ${DATASETS[0]}.$TGT ${DATASETS[1]}.$TGT ${DATASETS[2]}.$TGT )
}
Expand All @@ -107,19 +113,6 @@ REMOVE_FILE_PATHS+=( ${SI_OPUS_DATASETS[3]}.$SRC ${SI_OPUS_DATASETS[3]}.$SI_TGT

download_opus_data $NE_ROOT $NE_TGT


# Download and extract Global Voices data
GLOBAL_VOICES="$NE_ROOT/globalvoices.2018q4.ne-en"
GLOBAL_VOICES_URL="http://www.casmacat.eu/corpus/global-voices/globalvoices.ne-en.xliff.gz"

download_data $GLOBAL_VOICES.gz $GLOBAL_VOICES_URL
gunzip -Nf $GLOBAL_VOICES.gz

sed -ne 's?.*<source>\(.*\)</source>.*?\1?p' $GLOBAL_VOICES > $GLOBAL_VOICES.$NE_TGT
sed -ne 's?.*<target[^>]*>\(.*\)</target>.*?\1?p' $GLOBAL_VOICES > $GLOBAL_VOICES.$SRC

REMOVE_FILE_PATHS+=( $GLOBAL_VOICES )

# Download and extract the bible dataset
BIBLE_TOOLS=$ROOT/bible-corpus-tools
XML_BIBLES=$ROOT/XML_Bibles
Expand Down Expand Up @@ -151,12 +144,14 @@ cat $XML_BIBLES_DUP/aligned/*/Nepali.txt > $NE_ROOT/bible_dup.$SRC-$NE_TGT.$NE_T
REMOVE_FILE_PATHS+=( bible-corpus-1.2.1 bible.tar.gz $BIBLE_TOOLS $XML_BIBLES $XML_BIBLES_DUP )


# If you want to download en-hi then you have to manually do it here https://www.cfilt.iitb.ac.in/iitb_parallel/dataset.html

# Download parallel en-hi corpus
download_data $DATA/en-hi.tgz "http://www.cfilt.iitb.ac.in/iitb_parallel/iitb_corpus_download/parallel.tgz"
# download_data $DATA/en-hi.tgz "http://www.cfilt.iitb.ac.in/iitb_parallel/iitb_corpus_download/parallel.tgz"
#download_data $DATA/en-hi.tgz "https://www.cse.iitb.ac.in/~anoopk/share/iitb_en_hi_parallel/iitb_corpus_download/parallel.tgz"
tar xvzf $DATA/en-hi.tgz
cp parallel/* $HI_ROOT/
REMOVE_FILE_PATHS+=( parallel $DATA/en-hi.tgz )
# tar xvzf $DATA/en-hi.tgz
# cp parallel/* $HI_ROOT/
# REMOVE_FILE_PATHS+=( parallel $DATA/en-hi.tgz )


# Download and extract the Penn Treebank dataset
Expand Down Expand Up @@ -204,7 +199,7 @@ REMOVE_FILE_PATHS+=( $NE_DICT dictionaries )


# Download test sets
download_data $DATA/wikipedia_en_ne_si_test_sets.tgz "https://github.com/facebookresearch/flores/raw/master/data/wikipedia_en_ne_si_test_sets.tgz"
download_data $DATA/wikipedia_en_ne_si_test_sets.tgz "https://github.com/facebookresearch/flores/raw/main/data/wikipedia_en_ne_si_test_sets.tgz"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this was fixed in a previous PR, it might cause merge conflicts.

REMOVE_FILE_PATHS+=( $MOSES $NE_TAGGED original.zip $DATA/nepali-penn-treebank.$SRC.patch $DATA/nepali-penn-treebank.$NE_TGT.patch )

pushd $DATA/
Expand Down
2 changes: 1 addition & 1 deletion floresv1/prepare-neen.sh
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ ARCHIVES=(
TRAIN_SETS=(
"all-clean-ne/bible_dup.en-ne"
"all-clean-ne/bible.en-ne"
"all-clean-ne/globalvoices.2018q4.ne-en"
"all-clean-ne/GlobalVoices.en-ne"
"all-clean-ne/GNOMEKDEUbuntu.en-ne"
"all-clean-ne/nepali-penn-treebank"
)
Expand Down
2 changes: 1 addition & 1 deletion floresv1/scripts/download_indic.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ ROOT=$(dirname "$0")
INDICNLP=$ROOT/indic_nlp_library
if [ ! -e $INDICNLP ]; then
echo "Cloning Indic NLP Library..."
git -C $ROOT clone https://github.com/anoopkunchukuttan/indic_nlp_library.git
git clone https://github.com/anoopkunchukuttan/indic_nlp_library.git $INDICNLP
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

already fixed in a previous PR

pushd $INDICNLP
git reset --hard 0a5e01f2701e0df5bc1f9905334cd7916d874c16
popd
Expand Down
9 changes: 5 additions & 4 deletions floresv1/scripts/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ def translate_files(args, dest_dir, input_files):
--max-len-b {args.max_len_b} \
--buffer-size {args.buffer_size} \
--max-tokens {args.max_tokens} \
--skip-invalid-size-inputs-valid-test \
--num-workers {args.cpu} > {{output_file}} && \
echo "finished" >> {{output_file}}
"""
Expand Down Expand Up @@ -124,12 +125,12 @@ def main():
args.cuda_visible_device_ids = args.cuda_visible_device_ids or list(range(torch.cuda.device_count()))

chkpnt = torch.load(args.model)
model_args = chkpnt['args']
model_cfg = chkpnt['cfg']
if args.source_lang is None or args.target_lang is None:
args.source_lang = args.source_lang or model_args.source_lang
args.target_lang = args.target_lang or model_args.target_lang
args.source_lang = args.source_lang or model_cfg['task']['source_lang']
args.target_lang = args.target_lang or model_cfg['task']['target_lang']
if args.databin is None:
args.databin = args.databin or model_args.data
args.databin = args.databin or model_cfg['task']['data']

root_dir = os.path.dirname(os.path.realpath(__file__))
translation_dir = os.path.join(args.dest or root_dir, 'translations', f'{args.source_lang}-{args.target_lang}')
Expand Down