|
| 1 | +# This is used to tranlate english to french |
| 2 | + |
| 3 | +import pandas as pd |
| 4 | +from gensim.models import KeyedVectors |
| 5 | +import nltk |
| 6 | +import unicodedata |
| 7 | +import string |
| 8 | + |
| 9 | +# Loading in the French embeddings. |
| 10 | + |
| 11 | +fr_embeddings = KeyedVectors.load_word2vec_format('wiki.multi.fr.vec') |
| 12 | +f = open('capitals.txt', 'r').read() |
| 13 | +set_words = set(nltk.word_tokenize(f)) |
| 14 | + |
| 15 | +def load_translations(): |
| 16 | + ''' |
| 17 | + TBD |
| 18 | + |
| 19 | + ''' |
| 20 | + dict_fr = pd.read_csv('en-fr.txt', delimiter = ' ') |
| 21 | + |
| 22 | + en_to_fr = {} |
| 23 | + fr_to_vec = {} |
| 24 | + for i in range(len(dict_fr)): |
| 25 | + en = dict_fr.loc[i][0] |
| 26 | + fr = dict_fr.loc[i][1] |
| 27 | + if type(en) != float: |
| 28 | + en = en.capitalize() |
| 29 | + if en in set_words and en not in set(en_to_fr.keys()): |
| 30 | + en_to_fr[en] = fr |
| 31 | + fr_to_vec[fr] = fr_embeddings[fr] |
| 32 | + # Add comments later |
| 33 | + del fr_to_vec['syrienne'] |
| 34 | + del fr_to_vec['iranienne'] |
| 35 | + del fr_to_vec['malien'] |
| 36 | + del fr_to_vec['arménienne'] |
| 37 | + del fr_to_vec['chilien'] |
| 38 | + del fr_to_vec['équateur'] |
| 39 | + en_to_fr['Chile'] = 'chili' |
| 40 | + fr_to_vec['chili'] = fr_embeddings['chili'] |
| 41 | + en_to_fr['Iran'] = 'iran' |
| 42 | + fr_to_vec['iran'] = fr_embeddings['iran'] |
| 43 | + en_to_fr['Turkey'] = 'turquie' |
| 44 | + fr_to_vec['turquie'] = fr_embeddings['turquie'] |
| 45 | + en_to_fr['Syria'] = 'syrie' |
| 46 | + fr_to_vec['syrie'] = fr_embeddings['syrie'] |
| 47 | + en_to_fr['Nigeria'] = 'nigeria' |
| 48 | + fr_to_vec['nigeria'] = fr_embeddings['nigeria'] |
| 49 | + en_to_fr['Mali'] = 'mali' |
| 50 | + fr_to_vec['mali'] = fr_embeddings['mali'] |
| 51 | + fr_to_vec['grece'] = fr_embeddings['grèce'] |
| 52 | + en_to_fr['Armenia'] = 'arménie' |
| 53 | + fr_to_vec['arménie'] = fr_embeddings['arménie'] |
| 54 | + en_to_fr['Ecuador'] = 'ecuador' |
| 55 | + fr_to_vec['ecuador'] = fr_embeddings['ecuador'] |
| 56 | + en_to_fr['Niger'] = 'niger' |
| 57 | + fr_to_vec['niger'] = fr_embeddings['niger'] |
| 58 | + return en_to_fr, fr_to_vec |
| 59 | + |
| 60 | +def remove_accents(data): |
| 61 | + return ''.join(x for x in unicodedata.normalize('NFKD', data) if x in string.ascii_letters).lower() |
0 commit comments