Skip to content

Commit 29cd5f6

Browse files
authored
Word Embeddings and computing PCA
1 parent 2e452a9 commit 29cd5f6

14 files changed

+128001
-0
lines changed

word_embeddings_and_PCA/GaussianScatterPCA.svg

+5,758
Loading

word_embeddings_and_PCA/III_Principal_component_analysis.ipynb

+415
Large diffs are not rendered by default.

word_embeddings_and_PCA/II_Manipulating_word_embeddings.ipynb

+985
Large diffs are not rendered by default.

word_embeddings_and_PCA/IV_Compute_PCA.ipynb

+1,698
Large diffs are not rendered by default.

word_embeddings_and_PCA/I_Vector_and_matrix_operation_using_Numpy.ipynb

+824
Large diffs are not rendered by default.

word_embeddings_and_PCA/capitals.txt

+4,952
Large diffs are not rendered by default.

word_embeddings_and_PCA/en-fr.txt

+113,287
Large diffs are not rendered by default.

word_embeddings_and_PCA/map.jpg

570 KB
Loading

word_embeddings_and_PCA/utils.py

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
import numpy as np
2+
3+
4+
def get_vectors(embeddings, words):
5+
"""
6+
Input:
7+
embeddings: a word
8+
fr_embeddings:
9+
words: a list of words
10+
Output:
11+
X: a matrix where the rows are the embeddings corresponding to the rows on the list
12+
13+
"""
14+
m = len(words)
15+
X = np.zeros((1, 300))
16+
for word in words:
17+
english = word
18+
eng_emb = embeddings[english]
19+
X = np.row_stack((X, eng_emb))
20+
X = X[1:,:]
21+
return X

word_embeddings_and_PCA/utils_vecs.py

+61
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
# This is used to tranlate english to french
2+
3+
import pandas as pd
4+
from gensim.models import KeyedVectors
5+
import nltk
6+
import unicodedata
7+
import string
8+
9+
# Loading in the French embeddings.
10+
11+
fr_embeddings = KeyedVectors.load_word2vec_format('wiki.multi.fr.vec')
12+
f = open('capitals.txt', 'r').read()
13+
set_words = set(nltk.word_tokenize(f))
14+
15+
def load_translations():
16+
'''
17+
TBD
18+
19+
'''
20+
dict_fr = pd.read_csv('en-fr.txt', delimiter = ' ')
21+
22+
en_to_fr = {}
23+
fr_to_vec = {}
24+
for i in range(len(dict_fr)):
25+
en = dict_fr.loc[i][0]
26+
fr = dict_fr.loc[i][1]
27+
if type(en) != float:
28+
en = en.capitalize()
29+
if en in set_words and en not in set(en_to_fr.keys()):
30+
en_to_fr[en] = fr
31+
fr_to_vec[fr] = fr_embeddings[fr]
32+
# Add comments later
33+
del fr_to_vec['syrienne']
34+
del fr_to_vec['iranienne']
35+
del fr_to_vec['malien']
36+
del fr_to_vec['arménienne']
37+
del fr_to_vec['chilien']
38+
del fr_to_vec['équateur']
39+
en_to_fr['Chile'] = 'chili'
40+
fr_to_vec['chili'] = fr_embeddings['chili']
41+
en_to_fr['Iran'] = 'iran'
42+
fr_to_vec['iran'] = fr_embeddings['iran']
43+
en_to_fr['Turkey'] = 'turquie'
44+
fr_to_vec['turquie'] = fr_embeddings['turquie']
45+
en_to_fr['Syria'] = 'syrie'
46+
fr_to_vec['syrie'] = fr_embeddings['syrie']
47+
en_to_fr['Nigeria'] = 'nigeria'
48+
fr_to_vec['nigeria'] = fr_embeddings['nigeria']
49+
en_to_fr['Mali'] = 'mali'
50+
fr_to_vec['mali'] = fr_embeddings['mali']
51+
fr_to_vec['grece'] = fr_embeddings['grèce']
52+
en_to_fr['Armenia'] = 'arménie'
53+
fr_to_vec['arménie'] = fr_embeddings['arménie']
54+
en_to_fr['Ecuador'] = 'ecuador'
55+
fr_to_vec['ecuador'] = fr_embeddings['ecuador']
56+
en_to_fr['Niger'] = 'niger'
57+
fr_to_vec['niger'] = fr_embeddings['niger']
58+
return en_to_fr, fr_to_vec
59+
60+
def remove_accents(data):
61+
return ''.join(x for x in unicodedata.normalize('NFKD', data) if x in string.ascii_letters).lower()

word_embeddings_and_PCA/vectors.jpg

165 KB
Loading

word_embeddings_and_PCA/vectorsf.jpg

498 KB
Loading
Binary file not shown.

word_embeddings_and_PCA/word_embf.jpg

47.9 KB
Loading

0 commit comments

Comments
 (0)