pawankg
diff --git a/‎English_to_French_Language_translator/III_Machine_Translation.ipynb
+2,054 b/‎English_to_French_Language_translator/III_Machine_Translation.ipynb
+2,054
diff --git a/‎English_to_French_Language_translator/II_Hash_functions_and_multiplanes.ipynb
+661 b/‎English_to_French_Language_translator/II_Hash_functions_and_multiplanes.ipynb
+661
diff --git a/‎English_to_French_Language_translator/I_Vector_manipulation_in_Python.ipynb
+402 b/‎English_to_French_Language_translator/I_Vector_manipulation_in_Python.ipynb
+402
diff --git a/‎English_to_French_Language_translator/X_to_Y.jpg
50.2 KB b/‎English_to_French_Language_translator/X_to_Y.jpg
50.2 KB
diff --git a/‎English_to_French_Language_translator/cos1.jpg
422 KB b/‎English_to_French_Language_translator/cos1.jpg
422 KB
diff --git a/‎English_to_French_Language_translator/cos2.jpg
503 KB b/‎English_to_French_Language_translator/cos2.jpg
503 KB
diff --git a/‎English_to_French_Language_translator/e_to_f.jpg
94.3 KB b/‎English_to_French_Language_translator/e_to_f.jpg
94.3 KB
diff --git a/‎English_to_French_Language_translator/en-fr.test.txt
+2,943 b/‎English_to_French_Language_translator/en-fr.test.txt
+2,943
diff --git a/‎English_to_French_Language_translator/en-fr.train.txt
+10,872 b/‎English_to_French_Language_translator/en-fr.train.txt
+10,872
diff --git a/‎English_to_French_Language_translator/en_embeddings.p
7.74 MB b/‎English_to_French_Language_translator/en_embeddings.p
7.74 MB
diff --git a/‎English_to_French_Language_translator/four.png
334 KB b/‎English_to_French_Language_translator/four.png
334 KB
diff --git a/‎English_to_French_Language_translator/fr_embeddings.p
7.01 MB b/‎English_to_French_Language_translator/fr_embeddings.p
7.01 MB
diff --git a/‎English_to_French_Language_translator/graphs1.jpg
37.6 KB b/‎English_to_French_Language_translator/graphs1.jpg
37.6 KB
diff --git a/‎English_to_French_Language_translator/hashtable.png
197 KB b/‎English_to_French_Language_translator/hashtable.png
197 KB
diff --git a/‎English_to_French_Language_translator/one.png
132 KB b/‎English_to_French_Language_translator/one.png
132 KB
diff --git a/‎English_to_French_Language_translator/table.png
213 KB b/‎English_to_French_Language_translator/table.png
213 KB
diff --git a/‎English_to_French_Language_translator/trees.jpg
91.8 KB b/‎English_to_French_Language_translator/trees.jpg
91.8 KB
diff --git a/‎English_to_French_Language_translator/usv.jpg
57.3 KB b/‎English_to_French_Language_translator/usv.jpg
57.3 KB
diff --git a/‎English_to_French_Language_translator/utils.py
+77 b/‎English_to_French_Language_translator/utils.py
+77
@@ -0,0 +1,77 @@
+import re
+import string
+
+import numpy as np
+import pandas as pd
+from nltk.corpus import stopwords
+from nltk.stem import PorterStemmer
+from nltk.tokenize import TweetTokenizer
+
+
+def process_tweet(tweet):
+    '''
+    Input:
+        tweet: a string containing a tweet
+    Output:
+        tweets_clean: a list of words containing the processed tweet
+
+    '''
+    stemmer = PorterStemmer()
+    stopwords_english = stopwords.words('english')
+    # remove stock market tickers like $GE
+    tweet = re.sub(r'\$\w*', '', tweet)
+    # remove old style retweet text "RT"
+    tweet = re.sub(r'^RT[\s]+', '', tweet)
+    # remove hyperlinks
+    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
+    # remove hashtags
+    # only removing the hash # sign from the word
+    tweet = re.sub(r'#', '', tweet)
+    # tokenize tweets
+    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
+                               reduce_len=True)
+    tweet_tokens = tokenizer.tokenize(tweet)
+
+    tweets_clean = []
+    for word in tweet_tokens:
+        if (word not in stopwords_english and  # remove stopwords
+            word not in string.punctuation):  # remove punctuation
+            # tweets_clean.append(word)
+            stem_word = stemmer.stem(word)  # stemming word
+            tweets_clean.append(stem_word)
+
+    return tweets_clean
+
+
+def get_dict(file_name):
+    """
+    This function returns the english to french dictionary given a file where the each column corresponds to a word.
+    Check out the files this function takes in your workspace.
+    """
+    my_file = pd.read_csv(file_name, delimiter=' ')
+    etof = {}  # the english to french dictionary to be returned
+    for i in range(len(my_file)):
+        # indexing into the rows.
+        en = my_file.loc[i][0]
+        fr = my_file.loc[i][1]
+        etof[en] = fr
+
+    return etof
+
+
+def cosine_similarity(A, B):
+    '''
+    Input:
+        A: a numpy array which corresponds to a word vector
+        B: A numpy array which corresponds to a word vector
+    Output:
+        cos: numerical number representing the cosine similarity between A and B.
+    '''
+    # you have to set this variable to the true label.
+    cos = -10
+    dot = np.dot(A, B)
+    norma = np.linalg.norm(A)
+    normb = np.linalg.norm(B)
+    cos = dot / (norma * normb)
+
+    return cos