Skip to content

Commit 2bb173b

Browse files
authored
English to French Translator
1 parent 29cd5f6 commit 2bb173b

19 files changed

+17009
-0
lines changed

English_to_French_Language_translator/III_Machine_Translation.ipynb

+2,054
Large diffs are not rendered by default.

English_to_French_Language_translator/II_Hash_functions_and_multiplanes.ipynb

+661
Large diffs are not rendered by default.

English_to_French_Language_translator/I_Vector_manipulation_in_Python.ipynb

+402
Large diffs are not rendered by default.
50.2 KB
Loading
422 KB
Loading
503 KB
Loading
94.3 KB
Loading

English_to_French_Language_translator/en-fr.test.txt

+2,943
Large diffs are not rendered by default.

English_to_French_Language_translator/en-fr.train.txt

+10,872
Large diffs are not rendered by default.
Binary file not shown.
334 KB
Loading
Binary file not shown.
Loading
Loading
132 KB
Loading
Loading
91.8 KB
Loading
57.3 KB
Loading
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
import re
2+
import string
3+
4+
import numpy as np
5+
import pandas as pd
6+
from nltk.corpus import stopwords
7+
from nltk.stem import PorterStemmer
8+
from nltk.tokenize import TweetTokenizer
9+
10+
11+
def process_tweet(tweet):
12+
'''
13+
Input:
14+
tweet: a string containing a tweet
15+
Output:
16+
tweets_clean: a list of words containing the processed tweet
17+
18+
'''
19+
stemmer = PorterStemmer()
20+
stopwords_english = stopwords.words('english')
21+
# remove stock market tickers like $GE
22+
tweet = re.sub(r'\$\w*', '', tweet)
23+
# remove old style retweet text "RT"
24+
tweet = re.sub(r'^RT[\s]+', '', tweet)
25+
# remove hyperlinks
26+
tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
27+
# remove hashtags
28+
# only removing the hash # sign from the word
29+
tweet = re.sub(r'#', '', tweet)
30+
# tokenize tweets
31+
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
32+
reduce_len=True)
33+
tweet_tokens = tokenizer.tokenize(tweet)
34+
35+
tweets_clean = []
36+
for word in tweet_tokens:
37+
if (word not in stopwords_english and # remove stopwords
38+
word not in string.punctuation): # remove punctuation
39+
# tweets_clean.append(word)
40+
stem_word = stemmer.stem(word) # stemming word
41+
tweets_clean.append(stem_word)
42+
43+
return tweets_clean
44+
45+
46+
def get_dict(file_name):
47+
"""
48+
This function returns the english to french dictionary given a file where the each column corresponds to a word.
49+
Check out the files this function takes in your workspace.
50+
"""
51+
my_file = pd.read_csv(file_name, delimiter=' ')
52+
etof = {} # the english to french dictionary to be returned
53+
for i in range(len(my_file)):
54+
# indexing into the rows.
55+
en = my_file.loc[i][0]
56+
fr = my_file.loc[i][1]
57+
etof[en] = fr
58+
59+
return etof
60+
61+
62+
def cosine_similarity(A, B):
63+
'''
64+
Input:
65+
A: a numpy array which corresponds to a word vector
66+
B: A numpy array which corresponds to a word vector
67+
Output:
68+
cos: numerical number representing the cosine similarity between A and B.
69+
'''
70+
# you have to set this variable to the true label.
71+
cos = -10
72+
dot = np.dot(A, B)
73+
norma = np.linalg.norm(A)
74+
normb = np.linalg.norm(B)
75+
cos = dot / (norma * normb)
76+
77+
return cos

0 commit comments

Comments
 (0)