|
| 1 | +import re |
| 2 | +import string |
| 3 | + |
| 4 | +import numpy as np |
| 5 | +import pandas as pd |
| 6 | +from nltk.corpus import stopwords |
| 7 | +from nltk.stem import PorterStemmer |
| 8 | +from nltk.tokenize import TweetTokenizer |
| 9 | + |
| 10 | + |
| 11 | +def process_tweet(tweet): |
| 12 | + ''' |
| 13 | + Input: |
| 14 | + tweet: a string containing a tweet |
| 15 | + Output: |
| 16 | + tweets_clean: a list of words containing the processed tweet |
| 17 | +
|
| 18 | + ''' |
| 19 | + stemmer = PorterStemmer() |
| 20 | + stopwords_english = stopwords.words('english') |
| 21 | + # remove stock market tickers like $GE |
| 22 | + tweet = re.sub(r'\$\w*', '', tweet) |
| 23 | + # remove old style retweet text "RT" |
| 24 | + tweet = re.sub(r'^RT[\s]+', '', tweet) |
| 25 | + # remove hyperlinks |
| 26 | + tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet) |
| 27 | + # remove hashtags |
| 28 | + # only removing the hash # sign from the word |
| 29 | + tweet = re.sub(r'#', '', tweet) |
| 30 | + # tokenize tweets |
| 31 | + tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, |
| 32 | + reduce_len=True) |
| 33 | + tweet_tokens = tokenizer.tokenize(tweet) |
| 34 | + |
| 35 | + tweets_clean = [] |
| 36 | + for word in tweet_tokens: |
| 37 | + if (word not in stopwords_english and # remove stopwords |
| 38 | + word not in string.punctuation): # remove punctuation |
| 39 | + # tweets_clean.append(word) |
| 40 | + stem_word = stemmer.stem(word) # stemming word |
| 41 | + tweets_clean.append(stem_word) |
| 42 | + |
| 43 | + return tweets_clean |
| 44 | + |
| 45 | + |
| 46 | +def get_dict(file_name): |
| 47 | + """ |
| 48 | + This function returns the english to french dictionary given a file where the each column corresponds to a word. |
| 49 | + Check out the files this function takes in your workspace. |
| 50 | + """ |
| 51 | + my_file = pd.read_csv(file_name, delimiter=' ') |
| 52 | + etof = {} # the english to french dictionary to be returned |
| 53 | + for i in range(len(my_file)): |
| 54 | + # indexing into the rows. |
| 55 | + en = my_file.loc[i][0] |
| 56 | + fr = my_file.loc[i][1] |
| 57 | + etof[en] = fr |
| 58 | + |
| 59 | + return etof |
| 60 | + |
| 61 | + |
| 62 | +def cosine_similarity(A, B): |
| 63 | + ''' |
| 64 | + Input: |
| 65 | + A: a numpy array which corresponds to a word vector |
| 66 | + B: A numpy array which corresponds to a word vector |
| 67 | + Output: |
| 68 | + cos: numerical number representing the cosine similarity between A and B. |
| 69 | + ''' |
| 70 | + # you have to set this variable to the true label. |
| 71 | + cos = -10 |
| 72 | + dot = np.dot(A, B) |
| 73 | + norma = np.linalg.norm(A) |
| 74 | + normb = np.linalg.norm(B) |
| 75 | + cos = dot / (norma * normb) |
| 76 | + |
| 77 | + return cos |
0 commit comments