Skip to content

Commit 568998b

Browse files
committed
work with semeval 2016 data
1 parent 2e61687 commit 568998b

File tree

9 files changed

+144
-83
lines changed

9 files changed

+144
-83
lines changed

.gitignore

+4-4
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
2-
/.idea
3-
/data
4-
/third
1+
/.idea/
2+
/cache/
3+
/data/
4+
/third/
55
__pycache__/

senti/bin/main.py

+56-53
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from senti.rand import seed_rng
1313
from senti.score import *
1414
from senti.senti_models import *
15-
from senti.utils import BalancedSlice, FieldExtractor, RepeatSr, JSONDecoder
15+
from senti.utils import BalancedSlice, FieldExtractor, JSONDecoder, RepeatSr, temp_chdir
1616

1717

1818
class SentiData:
@@ -37,22 +37,22 @@ def __init__(self):
3737
self.classes_ = [0, 1, 2]
3838
self.average_classes = [0, 2]
3939
# data
40-
os.chdir('data/twitter')
41-
labelled_dir = 'semeval'
42-
self.train_objs = JSONDecoder(stack.enter_context(open('{}/train.json'.format(labelled_dir))))
43-
self.train_docs = FieldExtractor(self.train_objs, 'text')
44-
self.train_labels = np.fromiter(FieldExtractor(self.train_objs, 'label'), 'int32')
45-
distant_srs = [stack.enter_context(open('emote/class_{}.txt'.format(i), encoding='utf-8')) for i in [0, 2]]
46-
self.distant_docs = BalancedSlice(distant_srs)
47-
self.distant_labels = BalancedSlice((RepeatSr(0), RepeatSr(2)))
48-
unsup_sr = stack.enter_context(open('unsup/all.txt', encoding='utf-8'))
49-
self.unsup_docs = BalancedSlice([unsup_sr])
50-
self.val_objs = JSONDecoder(stack.enter_context(open('{}/val.json'.format(labelled_dir))))
51-
self.val_docs = FieldExtractor(self.val_objs, 'text')
52-
self.val_labels = FieldExtractor(self.val_objs, 'label')
53-
self.test_objs = JSONDecoder(stack.enter_context(open('{}/test.json'.format(labelled_dir))))
54-
self.test_docs = FieldExtractor(self.test_objs, 'text')
55-
self.test_labels = FieldExtractor(self.test_objs, 'label')
40+
self.data_dir = 'data/twitter/semeval_2016'
41+
with temp_chdir(self.data_dir):
42+
self.train_objs = JSONDecoder(stack.enter_context(open('train.json')))
43+
self.train_docs = FieldExtractor(self.train_objs, 'text')
44+
self.train_labels = np.fromiter(FieldExtractor(self.train_objs, 'label'), 'int32')
45+
distant_srs = [stack.enter_context(open('../emote/class_{}.txt'.format(i), encoding='utf-8')) for i in [0, 2]]
46+
self.distant_docs = BalancedSlice(distant_srs)
47+
self.distant_labels = BalancedSlice((RepeatSr(0), RepeatSr(2)))
48+
unsup_sr = stack.enter_context(open('../unsup/all.txt', encoding='utf-8'))
49+
self.unsup_docs = BalancedSlice([unsup_sr])
50+
self.val_objs = JSONDecoder(stack.enter_context(open('val.json')))
51+
self.val_docs = FieldExtractor(self.val_objs, 'text')
52+
self.val_labels = FieldExtractor(self.val_objs, 'label')
53+
self.test_objs = JSONDecoder(stack.enter_context(open('test.json')))
54+
self.test_docs = FieldExtractor(self.test_objs, 'text')
55+
self.test_labels = FieldExtractor(self.test_objs, 'label')
5656

5757

5858
class IMDBData(SentiData):
@@ -63,18 +63,19 @@ def __init__(self):
6363
self.classes_ = [0, 1, 2]
6464
self.average_classes = [0, 2]
6565
# data
66-
os.chdir('data/imdb')
67-
self.train_objs = JSONDecoder(stack.enter_context(open('train.json')))
68-
self.train_docs = FieldExtractor(self.train_objs, 'text')
69-
self.train_labels = np.fromiter(FieldExtractor(self.train_objs, 'label'), 'int32')
70-
unsup_sr = stack.enter_context(open('unsup.json'))
71-
self.unsup_docs = BalancedSlice([FieldExtractor(unsup_sr, 'text')])
72-
self.val_objs = JSONDecoder(stack.enter_context(open('val.json')))
73-
self.val_docs = FieldExtractor(self.val_objs, 'text')
74-
self.val_labels = FieldExtractor(self.val_objs, 'label')
75-
self.test_objs = JSONDecoder(stack.enter_context(open('test.json')))
76-
self.test_docs = FieldExtractor(self.test_objs, 'text')
77-
self.test_labels = FieldExtractor(self.test_objs, 'label')
66+
self.data_dir = 'data/imdb'
67+
with temp_chdir(self.data_dir):
68+
self.train_objs = JSONDecoder(stack.enter_context(open('train.json')))
69+
self.train_docs = FieldExtractor(self.train_objs, 'text')
70+
self.train_labels = np.fromiter(FieldExtractor(self.train_objs, 'label'), 'int32')
71+
unsup_sr = stack.enter_context(open('unsup.json'))
72+
self.unsup_docs = BalancedSlice([FieldExtractor(unsup_sr, 'text')])
73+
self.val_objs = JSONDecoder(stack.enter_context(open('val.json')))
74+
self.val_docs = FieldExtractor(self.val_objs, 'text')
75+
self.val_labels = FieldExtractor(self.val_objs, 'label')
76+
self.test_objs = JSONDecoder(stack.enter_context(open('test.json')))
77+
self.test_docs = FieldExtractor(self.test_objs, 'text')
78+
self.test_labels = FieldExtractor(self.test_objs, 'label')
7879

7980

8081
class YelpData(SentiData):
@@ -85,16 +86,17 @@ def __init__(self):
8586
self.classes_ = [1, 2, 3, 4, 5]
8687
self.average_classes = [1, 2, 3, 4, 5]
8788
# data
88-
os.chdir('data/yelp')
89-
self.train_objs = JSONDecoder(stack.enter_context(open('train.json')))
90-
self.train_docs = FieldExtractor(self.train_objs, 'text')
91-
self.train_labels = np.fromiter(FieldExtractor(self.train_objs, 'stars'), 'int32')
92-
self.val_objs = JSONDecoder(stack.enter_context(open('val.json')))
93-
self.val_docs = FieldExtractor(self.val_objs, 'text')
94-
self.val_labels = FieldExtractor(self.val_objs, 'stars')
95-
self.test_objs = JSONDecoder(stack.enter_context(open('test.json')))
96-
self.test_docs = FieldExtractor(self.test_objs, 'text')
97-
self.test_labels = FieldExtractor(self.test_objs, 'stars')
89+
self.data_dir = 'data/yelp'
90+
with temp_chdir(self.data_dir):
91+
self.train_objs = JSONDecoder(stack.enter_context(open('train.json')))
92+
self.train_docs = FieldExtractor(self.train_objs, 'text')
93+
self.train_labels = np.fromiter(FieldExtractor(self.train_objs, 'stars'), 'int32')
94+
self.val_objs = JSONDecoder(stack.enter_context(open('val.json')))
95+
self.val_docs = FieldExtractor(self.val_objs, 'text')
96+
self.val_labels = FieldExtractor(self.val_objs, 'stars')
97+
self.test_objs = JSONDecoder(stack.enter_context(open('test.json')))
98+
self.test_docs = FieldExtractor(self.test_objs, 'text')
99+
self.test_labels = FieldExtractor(self.test_objs, 'stars')
98100

99101

100102
def main():
@@ -109,11 +111,11 @@ def main():
109111

110112
# train
111113
senti_models = SentiModels(data)
112-
pipeline_name, pipeline = senti_models.fit_voting()
114+
# pipeline_name, pipeline = senti_models.fit_voting()
113115
# pipeline_name, pipeline = senti_models.fit_logreg()
114116
# pipeline_name, pipeline = senti_models.fit_word2vec_bayes()
115117
# pipeline_name, pipeline = senti_models.fit_svm()
116-
# pipeline_name, pipeline = senti_models.fit_nn_word()
118+
pipeline_name, pipeline = senti_models.fit_nn_word()
117119
# pipeline_name, pipeline = senti_models.fit_cnn_char()
118120
# pipeline_name, pipeline = senti_models.fit_cnn_word_char()
119121
# pipeline_name, pipeline = senti_models.fit_rnn_char_cnn_word()
@@ -125,22 +127,23 @@ def main():
125127
]
126128

127129
# predict & write results
128-
classes_ = np.array([0, 1, 2])
129130
for name, objs, docs, labels in test_data:
130-
os.makedirs('results/{}'.format(name), exist_ok=True)
131131
try:
132132
probs = pipeline.predict_proba(docs)
133133
except AttributeError:
134-
probs = LabelBinarizer().fit(classes_).transform(pipeline.predict(docs))
135-
with open('results/{}/{}.json'.format(name, pipeline_name), 'w') as results_sr:
136-
for obj, prob in zip(objs, probs):
137-
results_sr.write(json.dumps({
138-
'id': obj['id'], 'label': int(classes_[np.argmax(prob)]),
139-
'probs': [(c.item(), prob.item()) for c, prob in zip(classes_, prob)]
140-
}) + '\n')
141-
print('{} data: '.format(name))
142-
labels = np.fromiter(labels, dtype='int32')
143-
write_score('results/{}/{}'.format(name, pipeline_name), labels, probs, classes_, (0, 2))
134+
probs = LabelBinarizer().fit(data.classes_).transform(pipeline.predict(docs))
135+
results_dir = os.path.join(data.data_dir, 'results', name)
136+
os.makedirs(results_dir, exist_ok=True)
137+
with temp_chdir(results_dir):
138+
with open('{}.json'.format(pipeline_name), 'w') as results_sr:
139+
for obj, prob in zip(objs, probs):
140+
results_sr.write(json.dumps({
141+
'id': obj['id'], 'label': data.classes_[np.argmax(prob)],
142+
'probs': [(c, prob.item()) for c, prob in zip(data.classes_, prob)]
143+
}) + '\n')
144+
print('{} data: '.format(name))
145+
labels = np.fromiter(labels, dtype='int32')
146+
write_score('{}'.format(pipeline_name), labels, probs, data.classes_, data.average_classes)
144147

145148
if __name__ == '__main__':
146149
main()

senti/bin/summarize.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
#!/usr/bin/env python
22

33
import os
4-
import pandas as pd
54
from collections import OrderedDict
65

6+
import pandas as pd
7+
78

89
def main():
910
os.chdir('data/twitter')

senti/data/imdb.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
#!/usr/bin/env python
22

3+
import itertools
34
import json
45
import os
5-
import itertools
66

77
from senti.rand import *
88

senti/data/twitter.py

+57-10
Original file line numberDiff line numberDiff line change
@@ -81,14 +81,24 @@ def write_split_emote(cls):
8181
class SemEvalData:
8282
class_map = {'negative': 0, 'neutral': 1, 'positive': 2}
8383

84+
@classmethod
85+
def write_download(cls, out_path, download_path):
86+
with open(download_path) as download_sr, open(out_path, 'a+') as out_sr:
87+
for line in download_sr:
88+
doc_id, label, text = re.match(r'(?:\d+\t)?(\d+)\t(negative|neutral|positive)\t(.+)', line).groups()
89+
text = html.unescape(html.unescape(text))
90+
if text == 'Not Available':
91+
continue
92+
out_sr.write(json.dumps({'id': doc_id, 'text': text, 'label': cls.class_map[label]}) + '\n')
93+
8494
@classmethod
8595
def write_unitn(cls, out_path, unitn_path, download_path, is_train):
86-
with open(unitn_path) as unitn_sr, open(download_path) as download_sr, open(out_path, 'w') as out_sr:
96+
with open(unitn_path) as unitn_sr, open(download_path) as download_sr, open(out_path, 'a+') as out_sr:
8797
for unitn_line, download_line in zip(unitn_sr, download_sr):
8898
doc_id_unitn, label_unitn, text_unitn = \
8999
re.match(r'\d+\t(\d+)\t(negative|neutral|positive)\t(.+)', unitn_line).groups()
90100
doc_id_download, label_download, text_download = \
91-
re.match(r'\d+\t(\d+)\t(negative|neutral|positive)\t('r'.+)', download_line).groups()
101+
re.match(r'\d+\t(\d+)\t(negative|neutral|positive)\t(.+)', download_line).groups()
92102
text_unitn = text_unitn.encode().decode('unicode-escape')
93103
text_unitn = text_unitn.replace(r'’', '\'')
94104
if is_train:
@@ -110,7 +120,7 @@ def write_unitn(cls, out_path, unitn_path, download_path, is_train):
110120

111121
@classmethod
112122
def write_test(cls, out_path, download_path, test_path):
113-
with open(download_path) as in_sr, open(test_path) as labels_sr, open(out_path, 'w') as out_sr:
123+
with open(download_path) as in_sr, open(test_path) as labels_sr, open(out_path, 'a+') as out_sr:
114124
for line, label_line in zip(in_sr, labels_sr):
115125
doc_id, text = re.match(r'NA\t(T\d+)\tunknwn\t(.+)', line).groups()
116126
text = html.unescape(html.unescape(text))
@@ -139,22 +149,59 @@ def shuffle_lines(names, in_dir, out_dir):
139149
def main():
140150
os.chdir('data/twitter')
141151
seed_rng(1234)
152+
153+
# unsup
142154
# UnsupData.unescape_unsup()
143155
# UnsupData.write_all_emote()
144156
# UnsupData.write_split_emote()
157+
158+
# semeval 2015
159+
# for file_name in os.listdir('semeval_2015'):
160+
# os.remove(file_name)
145161
# SemEvalData.write_unitn(
146-
# 'semeval/val.json', 'input/unitn/dev/gold/twitter-dev-gold-B.tsv',
147-
# 'input/dev/gold/twitter-dev-gold-B-downloaded.tsv', False
162+
# 'semeval_2015/train.json',
163+
# 'input/semeval2015_task10_all/unitn/train/cleansed/twitter-train-cleansed-B.txt',
164+
# 'input/semeval2015_task10_all/train/cleansed/twitter-train-cleansed-B-downloaded.tsv', True
148165
# )
149166
# SemEvalData.write_unitn(
150-
# 'semeval/train.json', 'input/unitn/train/cleansed/twitter-train-cleansed-B.txt',
151-
# 'input/train/cleansed/twitter-train-cleansed-B-downloaded.tsv', True
167+
# 'semeval_2015/val.json',
168+
# 'input/semeval2015_task10_all/unitn/dev/gold/twitter-dev-gold-B.tsv',
169+
# 'input/semeval2015_task10_all/dev/gold/twitter-dev-gold-B-downloaded.tsv', False
152170
# )
153171
# SemEvalData.write_test(
154-
# 'semeval/test.json', 'input/test/SemEval2015-task10-test-B-input.txt',
155-
# 'input/test/SemEval2015-task10-test-B-gold.txt'
172+
# 'semeval_2015/test.json',
173+
# 'input/semeval2015_task10_all/test/SemEval2015-task10-test-B-input.txt',
174+
# 'input/semeval2015_task10_all/test/SemEval2015-task10-test-B-gold.txt'
156175
# )
157-
shuffle_lines(['train.json', 'val.json', 'test.json'], 'semeval', 'semeval_random')
176+
177+
# semeval 2015 random
178+
# shuffle_lines(['train.json', 'val.json', 'test.json'], 'semeval_2015', 'semeval_2015_random')
179+
180+
# semeval 2016
181+
for file_name in os.listdir('semeval_2016'):
182+
os.remove(file_name)
183+
SemEvalData.write_download(
184+
'semeval_2016/train.json',
185+
'input/semeval2016-task4.traindev/train/100_topics_100_tweets.sentence-three-point.subtask-A.train.out.txt',
186+
)
187+
SemEvalData.write_unitn(
188+
'semeval_2016/train.json',
189+
'input/semeval2015_task10_all/unitn/train/cleansed/twitter-train-cleansed-B.txt',
190+
'input/semeval2015_task10_all/train/cleansed/twitter-train-cleansed-B-downloaded.tsv', True
191+
)
192+
SemEvalData.write_unitn(
193+
'semeval_2016/train.json',
194+
'input/semeval2015_task10_all/unitn/dev/gold/twitter-dev-gold-B.tsv',
195+
'input/semeval2015_task10_all/dev/gold/twitter-dev-gold-B-downloaded.tsv', False
196+
)
197+
SemEvalData.write_download(
198+
'semeval_2016/val.json',
199+
'input/semeval2016-task4.traindev/dev/100_topics_100_tweets.sentence-three-point.subtask-A.dev.out.txt',
200+
)
201+
SemEvalData.write_download(
202+
'semeval_2016/test.json',
203+
'input/semeval2016-task4.traindev/test/100_topics_100_tweets.sentence-three-point.subtask-A.test.out.txt',
204+
)
158205

159206

160207
if __name__ == '__main__':

senti/data/yelp.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/usr/bin/env python
22

3-
import os
43
import itertools
4+
import os
55

66
from senti.rand import *
77

senti/score.py

+1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616

1717
def write_score(name, gold_labels, pred_scores, classes, average_classes):
18+
classes, average_classes = np.array(classes), np.array(average_classes)
1819
gold_scores = LabelBinarizer().fit(classes).transform(gold_labels)
1920
pred_labels = classes[np.argmax(pred_scores, axis=1)]
2021

senti/senti_models.py

+12-12
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ def fit_logreg(self):
181181
# ]).fit(self.unsup_docs[:10**6])),
182182
# ('w2v_word_avg_google', Pipeline([
183183
# ('tokenize', tokenize_sense),
184-
# ('feature', Word2VecAverage(joblib.load('../google/GoogleNews-vectors-negative300.pickle'))),
184+
# ('feature', Word2VecAverage(joblib.load('data/google/GoogleNews-vectors-negative300.pickle'))),
185185
# ])),
186186
# ('w2v_word_norm_avg', Pipeline([
187187
# ('tokenize', tokenize_sense),
@@ -191,7 +191,7 @@ def fit_logreg(self):
191191
# ]).fit(self.unsup_docs[:10**6])),
192192
('w2v_word_norm_avg_google', Pipeline([
193193
('tokenize', tokenize_sense),
194-
('feature', Word2VecNormAverage(joblib.load('../google/GoogleNews-vectors-negative300.pickle'))),
194+
('feature', Word2VecNormAverage(joblib.load('data/google/GoogleNews-vectors-negative300.pickle'))),
195195
])),
196196
# ('w2v_word_max', Pipeline([
197197
# ('tokenize', tokenize_sense),
@@ -201,7 +201,7 @@ def fit_logreg(self):
201201
# ]).fit(self.unsup_docs[:10**6])),
202202
# ('w2v_word_max_google', Pipeline([
203203
# ('tokenize', tokenize_sense),
204-
# ('feature', Word2VecMax(joblib.load('../google/GoogleNews-vectors-negative300.pickle'))),
204+
# ('feature', Word2VecMax(joblib.load('data/google/GoogleNews-vectors-negative300.pickle'))),
205205
# ])),
206206
# ('w2v_word_inv', ToCorporas(Pipeline([
207207
# ('tokenize', MapCorporas(tokenize_sense)),
@@ -233,7 +233,7 @@ def fit_word2vec_bayes(self):
233233

234234
def _fit_embedding_word(self, embedding_type, construct_docs, tokenize_, d=None):
235235
if embedding_type == 'google':
236-
embeddings_ = joblib.load('../google/GoogleNews-vectors-negative300.pickle')
236+
embeddings_ = joblib.load('data/google/GoogleNews-vectors-negative300.pickle')
237237
embeddings_ = SimpleNamespace(X=embeddings_.syn0, vocab={w: v.index for w, v in embeddings_.vocab.items()})
238238
elif embedding_type == 'twitter':
239239
estimator = Pipeline([
@@ -277,18 +277,18 @@ def fit_nn_word(self):
277277
('tokenize', tokenize_sense),
278278
('embeddings', emb),
279279
])
280-
# cf = CNNWord(
281-
# batch_size=64, emb_X=emb.X, input_size=56, conv_param=(100, [3, 4, 5]), dense_params=[],
282-
# output_size=3, static_mode=1, max_norm=3, f1_classes=[0, 2]
283-
# )
280+
cf = CNNWord(
281+
batch_size=64, emb_X=emb.X, input_size=56, conv_param=(100, [3, 4, 5]), dense_params=[],
282+
output_size=3, static_mode=1, max_norm=3, f1_classes=[0, 2]
283+
)
284284
# cf = CNNWordPredInteraction(
285285
# batch_size=64, emb_X=emb.X, input_size=56, conv_param=(100, [3, 4, 5]), dense_params=[],
286286
# output_size=3, max_norm=3, f1_classes=[0, 2]
287287
# )
288288
# cf = RNNWord(batch_size=64, emb_X=emb.X, lstm_param=300, output_size=3, f1_classes=[0, 2])
289-
cf = RNNMultiWord(
290-
batch_size=64, input_size=56, emb_X=emb.X, conv_param=3, lstm_param=300, output_size=3, f1_classes=[0, 2]
291-
)
289+
# cf = RNNMultiWord(
290+
# batch_size=64, input_size=56, emb_X=emb.X, conv_param=3, lstm_param=300, output_size=3, f1_classes=[0, 2]
291+
# )
292292
kw = dict(val_docs=ft.transform(self.val_docs), val_y=self.val_labels())
293293
cf.fit(ft.transform(distant_docs), distant_labels(), epoch_size=10**4, max_epochs=20, **kw)
294294
cf.fit(ft.transform(self.train_docs), self.train_labels(), epoch_size=1000, max_epochs=100, **kw)
@@ -371,7 +371,7 @@ def fit_multiview_cnn_word_cnn_char(self):
371371
return 'multiview_cnn_word_cnn_char(embedding={})'.format(emb_type), estimator
372372

373373
def _fit_rnn_embedding(self):
374-
emb_word = joblib.load('../google/GoogleNews-vectors-negative300.pickle')
374+
emb_word = joblib.load('data/google/GoogleNews-vectors-negative300.pickle')
375375
alphabet = ' abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:\'"/\\|_@#$%^&*~`+-=<>()[]{}'
376376
emb_char = self._fit_embedding_char('none', alphabet, 300)
377377
ft_char = Pipeline([

0 commit comments

Comments
 (0)