work with semeval 2016 data

stevenxxiu · stevenxxiu · commit 568998b4f905 · 2016-01-30T22:54:50.000+11:00
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,5 @@
-
-/.idea
-/data
-/third
+/.idea/
+/cache/
+/data/
+/third/
 __pycache__/
diff --git a/senti/bin/main.py b/senti/bin/main.py
@@ -12,7 +12,7 @@
 from senti.rand import seed_rng
 from senti.score import *
 from senti.senti_models import *
-from senti.utils import BalancedSlice, FieldExtractor, RepeatSr, JSONDecoder
+from senti.utils import BalancedSlice, FieldExtractor, JSONDecoder, RepeatSr, temp_chdir
 
 
 class SentiData:
@@ -37,22 +37,22 @@ def __init__(self):
         self.classes_ = [0, 1, 2]
         self.average_classes = [0, 2]
         # data
-        os.chdir('data/twitter')
-        labelled_dir = 'semeval'
-        self.train_objs = JSONDecoder(stack.enter_context(open('{}/train.json'.format(labelled_dir))))
-        self.train_docs = FieldExtractor(self.train_objs, 'text')
-        self.train_labels = np.fromiter(FieldExtractor(self.train_objs, 'label'), 'int32')
-        distant_srs = [stack.enter_context(open('emote/class_{}.txt'.format(i), encoding='utf-8')) for i in [0, 2]]
-        self.distant_docs = BalancedSlice(distant_srs)
-        self.distant_labels = BalancedSlice((RepeatSr(0), RepeatSr(2)))
-        unsup_sr = stack.enter_context(open('unsup/all.txt', encoding='utf-8'))
-        self.unsup_docs = BalancedSlice([unsup_sr])
-        self.val_objs = JSONDecoder(stack.enter_context(open('{}/val.json'.format(labelled_dir))))
-        self.val_docs = FieldExtractor(self.val_objs, 'text')
-        self.val_labels = FieldExtractor(self.val_objs, 'label')
-        self.test_objs = JSONDecoder(stack.enter_context(open('{}/test.json'.format(labelled_dir))))
-        self.test_docs = FieldExtractor(self.test_objs, 'text')
-        self.test_labels = FieldExtractor(self.test_objs, 'label')
+        self.data_dir = 'data/twitter/semeval_2016'
+        with temp_chdir(self.data_dir):
+            self.train_objs = JSONDecoder(stack.enter_context(open('train.json')))
+            self.train_docs = FieldExtractor(self.train_objs, 'text')
+            self.train_labels = np.fromiter(FieldExtractor(self.train_objs, 'label'), 'int32')
+            distant_srs = [stack.enter_context(open('../emote/class_{}.txt'.format(i), encoding='utf-8')) for i in [0, 2]]
+            self.distant_docs = BalancedSlice(distant_srs)
+            self.distant_labels = BalancedSlice((RepeatSr(0), RepeatSr(2)))
+            unsup_sr = stack.enter_context(open('../unsup/all.txt', encoding='utf-8'))
+            self.unsup_docs = BalancedSlice([unsup_sr])
+            self.val_objs = JSONDecoder(stack.enter_context(open('val.json')))
+            self.val_docs = FieldExtractor(self.val_objs, 'text')
+            self.val_labels = FieldExtractor(self.val_objs, 'label')
+            self.test_objs = JSONDecoder(stack.enter_context(open('test.json')))
+            self.test_docs = FieldExtractor(self.test_objs, 'text')
+            self.test_labels = FieldExtractor(self.test_objs, 'label')
 
 
 class IMDBData(SentiData):
@@ -63,18 +63,19 @@ def __init__(self):
         self.classes_ = [0, 1, 2]
         self.average_classes = [0, 2]
         # data
-        os.chdir('data/imdb')
-        self.train_objs = JSONDecoder(stack.enter_context(open('train.json')))
-        self.train_docs = FieldExtractor(self.train_objs, 'text')
-        self.train_labels = np.fromiter(FieldExtractor(self.train_objs, 'label'), 'int32')
-        unsup_sr = stack.enter_context(open('unsup.json'))
-        self.unsup_docs = BalancedSlice([FieldExtractor(unsup_sr, 'text')])
-        self.val_objs = JSONDecoder(stack.enter_context(open('val.json')))
-        self.val_docs = FieldExtractor(self.val_objs, 'text')
-        self.val_labels = FieldExtractor(self.val_objs, 'label')
-        self.test_objs = JSONDecoder(stack.enter_context(open('test.json')))
-        self.test_docs = FieldExtractor(self.test_objs, 'text')
-        self.test_labels = FieldExtractor(self.test_objs, 'label')
+        self.data_dir = 'data/imdb'
+        with temp_chdir(self.data_dir):
+            self.train_objs = JSONDecoder(stack.enter_context(open('train.json')))
+            self.train_docs = FieldExtractor(self.train_objs, 'text')
+            self.train_labels = np.fromiter(FieldExtractor(self.train_objs, 'label'), 'int32')
+            unsup_sr = stack.enter_context(open('unsup.json'))
+            self.unsup_docs = BalancedSlice([FieldExtractor(unsup_sr, 'text')])
+            self.val_objs = JSONDecoder(stack.enter_context(open('val.json')))
+            self.val_docs = FieldExtractor(self.val_objs, 'text')
+            self.val_labels = FieldExtractor(self.val_objs, 'label')
+            self.test_objs = JSONDecoder(stack.enter_context(open('test.json')))
+            self.test_docs = FieldExtractor(self.test_objs, 'text')
+            self.test_labels = FieldExtractor(self.test_objs, 'label')
 
 
 class YelpData(SentiData):
@@ -85,16 +86,17 @@ def __init__(self):
         self.classes_ = [1, 2, 3, 4, 5]
         self.average_classes = [1, 2, 3, 4, 5]
         # data
-        os.chdir('data/yelp')
-        self.train_objs = JSONDecoder(stack.enter_context(open('train.json')))
-        self.train_docs = FieldExtractor(self.train_objs, 'text')
-        self.train_labels = np.fromiter(FieldExtractor(self.train_objs, 'stars'), 'int32')
-        self.val_objs = JSONDecoder(stack.enter_context(open('val.json')))
-        self.val_docs = FieldExtractor(self.val_objs, 'text')
-        self.val_labels = FieldExtractor(self.val_objs, 'stars')
-        self.test_objs = JSONDecoder(stack.enter_context(open('test.json')))
-        self.test_docs = FieldExtractor(self.test_objs, 'text')
-        self.test_labels = FieldExtractor(self.test_objs, 'stars')
+        self.data_dir = 'data/yelp'
+        with temp_chdir(self.data_dir):
+            self.train_objs = JSONDecoder(stack.enter_context(open('train.json')))
+            self.train_docs = FieldExtractor(self.train_objs, 'text')
+            self.train_labels = np.fromiter(FieldExtractor(self.train_objs, 'stars'), 'int32')
+            self.val_objs = JSONDecoder(stack.enter_context(open('val.json')))
+            self.val_docs = FieldExtractor(self.val_objs, 'text')
+            self.val_labels = FieldExtractor(self.val_objs, 'stars')
+            self.test_objs = JSONDecoder(stack.enter_context(open('test.json')))
+            self.test_docs = FieldExtractor(self.test_objs, 'text')
+            self.test_labels = FieldExtractor(self.test_objs, 'stars')
 
 
 def main():
@@ -109,11 +111,11 @@ def main():
 
         # train
         senti_models = SentiModels(data)
-        pipeline_name, pipeline = senti_models.fit_voting()
+        # pipeline_name, pipeline = senti_models.fit_voting()
         # pipeline_name, pipeline = senti_models.fit_logreg()
         # pipeline_name, pipeline = senti_models.fit_word2vec_bayes()
         # pipeline_name, pipeline = senti_models.fit_svm()
-        # pipeline_name, pipeline = senti_models.fit_nn_word()
+        pipeline_name, pipeline = senti_models.fit_nn_word()
         # pipeline_name, pipeline = senti_models.fit_cnn_char()
         # pipeline_name, pipeline = senti_models.fit_cnn_word_char()
         # pipeline_name, pipeline = senti_models.fit_rnn_char_cnn_word()
@@ -125,22 +127,23 @@ def main():
         ]
 
         # predict & write results
-        classes_ = np.array([0, 1, 2])
         for name, objs, docs, labels in test_data:
-            os.makedirs('results/{}'.format(name), exist_ok=True)
             try:
                 probs = pipeline.predict_proba(docs)
             except AttributeError:
-                probs = LabelBinarizer().fit(classes_).transform(pipeline.predict(docs))
-            with open('results/{}/{}.json'.format(name, pipeline_name), 'w') as results_sr:
-                for obj, prob in zip(objs, probs):
-                    results_sr.write(json.dumps({
-                        'id': obj['id'], 'label': int(classes_[np.argmax(prob)]),
-                        'probs': [(c.item(), prob.item()) for c, prob in zip(classes_, prob)]
-                    }) + '\n')
-            print('{} data: '.format(name))
-            labels = np.fromiter(labels, dtype='int32')
-            write_score('results/{}/{}'.format(name, pipeline_name), labels, probs, classes_, (0, 2))
+                probs = LabelBinarizer().fit(data.classes_).transform(pipeline.predict(docs))
+            results_dir = os.path.join(data.data_dir, 'results', name)
+            os.makedirs(results_dir, exist_ok=True)
+            with temp_chdir(results_dir):
+                with open('{}.json'.format(pipeline_name), 'w') as results_sr:
+                    for obj, prob in zip(objs, probs):
+                        results_sr.write(json.dumps({
+                            'id': obj['id'], 'label': data.classes_[np.argmax(prob)],
+                            'probs': [(c, prob.item()) for c, prob in zip(data.classes_, prob)]
+                        }) + '\n')
+                print('{} data: '.format(name))
+                labels = np.fromiter(labels, dtype='int32')
+                write_score('{}'.format(pipeline_name), labels, probs, data.classes_, data.average_classes)
 
 if __name__ == '__main__':
     main()
diff --git a/senti/bin/summarize.py b/senti/bin/summarize.py
@@ -1,9 +1,10 @@
 #!/usr/bin/env python
 
 import os
-import pandas as pd
 from collections import OrderedDict
 
+import pandas as pd
+
 
 def main():
     os.chdir('data/twitter')
diff --git a/senti/data/imdb.py b/senti/data/imdb.py
@@ -1,8 +1,8 @@
 #!/usr/bin/env python
 
+import itertools
 import json
 import os
-import itertools
 
 from senti.rand import *
 
diff --git a/senti/data/twitter.py b/senti/data/twitter.py
@@ -81,14 +81,24 @@ def write_split_emote(cls):
 class SemEvalData:
     class_map = {'negative': 0, 'neutral': 1, 'positive': 2}
 
+    @classmethod
+    def write_download(cls, out_path, download_path):
+        with open(download_path) as download_sr, open(out_path, 'a+') as out_sr:
+            for line in download_sr:
+                doc_id, label, text = re.match(r'(?:\d+\t)?(\d+)\t(negative|neutral|positive)\t(.+)', line).groups()
+                text = html.unescape(html.unescape(text))
+                if text == 'Not Available':
+                    continue
+                out_sr.write(json.dumps({'id': doc_id, 'text': text, 'label': cls.class_map[label]}) + '\n')
+
     @classmethod
     def write_unitn(cls, out_path, unitn_path, download_path, is_train):
-        with open(unitn_path) as unitn_sr, open(download_path) as download_sr, open(out_path, 'w') as out_sr:
+        with open(unitn_path) as unitn_sr, open(download_path) as download_sr, open(out_path, 'a+') as out_sr:
             for unitn_line, download_line in zip(unitn_sr, download_sr):
                 doc_id_unitn, label_unitn, text_unitn = \
                     re.match(r'\d+\t(\d+)\t(negative|neutral|positive)\t(.+)', unitn_line).groups()
                 doc_id_download, label_download, text_download = \
-                    re.match(r'\d+\t(\d+)\t(negative|neutral|positive)\t('r'.+)', download_line).groups()
+                    re.match(r'\d+\t(\d+)\t(negative|neutral|positive)\t(.+)', download_line).groups()
                 text_unitn = text_unitn.encode().decode('unicode-escape')
                 text_unitn = text_unitn.replace(r'’', '\'')
                 if is_train:
@@ -110,7 +120,7 @@ def write_unitn(cls, out_path, unitn_path, download_path, is_train):
 
     @classmethod
     def write_test(cls, out_path, download_path, test_path):
-        with open(download_path) as in_sr, open(test_path) as labels_sr, open(out_path, 'w') as out_sr:
+        with open(download_path) as in_sr, open(test_path) as labels_sr, open(out_path, 'a+') as out_sr:
             for line, label_line in zip(in_sr, labels_sr):
                 doc_id, text = re.match(r'NA\t(T\d+)\tunknwn\t(.+)', line).groups()
                 text = html.unescape(html.unescape(text))
@@ -139,22 +149,59 @@ def shuffle_lines(names, in_dir, out_dir):
 def main():
     os.chdir('data/twitter')
     seed_rng(1234)
+
+    # unsup
     # UnsupData.unescape_unsup()
     # UnsupData.write_all_emote()
     # UnsupData.write_split_emote()
+
+    # semeval 2015
+    # for file_name in os.listdir('semeval_2015'):
+    #     os.remove(file_name)
     # SemEvalData.write_unitn(
-    #     'semeval/val.json', 'input/unitn/dev/gold/twitter-dev-gold-B.tsv',
-    #     'input/dev/gold/twitter-dev-gold-B-downloaded.tsv', False
+    #     'semeval_2015/train.json',
+    #     'input/semeval2015_task10_all/unitn/train/cleansed/twitter-train-cleansed-B.txt',
+    #     'input/semeval2015_task10_all/train/cleansed/twitter-train-cleansed-B-downloaded.tsv', True
     # )
     # SemEvalData.write_unitn(
-    #     'semeval/train.json', 'input/unitn/train/cleansed/twitter-train-cleansed-B.txt',
-    #     'input/train/cleansed/twitter-train-cleansed-B-downloaded.tsv', True
+    #     'semeval_2015/val.json',
+    #     'input/semeval2015_task10_all/unitn/dev/gold/twitter-dev-gold-B.tsv',
+    #     'input/semeval2015_task10_all/dev/gold/twitter-dev-gold-B-downloaded.tsv', False
     # )
     # SemEvalData.write_test(
-    #     'semeval/test.json', 'input/test/SemEval2015-task10-test-B-input.txt',
-    #     'input/test/SemEval2015-task10-test-B-gold.txt'
+    #     'semeval_2015/test.json',
+    #     'input/semeval2015_task10_all/test/SemEval2015-task10-test-B-input.txt',
+    #     'input/semeval2015_task10_all/test/SemEval2015-task10-test-B-gold.txt'
     # )
-    shuffle_lines(['train.json', 'val.json', 'test.json'], 'semeval', 'semeval_random')
+
+    # semeval 2015 random
+    # shuffle_lines(['train.json', 'val.json', 'test.json'], 'semeval_2015', 'semeval_2015_random')
+
+    # semeval 2016
+    for file_name in os.listdir('semeval_2016'):
+        os.remove(file_name)
+    SemEvalData.write_download(
+        'semeval_2016/train.json',
+        'input/semeval2016-task4.traindev/train/100_topics_100_tweets.sentence-three-point.subtask-A.train.out.txt',
+    )
+    SemEvalData.write_unitn(
+        'semeval_2016/train.json',
+        'input/semeval2015_task10_all/unitn/train/cleansed/twitter-train-cleansed-B.txt',
+        'input/semeval2015_task10_all/train/cleansed/twitter-train-cleansed-B-downloaded.tsv', True
+    )
+    SemEvalData.write_unitn(
+        'semeval_2016/train.json',
+        'input/semeval2015_task10_all/unitn/dev/gold/twitter-dev-gold-B.tsv',
+        'input/semeval2015_task10_all/dev/gold/twitter-dev-gold-B-downloaded.tsv', False
+    )
+    SemEvalData.write_download(
+        'semeval_2016/val.json',
+        'input/semeval2016-task4.traindev/dev/100_topics_100_tweets.sentence-three-point.subtask-A.dev.out.txt',
+    )
+    SemEvalData.write_download(
+        'semeval_2016/test.json',
+        'input/semeval2016-task4.traindev/test/100_topics_100_tweets.sentence-three-point.subtask-A.test.out.txt',
+    )
 
 
 if __name__ == '__main__':
diff --git a/senti/data/yelp.py b/senti/data/yelp.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 
-import os
 import itertools
+import os
 
 from senti.rand import *
 
diff --git a/senti/score.py b/senti/score.py
@@ -15,6 +15,7 @@
 
 
 def write_score(name, gold_labels, pred_scores, classes, average_classes):
+    classes, average_classes = np.array(classes), np.array(average_classes)
     gold_scores = LabelBinarizer().fit(classes).transform(gold_labels)
     pred_labels = classes[np.argmax(pred_scores, axis=1)]
 
diff --git a/senti/senti_models.py b/senti/senti_models.py
@@ -181,7 +181,7 @@ def fit_logreg(self):
             # ]).fit(self.unsup_docs[:10**6])),
             # ('w2v_word_avg_google', Pipeline([
             #     ('tokenize', tokenize_sense),
-            #     ('feature', Word2VecAverage(joblib.load('../google/GoogleNews-vectors-negative300.pickle'))),
+            #     ('feature', Word2VecAverage(joblib.load('data/google/GoogleNews-vectors-negative300.pickle'))),
             # ])),
             # ('w2v_word_norm_avg', Pipeline([
             #     ('tokenize', tokenize_sense),
@@ -191,7 +191,7 @@ def fit_logreg(self):
             # ]).fit(self.unsup_docs[:10**6])),
             ('w2v_word_norm_avg_google', Pipeline([
                 ('tokenize', tokenize_sense),
-                ('feature', Word2VecNormAverage(joblib.load('../google/GoogleNews-vectors-negative300.pickle'))),
+                ('feature', Word2VecNormAverage(joblib.load('data/google/GoogleNews-vectors-negative300.pickle'))),
             ])),
             # ('w2v_word_max', Pipeline([
             #     ('tokenize', tokenize_sense),
@@ -201,7 +201,7 @@ def fit_logreg(self):
             # ]).fit(self.unsup_docs[:10**6])),
             # ('w2v_word_max_google', Pipeline([
             #     ('tokenize', tokenize_sense),
-            #     ('feature', Word2VecMax(joblib.load('../google/GoogleNews-vectors-negative300.pickle'))),
+            #     ('feature', Word2VecMax(joblib.load('data/google/GoogleNews-vectors-negative300.pickle'))),
             # ])),
             # ('w2v_word_inv', ToCorporas(Pipeline([
             #     ('tokenize', MapCorporas(tokenize_sense)),
@@ -233,7 +233,7 @@ def fit_word2vec_bayes(self):
 
     def _fit_embedding_word(self, embedding_type, construct_docs, tokenize_, d=None):
         if embedding_type == 'google':
-            embeddings_ = joblib.load('../google/GoogleNews-vectors-negative300.pickle')
+            embeddings_ = joblib.load('data/google/GoogleNews-vectors-negative300.pickle')
             embeddings_ = SimpleNamespace(X=embeddings_.syn0, vocab={w: v.index for w, v in embeddings_.vocab.items()})
         elif embedding_type == 'twitter':
             estimator = Pipeline([
@@ -277,18 +277,18 @@ def fit_nn_word(self):
             ('tokenize', tokenize_sense),
             ('embeddings', emb),
         ])
-        # cf = CNNWord(
-        #     batch_size=64, emb_X=emb.X, input_size=56, conv_param=(100, [3, 4, 5]), dense_params=[],
-        #     output_size=3, static_mode=1, max_norm=3, f1_classes=[0, 2]
-        # )
+        cf = CNNWord(
+            batch_size=64, emb_X=emb.X, input_size=56, conv_param=(100, [3, 4, 5]), dense_params=[],
+            output_size=3, static_mode=1, max_norm=3, f1_classes=[0, 2]
+        )
         # cf = CNNWordPredInteraction(
         #     batch_size=64, emb_X=emb.X, input_size=56, conv_param=(100, [3, 4, 5]), dense_params=[],
         #     output_size=3, max_norm=3, f1_classes=[0, 2]
         # )
         # cf = RNNWord(batch_size=64, emb_X=emb.X, lstm_param=300, output_size=3, f1_classes=[0, 2])
-        cf = RNNMultiWord(
-            batch_size=64, input_size=56, emb_X=emb.X, conv_param=3, lstm_param=300, output_size=3, f1_classes=[0, 2]
-        )
+        # cf = RNNMultiWord(
+        #     batch_size=64, input_size=56, emb_X=emb.X, conv_param=3, lstm_param=300, output_size=3, f1_classes=[0, 2]
+        # )
         kw = dict(val_docs=ft.transform(self.val_docs), val_y=self.val_labels())
         cf.fit(ft.transform(distant_docs), distant_labels(), epoch_size=10**4, max_epochs=20, **kw)
         cf.fit(ft.transform(self.train_docs), self.train_labels(), epoch_size=1000, max_epochs=100, **kw)
@@ -371,7 +371,7 @@ def fit_multiview_cnn_word_cnn_char(self):
         return 'multiview_cnn_word_cnn_char(embedding={})'.format(emb_type), estimator
 
     def _fit_rnn_embedding(self):
-        emb_word = joblib.load('../google/GoogleNews-vectors-negative300.pickle')
+        emb_word = joblib.load('data/google/GoogleNews-vectors-negative300.pickle')
         alphabet = ' abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:\'"/\\|_@#$%^&*~`+-=<>()[]{}'
         emb_char = self._fit_embedding_char('none', alphabet, 300)
         ft_char = Pipeline([
diff --git a/senti/utils/utils.py b/senti/utils/utils.py

-Original file line number
+Diff line change
+-
 -/.idea
 -/data
 -/third
 +/.idea/
 +/cache/
 +/data/
 +/third/
 __pycache__/