12
12
from senti .rand import seed_rng
13
13
from senti .score import *
14
14
from senti .senti_models import *
15
- from senti .utils import BalancedSlice , FieldExtractor , RepeatSr , JSONDecoder
15
+ from senti .utils import BalancedSlice , FieldExtractor , JSONDecoder , RepeatSr , temp_chdir
16
16
17
17
18
18
class SentiData :
@@ -37,22 +37,22 @@ def __init__(self):
37
37
self .classes_ = [0 , 1 , 2 ]
38
38
self .average_classes = [0 , 2 ]
39
39
# data
40
- os . chdir ( 'data/twitter' )
41
- labelled_dir = 'semeval'
42
- self .train_objs = JSONDecoder (stack .enter_context (open ('{}/ train.json' . format ( labelled_dir ) )))
43
- self .train_docs = FieldExtractor (self .train_objs , 'text' )
44
- self .train_labels = np .fromiter (FieldExtractor (self .train_objs , 'label' ), 'int32' )
45
- distant_srs = [stack .enter_context (open ('emote/class_{}.txt' .format (i ), encoding = 'utf-8' )) for i in [0 , 2 ]]
46
- self .distant_docs = BalancedSlice (distant_srs )
47
- self .distant_labels = BalancedSlice ((RepeatSr (0 ), RepeatSr (2 )))
48
- unsup_sr = stack .enter_context (open ('unsup/all.txt' , encoding = 'utf-8' ))
49
- self .unsup_docs = BalancedSlice ([unsup_sr ])
50
- self .val_objs = JSONDecoder (stack .enter_context (open ('{}/ val.json' . format ( labelled_dir ) )))
51
- self .val_docs = FieldExtractor (self .val_objs , 'text' )
52
- self .val_labels = FieldExtractor (self .val_objs , 'label' )
53
- self .test_objs = JSONDecoder (stack .enter_context (open ('{}/ test.json' . format ( labelled_dir ) )))
54
- self .test_docs = FieldExtractor (self .test_objs , 'text' )
55
- self .test_labels = FieldExtractor (self .test_objs , 'label' )
40
+ self . data_dir = 'data/twitter/semeval_2016'
41
+ with temp_chdir ( self . data_dir ):
42
+ self .train_objs = JSONDecoder (stack .enter_context (open ('train.json' )))
43
+ self .train_docs = FieldExtractor (self .train_objs , 'text' )
44
+ self .train_labels = np .fromiter (FieldExtractor (self .train_objs , 'label' ), 'int32' )
45
+ distant_srs = [stack .enter_context (open ('../ emote/class_{}.txt' .format (i ), encoding = 'utf-8' )) for i in [0 , 2 ]]
46
+ self .distant_docs = BalancedSlice (distant_srs )
47
+ self .distant_labels = BalancedSlice ((RepeatSr (0 ), RepeatSr (2 )))
48
+ unsup_sr = stack .enter_context (open ('../ unsup/all.txt' , encoding = 'utf-8' ))
49
+ self .unsup_docs = BalancedSlice ([unsup_sr ])
50
+ self .val_objs = JSONDecoder (stack .enter_context (open ('val.json' )))
51
+ self .val_docs = FieldExtractor (self .val_objs , 'text' )
52
+ self .val_labels = FieldExtractor (self .val_objs , 'label' )
53
+ self .test_objs = JSONDecoder (stack .enter_context (open ('test.json' )))
54
+ self .test_docs = FieldExtractor (self .test_objs , 'text' )
55
+ self .test_labels = FieldExtractor (self .test_objs , 'label' )
56
56
57
57
58
58
class IMDBData (SentiData ):
@@ -63,18 +63,19 @@ def __init__(self):
63
63
self .classes_ = [0 , 1 , 2 ]
64
64
self .average_classes = [0 , 2 ]
65
65
# data
66
- os .chdir ('data/imdb' )
67
- self .train_objs = JSONDecoder (stack .enter_context (open ('train.json' )))
68
- self .train_docs = FieldExtractor (self .train_objs , 'text' )
69
- self .train_labels = np .fromiter (FieldExtractor (self .train_objs , 'label' ), 'int32' )
70
- unsup_sr = stack .enter_context (open ('unsup.json' ))
71
- self .unsup_docs = BalancedSlice ([FieldExtractor (unsup_sr , 'text' )])
72
- self .val_objs = JSONDecoder (stack .enter_context (open ('val.json' )))
73
- self .val_docs = FieldExtractor (self .val_objs , 'text' )
74
- self .val_labels = FieldExtractor (self .val_objs , 'label' )
75
- self .test_objs = JSONDecoder (stack .enter_context (open ('test.json' )))
76
- self .test_docs = FieldExtractor (self .test_objs , 'text' )
77
- self .test_labels = FieldExtractor (self .test_objs , 'label' )
66
+ self .data_dir = 'data/imdb'
67
+ with temp_chdir (self .data_dir ):
68
+ self .train_objs = JSONDecoder (stack .enter_context (open ('train.json' )))
69
+ self .train_docs = FieldExtractor (self .train_objs , 'text' )
70
+ self .train_labels = np .fromiter (FieldExtractor (self .train_objs , 'label' ), 'int32' )
71
+ unsup_sr = stack .enter_context (open ('unsup.json' ))
72
+ self .unsup_docs = BalancedSlice ([FieldExtractor (unsup_sr , 'text' )])
73
+ self .val_objs = JSONDecoder (stack .enter_context (open ('val.json' )))
74
+ self .val_docs = FieldExtractor (self .val_objs , 'text' )
75
+ self .val_labels = FieldExtractor (self .val_objs , 'label' )
76
+ self .test_objs = JSONDecoder (stack .enter_context (open ('test.json' )))
77
+ self .test_docs = FieldExtractor (self .test_objs , 'text' )
78
+ self .test_labels = FieldExtractor (self .test_objs , 'label' )
78
79
79
80
80
81
class YelpData (SentiData ):
@@ -85,16 +86,17 @@ def __init__(self):
85
86
self .classes_ = [1 , 2 , 3 , 4 , 5 ]
86
87
self .average_classes = [1 , 2 , 3 , 4 , 5 ]
87
88
# data
88
- os .chdir ('data/yelp' )
89
- self .train_objs = JSONDecoder (stack .enter_context (open ('train.json' )))
90
- self .train_docs = FieldExtractor (self .train_objs , 'text' )
91
- self .train_labels = np .fromiter (FieldExtractor (self .train_objs , 'stars' ), 'int32' )
92
- self .val_objs = JSONDecoder (stack .enter_context (open ('val.json' )))
93
- self .val_docs = FieldExtractor (self .val_objs , 'text' )
94
- self .val_labels = FieldExtractor (self .val_objs , 'stars' )
95
- self .test_objs = JSONDecoder (stack .enter_context (open ('test.json' )))
96
- self .test_docs = FieldExtractor (self .test_objs , 'text' )
97
- self .test_labels = FieldExtractor (self .test_objs , 'stars' )
89
+ self .data_dir = 'data/yelp'
90
+ with temp_chdir (self .data_dir ):
91
+ self .train_objs = JSONDecoder (stack .enter_context (open ('train.json' )))
92
+ self .train_docs = FieldExtractor (self .train_objs , 'text' )
93
+ self .train_labels = np .fromiter (FieldExtractor (self .train_objs , 'stars' ), 'int32' )
94
+ self .val_objs = JSONDecoder (stack .enter_context (open ('val.json' )))
95
+ self .val_docs = FieldExtractor (self .val_objs , 'text' )
96
+ self .val_labels = FieldExtractor (self .val_objs , 'stars' )
97
+ self .test_objs = JSONDecoder (stack .enter_context (open ('test.json' )))
98
+ self .test_docs = FieldExtractor (self .test_objs , 'text' )
99
+ self .test_labels = FieldExtractor (self .test_objs , 'stars' )
98
100
99
101
100
102
def main ():
@@ -109,11 +111,11 @@ def main():
109
111
110
112
# train
111
113
senti_models = SentiModels (data )
112
- pipeline_name , pipeline = senti_models .fit_voting ()
114
+ # pipeline_name, pipeline = senti_models.fit_voting()
113
115
# pipeline_name, pipeline = senti_models.fit_logreg()
114
116
# pipeline_name, pipeline = senti_models.fit_word2vec_bayes()
115
117
# pipeline_name, pipeline = senti_models.fit_svm()
116
- # pipeline_name, pipeline = senti_models.fit_nn_word()
118
+ pipeline_name , pipeline = senti_models .fit_nn_word ()
117
119
# pipeline_name, pipeline = senti_models.fit_cnn_char()
118
120
# pipeline_name, pipeline = senti_models.fit_cnn_word_char()
119
121
# pipeline_name, pipeline = senti_models.fit_rnn_char_cnn_word()
@@ -125,22 +127,23 @@ def main():
125
127
]
126
128
127
129
# predict & write results
128
- classes_ = np .array ([0 , 1 , 2 ])
129
130
for name , objs , docs , labels in test_data :
130
- os .makedirs ('results/{}' .format (name ), exist_ok = True )
131
131
try :
132
132
probs = pipeline .predict_proba (docs )
133
133
except AttributeError :
134
- probs = LabelBinarizer ().fit (classes_ ).transform (pipeline .predict (docs ))
135
- with open ('results/{}/{}.json' .format (name , pipeline_name ), 'w' ) as results_sr :
136
- for obj , prob in zip (objs , probs ):
137
- results_sr .write (json .dumps ({
138
- 'id' : obj ['id' ], 'label' : int (classes_ [np .argmax (prob )]),
139
- 'probs' : [(c .item (), prob .item ()) for c , prob in zip (classes_ , prob )]
140
- }) + '\n ' )
141
- print ('{} data: ' .format (name ))
142
- labels = np .fromiter (labels , dtype = 'int32' )
143
- write_score ('results/{}/{}' .format (name , pipeline_name ), labels , probs , classes_ , (0 , 2 ))
134
+ probs = LabelBinarizer ().fit (data .classes_ ).transform (pipeline .predict (docs ))
135
+ results_dir = os .path .join (data .data_dir , 'results' , name )
136
+ os .makedirs (results_dir , exist_ok = True )
137
+ with temp_chdir (results_dir ):
138
+ with open ('{}.json' .format (pipeline_name ), 'w' ) as results_sr :
139
+ for obj , prob in zip (objs , probs ):
140
+ results_sr .write (json .dumps ({
141
+ 'id' : obj ['id' ], 'label' : data .classes_ [np .argmax (prob )],
142
+ 'probs' : [(c , prob .item ()) for c , prob in zip (data .classes_ , prob )]
143
+ }) + '\n ' )
144
+ print ('{} data: ' .format (name ))
145
+ labels = np .fromiter (labels , dtype = 'int32' )
146
+ write_score ('{}' .format (pipeline_name ), labels , probs , data .classes_ , data .average_classes )
144
147
145
148
if __name__ == '__main__' :
146
149
main ()
0 commit comments