Skip to content

Commit c8d6ad5

Browse files
committed
Release V2.0.1 code: bugfixes
- removed duplicate 'license' entry in setup.py which blocked installation - update write/mode for pickle dumps and loads - other bugfixes for other Python3 bugs - update DeepLIFT API calls - update style - add requirements.txt file
1 parent 92eaafd commit c8d6ad5

24 files changed

+235
-214
lines changed

.gitignore

+2-2
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ _secret/
55
*.png
66
venv/
77
*TODO*
8-
*.pkl
8+
_data/*.pkl
99

1010
# Batch scripts / output
1111
*.sbatch
@@ -20,7 +20,7 @@ __pycache__/
2020
*.py[cod]
2121
*$py.class
2222

23-
# C extensions
23+
# C exetensions
2424
*.so
2525

2626
# Distribution / packaging

README.md

+11-16
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ Please visit [riddle.ai](https://riddle.ai).
1414
### Dependencies
1515
Python Libraries:
1616
* Keras (`keras`)
17-
* DeepLIFT (`deeplift`, install from GitHub)
17+
* DeepLIFT (`deeplift`, available on GitHub)
1818
* TensorFlow (`tensorflow`) or Theano (`theano`)
1919
* scikit-learn (`sklearn`)
2020
* NumPy (`numpy`)
@@ -38,12 +38,8 @@ Execute the following command in the outer *repository* folder (not `riddle/ridd
3838
You can clone the GitHub repo and go from there:
3939
```
4040
% git clone --recursive git://github.com/jisungk/riddle.git
41-
```
42-
43-
Alternatively, you can install RIDDLE and DeepLIFT from GitHub using `pip`:
44-
```
45-
% pip install git+git://github.com/kundajelab/deeplift.git # DeepLIFT
46-
% pip install git+git://github.com/jisungk/riddle.git # RIDDLE
41+
% cd riddle
42+
% pip install -r requirements.txt
4743
```
4844

4945
#### How can I run the RIDDLE pipeline?
@@ -61,17 +57,16 @@ Please refer to the example data file `dummy.txt` and the accompanying `README`
6157

6258
### Authors
6359

64-
[Ji-Sung Kim](http://jisungkim.com)
65-
Princeton University
66-
*hello (at) jisungkim.com*
60+
[Ji-Sung Kim](http://jisungkim.com)
61+
Princeton University
62+
*hello (at) jisungkim.com* (technical inquiries)
6763

68-
[Xin Gao](https://scholar.google.com/citations?user=wqdK8ugAAAAJ&hl=en), Associate Professor
69-
King Abdullah University of Science and Technology
70-
*xin.gao (at) kaust.edu.sa*
64+
[Xin Gao](https://scholar.google.com/citations?user=wqdK8ugAAAAJ&hl=en), Associate Professor
65+
King Abdullah University of Science and Technology
7166

72-
[Andrey Rzhetsky](https://scholar.google.com/citations?user=HXCMYLsAAAAJ&hl=en), Edna K. Papazian Professor
73-
University of Chicago
74-
*andrey.rzhetsky (at) uchicago.edu*
67+
[Andrey Rzhetsky](https://scholar.google.com/citations?user=HXCMYLsAAAAJ&hl=en), Edna K. Papazian Professor
68+
University of Chicago
69+
*andrey.rzhetsky (at) uchicago.edu* (research inquiries)
7570

7671
### License & Attribution
7772
All media (including but not limited to designs, images and logos) are copyrighted by Ji-Sung Kim (2017).

deeplift

Submodule deeplift updated 63 files

interpret_riddle.py

+6-5
Original file line numberDiff line numberDiff line change
@@ -188,20 +188,20 @@ def run(data_fn, prop_missing=0., max_num_feature=-1,
188188
start = time.time()
189189

190190
temp_mlp = MLP(num_feature=num_feature, num_class=num_class)
191-
keras_model = load_model(full_out_dir + '/model.h5')
191+
hdf5_path = full_out_dir + '/model.h5'
192192
sums_D, sums_D2, sums_contribs, pairs = \
193193
feature_importance.get_diff_sums(
194-
keras_model,
194+
hdf5_path,
195195
x_test_unvec,
196196
process_x_func=temp_mlp.process_x,
197197
num_feature=num_feature,
198198
num_class=num_class)
199199

200-
with open(full_out_dir + '/sums_D.pkl', 'w') as f:
200+
with open(full_out_dir + '/sums_D.pkl', 'wb') as f:
201201
pickle.dump(sums_D, f)
202-
with open(full_out_dir + '/sums_D2.pkl', 'w') as f:
202+
with open(full_out_dir + '/sums_D2.pkl', 'wb') as f:
203203
pickle.dump(sums_D2, f)
204-
with open(full_out_dir + '/sums_contribs.pkl', 'w') as f:
204+
with open(full_out_dir + '/sums_contribs.pkl', 'wb') as f:
205205
pickle.dump(sums_contribs, f)
206206

207207
list_sums_D.append(sums_D)
@@ -233,6 +233,7 @@ def compute_total_sums(list_sums):
233233
print('-' * 72)
234234
print()
235235

236+
236237
def main():
237238
"""Main method."""
238239
np.random.seed(SEED) # for reproducibility, must be before Keras imports!

other_clf.py

+10-7
Original file line numberDiff line numberDiff line change
@@ -115,14 +115,16 @@ def run(ModelClass, x_unvec, y, idx_feat_dict, num_feature, max_num_feature,
115115
x_train_unvec, y_train, idx_feat_dict,
116116
method=feature_selection, num_feature=num_feature,
117117
max_num_feature=max_num_feature)
118-
x_train_unvec = subset_reencode_features(x_train_unvec, feat_encoding_dict)
119-
x_test_unvec = subset_reencode_features(x_test_unvec, feat_encoding_dict)
118+
x_train_unvec = subset_reencode_features(
119+
x_train_unvec, feat_encoding_dict)
120+
x_test_unvec = subset_reencode_features(
121+
x_test_unvec, feat_encoding_dict)
120122
num_feature = max_num_feature
121123

122124
x_train = vectorize_features(x_train_unvec, num_feature)
123125
x_test = vectorize_features(x_test_unvec, num_feature)
124126

125-
args = dict(init_args) # copy dictionary
127+
args = dict(init_args) # copy dictionary
126128
args.update(params[k_idx])
127129

128130
start = time.time()
@@ -173,20 +175,21 @@ def run_kfold(data_fn, method='logit', prop_missing=0., max_num_feature=-1,
173175
"""
174176
start = time.time()
175177

176-
try: # load saved parameters
178+
try: # load saved parameters
177179
param_path = get_param_path(cache_dir, method, data_fn, prop_missing,
178180
max_num_feature, feature_selection)
179-
with open(param_path, 'r') as f:
181+
with open(param_path, 'rb') as f:
180182
params = pickle.load(f)
181183
except:
182184
warnings.warn('Cannot load parameters from: {}\n'.format(param_path) +
183185
'Need to do parameter search; run parameter_search.py')
184186
raise
185187

186-
# TODO(jisungkim) handle binary and multiclass separately, don't assume multiclass!
188+
# TODO(jisungkim) handle binary and multiclass separately, don't assume
189+
# multiclass!
187190
if method == 'logit':
188191
from sklearn.linear_model import LogisticRegression as ModelClass
189-
init_args = {'multi_class': 'multinomial', 'solver':'lbfgs'}
192+
init_args = {'multi_class': 'multinomial', 'solver': 'lbfgs'}
190193
elif method == 'random_forest':
191194
from sklearn.ensemble import RandomForestClassifier as ModelClass
192195
init_args = {}

parameter_search.py

+9-10
Original file line numberDiff line numberDiff line change
@@ -141,14 +141,14 @@ def run(method, x_unvec, y, idx_feat_dict, num_feature, max_num_feature,
141141
start = time.time()
142142
if method == 'riddle':
143143
model_class = MLP
144-
init_args = {'num_feature':num_feature, 'num_class': num_class}
144+
init_args = {'num_feature': num_feature, 'num_class': num_class}
145145
param_dist = {
146146
'num_hidden_layer': 2, # [1, 2]
147147
'num_hidden_node': 512, # [128, 256, 512]
148148
'activation': ['prelu', 'relu'],
149149
'dropout': tuning.Uniform(lo=0.2, hi=0.8),
150150
'learning_rate': tuning.UniformLogSpace(10, lo=-6, hi=-1),
151-
}
151+
}
152152
best_param = tuning.random_search(
153153
model_class, init_args, param_dist, x_val_unvec, y_val,
154154
num_class=num_class, k=TUNING_K, num_search=num_search)
@@ -167,7 +167,7 @@ def run(method, x_unvec, y, idx_feat_dict, num_feature, max_num_feature,
167167
'max_features': ['sqrt', 'log2', None],
168168
'max_depth': tuning.UniformIntegerLogSpace(base=2, lo=0, hi=7),
169169
'n_estimators': tuning.UniformIntegerLogSpace(base=2, lo=4, hi=8)
170-
}
170+
}
171171
elif method == 'linear_svm':
172172
from sklearn.svm import SVC
173173
# remark: due to a bug in scikit-learn / libsvm, the sparse 'linear'
@@ -177,30 +177,30 @@ def run(method, x_unvec, y, idx_feat_dict, num_feature, max_num_feature,
177177
probability=True, cache_size=1000)
178178
param_dist = {
179179
'C': tuning.UniformLogSpace(base=10, lo=-2, hi=1)
180-
}
180+
}
181181
elif method == 'poly_svm':
182182
from sklearn.svm import SVC
183183
estimator = SVC(kernel='poly', probability=True, cache_size=1000)
184184
param_dist = {
185185
'C': tuning.UniformLogSpace(base=10, lo=-2, hi=1),
186186
'degree': [2, 3, 4],
187187
'gamma': tuning.UniformLogSpace(base=10, lo=-5, hi=1)
188-
}
188+
}
189189
elif method == 'rbf_svm':
190190
from sklearn.svm import SVC
191191
estimator = SVC(kernel='rbf', probability=True, cache_size=1000)
192192
param_dist = {
193193
'C': tuning.UniformLogSpace(base=10, lo=-2, hi=1),
194194
'gamma': tuning.UniformLogSpace(base=10, lo=-5, hi=1)
195-
}
195+
}
196196
elif method == 'gbdt':
197197
from xgboost import XGBClassifier
198198
estimator = XGBClassifier(objective='multi:softprob')
199199
param_dist = {
200200
'max_depth': tuning.UniformIntegerLogSpace(base=2, lo=0, hi=5),
201201
'n_estimators': tuning.UniformIntegerLogSpace(base=2, lo=4, hi=8),
202202
'learning_rate': tuning.UniformLogSpace(base=10, lo=-3, hi=0)
203-
}
203+
}
204204
else:
205205
raise ValueError('unknown method: {}'.format(method))
206206

@@ -209,7 +209,6 @@ def run(method, x_unvec, y, idx_feat_dict, num_feature, max_num_feature,
209209
scoring=loss_scorer)
210210
param_search.fit(x_val, y_val)
211211

212-
213212
best_param = param_search.best_params_
214213

215214
print('Best parameters for {} for k_idx={}: {} found in {:.3f} s'
@@ -258,7 +257,7 @@ def run_kfold(data_fn, method='logit', prop_missing=0., max_num_feature=-1,
258257
param_path = get_param_path(cache_dir, method, data_fn, prop_missing,
259258
max_num_feature, feature_selection)
260259
if not force_run and os.path.isfile(param_path):
261-
warnings.warn('Already did search for {}, not performing search'
260+
warnings.warn('Already did search for {}, skipping the search'
262261
.format(method))
263262
return
264263

@@ -275,7 +274,7 @@ def run_kfold(data_fn, method='logit', prop_missing=0., max_num_feature=-1,
275274
k_idx=k_idx, k=k, num_search=num_search, perm_indices=perm_indices)
276275

277276
recursive_mkdir(FLAGS.cache_dir)
278-
with open(param_path, 'w') as f: # save
277+
with open(param_path, 'wb') as f: # save
279278
pickle.dump(params, f)
280279

281280
print('Finished parameter search for method: {}'.format(method))

requirements.txt

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
keras
2+
tensorflow
3+
sklearn
4+
xgboost
5+
numpy
6+
scipy
7+
matplotlib
8+
h5py

riddle.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ def run_kfold(data_fn, prop_missing=0., max_num_feature=-1,
181181

182182
base_out_dir = get_base_out_dir(out_dir, 'riddle', data_fn, prop_missing,
183183
max_num_feature, feature_selection)
184-
recursive_mkdir(cache_dir)
184+
recursive_mkdir(base_out_dir)
185185

186186
# get common data
187187
x_unvec, y, idx_feat_dict, idx_class_dict, icd9_descript_dict, perm_indices = (
@@ -197,10 +197,10 @@ def run_kfold(data_fn, prop_missing=0., max_num_feature=-1,
197197
for idx, feat in idx_feat_dict.items():
198198
f.write('{}\t{}\n'.format(idx, feat))
199199

200-
try: # load saved parameters
200+
try: # load saved parameters
201201
param_path = get_param_path(cache_dir, 'riddle', data_fn, prop_missing,
202202
max_num_feature, feature_selection)
203-
with open(param_path, 'r') as f:
203+
with open(param_path, 'rb') as f:
204204
params = pickle.load(f)
205205

206206
# for legacy compatability

riddle/__init__.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,10 @@
1111
from . import roc
1212
from . import tuning
1313

14-
__version__ = '2.0.0'
14+
__version__ = '2.0.1'
1515

1616

1717
def hello():
1818
"""Print out the current version."""
1919
print('Hello, World')
2020
print('My name is RIDDLE {}'.format(__version__))
21-

riddle/emr.py

+8-7
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,8 @@ def get_k_fold_partition(x_unvec, y, k_idx, k, perm_indices):
106106
(x_unvec_train, y_train), (x_unvec_val, y_val) = _split_data(
107107
x_unvec_train, y_train, k_idx=0, k=10, perm_indices=val_perm_indices)
108108

109-
assert len(x_unvec_train) + len(x_unvec_val) + len(x_unvec_test) == len(x_unvec)
109+
assert len(x_unvec_train) + len(x_unvec_val) + \
110+
len(x_unvec_test) == len(x_unvec)
110111

111112
return x_unvec_train, y_train, x_unvec_val, y_val, x_unvec_test, y_test
112113

@@ -125,18 +126,18 @@ def get_icd9_descript_dict(path):
125126
lines = _read_file(path)
126127
icd9_descript_dict = {}
127128

128-
for l in lines[1:]: # ignore first line which is column names
129+
for l in lines[1:]: # ignore first line which is column names
129130
elems = l.split('\t')
130131

131132
try:
132-
assert len(elems) == 8 # number of columns should be 8
133+
assert len(elems) == 8 # number of columns should be 8
133134
except:
134135
print('Problem with following line while loading icd9_descript_dict:')
135136
print(l)
136137
raise
137138

138-
icd9 = elems[0] # ICD9 code should be in the first column
139-
descript = elems[1] # description should be in the second column
139+
icd9 = elems[0] # ICD9 code should be in the first column
140+
descript = elems[1] # description should be in the second column
140141

141142
# check if the ICD9 code is a category and if so, append a label
142143
is_category = len(icd9.split('.')) == 1
@@ -199,13 +200,13 @@ def _clean_data(data, icd9_descript_dict, no_onset_age=True):
199200
icd9s.sort(key=lambda i: int(i[1]))
200201

201202
if no_onset_age:
202-
icd9s = [i[0] for i in icd9s] # remove onset age
203+
icd9s = [i[0] for i in icd9s] # remove onset age
203204
else:
204205
icd9s = [':'.join(i) for i in icd9s]
205206
features.extend(icd9s)
206207

207208
x_raw.append(features)
208-
y_raw.append(line[RAW_CLASS_COL]) # extract class
209+
y_raw.append(line[RAW_CLASS_COL]) # extract class
209210
except:
210211
print('WARNING: error on line #{} with case:'.format(idx))
211212
print(' '.join(line))

0 commit comments

Comments
 (0)