jisungk
diff --git a/‎.gitignore
+2-2 b/‎.gitignore
+2-2
diff --git a/‎README.md
+11-16 b/‎README.md
+11-16
diff --git a/‎deeplift b/‎deeplift
diff --git a/‎interpret_riddle.py
+6-5 b/‎interpret_riddle.py
+6-5
diff --git a/‎other_clf.py
+10-7 b/‎other_clf.py
+10-7
diff --git a/‎parameter_search.py
+9-10 b/‎parameter_search.py
+9-10
diff --git a/‎requirements.txt
+8 b/‎requirements.txt
+8
diff --git a/‎riddle.py
+3-3 b/‎riddle.py
+3-3
diff --git a/‎riddle/__init__.py
+1-2 b/‎riddle/__init__.py
+1-2
diff --git a/‎riddle/emr.py
+8-7 b/‎riddle/emr.py
+8-7
@@ -5,7 +5,7 @@ _secret/
 *.png
 venv/
 *TODO*
-*.pkl
+_data/*.pkl
 
 # Batch scripts / output
 *.sbatch
@@ -20,7 +20,7 @@ __pycache__/
 *.py[cod]
 *$py.class
 
-# C extensions
+# C exetensions
 *.so
 
 # Distribution / packaging
 
@@ -14,7 +14,7 @@ Please visit [riddle.ai](https://riddle.ai).
 ### Dependencies
 Python Libraries:
 * Keras (`keras`)
-* DeepLIFT (`deeplift`, install from GitHub)
+* DeepLIFT (`deeplift`, available on GitHub)
 * TensorFlow (`tensorflow`) or Theano (`theano`)
 * scikit-learn (`sklearn`)
 * NumPy (`numpy`)
@@ -38,12 +38,8 @@ Execute the following command in the outer *repository* folder (not `riddle/ridd
 You can clone the GitHub repo and go from there:
 ```
 % git clone --recursive git://github.com/jisungk/riddle.git
-```
-
-Alternatively, you can install RIDDLE and DeepLIFT from GitHub using `pip`:
-```
-% pip install git+git://github.com/kundajelab/deeplift.git # DeepLIFT
-% pip install git+git://github.com/jisungk/riddle.git      # RIDDLE
+% cd riddle
+% pip install -r requirements.txt
 ```
 
 #### How can I run the RIDDLE pipeline?
@@ -61,17 +57,16 @@ Please refer to the example data file `dummy.txt` and the accompanying `README`
 
 ### Authors
 
-[Ji-Sung Kim](http://jisungkim.com)
-Princeton University
-*hello (at) jisungkim.com*
+[Ji-Sung Kim](http://jisungkim.com)  
+Princeton University  
+*hello (at) jisungkim.com* (technical inquiries)  
 
-[Xin Gao](https://scholar.google.com/citations?user=wqdK8ugAAAAJ&hl=en), Associate Professor
-King Abdullah University of Science and Technology
-*xin.gao (at) kaust.edu.sa*
+[Xin Gao](https://scholar.google.com/citations?user=wqdK8ugAAAAJ&hl=en), Associate Professor  
+King Abdullah University of Science and Technology  
 
-[Andrey Rzhetsky](https://scholar.google.com/citations?user=HXCMYLsAAAAJ&hl=en), Edna K. Papazian Professor
-University of Chicago
-*andrey.rzhetsky (at) uchicago.edu*
+[Andrey Rzhetsky](https://scholar.google.com/citations?user=HXCMYLsAAAAJ&hl=en), Edna K. Papazian Professor  
+University of Chicago  
+*andrey.rzhetsky (at) uchicago.edu* (research inquiries)  
 
 ### License & Attribution
 All media (including but not limited to designs, images and logos) are copyrighted by Ji-Sung Kim (2017).
 
@@ -188,20 +188,20 @@ def run(data_fn, prop_missing=0., max_num_feature=-1,
         start = time.time()
 
         temp_mlp = MLP(num_feature=num_feature, num_class=num_class)
-        keras_model = load_model(full_out_dir + '/model.h5')
+        hdf5_path = full_out_dir + '/model.h5'
         sums_D, sums_D2, sums_contribs, pairs = \
             feature_importance.get_diff_sums(
-                keras_model,
+                hdf5_path,
                 x_test_unvec,
                 process_x_func=temp_mlp.process_x,
                 num_feature=num_feature,
                 num_class=num_class)
 
-        with open(full_out_dir + '/sums_D.pkl', 'w') as f:
+        with open(full_out_dir + '/sums_D.pkl', 'wb') as f:
             pickle.dump(sums_D, f)
-        with open(full_out_dir + '/sums_D2.pkl', 'w') as f:
+        with open(full_out_dir + '/sums_D2.pkl', 'wb') as f:
             pickle.dump(sums_D2, f)
-        with open(full_out_dir + '/sums_contribs.pkl', 'w') as f:
+        with open(full_out_dir + '/sums_contribs.pkl', 'wb') as f:
             pickle.dump(sums_contribs, f)
 
         list_sums_D.append(sums_D)
@@ -233,6 +233,7 @@ def compute_total_sums(list_sums):
     print('-' * 72)
     print()
 
+
 def main():
     """Main method."""
     np.random.seed(SEED)  # for reproducibility, must be before Keras imports!
 
@@ -115,14 +115,16 @@ def run(ModelClass, x_unvec, y, idx_feat_dict, num_feature, max_num_feature,
             x_train_unvec, y_train, idx_feat_dict,
             method=feature_selection, num_feature=num_feature,
             max_num_feature=max_num_feature)
-        x_train_unvec = subset_reencode_features(x_train_unvec, feat_encoding_dict)
-        x_test_unvec = subset_reencode_features(x_test_unvec, feat_encoding_dict)
+        x_train_unvec = subset_reencode_features(
+            x_train_unvec, feat_encoding_dict)
+        x_test_unvec = subset_reencode_features(
+            x_test_unvec, feat_encoding_dict)
         num_feature = max_num_feature
 
     x_train = vectorize_features(x_train_unvec, num_feature)
     x_test = vectorize_features(x_test_unvec, num_feature)
 
-    args = dict(init_args) # copy dictionary
+    args = dict(init_args)  # copy dictionary
     args.update(params[k_idx])
 
     start = time.time()
@@ -173,20 +175,21 @@ def run_kfold(data_fn, method='logit', prop_missing=0., max_num_feature=-1,
     """
     start = time.time()
 
-    try: # load saved parameters
+    try:  # load saved parameters
         param_path = get_param_path(cache_dir, method, data_fn, prop_missing,
                                     max_num_feature, feature_selection)
-        with open(param_path, 'r') as f:
+        with open(param_path, 'rb') as f:
             params = pickle.load(f)
     except:
         warnings.warn('Cannot load parameters from: {}\n'.format(param_path) +
                       'Need to do parameter search; run parameter_search.py')
         raise
 
-    # TODO(jisungkim) handle binary and multiclass separately, don't assume multiclass!
+    # TODO(jisungkim) handle binary and multiclass separately, don't assume
+    # multiclass!
     if method == 'logit':
         from sklearn.linear_model import LogisticRegression as ModelClass
-        init_args = {'multi_class': 'multinomial', 'solver':'lbfgs'}
+        init_args = {'multi_class': 'multinomial', 'solver': 'lbfgs'}
     elif method == 'random_forest':
         from sklearn.ensemble import RandomForestClassifier as ModelClass
         init_args = {}
 
@@ -141,14 +141,14 @@ def run(method, x_unvec, y, idx_feat_dict, num_feature, max_num_feature,
     start = time.time()
     if method == 'riddle':
         model_class = MLP
-        init_args = {'num_feature':num_feature, 'num_class': num_class}
+        init_args = {'num_feature': num_feature, 'num_class': num_class}
         param_dist = {
             'num_hidden_layer': 2,  # [1, 2]
             'num_hidden_node': 512,  # [128, 256, 512]
             'activation': ['prelu', 'relu'],
             'dropout': tuning.Uniform(lo=0.2, hi=0.8),
             'learning_rate': tuning.UniformLogSpace(10, lo=-6, hi=-1),
-            }
+        }
         best_param = tuning.random_search(
             model_class, init_args, param_dist, x_val_unvec, y_val,
             num_class=num_class, k=TUNING_K, num_search=num_search)
@@ -167,7 +167,7 @@ def run(method, x_unvec, y, idx_feat_dict, num_feature, max_num_feature,
                 'max_features': ['sqrt', 'log2', None],
                 'max_depth': tuning.UniformIntegerLogSpace(base=2, lo=0, hi=7),
                 'n_estimators': tuning.UniformIntegerLogSpace(base=2, lo=4, hi=8)
-                }
+            }
         elif method == 'linear_svm':
             from sklearn.svm import SVC
             # remark: due to a bug in scikit-learn / libsvm, the sparse 'linear'
@@ -177,30 +177,30 @@ def run(method, x_unvec, y, idx_feat_dict, num_feature, max_num_feature,
                             probability=True, cache_size=1000)
             param_dist = {
                 'C': tuning.UniformLogSpace(base=10, lo=-2, hi=1)
-                }
+            }
         elif method == 'poly_svm':
             from sklearn.svm import SVC
             estimator = SVC(kernel='poly', probability=True, cache_size=1000)
             param_dist = {
                 'C': tuning.UniformLogSpace(base=10, lo=-2, hi=1),
                 'degree': [2, 3, 4],
                 'gamma': tuning.UniformLogSpace(base=10, lo=-5, hi=1)
-                }
+            }
         elif method == 'rbf_svm':
             from sklearn.svm import SVC
             estimator = SVC(kernel='rbf', probability=True, cache_size=1000)
             param_dist = {
                 'C': tuning.UniformLogSpace(base=10, lo=-2, hi=1),
                 'gamma': tuning.UniformLogSpace(base=10, lo=-5, hi=1)
-                }
+            }
         elif method == 'gbdt':
             from xgboost import XGBClassifier
             estimator = XGBClassifier(objective='multi:softprob')
             param_dist = {
                 'max_depth': tuning.UniformIntegerLogSpace(base=2, lo=0, hi=5),
                 'n_estimators': tuning.UniformIntegerLogSpace(base=2, lo=4, hi=8),
                 'learning_rate': tuning.UniformLogSpace(base=10, lo=-3, hi=0)
-                }
+            }
         else:
             raise ValueError('unknown method: {}'.format(method))
 
@@ -209,7 +209,6 @@ def run(method, x_unvec, y, idx_feat_dict, num_feature, max_num_feature,
             scoring=loss_scorer)
         param_search.fit(x_val, y_val)
 
-
         best_param = param_search.best_params_
 
     print('Best parameters for {} for k_idx={}: {} found in {:.3f} s'
@@ -258,7 +257,7 @@ def run_kfold(data_fn, method='logit', prop_missing=0., max_num_feature=-1,
     param_path = get_param_path(cache_dir, method, data_fn, prop_missing,
                                 max_num_feature, feature_selection)
     if not force_run and os.path.isfile(param_path):
-        warnings.warn('Already did search for {}, not performing search'
+        warnings.warn('Already did search for {}, skipping the search'
                       .format(method))
         return
 
@@ -275,7 +274,7 @@ def run_kfold(data_fn, method='logit', prop_missing=0., max_num_feature=-1,
             k_idx=k_idx, k=k, num_search=num_search, perm_indices=perm_indices)
 
     recursive_mkdir(FLAGS.cache_dir)
-    with open(param_path, 'w') as f:  # save
+    with open(param_path, 'wb') as f:  # save
         pickle.dump(params, f)
 
     print('Finished parameter search for method: {}'.format(method))
 
@@ -0,0 +1,8 @@
+keras
+tensorflow
+sklearn
+xgboost
+numpy                 
+scipy
+matplotlib
+h5py
@@ -181,7 +181,7 @@ def run_kfold(data_fn, prop_missing=0., max_num_feature=-1,
 
     base_out_dir = get_base_out_dir(out_dir, 'riddle', data_fn, prop_missing,
                                     max_num_feature, feature_selection)
-    recursive_mkdir(cache_dir)
+    recursive_mkdir(base_out_dir)
 
     # get common data
     x_unvec, y, idx_feat_dict, idx_class_dict, icd9_descript_dict, perm_indices = (
@@ -197,10 +197,10 @@ def run_kfold(data_fn, prop_missing=0., max_num_feature=-1,
         for idx, feat in idx_feat_dict.items():
             f.write('{}\t{}\n'.format(idx, feat))
 
-    try: # load saved parameters
+    try:  # load saved parameters
         param_path = get_param_path(cache_dir, 'riddle', data_fn, prop_missing,
                                     max_num_feature, feature_selection)
-        with open(param_path, 'r') as f:
+        with open(param_path, 'rb') as f:
             params = pickle.load(f)
 
         # for legacy compatability
 
@@ -11,11 +11,10 @@
 from . import roc
 from . import tuning
 
-__version__ = '2.0.0'
+__version__ = '2.0.1'
 
 
 def hello():
     """Print out the current version."""
     print('Hello, World')
     print('My name is RIDDLE {}'.format(__version__))
-
@@ -106,7 +106,8 @@ def get_k_fold_partition(x_unvec, y, k_idx, k, perm_indices):
     (x_unvec_train, y_train), (x_unvec_val, y_val) = _split_data(
         x_unvec_train, y_train, k_idx=0, k=10, perm_indices=val_perm_indices)
 
-    assert len(x_unvec_train) + len(x_unvec_val) + len(x_unvec_test) == len(x_unvec)
+    assert len(x_unvec_train) + len(x_unvec_val) + \
+        len(x_unvec_test) == len(x_unvec)
 
     return x_unvec_train, y_train, x_unvec_val, y_val, x_unvec_test, y_test
 
@@ -125,18 +126,18 @@ def get_icd9_descript_dict(path):
     lines = _read_file(path)
     icd9_descript_dict = {}
 
-    for l in lines[1:]: # ignore first line which is column names
+    for l in lines[1:]:  # ignore first line which is column names
         elems = l.split('\t')
 
         try:
-            assert len(elems) == 8 # number of columns should be 8
+            assert len(elems) == 8  # number of columns should be 8
         except:
             print('Problem with following line while loading icd9_descript_dict:')
             print(l)
             raise
 
-        icd9 = elems[0] # ICD9 code should be in the first column
-        descript = elems[1] # description should be in the second column
+        icd9 = elems[0]  # ICD9 code should be in the first column
+        descript = elems[1]  # description should be in the second column
 
         # check if the ICD9 code is a category and if so, append a label
         is_category = len(icd9.split('.')) == 1
@@ -199,13 +200,13 @@ def _clean_data(data, icd9_descript_dict, no_onset_age=True):
             icd9s.sort(key=lambda i: int(i[1]))
 
             if no_onset_age:
-                icd9s = [i[0] for i in icd9s] # remove onset age
+                icd9s = [i[0] for i in icd9s]  # remove onset age
             else:
                 icd9s = [':'.join(i) for i in icd9s]
             features.extend(icd9s)
 
             x_raw.append(features)
-            y_raw.append(line[RAW_CLASS_COL]) # extract class
+            y_raw.append(line[RAW_CLASS_COL])  # extract class
         except:
             print('WARNING: error on line #{} with case:'.format(idx))
             print(' '.join(line))