Creation of mapping layer

dcrescim · dcrescim · commit 91c823eb5d3b · 2014-06-21T00:12:21.000-05:00
diff --git a/DFMapper.py b/DFMapper.py
@@ -0,0 +1,152 @@
+import numpy as np
+import ipdb
+import itertools
+
+def isinstance_func(x):
+  return hasattr(x, '__call__')
+
+# Takes numpy array, and returns a row array
+def row(arr):
+  if len(arr.shape) == 1:
+    return arr.reshape(1, len(arr))
+  return arr
+
+def col(arr):
+  if len(arr.shape) == 1:
+    return arr.reshape(len(arr), 1)
+  return arr
+
+
+def explode(matrix, order):
+  cols = matrix.shape[1]
+  assert order > 1, "order is not greater than 1"
+
+  new_cols =  []
+  for combos in itertools.combinations(xrange(cols),order):
+    first_column_index = combos[0]
+
+    # Create the combination column
+    combo_column = np.copy(matrix[:,first_column_index])
+    for cur_column_index in combos[1:]:
+      combo_column *= matrix[:, cur_column_index]
+
+
+    new_cols.append(col(combo_column))
+
+  return np.hstack(new_cols)
+
+
+class DFMapper(object):
+  def __init__(self):
+    self.dict_list = []
+    self.index = None
+    self.options = {}
+  # Key is a column of the original data
+  # function list is a list of one of the following
+  #   - A class that implements the Transformer API
+  #   - A function
+  def _add(self, key, function_list, is_X, is_Y, is_index, as_col=True):
+    
+    if not isinstance(function_list, list):
+      function_list = [function_list]
+
+    if isinstance(key, str):
+      key = [key]
+
+    dict_values = {}
+    dict_values['pipeline'] = function_list
+    dict_values['is_X'] = is_X
+    dict_values['is_Y'] = is_Y
+    dict_values['is_index'] = is_index
+    dict_values['as_col'] = as_col
+
+    self.dict_list.append((key,dict_values))
+  
+  def add_X(self, key, function_list=[], as_col = True):
+    self._add(key, function_list, is_X=True, is_Y=False, is_index=False, as_col=as_col)
+
+  def add_Y(self, key, function_list=[], as_col = True):
+    self._add(key, function_list, is_X=False, is_Y=True, is_index=False, as_col=as_col)
+
+  def add_index(self, key, function_list=[], as_col=True):
+    self._add(key, function_list, is_X=False, is_Y=False, is_index=True, as_col=as_col)
+
+  def add_option(self, key, val=True):
+    self.options[key] = val
+
+  def evaluate(self, key, dict_options, df, eval_type):
+    for el in key:
+      if (el not in df):
+        # If you are missing an X column, this is bad. 
+        #   You should find it.
+        if dict_options['is_X']:
+          ValueError("The column %s is not in your dataframe" % key)
+        
+        # If you are missing Y columns, that is not a big deal
+        #   You could just be transforming the test set.
+        if dict_options['is_Y']:
+          return None
+
+    if dict_options['as_col']:
+      cur_val = col(df[key].values)
+    else:
+      cur_val = df[key]
+
+    #import ipdb; ipdb.set_trace()
+    for (index, f) in enumerate(dict_options['pipeline']):
+      if isinstance_func(f):
+        cur_val = f(cur_val)
+      else:
+        if 'fit_transform' == eval_type:
+          cur_val = f.fit_transform(cur_val)
+        elif 'transform' == eval_type:
+          cur_val = f.transform(cur_val)
+        elif 'fit' == eval_type:
+          # Just call fit at the end
+          # otherwise call fit transform
+          if index+1 == len(dict_options['pipeline']):
+            f.fit(cur_val)
+            return None
+          else:
+            cur_val = f.fit_transform(cur_val)
+        else:
+          assert False, "Only support options fit, transform and fit_transform"
+
+    return cur_val
+
+  def eval_and_coalesce(self, df, eval_type):
+    results_X = []
+    results_Y = []
+    for (key, dict_options) in self.dict_list:
+      cur_val = self.evaluate(key,dict_options, df, eval_type)
+
+      # This occurs when you are trying to evaluate
+      # a key that is not in the dataframe
+      if cur_val == None:
+        continue
+
+      if dict_options['is_X']:
+        results_X.append(cur_val)
+      if dict_options['is_Y']:
+        results_Y.append(cur_val)
+      if dict_options['is_index']:
+        self.index = cur_val
+
+    results_X = np.hstack(results_X) if results_X else np.array([])
+    results_Y = np.hstack(results_Y) if results_Y else np.array([])
+
+    if ('explode' in self.options) and (len(results_X) > 0):
+      order = self.options['explode']
+      results_X = np.hstack([results_X, explode(results_X,order)])
+    return results_X, results_Y
+
+  def fit(self, df):
+    self.eval_and_coalesce(df, 'fit')
+    return self
+
+  def transform(self, df):
+    return self.eval_and_coalesce(df, 'transform')
+
+  def fit_transform(self, df):
+    return self.eval_and_coalesce(df, 'fit_transform')
+
diff --git a/README.md b/README.md
@@ -0,0 +1,11 @@
+DFMapper
+========
+
+This module is heavily influenced by the awesome pandas-sklearn module, as well as the Pipeline class from scikit-learn.
+
+More often than not, one has to perform the same transformations on the training set, and the test set leading to a duplication of code, and a source of errors. This gets more complicated if one has to determine the number of categorical variables (say), and use that mapping on the test set. 
+
+This module aims to make that whole process easier. By creating a DFMapper object, one can use the Transformer API to map both the training dataframe and the test dataframe, which makes the code much easier to understand and much more maintainable. 
+
+Here are some example uses.
+