Skip to content

Commit 91c823e

Browse files
committed
Creation of mapping layer
0 parents  commit 91c823e

File tree

2 files changed

+163
-0
lines changed

2 files changed

+163
-0
lines changed

DFMapper.py

+152
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
import numpy as np
2+
import ipdb
3+
import itertools
4+
5+
def isinstance_func(x):
6+
return hasattr(x, '__call__')
7+
8+
# Takes numpy array, and returns a row array
9+
def row(arr):
10+
if len(arr.shape) == 1:
11+
return arr.reshape(1, len(arr))
12+
return arr
13+
14+
def col(arr):
15+
if len(arr.shape) == 1:
16+
return arr.reshape(len(arr), 1)
17+
return arr
18+
19+
20+
def explode(matrix, order):
21+
cols = matrix.shape[1]
22+
assert order > 1, "order is not greater than 1"
23+
24+
new_cols = []
25+
for combos in itertools.combinations(xrange(cols),order):
26+
first_column_index = combos[0]
27+
28+
# Create the combination column
29+
combo_column = np.copy(matrix[:,first_column_index])
30+
for cur_column_index in combos[1:]:
31+
combo_column *= matrix[:, cur_column_index]
32+
33+
34+
new_cols.append(col(combo_column))
35+
36+
return np.hstack(new_cols)
37+
38+
39+
class DFMapper(object):
40+
def __init__(self):
41+
self.dict_list = []
42+
self.index = None
43+
self.options = {}
44+
# Key is a column of the original data
45+
# function list is a list of one of the following
46+
# - A class that implements the Transformer API
47+
# - A function
48+
def _add(self, key, function_list, is_X, is_Y, is_index, as_col=True):
49+
50+
if not isinstance(function_list, list):
51+
function_list = [function_list]
52+
53+
if isinstance(key, str):
54+
key = [key]
55+
56+
dict_values = {}
57+
dict_values['pipeline'] = function_list
58+
dict_values['is_X'] = is_X
59+
dict_values['is_Y'] = is_Y
60+
dict_values['is_index'] = is_index
61+
dict_values['as_col'] = as_col
62+
63+
self.dict_list.append((key,dict_values))
64+
65+
def add_X(self, key, function_list=[], as_col = True):
66+
self._add(key, function_list, is_X=True, is_Y=False, is_index=False, as_col=as_col)
67+
68+
def add_Y(self, key, function_list=[], as_col = True):
69+
self._add(key, function_list, is_X=False, is_Y=True, is_index=False, as_col=as_col)
70+
71+
def add_index(self, key, function_list=[], as_col=True):
72+
self._add(key, function_list, is_X=False, is_Y=False, is_index=True, as_col=as_col)
73+
74+
def add_option(self, key, val=True):
75+
self.options[key] = val
76+
77+
def evaluate(self, key, dict_options, df, eval_type):
78+
for el in key:
79+
if (el not in df):
80+
# If you are missing an X column, this is bad.
81+
# You should find it.
82+
if dict_options['is_X']:
83+
ValueError("The column %s is not in your dataframe" % key)
84+
85+
# If you are missing Y columns, that is not a big deal
86+
# You could just be transforming the test set.
87+
if dict_options['is_Y']:
88+
return None
89+
90+
if dict_options['as_col']:
91+
cur_val = col(df[key].values)
92+
else:
93+
cur_val = df[key]
94+
95+
#import ipdb; ipdb.set_trace()
96+
for (index, f) in enumerate(dict_options['pipeline']):
97+
if isinstance_func(f):
98+
cur_val = f(cur_val)
99+
else:
100+
if 'fit_transform' == eval_type:
101+
cur_val = f.fit_transform(cur_val)
102+
elif 'transform' == eval_type:
103+
cur_val = f.transform(cur_val)
104+
elif 'fit' == eval_type:
105+
# Just call fit at the end
106+
# otherwise call fit transform
107+
if index+1 == len(dict_options['pipeline']):
108+
f.fit(cur_val)
109+
return None
110+
else:
111+
cur_val = f.fit_transform(cur_val)
112+
else:
113+
assert False, "Only support options fit, transform and fit_transform"
114+
115+
return cur_val
116+
117+
def eval_and_coalesce(self, df, eval_type):
118+
results_X = []
119+
results_Y = []
120+
for (key, dict_options) in self.dict_list:
121+
cur_val = self.evaluate(key,dict_options, df, eval_type)
122+
123+
# This occurs when you are trying to evaluate
124+
# a key that is not in the dataframe
125+
if cur_val == None:
126+
continue
127+
128+
if dict_options['is_X']:
129+
results_X.append(cur_val)
130+
if dict_options['is_Y']:
131+
results_Y.append(cur_val)
132+
if dict_options['is_index']:
133+
self.index = cur_val
134+
135+
results_X = np.hstack(results_X) if results_X else np.array([])
136+
results_Y = np.hstack(results_Y) if results_Y else np.array([])
137+
138+
if ('explode' in self.options) and (len(results_X) > 0):
139+
order = self.options['explode']
140+
results_X = np.hstack([results_X, explode(results_X,order)])
141+
return results_X, results_Y
142+
143+
def fit(self, df):
144+
self.eval_and_coalesce(df, 'fit')
145+
return self
146+
147+
def transform(self, df):
148+
return self.eval_and_coalesce(df, 'transform')
149+
150+
def fit_transform(self, df):
151+
return self.eval_and_coalesce(df, 'fit_transform')
152+

README.md

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
DFMapper
2+
========
3+
4+
This module is heavily influenced by the awesome pandas-sklearn module, as well as the Pipeline class from scikit-learn.
5+
6+
More often than not, one has to perform the same transformations on the training set, and the test set leading to a duplication of code, and a source of errors. This gets more complicated if one has to determine the number of categorical variables (say), and use that mapping on the test set.
7+
8+
This module aims to make that whole process easier. By creating a DFMapper object, one can use the Transformer API to map both the training dataframe and the test dataframe, which makes the code much easier to understand and much more maintainable.
9+
10+
Here are some example uses.
11+

0 commit comments

Comments
 (0)