14
14
15
15
@dataclass
16
16
class RegressionModel (BaseEstimator ):
17
- """Machine learning model to estimate the mean reward function (:math:`q(x,a):= \\ mathbb{E}_{r \sim p(r|x,a)} [r|x,a]`).
17
+ """Machine learning model to estimate the mean reward function (:math:`q(x,a):= \\ mathbb{E}[r|x,a]`).
18
18
19
19
Note
20
20
-------
21
- Reward (or outcome) :math:`Y ` must be either binary or continuous.
21
+ Reward (or outcome) :math:`r ` must be either binary or continuous.
22
22
23
23
Parameters
24
24
------------
25
25
base_model: BaseEstimator
26
- Model class to be used to estimate the mean reward function.
26
+ A machine learning model used to estimate the mean reward function.
27
27
28
28
n_actions: int
29
29
Number of actions.
@@ -66,7 +66,7 @@ def __post_init__(self) -> None:
66
66
"normal" ,
67
67
"iw" ,
68
68
"mrdr" ,
69
- ], f"fitting method must be one of 'normal', 'iw', or 'mrdr', but { self .fitting_method } is given"
69
+ ], f"fitting_method must be one of 'normal', 'iw', or 'mrdr', but { self .fitting_method } is given"
70
70
assert self .n_actions > 1 and isinstance (
71
71
self .n_actions , int
72
72
), f"n_actions must be an integer larger than 1, but { self .n_actions } is given"
@@ -101,9 +101,10 @@ def fit(
101
101
reward: array-like, shape (n_rounds,)
102
102
Observed rewards (or outcome) in each round, i.e., :math:`r_t`.
103
103
104
- pscore: Optional[np.ndarray] , default=None
105
- Propensity scores, the action choice probabilities by behavior policy,
104
+ pscore: array-like, shape (n_rounds,) , default=None
105
+ Action choice probabilities (propensity score) of a behavior policy
106
106
in the training logged bandit feedback.
107
+ When None is given, the the behavior policy is assumed to be a uniform one.
107
108
108
109
position: array-like, shape (n_rounds,), default=None
109
110
Positions of each round in the given logged bandit feedback.
@@ -123,20 +124,26 @@ def fit(
123
124
position = position ,
124
125
action_context = self .action_context ,
125
126
)
127
+ n_rounds = context .shape [0 ]
128
+
126
129
if self .len_list == 1 :
127
130
position = np .zeros_like (action )
128
131
else :
129
132
assert (
130
- position is not None
131
- ), "position has to be set when len_list is larger than 1 "
133
+ isinstance ( position , np . ndarray ) and position . ndim == 1
134
+ ), f"when len_list > 1, position must be a 1-dimensional ndarray "
132
135
if self .fitting_method in ["iw" , "mrdr" ]:
133
136
assert (
134
- action_dist is not None
135
- ), "When either 'iw' or 'mrdr' is used as the 'fitting_method' argument, then action_dist must be given"
136
- assert (
137
- pscore is not None
138
- ), "When either 'iw' or 'mrdr' is used as the 'fitting_method' argument, then pscore must be given"
139
- n_data = context .shape [0 ]
137
+ isinstance (action_dist , np .ndarray ) and action_dist .ndim == 3
138
+ ), f"when fitting_method is either 'iw' or 'mrdr', action_dist must be a 3-dimensional ndarray"
139
+ assert action_dist .shape == (
140
+ n_rounds ,
141
+ self .n_actions ,
142
+ self .len_list ,
143
+ ), f"shape of action_dist must be (n_rounds, n_actions, len_list)=({ n_rounds , self .n_actions , self .len_list } )"
144
+ if pscore is None :
145
+ pscore = np .ones_like (action ) / self .n_actions
146
+
140
147
for position_ in np .arange (self .len_list ):
141
148
idx = position == position_
142
149
X = self ._pre_process_for_reg_model (
@@ -149,19 +156,19 @@ def fit(
149
156
self .base_model_list [position_ ].fit (X , reward [idx ])
150
157
else :
151
158
action_dist_at_position = action_dist [
152
- np .arange (n_data ), action , position_ * np .ones (n_data , dtype = int )
159
+ np .arange (n_rounds ),
160
+ action ,
161
+ position_ * np .ones (n_rounds , dtype = int ),
153
162
][idx ]
154
163
if self .fitting_method == "iw" :
155
164
sample_weight = action_dist_at_position / pscore [idx ]
156
165
self .base_model_list [position_ ].fit (
157
166
X , reward [idx ], sample_weight = sample_weight
158
167
)
159
168
elif self .fitting_method == "mrdr" :
160
- sample_weight = (
161
- action_dist_at_position
162
- * (1.0 - pscore [idx ])
163
- / (pscore [idx ] ** 2 )
164
- )
169
+ sample_weight = action_dist_at_position
170
+ sample_weight *= 1.0 - pscore [idx ]
171
+ sample_weight /= pscore [idx ] ** 2
165
172
self .base_model_list [position_ ].fit (
166
173
X , reward [idx ], sample_weight = sample_weight
167
174
)
@@ -215,7 +222,7 @@ def fit_predict(
215
222
n_folds : int = 1 ,
216
223
random_state : Optional [int ] = None ,
217
224
) -> None :
218
- """Fit the regression model on given logged bandit feedback data and then predict the mean reward function of the same data.
225
+ """Fit the regression model on given logged bandit feedback data and predict the reward function of the same data.
219
226
220
227
Note
221
228
------
@@ -234,8 +241,9 @@ def fit_predict(
234
241
Observed rewards (or outcome) in each round, i.e., :math:`r_t`.
235
242
236
243
pscore: array-like, shape (n_rounds,), default=None
237
- Propensity scores, the action choice probabilities by behavior policy,
244
+ Action choice probabilities (propensity score) of a behavior policy
238
245
in the training logged bandit feedback.
246
+ When None is given, the the behavior policy is assumed to be a uniform one.
239
247
240
248
position: array-like, shape (n_rounds,), default=None
241
249
Positions of each round in the given logged bandit feedback.
@@ -248,7 +256,7 @@ def fit_predict(
248
256
249
257
n_folds: int, default=1
250
258
Number of folds in the cross-fitting procedure.
251
- When 1 is given, then the regression model is trained on the whole logged bandit feedback data.
259
+ When 1 is given, the regression model is trained on the whole logged bandit feedback data.
252
260
253
261
random_state: int, default=None
254
262
`random_state` affects the ordering of the indices, which controls the randomness of each fold.
@@ -260,22 +268,36 @@ def fit_predict(
260
268
Estimated expected rewards for new data by the regression model.
261
269
262
270
"""
271
+ check_bandit_feedback_inputs (
272
+ context = context ,
273
+ action = action ,
274
+ reward = reward ,
275
+ pscore = pscore ,
276
+ position = position ,
277
+ action_context = self .action_context ,
278
+ )
279
+ n_rounds = context .shape [0 ]
280
+
263
281
assert n_folds > 0 and isinstance (
264
282
n_folds , int
265
283
), f"n_folds must be a positive integer, but { n_folds } is given"
266
284
if self .len_list == 1 :
267
285
position = np .zeros_like (action )
268
286
else :
269
287
assert (
270
- position is not None
271
- ), "position has to be set when len_list is larger than 1 "
288
+ isinstance ( position , np . ndarray ) and position . ndim == 1
289
+ ), f"when len_list > 1, position must be a 1-dimensional ndarray "
272
290
if self .fitting_method in ["iw" , "mrdr" ]:
273
291
assert (
274
- action_dist is not None
275
- ), "When either 'iw' or 'mrdr' is used as the 'fitting_method' argument, then action_dist must be given"
276
- assert (
277
- pscore is not None
278
- ), "When either 'iw' or 'mrdr' is used as the 'fitting_method' argument, then pscore must be given"
292
+ isinstance (action_dist , np .ndarray ) and action_dist .ndim == 3
293
+ ), f"when fitting_method is either 'iw' or 'mrdr', action_dist must be a 3-dimensional ndarray"
294
+ assert action_dist .shape == (
295
+ n_rounds ,
296
+ self .n_actions ,
297
+ self .len_list ,
298
+ ), f"shape of action_dist must be (n_rounds, n_actions, len_list)={ n_rounds , self .n_actions , self .len_list } , but is { action_dist .shape } "
299
+ if pscore is None :
300
+ pscore = np .ones_like (action ) / self .n_actions
279
301
280
302
if n_folds == 1 :
281
303
self .fit (
@@ -289,11 +311,11 @@ def fit_predict(
289
311
return self .predict (context = context )
290
312
else :
291
313
estimated_rewards_by_reg_model = np .zeros (
292
- (context . shape [ 0 ] , self .n_actions , self .len_list )
314
+ (n_rounds , self .n_actions , self .len_list )
293
315
)
294
- skf = KFold (n_splits = n_folds , shuffle = True , random_state = random_state )
295
- skf .get_n_splits (context )
296
- for train_idx , test_idx in skf .split (context ):
316
+ kf = KFold (n_splits = n_folds , shuffle = True , random_state = random_state )
317
+ kf .get_n_splits (context )
318
+ for train_idx , test_idx in kf .split (context ):
297
319
action_dist_tr = (
298
320
action_dist [train_idx ] if action_dist is not None else action_dist
299
321
)
0 commit comments