Skip to content

Commit 621720b

Browse files
authored
Merge pull request #145 from st-tech/feature/synthetic-dataset
Modify Synthetic Reward/Behavior Policy Functions
2 parents fc5d628 + d2d9171 commit 621720b

File tree

10 files changed

+710
-258
lines changed

10 files changed

+710
-258
lines changed

obp/dataset/__init__.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,12 @@
44
from obp.dataset.real import OpenBanditDataset
55
from obp.dataset.synthetic import linear_behavior_policy
66
from obp.dataset.synthetic import linear_reward_function
7+
from obp.dataset.synthetic import logistic_polynomial_reward_function
78
from obp.dataset.synthetic import logistic_reward_function
9+
from obp.dataset.synthetic import logistic_sparse_reward_function
10+
from obp.dataset.synthetic import polynomial_behavior_policy
11+
from obp.dataset.synthetic import polynomial_reward_function
12+
from obp.dataset.synthetic import sparse_reward_function
813
from obp.dataset.synthetic import SyntheticBanditDataset
914
from obp.dataset.synthetic_continuous import linear_behavior_policy_continuous
1015
from obp.dataset.synthetic_continuous import linear_reward_funcion_continuous
@@ -24,8 +29,13 @@
2429
"OpenBanditDataset",
2530
"SyntheticBanditDataset",
2631
"logistic_reward_function",
32+
"logistic_polynomial_reward_function",
33+
"logistic_sparse_reward_function",
2734
"linear_reward_function",
35+
"polynomial_reward_function",
36+
"sparse_reward_function",
2837
"linear_behavior_policy",
38+
"polynomial_behavior_policy",
2939
"MultiClassToBanditReduction",
3040
"SyntheticContinuousBanditDataset",
3141
"linear_reward_funcion_continuous",

obp/dataset/synthetic.py

Lines changed: 469 additions & 74 deletions
Large diffs are not rendered by default.

obp/dataset/synthetic_continuous.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ class SyntheticContinuousBanditDataset(BaseBanditDataset):
2525
-----
2626
By calling the `obtain_batch_bandit_feedback` method several times, we have different bandit samples with the same setting.
2727
This can be used to estimate confidence intervals of the performances of OPE estimators for continuous actions.
28-
If None is set as `behavior_policy_function`, the synthetic data will be context-free bandit feedback.
28+
If None is given as `behavior_policy_function`, the synthetic data will be context-free bandit feedback.
2929
3030
Parameters
3131
-----------
@@ -47,13 +47,13 @@ class SyntheticContinuousBanditDataset(BaseBanditDataset):
4747
reward_function: Callable[[np.ndarray, np.ndarray], np.ndarray]], default=None
4848
Function generating expected reward for each given action-context pair,
4949
i.e., :math:`\\mu: \\mathcal{X} \\times \\mathcal{A} \\rightarrow \\mathbb{R}`.
50-
If None is set, context **independent** expected reward for each action will be
50+
If None is given, context **independent** expected reward for each action will be
5151
sampled from the uniform distribution automatically.
5252
5353
behavior_policy_function: Callable[[np.ndarray, np.ndarray], np.ndarray], default=None
5454
Function generating the propensity score of continuous actions,
5555
i.e., :math:`\\f: \\mathcal{X} \\rightarrow \\mathbb{R}^{\\mathcal{A}}`.
56-
If None is set, context **independent** uniform distribution will be used (uniform behavior policy).
56+
If None is given, context **independent** uniform distribution will be used (uniform behavior policy).
5757
5858
random_state: int, default=12345
5959
Controls the random seed in sampling synthetic slate bandit dataset.
@@ -274,7 +274,7 @@ def linear_reward_funcion_continuous(
274274
Parameters
275275
-----------
276276
context: array-like, shape (n_rounds, dim_context)
277-
Context vectors characterizing each round (such as user information).
277+
Context vectors characterizing each data (such as user information).
278278
279279
action: array-like, shape (n_rounds,)
280280
Continuous action values.
@@ -311,7 +311,7 @@ def quadratic_reward_funcion_continuous(
311311
Parameters
312312
-----------
313313
context: array-like, shape (n_rounds, dim_context)
314-
Context vectors characterizing each round (such as user information).
314+
Context vectors characterizing each data (such as user information).
315315
316316
action: array-like, shape (n_rounds,)
317317
Continuous action values.
@@ -353,7 +353,7 @@ def linear_behavior_policy_continuous(
353353
Parameters
354354
-----------
355355
context: array-like, shape (n_rounds, dim_context)
356-
Context vectors characterizing each round (such as user information).
356+
Context vectors characterizing each data (such as user information).
357357
358358
random_state: int, default=None
359359
Controls the random seed in sampling parameters.
@@ -381,7 +381,7 @@ def linear_synthetic_policy_continuous(context: np.ndarray) -> np.ndarray:
381381
Parameters
382382
-----------
383383
context: array-like, shape (n_rounds, dim_context)
384-
Context vectors characterizing each round (such as user information).
384+
Context vectors characterizing each data (such as user information).
385385
386386
Returns
387387
---------
@@ -400,7 +400,7 @@ def threshold_synthetic_policy_continuous(context: np.ndarray) -> np.ndarray:
400400
Parameters
401401
-----------
402402
context: array-like, shape (n_rounds, dim_context)
403-
Context vectors characterizing each round (such as user information).
403+
Context vectors characterizing each data (such as user information).
404404
405405
Returns
406406
---------
@@ -419,7 +419,7 @@ def sign_synthetic_policy_continuous(context: np.ndarray) -> np.ndarray:
419419
Parameters
420420
-----------
421421
context: array-like, shape (n_rounds, dim_context)
422-
Context vectors characterizing each round (such as user information).
422+
Context vectors characterizing each data (such as user information).
423423
424424
Returns
425425
---------

obp/dataset/synthetic_slate.py

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from typing import Union
1212

1313
import numpy as np
14+
from scipy.special import logit
1415
from scipy.special import perm
1516
from scipy.stats import truncnorm
1617
from sklearn.utils import check_random_state
@@ -34,7 +35,7 @@ class SyntheticSlateBanditDataset(BaseBanditDataset):
3435
we have different bandit samples with the same setting.
3536
This can be used to estimate confidence intervals of the performances of Slate OPE estimators.
3637
37-
If None is set as `behavior_policy_function`, the synthetic data will be context-free bandit feedback.
38+
If None is given as `behavior_policy_function`, the synthetic data will be context-free bandit feedback.
3839
3940
Parameters
4041
-----------
@@ -93,13 +94,13 @@ class SyntheticSlateBanditDataset(BaseBanditDataset):
9394
base_reward_function: Callable[[np.ndarray, np.ndarray], np.ndarray], default=None
9495
Function generating expected reward for each given action-context pair,
9596
i.e., :math:`\\mu: \\mathcal{X} \\times \\mathcal{A} \\rightarrow \\mathbb{R}`.
96-
If None is set, context **independent** expected reward for each action will be
97+
If None is given, context **independent** expected reward for each action will be
9798
sampled from the uniform distribution automatically.
9899
99100
behavior_policy_function: Callable[[np.ndarray, np.ndarray], np.ndarray], default=None
100101
Function generating logit value of each action in action space,
101102
i.e., :math:`\\f: \\mathcal{X} \\rightarrow \\mathbb{R}^{\\mathcal{A}}`.
102-
If None is set, context **independent** uniform distribution will be used (uniform behavior policy).
103+
If None is given, context **independent** uniform distribution will be used (uniform behavior policy).
103104
104105
is_factorizable: bool
105106
A boolean parameter whether to use factorizable evaluation policy (which choose slot actions independently) or not.
@@ -865,7 +866,7 @@ def calc_ground_truth_policy_value(
865866
Parameters
866867
-----------
867868
context: array-like, shape (n_rounds, dim_context)
868-
Context vectors characterizing each round (such as user information).
869+
Context vectors characterizing each data (such as user information).
869870
870871
evaluation_policy_logit_: array-like, shape (n_rounds, n_unique_action)
871872
Evaluation policy function generating logit value of each action in action space.
@@ -1029,7 +1030,7 @@ def generate_evaluation_policy_pscore(
10291030
When 'random' is given, we calculate the three variants of the propensity scores of the uniform random policy.
10301031
10311032
context: array-like, shape (n_rounds, dim_context)
1032-
Context vectors characterizing each round (such as user information).
1033+
Context vectors characterizing each data (such as user information).
10331034
10341035
action: array-like, shape (n_rounds * len_list,), default=None
10351036
Actions sampled by a behavior policy.
@@ -1257,10 +1258,10 @@ def action_interaction_reward_function(
12571258
Parameters
12581259
-----------
12591260
context: array-like, shape (n_rounds, dim_context)
1260-
Context vectors characterizing each round (such as user information).
1261+
Context vectors characterizing each data (such as user information).
12611262
12621263
action_context: array-like, shape (n_unique_action, dim_action_context)
1263-
Vector representation for each action.
1264+
Vector representation of actions.
12641265
12651266
action: array-like, shape (n_rounds * len_list, ) or (len(enumerated_slate_actions) * len_list, )
12661267
When is_enumerated=False, action corresponds to actions sampled by a (often behavior) policy.
@@ -1270,7 +1271,7 @@ def action_interaction_reward_function(
12701271
base_reward_function: Callable[[np.ndarray, np.ndarray], np.ndarray]], default=None
12711272
Function generating expected reward for each given action-context pair,
12721273
i.e., :math:`\\mu: \\mathcal{X} \\times \\mathcal{A} \\rightarrow \\mathbb{R}`.
1273-
If None is set, context **independent** expected reward for each action will be
1274+
If None is given, context **independent** expected reward for each action will be
12741275
sampled from the uniform distribution automatically.
12751276
12761277
reward_type: str, default='binary'
@@ -1365,9 +1366,7 @@ def action_interaction_reward_function(
13651366
context=context, action_context=action_context, random_state=random_state
13661367
)
13671368
if reward_type == "binary":
1368-
expected_reward = np.log(expected_reward / (1 - expected_reward)).astype(
1369-
"float16"
1370-
)
1369+
expected_reward = logit(expected_reward)
13711370
expected_reward_factual = np.zeros_like(action_2d, dtype="float16")
13721371
for position_ in np.arange(len_list):
13731372
tmp_fixed_reward = expected_reward[
@@ -1424,10 +1423,10 @@ def linear_behavior_policy_logit(
14241423
Parameters
14251424
-----------
14261425
context: array-like, shape (n_rounds, dim_context)
1427-
Context vectors characterizing each round (such as user information).
1426+
Context vectors characterizing each data (such as user information).
14281427
14291428
action_context: array-like, shape (n_unique_action, dim_action_context)
1430-
Vector representation for each action.
1429+
Vector representation of actions.
14311430
14321431
random_state: int, default=None
14331432
Controls the random seed in sampling dataset.

obp/ope/regression_model.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ def fit(
114114
115115
position: array-like, shape (n_rounds,), default=None
116116
Position of recommendation interface where action was presented in each round of the given logged bandit data.
117-
If None is set, a regression model assumes that there is only one position.
117+
If None is given, a regression model assumes that there is only one position.
118118
When `len_list` > 1, this position argument has to be set.
119119
120120
action_dist: array-like, shape (n_rounds, n_actions, len_list), default=None
@@ -258,7 +258,7 @@ def fit_predict(
258258
259259
position: array-like, shape (n_rounds,), default=None
260260
Position of recommendation interface where action was presented in each round of the given logged bandit data.
261-
If None is set, a regression model assumes that there is only one position.
261+
If None is given, a regression model assumes that there is only one position.
262262
When `len_list` > 1, this position argument has to be set.
263263
264264
action_dist: array-like, shape (n_rounds, n_actions, len_list), default=None

obp/policy/offline.py

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,7 @@ def sample_action(
255255
Note
256256
--------
257257
This `sample_action` method samples a **non-repetitive** ranking of actions for new data
258-
:math:`x \\in \\mathcal{X}` via the so-colled "Gumbel Softmax trick" as follows.
258+
:math:`x \\in \\mathcal{X}` via the so-called "Gumbel Softmax trick" as follows.
259259
260260
.. math::
261261
@@ -264,6 +264,7 @@ def sample_action(
264264
:math:`\\tau` is a temperature hyperparameter.
265265
:math:`f: \\mathcal{X} \\times \\mathcal{A} \\times \\mathcal{K} \\rightarrow \\mathbb{R}_{+}`
266266
is a scoring function which is now implemented in the `predict_score` method.
267+
When `len_list > 0`, the expected rewards estimated at different positions will be averaged to form :math:`f(x,a)`.
267268
:math:`\\gamma_{x,a}` is a random variable sampled from the Gumbel distribution.
268269
By sorting the actions based on :math:`\\s (x,a)` for each context, we can efficiently sample a ranking from
269270
the Plackett-Luce ranking distribution.
@@ -282,7 +283,7 @@ def sample_action(
282283
283284
Returns
284285
-----------
285-
action: array-like, shape (n_rounds_of_new_data, n_actions, len_list)
286+
sampled_ranking: array-like, shape (n_rounds_of_new_data, n_actions, len_list)
286287
Ranking of actions sampled by the Gumbel softmax trick.
287288
288289
"""
@@ -291,13 +292,15 @@ def sample_action(
291292

292293
n_rounds = context.shape[0]
293294
random_ = check_random_state(random_state)
294-
sampled_action = np.zeros((n_rounds, self.n_actions, self.len_list))
295+
sampled_ranking = np.zeros((n_rounds, self.n_actions, self.len_list))
295296
scores = self.predict_score(context=context).mean(2) / tau
296297
scores += random_.gumbel(size=scores.shape)
297-
ranking = np.argsort(-scores, axis=1)
298+
sampled_ranking_full = np.argsort(-scores, axis=1)
298299
for position_ in np.arange(self.len_list):
299-
sampled_action[np.arange(n_rounds), ranking[:, position_], position_] = 1
300-
return sampled_action
300+
sampled_ranking[
301+
np.arange(n_rounds), sampled_ranking_full[:, position_], position_
302+
] = 1
303+
return sampled_ranking
301304

302305
def predict_proba(
303306
self,
@@ -479,7 +482,7 @@ def predict(
479482
-----------
480483
action_dist: array-like, shape (n_rounds_of_new_data, n_actions, len_list)
481484
Deterministic action choices by the QLearner.
482-
The output can contain duplicate items (when `len_list > 2`).
485+
The output can contain duplicated items (when `len_list > 1`).
483486
484487
"""
485488
check_array(array=context, name="context", expected_dim=2)
@@ -494,7 +497,7 @@ def predict(
494497
action_dist[
495498
np.arange(n_rounds),
496499
q_hat_argmax[:, p],
497-
np.ones(n_rounds, dtype=int) * p,
500+
p,
498501
] = 1
499502
return action_dist
500503

@@ -528,7 +531,7 @@ def sample_action(
528531
Note
529532
--------
530533
This `sample_action` method samples a ranking of (non-repetitive) actions for new data
531-
based on :math:`\\hat{q}` and the so-colled "Gumbel Softmax trick" as follows.
534+
based on :math:`\\hat{q}` and the so-called "Gumbel Softmax trick" as follows.
532535
533536
.. math::
534537
@@ -537,6 +540,7 @@ def sample_action(
537540
:math:`\\tau` is a temperature hyperparameter.
538541
:math:`\\hat{q}: \\mathcal{X} \\times \\mathcal{A} \\times \\mathcal{K} \\rightarrow \\mathbb{R}_{+}`
539542
is a q function estimator, which is now implemented in the `predict_score` method.
543+
When `len_list > 0`, the expected rewards estimated at different positions will be averaged to form :math:`f(x,a)`.
540544
:math:`\\gamma_{x,a}` is a random variable sampled from the Gumbel distribution.
541545
By sorting the actions based on :math:`\\s (x,a)` for each context, we can efficiently sample a ranking from
542546
the Plackett-Luce ranking distribution.
@@ -586,11 +590,12 @@ def predict_proba(
586590
587591
.. math::
588592
589-
\\pi (a | x) = \\frac{\\mathrm{exp}( \\hat{q}(x,a) / \\tau)}{\\sum_{a^{\\prime} \\in \\mathcal{A}} \\mathrm{exp}( \\hat{q}(x,a^{\\prime}) / \\tau)}
593+
\\pi_{k} (a | x) = \\frac{\\mathrm{exp}( \\hat{q}_{k}(x,a) / \\tau)}{\\sum_{a^{\\prime} \\in \\mathcal{A}} \\mathrm{exp}( \\hat{q}_{k}(x,a^{\\prime}) / \\tau)}
590594
595+
where :math:`\\pi_{k} (a | x)` is the resulting action choice probabilities at position :math:`k`.
591596
:math:`\\tau` is a temperature hyperparameter.
592597
:math:`\\hat{q}: \\mathcal{X} \\times \\mathcal{A} \\times \\mathcal{K} \\rightarrow \\mathbb{R}_{+}`
593-
is a q function estimator, which is now implemented in the `predict_score` method.
598+
is a q function estimator for position :math:`k`, which is now implemented in the `predict_score` method.
594599
595600
Parameters
596601
----------------
@@ -1290,7 +1295,7 @@ def sample_action(
12901295
Note
12911296
--------
12921297
This `sample_action` method samples a **non-repetitive** ranking of actions for new data
1293-
:math:`x \\in \\mathcal{X}` via the so-colled "Gumbel Softmax trick" as follows.
1298+
:math:`x \\in \\mathcal{X}` via the so-called "Gumbel Softmax trick" as follows.
12941299
12951300
.. math::
12961301
@@ -1299,6 +1304,7 @@ def sample_action(
12991304
:math:`\\tau` is a temperature hyperparameter.
13001305
:math:`f: \\mathcal{X} \\times \\mathcal{A} \\times \\mathcal{K} \\rightarrow \\mathbb{R}_{+}`
13011306
is a scoring function which is now implemented in the `predict_score` method.
1307+
When `len_list > 0`, the expected rewards estimated at different positions will be averaged to form :math:`f(x,a)`.
13021308
:math:`\\gamma_{x,a}` is a random variable sampled from the Gumbel distribution.
13031309
By sorting the actions based on :math:`\\s (x,a)` for each context, we can efficiently sample a ranking from
13041310
the Plackett-Luce ranking distribution.

0 commit comments

Comments
 (0)