Skip to content

new feature: is_factorizable option in SyntheticSlateBanditDataset #100

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
May 28, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
158 changes: 103 additions & 55 deletions obp/dataset/synthetic_slate.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"""Class for Generating Synthetic Slate Logged Bandit Feedback."""
from dataclasses import dataclass
from typing import Optional, Callable, Tuple, Union, List
from itertools import permutations
from itertools import permutations, product

import numpy as np
from scipy.stats import truncnorm
Expand Down Expand Up @@ -94,6 +94,10 @@ class SyntheticSlateBanditDataset(BaseBanditDataset):
i.e., :math:`\\f: \\mathcal{X} \\rightarrow \\mathbb{R}^{\\mathcal{A}}`.
If None is set, context **independent** uniform distribution will be used (uniform behavior policy).

is_factorizable: bool
A boolean parameter whether to use factorizable evaluation policy (which choose slot actions independently) or not.
When `n_unique_action` and `len_list` are large, this parameter should be set to True because of the computational time.

random_state: int, default=12345
Controls the random seed in sampling synthetic slate bandit dataset.

Expand Down Expand Up @@ -180,6 +184,7 @@ class SyntheticSlateBanditDataset(BaseBanditDataset):
behavior_policy_function: Optional[
Callable[[np.ndarray, np.ndarray], np.ndarray]
] = None
is_factorizable: bool = False
random_state: int = 12345
dataset_name: str = "synthetic_slate_bandit_dataset"

Expand Down Expand Up @@ -389,23 +394,28 @@ def obtain_pscore_given_evaluation_policy_logit(
total=n_rounds,
):
unique_action_set = np.arange(self.n_unique_action)
score_ = softmax(evaluation_policy_logit_[i : i + 1])[0]
pscore_i = 1.0
for position_ in np.arange(self.len_list):
action_ = action[i * self.len_list + position_]
action_index_ = np.where(unique_action_set == action_)[0][0]
score_ = softmax(
evaluation_policy_logit_[i : i + 1, unique_action_set]
)[0][action_index_]
# calculate joint pscore
pscore_i *= score_
pscore_i *= score_[action_index_]
pscore_cascade[i * self.len_list + position_] = pscore_i
unique_action_set = np.delete(
unique_action_set, unique_action_set == action_
)
# calculate marginal pscore
# update the pscore given the remaining items for nonfactorizable policy
if not self.is_factorizable and position_ != self.len_list - 1:
unique_action_set = np.delete(
unique_action_set, unique_action_set == action_
)
score_ = softmax(
evaluation_policy_logit_[i : i + 1, unique_action_set]
)[0]
# calculate pscore_item_position
if return_pscore_item_position:
if position_ == 0:
pscore_item_position_i_l = pscore_i
elif self.is_factorizable:
pscore_item_position_i_l = score_[action_index_]
else:
pscore_item_position_i_l = 0.0
for action_list in permutations(
Expand Down Expand Up @@ -481,11 +491,9 @@ def sample_action_and_obtain_pscore(
total=n_rounds,
):
unique_action_set = np.arange(self.n_unique_action)
score_ = softmax(behavior_policy_logit_[i : i + 1, unique_action_set])[0]
pscore_i = 1.0
for position_ in np.arange(self.len_list):
score_ = softmax(behavior_policy_logit_[i : i + 1, unique_action_set])[
0
]
sampled_action = self.random_.choice(
unique_action_set, p=score_, replace=False
)
Expand All @@ -496,13 +504,20 @@ def sample_action_and_obtain_pscore(
# calculate joint pscore
pscore_i *= score_[sampled_action_index]
pscore_cascade[i * self.len_list + position_] = pscore_i
unique_action_set = np.delete(
unique_action_set, unique_action_set == sampled_action
)
# calculate marginal pscore
# update the pscore given the remaining itemss for nonfactorizable behavior policy
if not self.is_factorizable and position_ != self.len_list - 1:
unique_action_set = np.delete(
unique_action_set, unique_action_set == sampled_action
)
score_ = softmax(
behavior_policy_logit_[i : i + 1, unique_action_set]
)[0]
# calculate pscore_item_position
if return_pscore_item_position:
if self.behavior_policy_function is None: # uniform random
pscore_item_position_i_l = 1 / self.n_unique_action
elif self.is_factorizable:
pscore_item_position_i_l = score_[sampled_action_index]
elif position_ == 0:
pscore_item_position_i_l = pscore_i
else:
Expand Down Expand Up @@ -741,44 +756,52 @@ def calc_on_policy_policy_value(
return reward.sum() / np.unique(slate_id).shape[0]

def calc_ground_truth_policy_value(
self, evaluation_policy_logit: np.ndarray, context: np.ndarray
self,
context: np.ndarray,
evaluation_policy_logit_: np.ndarray,
):
"""Calculate the ground-truth policy value of given evaluation policy logit and context

Parameters
-----------
evaluation_policy_logit: array-like, shape (n_rounds, n_unique_action)
Evaluation policy function generating logit value of each action in action space.

context: array-like, shape (n_rounds, dim_context)
Context vectors characterizing each round (such as user information).

evaluation_policy_logit_: array-like, shape (n_rounds, n_unique_action)
Evaluation policy function generating logit value of each action in action space.

"""
if (
not isinstance(evaluation_policy_logit, np.ndarray)
or evaluation_policy_logit.ndim != 2
not isinstance(evaluation_policy_logit_, np.ndarray)
or evaluation_policy_logit_.ndim != 2
):
raise ValueError("evaluation_policy_logit must be 2-dimensional ndarray")
if evaluation_policy_logit.shape[1] != self.n_unique_action:
raise ValueError("evaluation_policy_logit_ must be 2-dimensional ndarray")
if evaluation_policy_logit_.shape[1] != self.n_unique_action:
raise ValueError(
"the size of axis 1 of evaluation_policy_logit must be the same as n_unique_action"
"the size of axis 1 of evaluation_policy_logit_ must be the same as n_unique_action"
)
if not isinstance(context, np.ndarray) or context.ndim != 2:
raise ValueError("context must be 2-dimensional ndarray")
if context.shape[1] != self.dim_context:
raise ValueError(
"the size of axis 1 of context must be the same as dim_context"
)
if evaluation_policy_logit.shape[0] != context.shape[0]:
if evaluation_policy_logit_.shape[0] != context.shape[0]:
raise ValueError(
"the length of evaluation_policy_logit and context must be same"
"the length of evaluation_policy_logit_ and context must be same"
)

enumerated_slate_actions = [
_ for _ in permutations(np.arange(self.n_unique_action), self.len_list)
]
if self.is_factorizable:
enumerated_slate_actions = [
_
for _ in product(np.arange(self.n_unique_action), repeat=self.len_list)
]
else:
enumerated_slate_actions = [
_ for _ in permutations(np.arange(self.n_unique_action), self.len_list)
]
n_slate_actions = len(enumerated_slate_actions)
n_rounds = len(evaluation_policy_logit)
n_rounds = len(evaluation_policy_logit_)
policy_value = 0

for i in tqdm(
Expand All @@ -788,13 +811,24 @@ def calc_ground_truth_policy_value(
):
# calculate pscore for each combinatorial set of items (i.e., slate actions)
pscores = []
for action_list in enumerated_slate_actions:
pscores.append(
self._calc_pscore_given_action_list(
action_list=action_list,
policy_logit_i_=evaluation_policy_logit[i : i + 1],
if self.is_factorizable:
pscore_when_factorizable = softmax(evaluation_policy_logit_[i : i + 1])[
0
]
for action_list in enumerated_slate_actions:
pscores.append(
np.cumprod(
[pscore_when_factorizable[a_] for a_ in action_list]
)[-1]
)
else:
for action_list in enumerated_slate_actions:
pscores.append(
self._calc_pscore_given_action_list(
action_list=action_list,
policy_logit_i_=evaluation_policy_logit_[i : i + 1],
)
)
)
pscores = np.array(pscores)

# calculate expected slate-level reward for each combinatorial set of items (i.e., slate actions)
Expand Down Expand Up @@ -905,20 +939,30 @@ def generate_evaluation_policy_pscore(
raise ValueError("context must be 2-dimensional ndarray")

# [Caution]: OverflowError raises when integer division result is too large for a float
random_pscore_cascade = (
1.0
/ np.tile(
np.arange(
self.n_unique_action, self.n_unique_action - self.len_list, -1
),
(context.shape[0], 1),
if self.is_factorizable:
random_pscore_cascade = (
(np.ones((context.shape[0], self.len_list)) / self.n_unique_action)
.cumprod(axis=1)
.flatten()
)
random_pscore = np.ones(context.shape[0] * self.len_list) / (
self.n_unique_action ** self.len_list
)
else:
random_pscore_cascade = (
1.0
/ np.tile(
np.arange(
self.n_unique_action, self.n_unique_action - self.len_list, -1
),
(context.shape[0], 1),
)
.cumprod(axis=1)
.flatten()
)
random_pscore = np.ones(context.shape[0] * self.len_list) / perm(
self.n_unique_action, self.len_list
)
.cumprod(axis=1)
.flatten()
)
random_pscore = np.ones(context.shape[0] * self.len_list) / perm(
self.n_unique_action, self.len_list
)
random_pscore_item_position = (
np.ones(context.shape[0] * self.len_list) / self.n_unique_action
)
Expand Down Expand Up @@ -994,15 +1038,15 @@ def _calc_epsilon_greedy_pscore(
When bandit_feedback is obtained by `obtain_batch_bandit_feedback`, we can obtain action_2d as follows: bandit_feedback["action"].reshape((n_rounds, len_list))
When evaluation_policy_type is 'random', this argument is unnecessary.

random_pscore: array-like, shape (n_unique_action * len_list)
random_pscore: array-like, shape (n_unique_action * len_list, )
Joint action choice probabilities of the slate given context (:math:`x`) when the evaluation policy is random.
i.e., :math:`\\frac{1}{{}_{n} P _r)`, where :math:`n` is `n_unique_actions` and :math:`r` is `len_list`.

random_pscore_item_position: array-like, shape (n_unique_action * len_list)
random_pscore_item_position: array-like, shape (n_unique_action * len_list, )
Marginal action choice probabilities of each slot given context (:math:`x`) when the evaluation policy is random.
i.e., :math:`\\frac{1}{n)`, where :math:`n` is `n_unique_actions`.

random_pscore_cascade: array-like, shape (n_unique_action * len_list)
random_pscore_cascade: array-like, shape (n_unique_action * len_list, )
Joint action choice probabilities above the slot (:math:`k`) in each slate given context (:math:`x`) when the evaluation policy is random.
i.e., :math:`\\frac{1}{{}_{n} P _k)`, where :math:`n` is `n_unique_actions`.

Expand All @@ -1024,8 +1068,12 @@ def _calc_epsilon_greedy_pscore(
"""
if not isinstance(action_2d, np.ndarray) or action_2d.ndim != 2:
raise ValueError("action_2d must be 2-dimensional ndarray")
if set([np.unique(x).shape[0] for x in action_2d]) != set([self.len_list]):
raise ValueError("actions of each slate must not be duplicated")
if not self.is_factorizable and set(
[np.unique(x).shape[0] for x in action_2d]
) != set([self.len_list]):
raise ValueError(
"when is_factorizable=False, actions of each slate must not be duplicated"
)
action_match_flag = sorted_actions == action_2d
pscore_flg = np.repeat(action_match_flag.all(axis=1), self.len_list)
pscore_item_position_flg = action_match_flag.flatten()
Expand Down
Loading