st-tech · usaito · Nov 13, 2021 · Nov 11, 2021 · Nov 11, 2021 · Nov 11, 2021
diff --git a/obp/dataset/__init__.py b/obp/dataset/__init__.py
@@ -4,7 +4,12 @@
 from obp.dataset.real import OpenBanditDataset
 from obp.dataset.synthetic import linear_behavior_policy
 from obp.dataset.synthetic import linear_reward_function
+from obp.dataset.synthetic import logistic_polynomial_reward_function
 from obp.dataset.synthetic import logistic_reward_function
+from obp.dataset.synthetic import logistic_sparse_reward_function
+from obp.dataset.synthetic import polynomial_behavior_policy
+from obp.dataset.synthetic import polynomial_reward_function
+from obp.dataset.synthetic import sparse_reward_function
 from obp.dataset.synthetic import SyntheticBanditDataset
 from obp.dataset.synthetic_continuous import linear_behavior_policy_continuous
 from obp.dataset.synthetic_continuous import linear_reward_funcion_continuous
@@ -24,8 +29,13 @@
     "OpenBanditDataset",
     "SyntheticBanditDataset",
     "logistic_reward_function",
+    "logistic_polynomial_reward_function",
+    "logistic_sparse_reward_function",
     "linear_reward_function",
+    "polynomial_reward_function",
+    "sparse_reward_function",
     "linear_behavior_policy",
+    "polynomial_behavior_policy",
     "MultiClassToBanditReduction",
     "SyntheticContinuousBanditDataset",
     "linear_reward_funcion_continuous",

diff --git a/obp/dataset/synthetic.py b/obp/dataset/synthetic.py
diff --git a/obp/dataset/synthetic_continuous.py b/obp/dataset/synthetic_continuous.py
@@ -25,7 +25,7 @@ class SyntheticContinuousBanditDataset(BaseBanditDataset):
     -----
     By calling the `obtain_batch_bandit_feedback` method several times, we have different bandit samples with the same setting.
     This can be used to estimate confidence intervals of the performances of OPE estimators for continuous actions.
-    If None is set as `behavior_policy_function`, the synthetic data will be context-free bandit feedback.
+    If None is given as `behavior_policy_function`, the synthetic data will be context-free bandit feedback.
 
     Parameters
     -----------
@@ -47,13 +47,13 @@ class SyntheticContinuousBanditDataset(BaseBanditDataset):
     reward_function: Callable[[np.ndarray, np.ndarray], np.ndarray]], default=None
         Function generating expected reward for each given action-context pair,
         i.e., :math:`\\mu: \\mathcal{X} \\times \\mathcal{A} \\rightarrow \\mathbb{R}`.
-        If None is set, context **independent** expected reward for each action will be
+        If None is given, context **independent** expected reward for each action will be
         sampled from the uniform distribution automatically.
 
     behavior_policy_function: Callable[[np.ndarray, np.ndarray], np.ndarray], default=None
         Function generating the propensity score of continuous actions,
         i.e., :math:`\\f: \\mathcal{X} \\rightarrow \\mathbb{R}^{\\mathcal{A}}`.
-        If None is set, context **independent** uniform distribution will be used (uniform behavior policy).
+        If None is given, context **independent** uniform distribution will be used (uniform behavior policy).
 
     random_state: int, default=12345
         Controls the random seed in sampling synthetic slate bandit dataset.
@@ -274,7 +274,7 @@ def linear_reward_funcion_continuous(
     Parameters
     -----------
     context: array-like, shape (n_rounds, dim_context)
-        Context vectors characterizing each round (such as user information).
+        Context vectors characterizing each data (such as user information).
 
     action: array-like, shape (n_rounds,)
         Continuous action values.
@@ -311,7 +311,7 @@ def quadratic_reward_funcion_continuous(
     Parameters
     -----------
     context: array-like, shape (n_rounds, dim_context)
-        Context vectors characterizing each round (such as user information).
+        Context vectors characterizing each data (such as user information).
 
     action: array-like, shape (n_rounds,)
         Continuous action values.
@@ -353,7 +353,7 @@ def linear_behavior_policy_continuous(
     Parameters
     -----------
     context: array-like, shape (n_rounds, dim_context)
-        Context vectors characterizing each round (such as user information).
+        Context vectors characterizing each data (such as user information).
 
     random_state: int, default=None
         Controls the random seed in sampling parameters.
@@ -381,7 +381,7 @@ def linear_synthetic_policy_continuous(context: np.ndarray) -> np.ndarray:
     Parameters
     -----------
     context: array-like, shape (n_rounds, dim_context)
-        Context vectors characterizing each round (such as user information).
+        Context vectors characterizing each data (such as user information).
 
     Returns
     ---------
@@ -400,7 +400,7 @@ def threshold_synthetic_policy_continuous(context: np.ndarray) -> np.ndarray:
     Parameters
     -----------
     context: array-like, shape (n_rounds, dim_context)
-        Context vectors characterizing each round (such as user information).
+        Context vectors characterizing each data (such as user information).
 
     Returns
     ---------
@@ -419,7 +419,7 @@ def sign_synthetic_policy_continuous(context: np.ndarray) -> np.ndarray:
     Parameters
     -----------
     context: array-like, shape (n_rounds, dim_context)
-        Context vectors characterizing each round (such as user information).
+        Context vectors characterizing each data (such as user information).
 
     Returns
     ---------

diff --git a/obp/dataset/synthetic_slate.py b/obp/dataset/synthetic_slate.py
@@ -11,6 +11,7 @@
 from typing import Union
 
 import numpy as np
+from scipy.special import logit
 from scipy.special import perm
 from scipy.stats import truncnorm
 from sklearn.utils import check_random_state
@@ -34,7 +35,7 @@ class SyntheticSlateBanditDataset(BaseBanditDataset):
     we have different bandit samples with the same setting.
     This can be used to estimate confidence intervals of the performances of Slate OPE estimators.
 
-    If None is set as `behavior_policy_function`, the synthetic data will be context-free bandit feedback.
+    If None is given as `behavior_policy_function`, the synthetic data will be context-free bandit feedback.
 
     Parameters
     -----------
@@ -93,13 +94,13 @@ class SyntheticSlateBanditDataset(BaseBanditDataset):
     base_reward_function: Callable[[np.ndarray, np.ndarray], np.ndarray], default=None
         Function generating expected reward for each given action-context pair,
         i.e., :math:`\\mu: \\mathcal{X} \\times \\mathcal{A} \\rightarrow \\mathbb{R}`.
-        If None is set, context **independent** expected reward for each action will be
+        If None is given, context **independent** expected reward for each action will be
         sampled from the uniform distribution automatically.
 
     behavior_policy_function: Callable[[np.ndarray, np.ndarray], np.ndarray], default=None
         Function generating logit value of each action in action space,
         i.e., :math:`\\f: \\mathcal{X} \\rightarrow \\mathbb{R}^{\\mathcal{A}}`.
-        If None is set, context **independent** uniform distribution will be used (uniform behavior policy).
+        If None is given, context **independent** uniform distribution will be used (uniform behavior policy).
 
     is_factorizable: bool
         A boolean parameter whether to use factorizable evaluation policy (which choose slot actions independently) or not.
@@ -865,7 +866,7 @@ def calc_ground_truth_policy_value(
         Parameters
         -----------
         context: array-like, shape (n_rounds, dim_context)
-            Context vectors characterizing each round (such as user information).
+            Context vectors characterizing each data (such as user information).
 
         evaluation_policy_logit_: array-like, shape (n_rounds, n_unique_action)
             Evaluation policy function generating logit value of each action in action space.
@@ -1029,7 +1030,7 @@ def generate_evaluation_policy_pscore(
             When 'random' is given, we calculate the three variants of the propensity scores of the uniform random policy.
 
         context: array-like, shape (n_rounds, dim_context)
-            Context vectors characterizing each round (such as user information).
+            Context vectors characterizing each data (such as user information).
 
         action: array-like, shape (n_rounds * len_list,), default=None
             Actions sampled by a behavior policy.
@@ -1257,10 +1258,10 @@ def action_interaction_reward_function(
     Parameters
     -----------
     context: array-like, shape (n_rounds, dim_context)
-        Context vectors characterizing each round (such as user information).
+        Context vectors characterizing each data (such as user information).
 
     action_context: array-like, shape (n_unique_action, dim_action_context)
-        Vector representation for each action.
+        Vector representation of actions.
 
     action: array-like, shape (n_rounds * len_list, ) or (len(enumerated_slate_actions) * len_list, )
         When is_enumerated=False, action corresponds to actions sampled by a (often behavior) policy.
@@ -1270,7 +1271,7 @@ def action_interaction_reward_function(
     base_reward_function: Callable[[np.ndarray, np.ndarray], np.ndarray]], default=None
         Function generating expected reward for each given action-context pair,
         i.e., :math:`\\mu: \\mathcal{X} \\times \\mathcal{A} \\rightarrow \\mathbb{R}`.
-        If None is set, context **independent** expected reward for each action will be
+        If None is given, context **independent** expected reward for each action will be
         sampled from the uniform distribution automatically.
 
     reward_type: str, default='binary'
@@ -1365,9 +1366,7 @@ def action_interaction_reward_function(
         context=context, action_context=action_context, random_state=random_state
     )
     if reward_type == "binary":
-        expected_reward = np.log(expected_reward / (1 - expected_reward)).astype(
-            "float16"
-        )
+        expected_reward = logit(expected_reward)
     expected_reward_factual = np.zeros_like(action_2d, dtype="float16")
     for position_ in np.arange(len_list):
         tmp_fixed_reward = expected_reward[
@@ -1424,10 +1423,10 @@ def linear_behavior_policy_logit(
     Parameters
     -----------
     context: array-like, shape (n_rounds, dim_context)
-        Context vectors characterizing each round (such as user information).
+        Context vectors characterizing each data (such as user information).
 
     action_context: array-like, shape (n_unique_action, dim_action_context)
-        Vector representation for each action.
+        Vector representation of actions.
 
     random_state: int, default=None
         Controls the random seed in sampling dataset.

diff --git a/obp/ope/regression_model.py b/obp/ope/regression_model.py
@@ -114,7 +114,7 @@ def fit(
 
         position: array-like, shape (n_rounds,), default=None
             Position of recommendation interface where action was presented in each round of the given logged bandit data.
-            If None is set, a regression model assumes that there is only one position.
+            If None is given, a regression model assumes that there is only one position.
             When `len_list` > 1, this position argument has to be set.
 
         action_dist: array-like, shape (n_rounds, n_actions, len_list), default=None
@@ -258,7 +258,7 @@ def fit_predict(
 
         position: array-like, shape (n_rounds,), default=None
             Position of recommendation interface where action was presented in each round of the given logged bandit data.
-            If None is set, a regression model assumes that there is only one position.
+            If None is given, a regression model assumes that there is only one position.
             When `len_list` > 1, this position argument has to be set.
 
         action_dist: array-like, shape (n_rounds, n_actions, len_list), default=None

diff --git a/obp/policy/offline.py b/obp/policy/offline.py
@@ -255,7 +255,7 @@ def sample_action(
         Note
         --------
         This `sample_action` method samples a **non-repetitive** ranking of actions for new data
-        :math:`x \\in \\mathcal{X}` via the so-colled "Gumbel Softmax trick" as follows.
+        :math:`x \\in \\mathcal{X}` via the so-called "Gumbel Softmax trick" as follows.
 
         .. math::
 
@@ -264,6 +264,7 @@ def sample_action(
         :math:`\\tau` is a temperature hyperparameter.
         :math:`f: \\mathcal{X} \\times \\mathcal{A} \\times \\mathcal{K} \\rightarrow \\mathbb{R}_{+}`
         is a scoring function which is now implemented in the `predict_score` method.
+        When `len_list > 0`,  the expected rewards estimated at different positions will be averaged to form :math:`f(x,a)`.
         :math:`\\gamma_{x,a}` is a random variable sampled from the Gumbel distribution.
         By sorting the actions based on :math:`\\s (x,a)` for each context, we can efficiently sample a ranking from
         the Plackett-Luce ranking distribution.
@@ -282,7 +283,7 @@ def sample_action(
 
         Returns
         -----------
-        action: array-like, shape (n_rounds_of_new_data, n_actions, len_list)
+        sampled_ranking: array-like, shape (n_rounds_of_new_data, n_actions, len_list)
             Ranking of actions sampled by the Gumbel softmax trick.
 
         """
@@ -291,13 +292,15 @@ def sample_action(
 
         n_rounds = context.shape[0]
         random_ = check_random_state(random_state)
-        sampled_action = np.zeros((n_rounds, self.n_actions, self.len_list))
+        sampled_ranking = np.zeros((n_rounds, self.n_actions, self.len_list))
         scores = self.predict_score(context=context).mean(2) / tau
         scores += random_.gumbel(size=scores.shape)
-        ranking = np.argsort(-scores, axis=1)
+        sampled_ranking_full = np.argsort(-scores, axis=1)
         for position_ in np.arange(self.len_list):
-            sampled_action[np.arange(n_rounds), ranking[:, position_], position_] = 1
-        return sampled_action
+            sampled_ranking[
+                np.arange(n_rounds), sampled_ranking_full[:, position_], position_
+            ] = 1
+        return sampled_ranking
 
     def predict_proba(
         self,
@@ -479,7 +482,7 @@ def predict(
         -----------
         action_dist: array-like, shape (n_rounds_of_new_data, n_actions, len_list)
             Deterministic action choices by the QLearner.
-            The output can contain duplicate items (when `len_list > 2`).
+            The output can contain duplicated items (when `len_list > 1`).
 
         """
         check_array(array=context, name="context", expected_dim=2)
@@ -494,7 +497,7 @@ def predict(
             action_dist[
                 np.arange(n_rounds),
                 q_hat_argmax[:, p],
-                np.ones(n_rounds, dtype=int) * p,
+                p,
             ] = 1
         return action_dist
 
@@ -528,7 +531,7 @@ def sample_action(
         Note
         --------
         This `sample_action` method samples a ranking of (non-repetitive) actions for new data
-        based on :math:`\\hat{q}` and the so-colled "Gumbel Softmax trick" as follows.
+        based on :math:`\\hat{q}` and the so-called "Gumbel Softmax trick" as follows.
 
         .. math::
 
@@ -537,6 +540,7 @@ def sample_action(
         :math:`\\tau` is a temperature hyperparameter.
         :math:`\\hat{q}: \\mathcal{X} \\times \\mathcal{A} \\times \\mathcal{K} \\rightarrow \\mathbb{R}_{+}`
         is a q function estimator, which is now implemented in the `predict_score` method.
+        When `len_list > 0`,  the expected rewards estimated at different positions will be averaged to form :math:`f(x,a)`.
         :math:`\\gamma_{x,a}` is a random variable sampled from the Gumbel distribution.
         By sorting the actions based on :math:`\\s (x,a)` for each context, we can efficiently sample a ranking from
         the Plackett-Luce ranking distribution.
@@ -586,11 +590,12 @@ def predict_proba(
 
         .. math::
 
-            \\pi (a | x) = \\frac{\\mathrm{exp}( \\hat{q}(x,a) / \\tau)}{\\sum_{a^{\\prime} \\in \\mathcal{A}} \\mathrm{exp}( \\hat{q}(x,a^{\\prime}) / \\tau)}
+            \\pi_{k} (a | x) = \\frac{\\mathrm{exp}( \\hat{q}_{k}(x,a) / \\tau)}{\\sum_{a^{\\prime} \\in \\mathcal{A}} \\mathrm{exp}( \\hat{q}_{k}(x,a^{\\prime}) / \\tau)}
 
+        where :math:`\\pi_{k} (a | x)` is the resulting action choice probabilities at position :math:`k`.
         :math:`\\tau` is a temperature hyperparameter.
         :math:`\\hat{q}: \\mathcal{X} \\times \\mathcal{A} \\times \\mathcal{K} \\rightarrow \\mathbb{R}_{+}`
-        is a q function estimator, which is now implemented in the `predict_score` method.
+        is a q function estimator for position :math:`k`, which is now implemented in the `predict_score` method.
 
         Parameters
         ----------------
@@ -1290,7 +1295,7 @@ def sample_action(
         Note
         --------
         This `sample_action` method samples a **non-repetitive** ranking of actions for new data
-        :math:`x \\in \\mathcal{X}` via the so-colled "Gumbel Softmax trick" as follows.
+        :math:`x \\in \\mathcal{X}` via the so-called "Gumbel Softmax trick" as follows.
 
         .. math::
 
@@ -1299,6 +1304,7 @@ def sample_action(
         :math:`\\tau` is a temperature hyperparameter.
         :math:`f: \\mathcal{X} \\times \\mathcal{A} \\times \\mathcal{K} \\rightarrow \\mathbb{R}_{+}`
         is a scoring function which is now implemented in the `predict_score` method.
+        When `len_list > 0`,  the expected rewards estimated at different positions will be averaged to form :math:`f(x,a)`.
         :math:`\\gamma_{x,a}` is a random variable sampled from the Gumbel distribution.
         By sorting the actions based on :math:`\\s (x,a)` for each context, we can efficiently sample a ranking from
         the Plackett-Luce ranking distribution.