st-tech · usaito · Sep 6, 2021 · Sep 3, 2021 · Sep 3, 2021 · Sep 3, 2021
diff --git a/benchmark/ope/README.md b/benchmark/ope/README.md
@@ -90,7 +90,7 @@ It is possible to run multiple experimental settings easily by using the `--mult
 For example, the following script sweeps over all simulations including the three campaigns ('all', 'men',  and 'women') and two different behavior policies ('random' and 'bts').
 
 ```bash
-poetry run python benchmark_ope_estimators.py setting.campaign=all,men,women setting.behavior_policy=random.bts --multirun
+poetry run python benchmark_ope_estimators.py setting.campaign=all,men,women setting.behavior_policy=random,bts --multirun
 ```
 
 The experimental results (including the pairwise hypothesis test results) will be store in the `logs/` directory.

diff --git a/benchmark/ope/benchmark_ope_estimators.py b/benchmark/ope/benchmark_ope_estimators.py
@@ -45,14 +45,13 @@ def main(cfg: DictConfig) -> None:
 
     # compared ope estimators
     lambdas = list(dict(cfg.estimator_hyperparams)["lambdas"])
-    taus = list(dict(cfg.estimator_hyperparams)["taus"])
     ope_estimators = [
         InverseProbabilityWeighting(estimator_name="IPW"),
         SelfNormalizedInverseProbabilityWeighting(estimator_name="SNIPW"),
         DirectMethod(estimator_name="DM"),
         DoublyRobust(estimator_name="DR"),
         SelfNormalizedDoublyRobust(estimator_name="SNDR"),
-        SwitchDoublyRobustTuning(taus=taus, estimator_name="Switch-DR"),
+        SwitchDoublyRobustTuning(lambdas=lambdas, estimator_name="Switch-DR"),
         DoublyRobustWithShrinkageTuning(lambdas=lambdas, estimator_name="DRos"),
     ]
 

diff --git a/benchmark/ope/conf/estimator_hyperparams/default.yaml b/benchmark/ope/conf/estimator_hyperparams/default.yaml
@@ -10,13 +10,3 @@ lambdas:
  - 1000
  - 5000
  - 10000
-taus:
- - 1
- - 5
- - 10
- - 50
- - 100
- - 500
- - 1000
- - 5000
- - 10000
diff --git a/examples/multiclass/README.md b/examples/multiclass/README.md
@@ -76,8 +76,8 @@ python evaluate_off_policy_estimators.py\
 # snipw                 0.006797  0.004094
 # dr                    0.007780  0.004492
 # sndr                  0.007210  0.004089
-# switch-dr (tau=1)     0.173282  0.020025
-# switch-dr (tau=100)   0.007780  0.004492
+# switch-dr (lambda=1)     0.173282  0.020025
+# switch-dr (lambda=100)   0.007780  0.004492
 # dr-os (lambda=1)      0.079629  0.014008
 # dr-os (lambda=100)    0.008031  0.004634
 # =============================================

diff --git a/examples/multiclass/evaluate_off_policy_estimators.py b/examples/multiclass/evaluate_off_policy_estimators.py
@@ -48,8 +48,8 @@
     SelfNormalizedInverseProbabilityWeighting(),
     DoublyRobust(),
     SelfNormalizedDoublyRobust(),
-    SwitchDoublyRobust(tau=1.0, estimator_name="switch-dr (tau=1)"),
-    SwitchDoublyRobust(tau=100.0, estimator_name="switch-dr (tau=100)"),
+    SwitchDoublyRobust(lambda_=1.0, estimator_name="switch-dr (lambda=1)"),
+    SwitchDoublyRobust(lambda_=100.0, estimator_name="switch-dr (lambda=100)"),
     DoublyRobustWithShrinkage(lambda_=1.0, estimator_name="dr-os (lambda=1)"),
     DoublyRobustWithShrinkage(lambda_=100.0, estimator_name="dr-os (lambda=100)"),
 ]

diff --git a/examples/synthetic/README.md b/examples/synthetic/README.md
@@ -69,8 +69,8 @@ python evaluate_off_policy_estimators.py\
 # snipw                0.007543  0.005196
 # dr                   0.008099  0.006659
 # sndr                 0.008054  0.004911
-# switch-dr (tau=1)    0.195878  0.012146
-# switch-dr (tau=100)  0.008099  0.006659
+# switch-dr (lambda=1)    0.195878  0.012146
+# switch-dr (lambda=100)  0.008099  0.006659
 # dr-os (lambda=1)     0.195642  0.012151
 # dr-os (lambda=100)   0.175285  0.012801
 # =============================================

diff --git a/examples/synthetic/evaluate_off_policy_estimators.py b/examples/synthetic/evaluate_off_policy_estimators.py
@@ -45,8 +45,8 @@
     SelfNormalizedInverseProbabilityWeighting(),
     DoublyRobust(),
     SelfNormalizedDoublyRobust(),
-    SwitchDoublyRobust(tau=1.0, estimator_name="switch-dr (tau=1)"),
-    SwitchDoublyRobust(tau=100.0, estimator_name="switch-dr (tau=100)"),
+    SwitchDoublyRobust(lambda_=1.0, estimator_name="switch-dr (lambda=1)"),
+    SwitchDoublyRobust(lambda_=100.0, estimator_name="switch-dr (lambda=100)"),
     DoublyRobustWithShrinkage(lambda_=1.0, estimator_name="dr-os (lambda=1)"),
     DoublyRobustWithShrinkage(lambda_=100.0, estimator_name="dr-os (lambda=100)"),
 ]

diff --git a/obp/ope/estimators.py b/obp/ope/estimators.py
@@ -10,7 +10,7 @@
 import torch
 from sklearn.utils import check_scalar
 
-from .helper import estimate_high_probability_upper_bound_bias
+from .helper import estimate_high_probability_upper_bound_bias, estimate_bias_in_ope
 from ..utils import (
     estimate_confidence_interval_by_bootstrap,
     check_ope_inputs,
@@ -551,6 +551,8 @@ def _estimate_mse_score(
         pscore: np.ndarray,
         action_dist: np.ndarray,
         position: Optional[np.ndarray] = None,
+        use_bias_upper_bound: bool = True,
+        delta: float = 0.05,
         **kwargs,
     ) -> float:
         """Estimate the MSE score of a given clipping hyperparameter to conduct hyperparameter tuning.
@@ -572,6 +574,13 @@ def _estimate_mse_score(
         position: array-like, shape (n_rounds,), default=None
             Position of recommendation interface where action was presented in each round of the given logged bandit feedback.
 
+        use_bias_upper_bound: bool, default=True
+            Whether to use bias upper bound in hyperparameter tuning.
+            If False, direct bias estimator is used to estimate the MSE.
+
+        delta: float, default=0.05
+            A confidence delta to construct a high probability upper bound based on the Bernstein’s inequality.
+
         Returns
         ----------
         estimated_mse_score: float
@@ -596,12 +605,17 @@ def _estimate_mse_score(
 
         # estimate the (high probability) upper bound of the bias of IPW with clipping
         iw = action_dist[np.arange(n_rounds), action, position] / pscore
-        bias_upper_bound = estimate_high_probability_upper_bound_bias(
-            reward=reward,
-            iw=iw,
-            iw_hat=np.minimum(iw, self.lambda_),
-        )
-        estimated_mse_score = sample_variance + (bias_upper_bound ** 2)
+        if use_bias_upper_bound:
+            bias_term = estimate_high_probability_upper_bound_bias(
+                reward=reward, iw=iw, iw_hat=np.minimum(iw, self.lambda_), delta=delta
+            )
+        else:
+            bias_term = estimate_bias_in_ope(
+                reward=reward,
+                iw=iw,
+                iw_hat=np.minimum(iw, self.lambda_),
+            )
+        estimated_mse_score = sample_variance + (bias_term ** 2)
 
         return estimated_mse_score
 
@@ -1310,6 +1324,8 @@ def _estimate_mse_score(
         action_dist: np.ndarray,
         estimated_rewards_by_reg_model: np.ndarray,
         position: Optional[np.ndarray] = None,
+        use_bias_upper_bound: bool = True,
+        delta: float = 0.05,
     ) -> float:
         """Estimate the MSE score of a given clipping hyperparameter to conduct hyperparameter tuning.
 
@@ -1335,6 +1351,13 @@ def _estimate_mse_score(
         estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list)
             Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`.
 
+        use_bias_upper_bound: bool, default=True
+            Whether to use bias upper bound in hyperparameter tuning.
+            If False, direct bias estimator is used to estimate the MSE.
+
+        delta: float, default=0.05
+            A confidence delta to construct a high probability upper bound based on the Bernstein’s inequality.
+
         Returns
         ----------
         estimated_mse_score: float
@@ -1360,13 +1383,26 @@ def _estimate_mse_score(
 
         # estimate the (high probability) upper bound of the bias of DR with clipping
         iw = action_dist[np.arange(n_rounds), action, position] / pscore
-        bias_upper_bound = estimate_high_probability_upper_bound_bias(
-            reward=reward,
-            iw=iw,
-            iw_hat=np.minimum(iw, self.lambda_),
-            q_hat=estimated_rewards_by_reg_model[np.arange(n_rounds), action, position],
-        )
-        estimated_mse_score = sample_variance + (bias_upper_bound ** 2)
+        if use_bias_upper_bound:
+            bias_term = estimate_high_probability_upper_bound_bias(
+                reward=reward,
+                iw=iw,
+                iw_hat=np.minimum(iw, self.lambda_),
+                q_hat=estimated_rewards_by_reg_model[
+                    np.arange(n_rounds), action, position
+                ],
+                delta=delta,
+            )
+        else:
+            bias_term = estimate_bias_in_ope(
+                reward=reward,
+                iw=iw,
+                iw_hat=np.minimum(iw, self.lambda_),
+                q_hat=estimated_rewards_by_reg_model[
+                    np.arange(n_rounds), action, position
+                ],
+            )
+        estimated_mse_score = sample_variance + (bias_term ** 2)
 
         return estimated_mse_score
 
@@ -1487,20 +1523,20 @@ class SwitchDoublyRobust(DoublyRobust):
 
     .. math::
 
-        \\hat{V}_{\\mathrm{SwitchDR}} (\\pi_e; \\mathcal{D}, \\hat{q}, \\tau)
-        := \\mathbb{E}_{\\mathcal{D}} [\\hat{q}(x_t,\\pi_e) +  w(x_t,a_t) (r_t - \\hat{q}(x_t,a_t)) \\mathbb{I} \\{ w(x_t,a_t) \\le \\tau \\}],
+        \\hat{V}_{\\mathrm{SwitchDR}} (\\pi_e; \\mathcal{D}, \\hat{q}, \\lambda)
+        := \\mathbb{E}_{\\mathcal{D}} [\\hat{q}(x_t,\\pi_e) +  w(x_t,a_t) (r_t - \\hat{q}(x_t,a_t)) \\mathbb{I} \\{ w(x_t,a_t) \\le \\lambda \\}],
 
     where :math:`\\mathcal{D}=\\{(x_t,a_t,r_t)\\}_{t=1}^{T}` is logged bandit feedback data with :math:`T` rounds collected by
     a behavior policy :math:`\\pi_b`. :math:`w(x,a):=\\pi_e (a|x)/\\pi_b (a|x)` is the importance weight given :math:`x` and :math:`a`.
     :math:`\\mathbb{E}_{\\mathcal{D}}[\\cdot]` is the empirical average over :math:`T` observations in :math:`\\mathcal{D}`.
-    :math:`\\tau (\\ge 0)` is a switching hyperparameter, which decides the threshold for the importance weight.
+    :math:`\\lambda (\\ge 0)` is a switching hyperparameter, which decides the threshold for the importance weight.
     :math:`\\hat{q} (x,a)` is an estimated expected reward given :math:`x` and :math:`a`.
     :math:`\\hat{q} (x_t,\\pi):= \\mathbb{E}_{a \\sim \\pi(a|x)}[\\hat{q}(x,a)]` is the expectation of the estimated reward function over :math:`\\pi`.
     To estimate the mean reward function, please use `obp.ope.regression_model.RegressionModel`.
 
     Parameters
     ----------
-    tau: float, default=np.inf
+    lambda_: float, default=np.inf
         Switching hyperparameter. When importance weight is larger than this parameter, DM is applied, otherwise DR is used.
         This hyperparameter should be larger than or equal to 0., otherwise it is meaningless.
 
@@ -1520,19 +1556,19 @@ class SwitchDoublyRobust(DoublyRobust):
 
     """
 
-    tau: float = np.inf
+    lambda_: float = np.inf
     estimator_name: str = "switch-dr"
 
     def __post_init__(self) -> None:
         """Initialize Class."""
         check_scalar(
-            self.tau,
-            name="tau",
+            self.lambda_,
+            name="lambda_",
             target_type=(int, float),
             min_val=0.0,
         )
-        if self.tau != self.tau:
-            raise ValueError("tau must not be nan")
+        if self.lambda_ != self.lambda_:
+            raise ValueError("lambda_ must not be nan")
 
     def _estimate_round_rewards(
         self,
@@ -1576,7 +1612,7 @@ def _estimate_round_rewards(
         """
         n_rounds = action.shape[0]
         iw = action_dist[np.arange(n_rounds), action, position] / pscore
-        switch_indicator = np.array(iw <= self.tau, dtype=int)
+        switch_indicator = np.array(iw <= self.lambda_, dtype=int)
         q_hat_at_position = estimated_rewards_by_reg_model[
             np.arange(n_rounds), :, position
         ]
@@ -1613,6 +1649,8 @@ def _estimate_mse_score(
         action_dist: np.ndarray,
         estimated_rewards_by_reg_model: np.ndarray,
         position: Optional[np.ndarray] = None,
+        use_bias_upper_bound: bool = False,
+        delta: float = 0.05,
     ) -> float:
         """Estimate the MSE score of a given switching hyperparameter to conduct hyperparameter tuning.
 
@@ -1638,10 +1676,17 @@ def _estimate_mse_score(
             When None is given, the effect of position on the reward will be ignored.
             (If only one action is chosen and there is no posion, then you can just ignore this argument.)
 
+        use_bias_upper_bound: bool, default=True
+            Whether to use bias upper bound in hyperparameter tuning.
+            If False, direct bias estimator is used to estimate the MSE.
+
+        delta: float, default=0.05
+            A confidence delta to construct a high probability upper bound based on the Bernstein’s inequality.
+
         Returns
         ----------
         estimated_mse_score: float
-            Estimated MSE score of a given switching hyperparameter `tau`.
+            Estimated MSE score of a given switching hyperparameter `lambda_`.
             MSE score is the sum of (high probability) upper bound of bias and the sample variance.
             This is estimated using the automatic hyperparameter tuning procedure
             based on Section 5 of Su et al.(2020).
@@ -1663,13 +1708,26 @@ def _estimate_mse_score(
 
         # estimate the (high probability) upper bound of the bias of Switch-DR
         iw = action_dist[np.arange(n_rounds), action, position] / pscore
-        bias_upper_bound = estimate_high_probability_upper_bound_bias(
-            reward=reward,
-            iw=iw,
-            iw_hat=iw * np.array(iw <= self.tau, dtype=int),
-            q_hat=estimated_rewards_by_reg_model[np.arange(n_rounds), action, position],
-        )
-        estimated_mse_score = sample_variance + (bias_upper_bound ** 2)
+        if use_bias_upper_bound:
+            bias_term = estimate_high_probability_upper_bound_bias(
+                reward=reward,
+                iw=iw,
+                iw_hat=iw * np.array(iw <= self.lambda_, dtype=int),
+                q_hat=estimated_rewards_by_reg_model[
+                    np.arange(n_rounds), action, position
+                ],
+                delta=delta,
+            )
+        else:
+            bias_term = estimate_bias_in_ope(
+                reward=reward,
+                iw=iw,
+                iw_hat=iw * np.array(iw <= self.lambda_, dtype=int),
+                q_hat=estimated_rewards_by_reg_model[
+                    np.arange(n_rounds), action, position
+                ],
+            )
+        estimated_mse_score = sample_variance + (bias_term ** 2)
 
         return estimated_mse_score
 
@@ -1703,8 +1761,7 @@ class DoublyRobustWithShrinkage(DoublyRobust):
         w_{o} (x_t,a_t;\\lambda) := \\frac{\\lambda}{w^2(x_t,a_t) + \\lambda} w(x_t,a_t).
 
     When :math:`\\lambda=0`, we have :math:`w_{o} (x,a;\\lambda)=0` corresponding to the DM estimator.
-    In contrast, as :math:`\\lambda \\rightarrow \\infty`, :math:`w_{o} (x,a;\\lambda)` increases and in the limit becomes equal to
-    the original importance weight, corresponding to the standard DR estimator.
+    In contrast, as :math:`\\lambda \\rightarrow \\infty`, :math:`w_{o} (x,a;\\lambda)` increases and in the limit becomes equal to the original importance weight, corresponding to the standard DR estimator.
 
     Parameters
     ----------
@@ -1815,6 +1872,8 @@ def _estimate_mse_score(
         action_dist: np.ndarray,
         estimated_rewards_by_reg_model: np.ndarray,
         position: Optional[np.ndarray] = None,
+        use_bias_upper_bound: bool = False,
+        delta: float = 0.05,
     ) -> float:
         """Estimate the MSE score of a given shrinkage hyperparameter to conduct hyperparameter tuning.
 
@@ -1838,6 +1897,13 @@ def _estimate_mse_score(
         position: array-like, shape (n_rounds,), default=None
             Position of recommendation interface where action was presented in each round of the given logged bandit feedback.
 
+        use_bias_upper_bound: bool, default=True
+            Whether to use bias upper bound in hyperparameter tuning.
+            If False, direct bias estimator is used to estimate the MSE.
+
+        delta: float, default=0.05
+            A confidence delta to construct a high probability upper bound based on the Bernstein’s inequality.
+
         Returns
         ----------
         estimated_mse_score: float
@@ -1867,12 +1933,25 @@ def _estimate_mse_score(
             iw_hat = (self.lambda_ * iw) / (iw ** 2 + self.lambda_)
         else:
             iw_hat = iw
-        bias_upper_bound = estimate_high_probability_upper_bound_bias(
-            reward=reward,
-            iw=iw,
-            iw_hat=iw_hat,
-            q_hat=estimated_rewards_by_reg_model[np.arange(n_rounds), action, position],
-        )
-        estimated_mse_score = sample_variance + (bias_upper_bound ** 2)
+        if use_bias_upper_bound:
+            bias_term = estimate_high_probability_upper_bound_bias(
+                reward=reward,
+                iw=iw,
+                iw_hat=iw_hat,
+                q_hat=estimated_rewards_by_reg_model[
+                    np.arange(n_rounds), action, position
+                ],
+                delta=0.05,
+            )
+        else:
+            bias_term = estimate_bias_in_ope(
+                reward=reward,
+                iw=iw,
+                iw_hat=iw_hat,
+                q_hat=estimated_rewards_by_reg_model[
+                    np.arange(n_rounds), action, position
+                ],
+            )
+        estimated_mse_score = sample_variance + (bias_term ** 2)
 
         return estimated_mse_score
-Original file line number
+Diff line change
@@ Expand Up / @@ -10,13 +10,3 @@ lambdas: @@
      - 1000
      - 5000
      - 10000
-    taus:
-     - 1
-     - 5
-     - 10
-     - 50
-     - 100
-     - 500
-     - 1000
-     - 5000
-     - 10000