st-tech · usaito · Jan 12, 2022 · Nov 18, 2021 · Nov 26, 2021 · Nov 28, 2021
diff --git a/examples/quickstart/balanced-ope-deterministic-evaluation-policy.ipynb b/examples/quickstart/balanced-ope-deterministic-evaluation-policy.ipynb
diff --git a/examples/quickstart/balanced-ope-stochastic-evaluation-policy.ipynb b/examples/quickstart/balanced-ope-stochastic-evaluation-policy.ipynb
diff --git a/obp/ope/__init__.py b/obp/ope/__init__.py
@@ -7,6 +7,7 @@
 from obp.ope.estimators import SelfNormalizedDoublyRobust
 from obp.ope.estimators import SelfNormalizedInverseProbabilityWeighting
 from obp.ope.estimators import SwitchDoublyRobust
+from obp.ope.estimators import BalancedInverseProbabilityWeighting
 from obp.ope.estimators_continuous import (
     KernelizedSelfNormalizedInverseProbabilityWeighting,
 )
@@ -31,6 +32,8 @@
 from obp.ope.meta_continuous import ContinuousOffPolicyEvaluation
 from obp.ope.meta_slate import SlateOffPolicyEvaluation
 from obp.ope.regression_model import RegressionModel
+from obp.ope.classification_model import ImportanceWeightEstimator
+from obp.ope.classification_model import PropensityScoreEstimator
 
 
 __all__ = [
@@ -57,6 +60,9 @@
     "SelfNormalizedSlateRewardInteractionIPS",
     "SelfNormalizedSlateIndependentIPS",
     "SelfNormalizedSlateStandardIPS",
+    "BalancedInverseProbabilityWeighting",
+    "ImportanceWeightEstimator",
+    "PropensityScoreEstimator",
     "BaseContinuousOffPolicyEstimator",
     "KernelizedInverseProbabilityWeighting",
     "KernelizedSelfNormalizedInverseProbabilityWeighting",
@@ -76,6 +82,7 @@
     "DoublyRobustWithShrinkage",
     "SwitchDoublyRobust",
     "SelfNormalizedDoublyRobust",
+    "BalancedInverseProbabilityWeighting",
 ]
 
 

diff --git a/obp/ope/classification_model.py b/obp/ope/classification_model.py
diff --git a/obp/ope/estimators.py b/obp/ope/estimators.py
diff --git a/obp/ope/estimators_tuning.py b/obp/ope/estimators_tuning.py
diff --git a/obp/ope/meta.py b/obp/ope/meta.py
diff --git a/obp/policy/offline_continuous.py b/obp/policy/offline_continuous.py
@@ -317,7 +317,7 @@ def _create_train_data_for_opl(
         context: array-like, shape (n_rounds, dim_context)
             Context vectors in each round, i.e., :math:`x_t`.
 
-        action: array-like or Tensor, shape (n_rounds,)
+        action: array-like, shape (n_rounds,)
             Continuous action values sampled by a behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`.
 
         reward: array-like, shape (n_rounds,)
@@ -401,7 +401,7 @@ def fit(
         context: array-like, shape (n_rounds, dim_context)
             Context vectors in each round, i.e., :math:`x_t`.
 
-        action: array-like or Tensor, shape (n_rounds,)
+        action: array-like, shape (n_rounds,)
             Continuous action values sampled by a behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`.
 
         reward: array-like, shape (n_rounds,)
@@ -550,7 +550,7 @@ def _estimate_policy_value(
         pscore: Tensor, shape (batch_size,)
             Action choice probabilities of a behavior policy (generalized propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`.
 
-        action_by_current_policy: array-like or Tensor, shape (batch_size,)
+        action_by_current_policy: Tensor, shape (batch_size,)
             Continuous action values given by the current policy.
 
         Returns
@@ -863,7 +863,7 @@ def _create_train_data_for_q_func_estimation(
         context: array-like, shape (n_rounds, dim_context)
             Context vectors in each round, i.e., :math:`x_t`.
 
-        action: array-like or Tensor, shape (n_rounds,)
+        action: array-like, shape (n_rounds,)
             Continuous action values sampled by a behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`.
 
         reward: array-like, shape (n_rounds,)
@@ -933,7 +933,7 @@ def fit(
         context: array-like, shape (n_rounds, dim_context)
             Context vectors in each round, i.e., :math:`x_t`.
 
-        action: array-like or Tensor, shape (n_rounds,)
+        action: array-like, shape (n_rounds,)
             Continuous action values sampled by a behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`.
 
         reward: array-like, shape (n_rounds,)

diff --git a/obp/utils.py b/obp/utils.py
@@ -305,6 +305,7 @@ def check_ope_inputs(
     reward: Optional[np.ndarray] = None,
     pscore: Optional[np.ndarray] = None,
     estimated_rewards_by_reg_model: Optional[np.ndarray] = None,
+    estimated_importance_weights: Optional[np.ndarray] = None,
 ) -> Optional[ValueError]:
     """Check inputs for ope.
 
@@ -329,6 +330,8 @@ def check_ope_inputs(
     estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list), default=None
         Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`.
 
+    estimated_importance_weights: array-like, shape (n_rounds,), default=None
+        Importance weights estimated via supervised classification, i.e., :math:`\\hat{ww}(x_t, a_t)`.
     """
     # action_dist
     check_array(array=action_dist, name="action_dist", expected_dim=3)
@@ -360,6 +363,14 @@ def check_ope_inputs(
                 "Expected `estimated_rewards_by_reg_model.shape == action_dist.shape`, but found it False"
             )
 
+    if estimated_importance_weights is not None:
+        if not (action.shape[0] == estimated_importance_weights.shape[0]):
+            raise ValueError(
+                "Expected `action.shape[0] == estimated_importance_weights.shape[0]`, but found it False"
+            )
+        if np.any(estimated_importance_weights < 0):
+            raise ValueError("estimated_importance_weights must be non-negative")
+
     # action, reward
     if action is not None or reward is not None:
         check_array(array=action, name="action", expected_dim=1)