Skip to content

[Review] Feature: Balanced-OPE estimators #146

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
Jan 12, 2022
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,256 changes: 1,256 additions & 0 deletions examples/quickstart/balanced-ope-deterministic-evaluation-policy.ipynb

Large diffs are not rendered by default.

1,258 changes: 1,258 additions & 0 deletions examples/quickstart/balanced-ope-stochastic-evaluation-policy.ipynb

Large diffs are not rendered by default.

7 changes: 7 additions & 0 deletions obp/ope/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from obp.ope.estimators import SelfNormalizedDoublyRobust
from obp.ope.estimators import SelfNormalizedInverseProbabilityWeighting
from obp.ope.estimators import SwitchDoublyRobust
from obp.ope.estimators import BalancedInverseProbabilityWeighting
from obp.ope.estimators_continuous import (
KernelizedSelfNormalizedInverseProbabilityWeighting,
)
Expand All @@ -31,6 +32,8 @@
from obp.ope.meta_continuous import ContinuousOffPolicyEvaluation
from obp.ope.meta_slate import SlateOffPolicyEvaluation
from obp.ope.regression_model import RegressionModel
from obp.ope.classification_model import ImportanceWeightEstimator
from obp.ope.classification_model import PropensityScoreEstimator


__all__ = [
Expand All @@ -57,6 +60,9 @@
"SelfNormalizedSlateRewardInteractionIPS",
"SelfNormalizedSlateIndependentIPS",
"SelfNormalizedSlateStandardIPS",
"BalancedInverseProbabilityWeighting",
"ImportanceWeightEstimator",
"PropensityScoreEstimator",
"BaseContinuousOffPolicyEstimator",
"KernelizedInverseProbabilityWeighting",
"KernelizedSelfNormalizedInverseProbabilityWeighting",
Expand All @@ -76,6 +82,7 @@
"DoublyRobustWithShrinkage",
"SwitchDoublyRobust",
"SelfNormalizedDoublyRobust",
"BalancedInverseProbabilityWeighting",
]


Expand Down
660 changes: 660 additions & 0 deletions obp/ope/classification_model.py

Large diffs are not rendered by default.

449 changes: 384 additions & 65 deletions obp/ope/estimators.py

Large diffs are not rendered by default.

272 changes: 212 additions & 60 deletions obp/ope/estimators_tuning.py

Large diffs are not rendered by default.

193 changes: 178 additions & 15 deletions obp/ope/meta.py

Large diffs are not rendered by default.

10 changes: 5 additions & 5 deletions obp/policy/offline_continuous.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,7 @@ def _create_train_data_for_opl(
context: array-like, shape (n_rounds, dim_context)
Context vectors in each round, i.e., :math:`x_t`.

action: array-like or Tensor, shape (n_rounds,)
action: array-like, shape (n_rounds,)
Continuous action values sampled by a behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`.

reward: array-like, shape (n_rounds,)
Expand Down Expand Up @@ -401,7 +401,7 @@ def fit(
context: array-like, shape (n_rounds, dim_context)
Context vectors in each round, i.e., :math:`x_t`.

action: array-like or Tensor, shape (n_rounds,)
action: array-like, shape (n_rounds,)
Continuous action values sampled by a behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`.

reward: array-like, shape (n_rounds,)
Expand Down Expand Up @@ -550,7 +550,7 @@ def _estimate_policy_value(
pscore: Tensor, shape (batch_size,)
Action choice probabilities of a behavior policy (generalized propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`.

action_by_current_policy: array-like or Tensor, shape (batch_size,)
action_by_current_policy: Tensor, shape (batch_size,)
Continuous action values given by the current policy.

Returns
Expand Down Expand Up @@ -863,7 +863,7 @@ def _create_train_data_for_q_func_estimation(
context: array-like, shape (n_rounds, dim_context)
Context vectors in each round, i.e., :math:`x_t`.

action: array-like or Tensor, shape (n_rounds,)
action: array-like, shape (n_rounds,)
Continuous action values sampled by a behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`.

reward: array-like, shape (n_rounds,)
Expand Down Expand Up @@ -933,7 +933,7 @@ def fit(
context: array-like, shape (n_rounds, dim_context)
Context vectors in each round, i.e., :math:`x_t`.

action: array-like or Tensor, shape (n_rounds,)
action: array-like, shape (n_rounds,)
Continuous action values sampled by a behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`.

reward: array-like, shape (n_rounds,)
Expand Down
11 changes: 11 additions & 0 deletions obp/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,7 @@ def check_ope_inputs(
reward: Optional[np.ndarray] = None,
pscore: Optional[np.ndarray] = None,
estimated_rewards_by_reg_model: Optional[np.ndarray] = None,
estimated_importance_weights: Optional[np.ndarray] = None,
) -> Optional[ValueError]:
"""Check inputs for ope.

Expand All @@ -329,6 +330,8 @@ def check_ope_inputs(
estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list), default=None
Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`.

estimated_importance_weights: array-like, shape (n_rounds,), default=None
Importance weights estimated via supervised classification, i.e., :math:`\\hat{ww}(x_t, a_t)`.
"""
# action_dist
check_array(array=action_dist, name="action_dist", expected_dim=3)
Expand Down Expand Up @@ -360,6 +363,14 @@ def check_ope_inputs(
"Expected `estimated_rewards_by_reg_model.shape == action_dist.shape`, but found it False"
)

if estimated_importance_weights is not None:
if not (action.shape[0] == estimated_importance_weights.shape[0]):
raise ValueError(
"Expected `action.shape[0] == estimated_importance_weights.shape[0]`, but found it False"
)
if np.any(estimated_importance_weights < 0):
raise ValueError("estimated_importance_weights must be non-negative")

# action, reward
if action is not None or reward is not None:
check_array(array=action, name="action", expected_dim=1)
Expand Down
Loading