Evaluating Standard Off-Policy Estimators with Small Sample Open-Bandit Dataset¶

Overview¶

Evaluating 3 standard off-policy estimators (DirectMethod, DoublyRobust, and InverseProbabilityWeighting) on small sample open-bandit dataset.

These OPE estimators will estimate the performance of BernoulliTS policy (counterfactual/evaluation policy) using data generated by Random policy (behavior policy).

Imports

import argparse
from pathlib import Path

from joblib import delayed
from joblib import Parallel
import numpy as np
from pandas import DataFrame
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import yaml

Dataloader¶

Abstract Base Class for Logged Bandit Feedback.

from abc import ABCMeta
from abc import abstractmethod

class BaseBanditDataset(metaclass=ABCMeta):
    """Base Class for Synthetic Bandit Dataset."""

    @abstractmethod
    def obtain_batch_bandit_feedback(self) -> None:
        """Obtain batch logged bandit feedback."""
        raise NotImplementedError

class BaseRealBanditDataset(BaseBanditDataset):
    """Base Class for Real-World Bandit Dataset."""

    @abstractmethod
    def load_raw_data(self) -> None:
        """Load raw dataset."""
        raise NotImplementedError

    @abstractmethod
    def pre_process(self) -> None:
        """Preprocess raw dataset."""
        raise NotImplementedError

Dataset Class for Real-World Logged Bandit Feedback.

from dataclasses import dataclass
from logging import basicConfig
from logging import getLogger
from logging import INFO
from pathlib import Path
from typing import Optional
from typing import Tuple
from typing import Union
from typing import Dict

import numpy as np
import pandas as pd
from scipy.stats import rankdata
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import check_random_state
from sklearn.utils import check_scalar

# dataset type
BanditFeedback = Dict[str, Union[int, np.ndarray]]

logger = getLogger(__name__)
basicConfig(level=INFO)

OBD_DATA_PATH = '/content/zr-obp/obd'

@dataclass
class OpenBanditDataset(BaseRealBanditDataset):
    """Class for loading and preprocessing Open Bandit Dataset.
    Note
    -----
    Users are free to implement their own feature engineering by overriding the `pre_process` method.
    Parameters
    -----------
    behavior_policy: str
        Name of the behavior policy that generated the logged bandit feedback data.
        Must be either 'random' or 'bts'.
    campaign: str
        One of the three possible campaigns considered in ZOZOTOWN.
        Must be one of "all", "men", or "women".
    data_path: str or Path, default=None
        Path where the Open Bandit Dataset is stored.
    dataset_name: str, default='obd'
        Name of the dataset.
    References
    ------------
    Yuta Saito, Shunsuke Aihara, Megumi Matsutani, Yusuke Narita.
    "Open Bandit Dataset and Pipeline: Towards Realistic and Reproducible Off-Policy Evaluation.", 2020.
    """

    behavior_policy: str
    campaign: str
    data_path: Optional[Union[str, Path]] = None
    dataset_name: str = "obd"

    def __post_init__(self) -> None:
        """Initialize Open Bandit Dataset Class."""
        if self.behavior_policy not in [
            "bts",
            "random",
        ]:
            raise ValueError(
                f"behavior_policy must be either of 'bts' or 'random', but {self.behavior_policy} is given"
            )

        if self.campaign not in [
            "all",
            "men",
            "women",
        ]:
            raise ValueError(
                f"campaign must be one of 'all', 'men', or 'women', but {self.campaign} is given"
            )

        if self.data_path is None:
            self.data_path = Path(OBD_DATA_PATH)
        else:
            if isinstance(self.data_path, Path):
                pass
            elif isinstance(self.data_path, str):
                self.data_path = Path(self.data_path)
            else:
                raise ValueError("data_path must be a string or Path")
        self.data_path = self.data_path / self.behavior_policy / self.campaign
        self.raw_data_file = f"{self.campaign}.csv"

        self.load_raw_data()
        self.pre_process()

    @property
    def n_rounds(self) -> int:
        """Total number of rounds contained in the logged bandit dataset."""
        return self.data.shape[0]

    @property
    def n_actions(self) -> int:
        """Number of actions."""
        return int(self.action.max() + 1)

    @property
    def dim_context(self) -> int:
        """Dimensions of context vectors."""
        return self.context.shape[1]

    @property
    def len_list(self) -> int:
        """Length of recommendation lists."""
        return int(self.position.max() + 1)

    @classmethod
    def calc_on_policy_policy_value_estimate(
        cls,
        behavior_policy: str,
        campaign: str,
        data_path: Optional[Path] = None,
        test_size: float = 0.3,
        is_timeseries_split: bool = False,
    ) -> float:
        """Calculate on-policy policy value estimate (used as a ground-truth policy value).
        Parameters
        ----------
        behavior_policy: str
            Name of the behavior policy that generated the log data.
            Must be either 'random' or 'bts'.
        campaign: str
            One of the three possible campaigns considered in ZOZOTOWN (i.e., "all", "men", and "women").
        data_path: Path, default=None
            Path where the Open Bandit Dataset exists.
        test_size: float, default=0.3
            Proportion of the dataset included in the test split.
            If float, should be between 0.0 and 1.0.
            This argument matters only when `is_timeseries_split=True` (the out-sample case).
        is_timeseries_split: bool, default=False
            If true, split the original logged bandit feedback data by time series.
        Returns
        ---------
        on_policy_policy_value_estimate: float
            Policy value of the behavior policy estimated by on-policy estimation, i.e., :math:`\\mathbb{E}_{\\mathcal{D}} [r_t]`.
            where :math:`\\mathbb{E}_{\\mathcal{D}}[\\cdot]` is the empirical average over :math:`T` observations in :math:`\\mathcal{D}`.
            This parameter is used as a ground-truth policy value in the evaluation of OPE estimators.
        """
        bandit_feedback = cls(
            behavior_policy=behavior_policy, campaign=campaign, data_path=data_path
        ).obtain_batch_bandit_feedback(
            test_size=test_size, is_timeseries_split=is_timeseries_split
        )
        if is_timeseries_split:
            bandit_feedback_test = bandit_feedback[1]
        else:
            bandit_feedback_test = bandit_feedback
        return bandit_feedback_test["reward"].mean()

    def load_raw_data(self) -> None:
        """Load raw open bandit dataset."""
        self.data = pd.read_csv(self.data_path / self.raw_data_file, index_col=0)
        self.item_context = pd.read_csv(
            self.data_path / "item_context.csv", index_col=0
        )
        self.data.sort_values("timestamp", inplace=True)
        self.action = self.data["item_id"].values
        self.position = (rankdata(self.data["position"].values, "dense") - 1).astype(
            int
        )
        self.reward = self.data["click"].values
        self.pscore = self.data["propensity_score"].values

    def pre_process(self) -> None:
        """Preprocess raw open bandit dataset.
        Note
        -----
        This is the default feature engineering and please override this method to
        implement your own preprocessing.
        see https://github.com/st-tech/zr-obp/blob/master/examples/examples_with_obd/custom_dataset.py for example.
        """
        user_cols = self.data.columns.str.contains("user_feature")
        self.context = pd.get_dummies(
            self.data.loc[:, user_cols], drop_first=True
        ).values
        item_feature_0 = self.item_context["item_feature_0"]
        item_feature_cat = self.item_context.drop("item_feature_0", 1).apply(
            LabelEncoder().fit_transform
        )
        self.action_context = pd.concat([item_feature_cat, item_feature_0], 1).values

    def obtain_batch_bandit_feedback(
        self, test_size: float = 0.3, is_timeseries_split: bool = False
    ) -> Union[BanditFeedback, Tuple[BanditFeedback, BanditFeedback]]:
        """Obtain batch logged bandit feedback.
        Parameters
        -----------
        test_size: float, default=0.3
            Proportion of the dataset included in the test split.
            If float, should be between 0.0 and 1.0.
            This argument matters only when `is_timeseries_split=True` (the out-sample case).
        is_timeseries_split: bool, default=False
            If true, split the original logged bandit feedback data into train and test sets based on time series.
        Returns
        --------
        bandit_feedback: BanditFeedback
            A dictionary containing batch logged bandit feedback data collected by a behavior policy.
            The keys of the dictionary are as follows.
            - n_rounds: number of rounds (size) of the logged bandit data
            - n_actions: number of actions (:math:`|\mathcal{A}|`)
            - action: action variables sampled by a behavior policy
            - position: positions where actions are recommended
            - reward: reward variables
            - pscore: action choice probabilities by a behavior policy
            - context: context vectors such as user-related features and user-item affinity scores
            - action_context: item-related context vectors
        """
        if not isinstance(is_timeseries_split, bool):
            raise TypeError(
                f"`is_timeseries_split` must be a bool, but {type(is_timeseries_split)} is given"
            )

        if is_timeseries_split:
            check_scalar(
                test_size,
                name="target_size",
                target_type=(float),
                min_val=0.0,
                max_val=1.0,
            )
            n_rounds_train = np.int(self.n_rounds * (1.0 - test_size))
            bandit_feedback_train = dict(
                n_rounds=n_rounds_train,
                n_actions=self.n_actions,
                action=self.action[:n_rounds_train],
                position=self.position[:n_rounds_train],
                reward=self.reward[:n_rounds_train],
                pscore=self.pscore[:n_rounds_train],
                context=self.context[:n_rounds_train],
                action_context=self.action_context,
            )
            bandit_feedback_test = dict(
                n_rounds=(self.n_rounds - n_rounds_train),
                n_actions=self.n_actions,
                action=self.action[n_rounds_train:],
                position=self.position[n_rounds_train:],
                reward=self.reward[n_rounds_train:],
                pscore=self.pscore[n_rounds_train:],
                context=self.context[n_rounds_train:],
                action_context=self.action_context,
            )
            return bandit_feedback_train, bandit_feedback_test
        else:
            return dict(
                n_rounds=self.n_rounds,
                n_actions=self.n_actions,
                action=self.action,
                position=self.position,
                reward=self.reward,
                pscore=self.pscore,
                context=self.context,
                action_context=self.action_context,
            )

    def sample_bootstrap_bandit_feedback(
        self,
        sample_size: Optional[int] = None,
        test_size: float = 0.3,
        is_timeseries_split: bool = False,
        random_state: Optional[int] = None,
    ) -> BanditFeedback:
        """Obtain bootstrap logged bandit feedback.
        Parameters
        -----------
        sample_size: int, default=None
            Number of data sampled by bootstrap.
            When None is given, the original data size (n_rounds) is used as `sample_size`.
            The value must be smaller than the original data size.
        test_size: float, default=0.3
            Proportion of the dataset included in the test split.
            If float, should be between 0.0 and 1.0.
            This argument matters only when `is_timeseries_split=True` (the out-sample case).
        is_timeseries_split: bool, default=False
            If true, split the original logged bandit feedback data into train and test sets based on time series.
        random_state: int, default=None
            Controls the random seed in bootstrap sampling.
        Returns
        --------
        bandit_feedback: BanditFeedback
            A dictionary containing logged bandit feedback data sampled independently from the original data with replacement.
            The keys of the dictionary are as follows.
            - n_rounds: number of rounds (size) of the logged bandit data
            - n_actions: number of actions
            - action: action variables sampled by a behavior policy
            - position: positions where actions are recommended by a behavior policy
            - reward: reward variables
            - pscore: action choice probabilities by a behavior policy
            - context: context vectors such as user-related features and user-item affinity scores
            - action_context: item-related context vectors
        """
        if is_timeseries_split:
            bandit_feedback = self.obtain_batch_bandit_feedback(
                test_size=test_size, is_timeseries_split=is_timeseries_split
            )[0]
        else:
            bandit_feedback = self.obtain_batch_bandit_feedback(
                test_size=test_size, is_timeseries_split=is_timeseries_split
            )
        n_rounds = bandit_feedback["n_rounds"]
        if sample_size is None:
            sample_size = bandit_feedback["n_rounds"]
        else:
            check_scalar(
                sample_size,
                name="sample_size",
                target_type=(int),
                min_val=0,
                max_val=n_rounds,
            )
        random_ = check_random_state(random_state)
        bootstrap_idx = random_.choice(
            np.arange(n_rounds), size=sample_size, replace=True
        )
        for key_ in ["action", "position", "reward", "pscore", "context"]:
            bandit_feedback[key_] = bandit_feedback[key_][bootstrap_idx]
        bandit_feedback["n_rounds"] = sample_size
        return bandit_feedback

OPE Estimators¶

Utils¶

from abc import ABCMeta
from abc import abstractmethod
from dataclasses import dataclass
from typing import Dict
from typing import Optional
from typing import Union

import numpy as np
import pandas as pd
from sklearn.utils import check_random_state
from sklearn.utils import check_scalar
import torch

def check_array(
    array: np.ndarray,
    name: str,
    expected_dim: int = 1,
) -> ValueError:
    """Input validation on an array.
    Parameters
    -------------
    array: object
        Input object to check.
    name: str
        Name of the input array.
    expected_dim: int, default=1
        Expected dimension of the input array.
    """
    if not isinstance(array, np.ndarray):
        raise ValueError(f"{name} must be {expected_dim}D array, but got {type(array)}")
    if array.ndim != expected_dim:
        raise ValueError(
            f"{name} must be {expected_dim}D array, but got {array.ndim}D array"
        )

def check_confidence_interval_arguments(
    alpha: float = 0.05,
    n_bootstrap_samples: int = 10000,
    random_state: Optional[int] = None,
) -> Optional[ValueError]:
    """Check confidence interval arguments.
    Parameters
    ----------
    alpha: float, default=0.05
        Significance level.
    n_bootstrap_samples: int, default=10000
        Number of resampling performed in the bootstrap procedure.
    random_state: int, default=None
        Controls the random seed in bootstrap sampling.
    Returns
    ----------
    estimated_confidence_interval: Dict[str, float]
        Dictionary storing the estimated mean and upper-lower confidence bounds.
    """
    check_random_state(random_state)
    check_scalar(alpha, "alpha", float, min_val=0.0, max_val=1.0)
    check_scalar(n_bootstrap_samples, "n_bootstrap_samples", int, min_val=1)

def estimate_confidence_interval_by_bootstrap(
    samples: np.ndarray,
    alpha: float = 0.05,
    n_bootstrap_samples: int = 10000,
    random_state: Optional[int] = None,
) -> Dict[str, float]:
    """Estimate confidence interval by nonparametric bootstrap-like procedure.
    Parameters
    ----------
    samples: array-like
        Empirical observed samples to be used to estimate cumulative distribution function.
    alpha: float, default=0.05
        Significance level.
    n_bootstrap_samples: int, default=10000
        Number of resampling performed in the bootstrap procedure.
    random_state: int, default=None
        Controls the random seed in bootstrap sampling.
    Returns
    ----------
    estimated_confidence_interval: Dict[str, float]
        Dictionary storing the estimated mean and upper-lower confidence bounds.
    """
    check_confidence_interval_arguments(
        alpha=alpha, n_bootstrap_samples=n_bootstrap_samples, random_state=random_state
    )

    boot_samples = list()
    random_ = check_random_state(random_state)
    for _ in np.arange(n_bootstrap_samples):
        boot_samples.append(np.mean(random_.choice(samples, size=samples.shape[0])))
    lower_bound = np.percentile(boot_samples, 100 * (alpha / 2))
    upper_bound = np.percentile(boot_samples, 100 * (1.0 - alpha / 2))
    return {
        "mean": np.mean(boot_samples),
        f"{100 * (1. - alpha)}% CI (lower)": lower_bound,
        f"{100 * (1. - alpha)}% CI (upper)": upper_bound,
    }

def check_ope_inputs(
    action_dist: np.ndarray,
    position: Optional[np.ndarray] = None,
    action: Optional[np.ndarray] = None,
    reward: Optional[np.ndarray] = None,
    pscore: Optional[np.ndarray] = None,
    estimated_rewards_by_reg_model: Optional[np.ndarray] = None,
) -> Optional[ValueError]:
    """Check inputs for ope.
    Parameters
    -----------
    action_dist: array-like, shape (n_rounds, n_actions, len_list)
        Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`.
    position: array-like, shape (n_rounds,), default=None
        Position of recommendation interface where action was presented in each round of the given logged bandit data.
    action: array-like, shape (n_rounds,), default=None
        Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`.
    reward: array-like, shape (n_rounds,), default=None
        Observed rewards (or outcome) in each round, i.e., :math:`r_t`.
    pscore: array-like, shape (n_rounds,), default=None
        Propensity scores, the probability of selecting each action by behavior policy,
        in the given logged bandit data.
    estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list), default=None
        Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`.
    """
    # action_dist
    check_array(array=action_dist, name="action_dist", expected_dim=3)
    if not np.allclose(action_dist.sum(axis=1), 1):
        raise ValueError("action_dist must be a probability distribution")

    # position
    if position is not None:
        check_array(array=position, name="position", expected_dim=1)
        if not (position.shape[0] == action_dist.shape[0]):
            raise ValueError(
                "Expected `position.shape[0] == action_dist.shape[0]`, but found it False"
            )
        if not (np.issubdtype(position.dtype, np.integer) and position.min() >= 0):
            raise ValueError("position elements must be non-negative integers")
        if position.max() >= action_dist.shape[2]:
            raise ValueError(
                "position elements must be smaller than `action_dist.shape[2]`"
            )
    elif action_dist.shape[2] > 1:
        raise ValueError(
            "position elements must be given when `action_dist.shape[2] > 1`"
        )

    # estimated_rewards_by_reg_model
    if estimated_rewards_by_reg_model is not None:
        if estimated_rewards_by_reg_model.shape != action_dist.shape:
            raise ValueError(
                "Expected `estimated_rewards_by_reg_model.shape == action_dist.shape`, but found it False"
            )

    # action, reward
    if action is not None or reward is not None:
        check_array(array=action, name="action", expected_dim=1)
        check_array(array=reward, name="reward", expected_dim=1)
        if not (action.shape[0] == reward.shape[0]):
            raise ValueError(
                "Expected `action.shape[0] == reward.shape[0]`, but found it False"
            )
        if not (np.issubdtype(action.dtype, np.integer) and action.min() >= 0):
            raise ValueError("action elements must be non-negative integers")
        if action.max() >= action_dist.shape[1]:
            raise ValueError(
                "action elements must be smaller than `action_dist.shape[1]`"
            )

    # pscore
    if pscore is not None:
        if pscore.ndim != 1:
            raise ValueError("pscore must be 1-dimensional")
        if not (action.shape[0] == reward.shape[0] == pscore.shape[0]):
            raise ValueError(
                "Expected `action.shape[0] == reward.shape[0] == pscore.shape[0]`, but found it False"
            )
        if np.any(pscore <= 0):
            raise ValueError("pscore must be positive")

def estimate_bias_in_ope(
    reward: np.ndarray,
    iw: np.ndarray,
    iw_hat: np.ndarray,
    q_hat: Optional[np.ndarray] = None,
) -> float:
    """Helper to estimate a bias in OPE.
    Parameters
    ----------
    reward: array-like, shape (n_rounds,)
        Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`.
    iw: array-like, shape (n_rounds,)
        Importance weight in each round of the logged bandit feedback, i.e., :math:`w(x,a)=\\pi_e(a|x)/ \\pi_b(a|x)`.
    iw_hat: array-like, shape (n_rounds,)
        Importance weight (IW) modified by a hyparpareter. How IW is modified depends on the estimator as follows.
            - clipping: :math:`\\hat{w}(x,a) := \\min \\{ \\lambda, w(x,a) \\}`
            - switching: :math:`\\hat{w}(x,a) := w(x,a) \\cdot \\mathbb{I} \\{ w(x,a) < \\lambda \\}`
            - shrinkage: :math:`\\hat{w}(x,a) := (\\lambda w(x,a)) / (\\lambda + w^2(x,a))`
        where :math:`\\lambda` is a hyperparameter value.
    q_hat: array-like, shape (n_rounds,), default=None
        Estimated expected reward given context :math:`x_t` and action :math:`a_t`.
    Returns
    ----------
    estimated_bias: float
        Estimated the bias in OPE.
        This is based on the direct bias estimation stated on page 17 of Su et al.(2020).
    References
    ----------
    Yi Su, Maria Dimakopoulou, Akshay Krishnamurthy, and Miroslav Dudik.
    "Doubly Robust Off-Policy Evaluation with Shrinkage.", 2020.
    """
    n_rounds = reward.shape[0]
    if q_hat is None:
        q_hat = np.zeros(n_rounds)
    estimated_bias_arr = (iw - iw_hat) * (reward - q_hat)
    estimated_bias = np.abs(estimated_bias_arr.mean())

    return estimated_bias

Abstract Class¶

@dataclass
class BaseOffPolicyEstimator(metaclass=ABCMeta):
    """Base class for OPE estimators."""

    @abstractmethod
    def _estimate_round_rewards(self) -> np.ndarray:
        """Estimate round-wise (or sample-wise) rewards."""
        raise NotImplementedError

    @abstractmethod
    def estimate_policy_value(self) -> float:
        """Estimate the policy value of evaluation policy."""
        raise NotImplementedError

    @abstractmethod
    def estimate_interval(self) -> Dict[str, float]:
        """Estimate confidence interval of policy value by nonparametric bootstrap procedure."""
        raise NotImplementedError

Direct Method¶

@dataclass
class DirectMethod(BaseOffPolicyEstimator):
    """Direct Method (DM).
    Note
    -------
    DM first learns a supervised machine learning model, such as ridge regression and gradient boosting,
    to estimate the mean reward function (:math:`q(x,a) = \\mathbb{E}[r|x,a]`).
    It then uses it to estimate the policy value as follows.
    .. math::
        \\hat{V}_{\\mathrm{DM}} (\\pi_e; \\mathcal{D}, \\hat{q})
        &:= \\mathbb{E}_{\\mathcal{D}} \\left[ \\sum_{a \\in \\mathcal{A}} \\hat{q} (x_t,a) \\pi_e(a|x_t) \\right],    \\\\
        & =  \\mathbb{E}_{\\mathcal{D}}[\\hat{q} (x_t,\\pi_e)],
    where :math:`\\mathcal{D}=\\{(x_t,a_t,r_t)\\}_{t=1}^{T}` is logged bandit feedback data with :math:`T` rounds collected by
    a behavior policy :math:`\\pi_b`. :math:`\\mathbb{E}_{\\mathcal{D}}[\\cdot]` is the empirical average over :math:`T` observations in :math:`\\mathcal{D}`.
    :math:`\\hat{q} (x,a)` is an estimated expected reward given :math:`x` and :math:`a`.
    :math:`\\hat{q} (x_t,\\pi):= \\mathbb{E}_{a \\sim \\pi(a|x)}[\\hat{q}(x,a)]` is the expectation of the estimated reward function over :math:`\\pi`.
    To estimate the mean reward function, please use `obp.ope.regression_model.RegressionModel`, which supports several fitting methods specific to OPE.
    If the regression model (:math:`\\hat{q}`) is a good approximation to the true mean reward function,
    this estimator accurately estimates the policy value of the evaluation policy.
    If the regression function fails to approximate the mean reward function well,
    however, the final estimator is no longer consistent.
    Parameters
    ----------
    estimator_name: str, default='dm'.
        Name of the estimator.
    References
    ----------
    Alina Beygelzimer and John Langford.
    "The offset tree for learning with partial labels.", 2009.
    Miroslav Dudík, Dumitru Erhan, John Langford, and Lihong Li.
    "Doubly Robust Policy Evaluation and Optimization.", 2014.
    """

    estimator_name: str = "dm"

    def _estimate_round_rewards(
        self,
        action_dist: np.ndarray,
        estimated_rewards_by_reg_model: np.ndarray,
        position: Optional[np.ndarray] = None,
        **kwargs,
    ) -> np.ndarray:
        """Estimate the policy value of evaluation policy.
        Parameters
        ----------
        action_dist: array-like or Tensor, shape (n_rounds, n_actions, len_list)
            Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`.
        estimated_rewards_by_reg_model: array-like or Tensor, shape (n_rounds, n_actions, len_list)
            Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`.
        position: array-like or Tensor, shape (n_rounds,), default=None
            Position of recommendation interface where action was presented in each round of the given logged bandit data.
            When None is given, the effect of position on the reward will be ignored.
            (If only one action is chosen and there is no posion, then you can just ignore this argument.)
        Returns
        ----------
        estimated_rewards: array-like or Tensor, shape (n_rounds,)
            Rewards of each round estimated by the DM estimator.
        """
        if position is None:
            position = np.zeros(action_dist.shape[0], dtype=int)
        n_rounds = position.shape[0]
        q_hat_at_position = estimated_rewards_by_reg_model[
            np.arange(n_rounds), :, position
        ]
        pi_e_at_position = action_dist[np.arange(n_rounds), :, position]

        if isinstance(action_dist, np.ndarray):
            return np.average(
                q_hat_at_position,
                weights=pi_e_at_position,
                axis=1,
            )
        else:
            raise ValueError("action must be 1D array")

    def estimate_policy_value(
        self,
        action_dist: np.ndarray,
        estimated_rewards_by_reg_model: np.ndarray,
        position: Optional[np.ndarray] = None,
        **kwargs,
    ) -> float:
        """Estimate the policy value of evaluation policy.
        Parameters
        ----------
        action_dist: array-like, shape (n_rounds, n_actions, len_list)
            Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`.
        estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list)
            Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`.
        position: array-like, shape (n_rounds,), default=None
            Position of recommendation interface where action was presented in each round of the given logged bandit data.
            When None is given, the effect of position on the reward will be ignored.
            (If only one action is chosen and there is no posion, then you can just ignore this argument.)
        Returns
        ----------
        V_hat: float
            Estimated policy value (performance) of a given evaluation policy.
        """
        check_array(
            array=estimated_rewards_by_reg_model,
            name="estimated_rewards_by_reg_model",
            expected_dim=3,
        )
        check_ope_inputs(
            action_dist=action_dist,
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
            position=position,
        )
        if position is None:
            position = np.zeros(action_dist.shape[0], dtype=int)

        return self._estimate_round_rewards(
            position=position,
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
            action_dist=action_dist,
        ).mean()

    def estimate_interval(
        self,
        action_dist: np.ndarray,
        estimated_rewards_by_reg_model: np.ndarray,
        position: Optional[np.ndarray] = None,
        alpha: float = 0.05,
        n_bootstrap_samples: int = 10000,
        random_state: Optional[int] = None,
        **kwargs,
    ) -> Dict[str, float]:
        """Estimate confidence interval of policy value by nonparametric bootstrap procedure.
        Parameters
        ----------
        action_dist: array-like, shape (n_rounds, n_actions, len_list)
            Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`.
        estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list)
            Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`.
        position: array-like, shape (n_rounds,), default=None
            Position of recommendation interface where action was presented in each round of the given logged bandit data.
            When None is given, the effect of position on the reward will be ignored.
            (If only one action is chosen and there is no posion, then you can just ignore this argument.)
        alpha: float, default=0.05
            Significance level.
        n_bootstrap_samples: int, default=10000
            Number of resampling performed in the bootstrap procedure.
        random_state: int, default=None
            Controls the random seed in bootstrap sampling.
        Returns
        ----------
        estimated_confidence_interval: Dict[str, float]
            Dictionary storing the estimated mean and upper-lower confidence bounds.
        """
        check_array(
            array=estimated_rewards_by_reg_model,
            name="estimated_rewards_by_reg_model",
            expected_dim=3,
        )
        check_ope_inputs(
            action_dist=action_dist,
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
            position=position,
        )
        if position is None:
            position = np.zeros(action_dist.shape[0], dtype=int)

        estimated_round_rewards = self._estimate_round_rewards(
            position=position,
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
            action_dist=action_dist,
        )
        return estimate_confidence_interval_by_bootstrap(
            samples=estimated_round_rewards,
            alpha=alpha,
            n_bootstrap_samples=n_bootstrap_samples,
            random_state=random_state,
        )

Doubly Robust¶

@dataclass
class DoublyRobust(BaseOffPolicyEstimator):
    """Doubly Robust (DR) Estimator.
    Note
    -------
    Similar to DM, DR first learns a supervised machine learning model, such as ridge regression and gradient boosting,
    to estimate the mean reward function (:math:`q(x,a) = \\mathbb{E}[r|x,a]`).
    It then uses it to estimate the policy value as follows.
    .. math::
        \\hat{V}_{\\mathrm{DR}} (\\pi_e; \\mathcal{D}, \\hat{q})
        := \\mathbb{E}_{\\mathcal{D}}[\\hat{q}(x_t,\\pi_e) +  w(x_t,a_t) (r_t - \\hat{q}(x_t,a_t))],
    where :math:`\\mathcal{D}=\\{(x_t,a_t,r_t)\\}_{t=1}^{T}` is logged bandit feedback data with :math:`T` rounds collected by
    a behavior policy :math:`\\pi_b`.
    :math:`w(x,a):=\\pi_e (a|x)/\\pi_b (a|x)` is the importance weight given :math:`x` and :math:`a`.
    :math:`\\mathbb{E}_{\\mathcal{D}}[\\cdot]` is the empirical average over :math:`T` observations in :math:`\\mathcal{D}`.
    :math:`\\hat{q} (x,a)` is an estimated expected reward given :math:`x` and :math:`a`.
    :math:`\\hat{q} (x_t,\\pi):= \\mathbb{E}_{a \\sim \\pi(a|x)}[\\hat{q}(x,a)]` is the expectation of the estimated reward function over :math:`\\pi`.
    When the weight-clipping is applied, a large importance weight is clipped as :math:`\\hat{w}(x,a) := \\min \\{ \\lambda, w(x,a) \\}`
    where :math:`\\lambda (>0)` is a hyperparameter that decides a maximum allowed importance weight.
    To estimate the mean reward function, please use `obp.ope.regression_model.RegressionModel`,
    which supports several fitting methods specific to OPE such as *more robust doubly robust*.
    DR mimics IPW to use a weighted version of rewards, but DR also uses the estimated mean reward
    function (the regression model) as a control variate to decrease the variance.
    It preserves the consistency of IPW if either the importance weight or
    the mean reward estimator is accurate (a property called double robustness).
    Moreover, DR is semiparametric efficient when the mean reward estimator is correctly specified.
    Parameters
    ----------
    lambda_: float, default=np.inf
        A maximum possible value of the importance weight.
        When a positive finite value is given, importance weights larger than `lambda_` will be clipped.
        DoublyRobust with a finite positive `lambda_` corresponds to Doubly Robust with Pessimistic Shrinkage of Su et al.(2020) or CAB-DR of Su et al.(2019).
    estimator_name: str, default='dr'.
        Name of the estimator.
    References
    ----------
    Miroslav Dudík, Dumitru Erhan, John Langford, and Lihong Li.
    "Doubly Robust Policy Evaluation and Optimization.", 2014.
    Mehrdad Farajtabar, Yinlam Chow, and Mohammad Ghavamzadeh.
    "More Robust Doubly Robust Off-policy Evaluation.", 2018.
    Yi Su, Lequn Wang, Michele Santacatterina, and Thorsten Joachims.
    "CAB: Continuous Adaptive Blending Estimator for Policy Evaluation and Learning", 2019.
    Yi Su, Maria Dimakopoulou, Akshay Krishnamurthy, and Miroslav Dudík.
    "Doubly robust off-policy evaluation with shrinkage.", 2020.
    """

    lambda_: float = np.inf
    estimator_name: str = "dr"

    def __post_init__(self) -> None:
        """Initialize Class."""
        check_scalar(
            self.lambda_,
            name="lambda_",
            target_type=(int, float),
            min_val=0.0,
        )
        if self.lambda_ != self.lambda_:
            raise ValueError("lambda_ must not be nan")

    def _estimate_round_rewards(
        self,
        reward: np.ndarray,
        action: np.ndarray,
        pscore: np.ndarray,
        action_dist: np.ndarray,
        estimated_rewards_by_reg_model: np.ndarray,
        position: Optional[np.ndarray] = None,
        **kwargs,
    ) -> np.ndarray:
        """Estimate round-wise (or sample-wise) rewards.
        Parameters
        ----------
        reward: array-like or Tensor, shape (n_rounds,)
            Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`.
        action: array-like or Tensor, shape (n_rounds,)
            Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`.
        pscore: array-like or Tensor, shape (n_rounds,)
            Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`.
        action_dist: array-like or Tensor, shape (n_rounds, n_actions, len_list)
            Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`.
        estimated_rewards_by_reg_model or Tensor: array-like, shape (n_rounds, n_actions, len_list)
            Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`.
        position: array-like or Tensor, shape (n_rounds,), default=None
            Position of recommendation interface where action was presented in each round of the given logged bandit data.
            When None is given, the effect of position on the reward will be ignored.
            (If only one action is chosen and there is no posion, then you can just ignore this argument.)
        Returns
        ----------
        estimated_rewards: array-like or Tensor, shape (n_rounds,)
            Rewards of each round estimated by the DR estimator.
        """
        if position is None:
            position = np.zeros(action_dist.shape[0], dtype=int)
        n_rounds = action.shape[0]
        iw = action_dist[np.arange(n_rounds), action, position] / pscore
        # weight clipping
        if isinstance(iw, np.ndarray):
            iw = np.minimum(iw, self.lambda_)
        q_hat_at_position = estimated_rewards_by_reg_model[
            np.arange(n_rounds), :, position
        ]
        q_hat_factual = estimated_rewards_by_reg_model[
            np.arange(n_rounds), action, position
        ]
        pi_e_at_position = action_dist[np.arange(n_rounds), :, position]

        if isinstance(reward, np.ndarray):
            estimated_rewards = np.average(
                q_hat_at_position,
                weights=pi_e_at_position,
                axis=1,
            )
        else:
            raise ValueError("reward must be 1D array")

        estimated_rewards += iw * (reward - q_hat_factual)
        return estimated_rewards

    def estimate_policy_value(
        self,
        reward: np.ndarray,
        action: np.ndarray,
        pscore: np.ndarray,
        action_dist: np.ndarray,
        estimated_rewards_by_reg_model: np.ndarray,
        position: Optional[np.ndarray] = None,
    ) -> float:
        """Estimate the policy value of evaluation policy.
        Parameters
        ----------
        reward: array-like, shape (n_rounds,)
            Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`.
        action: array-like, shape (n_rounds,)
            Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`.
        pscore: array-like, shape (n_rounds,)
            Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`.
        action_dist: array-like, shape (n_rounds, n_actions, len_list)
            Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`.
        estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list)
            Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`.
        position: array-like, shape (n_rounds,), default=None
            Position of recommendation interface where action was presented in each round of the given logged bandit data.
            When None is given, the effect of position on the reward will be ignored.
            (If only one action is chosen and there is no posion, then you can just ignore this argument.)
        Returns
        ----------
        V_hat: float
            Policy value estimated by the DR estimator.
        """
        check_array(
            array=estimated_rewards_by_reg_model,
            name="estimated_rewards_by_reg_model",
            expected_dim=3,
        )
        check_array(array=reward, name="reward", expected_dim=1)
        check_array(array=action, name="action", expected_dim=1)
        check_array(array=pscore, name="pscore", expected_dim=1)
        check_ope_inputs(
            action_dist=action_dist,
            position=position,
            action=action,
            reward=reward,
            pscore=pscore,
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        )
        if position is None:
            position = np.zeros(action_dist.shape[0], dtype=int)

        return self._estimate_round_rewards(
            reward=reward,
            action=action,
            position=position,
            pscore=pscore,
            action_dist=action_dist,
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        ).mean()

    def estimate_interval(
        self,
        reward: np.ndarray,
        action: np.ndarray,
        pscore: np.ndarray,
        action_dist: np.ndarray,
        estimated_rewards_by_reg_model: np.ndarray,
        position: Optional[np.ndarray] = None,
        alpha: float = 0.05,
        n_bootstrap_samples: int = 10000,
        random_state: Optional[int] = None,
        **kwargs,
    ) -> Dict[str, float]:
        """Estimate confidence interval of policy value by nonparametric bootstrap procedure.
        Parameters
        ----------
        reward: array-like, shape (n_rounds,)
            Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`.
        action: array-like, shape (n_rounds,)
            Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`.
        pscore: array-like, shape (n_rounds,)
            Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`.
        action_dist: array-like, shape (n_rounds, n_actions, len_list)
            Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`.
        estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list)
            Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`.
        position: array-like, shape (n_rounds,), default=None
            Position of recommendation interface where action was presented in each round of the given logged bandit data.
            When None is given, the effect of position on the reward will be ignored.
            (If only one action is chosen and there is no posion, then you can just ignore this argument.)
        alpha: float, default=0.05
            Significance level.
        n_bootstrap_samples: int, default=10000
            Number of resampling performed in the bootstrap procedure.
        random_state: int, default=None
            Controls the random seed in bootstrap sampling.
        Returns
        ----------
        estimated_confidence_interval: Dict[str, float]
            Dictionary storing the estimated mean and upper-lower confidence bounds.
        """
        check_array(
            array=estimated_rewards_by_reg_model,
            name="estimated_rewards_by_reg_model",
            expected_dim=3,
        )
        check_array(array=reward, name="reward", expected_dim=1)
        check_array(array=action, name="action", expected_dim=1)
        check_array(array=pscore, name="pscore", expected_dim=1)
        check_ope_inputs(
            action_dist=action_dist,
            position=position,
            action=action,
            reward=reward,
            pscore=pscore,
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        )
        if position is None:
            position = np.zeros(action_dist.shape[0], dtype=int)

        estimated_round_rewards = self._estimate_round_rewards(
            reward=reward,
            action=action,
            position=position,
            pscore=pscore,
            action_dist=action_dist,
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        )
        return estimate_confidence_interval_by_bootstrap(
            samples=estimated_round_rewards,
            alpha=alpha,
            n_bootstrap_samples=n_bootstrap_samples,
            random_state=random_state,
        )

    def _estimate_mse_score(
        self,
        reward: np.ndarray,
        action: np.ndarray,
        pscore: np.ndarray,
        action_dist: np.ndarray,
        estimated_rewards_by_reg_model: np.ndarray,
        position: Optional[np.ndarray] = None,
        use_bias_upper_bound: bool = True,
        delta: float = 0.05,
    ) -> float:
        """Estimate the MSE score of a given clipping hyperparameter to conduct hyperparameter tuning.
        Parameters
        ----------
        reward: array-like, shape (n_rounds,)
            Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`.
        action: array-like, shape (n_rounds,)
            Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`.
        pscore: array-like, shape (n_rounds,)
            Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`.
        action_dist: array-like, shape (n_rounds, n_actions, len_list)
            Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`.
        position: array-like, shape (n_rounds,), default=None
            Position of recommendation interface where action was presented in each round of the given logged bandit data.
            When None is given, the effect of position on the reward will be ignored.
            (If only one action is chosen and there is no posion, then you can just ignore this argument.)
        estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list)
            Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`.
        use_bias_upper_bound: bool, default=True
            Whether to use bias upper bound in hyperparameter tuning.
            If False, direct bias estimator is used to estimate the MSE.
        delta: float, default=0.05
            A confidence delta to construct a high probability upper bound based on the Bernstein’s inequality.
        Returns
        ----------
        estimated_mse_score: float
            Estimated MSE score of a given clipping hyperparameter `lambda_`.
            MSE score is the sum of (high probability) upper bound of bias and the sample variance.
            This is estimated using the automatic hyperparameter tuning procedure
            based on Section 5 of Su et al.(2020).
        """
        n_rounds = reward.shape[0]
        # estimate the sample variance of DR with clipping
        sample_variance = np.var(
            self._estimate_round_rewards(
                reward=reward,
                action=action,
                pscore=pscore,
                action_dist=action_dist,
                estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
                position=position,
            )
        )
        sample_variance /= n_rounds

        # estimate the (high probability) upper bound of the bias of DR with clipping
        iw = action_dist[np.arange(n_rounds), action, position] / pscore
        if use_bias_upper_bound:
            bias_term = estimate_high_probability_upper_bound_bias(
                reward=reward,
                iw=iw,
                iw_hat=np.minimum(iw, self.lambda_),
                q_hat=estimated_rewards_by_reg_model[
                    np.arange(n_rounds), action, position
                ],
                delta=delta,
            )
        else:
            bias_term = estimate_bias_in_ope(
                reward=reward,
                iw=iw,
                iw_hat=np.minimum(iw, self.lambda_),
                q_hat=estimated_rewards_by_reg_model[
                    np.arange(n_rounds), action, position
                ],
            )
        estimated_mse_score = sample_variance + (bias_term ** 2)

        return estimated_mse_score

Inverse Probability Weighting¶

@dataclass
class InverseProbabilityWeighting(BaseOffPolicyEstimator):
    """Inverse Probability Weighting (IPW) Estimator.
    Note
    -------
    Inverse Probability Weighting (IPW) estimates the policy value of evaluation policy :math:`\\pi_e` by
    .. math::
        \\hat{V}_{\\mathrm{IPW}} (\\pi_e; \\mathcal{D}) := \\mathbb{E}_{\\mathcal{D}} [ w(x_t,a_t) r_t],
    where :math:`\\mathcal{D}=\\{(x_t,a_t,r_t)\\}_{t=1}^{T}` is logged bandit feedback data with :math:`T` rounds collected by
    a behavior policy :math:`\\pi_b`. :math:`w(x,a):=\\pi_e (a|x)/\\pi_b (a|x)` is the importance weight given :math:`x` and :math:`a`.
    :math:`\\mathbb{E}_{\\mathcal{D}}[\\cdot]` is the empirical average over :math:`T` observations in :math:`\\mathcal{D}`.
    When the weight-clipping is applied, a large importance weight is clipped as :math:`\\hat{w}(x,a) := \\min \\{ \\lambda, w(x,a) \\}`
    where :math:`\\lambda (>0)` is a hyperparameter that decides a maximum allowed importance weight.
    IPW re-weights the rewards by the ratio of the evaluation policy and behavior policy (importance weight).
    When the behavior policy is known, IPW is unbiased and consistent for the true policy value.
    However, it can have a large variance, especially when the evaluation policy significantly deviates from the behavior policy.
    Parameters
    ------------
    lambda_: float, default=np.inf
        A maximum possible value of the importance weight.
        When a positive finite value is given, importance weights larger than `lambda_` will be clipped.
    estimator_name: str, default='ipw'.
        Name of the estimator.
    References
    ------------
    Alex Strehl, John Langford, Lihong Li, and Sham M Kakade.
    "Learning from Logged Implicit Exploration Data"., 2010.
    Miroslav Dudík, Dumitru Erhan, John Langford, and Lihong Li.
    "Doubly Robust Policy Evaluation and Optimization.", 2014.
    Yi Su, Maria Dimakopoulou, Akshay Krishnamurthy, and Miroslav Dudik.
    "Doubly Robust Off-Policy Evaluation with Shrinkage.", 2020.
    """

    lambda_: float = np.inf
    estimator_name: str = "ipw"

    def __post_init__(self) -> None:
        """Initialize Class."""
        check_scalar(
            self.lambda_,
            name="lambda_",
            target_type=(int, float),
            min_val=0.0,
        )
        if self.lambda_ != self.lambda_:
            raise ValueError("lambda_ must not be nan")

    def _estimate_round_rewards(
        self,
        reward: np.ndarray,
        action: np.ndarray,
        pscore: np.ndarray,
        action_dist: np.ndarray,
        position: Optional[np.ndarray] = None,
        **kwargs,
    ) -> np.ndarray:
        """Estimate round-wise (or sample-wise) rewards.
        Parameters
        ----------
        reward: array-like or Tensor, shape (n_rounds,)
            Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`.
        action: array-like or Tensor, shape (n_rounds,)
            Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`.
        pscore: array-like or Tensor, shape (n_rounds,)
            Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`.
        action_dist: array-like or Tensor, shape (n_rounds, n_actions, len_list)
            Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`.
        position: array-like or Tensor, shape (n_rounds,), default=None
            Position of recommendation interface where action was presented in each round of the given logged bandit data.
            When None is given, the effect of position on the reward will be ignored.
            (If only one action is chosen and there is no posion, then you can just ignore this argument.)
        Returns
        ----------
        estimated_rewards: array-like or Tensor, shape (n_rounds,)
            Rewards of each round estimated by IPW.
        """
        if position is None:
            position = np.zeros(action_dist.shape[0], dtype=int)
        iw = action_dist[np.arange(action.shape[0]), action, position] / pscore
        # weight clipping
        if isinstance(iw, np.ndarray):
            iw = np.minimum(iw, self.lambda_)
        return reward * iw

    def estimate_policy_value(
        self,
        reward: np.ndarray,
        action: np.ndarray,
        pscore: np.ndarray,
        action_dist: np.ndarray,
        position: Optional[np.ndarray] = None,
        **kwargs,
    ) -> np.ndarray:
        """Estimate the policy value of evaluation policy.
        Parameters
        ----------
        reward: array-like, shape (n_rounds,)
            Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`.
        action: array-like, shape (n_rounds,)
            Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`.
        pscore: array-like, shape (n_rounds,)
            Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`.
        action_dist: array-like, shape (n_rounds, n_actions, len_list)
            Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`.
        position: array-like, shape (n_rounds,), default=None
            Position of recommendation interface where action was presented in each round of the given logged bandit data.
            When None is given, the effect of position on the reward will be ignored.
            (If only one action is chosen and there is no posion, then you can just ignore this argument.)
        Returns
        ----------
        V_hat: float
            Estimated policy value (performance) of a given evaluation policy.
        """
        check_array(array=reward, name="reward", expected_dim=1)
        check_array(array=action, name="action", expected_dim=1)
        check_array(array=pscore, name="pscore", expected_dim=1)
        check_ope_inputs(
            action_dist=action_dist,
            position=position,
            action=action,
            reward=reward,
            pscore=pscore,
        )
        if position is None:
            position = np.zeros(action_dist.shape[0], dtype=int)

        return self._estimate_round_rewards(
            reward=reward,
            action=action,
            position=position,
            pscore=pscore,
            action_dist=action_dist,
        ).mean()

    def estimate_interval(
        self,
        reward: np.ndarray,
        action: np.ndarray,
        pscore: np.ndarray,
        action_dist: np.ndarray,
        position: Optional[np.ndarray] = None,
        alpha: float = 0.05,
        n_bootstrap_samples: int = 10000,
        random_state: Optional[int] = None,
        **kwargs,
    ) -> Dict[str, float]:
        """Estimate confidence interval of policy value by nonparametric bootstrap procedure.
        Parameters
        ----------
        reward: array-like, shape (n_rounds,)
            Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`.
        action: array-like, shape (n_rounds,)
            Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`.
        pscore: array-like, shape (n_rounds,)
            Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`.
        action_dist: array-like, shape (n_rounds, n_actions, len_list)
            Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`.
        position: array-like, shape (n_rounds,), default=None
            Position of recommendation interface where action was presented in each round of the given logged bandit data.
            When None is given, the effect of position on the reward will be ignored.
            (If only one action is chosen and there is no posion, then you can just ignore this argument.)
        alpha: float, default=0.05
            Significance level.
        n_bootstrap_samples: int, default=10000
            Number of resampling performed in the bootstrap procedure.
        random_state: int, default=None
            Controls the random seed in bootstrap sampling.
        Returns
        ----------
        estimated_confidence_interval: Dict[str, float]
            Dictionary storing the estimated mean and upper-lower confidence bounds.
        """
        check_array(array=reward, name="reward", expected_dim=1)
        check_array(array=action, name="action", expected_dim=1)
        check_array(array=pscore, name="pscore", expected_dim=1)
        check_ope_inputs(
            action_dist=action_dist,
            position=position,
            action=action,
            reward=reward,
            pscore=pscore,
        )
        if position is None:
            position = np.zeros(action_dist.shape[0], dtype=int)

        estimated_round_rewards = self._estimate_round_rewards(
            reward=reward,
            action=action,
            position=position,
            pscore=pscore,
            action_dist=action_dist,
        )
        return estimate_confidence_interval_by_bootstrap(
            samples=estimated_round_rewards,
            alpha=alpha,
            n_bootstrap_samples=n_bootstrap_samples,
            random_state=random_state,
        )

    def _estimate_mse_score(
        self,
        reward: np.ndarray,
        action: np.ndarray,
        pscore: np.ndarray,
        action_dist: np.ndarray,
        position: Optional[np.ndarray] = None,
        use_bias_upper_bound: bool = True,
        delta: float = 0.05,
        **kwargs,
    ) -> float:
        """Estimate the MSE score of a given clipping hyperparameter to conduct hyperparameter tuning.
        Parameters
        ----------
        reward: array-like, shape (n_rounds,)
            Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`.
        action: array-like, shape (n_rounds,)
            Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`.
        pscore: array-like, shape (n_rounds,)
            Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`.
        action_dist: array-like, shape (n_rounds, n_actions, len_list)
            Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`.
        position: array-like, shape (n_rounds,), default=None
            Position of recommendation interface where action was presented in each round of the given logged bandit data.
        use_bias_upper_bound: bool, default=True
            Whether to use bias upper bound in hyperparameter tuning.
            If False, direct bias estimator is used to estimate the MSE.
        delta: float, default=0.05
            A confidence delta to construct a high probability upper bound based on the Bernstein’s inequality.
        Returns
        ----------
        estimated_mse_score: float
            Estimated MSE score of a given clipping hyperparameter `lambda_`.
            MSE score is the sum of (high probability) upper bound of bias and the sample variance.
            This is estimated using the automatic hyperparameter tuning procedure
            based on Section 5 of Su et al.(2020).
        """
        n_rounds = reward.shape[0]
        # estimate the sample variance of IPW with clipping
        sample_variance = np.var(
            self._estimate_round_rewards(
                reward=reward,
                action=action,
                pscore=pscore,
                action_dist=action_dist,
                position=position,
            )
        )
        sample_variance /= n_rounds

        # estimate the (high probability) upper bound of the bias of IPW with clipping
        iw = action_dist[np.arange(n_rounds), action, position] / pscore
        if use_bias_upper_bound:
            bias_term = estimate_high_probability_upper_bound_bias(
                reward=reward, iw=iw, iw_hat=np.minimum(iw, self.lambda_), delta=delta
            )
        else:
            bias_term = estimate_bias_in_ope(
                reward=reward,
                iw=iw,
                iw_hat=np.minimum(iw, self.lambda_),
            )
        estimated_mse_score = sample_variance + (bias_term ** 2)

        return estimated_mse_score

Off-Policy Evaluation Class¶

Off-Policy Evaluation Class to Streamline OPE.

from dataclasses import dataclass
from logging import getLogger
from pathlib import Path
from typing import Dict
from typing import List
from typing import Optional
from typing import Tuple
from typing import Union

import matplotlib.pyplot as plt
import numpy as np
from pandas import DataFrame
import seaborn as sns
from sklearn.utils import check_scalar

logger = getLogger(__name__)

@dataclass
class OffPolicyEvaluation:
    """Class to conduct OPE by multiple estimators simultaneously.
    Parameters
    -----------
    bandit_feedback: BanditFeedback
        Logged bandit feedback data used to conduct OPE.
    ope_estimators: List[BaseOffPolicyEstimator]
        List of OPE estimators used to evaluate the policy value of evaluation policy.
        Estimators must follow the interface of `obp.ope.BaseOffPolicyEstimator`.
    Examples
    ----------
    .. code-block:: python
        # a case for implementing OPE of the BernoulliTS policy
        # using log data generated by the Random policy
        >>> from obp.dataset import OpenBanditDataset
        >>> from obp.policy import BernoulliTS
        >>> from obp.ope import OffPolicyEvaluation, InverseProbabilityWeighting as IPW
        # (1) Data loading and preprocessing
        >>> dataset = OpenBanditDataset(behavior_policy='random', campaign='all')
        >>> bandit_feedback = dataset.obtain_batch_bandit_feedback()
        >>> bandit_feedback.keys()
        dict_keys(['n_rounds', 'n_actions', 'action', 'position', 'reward', 'pscore', 'context', 'action_context'])
        # (2) Off-Policy Learning
        >>> evaluation_policy = BernoulliTS(
            n_actions=dataset.n_actions,
            len_list=dataset.len_list,
            is_zozotown_prior=True, # replicate the policy in the ZOZOTOWN production
            campaign="all",
            random_state=12345
        )
        >>> action_dist = evaluation_policy.compute_batch_action_dist(
            n_sim=100000, n_rounds=bandit_feedback["n_rounds"]
        )
        # (3) Off-Policy Evaluation
        >>> ope = OffPolicyEvaluation(bandit_feedback=bandit_feedback, ope_estimators=[IPW()])
        >>> estimated_policy_value = ope.estimate_policy_values(action_dist=action_dist)
        >>> estimated_policy_value
        {'ipw': 0.004553...}
        # policy value improvement of BernoulliTS over the Random policy estimated by IPW
        >>> estimated_policy_value_improvement = estimated_policy_value['ipw'] / bandit_feedback['reward'].mean()
        # our OPE procedure suggests that BernoulliTS improves Random by 19.81%
        >>> print(estimated_policy_value_improvement)
        1.198126...
    """

    bandit_feedback: BanditFeedback
    ope_estimators: List[BaseOffPolicyEstimator]

    def __post_init__(self) -> None:
        """Initialize class."""
        for key_ in ["action", "position", "reward", "pscore"]:
            if key_ not in self.bandit_feedback:
                raise RuntimeError(f"Missing key of {key_} in 'bandit_feedback'.")
        self.ope_estimators_ = dict()
        self.is_model_dependent = False
        for estimator in self.ope_estimators:
            self.ope_estimators_[estimator.estimator_name] = estimator
            if isinstance(estimator, DirectMethod) or isinstance(estimator, DoublyRobust):
                self.is_model_dependent = True

    def _create_estimator_inputs(
        self,
        action_dist: np.ndarray,
        estimated_rewards_by_reg_model: Optional[
            Union[np.ndarray, Dict[str, np.ndarray]]
        ] = None,
    ) -> Dict[str, Dict[str, np.ndarray]]:
        """Create input dictionary to estimate policy value using subclasses of `BaseOffPolicyEstimator`"""
        check_array(array=action_dist, name="action_dist", expected_dim=3)
        if estimated_rewards_by_reg_model is None:
            pass
        elif isinstance(estimated_rewards_by_reg_model, dict):
            for estimator_name, value in estimated_rewards_by_reg_model.items():
                check_array(
                    array=value,
                    name=f"estimated_rewards_by_reg_model[{estimator_name}]",
                    expected_dim=3,
                )
                if value.shape != action_dist.shape:
                    raise ValueError(
                        f"Expected `estimated_rewards_by_reg_model[{estimator_name}].shape == action_dist.shape`, but found it False."
                    )
        elif estimated_rewards_by_reg_model.shape != action_dist.shape:
            raise ValueError(
                "Expected `estimated_rewards_by_reg_model.shape == action_dist.shape`, but found it False"
            )
        estimator_inputs = {
            estimator_name: {
                input_: self.bandit_feedback[input_]
                for input_ in ["reward", "action", "position", "pscore"]
            }
            for estimator_name in self.ope_estimators_
        }

        for estimator_name in self.ope_estimators_:
            estimator_inputs[estimator_name]["action_dist"] = action_dist
            if isinstance(estimated_rewards_by_reg_model, dict):
                if estimator_name in estimated_rewards_by_reg_model:
                    estimator_inputs[estimator_name][
                        "estimated_rewards_by_reg_model"
                    ] = estimated_rewards_by_reg_model[estimator_name]
                else:
                    estimator_inputs[estimator_name][
                        "estimated_rewards_by_reg_model"
                    ] = None
            else:
                estimator_inputs[estimator_name][
                    "estimated_rewards_by_reg_model"
                ] = estimated_rewards_by_reg_model

        return estimator_inputs

    def estimate_policy_values(
        self,
        action_dist: np.ndarray,
        estimated_rewards_by_reg_model: Optional[
            Union[np.ndarray, Dict[str, np.ndarray]]
        ] = None,
    ) -> Dict[str, float]:
        """Estimate the policy value of evaluation policy.
        Parameters
        ------------
        action_dist: array-like, shape (n_rounds, n_actions, len_list)
            Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`.
        estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list) or Dict[str, array-like], default=None
            Expected rewards given each round, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`.
            When an array-like is given, all OPE estimators use it.
            When a dict is given, if the dict has the name of a estimator as a key, the corresponding value is used.
            When it is not given, model-dependent estimators such as DM and DR cannot be used.
        Returns
        ----------
        policy_value_dict: Dict[str, float]
            Dictionary containing estimated policy values by OPE estimators.
        """
        if self.is_model_dependent:
            if estimated_rewards_by_reg_model is None:
                raise ValueError(
                    "When model dependent estimators such as DM or DR are used, `estimated_rewards_by_reg_model` must be given"
                )

        policy_value_dict = dict()
        estimator_inputs = self._create_estimator_inputs(
            action_dist=action_dist,
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        )
        for estimator_name, estimator in self.ope_estimators_.items():
            policy_value_dict[estimator_name] = estimator.estimate_policy_value(
                **estimator_inputs[estimator_name]
            )

        return policy_value_dict

    def estimate_intervals(
        self,
        action_dist: np.ndarray,
        estimated_rewards_by_reg_model: Optional[
            Union[np.ndarray, Dict[str, np.ndarray]]
        ] = None,
        alpha: float = 0.05,
        n_bootstrap_samples: int = 100,
        random_state: Optional[int] = None,
    ) -> Dict[str, Dict[str, float]]:
        """Estimate confidence intervals of policy values using nonparametric bootstrap procedure.
        Parameters
        ------------
        action_dist: array-like, shape (n_rounds, n_actions, len_list)
            Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`.
        estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list) or Dict[str, array-like], default=None
            Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`.
            When an array-like is given, all OPE estimators use it.
            When a dict is given, if the dict has the name of a estimator as a key, the corresponding value is used.
            When it is not given, model-dependent estimators such as DM and DR cannot be used.
        alpha: float, default=0.05
            Significance level.
        n_bootstrap_samples: int, default=100
            Number of resampling performed in the bootstrap procedure.
        random_state: int, default=None
            Controls the random seed in bootstrap sampling.
        Returns
        ----------
        policy_value_interval_dict: Dict[str, Dict[str, float]]
            Dictionary containing confidence intervals of estimated policy value estimated
            using nonparametric bootstrap procedure.
        """
        if self.is_model_dependent:
            if estimated_rewards_by_reg_model is None:
                raise ValueError(
                    "When model dependent estimators such as DM or DR are used, `estimated_rewards_by_reg_model` must be given"
                )

        check_confidence_interval_arguments(
            alpha=alpha,
            n_bootstrap_samples=n_bootstrap_samples,
            random_state=random_state,
        )
        policy_value_interval_dict = dict()
        estimator_inputs = self._create_estimator_inputs(
            action_dist=action_dist,
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        )
        for estimator_name, estimator in self.ope_estimators_.items():
            policy_value_interval_dict[estimator_name] = estimator.estimate_interval(
                **estimator_inputs[estimator_name],
                alpha=alpha,
                n_bootstrap_samples=n_bootstrap_samples,
                random_state=random_state,
            )

        return policy_value_interval_dict

    def summarize_off_policy_estimates(
        self,
        action_dist: np.ndarray,
        estimated_rewards_by_reg_model: Optional[
            Union[np.ndarray, Dict[str, np.ndarray]]
        ] = None,
        alpha: float = 0.05,
        n_bootstrap_samples: int = 100,
        random_state: Optional[int] = None,
    ) -> Tuple[DataFrame, DataFrame]:
        """Summarize policy values and their confidence intervals estimated by OPE estimators.
        Parameters
        ------------
        action_dist: array-like, shape (n_rounds, n_actions, len_list)
            Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`.
        estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list) or Dict[str, array-like], default=None
            Expected rewards given each round, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`.
            When an array-like is given, all OPE estimators use it.
            When a dict is given, if the dict has the name of a estimator as a key, the corresponding value is used.
            When it is not given, model-dependent estimators such as DM and DR cannot be used.
        alpha: float, default=0.05
            Significance level.
        n_bootstrap_samples: int, default=100
            Number of resampling performed in the bootstrap procedure.
        random_state: int, default=None
            Controls the random seed in bootstrap sampling.
        Returns
        ----------
        (policy_value_df, policy_value_interval_df): Tuple[DataFrame, DataFrame]
            Policy values and their confidence intervals Estimated by OPE estimators.
        """
        policy_value_df = DataFrame(
            self.estimate_policy_values(
                action_dist=action_dist,
                estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
            ),
            index=["estimated_policy_value"],
        )
        policy_value_interval_df = DataFrame(
            self.estimate_intervals(
                action_dist=action_dist,
                estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
                alpha=alpha,
                n_bootstrap_samples=n_bootstrap_samples,
                random_state=random_state,
            )
        )
        policy_value_of_behavior_policy = self.bandit_feedback["reward"].mean()
        policy_value_df = policy_value_df.T
        if policy_value_of_behavior_policy <= 0:
            logger.warning(
                f"Policy value of the behavior policy is {policy_value_of_behavior_policy} (<=0); relative estimated policy value is set to np.nan"
            )
            policy_value_df["relative_estimated_policy_value"] = np.nan
        else:
            policy_value_df["relative_estimated_policy_value"] = (
                policy_value_df.estimated_policy_value / policy_value_of_behavior_policy
            )
        return policy_value_df, policy_value_interval_df.T

    def visualize_off_policy_estimates(
        self,
        action_dist: np.ndarray,
        estimated_rewards_by_reg_model: Optional[
            Union[np.ndarray, Dict[str, np.ndarray]]
        ] = None,
        alpha: float = 0.05,
        is_relative: bool = False,
        n_bootstrap_samples: int = 100,
        random_state: Optional[int] = None,
        fig_dir: Optional[Path] = None,
        fig_name: str = "estimated_policy_value.png",
    ) -> None:
        """Visualize policy values estimated by OPE estimators.
        Parameters
        ----------
        action_dist: array-like, shape (n_rounds, n_actions, len_list)
            Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`.
        estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list) or Dict[str, array-like], default=None
            Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`.
            When an array-like is given, all OPE estimators use it.
            When a dict is given, if the dict has the name of a estimator as a key, the corresponding value is used.
            When it is not given, model-dependent estimators such as DM and DR cannot be used.
        alpha: float, default=0.05
            Significance level.
        n_bootstrap_samples: int, default=100
            Number of resampling performed in the bootstrap procedure.
        random_state: int, default=None
            Controls the random seed in bootstrap sampling.
        is_relative: bool, default=False,
            If True, the method visualizes the estimated policy values of evaluation policy
            relative to the ground-truth policy value of behavior policy.
        fig_dir: Path, default=None
            Path to store the bar figure.
            If 'None' is given, the figure will not be saved.
        fig_name: str, default="estimated_policy_value.png"
            Name of the bar figure.
        """
        if fig_dir is not None:
            assert isinstance(fig_dir, Path), "fig_dir must be a Path"
        if fig_name is not None:
            assert isinstance(fig_name, str), "fig_dir must be a string"

        estimated_round_rewards_dict = dict()
        estimator_inputs = self._create_estimator_inputs(
            action_dist=action_dist,
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        )
        for estimator_name, estimator in self.ope_estimators_.items():
            estimated_round_rewards_dict[
                estimator_name
            ] = estimator._estimate_round_rewards(**estimator_inputs[estimator_name])
        estimated_round_rewards_df = DataFrame(estimated_round_rewards_dict)
        estimated_round_rewards_df.rename(
            columns={key: key.upper() for key in estimated_round_rewards_dict.keys()},
            inplace=True,
        )
        if is_relative:
            estimated_round_rewards_df /= self.bandit_feedback["reward"].mean()

        plt.style.use("ggplot")
        fig, ax = plt.subplots(figsize=(8, 6))
        sns.barplot(
            data=estimated_round_rewards_df,
            ax=ax,
            ci=100 * (1 - alpha),
            n_boot=n_bootstrap_samples,
            seed=random_state,
        )
        plt.xlabel("OPE Estimators", fontsize=25)
        plt.ylabel(
            f"Estimated Policy Value (± {np.int(100*(1 - alpha))}% CI)", fontsize=20
        )
        plt.yticks(fontsize=15)
        plt.xticks(fontsize=25 - 2 * len(self.ope_estimators))

        if fig_dir:
            fig.savefig(str(fig_dir / fig_name))

    def evaluate_performance_of_estimators(
        self,
        ground_truth_policy_value: float,
        action_dist: np.ndarray,
        estimated_rewards_by_reg_model: Optional[
            Union[np.ndarray, Dict[str, np.ndarray]]
        ] = None,
        metric: str = "relative-ee",
    ) -> Dict[str, float]:
        """Evaluate estimation performance of OPE estimators.
        Note
        ------
        Evaluate the estimation performance of OPE estimators by relative estimation error (relative-EE) or squared error (SE):
        .. math ::
            \\text{Relative-EE} (\\hat{V}; \\mathcal{D}) = \\left|  \\frac{\\hat{V}(\\pi; \\mathcal{D}) - V(\\pi)}{V(\\pi)} \\right|,
        .. math ::
            \\text{SE} (\\hat{V}; \\mathcal{D}) = \\left(\\hat{V}(\\pi; \\mathcal{D}) - V(\\pi) \\right)^2,
        where :math:`V({\\pi})` is the ground-truth policy value of the evalation policy :math:`\\pi_e` (often estimated using on-policy estimation).
        :math:`\\hat{V}(\\pi; \\mathcal{D})` is an estimated policy value by an OPE estimator :math:`\\hat{V}` and logged bandit feedback :math:`\\mathcal{D}`.
        Parameters
        ----------
        ground_truth policy value: float
            Ground_truth policy value of evaluation policy, i.e., :math:`V(\\pi_e)`.
            With Open Bandit Dataset, we use an on-policy estimate of the policy value as its ground-truth.
        action_dist: array-like, shape (n_rounds, n_actions, len_list)
            Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`.
        estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list) or Dict[str, array-like], default=None
            Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`.
            When an array-like is given, all OPE estimators use it.
            When a dict is given, if the dict has the name of a estimator as a key, the corresponding value is used.
            When it is not given, model-dependent estimators such as DM and DR cannot be used.
        metric: str, default="relative-ee"
            Evaluation metric used to evaluate and compare the estimation performance of OPE estimators.
            Must be "relative-ee" or "se".
        Returns
        ----------
        eval_metric_ope_dict: Dict[str, float]
            Dictionary containing evaluation metric for evaluating the estimation performance of OPE estimators.
        """
        check_scalar(
            ground_truth_policy_value,
            "ground_truth_policy_value",
            float,
        )
        if metric not in ["relative-ee", "se"]:
            raise ValueError(
                f"metric must be either 'relative-ee' or 'se', but {metric} is given"
            )
        if metric == "relative-ee" and ground_truth_policy_value == 0.0:
            raise ValueError(
                "ground_truth_policy_value must be non-zero when metric is relative-ee"
            )

        eval_metric_ope_dict = dict()
        estimator_inputs = self._create_estimator_inputs(
            action_dist=action_dist,
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        )
        for estimator_name, estimator in self.ope_estimators_.items():
            estimated_policy_value = estimator.estimate_policy_value(
                **estimator_inputs[estimator_name]
            )
            if metric == "relative-ee":
                relative_ee_ = estimated_policy_value - ground_truth_policy_value
                relative_ee_ /= ground_truth_policy_value
                eval_metric_ope_dict[estimator_name] = np.abs(relative_ee_)
            elif metric == "se":
                se_ = (estimated_policy_value - ground_truth_policy_value) ** 2
                eval_metric_ope_dict[estimator_name] = se_
        return eval_metric_ope_dict

    def summarize_estimators_comparison(
        self,
        ground_truth_policy_value: float,
        action_dist: np.ndarray,
        estimated_rewards_by_reg_model: Optional[
            Union[np.ndarray, Dict[str, np.ndarray]]
        ] = None,
        metric: str = "relative-ee",
    ) -> DataFrame:
        """Summarize performance comparisons of OPE estimators.
        Parameters
        ----------
        ground_truth policy value: float
            Ground_truth policy value of evaluation policy, i.e., :math:`V(\\pi_e)`.
            With Open Bandit Dataset, we use an on-policy estimate of the policy value as ground-truth.
        action_dist: array-like, shape (n_rounds, n_actions, len_list)
            Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`.
        estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list), default=None
            Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`.
            When it is not given, model-dependent estimators such as DM and DR cannot be used.
        metric: str, default="relative-ee"
            Evaluation metric used to evaluate and compare the estimation performance of OPE estimators.
            Must be either "relative-ee" or "se".
        Returns
        ----------
        eval_metric_ope_df: DataFrame
            Evaluation metric to evaluate and compare the estimation performance of OPE estimators.
        """
        eval_metric_ope_df = DataFrame(
            self.evaluate_performance_of_estimators(
                ground_truth_policy_value=ground_truth_policy_value,
                action_dist=action_dist,
                estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
                metric=metric,
            ),
            index=[metric],
        )
        return eval_metric_ope_df.T

    def visualize_off_policy_estimates_of_multiple_policies(
        self,
        policy_name_list: List[str],
        action_dist_list: List[np.ndarray],
        estimated_rewards_by_reg_model: Optional[
            Union[np.ndarray, Dict[str, np.ndarray]]
        ] = None,
        alpha: float = 0.05,
        is_relative: bool = False,
        n_bootstrap_samples: int = 100,
        random_state: Optional[int] = None,
        fig_dir: Optional[Path] = None,
        fig_name: str = "estimated_policy_value.png",
    ) -> None:
        """Visualize policy values estimated by OPE estimators.
        Parameters
        ----------
        policy_name_list: List[str]
            List of the names of evaluation policies.
        action_dist_list: List[array-like, shape (n_rounds, n_actions, len_list)]
            List of action choice probabilities by the evaluation policies (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`.
        estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list) or Dict[str, array-like], default=None
            Expected rewards given context, action, and position estimated by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`.
            When an array-like is given, all OPE estimators use it.
            When a dict is given, if the dict has the name of an estimator as a key, the corresponding value is used.
            When it is not given, model-dependent estimators such as DM and DR cannot be used.
        alpha: float, default=0.05
            Significance level.
        n_bootstrap_samples: int, default=100
            Number of resampling performed in the bootstrap procedure.
        random_state: int, default=None
            Controls the random seed in bootstrap sampling.
        is_relative: bool, default=False,
            If True, the method visualizes the estimated policy values of evaluation policy
            relative to the ground-truth policy value of behavior policy.
        fig_dir: Path, default=None
            Path to store the bar figure.
            If 'None' is given, the figure will not be saved.
        fig_name: str, default="estimated_policy_value.png"
            Name of the bar figure.
        """
        if len(policy_name_list) != len(action_dist_list):
            raise ValueError(
                "the length of policy_name_list must be the same as action_dist_list"
            )
        if fig_dir is not None:
            assert isinstance(fig_dir, Path), "fig_dir must be a Path"
        if fig_name is not None:
            assert isinstance(fig_name, str), "fig_dir must be a string"

        estimated_round_rewards_dict = {
            estimator_name: {} for estimator_name in self.ope_estimators_
        }

        for policy_name, action_dist in zip(policy_name_list, action_dist_list):
            estimator_inputs = self._create_estimator_inputs(
                action_dist=action_dist,
                estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
            )
            for estimator_name, estimator in self.ope_estimators_.items():
                estimated_round_rewards_dict[estimator_name][
                    policy_name
                ] = estimator._estimate_round_rewards(
                    **estimator_inputs[estimator_name]
                )

        plt.style.use("ggplot")
        fig = plt.figure(figsize=(8, 6.2 * len(self.ope_estimators_)))

        for i, estimator_name in enumerate(self.ope_estimators_):
            estimated_round_rewards_df = DataFrame(
                estimated_round_rewards_dict[estimator_name]
            )
            if is_relative:
                estimated_round_rewards_df /= self.bandit_feedback["reward"].mean()

            ax = fig.add_subplot(len(action_dist_list), 1, i + 1)
            sns.barplot(
                data=estimated_round_rewards_df,
                ax=ax,
                ci=100 * (1 - alpha),
                n_boot=n_bootstrap_samples,
                seed=random_state,
            )
            ax.set_title(estimator_name.upper(), fontsize=20)
            ax.set_ylabel(
                f"Estimated Policy Value (± {np.int(100*(1 - alpha))}% CI)", fontsize=20
            )
            plt.yticks(fontsize=15)
            plt.xticks(fontsize=25 - 2 * len(policy_name_list))

        if fig_dir:
            fig.savefig(str(fig_dir / fig_name))

Base Models¶

Utils¶

def check_bandit_feedback_inputs(
    context: np.ndarray,
    action: np.ndarray,
    reward: np.ndarray,
    expected_reward: Optional[np.ndarray] = None,
    position: Optional[np.ndarray] = None,
    pscore: Optional[np.ndarray] = None,
    action_context: Optional[np.ndarray] = None,
) -> Optional[ValueError]:
    """Check inputs for bandit learning or simulation.
    Parameters
    -----------
    context: array-like, shape (n_rounds, dim_context)
        Context vectors in each round, i.e., :math:`x_t`.
    action: array-like, shape (n_rounds,)
        Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`.
    reward: array-like, shape (n_rounds,)
        Observed rewards (or outcome) in each round, i.e., :math:`r_t`.
    expected_reward: array-like, shape (n_rounds, n_actions), default=None
        Expected rewards (or outcome) in each round, i.e., :math:`\\mathbb{E}[r_t]`.
    position: array-like, shape (n_rounds,), default=None
        Position of recommendation interface where action was presented in each round of the given logged bandit data.
    pscore: array-like, shape (n_rounds,), default=None
        Propensity scores, the probability of selecting each action by behavior policy,
        in the given logged bandit data.
    action_context: array-like, shape (n_actions, dim_action_context)
        Context vectors characterizing each action.
    """
    check_array(array=context, name="context", expected_dim=2)
    check_array(array=action, name="action", expected_dim=1)
    check_array(array=reward, name="reward", expected_dim=1)
    if not (np.issubdtype(action.dtype, np.integer) and action.min() >= 0):
        raise ValueError("action elements must be non-negative integers")

    if expected_reward is not None:
        check_array(array=expected_reward, name="expected_reward", expected_dim=2)
        if not (
            context.shape[0]
            == action.shape[0]
            == reward.shape[0]
            == expected_reward.shape[0]
        ):
            raise ValueError(
                "Expected `context.shape[0] == action.shape[0] == reward.shape[0] == expected_reward.shape[0]`"
                ", but found it False"
            )
        if action.max() >= expected_reward.shape[1]:
            raise ValueError(
                "action elements must be smaller than `expected_reward.shape[1]`"
            )
    if pscore is not None:
        check_array(array=pscore, name="pscore", expected_dim=1)
        if not (
            context.shape[0] == action.shape[0] == reward.shape[0] == pscore.shape[0]
        ):
            raise ValueError(
                "Expected `context.shape[0] == action.shape[0] == reward.shape[0] == pscore.shape[0]`"
                ", but found it False"
            )
        if np.any(pscore <= 0):
            raise ValueError("pscore must be positive")

    if position is not None:
        check_array(array=position, name="position", expected_dim=1)
        if not (
            context.shape[0] == action.shape[0] == reward.shape[0] == position.shape[0]
        ):
            raise ValueError(
                "Expected `context.shape[0] == action.shape[0] == reward.shape[0] == position.shape[0]`"
                ", but found it False"
            )
        if not (np.issubdtype(position.dtype, np.integer) and position.min() >= 0):
            raise ValueError("position elements must be non-negative integers")
    else:
        if not (context.shape[0] == action.shape[0] == reward.shape[0]):
            raise ValueError(
                "Expected `context.shape[0] == action.shape[0] == reward.shape[0]`"
                ", but found it False"
            )
    if action_context is not None:
        check_array(array=action_context, name="action_context", expected_dim=2)
        if action.max() >= action_context.shape[0]:
            raise ValueError(
                "action elements must be smaller than `action_context.shape[0]`"
            )

Regression Model¶

Regression Model Class for Estimating Mean Reward Functions.

from dataclasses import dataclass
from typing import Optional

import numpy as np
from sklearn.base import BaseEstimator
from sklearn.base import clone
from sklearn.base import is_classifier
from sklearn.model_selection import KFold
from sklearn.utils import check_random_state
from sklearn.utils import check_scalar

@dataclass
class RegressionModel(BaseEstimator):
    """Machine learning model to estimate the mean reward function (:math:`q(x,a):= \\mathbb{E}[r|x,a]`).
    Note
    -------
    Reward (or outcome) :math:`r` must be either binary or continuous.
    Parameters
    ------------
    base_model: BaseEstimator
        A machine learning model used to estimate the mean reward function.
    n_actions: int
        Number of actions.
    len_list: int, default=1
        Length of a list of actions recommended in each impression.
        When Open Bandit Dataset is used, 3 should be set.
    action_context: array-like, shape (n_actions, dim_action_context), default=None
        Context vector characterizing action (i.e., vector representation of each action).
        If not given, one-hot encoding of the action variable is used as default.
    fitting_method: str, default='normal'
        Method to fit the regression model.
        Must be one of ['normal', 'iw', 'mrdr'] where 'iw' stands for importance weighting and
        'mrdr' stands for more robust doubly robust.
    References
    -----------
    Mehrdad Farajtabar, Yinlam Chow, and Mohammad Ghavamzadeh.
    "More Robust Doubly Robust Off-policy Evaluation.", 2018.
    Yi Su, Maria Dimakopoulou, Akshay Krishnamurthy, and Miroslav Dudik.
    "Doubly Robust Off-Policy Evaluation with Shrinkage.", 2020.
    Yusuke Narita, Shota Yasui, and Kohei Yata.
    "Off-policy Bandit and Reinforcement Learning.", 2020.
    """

    base_model: BaseEstimator
    n_actions: int
    len_list: int = 1
    action_context: Optional[np.ndarray] = None
    fitting_method: str = "normal"

    def __post_init__(self) -> None:
        """Initialize Class."""
        check_scalar(self.n_actions, "n_actions", int, min_val=2)
        check_scalar(self.len_list, "len_list", int, min_val=1)
        if not (
            isinstance(self.fitting_method, str)
            and self.fitting_method in ["normal", "iw", "mrdr"]
        ):
            raise ValueError(
                f"fitting_method must be one of 'normal', 'iw', or 'mrdr', but {self.fitting_method} is given"
            )
        if not isinstance(self.base_model, BaseEstimator):
            raise ValueError(
                "base_model must be BaseEstimator or a child class of BaseEstimator"
            )

        self.base_model_list = [
            clone(self.base_model) for _ in np.arange(self.len_list)
        ]
        if self.action_context is None:
            self.action_context = np.eye(self.n_actions, dtype=int)

    def fit(
        self,
        context: np.ndarray,
        action: np.ndarray,
        reward: np.ndarray,
        pscore: Optional[np.ndarray] = None,
        position: Optional[np.ndarray] = None,
        action_dist: Optional[np.ndarray] = None,
    ) -> None:
        """Fit the regression model on given logged bandit feedback data.
        Parameters
        ----------
        context: array-like, shape (n_rounds, dim_context)
            Context vectors observed in each round of the logged bandit feedback, i.e., :math:`x_t`.
        action: array-like, shape (n_rounds,)
            Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`.
        reward: array-like, shape (n_rounds,)
            Reward observed in each round of the logged bandit feedback, i.e., :math:`r_t`.
        pscore: array-like, shape (n_rounds,)
            Action choice probabilities of behavior policy (propensity scores), i.e., :math:`\\pi_b(a_t|x_t)`.
            When None is given, behavior policy is assumed to be uniform.
        position: array-like, shape (n_rounds,), default=None
            Position of recommendation interface where action was presented in each round of the given logged bandit data.
            If None is set, a regression model assumes that there is only one position.
            When `len_list` > 1, this position argument has to be set.
        action_dist: array-like, shape (n_rounds, n_actions, len_list), default=None
            Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`.
            When either of 'iw' or 'mrdr' is used as the 'fitting_method' argument, then `action_dist` must be given.
        """
        check_bandit_feedback_inputs(
            context=context,
            action=action,
            reward=reward,
            pscore=pscore,
            position=position,
            action_context=self.action_context,
        )
        n_rounds = context.shape[0]

        if position is None or self.len_list == 1:
            position = np.zeros_like(action)
        else:
            if position.max() >= self.len_list:
                raise ValueError(
                    f"position elements must be smaller than len_list, but the maximum value is {position.max()} (>= {self.len_list})"
                )
        if self.fitting_method in ["iw", "mrdr"]:
            if not (isinstance(action_dist, np.ndarray) and action_dist.ndim == 3):
                raise ValueError(
                    "when fitting_method is either 'iw' or 'mrdr', action_dist (a 3-dimensional ndarray) must be given"
                )
            if action_dist.shape != (n_rounds, self.n_actions, self.len_list):
                raise ValueError(
                    f"shape of action_dist must be (n_rounds, n_actions, len_list)=({n_rounds, self.n_actions, self.len_list}), but is {action_dist.shape}"
                )
            if not np.allclose(action_dist.sum(axis=1), 1):
                raise ValueError("action_dist must be a probability distribution")
        if pscore is None:
            pscore = np.ones_like(action) / self.n_actions

        for position_ in np.arange(self.len_list):
            idx = position == position_
            X = self._pre_process_for_reg_model(
                context=context[idx],
                action=action[idx],
                action_context=self.action_context,
            )
            if X.shape[0] == 0:
                raise ValueError(f"No training data at position {position_}")
            # train the base model according to the given `fitting method`
            if self.fitting_method == "normal":
                self.base_model_list[position_].fit(X, reward[idx])
            else:
                action_dist_at_position = action_dist[
                    np.arange(n_rounds),
                    action,
                    position_ * np.ones(n_rounds, dtype=int),
                ][idx]
                if self.fitting_method == "iw":
                    sample_weight = action_dist_at_position / pscore[idx]
                    self.base_model_list[position_].fit(
                        X, reward[idx], sample_weight=sample_weight
                    )
                elif self.fitting_method == "mrdr":
                    sample_weight = action_dist_at_position
                    sample_weight *= 1.0 - pscore[idx]
                    sample_weight /= pscore[idx] ** 2
                    self.base_model_list[position_].fit(
                        X, reward[idx], sample_weight=sample_weight
                    )

    def predict(self, context: np.ndarray) -> np.ndarray:
        """Predict the mean reward function.
        Parameters
        -----------
        context: array-like, shape (n_rounds_of_new_data, dim_context)
            Context vectors of new data.
        Returns
        -----------
        estimated_rewards_by_reg_model: array-like, shape (n_rounds_of_new_data, n_actions, len_list)
            Expected rewards of new data estimated by the regression model.
        """
        n_rounds_of_new_data = context.shape[0]
        ones_n_rounds_arr = np.ones(n_rounds_of_new_data, int)
        estimated_rewards_by_reg_model = np.zeros(
            (n_rounds_of_new_data, self.n_actions, self.len_list)
        )
        for action_ in np.arange(self.n_actions):
            for position_ in np.arange(self.len_list):
                X = self._pre_process_for_reg_model(
                    context=context,
                    action=action_ * ones_n_rounds_arr,
                    action_context=self.action_context,
                )
                estimated_rewards_ = (
                    self.base_model_list[position_].predict_proba(X)[:, 1]
                    if is_classifier(self.base_model_list[position_])
                    else self.base_model_list[position_].predict(X)
                )
                estimated_rewards_by_reg_model[
                    np.arange(n_rounds_of_new_data),
                    action_ * ones_n_rounds_arr,
                    position_ * ones_n_rounds_arr,
                ] = estimated_rewards_
        return estimated_rewards_by_reg_model

    def fit_predict(
        self,
        context: np.ndarray,
        action: np.ndarray,
        reward: np.ndarray,
        pscore: Optional[np.ndarray] = None,
        position: Optional[np.ndarray] = None,
        action_dist: Optional[np.ndarray] = None,
        n_folds: int = 1,
        random_state: Optional[int] = None,
    ) -> np.ndarray:
        """Fit the regression model on given logged bandit feedback data and predict the reward function of the same data.
        Note
        ------
        When `n_folds` is larger than 1, then the cross-fitting procedure is applied.
        See the reference for the details about the cross-fitting technique.
        Parameters
        ----------
        context: array-like, shape (n_rounds, dim_context)
            Context vectors observed in each round of the logged bandit feedback, i.e., :math:`x_t`.
        action: array-like, shape (n_rounds,)
            Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`.
        reward: array-like, shape (n_rounds,)
            Observed rewards (or outcome) in each round, i.e., :math:`r_t`.
        pscore: array-like, shape (n_rounds,), default=None
            Action choice probabilities (propensity score) of a behavior policy
            in the training logged bandit feedback.
            When None is given, the the behavior policy is assumed to be a uniform one.
        position: array-like, shape (n_rounds,), default=None
            Position of recommendation interface where action was presented in each round of the given logged bandit data.
            If None is set, a regression model assumes that there is only one position.
            When `len_list` > 1, this position argument has to be set.
        action_dist: array-like, shape (n_rounds, n_actions, len_list), default=None
            Action choice probabilities of evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`.
            When either of 'iw' or 'mrdr' is used as the 'fitting_method' argument, then `action_dist` must be given.
        n_folds: int, default=1
            Number of folds in the cross-fitting procedure.
            When 1 is given, the regression model is trained on the whole logged bandit feedback data.
            Please refer to https://arxiv.org/abs/2002.08536 about the details of the cross-fitting procedure.
        random_state: int, default=None
            `random_state` affects the ordering of the indices, which controls the randomness of each fold.
            See https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html for the details.
        Returns
        -----------
        estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list)
            Expected rewards of new data estimated by the regression model.
        """
        check_bandit_feedback_inputs(
            context=context,
            action=action,
            reward=reward,
            pscore=pscore,
            position=position,
            action_context=self.action_context,
        )
        n_rounds = context.shape[0]

        check_scalar(n_folds, "n_folds", int, min_val=1)
        check_random_state(random_state)

        if position is None or self.len_list == 1:
            position = np.zeros_like(action)
        else:
            if position.max() >= self.len_list:
                raise ValueError(
                    f"position elements must be smaller than len_list, but the maximum value is {position.max()} (>= {self.len_list})"
                )
        if self.fitting_method in ["iw", "mrdr"]:
            if not (isinstance(action_dist, np.ndarray) and action_dist.ndim == 3):
                raise ValueError(
                    "when fitting_method is either 'iw' or 'mrdr', action_dist (a 3-dimensional ndarray) must be given"
                )
            if action_dist.shape != (n_rounds, self.n_actions, self.len_list):
                raise ValueError(
                    f"shape of action_dist must be (n_rounds, n_actions, len_list)=({n_rounds, self.n_actions, self.len_list}), but is {action_dist.shape}"
                )
        if pscore is None:
            pscore = np.ones_like(action) / self.n_actions

        if n_folds == 1:
            self.fit(
                context=context,
                action=action,
                reward=reward,
                pscore=pscore,
                position=position,
                action_dist=action_dist,
            )
            return self.predict(context=context)
        else:
            estimated_rewards_by_reg_model = np.zeros(
                (n_rounds, self.n_actions, self.len_list)
            )
        kf = KFold(n_splits=n_folds, shuffle=True, random_state=random_state)
        kf.get_n_splits(context)
        for train_idx, test_idx in kf.split(context):
            action_dist_tr = (
                action_dist[train_idx] if action_dist is not None else action_dist
            )
            self.fit(
                context=context[train_idx],
                action=action[train_idx],
                reward=reward[train_idx],
                pscore=pscore[train_idx],
                position=position[train_idx],
                action_dist=action_dist_tr,
            )
            estimated_rewards_by_reg_model[test_idx, :, :] = self.predict(
                context=context[test_idx]
            )
        return estimated_rewards_by_reg_model

    def _pre_process_for_reg_model(
        self,
        context: np.ndarray,
        action: np.ndarray,
        action_context: np.ndarray,
    ) -> np.ndarray:
        """Preprocess feature vectors to train a regression model.
        Note
        -----
        Please override this method if you want to use another feature enginnering
        for training the regression model.
        Parameters
        -----------
        context: array-like, shape (n_rounds,)
            Context vectors observed in each round of the logged bandit feedback, i.e., :math:`x_t`.
        action: array-like, shape (n_rounds,)
            Action sampled by behavior policy in each round of the logged bandit feedback, i.e., :math:`a_t`.
        action_context: array-like, shape shape (n_actions, dim_action_context)
            Context vector characterizing action (i.e., vector representation of each action).
        """
        return np.c_[context, action_context[action]]

Policies¶

from dataclasses import dataclass
import os
from typing import Optional
  
import enum

import numpy as np
from sklearn.utils import check_scalar

# import pkg_resources
import yaml

from abc import ABCMeta
from abc import abstractmethod
from dataclasses import dataclass
from typing import Optional

import numpy as np
from sklearn.utils import check_random_state
from sklearn.utils import check_scalar

%%writefile prior_bts.yaml
all:
  alpha:
    - 47.0
    - 8.0
    - 62.0
    - 142.0
    - 3.0
    - 14.0
    - 7.0
    - 857.0
    - 12.0
    - 15.0
    - 6.0
    - 100.0
    - 48.0
    - 23.0
    - 71.0
    - 61.0
    - 13.0
    - 16.0
    - 518.0
    - 30.0
    - 7.0
    - 4.0
    - 23.0
    - 8.0
    - 10.0
    - 11.0
    - 11.0
    - 18.0
    - 121.0
    - 11.0
    - 11.0
    - 10.0
    - 14.0
    - 9.0
    - 204.0
    - 58.0
    - 3.0
    - 19.0
    - 42.0
    - 1013.0
    - 2.0
    - 328.0
    - 15.0
    - 31.0
    - 14.0
    - 138.0
    - 45.0
    - 55.0
    - 23.0
    - 38.0
    - 10.0
    - 401.0
    - 52.0
    - 6.0
    - 3.0
    - 6.0
    - 5.0
    - 32.0
    - 35.0
    - 133.0
    - 52.0
    - 820.0
    - 43.0
    - 195.0
    - 8.0
    - 42.0
    - 40.0
    - 4.0
    - 32.0
    - 30.0
    - 9.0
    - 22.0
    - 6.0
    - 23.0
    - 5.0
    - 54.0
    - 8.0
    - 22.0
    - 65.0
    - 246.0
  beta:
    - 12198.0
    - 3566.0
    - 15993.0
    - 35522.0
    - 2367.0
    - 4609.0
    - 3171.0
    - 181745.0
    - 4372.0
    - 4951.0
    - 3100.0
    - 24665.0
    - 13210.0
    - 7061.0
    - 18061.0
    - 17449.0
    - 5644.0
    - 6787.0
    - 111326.0
    - 8776.0
    - 3334.0
    - 2271.0
    - 7389.0
    - 2659.0
    - 3665.0
    - 4724.0
    - 3561.0
    - 5085.0
    - 27407.0
    - 4601.0
    - 4756.0
    - 4120.0
    - 4736.0
    - 3788.0
    - 45292.0
    - 14719.0
    - 2189.0
    - 5589.0
    - 11995.0
    - 222255.0
    - 2308.0
    - 70034.0
    - 4801.0
    - 8274.0
    - 5421.0
    - 31912.0
    - 12213.0
    - 13576.0
    - 6230.0
    - 10382.0
    - 4141.0
    - 85731.0
    - 12811.0
    - 2707.0
    - 2250.0
    - 2668.0
    - 2886.0
    - 9581.0
    - 9465.0
    - 28336.0
    - 12062.0
    - 162793.0
    - 12107.0
    - 41240.0
    - 3162.0
    - 11604.0
    - 10818.0
    - 2923.0
    - 8897.0
    - 8654.0
    - 4000.0
    - 6580.0
    - 3174.0
    - 6766.0
    - 2602.0
    - 14506.0
    - 3968.0
    - 7523.0
    - 16532.0
    - 51964.0
men:
  alpha:
    - 47.0
    - 8.0
    - 62.0
    - 142.0
    - 3.0
    - 6.0
    - 100.0
    - 48.0
    - 23.0
    - 71.0
    - 61.0
    - 13.0
    - 16.0
    - 518.0
    - 30.0
    - 7.0
    - 4.0
    - 23.0
    - 8.0
    - 10.0
    - 11.0
    - 11.0
    - 18.0
    - 121.0
    - 11.0
    - 4.0
    - 32.0
    - 30.0
    - 9.0
    - 22.0
    - 6.0
    - 23.0
    - 5.0
    - 54.0
  beta:
    - 12198.0
    - 3566.0
    - 15993.0
    - 35522.0
    - 2367.0
    - 3100.0
    - 24665.0
    - 13210.0
    - 7061.0
    - 18061.0
    - 17449.0
    - 5644.0
    - 6787.0
    - 111326.0
    - 8776.0
    - 3334.0
    - 2271.0
    - 7389.0
    - 2659.0
    - 3665.0
    - 4724.0
    - 3561.0
    - 5085.0
    - 27407.0
    - 4601.0
    - 2923.0
    - 8897.0
    - 8654.0
    - 4000.0
    - 6580.0
    - 3174.0
    - 6766.0
    - 2602.0
    - 14506.0
women:
  alpha:
    - 12.0
    - 7.0
    - 984.0
    - 13.0
    - 15.0
    - 15.0
    - 11.0
    - 14.0
    - 9.0
    - 200.0
    - 72.0
    - 3.0
    - 14.0
    - 49.0
    - 1278.0
    - 3.0
    - 325.0
    - 14.0
    - 27.0
    - 14.0
    - 169.0
    - 48.0
    - 47.0
    - 18.0
    - 40.0
    - 12.0
    - 447.0
    - 46.0
    - 5.0
    - 3.0
    - 5.0
    - 7.0
    - 35.0
    - 34.0
    - 99.0
    - 30.0
    - 880.0
    - 51.0
    - 182.0
    - 6.0
    - 45.0
    - 39.0
    - 10.0
    - 24.0
    - 72.0
    - 229.0
  beta:
    - 3612.0
    - 3173.0
    - 204484.0
    - 4517.0
    - 4765.0
    - 5331.0
    - 4131.0
    - 4728.0
    - 4028.0
    - 44280.0
    - 17918.0
    - 2309.0
    - 4339.0
    - 12922.0
    - 270771.0
    - 2480.0
    - 68475.0
    - 5129.0
    - 7367.0
    - 5819.0
    - 38026.0
    - 13047.0
    - 11604.0
    - 5394.0
    - 10912.0
    - 4439.0
    - 94485.0
    - 10700.0
    - 2679.0
    - 2319.0
    - 2578.0
    - 3288.0
    - 9566.0
    - 9775.0
    - 20120.0
    - 7317.0
    - 172026.0
    - 13673.0
    - 37329.0
    - 3365.0
    - 10911.0
    - 10734.0
    - 4278.0
    - 7574.0
    - 16826.0
    - 47462.0

Writing prior_bts.yaml

# configurations to replicate the Bernoulli Thompson Sampling policy used in ZOZOTOWN production
prior_bts_file = "prior_bts.yaml"
with open(prior_bts_file, "rb") as f:
    production_prior_for_bts = yaml.safe_load(f)

class PolicyType(enum.Enum):
    """Policy type.
    Attributes
    ----------
    CONTEXT_FREE:
        The policy type is contextfree.
    CONTEXTUAL:
        The policy type is contextual.
    OFFLINE:
        The policy type is offline.
    """

    CONTEXT_FREE = enum.auto()
    CONTEXTUAL = enum.auto()
    OFFLINE = enum.auto()

    def __repr__(self) -> str:

        return str(self)

Base Context Free Policy¶

@dataclass
class BaseContextFreePolicy(metaclass=ABCMeta):
    """Base class for context-free bandit policies.
    Parameters
    ----------
    n_actions: int
        Number of actions.
    len_list: int, default=1
        Length of a list of actions recommended in each impression.
        When Open Bandit Dataset is used, 3 should be set.
    batch_size: int, default=1
        Number of samples used in a batch parameter update.
    random_state: int, default=None
        Controls the random seed in sampling actions.
    """

    n_actions: int
    len_list: int = 1
    batch_size: int = 1
    random_state: Optional[int] = None

    def __post_init__(self) -> None:
        """Initialize Class."""
        check_scalar(self.n_actions, "n_actions", int, min_val=2)
        check_scalar(self.len_list, "len_list", int, min_val=1, max_val=self.n_actions)
        check_scalar(self.batch_size, "batch_size", int, min_val=1)
        self.n_trial = 0
        self.random_ = check_random_state(self.random_state)
        self.action_counts = np.zeros(self.n_actions, dtype=int)
        self.action_counts_temp = np.zeros(self.n_actions, dtype=int)
        self.reward_counts_temp = np.zeros(self.n_actions)
        self.reward_counts = np.zeros(self.n_actions)

    @property
    def policy_type(self) -> PolicyType:
        """Type of the bandit policy."""
        return PolicyType.CONTEXT_FREE

    def initialize(self) -> None:
        """Initialize Parameters."""
        self.n_trial = 0
        self.random_ = check_random_state(self.random_state)
        self.action_counts = np.zeros(self.n_actions, dtype=int)
        self.action_counts_temp = np.zeros(self.n_actions, dtype=int)
        self.reward_counts_temp = np.zeros(self.n_actions)
        self.reward_counts = np.zeros(self.n_actions)

    @abstractmethod
    def select_action(self) -> np.ndarray:
        """Select a list of actions."""
        raise NotImplementedError

    @abstractmethod
    def update_params(self, action: int, reward: float) -> None:
        """Update policy parameters."""
        raise NotImplementedError

Epsilon Greedy¶

@dataclass
class EpsilonGreedy(BaseContextFreePolicy):
    """Epsilon Greedy policy.
    Parameters
    ----------
    n_actions: int
        Number of actions.
    len_list: int, default=1
        Length of a list of actions recommended in each impression.
        When Open Bandit Dataset is used, 3 should be set.
    batch_size: int, default=1
        Number of samples used in a batch parameter update.
    random_state: int, default=None
        Controls the random seed in sampling actions.
    epsilon: float, default=1.
        Exploration hyperparameter that must take value in the range of [0., 1.].
    policy_name: str, default=f'egreedy_{epsilon}'.
        Name of bandit policy.
    """

    epsilon: float = 1.0

    def __post_init__(self) -> None:
        """Initialize Class."""
        check_scalar(self.epsilon, "epsilon", float, min_val=0.0, max_val=1.0)
        self.policy_name = f"egreedy_{self.epsilon}"
        super().__post_init__()

    def select_action(self) -> np.ndarray:
        """Select a list of actions.
        Returns
        ----------
        selected_actions: array-like, shape (len_list, )
            List of selected actions.
        """
        if (self.random_.rand() > self.epsilon) and (self.action_counts.min() > 0):
            predicted_rewards = self.reward_counts / self.action_counts
            return predicted_rewards.argsort()[::-1][: self.len_list]
        else:
            return self.random_.choice(
                self.n_actions, size=self.len_list, replace=False
            )

    def update_params(self, action: int, reward: float) -> None:
        """Update policy parameters.
        Parameters
        ----------
        action: int
            Selected action by the policy.
        reward: float
            Observed reward for the chosen action and position.
        """
        self.n_trial += 1
        self.action_counts_temp[action] += 1
        self.reward_counts_temp[action] += reward
        if self.n_trial % self.batch_size == 0:
            self.action_counts = np.copy(self.action_counts_temp)
            self.reward_counts = np.copy(self.reward_counts_temp)

Random¶

@dataclass
class Random(EpsilonGreedy):
    """Random policy
    Parameters
    ----------
    n_actions: int
        Number of actions.
    len_list: int, default=1
        Length of a list of actions recommended in each impression.
        When Open Bandit Dataset is used, 3 should be set.
    batch_size: int, default=1
        Number of samples used in a batch parameter update.
    random_state: int, default=None
        Controls the random seed in sampling actions.
    epsilon: float, default=1.
        Exploration hyperparameter that must take value in the range of [0., 1.].
    policy_name: str, default='random'.
        Name of bandit policy.
    """

    policy_name: str = "random"

    def compute_batch_action_dist(
        self,
        n_rounds: int = 1,
    ) -> np.ndarray:
        """Compute the distribution over actions by Monte Carlo simulation.
        Parameters
        ----------
        n_rounds: int, default=1
            Number of rounds in the distribution over actions.
            (the size of the first axis of `action_dist`)
        Returns
        ----------
        action_dist: array-like, shape (n_rounds, n_actions, len_list)
            Probability estimates of each arm being the best one for each sample, action, and position.
        """
        action_dist = np.ones((n_rounds, self.n_actions, self.len_list)) * (
            1 / self.n_actions
        )
        return action_dist

BernoulliTS¶

@dataclass
class BernoulliTS(BaseContextFreePolicy):
    """Bernoulli Thompson Sampling Policy
    Parameters
    ----------
    n_actions: int
        Number of actions.
    len_list: int, default=1
        Length of a list of actions recommended in each impression.
        When Open Bandit Dataset is used, 3 should be set.
    batch_size: int, default=1
        Number of samples used in a batch parameter update.
    random_state: int, default=None
        Controls the random seed in sampling actions.
    alpha: array-like, shape (n_actions, ), default=None
        Prior parameter vector for Beta distributions.
    beta: array-like, shape (n_actions, ), default=None
        Prior parameter vector for Beta distributions.
    is_zozotown_prior: bool, default=False
        Whether to use hyperparameters for the beta distribution used
        at the start of the data collection period in ZOZOTOWN.
    campaign: str, default=None
        One of the three possible campaigns considered in ZOZOTOWN, "all", "men", and "women".
    policy_name: str, default='bts'
        Name of bandit policy.
    """

    alpha: Optional[np.ndarray] = None
    beta: Optional[np.ndarray] = None
    is_zozotown_prior: bool = False
    campaign: Optional[str] = None
    policy_name: str = "bts"

    def __post_init__(self) -> None:
        """Initialize class."""
        super().__post_init__()
        if self.is_zozotown_prior:
            if self.campaign is None:
                raise Exception(
                    "`campaign` must be specified when `is_zozotown_prior` is True."
                )
            self.alpha = production_prior_for_bts[self.campaign]["alpha"]
            self.beta = production_prior_for_bts[self.campaign]["beta"]
        else:
            self.alpha = np.ones(self.n_actions) if self.alpha is None else self.alpha
            self.beta = np.ones(self.n_actions) if self.beta is None else self.beta

    def select_action(self) -> np.ndarray:
        """Select a list of actions.
        Returns
        ----------
        selected_actions: array-like, shape (len_list, )
            List of selected actions.
        """
        predicted_rewards = self.random_.beta(
            a=self.reward_counts + self.alpha,
            b=(self.action_counts - self.reward_counts) + self.beta,
        )
        return predicted_rewards.argsort()[::-1][: self.len_list]

    def update_params(self, action: int, reward: float) -> None:
        """Update policy parameters.
        Parameters
        ----------
        action: int
            Selected action by the policy.
        reward: float
            Observed reward for the chosen action and position.
        """
        self.n_trial += 1
        self.action_counts_temp[action] += 1
        self.reward_counts_temp[action] += reward
        if self.n_trial % self.batch_size == 0:
            self.action_counts = np.copy(self.action_counts_temp)
            self.reward_counts = np.copy(self.reward_counts_temp)

    def compute_batch_action_dist(
        self,
        n_rounds: int = 1,
        n_sim: int = 100000,
    ) -> np.ndarray:
        """Compute the distribution over actions by Monte Carlo simulation.
        Parameters
        ----------
        n_rounds: int, default=1
            Number of rounds in the distribution over actions.
            (the size of the first axis of `action_dist`)
        n_sim: int, default=100000
            Number of simulations in the Monte Carlo simulation to compute the distribution over actions.
        Returns
        ----------
        action_dist: array-like, shape (n_rounds, n_actions, len_list)
            Probability estimates of each arm being the best one for each sample, action, and position.
        """
        action_count = np.zeros((self.n_actions, self.len_list))
        for _ in np.arange(n_sim):
            selected_actions = self.select_action()
            for pos in np.arange(self.len_list):
                action_count[selected_actions[pos], pos] += 1
        action_dist = np.tile(
            action_count / n_sim,
            (n_rounds, 1, 1),
        )
        return action_dist

Run¶

Policies

evaluation_policy_dict = dict(bts=BernoulliTS, random=Random)
evaluation_policy_dict

{'bts': __main__.BernoulliTS, 'random': __main__.Random}

Base models

%%writefile hyperparams.yaml
lightgbm:
    n_estimators: 30
    learning_rate: 0.01
    max_depth: 5
    min_samples_leaf: 10
    random_state: 12345
logistic_regression:
    max_iter: 10000
    C: 100
    random_state: 12345
random_forest:
    n_estimators: 30
    max_depth: 5
    min_samples_leaf: 10
    random_state: 12345

Overwriting hyperparams.yaml

# hyperparameters of the regression model used in model dependent OPE estimators
with open("hyperparams.yaml", "rb") as f:
    hyperparams = yaml.safe_load(f)
hyperparams

{'lightgbm': {'learning_rate': 0.01,
  'max_depth': 5,
  'min_samples_leaf': 10,
  'n_estimators': 30,
  'random_state': 12345},
 'logistic_regression': {'C': 100, 'max_iter': 10000, 'random_state': 12345},
 'random_forest': {'max_depth': 5,
  'min_samples_leaf': 10,
  'n_estimators': 30,
  'random_state': 12345}}

base_model_dict = dict(
    logistic_regression=LogisticRegression,
    lightgbm=GradientBoostingClassifier,
    random_forest=RandomForestClassifier,
)
base_model_dict

{'lightgbm': sklearn.ensemble._gb.GradientBoostingClassifier,
 'logistic_regression': sklearn.linear_model._logistic.LogisticRegression,
 'random_forest': sklearn.ensemble._forest.RandomForestClassifier}

OPE estimators

ope_estimators = [DirectMethod(), InverseProbabilityWeighting(), DoublyRobust()]
ope_estimators

[DirectMethod(estimator_name='dm'),
 InverseProbabilityWeighting(lambda_=inf, estimator_name='ipw'),
 DoublyRobust(lambda_=inf, estimator_name='dr')]

parser = argparse.ArgumentParser(description="evaluate off-policy estimators.")
parser.add_argument(
    "--n_runs",
    type=int,
    default=1,
    help="number of bootstrap sampling in the experiment.",
)
parser.add_argument(
    "--evaluation_policy",
    type=str,
    choices=["bts", "random"],
    default='bts',
    help="evaluation policy, bts or random.",
)
parser.add_argument(
    "--base_model",
    type=str,
    choices=["logistic_regression", "lightgbm", "random_forest"],
    default='lightgbm',
    help="base ML model for regression model, logistic_regression, random_forest or lightgbm.",
)
parser.add_argument(
    "--behavior_policy",
    type=str,
    choices=["bts", "random"],
    default='random',
    help="behavior policy, bts or random.",
)
parser.add_argument(
    "--campaign",
    type=str,
    choices=["all", "men", "women"],
    default='all',
    help="campaign name, men, women, or all.",
)
parser.add_argument(
    "--n_sim_to_compute_action_dist",
    type=float,
    default=1000000,
    help="number of monte carlo simulation to compute the action distribution of bts.",
)
parser.add_argument(
    "--n_jobs",
    type=int,
    default=1,
    help="the maximum number of concurrently running jobs.",
)
parser.add_argument("--random_state", type=int, default=12345)

args = parser.parse_args(args={})
print(args)

Namespace(base_model='lightgbm', behavior_policy='random', campaign='all', evaluation_policy='bts', n_jobs=1, n_runs=1, n_sim_to_compute_action_dist=1000000, random_state=12345)

# configurations
n_runs = args.n_runs
base_model = args.base_model
evaluation_policy = args.evaluation_policy
behavior_policy = args.behavior_policy
campaign = args.campaign
n_sim_to_compute_action_dist = args.n_sim_to_compute_action_dist
n_jobs = args.n_jobs
random_state = args.random_state
np.random.seed(random_state)

!git clone https://github.com/st-tech/zr-obp.git

Cloning into 'zr-obp'...
remote: Enumerating objects: 4993, done.
remote: Counting objects: 100% (2007/2007), done.
remote: Compressing objects: 100% (860/860), done.
remote: Total 4993 (delta 1404), reused 1661 (delta 1135), pack-reused 2986
Receiving objects: 100% (4993/4993), 27.54 MiB | 29.23 MiB/s, done.
Resolving deltas: 100% (3306/3306), done.

obd = OpenBanditDataset(behavior_policy=behavior_policy, campaign=campaign, data_path=Path('/content/zr-obp/obd'))
obd

OpenBanditDataset(behavior_policy='random', campaign='all', data_path=PosixPath('/content/zr-obp/obd/random/all'), dataset_name='obd')

Compute action distribution by evaluation policy

kwargs = dict(
    n_actions=obd.n_actions, len_list=obd.len_list, random_state=random_state
)

if evaluation_policy == "bts":
    kwargs["is_zozotown_prior"] = True
    kwargs["campaign"] = campaign

policy = evaluation_policy_dict[evaluation_policy](**kwargs)

action_dist_single_round = policy.compute_batch_action_dist(
    n_sim=n_sim_to_compute_action_dist
)

Ground-truth policy value of an evaluation policy, which is estimated with factual (observed) rewards (on-policy estimation)

ground_truth_policy_value = OpenBanditDataset.calc_on_policy_policy_value_estimate(
    behavior_policy=evaluation_policy,
    campaign=campaign,
)

def process(b: int):
    # sample bootstrap from batch logged bandit feedback
    bandit_feedback = obd.sample_bootstrap_bandit_feedback(random_state=b)
    # estimate the mean reward function with an ML model
    regression_model = RegressionModel(
        n_actions=obd.n_actions,
        len_list=obd.len_list,
        action_context=obd.action_context,
        base_model=base_model_dict[base_model](**hyperparams[base_model]),
    )
    estimated_rewards_by_reg_model = regression_model.fit_predict(
        context=bandit_feedback["context"],
        action=bandit_feedback["action"],
        reward=bandit_feedback["reward"],
        position=bandit_feedback["position"],
        pscore=bandit_feedback["pscore"],
        n_folds=3,  # 3-fold cross-fitting
        random_state=random_state,
    )
    # evaluate estimators' performances using relative estimation error (relative-ee)
    ope = OffPolicyEvaluation(
        bandit_feedback=bandit_feedback,
        ope_estimators=ope_estimators,
    )
    action_dist = np.tile(
        action_dist_single_round, (bandit_feedback["n_rounds"], 1, 1)
    )
    relative_ee_b = ope.evaluate_performance_of_estimators(
        ground_truth_policy_value=ground_truth_policy_value,
        action_dist=action_dist,
        estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
    )

    return relative_ee_b

processed = Parallel(n_jobs=n_jobs, verbose=50)([delayed(process)(i) for i in np.arange(n_runs)])

relative_ee_dict = {est.estimator_name: dict() for est in ope_estimators}

for b, relative_ee_b in enumerate(processed):
    for (estimator_name, relative_ee_) in relative_ee_b.items():
        relative_ee_dict[estimator_name][b] = relative_ee_

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.0s finished

relative_ee_df = DataFrame(relative_ee_dict).describe().T.round(6)

print("=" * 30)
print(f"random_state={random_state}")
print("-" * 30)
print(relative_ee_df[["mean", "std"]])
print("=" * 30)

==============================
random_state=12345
------------------------------
         mean  std
dm   0.034354  NaN
ipw  0.100573  NaN
dr   0.096567  NaN
==============================

# save results of the evaluation of off-policy estimators in './logs' directory.
log_path = Path("./logs") / behavior_policy / campaign
log_path.mkdir(exist_ok=True, parents=True)
relative_ee_df.to_csv(log_path / "relative_ee_of_ope_estimators.csv")