Batch Learning from Bandit Feedback

Imports

import math
import pandas as pd
import numpy as np
import warnings
import seaborn as sns
import matplotlib.pyplot as plt

import sklearn.model_selection
import sklearn.preprocessing
import sklearn.linear_model 

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble.forest import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn.datasets import load_digits, load_breast_cancer, load_wine, fetch_openml

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

Utils

def create_interactions(X: np.ndarray, T: np.ndarray, one_hot_labeler=None) -> tuple:
    if one_hot_labeler is None:
        lb_fit = sklearn.preprocessing.LabelBinarizer().fit(T)
    else:
        lb_fit = one_hot_labeler
    T = lb_fit.transform(T)
    XT = np.zeros(shape=[X.shape[0], X.shape[1] * T.shape[1]]) * np.nan
    cnt = 0
    for i in range(X.shape[1]):
        for j in range(T.shape[1]):
            XT[:,cnt]= X[:, i] * T[:, j]
            cnt += 1
    X_full = np.column_stack((X, T, XT))
    return X_full, lb_fit

Supervised to Bandit Transform (STBT)

class STBT:
   
    """
    Performs Supervised to Bandit Conversion for classification 
    datasets. This conversion is generally used to test the limits of 
    counterfactual learning in a well-controlled environment [1,2,3]. 
    
    Parameters
    ----------
    
    train_frac : float, default: 0.50
        It should be between 0.0 and 1.0 and represents the
        proportion of the dataset to include in the train split.
        
    permute : bool, default: False
        Randomly permute the data before the random split between train and test.
    logging_type : str, default: "uniform"
        The type of logging policy. If "uniform", uniform random samples from the 
        labels $y$ to simulate a logging policy. If "biased", the logging policy
        is a stochastic function of the covariates.
        
    sample_frac : float, default: None
        A sample fraction between (0.0,1.0]. This is the sample fraction of the
        training data used to fit the target policy. By default, the full
        training set is used. 
     
    References
    ----------
    
    .. [1] N. Jiang, and  L. Li, Doubly Robust Off-policy Value Evaluation for Reinforcement Learning, 
           Proceedings of Machine Learning Research, 48, 652--661, 2016.
    .. [2] A. Swaminathan and T. Joachims, Batch Learning from Logged Bandit Feedback through 
           Counterfactual Risk Minimization, Journal of Machine Learning Research, 16(52),
           1731--1755, 2015.
    .. [3] A. Swaminathan and T. Joachims, The self-normalized estimator for counterfactual learning, 
           Advances in Neural Information Processing Systems, 28, 16(52), 3231--3239, 2015.
           
    Examples
    --------
    >>> np.random.seed(42)
    >>> X, y = get_data(dataset='ecoli')  
    >>> obj = STBT()
    >>> sample_batch = obj.generate_batch(X, y)
    >>> sample_batch.y_train_logging[0:5]
    array([1, 1, 0, 0, 0]))
    
    """
    
    def __init__(self, train_frac: float = 0.50, permute: bool = False, logging_type: str = 'uniform',
                 sample_frac: float = None):
        self.train_frac = train_frac
        self.permute = permute
        self.logging_type = logging_type
        self.sample_frac = sample_frac
        
    def __repr__(self):
        
        items = ("%s = %r" % (k, v) for k, v in self.__dict__.items())
        return "<%s: {%s}>" % (self.__class__.__name__, ', '.join(items))
    
    def _validate_input(self):
        if not isinstance(self.train_frac, float) or not (0.0 < self.train_frac < 1.0):
            raise ValueError("`train_frac` should be a float in (0.0,1.0), got %s" % self.train_frac)
            
        if self.sample_frac is not None and self.sample_frac is not (0.0 < self.sample_frac <= 1.0):
            raise ValueError("`sample_frac` should be a float in (0.0,1.0], got %s" % self.sample_frac)

        if self.logging_type not in ['uniform', 'biased']:
            raise ValueError("`logging_type` should be either 'uniform' or 'biased', got %s" % self.logging_type)
    
    def _softmax(self, x, axis=-1):
        
        kw = dict(axis=axis, keepdims=True)
        xrel = x - x.max(**kw)
        exp_xrel = np.exp(xrel)
        p = exp_xrel / exp_xrel.sum(**kw)  
         
        return p
        
         
    def generate_batch(self, X: np.ndarray, y: np.ndarray, **kwargs):
        """Generate Supervised to Bandit batch
        
        Parameters
        ----------
        X : array of shape (n_samples, n_features)
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.
        y : array of shape (n_samples,)
            Target vector relative to X.
        **kwargs : Arguments passed to fit method in 
                   `sklearn.linear_model.LogisticRegression` class.
        
        Returns
        -------
        X_train : array of shape (n_train_samples, n_features)
        y_train : array of shape (n_train_samples,)
        X_test : array of shape (n_test_samples, n_features)
        y_test : array of shape (n_test_samples,)
        y_train_logging : array of shape (n_train_samples,)
            Logging policy labels on train data
        train_logging_probs : array of shape (n_train_samples, n_classes)     
            Logging policy probabilities on train data
        train_logging_prob : array of shape (n_train_samples,)
            Logging policy probability corresponding to the chosen logging label on train data
        y_train_logging_idx : array of shape (n_train_samples, n_classes)
            Binary matrix with 1s indicating which action was taken by the logging policy in train data
           
        y_test_logging : array of shape (n_test_samples,)
             Logging policy labels on test data
        test_logging_probs : array of shape (n_test_samples, n_classes)   
            Logging policy probabilities on test data
        test_logging_prob : array of shape (n_test_samples,)
            Logging policy probability corresponding to the chosen logging label on test data
       
        y_train_target : array of shape (n_train_samples,)
             Target policy labels on train data
        train_target_prob : array of shape (n_train_samples, n_classes)     
             Target policy probabilities on train data
        train_target_probs : array of shape (n_train_samples,)
            Target policy probability corresponding to the chosen logging label on train data
       
        y_test_target : array of shape (n_test_samples,)
             Target policy labels on test data
        test_target_prob : array of shape (n_test_samples, n_classes)     
             Target policy probabilities on test data
        test_target_probs : array of shape (n_test_samples,)
            Target policy probability corresponding to the chosen logging label on test data
       
        true_target_value_test : float
            True value of Target policy on test data
       
        train_logging_reward : array of shape (n_train_samples,)
            Observed reward of logging policy on train data 
        test_logging_reward : array of shape (n_test_samples,)
            Observed reward of logging policy on test data 
            
        """
        self._validate_input()
        
        self.generate_batch_call = True
        
        self.dual = False
        
        if self.permute:
            permute = np.random.permutation(X.shape[0])
            X = X[permute, :]
            y = y[permute]

        self.X_train, self.X_test, self.y_train, self.y_test = \
            sklearn.model_selection.train_test_split(X, y,
                train_size = self.train_frac) 
            
        n_train_samples, n_features = self.X_train.shape
        n_test_samples = self.X_test.shape[0]

    
        y_train_u = np.unique(self.y_train)
        
        if self.logging_type == 'uniform':

            self.y_train_logging = np.random.choice(y_train_u, size=n_train_samples)  
            self.train_logging_prob = np.repeat(1.0/len(y_train_u), n_train_samples)
            self.train_logging_probs = np.repeat(self.train_logging_prob.reshape(-1,1), len(y_train_u), axis=1)
            self.y_test_logging = np.random.choice(y_train_u, size=n_test_samples)  
            self.test_logging_prob = np.repeat(1.0/len(y_train_u), n_test_samples)
            self.test_logging_probs = np.repeat(self.test_logging_prob.reshape(-1,1), len(y_train_u), axis=1)
            
            self.y_train_logging_idx = np.full((n_train_samples, len(y_train_u)), False, dtype=bool)
            for i in range(n_train_samples):
                self.y_train_logging_idx[i, np.where(y_train_u==self.y_train_logging[i])[0][0]] = True 
        
        else:
            
            W = np.random.normal(0, 1, (n_features, len(y_train_u)))
            lp_train = self.X_train @ W
            lp_test = self.X_test @ W
            self.train_logging_probs = self._softmax(lp_train)
            self.test_logging_probs = self._softmax(lp_test)
            
            self.y_train_logging_idx = np.full((n_train_samples, len(y_train_u)), False, dtype=bool)
            y_test_logging_idx = np.full((n_test_samples, len(y_train_u)), False, dtype=bool)
            
            for sample in range(n_train_samples):
                choice = np.random.multinomial(1, self.train_logging_probs[sample,:], size = 1)[-1]
                self.y_train_logging_idx[sample, :] = choice
            
            for sample in range(n_test_samples):
                choice = np.random.multinomial(1, self.test_logging_probs[sample,:], size = 1)[-1]
                y_test_logging_idx[sample, :] = choice
            
            self.y_train_logging = np.array([y_train_u,]*n_train_samples)[self.y_train_logging_idx]
            self.y_test_logging = np.array([y_train_u,]*n_test_samples)[y_test_logging_idx]
            
            self.train_logging_prob = self.train_logging_probs[self.y_train_logging_idx]
            self.test_logging_prob = self.test_logging_probs[y_test_logging_idx]
            
        if self.sample_frac is not None:
            n_subsamples = math.ceil(self.sample_frac * n_train_samples)
            idx_subsamples = np.random.randint(n_train_samples, size=n_subsamples)
            X_train_subsamples = self.X_train[idx_subsamples, :]
            y_train_subsamples = self.y_train[idx_subsamples]
            if n_subsamples < n_features:
                self.dual=True
            target_policy = sklearn.linear_model.LogisticRegression(**kwargs, dual=self.dual).fit(X_train_subsamples, y_train_subsamples)
       
        else:
            if n_train_samples < n_features:
                self.dual=True
            target_policy = sklearn.linear_model.LogisticRegression(**kwargs, dual=self.dual).fit(self.X_train, self.y_train)
        
        self.train_target_probs = target_policy.predict_proba(self.X_train)
        self.test_target_probs = target_policy.predict_proba(self.X_test)
         
        y_train_target = list()
        train_target_prob = list()
        y_test_target = list()
        test_target_prob = list()
         
        for i in range(n_train_samples):
            y_train_target_i = np.random.choice(y_train_u, size=1, 
                                                 replace=False, p=self.train_target_probs[i,:])[0]
            y_train_target.append(y_train_target_i)
            train_target_prob.append(self.train_target_probs[i, np.where(y_train_u==y_train_target_i)[0][0]])
        self.y_train_target = np.array(y_train_target)
        self.train_target_prob = np.array(train_target_prob)
        
        for i in range(n_test_samples):
            y_test_target_i = np.random.choice(y_train_u, size=1, 
                                                 replace=False, p=self.test_target_probs[i,:])[0]
            y_test_target.append(y_test_target_i)
            test_target_prob.append(self.test_target_probs[i, np.where(y_train_u==y_test_target_i)[0][0]])
        self.y_test_target = np.array(y_test_target)
        self.test_target_prob = np.array(test_target_prob)
        
        self.true_target_value_test = np.mean(1 * (self.y_test == self.y_test_target))
        
        self.train_logging_reward = 1 * (self.y_train == self.y_train_logging)
        self.test_logging_reward = 1 * (self.y_test == self.y_test_logging)
           
        return self

Off-Policy Evaluation Estimators

class PolicyEvaluation:
        """
        Performs off-policy evaluation with bandit feedback. 
        
        Parameters
        ----------
        method : str, default: 'ips'.
            The policy evaluation method. The default is 'ips'.
            It should be one of: 'ips' (Inverse Propensity Score), 
            'dm' (Direct Method), 'dr' (Doubly Robust), 'switch'
            (SWITCH estimator).
            
        tau : float, default: 0.001.
            Hyperparameter added to IPS or SWICTH estimator for numerical stability. 
            
            For method='ips', the logging probabilities in the test set get adjusted by
            the max(logging probabilities, tau).
            
            For method = 'switch', when logging probabilities are larger than this parameter,
            the 'dm' estimator is applied, otherwise the 'dr' estimator is applied. 
            
            
        References
        ----------
    
        .. [1] Y. Wang, A. Agarwal and M. Dud\'{\i}k, Optimal and Adaptive Off-policy Evaluation in Contextual Bandits, 
               Proceedings of Machine Learning Research, 70, 3589--3597, 2017.
        .. [2] N. Jiang, and  L. Li, Doubly Robust Off-policy Value Evaluation for Reinforcement Learning, 
               Proceedings of Machine Learning Research, 48, 652--661, 2016.
        .. [3] K{\"u}nzel, S., Sekhon, J., Bickel, P. and Yu, B., Metalearners for estimating heterogeneous 
               treatment effects using machine learning, Proceedings of the National Academy of Sciences, 
               116(10), 4156--4165, 2019. 
               
        Examples
        --------
        >>> np.random.seed(42)
        >>> from blbf.STBT import STBT
        >>> from blbf.PolicyEvaluation import PolicyEvaluation 
        >>> X, y = get_data(dataset='ecoli')
        >>> obj = STBT(train_frac= 0.5)
        >>> data = obj.generate_batch(X, y, max_iter=1000)
        >>> PolicyEvaluation(method='dr').evaluate_policy(data = data)
        0.7241601514218099
        """
    
        def __init__(self, method: str = 'ips', tau: float = 0.001):
            
            self.method = method
            self.tau = tau 
            
            valid_methods = ['ips', 'dm', 'dr', 'switch']
            if self.method not in valid_methods:
                raise ValueError("%s is not a valid method." % self.method)
        
            if self.tau <= 0 or self.tau >1:
                raise ValueError("`tau` must be in the (0, 1) interval, got %s." % self.tau)
                       
        def __repr__(self):
            
            items = ("%s = %r" % (k, v) for k, v in self.__dict__.items())
            return "<%s: {%s}>" % (self.__class__.__name__, ', '.join(items))
        
        def evaluate_policy(self, data, clf: str = 'LogisticRegression', **kwargs) -> float:
            """
            Parameters
            ----------
            data : STBT object
                This must be a Supervised to Bandit Transform (STBT) class with fitted 
                `generate_batch` method.
                
            clf : str, default: 'LogisticRegression'
            A sklearn classification estimator. Must be one of 'LogisticRegression', 
            'LogisticRegressionCV', 'RandomForestClassifier', or 'SVC'.
           
            **kwargs : Arguments passed to clf.
    
            Returns
            -------
            float.
              The estimated value of the policy.
        
            """
              
            if not hasattr(data, 'generate_batch_call'):
                raise TypeError("The method `generate_batch` must be called first on the instance: %s." % (data))
                                    
            if self.method == 'ips':
                
                if self.tau is not None:
                    adj_test_logging_prob = np.maximum(self.tau, data.test_logging_prob)  
                
                else:
                    adj_test_logging_prob = data.test_logging_prob
                
                v = np.mean(data.test_logging_reward * (data.y_test_logging == data.y_test_target) / adj_test_logging_prob)
                
            else:
                XY_train, lb_fit = create_interactions(data.X_train, data.y_train_logging)
                m = eval(clf)(**kwargs).fit(XY_train, data.train_logging_reward)
                XY_test_target, _ = create_interactions(data.X_test, data.y_test_target, one_hot_labeler = lb_fit)
                test_target_pred_reward = m.predict_proba(XY_test_target)[:,1]
                
                if self.method in ['dr', 'switch']:
                    XY_test_logging, _ = create_interactions(data.X_test, data.y_test_logging, one_hot_labeler = lb_fit)
                    test_logging_pred_reward = m.predict_proba(XY_test_logging)[:,1]
                    dr_adj = (data.test_logging_reward - test_logging_pred_reward) * \
                                 (data.y_test_logging == data.y_test_target) / data.test_logging_prob
                
                if self.method == 'dm':
                    v = np.mean(test_target_pred_reward) 
                    
                elif self.method == 'dr':
                    v = np.mean(test_target_pred_reward + dr_adj)
                        
                elif self.method == 'switch':
                    switch_indicator = np.array(data.test_logging_prob <= self.tau, dtype=int)
                    switch_estimator_rewards = (1-switch_indicator) * (dr_adj + test_target_pred_reward)
                    switch_estimator_rewards += switch_indicator * test_target_pred_reward
                    v = np.mean(switch_estimator_rewards)
          
            return v

Sample datasets used in experiments

def get_data(dataset: str = None, scale: bool = True) -> tuple:
    """Get data (features and labels) used in experiments.
    
    Parameters
    ----------
    
    dataset : str, default: None 
        It should be one of: 'ecoli', 'glass', 'letter-recognition', 
        'lymphography', 'yeast', 'digits', 'breast-cancer', 'wine', or 
        'mnist'.
        
    scale : bool, default: True
        Standardize features by zero mean and unit variance.
        
    Returns
    -------
    
    tuple, length=2 
        tuple containing features-target split of inputs.
        
    References
    ----------
    Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. 
    Irvine, CA: University of California, School of Information and Computer Science.
    
    Examples
    --------
    >>> X, y = get_data(dataset='ecoli')
    >>> X[0,:]
    array([0.49, 0.29, 0.48, 0.5 , 0.56, 0.24, 0.35])
    
    """
    
    if dataset not in ['ecoli', 'glass', 'letter-recognition', 'lymphography', 'yeast', 
                       'digits', 'breast-cancer', 'wine', 'mnist']:
        raise ValueError("Invalid dataset provided.")
    
    if dataset in dataset in ['ecoli', 'glass', 'letter-recognition', 'lymphography', 'yeast']:
        path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/' 
        f = path + dataset + "/" + dataset + ".data"
     
    if dataset in  ['ecoli', 'yeast']:
        df = pd.read_table(f, delim_whitespace=True, header=None)
    elif dataset in [ 'glass', 'letter-recognition', 'lymphography']:
        df = pd.read_csv(f, header=None)
    elif dataset == 'digits':
        df = load_digits()
        X = df.data
        y = df.target
    elif dataset == 'breast-cancer':
        df = load_breast_cancer()
        X = df.data
        y = df.target
    elif dataset == 'wine':
        df = load_wine()
        X = df.data
        y = df.target
        
    if dataset == 'ecoli':
        y = preprocessing.LabelEncoder().fit_transform(df.iloc[:,-1])
        X = df.iloc[:,1:8].values
        
    elif dataset == 'glass':
        y = df.iloc[:,-1].values
        X = df.iloc[:, 1:(df.shape[1]-1)].values
        
    elif dataset in ['letter-recognition', 'lymphography']:
        y = preprocessing.LabelEncoder().fit_transform(df.iloc[:,0])
        X = df.iloc[:, 1:(df.shape[1])].values
     
    elif dataset == 'yeast':
        y = preprocessing.LabelEncoder().fit_transform(df.iloc[:,-1])
        X = df.iloc[:,1:9].values
        
    elif dataset == 'mnist':
        X, y = fetch_openml('mnist_784', version=1, return_X_y=True)
        y = y.astype('int64') 

    if scale==True:
        scaler = preprocessing.StandardScaler()
        X = scaler.fit_transform(X)
    
    return X, y

Compare the following methods

  • IPS: Inverse Propensity Score

  • DM: Direct Method (Reward Prediction)

  • DR: Doubly Robust

  • SWITCH: Switch Estimator

class ComparePolicyEvaluation:
    
    def __init__(self, B: int = 100, datasets: list = None):
        self.B = B
        self.datasets = datasets
        
    def __repr__(self):
            
        items = ("%s = %r" % (k, v) for k, v in self.__dict__.items())
        return "<%s: {%s}>" % (self.__class__.__name__, ', '.join(items))

    def fit_policies(self, **kwargs) -> pd.DataFrame:
        
        if self.datasets is None:
            self.datasets = ['ecoli', 'glass', 'lymphography', 'yeast', 
                        'digits', 'breast-cancer', 'wine'] # 'letter-recognition'
        dat = list()
        true_value = list()
        ips = list()
        dm = list()
        dr = list()
        switch = list()
        
        for s in self.datasets:
            for b in range(self.B):
                if (b % 10) == 0:
                    print("Sample: %d - Dataset: %s" % (b, s))
                X, y = get_data(dataset=s)
                d = STBT().generate_batch(X, y, max_iter=1000)
                dat.append(s)
                true_value.append(d.true_target_value_test)
                ips.append(PolicyEvaluation(method='ips').evaluate_policy(data = d))
                dm.append(PolicyEvaluation(method='dm').evaluate_policy(data = d, **kwargs))
                dr.append(PolicyEvaluation(method='dr').evaluate_policy(data = d, **kwargs))
                switch.append(PolicyEvaluation(method='switch').evaluate_policy(data = d, **kwargs))
           
        res = pd.DataFrame.from_dict({'dataset':dat, 'true_value':true_value, 'ips':ips,
                                     'dm': dm, 'dr':dr, 'switch': switch})
    
        # Bias
        res['ips_bias'] = res['true_value'].values - res['ips'].values
        res['dm_bias'] = res['true_value'].values - res['dm'].values
        res['dr_bias'] = res['true_value'].values - res['dr'].values
        res['switch_bias'] = res['true_value'].values - res['switch'].values
        
        # Relative risk
        res['ips_rr'] = np.abs((res['true_value'].values - res['ips'].values)/res['true_value'].values)
        res['dm_rr'] = np.abs((res['true_value'].values - res['dm'].values)/res['true_value'].values)
        res['dr_rr'] = np.abs((res['true_value'].values - res['dr'].values)/res['true_value'].values)
        res['switch_rr'] = np.abs((res['true_value'].values - res['switch'].values)/res['true_value'].values)
        
        self.res = res
       
        return self
    
    def get_summary_stats(self):
        
        res_summary = self.res.groupby(['dataset'], as_index=False).agg({
                            'ips_bias': ['mean','std'], 
                            'dm_bias': ['mean','std'],
                            'dr_bias': ['mean','std'],
                            'switch_bias': ['mean','std'],
                            'ips_rr': ['mean','std'], 
                            'dm_rr': ['mean','std'],
                            'dr_rr': ['mean','std'],
                            'switch_rr': ['mean','std']
                            })
        
        self.res_summary = res_summary
        return self
    
    def plot_bias(self):
        
        res_long = pd.melt(self.res, id_vars=['dataset'], var_name = 'method', value_name = "bias",
                  value_vars=['ips_bias',  'dm_bias', 'dr_bias', 'switch_bias'])

        ax = sns.catplot(x="method", y="bias", col = "dataset", kind = "box", 
                          col_wrap=3, data=res_long)
        for i in range(len(ax.axes)):
            ax_i = ax.axes[i]
            ax_i.axhline(0, ls="--")
        
        plt.show()
cpe = ComparePolicyEvaluation(B=100).fit_policies(max_iter=1000)
Sample: 0 - Dataset: ecoli
Sample: 10 - Dataset: ecoli
Sample: 20 - Dataset: ecoli
Sample: 30 - Dataset: ecoli
Sample: 40 - Dataset: ecoli
Sample: 50 - Dataset: ecoli
Sample: 60 - Dataset: ecoli
Sample: 70 - Dataset: ecoli
Sample: 80 - Dataset: ecoli
Sample: 90 - Dataset: ecoli
Sample: 0 - Dataset: glass
Sample: 10 - Dataset: glass
Sample: 20 - Dataset: glass
Sample: 30 - Dataset: glass
Sample: 40 - Dataset: glass
Sample: 50 - Dataset: glass
Sample: 60 - Dataset: glass
Sample: 70 - Dataset: glass
Sample: 80 - Dataset: glass
Sample: 90 - Dataset: glass
Sample: 0 - Dataset: lymphography
Sample: 10 - Dataset: lymphography
Sample: 20 - Dataset: lymphography
Sample: 30 - Dataset: lymphography
Sample: 40 - Dataset: lymphography
Sample: 50 - Dataset: lymphography
Sample: 60 - Dataset: lymphography
Sample: 70 - Dataset: lymphography
Sample: 80 - Dataset: lymphography
Sample: 90 - Dataset: lymphography
Sample: 0 - Dataset: yeast
Sample: 10 - Dataset: yeast
Sample: 20 - Dataset: yeast
Sample: 30 - Dataset: yeast
Sample: 40 - Dataset: yeast
Sample: 50 - Dataset: yeast
Sample: 60 - Dataset: yeast
Sample: 70 - Dataset: yeast
Sample: 80 - Dataset: yeast
Sample: 90 - Dataset: yeast
Sample: 0 - Dataset: digits
Sample: 10 - Dataset: digits
Sample: 20 - Dataset: digits
Sample: 30 - Dataset: digits
Sample: 40 - Dataset: digits
Sample: 50 - Dataset: digits
Sample: 60 - Dataset: digits
Sample: 70 - Dataset: digits
Sample: 80 - Dataset: digits
Sample: 90 - Dataset: digits
Sample: 0 - Dataset: breast-cancer
Sample: 10 - Dataset: breast-cancer
Sample: 20 - Dataset: breast-cancer
Sample: 30 - Dataset: breast-cancer
Sample: 40 - Dataset: breast-cancer
Sample: 50 - Dataset: breast-cancer
Sample: 60 - Dataset: breast-cancer
Sample: 70 - Dataset: breast-cancer
Sample: 80 - Dataset: breast-cancer
Sample: 90 - Dataset: breast-cancer
Sample: 0 - Dataset: wine
Sample: 10 - Dataset: wine
Sample: 20 - Dataset: wine
Sample: 30 - Dataset: wine
Sample: 40 - Dataset: wine
Sample: 50 - Dataset: wine
Sample: 60 - Dataset: wine
Sample: 70 - Dataset: wine
Sample: 80 - Dataset: wine
Sample: 90 - Dataset: wine
cpe.get_summary_stats()
cpe.res_summary
dataset ips_bias dm_bias dr_bias switch_bias ips_rr dm_rr dr_rr switch_rr
mean std mean std mean std mean std mean std mean std mean std mean std
0 breast-cancer 0.015439 0.056258 0.031097 0.014548 0.002050 0.009696 0.002050 0.009696 0.051048 0.033579 0.032639 0.015079 0.008363 0.006185 0.008363 0.006185
1 digits -0.000356 0.099657 0.271003 0.036489 -0.003003 0.045847 -0.003003 0.045847 0.086759 0.063050 0.291985 0.039215 0.039939 0.029041 0.039939 0.029041
2 ecoli 0.036131 0.159114 0.221680 0.069068 0.021395 0.082412 0.021395 0.082412 0.168851 0.126703 0.288876 0.088675 0.086244 0.068031 0.086244 0.068031
3 glass -0.005234 0.135409 0.108039 0.077705 -0.015941 0.102587 -0.015941 0.102587 0.231339 0.168950 0.231582 0.136500 0.177798 0.135564 0.177798 0.135564
4 lymphography -0.030135 0.166516 0.210184 0.112056 -0.006171 0.097335 -0.006171 0.097335 0.166880 0.141465 0.278812 0.132798 0.100086 0.078494 0.100086 0.078494
5 wine 0.004831 0.121543 0.123857 0.042780 -0.000622 0.038650 -0.000622 0.038650 0.099961 0.082773 0.132158 0.045198 0.031962 0.026401 0.031962 0.026401
6 yeast -0.010755 0.071710 0.080556 0.037969 -0.006330 0.058608 -0.006330 0.058608 0.127633 0.097028 0.178583 0.078501 0.105852 0.077361 0.105852 0.077361
cpe.plot_bias()
_images/T890478_Batch_Learning_from_Bandit_Feedback_(BLBF)_15_0.png

Off-Policy Learning Estimators

class EvaluationMetrics:
    
    def __init__(self) -> None:
        pass
    
    @staticmethod
    def error_rate(y_pred, y) -> float:
        er = 1 - np.mean(1 * (y_pred == y))
        
        return er

class BanditDataset(Dataset):
    
    def __init__(self, X, y, p0, r, y_idx):
        self.X = X
        self.y = y
        self.p0 = p0
        self.r = r
        self.y_idx = y_idx
        
    def __getitem__(self, index):
        return self.X[index], self.y[index], self.p0[index], self.r[index], self.y_idx[index]
        
    def __len__ (self):
        return len(self.X)
    
    
class LinearModel(torch.nn.Module):
    def __init__(self, n_features, n_actions):
        super(LinearModel, self).__init__()
        self.linear = torch.nn.Linear(n_features, n_actions)

    def forward(self, x):
        xw_plus_b = self.linear(x)
        return xw_plus_b # batch size x n_actions
    
class NonLinearModel(torch.nn.Module):
    def __init__(self, n_features, n_actions, n_hidden=3):
        super().__init__()
        self.l1 = nn.Linear(n_features,n_hidden)
        self.l2 = nn.Linear(n_hidden,n_actions)
        
    def forward(self, x):
        return self.l2(F.relu(self.l1(x)))
   

class RewardPredictor(EvaluationMetrics):
    """
    Performs policy learning using by directly predicting the Reward as a function of covariates, 
    actions and their interaction. 
    
    References
    ----------
    
    .. [1] A. Swaminathan and T. Joachims, Batch Learning from Logged Bandit Feedback through 
           Counterfactual Risk Minimization, Journal of Machine Learning Research, 16(52),
           1731--1755, 2015.
    .. [2] A. Swaminathan and T. Joachims, The self-normalized estimator for counterfactual learning, 
           Advances in Neural Information Processing Systems, 28, 16(52), 3231--3239, 2015.
    .. [3] A. Swaminathan, T. Joachims, and M. de Rijke, Deep Learning with Logged Bandit Feedback, 
           International Conference on Learning Representations,  2018.
    
    """
        
    
    def __init__(self) -> None:
        pass
     
    def __repr__(self) -> str:
    
        items = ("%s = %r" % (k, v) for k, v in self.__dict__.items())
        return "<%s: {%s}>" % (self.__class__.__name__, ', '.join(items))
    
    def learn_policy(self, data, clf: str = 'LogisticRegression', **kwargs) -> None:
        """
        Parameters
        ----------
        data : STBT object
            This must be a Supervised to Bandit Transform (STBT) class with fitted 
            `generate_batch` method.
            
        clf : str, default: 'LogisticRegression'
        A sklearn classification estimator. Must be one of 'LogisticRegression', 
        'LogisticRegressionCV', 'RandomForestClassifier', or 'SVC'.
       
        **kwargs : Arguments passed to clf.
        Returns
        -------
        int.
          The predicted best policy.
        
        """    
    
        XY_train, lb_fit = create_interactions(data.X_train, data.y_train_logging)
        y_train_logging_u = np.unique(data.y_train_logging)
        self.train_pred_reward_arr = np.zeros(shape=[data.X_train.shape[0], len(y_train_logging_u)]) 
        self.test_pred_reward_arr = np.zeros(shape=[data.X_test.shape[0], len(y_train_logging_u)]) 
        m = eval(clf)(**kwargs).fit(XY_train, data.train_logging_reward)
        
        for i, yval in enumerate(y_train_logging_u):
           XY_train_yval, _ = create_interactions(data.X_train, np.repeat(yval, data.X_train.shape[0]), one_hot_labeler = lb_fit)
           XY_test_yval, _ = create_interactions(data.X_test, np.repeat(yval, data.X_test.shape[0]), one_hot_labeler = lb_fit)
           self.train_pred_reward_arr[:,i] = m.predict_proba(XY_train_yval)[:,1]
           self.test_pred_reward_arr[:,i] = m.predict_proba(XY_test_yval)[:,1]
             
        self.est_best_policy = np.array(y_train_logging_u[np.argmax(self.test_pred_reward_arr, axis=1)])
             
        return self
    
class OutcomeWeightedLearning(EvaluationMetrics):
    """
    Performs policy learning by transforming the learning problem into a 
    weighted multi-class classification problem. 
    
    
    References
    ----------
    
    .. [1] Y. Zhao, D. Zeng, A.J. Rush and M. R. Kosorok, Estimating Individualized Treatment 
           Rules Using Outcome Weighted Learning, Journal of the American Statistical Association, 
           107:499, 1106-1118, 2012, DOI: 10.1080/01621459.2012.695674.
    """
        
    def __init__(self) -> None:
        pass
     
    def __repr__(self) -> str:
    
        items = ("%s = %r" % (k, v) for k, v in self.__dict__.items())
        return "<%s: {%s}>" % (self.__class__.__name__, ', '.join(items))
    
    def learn_policy(self, data, clf: str = 'SVC', **kwargs) -> None:
    
        """
        Parameters
        ----------
        data : STBT object
            This must be a Supervised to Bandit Transform (STBT) class with fitted 
            `generate_batch` method.
            
        clf : str, default: 'SVC'
        A sklearn classification estimator. Must be one of 'LogisticRegression', 
        'LogisticRegressionCV', 'RandomForestClassifier', or 'SVC'.
       
        **kwargs : Arguments passed to clf.
        Returns
        -------
        int.
          The predicted best policy.
        
        """    
        
        wt = data.train_logging_reward / data.train_logging_prob
        
        if clf in ['SVC', 'RandomForestClassifier']:
            m = eval(clf)(**kwargs).fit(data.X_train, data.y_train_logging, sample_weight = wt)
        elif clf in ['LogisticRegression', 'LogisticRegressionCV']:
            m = eval(clf)(multi_class='multinomial', **kwargs).fit(data.X_train, data.y_train_logging, sample_weight = wt)
        
        self.est_best_policy = m.predict(data.X_test)
        
        return self
         

class VowpalWabbit(EvaluationMetrics):
    """
    Performs policy learning using Vowpal Wabbit. 
    
    Parameters
    ----------
    method : str, default: 'ips'
        The policy evaluation approach to optimize a policy. Vowpal Wabbit offers four 
        approaches to specify a contextual bandit approach:
            * Inverse Propensity Score: 'ips'
            * Doubly Robust: 'dr'
            * Direct Method: 'dm'
            * Multi Task Regression/Importance Weighted Regression: 'mtr' 
       
    References
    ----------
    
    .. [1] A. Bietti and A. Agarwal and J. Langford, A Contextual Bandit Bake-off, 
        arXiv preprint arXiv:1802.04064, 2018.
    
    """

    def __init__(self, method = 'dr') -> None:
        self.method = method
     
    def __repr__(self) -> str:
    
        items = ("%s = %r" % (k, v) for k, v in self.__dict__.items())
        return "<%s: {%s}>" % (self.__class__.__name__, ', '.join(items))
    
    def _train_vw(self, data):
        n_actions = len(np.unique(data.y_train_logging))
        vw = pyvw.vw(str("--cb_type") + " " +  self.method + " " + str(n_actions))
        for i in range(data.X_train.shape[0]):
            action = data.y_train_logging[i]
            cost = 1 - data.train_logging_reward[i] # input requires cost instead of reward
            probability = data.train_logging_prob[i]
            train_features_ls = list()
            for f in range(data.X_train.shape[1]):
                train_features_ls.append(str(data.X_train[i, f]))
                train_features = " ".join(train_features_ls)
            learn_example = str(action) + ":" + str(cost) + ":" + str(probability) + " | " + train_features
            vw.learn(learn_example) 
        return vw
    
    def _predict_vw(self, vw_object, data):
        test_features_ls = list()
        predictions = list()
        for i in range(data.X_test.shape[0]):
            for f in range(data.X_test.shape[1]):
                test_features_ls.append(str(data.X_test[i, f]))
                features = " ".join(test_features_ls)
            test_example = " | " + features
            pred = vw_object.predict(test_example) 
            predictions.append(pred)
        predictions = np.array(predictions)
    
        return predictions 
        
    
    def learn_policy(self, data) -> None:
        """
        Parameters
        ----------
        data : STBT object
            This must be a Supervised to Bandit Transform (STBT) class with fitted 
            `generate_batch` method.
            
        Returns
        -------
        int.
          The predicted best policy.
        
        """ 
        
        vw_fit = self._train_vw(data)
        self.est_best_policy = self._predict_vw(vw_fit, data)
        
        return self

class CounterfactualRiskMinimization(EvaluationMetrics):
    """
    Performs policy learning using the Counterfactual Risk Minimization 
    approach proposed in [1], and later refined in [2].
    
    Parameters
    ----------
    
    batch_size : int, default: 96
        The number of samples per batch to load 
    learning_rate : float, default: 0.01
        Stochastic gradient descent learning rate 
    weight_decay : float, default: 0.001
        L2 regularization on parameters
    lambda_ : float, default: 0.1
        Variance regularization. Penalty on the variance of the 
        learnt policy relative to the logging policy.
    self_normalize: bool, default: True
        Whether to normalize the IPS estimator. See [2].
    clipping: float, default: 100.
        Clipping the importance sample weights. See [1].
    verbose: bool, default: False
        Whether to print Poem Loss during training .
    
    References
    ----------
    
    .. [1] A. Swaminathan and T. Joachims, Batch Learning from Logged Bandit Feedback through 
            Counterfactual Risk Minimization, Journal of Machine Learning Research, 16(52),
            1731--1755, 2015.
    .. [2] A. Swaminathan and T. Joachims, The self-normalized estimator for counterfactual learning, 
            Advances in Neural Information Processing Systems, 28, 16(52), 3231--3239, 2015.
    .. [3] A. Swaminathan, T. Joachims, and M. de Rijke, Deep Learning with Logged Bandit Feedback, 
            International Conference on Learning Representations,  2018.
    
    """

    
    def __init__(self, batch_size: int = 96, learning_rate: float = 0.001, weight_decay: float = 0.001, 
                 lambda_: float = 0.5, self_normalize: bool = True, clipping : float = 100.,
                 verbose: bool = False) -> None:
        
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.weight_decay = weight_decay
        self.lambda_ = lambda_
        self.self_normalize = self_normalize
        self.clipping = clipping 
        self.verbose = verbose
       
    def __repr__(self) -> str:
    
        items = ("%s = %r" % (k, v) for k, v in self.__dict__.items())
        return "<%s: {%s}>" % (self.__class__.__name__, ', '.join(items))
    
    def _poem_loss(self, pi, p0, r, y_idx, Lambda):
        
        if torch.sum(r) == 0: 
            r = torch.repeat_interleave(torch.tensor(1e-05, dtype=torch.float), len(r))
        
        bsz = pi.shape[0]
        softmax_pi = F.softmax(pi, dim=1)
        pi_i = softmax_pi.masked_select(y_idx)
        log_importance = torch.log(pi_i) - torch.log(p0) 
        importance = torch.exp(log_importance)    
        clip_importance_vals = torch.repeat_interleave(torch.tensor(self.clipping, dtype=torch.float), len(importance))
        importance = torch.min(clip_importance_vals, importance)
        off_policy_est = torch.mul(importance, r)
        # Eq.(8) in [2] 
        var_n = torch.sum(torch.mul(torch.pow(torch.sub(r, off_policy_est), 2), torch.pow(torch.div(pi_i, p0), 2)))
        var_d = torch.pow(torch.sum(torch.div(pi_i, p0)), 2)
        empirical_var = torch.div(var_n, var_d)
        if self.self_normalize:
            effective_sample_size = torch.sum(importance).detach() # turns off requires grad
            mean_off_policy_est = torch.div(torch.sum(off_policy_est), effective_sample_size) 
        else:
            mean_off_policy_est = torch.mean(off_policy_est)
        
        penalty = torch.mul(Lambda, torch.sqrt(torch.div(empirical_var, bsz)))
        loss = torch.mul(-1.0, mean_off_policy_est) + penalty
        
        return loss

   
    
    def learn_policy(self, model, data, epochs: int = 500) -> None:

        """
        Parameters
        ----------
        data : STBT object
            This must be a Supervised to Bandit Transform (STBT) class with fitted 
            `generate_batch` method.
        epochs : int, default 
            Number of training epochs.
            
        Returns
        -------
        int.
          The predicted best policy.
        
        """ 
        
        train_ds = BanditDataset(torch.from_numpy(data.X_train).float(),
                                 torch.from_numpy(data.y_train_logging).long(), 
                                 torch.from_numpy(data.train_logging_prob).float(), 
                                 torch.from_numpy(data.train_logging_reward).long(),
                                 torch.from_numpy(data.y_train_logging_idx).bool())
        
        
        n_features = train_ds.X.shape[1]
        actions = torch.unique(train_ds.y)
        n_actions = len(actions)
       
        train_dl = DataLoader(train_ds, self.batch_size)
        
        Model = model(n_features = n_features, n_actions = n_actions)
        
        optimizer = torch.optim.Adam(Model.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)  
       
        for epoch in range(epochs):
            Model.train()
            train_epoch_loss = 0.
            for x_batch,y_batch,p0_batch,r_batch,y_idx_batch in train_dl:
                pi = Model(x_batch)
                loss = self._poem_loss(pi, p0_batch, r_batch, y_idx_batch, self.lambda_)
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                train_epoch_loss += loss.item()
            if self.verbose:
                if epoch % 100 == 0:
                    print(f'Epoch {epoch}: | Train Poem Loss: {train_epoch_loss/len(train_dl):.5f}')
            
        Model.eval()
        with torch.no_grad():
            X_test = torch.from_numpy(data.X_test).float()
            pred = Model(X_test)
            est_best_policy = actions[torch.argmax(pred, dim=1)]
            
        self.est_best_policy = est_best_policy.numpy()
         
        return self
    
    

class CounterfactualRiskMinimizationCV(CounterfactualRiskMinimization, EvaluationMetrics):
    """
    Tune variance penalty for Counterfactual Risk Minimization.
    
    Parameters
    ----------
    
    batch_size : int, default: 96
        The number of samples per batch to load 
    learning_rate : float, default: 0.01
        Stochastic gradient descent learning rate 
    weight_decay : float, default: 0.001
        L2 regularization on parameters
    self_normalize: bool, default: True
        Whether to normalize the IPS estimator. See [2].
    clipping: float, default: 100.
        Clipping the importance sample weights. See [1].
    verbose: bool, default: True
        Whether to print Poem Loss during training .
    lambda_ : 1D array, optional, defaults to grid of values 
        chosen in a logarithmic scale between 1e-4 and 1e+01.
    
    """
    
    def __init__(self, batch_size: int = 96, learning_rate: float = 0.001, weight_decay: float = 0.001, 
                 self_normalize: bool = True, clipping : float = 100., verbose: bool = False, 
                 lambda_: np.ndarray = None) -> None:
        
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.weight_decay = weight_decay
        self.self_normalize = self_normalize
        self.clipping = clipping 
        self.verbose = verbose
        
        if lambda_ is None:
            self.lambda_ = 10 ** np.linspace(-4., 1., 10) # search in log scale
        else:
            self.lambda_= lambda_
                           
    def __repr__(self) -> str:
    
        items = ("%s = %r" % (k, v) for k, v in self.__dict__.items())
        return "<%s: {%s}>" % (self.__class__.__name__, ', '.join(items))
    
    def _get_params_min_loss(self, x):
        x = x.numpy()
        xmin_idx = np.unravel_index(x.argmin(), x.shape)
        l_best = self.lambda_[xmin_idx[0]]
          
        return l_best

    
    def learn_policy(self, model, data, valid_frac: float = 0.5, epochs: int = 500) -> None:
        """
        Parameters
        ----------
        data : STBT object
            This must be a Supervised to Bandit Transform (STBT) class with fitted 
            `generate_batch` method.
        valid_frac : float, default: 0.5
            Fraction of training data set for validation. Test data are not modified. 
        epochs : int, default: 500
            Number of training epochs.
            
        Returns
        -------
        int.
          The predicted best policy.
        
        """ 
        
        self.epochs = epochs
        self.valid_frac = valid_frac
        
        n_train_samples, n_features = data.X_train.shape
        idx_valid_samples = np.random.choice(range(n_train_samples), 
                                              size = int(np.floor(n_train_samples * valid_frac)), replace = False)
        
        train_ds = BanditDataset(torch.from_numpy(np.delete(data.X_train, idx_valid_samples, axis=0)).float(),
                                  torch.from_numpy(np.delete(data.y_train_logging, idx_valid_samples)).long(), 
                                  torch.from_numpy(np.delete(data.train_logging_prob, idx_valid_samples)).float(), 
                                  torch.from_numpy(np.delete(data.train_logging_reward, idx_valid_samples)).long(),
                                  torch.from_numpy(np.delete(data.y_train_logging_idx, idx_valid_samples, axis=0)).bool())
        
        
        valid_ds = BanditDataset(torch.from_numpy(data.X_train[idx_valid_samples, :]).float(),
                                  torch.from_numpy(data.y_train_logging[idx_valid_samples]).long(), 
                                  torch.from_numpy(data.train_logging_prob[idx_valid_samples]).float(), 
                                  torch.from_numpy(data.train_logging_reward[idx_valid_samples]).long(),
                                  torch.from_numpy(data.y_train_logging_idx[idx_valid_samples, :]).bool())
            
        y_train = np.delete(data.y_train, idx_valid_samples, axis=0)
        y_valid = data.y_train[idx_valid_samples]
        X_test = torch.from_numpy(data.X_test).float()
        
        actions = torch.unique(train_ds.y)
        n_actions = len(actions)
       
        train_dl = DataLoader(train_ds, self.batch_size)
        valid_dl = DataLoader(valid_ds, self.batch_size)
        
        Model = model(n_features=n_features, n_actions=n_actions)
        
        optimizer = torch.optim.Adam(Model.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)  
           
        self.train_tot_loss_hist = torch.zeros(len(self.lambda_), epochs)   
        self.valid_tot_loss_hist = torch.zeros(len(self.lambda_), epochs)
        self.valid_acc = torch.zeros(len(self.lambda_), epochs) 
        self.train_acc = torch.zeros(len(self.lambda_), epochs) 
        self.test_acc = torch.zeros(len(self.lambda_), epochs) 
    
        for l_idx, l in enumerate(self.lambda_):
       
            for epoch in range(epochs):
                Model.train()
                train_epoch_loss = 0.
                for x_batch,y_batch,p0_batch,r_batch,y_idx_batch in train_dl:
                    pi = Model(x_batch)
                    loss = self._poem_loss(pi, p0_batch, r_batch, y_idx_batch, l)
                    loss.backward()
                    optimizer.step()
                    optimizer.zero_grad()
                    train_epoch_loss += loss.item()
                self.train_tot_loss_hist[l_idx, epoch] = train_epoch_loss/len(train_dl)
                if self.verbose:
                    if epoch % 100 == 0:
                        print(f'Epoch: {epoch} | Train Poem Loss: {train_epoch_loss/len(train_dl):.5f}')
                
                Model.eval()
                with torch.no_grad():
                    valid_tot_loss=0.
                    for x_batch,y_batch,p0_batch,r_batch,y_idx_batch in valid_dl:
                        pi = Model(x_batch)
                        valid_loss = self._poem_loss(pi, p0_batch, r_batch, y_idx_batch, l)
                        valid_tot_loss += valid_loss.item()
                self.valid_tot_loss_hist[l_idx, epoch] = valid_tot_loss/len(valid_dl)
                if self.verbose:
                      if epoch % 100 == 0:
                          print(f'Epoch: {epoch} | Valid Poem Loss: {valid_tot_loss/len(valid_dl):.5f}')
                
                pred_train = Model(train_ds.X)
                est_best_policy_train = actions[torch.argmax(pred_train, dim=1)]
                est_best_policy_train = est_best_policy_train.numpy()
                self.train_acc[l_idx, epoch] = self.error_rate(est_best_policy_train, y_train)
                pred_valid = Model(valid_ds.X)
                est_best_policy_valid = actions[torch.argmax(pred_valid, dim=1)]
                est_best_policy_valid = est_best_policy_valid.numpy()
                self.valid_acc[l_idx, epoch] = self.error_rate(est_best_policy_valid, y_valid)
                
                pred_test = Model(X_test)
                est_best_policy_test = actions[torch.argmax(pred_test, dim=1)]
                est_best_policy_test = est_best_policy_test.numpy()
                self.test_acc[l_idx, epoch] = self.error_rate(est_best_policy_test, data.y_test)
            
                          
        self.l_best = self._get_params_min_loss(self.valid_tot_loss_hist)
            
    
        
        crm = CounterfactualRiskMinimization(lambda_=self.l_best, batch_size = self.batch_size,
                                              learning_rate = self.learning_rate, weight_decay = self.weight_decay,
                                              clipping = self.clipping, self_normalize = self.self_normalize, verbose = self.verbose)
         
        crm.learn_policy(model=model, data=data, epochs=epochs) 
        self.est_best_policy = crm.est_best_policy
                
        return self
    
    def plot_cv_loss(self):
         
        train_loss_flatten = self.train_tot_loss_hist.T.flatten(1).numpy()
        valid_loss_flatten = self.valid_tot_loss_hist.T.flatten(1).numpy()
        
        train_acc_flatten =  self.train_acc.T.flatten(1).numpy()
        valid_acc_flatten = self.valid_acc.T.flatten(1).numpy()
        test_acc_flatten = self.test_acc.T.flatten(1).numpy()
        
        fig, axs = plt.subplots(2, 3)
        fs = 8
        
        for l_idx, l in enumerate(self.lambda_):
            axs[0, 0].plot(train_loss_flatten[:,l_idx], label = round(l, 4))
            axs[0, 1].plot(valid_loss_flatten[:,l_idx], label = round(l, 4))
            axs[1, 0].plot(train_acc_flatten[:,l_idx], label = round(l, 4))
            axs[1, 1].plot(valid_acc_flatten[:,l_idx], label = round(l, 4))
            axs[1, 2].plot(test_acc_flatten[:,l_idx], label = round(l, 4))
            
        axs[0, 0].set_title("Train: Poem Loss", fontsize=fs)
        axs[0, 1].set_title("Validation: Poem Loss", fontsize=fs)
        axs[1, 0].set_title("Train: Accuracy", fontsize=fs)
        axs[1, 1].set_title("Validation: Accuracy", fontsize=fs)
        axs[1, 2].set_title("Test: Accuracy", fontsize=fs)
        
        for i, ax in enumerate(axs.flat):
            if i < 2:
                ax.set_xlabel(xlabel='Epoch', fontsize=fs)
                ax.set_ylabel(ylabel='Loss', fontsize=fs)
            else:
                ax.set_xlabel(xlabel='Epoch', fontsize=fs)
                ax.set_ylabel(ylabel='Accuracy', fontsize=fs)
        fig.legend(self.lambda_, loc='upper right', fontsize=fs)

Alternative using Doubly Robust (as opposed to the IPS estimator)

# class CounterfactualRiskMinimization(EvaluationMetrics):
#     """
#     Performs policy learning using the Counterfactual Risk Minimization 
#     approach proposed in [1], and later refined in [2].
    
#     Parameters
#     ----------
#     batch_size : int, default: 96
#         The number of samples per batch to load 
#     learning_rate : float, default: 0.01
#         Stochastic gradient descent learning rate 
#     weight_decay : float, default: 0.001
#         L2 regularization on parameters
#     lambda_ : float, default: 0.1
#         Variance regularization. Penalty on the variance of the 
#         learnt policy relative to the logging policy.
#     self_normalize: bool, default: True
#         Whether to normalize the IPS estimator. See [2].
#     clipping: float, default: 100.
#         Clipping the importance sample weights. See [1].
#     verbose: bool, default: False
#         Whether to print Poem Loss during training .
    
#     References
#     ----------
    
#     .. [1] A. Swaminathan and T. Joachims, Batch Learning from Logged Bandit Feedback through 
#            Counterfactual Risk Minimization, Journal of Machine Learning Research, 16(52),
#            1731--1755, 2015.
#     .. [2] A. Swaminathan and T. Joachims, The self-normalized estimator for counterfactual learning, 
#            Advances in Neural Information Processing Systems, 28, 16(52), 3231--3239, 2015.
#     .. [3] A. Swaminathan, T. Joachims, and M. de Rijke, Deep Learning with Logged Bandit Feedback, 
#            International Conference on Learning Representations,  2018.
    
#     """

    
#     def __init__(self, batch_size: int = 96, learning_rate: float = 0.01, weight_decay: float = 0.001, 
#                  lambda_: float = 0.5, self_normalize: bool = True, clipping : float = 100.,
#                  verbose: bool = False) -> None:
#         self.batch_size = batch_size
#         self.learning_rate = learning_rate
#         self.weight_decay = weight_decay
#         self.lambda_ = lambda_
#         self.self_normalize = self_normalize
#         self.verbose = verbose
#         self.clipping = clipping 
     
#     def __repr__(self) -> str:
    
#         items = ("%s = %r" % (k, v) for k, v in self.__dict__.items())
#         return "<%s: {%s}>" % (self.__class__.__name__, ', '.join(items))
  
#     def _poem_loss(self, pi, p0, r, r_pred, y_idx, Lambda, self_normalize):
        
#         #if torch.sum(r) == 0: 
#         #    r = torch.repeat_interleave(torch.tensor(1e-05, dtype=torch.float), len(r))
        
        
#         bsz = pi.shape[0]
#         softmax_pi = F.softmax(pi, dim=1)
#         pi_i = softmax_pi.masked_select(y_idx)
#         r_pred_i = r_pred.masked_select(y_idx)
#         importance = torch.div(pi_i, p0) 
#         clip_importance_vals = torch.repeat_interleave(torch.tensor(self.clipping, dtype=torch.float), len(importance))
#         importance_clipped = torch.min(clip_importance_vals, importance)
#         reward_residual = torch.sub(r, r_pred_i)
#         weighted_reward_pred = torch.sum(torch.mul(softmax_pi, r_pred), dim=1)
#         off_policy_est = torch.add(torch.mul(importance_clipped, reward_residual), weighted_reward_pred)
#         empirical_var = torch.var(off_policy_est)
        
#         if self_normalize:
#             effective_sample_size = torch.sum(importance_clipped).detach() # turns off requires grad
#             sum_off_policy_est = torch.div(torch.sum(off_policy_est), effective_sample_size) 
#         else:
#             sum_off_policy_est = torch.sum(off_policy_est)
        
#         penalty = torch.mul(Lambda, torch.sqrt(torch.div(empirical_var, bsz)))
#         loss = torch.mul(-1.0, sum_off_policy_est) + penalty
             
#         return loss

    
#     def learn_policy(self, model, data, epochs: int = 500) -> None:

#         """
#         Parameters
#         ----------
#         data : STBT object
#             This must be a Supervised to Bandit Transform (STBT) class with fitted 
#             `generate_batch` method.
#         epochs : int, default 
#             Number of training epochs.
            
#         Returns
#         -------
#         int.
#           The predicted best policy.
        
#         """ 
        
#         rp = RewardPredictor().learn_policy(data=data, max_iter=1000)
        
#         train_ds = BanditDataset(torch.from_numpy(data.X_train).float(),
#                                  torch.from_numpy(data.y_train_logging).long(), 
#                                  torch.from_numpy(data.train_logging_prob).float(), 
#                                  torch.from_numpy(data.train_logging_reward).long(),
#                                  torch.from_numpy(data.y_train_logging_idx).bool(),
#                                  torch.from_numpy(rp.train_pred_reward_arr).float() 
#                                  )
        
        
#         n_features = train_ds.X.shape[1]
#         actions = torch.unique(train_ds.y)
#         n_actions = len(actions)
       
#         train_dl = DataLoader(train_ds, self.batch_size)
        
#         Model = model(n_features = n_features, n_actions = n_actions)
        
#         optimizer = torch.optim.SGD(Model.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)  
        
#         for epoch in range(epochs):
#             Model.train()
#             train_epoch_loss = 0.
#             for x_batch, y_batch, p0_batch, r_batch, y_idx_batch, r_pred_batch in train_dl:
#                 pi = Model(x_batch)
#                 loss = self._poem_loss(pi, p0_batch, r_batch, r_pred_batch, y_idx_batch, self.lambda_, self.self_normalize)
#                 loss.backward()
#                 optimizer.step()
#                 optimizer.zero_grad()
#                 train_epoch_loss += loss.item()
#             if self.verbose:
#                 print(f'Epoch {epoch}: | Train Poem Loss: {train_epoch_loss/len(train_dl):.5f}')
            
#         Model.eval()
#         with torch.no_grad():
#            X_test = torch.from_numpy(data.X_test).float()
#            pred = Model(X_test)
#            est_best_policy = actions[torch.argmax(pred, dim=1)]
            
#         self.est_best_policy = est_best_policy.numpy()
         
#         return self
    
    

# class CounterfactualRiskMinimizationCV(CounterfactualRiskMinimization, EvaluationMetrics):
#     """
#     Tune variance regularizer for Counterfactual Risk Minimization.
    
#     Parameters
#     ----------
    
#     batch_size : int, default: 96
#         The number of samples per batch to load 
#     learning_rate : float, default: 0.01
#         Stochastic gradient descent learning rate 
#     weight_decay : float, default: 0.001
#         L2 regularization on parameters
#     clipping: float, default: 100.
#         Clipping the importance sample weights. See [1].
#     self_normalize: bool, default: True
#         Whether to normalize the IPS estimator. See [2].
#     verbose: bool, default: True
#         Whether to print Poem Loss during training .
#     lambda_ : 1D array, optional, defaults to grid of values 
#         chosen in a logarithmic scale between 1e-4 and 1e+01.
    
#     """
    
#     def __init__(self, batch_size: int = 96, learning_rate: float = 0.01, weight_decay: float = 0.001, 
#                  clipping : float = 100., self_normalize: bool = True, verbose: bool = False, 
#                  lambda_: np.ndarray = None) -> None:
        
#         self.batch_size = batch_size
#         self.learning_rate = learning_rate
#         self.weight_decay = weight_decay
#         self.clipping = clipping 
#         self.self_normalize = self_normalize
#         self.verbose = verbose
        
#         if lambda_ is None:
#             self.lambda_ = 10 ** np.linspace(-4., 1., 10) # search in log scale
#         else:
#             self.lambda_= lambda_
                           
#     def __repr__(self) -> str:
    
#         items = ("%s = %r" % (k, v) for k, v in self.__dict__.items())
#         return "<%s: {%s}>" % (self.__class__.__name__, ', '.join(items))
    
#     def _get_params_min_loss(self, x):
#         x = x.numpy()
#         xmin_idx = np.unravel_index(x.argmin(), x.shape)
#         l_best = self.lambda_[xmin_idx[0]]
          
#         return l_best

    
#     def learn_policy(self, model, data, valid_frac: float = 0.5, epochs: int = 500) -> None:
#         """
#         Parameters
#         ----------
#         data : STBT object
#             This must be a Supervised to Bandit Transform (STBT) class with fitted 
#             `generate_batch` method.
#         valid_frac : float, default: 0.5
#             Fraction of training data set for validation. Test data are not modified. 
#         epochs : int, default: 500
#             Number of training epochs.
            
#         Returns
#         -------
#         int.
#           The predicted best policy.
        
#         """ 
        
#         self.epochs = epochs
#         self.valid_frac = valid_frac
        
#         rp = RewardPredictor().learn_policy(data=data, max_iter=1000)
        
#         n_train_samples, n_features = data.X_train.shape
#         idx_valid_samples = np.random.choice(range(n_train_samples), 
#                                              size = int(np.floor(n_train_samples * valid_frac)), replace = False)
        
#         train_ds = BanditDataset(torch.from_numpy(np.delete(data.X_train, idx_valid_samples, axis=0)).float(),
#                                  torch.from_numpy(np.delete(data.y_train_logging, idx_valid_samples)).long(), 
#                                  torch.from_numpy(np.delete(data.train_logging_prob, idx_valid_samples)).float(), 
#                                  torch.from_numpy(np.delete(data.train_logging_reward, idx_valid_samples)).long(),
#                                  torch.from_numpy(np.delete(data.y_train_logging_idx, idx_valid_samples, axis=0)).bool(), 
#                                  torch.from_numpy(np.delete(rp.train_pred_reward_arr, idx_valid_samples, axis=0)).float()
#                                  )
        
        
#         valid_ds = BanditDataset(torch.from_numpy(data.X_train[idx_valid_samples, :]).float(),
#                                  torch.from_numpy(data.y_train_logging[idx_valid_samples]).long(), 
#                                  torch.from_numpy(data.train_logging_prob[idx_valid_samples]).float(), 
#                                  torch.from_numpy(data.train_logging_reward[idx_valid_samples]).long(),
#                                  torch.from_numpy(data.y_train_logging_idx[idx_valid_samples, :]).bool(),
#                                  torch.from_numpy(rp.train_pred_reward_arr[idx_valid_samples, :]).float()
#                                  )
            
#         y_train = np.delete(data.y_train, idx_valid_samples, axis=0)
#         y_valid = data.y_train[idx_valid_samples]
#         X_test = torch.from_numpy(data.X_test).float()
        
#         actions = torch.unique(train_ds.y)
#         n_actions = len(actions)
       
#         train_dl = DataLoader(train_ds, self.batch_size)
#         valid_dl = DataLoader(valid_ds, self.batch_size)
        
#         Model = model(n_features=n_features, n_actions=n_actions)
        
#         optimizer = torch.optim.SGD(Model.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)  
           
#         self.train_tot_loss_hist = torch.zeros(len(self.lambda_), epochs)   
#         self.valid_tot_loss_hist = torch.zeros(len(self.lambda_), epochs)
#         self.valid_acc = torch.zeros(len(self.lambda_), epochs) 
#         self.train_acc = torch.zeros(len(self.lambda_), epochs) 
#         self.test_acc = torch.zeros(len(self.lambda_), epochs) 
    
#         for l_idx, l in enumerate(self.lambda_):
       
#             for epoch in range(epochs):
#                 Model.train()
#                 train_epoch_loss = 0.
#                 for x_batch, y_batch, p0_batch,r_batch,y_idx_batch,r_pred_batch in train_dl:
#                     pi = Model(x_batch)
#                     loss = self._poem_loss(pi, p0_batch, r_batch, r_pred_batch, y_idx_batch, l, self.self_normalize)
#                     loss.backward()
#                     optimizer.step()
#                     optimizer.zero_grad()
#                     train_epoch_loss += loss.item()
#                 self.train_tot_loss_hist[l_idx, epoch] = train_epoch_loss/len(train_dl)
#                 if self.verbose:
#                     print(f'Epoch: {epoch} | Train Poem Loss: {train_epoch_loss/len(train_dl):.5f}')
                
#                 Model.eval()
#                 with torch.no_grad():
#                     valid_tot_loss=0.
#                     for x_batch,y_batch,p0_batch,r_batch,y_idx_batch,r_pred_batch in valid_dl:
#                         pi = Model(x_batch)
#                         valid_loss = self._poem_loss(pi, p0_batch, r_batch, r_pred_batch, y_idx_batch, l, self.self_normalize)
#                         valid_tot_loss += valid_loss.item()
#                 self.valid_tot_loss_hist[l_idx, epoch] = valid_tot_loss/len(valid_dl)
#                 if self.verbose:
#                       print(f'Epoch: {epoch} | Valid Poem Loss: {valid_tot_loss/len(valid_dl):.5f}')
                
#                 pred_train = Model(train_ds.X)
#                 est_best_policy_train = actions[torch.argmax(pred_train, dim=1)]
#                 est_best_policy_train = est_best_policy_train.numpy()
#                 self.train_acc[l_idx, epoch] = self.error_rate(est_best_policy_train, y_train)
#                 pred_valid = Model(valid_ds.X)
#                 est_best_policy_valid = actions[torch.argmax(pred_valid, dim=1)]
#                 est_best_policy_valid = est_best_policy_valid.numpy()
#                 self.valid_acc[l_idx, epoch] = self.error_rate(est_best_policy_valid, y_valid)
                
#                 pred_test = Model(X_test)
#                 est_best_policy_test = actions[torch.argmax(pred_test, dim=1)]
#                 est_best_policy_test = est_best_policy_test.numpy()
#                 self.test_acc[l_idx, epoch] = self.error_rate(est_best_policy_test, data.y_test)
            
                          
#         self.l_best = self._get_params_min_loss(self.valid_tot_loss_hist)
            
    
        
#         crm = CounterfactualRiskMinimization(lambda_=self.l_best, batch_size = self.batch_size,
#                                              learning_rate = self.learning_rate, weight_decay = self.weight_decay,
#                                              clipping = self.clipping, self_normalize = self.self_normalize, verbose = self.verbose)
        
#         crm.learn_policy(model=model, data=data, epochs=epochs) 
#         self.est_best_policy = crm.est_best_policy
                
#         return self
    
#     def plot_cv_loss(self):
         
#         train_loss_flatten = self.train_tot_loss_hist.T.flatten(1).numpy()
#         valid_loss_flatten = self.valid_tot_loss_hist.T.flatten(1).numpy()
        
#         train_acc_flatten =  self.train_acc.T.flatten(1).numpy()
#         valid_acc_flatten = self.valid_acc.T.flatten(1).numpy()
#         test_acc_flatten = self.test_acc.T.flatten(1).numpy()
        
#         fig, axs = plt.subplots(2, 3)
#         fs = 8
        
#         for l_idx, l in enumerate(self.lambda_):
#             axs[0, 0].plot(train_loss_flatten[:,l_idx], label = round(l, 4))
#             axs[0, 1].plot(valid_loss_flatten[:,l_idx], label = round(l, 4))
#             axs[1, 0].plot(train_acc_flatten[:,l_idx], label = round(l, 4))
#             axs[1, 1].plot(valid_acc_flatten[:,l_idx], label = round(l, 4))
#             axs[1, 2].plot(test_acc_flatten[:,l_idx], label = round(l, 4))
            
#         axs[0, 0].set_title("Train: Poem Loss", fontsize=fs)
#         axs[0, 1].set_title("Validation: Poem Loss", fontsize=fs)
#         axs[1, 0].set_title("Train: Accuracy", fontsize=fs)
#         axs[1, 1].set_title("Validation: Accuracy", fontsize=fs)
#         axs[1, 2].set_title("Test: Accuracy", fontsize=fs)
        
#         for i, ax in enumerate(axs.flat):
#             if i < 2:
#                 ax.set_xlabel(xlabel='Epoch', fontsize=fs)
#                 ax.set_ylabel(ylabel='Loss', fontsize=fs)
#             else:
#                 ax.set_xlabel(xlabel='Epoch', fontsize=fs)
#                 ax.set_ylabel(ylabel='Accuracy', fontsize=fs)
#         fig.legend(self.lambda_, loc='upper right', fontsize=fs)

Read data

np.random.seed(1)
X, y = get_data(dataset= 'glass')

Perform Supervised-to-Bandit Conversion

Performs Supervised to Bandit Conversion for classification datasets. This conversion is generally used to test the limits of counterfactual learning in a well-controlled environment.

Here, we take a supervised dataset with features x and labeled classes y, and simulate a bandit feedback data set from a logging policy. Basically, this involves: (i) simulating a stochastic logging policy, which may be uniform (logging_type=’uniform’), or given as a function of covariates (logging_type = ‘biased’), (ii) when the logging policy for a given observation equals the optimal policy (true label), a positive reward is observed.

data = STBT(train_frac= 0.5, logging_type='biased').generate_batch(X, y, max_iter=1000)

Skyline

Best possible error rate, assuming we have full feedback (this can only be tested from the simulation as in practice as we have bandit feedback)x.

clf = LogisticRegressionCV(multi_class='multinomial', max_iter=2000).fit(data.X_train, data.y_train)
optimal_policy = clf.predict(data.X_test)
print("Skyline Error:", EvaluationMetrics.error_rate(optimal_policy, data.y_test))
Skyline Error: 0.30841121495327106
## Reward Predictor (RP)
rp = RewardPredictor()
rp.learn_policy(data, max_iter=1000)
print("Reward Predictor Error:", rp.error_rate(rp.est_best_policy, data.y_test))
Reward Predictor Error: 0.7009345794392523
## Outcome Weighted Learning (OWL)
owl = OutcomeWeightedLearning()
owl.learn_policy(data, clf = 'LogisticRegressionCV', max_iter=1000)
print("OWL-LR:", owl.error_rate(owl.est_best_policy, data.y_test))
OWL-LR: 0.7289719626168225

Counterfactual Risk Minimization (CRM)

crm = CounterfactualRiskMinimization(verbose=True, lambda_ = 1e-06)
crm.learn_policy(model=LinearModel, data=data, epochs = 2000)
print("CRM:", crm.error_rate(crm.est_best_policy, data.y_test))
Epoch 0: | Train Poem Loss: -0.10087
Epoch 100: | Train Poem Loss: -0.23397
Epoch 200: | Train Poem Loss: -0.37680
Epoch 300: | Train Poem Loss: -0.49386
Epoch 400: | Train Poem Loss: -0.54733
Epoch 500: | Train Poem Loss: -0.57395
Epoch 600: | Train Poem Loss: -0.58959
Epoch 700: | Train Poem Loss: -0.60032
Epoch 800: | Train Poem Loss: -0.60866
Epoch 900: | Train Poem Loss: -0.61578
Epoch 1000: | Train Poem Loss: -0.62223
Epoch 1100: | Train Poem Loss: -0.62831
Epoch 1200: | Train Poem Loss: -0.63413
Epoch 1300: | Train Poem Loss: -0.63967
Epoch 1400: | Train Poem Loss: -0.64488
Epoch 1500: | Train Poem Loss: -0.64968
Epoch 1600: | Train Poem Loss: -0.65401
Epoch 1700: | Train Poem Loss: -0.65786
Epoch 1800: | Train Poem Loss: -0.66122
Epoch 1900: | Train Poem Loss: -0.66412
CRM: 0.719626168224299

Experiments

## Params
B = 10 # Number of simulations
EPOCHS = 500
LOGGING_TYPE = 'biased'
MODEL = LinearModel
LAMBDA = 1e-06
DATASETS = ['ecoli', 'glass', 'lymphography', 'yeast', 'digits', 'breast-cancer', 'wine', 'letter-recognition']
dat = list()
skyline_error = list()
randomized_error = list()
reward_predictor_error = list()
owl_lrcv_error = list()
crm_error = list()
for s in DATASETS:
    
    X, y = get_data(dataset=s)
    
    for b in range(B):
        if (b % 10) == 0:
            print("Sample: %d - Dataset: %s" % (b, s))
        
        d = STBT(logging_type = LOGGING_TYPE).generate_batch(X, y, max_iter=1000)
        dat.append(s)    
       
        skyline = LogisticRegression(multi_class='multinomial', max_iter=2000).fit(d.X_train, d.y_train)
        optimal_policy = skyline.predict(d.X_test)
        
        rp = RewardPredictor().learn_policy(data=d, max_iter=1000)
        erm_lrcv = OutcomeWeightedLearning().learn_policy(data=d, clf = 'LogisticRegressionCV', max_iter=1000)
        crm = CounterfactualRiskMinimization(lambda_=LAMBDA).learn_policy(model=MODEL, data=d, epochs=EPOCHS)     
        
        skyline_error.append(EvaluationMetrics.error_rate(optimal_policy, d.y_test))
        randomized_error.append(EvaluationMetrics.error_rate(d.y_test_logging, d.y_test))
        reward_predictor_error.append(rp.error_rate(rp.est_best_policy, d.y_test))
        owl_lrcv_error.append(erm_lrcv.error_rate(erm_lrcv.est_best_policy, d.y_test))
        crm_error.append(crm.error_rate(crm.est_best_policy, d.y_test))
Sample: 0 - Dataset: ecoli
/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_split.py:667: UserWarning: The least populated class in y has only 4 members, which is less than n_splits=5.
  % (min_groups, self.n_splits)), UserWarning)
/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_split.py:667: UserWarning: The least populated class in y has only 4 members, which is less than n_splits=5.
  % (min_groups, self.n_splits)), UserWarning)
Sample: 0 - Dataset: glass
Sample: 0 - Dataset: lymphography
Sample: 0 - Dataset: yeast
/usr/local/lib/python3.7/dist-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
/usr/local/lib/python3.7/dist-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
/usr/local/lib/python3.7/dist-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
Sample: 0 - Dataset: digits
Sample: 0 - Dataset: breast-cancer
Sample: 0 - Dataset: wine
Sample: 0 - Dataset: letter-recognition
res = pd.DataFrame.from_dict({'dataset':dat[1:], 'skyline_error': skyline_error, 'randomized_error':randomized_error, 'reward_predictor_error':reward_predictor_error,
                              'owl_lrcv_error':owl_lrcv_error, 'crm_error':crm_error})

res_summary = res.groupby(['dataset'], as_index=False).agg({
                            'skyline_error': ['mean','std'], 
                            'randomized_error': ['mean','std'], 
                            'reward_predictor_error': ['mean','std'],
                            'owl_lrcv_error': ['mean','std'],
                            'crm_error': ['mean','std']
                            })

res_summary
dataset skyline_error randomized_error reward_predictor_error owl_lrcv_error crm_error
mean std mean std mean std mean std mean std
0 breast-cancer 0.029123 0.009792 0.558596 0.191854 0.026667 0.009813 0.209825 0.162483 0.064561 0.051467
1 digits 0.036151 0.004104 0.888432 0.054886 0.376529 0.096568 0.529366 0.120459 0.426251 0.111795
2 ecoli 0.132937 0.018231 0.835714 0.093679 0.278175 0.043127 0.397619 0.178276 0.310714 0.080024
3 glass 0.401869 0.037642 0.825234 0.061372 0.588785 0.103134 0.591589 0.080883 0.629907 0.093996
4 letter-recognition 0.228200 0.004190 0.966280 0.011429 0.737560 0.031631 0.713590 0.030353 0.741250 0.028611
5 lymphography 0.168919 0.032638 0.748649 0.075965 0.295946 0.067553 0.347297 0.111444 0.410811 0.119722
6 wine 0.026966 0.022596 0.723596 0.098695 0.065169 0.047611 0.365169 0.104097 0.158427 0.094074
7 yeast 0.411456 0.011078 0.901482 0.024659 0.516981 0.058388 0.606334 0.062139 0.569946 0.045408

References

[1] A. Swaminathan and T. Joachims, Batch Learning from Logged Bandit Feedback through Counterfactual Risk Minimization, Journal of Machine Learning Research, 16(52), 1731–1755, 2015.

[2] A. Swaminathan and T. Joachims, The self-normalized estimator for counterfactual learning, Advances in Neural Information Processing Systems, 28, 16(52), 3231–3239, 2015.

[3] A. Swaminathan, T. Joachims, and M. de Rijke, Deep Learning with Logged Bandit Feedback, International Conference on Learning Representations, 2018.

[4] Y. Zhao, D. Zeng, A.J. Rush and M. R. Kosorok, Estimating Individualized Treatment Rules Using Outcome Weighted Learning, Journal of the American Statistical Association, 107:499, 1106-1118, 2012, DOI: 10.1080/01621459.2012.695674.

[5] Y. Wang, A. Agarwal and M. Dud’{\i}k, Optimal and Adaptive Off-policy Evaluation in Contextual Bandits, Proceedings of Machine Learning Research, 70, 3589–3597, 2017.

[6] M. Dudik and J. Langford and L. Li, Doubly Robust Policy Evaluation and Learning, CoRR, 2011. http://arxiv.org/abs/1103.4601

[7] K{“u}nzel, S., Sekhon, J., Bickel, P. and Yu, B., Metalearners for estimating heterogeneous treatment effects using machine learning, Proceedings of the National Academy of Sciences, 116(10), 4156–4165, 2019.

[8] Batch Learning from Bandit Feedback (BLBF). https://github.com/leoguelman/BLBF.