Optimal Off-Policy Evaluation from Multiple Logging Policies¶
Imports¶
%tensorflow_version 1.x
TensorFlow 1.x selected.
from typing import Dict, List
import argparse
import time
import pickle
import warnings
from pathlib import Path
import yaml
import numpy as np
import pandas as pd
from dataclasses import dataclass
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.exceptions import ConvergenceWarning
import tensorflow as tf
from tensorflow.python.framework import ops
warnings.filterwarnings(action="ignore", category=ConvergenceWarning)
Configurations¶
!mkdir -p conf
%%writefile conf/policy_params.yaml
evaluation: 1.0
behavior1: 0.95
behavior2: 0.05
Writing conf/policy_params.yaml
%%writefile conf/q_func_hyperparams.yaml
eta: 0.01
std: 0.01
lam: 0.001
batch_size: 256
epochs: 200
Writing conf/q_func_hyperparams.yaml
Dataset¶
!mkdir -p data/optdigits
!wget -q --show-progress -O data/optdigits/optdigits.tra https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tra
!wget -q --show-progress -O data/optdigits/optdigits.tes https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tes
data/optdigits/optd 100%[===================>] 550.43K 741KB/s in 0.7s
data/optdigits/optd 100%[===================>] 258.51K 436KB/s in 0.6s
def load_datasets(
data: str, ratio: float, test_size: float = 0.5, random_state: int = 12345
):
"""Load and preprocess raw multiclass classification data."""
data_path = Path(f"data/{data}")
le = LabelEncoder()
if data == "optdigits":
data_ = np.r_[
np.loadtxt(data_path / f"{data}.tra", delimiter=","),
np.loadtxt(data_path / f"{data}.tes", delimiter=","),
]
elif data == "pendigits":
data_ = np.r_[
np.loadtxt(data_path / f"{data}.tra", delimiter=","),
np.loadtxt(data_path / f"{data}.tes", delimiter=","),
]
elif data == "sat":
data_ = np.r_[
np.loadtxt(data_path / f"{data}.trn", delimiter=" "),
np.loadtxt(data_path / f"{data}.tst", delimiter=" "),
]
data_[:, -1] = np.where(data_[:, -1] == 7, 5, data_[:, -1] - 1)
elif data == "letter":
data_ = np.genfromtxt(
data_path / "letter-recognition.data", delimiter=",", dtype="str"
)
data_ = np.c_[data_[:, 1:], le.fit_transform(data_[:, 0])].astype(float)
np.random.shuffle(data_)
data_tr, data_ev = train_test_split(
data_, test_size=test_size, random_state=random_state
)
n_train, n_eval = data_tr.shape[0], data_ev.shape[0]
n_dim = np.int(data_tr.shape[1] / 2)
y_tr, y_ev = data_tr[:, -1].astype(int), data_ev[:, -1].astype(int)
n_class = np.unique(y_tr).shape[0]
y_full_ev = np.zeros((n_eval, n_class))
y_full_ev[np.arange(n_eval), y_ev] = 1
X_tr, X_ev = data_tr[:, :-1], data_ev[:, :-1]
X_tr1, X_tr2 = data_tr[:, :n_dim], data_tr[:, n_dim:]
X_ev1, X_ev2 = data_ev[:, :n_dim], data_ev[:, n_dim:]
# multiple logger index generation
ratio1 = ratio / (1 + ratio)
n_eval1 = np.int(n_eval * ratio1)
idx1 = np.ones(n_eval, dtype=bool)
idx1[n_eval1:] = False
return dict(
n_train=n_train,
n_eval=n_eval,
n_dim=n_dim,
n_class=n_class,
n_behavior_policies=2,
X_tr=X_tr,
X_tr1=X_tr1,
X_tr2=X_tr2,
X_ev=X_ev,
X_ev1=X_ev1,
X_ev2=X_ev2,
y_tr=y_tr,
y_ev=y_ev,
y_full_ev=y_full_ev,
idx1=idx1,
ratio1=(n_eval1 / n_eval),
)
def generate_bandit_feedback(data_dict: Dict, pi_b1: np.ndarray, pi_b2: np.ndarray):
"""Generate logged bandit feedback data."""
n_eval = data_dict["n_eval"]
idx1, ratio1 = data_dict["idx1"], data_dict["ratio1"]
idx1_expanded = np.expand_dims(idx1, 1)
pi_b = pi_b1 * idx1_expanded + pi_b2 * (1 - idx1_expanded)
pi_b_star = pi_b1 * ratio1 + pi_b2 * (1.0 - ratio1)
action_set = np.arange(data_dict["n_class"])
actions = np.zeros(data_dict["n_eval"], dtype=int)
for i, pvals in enumerate(pi_b):
actions[i] = np.random.choice(action_set, p=pvals)
rewards = data_dict["y_full_ev"][np.arange(n_eval), actions]
return dict(
n_eval=data_dict["n_eval"],
n_class=data_dict["n_class"],
X_ev=data_dict["X_ev"],
pi_b=pi_b,
pi_b_star=pi_b_star,
actions=actions,
idx1=idx1,
rewards=rewards,
)
Policies¶
def train_policies(data_dict: Dict, random_state: int = 0) -> List[np.ndarray]:
"""Train evaluation and behavior policies."""
with open("./conf/policy_params.yaml", "rb") as f:
policy_params = yaml.safe_load(f)
policy_list = list()
for pol in policy_params.keys():
# make label predictions
X_tr, y_tr = data_dict[f"X_tr"], data_dict[f"y_tr"]
clf = LogisticRegression(
random_state=random_state,
solver="lbfgs",
multi_class="multinomial",
).fit(X=X_tr, y=y_tr)
preds = clf.predict(X=data_dict[f"X_ev"]).astype(int)
# transform predictions into distribution over actions
alpha = policy_params[pol]
pi = np.zeros((data_dict["n_eval"], data_dict["n_class"]))
pi[:, :] = (1.0 - alpha) / data_dict["n_class"]
pi[np.arange(data_dict["n_eval"]), preds] = (
alpha + (1.0 - alpha) / data_dict["n_class"]
)
policy_list.append(pi)
return policy_list
Estimators¶
def calc_ground_truth(y_true: np.ndarray, pi: np.ndarray) -> float:
"""Calculate the ground-truth policy value of an eval policy"""
return pi[np.arange(y_true.shape[0]), y_true].mean()
def calc_ipw(
rewards: np.ndarray,
actions: np.ndarray,
pi_b: np.ndarray,
pi_e: np.ndarray,
) -> float:
n_data = actions.shape[0]
iw = pi_e[np.arange(n_data), actions] / pi_b[np.arange(n_data), actions]
return (rewards * iw).mean()
def calc_var(
rewards: np.ndarray,
actions: np.ndarray,
pi_b: np.ndarray,
pi_e: np.ndarray,
estimated_q_func: np.ndarray,
):
n_data = actions.shape[0]
v = np.average(estimated_q_func, weights=pi_e, axis=1)
shifted_rewards = rewards - estimated_q_func[np.arange(n_data), actions]
iw = pi_e[np.arange(n_data), actions] / pi_b[np.arange(n_data), actions]
return np.var(shifted_rewards * iw + v)
def calc_weighted(
rewards: np.ndarray,
actions: np.ndarray,
idx1: np.ndarray,
pi_b: np.ndarray,
pi_e: np.ndarray,
estimated_q_func: np.ndarray = None,
n_fold: int = 2,
) -> float:
estimated_rewards_list = list()
if estimated_q_func is None:
estimated_q_func = np.zeros((actions.shape[0], np.int(actions.max() + 1)))
kf = KFold(n_splits=n_fold, shuffle=True, random_state=12345)
for train_idx, test_idx in kf.split(rewards):
rewards_tr, rewards_ev = rewards[train_idx], rewards[test_idx]
actions_tr, actions_ev = actions[train_idx], actions[test_idx]
idx1_tr, idx1_ev = idx1[train_idx], idx1[test_idx]
pi_b_tr, pi_b_ev = pi_b[train_idx], pi_b[test_idx]
pi_e_tr, pi_e_ev = pi_e[train_idx], pi_e[test_idx]
estimated_q_func_tr = estimated_q_func[train_idx]
estimated_q_func_ev = estimated_q_func[test_idx]
# estimate lambda with one of the fold
n_data1, n_data2 = idx1_tr.sum(), (~idx1_tr).sum()
var1 = calc_var(
rewards=rewards_tr[idx1_tr],
actions=actions_tr[idx1_tr],
pi_b=pi_b_tr[idx1_tr],
pi_e=pi_e_tr[idx1_tr],
estimated_q_func=estimated_q_func_tr[idx1_tr],
)
var2 = calc_var(
rewards=rewards_tr[~idx1_tr],
actions=actions_tr[~idx1_tr],
pi_b=pi_b_tr[~idx1_tr],
pi_e=pi_e_tr[~idx1_tr],
estimated_q_func=estimated_q_func_tr[~idx1_tr],
)
denominator = (n_data1 / var1) + (n_data2 / var2)
lam1 = (n_data1 / var1) / denominator
lam2 = (n_data2 / var2) / denominator
# estimate the policy value with the other fold
iw1 = (
pi_e_ev[idx1_ev, actions_ev[idx1_ev]]
/ pi_b_ev[idx1_ev, actions_ev[idx1_ev]]
)
iw2 = (
pi_e_ev[~idx1_ev, actions_ev[~idx1_ev]]
/ pi_b_ev[~idx1_ev, actions_ev[~idx1_ev]]
)
v1 = np.average(estimated_q_func_ev[idx1_ev], weights=pi_e_ev[idx1_ev], axis=1)
v2 = np.average(
estimated_q_func_ev[~idx1_ev], weights=pi_e_ev[~idx1_ev], axis=1
)
shifted_rewards1 = (
rewards_ev[idx1_ev] - estimated_q_func_ev[idx1_ev, actions_ev[idx1_ev]]
)
shifted_rewards2 = (
rewards_ev[~idx1_ev] - estimated_q_func_ev[~idx1_ev, actions_ev[~idx1_ev]]
)
estimated_rewards = lam1 * (iw1 * shifted_rewards1 + v1).mean()
estimated_rewards += lam2 * (iw2 * shifted_rewards2 + v2).mean()
estimated_rewards_list.append(estimated_rewards)
return np.mean(estimated_rewards_list)
def calc_dr(
rewards: np.ndarray,
actions: np.ndarray,
estimated_q_func: np.ndarray,
pi_b: np.ndarray,
pi_e: np.ndarray,
) -> float:
n_data = actions.shape[0]
v = np.average(estimated_q_func, weights=pi_e, axis=1)
iw = pi_e[np.arange(n_data), actions] / pi_b[np.arange(n_data), actions]
shifted_rewards = rewards - estimated_q_func[np.arange(n_data), actions]
return (iw * shifted_rewards + v).mean()
def estimate_q_func(
bandit_feedback,
pi_e: np.ndarray,
fitting_method: str = "naive",
k_fold: int = 2,
) -> np.ndarray:
# hyperparam
with open("./conf/q_func_hyperparams.yaml", "rb") as f:
q_func_hyperparams = yaml.safe_load(f)
X = bandit_feedback["X_ev"]
y = bandit_feedback["rewards"]
pi_b_star = bandit_feedback["pi_b_star"]
idx1 = bandit_feedback["idx1"].astype(int)
a = pd.get_dummies(bandit_feedback["actions"]).values
skf = StratifiedKFold(n_splits=k_fold)
skf.get_n_splits(X, y)
estimated_q_func = np.zeros((bandit_feedback["n_eval"], bandit_feedback["n_class"]))
for train_idx, test_idx in skf.split(X, y):
X_tr, X_ev = X[train_idx], X[test_idx]
y_tr, a_tr = y[train_idx], a[train_idx].astype(float)
pi_e_tr = pi_e[train_idx]
pi_b_star_tr = pi_b_star[train_idx]
idx1_tr = idx1[train_idx]
ops.reset_default_graph()
clf = QFuncEstimator(
num_features=X_tr.shape[1],
num_classes=bandit_feedback["n_class"],
fitting_method=fitting_method,
eta=q_func_hyperparams["eta"],
std=q_func_hyperparams["std"],
lam=q_func_hyperparams["lam"],
batch_size=q_func_hyperparams["batch_size"],
epochs=q_func_hyperparams["epochs"],
)
clf.train(
X=X_tr,
a=a_tr,
y=y_tr,
pi_e=pi_e_tr,
pi_b_star=pi_b_star_tr,
idx1=idx1_tr,
)
for a_idx in np.arange(bandit_feedback["n_class"]):
estimated_q_func_for_a = clf.predict(X=X_ev, a_idx=a_idx)[:, a_idx]
estimated_q_func[test_idx, a_idx] = estimated_q_func_for_a
clf.s.close()
return estimated_q_func
@dataclass
class QFuncEstimator:
num_features: int
num_classes: int
eta: float = 0.01
std: float = 0.01
lam: float = 0.001
batch_size: int = 256
epochs: int = 200
fitting_method: str = "stratified"
def __post_init__(self) -> None:
"""Initialize Class."""
tf.set_random_seed(0)
self.s = tf.Session()
self.create_placeholders()
self.build_graph()
self.create_losses()
self.add_optimizer()
def create_placeholders(self) -> None:
"""Create the placeholders to be used."""
self.input_X = tf.placeholder(
"float32", shape=(None, self.num_features), name="input_X"
)
self.input_A = tf.placeholder(
"float32", shape=(None, self.num_classes), name="input_A"
)
self.input_R = tf.placeholder("float32", shape=(None,), name="input_R")
self.input_pi_e = tf.placeholder(
"float32", shape=(None, self.num_classes), name="input_pi_e"
)
self.input_pi_b_star = tf.placeholder(
"float32", shape=(None, self.num_classes), name="input_pi_b_star"
)
self.input_idx1 = tf.placeholder("float32", shape=(None,), name="input_idx1")
def build_graph(self) -> None:
"""Build the main tensorflow graph with embedding layers."""
self.weights = tf.Variable(
tf.random_normal(
[self.num_features + self.num_classes, self.num_classes],
stddev=self.std,
)
)
self.bias = tf.Variable(tf.random_normal([self.num_classes], stddev=self.std))
with tf.variable_scope("prediction"):
input_X = tf.concat([self.input_X, self.input_A], axis=1)
self.preds = tf.sigmoid(tf.matmul(input_X, self.weights) + self.bias)
def create_losses(self) -> None:
"""Create the losses."""
with tf.name_scope("loss"):
shifted_rewards = self.input_R - tf.reduce_sum(
self.preds * self.input_A, axis=1
)
if self.fitting_method == "normal":
self.loss = tf.reduce_mean(tf.square(shifted_rewards))
else:
ratio1 = tf.reduce_mean(self.input_idx1)
input_idx2 = tf.ones_like(self.input_idx1) - self.input_idx1
ratio2 = tf.reduce_mean(input_idx2)
pi_e = tf.reduce_sum(self.input_pi_e * self.input_A, 1)
pi_b_star = tf.reduce_sum(self.input_pi_b_star * self.input_A, 1)
v = tf.reduce_sum(self.input_pi_e * self.preds, 1)
phi = (pi_e / pi_b_star) * shifted_rewards + v
phi1 = self.input_idx1 * phi
phi2 = input_idx2 * phi
if self.fitting_method == "stratified":
self.loss = ratio1 * tf.reduce_mean(tf.square(phi1))
self.loss += ratio2 * tf.reduce_mean(tf.square(phi2))
self.loss -= ratio1 * tf.square(tf.reduce_mean(phi1))
self.loss -= ratio2 * tf.square(tf.reduce_mean(phi2))
elif self.fitting_method == "naive":
self.loss = tf.reduce_mean(tf.square(phi))
self.loss -= tf.square(tf.reduce_mean(phi))
self.var_list = [self.weights, self.bias]
l2_reg = [tf.nn.l2_loss(v) for v in self.var_list]
self.loss += self.lam * tf.add_n(l2_reg)
def add_optimizer(self) -> None:
"""Add the required optimizer to the graph."""
with tf.name_scope("optimizer"):
self.apply_grads = tf.train.MomentumOptimizer(
learning_rate=self.eta, momentum=0.8
).minimize(self.loss, var_list=self.var_list)
def train(
self,
X: np.ndarray,
a: np.ndarray,
y: np.ndarray,
pi_e: np.ndarray,
pi_b_star: np.ndarray,
idx1: np.ndarray,
) -> None:
self.s.run(tf.global_variables_initializer())
for _ in np.arange(self.epochs):
arr = np.arange(X.shape[0])
np.random.shuffle(arr)
for idx in np.arange(0, X.shape[0], self.batch_size):
arr_ = arr[idx : idx + self.batch_size]
self.s.run(
self.apply_grads,
feed_dict={
self.input_X: X[arr_],
self.input_A: a[arr_],
self.input_R: y[arr_],
self.input_pi_e: pi_e[arr_],
self.input_pi_b_star: pi_b_star[arr_],
self.input_idx1: idx1[arr_],
},
)
def predict(self, X: np.ndarray, a_idx: int):
a_ = np.zeros((X.shape[0], self.num_classes))
a_[:, a_idx] = 1
return self.s.run(self.preds, feed_dict={self.input_X: X, self.input_A: a_})
def estimate_pi_b(bandit_feedback, k_fold: int = 2) -> None:
X = bandit_feedback["X_ev"]
idx1 = bandit_feedback["idx1"]
a = bandit_feedback["actions"]
skf = StratifiedKFold(n_splits=k_fold, shuffle=True)
skf.get_n_splits(X, a)
estimated_pi_b1 = np.zeros((bandit_feedback["n_eval"], bandit_feedback["n_class"]))
estimated_pi_b2 = np.zeros((bandit_feedback["n_eval"], bandit_feedback["n_class"]))
estimated_pi_b_star = np.zeros(
(bandit_feedback["n_eval"], bandit_feedback["n_class"])
)
for train_idx, test_idx in skf.split(X, a):
X_tr, X_ev = X[train_idx], X[test_idx]
idx1_tr, a_tr = idx1[train_idx], a[train_idx]
clf = LogisticRegression(random_state=12345)
clf.fit(X=X_tr[idx1_tr], y=a_tr[idx1_tr])
estimated_pi_b1[test_idx, :] = clf.predict_proba(X_ev)
clf = LogisticRegression(random_state=12345)
clf.fit(X=X_tr[~idx1_tr], y=a_tr[~idx1_tr])
estimated_pi_b2[test_idx, :] = clf.predict_proba(X_ev)
clf = LogisticRegression(random_state=12345)
clf.fit(X=X_tr, y=a_tr)
estimated_pi_b_star[test_idx, :] = clf.predict_proba(X_ev)
idx1 = np.expand_dims(idx1.astype(int), 1)
bandit_feedback["pi_b"] = np.clip(
idx1 * estimated_pi_b1 + (1 - idx1) * estimated_pi_b2, 1e-6, 1.0
)
bandit_feedback["pi_b_star"] = np.clip(estimated_pi_b_star, 1e-6, 1.0)
return bandit_feedback
Run¶
def calc_rel_rmse(policy_value_true: float, policy_value_estimated: float) -> float:
return np.sqrt(
(((policy_value_true - policy_value_estimated) / policy_value_true) ** 2).mean()
)
parser = argparse.ArgumentParser()
parser.add_argument("--num_sims", "-n", type=int, default=200)
parser.add_argument("--data", "-d", type=str, default='optdigits') # data in ['optdigits','pendigits','sat','letter']
parser.add_argument("--test_size", "-t", type=float, default=0.7)
parser.add_argument("--is_estimate_pi_b", "-i", default=True, action="store_true")
args = parser.parse_args(args={})
print(args)
Namespace(data='optdigits', is_estimate_pi_b=True, num_sims=200, test_size=0.7)
# configurations
num_sims = args.num_sims
data = args.data
test_size = args.test_size
is_estimate_pi_b = args.is_estimate_pi_b
np.random.seed(12345)
ratio_list = [0.1, 0.2, 0.5, 1, 2, 4, 10]
estimator_names = [
"ground_truth",
"IS-Avg",
"IS",
"IS-PW(f)",
"DR-Avg",
"DR-PW",
"DR",
"MRDR",
"SMRDR",
]
log_path = (
Path("log") / data / f"test_size={test_size}" / "estimated_pi_b"
if is_estimate_pi_b
else Path("log") / data / f"test_size={test_size}" / "true_pi_b"
)
log_path.mkdir(parents=True, exist_ok=True)
raw_results_path = log_path / "raw_results"
raw_results_path.mkdir(parents=True, exist_ok=True)
rel_rmse_results = {
name: {r: np.zeros(num_sims) for r in ratio_list} for name in estimator_names
}
for ratio in ratio_list:
start = time.time()
ope_results = {name: np.zeros(num_sims) for name in estimator_names}
for sim_id in np.arange(num_sims):
# load and split data
data_dict = load_datasets(
data=data, test_size=test_size, ratio=ratio, random_state=sim_id
)
# train eval and two behavior policies
pi_e, pi_b1, pi_b2 = train_policies(
data_dict=data_dict,
random_state=sim_id,
)
# generate bandit feedback
bandit_feedback_ = generate_bandit_feedback(
data_dict=data_dict, pi_b1=pi_b1, pi_b2=pi_b2
)
# estimate pi_b1, pi_b2, and pi_b_star with 2-fold cross-fitting
if is_estimate_pi_b:
bandit_feedback = estimate_pi_b(bandit_feedback=bandit_feedback_)
else:
bandit_feedback = bandit_feedback_
# estimate q-function with 2-fold cross-fitting
estimated_q_func = estimate_q_func(
bandit_feedback=bandit_feedback,
pi_e=pi_e,
fitting_method="normal",
)
estimated_q_func_with_mrdr_wrong = estimate_q_func(
bandit_feedback=bandit_feedback,
pi_e=pi_e,
fitting_method="naive",
)
estimated_q_func_with_mrdr = estimate_q_func(
bandit_feedback=bandit_feedback,
pi_e=pi_e,
fitting_method="stratified",
)
# off-policy evaluation
ope_results["ground_truth"][sim_id] = calc_ground_truth(
y_true=data_dict["y_ev"], pi=pi_e
)
ope_results["IS-Avg"][sim_id] = calc_ipw(
rewards=bandit_feedback["rewards"],
actions=bandit_feedback["actions"],
pi_b=bandit_feedback["pi_b"],
pi_e=pi_e,
)
ope_results["IS"][sim_id] = calc_ipw(
rewards=bandit_feedback["rewards"],
actions=bandit_feedback["actions"],
pi_b=bandit_feedback["pi_b_star"],
pi_e=pi_e,
)
ope_results["IS-PW(f)"][sim_id] = calc_weighted(
rewards=bandit_feedback["rewards"],
actions=bandit_feedback["actions"],
idx1=bandit_feedback["idx1"],
pi_b=bandit_feedback["pi_b"],
pi_e=pi_e,
)
ope_results["DR-Avg"][sim_id] = calc_dr(
rewards=bandit_feedback["rewards"],
actions=bandit_feedback["actions"],
estimated_q_func=estimated_q_func,
pi_b=bandit_feedback["pi_b"],
pi_e=pi_e,
)
ope_results["DR-PW"][sim_id] = calc_weighted(
rewards=bandit_feedback["rewards"],
actions=bandit_feedback["actions"],
idx1=bandit_feedback["idx1"],
pi_b=bandit_feedback["pi_b"],
pi_e=pi_e,
estimated_q_func=estimated_q_func,
)
ope_results["DR"][sim_id] = calc_dr(
rewards=bandit_feedback["rewards"],
actions=bandit_feedback["actions"],
estimated_q_func=estimated_q_func,
pi_b=bandit_feedback["pi_b_star"],
pi_e=pi_e,
)
ope_results["MRDR"][sim_id] = calc_dr(
rewards=bandit_feedback["rewards"],
actions=bandit_feedback["actions"],
estimated_q_func=estimated_q_func_with_mrdr_wrong,
pi_b=bandit_feedback["pi_b_star"],
pi_e=pi_e,
)
ope_results["SMRDR"][sim_id] = calc_dr(
rewards=bandit_feedback["rewards"],
actions=bandit_feedback["actions"],
estimated_q_func=estimated_q_func_with_mrdr,
pi_b=bandit_feedback["pi_b_star"],
pi_e=pi_e,
)
if ((sim_id + 1) % 20) == 0:
print(
f"ratio={ratio}-{sim_id+1}th: {np.round((time.time() - start) / 60, 2)}min"
)
# save raw off-policy evaluation results.
with open(raw_results_path / f"ratio={ratio}.pkl", mode="wb") as f:
pickle.dump(ope_results, f)
for estimator in estimator_names:
rel_rmse_results[estimator][ratio] = calc_rel_rmse(
policy_value_true=ope_results["ground_truth"],
policy_value_estimated=ope_results[estimator],
)
print(f"finish ratio={ratio}: {np.round((time.time() - start) / 60, 2)}min")
print("=" * 50)
# save results of the evaluation of OPE
rel_rmse_results_df = pd.DataFrame(rel_rmse_results).drop("ground_truth", 1)
rel_rmse_results_df.T.round(5).to_csv(log_path / f"rel_rmse.csv")
ratio=0.1-20th: 4.68min
ratio=0.1-40th: 9.34min
ratio=0.1-60th: 14.03min
ratio=0.1-80th: 18.7min
ratio=0.1-100th: 23.36min
ratio=0.1-120th: 28.05min
ratio=0.1-140th: 32.74min
ratio=0.1-160th: 37.42min
ratio=0.1-180th: 42.12min
ratio=0.1-200th: 46.86min
finish ratio=0.1: 46.86min
==================================================
ratio=0.2-20th: 4.73min
ratio=0.2-40th: 9.39min
ratio=0.2-60th: 14.04min
ratio=0.2-80th: 18.72min
ratio=0.2-100th: 23.4min
ratio=0.2-120th: 28.04min
ratio=0.2-140th: 32.7min
ratio=0.2-160th: 37.38min
ratio=0.2-180th: 42.08min
ratio=0.2-200th: 46.76min
finish ratio=0.2: 46.76min
==================================================
ratio=0.5-20th: 4.68min
ratio=0.5-40th: 9.34min
ratio=0.5-60th: 14.04min
ratio=0.5-80th: 18.75min
ratio=0.5-100th: 23.48min
ratio=0.5-120th: 28.18min
ratio=0.5-140th: 32.87min
ratio=0.5-160th: 37.55min
ratio=0.5-180th: 42.25min
ratio=0.5-200th: 46.96min
finish ratio=0.5: 46.96min
==================================================
ratio=1-20th: 4.74min
ratio=1-40th: 9.52min
ratio=1-60th: 14.19min
ratio=1-80th: 18.85min
ratio=1-100th: 23.49min
ratio=1-120th: 28.19min
ratio=1-140th: 32.92min
ratio=1-160th: 37.63min
ratio=1-180th: 42.35min
ratio=1-200th: 47.13min
finish ratio=1: 47.13min
==================================================
ratio=2-20th: 4.73min
ratio=2-40th: 9.43min
ratio=2-60th: 14.12min
ratio=2-80th: 18.83min
ratio=2-100th: 23.55min
ratio=2-120th: 28.28min
ratio=2-140th: 32.95min
ratio=2-160th: 37.7min
ratio=2-180th: 42.41min
ratio=2-200th: 47.1min
finish ratio=2: 47.1min
==================================================
ratio=4-20th: 4.69min
ratio=4-40th: 9.34min
ratio=4-60th: 13.98min
ratio=4-80th: 18.6min
ratio=4-100th: 23.22min
ratio=4-120th: 27.85min
ratio=4-140th: 32.52min
ratio=4-160th: 37.22min
ratio=4-180th: 41.96min
ratio=4-200th: 46.65min
finish ratio=4: 46.65min
==================================================
ratio=10-20th: 4.71min
ratio=10-40th: 9.48min
ratio=10-60th: 14.27min
ratio=10-80th: 19.05min
ratio=10-100th: 23.73min
ratio=10-120th: 28.38min
ratio=10-140th: 33.03min
ratio=10-160th: 37.7min
ratio=10-180th: 42.36min
ratio=10-200th: 47.02min
finish ratio=10: 47.02min
==================================================
pd.read_csv('./log/optdigits/test_size=0.7/estimated_pi_b/rel_rmse.csv', index_col=0)
| 0.1 | 0.2 | 0.5 | 1.0 | 2.0 | 4.0 | 10.0 | |
|---|---|---|---|---|---|---|---|
| IS-Avg | 208.48864 | 535.32569 | 2637.91477 | 905.71031 | 135.95046 | 87.50504 | 4222.50872 |
| IS | 0.51822 | 0.53494 | 0.52986 | 0.49601 | 0.44738 | 0.40806 | 0.41228 |
| IS-PW(f) | 28.30699 | 1.17752 | 0.75899 | 3.61226 | 43.51955 | 107.73743 | 46.19869 |
| DR-Avg | 131.46072 | 225.88591 | 510.52975 | 319.42255 | 117.08375 | 62.25383 | 232.50566 |
| DR-PW | 6.19500 | 0.06585 | 0.06765 | 0.23945 | 9.66307 | 96.67823 | 54.94231 |
| DR | 0.08051 | 0.07398 | 0.05631 | 0.03551 | 0.02426 | 0.05821 | 0.20872 |
| MRDR | 0.04909 | 0.05391 | 0.05649 | 0.06436 | 0.06722 | 0.11189 | 0.40688 |
| SMRDR | 0.04077 | 0.04424 | 0.05186 | 0.05897 | 0.07331 | 0.11474 | 0.44046 |
!apt-get install tree
!tree --du -h ./log
./log
└── [119K] optdigits
└── [115K] test_size=0.7
└── [111K] estimated_pi_b
├── [106K] raw_results
│ ├── [ 15K] ratio=0.1.pkl
│ ├── [ 15K] ratio=0.2.pkl
│ ├── [ 15K] ratio=0.5.pkl
│ ├── [ 15K] ratio=10.pkl
│ ├── [ 15K] ratio=1.pkl
│ ├── [ 15K] ratio=2.pkl
│ └── [ 15K] ratio=4.pkl
└── [ 557] rel_rmse.csv
123K used in 4 directories, 8 files