Source code for dueling_bandit.environment

from dataclasses import dataclass
from typing import Optional, Tuple
import numpy as np
from scipy.sparse import diags, eye
from scipy.sparse.linalg import eigs

[docs]class RankCentrality: """Rank Centrality for spectral ranking from pairwise comparisons.""" def __init__(self, k: int): self.k = k
[docs] def stationary_distribution(self, wins: np.ndarray, losses: np.ndarray) -> np.ndarray: """ Compute stationary distribution using Rank Centrality. Args: wins (np.ndarray): Win counts (k x k). losses (np.ndarray): Loss counts (k x k). Returns: np.ndarray: Stationary distribution (pi). """ total = wins + losses total = np.where(total == 0, 1, total) P = wins / total P = np.where(wins + losses == 0, 1.0 / self.k, P) np.fill_diagonal(P, 0) degrees = P.sum(axis=1) degrees = np.where(degrees == 0, 1, degrees) D_inv = diags(1.0 / degrees) M = eye(self.k) - D_inv @ P _, vecs = eigs(M.T, k=1, which='SM') pi = np.abs(vecs[:, 0].real) pi /= pi.sum() return pi
[docs]@dataclass class DuelingBanditEnv: """Dueling Bandit environment with Bradley-Terry model.""" P: np.ndarray features: Optional[np.ndarray] = None seed: Optional[int] = None def __post_init__(self): self.k = self.P.shape[0] self.rng = np.random.default_rng(self.seed) self.rank = RankCentrality(self.k)
[docs] @classmethod def random_bt(cls, k: int, d: int = 0, seed: Optional[int] = None) -> "DuelingBanditEnv": """Generate a random Bradley-Terry environment.""" rng = np.random.default_rng(seed) utilities = rng.normal(size=k) P = 1.0 / (1.0 + np.exp(utilities[:, None] - utilities[None, :])) np.fill_diagonal(P, 0.5) features = rng.normal(size=(k, d)) if d > 0 else None return cls(P, features, seed)
[docs] def duel(self, a: int, b: int) -> Tuple[int, int]: """Perform a duel between two items.""" if self.rng.random() < self.P[a, b]: return a, b return b, a
[docs] def best_arm(self) -> int: """Return the index of the item with the highest BTL score.""" utilities = -np.log(1.0 / self.P - 1) return int(np.argmax(np.nanmean(utilities, axis=1)))
[docs] def true_rank(self) -> np.ndarray: """Return the true ranking of items.""" utilities = -np.log(1.0 / self.P - 1) scores = np.nanmean(utilities, axis=1) return np.argsort(-scores)
[docs] def delta12(self) -> float: """Compute the separation metric Delta_1,2.""" top2 = np.argsort(-np.nanmean(-np.log(1.0 / self.P - 1), axis=1))[:2] return (self.P[top2[0], top2[1]] - 0.5) ** 2