Source code for deeptab.metrics.classification

"""Classification metrics (Accuracy, F1, AUROC, AUPRC, LogLoss, ECE, BrierScore).

All standard metrics delegate to :mod:`sklearn.metrics` internally.
The wrapper classes add the :class:`DeepTabMetric` interface (``name``,
``higher_is_better``, ``needs_raw``) and normalise DeepTab-specific
prediction formats (2-D probability arrays vs 1-D label arrays).

:class:`ExpectedCalibrationError` is the only class without a sklearn
equivalent and is therefore implemented from scratch.

Quick reference
---------------

.. list-table::
   :header-rows: 1
   :widths: 28 14 20 38

   * - Class
     - ``name``
     - ``higher_is_better``
     - Notes
   * - :class:`Accuracy`
     - ``"accuracy"``
     - ``True``
     - Fraction correct; **higher = better**
   * - :class:`F1Score`
     - ``"f1"``
     - ``True``
     - Harmonic mean precision/recall; **higher = better**
   * - :class:`AUROC`
     - ``"auroc"``
     - ``True``
     - Needs probability scores; **higher = better**
   * - :class:`AUPRC`
     - ``"auprc"``
     - ``True``
     - Better than AUROC for imbalanced data; **higher = better**
   * - :class:`LogLoss`
     - ``"log_loss"``
     - ``False``
     - Cross-entropy; lower = better
   * - :class:`BrierScore`
     - ``"brier"``
     - ``False``
     - MSE of probability; lower = better
   * - :class:`ExpectedCalibrationError`
     - ``"ece"``
     - ``False``
     - 0 = perfectly calibrated; lower = better
"""

from __future__ import annotations

import itertools

import numpy as np
from sklearn.metrics import accuracy_score as _accuracy
from sklearn.metrics import average_precision_score as _auprc
from sklearn.metrics import brier_score_loss as _brier
from sklearn.metrics import f1_score as _f1
from sklearn.metrics import log_loss as _log_loss
from sklearn.metrics import roc_auc_score as _auroc

from .base import DeepTabMetric


[docs] class Accuracy(DeepTabMetric): """Classification accuracy -- delegates to :func:`sklearn.metrics.accuracy_score`. Accepts 1-D integer labels or 2-D probability arrays (argmax is taken). """ name = "accuracy" higher_is_better = True def __call__(self, y_true: np.ndarray, y_pred: np.ndarray) -> float: y_true = np.asarray(y_true).ravel() y_pred = np.asarray(y_pred) labels = np.argmax(y_pred, axis=1) if y_pred.ndim == 2 else (y_pred.ravel() >= 0.5).astype(int) return float(_accuracy(y_true, labels))
[docs] class F1Score(DeepTabMetric): """F1 Score -- delegates to :func:`sklearn.metrics.f1_score`. Parameters ---------- average : str Averaging strategy: ``"binary"`` (default), ``"macro"``, or ``"weighted"``. """ name = "f1" higher_is_better = True def __init__(self, average: str = "binary") -> None: if average not in ("binary", "macro", "weighted"): raise ValueError(f"average must be 'binary', 'macro', or 'weighted', got {average!r}") self.average = average def __call__(self, y_true: np.ndarray, y_pred: np.ndarray) -> float: y_true = np.asarray(y_true).ravel() y_pred = np.asarray(y_pred) labels = np.argmax(y_pred, axis=1) if y_pred.ndim == 2 else (y_pred.ravel() >= 0.5).astype(int) return float(_f1(y_true, labels, average=self.average, zero_division=0)) # type: ignore[arg-type] def __repr__(self) -> str: return f"F1Score(average={self.average!r})"
[docs] class AUROC(DeepTabMetric): """Area Under the ROC Curve -- delegates to :func:`sklearn.metrics.roc_auc_score`. Parameters ---------- average : str ``"macro"`` (default) or ``"weighted"``. Ignored for binary tasks. """ name = "auroc" higher_is_better = True def __init__(self, average: str = "macro") -> None: self.average = average def __call__(self, y_true: np.ndarray, y_pred: np.ndarray) -> float: y_true = np.asarray(y_true).ravel() y_pred = np.asarray(y_pred) try: if y_pred.ndim == 2 and y_pred.shape[1] == 2: return float(_auroc(y_true, y_pred[:, 1])) elif y_pred.ndim == 2: return float(_auroc(y_true, y_pred, multi_class="ovr", average=self.average)) else: return float(_auroc(y_true, y_pred.ravel())) except ValueError: return float("nan") def __repr__(self) -> str: return f"AUROC(average={self.average!r})"
[docs] class AUPRC(DeepTabMetric): """Area Under the Precision-Recall Curve -- delegates to :func:`sklearn.metrics.average_precision_score`. """ name = "auprc" higher_is_better = True def __call__(self, y_true: np.ndarray, y_pred: np.ndarray) -> float: y_true = np.asarray(y_true).ravel() y_pred = np.asarray(y_pred) scores = y_pred[:, 1] if y_pred.ndim == 2 else y_pred.ravel() try: return float(_auprc(y_true, scores)) except ValueError: return float("nan")
[docs] class LogLoss(DeepTabMetric): """Cross-Entropy / Log Loss -- delegates to :func:`sklearn.metrics.log_loss`.""" name = "log_loss" higher_is_better = False def __call__(self, y_true: np.ndarray, y_pred: np.ndarray) -> float: return float(_log_loss(np.asarray(y_true).ravel(), np.asarray(y_pred)))
[docs] class BrierScore(DeepTabMetric): """Brier Score -- delegates to :func:`sklearn.metrics.brier_score_loss`. Accepts 1-D probability scores or a 2-D array (second column is used). """ name = "brier" higher_is_better = False def __call__(self, y_true: np.ndarray, y_pred: np.ndarray) -> float: y_true = np.asarray(y_true).ravel() y_pred = np.asarray(y_pred, dtype=float) probs = y_pred[:, 1] if y_pred.ndim == 2 else y_pred.ravel() return float(_brier(y_true, probs))
[docs] class ExpectedCalibrationError(DeepTabMetric): """Expected Calibration Error (ECE). sklearn does not provide ECE natively, so this is a custom implementation. Bins predictions by confidence and measures the gap between mean confidence and accuracy per bin. Parameters ---------- n_bins : int Number of confidence bins. Default 10. """ name = "ece" higher_is_better = False def __init__(self, n_bins: int = 10) -> None: self.n_bins = n_bins def __call__(self, y_true: np.ndarray, y_pred: np.ndarray) -> float: y_true = np.asarray(y_true).ravel() y_pred = np.asarray(y_pred, dtype=float) if y_pred.ndim == 2: confidence = y_pred.max(axis=1) preds = y_pred.argmax(axis=1) else: confidence = np.where(y_pred >= 0.5, y_pred, 1.0 - y_pred).ravel() preds = (y_pred.ravel() >= 0.5).astype(int) correct = (preds == y_true).astype(float) bin_edges = np.linspace(0.0, 1.0, self.n_bins + 1) ece = 0.0 n = len(y_true) for lo, hi in itertools.pairwise(bin_edges): mask = (confidence >= lo) & (confidence < hi) if mask.sum() == 0: continue acc = correct[mask].mean() conf = confidence[mask].mean() ece += mask.sum() / n * abs(acc - conf) return float(ece) def __repr__(self) -> str: return f"ExpectedCalibrationError(n_bins={self.n_bins})"