Source code for deeptab.metrics.classification

"""Classification metrics (Accuracy, F1, AUROC, AUPRC, LogLoss, ECE, BrierScore).

All standard metrics delegate to :mod:`sklearn.metrics` internally.
The wrapper classes add the :class:`DeepTabMetric` interface (``name``,
``higher_is_better``, ``needs_raw``) and normalise DeepTab-specific
prediction formats (2-D probability arrays vs 1-D label arrays).

:class:`ExpectedCalibrationError` is the only class without a sklearn
equivalent and is therefore implemented from scratch.

Quick reference
---------------

.. list-table::
   :header-rows: 1
   :widths: 28 14 20 38

   * - Class
     - ``name``
     - ``higher_is_better``
     - Notes
   * - :class:`Accuracy`
     - ``"accuracy"``
     - ``True``
     - Fraction correct; **higher = better**
   * - :class:`F1Score`
     - ``"f1"``
     - ``True``
     - Harmonic mean precision/recall; **higher = better**
   * - :class:`AUROC`
     - ``"auroc"``
     - ``True``
     - Needs probability scores; **higher = better**
   * - :class:`AUPRC`
     - ``"auprc"``
     - ``True``
     - Better than AUROC for imbalanced data; **higher = better**
   * - :class:`LogLoss`
     - ``"log_loss"``
     - ``False``
     - Cross-entropy; lower = better
   * - :class:`BrierScore`
     - ``"brier"``
     - ``False``
     - MSE of probability; lower = better
   * - :class:`ExpectedCalibrationError`
     - ``"ece"``
     - ``False``
     - 0 = perfectly calibrated; lower = better
"""

from __future__ import annotations

import itertools

import numpy as np
from sklearn.metrics import accuracy_score as _accuracy
from sklearn.metrics import average_precision_score as _auprc
from sklearn.metrics import brier_score_loss as _brier
from sklearn.metrics import f1_score as _f1
from sklearn.metrics import log_loss as _log_loss
from sklearn.metrics import roc_auc_score as _auroc

from .base import DeepTabMetric



[docs]
class Accuracy(DeepTabMetric):
    """Classification accuracy -- delegates to :func:`sklearn.metrics.accuracy_score`.

    Accepts 1-D integer labels or 2-D probability arrays (argmax is taken).
    """

    name = "accuracy"
    higher_is_better = True

    def __call__(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
        y_true = np.asarray(y_true).ravel()
        y_pred = np.asarray(y_pred)
        labels = np.argmax(y_pred, axis=1) if y_pred.ndim == 2 else (y_pred.ravel() >= 0.5).astype(int)
        return float(_accuracy(y_true, labels))




[docs]
class F1Score(DeepTabMetric):
    """F1 Score -- delegates to :func:`sklearn.metrics.f1_score`.

    Parameters
    ----------
    average : str
        Averaging strategy: ``"binary"`` (default), ``"macro"``, or
        ``"weighted"``.
    """

    name = "f1"
    higher_is_better = True

    def __init__(self, average: str = "binary") -> None:
        if average not in ("binary", "macro", "weighted"):
            raise ValueError(f"average must be 'binary', 'macro', or 'weighted', got {average!r}")
        self.average = average

    def __call__(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
        y_true = np.asarray(y_true).ravel()
        y_pred = np.asarray(y_pred)
        labels = np.argmax(y_pred, axis=1) if y_pred.ndim == 2 else (y_pred.ravel() >= 0.5).astype(int)
        return float(_f1(y_true, labels, average=self.average, zero_division=0))  # type: ignore[arg-type]

    def __repr__(self) -> str:
        return f"F1Score(average={self.average!r})"




[docs]
class AUROC(DeepTabMetric):
    """Area Under the ROC Curve -- delegates to :func:`sklearn.metrics.roc_auc_score`.

    Parameters
    ----------
    average : str
        ``"macro"`` (default) or ``"weighted"``.  Ignored for binary tasks.
    """

    name = "auroc"
    higher_is_better = True

    def __init__(self, average: str = "macro") -> None:
        self.average = average

    def __call__(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
        y_true = np.asarray(y_true).ravel()
        y_pred = np.asarray(y_pred)
        try:
            if y_pred.ndim == 2 and y_pred.shape[1] == 2:
                return float(_auroc(y_true, y_pred[:, 1]))
            elif y_pred.ndim == 2:
                return float(_auroc(y_true, y_pred, multi_class="ovr", average=self.average))
            else:
                return float(_auroc(y_true, y_pred.ravel()))
        except ValueError:
            return float("nan")

    def __repr__(self) -> str:
        return f"AUROC(average={self.average!r})"




[docs]
class AUPRC(DeepTabMetric):
    """Area Under the Precision-Recall Curve -- delegates to
    :func:`sklearn.metrics.average_precision_score`.
    """

    name = "auprc"
    higher_is_better = True

    def __call__(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
        y_true = np.asarray(y_true).ravel()
        y_pred = np.asarray(y_pred)
        scores = y_pred[:, 1] if y_pred.ndim == 2 else y_pred.ravel()
        try:
            return float(_auprc(y_true, scores))
        except ValueError:
            return float("nan")




[docs]
class LogLoss(DeepTabMetric):
    """Cross-Entropy / Log Loss -- delegates to :func:`sklearn.metrics.log_loss`."""

    name = "log_loss"
    higher_is_better = False

    def __call__(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
        return float(_log_loss(np.asarray(y_true).ravel(), np.asarray(y_pred)))




[docs]
class BrierScore(DeepTabMetric):
    """Brier Score -- delegates to :func:`sklearn.metrics.brier_score_loss`.

    Accepts 1-D probability scores or a 2-D array (second column is used).
    """

    name = "brier"
    higher_is_better = False

    def __call__(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
        y_true = np.asarray(y_true).ravel()
        y_pred = np.asarray(y_pred, dtype=float)
        probs = y_pred[:, 1] if y_pred.ndim == 2 else y_pred.ravel()
        return float(_brier(y_true, probs))




[docs]
class ExpectedCalibrationError(DeepTabMetric):
    """Expected Calibration Error (ECE).

    sklearn does not provide ECE natively, so this is a custom implementation.
    Bins predictions by confidence and measures the gap between mean confidence
    and accuracy per bin.

    Parameters
    ----------
    n_bins : int
        Number of confidence bins.  Default 10.
    """

    name = "ece"
    higher_is_better = False

    def __init__(self, n_bins: int = 10) -> None:
        self.n_bins = n_bins

    def __call__(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
        y_true = np.asarray(y_true).ravel()
        y_pred = np.asarray(y_pred, dtype=float)
        if y_pred.ndim == 2:
            confidence = y_pred.max(axis=1)
            preds = y_pred.argmax(axis=1)
        else:
            confidence = np.where(y_pred >= 0.5, y_pred, 1.0 - y_pred).ravel()
            preds = (y_pred.ravel() >= 0.5).astype(int)
        correct = (preds == y_true).astype(float)

        bin_edges = np.linspace(0.0, 1.0, self.n_bins + 1)
        ece = 0.0
        n = len(y_true)
        for lo, hi in itertools.pairwise(bin_edges):
            mask = (confidence >= lo) & (confidence < hi)
            if mask.sum() == 0:
                continue
            acc = correct[mask].mean()
            conf = confidence[mask].mean()
            ece += mask.sum() / n * abs(acc - conf)
        return float(ece)

    def __repr__(self) -> str:
        return f"ExpectedCalibrationError(n_bins={self.n_bins})"