Source code for qunfold.sklearn

import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin, clone
from sklearn.exceptions import NotFittedError
from sklearn.model_selection import StratifiedKFold
from sklearn.utils.multiclass import unique_labels


[docs]
class CVClassifier(BaseEstimator, ClassifierMixin):
  """An ensemble of classifiers that are trained from cross-validation folds.

  All objects of this type have a fixed attribute `oob_score = True` and, when trained, a fitted attribute `self.oob_decision_function_`, just like scikit-learn bagging classifiers.

  Args:
      estimator: A classifier that implements the API of scikit-learn.
      n_estimators (optional): The number of stratified cross-validation folds. Defaults to `5`.
      random_state (optional): The random state for stratification. Defaults to `None`.

  Examples:
      Here, we create an instance of ACC that trains a logistic regression classifier with 10 cross-validation folds.

          >>> ACC(CVClassifier(LogisticRegression(), 10))
  """
  def __init__(self, estimator, n_estimators=5, random_state=None):
    self.estimator = estimator
    self.n_estimators = n_estimators
    self.random_state = random_state
    self.oob_score = True # the whole point of this class is to have an oob_score

[docs]
  def fit(self, X, y):
    self.estimators_ = []
    self.i_classes_ = [] # the indices of each estimator's subset of classes
    self.classes_ = unique_labels(y)
    self.oob_decision_function_ = np.zeros((len(y), len(self.classes_)))
    class_mapping = dict(zip(self.classes_, np.arange(len(self.classes_))))
    skf = StratifiedKFold(
        n_splits = self.n_estimators,
        random_state = self.random_state,
        shuffle = True
    )
    for i_trn, i_tst in skf.split(X, y):
      estimator = clone(self.estimator).fit(X[i_trn], y[i_trn])
      i_classes = np.array([ class_mapping[_class] for _class in estimator.classes_ ])
      y_pred = estimator.predict_proba(X[i_tst])
      self.oob_decision_function_[i_tst[:, np.newaxis], i_classes[np.newaxis, :]] = y_pred
      self.estimators_.append(estimator)
      self.i_classes_.append(i_classes)
    return self


[docs]
  def predict_proba(self, X):
    if not hasattr(self, "classes_"):
      raise NotFittedError()
    y_pred = np.zeros((len(self.estimators_), X.shape[0], len(self.classes_)))
    for i, (estimator, i_classes) in enumerate(zip(self.estimators_, self.i_classes_)):
      y_pred[i, :, i_classes] = estimator.predict_proba(X).T
    return np.mean(y_pred, axis=0) # shape (n_samples, n_classes)


[docs]
  def predict(self, X):
    y_pred = self.predict_proba(X).argmax(axis=1) # class indices
    return self.classes_[y_pred]