Source code for dask_glm.estimators

"""
Models following scikit-learn's estimator API.
"""
from sklearn.base import BaseEstimator

from . import algorithms
from . import families
from .utils import (
    sigmoid, dot, add_intercept, mean_squared_error, accuracy_score, exp,
    poisson_deviance
)


class _GLM(BaseEstimator):

    @property
    def family(self):
        """
        The family this estimator is for.
        """

    def __init__(self, fit_intercept=True, solver='admm', regularizer='l2',
                 max_iter=100, tol=1e-4, lamduh=1.0, rho=1,
                 over_relax=1, abstol=1e-4, reltol=1e-2):
        self.fit_intercept = fit_intercept
        self.solver = solver
        self.regularizer = regularizer
        self.max_iter = max_iter
        self.tol = tol
        self.lamduh = lamduh
        self.rho = rho
        self.over_relax = over_relax
        self.abstol = abstol
        self.reltol = reltol

        self.coef_ = None
        self.intercept_ = None
        self._coef = None  # coef, maybe with intercept

        fit_kwargs = {'max_iter', 'tol', 'family'}

        if solver == 'admm':
            fit_kwargs.discard('tol')
            fit_kwargs.update({
                'regularizer', 'lamduh', 'rho', 'over_relax', 'abstol',
                'reltol'
            })
        elif solver == 'proximal_grad' or solver == 'lbfgs':
            fit_kwargs.update({'regularizer', 'lamduh'})

        self._fit_kwargs = {k: getattr(self, k) for k in fit_kwargs}

    def fit(self, X, y=None):
        X_ = self._maybe_add_intercept(X)
        self._coef = algorithms._solvers[self.solver](X_, y, **self._fit_kwargs)

        if self.fit_intercept:
            self.coef_ = self._coef[:-1]
            self.intercept_ = self._coef[-1]
        else:
            self.coef_ = self._coef
        return self

    def _maybe_add_intercept(self, X):
        if self.fit_intercept:
            return add_intercept(X)
        else:
            return X


[docs]class LogisticRegression(_GLM): """ Esimator for logistic regression. Parameters ---------- fit_intercept : bool, default True Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function. solver : {'admm', 'gradient_descent', 'newton', 'lbfgs', 'proximal_grad'} Solver to use. See :ref:`api.algorithms` for details regularizer : {'l1', 'l2'} Regularizer to use. See :ref:`api.regularizers` for details. Only used with ``admm``, ``lbfgs``, and ``proximal_grad`` solvers. max_iter : int, default 100 Maximum number of iterations taken for the solvers to converge tol : float, default 1e-4 Tolerance for stopping criteria. Ignored for ``admm`` solver lambduh : float, default 1.0 Only used with ``admm``, ``lbfgs`` and ``proximal_grad`` solvers. rho, over_relax, abstol, reltol : float Only used with the ``admm`` solver. Attributes ---------- coef_ : array, shape (n_classes, n_features) The learned value for the model's coefficients intercept_ : float of None The learned value for the intercept, if one was added to the model Examples -------- >>> from dask_glm.datasets import make_classification >>> X, y = make_classification() >>> lr = LogisticRegression() >>> lr.fit(X, y) >>> lr.predict(X) >>> lr.predict_proba(X) >>> est.score(X, y) """ @property def family(self): return families.Logistic def predict(self, X): return self.predict_proba(X) > .5 # TODO: verify, multiclass broken def predict_proba(self, X): X_ = self._maybe_add_intercept(X) return sigmoid(dot(X_, self._coef)) def score(self, X, y): return accuracy_score(y, self.predict(X))
[docs]class LinearRegression(_GLM): """ Esimator for a linear model using Ordinary Least Squares. Parameters ---------- fit_intercept : bool, default True Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function. solver : {'admm', 'gradient_descent', 'newton', 'lbfgs', 'proximal_grad'} Solver to use. See :ref:`api.algorithms` for details regularizer : {'l1', 'l2'} Regularizer to use. See :ref:`api.regularizers` for details. Only used with ``admm`` and ``proximal_grad`` solvers. max_iter : int, default 100 Maximum number of iterations taken for the solvers to converge tol : float, default 1e-4 Tolerance for stopping criteria. Ignored for ``admm`` solver lambduh : float, default 1.0 Only used with ``admm`` and ``proximal_grad`` solvers rho, over_relax, abstol, reltol : float Only used with the ``admm`` solver. Attributes ---------- coef_ : array, shape (n_classes, n_features) The learned value for the model's coefficients intercept_ : float of None The learned value for the intercept, if one was added to the model Examples -------- >>> from dask_glm.datasets import make_regression >>> X, y = make_regression() >>> est = LinearRegression() >>> est.fit(X, y) >>> est.predict(X) >>> est.score(X, y) """ @property def family(self): return families.Normal def predict(self, X): X_ = self._maybe_add_intercept(X) return dot(X_, self._coef) def score(self, X, y): return mean_squared_error(y, self.predict(X))
[docs]class PoissonRegression(_GLM): """ Esimator for Poisson Regression. Parameters ---------- fit_intercept : bool, default True Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function. solver : {'admm', 'gradient_descent', 'newton', 'lbfgs', 'proximal_grad'} Solver to use. See :ref:`api.algorithms` for details regularizer : {'l1', 'l2'} Regularizer to use. See :ref:`api.regularizers` for details. Only used with ``admm``, ``lbfgs``, and ``proximal_grad`` solvers. max_iter : int, default 100 Maximum number of iterations taken for the solvers to converge tol : float, default 1e-4 Tolerance for stopping criteria. Ignored for ``admm`` solver lambduh : float, default 1.0 Only used with ``admm``, ``lbfgs`` and ``proximal_grad`` solvers. rho, over_relax, abstol, reltol : float Only used with the ``admm`` solver. Attributes ---------- coef_ : array, shape (n_classes, n_features) The learned value for the model's coefficients intercept_ : float of None The learned value for the intercept, if one was added to the model Examples -------- >>> from dask_glm.datasets import make_poisson >>> X, y = make_poisson() >>> pr = PoissonRegression() >>> pr.fit(X, y) >>> pr.predict(X) >>> pr.get_deviance(X, y) """ @property def family(self): return families.Poisson def predict(self, X): X_ = self._maybe_add_intercept(X) return exp(dot(X_, self._coef)) def get_deviance(self, X, y): return poisson_deviance(y, self.predict(X))