Source code for dask_glm.estimators

"""
Models following scikit-learn's estimator API.
"""
from sklearn.base import BaseEstimator

from . import algorithms
from . import families
from .utils import (
    sigmoid, dot, add_intercept, mean_squared_error, accuracy_score, exp,
    poisson_deviance
)


class _GLM(BaseEstimator):

    @property
    def family(self):
        """
        The family this estimator is for.
        """

    def __init__(self, fit_intercept=True, solver='admm', regularizer='l2',
                 max_iter=100, tol=1e-4, lamduh=1.0, rho=1,
                 over_relax=1, abstol=1e-4, reltol=1e-2):
        self.fit_intercept = fit_intercept
        self.solver = solver
        self.regularizer = regularizer
        self.max_iter = max_iter
        self.tol = tol
        self.lamduh = lamduh
        self.rho = rho
        self.over_relax = over_relax
        self.abstol = abstol
        self.reltol = reltol

        self.coef_ = None
        self.intercept_ = None
        self._coef = None  # coef, maybe with intercept

        fit_kwargs = {'max_iter', 'tol', 'family'}

        if solver == 'admm':
            fit_kwargs.discard('tol')
            fit_kwargs.update({
                'regularizer', 'lamduh', 'rho', 'over_relax', 'abstol',
                'reltol'
            })
        elif solver == 'proximal_grad' or solver == 'lbfgs':
            fit_kwargs.update({'regularizer', 'lamduh'})

        self._fit_kwargs = {k: getattr(self, k) for k in fit_kwargs}

    def fit(self, X, y=None):
        X_ = self._maybe_add_intercept(X)
        self._coef = algorithms._solvers[self.solver](X_, y, **self._fit_kwargs)

        if self.fit_intercept:
            self.coef_ = self._coef[:-1]
            self.intercept_ = self._coef[-1]
        else:
            self.coef_ = self._coef
        return self

    def _maybe_add_intercept(self, X):
        if self.fit_intercept:
            return add_intercept(X)
        else:
            return X


[docs]class LogisticRegression(_GLM):
    """
    Esimator for logistic regression.

    Parameters
    ----------
    fit_intercept : bool, default True
        Specifies if a constant (a.k.a. bias or intercept) should be
        added to the decision function.
    solver : {'admm', 'gradient_descent', 'newton', 'lbfgs', 'proximal_grad'}
        Solver to use. See :ref:`api.algorithms` for details
    regularizer : {'l1', 'l2'}
        Regularizer to use. See :ref:`api.regularizers` for details.
        Only used with ``admm``, ``lbfgs``, and ``proximal_grad`` solvers.
    max_iter : int, default 100
        Maximum number of iterations taken for the solvers to converge
    tol : float, default 1e-4
        Tolerance for stopping criteria. Ignored for ``admm`` solver
    lambduh : float, default 1.0
        Only used with ``admm``, ``lbfgs`` and ``proximal_grad`` solvers.
    rho, over_relax, abstol, reltol : float
        Only used with the ``admm`` solver.

    Attributes
    ----------
    coef_ : array, shape (n_classes, n_features)
        The learned value for the model's coefficients
    intercept_ : float of None
        The learned value for the intercept, if one was added
        to the model

    Examples
    --------
    >>> from dask_glm.datasets import make_classification
    >>> X, y = make_classification()
    >>> lr = LogisticRegression()
    >>> lr.fit(X, y)
    >>> lr.predict(X)
    >>> lr.predict_proba(X)
    >>> est.score(X, y)
    """

    @property
    def family(self):
        return families.Logistic

    def predict(self, X):
        return self.predict_proba(X) > .5  # TODO: verify, multiclass broken

    def predict_proba(self, X):
        X_ = self._maybe_add_intercept(X)
        return sigmoid(dot(X_, self._coef))

    def score(self, X, y):
        return accuracy_score(y, self.predict(X))


[docs]class LinearRegression(_GLM):
    """
    Esimator for a linear model using Ordinary Least Squares.

    Parameters
    ----------
    fit_intercept : bool, default True
        Specifies if a constant (a.k.a. bias or intercept) should be
        added to the decision function.
    solver : {'admm', 'gradient_descent', 'newton', 'lbfgs', 'proximal_grad'}
        Solver to use. See :ref:`api.algorithms` for details
    regularizer : {'l1', 'l2'}
        Regularizer to use. See :ref:`api.regularizers` for details.
        Only used with ``admm`` and ``proximal_grad`` solvers.
    max_iter : int, default 100
        Maximum number of iterations taken for the solvers to converge
    tol : float, default 1e-4
        Tolerance for stopping criteria. Ignored for ``admm`` solver
    lambduh : float, default 1.0
        Only used with ``admm`` and ``proximal_grad`` solvers
    rho, over_relax, abstol, reltol : float
        Only used with the ``admm`` solver.

    Attributes
    ----------
    coef_ : array, shape (n_classes, n_features)
        The learned value for the model's coefficients
    intercept_ : float of None
        The learned value for the intercept, if one was added
        to the model

    Examples
    --------
    >>> from dask_glm.datasets import make_regression
    >>> X, y = make_regression()
    >>> est = LinearRegression()
    >>> est.fit(X, y)
    >>> est.predict(X)
    >>> est.score(X, y)
    """
    @property
    def family(self):
        return families.Normal

    def predict(self, X):
        X_ = self._maybe_add_intercept(X)
        return dot(X_, self._coef)

    def score(self, X, y):
        return mean_squared_error(y, self.predict(X))


[docs]class PoissonRegression(_GLM):
    """
    Esimator for Poisson Regression.

    Parameters
    ----------
    fit_intercept : bool, default True
        Specifies if a constant (a.k.a. bias or intercept) should be
        added to the decision function.
    solver : {'admm', 'gradient_descent', 'newton', 'lbfgs', 'proximal_grad'}
        Solver to use. See :ref:`api.algorithms` for details
    regularizer : {'l1', 'l2'}
        Regularizer to use. See :ref:`api.regularizers` for details.
        Only used with ``admm``, ``lbfgs``, and ``proximal_grad`` solvers.
    max_iter : int, default 100
        Maximum number of iterations taken for the solvers to converge
    tol : float, default 1e-4
        Tolerance for stopping criteria. Ignored for ``admm`` solver
    lambduh : float, default 1.0
        Only used with ``admm``, ``lbfgs`` and ``proximal_grad`` solvers.
    rho, over_relax, abstol, reltol : float
        Only used with the ``admm`` solver.

    Attributes
    ----------
    coef_ : array, shape (n_classes, n_features)
        The learned value for the model's coefficients
    intercept_ : float of None
        The learned value for the intercept, if one was added
        to the model

    Examples
    --------
    >>> from dask_glm.datasets import make_poisson
    >>> X, y = make_poisson()
    >>> pr = PoissonRegression()
    >>> pr.fit(X, y)
    >>> pr.predict(X)
    >>> pr.get_deviance(X, y)
    """
    @property
    def family(self):
        return families.Poisson

    def predict(self, X):
        X_ = self._maybe_add_intercept(X)
        return exp(dot(X_, self._coef))

    def get_deviance(self, X, y):
        return poisson_deviance(y, self.predict(X))