Source code for dask_glm.algorithms

"""Optimization algorithms for solving minimizaiton problems.
"""

from __future__ import absolute_import, division, print_function

from dask import delayed, persist, compute, set_options
import functools
import numpy as np
import dask.array as da
from scipy.optimize import fmin_l_bfgs_b


from dask_glm.utils import dot, normalize
from dask_glm.families import Logistic
from dask_glm.regularizers import Regularizer


[docs]def compute_stepsize_dask(beta, step, Xbeta, Xstep, y, curr_val, family=Logistic, stepSize=1.0, armijoMult=0.1, backtrackMult=0.1): """Compute the optimal stepsize beta : array-like step : float XBeta : array-lie Xstep : y : array-like curr_val : float famlily : Family, optional stepSize : float, optional armijoMult : float, optional backtrackMult : float, optional Returns ------- stepSize : flaot beta : array-like xBeta : array-like func : callable """ loglike = family.loglike beta, step, Xbeta, Xstep, y, curr_val = persist(beta, step, Xbeta, Xstep, y, curr_val) obeta, oXbeta = beta, Xbeta (step,) = compute(step) steplen = (step ** 2).sum() lf = curr_val func = 0 for ii in range(100): beta = obeta - stepSize * step if ii and (beta == obeta).all(): stepSize = 0 break Xbeta = oXbeta - stepSize * Xstep func = loglike(Xbeta, y) Xbeta, func = persist(Xbeta, func) df = lf - compute(func)[0] if df >= armijoMult * stepSize * steplen: break stepSize *= backtrackMult return stepSize, beta, Xbeta, func
@normalize
[docs]def gradient_descent(X, y, max_iter=100, tol=1e-14, family=Logistic, **kwargs): """ Michael Grant's implementation of Gradient Descent. Parameters ---------- X : array-like, shape (n_samples, n_features) y : array-like, shape (n_samples,) max_iter : int maximum number of iterations to attempt before declaring failure to converge tol : float Maximum allowed change from prior iteration required to declare convergence family : Family Returns ------- beta : array-like, shape (n_features,) """ loglike, gradient = family.loglike, family.gradient n, p = X.shape firstBacktrackMult = 0.1 nextBacktrackMult = 0.5 armijoMult = 0.1 stepGrowth = 1.25 stepSize = 1.0 recalcRate = 10 backtrackMult = firstBacktrackMult beta = np.zeros(p) for k in range(max_iter): # how necessary is this recalculation? if k % recalcRate == 0: Xbeta = X.dot(beta) func = loglike(Xbeta, y) grad = gradient(Xbeta, X, y) Xgradient = X.dot(grad) # backtracking line search lf = func stepSize, _, _, func = compute_stepsize_dask(beta, grad, Xbeta, Xgradient, y, func, family=family, backtrackMult=backtrackMult, armijoMult=armijoMult, stepSize=stepSize) beta, stepSize, Xbeta, lf, func, grad, Xgradient = persist( beta, stepSize, Xbeta, lf, func, grad, Xgradient) stepSize, lf, func, grad = compute(stepSize, lf, func, grad) beta = beta - stepSize * grad # tiny bit of repeat work here to avoid communication Xbeta = Xbeta - stepSize * Xgradient if stepSize == 0: break df = lf - func df /= max(func, lf) if df < tol: break stepSize *= stepGrowth backtrackMult = nextBacktrackMult return beta
@normalize
[docs]def newton(X, y, max_iter=50, tol=1e-8, family=Logistic, **kwargs): """Newtons Method for Logistic Regression. Parameters ---------- X : array-like, shape (n_samples, n_features) y : array-like, shape (n_samples,) max_iter : int maximum number of iterations to attempt before declaring failure to converge tol : float Maximum allowed change from prior iteration required to declare convergence family : Family Returns ------- beta : array-like, shape (n_features,) """ gradient, hessian = family.gradient, family.hessian n, p = X.shape beta = np.zeros(p) # always init to zeros? Xbeta = dot(X, beta) iter_count = 0 converged = False while not converged: beta_old = beta # should this use map_blocks()? hess = hessian(Xbeta, X) grad = gradient(Xbeta, X, y) hess, grad = da.compute(hess, grad) # should this be dask or numpy? # currently uses Python 3 specific syntax step, _, _, _ = np.linalg.lstsq(hess, grad) beta = (beta_old - step) iter_count += 1 # should change this criterion coef_change = np.absolute(beta_old - beta) converged = ( (not np.any(coef_change > tol)) or (iter_count > max_iter)) if not converged: Xbeta = dot(X, beta) # numpy -> dask converstion of beta return beta
@normalize
[docs]def admm(X, y, regularizer='l1', lamduh=0.1, rho=1, over_relax=1, max_iter=250, abstol=1e-4, reltol=1e-2, family=Logistic, **kwargs): """ Alternating Direction Method of Multipliers Parameters ---------- X : array-like, shape (n_samples, n_features) y : array-like, shape (n_samples,) regularizer : str or Regularizer lambuh : float rho : float over_relax : FLOAT max_iter : int maximum number of iterations to attempt before declaring failure to converge abstol, reltol : float family : Family Returns ------- beta : array-like, shape (n_features,) """ pointwise_loss = family.pointwise_loss pointwise_gradient = family.pointwise_gradient regularizer = Regularizer.get(regularizer) def create_local_gradient(func): @functools.wraps(func) def wrapped(beta, X, y, z, u, rho): return func(beta, X, y) + rho * (beta - z + u) return wrapped def create_local_f(func): @functools.wraps(func) def wrapped(beta, X, y, z, u, rho): return func(beta, X, y) + (rho / 2) * np.dot(beta - z + u, beta - z + u) return wrapped f = create_local_f(pointwise_loss) fprime = create_local_gradient(pointwise_gradient) nchunks = getattr(X, 'npartitions', 1) # nchunks = X.npartitions (n, p) = X.shape # XD = X.to_delayed().flatten().tolist() # yD = y.to_delayed().flatten().tolist() if isinstance(X, da.Array): XD = X.rechunk((None, X.shape[-1])).to_delayed().flatten().tolist() else: XD = [X] if isinstance(y, da.Array): yD = y.rechunk((None, y.shape[-1])).to_delayed().flatten().tolist() else: yD = [y] z = np.zeros(p) u = np.array([np.zeros(p) for i in range(nchunks)]) betas = np.array([np.ones(p) for i in range(nchunks)]) for k in range(max_iter): # x-update step new_betas = [delayed(local_update)(xx, yy, bb, z, uu, rho, f=f, fprime=fprime) for xx, yy, bb, uu in zip(XD, yD, betas, u)] new_betas = np.array(da.compute(*new_betas)) beta_hat = over_relax * new_betas + (1 - over_relax) * z # z-update step zold = z.copy() ztilde = np.mean(beta_hat + np.array(u), axis=0) z = regularizer.proximal_operator(ztilde, lamduh / (rho * nchunks)) # u-update step u += beta_hat - z # check for convergence primal_res = np.linalg.norm(new_betas - z) dual_res = np.linalg.norm(rho * (z - zold)) eps_pri = np.sqrt(p * nchunks) * abstol + reltol * np.maximum( np.linalg.norm(new_betas), np.sqrt(nchunks) * np.linalg.norm(z)) eps_dual = np.sqrt(p * nchunks) * abstol + \ reltol * np.linalg.norm(rho * u) if primal_res < eps_pri and dual_res < eps_dual: break return z
def local_update(X, y, beta, z, u, rho, f, fprime, solver=fmin_l_bfgs_b): beta = beta.ravel() u = u.ravel() z = z.ravel() solver_args = (X, y, z, u, rho) beta, f, d = solver(f, beta, fprime=fprime, args=solver_args, maxiter=200, maxfun=250) return beta @normalize
[docs]def lbfgs(X, y, regularizer=None, lamduh=1.0, max_iter=100, tol=1e-4, family=Logistic, verbose=False, **kwargs): """L-BFGS solver using scipy.optimize implementation Parameters ---------- X : array-like, shape (n_samples, n_features) y : array-like, shape (n_samples,) max_iter : int maximum number of iterations to attempt before declaring failure to converge tol : float Maximum allowed change from prior iteration required to declare convergence family : Family Returns ------- beta : array-like, shape (n_features,) """ pointwise_loss = family.pointwise_loss pointwise_gradient = family.pointwise_gradient if regularizer is not None: regularizer = Regularizer.get(regularizer) pointwise_loss = regularizer.add_reg_f(pointwise_loss, lamduh) pointwise_gradient = regularizer.add_reg_grad(pointwise_gradient, lamduh) n, p = X.shape beta0 = np.zeros(p) def compute_loss_grad(beta, X, y): loss_fn = pointwise_loss(beta, X, y) gradient_fn = pointwise_gradient(beta, X, y) loss, gradient = compute(loss_fn, gradient_fn) return loss, gradient.copy() with set_options(fuse_ave_width=0): # optimizations slows this down beta, loss, info = fmin_l_bfgs_b( compute_loss_grad, beta0, fprime=None, args=(X, y), iprint=(verbose > 0) - 1, pgtol=tol, maxiter=max_iter) return beta
@normalize
[docs]def proximal_grad(X, y, regularizer='l1', lamduh=0.1, family=Logistic, max_iter=100, tol=1e-8, **kwargs): """ Parameters ---------- X : array-like, shape (n_samples, n_features) y : array-like, shape (n_samples,) max_iter : int maximum number of iterations to attempt before declaring failure to converge tol : float Maximum allowed change from prior iteration required to declare convergence family : Family verbose : bool, default False whether to print diagnostic information during convergence Returns ------- beta : array-like, shape (n_features,) """ n, p = X.shape firstBacktrackMult = 0.1 nextBacktrackMult = 0.5 armijoMult = 0.1 stepGrowth = 1.25 stepSize = 1.0 recalcRate = 10 backtrackMult = firstBacktrackMult beta = np.zeros(p) regularizer = Regularizer.get(regularizer) for k in range(max_iter): # Compute the gradient if k % recalcRate == 0: Xbeta = X.dot(beta) func = family.loglike(Xbeta, y) gradient = family.gradient(Xbeta, X, y) Xbeta, func, gradient = persist( Xbeta, func, gradient) obeta = beta # Compute the step size lf = func for ii in range(100): beta = regularizer.proximal_operator(obeta - stepSize * gradient, stepSize * lamduh) step = obeta - beta Xbeta = X.dot(beta) Xbeta, beta = persist(Xbeta, beta) func = family.loglike(Xbeta, y) func = persist(func)[0] func = compute(func)[0] df = lf - func if df > 0: break stepSize *= backtrackMult if stepSize == 0: break df /= max(func, lf) if df < tol: break stepSize *= stepGrowth backtrackMult = nextBacktrackMult # L2-regularization returned a dask-array try: return beta.compute() except AttributeError: return beta
_solvers = { 'admm': admm, 'gradient_descent': gradient_descent, 'newton': newton, 'lbfgs': lbfgs, 'proximal_grad': proximal_grad }