"""Optimization algorithms for solving minimizaiton problems.
"""
from __future__ import absolute_import, division, print_function
from dask import delayed, persist, compute, set_options
import functools
import numpy as np
import dask.array as da
from scipy.optimize import fmin_l_bfgs_b
from dask_glm.utils import dot, normalize
from dask_glm.families import Logistic
from dask_glm.regularizers import Regularizer
[docs]def compute_stepsize_dask(beta, step, Xbeta, Xstep, y, curr_val,
family=Logistic, stepSize=1.0,
armijoMult=0.1, backtrackMult=0.1):
"""Compute the optimal stepsize
beta : array-like
step : float
XBeta : array-lie
Xstep :
y : array-like
curr_val : float
famlily : Family, optional
stepSize : float, optional
armijoMult : float, optional
backtrackMult : float, optional
Returns
-------
stepSize : flaot
beta : array-like
xBeta : array-like
func : callable
"""
loglike = family.loglike
beta, step, Xbeta, Xstep, y, curr_val = persist(beta, step, Xbeta, Xstep, y, curr_val)
obeta, oXbeta = beta, Xbeta
(step,) = compute(step)
steplen = (step ** 2).sum()
lf = curr_val
func = 0
for ii in range(100):
beta = obeta - stepSize * step
if ii and (beta == obeta).all():
stepSize = 0
break
Xbeta = oXbeta - stepSize * Xstep
func = loglike(Xbeta, y)
Xbeta, func = persist(Xbeta, func)
df = lf - compute(func)[0]
if df >= armijoMult * stepSize * steplen:
break
stepSize *= backtrackMult
return stepSize, beta, Xbeta, func
@normalize
[docs]def gradient_descent(X, y, max_iter=100, tol=1e-14, family=Logistic, **kwargs):
"""
Michael Grant's implementation of Gradient Descent.
Parameters
----------
X : array-like, shape (n_samples, n_features)
y : array-like, shape (n_samples,)
max_iter : int
maximum number of iterations to attempt before declaring
failure to converge
tol : float
Maximum allowed change from prior iteration required to
declare convergence
family : Family
Returns
-------
beta : array-like, shape (n_features,)
"""
loglike, gradient = family.loglike, family.gradient
n, p = X.shape
firstBacktrackMult = 0.1
nextBacktrackMult = 0.5
armijoMult = 0.1
stepGrowth = 1.25
stepSize = 1.0
recalcRate = 10
backtrackMult = firstBacktrackMult
beta = np.zeros(p)
for k in range(max_iter):
# how necessary is this recalculation?
if k % recalcRate == 0:
Xbeta = X.dot(beta)
func = loglike(Xbeta, y)
grad = gradient(Xbeta, X, y)
Xgradient = X.dot(grad)
# backtracking line search
lf = func
stepSize, _, _, func = compute_stepsize_dask(beta, grad,
Xbeta, Xgradient,
y, func, family=family,
backtrackMult=backtrackMult,
armijoMult=armijoMult,
stepSize=stepSize)
beta, stepSize, Xbeta, lf, func, grad, Xgradient = persist(
beta, stepSize, Xbeta, lf, func, grad, Xgradient)
stepSize, lf, func, grad = compute(stepSize, lf, func, grad)
beta = beta - stepSize * grad # tiny bit of repeat work here to avoid communication
Xbeta = Xbeta - stepSize * Xgradient
if stepSize == 0:
break
df = lf - func
df /= max(func, lf)
if df < tol:
break
stepSize *= stepGrowth
backtrackMult = nextBacktrackMult
return beta
@normalize
[docs]def newton(X, y, max_iter=50, tol=1e-8, family=Logistic, **kwargs):
"""Newtons Method for Logistic Regression.
Parameters
----------
X : array-like, shape (n_samples, n_features)
y : array-like, shape (n_samples,)
max_iter : int
maximum number of iterations to attempt before declaring
failure to converge
tol : float
Maximum allowed change from prior iteration required to
declare convergence
family : Family
Returns
-------
beta : array-like, shape (n_features,)
"""
gradient, hessian = family.gradient, family.hessian
n, p = X.shape
beta = np.zeros(p) # always init to zeros?
Xbeta = dot(X, beta)
iter_count = 0
converged = False
while not converged:
beta_old = beta
# should this use map_blocks()?
hess = hessian(Xbeta, X)
grad = gradient(Xbeta, X, y)
hess, grad = da.compute(hess, grad)
# should this be dask or numpy?
# currently uses Python 3 specific syntax
step, _, _, _ = np.linalg.lstsq(hess, grad)
beta = (beta_old - step)
iter_count += 1
# should change this criterion
coef_change = np.absolute(beta_old - beta)
converged = (
(not np.any(coef_change > tol)) or (iter_count > max_iter))
if not converged:
Xbeta = dot(X, beta) # numpy -> dask converstion of beta
return beta
@normalize
[docs]def admm(X, y, regularizer='l1', lamduh=0.1, rho=1, over_relax=1,
max_iter=250, abstol=1e-4, reltol=1e-2, family=Logistic, **kwargs):
"""
Alternating Direction Method of Multipliers
Parameters
----------
X : array-like, shape (n_samples, n_features)
y : array-like, shape (n_samples,)
regularizer : str or Regularizer
lambuh : float
rho : float
over_relax : FLOAT
max_iter : int
maximum number of iterations to attempt before declaring
failure to converge
abstol, reltol : float
family : Family
Returns
-------
beta : array-like, shape (n_features,)
"""
pointwise_loss = family.pointwise_loss
pointwise_gradient = family.pointwise_gradient
regularizer = Regularizer.get(regularizer)
def create_local_gradient(func):
@functools.wraps(func)
def wrapped(beta, X, y, z, u, rho):
return func(beta, X, y) + rho * (beta - z + u)
return wrapped
def create_local_f(func):
@functools.wraps(func)
def wrapped(beta, X, y, z, u, rho):
return func(beta, X, y) + (rho / 2) * np.dot(beta - z + u,
beta - z + u)
return wrapped
f = create_local_f(pointwise_loss)
fprime = create_local_gradient(pointwise_gradient)
nchunks = getattr(X, 'npartitions', 1)
# nchunks = X.npartitions
(n, p) = X.shape
# XD = X.to_delayed().flatten().tolist()
# yD = y.to_delayed().flatten().tolist()
if isinstance(X, da.Array):
XD = X.rechunk((None, X.shape[-1])).to_delayed().flatten().tolist()
else:
XD = [X]
if isinstance(y, da.Array):
yD = y.rechunk((None, y.shape[-1])).to_delayed().flatten().tolist()
else:
yD = [y]
z = np.zeros(p)
u = np.array([np.zeros(p) for i in range(nchunks)])
betas = np.array([np.ones(p) for i in range(nchunks)])
for k in range(max_iter):
# x-update step
new_betas = [delayed(local_update)(xx, yy, bb, z, uu, rho, f=f,
fprime=fprime) for
xx, yy, bb, uu in zip(XD, yD, betas, u)]
new_betas = np.array(da.compute(*new_betas))
beta_hat = over_relax * new_betas + (1 - over_relax) * z
# z-update step
zold = z.copy()
ztilde = np.mean(beta_hat + np.array(u), axis=0)
z = regularizer.proximal_operator(ztilde, lamduh / (rho * nchunks))
# u-update step
u += beta_hat - z
# check for convergence
primal_res = np.linalg.norm(new_betas - z)
dual_res = np.linalg.norm(rho * (z - zold))
eps_pri = np.sqrt(p * nchunks) * abstol + reltol * np.maximum(
np.linalg.norm(new_betas), np.sqrt(nchunks) * np.linalg.norm(z))
eps_dual = np.sqrt(p * nchunks) * abstol + \
reltol * np.linalg.norm(rho * u)
if primal_res < eps_pri and dual_res < eps_dual:
break
return z
def local_update(X, y, beta, z, u, rho, f, fprime, solver=fmin_l_bfgs_b):
beta = beta.ravel()
u = u.ravel()
z = z.ravel()
solver_args = (X, y, z, u, rho)
beta, f, d = solver(f, beta, fprime=fprime, args=solver_args,
maxiter=200,
maxfun=250)
return beta
@normalize
[docs]def lbfgs(X, y, regularizer=None, lamduh=1.0, max_iter=100, tol=1e-4,
family=Logistic, verbose=False, **kwargs):
"""L-BFGS solver using scipy.optimize implementation
Parameters
----------
X : array-like, shape (n_samples, n_features)
y : array-like, shape (n_samples,)
max_iter : int
maximum number of iterations to attempt before declaring
failure to converge
tol : float
Maximum allowed change from prior iteration required to
declare convergence
family : Family
Returns
-------
beta : array-like, shape (n_features,)
"""
pointwise_loss = family.pointwise_loss
pointwise_gradient = family.pointwise_gradient
if regularizer is not None:
regularizer = Regularizer.get(regularizer)
pointwise_loss = regularizer.add_reg_f(pointwise_loss, lamduh)
pointwise_gradient = regularizer.add_reg_grad(pointwise_gradient, lamduh)
n, p = X.shape
beta0 = np.zeros(p)
def compute_loss_grad(beta, X, y):
loss_fn = pointwise_loss(beta, X, y)
gradient_fn = pointwise_gradient(beta, X, y)
loss, gradient = compute(loss_fn, gradient_fn)
return loss, gradient.copy()
with set_options(fuse_ave_width=0): # optimizations slows this down
beta, loss, info = fmin_l_bfgs_b(
compute_loss_grad, beta0, fprime=None,
args=(X, y),
iprint=(verbose > 0) - 1, pgtol=tol, maxiter=max_iter)
return beta
@normalize
[docs]def proximal_grad(X, y, regularizer='l1', lamduh=0.1, family=Logistic,
max_iter=100, tol=1e-8, **kwargs):
"""
Parameters
----------
X : array-like, shape (n_samples, n_features)
y : array-like, shape (n_samples,)
max_iter : int
maximum number of iterations to attempt before declaring
failure to converge
tol : float
Maximum allowed change from prior iteration required to
declare convergence
family : Family
verbose : bool, default False
whether to print diagnostic information during convergence
Returns
-------
beta : array-like, shape (n_features,)
"""
n, p = X.shape
firstBacktrackMult = 0.1
nextBacktrackMult = 0.5
armijoMult = 0.1
stepGrowth = 1.25
stepSize = 1.0
recalcRate = 10
backtrackMult = firstBacktrackMult
beta = np.zeros(p)
regularizer = Regularizer.get(regularizer)
for k in range(max_iter):
# Compute the gradient
if k % recalcRate == 0:
Xbeta = X.dot(beta)
func = family.loglike(Xbeta, y)
gradient = family.gradient(Xbeta, X, y)
Xbeta, func, gradient = persist(
Xbeta, func, gradient)
obeta = beta
# Compute the step size
lf = func
for ii in range(100):
beta = regularizer.proximal_operator(obeta - stepSize * gradient, stepSize * lamduh)
step = obeta - beta
Xbeta = X.dot(beta)
Xbeta, beta = persist(Xbeta, beta)
func = family.loglike(Xbeta, y)
func = persist(func)[0]
func = compute(func)[0]
df = lf - func
if df > 0:
break
stepSize *= backtrackMult
if stepSize == 0:
break
df /= max(func, lf)
if df < tol:
break
stepSize *= stepGrowth
backtrackMult = nextBacktrackMult
# L2-regularization returned a dask-array
try:
return beta.compute()
except AttributeError:
return beta
_solvers = {
'admm': admm,
'gradient_descent': gradient_descent,
'newton': newton,
'lbfgs': lbfgs,
'proximal_grad': proximal_grad
}