diff --git a/docs/requirements.txt b/docs/requirements.txt index 196e93cc..4fbf8336 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -8,3 +8,4 @@ matplotlib nbinteract scipy jupyter-book +sympy \ No newline at end of file diff --git a/ngboost/.vscode/settings.json b/ngboost/.vscode/settings.json new file mode 100644 index 00000000..699e1e10 --- /dev/null +++ b/ngboost/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "python.pythonPath": "D:\\src\\ngboost\\venv\\python.exe" +} \ No newline at end of file diff --git a/ngboost/distns/__init__.py b/ngboost/distns/__init__.py index 1804ba1d..94805422 100644 --- a/ngboost/distns/__init__.py +++ b/ngboost/distns/__init__.py @@ -9,3 +9,7 @@ from .normal import Normal, NormalFixedVar # NOQA from .poisson import Poisson # NOQA from .t import T, TFixedDf, TFixedDfFixedVar # NOQA +from .gamma import Gamma +from .beta import Beta +from .logitnormal import LogitNormal +from .betabernoulli import BetaBernoulli \ No newline at end of file diff --git a/ngboost/distns/beta.py b/ngboost/distns/beta.py new file mode 100644 index 00000000..6a40cad2 --- /dev/null +++ b/ngboost/distns/beta.py @@ -0,0 +1,83 @@ +import numpy as np +import numpy +import math +import scipy as sp +from scipy.stats import beta as dist +from scipy.special import polygamma, beta + +from ngboost.distns.distn import RegressionDistn +from ngboost.scores import CRPScore, LogScore + +import sympy as sym +from sympy import stats as symstats +from sympy.printing.lambdarepr import NumPyPrinter +from sympy.utilities.lambdify import lambdastr + +logalpha, logbeta, x = sym.symbols('logalpha logbeta x') + +distr = symstats.Beta('dist', sym.exp(logalpha), sym.exp(logbeta)) +score = -sym.log(symstats.density( distr ).pdf(x)) +def neg_loglikelihood_sympy(logalpha, logbeta, x): + return score + +# Need to create str, because non-numpy functions are used: polygamma; and exp is not translated into numpy inside polygamma +def newlambdify(args, funs): + funcstr = lambdastr(args, funs, printer=NumPyPrinter) + funcstr = funcstr.replace( + ' exp', 'numpy.exp' + ) + return eval(funcstr) + +neg_loglikelihood = np.vectorize( newlambdify((logalpha, logbeta, x), neg_loglikelihood_sympy(logalpha, logbeta, x)) ) +D_0 = np.vectorize( newlambdify( (logalpha, logbeta, x), sym.diff(neg_loglikelihood_sympy(logalpha, logbeta, x), logalpha)) ) +D_1 = np.vectorize( newlambdify( (logalpha, logbeta, x), sym.diff(neg_loglikelihood_sympy(logalpha, logbeta, x), logbeta)) ) +# FI_0_0 = np.vectorize( newlambdify( (logalpha, logbeta), sym.factor(sym.expand(symstats.E(sym.factor(sym.expand(sym.diff(sym.diff(score, logalpha), logalpha))).subs(x, distr))))) ) +# FI_0_1 = np.vectorize( newlambdify( (logalpha, logbeta), sym.factor(sym.expand(symstats.E(sym.factor(sym.expand(sym.diff(sym.diff(score, logalpha), logbeta))).subs(x, distr))))) ) +# FI_1_0 = np.vectorize( newlambdify( (logalpha, logbeta), sym.factor(sym.expand(symstats.E(sym.factor(sym.expand(sym.diff(sym.diff(score, logbeta), logalpha))).subs(x, distr))))) ) +# FI_1_1 = np.vectorize( newlambdify( (logalpha, logbeta), sym.factor(sym.expand(symstats.E(sym.factor(sym.expand(sym.diff(sym.diff(score, logbeta), logbeta))).subs(x, distr))))) ) + + + +class BetaLogScore(LogScore): + + def score(self, Y): + return -dist.logpdf(Y, a=self.alpha, b=self.beta, loc=0, scale=1) + # return neg_loglikelihood(k=self.a, logtheta=self.logscale, x=Y) + + def d_score(self, Y): + D = np.zeros((len(Y), 2)) # first col is dS/da, second col is dS/d(log(scale)) + D[:, 0] = D_0(logalpha=self.logalpha, logbeta=self.logbeta, x=Y) + D[:, 1] = D_1(logalpha=self.logalpha, logbeta=self.logbeta, x=Y) + return D + +class Beta(RegressionDistn): + + n_params = 2 + scores = [BetaLogScore] + + def __init__(self, params): + # save the parameters + self._params = params + + # create other objects that will be useful later + self.logalpha = params[0] + self.logbeta = params[1] + self.alpha = np.exp(params[0]) + self.beta = np.exp(params[1]) # since params[1] is log(scale) + self.dist = dist(a=self.alpha, b=self.beta) + + def fit(Y): + alpha, beta, loc1, scale1 = dist.fit(Y, floc=0, fscale=1) # use scipy's implementation + return np.array([np.log(alpha), np.log(beta)]) + + def sample(self, m): + return np.array([self.dist.rvs() for i in range(m)]) + + def __getattr__(self, name): # gives us access to Laplace.mean() required for RegressionDist.predict() + if name in dir(self.dist): + return getattr(self.dist, name) + return None + + @property + def params(self): + return {'alpha':self.alpha, 'beta':self.beta} \ No newline at end of file diff --git a/ngboost/distns/betabernoulli.py b/ngboost/distns/betabernoulli.py new file mode 100644 index 00000000..678a8493 --- /dev/null +++ b/ngboost/distns/betabernoulli.py @@ -0,0 +1,180 @@ +from scipy.stats import betabinom as dist +import numpy as np +from ngboost.distns.distn import RegressionDistn +from ngboost.scores import LogScore +from scipy.special import digamma, polygamma +from array import array +import sys + +class BetaBernoulliLogScore(LogScore): + def score(self, Y): + return -self.dist.logpmf(Y) + + def d_alpha(self, Y): + return self.alpha * ( + digamma(self.alpha + self.beta) + + digamma(Y + self.alpha) + - digamma(self.alpha + self.beta + 1) + - digamma(self.alpha) + ) + + def d_beta(self, Y): + return self.beta * ( + digamma(self.alpha + self.beta) + + digamma(-Y + self.beta + 1) + - digamma(self.alpha + self.beta + 1) + - digamma(self.beta) + ) + + def d_alpha_alpha(self, Y): + return ( + (polygamma(1, Y + self.alpha) + + polygamma(1, self.alpha + self.beta) - + polygamma(1, self.alpha + self.beta + 1) - + polygamma(1, self.alpha) + ) * self.alpha**2 + self.d_alpha(Y) + ) + + def d_alpha_beta(self, Y): + return ( + self.alpha * self.beta * ( + polygamma(1, self.alpha + self.beta) - + polygamma(1, self.alpha + self.beta + 1) + ) + ) + + def d_beta_beta(self, Y): + return ( + (polygamma(1, -Y + self.beta + 1) + + polygamma(1, self.alpha + self.beta) - + polygamma(1, self.alpha + self.beta + 1) - + polygamma(1, self.beta) + ) * self.beta**2 + self.d_beta(Y) + ) + def d_score(self, Y): + D = np.zeros( + (len(Y), 2) + ) # first col is dS/d(log(α)), second col is dS/d(log(β)) + D[:, 0] = -self.d_alpha(Y) + D[:, 1] = -self.d_beta(Y) + return D + + # Variance + def metric(self): + FI = np.zeros((self.alpha.shape[0], 2, 2)) + FI[:, 0, 0] = ( + self.d_alpha(0)*self.d_alpha(0)*self.dist.pmf(0) + + self.d_alpha(1)*self.d_alpha(1)*self.dist.pmf(1) + ) + # FI[:, 1, 0] = ( + # self.d_alpha(0)*self.d_beta(0)*self.dist.pmf(0) + + # self.d_alpha(1)*self.d_beta(1)*self.dist.pmf(1) + # ) + # FI[:, 0, 1] = ( + # self.d_alpha(0)*self.d_beta(0)*self.dist.pmf(0) + + # self.d_alpha(1)*self.d_beta(1)*self.dist.pmf(1) + # ) + FI[:, 1, 1] = ( + self.d_beta(0)*self.d_beta(0)*self.dist.pmf(0) + + self.d_beta(1)*self.d_beta(1)*self.dist.pmf(1) + ) + return FI + + # Hessian + # def metric(self): + # FI = np.zeros((self.alpha.shape[0], 2, 2)) + # FI[:, 0, 0] = self.d_alpha_alpha(0) * self.dist.pmf(0) + self.d_alpha_alpha(1) * self.dist.pmf(1) + # # FI[:, 1, 0] = self.d_alpha_beta(0) * self.dist.pmf(0) + self.d_alpha_beta(1) * self.dist.pmf(1) + # # FI[:, 0, 1] = self.d_alpha_beta(0) * self.dist.pmf(0) + self.d_alpha_beta(1) * self.dist.pmf(1) + # FI[:, 1, 1] = self.d_beta_beta(0) * self.dist.pmf(0) + self.d_beta_beta(1) * self.dist.pmf(1) + # return FI + +class BetaBernoulli(RegressionDistn): + + n_params = 2 + scores = [BetaBernoulliLogScore] + + def __init__(self, params): + # save the parameters + self._params = params + + # create other objects that will be useful later + self.log_alpha = params[0] + self.log_beta = params[1] + self.alpha = np.exp(self.log_alpha) + self.beta = np.exp(self.log_beta) + self.dist = dist(n=1, a=self.alpha, b=self.beta) + + def fit(Y): + def fit_alpha_beta_py(success, alpha0=1.5, beta0=5, niter=1000): + # based on https://github.com/lfiaschi/fastbetabino/blob/master/fastbetabino.pyx + + # This optimisation works for Beta-Binomial distribution in general. + # For Beta-Bernoulli it's simplified by fixing the trials to 1. + trials = np.ones_like(Y) + + alpha_old = alpha0 + beta_old = beta0 + + for it in range(niter): + + alpha = ( + alpha_old + * ( + sum( + digamma(c + alpha_old) - digamma(alpha_old) + for c, i in zip(success, trials) + ) + ) + / ( + sum( + digamma(i + alpha_old + beta_old) + - digamma(alpha_old + beta_old) + for c, i in zip(success, trials) + ) + ) + ) + + beta = ( + beta_old + * ( + sum( + digamma(i - c + beta_old) - digamma(beta_old) + for c, i in zip(success, trials) + ) + ) + / ( + sum( + digamma(i + alpha_old + beta_old) + - digamma(alpha_old + beta_old) + for c, i in zip(success, trials) + ) + ) + ) + + # print('alpha {} | {} beta {} | {}'.format(alpha,alpha_old,beta,beta_old)) + sys.stdout.flush() + + if np.abs(alpha - alpha_old) and np.abs(beta - beta_old) < 1e-10: + # print('early stop') + break + + alpha_old = alpha + beta_old = beta + + return alpha, beta + + alpha, beta = fit_alpha_beta_py(Y) + return np.array([np.log(alpha), np.log(beta)]) + + def sample(self, m): + return np.array([self.dist.rvs() for i in range(m)]) + + def __getattr__(self, name): + if name in dir(self.dist): + return getattr(self.dist, name) + return None + + @property + def params(self): + return {"alpha": self.alpha, "beta": self.beta} \ No newline at end of file diff --git a/ngboost/distns/betabinomial.py b/ngboost/distns/betabinomial.py new file mode 100644 index 00000000..262b3829 --- /dev/null +++ b/ngboost/distns/betabinomial.py @@ -0,0 +1,148 @@ +from scipy.stats import betabinom as dist +import numpy as np +from ngboost.distns.distn import RegressionDistn +from ngboost.scores import LogScore +from scipy.special import digamma, polygamma +from array import array +import sys +from scipy.special import binom as binomial +import numpy +import math +import sympy as sym +from sympy import stats as symstats +from sympy.printing.lambdarepr import NumPyPrinter +from sympy.utilities.lambdify import lambdastr +import re + +# Need to create str, because non-numpy functions are used: polygamma; and exp is not translated into numpy inside polygamma +def newlambdify(args, funs): + funcstr = lambdastr(args, funs, printer=NumPyPrinter) + funcstr = funcstr.replace( + ' exp', 'numpy.exp' + ).replace( + 'builtins.sum', 'sum' + ) + funcstr = re.sub(r'\bk_\d*\b', '_k', funcstr) + return eval(funcstr) + +n, logalpha, logbeta, x = sym.symbols('n, logalpha, logbeta, x') + +distr = symstats.BetaBinomial('dist', n, sym.exp(logalpha), sym.exp(logbeta)) +score = -sym.log(symstats.density( distr ).pmf(x)) +def neg_loglikelihood_sympy(n, logalpha, logbeta, x): + return score + +neg_loglikelihood = np.vectorize( newlambdify( (n, logalpha, logbeta, x), neg_loglikelihood_sympy(n, logalpha, logbeta, x) ) ) +D_0 = np.vectorize( newlambdify( (n, logalpha, logbeta, x), sym.diff(neg_loglikelihood_sympy(n, logalpha, logbeta, x), logalpha) )) +D_1 = np.vectorize( newlambdify( (n, logalpha, logbeta, x), sym.diff(neg_loglikelihood_sympy(n, logalpha, logbeta, x), logbeta))) +FI_0_0 = np.vectorize( newlambdify( (n, logalpha, logbeta), sym.factor(sym.expand(symstats.E(sym.factor(sym.expand(sym.diff(sym.diff(score, logalpha), logalpha))).subs(x, distr)))) )) +FI_0_1 = np.vectorize( newlambdify( (n, logalpha, logbeta), sym.factor(sym.expand(symstats.E(sym.factor(sym.expand(sym.diff(sym.diff(score, logalpha), logbeta))).subs(x, distr)))) )) +FI_1_0 = np.vectorize( newlambdify( (n, logalpha, logbeta), sym.factor(sym.expand(symstats.E(sym.factor(sym.expand(sym.diff(sym.diff(score, logbeta), logalpha))).subs(x, distr)))) )) +FI_1_1 = np.vectorize( newlambdify( (n, logalpha, logbeta), sym.factor(sym.expand(symstats.E(sym.factor(sym.expand(sym.diff(sym.diff(score, logbeta), logbeta))).subs(x, distr)))) )) + + +class BetaBinomialLogScore(LogScore): + def score(self, Y): + return neg_loglikelihood(n=self.n, logalpha=self.logalpha, logbeta=np.log(self.logbeta), x=Y) + + def d_score(self, Y): + D = np.zeros((len(Y), 2)) # first col is dS/d(log(α)), second col is dS/d(log(β)) + D[:, 0] = D_0(n=self.n, logalpha=self.logalpha, logbeta=np.log(self.logbeta), x=Y) + D[:, 1] = D_1(n=self.n, logalpha=self.logalpha, logbeta=np.log(self.logbeta), x=Y) + return D + + # Variance + def metric(self): + FI = np.zeros((self.var.shape[0], 2, 2)) + FI[:, 0, 0] = FI_0_0(n=self.n, logalpha=self.logalpha, logbeta=np.log(self.logbeta)) + FI[:, 0, 1] = FI_0_1(n=self.n, logalpha=self.logalpha, logbeta=np.log(self.logbeta)) + FI[:, 1, 0] = FI_1_0(n=self.n, logalpha=self.logalpha, logbeta=np.log(self.logbeta)) + FI[:, 1, 1] = FI_1_1(n=self.n, logalpha=self.logalpha, logbeta=np.log(self.logbeta)) + return FI + +class BetaBinomial(RegressionDistn): + + n_params = 2 + scores = [BetaBinomialLogScore] + + def __init__(self, params): + # save the parameters + self._params = params + + # create other objects that will be useful later + self.logalpha = params[0] + self.logbeta = params[1] + self.alpha = np.exp(self.logalpha) + self.beta = np.exp(self.logbeta) + self.dist = dist(n=1, a=self.alpha, b=self.beta) + + def fit(n, Y): + def fit_alpha_beta_py(trials, success, alpha0=1.5, beta0=5, niter=1000): + # based on https://github.com/lfiaschi/fastbetabino/blob/master/fastbetabino.pyx + + alpha_old = alpha0 + beta_old = beta0 + + for it in range(niter): + + alpha = ( + alpha_old + * ( + sum( + digamma(c + alpha_old) - digamma(alpha_old) + for c, i in zip(success, trials) + ) + ) + / ( + sum( + digamma(i + alpha_old + beta_old) + - digamma(alpha_old + beta_old) + for c, i in zip(success, trials) + ) + ) + ) + + beta = ( + beta_old + * ( + sum( + digamma(i - c + beta_old) - digamma(beta_old) + for c, i in zip(success, trials) + ) + ) + / ( + sum( + digamma(i + alpha_old + beta_old) + - digamma(alpha_old + beta_old) + for c, i in zip(success, trials) + ) + ) + ) + + # print('alpha {} | {} beta {} | {}'.format(alpha,alpha_old,beta,beta_old)) + sys.stdout.flush() + + if np.abs(alpha - alpha_old) and np.abs(beta - beta_old) < 1e-10: + # print('early stop') + break + + alpha_old = alpha + beta_old = beta + + return alpha, beta + + alpha, beta = fit_alpha_beta_py(n, Y) + return np.array([np.log(alpha), np.log(beta)]) + + def sample(self, m): + return np.array([self.dist.rvs() for i in range(m)]) + + def __getattr__(self, name): + if name in dir(self.dist): + return getattr(self.dist, name) + return None + + @property + def params(self): + return {"alpha": self.alpha, "beta": self.beta} + \ No newline at end of file diff --git a/ngboost/distns/gamma.py b/ngboost/distns/gamma.py new file mode 100644 index 00000000..45ea97c1 --- /dev/null +++ b/ngboost/distns/gamma.py @@ -0,0 +1,92 @@ +"""The NGBoost Gamma distribution and scores""" +import numpy as np +import scipy as sp +from scipy.stats import gamma as dist +from scipy.special import polygamma +import numpy +import math +from ngboost.distns.distn import RegressionDistn +from ngboost.scores import CRPScore, LogScore + +import sympy as sym +from sympy import stats as symstats + +from sympy.printing.lambdarepr import NumPyPrinter +from sympy.utilities.lambdify import lambdastr + +# Need to create str, because non-numpy functions are used: polygamma; and exp is not translated into numpy inside polygamma +def newlambdify(args, funs): + funcstr = lambdastr(args, funs, printer=NumPyPrinter) + funcstr = funcstr.replace( + ' exp', 'numpy.exp' + ) + return eval(funcstr) + +logk, logtheta, x = sym.symbols('logk logtheta x') + +distr = symstats.Gamma('dist', sym.exp(logk), sym.exp(logtheta)) +score = -sym.log(symstats.density( distr ).pdf(x)) +def neg_loglikelihood_sympy(mu, logsigma, x): + return -sym.log(symstats.density( symstats.Gamma('dist', sym.exp(logk), sym.exp(logtheta)) ).pdf(x)) + +# Need to vectorize, because FI matrix is calculated by simulation, +# plus I guess polygamma doesn't support vector operations +neg_loglikelihood = np.vectorize( newlambdify((logk, logtheta, x), neg_loglikelihood_sympy(logk, logtheta, x) ) ) +D_0 = np.vectorize( newlambdify( (logk, logtheta, x), sym.diff(neg_loglikelihood_sympy(logk, logtheta, x), logk) ) ) +D_1 = np.vectorize( newlambdify( (logk, logtheta, x), sym.diff(neg_loglikelihood_sympy(logk, logtheta, x), logtheta) ) ) +FI_0_0 = np.vectorize( newlambdify( (logk, logtheta), sym.factor(sym.expand(symstats.E(sym.factor(sym.expand(sym.diff(sym.diff(score, logk), logk))).subs(x, distr)))) ) ) +FI_0_1 = np.vectorize( newlambdify( (logk, logtheta), sym.factor(sym.expand(symstats.E(sym.factor(sym.expand(sym.diff(sym.diff(score, logk), logtheta))).subs(x, distr)))) ) ) +FI_1_0 = np.vectorize( newlambdify( (logk, logtheta), sym.factor(sym.expand(symstats.E(sym.factor(sym.expand(sym.diff(sym.diff(score, logtheta), logk))).subs(x, distr)))) ) ) +FI_1_1 = np.vectorize( newlambdify( (logk, logtheta), sym.factor(sym.expand(symstats.E(sym.factor(sym.expand(sym.diff(sym.diff(score, logtheta), logtheta))).subs(x, distr)))) ) ) + +class GammaLogScore(LogScore): + + def score(self, Y): + return -dist.logpdf(Y, self.a, loc=np.zeros_like(self.a), scale=self.scale) + # return neg_loglikelihood(k=self.a, logtheta=self.logscale, x=Y) + + def d_score(self, Y): + D = np.zeros((len(Y), 2)) # first col is dS/da, second col is dS/d(log(scale)) + D[:, 0] = D_0(logk=self.loga, logtheta=self.logscale, x=Y) + D[:, 1] = D_1(logk=self.loga, logtheta=self.logscale, x=Y) + return D + + # no closed form + # def metric(self): + # FI = np.zeros((self.scale.shape[0], 2, 2)) + # FI[:, 0, 0] = FI_0_0(logk=self.loga, logtheta=self.logscale) + # FI[:, 0, 1] = FI_0_1(logk=self.loga, logtheta=self.logscale) + # FI[:, 1, 0] = FI_1_0(logk=self.loga, logtheta=self.logscale) + # FI[:, 1, 1] = FI_1_1(logk=self.loga, logtheta=self.logscale) + +class Gamma(RegressionDistn): + + n_params = 2 + scores = [GammaLogScore] + + def __init__(self, params): + # save the parameters + self._params = params + + # create other objects that will be useful later + self.loga = params[0] + self.logscale = params[1] + self.a = np.exp(params[0]) + self.scale = np.exp(params[1]) # since params[1] is log(scale) + self.dist = dist(a=self.a, loc=0, scale=self.scale) + + def fit(Y): + a, loc, scale = dist.fit(Y) # use scipy's implementation + return np.array([np.log(a), np.log(scale)]) + + def sample(self, m): + return np.array([self.dist.rvs() for i in range(m)]) + + def __getattr__(self, name): # gives us access to Laplace.mean() required for RegressionDist.predict() + if name in dir(self.dist): + return getattr(self.dist, name) + return None + + @property + def params(self): + return {'a':self.a, 'scale':self.scale} diff --git a/ngboost/distns/logitnormal.py b/ngboost/distns/logitnormal.py new file mode 100644 index 00000000..a2f21b7d --- /dev/null +++ b/ngboost/distns/logitnormal.py @@ -0,0 +1,88 @@ +"""The NGBoost LogitNormal distribution and scores""" +import numpy as np +import scipy as sp +from scipy.stats import norm as dist + +from ngboost.distns.distn import RegressionDistn +from ngboost.scores import CRPScore, LogScore + +import sympy as sym +from sympy import stats as symstats + +mu, logsigma, x = sym.symbols('mu logsigma x') + +pdf = 1/(sym.exp(logsigma)*sym.sqrt(2*sym.pi))*sym.exp(-(sym.log(x/(1-x))-mu)**2/(2*sym.exp(logsigma)**2))*1/(x*(1-x)) + +distr = symstats.ContinuousRV(x, pdf, set=sym.Interval(-sym.oo, sym.oo)) +score = -sym.log(pdf) + +def neg_loglikelihood_sympy(mu, logsigma, x): + return score + +neg_loglikelihood = sym.lambdify((mu, logsigma, x), neg_loglikelihood_sympy(mu, logsigma, x), 'numpy') +D_0 = sym.lambdify( (mu, logsigma, x), sym.factor(sym.expand(sym.diff(neg_loglikelihood_sympy(mu, logsigma, x), mu))), 'numpy') +D_1 = sym.lambdify( (mu, logsigma, x), sym.factor(sym.expand(sym.diff(neg_loglikelihood_sympy(mu, logsigma, x), logsigma))), 'numpy') +FI_0_0 = sym.lambdify( (mu, logsigma), sym.factor(sym.expand(symstats.E(sym.factor(sym.expand(sym.diff(sym.diff(score, mu), mu))).subs(x, distr)))), 'numpy') +# FI_0_1 = sym.lambdify( (mu, logsigma), sym.factor(sym.expand(symstats.E(sym.factor(sym.expand(sym.diff(sym.diff(score, mu), logsigma))).subs(x, distr)))), 'numpy') +# FI_1_0 = sym.lambdify( (mu, logsigma), sym.factor(sym.expand(symstats.E(sym.factor(sym.expand(sym.diff(sym.diff(score, logsigma), mu))).subs(x, distr)))), 'numpy') +# FI_1_1 = sym.lambdify( (mu, logsigma), sym.factor(sym.expand(symstats.E(sym.factor(sym.expand(sym.diff(sym.diff(score, logsigma), logsigma))).subs(x, distr)))), 'numpy') + +class LogitNormalLogScore(LogScore): + + def score(self, Y): + return neg_loglikelihood(mu=self.loc, logsigma=np.log(self.scale), x=Y) + + def d_score(self, Y): + print(D_0(mu=self.loc, logsigma=np.log(self.scale), x=Y)) + print(D_1(mu=self.loc, logsigma=np.log(self.scale), x=Y)) + + D = np.zeros((len(Y), 2)) + D[:, 0] = D_0(mu=self.loc, logsigma=np.log(self.scale), x=Y) + D[:, 1] = D_1(mu=self.loc, logsigma=np.log(self.scale), x=Y) + return D + + # def metric(self): + # FI = np.zeros((self.var.shape[0], 2, 2)) + # FI[:, 0, 0] = FI_0_0(mu=self.loc, logsigma=np.log(self.scale)) + # FI[:, 0, 1] = 0 # FI_0_1(mu=self.loc, logsigma=np.log(self.scale)) + # FI[:, 1, 0] = 0 # FI_1_0(mu=self.loc, logsigma=np.log(self.scale)) + # FI[:, 1, 1] = FI_0_0(mu=self.loc, logsigma=np.log(self.scale)) # FI_1_1(mu=self.loc, logsigma=np.log(self.scale)) + # return FI + +class LogitNormal(RegressionDistn): + """ + Implements the normal distribution for NGBoost. + + The normal distribution has two parameters, loc and scale, which are + the mean and standard deviation, respectively. + This distribution has both LogScore and CRPScore implemented for it. + """ + + n_params = 2 + scores = [LogitNormalLogScore] + + def __init__(self, params): + super().__init__(params) + self.loc = params[0] + self.scale = np.exp(params[1]) + self.var = self.scale ** 2 + self.dist = dist(loc=self.loc, scale=self.scale) + + def fit(Y): + # m, s = sp.stats.norm.fit(np.log(Y/(1-Y))) + m, s = 0, 1 + return np.array([m, np.log(s)]) + + def sample(self, m): + return np.array([self.rvs() for i in range(m)]) + + def __getattr__( + self, name + ): # gives us Normal.mean() required for RegressionDist.predict() + if name in dir(self.dist): + return getattr(self.dist, name) + return None + + @property + def params(self): + return {"loc": self.loc, "scale": self.scale} diff --git a/ngboost/distns/normal.py b/ngboost/distns/normal.py index f948d72e..ab96c767 100644 --- a/ngboost/distns/normal.py +++ b/ngboost/distns/normal.py @@ -6,21 +6,41 @@ from ngboost.distns.distn import RegressionDistn from ngboost.scores import CRPScore, LogScore +import sympy as sym +from sympy import stats as symstats + +mu, logsigma, x = sym.symbols('mu logsigma x') + +distr = symstats.Normal('dist', mu, sym.exp(logsigma)) +score = -sym.log(symstats.density( distr ).pdf(x)) +def neg_loglikelihood_sympy(mu, logsigma, x): + return score + +neg_loglikelihood = sym.lambdify((mu, logsigma, x), neg_loglikelihood_sympy(mu, logsigma, x), 'numpy') +D_0 = sym.lambdify( (mu, logsigma, x), sym.diff(neg_loglikelihood_sympy(mu, logsigma, x), mu), 'numpy') +D_1 = sym.lambdify( (mu, logsigma, x), sym.diff(neg_loglikelihood_sympy(mu, logsigma, x), logsigma), 'numpy') +FI_0_0 = sym.lambdify( (mu, logsigma), sym.factor(sym.expand(symstats.E(sym.factor(sym.expand(sym.diff(sym.diff(score, mu), mu))).subs(x, distr)))), 'numpy') +FI_0_1 = sym.lambdify( (mu, logsigma), sym.factor(sym.expand(symstats.E(sym.factor(sym.expand(sym.diff(sym.diff(score, mu), logsigma))).subs(x, distr)))), 'numpy') +FI_1_0 = sym.lambdify( (mu, logsigma), sym.factor(sym.expand(symstats.E(sym.factor(sym.expand(sym.diff(sym.diff(score, logsigma), mu))).subs(x, distr)))), 'numpy') +FI_1_1 = sym.lambdify( (mu, logsigma), sym.factor(sym.expand(symstats.E(sym.factor(sym.expand(sym.diff(sym.diff(score, logsigma), logsigma))).subs(x, distr)))), 'numpy') class NormalLogScore(LogScore): + def score(self, Y): - return -self.dist.logpdf(Y) + return neg_loglikelihood(mu=self.loc, logsigma=np.log(self.scale), x=Y) - def d_score(self, Y): + def d_score(self, Y): D = np.zeros((len(Y), 2)) - D[:, 0] = (self.loc - Y) / self.var - D[:, 1] = 1 - ((self.loc - Y) ** 2) / self.var + D[:, 0] = D_0(mu=self.loc, logsigma=np.log(self.scale), x=Y) + D[:, 1] = D_1(mu=self.loc, logsigma=np.log(self.scale), x=Y) return D def metric(self): FI = np.zeros((self.var.shape[0], 2, 2)) - FI[:, 0, 0] = 1 / self.var - FI[:, 1, 1] = 2 + FI[:, 0, 0] = FI_0_0(mu=self.loc, logsigma=np.log(self.scale)) + FI[:, 0, 1] = FI_0_1(mu=self.loc, logsigma=np.log(self.scale)) + FI[:, 1, 0] = FI_1_0(mu=self.loc, logsigma=np.log(self.scale)) + FI[:, 1, 1] = FI_1_1(mu=self.loc, logsigma=np.log(self.scale)) return FI diff --git a/ngboost/ngboost.py b/ngboost/ngboost.py index e547c295..0e09cf02 100644 --- a/ngboost/ngboost.py +++ b/ngboost/ngboost.py @@ -8,7 +8,7 @@ from sklearn.tree import DecisionTreeRegressor from sklearn.utils import check_array, check_random_state, check_X_y -from ngboost.distns import Normal, k_categorical +from ngboost.distns import * from ngboost.learners import default_tree_learner from ngboost.manifold import manifold from ngboost.scores import LogScore @@ -76,19 +76,31 @@ def __init__( self.tol = tol self.random_state = check_random_state(random_state) self.best_val_loss_itr = None + if hasattr(self.Dist, "multi_output"): + self.multi_output = self.Dist.multi_output + else: + self.multi_output = False def __getstate__(self): state = self.__dict__.copy() # Remove the unpicklable entries. del state["Manifold"] + state["Dist_name"] = self.Dist.__name__ if self.Dist.__name__ == "Categorical": del state["Dist"] state["K"] = self.Dist.n_params + 1 + elif self.Dist.__name__ == "MVN": + del state["Dist"] + state["K"] = (-3 + (9 + 8 * (self.Dist.n_params)) ** 0.5) / 2 return state def __setstate__(self, state_dict): + # Recreate the object which could not pickle + name_to_dist_dict = {"Categorical": k_categorical, "MVN": MultivariateNormal} if "K" in state_dict.keys(): - state_dict["Dist"] = k_categorical(state_dict["K"]) + state_dict["Dist"] = name_to_dist_dict[state_dict["Dist_name"]]( + state_dict["K"] + ) state_dict["Manifold"] = manifold(state_dict["Score"], state_dict["Dist"]) self.__dict__ = state_dict @@ -137,6 +149,7 @@ def sample(self, X, Y, sample_weight, params): ) def fit_base(self, X, grads, sample_weight=None): + models = [ clone(self.Base).fit(X, g, sample_weight=sample_weight) for g in grads.T ] @@ -218,7 +231,7 @@ def fit( if Y is None: raise ValueError("y cannot be None") - X, Y = check_X_y(X, Y, y_numeric=True) + X, Y = check_X_y(X, Y, y_numeric=True, multi_output=self.multi_output) self.n_features = X.shape[1] @@ -227,7 +240,9 @@ def fit( params = self.pred_param(X) if X_val is not None and Y_val is not None: - X_val, Y_val = check_X_y(X_val, Y_val, y_numeric=True) + X_val, Y_val = check_X_y( + X_val, Y_val, y_numeric=True, multi_output=self.multi_output + ) val_params = self.pred_param(X_val) val_loss_list = [] best_val_loss = np.inf diff --git a/pyproject.toml b/pyproject.toml index 6782cd29..b8b3d3c0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,7 @@ numpy = ">=1.17, <2.0" scipy = ">=1.3, <2.0" tqdm = ">=4.4, <5.0" lifelines = ">=0.25, <0.29" +sympy = "^1.7.1" [tool.poetry.dev-dependencies] pytest = "^6.1.2"