Ce serveur Gitlab sera éteint le 30 juin 2020, pensez à migrer vos projets vers les serveurs gitlab-research.centralesupelec.fr et gitlab-student.centralesupelec.fr !

Commit 23439433 authored by John Myles White's avatar John Myles White

Move fit methods into distribution-specific files

Add EmpiricalUnivariateDistribution
parent 00dd06fb
......@@ -29,7 +29,7 @@ export # types
Dirichlet,
DiscreteUniform,
DoubleExponential,
EmpiricalDistribution,
EmpiricalUnivariateDistribution,
Erlang,
Exponential,
FDist,
......@@ -90,6 +90,7 @@ export # types
linkinv, # inverse link mapping eta to mu
logccdf, # ccdf returning log-probability
logcdf, # cdf returning log-probability
loglikelihood, # log probability of array of IID draws
logpdf, # log probability density
logpdf!, # evaluate log pdf to provided storage
logpmf, # log probability mass
......@@ -158,6 +159,7 @@ include(joinpath("univariate", "cauchy.jl"))
include(joinpath("univariate", "chi.jl"))
include(joinpath("univariate", "chisq.jl"))
include(joinpath("univariate", "discreteuniform.jl"))
include(joinpath("univariate", "empirical.jl"))
include(joinpath("univariate", "exponential.jl"))
include(joinpath("univariate", "fdist.jl"))
include(joinpath("univariate", "gamma.jl"))
......@@ -208,9 +210,6 @@ include("glmtools.jl")
# REPL representations
include("show.jl")
# Fit distributions to data
include("fit.jl")
# Kernel density estimators
include("kde.jl")
......
......@@ -77,3 +77,7 @@ function draw(table::DiscreteDistributionTable)
end
return table.table[bound][index]
end
function Base.show(io::IO, table::DiscreteDistributionTable)
@printf io "DiscreteDistributionTable"
end
......@@ -85,6 +85,22 @@ logcdf(d::Distribution, q::Real) = log(cdf(d,q))
logccdf(d::Distribution, q::Real) = log(ccdf(d,q))
function loglikelihood(d::UnivariateDistribution, X::Array)
ll = 0.0
for i in 1:length(X)
ll += logpdf(d, X[i])
end
return ll
end
function loglikelihood(d::MultivariateDistribution, X::Matrix)
ll = 0.0
for i in 1:size(X, 2)
ll += logpdf(d, X[:, i])
end
return ll
end
logpdf(d::Distribution, x::Real) = log(pdf(d,x))
function logpdf!(r::AbstractArray, d::UnivariateDistribution, x::AbstractArray)
......
function digamma(x::Float64) # Use Int instead?
ccall(("digamma", :libRmath), Float64, (Float64, ), x)
end
function trigamma(x::Float64) # Use Int instead?
ccall(("trigamma", :libRmath), Float64, (Float64, ), x)
end
function fit(::Type{Bernoulli}, x::Array)
for i in 1:length(x)
if !insupport(Bernoulli(), x[i])
error("Bernoulli observations must be 0/1 values")
end
end
return Bernoulli(mean(x))
end
function fit(::Type{Beta}, x::Array)
for i in 1:length(x)
if !insupport(Beta(), x[i])
error("Bernoulli observations must be in [0,1]")
end
end
x_bar = mean(x)
v_bar = var(x)
a = x_bar * (((x_bar * (1.0 - x_bar)) / v_bar) - 1.0)
b = (1.0 - x_bar) * (((x_bar * (1.0 - x_bar)) / v_bar) - 1.0)
return Beta(a, b)
end
function fit(::Type{Binomial}, x::Real, n::Real)
if x > n || x < 0
error("For binomial observations, x must lie in [0, n]")
end
return Binomial(int(n), x / n)
end
# Categorical
# Cauchy
# Chisq
# Dirichlet
function fit(::Type{DiscreteUniform}, x::Array)
DiscreteUniform(min(x), max(x))
end
function fit(::Type{Exponential}, x::Array)
for i in 1:length(x)
if !insupport(Exponential(), x[i])
error("Exponential observations must be non-negative values")
end
end
return Exponential(mean(x))
end
# FDist
function fit(::Type{Gamma}, x::Array)
a_old = 0.5 / (log(mean(x)) - mean(log(x)))
a_new = 0.5 / (log(mean(x)) - mean(log(x)))
z = 1.0 / a_old + (mean(log(x)) - log(mean(x)) + log(a_old) - digamma(a_old)) / (a_old^2 * (1.0 / a_old - trigamma(a_old)))
a_new, a_old = 1.0 / z, a_new
iterations = 1
while abs(a_old - a_new) > 1e-16 && iterations <= 10_000
z = 1.0 / a_old + (mean(log(x)) - log(mean(x)) + log(a_old) - digamma(a_old)) / (a_old^2 * (1.0 / a_old - trigamma(a_old)))
a_new, a_old = 1.0 / z, a_new
end
if iterations >= 10_000
error("Parameter estimation failed to converge in 10,000 iterations")
end
Gamma(a_new, mean(x) / a_new)
end
function fit(::Type{Geometric}, x::Array)
Geometric(1.0 / mean(convert(Array{Int}, x)))
end
# HyperGeometric
# Logistic
function fit(::Type{Laplace}, x::Array)
a = median(x)
deviations = 0.0
for i in 1:length(x)
deviations += abs(x[i] - a)
end
b = deviations / length(x)
Laplace(a, b)
end
# logNormal
# function fit(Multinomial)
# end
# Assumes observations are rows
# Positive-definite errors
# function fit{T <: Real}(x::Matrix{T}, d::MultivariateNormal)
# MultivariateNormal(reshape(mean(x, 1), size(x, 2)), cov(x))
# end
# fit([1.0 -0.1; -2.0 0.1], MultivariateNormal())
# NegativeBinomial
# NoncentralBeta
# NoncentralChisq
# NoncentralF
# NoncentralT
# Normal
# Not strict MLE
function fit(::Type{Normal}, x::Array)
Normal(mean(x), std(x))
end
# Poisson
function fit(::Type{Poisson}, x::Array)
for i in 1:length(x)
if !insupport(Poisson(), x[i])
error("Poisson observations must be non-negative integers")
end
end
Poisson(mean(x))
end
# TDist
# Uniform
function fit(::Type{Uniform}, x::Array)
Uniform(min(x), max(x))
end
# Weibull
......@@ -115,3 +115,7 @@ function var(d::Multinomial)
end
return S
end
function fit(::Type{Multinomial}, X::Matrix)
return Multinomial(sum(X[:, 1]), vec(mean(X, 2)))
end
......@@ -146,6 +146,10 @@ function var(d::MultivariateNormal)
return U'U
end
function fit{T <: Real}(::Type{MultivariateNormal}, X::Matrix{T})
MultivariateNormal(vec(mean(X, 2)), cov(X'))
end
function chol_ldiv!(chol::Cholesky{Float64}, u::VecOrMat{Float64})
Base.LinAlg.LAPACK.trtrs!('U', 'T', 'N', chol.UL, u)
end
......@@ -63,6 +63,15 @@ skewness(d::Bernoulli) = (1.0 - 2.0 * d.prob) / std(d)
var(d::Bernoulli) = d.prob * (1.0 - d.prob)
function fit(::Type{Bernoulli}, x::Array)
for i in 1:length(x)
if !insupport(Bernoulli(), x[i])
error("Bernoulli observations must be in {0, 1}")
end
end
return Bernoulli(mean(x))
end
# GLM methods
function devresid(d::Bernoulli, y::Real, mu::Real, wt::Real)
2wt * (xlogxdmu(y, mu) + xlogxdmu(1.0 - y, 1.0 - mu))
......
......@@ -70,3 +70,16 @@ function var(d::Beta)
ab = d.alpha + d.beta
return d.alpha * d.beta / (ab * ab * (ab + 1.0))
end
function fit(::Type{Beta}, x::Array)
for i in 1:length(x)
if !insupport(Beta(), x[i])
error("Bernoulli observations must be in [0,1]")
end
end
x_bar = mean(x)
v_bar = var(x)
a = x_bar * (((x_bar * (1.0 - x_bar)) / v_bar) - 1.0)
b = (1.0 - x_bar) * (((x_bar * (1.0 - x_bar)) / v_bar) - 1.0)
return Beta(a, b)
end
......@@ -49,3 +49,10 @@ modes(d::Binomial) = iround([d.size * d.prob])
skewness(d::Binomial) = (1.0 - 2.0 * d.prob) / std(d)
var(d::Binomial) = d.size * d.prob * (1.0 - d.prob)
function fit(::Type{Binomial}, x::Real, n::Real)
if x > n || x < 0
error("For binomial observations, x must lie in [0, n]")
end
return Binomial(int(n), x / n)
end
......@@ -95,3 +95,8 @@ function var(d::Categorical, m::Number)
end
return s
end
function fit{T <: Real}(::Type{Categorical}, x::Array{T})
# Counts for all categories
return Categorical()
end
......@@ -37,3 +37,9 @@ modes(d::Cauchy) = [d.location]
skewness(d::Cauchy) = NaN
var(d::Cauchy) = NaN
function fit{T <: Real}(::Type{Cauchy}, x::Array{T})
c = median(x)
l, u = iqr(x)
return Cauchy(c, (u - l) / 2.0)
end
......@@ -68,3 +68,7 @@ rand(d::DiscreteUniform) = d.a + rand(0:(d.b - d.a))
skewness(d::DiscreteUniform) = 0.0
var(d::DiscreteUniform) = ((d.b - d.a + 1.0)^2 - 1.0) / 12.0
function fit{T <: Real}(::Type{DiscreteUniform}, x::Array{T})
DiscreteUniform(min(x), max(x))
end
##############################################################################
#
# REFERENCES: "Statistical Distributions"
#
##############################################################################
immutable EmpiricalUnivariateDistribution <: ContinuousUnivariateDistribution
values::Vector{Float64}
cdf::Function
entropy::Float64
kurtosis::Float64
mean::Float64
median::Float64
modes::Vector{Float64}
skewness::Float64
var::Float64
end
function EmpiricalUnivariateDistribution(x::Vector)
EmpiricalUnivariateDistribution(sort(x),
ecdf(x),
NaN,
NaN,
mean(x),
median(x),
Float64[],
NaN,
var(x))
end
cdf(d::EmpiricalUnivariateDistribution, q::Real) = d.cdf(q)
entropy(d::EmpiricalUnivariateDistribution) = d.entropy
function insupport(d::EmpiricalUnivariateDistribution, x::Number)
return contains(d.values, x)
end
kurtosis(d::EmpiricalUnivariateDistribution) = d.kurtosis
mean(d::EmpiricalUnivariateDistribution) = d.mean
median(d::EmpiricalUnivariateDistribution) = d.median
modes(d::EmpiricalUnivariateDistribution) = Float64[]
function pdf(d::EmpiricalUnivariateDistribution, x::Real)
# TODO: Create lookup table for discrete case
1.0 / length(d.values)
end
function quantile(d::EmpiricalUnivariateDistribution, p::Real)
n = length(d.values)
index = ifloor(p * n) + 1
if index > n
return d.values[n]
else
return d.values[index]
end
end
function rand(d::EmpiricalUnivariateDistribution)
return d.values[rand(1:length(d.values))]
end
skewness(d::EmpiricalUnivariateDistribution) = NaN
var(d::EmpiricalUnivariateDistribution) = d.var
function fit{T <: Real}(::Type{EmpiricalUnivariateDistribution},
x::Vector{T})
EmpiricalUnivariateDistribution(x)
end
......@@ -91,3 +91,12 @@ rand!(d::Exponential, A::Array{Float64}) = d.scale * Random.randmtzig_fill_exprn
skewness(d::Exponential) = 2.0
var(d::Exponential) = d.scale * d.scale
function fit(::Type{Exponential}, x::Array)
for i in 1:length(x)
if !insupport(Exponential(), x[i])
error("Exponential observations must be non-negative values")
end
end
return Exponential(mean(x))
end
......@@ -91,3 +91,24 @@ end
skewness(d::Gamma) = 2.0 / sqrt(d.shape)
var(d::Gamma) = d.shape * d.scale * d.scale
function fit(::Type{Gamma}, x::Array)
a_old = 0.5 / (log(mean(x)) - mean(log(x)))
a_new = 0.5 / (log(mean(x)) - mean(log(x)))
z = 1.0 / a_old + (mean(log(x)) - log(mean(x)) + log(a_old) - digamma(a_old)) / (a_old^2 * (1.0 / a_old - trigamma(a_old)))
a_new, a_old = 1.0 / z, a_new
iterations = 1
while abs(a_old - a_new) > 1e-16 && iterations <= 10_000
z = 1.0 / a_old + (mean(log(x)) - log(mean(x)) + log(a_old) - digamma(a_old)) / (a_old^2 * (1.0 / a_old - trigamma(a_old)))
a_new, a_old = 1.0 / z, a_new
end
if iterations >= 10_000
error("Parameter estimation failed to converge in 10,000 iterations")
end
Gamma(a_new, mean(x) / a_new)
end
......@@ -51,3 +51,5 @@ end
skewness(d::Geometric) = (2.0 - d.prob) / sqrt(1.0 - d.prob)
var(d::Geometric) = (1.0 - d.prob) / d.prob^2
fit(::Type{Geometric}, x::Array) = Geometric(1.0 / mean(x))
......@@ -67,3 +67,14 @@ skewness(d::Laplace) = 0.0
std(d::Laplace) = sqrt(2.0) * d.scale
var(d::Laplace) = 2.0 * d.scale^2
function fit(::Type{Laplace}, x::Array)
n = length(x)
a = median(x)
deviations = 0.0
for i in 1:n
deviations += abs(x[i] - a)
end
b = deviations / n
Laplace(a, b)
end
......@@ -26,3 +26,8 @@ function var(d::logNormal)
sigsq = d.sdlog^2
return (exp(sigsq) - 1) * exp(2d.meanlog + sigsq)
end
function fit{T <: Real}(::Type{logNormal}, x::Array{T})
lx = log(x)
return logNormal(mean(lx), std(lx))
end
......@@ -46,3 +46,5 @@ skewness(d::Normal) = 0.0
std(d::Normal) = d.std
var(d::Normal) = d.std^2
fit{T <: Real}(::Type{Normal}, x::Array{T}) = Normal(mean(x), std(x))
......@@ -36,6 +36,15 @@ end
var(d::Poisson) = d.lambda
function fit(::Type{Poisson}, x::Array)
for i in 1:length(x)
if !insupport(Poisson(), x[i])
error("Poisson observations must be non-negative integers")
end
end
Poisson(mean(x))
end
# GLM Methods
function devresid(d::Poisson, y::Real, mu::Real, wt::Real)
......
......@@ -44,3 +44,7 @@ function var(d::Uniform)
w = d.b - d.a
return w * w / 12.0
end
function fit{T <: Real}(::Type{Uniform}, x::Vector{T})
Uniform(min(x), max(x))
end
......@@ -10,7 +10,8 @@ fit(DiscreteUniform, rand(DiscreteUniform(300_000, 700_000), N))
fit(Exponential, rand(Exponential(0.1), N))
fit(Gamma, rand(Gamma(7.9, 3.1), N))
# TODO: Reable when polygamma gets merged
# fit(Gamma, rand(Gamma(7.9, 3.1), N))
fit(Geometric, rand(Geometric(0.1), N))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment