Ce serveur Gitlab sera éteint le 30 juin 2020, pensez à migrer vos projets vers les serveurs gitlab-research.centralesupelec.fr et gitlab-student.centralesupelec.fr !

Commit ec786a51 authored by Simon Byrne's avatar Simon Byrne

added KSDist and KSOneSided, implemented basic algorithms.

parent 71e9cfa1
......@@ -46,6 +46,8 @@ export # types
InverseWishart,
InvertedGamma,
Kolmogorov,
KSDist,
KSOneSided,
Laplace,
Levy,
# Link,
......@@ -195,6 +197,8 @@ include(joinpath("univariate", "gumbel.jl"))
include(joinpath("univariate", "hypergeometric.jl"))
include(joinpath("univariate", "invertedgamma.jl"))
include(joinpath("univariate", "kolmogorov.jl"))
include(joinpath("univariate", "ksdist.jl"))
include(joinpath("univariate", "ksonesided.jl"))
include(joinpath("univariate", "laplace.jl"))
include(joinpath("univariate", "levy.jl"))
include(joinpath("univariate", "logistic.jl"))
......
# Kolmogorov distribution
# defined as the sup_{t \in [0,1]} |B(t)|, where B(t) is a Brownian bridge
# used in the Kolmogorov--Smirnov test for large n.
immutable Kolmogorov <: ContinuousUnivariateDistribution
end
......@@ -44,3 +45,25 @@ function ccdf(d::Kolmogorov,x::Real)
end
2.0*s
end
# TODO: figure out how best to truncate series
function pdf(d::Kolmogorov,x::Real)
if x <= 0.0
return 0.0
elseif x <= 1.0
c = π/(2.0*x)
s = 0.0
for i = 1:20
k = ((2*i-1)*c)^2
s += (k-1.0)*exp(-k/2.0)
end
return √2π*s/x^2
else
s = 0.0
for i = 1:20
s += (iseven(i) ? -1 : 1)*i^2*exp(-2.0*(i*x)^2)
end
return 8.0*x*s
end
end
# Distribution of the (two-sided) Kolmogorov-Smirnoff statistic
# D_n = \sup_x |\hat{F}_n(x) -F(x)|
# sqrt(n) D_n converges a.s. to the Kolmogorov distribution.
immutable KSDist <: ContinuousUnivariateDistribution
n::Int
end
insupport(d::KSDist, x::Real) = 1/(2*d.n) <= x <= 1.0
# Simard and L'Ecuyer (2011) meta-algorithm
function cdf(d::KSDist,x::Real)
n = d.n
b = x*n
# known exact values
if b <= 0.5
return 0.0
elseif b <= 1.0
# could be improved
return exp(lfact(n)+n*(log(2.0*b-1.0)-log(n)))
elseif x >= 1.0
return 1.0
elseif b >= n-1
return 1.0 - 2.0*(1.0-x)^n
end
a = b*x
if a >= 18.0
return 1.0
elseif n <= 140
if a <= 0.754693
return durbin(d,x)
elseif a <= 4.0
return pomeranz(d,x)
else
# Miller (1956) approximation
return 1.0 - 2.0*ccdf(KSOneSided(d.n),x)
end
elseif n <= 10_000
if b*sqrt(n) <= 1.4
return durbin(d,x)
end
end
pelzgood(d,x)
end
# Simard and L'Ecuyer (2011) meta-algorithm
function ccdf(d::KSDist,x::Real)
n = d.n
b = x*n
# Ruben and Gambino (1982) known exact values
if b <= 0.5
return 1.0
elseif b <= 1.0
return 1.0-exp(lfact(n)+n*(log(2.0*b-1.0)-log(n)))
elseif x >= 1.0
return 0.0
elseif b >= n-1
return 2.0*(1.0-x)^n
end
a = b*x
if a >= 370.0
return 0.0
elseif a >= 4.0 || (n > 140 && a >= 2.2)
# Miller (1956) approximation
return 2.0*ccdf(KSOneSided(d.n),x)
end
1.0-cdf(d,x)
end
# Durbin matrix CDF method, based on Marsaglia, Tsang and Wang (2003)
# modified to avoid need for exponent tracking
function durbin(d::KSDist,x::Float64)
# a = x*x*n
# if (a > 7.24) || (a > 3.76) && (n > 99))
# return 1 - 2.0*exp(-(2.000071+.331/sqrt(n)+1.409/n)*a)
# end
n = d.n
k = iceil(n*x)
h = k-n*x
m = 2*k-1
H = Array(Float64,m,m)
for i = 1:m, j = 1:m
H[i,j] = i-j+1 >= 0 ? 1.0 : 0.0
end
for i = 1:m
H[i,1] -= h^i
H[m,i] -= h^(m-i+1)
end
H[m,1] += h > 0.5 ? (2*h-1) : 0.0
for i = 1:m, j = 1:m
# we can avoid keeping track of the exponent by dividing by e
# (from Stirling's approximation)
H[i,j] /= e
for g = 1:max(i-j+1,0)
H[i,j] /= g
end
end
Q = H^n
s = Q[k,k]
# need to multiply by n!*(e/n)^n
if n < 500
for i = 1:n
s *= i/n*e
end
else
# 3rd-order Stirling's approximation more accurate for large n
twn = 12.0*n
s*sqrt(2.0*pi*n)*(1.0 + twn\(1 + (2.0*twn)\(1 - (15.0*twn)\139.0)))
end
s
end
# CDF method of Pelz and Good (1976)
# based on asymptotic series, accurate for large n
function pelzgood(d,x)
n = d.n
sqrtn = sqrt(n)
z = x/sqrtn
K0 = cdf(Kolmogorov(),z)
K1 = x
K0 + K1/sqrtn + K2/n + K3/(n*sqrtn)
end
function pomeranz(d,x)
n = d.n
t = n*x
t = big(x)*n
end
\ No newline at end of file
# Distribution of the (two-sided) Kolmogorov-Smirnoff statistic
# D_n = \sup_x |\hat{F}_n(x) -F(x)|
# sqrt(n) D_n converges a.s. to the Kolmogorov distribution.
immutable KSDist <: ContinuousUnivariateDistribution
n::Int
end
insupport(d::KSDist, x::Real) = 1/(2*d.n) <= x <= 1.0
# TODO: implement Simard and L'Ecuyer (2011) meta-algorithm
# requires Pomeranz and Pelz-Good algorithms
function cdf(d::KSDist,x::Real)
n = d.n
b = x*n
# known exact values
if b <= 0.5
return 0.0
elseif b <= 1.0
# accuracy could be improved
return exp(lfact(n)+n*(log(2.0*b-1.0)-log(n)))
elseif x >= 1.0
return 1.0
elseif b >= n-1
return 1.0 - 2.0*(1.0-x)^n
end
a = b*x
if a >= 18.0
return 1.0
elseif n <= 10_000
if a <= 4.0
return cdf_durbin(d,x)
else
return 1.0 - ccdf_miller(d,x)
end
else
return cdf(Kolmogorov(),sqrt(a))
end
end
function ccdf(d::KSDist,x::Real)
n = d.n
b = x*n
# Ruben and Gambino (1982) known exact values
if b <= 0.5
return 1.0
elseif b <= 1.0
return 1.0-exp(lfact(n)+n*(log(2.0*b-1.0)-log(n)))
elseif x >= 1.0
return 0.0
elseif b >= n-1
return 2.0*(1.0-x)^n
end
a = b*x
if a >= 370.0
return 0.0
elseif a >= 4.0 || (n > 140 && a >= 2.2)
return ccdf_miller(d,x)
else
return 1.0-cdf(d,x)
end
end
# Durbin matrix CDF method, based on Marsaglia, Tsang and Wang (2003)
# modified to avoid need for exponent tracking
function cdf_durbin(d::KSDist,x::Float64)
n = d.n
k, ch, h = ceil_rems_mult(n,x)
m = 2*k-1
H = Array(Float64,m,m)
for i = 1:m, j = 1:m
H[i,j] = i-j+1 >= 0 ? 1.0 : 0.0
end
r = 1.0
for i = 1:m
# (1-h^i) = (1-h)(1+h+...+h^(i-1))
H[i,1] = H[m,m-i+1] = ch*r
r += h^i
end
H[m,1] += h <= 0.5 ? -h^m : -h^m+(h-ch)
for i = 1:m, j = 1:m
for g = 1:max(i-j+1,0)
H[i,j] /= g
end
# we can avoid keeping track of the exponent by dividing by e
# (from Stirling's approximation)
H[i,j] /= e
end
Q = H^n
s = Q[k,k]
s*stirling(n)
end
# Miller (1956) approximation
function ccdf_miller(d::KSDist, x::Real)
2.0*ccdf(KSOneSided(d.n),x)
end
## these functions are used in durbin and pomeranz algorithms
# calculate exact remainders the easy way
function floor_rems_mult(n,x)
t = big(x)*big(n)
fl = floor(t)
lrem = t - fl
urem = (fl+one(fl)) - t
return convert(typeof(n),fl), convert(typeof(x),lrem), convert(typeof(x),urem)
end
function ceil_rems_mult(n,x)
t = big(x)*big(n)
cl = ceil(t)
lrem = t - (cl - one(cl))
urem = cl - t
return convert(typeof(n),cl), convert(typeof(x),lrem), convert(typeof(x),urem)
end
# n!*(e/n)^n
function stirling(n)
if n < 500
s = 1.0
for i = 1:n
s *= i/n*e
end
return s
else
# 3rd-order Stirling's approximation more accurate for large n
twn = 12.0*n
return sqrt(2.0*pi*n)*(1.0 + twn\(1 + (2.0*twn)\(1 - (15.0*twn)\139.0)))
end
end
# Distribution of the one-sided Kolmogorov-Smirnov test statistic:
# D^+_n = \sup_x (\hat{F}_n(x) -F(x))
immutable KSOneSided <: ContinuousUnivariateDistribution
n::Int
end
insupport(d::KSOneSided, x::Real) = 0.0 <= x <= 1.0
# formula of Birnbaum and Tingey (1951)
function ccdf(d::KSOneSided,x::Real)
if x >= 1.0
return 0.0
elseif x <= 0.0
return 1.0
end
n = d.n
s = 0.0
for j = 0:ifloor(n-n*x)
p = x+j/n
s += pdf(Binomial(n,p),j) / p
end
s*x
end
cdf(d::KSOneSided,x::Real) = 1.0 - ccdf(d,x)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment