-
Notifications
You must be signed in to change notification settings - Fork 433
Add multivariate hypergeometric distribution #1963
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 21 commits
dd6376e
8bcfaba
7f494af
47e416d
585caff
43e824a
53b005f
f2d6b4f
4bdaca4
d17c249
9e37f95
91d9da5
74240a6
88878b4
e6b67d7
f7b61ff
58f992d
65c83f7
dbeeb04
30c7882
fc068db
3f5985c
a5f8cc0
17ee592
a01a3e1
4d2ed35
c0a6cc0
5fb20ff
4d58d63
b0c2386
53f875a
7042107
947fed3
d096ba5
20d68b8
4853cbc
84dfbb6
cccd79f
6c20f7e
b3496ab
26e33f7
afe12d1
6b0ad57
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -59,6 +59,7 @@ MvNormalCanon | |
| MvLogitNormal | ||
| MvLogNormal | ||
| Dirichlet | ||
| MvHypergeometric | ||
| ``` | ||
|
|
||
| ## Addition Methods | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,125 @@ | ||
| """ | ||
| The [Multivariate hypergeometric distribution](https://en.wikipedia.org/wiki/Hypergeometric_distribution#Multivariate_hypergeometric_distribution) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Julia docstrings should contain the type or function signature in the first line, indented by four spaces. Generally, could you make this docstring consistent with docstrings of other existing distribution types?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've added the function signature. The doc string is based on the multinomial distribution docstring. Is there a different distribution I should be following? |
||
| generalizes the *hypergeometric distribution*. Consider ``n`` draws from a finite population containing ``k`` types of elements. Suppose that the population has size ``M`` and there are ``m_i`` elements of type ``i`` for ``i = 1, .., k`` with ``m_1+...m_k = M``. Let ``X = (X_1, ..., X_k)`` where ``X_i`` represents the number of elements of type ``i`` drawn, then the distribution of ``X`` is a multivariate hypergeometric distribution. Each sample of a multivariate hypergeometric distribution is a ``k``-dimensional integer vector that sums to ``n`` and satisfies ``0 \\le X_i \\le m_i``. | ||
| The probability mass function is given by | ||
| ```math | ||
| f(x; m, n) = {{{m_1 \\choose x_1}{m_2 \\choose x_2}\\cdots {m_k \\choose x_k}}\\over {N \\choose n}}, | ||
| \\quad x_1 + \\cdots + x_k = n, \\quad x_i \\le m_i | ||
| ``` | ||
| ```julia | ||
| MvHypergeometric(m, n) # Multivariate hypergeometric distribution for a population with | ||
| # m = (m_1, ..., m_k) elements of type 1 to k and n draws | ||
| ``` | ||
| """ | ||
| struct MvHypergeometric <: DiscreteMultivariateDistribution | ||
| m::AbstractVector{Int} # number of elements of each type | ||
Michael-Howes marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| n::Int # number of draws | ||
| function MvHypergeometric(m::AbstractVector{Int}, n::Int; check_args::Bool=true) | ||
| @check_args( | ||
| MvHypergeometric, | ||
| (m, all(m .>= zero.(n))), | ||
Michael-Howes marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| zero(n) <= n <= sum(m), | ||
| ) | ||
| new(m, n) | ||
| end | ||
| end | ||
|
|
||
|
|
||
| # Parameters | ||
|
|
||
| ncategories(d::MvHypergeometric) = length(d.m) | ||
| length(d::MvHypergeometric) = ncategories(d) | ||
| nelements(d::MvHypergeometric) = d.m | ||
Michael-Howes marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| ntrials(d::MvHypergeometric) = d.n | ||
|
|
||
| params(d::MvHypergeometric) = (d.m, d.n) | ||
Michael-Howes marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| # Statistics | ||
|
|
||
| mean(d::MvHypergeometric) = d.n .* d.m ./ sum(d.m) | ||
|
|
||
| function var(d::MvHypergeometric) | ||
| m = nelements(d) | ||
| k = length(m) | ||
| n = ntrials(d) | ||
| M = sum(m) | ||
| p = m / M | ||
| f = n * (M - n) / (M - 1) | ||
|
|
||
| v = Vector{Real}(undef, k) | ||
Michael-Howes marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| for i = 1:k | ||
| @inbounds p_i = p[i] | ||
Michael-Howes marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| v[i] = f * p_i * (1 - p_i) | ||
| end | ||
Michael-Howes marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| v | ||
| end | ||
|
|
||
| function cov(d::MvHypergeometric) | ||
| m = nelements(d) | ||
| k = length(m) | ||
| n = ntrials(d) | ||
| M = sum(m) | ||
| p = m / M | ||
| f = n * (M - n) / (M - 1) | ||
|
|
||
| C = Matrix{Real}(undef, k, k) | ||
Michael-Howes marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| for j = 1:k | ||
| pj = p[j] | ||
| for i = 1:j-1 | ||
| @inbounds C[i, j] = -f * p[i] * pj | ||
|
||
| end | ||
|
|
||
| @inbounds C[j, j] = f * pj * (1 - pj) | ||
| end | ||
|
|
||
| for j = 1:k-1 | ||
| for i = j+1:k | ||
| @inbounds C[i, j] = C[j, i] | ||
| end | ||
| end | ||
| C | ||
| end | ||
|
|
||
|
|
||
| # Evaluation | ||
| function insupport(d::MvHypergeometric, x::AbstractVector{T}) where T<:Real | ||
| k = length(d) | ||
| m = nelements(d) | ||
| length(x) == k || return false | ||
| s = 0.0 | ||
| for i = 1:k | ||
| @inbounds xi = x[i] | ||
| if !(isinteger(xi) && xi >= 0 && xi <= m[i]) | ||
| return false | ||
| end | ||
| s += xi | ||
| end | ||
| return s == ntrials(d) # integer computation would not yield truncation errors | ||
| end | ||
Michael-Howes marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| function _logpdf(d::MvHypergeometric, x::AbstractVector{T}) where T<:Real | ||
Michael-Howes marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| m = nelements(d) | ||
| M = sum(m) | ||
| n = ntrials(d) | ||
| insupport(d, x) || return -Inf | ||
Michael-Howes marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| s = -logabsbinomial(M, n)[1] | ||
| for i = 1:length(m) | ||
| @inbounds xi = x[i] | ||
| @inbounds m_i = m[i] | ||
| s += logabsbinomial(m_i, xi)[1] | ||
Michael-Howes marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| end | ||
| return s | ||
| end | ||
|
|
||
| # Sampling is performed by sequentially sampling each entry from the | ||
| # hypergeometric distribution | ||
| _rand!(rng::AbstractRNG, d::MvHypergeometric, x::AbstractVector{Int}) = | ||
| mvhypergeom_rand!(rng, nelements(d), ntrials(d), x) | ||
Michael-Howes marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
|
|
||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,32 @@ | ||
| function mvhypergeom_rand!(rng::AbstractRNG, m::AbstractVector{Int}, n::Int, | ||
| x::AbstractVector{<:Real}) | ||
| k = length(m) | ||
| length(x) == k || throw(DimensionMismatch("Invalid argument dimension.")) | ||
|
|
||
| M = sum(m) | ||
| i = 0 | ||
| km1 = k - 1 | ||
|
|
||
| while i < km1 && n > 0 | ||
| i += 1 | ||
| @inbounds mi = m[i] | ||
Michael-Howes marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| # Sample from hypergeometric distribution. Element of type i are | ||
| # considered successes. All other elements are considered failures. | ||
| xi = rand(rng, Hypergeometric(mi, M - mi, n)) | ||
| @inbounds x[i] = xi | ||
Michael-Howes marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| # Remove elements of type i from population and group to be sampled. | ||
| n -= xi | ||
| M -= mi | ||
| end | ||
|
|
||
| if i == km1 | ||
| @inbounds x[k] = n | ||
Michael-Howes marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| else # n must have been zero. | ||
| z = zero(eltype(x)) | ||
| for j = i+1:k | ||
| @inbounds x[j] = z | ||
Michael-Howes marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| end | ||
| end | ||
|
|
||
| return x | ||
| end | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,133 @@ | ||
| # Tests for Multivariate Hypergeometric | ||
|
|
||
| using Distributions | ||
| using Test | ||
|
|
||
| @testset "Multivariate Hypergeometric" begin | ||
| @test_throws DomainError MvHypergeometric([5, 3, -2], 4) | ||
| @test_throws ArgumentError MvHypergeometric([5, 3], 10) | ||
|
|
||
| m = [5, 3, 2] | ||
| n = 4 | ||
| d = MvHypergeometric(m, n) | ||
| @test length(d) == 3 | ||
| @test d.n == n | ||
| @test nelements(d) == m | ||
| @test ncategories(d) == length(m) | ||
| @test params(d) == (m, n) | ||
|
|
||
| @test mean(d) ≈ [2.0, 1.2, 0.8] | ||
| @test var(d) ≈ [2 / 3, 56 / 100, 32 / 75] | ||
|
|
||
| covmat = cov(d) | ||
| @test covmat ≈ (8 / 3) .* [1/4 -3/20 -1/10; -3/20 21/100 -3/50; -1/10 -3/50 4/25] | ||
|
|
||
| @test insupport(d, [2, 1, 1]) | ||
| @test !insupport(d, [3, 2, 1]) | ||
| @test !insupport(d, [0, 0, 4]) | ||
|
|
||
|
|
||
| # random sampling | ||
| x = rand(d) | ||
| @test isa(x, Vector{Int}) | ||
| @test sum(x) == n | ||
| @test all(x .>= 0) | ||
| @test all(x .<= m) | ||
| @test insupport(d, x) | ||
|
|
||
| x = rand(d, 100) | ||
| @test isa(x, Matrix{Int}) | ||
| @test all(sum(x, dims=1) .== n) | ||
| @test all(x .>= 0) | ||
| @test all(x .<= m) | ||
| @test all(insupport(d, x)) | ||
|
|
||
| # random sampling with many catergories | ||
| m = [20, 2, 2, 2, 1, 1, 1] | ||
| n = 5 | ||
| d2 = MvHypergeometric(m, n) | ||
| x = rand(d2) | ||
| @test isa(x, Vector{Int}) | ||
| @test sum(x) == n | ||
| @test all(x .>= 0) | ||
| @test all(x .<= m) | ||
| @test insupport(d2, x) | ||
|
|
||
| # random sampling with a large category | ||
| m = [2, 1000] | ||
| n = 5 | ||
| d3 = MvHypergeometric(m, n) | ||
| x = rand(d3) | ||
| @test isa(x, Vector{Int}) | ||
| @test sum(x) == n | ||
| @test all(x .>= 0) | ||
| @test all(x .<= m) | ||
| @test insupport(d3, x) | ||
|
|
||
| # log pdf | ||
| x = [2, 1, 1] | ||
| @test pdf(d, x) ≈ 2 / 7 | ||
| @test logpdf(d, x) ≈ log(2 / 7) | ||
| @test logpdf(d, x) ≈ log(pdf(d, x)) | ||
|
|
||
| x = rand(d, 100) | ||
| pv = pdf(d, x) | ||
| lp = logpdf(d, x) | ||
| for i in 1:size(x, 2) | ||
| @test pv[i] ≈ pdf(d, x[:, i]) | ||
| @test lp[i] ≈ logpdf(d, x[:, i]) | ||
| end | ||
|
|
||
| # test degenerate cases | ||
| d1 = MvHypergeometric([1], 1) | ||
| @test logpdf(d1, [1]) ≈ 0 | ||
| @test logpdf(d1, [0]) == -Inf | ||
| d2 = MvHypergeometric([2, 0], 1) | ||
| @test logpdf(d2, [1, 0]) ≈ 0 | ||
| @test logpdf(d2, [0, 1]) == -Inf | ||
|
|
||
| d3 = MvHypergeometric([5, 0, 0, 0], 3) | ||
| @test logpdf(d3, [3, 0, 0, 0]) ≈ 0 | ||
| @test logpdf(d3, [2, 1, 0, 0]) == -Inf | ||
| @test logpdf(d3, [2, 0, 0, 0]) == -Inf | ||
|
|
||
| # behavior with n = 0 | ||
| d0 = MvHypergeometric([5, 3, 2], 0) | ||
| @test logpdf(d0, [0, 0, 0]) ≈ 0 | ||
| @test logpdf(d0, [1, 0, 0]) == -Inf | ||
|
|
||
| @test rand(d0) == [0, 0, 0] | ||
| @test mean(d0) == [0.0, 0.0, 0.0] | ||
| @test var(d0) == [0.0, 0.0, 0.0] | ||
| @test insupport(d0, [0, 0, 0]) | ||
| @test !insupport(d0, [1, 0, 0]) | ||
| @test length(d0) == 3 | ||
|
|
||
| # compare with hypergeometric | ||
| ns = 3 | ||
| nf = 5 | ||
| n = 4 | ||
| dh1 = MvHypergeometric([ns, nf], n) | ||
| dh2 = Hypergeometric(ns, nf, n) | ||
|
|
||
| x = 2 | ||
| @test pdf(dh1, [x, n - x]) ≈ pdf(dh2, x) | ||
| x = 3 | ||
| @test pdf(dh1, [x, n - x]) ≈ pdf(dh2, x) | ||
|
|
||
| # comparing marginals to hypergeometric | ||
| m = [5, 3, 2] | ||
| n = 4 | ||
| d = MvHypergeometric(m, n) | ||
| dh = Hypergeometric(m[1], sum(m[2:end]), n) | ||
| x1 = 2 | ||
| @test pdf(dh, x1) ≈ sum([pdf(d, [x1, x2, n - x1 - x2]) for x2 in 0:m[2]]) | ||
|
|
||
| # comparing conditionals to hypergeometric | ||
| x1 = 2 | ||
| dh = Hypergeometric(m[2], m[3], n - x1) | ||
| q = sum([pdf(d, [x1, x2, n - x1 - x2]) for x2 in 0:m[2]]) | ||
| for x2 = 0:m[2] | ||
| @test pdf(dh, x2) ≈ pdf(d, [x1, x2, n - x1 - x2]) / q | ||
| end | ||
| end |
Uh oh!
There was an error while loading. Please reload this page.