diff --git a/src/distributions/add_noise.jl b/src/distributions/add_noise.jl index f4f2a8b..433ec92 100644 --- a/src/distributions/add_noise.jl +++ b/src/distributions/add_noise.jl @@ -1,3 +1,8 @@ +""" + noisy_value::Float64 ~ AddNoise(mean::Float64, std::Float64) + +Adds normally-distributed random noise (with standard deviation `std`) to the value `mean`. +""" struct AddNoise <: PCleanDistribution end has_discrete_proposal(::AddNoise) = false diff --git a/src/distributions/add_typos.jl b/src/distributions/add_typos.jl index 8ffcc4e..9e025c0 100644 --- a/src/distributions/add_typos.jl +++ b/src/distributions/add_typos.jl @@ -1,5 +1,26 @@ import StringDistances: DamerauLevenshtein, evaluate +""" + word_with_typos::String ~ AddTypos(word::String, max_typos=nothing) + +Add a random number of random typos to `word`. + +The distribution on the of typos added to a word depends on the word +length. On average there is approximately 1 typo for every 45 characters in the +input word when `max_typos` is large or not provided. + +The typos can be one of several types: + +- insertion: insert a random lower-case letter at a random location + +- deletion: delete a random character + +- substitution: replace a random character with a random lower-case letter + +- transpose: swap a random pair of two consecutive letters + +NOTE: The log-density is approximate +""" struct AddTypos <: PCleanDistribution end has_discrete_proposal(::AddTypos) = false diff --git a/src/distributions/maybe_swap.jl b/src/distributions/maybe_swap.jl index e4cdbc8..a23f7a3 100644 --- a/src/distributions/maybe_swap.jl +++ b/src/distributions/maybe_swap.jl @@ -1,3 +1,8 @@ +""" + MaybeSwap(val, options, prob) + +With probability `prob`, return a random element from `options`, otherwise return `val`. +""" struct MaybeSwap <: PCleanDistribution end supports_explicitly_missing_observations(::MaybeSwap) = true diff --git a/src/distributions/string_prior.jl b/src/distributions/string_prior.jl index 7270b81..d4a88e0 100644 --- a/src/distributions/string_prior.jl +++ b/src/distributions/string_prior.jl @@ -1,5 +1,13 @@ using CSV +""" + str::String ~ StringPrior(min_length, max_length, proposal_atoms::Vector{String}) + +Sample a string of random length froma simple bigram model fit to English text. + +The string length is uniformly distributed between `min_length` and `max_length` (inclusive). +The alphabet is the set {'a', 'b', .., 'z', ' ', '.'}. +""" struct StringPrior <: PCleanDistribution end letter_probs_file = joinpath(dirname(pathof(PClean)), "distributions", "lmparams", "letter_probabilities.csv") diff --git a/src/distributions/time_prior.jl b/src/distributions/time_prior.jl index c22f6f5..d2e9b07 100644 --- a/src/distributions/time_prior.jl +++ b/src/distributions/time_prior.jl @@ -1,5 +1,12 @@ using CSV +""" + timestamp::String ~ TimePrior(proposal_atoms::Vector{String}) + +Return a random time stamp of form `@sprintf("%d:%02d %s", hours, minutes, ampm)`. + +The `hours`, `minutes` and `ampm` are drawn uniformly from {1, .., 12}, {0, .., 59}, and {"a.m.", "p.m."} respectively. +""" struct TimePrior <: PCleanDistribution end has_discrete_proposal(::TimePrior) = true