From e1519e029b8ee3e73e681c37686fe005b7868ba4 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Fri, 22 Aug 2025 15:44:56 -0500 Subject: [PATCH 1/5] remove local definitions of fulldummy and nesting operator These are now implemented as part of RegressionFormulae.jl. At some point, we may need to provide more specialized methods for complex random effects structures (i.e. interaction with correlation suppression), but these can and should be implemented as methods of RegressionFormuale.jl's functions. --- Project.toml | 3 +- src/MixedModels.jl | 1 + src/randomeffectsterm.jl | 64 ---------------------------------------- test/FactorReTerm.jl | 9 +----- 4 files changed, 4 insertions(+), 73 deletions(-) diff --git a/Project.toml b/Project.toml index 02dcd04cc..ef5533d78 100644 --- a/Project.toml +++ b/Project.toml @@ -19,6 +19,7 @@ PooledArrays = "2dfb63ee-cc39-5dd5-95bd-886bf059d720" PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a" ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +RegressionFormulae = "545c379f-4ec2-4339-9aea-38f2fb6a8ba2" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" @@ -93,4 +94,4 @@ Suppressor = "fd094767-a336-5f1f-9728-57cf17d0bbfb" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Aqua", "DataFrames", "ExplicitImports", "FiniteDiff", "ForwardDiff", "InteractiveUtils", "PRIMA", "RegressionFormulae", "StableRNGs", "Suppressor", "Test"] +test = ["Aqua", "DataFrames", "ExplicitImports", "FiniteDiff", "ForwardDiff", "InteractiveUtils", "PRIMA", "StableRNGs", "Suppressor", "Test"] diff --git a/src/MixedModels.jl b/src/MixedModels.jl index 852be8fdd..121079262 100644 --- a/src/MixedModels.jl +++ b/src/MixedModels.jl @@ -25,6 +25,7 @@ using NLopt: NLopt using PrecompileTools: PrecompileTools, @setup_workload, @compile_workload using ProgressMeter: ProgressMeter, Progress, finish!, next! using Random: Random, AbstractRNG, randn! +using RegressionFormulae: fulldummy using SparseArrays: SparseArrays, SparseMatrixCSC, SparseVector, dropzeros!, nnz using SparseArrays: nonzeros, nzrange, rowvals, sparse using StaticArrays: StaticArrays, SVector diff --git a/src/randomeffectsterm.jl b/src/randomeffectsterm.jl index c7fdacdf0..6cea36035 100644 --- a/src/randomeffectsterm.jl +++ b/src/randomeffectsterm.jl @@ -164,70 +164,6 @@ function _ranef_refs( return refs, uniques end -# TODO: remove all of this and either -# - require users to use RegressionFormulae.jl -# - add a dependency on RegressionFormulae.jl - -function StatsModels.apply_schema( - t::FunctionTerm{typeof(/)}, sch::StatsModels.FullRank, Mod::Type{<:MixedModel} -) - if length(t.args) ≠ 2 - throw(ArgumentError("malformed nesting term: $t (Exactly two arguments required")) - end - - first, second = apply_schema.(t.args, Ref(sch), Mod) - - if !(typeof(first) <: CategoricalTerm) - throw( - ArgumentError( - "nesting terms requires categorical grouping term, got $first. Manually specify $first as `CategoricalTerm` in hints/contrasts" - ), - ) - end - - return first + fulldummy(first) & second -end - -# add some syntax to manually promote to full dummy coding -function fulldummy(t::AbstractTerm) - return throw( - ArgumentError( - "can't promote $t (of type $(typeof(t))) to full dummy " * - "coding (only CategoricalTerms)", - ), - ) -end - -""" - fulldummy(term::CategoricalTerm) - -Assign "contrasts" that include all indicator columns (dummy variables) and an intercept column. - -This will result in an under-determined set of contrasts, which is not a problem in the random -effects because of the regularization, or "shrinkage", of the conditional modes. - -The interaction of `fulldummy` with complex random effects is subtle and complex with numerous -potential edge cases. As we discover these edge cases, we will document and determine their -behavior. Until such time, please check the model summary to verify that the expansion is -working as you expected. If it is not, please report a use case on GitHub. -""" -function fulldummy(t::CategoricalTerm) - new_contrasts = StatsModels.ContrastsMatrix( - StatsModels.FullDummyCoding(), t.contrasts.levels - ) - return t = CategoricalTerm(t.sym, new_contrasts) -end - -function fulldummy(x) - return throw(ArgumentError("fulldummy isn't supported outside of a MixedModel formula")) -end - -function StatsModels.apply_schema( - t::FunctionTerm{typeof(fulldummy)}, sch::StatsModels.FullRank, Mod::Type{<:MixedModel} -) - return fulldummy(apply_schema.(t.args, Ref(sch), Mod)...) -end - # specify zero correlation struct ZeroCorr <: AbstractReTerm term::RandomEffectsTerm diff --git a/test/FactorReTerm.jl b/test/FactorReTerm.jl index 8f1993795..d84bea415 100644 --- a/test/FactorReTerm.jl +++ b/test/FactorReTerm.jl @@ -172,12 +172,10 @@ end f=string.(repeat('A':'C'; outer=6))) @testset "fulldummy" begin - @test_throws ArgumentError fulldummy(1) - f = @formula(y ~ 1 + fulldummy(f)) f1 = apply_schema(f, schema(dat)) @test typeof(last(f1.rhs.terms)) <: FunctionTerm{typeof(fulldummy)} - @test_throws ArgumentError modelcols(f1, dat) + @test_throws MethodError modelcols(f1, dat) f2 = apply_schema(f, schema(dat), MixedModel) @test typeof(last(f2.rhs.terms)) <: CategoricalTerm{<:StatsModels.FullDummyCoding} @@ -240,11 +238,6 @@ end @formula(0 ~ 1 + a / b), schema(d2), MixedModel ) - # errors for too much nesting - @test_throws ArgumentError apply_schema( - @formula(0 ~ 1 + b / c / a), schema(d2), MixedModel - ) - # fitted model to test amalgamate and fnames, and equivalence with other formulations psts = dataset("pastes") m = fit( From caa41170d4e63dac3437bf54e256dc69e21e24df Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Fri, 22 Aug 2025 15:50:30 -0500 Subject: [PATCH 2/5] docs tweak --- docs/src/constructors.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/docs/src/constructors.md b/docs/src/constructors.md index b6cf4a642..ef3dd3dc3 100644 --- a/docs/src/constructors.md +++ b/docs/src/constructors.md @@ -188,9 +188,6 @@ DisplayAs.Text(ans) # hide (Notice that the variance component for `days: 1` is estimated as zero, so the correlations for this component are undefined and expressed as `NaN`, not a number.) An alternative is to force all the levels of `days` as indicators using `fulldummy` encoding. -```@docs -fulldummy -``` ```@example Main fit(MixedModel, @formula(reaction ~ 1 + days + (1 + fulldummy(days)|subj)), sleepstudy, contrasts = Dict(:days => DummyCoding())) From 076535bd7a9e485ada1408012054d794fed1b5c8 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Fri, 22 Aug 2025 15:51:23 -0500 Subject: [PATCH 3/5] NEWS --- NEWS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/NEWS.md b/NEWS.md index 790eabd58..b1633a52d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -8,6 +8,7 @@ MixedModels v5.0.0 Release Notes - Internal code around the default optimizer has been restructured. In particular, the NLopt backend has been moved to a submodule, which will make it easier to move it to an extension if we promote another backend to the default. [#853] - Internal code around optimization in profiling has been restructuring so that fitting done during calls to `profile` respect the `backend` and `optimizer` settings. [#853] - The `prfit!` convenience function has been removed. [#853] +- The local implementation of `fulldummy` and the nesting syntax has been removed and a dependency on RegressionFormulae.jl for their implementation has been added. [#855] MixedModels v4.38.0 Release Notes ============================== From f5a7945c114cfe5e0a626fffcfdd0b43685e17c1 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Sat, 23 Aug 2025 14:21:19 -0500 Subject: [PATCH 4/5] add documentation on formula syntax --- docs/make.jl | 1 + docs/src/formula_syntax.md | 71 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) create mode 100644 docs/src/formula_syntax.md diff --git a/docs/make.jl b/docs/make.jl index 1ee57b3c4..ed1b5f7d7 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -22,6 +22,7 @@ makedocs(; "rankdeficiency.md", "mime.md", "derivatives.md", + "formula_syntax.md", "api.md", ], ) diff --git a/docs/src/formula_syntax.md b/docs/src/formula_syntax.md new file mode 100644 index 000000000..5c2edb6ba --- /dev/null +++ b/docs/src/formula_syntax.md @@ -0,0 +1,71 @@ + +# Formula syntax + +MixedModels.jl uses the variant of the Wilkinson-Rogers (1973) notation for models of (co)variance implemented by [StatsModels.jl](https://juliastats.org/StatsModels.jl/stable/formula/#Modeling-tabular-data). +Additionally, MixedModels.jl extends this syntax to use the pipe `|` as the grouping operator. +Further extensions are provided by [RegressionFormulae.jl](https://github.com/kleinschmidt/RegressionFormulae.jl?tab=readme-ov-file), in particular the use of the slash `/` as the nesting operator and the use of the caret `^` to indicate main effects and interactions up to a specified order. +Currently, MixedModels.jl loads RegressionFormulae.jl by default, though this may change in a future release. +If you require specific functionality from RegressionFormulae.jl, it is best to load it directly so that you can control the version used. + +## General rules + +- "Addition" (`+`) indicates additive, i.e., main effects: `a + b` indicates main effects of `a` and `b`. +- "Multiplication" (`*`) indicates crossing: main effects and interactions between two terms: `a * b` indicates main effects of `a` and `b` as well as their interaction. +- Usual algebraic rules apply (associativity and distributivity): + - `(a + b) * c` is equivalent to `a * c + b * c` + - `a * b * c` corresponds to main effects of `a`, `b`, and `c`, as well as all three two-way interactions and the three-way interaction. +- Categorical terms are expanded into the associated indicators/contrast variables. See the [StatsModels.jl documentation on contrasts](https://juliastats.org/StatsModels.jl/stable/contrasts/) for more information. +- Interactions are expressed with the ampersand (`&`). (This is contrast to R, which uses the colon `:` for this operation.). `a&b` is the interaction of `a` and `b`. For categorical terms, appropriate combinations of indicators/contrast variables are generated. +- Tilde (`~`) is used to separate response from predictors. +- The intercept is indicated by `1`. +- `y ~ 1 + (a + b) * c` is read as: + - The response variable is `y`. + - The model contains an intercept. + - The model contains main effects of `a`, `b`, and `c`. + - The model contains interactions between `a` and `c` and between `b` and `c` but not `a` and `b`. +- An intercept is included by default, i.e. there is an implicit `1 + ` in every formula. The intercept may be suppressed by including a `0 + ` in the formula. (In contrast to R, the use of `-1` is **not** supported.) + +### MixedModels.jl provided extensions + +- The pipe operator (`|`) indicates grouping or blocking. +- `(1 + a | subject)` indicates "by-subject random effects for the intercept and main effect `a`". +- This is in line with the usual statistical reading of `|` as "conditional on". + +### RegressionFormulae.jl provided extensions + +- "Exponentiation" (`^`) works like repeated multiplication and generates all multiplicative and additive terms up to the given order. + - `(a + b + c)^2` generates `a + b + c + a&b + a&c + b&c`, but not `a&b&c`. + - The presence of interaction terms within the base will result in redundant terms and is currently unsupported. +- `fulldummy(a)` assigns "contrasts" to `a` that include all indicator columns (dummy variables) and an intercept column. The resulting overparameterization is generally useful in the fixed effects only as part of nesting. +- The slash operator (`/`) indicates nesting: + - `a / b` is read as "`b` is nested within `a`". + - `a / b` expands to `a + fulldummy(a) & b`. +- It is generally not necessary to specify nesting in the blocking variables, when the inner levels are unique across outer levels. In other words, in a study with children (`C1`, `C2`, etc. ) nested within schools (`S1`, `S2`, etc.), + - it is not necessary to specify the nesting when `C1` identifies a unique child across schools. In other words, intercept-only random effects terms can be written as `(1|C) + `(1|S)`. + - it is necessary to specify the nesting when chid identifiers are re-used across schools, e.g. `C1` refers to a child in `S1` and a different child in `S2`. In this case, the nested syntax `(1|S/C)` expands to `(1|S) + (1|S&C)`. The interaction term in the second blocking variable generates unique labels for each child across schools. + + + +## Mixed models in Wilkinson-Rogers and mathematical notation + +Models fit with MixedModels.jl are generally linear mixed-effects models with unconstrained random effects covariance matrices and homoskedastic, normally distributed residuals. +Under these assumptions, the model specification + +`response ~ 1 + (age + sex) * education * n_children + (1 | subject)` + +corresponds to the statistical model + +\begin{align*} +\left(Y |\mathcal{B}=b\right) &\sim N\left(X\beta + Zb, \sigma^2 I \right) \\ +\mathcal{B} &\sim N\left(0, G\right) +\end{align*} + +for which we wish to obtain the maximum-likelihood estimates for $G$ and thus the fixed-effects $\beta$. + +- The model contains no restrictions on $G$, except that it is positive semidefinite. +- The response Y is the value of a given response. +- The fixed-effects design matrix X consists of columns for + - the intercept, age, sex, education, and number of children (contrast coded as appropriate) + - the interaction of all lower order terms, excluding interactions between age and sex +- The random-effects design matrix Z includes a column for + - the intercept for each subject From c1e4b9ddca54b679d52d8ed175e9da063b913e07 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Sat, 23 Aug 2025 14:36:38 -0500 Subject: [PATCH 5/5] fix math formatting --- docs/src/formula_syntax.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/docs/src/formula_syntax.md b/docs/src/formula_syntax.md index 5c2edb6ba..640cafdf2 100644 --- a/docs/src/formula_syntax.md +++ b/docs/src/formula_syntax.md @@ -55,17 +55,19 @@ Under these assumptions, the model specification corresponds to the statistical model +```math \begin{align*} \left(Y |\mathcal{B}=b\right) &\sim N\left(X\beta + Zb, \sigma^2 I \right) \\ \mathcal{B} &\sim N\left(0, G\right) \end{align*} +``` -for which we wish to obtain the maximum-likelihood estimates for $G$ and thus the fixed-effects $\beta$. +for which we wish to obtain the maximum-likelihood estimates for ``G`` and thus the fixed-effects ``\beta``. -- The model contains no restrictions on $G$, except that it is positive semidefinite. -- The response Y is the value of a given response. -- The fixed-effects design matrix X consists of columns for +- The model contains no restrictions on ``G``, except that it is positive semidefinite. +- The response ``Y`` is the value of a given response. +- The fixed-effects design matrix ``X`` consists of columns for - the intercept, age, sex, education, and number of children (contrast coded as appropriate) - the interaction of all lower order terms, excluding interactions between age and sex -- The random-effects design matrix Z includes a column for +- The random-effects design matrix ``Z`` includes a column for - the intercept for each subject