88# ' @param ... One or more selector functions to choose variables for this check.
99# ' See [selections()] for more details. You will usually want to use
1010# ' [recipes::all_predictors()] and/or [recipes::all_outcomes()] here.
11- # ' @param n The minimum number of data points required for training. If this is
12- # ' NULL, the total number of predictors will be used.
11+ # ' @param min_data_points The minimum number of data points required for
12+ # ' training. If this is NULL, the total number of predictors will be used.
1313# ' @param epi_keys A character vector of column names on which to group the data
1414# ' and check threshold within each group. Useful if your forecaster trains
1515# ' per group (for example, per geo_value).
1818# ' created.
1919# ' @param trained A logical for whether the selectors in `...`
2020# ' have been resolved by [prep()].
21- # ' @param columns An internal argument that tracks which columns are evaluated
22- # ' for this check. Should not be used by the user.
2321# ' @param id A character string that is unique to this check to identify it.
2422# ' @param skip A logical. If `TRUE`, only training data is checked, while if
2523# ' `FALSE`, both training and predicting data is checked. Technically, this
4644check_enough_data <-
4745 function (recipe ,
4846 ... ,
49- n = NULL ,
47+ min_data_points = NULL ,
5048 epi_keys = NULL ,
5149 drop_na = TRUE ,
5250 role = NA ,
5351 trained = FALSE ,
54- columns = NULL ,
5552 skip = TRUE ,
5653 id = rand_id(" enough_data" )) {
5754 recipes :: add_check(
5855 recipe ,
5956 check_enough_data_new(
60- n = n ,
57+ min_data_points = min_data_points ,
6158 epi_keys = epi_keys ,
6259 drop_na = drop_na ,
6360 terms = enquos(... ),
6461 role = role ,
6562 trained = trained ,
66- columns = columns ,
63+ columns = NULL ,
6764 skip = skip ,
6865 id = id
6966 )
7067 )
7168 }
7269
7370check_enough_data_new <-
74- function (n , epi_keys , drop_na , terms , role , trained , columns , skip , id ) {
71+ function (min_data_points , epi_keys , drop_na , terms ,
72+ role , trained , columns , skip , id ) {
7573 recipes :: check(
7674 subclass = " enough_data" ,
7775 prefix = " check_" ,
78- n = n ,
76+ min_data_points = min_data_points ,
7977 epi_keys = epi_keys ,
8078 drop_na = drop_na ,
8179 terms = terms ,
@@ -90,15 +88,14 @@ check_enough_data_new <-
9088# ' @export
9189prep.check_enough_data <- function (x , training , info = NULL , ... ) {
9290 col_names <- recipes :: recipes_eval_select(x $ terms , training , info )
93- if (is.null(x $ n )) {
94- x $ n <- length(col_names )
91+ if (is.null(x $ min_data_points )) {
92+ x $ min_data_points <- length(col_names )
9593 }
9694
9795 check_enough_data_core(training , x , col_names , " train" )
9896
99-
10097 check_enough_data_new(
101- n = x $ n ,
98+ min_data_points = x $ min_data_points ,
10299 epi_keys = x $ epi_keys ,
103100 drop_na = x $ drop_na ,
104101 terms = x $ terms ,
@@ -119,7 +116,7 @@ bake.check_enough_data <- function(object, new_data, ...) {
119116
120117# ' @export
121118print.check_enough_data <- function (x , width = max(20 , options()$ width - 30 ), ... ) {
122- title <- paste0(" Check enough data (n = " , x $ n , " ) for " )
119+ title <- paste0(" Check enough data (n = " , x $ min_data_points , " ) for " )
123120 recipes :: print_step(x $ columns , x $ terms , x $ trained , title , width )
124121 invisible (x )
125122}
@@ -132,7 +129,7 @@ tidy.check_enough_data <- function(x, ...) {
132129 res <- tibble(terms = recipes :: sel2char(x $ terms ))
133130 }
134131 res $ id <- x $ id
135- res $ n <- x $ n
132+ res $ min_data_points <- x $ min_data_points
136133 res $ epi_keys <- x $ epi_keys
137134 res $ drop_na <- x $ drop_na
138135 res
@@ -145,18 +142,18 @@ check_enough_data_core <- function(epi_df, step_obj, col_names, train_or_predict
145142 any_missing_data <- epi_df %> %
146143 mutate(any_are_na = rowSums(across(any_of(.env $ col_names ), ~ is.na(.x ))) > 0 ) %> %
147144 # count the number of rows where they're all not na
148- summarise(sum(any_are_na == 0 ) < .env $ step_obj $ n , .groups = " drop" )
145+ summarise(sum(any_are_na == 0 ) < .env $ step_obj $ min_data_points , .groups = " drop" )
149146 any_missing_data <- any_missing_data %> %
150147 summarize(across(all_of(setdiff(names(any_missing_data ), step_obj $ epi_keys )), any )) %> %
151148 any()
152149
153- # figuring out which individual columns (if any) are to blame for this darth
150+ # figuring out which individual columns (if any) are to blame for this dearth
154151 # of data
155152 cols_not_enough_data <- epi_df %> %
156153 summarise(
157154 across(
158155 all_of(.env $ col_names ),
159- ~ sum(! is.na(.x )) < .env $ step_obj $ n
156+ ~ sum(! is.na(.x )) < .env $ step_obj $ min_data_points
160157 ),
161158 .groups = " drop"
162159 ) %> %
@@ -176,12 +173,7 @@ check_enough_data_core <- function(epi_df, step_obj, col_names, train_or_predict
176173 } else {
177174 # if we're not dropping na values, just count
178175 cols_not_enough_data <- epi_df %> %
179- summarise(
180- across(
181- all_of(.env $ col_names ),
182- ~ dplyr :: n() < .env $ step_obj $ n
183- )
184- )
176+ summarise(across(all_of(.env $ col_names ), ~ dplyr :: n() < .env $ step_obj $ min_data_points ))
185177 any_missing_data <- cols_not_enough_data %> %
186178 summarize(across(all_of(.env $ col_names ), all )) %> %
187179 all()
0 commit comments