From 0c21801b591258a2483bf2a745d5efdb30fe7d59 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 26 Feb 2025 11:31:57 -0800 Subject: [PATCH 001/107] refactor: hoist some epi_slide_opt pre-processing to helpers For re-use in an epix_epi_slide_opt --- R/slide.R | 292 +++++++++++++++----------- man/across_ish_names_info.Rd | 46 ++++ man/upstream_slide_f_info.Rd | 23 ++ man/upstream_slide_f_possibilities.Rd | 16 ++ 4 files changed, 259 insertions(+), 118 deletions(-) create mode 100644 man/across_ish_names_info.Rd create mode 100644 man/upstream_slide_f_info.Rd create mode 100644 man/upstream_slide_f_possibilities.Rd diff --git a/R/slide.R b/R/slide.R index abc7c3b77..b0de0f8d2 100644 --- a/R/slide.R +++ b/R/slide.R @@ -549,6 +549,164 @@ get_before_after_from_window <- function(window_size, align, time_type) { list(before = before, after = after) } + +#' Information about upstream (`{data.table}`/`{slider}`) slide functions +#' +#' Underlies [`upstream_slide_f_info`]. +#' +#' @keywords internal +upstream_slide_f_possibilities <- tibble::tribble( + ~f, ~package, ~namer, + frollmean, "data.table", ~ if (is.logical(.x)) "prop" else "av", + frollsum, "data.table", ~ if (is.logical(.x)) "count" else "sum", + frollapply, "data.table", ~"slide", + slide_sum, "slider", ~ if (is.logical(.x)) "count" else "sum", + slide_prod, "slider", ~"prod", + slide_mean, "slider", ~ if (is.logical(.x)) "prop" else "av", + slide_min, "slider", ~"min", + slide_max, "slider", ~"max", + slide_all, "slider", ~"all", + slide_any, "slider", ~"any", +) + +#' Validate & get information about an upstream slide function +#' +#' @param .f function such as `data.table::frollmean` or `slider::slide_mean`; +#' must appear in [`upstream_slide_f_possibilities`] +#' @return named list with two elements: `from_package`, a string containing the +#' upstream package name ("data.table" or "slider"), and `namer`, a function +#' that takes a column to call `.f` on and outputs a basic name or +#' abbreviation for what operation `.f` represents on that kind of column +#' (e.g., "sum", "av", "count"). +#' +#' @keywords internal +upstream_slide_f_info <- function(.f) { + # Check that slide function `.f` is one of those short-listed from + # `data.table` and `slider` (or a function that has the exact same definition, + # e.g. if the function has been reexported or defined locally). Extract some + # metadata. `namer` will be mapped over columns (.x will be a column, not the + # entire edf). + f_info_row <- upstream_slide_f_possibilities %>% + filter(map_lgl(.data$f, ~ identical(.f, .x))) + if (nrow(f_info_row) == 0L) { + # `f` is from somewhere else and not supported + cli_abort( + c( + "problem with {rlang::expr_label(rlang::caller_arg(f))}", + "i" = "`f` must be one of `data.table`'s rolling functions (`frollmean`, + `frollsum`, `frollapply`. See `?data.table::roll`) or one of + `slider`'s specialized sliding functions (`slide_mean`, `slide_sum`, + etc. See `?slider::\`summary-slide\`` for more options)." + ), + class = "epiprocess__epi_slide_opt__unsupported_slide_function", + epiprocess__f = .f + ) + } + if (nrow(f_info_row) > 1L) { + cli_abort('epiprocess internal error: looking up `.f` in table of possible + functions yielded multiple matches. Please report it using "New + issue" at https://github.com/cmu-delphi/epiprocess/issues, using + reprex::reprex to provide a minimal reproducible example.') + } + f_from_package <- f_info_row$package + list( + from_package = f_from_package, + namer = unwrap(f_info_row$namer) + ) +} + +#' Calculate input and output column names for an `{epiprocess}` [`dplyr::across`]-like operations +#' +#' @param .x data.frame to perform input column tidyselection on +#' @param time_type as in [`new_epi_df`] +#' @param col_names_quo enquosed input column tidyselect expression +#' @param .f_namer function taking an input column object and outputting a name +#' for a corresponding output column; see [`upstream_slide_f_info`] +#' @param .window_size as in [`epi_slide_opt`] +#' @param .align as in [`epi_slide_opt`] +#' @param .prefix as in [`epi_slide_opt`] +#' @param .suffix as in [`epi_slide_opt`] +#' @param .new_col_names as in [`epi_slide_opt`] +#' @return named list with two elements: `input_col_names`, chr, subset of +#' `names(.x)`; and `output_colnames`, chr, same length as `input_col_names` +#' +#' @keywords internal +across_ish_names_info <- function(.x, time_type, col_names_quo, .f_namer, .window_size, .align, .prefix, .suffix, .new_col_names) { + # The position of a given column can be differ between input `.x` and + # `.data_group` since the grouping step by default drops grouping columns. + # To avoid rerunning `eval_select` for every `.data_group`, convert + # positions of user-provided `col_names` into string column names. We avoid + # using `names(pos)` directly for robustness and in case we later want to + # allow users to rename fields via tidyselection. + pos <- eval_select(col_names_quo, data = .x, allow_rename = FALSE) + input_col_names <- names(.x)[pos] + + # Handle output naming + if ((!is.null(.prefix) || !is.null(.suffix)) && !is.null(.new_col_names)) { + cli_abort( + "Can't use both .prefix/.suffix and .new_col_names at the same time.", + class = "epiprocess__epi_slide_opt_incompatible_naming_args" + ) + } + assert_string(.prefix, null.ok = TRUE) + assert_string(.suffix, null.ok = TRUE) + assert_character(.new_col_names, len = length(input_col_names), null.ok = TRUE) + if (is.null(.prefix) && is.null(.suffix) && is.null(.new_col_names)) { + .suffix <- "_{.n}{.time_unit_abbr}{.align_abbr}{.f_abbr}" + # ^ does not account for any arguments specified to underlying functions via + # `...` such as `na.rm =`, nor does it distinguish between functions from + # different packages accomplishing the same type of computation. Those are + # probably only set one way per task, so this probably produces cleaner + # names without clashes (though maybe some confusion if switching between + # code with different settings). + } + if (!is.null(.prefix) || !is.null(.suffix)) { + .prefix <- .prefix %||% "" + .suffix <- .suffix %||% "" + if (identical(.window_size, Inf)) { + n <- "running_" + time_unit_abbr <- "" + align_abbr <- "" + } else { + n <- time_delta_to_n_steps(.window_size, time_type) + time_unit_abbr <- time_type_unit_abbr(time_type) + align_abbr <- c(right = "", center = "c", left = "l")[[.align]] + } + glue_env <- rlang::env( + .n = n, + .time_unit_abbr = time_unit_abbr, + .align_abbr = align_abbr, + .f_abbr = purrr::map_chr(.x[, c(input_col_names)], .f_namer), # compat between DT and tbl selection + quo_get_env(col_names_quo) + ) + .new_col_names <- unclass( + glue(.prefix, .envir = glue_env) + + input_col_names + + glue(.suffix, .envir = glue_env) + ) + } else { + # `.new_col_names` was provided by user; we don't need to do anything. + } + if (any(.new_col_names %in% names(.x))) { + cli_abort(c( + "Naming conflict between new columns and existing columns", + "x" = "Overlapping names: {format_varnames(intersect(.new_col_names, names(.x)))}" + ), class = "epiprocess__epi_slide_opt_old_new_name_conflict") + } + if (anyDuplicated(.new_col_names)) { + cli_abort(c( + "New column names contain duplicates", + "x" = "Duplicated names: {format_varnames(unique(.new_col_names[duplicated(.new_col_names)]))}" + ), class = "epiprocess__epi_slide_opt_new_name_duplicated") + } + output_col_names <- .new_col_names + + return(list( + input_col_names = input_col_names, + output_col_names = output_col_names + )) +} + #' Optimized slide functions for common cases #' #' @description `epi_slide_opt` allows sliding an n-timestep [data.table::froll] @@ -748,59 +906,12 @@ epi_slide_opt <- function( # Check for duplicated time values within groups assert(check_ukey_unique(ungroup(.x), c(group_vars(.x), "time_value"))) - # The position of a given column can be differ between input `.x` and - # `.data_group` since the grouping step by default drops grouping columns. - # To avoid rerunning `eval_select` for every `.data_group`, convert - # positions of user-provided `col_names` into string column names. We avoid - # using `names(pos)` directly for robustness and in case we later want to - # allow users to rename fields via tidyselection. + # Validate/process .col_names, .f: col_names_quo <- enquo(.col_names) - pos <- eval_select(col_names_quo, data = .x, allow_rename = FALSE) - col_names_chr <- names(.x)[pos] - - # Check that slide function `.f` is one of those short-listed from - # `data.table` and `slider` (or a function that has the exact same definition, - # e.g. if the function has been reexported or defined locally). Extract some - # metadata. `namer` will be mapped over columns (.x will be a column, not the - # entire edf). - f_possibilities <- - tibble::tribble( - ~f, ~package, ~namer, - frollmean, "data.table", ~ if (is.logical(.x)) "prop" else "av", - frollsum, "data.table", ~ if (is.logical(.x)) "count" else "sum", - frollapply, "data.table", ~"slide", - slide_sum, "slider", ~ if (is.logical(.x)) "count" else "sum", - slide_prod, "slider", ~"prod", - slide_mean, "slider", ~ if (is.logical(.x)) "prop" else "av", - slide_min, "slider", ~"min", - slide_max, "slider", ~"max", - slide_all, "slider", ~"all", - slide_any, "slider", ~"any", - ) - f_info <- f_possibilities %>% - filter(map_lgl(.data$f, ~ identical(.f, .x))) - if (nrow(f_info) == 0L) { - # `f` is from somewhere else and not supported - cli_abort( - c( - "problem with {rlang::expr_label(rlang::caller_arg(f))}", - "i" = "`f` must be one of `data.table`'s rolling functions (`frollmean`, - `frollsum`, `frollapply`. See `?data.table::roll`) or one of - `slider`'s specialized sliding functions (`slide_mean`, `slide_sum`, - etc. See `?slider::\`summary-slide\`` for more options)." - ), - class = "epiprocess__epi_slide_opt__unsupported_slide_function", - epiprocess__f = .f - ) - } - if (nrow(f_info) > 1L) { - cli_abort('epiprocess internal error: looking up `.f` in table of possible - functions yielded multiple matches. Please report it using "New - issue" at https://github.com/cmu-delphi/epiprocess/issues, using - reprex::reprex to provide a minimal reproducible example.') - } - f_from_package <- f_info$package + f_info <- upstream_slide_f_info(.f) + f_from_package <- f_info$from_package + # Validate/process .ref_time_values: user_provided_rtvs <- !is.null(.ref_time_values) if (!user_provided_rtvs) { .ref_time_values <- unique(.x$time_value) @@ -830,65 +941,10 @@ epi_slide_opt <- function( validate_slide_window_arg(.window_size, time_type) window_args <- get_before_after_from_window(.window_size, .align, time_type) - # Handle output naming - if ((!is.null(.prefix) || !is.null(.suffix)) && !is.null(.new_col_names)) { - cli_abort( - "Can't use both .prefix/.suffix and .new_col_names at the same time.", - class = "epiprocess__epi_slide_opt_incompatible_naming_args" - ) - } - assert_string(.prefix, null.ok = TRUE) - assert_string(.suffix, null.ok = TRUE) - assert_character(.new_col_names, len = length(col_names_chr), null.ok = TRUE) - if (is.null(.prefix) && is.null(.suffix) && is.null(.new_col_names)) { - .suffix <- "_{.n}{.time_unit_abbr}{.align_abbr}{.f_abbr}" - # ^ does not account for any arguments specified to underlying functions via - # `...` such as `na.rm =`, nor does it distinguish between functions from - # different packages accomplishing the same type of computation. Those are - # probably only set one way per task, so this probably produces cleaner - # names without clashes (though maybe some confusion if switching between - # code with different settings). - } - if (!is.null(.prefix) || !is.null(.suffix)) { - .prefix <- .prefix %||% "" - .suffix <- .suffix %||% "" - if (identical(.window_size, Inf)) { - n <- "running_" - time_unit_abbr <- "" - align_abbr <- "" - } else { - n <- time_delta_to_n_steps(.window_size, time_type) - time_unit_abbr <- time_type_unit_abbr(time_type) - align_abbr <- c(right = "", center = "c", left = "l")[[.align]] - } - glue_env <- rlang::env( - .n = n, - .time_unit_abbr = time_unit_abbr, - .align_abbr = align_abbr, - .f_abbr = purrr::map_chr(.x[col_names_chr], unwrap(f_info$namer)), - quo_get_env(col_names_quo) - ) - .new_col_names <- unclass( - glue(.prefix, .envir = glue_env) + - col_names_chr + - glue(.suffix, .envir = glue_env) - ) - } else { - # `.new_col_names` was provided by user; we don't need to do anything. - } - if (any(.new_col_names %in% names(.x))) { - cli_abort(c( - "Naming conflict between new columns and existing columns", - "x" = "Overlapping names: {format_varnames(intersect(.new_col_names, names(.x)))}" - ), class = "epiprocess__epi_slide_opt_old_new_name_conflict") - } - if (anyDuplicated(.new_col_names)) { - cli_abort(c( - "New column names contain duplicates", - "x" = "Duplicated names: {format_varnames(unique(.new_col_names[duplicated(.new_col_names)]))}" - ), class = "epiprocess__epi_slide_opt_new_name_duplicated") - } - result_col_names <- .new_col_names + # Handle output naming: + names_info <- across_ish_names_info(.x, time_type, col_names_quo, f_info$namer, .window_size, .align, .prefix, .suffix, .new_col_names) + input_col_names <- names_info$input_col_names + output_col_names <- names_info$output_col_names # Make a complete date sequence between min(.x$time_value) and max(.x$time_value). date_seq_list <- full_date_seq(.x, window_args$before, window_args$after, time_type) @@ -933,23 +989,23 @@ epi_slide_opt <- function( # be; shift results to the left by `after` timesteps. if (window_args$before != Inf) { window_size <- window_args$before + window_args$after + 1L - roll_output <- .f(x = .data_group[, col_names_chr], n = window_size, ...) + roll_output <- .f(x = .data_group[, input_col_names], n = window_size, ...) } else { window_size <- list(seq_along(.data_group$time_value)) - roll_output <- .f(x = .data_group[, col_names_chr], n = window_size, adaptive = TRUE, ...) + roll_output <- .f(x = .data_group[, input_col_names], n = window_size, adaptive = TRUE, ...) } if (window_args$after >= 1) { - .data_group[, result_col_names] <- purrr::map(roll_output, function(.x) { + .data_group[, output_col_names] <- purrr::map(roll_output, function(.x) { c(.x[(window_args$after + 1L):length(.x)], rep(NA, window_args$after)) }) } else { - .data_group[, result_col_names] <- roll_output + .data_group[, output_col_names] <- roll_output } } if (f_from_package == "slider") { - for (i in seq_along(col_names_chr)) { - .data_group[, result_col_names[i]] <- .f( - x = .data_group[[col_names_chr[i]]], + for (i in seq_along(input_col_names)) { + .data_group[, output_col_names[i]] <- .f( + x = .data_group[[input_col_names[i]]], before = as.numeric(window_args$before), after = as.numeric(window_args$after), ... @@ -968,7 +1024,7 @@ epi_slide_opt <- function( group_by(!!!.x_orig_groups) if (.all_rows) { - result[!vec_in(result$time_value, ref_time_values), result_col_names] <- NA + result[!vec_in(result$time_value, ref_time_values), output_col_names] <- NA } else if (user_provided_rtvs) { result <- result[vec_in(result$time_value, ref_time_values), ] } diff --git a/man/across_ish_names_info.Rd b/man/across_ish_names_info.Rd new file mode 100644 index 000000000..70eb40c4f --- /dev/null +++ b/man/across_ish_names_info.Rd @@ -0,0 +1,46 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/slide.R +\name{across_ish_names_info} +\alias{across_ish_names_info} +\title{Calculate input and output column names for an \code{{epiprocess}} \code{\link[dplyr:across]{dplyr::across}}-like operations} +\usage{ +across_ish_names_info( + .x, + time_type, + col_names_quo, + .f_namer, + .window_size, + .align, + .prefix, + .suffix, + .new_col_names +) +} +\arguments{ +\item{.x}{data.frame to perform input column tidyselection on} + +\item{time_type}{as in \code{\link{new_epi_df}}} + +\item{col_names_quo}{enquosed input column tidyselect expression} + +\item{.f_namer}{function taking an input column object and outputting a name +for a corresponding output column; see \code{\link{upstream_slide_f_info}}} + +\item{.window_size}{as in \code{\link{epi_slide_opt}}} + +\item{.align}{as in \code{\link{epi_slide_opt}}} + +\item{.prefix}{as in \code{\link{epi_slide_opt}}} + +\item{.suffix}{as in \code{\link{epi_slide_opt}}} + +\item{.new_col_names}{as in \code{\link{epi_slide_opt}}} +} +\value{ +named list with two elements: \code{input_col_names}, chr, subset of +\code{names(.x)}; and \code{output_colnames}, chr, same length as \code{input_col_names} +} +\description{ +Calculate input and output column names for an \code{{epiprocess}} \code{\link[dplyr:across]{dplyr::across}}-like operations +} +\keyword{internal} diff --git a/man/upstream_slide_f_info.Rd b/man/upstream_slide_f_info.Rd new file mode 100644 index 000000000..b0e928c4d --- /dev/null +++ b/man/upstream_slide_f_info.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/slide.R +\name{upstream_slide_f_info} +\alias{upstream_slide_f_info} +\title{Validate & get information about an upstream slide function} +\usage{ +upstream_slide_f_info(.f) +} +\arguments{ +\item{.f}{function such as \code{data.table::frollmean} or \code{slider::slide_mean}; +must appear in \code{\link{upstream_slide_f_possibilities}}} +} +\value{ +named list with two elements: \code{from_package}, a string containing the +upstream package name ("data.table" or "slider"), and \code{namer}, a function +that takes a column to call \code{.f} on and outputs a basic name or +abbreviation for what operation \code{.f} represents on that kind of column +(e.g., "sum", "av", "count"). +} +\description{ +Validate & get information about an upstream slide function +} +\keyword{internal} diff --git a/man/upstream_slide_f_possibilities.Rd b/man/upstream_slide_f_possibilities.Rd new file mode 100644 index 000000000..9ba949ef1 --- /dev/null +++ b/man/upstream_slide_f_possibilities.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/slide.R +\docType{data} +\name{upstream_slide_f_possibilities} +\alias{upstream_slide_f_possibilities} +\title{Information about upstream (\code{{data.table}}/\code{{slider}}) slide functions} +\format{ +An object of class \code{tbl_df} (inherits from \code{tbl}, \code{data.frame}) with 10 rows and 3 columns. +} +\usage{ +upstream_slide_f_possibilities +} +\description{ +Underlies \code{\link{upstream_slide_f_info}}. +} +\keyword{internal} From 8471a284764f76574e8b61d086ad53b8539181f6 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 26 Feb 2025 13:03:31 -0800 Subject: [PATCH 002/107] feat: WIP epix_epi_slide_opt --- DESCRIPTION | 2 + NAMESPACE | 11 ++ R/epiprocess-package.R | 8 ++ R/epix_epi_slide_opt.R | 143 +++++++++++++++++++++++++ R/patch.R | 236 +++++++++++++++++++++++++++++++++++++++++ man/tbl_diff2.Rd | 44 ++++++++ man/tbl_patch.Rd | 26 +++++ 7 files changed, 470 insertions(+) create mode 100644 R/epix_epi_slide_opt.R create mode 100644 R/patch.R create mode 100644 man/tbl_diff2.Rd create mode 100644 man/tbl_patch.Rd diff --git a/DESCRIPTION b/DESCRIPTION index abb0cf86c..013d32600 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -99,6 +99,7 @@ Collate: 'epi_df.R' 'epi_df_forbidden_methods.R' 'epiprocess-package.R' + 'epix_epi_slide_opt.R' 'group_by_epi_df_methods.R' 'methods-epi_archive.R' 'grouped_epi_archive.R' @@ -106,6 +107,7 @@ Collate: 'key_colnames.R' 'methods-epi_df.R' 'outliers.R' + 'patch.R' 'reexports.R' 'revision_analysis.R' 'slide.R' diff --git a/NAMESPACE b/NAMESPACE index 6f3ef6a11..0fce99981 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -27,6 +27,8 @@ S3method(dplyr_col_modify,col_modify_recorder_df) S3method(dplyr_col_modify,epi_df) S3method(dplyr_reconstruct,epi_df) S3method(dplyr_row_slice,epi_df) +S3method(epix_epi_slide_opt,epi_archive) +S3method(epix_epi_slide_opt,grouped_epi_archive) S3method(epix_slide,epi_archive) S3method(epix_slide,grouped_epi_archive) S3method(epix_truncate_versions_after,epi_archive) @@ -73,6 +75,7 @@ export(epi_slide_mean) export(epi_slide_opt) export(epi_slide_sum) export(epix_as_of) +export(epix_epi_slide_opt) export(epix_fill_through_version) export(epix_merge) export(epix_slide) @@ -144,6 +147,7 @@ importFrom(data.table,address) importFrom(data.table,as.data.table) importFrom(data.table,between) importFrom(data.table,copy) +importFrom(data.table,fifelse) importFrom(data.table,frollapply) importFrom(data.table,frollmean) importFrom(data.table,frollsum) @@ -201,6 +205,7 @@ importFrom(rlang,"%||%") importFrom(rlang,.data) importFrom(rlang,.env) importFrom(rlang,arg_match) +importFrom(rlang,arg_match0) importFrom(rlang,caller_arg) importFrom(rlang,caller_env) importFrom(rlang,check_dots_empty) @@ -236,10 +241,12 @@ importFrom(slider,slide_sum) importFrom(stats,cor) importFrom(stats,median) importFrom(tibble,as_tibble) +importFrom(tibble,is_tibble) importFrom(tibble,new_tibble) importFrom(tibble,validate_tibble) importFrom(tidyr,complete) importFrom(tidyr,full_seq) +importFrom(tidyr,nest) importFrom(tidyr,unnest) importFrom(tidyselect,any_of) importFrom(tidyselect,eval_select) @@ -252,12 +259,16 @@ importFrom(vctrs,"vec_slice<-") importFrom(vctrs,vec_cast) importFrom(vctrs,vec_data) importFrom(vctrs,vec_duplicate_any) +importFrom(vctrs,vec_duplicate_id) importFrom(vctrs,vec_equal) importFrom(vctrs,vec_in) +importFrom(vctrs,vec_match) importFrom(vctrs,vec_order) importFrom(vctrs,vec_rbind) importFrom(vctrs,vec_recycle_common) importFrom(vctrs,vec_rep) +importFrom(vctrs,vec_rep_each) +importFrom(vctrs,vec_seq_along) importFrom(vctrs,vec_size) importFrom(vctrs,vec_slice) importFrom(vctrs,vec_sort) diff --git a/R/epiprocess-package.R b/R/epiprocess-package.R index 61eccf993..9f01904b9 100644 --- a/R/epiprocess-package.R +++ b/R/epiprocess-package.R @@ -17,6 +17,7 @@ #' @importFrom cli pluralize #' @importFrom cli qty #' @importFrom data.table as.data.table +#' @importFrom data.table fifelse #' @importFrom data.table key #' @importFrom data.table setkeyv #' @importFrom dplyr arrange @@ -26,16 +27,23 @@ #' @importFrom lifecycle deprecated #' @importFrom purrr list_rbind #' @importFrom rlang %||% +#' @importFrom rlang arg_match0 #' @importFrom rlang is_bare_integerish +#' @importFrom tibble is_tibble +#' @importFrom tidyr nest #' @importFrom tools toTitleCase #' @importFrom vctrs vec_cast #' @importFrom vctrs vec_data +#' @importFrom vctrs vec_duplicate_id #' @importFrom vctrs vec_equal #' @importFrom vctrs vec_in +#' @importFrom vctrs vec_match #' @importFrom vctrs vec_order #' @importFrom vctrs vec_rbind #' @importFrom vctrs vec_recycle_common #' @importFrom vctrs vec_rep +#' @importFrom vctrs vec_rep_each +#' @importFrom vctrs vec_seq_along #' @importFrom vctrs vec_slice #' @importFrom vctrs vec_slice<- #' @importFrom vctrs vec_sort diff --git a/R/epix_epi_slide_opt.R b/R/epix_epi_slide_opt.R new file mode 100644 index 000000000..24ffba919 --- /dev/null +++ b/R/epix_epi_slide_opt.R @@ -0,0 +1,143 @@ +epix_epi_slide_opt_one_epikey <- function(updates, in_colnames, f, f_from_package, before, after, time_type, out_colnames) { + if (before == Inf) { + cli_abort("epiprocess internal error: epix_epi_slide_opt_one_epikey() called with before == Inf") + } + unit_step <- epiprocess:::unit_time_delta(time_type) + prev_inp_snapshot <- NULL + prev_out_snapshot <- NULL + result <- map(seq_len(nrow(updates)), function(update_i) { + version <- updates$version[[update_i]] + inp_update <- updates$subtbl[[update_i]] # TODO decide whether DT + ## setDF(inp_update) + ## inp_update <- new_tibble(inp_update, nrow = nrow(inp_update)) + inp_snapshot <- tbl_patch(prev_inp_snapshot, inp_update, "time_value") + inp_update_min_t <- min(inp_update$time_value) # TODO check efficiency + inp_update_max_t <- max(inp_update$time_value) + # If the input had updates in the range t1..t2, this could produce changes + # in slide outputs in the range t1-after..t2+before, and to compute those + # slide values, we need to look at the input snapshot from + # t1-after-before..t2+before+after. + slide_min_t <- inp_update_min_t - (before + after) * unit_step + slide_max_t <- inp_update_max_t + (before + after) * unit_step + slide_n <- time_delta_to_n_steps(slide_max_t - slide_min_t, time_type) + 1L + slide_time_values <- slide_min_t + 0L:(slide_n - 1L) * unit_step + slide_inp_backrefs <- vec_match(slide_time_values, inp_snapshot$time_value) + slide <- inp_snapshot[slide_inp_backrefs, ] # TODO vs. DT key index vs .... + slide$time_value <- slide_time_values + # TODO ensure before & after as integers? + # TODO parameterize naming, slide function, options, ... + if (f_from_package == "data.table") { + for (col_i in seq_along(in_colnames)) { + # FIXME other arg forwarding + slide[[out_colnames[[col_i]]]] <- f(slide[[in_colnames[[col_i]]]], before + after + 1L) + } + } else if (f_from_package == "slider") { + for (col_i in seq_along(in_colnames)) { + # with adaptive tails that incorporate fewer inputs: + # FIXME other arg forwarding + out_col <- f(slide[[in_colnames[[col_i]]]], before = before, after = after) + # XXX is this actually required or being done at the right time? we are + # already chopping off a good amount that might include this? + # + # FIXME can this generate an error on very short series? + vec_slice(out_col, seq_len(before)) <- NA + vec_slice(out_col, slide_n - after + seq_len(after)) <- NA + slide[[out_colnames[[col_i]]]] <- out_col + } + } else { + cli_abort("epiprocess internal error: `f_from_package` was {format_chr_deparse(f_from_package)}") + } + out_update <- slide[ + # Get back to t1-after..t2+before; times outside this range were included + # only so those inside would have enough context for their slide + # computations, but these "context" rows may contain invalid slide + # computation outputs: + vec_rep_each(c(FALSE, TRUE, FALSE), c(before, slide_n - before - after, after)) & + # Only include time_values that appeared in the input snapshot: + !is.na(slide_inp_backrefs), + ] + out_diff <- tbl_diff2(prev_out_snapshot, out_update, "time_value", "update") + out_snapshot <- tbl_patch(prev_out_snapshot, out_diff) + prev_inp_snapshot <<- inp_snapshot + prev_out_snapshot <<- out_snapshot # TODO avoid need to patch twice? + out_diff$version <- version + out_diff + }) + result +} + +#' @export +epix_epi_slide_opt <- + function(.x, .col_names, .f, ..., + .window_size = NULL, .align = c("right", "center", "left"), + .prefix = NULL, .suffix = NULL, .new_col_names = NULL # , + ## .ref_time_values = NULL, .all_rows = FALSE + ) { + UseMethod("epix_epi_slide_opt") + } + +#' @method epix_epi_slide_opt grouped_epi_archive +#' @export +epix_epi_slide_opt.grouped_epi_archive <- function(.x, ...) { + assert_set_equal( + group_vars(.x), + key_colnames(.x, exclude = c("time_value", "version")) + ) + orig_group_vars <- group_vars(.x) + orig_drop <- .x$private$drop + .x %>% + ungroup() %>% + epix_epi_slide_opt(...) %>% + group_by(pick(all_of(orig_group_vars)), .drop = orig_drop) +} + +#' @method epix_epi_slide_opt epi_archive +#' @export +epix_epi_slide_opt.epi_archive <- + function(.x, .col_names, .f, ..., + .window_size = NULL, .align = c("right", "center", "left"), + .prefix = NULL, .suffix = NULL, .new_col_names = NULL, + ## , .ref_time_values = NULL, .all_rows = FALSE + .progress = FALSE) { + # Extract metadata: + time_type <- .x$time_type + epikey_names <- key_colnames(.x, exclude = c("time_value", "version")) + # Validation & pre-processing: + .align <- arg_match(.align) + f_info <- upstream_slide_f_info(.f) + col_names_quo <- enquo(.col_names) + names_info <- across_ish_names_info(.x$DT, time_type, col_names_quo, f_info$namer, .window_size, .align, .prefix, .suffix, .new_col_names) + window_args <- get_before_after_from_window(.window_size, .align, time_type) + assert( + checkmate::check_logical(.progress, any.missing = FALSE, len = 1L, names = "unnamed"), + checkmate::check_string(.progress) + ) + if (isTRUE(.progress)) { + .progress <- "Time series processed:" + } + use_progress <- !isFALSE(.progress) + # Perform the slide: + updates_grouped <- .x$DT %>% + as.data.frame() %>% + as_tibble(.name_repair = "minimal") %>% + # 0 rows input -> 0 rows output, so we can just say drop = TRUE: + grouped_df(epikey_names, TRUE) + if (use_progress) progress_bar_id <- cli::cli_progress_bar(.progress, total = n_groups(updates_grouped)) + result <- updates_grouped %>% + group_modify(function(group_values, group_key) { + group_updates <- group_values %>% + nest(.by = version, .key = "subtbl") %>% + arrange(version) + res <- epix_epi_slide_opt_one_epikey(group_updates, names_info$input_col_names, .f, f_info$from_package, window_args$before, window_args$after, time_type, names_info$output_col_names) %>% + list_rbind() + if (use_progress) cli::cli_progress_update(id = progress_bar_id) + res + }) %>% + ungroup() %>% + new_epi_archive( + .x$geo_type, .x$time_type, .x$other_keys, + .x$clobberable_versions_start, .x$versions_end + ) + if (use_progress) cli::cli_progress_done(id = progress_bar_id) + result + } diff --git a/R/patch.R b/R/patch.R new file mode 100644 index 000000000..b582488a6 --- /dev/null +++ b/R/patch.R @@ -0,0 +1,236 @@ +# TODO use these in apply_compactify +approx_equal <- function(vec1, vec2, abs_tol, na_equal, .ptype = NULL, recurse = approx_equal, inds1 = NULL, inds2 = NULL) { + vecs <- list(vec1, vec2) + if (!is.null(inds1)) { + # XXX could have logical or integerish inds; just leave it to later checks + # to hopefully abort for now + } else { + vecs <- vec_recycle_common(!!!vecs) + } + vecs <- vec_cast_common(!!!vecs, .to = .ptype) + approx_equal0(vecs[[1]], vecs[[2]], abs_tol, na_equal, rec = approx_equal, inds1, inds2) +} + +approx_equal0 <- function(vec1, vec2, abs_tol, na_equal, recurse = approx_equal0, inds1 = NULL, inds2 = NULL) { + if (is_bare_numeric(vec1) && abs_tol != 0) { + if (!is.null(inds1)) { + vec1 <- vec1[inds1] + vec2 <- vec2[inds2] + } + # perf: since we're working with bare numerics and logicals: we can use + # fifelse, and stop it from propagating attributes when there's no special + # class to guide the meaning + res <- fifelse( + !is.na(vec1) & !is.na(vec2), + abs(vec1 - vec2) <= abs_tol, + if (na_equal) is.na(vec1) & is.na(vec2) else FALSE + ) + attributes(res) <- NULL + return(res) + } else if (is.data.frame(vec1) && abs_tol != 0) { + # (we only need to manually recurse if we potentially have columns that would + # be specially processed by the above) + if (ncol(vec1) == 0) { + rep(TRUE, nrow(vec1)) + } else { + Reduce(`&`, lapply(seq_len(ncol(vec1)), function(col_i) { + recurse(vec1[[col_i]], vec2[[col_i]], abs_tol, na_equal, recurse, inds1, inds2) + })) + } + } else { + # No special handling for any other types/situations. Makes sense for + # unclassed atomic things; bare lists and certain vctrs classes might want + # recursion / specialization, though. + if (!is.null(inds1)) { + vec1 <- vec_slice(vec1, inds1) + vec2 <- vec_slice(vec2, inds2) + } + res <- vec_equal(vec1, vec2, na_equal = na_equal) + return(res) + } +} + +#' Calculate compact patch to move from one snapshot/update to another +#' +#' @param earlier_snapshot tibble or `NULL`; `NULL` represents that there was no +#' data before `later_tbl`. +#' @param later_tbl tibble; must have the same column names as +#' `earlier_snapshot` if it is a tibble. +#' @param ukey_names character; column names that together, form a unique key +#' for `earlier_snapshot` and for `later_tbl`. This is unchecked; see +#' [`check_ukey_unique`] if you don't already have this guaranteed. +#' @param later_format "snapshot" or "update"; default is "snapshot". If +#' "snapshot", `later_tbl` will be interpreted as a full snapshot of the data +#' set including all ukeys, and any ukeys that are in `earlier_snapshot` but +#' not in `later_tbl` are interpreted as deletions, which are currently +#' (imprecisely) represented in the output patch as revisions of all +#' non-`ukey_names` columns to NA values (using `{vctrs}`). If "update", then +#' it's assumed that any deletions have already been represented this way in +#' `later_tbl` and any ukeys not in `later_tbl` are simply unchanged; we are +#' just ensuring that the update is fully compact for the given +#' `compactify_abs_tol`. +#' @param compactify_abs_tol compactification tolerance; see `apply_compactify` +#' @return a tibble in compact "update" (diff) format +tbl_diff2 <- function(earlier_snapshot, later_tbl, + ukey_names, + later_format = c("snapshot", "update"), + compactify_abs_tol = 0) { + # Most input validation + handle NULL earlier_snapshot. This is a small function so + # use faster validation variants: + if (!is_tibble(later_tbl)) { + cli_abort("`later_tbl` must be a tibble") + } + if (is.null(earlier_snapshot)) { + return(later_tbl) + } + if (!is_tibble(earlier_snapshot)) { + cli_abort("`earlier_snapshot` must be a tibble or `NULL`") + } + later_format <- arg_match0(later_format, c("snapshot", "update")) + if (!(is.vector(compactify_abs_tol, mode = "numeric") && length(compactify_abs_tol) == 1L && compactify_abs_tol >= 0)) { + # Give a specific message: + assert_numeric(compactify_abs_tol, lower = 0, any.missing = FALSE, len = 1L) + # Fallback e.g. for invalid classes not caught by assert_numeric: + cli_abort("`compactify_abs_tol` must be a length-1 double/integer >= 0") + } + + # Extract metadata: + earlier_n <- nrow(earlier_snapshot) + later_n <- nrow(later_tbl) + tbl_names <- names(earlier_snapshot) + val_names <- tbl_names[!tbl_names %in% ukey_names] + + # More input validation: + if (!identical(tbl_names, names(later_tbl))) { + # XXX is this check actually necessary? + cli_abort(c("`earlier_snapshot` and `later_tbl` should have identical column + names and ordering.", + "*" = "`earlier_snapshot` colnames: {format_chr_deparse(tbl_names)}", + "*" = "`later_tbl` colnames: {format_chr_deparse(names(later_tbl))}" + )) + } + + combined_tbl <- vec_rbind(earlier_snapshot, later_tbl) + combined_n <- nrow(combined_tbl) + + # We'll also need epikeytimes and value columns separately: + combined_ukeys <- combined_tbl[ukey_names] + combined_vals <- combined_tbl[val_names] + + # We have five types of rows in combined_tbl: + # 1. From earlier_snapshot, no matching ukey in later_tbl (deletion; turn vals to + # NAs to match epi_archive format) + # 2. From earlier_snapshot, with matching ukey in later_tbl (context; exclude from + # result) + # 3. From later_tbl, with matching ukey in earlier_snapshot, with value "close" (change + # that we'll compactify away) + # 4. From later_tbl, with matching ukey in earlier_snapshot, value not "close" (change + # that we'll record) + # 5. From later_tbl, with no matching ukey in later_tbl (addition) + + # For "snapshot" later_format, we need to filter to 1., 4., and 5., and alter + # values for 1. For "update" later_format, we need to filter to 4. and 5. + + # (For compactify_abs_tol = 0, we could potentially streamline things by dropping + # ukey+val duplicates (cases 2. and 3.).) + + # Row indices of first occurrence of each ukey; will be the same as + # seq_len(combined_n) except for when that ukey has been re-reported in + # `later_tbl`, in which case (3. or 4.) it will point back to the row index of + # the same ukey in `earlier_snapshot`: + combined_ukey_firsts <- vec_duplicate_id(combined_ukeys) + + # Which rows from combined are cases 3. or 4.? + combined_ukey_is_repeat <- combined_ukey_firsts != seq_len(combined_n) + # For each row in 3. or 4., row numbers of the ukey appearance in earlier: + ukey_repeat_first_i <- combined_ukey_firsts[combined_ukey_is_repeat] + + # Which rows from combined are in case 3.? + combined_compactify_away <- rep(FALSE, combined_n) + combined_compactify_away[combined_ukey_is_repeat] <- + approx_equal0(combined_vals, + combined_vals, + # TODO move inds closer to vals to not be as confusing? + abs_tol = compactify_abs_tol, + na_equal = TRUE, + inds1 = combined_ukey_is_repeat, + inds2 = ukey_repeat_first_i + ) + + # Which rows from combined are in cases 3., 4., or 5.? + combined_from_later <- vec_rep_each(c(FALSE, TRUE), c(earlier_n, later_n)) + + if (later_format == "update") { + # Cases 4. and 5.: + combined_tbl <- combined_tbl[combined_from_later & !combined_compactify_away, ] + } else { # later_format == "snapshot" + # Which rows from combined are in case 1.? + combined_is_deletion <- vec_rep_each(c(TRUE, FALSE), c(earlier_n, later_n)) + combined_is_deletion[ukey_repeat_first_i] <- FALSE + # Which rows from combined are in cases 1., 4., or 5.? + combined_include <- combined_is_deletion | combined_from_later & !combined_compactify_away + combined_tbl <- combined_tbl[combined_include, ] + # Represent deletion in 1. with NA-ing of all value columns. (In some + # previous approaches to epi_diff2, this seemed to be faster than using + # vec_rbind(case_1_ukeys, cases_45_tbl) or bind_rows to fill with NAs, and more + # general than data.table's rbind(case_1_ukeys, cases_45_tbl, fill = TRUE).) + combined_tbl[combined_is_deletion[combined_include], val_names] <- NA + } + + combined_tbl +} + +epi_diff2 <- function(earlier_snapshot, later_edf, + later_format = c("snapshot", "update"), + compactify_abs_tol = 0) { + ukey_names <- key_colnames(later_edf) + dplyr_reconstruct(tbl_diff2(as_tibble(earlier_snapshot), as_tibble(later_edf), ukey_names, later_format, compactify_abs_tol), later_edf) +} + +# XXX vs. tbl_patch_apply? + + + +#' Apply an update (e.g., from `tbl_diff2`) to a snapshot +#' +#' @param snapshot tibble or `NULL`; entire data set as of some version, or +#' `NULL` to treat `update` as the initial version of the data set. +#' @param update tibble; ukeys + initial values for added rows, ukeys + new +#' values for changed rows. Deletions must be imprecisely represented as +#' changing all values to NAs. +#' @param ukey_names character; names of columns that should form a unique key +#' for `snapshot` and for `update`. Uniqueness is unchecked; if you don't have +#' this guaranteed, see [`check_ukey_unique()`]. +#' @return tibble; snapshot of the data set with the update applied. +tbl_patch <- function(snapshot, update, ukey_names) { + # Most input validation. This is a small function so use faster validation + # variants: + if (!is_tibble(update)) { + # XXX debating about whether to have a specialized class for updates/diffs. + # Seems nice for type-based reasoning and might remove some args from + # interfaces, but would require constructor/converter functions for that + # type. + cli_abort("`update` must be a tibble") + } + if (is.null(snapshot)) { + return(update) + } + if (!is_tibble(snapshot)) { + cli_abort("`snapshot` must be a tibble") + } + + result_tbl <- vec_rbind(update, snapshot) + + dup_ids <- vec_duplicate_id(result_tbl[ukey_names]) + not_overwritten <- dup_ids == vec_seq_along(result_tbl) + result_tbl <- result_tbl[not_overwritten, ] + + ## result_tbl <- arrange_canonical(result_tbl) + + result_tbl +} + +epi_patch <- function(snapshot, update) { + ukey_names <- key_colnames(update) + dplyr_reconstruct(tbl_patch(as_tibble(snapshot), as_tibble(update), ukey_names), update) +} diff --git a/man/tbl_diff2.Rd b/man/tbl_diff2.Rd new file mode 100644 index 000000000..d9b652258 --- /dev/null +++ b/man/tbl_diff2.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/patch.R +\name{tbl_diff2} +\alias{tbl_diff2} +\title{Calculate compact patch to move from one snapshot/update to another} +\usage{ +tbl_diff2( + earlier_snapshot, + later_tbl, + ukey_names, + later_format = c("snapshot", "update"), + compactify_abs_tol = 0 +) +} +\arguments{ +\item{earlier_snapshot}{tibble or \code{NULL}; \code{NULL} represents that there was no +data before \code{later_tbl}.} + +\item{later_tbl}{tibble; must have the same column names as +\code{earlier_snapshot} if it is a tibble.} + +\item{ukey_names}{character; column names that together, form a unique key +for \code{earlier_snapshot} and for \code{later_tbl}. This is unchecked; see +\code{\link{check_ukey_unique}} if you don't already have this guaranteed.} + +\item{later_format}{"snapshot" or "update"; default is "snapshot". If +"snapshot", \code{later_tbl} will be interpreted as a full snapshot of the data +set including all ukeys, and any ukeys that are in \code{earlier_snapshot} but +not in \code{later_tbl} are interpreted as deletions, which are currently +(imprecisely) represented in the output patch as revisions of all +non-\code{ukey_names} columns to NA values (using \code{{vctrs}}). If "update", then +it's assumed that any deletions have already been represented this way in +\code{later_tbl} and any ukeys not in \code{later_tbl} are simply unchanged; we are +just ensuring that the update is fully compact for the given +\code{compactify_abs_tol}.} + +\item{compactify_abs_tol}{compactification tolerance; see \code{apply_compactify}} +} +\value{ +a tibble in compact "update" (diff) format +} +\description{ +Calculate compact patch to move from one snapshot/update to another +} diff --git a/man/tbl_patch.Rd b/man/tbl_patch.Rd new file mode 100644 index 000000000..2b4e8288b --- /dev/null +++ b/man/tbl_patch.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/patch.R +\name{tbl_patch} +\alias{tbl_patch} +\title{Apply an update (e.g., from \code{tbl_diff2}) to a snapshot} +\usage{ +tbl_patch(snapshot, update, ukey_names) +} +\arguments{ +\item{snapshot}{tibble or \code{NULL}; entire data set as of some version, or +\code{NULL} to treat \code{update} as the initial version of the data set.} + +\item{update}{tibble; ukeys + initial values for added rows, ukeys + new +values for changed rows. Deletions must be imprecisely represented as +changing all values to NAs.} + +\item{ukey_names}{character; names of columns that should form a unique key +for \code{snapshot} and for \code{update}. Uniqueness is unchecked; if you don't have +this guaranteed, see \code{\link[=check_ukey_unique]{check_ukey_unique()}}.} +} +\value{ +tibble; snapshot of the data set with the update applied. +} +\description{ +Apply an update (e.g., from \code{tbl_diff2}) to a snapshot +} From 4cc0961b78e1b20718f07053d5c9d4091fca08e5 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 28 Feb 2025 12:47:46 -0800 Subject: [PATCH 003/107] epix_epi_slide_opt: don't re-order input columns in output --- NAMESPACE | 1 + R/epiprocess-package.R | 1 + R/epix_epi_slide_opt.R | 4 +++- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/NAMESPACE b/NAMESPACE index 0fce99981..168b7507d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -156,6 +156,7 @@ importFrom(data.table,key) importFrom(data.table,rbindlist) importFrom(data.table,set) importFrom(data.table,setDF) +importFrom(data.table,setcolorder) importFrom(data.table,setkeyv) importFrom(dplyr,"%>%") importFrom(dplyr,across) diff --git a/R/epiprocess-package.R b/R/epiprocess-package.R index 9f01904b9..8a380c092 100644 --- a/R/epiprocess-package.R +++ b/R/epiprocess-package.R @@ -19,6 +19,7 @@ #' @importFrom data.table as.data.table #' @importFrom data.table fifelse #' @importFrom data.table key +#' @importFrom data.table setcolorder #' @importFrom data.table setkeyv #' @importFrom dplyr arrange #' @importFrom dplyr grouped_df diff --git a/R/epix_epi_slide_opt.R b/R/epix_epi_slide_opt.R index 24ffba919..bb201cf18 100644 --- a/R/epix_epi_slide_opt.R +++ b/R/epix_epi_slide_opt.R @@ -56,7 +56,7 @@ epix_epi_slide_opt_one_epikey <- function(updates, in_colnames, f, f_from_packag # Only include time_values that appeared in the input snapshot: !is.na(slide_inp_backrefs), ] - out_diff <- tbl_diff2(prev_out_snapshot, out_update, "time_value", "update") + out_diff <- tbl_diff2(prev_out_snapshot, out_update, "time_value", "update") # TODO avoid redundant diff2 work? though depends on compactify parms... out_snapshot <- tbl_patch(prev_out_snapshot, out_diff) prev_inp_snapshot <<- inp_snapshot prev_out_snapshot <<- out_snapshot # TODO avoid need to patch twice? @@ -139,5 +139,7 @@ epix_epi_slide_opt.epi_archive <- .x$clobberable_versions_start, .x$versions_end ) if (use_progress) cli::cli_progress_done(id = progress_bar_id) + # Keep ordering of old columns, place new columns at end: + setcolorder(result$DT, names(.x$DT)) result } From 3be711e9cc8b3c46484645dc697c6c13e94c9a66 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 3 Mar 2025 10:09:01 -0800 Subject: [PATCH 004/107] feat: support before = Inf in epix_epi_slide_opt --- R/epix_epi_slide_opt.R | 69 +++++++++++++++++++++++++++--------------- 1 file changed, 44 insertions(+), 25 deletions(-) diff --git a/R/epix_epi_slide_opt.R b/R/epix_epi_slide_opt.R index bb201cf18..d42f1c3c8 100644 --- a/R/epix_epi_slide_opt.R +++ b/R/epix_epi_slide_opt.R @@ -1,7 +1,5 @@ +# TODO just make this an epi_slide_opt impl? epix_epi_slide_opt_one_epikey <- function(updates, in_colnames, f, f_from_package, before, after, time_type, out_colnames) { - if (before == Inf) { - cli_abort("epiprocess internal error: epix_epi_slide_opt_one_epikey() called with before == Inf") - } unit_step <- epiprocess:::unit_time_delta(time_type) prev_inp_snapshot <- NULL prev_out_snapshot <- NULL @@ -11,25 +9,41 @@ epix_epi_slide_opt_one_epikey <- function(updates, in_colnames, f, f_from_packag ## setDF(inp_update) ## inp_update <- new_tibble(inp_update, nrow = nrow(inp_update)) inp_snapshot <- tbl_patch(prev_inp_snapshot, inp_update, "time_value") - inp_update_min_t <- min(inp_update$time_value) # TODO check efficiency - inp_update_max_t <- max(inp_update$time_value) - # If the input had updates in the range t1..t2, this could produce changes - # in slide outputs in the range t1-after..t2+before, and to compute those - # slide values, we need to look at the input snapshot from - # t1-after-before..t2+before+after. - slide_min_t <- inp_update_min_t - (before + after) * unit_step - slide_max_t <- inp_update_max_t + (before + after) * unit_step - slide_n <- time_delta_to_n_steps(slide_max_t - slide_min_t, time_type) + 1L - slide_time_values <- slide_min_t + 0L:(slide_n - 1L) * unit_step + if (before == Inf) { + if (after != 0) { + cli_abort('.window_size = Inf is only supported with .align = "right"') + } + # We need to use the entire input snapshot range, filling in time gaps. We + # shouldn't pad the ends. + slide_min_t <- min(inp_snapshot$time_value) # TODO check efficiency + slide_max_t <- max(inp_snapshot$time_value) + } else { + # If the input had updates in the range t1..t2, this could produce changes + # in slide outputs in the range t1-after..t2+before, and to compute those + # slide values, we need to look at the input snapshot from + # t1-after-before..t2+before+after. + inp_update_min_t <- min(inp_update$time_value) # TODO check efficiency + inp_update_max_t <- max(inp_update$time_value) + slide_min_t <- inp_update_min_t - (before + after) * unit_step + slide_max_t <- inp_update_max_t + (before + after) * unit_step + } + slide_nrow <- time_delta_to_n_steps(slide_max_t - slide_min_t, time_type) + 1L + slide_time_values <- slide_min_t + 0L:(slide_nrow - 1L) * unit_step slide_inp_backrefs <- vec_match(slide_time_values, inp_snapshot$time_value) slide <- inp_snapshot[slide_inp_backrefs, ] # TODO vs. DT key index vs .... slide$time_value <- slide_time_values # TODO ensure before & after as integers? # TODO parameterize naming, slide function, options, ... if (f_from_package == "data.table") { + if (before == Inf) { + n_arg <- seq_len(slide_nrow) + adaptive_arg <- TRUE + } else { + n_arg <- before + after + 1L + adaptive_arg <- FALSE + } for (col_i in seq_along(in_colnames)) { - # FIXME other arg forwarding - slide[[out_colnames[[col_i]]]] <- f(slide[[in_colnames[[col_i]]]], before + after + 1L) + slide[[out_colnames[[col_i]]]] <- f(slide[[in_colnames[[col_i]]]], n_arg, adaptive = adaptive_arg) } } else if (f_from_package == "slider") { for (col_i in seq_along(in_colnames)) { @@ -41,21 +55,26 @@ epix_epi_slide_opt_one_epikey <- function(updates, in_colnames, f, f_from_packag # # FIXME can this generate an error on very short series? vec_slice(out_col, seq_len(before)) <- NA - vec_slice(out_col, slide_n - after + seq_len(after)) <- NA + vec_slice(out_col, slide_nrow - after + seq_len(after)) <- NA slide[[out_colnames[[col_i]]]] <- out_col } } else { cli_abort("epiprocess internal error: `f_from_package` was {format_chr_deparse(f_from_package)}") } - out_update <- slide[ - # Get back to t1-after..t2+before; times outside this range were included - # only so those inside would have enough context for their slide - # computations, but these "context" rows may contain invalid slide - # computation outputs: - vec_rep_each(c(FALSE, TRUE, FALSE), c(before, slide_n - before - after, after)) & - # Only include time_values that appeared in the input snapshot: - !is.na(slide_inp_backrefs), - ] + rows_should_keep <- + if (before == Inf) { + # Re-introduce time gaps: + !is.na(slide_inp_backrefs) + } else { + # Get back to t1-after..t2+before; times outside this range were included + # only so those inside would have enough context for their slide + # computations, but these "context" rows may contain invalid slide + # computation outputs: + vec_rep_each(c(FALSE, TRUE, FALSE), c(before, slide_nrow - before - after, after)) & + # Only include time_values that appeared in the input snapshot: + !is.na(slide_inp_backrefs) + } + out_update <- slide[rows_should_keep, ] out_diff <- tbl_diff2(prev_out_snapshot, out_update, "time_value", "update") # TODO avoid redundant diff2 work? though depends on compactify parms... out_snapshot <- tbl_patch(prev_out_snapshot, out_diff) prev_inp_snapshot <<- inp_snapshot From 78c78852d1e21ffd79b4caffab204e0150b4d5c3 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 3 Mar 2025 10:36:36 -0800 Subject: [PATCH 005/107] feat(epi_slide_opt): improve feedback when .f is forgotten --- R/epiprocess-package.R | 1 + R/slide.R | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/R/epiprocess-package.R b/R/epiprocess-package.R index 8a380c092..93af0b737 100644 --- a/R/epiprocess-package.R +++ b/R/epiprocess-package.R @@ -6,6 +6,7 @@ #' @importFrom checkmate anyInfinite anyMissing assert assert_character #' @importFrom checkmate assert_class assert_data_frame assert_int assert_list #' @importFrom checkmate assert_false +#' @importFrom checkmate assert_function #' @importFrom checkmate assert_logical assert_numeric assert_scalar checkInt #' @importFrom checkmate assert_string #' @importFrom checkmate assert_subset diff --git a/R/slide.R b/R/slide.R index b0de0f8d2..0da569deb 100644 --- a/R/slide.R +++ b/R/slide.R @@ -581,6 +581,8 @@ upstream_slide_f_possibilities <- tibble::tribble( #' #' @keywords internal upstream_slide_f_info <- function(.f) { + assert_function(.f) + # Check that slide function `.f` is one of those short-listed from # `data.table` and `slider` (or a function that has the exact same definition, # e.g. if the function has been reexported or defined locally). Extract some @@ -776,7 +778,6 @@ across_ish_names_info <- function(.x, time_type, col_names_quo, .f_namer, .windo #' @importFrom purrr map map_lgl #' @importFrom data.table frollmean frollsum frollapply #' @importFrom lubridate as.period -#' @importFrom checkmate assert_function #' @importFrom slider slide_sum slide_prod slide_mean slide_min slide_max slide_all slide_any #' @export #' @seealso [`epi_slide`] for the more general slide function @@ -1164,7 +1165,6 @@ epi_slide_sum <- function( #' `before` and `after` args are assumed to have been validated by the calling #' function (using `validate_slide_window_arg`). #' -#' @importFrom checkmate assert_function #' @keywords internal full_date_seq <- function(x, before, after, time_type) { if (!time_type %in% c("day", "week", "yearmonth", "integer")) { From f5c235f61196e746af253ed4549092e599745a41 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 3 Mar 2025 10:47:03 -0800 Subject: [PATCH 006/107] fix(epix_epi_slide_opt): support ... forwarding --- R/epix_epi_slide_opt.R | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/R/epix_epi_slide_opt.R b/R/epix_epi_slide_opt.R index d42f1c3c8..502a757e4 100644 --- a/R/epix_epi_slide_opt.R +++ b/R/epix_epi_slide_opt.R @@ -1,5 +1,5 @@ # TODO just make this an epi_slide_opt impl? -epix_epi_slide_opt_one_epikey <- function(updates, in_colnames, f, f_from_package, before, after, time_type, out_colnames) { +epix_epi_slide_opt_one_epikey <- function(updates, in_colnames, f_dots_baked, f_from_package, before, after, time_type, out_colnames) { unit_step <- epiprocess:::unit_time_delta(time_type) prev_inp_snapshot <- NULL prev_out_snapshot <- NULL @@ -43,13 +43,13 @@ epix_epi_slide_opt_one_epikey <- function(updates, in_colnames, f, f_from_packag adaptive_arg <- FALSE } for (col_i in seq_along(in_colnames)) { - slide[[out_colnames[[col_i]]]] <- f(slide[[in_colnames[[col_i]]]], n_arg, adaptive = adaptive_arg) + slide[[out_colnames[[col_i]]]] <- f_dots_baked(slide[[in_colnames[[col_i]]]], n_arg, adaptive = adaptive_arg) } } else if (f_from_package == "slider") { for (col_i in seq_along(in_colnames)) { # with adaptive tails that incorporate fewer inputs: # FIXME other arg forwarding - out_col <- f(slide[[in_colnames[[col_i]]]], before = before, after = after) + out_col <- f_dots_baked(slide[[in_colnames[[col_i]]]], before = before, after = after) # XXX is this actually required or being done at the right time? we are # already chopping off a good amount that might include this? # @@ -109,7 +109,6 @@ epix_epi_slide_opt.grouped_epi_archive <- function(.x, ...) { epix_epi_slide_opt(...) %>% group_by(pick(all_of(orig_group_vars)), .drop = orig_drop) } - #' @method epix_epi_slide_opt epi_archive #' @export epix_epi_slide_opt.epi_archive <- @@ -123,9 +122,17 @@ epix_epi_slide_opt.epi_archive <- epikey_names <- key_colnames(.x, exclude = c("time_value", "version")) # Validation & pre-processing: .align <- arg_match(.align) - f_info <- upstream_slide_f_info(.f) + .f_info <- upstream_slide_f_info(.f) + .f_dots_baked <- + if (rlang::dots_n(...) == 0L) { + # Leaving `.f` unchanged slightly improves computation speed and trims + # debug stack traces: + .f + } else { + purrr::partial(.f, ...) + } col_names_quo <- enquo(.col_names) - names_info <- across_ish_names_info(.x$DT, time_type, col_names_quo, f_info$namer, .window_size, .align, .prefix, .suffix, .new_col_names) + names_info <- across_ish_names_info(.x$DT, time_type, col_names_quo, .f_info$namer, .window_size, .align, .prefix, .suffix, .new_col_names) window_args <- get_before_after_from_window(.window_size, .align, time_type) assert( checkmate::check_logical(.progress, any.missing = FALSE, len = 1L, names = "unnamed"), @@ -147,7 +154,7 @@ epix_epi_slide_opt.epi_archive <- group_updates <- group_values %>% nest(.by = version, .key = "subtbl") %>% arrange(version) - res <- epix_epi_slide_opt_one_epikey(group_updates, names_info$input_col_names, .f, f_info$from_package, window_args$before, window_args$after, time_type, names_info$output_col_names) %>% + res <- epix_epi_slide_opt_one_epikey(group_updates, names_info$input_col_names, .f_dots_baked, .f_info$from_package, window_args$before, window_args$after, time_type, names_info$output_col_names) %>% list_rbind() if (use_progress) cli::cli_progress_update(id = progress_bar_id) res From 15e8c7da721b418793eb3192c2ecec64099beb89 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 3 Mar 2025 12:27:14 -0800 Subject: [PATCH 007/107] Clean up some comments&code + note bug --- R/epix_epi_slide_opt.R | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/R/epix_epi_slide_opt.R b/R/epix_epi_slide_opt.R index 502a757e4..22b83eb1a 100644 --- a/R/epix_epi_slide_opt.R +++ b/R/epix_epi_slide_opt.R @@ -30,10 +30,11 @@ epix_epi_slide_opt_one_epikey <- function(updates, in_colnames, f_dots_baked, f_ slide_nrow <- time_delta_to_n_steps(slide_max_t - slide_min_t, time_type) + 1L slide_time_values <- slide_min_t + 0L:(slide_nrow - 1L) * unit_step slide_inp_backrefs <- vec_match(slide_time_values, inp_snapshot$time_value) + # Get additional values needed from inp_snapshot + perform any NA + # tail-padding needed to make slider results a fixed window size rather than + # adaptive at tails + perform any NA gap-filling needed: slide <- inp_snapshot[slide_inp_backrefs, ] # TODO vs. DT key index vs .... slide$time_value <- slide_time_values - # TODO ensure before & after as integers? - # TODO parameterize naming, slide function, options, ... if (f_from_package == "data.table") { if (before == Inf) { n_arg <- seq_len(slide_nrow) @@ -43,20 +44,12 @@ epix_epi_slide_opt_one_epikey <- function(updates, in_colnames, f_dots_baked, f_ adaptive_arg <- FALSE } for (col_i in seq_along(in_colnames)) { + # FIXME wrong with .align = "left" slide[[out_colnames[[col_i]]]] <- f_dots_baked(slide[[in_colnames[[col_i]]]], n_arg, adaptive = adaptive_arg) } } else if (f_from_package == "slider") { for (col_i in seq_along(in_colnames)) { - # with adaptive tails that incorporate fewer inputs: - # FIXME other arg forwarding - out_col <- f_dots_baked(slide[[in_colnames[[col_i]]]], before = before, after = after) - # XXX is this actually required or being done at the right time? we are - # already chopping off a good amount that might include this? - # - # FIXME can this generate an error on very short series? - vec_slice(out_col, seq_len(before)) <- NA - vec_slice(out_col, slide_nrow - after + seq_len(after)) <- NA - slide[[out_colnames[[col_i]]]] <- out_col + slide[[out_colnames[[col_i]]]] <- f_dots_baked(slide[[in_colnames[[col_i]]]], before = before, after = after) } } else { cli_abort("epiprocess internal error: `f_from_package` was {format_chr_deparse(f_from_package)}") From 2cc751da97c0608dcdfc5511b9b0fb2c04bba6f1 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 3 Mar 2025 12:56:18 -0800 Subject: [PATCH 008/107] fix(epix_epi_slide_opt): on data.table `.f`s with `.align != "right"` --- R/epix_epi_slide_opt.R | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/R/epix_epi_slide_opt.R b/R/epix_epi_slide_opt.R index 22b83eb1a..6256efb4e 100644 --- a/R/epix_epi_slide_opt.R +++ b/R/epix_epi_slide_opt.R @@ -36,16 +36,25 @@ epix_epi_slide_opt_one_epikey <- function(updates, in_colnames, f_dots_baked, f_ slide <- inp_snapshot[slide_inp_backrefs, ] # TODO vs. DT key index vs .... slide$time_value <- slide_time_values if (f_from_package == "data.table") { - if (before == Inf) { - n_arg <- seq_len(slide_nrow) - adaptive_arg <- TRUE - } else { - n_arg <- before + after + 1L - adaptive_arg <- FALSE - } + # if (before == Inf) { + # n_arg <- seq_len(slide_nrow) + # adaptive_arg <- TRUE + # } else { + # n_arg <- before + after + 1L + # adaptive_arg <- FALSE + # } for (col_i in seq_along(in_colnames)) { - # FIXME wrong with .align = "left" - slide[[out_colnames[[col_i]]]] <- f_dots_baked(slide[[in_colnames[[col_i]]]], n_arg, adaptive = adaptive_arg) + if (before == Inf) { + slide[[out_colnames[[col_i]]]] <- f_dots_baked(slide[[in_colnames[[col_i]]]], seq_len(slide_nrow), adaptive = TRUE) + } else { + out_col <- f_dots_baked(slide[[in_colnames[[col_i]]]], before + after + 1L) + if (after != 0L) { + # data.table always puts NAs at tails, even with na.rm = TRUE; chop + # off extra NAs from beginning and place missing NAs at end: + out_col <- c(out_col[seq(after + 1L, slide_nrow)], rep(NA, after)) + } + slide[[out_colnames[[col_i]]]] <- out_col + } } } else if (f_from_package == "slider") { for (col_i in seq_along(in_colnames)) { From 9cc8468689cc59e025f3549aa5ee4dcf2b5ac7a9 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 3 Mar 2025 15:01:03 -0800 Subject: [PATCH 009/107] Clear out some more comments Changing min() to [[1L]] or min & max to range don't seem to make a notable performance difference, and require extra assumptions / uglier code. --- R/epix_epi_slide_opt.R | 29 ++++++++++------------------- 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/R/epix_epi_slide_opt.R b/R/epix_epi_slide_opt.R index 6256efb4e..9e09af8f6 100644 --- a/R/epix_epi_slide_opt.R +++ b/R/epix_epi_slide_opt.R @@ -5,9 +5,7 @@ epix_epi_slide_opt_one_epikey <- function(updates, in_colnames, f_dots_baked, f_ prev_out_snapshot <- NULL result <- map(seq_len(nrow(updates)), function(update_i) { version <- updates$version[[update_i]] - inp_update <- updates$subtbl[[update_i]] # TODO decide whether DT - ## setDF(inp_update) - ## inp_update <- new_tibble(inp_update, nrow = nrow(inp_update)) + inp_update <- updates$subtbl[[update_i]] inp_snapshot <- tbl_patch(prev_inp_snapshot, inp_update, "time_value") if (before == Inf) { if (after != 0) { @@ -15,34 +13,27 @@ epix_epi_slide_opt_one_epikey <- function(updates, in_colnames, f_dots_baked, f_ } # We need to use the entire input snapshot range, filling in time gaps. We # shouldn't pad the ends. - slide_min_t <- min(inp_snapshot$time_value) # TODO check efficiency - slide_max_t <- max(inp_snapshot$time_value) + slide_t_min <- min(inp_snapshot$time_value) + slide_t_max <- max(inp_snapshot$time_value) } else { # If the input had updates in the range t1..t2, this could produce changes # in slide outputs in the range t1-after..t2+before, and to compute those # slide values, we need to look at the input snapshot from # t1-after-before..t2+before+after. - inp_update_min_t <- min(inp_update$time_value) # TODO check efficiency - inp_update_max_t <- max(inp_update$time_value) - slide_min_t <- inp_update_min_t - (before + after) * unit_step - slide_max_t <- inp_update_max_t + (before + after) * unit_step + inp_update_t_min <- min(inp_update$time_value) + inp_update_t_max <- max(inp_update$time_value) + slide_t_min <- inp_update_t_min - (before + after) * unit_step + slide_t_max <- inp_update_t_max + (before + after) * unit_step } - slide_nrow <- time_delta_to_n_steps(slide_max_t - slide_min_t, time_type) + 1L - slide_time_values <- slide_min_t + 0L:(slide_nrow - 1L) * unit_step + slide_nrow <- time_delta_to_n_steps(slide_t_max - slide_t_min, time_type) + 1L + slide_time_values <- slide_t_min + 0L:(slide_nrow - 1L) * unit_step slide_inp_backrefs <- vec_match(slide_time_values, inp_snapshot$time_value) # Get additional values needed from inp_snapshot + perform any NA # tail-padding needed to make slider results a fixed window size rather than # adaptive at tails + perform any NA gap-filling needed: - slide <- inp_snapshot[slide_inp_backrefs, ] # TODO vs. DT key index vs .... + slide <- inp_snapshot[slide_inp_backrefs, ] slide$time_value <- slide_time_values if (f_from_package == "data.table") { - # if (before == Inf) { - # n_arg <- seq_len(slide_nrow) - # adaptive_arg <- TRUE - # } else { - # n_arg <- before + after + 1L - # adaptive_arg <- FALSE - # } for (col_i in seq_along(in_colnames)) { if (before == Inf) { slide[[out_colnames[[col_i]]]] <- f_dots_baked(slide[[in_colnames[[col_i]]]], seq_len(slide_nrow), adaptive = TRUE) From 762f04afac3c1e70a681cc2cec7b7301194c5ea4 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 4 Mar 2025 11:07:55 -0800 Subject: [PATCH 010/107] Fix missing ukey_names arg --- R/epix_epi_slide_opt.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/epix_epi_slide_opt.R b/R/epix_epi_slide_opt.R index 9e09af8f6..74cbcd8cc 100644 --- a/R/epix_epi_slide_opt.R +++ b/R/epix_epi_slide_opt.R @@ -69,7 +69,7 @@ epix_epi_slide_opt_one_epikey <- function(updates, in_colnames, f_dots_baked, f_ } out_update <- slide[rows_should_keep, ] out_diff <- tbl_diff2(prev_out_snapshot, out_update, "time_value", "update") # TODO avoid redundant diff2 work? though depends on compactify parms... - out_snapshot <- tbl_patch(prev_out_snapshot, out_diff) + out_snapshot <- tbl_patch(prev_out_snapshot, out_diff, "time_value") prev_inp_snapshot <<- inp_snapshot prev_out_snapshot <<- out_snapshot # TODO avoid need to patch twice? out_diff$version <- version From bd0a7087fe9e7d3abc00e6069abe0ba9fee53f2e Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 4 Mar 2025 11:19:37 -0800 Subject: [PATCH 011/107] Check for missing & improper ukey_names args --- R/patch.R | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/R/patch.R b/R/patch.R index b582488a6..ba04246d5 100644 --- a/R/patch.R +++ b/R/patch.R @@ -86,6 +86,9 @@ tbl_diff2 <- function(earlier_snapshot, later_tbl, if (!is_tibble(earlier_snapshot)) { cli_abort("`earlier_snapshot` must be a tibble or `NULL`") } + if (!is.character(ukey_names) || !all(ukey_names %in% names(earlier_snapshot))) { + cli_abort("`ukey_names` must be a subset of column names") + } later_format <- arg_match0(later_format, c("snapshot", "update")) if (!(is.vector(compactify_abs_tol, mode = "numeric") && length(compactify_abs_tol) == 1L && compactify_abs_tol >= 0)) { # Give a specific message: @@ -218,6 +221,9 @@ tbl_patch <- function(snapshot, update, ukey_names) { if (!is_tibble(snapshot)) { cli_abort("`snapshot` must be a tibble") } + if (!is.character(ukey_names) || !all(ukey_names %in% names(snapshot))) { + cli_abort("`ukey_names` must be a subset of column names") + } result_tbl <- vec_rbind(update, snapshot) From 860831398c5f3cd897ab49e9c30dd5e006fb76af Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 4 Mar 2025 11:57:43 -0800 Subject: [PATCH 012/107] Remove some commented ideas that aren't quick wins --- R/epix_epi_slide_opt.R | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/R/epix_epi_slide_opt.R b/R/epix_epi_slide_opt.R index 74cbcd8cc..8fb9d5905 100644 --- a/R/epix_epi_slide_opt.R +++ b/R/epix_epi_slide_opt.R @@ -1,4 +1,3 @@ -# TODO just make this an epi_slide_opt impl? epix_epi_slide_opt_one_epikey <- function(updates, in_colnames, f_dots_baked, f_from_package, before, after, time_type, out_colnames) { unit_step <- epiprocess:::unit_time_delta(time_type) prev_inp_snapshot <- NULL @@ -68,16 +67,18 @@ epix_epi_slide_opt_one_epikey <- function(updates, in_colnames, f_dots_baked, f_ !is.na(slide_inp_backrefs) } out_update <- slide[rows_should_keep, ] - out_diff <- tbl_diff2(prev_out_snapshot, out_update, "time_value", "update") # TODO avoid redundant diff2 work? though depends on compactify parms... + out_diff <- tbl_diff2(prev_out_snapshot, out_update, "time_value", "update") out_snapshot <- tbl_patch(prev_out_snapshot, out_diff, "time_value") prev_inp_snapshot <<- inp_snapshot - prev_out_snapshot <<- out_snapshot # TODO avoid need to patch twice? + prev_out_snapshot <<- out_snapshot out_diff$version <- version out_diff }) result } +# TODO just make this an epi_slide_opt impl? + #' @export epix_epi_slide_opt <- function(.x, .col_names, .f, ..., From 111bac53be0547cce9f71d121cf6c836b76bfb27 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 4 Mar 2025 18:18:47 -0800 Subject: [PATCH 013/107] WIP cleaning up approx_equal --- NAMESPACE | 3 +++ R/epiprocess-package.R | 3 +++ R/patch.R | 42 +++++++++++++++++++++++++----------------- 3 files changed, 31 insertions(+), 17 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 168b7507d..2d4760771 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -258,6 +258,7 @@ importFrom(utils,capture.output) importFrom(utils,tail) importFrom(vctrs,"vec_slice<-") importFrom(vctrs,vec_cast) +importFrom(vctrs,vec_cast_common) importFrom(vctrs,vec_data) importFrom(vctrs,vec_duplicate_any) importFrom(vctrs,vec_duplicate_id) @@ -266,10 +267,12 @@ importFrom(vctrs,vec_in) importFrom(vctrs,vec_match) importFrom(vctrs,vec_order) importFrom(vctrs,vec_rbind) +importFrom(vctrs,vec_recycle) importFrom(vctrs,vec_recycle_common) importFrom(vctrs,vec_rep) importFrom(vctrs,vec_rep_each) importFrom(vctrs,vec_seq_along) importFrom(vctrs,vec_size) +importFrom(vctrs,vec_size_common) importFrom(vctrs,vec_slice) importFrom(vctrs,vec_sort) diff --git a/R/epiprocess-package.R b/R/epiprocess-package.R index 93af0b737..a0a6f0424 100644 --- a/R/epiprocess-package.R +++ b/R/epiprocess-package.R @@ -35,6 +35,7 @@ #' @importFrom tidyr nest #' @importFrom tools toTitleCase #' @importFrom vctrs vec_cast +#' @importFrom vctrs vec_cast_common #' @importFrom vctrs vec_data #' @importFrom vctrs vec_duplicate_id #' @importFrom vctrs vec_equal @@ -42,10 +43,12 @@ #' @importFrom vctrs vec_match #' @importFrom vctrs vec_order #' @importFrom vctrs vec_rbind +#' @importFrom vctrs vec_recycle #' @importFrom vctrs vec_recycle_common #' @importFrom vctrs vec_rep #' @importFrom vctrs vec_rep_each #' @importFrom vctrs vec_seq_along +#' @importFrom vctrs vec_size_common #' @importFrom vctrs vec_slice #' @importFrom vctrs vec_slice<- #' @importFrom vctrs vec_sort diff --git a/R/patch.R b/R/patch.R index ba04246d5..245be4741 100644 --- a/R/patch.R +++ b/R/patch.R @@ -1,30 +1,38 @@ # TODO use these in apply_compactify -approx_equal <- function(vec1, vec2, abs_tol, na_equal, .ptype = NULL, recurse = approx_equal, inds1 = NULL, inds2 = NULL) { - vecs <- list(vec1, vec2) - if (!is.null(inds1)) { - # XXX could have logical or integerish inds; just leave it to later checks - # to hopefully abort for now +approx_equal <- function(vec1, vec2, abs_tol, na_equal, .ptype = NULL, inds1 = NULL, inds2 = NULL) { + # Recycle inds if provided; vecs if not: + common_size <- vec_size_common( + if (is.null(inds1)) vec1 else inds1, + if (is.null(inds2)) vec2 else inds2 + ) + if (is.null(inds1)) { + vec1 <- vec_recycle(vec1, common_size) } else { - vecs <- vec_recycle_common(!!!vecs) + inds1 <- vec_recycle(inds1, common_size) } - vecs <- vec_cast_common(!!!vecs, .to = .ptype) - approx_equal0(vecs[[1]], vecs[[2]], abs_tol, na_equal, rec = approx_equal, inds1, inds2) + if (is.null(inds2)) { + vec2 <- vec_recycle(vec2, common_size) + } else { + inds2 <- vec_recycle(inds2, common_size) + } + vecs <- vec_cast_common(vec1, vec2, .to = .ptype) + approx_equal0(vecs[[1]], vecs[[2]], abs_tol, na_equal, inds1, inds2) } -approx_equal0 <- function(vec1, vec2, abs_tol, na_equal, recurse = approx_equal0, inds1 = NULL, inds2 = NULL) { +approx_equal0 <- function(vec1, vec2, abs_tol, na_equal, inds1 = NULL, inds2 = NULL) { if (is_bare_numeric(vec1) && abs_tol != 0) { - if (!is.null(inds1)) { - vec1 <- vec1[inds1] - vec2 <- vec2[inds2] - } - # perf: since we're working with bare numerics and logicals: we can use - # fifelse, and stop it from propagating attributes when there's no special - # class to guide the meaning + # perf: since we're working with bare numerics and logicals: we can use `[` + # and `fifelse`. Matching vec_equal, we ignore names and other attributes. + + # FIXME matrices can make their way in here though... + if (!is.null(inds1)) vec1 <- vec1[inds1] + if (!is.null(inds2)) vec2 <- vec2[inds2] res <- fifelse( !is.na(vec1) & !is.na(vec2), abs(vec1 - vec2) <= abs_tol, if (na_equal) is.na(vec1) & is.na(vec2) else FALSE ) + # `fifelse` inherits any unrecognized attributes; drop them instead: attributes(res) <- NULL return(res) } else if (is.data.frame(vec1) && abs_tol != 0) { @@ -34,7 +42,7 @@ approx_equal0 <- function(vec1, vec2, abs_tol, na_equal, recurse = approx_equal0 rep(TRUE, nrow(vec1)) } else { Reduce(`&`, lapply(seq_len(ncol(vec1)), function(col_i) { - recurse(vec1[[col_i]], vec2[[col_i]], abs_tol, na_equal, recurse, inds1, inds2) + approx_equal0(vec1[[col_i]], vec2[[col_i]], abs_tol, na_equal, inds1, inds2) })) } } else { From 1d8a5b9de6a46237e73b146cdd0f3aedd01367b0 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 5 Mar 2025 09:44:07 -0800 Subject: [PATCH 014/107] fix(apply_compactify): avoid arrange on data.table, `i` parsing issues --- R/archive.R | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/R/archive.R b/R/archive.R index 922371f1c..6652ccea1 100644 --- a/R/archive.R +++ b/R/archive.R @@ -424,10 +424,23 @@ apply_compactify <- function(updates_df, ukey_names, abs_tol = 0) { } assert_numeric(abs_tol, len = 1, lower = 0) - if (!is.data.table(updates_df) || !identical(key(updates_df), ukey_names)) { + if (is.data.table(updates_df)) { + if (!identical(key(updates_df), ukey_names)) { + cli_abort(c("`ukey_names` should match `key(updates_df)`", + "i" = "`ukey_names` was {format_chr_deparse(ukey_names)}", + "i" = "`key(updates_df)` was {format_chr_deparse(key(updates_df))}" + )) + } + } else { updates_df <- updates_df %>% arrange(pick(all_of(ukey_names))) } - updates_df[!update_is_locf(updates_df, ukey_names, abs_tol), ] + + # In case updates_df is a data.table, store keep flags in a local: "When the + # first argument inside DT[...] is a single symbol (e.g. DT[var]), data.table + # looks for var in calling scope". In case it's not a data.table, make sure to + # use df[i,] not just df[i]. + to_keep <- !update_is_locf(updates_df, ukey_names, abs_tol) + updates_df[to_keep, ] } #' get the entries that `compactify` would remove From a224336dc4598becb4e184ab29d96aee1ebf8e10 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 5 Mar 2025 10:10:09 -0800 Subject: [PATCH 015/107] perf: speed up compactification with `approx_equal` --- NAMESPACE | 2 -- R/archive.R | 66 ++++++++++++----------------------- R/patch.R | 1 - man/is_locf.Rd | 32 ----------------- tests/testthat/test-archive.R | 4 +-- 5 files changed, 25 insertions(+), 80 deletions(-) delete mode 100644 man/is_locf.Rd diff --git a/NAMESPACE b/NAMESPACE index 2d4760771..14a001dfa 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -179,7 +179,6 @@ importFrom(dplyr,if_all) importFrom(dplyr,if_any) importFrom(dplyr,if_else) importFrom(dplyr,is_grouped_df) -importFrom(dplyr,lag) importFrom(dplyr,mutate) importFrom(dplyr,pick) importFrom(dplyr,pull) @@ -219,7 +218,6 @@ importFrom(rlang,expr_label) importFrom(rlang,f_env) importFrom(rlang,f_rhs) importFrom(rlang,is_bare_integerish) -importFrom(rlang,is_bare_numeric) importFrom(rlang,is_environment) importFrom(rlang,is_formula) importFrom(rlang,is_function) diff --git a/R/archive.R b/R/archive.R index 6652ccea1..a9200a7ff 100644 --- a/R/archive.R +++ b/R/archive.R @@ -477,50 +477,30 @@ update_is_locf <- function(arranged_updates_df, ukey_names, abs_tol) { ekt_names <- ukey_names[ukey_names != "version"] val_names <- all_names[!all_names %in% ukey_names] - Reduce(`&`, lapply(updates_col_refs[ekt_names], is_locf, abs_tol, TRUE)) & - Reduce(`&`, lapply(updates_col_refs[val_names], is_locf, abs_tol, FALSE)) -} - -#' Checks to see if a value in a vector is LOCF -#' @description LOCF meaning last observation carried forward (to later -#' versions). Lags the vector by 1, then compares with itself. If `is_key` is -#' `TRUE`, only values that are exactly the same between the lagged and -#' original are considered LOCF. If `is_key` is `FALSE` and `vec` is a vector -#' of numbers ([`base::is.numeric`]), then approximate equality will be used, -#' checking whether the absolute difference between each pair of entries is -#' `<= abs_tol`; if `vec` is something else, then exact equality is used -#' instead. -#' -#' @details -#' -#' We include epikey-time columns in LOCF comparisons as part of an optimization -#' to avoid slower grouped operations while still ensuring that the first -#' observation for each time series will not be marked as LOCF. We test these -#' key columns for exact equality to prevent chopping off consecutive -#' time_values during flat periods when `abs_tol` is high. -#' -#' We use exact equality for non-`is.numeric` double/integer columns such as -#' dates, datetimes, difftimes, `tsibble::yearmonth`s, etc., as these may be -#' used as part of re-indexing or grouping procedures, and we don't want to -#' change the number of groups for those operations when we remove LOCF data -#' during compactification. -#' -#' @importFrom dplyr lag if_else -#' @importFrom rlang is_bare_numeric -#' @importFrom vctrs vec_equal -#' @keywords internal -is_locf <- function(vec, abs_tol, is_key) { # nolint: object_usage_linter - lag_vec <- lag(vec) - if (is.vector(vec, mode = "numeric") && !is_key) { - # (integer or double vector, no class (& no dims); maybe names, which we'll - # ignore like `vec_equal`); not a key column - unname(if_else( - !is.na(vec) & !is.na(lag_vec), - abs(vec - lag_vec) <= abs_tol, - is.na(vec) & is.na(lag_vec) - )) + n_updates <- nrow(arranged_updates_df) + if (n_updates == 0L) { + logical(0L) + } else if (n_updates == 1L) { + FALSE # sole observation is not LOCF } else { - vec_equal(vec, lag_vec, na_equal = TRUE) + ekts_tbl <- new_tibble(updates_col_refs[ekt_names]) + vals_tbl <- new_tibble(updates_col_refs[val_names]) + # n_updates >= 2L so we can use `:` naturally (this is the reason for + # separating out n_updates == 1L from this case): + inds1 <- 2L:n_updates + inds2 <- 1L:(n_updates - 1L) + c( + FALSE, # first observation is not LOCF + approx_equal0(ekts_tbl, + inds1 = inds1, ekts_tbl, inds2 = inds2, + # check ekt cols without tolerance: + abs_tol = 0, na_equal = TRUE + ) & + approx_equal0(vals_tbl, + inds1 = inds1, vals_tbl, inds2 = inds2, + abs_tol = abs_tol, na_equal = TRUE + ) + ) } } diff --git a/R/patch.R b/R/patch.R index 245be4741..2f0a2f00f 100644 --- a/R/patch.R +++ b/R/patch.R @@ -1,4 +1,3 @@ -# TODO use these in apply_compactify approx_equal <- function(vec1, vec2, abs_tol, na_equal, .ptype = NULL, inds1 = NULL, inds2 = NULL) { # Recycle inds if provided; vecs if not: common_size <- vec_size_common( diff --git a/man/is_locf.Rd b/man/is_locf.Rd deleted file mode 100644 index f8f8eefcb..000000000 --- a/man/is_locf.Rd +++ /dev/null @@ -1,32 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/archive.R -\name{is_locf} -\alias{is_locf} -\title{Checks to see if a value in a vector is LOCF} -\usage{ -is_locf(vec, abs_tol, is_key) -} -\description{ -LOCF meaning last observation carried forward (to later -versions). Lags the vector by 1, then compares with itself. If \code{is_key} is -\code{TRUE}, only values that are exactly the same between the lagged and -original are considered LOCF. If \code{is_key} is \code{FALSE} and \code{vec} is a vector -of numbers (\code{\link[base:numeric]{base::is.numeric}}), then approximate equality will be used, -checking whether the absolute difference between each pair of entries is -\verb{<= abs_tol}; if \code{vec} is something else, then exact equality is used -instead. -} -\details{ -We include epikey-time columns in LOCF comparisons as part of an optimization -to avoid slower grouped operations while still ensuring that the first -observation for each time series will not be marked as LOCF. We test these -key columns for exact equality to prevent chopping off consecutive -time_values during flat periods when \code{abs_tol} is high. - -We use exact equality for non-\code{is.numeric} double/integer columns such as -dates, datetimes, difftimes, \code{tsibble::yearmonth}s, etc., as these may be -used as part of re-indexing or grouping procedures, and we don't want to -change the number of groups for those operations when we remove LOCF data -during compactification. -} -\keyword{internal} diff --git a/tests/testthat/test-archive.R b/tests/testthat/test-archive.R index 0e84b03ba..12d9ed413 100644 --- a/tests/testthat/test-archive.R +++ b/tests/testthat/test-archive.R @@ -217,8 +217,8 @@ test_that("`epi_archive` rejects dataframes where time_value and version columns expect_error(as_epi_archive(tbl3), class = "epiprocess__time_value_version_mismatch") }) -test_that("is_locf works as expected", { +test_that("is_locf replacement works as expected", { vec <- c(1, 1, 1e-10, 1.1e-10, NA, NA, NaN, NaN) is_repeated <- c(0, 1, 0, 1, 0, 1, 1, 1) - expect_equal(is_locf(vec, .Machine$double.eps^0.5, FALSE), as.logical(is_repeated)) + expect_equal(c(FALSE, approx_equal(head(vec, -1L), tail(vec, -1L), .Machine$double.eps^0.5, na_equal = TRUE)), as.logical(is_repeated)) }) From ab010c8b975a3180f3f104088dd34bedd89b65ae Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 5 Mar 2025 10:52:00 -0800 Subject: [PATCH 016/107] fix(approx_equal): missing import --- NAMESPACE | 1 + R/epiprocess-package.R | 1 + 2 files changed, 2 insertions(+) diff --git a/NAMESPACE b/NAMESPACE index 14a001dfa..aa936b4ae 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -218,6 +218,7 @@ importFrom(rlang,expr_label) importFrom(rlang,f_env) importFrom(rlang,f_rhs) importFrom(rlang,is_bare_integerish) +importFrom(rlang,is_bare_numeric) importFrom(rlang,is_environment) importFrom(rlang,is_formula) importFrom(rlang,is_function) diff --git a/R/epiprocess-package.R b/R/epiprocess-package.R index a0a6f0424..ab4d2ba26 100644 --- a/R/epiprocess-package.R +++ b/R/epiprocess-package.R @@ -31,6 +31,7 @@ #' @importFrom rlang %||% #' @importFrom rlang arg_match0 #' @importFrom rlang is_bare_integerish +#' @importFrom rlang is_bare_numeric #' @importFrom tibble is_tibble #' @importFrom tidyr nest #' @importFrom tools toTitleCase From 72056a3a17b2577960996d8b9351ab9f0ce3b589 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 5 Mar 2025 10:52:09 -0800 Subject: [PATCH 017/107] docs(approx_equal): roxygen2 + comment on inconsistencies/bugs --- R/patch.R | 20 +++++++++++++++++++- man/approx_equal.Rd | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 1 deletion(-) create mode 100644 man/approx_equal.Rd diff --git a/R/patch.R b/R/patch.R index 2f0a2f00f..69de6d709 100644 --- a/R/patch.R +++ b/R/patch.R @@ -1,3 +1,18 @@ +#' Test two vctrs vectors for equality with some tolerance in some cases +#' +#' @param vec1,vec2 vctrs vectors (includes data frames) +#' @param abs_tol tolerance; will be used for bare numeric `vec1`, `vec2`, or +#' any such columns within `vec1`, `vec2` if they are data frames +#' @param na_equal should `NA`s be considered equal to each other? (In +#' epiprocess, we usually want this to be `TRUE`, but that doesn't match the +#' [`vctrs::vec_equal()`] default, so this is mandatory.) +#' @param .ptype as in [`vctrs::vec_equal()`] +#' @param inds1,inds2 optional (row) indices into vec1 and vec2; output should +#' be consistent with `vec_slice`-ing to these indices beforehand, but can +#' give faster computation if `vec1` and `vec2` are data frames. +#' +#' @return logical vector; no nonmissing entries if `na_equal = TRUE`. Behavior +#' may differ from `vec_equal` with non-`NA` `NaN`s involved. approx_equal <- function(vec1, vec2, abs_tol, na_equal, .ptype = NULL, inds1 = NULL, inds2 = NULL) { # Recycle inds if provided; vecs if not: common_size <- vec_size_common( @@ -30,6 +45,9 @@ approx_equal0 <- function(vec1, vec2, abs_tol, na_equal, inds1 = NULL, inds2 = N !is.na(vec1) & !is.na(vec2), abs(vec1 - vec2) <= abs_tol, if (na_equal) is.na(vec1) & is.na(vec2) else FALSE + # XXX ^ inconsistent with vec_equal treatment: NA vs. NaN comparison + # behavior with na_equal = TRUE is different; plus output with na_equal = + # FALSE on two NAs is different ) # `fifelse` inherits any unrecognized attributes; drop them instead: attributes(res) <- NULL @@ -45,7 +63,7 @@ approx_equal0 <- function(vec1, vec2, abs_tol, na_equal, inds1 = NULL, inds2 = N })) } } else { - # No special handling for any other types/situations. Makes sense for + # XXX No special handling for any other types/situations. Makes sense for # unclassed atomic things; bare lists and certain vctrs classes might want # recursion / specialization, though. if (!is.null(inds1)) { diff --git a/man/approx_equal.Rd b/man/approx_equal.Rd new file mode 100644 index 000000000..88fd8b63a --- /dev/null +++ b/man/approx_equal.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/patch.R +\name{approx_equal} +\alias{approx_equal} +\title{Test two vctrs vectors for equality with some tolerance in some cases} +\usage{ +approx_equal( + vec1, + vec2, + abs_tol, + na_equal, + .ptype = NULL, + inds1 = NULL, + inds2 = NULL +) +} +\arguments{ +\item{vec1, vec2}{vctrs vectors (includes data frames)} + +\item{abs_tol}{tolerance; will be used for bare numeric \code{vec1}, \code{vec2}, or +any such columns within \code{vec1}, \code{vec2} if they are data frames} + +\item{na_equal}{should \code{NA}s be considered equal to each other? (In +epiprocess, we usually want this to be \code{TRUE}, but that doesn't match the +\code{\link[vctrs:vec_equal]{vctrs::vec_equal()}} default, so this is mandatory.)} + +\item{.ptype}{as in \code{\link[vctrs:vec_equal]{vctrs::vec_equal()}}} + +\item{inds1, inds2}{optional (row) indices into vec1 and vec2; output should +be consistent with \code{vec_slice}-ing to these indices beforehand, but can +give faster computation if \code{vec1} and \code{vec2} are data frames.} +} +\value{ +logical vector; no nonmissing entries if \code{na_equal = TRUE}. Behavior +may differ from \code{vec_equal} with non-\code{NA} \code{NaN}s involved. +} +\description{ +Test two vctrs vectors for equality with some tolerance in some cases +} From e9ea2caf3b6bf871e10ab1fe2c1d5614059c3709 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 5 Mar 2025 11:30:56 -0800 Subject: [PATCH 018/107] fix(approx_equal): consistency with vec_slice(na_equal=FALSE) --- R/patch.R | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/R/patch.R b/R/patch.R index 69de6d709..d67fba1cc 100644 --- a/R/patch.R +++ b/R/patch.R @@ -44,10 +44,9 @@ approx_equal0 <- function(vec1, vec2, abs_tol, na_equal, inds1 = NULL, inds2 = N res <- fifelse( !is.na(vec1) & !is.na(vec2), abs(vec1 - vec2) <= abs_tol, - if (na_equal) is.na(vec1) & is.na(vec2) else FALSE + if (na_equal) is.na(vec1) & is.na(vec2) else NA # XXX ^ inconsistent with vec_equal treatment: NA vs. NaN comparison - # behavior with na_equal = TRUE is different; plus output with na_equal = - # FALSE on two NAs is different + # behavior with na_equal = TRUE is different ) # `fifelse` inherits any unrecognized attributes; drop them instead: attributes(res) <- NULL From b13e82547558784292f4d0f94f6189af214cb053 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 5 Mar 2025 11:32:41 -0800 Subject: [PATCH 019/107] fix(approx_equal): on bare numeric matrices --- R/patch.R | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/R/patch.R b/R/patch.R index d67fba1cc..074d3d427 100644 --- a/R/patch.R +++ b/R/patch.R @@ -37,10 +37,8 @@ approx_equal0 <- function(vec1, vec2, abs_tol, na_equal, inds1 = NULL, inds2 = N if (is_bare_numeric(vec1) && abs_tol != 0) { # perf: since we're working with bare numerics and logicals: we can use `[` # and `fifelse`. Matching vec_equal, we ignore names and other attributes. - - # FIXME matrices can make their way in here though... - if (!is.null(inds1)) vec1 <- vec1[inds1] - if (!is.null(inds2)) vec2 <- vec2[inds2] + if (!is.null(inds1)) vec1 <- vec_slice(vec1, inds1) + if (!is.null(inds2)) vec2 <- vec_slice(vec2, inds2) res <- fifelse( !is.na(vec1) & !is.na(vec2), abs(vec1 - vec2) <= abs_tol, @@ -48,6 +46,10 @@ approx_equal0 <- function(vec1, vec2, abs_tol, na_equal, inds1 = NULL, inds2 = N # XXX ^ inconsistent with vec_equal treatment: NA vs. NaN comparison # behavior with na_equal = TRUE is different ) + if (!is.null(dim(vec1))) { + dim(res) <- dim(vec1) + res <- rowSums(res) == ncol(res) + } # `fifelse` inherits any unrecognized attributes; drop them instead: attributes(res) <- NULL return(res) From 0c9f46227351ce05ab1352831ddea58c18a23c79 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 5 Mar 2025 11:57:37 -0800 Subject: [PATCH 020/107] feat: approx_equal on lists --- NAMESPACE | 2 ++ R/epiprocess-package.R | 2 ++ R/patch.R | 22 +++++++++++++++++++--- man/approx_equal.Rd | 3 ++- 4 files changed, 25 insertions(+), 4 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index aa936b4ae..e1b111c0a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -218,6 +218,7 @@ importFrom(rlang,expr_label) importFrom(rlang,f_env) importFrom(rlang,f_rhs) importFrom(rlang,is_bare_integerish) +importFrom(rlang,is_bare_list) importFrom(rlang,is_bare_numeric) importFrom(rlang,is_environment) importFrom(rlang,is_formula) @@ -265,6 +266,7 @@ importFrom(vctrs,vec_equal) importFrom(vctrs,vec_in) importFrom(vctrs,vec_match) importFrom(vctrs,vec_order) +importFrom(vctrs,vec_ptype) importFrom(vctrs,vec_rbind) importFrom(vctrs,vec_recycle) importFrom(vctrs,vec_recycle_common) diff --git a/R/epiprocess-package.R b/R/epiprocess-package.R index ab4d2ba26..89e785397 100644 --- a/R/epiprocess-package.R +++ b/R/epiprocess-package.R @@ -31,6 +31,7 @@ #' @importFrom rlang %||% #' @importFrom rlang arg_match0 #' @importFrom rlang is_bare_integerish +#' @importFrom rlang is_bare_list #' @importFrom rlang is_bare_numeric #' @importFrom tibble is_tibble #' @importFrom tidyr nest @@ -43,6 +44,7 @@ #' @importFrom vctrs vec_in #' @importFrom vctrs vec_match #' @importFrom vctrs vec_order +#' @importFrom vctrs vec_ptype #' @importFrom vctrs vec_rbind #' @importFrom vctrs vec_recycle #' @importFrom vctrs vec_recycle_common diff --git a/R/patch.R b/R/patch.R index 074d3d427..3bcddb383 100644 --- a/R/patch.R +++ b/R/patch.R @@ -12,7 +12,8 @@ #' give faster computation if `vec1` and `vec2` are data frames. #' #' @return logical vector; no nonmissing entries if `na_equal = TRUE`. Behavior -#' may differ from `vec_equal` with non-`NA` `NaN`s involved. +#' may differ from `vec_equal` with non-`NA` `NaN`s involved, or for bare +#' lists that contain named vectors. approx_equal <- function(vec1, vec2, abs_tol, na_equal, .ptype = NULL, inds1 = NULL, inds2 = NULL) { # Recycle inds if provided; vecs if not: common_size <- vec_size_common( @@ -63,10 +64,25 @@ approx_equal0 <- function(vec1, vec2, abs_tol, na_equal, inds1 = NULL, inds2 = N approx_equal0(vec1[[col_i]], vec2[[col_i]], abs_tol, na_equal, inds1, inds2) })) } + } else if (is_bare_list(vec1)) { + vapply(seq_along(vec1), function(i) { + entry1 <- vec1[[i]] + entry2 <- vec2[[i]] + vec_size(entry1) == vec_size(entry2) && + # This is inconsistent with vec_equal on named vectors; to be + # consistently inconsistent, we avoid dispatching to vec_equal for bare + # lists even with abs_tol = 0: + identical(vec_ptype(entry1), vec_ptype(entry2)) && + all(approx_equal0(entry1, entry2, abs_tol, na_equal)) + }, logical(1L)) } else { # XXX No special handling for any other types/situations. Makes sense for - # unclassed atomic things; bare lists and certain vctrs classes might want - # recursion / specialization, though. + # unclassed atomic things; custom classes (e.g., distributions) might want + # recursion / specialization, though. approx_equal0 should probably be an S3 + # method. Also, abs_tol == 0 --> vec_equal logic should maybe be either be + # hoisted to approx_equal or we should manually recurse on data frames even + # with abs_tol = 0 when that's faster (might depend on presence of inds*), + # after some inconsistencies are ironed out. if (!is.null(inds1)) { vec1 <- vec_slice(vec1, inds1) vec2 <- vec_slice(vec2, inds2) diff --git a/man/approx_equal.Rd b/man/approx_equal.Rd index 88fd8b63a..d58f24bb9 100644 --- a/man/approx_equal.Rd +++ b/man/approx_equal.Rd @@ -32,7 +32,8 @@ give faster computation if \code{vec1} and \code{vec2} are data frames.} } \value{ logical vector; no nonmissing entries if \code{na_equal = TRUE}. Behavior -may differ from \code{vec_equal} with non-\code{NA} \code{NaN}s involved. +may differ from \code{vec_equal} with non-\code{NA} \code{NaN}s involved, or for bare +lists that contain named vectors. } \description{ Test two vctrs vectors for equality with some tolerance in some cases From 39816f58c93fab845a0ca0c65afd26cc5b22ccde Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 5 Mar 2025 12:53:00 -0800 Subject: [PATCH 021/107] docs(approx_equal): iterate on @return + doc approx_equal0 --- R/patch.R | 11 ++++++++--- man/approx_equal.Rd | 8 +++++--- man/approx_equal0.Rd | 12 ++++++++++++ 3 files changed, 25 insertions(+), 6 deletions(-) create mode 100644 man/approx_equal0.Rd diff --git a/R/patch.R b/R/patch.R index 3bcddb383..37af285f9 100644 --- a/R/patch.R +++ b/R/patch.R @@ -11,9 +11,11 @@ #' be consistent with `vec_slice`-ing to these indices beforehand, but can #' give faster computation if `vec1` and `vec2` are data frames. #' -#' @return logical vector; no nonmissing entries if `na_equal = TRUE`. Behavior -#' may differ from `vec_equal` with non-`NA` `NaN`s involved, or for bare -#' lists that contain named vectors. +#' @return logical vector, with length matching the result of recycling `vec1` +#' (at `inds1` if provided) and `vec2` (at `inds2` if provided); entries +#' should all be `TRUE` or `FALSE` if `na_equal = TRUE`. Behavior may differ +#' from `vec_equal` with non-`NA` `NaN`s involved, or for bare lists that +#' contain named vectors. approx_equal <- function(vec1, vec2, abs_tol, na_equal, .ptype = NULL, inds1 = NULL, inds2 = NULL) { # Recycle inds if provided; vecs if not: common_size <- vec_size_common( @@ -34,6 +36,9 @@ approx_equal <- function(vec1, vec2, abs_tol, na_equal, .ptype = NULL, inds1 = N approx_equal0(vecs[[1]], vecs[[2]], abs_tol, na_equal, inds1, inds2) } +#' Helper for [`approx_equal`] for vecs guaranteed to have the same ptype and size +#' +#' @keywords internal approx_equal0 <- function(vec1, vec2, abs_tol, na_equal, inds1 = NULL, inds2 = NULL) { if (is_bare_numeric(vec1) && abs_tol != 0) { # perf: since we're working with bare numerics and logicals: we can use `[` diff --git a/man/approx_equal.Rd b/man/approx_equal.Rd index d58f24bb9..4eac410f0 100644 --- a/man/approx_equal.Rd +++ b/man/approx_equal.Rd @@ -31,9 +31,11 @@ be consistent with \code{vec_slice}-ing to these indices beforehand, but can give faster computation if \code{vec1} and \code{vec2} are data frames.} } \value{ -logical vector; no nonmissing entries if \code{na_equal = TRUE}. Behavior -may differ from \code{vec_equal} with non-\code{NA} \code{NaN}s involved, or for bare -lists that contain named vectors. +logical vector, with length matching the result of recycling \code{vec1} +(at \code{inds1} if provided) and \code{vec2} (at \code{inds2} if provided); entries +should all be \code{TRUE} or \code{FALSE} if \code{na_equal = TRUE}. Behavior may differ +from \code{vec_equal} with non-\code{NA} \code{NaN}s involved, or for bare lists that +contain named vectors. } \description{ Test two vctrs vectors for equality with some tolerance in some cases diff --git a/man/approx_equal0.Rd b/man/approx_equal0.Rd new file mode 100644 index 000000000..f7a69f1fe --- /dev/null +++ b/man/approx_equal0.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/patch.R +\name{approx_equal0} +\alias{approx_equal0} +\title{Helper for \code{\link{approx_equal}} for vecs guaranteed to have the same ptype and size} +\usage{ +approx_equal0(vec1, vec2, abs_tol, na_equal, inds1 = NULL, inds2 = NULL) +} +\description{ +Helper for \code{\link{approx_equal}} for vecs guaranteed to have the same ptype and size +} +\keyword{internal} From ba40405bd8b127a53bb53664c2032a2d8f6e5a17 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 5 Mar 2025 14:10:19 -0800 Subject: [PATCH 022/107] WIP docs(epix_epi_slide_opt_one_epikey): initial --- R/epix_epi_slide_opt.R | 48 ++++++++++++++++++ man/epix_epi_slide_opt_one_epikey.Rd | 74 ++++++++++++++++++++++++++++ 2 files changed, 122 insertions(+) create mode 100644 man/epix_epi_slide_opt_one_epikey.Rd diff --git a/R/epix_epi_slide_opt.R b/R/epix_epi_slide_opt.R index 8fb9d5905..5e446cdca 100644 --- a/R/epix_epi_slide_opt.R +++ b/R/epix_epi_slide_opt.R @@ -1,4 +1,51 @@ +#' Core operation of `epix_epi_slide_opt` for a single epikey's history +#' +#' @param updates tibble with two columns: `version` and `subtbl`; `subtbl` is a +#' list of tibbles, each with a `time_value` column and measurement columns. +#' The epikey should not appear. +#' @param in_colnames chr; names of columns to which to apply `f_dots_baked` +#' @param f_dots_baked supported sliding function from `{data.table}` or +#' `{slider}`, potentially with some arguments baked in with +#' [`purrr::partial`] +#' @param f_from_package string; name of package from which `f_dots_baked` +#' (pre-`partial`) originates +#' @param before integerish >=0 or Inf; number of time steps before each +#' ref_time_value to include in the sliding window computation; Inf to include +#' all times beginning with the min `time_value` +#' @param after integerish >=0; number of time steps after each ref_time_value +#' to include in the sliding window computation +#' @param time_type as in `new_epi_archive` +#' @param out_colnames chr, same length as `in_colnames`; column names to use +#' for results +#' @return list of tibbles with same names as `subtbl`s plus: `c(out_colnames, +#' "version")`; (compactified) diff data to put into an `epi_archive` +#' +#' @examples +#' +#' library(dplyr) +#' updates <- bind_rows( +#' tibble( +#' version = 40, time_value = 1:10, value = 1:10 +#' ), +#' tibble( +#' version = 12, time_value = 2:3, value = 3:2 +#' ), +#' tibble( +#' version = 13, time_value = 6, value = 7, +#' ), +#' tibble( +#' version = 13, time_value = 7, value = NA, +#' ) +#' ) %>% +#' mutate(across(c(version, time_value), ~ as.Date("2020-01-01") - 1 + .x)) %>% +#' tidyr::nest(.by = version, .key = "subtbl") +#' +#' updates %>% +#' epix_epi_slide_opt_one_epikey("value", data.table::frollmean, "data.table", 1L, 0L, "day", "slide_value") +#' +#' @keywords internal epix_epi_slide_opt_one_epikey <- function(updates, in_colnames, f_dots_baked, f_from_package, before, after, time_type, out_colnames) { + # TODO check for col name clobbering unit_step <- epiprocess:::unit_time_delta(time_type) prev_inp_snapshot <- NULL prev_out_snapshot <- NULL @@ -148,6 +195,7 @@ epix_epi_slide_opt.epi_archive <- group_updates <- group_values %>% nest(.by = version, .key = "subtbl") %>% arrange(version) + # TODO move nesting inside the helper? res <- epix_epi_slide_opt_one_epikey(group_updates, names_info$input_col_names, .f_dots_baked, .f_info$from_package, window_args$before, window_args$after, time_type, names_info$output_col_names) %>% list_rbind() if (use_progress) cli::cli_progress_update(id = progress_bar_id) diff --git a/man/epix_epi_slide_opt_one_epikey.Rd b/man/epix_epi_slide_opt_one_epikey.Rd new file mode 100644 index 000000000..e8321616e --- /dev/null +++ b/man/epix_epi_slide_opt_one_epikey.Rd @@ -0,0 +1,74 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/epix_epi_slide_opt.R +\name{epix_epi_slide_opt_one_epikey} +\alias{epix_epi_slide_opt_one_epikey} +\title{Core operation of \code{epix_epi_slide_opt} for a single epikey's history} +\usage{ +epix_epi_slide_opt_one_epikey( + updates, + in_colnames, + f_dots_baked, + f_from_package, + before, + after, + time_type, + out_colnames +) +} +\arguments{ +\item{updates}{tibble with two columns: \code{version} and \code{subtbl}; \code{subtbl} is a +list of tibbles, each with a \code{time_value} column and measurement columns. +The epikey should not appear.} + +\item{in_colnames}{chr; names of columns to which to apply \code{f_dots_baked}} + +\item{f_dots_baked}{supported sliding function from \code{{data.table}} or +\code{{slider}}, potentially with some arguments baked in with +\code{\link[purrr:partial]{purrr::partial}}} + +\item{f_from_package}{string; name of package from which \code{f_dots_baked} +(pre-\code{partial}) originates} + +\item{before}{integerish >=0 or Inf; number of time steps before each +ref_time_value to include in the sliding window computation; Inf to include +all times beginning with the min \code{time_value}} + +\item{after}{integerish >=0; number of time steps after each ref_time_value +to include in the sliding window computation} + +\item{time_type}{as in \code{new_epi_archive}} + +\item{out_colnames}{chr, same length as \code{in_colnames}; column names to use +for results} +} +\value{ +list of tibbles with same names as \code{subtbl}s plus: \code{c(out_colnames, "version")}; (compactified) diff data to put into an \code{epi_archive} +} +\description{ +Core operation of \code{epix_epi_slide_opt} for a single epikey's history +} +\examples{ + +library(dplyr) +updates <- bind_rows( + tibble( + version = 40, time_value = 1:10, value = 1:10 + ), + tibble( + version = 12, time_value = 2:3, value = 3:2 + ), + tibble( + version = 13, time_value = 6, value = 7, + ), + tibble( + version = 13, time_value = 7, value = NA, + ) +) \%>\% + mutate(across(c(version, time_value), ~as.Date("2020-01-01") - 1 + .x)) \%>\% + tidyr::nest(.by = version, .key = "subtbl") + +updates \%>\% + epix_epi_slide_opt_one_epikey("value", data.table::frollmean, "data.table", 1L, 0L, "day", "slide_value") + +} +\keyword{internal} From 764c624dc48e39d54a8d05951bc9e2cf768c63e2 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 6 Mar 2025 17:22:27 -0800 Subject: [PATCH 023/107] refactor: move epi_slide_opt & helpers to its own file --- DESCRIPTION | 1 + R/epi_slide_opt_edf.R | 664 +++++++++++++++++++++++++ R/slide.R | 666 -------------------------- man/across_ish_names_info.Rd | 2 +- man/epi_slide_opt.Rd | 2 +- man/epix_epi_slide_opt_one_epikey.Rd | 2 +- man/full_date_seq.Rd | 2 +- man/upstream_slide_f_info.Rd | 2 +- man/upstream_slide_f_possibilities.Rd | 2 +- 9 files changed, 671 insertions(+), 672 deletions(-) create mode 100644 R/epi_slide_opt_edf.R diff --git a/DESCRIPTION b/DESCRIPTION index 013d32600..1732bbc97 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -98,6 +98,7 @@ Collate: 'correlation.R' 'epi_df.R' 'epi_df_forbidden_methods.R' + 'epi_slide_opt_edf.R' 'epiprocess-package.R' 'epix_epi_slide_opt.R' 'group_by_epi_df_methods.R' diff --git a/R/epi_slide_opt_edf.R b/R/epi_slide_opt_edf.R new file mode 100644 index 000000000..44eaa1470 --- /dev/null +++ b/R/epi_slide_opt_edf.R @@ -0,0 +1,664 @@ +#' Information about upstream (`{data.table}`/`{slider}`) slide functions +#' +#' Underlies [`upstream_slide_f_info`]. +#' +#' @keywords internal +upstream_slide_f_possibilities <- tibble::tribble( + ~f, ~package, ~namer, + frollmean, "data.table", ~ if (is.logical(.x)) "prop" else "av", + frollsum, "data.table", ~ if (is.logical(.x)) "count" else "sum", + frollapply, "data.table", ~"slide", + slide_sum, "slider", ~ if (is.logical(.x)) "count" else "sum", + slide_prod, "slider", ~"prod", + slide_mean, "slider", ~ if (is.logical(.x)) "prop" else "av", + slide_min, "slider", ~"min", + slide_max, "slider", ~"max", + slide_all, "slider", ~"all", + slide_any, "slider", ~"any", +) + +#' Validate & get information about an upstream slide function +#' +#' @param .f function such as `data.table::frollmean` or `slider::slide_mean`; +#' must appear in [`upstream_slide_f_possibilities`] +#' @return named list with two elements: `from_package`, a string containing the +#' upstream package name ("data.table" or "slider"), and `namer`, a function +#' that takes a column to call `.f` on and outputs a basic name or +#' abbreviation for what operation `.f` represents on that kind of column +#' (e.g., "sum", "av", "count"). +#' +#' @keywords internal +upstream_slide_f_info <- function(.f) { + assert_function(.f) + + # Check that slide function `.f` is one of those short-listed from + # `data.table` and `slider` (or a function that has the exact same definition, + # e.g. if the function has been reexported or defined locally). Extract some + # metadata. `namer` will be mapped over columns (.x will be a column, not the + # entire edf). + f_info_row <- upstream_slide_f_possibilities %>% + filter(map_lgl(.data$f, ~ identical(.f, .x))) + if (nrow(f_info_row) == 0L) { + # `f` is from somewhere else and not supported + cli_abort( + c( + "problem with {rlang::expr_label(rlang::caller_arg(f))}", + "i" = "`f` must be one of `data.table`'s rolling functions (`frollmean`, + `frollsum`, `frollapply`. See `?data.table::roll`) or one of + `slider`'s specialized sliding functions (`slide_mean`, `slide_sum`, + etc. See `?slider::\`summary-slide\`` for more options)." + ), + class = "epiprocess__epi_slide_opt__unsupported_slide_function", + epiprocess__f = .f + ) + } + if (nrow(f_info_row) > 1L) { + cli_abort('epiprocess internal error: looking up `.f` in table of possible + functions yielded multiple matches. Please report it using "New + issue" at https://github.com/cmu-delphi/epiprocess/issues, using + reprex::reprex to provide a minimal reproducible example.') + } + f_from_package <- f_info_row$package + list( + from_package = f_from_package, + namer = unwrap(f_info_row$namer) + ) +} + +#' Calculate input and output column names for an `{epiprocess}` [`dplyr::across`]-like operations +#' +#' @param .x data.frame to perform input column tidyselection on +#' @param time_type as in [`new_epi_df`] +#' @param col_names_quo enquosed input column tidyselect expression +#' @param .f_namer function taking an input column object and outputting a name +#' for a corresponding output column; see [`upstream_slide_f_info`] +#' @param .window_size as in [`epi_slide_opt`] +#' @param .align as in [`epi_slide_opt`] +#' @param .prefix as in [`epi_slide_opt`] +#' @param .suffix as in [`epi_slide_opt`] +#' @param .new_col_names as in [`epi_slide_opt`] +#' @return named list with two elements: `input_col_names`, chr, subset of +#' `names(.x)`; and `output_colnames`, chr, same length as `input_col_names` +#' +#' @keywords internal +across_ish_names_info <- function(.x, time_type, col_names_quo, .f_namer, .window_size, .align, .prefix, .suffix, .new_col_names) { + # The position of a given column can be differ between input `.x` and + # `.data_group` since the grouping step by default drops grouping columns. + # To avoid rerunning `eval_select` for every `.data_group`, convert + # positions of user-provided `col_names` into string column names. We avoid + # using `names(pos)` directly for robustness and in case we later want to + # allow users to rename fields via tidyselection. + pos <- eval_select(col_names_quo, data = .x, allow_rename = FALSE) + input_col_names <- names(.x)[pos] + + # Handle output naming + if ((!is.null(.prefix) || !is.null(.suffix)) && !is.null(.new_col_names)) { + cli_abort( + "Can't use both .prefix/.suffix and .new_col_names at the same time.", + class = "epiprocess__epi_slide_opt_incompatible_naming_args" + ) + } + assert_string(.prefix, null.ok = TRUE) + assert_string(.suffix, null.ok = TRUE) + assert_character(.new_col_names, len = length(input_col_names), null.ok = TRUE) + if (is.null(.prefix) && is.null(.suffix) && is.null(.new_col_names)) { + .suffix <- "_{.n}{.time_unit_abbr}{.align_abbr}{.f_abbr}" + # ^ does not account for any arguments specified to underlying functions via + # `...` such as `na.rm =`, nor does it distinguish between functions from + # different packages accomplishing the same type of computation. Those are + # probably only set one way per task, so this probably produces cleaner + # names without clashes (though maybe some confusion if switching between + # code with different settings). + } + if (!is.null(.prefix) || !is.null(.suffix)) { + .prefix <- .prefix %||% "" + .suffix <- .suffix %||% "" + if (identical(.window_size, Inf)) { + n <- "running_" + time_unit_abbr <- "" + align_abbr <- "" + } else { + n <- time_delta_to_n_steps(.window_size, time_type) + time_unit_abbr <- time_type_unit_abbr(time_type) + align_abbr <- c(right = "", center = "c", left = "l")[[.align]] + } + glue_env <- rlang::env( + .n = n, + .time_unit_abbr = time_unit_abbr, + .align_abbr = align_abbr, + .f_abbr = purrr::map_chr(.x[, c(input_col_names)], .f_namer), # compat between DT and tbl selection + quo_get_env(col_names_quo) + ) + .new_col_names <- unclass( + glue(.prefix, .envir = glue_env) + + input_col_names + + glue(.suffix, .envir = glue_env) + ) + } else { + # `.new_col_names` was provided by user; we don't need to do anything. + } + if (any(.new_col_names %in% names(.x))) { + cli_abort(c( + "Naming conflict between new columns and existing columns", + "x" = "Overlapping names: {format_varnames(intersect(.new_col_names, names(.x)))}" + ), class = "epiprocess__epi_slide_opt_old_new_name_conflict") + } + if (anyDuplicated(.new_col_names)) { + cli_abort(c( + "New column names contain duplicates", + "x" = "Duplicated names: {format_varnames(unique(.new_col_names[duplicated(.new_col_names)]))}" + ), class = "epiprocess__epi_slide_opt_new_name_duplicated") + } + output_col_names <- .new_col_names + + return(list( + input_col_names = input_col_names, + output_col_names = output_col_names + )) +} + +#' Optimized slide functions for common cases +#' +#' @description `epi_slide_opt` allows sliding an n-timestep [data.table::froll] +#' or [slider::summary-slide] function over variables in an `epi_df` object. +#' These functions tend to be much faster than `epi_slide()`. See +#' `vignette("epi_df")` for more examples. +#' +#' @template basic-slide-params +#' @param .col_names <[`tidy-select`][dplyr_tidy_select]> An unquoted column +#' name (e.g., `cases`), multiple column names (e.g., `c(cases, deaths)`), +#' [other tidy-select expression][tidyselect::language], or a vector of +#' characters (e.g. `c("cases", "deaths")`). Variable names can be used as if +#' they were positions in the data frame, so expressions like `x:y` can be +#' used to select a range of variables. +#' +#' The tidy-selection renaming interface is not supported, and cannot be used +#' to provide output column names; if you want to customize the output column +#' names, use [`dplyr::rename`] after the slide. +#' @param .f Function; together with `...` specifies the computation to slide. +#' `.f` must be one of `data.table`'s rolling functions +#' (`frollmean`, `frollsum`, `frollapply`. See [data.table::roll]) or one +#' of `slider`'s specialized sliding functions (`slide_mean`, `slide_sum`, +#' etc. See [slider::summary-slide]). +#' +#' The optimized `data.table` and `slider` functions can't be directly passed +#' as the computation function in `epi_slide` without careful handling to make +#' sure each computation group is made up of the `.window_size` dates rather +#' than `.window_size` points. `epi_slide_opt` (and wrapper functions +#' `epi_slide_mean` and `epi_slide_sum`) take care of window completion +#' automatically to prevent associated errors. +#' @param ... Additional arguments to pass to the slide computation `.f`, for +#' example, `algo` or `na.rm` in data.table functions. You don't need to +#' specify `.x`, `.window_size`, or `.align` (or `before`/`after` for slider +#' functions). +#' @param .prefix Optional [`glue::glue`] format string; name the slide result +#' column(s) by attaching this prefix to the corresponding input column(s). +#' Some shorthand is supported for basing the output names on `.window_size` +#' or other arguments; see "Prefix and suffix shorthand" below. +#' @param .suffix Optional [`glue::glue`] format string; like `.prefix`. The +#' default naming behavior is equivalent to `.suffix = +#' "_{.n}{.time_unit_abbr}{.align_abbr}{.f_abbr}"`. Can be used in combination +#' with `.prefix`. +#' @param .new_col_names Optional character vector with length matching the +#' number of input columns from `.col_names`; name the slide result column(s) +#' with these names. Cannot be used in combination with `.prefix` and/or +#' `.suffix`. +#' +#' @section Prefix and suffix shorthand: +#' +#' [`glue::glue`] format strings specially interpret content within curly +#' braces. E.g., `glue::glue("ABC{2 + 2}")` evaluates to `"ABC4"`. For `.prefix` +#' and `.suffix`, we provide `glue` with some additional variable bindings: +#' +#' - `{.n}` will be the number of time steps in the computation +#' corresponding to the `.window_size`. +#' - `{.time_unit_abbr}` will be a lower-case letter corresponding to the +#' `time_type` of `.x` +#' - `{.align_abbr}` will be `""` if `.align` is the default of `"right"`; +#' otherwise, it will be the first letter of `.align` +#' - `{.f_abbr}` will be a character vector containing a short abbreviation +#' for `.f` factoring in the input column type(s) for `.col_names` +#' +#' @importFrom dplyr mutate %>% arrange tibble select all_of +#' @importFrom rlang enquo expr_label caller_arg quo_get_env +#' @importFrom tidyselect eval_select +#' @importFrom glue glue +#' @importFrom purrr map map_lgl +#' @importFrom data.table frollmean frollsum frollapply +#' @importFrom lubridate as.period +#' @importFrom slider slide_sum slide_prod slide_mean slide_min slide_max slide_all slide_any +#' @export +#' @seealso [`epi_slide`] for the more general slide function +#' @examples +#' library(dplyr) +#' +#' # Add a column (`cases_7dsum`) containing a 7-day trailing sum on `cases`: +#' cases_deaths_subset %>% +#' select(geo_value, time_value, cases) %>% +#' epi_slide_sum(cases, .window_size = 7) +#' +#' # Add a column (`cases_rate_7dav`) containing a 7-day trailing average on `case_rate`: +#' covid_case_death_rates_extended %>% +#' epi_slide_mean(case_rate, .window_size = 7) +#' +#' # Use a less common specialized slide function: +#' cases_deaths_subset %>% +#' epi_slide_opt(cases, slider::slide_min, .window_size = 7) +#' +#' # Specify output column names and/or a naming scheme: +#' cases_deaths_subset %>% +#' select(geo_value, time_value, cases) %>% +#' group_by(geo_value) %>% +#' epi_slide_sum(cases, .window_size = 7, .new_col_names = "case_sum") %>% +#' ungroup() +#' cases_deaths_subset %>% +#' select(geo_value, time_value, cases) %>% +#' group_by(geo_value) %>% +#' epi_slide_sum(cases, .window_size = 7, .prefix = "sum_") %>% +#' ungroup() +#' +#' # Additional settings can be sent to the {data.table} and {slider} functions +#' # via `...`. This example passes some arguments to `frollmean` settings for +#' # speed, accuracy, and to allow partially-missing windows: +#' covid_case_death_rates_extended %>% +#' epi_slide_mean( +#' case_rate, +#' .window_size = 7, +#' na.rm = TRUE, algo = "exact", hasNA = TRUE +#' ) +#' +#' # If the more specialized possibilities for `.f` don't cover your needs, you +#' # can use `epi_slide_opt` with `.f = data.table::frollapply` to apply a +#' # custom function at the cost of more computation time. See also `epi_slide` +#' # if you need something even more general. +#' cases_deaths_subset %>% +#' select(geo_value, time_value, case_rate_7d_av, death_rate_7d_av) %>% +#' epi_slide_opt(c(case_rate_7d_av, death_rate_7d_av), +#' data.table::frollapply, +#' FUN = median, .window_size = 28, +#' .suffix = "_{.n}{.time_unit_abbr}_median" +#' ) %>% +#' print(n = 40) +epi_slide_opt <- function( + .x, .col_names, .f, ..., + .window_size = NULL, .align = c("right", "center", "left"), + .prefix = NULL, .suffix = NULL, .new_col_names = NULL, + .ref_time_values = NULL, .all_rows = FALSE) { + assert_class(.x, "epi_df") + + # Deprecated argument handling + provided_args <- rlang::call_args_names(rlang::call_match()) + if (any(purrr::map_lgl(provided_args, ~ .x %in% c("x", "col_names", "f", "ref_time_values", "all_rows")))) { + cli::cli_abort( + "epi_slide_opt: you are using one of the following old argument names: `x`, `col_names`, `f`, `ref_time_values`, + or `all_rows`. Please use the new dot-prefixed names: `.x`, `.col_names`, `.f`, + `.ref_time_values`, `.all_rows`." + ) + } + if ("as_list_col" %in% provided_args) { + cli::cli_abort( + "epi_slide_opt: the argument `as_list_col` is deprecated. If FALSE, you can just remove it. + If TRUE, have your given computation wrap its result using `list(result)` instead." + ) + } + if ("before" %in% provided_args || "after" %in% provided_args) { + cli::cli_abort( + "epi_slide_opt: `before` and `after` are deprecated for `epi_slide`. Use `.window_size` and `.align` instead. + See the slide documentation for more details." + ) + } + if ("new_col_name" %in% provided_args || ".new_col_name" %in% provided_args) { + cli::cli_abort( + "epi_slide_opt: the argument `new_col_name` is not supported for `epi_slide_opt`. If you want to customize + the output column names, use `.prefix =`, `.suffix =`, or `.new_col_**names** =`.", + class = "epiprocess__epi_slide_opt__new_name_not_supported" + ) + } + if ("names_sep" %in% provided_args || ".names_sep" %in% provided_args) { + cli::cli_abort( + "epi_slide_opt: the argument `names_sep` is not supported for `epi_slide_opt`. If you want to customize + the output column names, use `dplyr::rename` after the slide.", + class = "epiprocess__epi_slide_opt__name_sep_not_supported" + ) + } + + assert_class(.x, "epi_df") + .x_orig_groups <- groups(.x) + if (inherits(.x, "grouped_df")) { + expected_group_keys <- .x %>% + key_colnames(exclude = "time_value") %>% + sort() + if (!identical(.x %>% group_vars() %>% sort(), expected_group_keys)) { + cli_abort( + "`.x` must be either grouped by {expected_group_keys} or ungrouped; if the latter, + we'll temporarily group by {expected_group_keys} for this operation. You may need + to aggregate your data first; see sum_groups_epi_df().", + class = "epiprocess__epi_slide_opt__invalid_grouping" + ) + } + } else { + .x <- group_epi_df(.x, exclude = "time_value") + } + if (nrow(.x) == 0L) { + cli_abort( + c( + "input data `.x` unexpectedly has 0 rows", + "i" = "If this computation is occuring within an `epix_slide` call, + check that `epix_slide` `.versions` argument was set appropriately + so that you don't get any completely-empty snapshots" + ), + class = "epiprocess__epi_slide_opt__0_row_input", + epiprocess__x = .x + ) + } + + # Check for duplicated time values within groups + assert(check_ukey_unique(ungroup(.x), c(group_vars(.x), "time_value"))) + + # Validate/process .col_names, .f: + col_names_quo <- enquo(.col_names) + f_info <- upstream_slide_f_info(.f) + f_from_package <- f_info$from_package + + # Validate/process .ref_time_values: + user_provided_rtvs <- !is.null(.ref_time_values) + if (!user_provided_rtvs) { + .ref_time_values <- unique(.x$time_value) + } else { + assert_numeric(.ref_time_values, min.len = 1L, null.ok = FALSE, any.missing = FALSE) + if (!test_subset(.ref_time_values, unique(.x$time_value))) { + cli_abort( + "`ref_time_values` must be a unique subset of the time values in `x`.", + class = "epiprocess__epi_slide_opt_invalid_ref_time_values" + ) + } + if (anyDuplicated(.ref_time_values) != 0L) { + cli_abort( + "`ref_time_values` must not contain any duplicates; use `unique` if appropriate.", + class = "epiprocess__epi_slide_opt_invalid_ref_time_values" + ) + } + } + ref_time_values <- sort(.ref_time_values) + + # Handle window arguments + .align <- rlang::arg_match(.align) + time_type <- attr(.x, "metadata")$time_type + if (is.null(.window_size)) { + cli_abort("epi_slide_opt: `.window_size` must be specified.") + } + validate_slide_window_arg(.window_size, time_type) + window_args <- get_before_after_from_window(.window_size, .align, time_type) + + # Handle output naming: + names_info <- across_ish_names_info(.x, time_type, col_names_quo, f_info$namer, .window_size, .align, .prefix, .suffix, .new_col_names) + input_col_names <- names_info$input_col_names + output_col_names <- names_info$output_col_names + + # Make a complete date sequence between min(.x$time_value) and max(.x$time_value). + date_seq_list <- full_date_seq(.x, window_args$before, window_args$after, time_type) + all_dates <- date_seq_list$all_dates + pad_early_dates <- date_seq_list$pad_early_dates + pad_late_dates <- date_seq_list$pad_late_dates + + slide_one_grp <- function(.data_group, .group_key, ...) { + missing_times <- all_dates[!vec_in(all_dates, .data_group$time_value)] + # `frollmean` requires a full window to compute a result. Add NA values + # to beginning and end of the group so that we get results for the + # first `before` and last `after` elements. + .data_group <- vec_rbind( + .data_group, # (tibble; epi_slide_opt uses .keep = FALSE) + new_tibble(vec_recycle_common( + time_value = c(missing_times, pad_early_dates, pad_late_dates), + .real = FALSE + )) + ) %>% + `[`(vec_order(.$time_value), ) + + if (f_from_package == "data.table") { + # Grouping should ensure that we don't have duplicate time values. + # Completion above should ensure we have at least .window_size rows. Check + # that we don't have more than .window_size rows (or fewer somehow): + if (nrow(.data_group) != length(c(all_dates, pad_early_dates, pad_late_dates))) { + cli_abort( + c( + "group contains an unexpected number of rows", + "i" = c("Input data may contain `time_values` closer together than the + expected `time_step` size") + ), + class = "epiprocess__epi_slide_opt__unexpected_row_number", + epiprocess__data_group = .data_group, + epiprocess__group_key = .group_key + ) + } + + # `frollmean` is 1-indexed, so create a new window width based on our + # `before` and `after` params. Right-aligned `frollmean` results' + # `ref_time_value`s will be `after` timesteps ahead of where they should + # be; shift results to the left by `after` timesteps. + if (window_args$before != Inf) { + window_size <- window_args$before + window_args$after + 1L + roll_output <- .f(x = .data_group[, input_col_names], n = window_size, ...) + } else { + window_size <- list(seq_along(.data_group$time_value)) + roll_output <- .f(x = .data_group[, input_col_names], n = window_size, adaptive = TRUE, ...) + } + if (window_args$after >= 1) { + .data_group[, output_col_names] <- purrr::map(roll_output, function(.x) { + c(.x[(window_args$after + 1L):length(.x)], rep(NA, window_args$after)) + }) + } else { + .data_group[, output_col_names] <- roll_output + } + } + if (f_from_package == "slider") { + for (i in seq_along(input_col_names)) { + .data_group[, output_col_names[i]] <- .f( + x = .data_group[[input_col_names[i]]], + before = as.numeric(window_args$before), + after = as.numeric(window_args$after), + ... + ) + } + } + + .data_group + } + + result <- .x %>% + `[[<-`(".real", value = TRUE) %>% + group_modify(slide_one_grp, ..., .keep = FALSE) %>% + `[`(.$.real, names(.) != ".real") %>% + arrange_col_canonical() %>% + group_by(!!!.x_orig_groups) + + if (.all_rows) { + result[!vec_in(result$time_value, ref_time_values), output_col_names] <- NA + } else if (user_provided_rtvs) { + result <- result[vec_in(result$time_value, ref_time_values), ] + } + + if (!is_epi_df(result)) { + # `.all_rows` handling strips epi_df format and metadata. + # Restore them. + result <- reclass(result, attributes(.x)$metadata) + } + + return(result) +} + +#' @rdname epi_slide_opt +#' @description `epi_slide_mean` is a wrapper around `epi_slide_opt` with `.f = +#' data.table::frollmean`. +#' +#' @export +epi_slide_mean <- function( + .x, .col_names, ..., + .window_size = NULL, .align = c("right", "center", "left"), + .prefix = NULL, .suffix = NULL, .new_col_names = NULL, + .ref_time_values = NULL, .all_rows = FALSE) { + # Deprecated argument handling + provided_args <- rlang::call_args_names(rlang::call_match()) + if (any(purrr::map_lgl(provided_args, ~ .x %in% c("x", "col_names", "f", "ref_time_values", "all_rows")))) { + cli::cli_abort( + "epi_slide_mean: you are using one of the following old argument names: `x`, `col_names`, `f`, `ref_time_values`, + or `all_rows`. Please use the new dot-prefixed names: `.x`, `.col_names`, `.f`, + `.ref_time_values`, `.all_rows`." + ) + } + if ("as_list_col" %in% provided_args) { + cli::cli_abort( + "epi_slide_mean: the argument `as_list_col` is deprecated. If FALSE, you can just remove it. + If TRUE, have your given computation wrap its result using `list(result)` instead." + ) + } + if ("before" %in% provided_args || "after" %in% provided_args) { + cli::cli_abort( + "epi_slide_mean: `before` and `after` are deprecated for `epi_slide`. Use `.window_size` and `.align` instead. + See the slide documentation for more details." + ) + } + if ("new_col_name" %in% provided_args || ".new_col_name" %in% provided_args) { + cli::cli_abort( + "epi_slide_opt: the argument `new_col_name` is not supported for `epi_slide_opt`. If you want to customize + the output column names, use `.prefix =`, `.suffix =`, or `.new_col_**names** =`.", + class = "epiprocess__epi_slide_opt__new_name_not_supported" + ) + } + if ("names_sep" %in% provided_args || ".names_sep" %in% provided_args) { + cli::cli_abort( + "epi_slide_mean: the argument `names_sep` is not supported. If you want to customize + the output column names, use `dplyr::rename` after the slide." + ) + } + + epi_slide_opt( + .x = .x, + .col_names = {{ .col_names }}, + .f = data.table::frollmean, + ..., + .window_size = .window_size, + .align = .align, + .prefix = .prefix, + .suffix = .suffix, + .new_col_names = .new_col_names, + .ref_time_values = .ref_time_values, + .all_rows = .all_rows + ) +} + +#' @rdname epi_slide_opt +#' @description `epi_slide_sum` is a wrapper around `epi_slide_opt` with `.f = +#' data.table::frollsum`. +#' +#' @export +epi_slide_sum <- function( + .x, .col_names, ..., + .window_size = NULL, .align = c("right", "center", "left"), + .prefix = NULL, .suffix = NULL, .new_col_names = NULL, + .ref_time_values = NULL, .all_rows = FALSE) { + # Deprecated argument handling + provided_args <- rlang::call_args_names(rlang::call_match()) + if (any(purrr::map_lgl(provided_args, ~ .x %in% c("x", "col_names", "f", "ref_time_values", "all_rows")))) { + cli::cli_abort( + "epi_slide_sum: you are using one of the following old argument names: `x`, `col_names`, `f`, `ref_time_values`, + or `all_rows`. Please use the new dot-prefixed names: `.x`, `.col_names`, `.f`, + `.ref_time_values`, `.all_rows`." + ) + } + if ("as_list_col" %in% provided_args) { + cli::cli_abort( + "epi_slide_sum: the argument `as_list_col` is deprecated. If FALSE, you can just remove it. + If TRUE, have your given computation wrap its result using `list(result)` instead." + ) + } + if ("before" %in% provided_args || "after" %in% provided_args) { + cli::cli_abort( + "epi_slide_sum: `before` and `after` are deprecated for `epi_slide`. Use `.window_size` and `.align` instead. + See the slide documentation for more details." + ) + } + if ("new_col_name" %in% provided_args || ".new_col_name" %in% provided_args) { + cli::cli_abort( + "epi_slide_opt: the argument `new_col_name` is not supported for `epi_slide_opt`. If you want to customize + the output column names, use `.prefix =`, `.suffix =`, or `.new_col_**names** =`.", + class = "epiprocess__epi_slide_opt__new_name_not_supported" + ) + } + if ("names_sep" %in% provided_args || ".names_sep" %in% provided_args) { + cli::cli_abort( + "epi_slide_sum: the argument `names_sep` is not supported. If you want to customize + the output column names, use `dplyr::rename` after the slide." + ) + } + epi_slide_opt( + .x = .x, + .col_names = {{ .col_names }}, + .f = data.table::frollsum, + ..., + .window_size = .window_size, + .align = .align, + .prefix = .prefix, + .suffix = .suffix, + .new_col_names = .new_col_names, + .ref_time_values = .ref_time_values, + .all_rows = .all_rows + ) +} + +#' Make a complete date sequence between min(x$time_value) and max +#' (x$time_value). Produce lists of dates before min(x$time_value) and after +#' max(x$time_value) for padding initial and final windows to size `n`. +#' +#' `before` and `after` args are assumed to have been validated by the calling +#' function (using `validate_slide_window_arg`). +#' +#' @keywords internal +full_date_seq <- function(x, before, after, time_type) { + if (!time_type %in% c("day", "week", "yearmonth", "integer")) { + cli_abort( + "time_type must be one of 'day', 'week', or 'integer'." + ) + } + + pad_early_dates <- c() + pad_late_dates <- c() + + # `tsibble` time types have their own behavior, where adding 1 corresponds to + # incrementing by a quantum (smallest resolvable unit) of the date class. For + # example, one step = 1 quarter for `yearquarter`. + if (time_type %in% c("yearmonth", "integer")) { + all_dates <- seq(min(x$time_value), max(x$time_value), by = 1L) + + if (before != 0 && before != Inf) { + pad_early_dates <- all_dates[1L] - before:1 + } + if (after != 0) { + pad_late_dates <- all_dates[length(all_dates)] + 1:after + } + } else { + by <- switch(time_type, + day = "days", + week = "weeks", + ) + + all_dates <- seq(min(x$time_value), max(x$time_value), by = by) + if (before != 0 && before != Inf) { + # The behavior is analogous to the branch with tsibble types above. For + # more detail, note that the function `seq.Date(from, ..., length.out = + # n)` returns `from + 0:n`. Since we want `from + 1:n`, we drop the first + # element. Adding "-1" to the `by` arg makes `seq.Date` go backwards in + # time. + pad_early_dates <- sort(seq(all_dates[1L], by = paste("-1", by), length.out = before + 1)[-1]) + } + if (after != 0) { + pad_late_dates <- seq(all_dates[length(all_dates)], by = by, length.out = after + 1)[-1] + } + } + + list( + all_dates = all_dates, + pad_early_dates = pad_early_dates, + pad_late_dates = pad_late_dates + ) +} diff --git a/R/slide.R b/R/slide.R index 0da569deb..c05fbe98f 100644 --- a/R/slide.R +++ b/R/slide.R @@ -548,669 +548,3 @@ get_before_after_from_window <- function(window_size, align, time_type) { } list(before = before, after = after) } - - -#' Information about upstream (`{data.table}`/`{slider}`) slide functions -#' -#' Underlies [`upstream_slide_f_info`]. -#' -#' @keywords internal -upstream_slide_f_possibilities <- tibble::tribble( - ~f, ~package, ~namer, - frollmean, "data.table", ~ if (is.logical(.x)) "prop" else "av", - frollsum, "data.table", ~ if (is.logical(.x)) "count" else "sum", - frollapply, "data.table", ~"slide", - slide_sum, "slider", ~ if (is.logical(.x)) "count" else "sum", - slide_prod, "slider", ~"prod", - slide_mean, "slider", ~ if (is.logical(.x)) "prop" else "av", - slide_min, "slider", ~"min", - slide_max, "slider", ~"max", - slide_all, "slider", ~"all", - slide_any, "slider", ~"any", -) - -#' Validate & get information about an upstream slide function -#' -#' @param .f function such as `data.table::frollmean` or `slider::slide_mean`; -#' must appear in [`upstream_slide_f_possibilities`] -#' @return named list with two elements: `from_package`, a string containing the -#' upstream package name ("data.table" or "slider"), and `namer`, a function -#' that takes a column to call `.f` on and outputs a basic name or -#' abbreviation for what operation `.f` represents on that kind of column -#' (e.g., "sum", "av", "count"). -#' -#' @keywords internal -upstream_slide_f_info <- function(.f) { - assert_function(.f) - - # Check that slide function `.f` is one of those short-listed from - # `data.table` and `slider` (or a function that has the exact same definition, - # e.g. if the function has been reexported or defined locally). Extract some - # metadata. `namer` will be mapped over columns (.x will be a column, not the - # entire edf). - f_info_row <- upstream_slide_f_possibilities %>% - filter(map_lgl(.data$f, ~ identical(.f, .x))) - if (nrow(f_info_row) == 0L) { - # `f` is from somewhere else and not supported - cli_abort( - c( - "problem with {rlang::expr_label(rlang::caller_arg(f))}", - "i" = "`f` must be one of `data.table`'s rolling functions (`frollmean`, - `frollsum`, `frollapply`. See `?data.table::roll`) or one of - `slider`'s specialized sliding functions (`slide_mean`, `slide_sum`, - etc. See `?slider::\`summary-slide\`` for more options)." - ), - class = "epiprocess__epi_slide_opt__unsupported_slide_function", - epiprocess__f = .f - ) - } - if (nrow(f_info_row) > 1L) { - cli_abort('epiprocess internal error: looking up `.f` in table of possible - functions yielded multiple matches. Please report it using "New - issue" at https://github.com/cmu-delphi/epiprocess/issues, using - reprex::reprex to provide a minimal reproducible example.') - } - f_from_package <- f_info_row$package - list( - from_package = f_from_package, - namer = unwrap(f_info_row$namer) - ) -} - -#' Calculate input and output column names for an `{epiprocess}` [`dplyr::across`]-like operations -#' -#' @param .x data.frame to perform input column tidyselection on -#' @param time_type as in [`new_epi_df`] -#' @param col_names_quo enquosed input column tidyselect expression -#' @param .f_namer function taking an input column object and outputting a name -#' for a corresponding output column; see [`upstream_slide_f_info`] -#' @param .window_size as in [`epi_slide_opt`] -#' @param .align as in [`epi_slide_opt`] -#' @param .prefix as in [`epi_slide_opt`] -#' @param .suffix as in [`epi_slide_opt`] -#' @param .new_col_names as in [`epi_slide_opt`] -#' @return named list with two elements: `input_col_names`, chr, subset of -#' `names(.x)`; and `output_colnames`, chr, same length as `input_col_names` -#' -#' @keywords internal -across_ish_names_info <- function(.x, time_type, col_names_quo, .f_namer, .window_size, .align, .prefix, .suffix, .new_col_names) { - # The position of a given column can be differ between input `.x` and - # `.data_group` since the grouping step by default drops grouping columns. - # To avoid rerunning `eval_select` for every `.data_group`, convert - # positions of user-provided `col_names` into string column names. We avoid - # using `names(pos)` directly for robustness and in case we later want to - # allow users to rename fields via tidyselection. - pos <- eval_select(col_names_quo, data = .x, allow_rename = FALSE) - input_col_names <- names(.x)[pos] - - # Handle output naming - if ((!is.null(.prefix) || !is.null(.suffix)) && !is.null(.new_col_names)) { - cli_abort( - "Can't use both .prefix/.suffix and .new_col_names at the same time.", - class = "epiprocess__epi_slide_opt_incompatible_naming_args" - ) - } - assert_string(.prefix, null.ok = TRUE) - assert_string(.suffix, null.ok = TRUE) - assert_character(.new_col_names, len = length(input_col_names), null.ok = TRUE) - if (is.null(.prefix) && is.null(.suffix) && is.null(.new_col_names)) { - .suffix <- "_{.n}{.time_unit_abbr}{.align_abbr}{.f_abbr}" - # ^ does not account for any arguments specified to underlying functions via - # `...` such as `na.rm =`, nor does it distinguish between functions from - # different packages accomplishing the same type of computation. Those are - # probably only set one way per task, so this probably produces cleaner - # names without clashes (though maybe some confusion if switching between - # code with different settings). - } - if (!is.null(.prefix) || !is.null(.suffix)) { - .prefix <- .prefix %||% "" - .suffix <- .suffix %||% "" - if (identical(.window_size, Inf)) { - n <- "running_" - time_unit_abbr <- "" - align_abbr <- "" - } else { - n <- time_delta_to_n_steps(.window_size, time_type) - time_unit_abbr <- time_type_unit_abbr(time_type) - align_abbr <- c(right = "", center = "c", left = "l")[[.align]] - } - glue_env <- rlang::env( - .n = n, - .time_unit_abbr = time_unit_abbr, - .align_abbr = align_abbr, - .f_abbr = purrr::map_chr(.x[, c(input_col_names)], .f_namer), # compat between DT and tbl selection - quo_get_env(col_names_quo) - ) - .new_col_names <- unclass( - glue(.prefix, .envir = glue_env) + - input_col_names + - glue(.suffix, .envir = glue_env) - ) - } else { - # `.new_col_names` was provided by user; we don't need to do anything. - } - if (any(.new_col_names %in% names(.x))) { - cli_abort(c( - "Naming conflict between new columns and existing columns", - "x" = "Overlapping names: {format_varnames(intersect(.new_col_names, names(.x)))}" - ), class = "epiprocess__epi_slide_opt_old_new_name_conflict") - } - if (anyDuplicated(.new_col_names)) { - cli_abort(c( - "New column names contain duplicates", - "x" = "Duplicated names: {format_varnames(unique(.new_col_names[duplicated(.new_col_names)]))}" - ), class = "epiprocess__epi_slide_opt_new_name_duplicated") - } - output_col_names <- .new_col_names - - return(list( - input_col_names = input_col_names, - output_col_names = output_col_names - )) -} - -#' Optimized slide functions for common cases -#' -#' @description `epi_slide_opt` allows sliding an n-timestep [data.table::froll] -#' or [slider::summary-slide] function over variables in an `epi_df` object. -#' These functions tend to be much faster than `epi_slide()`. See -#' `vignette("epi_df")` for more examples. -#' -#' @template basic-slide-params -#' @param .col_names <[`tidy-select`][dplyr_tidy_select]> An unquoted column -#' name (e.g., `cases`), multiple column names (e.g., `c(cases, deaths)`), -#' [other tidy-select expression][tidyselect::language], or a vector of -#' characters (e.g. `c("cases", "deaths")`). Variable names can be used as if -#' they were positions in the data frame, so expressions like `x:y` can be -#' used to select a range of variables. -#' -#' The tidy-selection renaming interface is not supported, and cannot be used -#' to provide output column names; if you want to customize the output column -#' names, use [`dplyr::rename`] after the slide. -#' @param .f Function; together with `...` specifies the computation to slide. -#' `.f` must be one of `data.table`'s rolling functions -#' (`frollmean`, `frollsum`, `frollapply`. See [data.table::roll]) or one -#' of `slider`'s specialized sliding functions (`slide_mean`, `slide_sum`, -#' etc. See [slider::summary-slide]). -#' -#' The optimized `data.table` and `slider` functions can't be directly passed -#' as the computation function in `epi_slide` without careful handling to make -#' sure each computation group is made up of the `.window_size` dates rather -#' than `.window_size` points. `epi_slide_opt` (and wrapper functions -#' `epi_slide_mean` and `epi_slide_sum`) take care of window completion -#' automatically to prevent associated errors. -#' @param ... Additional arguments to pass to the slide computation `.f`, for -#' example, `algo` or `na.rm` in data.table functions. You don't need to -#' specify `.x`, `.window_size`, or `.align` (or `before`/`after` for slider -#' functions). -#' @param .prefix Optional [`glue::glue`] format string; name the slide result -#' column(s) by attaching this prefix to the corresponding input column(s). -#' Some shorthand is supported for basing the output names on `.window_size` -#' or other arguments; see "Prefix and suffix shorthand" below. -#' @param .suffix Optional [`glue::glue`] format string; like `.prefix`. The -#' default naming behavior is equivalent to `.suffix = -#' "_{.n}{.time_unit_abbr}{.align_abbr}{.f_abbr}"`. Can be used in combination -#' with `.prefix`. -#' @param .new_col_names Optional character vector with length matching the -#' number of input columns from `.col_names`; name the slide result column(s) -#' with these names. Cannot be used in combination with `.prefix` and/or -#' `.suffix`. -#' -#' @section Prefix and suffix shorthand: -#' -#' [`glue::glue`] format strings specially interpret content within curly -#' braces. E.g., `glue::glue("ABC{2 + 2}")` evaluates to `"ABC4"`. For `.prefix` -#' and `.suffix`, we provide `glue` with some additional variable bindings: -#' -#' - `{.n}` will be the number of time steps in the computation -#' corresponding to the `.window_size`. -#' - `{.time_unit_abbr}` will be a lower-case letter corresponding to the -#' `time_type` of `.x` -#' - `{.align_abbr}` will be `""` if `.align` is the default of `"right"`; -#' otherwise, it will be the first letter of `.align` -#' - `{.f_abbr}` will be a character vector containing a short abbreviation -#' for `.f` factoring in the input column type(s) for `.col_names` -#' -#' @importFrom dplyr mutate %>% arrange tibble select all_of -#' @importFrom rlang enquo expr_label caller_arg quo_get_env -#' @importFrom tidyselect eval_select -#' @importFrom glue glue -#' @importFrom purrr map map_lgl -#' @importFrom data.table frollmean frollsum frollapply -#' @importFrom lubridate as.period -#' @importFrom slider slide_sum slide_prod slide_mean slide_min slide_max slide_all slide_any -#' @export -#' @seealso [`epi_slide`] for the more general slide function -#' @examples -#' library(dplyr) -#' -#' # Add a column (`cases_7dsum`) containing a 7-day trailing sum on `cases`: -#' cases_deaths_subset %>% -#' select(geo_value, time_value, cases) %>% -#' epi_slide_sum(cases, .window_size = 7) -#' -#' # Add a column (`cases_rate_7dav`) containing a 7-day trailing average on `case_rate`: -#' covid_case_death_rates_extended %>% -#' epi_slide_mean(case_rate, .window_size = 7) -#' -#' # Use a less common specialized slide function: -#' cases_deaths_subset %>% -#' epi_slide_opt(cases, slider::slide_min, .window_size = 7) -#' -#' # Specify output column names and/or a naming scheme: -#' cases_deaths_subset %>% -#' select(geo_value, time_value, cases) %>% -#' group_by(geo_value) %>% -#' epi_slide_sum(cases, .window_size = 7, .new_col_names = "case_sum") %>% -#' ungroup() -#' cases_deaths_subset %>% -#' select(geo_value, time_value, cases) %>% -#' group_by(geo_value) %>% -#' epi_slide_sum(cases, .window_size = 7, .prefix = "sum_") %>% -#' ungroup() -#' -#' # Additional settings can be sent to the {data.table} and {slider} functions -#' # via `...`. This example passes some arguments to `frollmean` settings for -#' # speed, accuracy, and to allow partially-missing windows: -#' covid_case_death_rates_extended %>% -#' epi_slide_mean( -#' case_rate, -#' .window_size = 7, -#' na.rm = TRUE, algo = "exact", hasNA = TRUE -#' ) -#' -#' # If the more specialized possibilities for `.f` don't cover your needs, you -#' # can use `epi_slide_opt` with `.f = data.table::frollapply` to apply a -#' # custom function at the cost of more computation time. See also `epi_slide` -#' # if you need something even more general. -#' cases_deaths_subset %>% -#' select(geo_value, time_value, case_rate_7d_av, death_rate_7d_av) %>% -#' epi_slide_opt(c(case_rate_7d_av, death_rate_7d_av), -#' data.table::frollapply, -#' FUN = median, .window_size = 28, -#' .suffix = "_{.n}{.time_unit_abbr}_median" -#' ) %>% -#' print(n = 40) -epi_slide_opt <- function( - .x, .col_names, .f, ..., - .window_size = NULL, .align = c("right", "center", "left"), - .prefix = NULL, .suffix = NULL, .new_col_names = NULL, - .ref_time_values = NULL, .all_rows = FALSE) { - assert_class(.x, "epi_df") - - # Deprecated argument handling - provided_args <- rlang::call_args_names(rlang::call_match()) - if (any(purrr::map_lgl(provided_args, ~ .x %in% c("x", "col_names", "f", "ref_time_values", "all_rows")))) { - cli::cli_abort( - "epi_slide_opt: you are using one of the following old argument names: `x`, `col_names`, `f`, `ref_time_values`, - or `all_rows`. Please use the new dot-prefixed names: `.x`, `.col_names`, `.f`, - `.ref_time_values`, `.all_rows`." - ) - } - if ("as_list_col" %in% provided_args) { - cli::cli_abort( - "epi_slide_opt: the argument `as_list_col` is deprecated. If FALSE, you can just remove it. - If TRUE, have your given computation wrap its result using `list(result)` instead." - ) - } - if ("before" %in% provided_args || "after" %in% provided_args) { - cli::cli_abort( - "epi_slide_opt: `before` and `after` are deprecated for `epi_slide`. Use `.window_size` and `.align` instead. - See the slide documentation for more details." - ) - } - if ("new_col_name" %in% provided_args || ".new_col_name" %in% provided_args) { - cli::cli_abort( - "epi_slide_opt: the argument `new_col_name` is not supported for `epi_slide_opt`. If you want to customize - the output column names, use `.prefix =`, `.suffix =`, or `.new_col_**names** =`.", - class = "epiprocess__epi_slide_opt__new_name_not_supported" - ) - } - if ("names_sep" %in% provided_args || ".names_sep" %in% provided_args) { - cli::cli_abort( - "epi_slide_opt: the argument `names_sep` is not supported for `epi_slide_opt`. If you want to customize - the output column names, use `dplyr::rename` after the slide.", - class = "epiprocess__epi_slide_opt__name_sep_not_supported" - ) - } - - assert_class(.x, "epi_df") - .x_orig_groups <- groups(.x) - if (inherits(.x, "grouped_df")) { - expected_group_keys <- .x %>% - key_colnames(exclude = "time_value") %>% - sort() - if (!identical(.x %>% group_vars() %>% sort(), expected_group_keys)) { - cli_abort( - "`.x` must be either grouped by {expected_group_keys} or ungrouped; if the latter, - we'll temporarily group by {expected_group_keys} for this operation. You may need - to aggregate your data first; see sum_groups_epi_df().", - class = "epiprocess__epi_slide_opt__invalid_grouping" - ) - } - } else { - .x <- group_epi_df(.x, exclude = "time_value") - } - if (nrow(.x) == 0L) { - cli_abort( - c( - "input data `.x` unexpectedly has 0 rows", - "i" = "If this computation is occuring within an `epix_slide` call, - check that `epix_slide` `.versions` argument was set appropriately - so that you don't get any completely-empty snapshots" - ), - class = "epiprocess__epi_slide_opt__0_row_input", - epiprocess__x = .x - ) - } - - # Check for duplicated time values within groups - assert(check_ukey_unique(ungroup(.x), c(group_vars(.x), "time_value"))) - - # Validate/process .col_names, .f: - col_names_quo <- enquo(.col_names) - f_info <- upstream_slide_f_info(.f) - f_from_package <- f_info$from_package - - # Validate/process .ref_time_values: - user_provided_rtvs <- !is.null(.ref_time_values) - if (!user_provided_rtvs) { - .ref_time_values <- unique(.x$time_value) - } else { - assert_numeric(.ref_time_values, min.len = 1L, null.ok = FALSE, any.missing = FALSE) - if (!test_subset(.ref_time_values, unique(.x$time_value))) { - cli_abort( - "`ref_time_values` must be a unique subset of the time values in `x`.", - class = "epiprocess__epi_slide_opt_invalid_ref_time_values" - ) - } - if (anyDuplicated(.ref_time_values) != 0L) { - cli_abort( - "`ref_time_values` must not contain any duplicates; use `unique` if appropriate.", - class = "epiprocess__epi_slide_opt_invalid_ref_time_values" - ) - } - } - ref_time_values <- sort(.ref_time_values) - - # Handle window arguments - .align <- rlang::arg_match(.align) - time_type <- attr(.x, "metadata")$time_type - if (is.null(.window_size)) { - cli_abort("epi_slide_opt: `.window_size` must be specified.") - } - validate_slide_window_arg(.window_size, time_type) - window_args <- get_before_after_from_window(.window_size, .align, time_type) - - # Handle output naming: - names_info <- across_ish_names_info(.x, time_type, col_names_quo, f_info$namer, .window_size, .align, .prefix, .suffix, .new_col_names) - input_col_names <- names_info$input_col_names - output_col_names <- names_info$output_col_names - - # Make a complete date sequence between min(.x$time_value) and max(.x$time_value). - date_seq_list <- full_date_seq(.x, window_args$before, window_args$after, time_type) - all_dates <- date_seq_list$all_dates - pad_early_dates <- date_seq_list$pad_early_dates - pad_late_dates <- date_seq_list$pad_late_dates - - slide_one_grp <- function(.data_group, .group_key, ...) { - missing_times <- all_dates[!vec_in(all_dates, .data_group$time_value)] - # `frollmean` requires a full window to compute a result. Add NA values - # to beginning and end of the group so that we get results for the - # first `before` and last `after` elements. - .data_group <- vec_rbind( - .data_group, # (tibble; epi_slide_opt uses .keep = FALSE) - new_tibble(vec_recycle_common( - time_value = c(missing_times, pad_early_dates, pad_late_dates), - .real = FALSE - )) - ) %>% - `[`(vec_order(.$time_value), ) - - if (f_from_package == "data.table") { - # Grouping should ensure that we don't have duplicate time values. - # Completion above should ensure we have at least .window_size rows. Check - # that we don't have more than .window_size rows (or fewer somehow): - if (nrow(.data_group) != length(c(all_dates, pad_early_dates, pad_late_dates))) { - cli_abort( - c( - "group contains an unexpected number of rows", - "i" = c("Input data may contain `time_values` closer together than the - expected `time_step` size") - ), - class = "epiprocess__epi_slide_opt__unexpected_row_number", - epiprocess__data_group = .data_group, - epiprocess__group_key = .group_key - ) - } - - # `frollmean` is 1-indexed, so create a new window width based on our - # `before` and `after` params. Right-aligned `frollmean` results' - # `ref_time_value`s will be `after` timesteps ahead of where they should - # be; shift results to the left by `after` timesteps. - if (window_args$before != Inf) { - window_size <- window_args$before + window_args$after + 1L - roll_output <- .f(x = .data_group[, input_col_names], n = window_size, ...) - } else { - window_size <- list(seq_along(.data_group$time_value)) - roll_output <- .f(x = .data_group[, input_col_names], n = window_size, adaptive = TRUE, ...) - } - if (window_args$after >= 1) { - .data_group[, output_col_names] <- purrr::map(roll_output, function(.x) { - c(.x[(window_args$after + 1L):length(.x)], rep(NA, window_args$after)) - }) - } else { - .data_group[, output_col_names] <- roll_output - } - } - if (f_from_package == "slider") { - for (i in seq_along(input_col_names)) { - .data_group[, output_col_names[i]] <- .f( - x = .data_group[[input_col_names[i]]], - before = as.numeric(window_args$before), - after = as.numeric(window_args$after), - ... - ) - } - } - - .data_group - } - - result <- .x %>% - `[[<-`(".real", value = TRUE) %>% - group_modify(slide_one_grp, ..., .keep = FALSE) %>% - `[`(.$.real, names(.) != ".real") %>% - arrange_col_canonical() %>% - group_by(!!!.x_orig_groups) - - if (.all_rows) { - result[!vec_in(result$time_value, ref_time_values), output_col_names] <- NA - } else if (user_provided_rtvs) { - result <- result[vec_in(result$time_value, ref_time_values), ] - } - - if (!is_epi_df(result)) { - # `.all_rows` handling strips epi_df format and metadata. - # Restore them. - result <- reclass(result, attributes(.x)$metadata) - } - - return(result) -} - -#' @rdname epi_slide_opt -#' @description `epi_slide_mean` is a wrapper around `epi_slide_opt` with `.f = -#' data.table::frollmean`. -#' -#' @export -epi_slide_mean <- function( - .x, .col_names, ..., - .window_size = NULL, .align = c("right", "center", "left"), - .prefix = NULL, .suffix = NULL, .new_col_names = NULL, - .ref_time_values = NULL, .all_rows = FALSE) { - # Deprecated argument handling - provided_args <- rlang::call_args_names(rlang::call_match()) - if (any(purrr::map_lgl(provided_args, ~ .x %in% c("x", "col_names", "f", "ref_time_values", "all_rows")))) { - cli::cli_abort( - "epi_slide_mean: you are using one of the following old argument names: `x`, `col_names`, `f`, `ref_time_values`, - or `all_rows`. Please use the new dot-prefixed names: `.x`, `.col_names`, `.f`, - `.ref_time_values`, `.all_rows`." - ) - } - if ("as_list_col" %in% provided_args) { - cli::cli_abort( - "epi_slide_mean: the argument `as_list_col` is deprecated. If FALSE, you can just remove it. - If TRUE, have your given computation wrap its result using `list(result)` instead." - ) - } - if ("before" %in% provided_args || "after" %in% provided_args) { - cli::cli_abort( - "epi_slide_mean: `before` and `after` are deprecated for `epi_slide`. Use `.window_size` and `.align` instead. - See the slide documentation for more details." - ) - } - if ("new_col_name" %in% provided_args || ".new_col_name" %in% provided_args) { - cli::cli_abort( - "epi_slide_opt: the argument `new_col_name` is not supported for `epi_slide_opt`. If you want to customize - the output column names, use `.prefix =`, `.suffix =`, or `.new_col_**names** =`.", - class = "epiprocess__epi_slide_opt__new_name_not_supported" - ) - } - if ("names_sep" %in% provided_args || ".names_sep" %in% provided_args) { - cli::cli_abort( - "epi_slide_mean: the argument `names_sep` is not supported. If you want to customize - the output column names, use `dplyr::rename` after the slide." - ) - } - - epi_slide_opt( - .x = .x, - .col_names = {{ .col_names }}, - .f = data.table::frollmean, - ..., - .window_size = .window_size, - .align = .align, - .prefix = .prefix, - .suffix = .suffix, - .new_col_names = .new_col_names, - .ref_time_values = .ref_time_values, - .all_rows = .all_rows - ) -} - -#' @rdname epi_slide_opt -#' @description `epi_slide_sum` is a wrapper around `epi_slide_opt` with `.f = -#' data.table::frollsum`. -#' -#' @export -epi_slide_sum <- function( - .x, .col_names, ..., - .window_size = NULL, .align = c("right", "center", "left"), - .prefix = NULL, .suffix = NULL, .new_col_names = NULL, - .ref_time_values = NULL, .all_rows = FALSE) { - # Deprecated argument handling - provided_args <- rlang::call_args_names(rlang::call_match()) - if (any(purrr::map_lgl(provided_args, ~ .x %in% c("x", "col_names", "f", "ref_time_values", "all_rows")))) { - cli::cli_abort( - "epi_slide_sum: you are using one of the following old argument names: `x`, `col_names`, `f`, `ref_time_values`, - or `all_rows`. Please use the new dot-prefixed names: `.x`, `.col_names`, `.f`, - `.ref_time_values`, `.all_rows`." - ) - } - if ("as_list_col" %in% provided_args) { - cli::cli_abort( - "epi_slide_sum: the argument `as_list_col` is deprecated. If FALSE, you can just remove it. - If TRUE, have your given computation wrap its result using `list(result)` instead." - ) - } - if ("before" %in% provided_args || "after" %in% provided_args) { - cli::cli_abort( - "epi_slide_sum: `before` and `after` are deprecated for `epi_slide`. Use `.window_size` and `.align` instead. - See the slide documentation for more details." - ) - } - if ("new_col_name" %in% provided_args || ".new_col_name" %in% provided_args) { - cli::cli_abort( - "epi_slide_opt: the argument `new_col_name` is not supported for `epi_slide_opt`. If you want to customize - the output column names, use `.prefix =`, `.suffix =`, or `.new_col_**names** =`.", - class = "epiprocess__epi_slide_opt__new_name_not_supported" - ) - } - if ("names_sep" %in% provided_args || ".names_sep" %in% provided_args) { - cli::cli_abort( - "epi_slide_sum: the argument `names_sep` is not supported. If you want to customize - the output column names, use `dplyr::rename` after the slide." - ) - } - epi_slide_opt( - .x = .x, - .col_names = {{ .col_names }}, - .f = data.table::frollsum, - ..., - .window_size = .window_size, - .align = .align, - .prefix = .prefix, - .suffix = .suffix, - .new_col_names = .new_col_names, - .ref_time_values = .ref_time_values, - .all_rows = .all_rows - ) -} - -#' Make a complete date sequence between min(x$time_value) and max -#' (x$time_value). Produce lists of dates before min(x$time_value) and after -#' max(x$time_value) for padding initial and final windows to size `n`. -#' -#' `before` and `after` args are assumed to have been validated by the calling -#' function (using `validate_slide_window_arg`). -#' -#' @keywords internal -full_date_seq <- function(x, before, after, time_type) { - if (!time_type %in% c("day", "week", "yearmonth", "integer")) { - cli_abort( - "time_type must be one of 'day', 'week', or 'integer'." - ) - } - - pad_early_dates <- c() - pad_late_dates <- c() - - # `tsibble` time types have their own behavior, where adding 1 corresponds to - # incrementing by a quantum (smallest resolvable unit) of the date class. For - # example, one step = 1 quarter for `yearquarter`. - if (time_type %in% c("yearmonth", "integer")) { - all_dates <- seq(min(x$time_value), max(x$time_value), by = 1L) - - if (before != 0 && before != Inf) { - pad_early_dates <- all_dates[1L] - before:1 - } - if (after != 0) { - pad_late_dates <- all_dates[length(all_dates)] + 1:after - } - } else { - by <- switch(time_type, - day = "days", - week = "weeks", - ) - - all_dates <- seq(min(x$time_value), max(x$time_value), by = by) - if (before != 0 && before != Inf) { - # The behavior is analogous to the branch with tsibble types above. For - # more detail, note that the function `seq.Date(from, ..., length.out = - # n)` returns `from + 0:n`. Since we want `from + 1:n`, we drop the first - # element. Adding "-1" to the `by` arg makes `seq.Date` go backwards in - # time. - pad_early_dates <- sort(seq(all_dates[1L], by = paste("-1", by), length.out = before + 1)[-1]) - } - if (after != 0) { - pad_late_dates <- seq(all_dates[length(all_dates)], by = by, length.out = after + 1)[-1] - } - } - - list( - all_dates = all_dates, - pad_early_dates = pad_early_dates, - pad_late_dates = pad_late_dates - ) -} diff --git a/man/across_ish_names_info.Rd b/man/across_ish_names_info.Rd index 70eb40c4f..c993c2bf5 100644 --- a/man/across_ish_names_info.Rd +++ b/man/across_ish_names_info.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/slide.R +% Please edit documentation in R/epi_slide_opt_edf.R \name{across_ish_names_info} \alias{across_ish_names_info} \title{Calculate input and output column names for an \code{{epiprocess}} \code{\link[dplyr:across]{dplyr::across}}-like operations} diff --git a/man/epi_slide_opt.Rd b/man/epi_slide_opt.Rd index 4b75e9ffb..946e7c981 100644 --- a/man/epi_slide_opt.Rd +++ b/man/epi_slide_opt.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/slide.R +% Please edit documentation in R/epi_slide_opt_edf.R \name{epi_slide_opt} \alias{epi_slide_opt} \alias{epi_slide_mean} diff --git a/man/epix_epi_slide_opt_one_epikey.Rd b/man/epix_epi_slide_opt_one_epikey.Rd index e8321616e..9b3650f42 100644 --- a/man/epix_epi_slide_opt_one_epikey.Rd +++ b/man/epix_epi_slide_opt_one_epikey.Rd @@ -64,7 +64,7 @@ updates <- bind_rows( version = 13, time_value = 7, value = NA, ) ) \%>\% - mutate(across(c(version, time_value), ~as.Date("2020-01-01") - 1 + .x)) \%>\% + mutate(across(c(version, time_value), ~ as.Date("2020-01-01") - 1 + .x)) \%>\% tidyr::nest(.by = version, .key = "subtbl") updates \%>\% diff --git a/man/full_date_seq.Rd b/man/full_date_seq.Rd index eb36b2c17..1a29c5e92 100644 --- a/man/full_date_seq.Rd +++ b/man/full_date_seq.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/slide.R +% Please edit documentation in R/epi_slide_opt_edf.R \name{full_date_seq} \alias{full_date_seq} \title{Make a complete date sequence between min(x$time_value) and max diff --git a/man/upstream_slide_f_info.Rd b/man/upstream_slide_f_info.Rd index b0e928c4d..787790ca6 100644 --- a/man/upstream_slide_f_info.Rd +++ b/man/upstream_slide_f_info.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/slide.R +% Please edit documentation in R/epi_slide_opt_edf.R \name{upstream_slide_f_info} \alias{upstream_slide_f_info} \title{Validate & get information about an upstream slide function} diff --git a/man/upstream_slide_f_possibilities.Rd b/man/upstream_slide_f_possibilities.Rd index 9ba949ef1..0fae2cc4f 100644 --- a/man/upstream_slide_f_possibilities.Rd +++ b/man/upstream_slide_f_possibilities.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/slide.R +% Please edit documentation in R/epi_slide_opt_edf.R \docType{data} \name{upstream_slide_f_possibilities} \alias{upstream_slide_f_possibilities} From e9b96f12f02ee718b918029e26afff4f64aed5f8 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 6 Mar 2025 17:27:44 -0800 Subject: [PATCH 024/107] Actually turn epi_slide_opt into S3 method --- DESCRIPTION | 2 +- NAMESPACE | 6 +-- ...pi_slide_opt.R => epi_slide_opt_archive.R} | 39 ++++++++----------- R/epi_slide_opt_edf.R | 36 ++++++++++++++--- man/epi_slide_opt.Rd | 25 ++++++++++-- ...Rd => epi_slide_opt_archive_one_epikey.Rd} | 14 +++---- 6 files changed, 80 insertions(+), 42 deletions(-) rename R/{epix_epi_slide_opt.R => epi_slide_opt_archive.R} (87%) rename man/{epix_epi_slide_opt_one_epikey.Rd => epi_slide_opt_archive_one_epikey.Rd} (80%) diff --git a/DESCRIPTION b/DESCRIPTION index 1732bbc97..a746b53eb 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -98,9 +98,9 @@ Collate: 'correlation.R' 'epi_df.R' 'epi_df_forbidden_methods.R' + 'epi_slide_opt_archive.R' 'epi_slide_opt_edf.R' 'epiprocess-package.R' - 'epix_epi_slide_opt.R' 'group_by_epi_df_methods.R' 'methods-epi_archive.R' 'grouped_epi_archive.R' diff --git a/NAMESPACE b/NAMESPACE index e1b111c0a..49b5a3acf 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -27,8 +27,9 @@ S3method(dplyr_col_modify,col_modify_recorder_df) S3method(dplyr_col_modify,epi_df) S3method(dplyr_reconstruct,epi_df) S3method(dplyr_row_slice,epi_df) -S3method(epix_epi_slide_opt,epi_archive) -S3method(epix_epi_slide_opt,grouped_epi_archive) +S3method(epi_slide_opt,epi_archive) +S3method(epi_slide_opt,epi_df) +S3method(epi_slide_opt,grouped_epi_archive) S3method(epix_slide,epi_archive) S3method(epix_slide,grouped_epi_archive) S3method(epix_truncate_versions_after,epi_archive) @@ -75,7 +76,6 @@ export(epi_slide_mean) export(epi_slide_opt) export(epi_slide_sum) export(epix_as_of) -export(epix_epi_slide_opt) export(epix_fill_through_version) export(epix_merge) export(epix_slide) diff --git a/R/epix_epi_slide_opt.R b/R/epi_slide_opt_archive.R similarity index 87% rename from R/epix_epi_slide_opt.R rename to R/epi_slide_opt_archive.R index 5e446cdca..bf90313c2 100644 --- a/R/epix_epi_slide_opt.R +++ b/R/epi_slide_opt_archive.R @@ -1,4 +1,4 @@ -#' Core operation of `epix_epi_slide_opt` for a single epikey's history +#' Core operation of `epi_slide_opt.epi_archive` for a single epikey's history #' #' @param updates tibble with two columns: `version` and `subtbl`; `subtbl` is a #' list of tibbles, each with a `time_value` column and measurement columns. @@ -41,10 +41,10 @@ #' tidyr::nest(.by = version, .key = "subtbl") #' #' updates %>% -#' epix_epi_slide_opt_one_epikey("value", data.table::frollmean, "data.table", 1L, 0L, "day", "slide_value") +#' epi_slide_opt_one_epikey("value", data.table::frollmean, "data.table", 1L, 0L, "day", "slide_value") #' #' @keywords internal -epix_epi_slide_opt_one_epikey <- function(updates, in_colnames, f_dots_baked, f_from_package, before, after, time_type, out_colnames) { +epi_slide_opt_archive_one_epikey <- function(updates, in_colnames, f_dots_baked, f_from_package, before, after, time_type, out_colnames) { # TODO check for col name clobbering unit_step <- epiprocess:::unit_time_delta(time_type) prev_inp_snapshot <- NULL @@ -124,21 +124,9 @@ epix_epi_slide_opt_one_epikey <- function(updates, in_colnames, f_dots_baked, f_ result } -# TODO just make this an epi_slide_opt impl? - +#' @method epi_slide_opt grouped_epi_archive #' @export -epix_epi_slide_opt <- - function(.x, .col_names, .f, ..., - .window_size = NULL, .align = c("right", "center", "left"), - .prefix = NULL, .suffix = NULL, .new_col_names = NULL # , - ## .ref_time_values = NULL, .all_rows = FALSE - ) { - UseMethod("epix_epi_slide_opt") - } - -#' @method epix_epi_slide_opt grouped_epi_archive -#' @export -epix_epi_slide_opt.grouped_epi_archive <- function(.x, ...) { +epi_slide_opt.grouped_epi_archive <- function(.x, ...) { assert_set_equal( group_vars(.x), key_colnames(.x, exclude = c("time_value", "version")) @@ -147,16 +135,17 @@ epix_epi_slide_opt.grouped_epi_archive <- function(.x, ...) { orig_drop <- .x$private$drop .x %>% ungroup() %>% - epix_epi_slide_opt(...) %>% + epi_slide_opt(...) %>% group_by(pick(all_of(orig_group_vars)), .drop = orig_drop) } -#' @method epix_epi_slide_opt epi_archive + +#' @method epi_slide_opt epi_archive #' @export -epix_epi_slide_opt.epi_archive <- +epi_slide_opt.epi_archive <- function(.x, .col_names, .f, ..., .window_size = NULL, .align = c("right", "center", "left"), .prefix = NULL, .suffix = NULL, .new_col_names = NULL, - ## , .ref_time_values = NULL, .all_rows = FALSE + .ref_time_values = NULL, .all_rows = FALSE, .progress = FALSE) { # Extract metadata: time_type <- .x$time_type @@ -175,6 +164,12 @@ epix_epi_slide_opt.epi_archive <- col_names_quo <- enquo(.col_names) names_info <- across_ish_names_info(.x$DT, time_type, col_names_quo, .f_info$namer, .window_size, .align, .prefix, .suffix, .new_col_names) window_args <- get_before_after_from_window(.window_size, .align, time_type) + if (!is.null(.ref_time_values)) { + cli_abort("epi_slide.epi_archive does not support the `.ref_time_values` argument") + } + if (!identical(.all_rows, FALSE)) { + cli_abort("epi_slide.epi_archive does not support the `.all_rows` argument") + } assert( checkmate::check_logical(.progress, any.missing = FALSE, len = 1L, names = "unnamed"), checkmate::check_string(.progress) @@ -196,7 +191,7 @@ epix_epi_slide_opt.epi_archive <- nest(.by = version, .key = "subtbl") %>% arrange(version) # TODO move nesting inside the helper? - res <- epix_epi_slide_opt_one_epikey(group_updates, names_info$input_col_names, .f_dots_baked, .f_info$from_package, window_args$before, window_args$after, time_type, names_info$output_col_names) %>% + res <- epi_slide_opt_archive_one_epikey(group_updates, names_info$input_col_names, .f_dots_baked, .f_info$from_package, window_args$before, window_args$after, time_type, names_info$output_col_names) %>% list_rbind() if (use_progress) cli::cli_progress_update(id = progress_bar_id) res diff --git a/R/epi_slide_opt_edf.R b/R/epi_slide_opt_edf.R index 44eaa1470..9dd7190c8 100644 --- a/R/epi_slide_opt_edf.R +++ b/R/epi_slide_opt_edf.R @@ -159,10 +159,23 @@ across_ish_names_info <- function(.x, time_type, col_names_quo, .f_namer, .windo #' Optimized slide functions for common cases #' -#' @description `epi_slide_opt` allows sliding an n-timestep [data.table::froll] -#' or [slider::summary-slide] function over variables in an `epi_df` object. -#' These functions tend to be much faster than `epi_slide()`. See -#' `vignette("epi_df")` for more examples. +#' @description +#' +#' `epi_slide_opt` calculates n-time-step rolling means&sums, +#' cumulative/"running" means&sums, or other operations supported by +#' [`data.table::froll`] or [`slider::summary-slide`] functions. +#' +#' * On `epi_df`s, it will take care of looping over `geo_value`s, temporarily +#' filling in time gaps with `NA`s and other work needed to ensure there are +#' exactly n consecutive time steps per computation, and has some other +#' convenience features. See `vignette("epi_df")` for more examples. +#' +#' * On `epi_archive`s, it will calculate the version history for these slide +#' computations and combine it with the version history for the rest of the +#' columns. +#' +#' This function tends to be much faster than using `epi_slide()` and +#' `epix_slide()` directly. #' #' @template basic-slide-params #' @param .col_names <[`tidy-select`][dplyr_tidy_select]> An unquoted column @@ -279,13 +292,26 @@ across_ish_names_info <- function(.x, time_type, col_names_quo, .f_namer, .windo #' .suffix = "_{.n}{.time_unit_abbr}_median" #' ) %>% #' print(n = 40) +#' +#' # You can calculate entire version histories for the derived signals by +#' # calling `epi_slide_opt()` on an `epi_archive`: +#' case_death_rate_archive %>% +#' epi_slide_mean(case_rate, .window_size = 14) +#' +#' @export epi_slide_opt <- function( .x, .col_names, .f, ..., .window_size = NULL, .align = c("right", "center", "left"), .prefix = NULL, .suffix = NULL, .new_col_names = NULL, .ref_time_values = NULL, .all_rows = FALSE) { - assert_class(.x, "epi_df") + UseMethod("epi_slide_opt") +} +#' @export +epi_slide_opt.epi_df <- function(.x, .col_names, .f, ..., + .window_size = NULL, .align = c("right", "center", "left"), + .prefix = NULL, .suffix = NULL, .new_col_names = NULL, + .ref_time_values = NULL, .all_rows = FALSE) { # Deprecated argument handling provided_args <- rlang::call_args_names(rlang::call_match()) if (any(purrr::map_lgl(provided_args, ~ .x %in% c("x", "col_names", "f", "ref_time_values", "all_rows")))) { diff --git a/man/epi_slide_opt.Rd b/man/epi_slide_opt.Rd index 946e7c981..1d5999578 100644 --- a/man/epi_slide_opt.Rd +++ b/man/epi_slide_opt.Rd @@ -134,10 +134,21 @@ added. It will be ungrouped if \code{.x} was ungrouped, and have the same groups as \code{.x} if \code{.x} was grouped. } \description{ -\code{epi_slide_opt} allows sliding an n-timestep \link[data.table:froll]{data.table::froll} -or \link[slider:summary-slide]{slider::summary-slide} function over variables in an \code{epi_df} object. -These functions tend to be much faster than \code{epi_slide()}. See -\code{vignette("epi_df")} for more examples. +\code{epi_slide_opt} calculates n-time-step rolling means&sums, +cumulative/"running" means&sums, or other operations supported by +\code{\link[data.table:froll]{data.table::froll}} or \code{\link[slider:summary-slide]{slider::summary-slide}} functions. +\itemize{ +\item On \code{epi_df}s, it will take care of looping over \code{geo_value}s, temporarily +filling in time gaps with \code{NA}s and other work needed to ensure there are +exactly n consecutive time steps per computation, and has some other +convenience features. See \code{vignette("epi_df")} for more examples. +\item On \code{epi_archive}s, it will calculate the version history for these slide +computations and combine it with the version history for the rest of the +columns. +} + +This function tends to be much faster than using \code{epi_slide()} and +\code{epix_slide()} directly. \code{epi_slide_mean} is a wrapper around \code{epi_slide_opt} with \code{.f = data.table::frollmean}. @@ -211,6 +222,12 @@ cases_deaths_subset \%>\% .suffix = "_{.n}{.time_unit_abbr}_median" ) \%>\% print(n = 40) + +# You can calculate entire version histories for the derived signals by +# calling `epi_slide_opt()` on an `epi_archive`: +case_death_rate_archive \%>\% + epi_slide_mean(case_rate, .window_size = 14) + } \seealso{ \code{\link{epi_slide}} for the more general slide function diff --git a/man/epix_epi_slide_opt_one_epikey.Rd b/man/epi_slide_opt_archive_one_epikey.Rd similarity index 80% rename from man/epix_epi_slide_opt_one_epikey.Rd rename to man/epi_slide_opt_archive_one_epikey.Rd index 9b3650f42..95f97909f 100644 --- a/man/epix_epi_slide_opt_one_epikey.Rd +++ b/man/epi_slide_opt_archive_one_epikey.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/epix_epi_slide_opt.R -\name{epix_epi_slide_opt_one_epikey} -\alias{epix_epi_slide_opt_one_epikey} -\title{Core operation of \code{epix_epi_slide_opt} for a single epikey's history} +% Please edit documentation in R/epi_slide_opt_archive.R +\name{epi_slide_opt_archive_one_epikey} +\alias{epi_slide_opt_archive_one_epikey} +\title{Core operation of \code{epi_slide_opt.epi_archive} for a single epikey's history} \usage{ -epix_epi_slide_opt_one_epikey( +epi_slide_opt_archive_one_epikey( updates, in_colnames, f_dots_baked, @@ -45,7 +45,7 @@ for results} list of tibbles with same names as \code{subtbl}s plus: \code{c(out_colnames, "version")}; (compactified) diff data to put into an \code{epi_archive} } \description{ -Core operation of \code{epix_epi_slide_opt} for a single epikey's history +Core operation of \code{epi_slide_opt.epi_archive} for a single epikey's history } \examples{ @@ -68,7 +68,7 @@ updates <- bind_rows( tidyr::nest(.by = version, .key = "subtbl") updates \%>\% - epix_epi_slide_opt_one_epikey("value", data.table::frollmean, "data.table", 1L, 0L, "day", "slide_value") + epi_slide_opt_one_epikey("value", data.table::frollmean, "data.table", 1L, 0L, "day", "slide_value") } \keyword{internal} From ddbd05d571a598ac685147d66b2d0976fc4748b6 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 5 Mar 2025 17:56:34 -0800 Subject: [PATCH 025/107] Clean up unnecessary comments and unused helper functions, +@keywords internal --- R/patch.R | 35 +++++++++++------------------------ man/tbl_diff2.Rd | 1 + man/tbl_patch.Rd | 1 + 3 files changed, 13 insertions(+), 24 deletions(-) diff --git a/R/patch.R b/R/patch.R index 37af285f9..32b7dc96a 100644 --- a/R/patch.R +++ b/R/patch.R @@ -118,6 +118,8 @@ approx_equal0 <- function(vec1, vec2, abs_tol, na_equal, inds1 = NULL, inds2 = N #' `compactify_abs_tol`. #' @param compactify_abs_tol compactification tolerance; see `apply_compactify` #' @return a tibble in compact "update" (diff) format +#' +#' @keywords internal tbl_diff2 <- function(earlier_snapshot, later_tbl, ukey_names, later_format = c("snapshot", "update"), @@ -152,7 +154,6 @@ tbl_diff2 <- function(earlier_snapshot, later_tbl, # More input validation: if (!identical(tbl_names, names(later_tbl))) { - # XXX is this check actually necessary? cli_abort(c("`earlier_snapshot` and `later_tbl` should have identical column names and ordering.", "*" = "`earlier_snapshot` colnames: {format_chr_deparse(tbl_names)}", @@ -200,7 +201,6 @@ tbl_diff2 <- function(earlier_snapshot, later_tbl, combined_compactify_away[combined_ukey_is_repeat] <- approx_equal0(combined_vals, combined_vals, - # TODO move inds closer to vals to not be as confusing? abs_tol = compactify_abs_tol, na_equal = TRUE, inds1 = combined_ukey_is_repeat, @@ -230,17 +230,6 @@ tbl_diff2 <- function(earlier_snapshot, later_tbl, combined_tbl } -epi_diff2 <- function(earlier_snapshot, later_edf, - later_format = c("snapshot", "update"), - compactify_abs_tol = 0) { - ukey_names <- key_colnames(later_edf) - dplyr_reconstruct(tbl_diff2(as_tibble(earlier_snapshot), as_tibble(later_edf), ukey_names, later_format, compactify_abs_tol), later_edf) -} - -# XXX vs. tbl_patch_apply? - - - #' Apply an update (e.g., from `tbl_diff2`) to a snapshot #' #' @param snapshot tibble or `NULL`; entire data set as of some version, or @@ -252,14 +241,12 @@ epi_diff2 <- function(earlier_snapshot, later_edf, #' for `snapshot` and for `update`. Uniqueness is unchecked; if you don't have #' this guaranteed, see [`check_ukey_unique()`]. #' @return tibble; snapshot of the data set with the update applied. +#' +#' @keywords internal tbl_patch <- function(snapshot, update, ukey_names) { # Most input validation. This is a small function so use faster validation # variants: if (!is_tibble(update)) { - # XXX debating about whether to have a specialized class for updates/diffs. - # Seems nice for type-based reasoning and might remove some args from - # interfaces, but would require constructor/converter functions for that - # type. cli_abort("`update` must be a tibble") } if (is.null(snapshot)) { @@ -271,6 +258,13 @@ tbl_patch <- function(snapshot, update, ukey_names) { if (!is.character(ukey_names) || !all(ukey_names %in% names(snapshot))) { cli_abort("`ukey_names` must be a subset of column names") } + if (!identical(names(snapshot), names(update))) { + cli_abort(c("`snapshot` and `update` should have identical column + names and ordering.", + "*" = "`snapshot` colnames: {format_chr_deparse(tbl_names)}", + "*" = "`update` colnames: {format_chr_deparse(names(update))}" + )) + } result_tbl <- vec_rbind(update, snapshot) @@ -278,12 +272,5 @@ tbl_patch <- function(snapshot, update, ukey_names) { not_overwritten <- dup_ids == vec_seq_along(result_tbl) result_tbl <- result_tbl[not_overwritten, ] - ## result_tbl <- arrange_canonical(result_tbl) - result_tbl } - -epi_patch <- function(snapshot, update) { - ukey_names <- key_colnames(update) - dplyr_reconstruct(tbl_patch(as_tibble(snapshot), as_tibble(update), ukey_names), update) -} diff --git a/man/tbl_diff2.Rd b/man/tbl_diff2.Rd index d9b652258..f4e393b95 100644 --- a/man/tbl_diff2.Rd +++ b/man/tbl_diff2.Rd @@ -42,3 +42,4 @@ a tibble in compact "update" (diff) format \description{ Calculate compact patch to move from one snapshot/update to another } +\keyword{internal} diff --git a/man/tbl_patch.Rd b/man/tbl_patch.Rd index 2b4e8288b..6d9eee270 100644 --- a/man/tbl_patch.Rd +++ b/man/tbl_patch.Rd @@ -24,3 +24,4 @@ tibble; snapshot of the data set with the update applied. \description{ Apply an update (e.g., from \code{tbl_diff2}) to a snapshot } +\keyword{internal} From 10b0de10a8d1c0d162f06453a5acb4253afe2bcd Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 6 Mar 2025 10:08:32 -0800 Subject: [PATCH 026/107] approx_equal: make "abs_tol=" mandatory, +validation, +docs --- NAMESPACE | 6 +++ R/archive.R | 6 +-- R/epiprocess-package.R | 5 ++ R/patch.R | 98 +++++++++++++++++++++++++++++------ man/approx_equal.Rd | 69 +++++++++++++++++++----- man/approx_equal0.Rd | 2 +- tests/testthat/test-archive.R | 2 +- 7 files changed, 154 insertions(+), 34 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 49b5a3acf..66532c7cb 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -58,6 +58,7 @@ S3method(ungroup,epi_df) S3method(ungroup,grouped_epi_archive) S3method(unnest,epi_df) export("%>%") +export(approx_equal) export(arrange) export(arrange_canonical) export(as_epi_archive) @@ -126,8 +127,12 @@ importFrom(checkmate,assert_subset) importFrom(checkmate,assert_tibble) importFrom(checkmate,checkInt) importFrom(checkmate,check_atomic) +importFrom(checkmate,check_character) importFrom(checkmate,check_data_frame) +importFrom(checkmate,check_logical) importFrom(checkmate,check_names) +importFrom(checkmate,check_null) +importFrom(checkmate,check_numeric) importFrom(checkmate,expect_class) importFrom(checkmate,test_int) importFrom(checkmate,test_set_equal) @@ -257,6 +262,7 @@ importFrom(tsibble,as_tsibble) importFrom(utils,capture.output) importFrom(utils,tail) importFrom(vctrs,"vec_slice<-") +importFrom(vctrs,obj_is_vector) importFrom(vctrs,vec_cast) importFrom(vctrs,vec_cast_common) importFrom(vctrs,vec_data) diff --git a/R/archive.R b/R/archive.R index a9200a7ff..d27f7126b 100644 --- a/R/archive.R +++ b/R/archive.R @@ -493,12 +493,12 @@ update_is_locf <- function(arranged_updates_df, ukey_names, abs_tol) { FALSE, # first observation is not LOCF approx_equal0(ekts_tbl, inds1 = inds1, ekts_tbl, inds2 = inds2, - # check ekt cols without tolerance: - abs_tol = 0, na_equal = TRUE + # check ekt (key) cols with 0 tolerance: + na_equal = TRUE, abs_tol = 0 ) & approx_equal0(vals_tbl, inds1 = inds1, vals_tbl, inds2 = inds2, - abs_tol = abs_tol, na_equal = TRUE + na_equal = TRUE, abs_tol = abs_tol ) ) } diff --git a/R/epiprocess-package.R b/R/epiprocess-package.R index 89e785397..f73f67a4d 100644 --- a/R/epiprocess-package.R +++ b/R/epiprocess-package.R @@ -12,7 +12,11 @@ #' @importFrom checkmate assert_subset #' @importFrom checkmate assert_tibble #' @importFrom checkmate check_atomic check_data_frame expect_class test_int +#' @importFrom checkmate check_character +#' @importFrom checkmate check_logical #' @importFrom checkmate check_names +#' @importFrom checkmate check_null +#' @importFrom checkmate check_numeric #' @importFrom checkmate test_subset test_set_equal vname #' @importFrom cli cli_abort cli_warn #' @importFrom cli pluralize @@ -36,6 +40,7 @@ #' @importFrom tibble is_tibble #' @importFrom tidyr nest #' @importFrom tools toTitleCase +#' @importFrom vctrs obj_is_vector #' @importFrom vctrs vec_cast #' @importFrom vctrs vec_cast_common #' @importFrom vctrs vec_data diff --git a/R/patch.R b/R/patch.R index 32b7dc96a..39f69417f 100644 --- a/R/patch.R +++ b/R/patch.R @@ -1,22 +1,88 @@ #' Test two vctrs vectors for equality with some tolerance in some cases #' -#' @param vec1,vec2 vctrs vectors (includes data frames) -#' @param abs_tol tolerance; will be used for bare numeric `vec1`, `vec2`, or -#' any such columns within `vec1`, `vec2` if they are data frames +#' Similar to [`vctrs::vec_equal`]. Behavior may differ from `vec_equal` with +#' non-`NA` `NaN`s involved, or for bare lists that contain named vectors, and +#' the precise behavior in these cases may change and should not be relied upon. +#' +#' @param vec1,vec2 vctrs vectors (includes data frames). Take care when using +#' on named vectors or "keyed" data frames; [`vec_names()`] are largely +#' ignored, and key columns are treated as normal value columns (when they +#' should probably generate an error if they are not lined up correctly, or be +#' tested for exact rather than approximate equality). #' @param na_equal should `NA`s be considered equal to each other? (In #' epiprocess, we usually want this to be `TRUE`, but that doesn't match the #' [`vctrs::vec_equal()`] default, so this is mandatory.) -#' @param .ptype as in [`vctrs::vec_equal()`] -#' @param inds1,inds2 optional (row) indices into vec1 and vec2; output should -#' be consistent with `vec_slice`-ing to these indices beforehand, but can -#' give faster computation if `vec1` and `vec2` are data frames. +#' @param abs_tol absolute tolerance; will be used for bare numeric `vec1`, +#' `vec2`, or any such columns within `vec1`, `vec2` if they are data frames. +#' @param .ptype as in [`vctrs::vec_equal()`]. +#' @param inds1,inds2 optional (row) indices into vec1 and vec2 compatible with +#' [`vctrs::vec_slice()`]; output should be consistent with `vec_slice`-ing to +#' these indices beforehand, but can give faster computation if `vec1` and +#' `vec2` are data frames. #' #' @return logical vector, with length matching the result of recycling `vec1` #' (at `inds1` if provided) and `vec2` (at `inds2` if provided); entries -#' should all be `TRUE` or `FALSE` if `na_equal = TRUE`. Behavior may differ -#' from `vec_equal` with non-`NA` `NaN`s involved, or for bare lists that -#' contain named vectors. -approx_equal <- function(vec1, vec2, abs_tol, na_equal, .ptype = NULL, inds1 = NULL, inds2 = NULL) { +#' should all be `TRUE` or `FALSE` if `na_equal = TRUE`. +#' +#' @examples +#' +#' # On numeric vectors: +#' approx_equal( +#' c(1, 2, 3, NA), +#' c(1, 2 + 1e-10, NA, NA), +#' na_equal = TRUE, +#' abs_tol = 1e-8 +#' ) +#' +#' # On tibbles: +#' tbl1 <- tibble( +#' a = 1:5, +#' b = list(1:5, 1:4, 1:3, 1:2, 1:1) %>% lapply(as.numeric), +#' c = tibble( +#' c1 = 1:5 +#' ), +#' d = matrix(1:10, 5, 2) +#' ) +#' tbl2 <- tbl1 +#' tbl2$a[[2]] <- tbl1$a[[2]] + 1e-10 +#' tbl2$b[[3]][[1]] <- tbl1$b[[3]][[1]] + 1e-10 +#' tbl2$c$c1[[4]] <- tbl1$c$c1[[4]] + 1e-10 +#' tbl2$d[[5, 2]] <- tbl1$d[[5, 2]] + 1e-10 +#' vctrs::vec_equal(tbl1, tbl2, na_equal = TRUE) +#' approx_equal(tbl1, tbl2, na_equal = TRUE, abs_tol = 1e-12) +#' approx_equal(tbl1, tbl2, na_equal = TRUE, abs_tol = 1e-8) +#' +#' +#' +#' +#' +#' # Type comparison within lists is stricter, matching vctrs: +#' vctrs::vec_equal(list(1:2), list(as.numeric(1:2))) +#' approx_equal(list(1:2), list(as.numeric(1:2)), FALSE, abs_tol = 0) +#' +#' @export +approx_equal <- function(vec1, vec2, na_equal, .ptype = NULL, ..., abs_tol, inds1 = NULL, inds2 = NULL) { + if (!obj_is_vector(vec1)) cli_abort("`vec1` must be recognized by vctrs as a vector") + if (!obj_is_vector(vec2)) cli_abort("`vec2` must be recognized by vctrs as a vector") + # Leave vec size checking to vctrs recycling ops. + assert_logical(na_equal, any.missing = FALSE, len = 1L) + # Leave .ptype checks to cast operation. + check_dots_empty() + assert_numeric(abs_tol, lower = 0, len = 1L) + assert( + check_null(inds1), + check_numeric(inds1), + check_logical(inds1), + check_character(inds1) + ) + assert( + check_null(inds2), + check_numeric(inds2), + check_logical(inds2), + check_character(inds2) + ) + # Leave heavier index validation to the vctrs recycling & indexing ops. + # Recycle inds if provided; vecs if not: common_size <- vec_size_common( if (is.null(inds1)) vec1 else inds1, @@ -33,13 +99,13 @@ approx_equal <- function(vec1, vec2, abs_tol, na_equal, .ptype = NULL, inds1 = N inds2 <- vec_recycle(inds2, common_size) } vecs <- vec_cast_common(vec1, vec2, .to = .ptype) - approx_equal0(vecs[[1]], vecs[[2]], abs_tol, na_equal, inds1, inds2) + approx_equal0(vecs[[1]], vecs[[2]], na_equal, abs_tol, inds1, inds2) } #' Helper for [`approx_equal`] for vecs guaranteed to have the same ptype and size #' #' @keywords internal -approx_equal0 <- function(vec1, vec2, abs_tol, na_equal, inds1 = NULL, inds2 = NULL) { +approx_equal0 <- function(vec1, vec2, na_equal, abs_tol, inds1 = NULL, inds2 = NULL) { if (is_bare_numeric(vec1) && abs_tol != 0) { # perf: since we're working with bare numerics and logicals: we can use `[` # and `fifelse`. Matching vec_equal, we ignore names and other attributes. @@ -66,7 +132,7 @@ approx_equal0 <- function(vec1, vec2, abs_tol, na_equal, inds1 = NULL, inds2 = N rep(TRUE, nrow(vec1)) } else { Reduce(`&`, lapply(seq_len(ncol(vec1)), function(col_i) { - approx_equal0(vec1[[col_i]], vec2[[col_i]], abs_tol, na_equal, inds1, inds2) + approx_equal0(vec1[[col_i]], vec2[[col_i]], na_equal, abs_tol, inds1, inds2) })) } } else if (is_bare_list(vec1)) { @@ -78,7 +144,7 @@ approx_equal0 <- function(vec1, vec2, abs_tol, na_equal, inds1 = NULL, inds2 = N # consistently inconsistent, we avoid dispatching to vec_equal for bare # lists even with abs_tol = 0: identical(vec_ptype(entry1), vec_ptype(entry2)) && - all(approx_equal0(entry1, entry2, abs_tol, na_equal)) + all(approx_equal0(entry1, entry2, na_equal, abs_tol)) }, logical(1L)) } else { # XXX No special handling for any other types/situations. Makes sense for @@ -201,8 +267,8 @@ tbl_diff2 <- function(earlier_snapshot, later_tbl, combined_compactify_away[combined_ukey_is_repeat] <- approx_equal0(combined_vals, combined_vals, - abs_tol = compactify_abs_tol, na_equal = TRUE, + abs_tol = compactify_abs_tol, inds1 = combined_ukey_is_repeat, inds2 = ukey_repeat_first_i ) diff --git a/man/approx_equal.Rd b/man/approx_equal.Rd index 4eac410f0..237b33be4 100644 --- a/man/approx_equal.Rd +++ b/man/approx_equal.Rd @@ -7,36 +7,79 @@ approx_equal( vec1, vec2, - abs_tol, na_equal, .ptype = NULL, + ..., + abs_tol, inds1 = NULL, inds2 = NULL ) } \arguments{ -\item{vec1, vec2}{vctrs vectors (includes data frames)} - -\item{abs_tol}{tolerance; will be used for bare numeric \code{vec1}, \code{vec2}, or -any such columns within \code{vec1}, \code{vec2} if they are data frames} +\item{vec1, vec2}{vctrs vectors (includes data frames). Take care when using +on named vectors or "keyed" data frames; \code{\link[=vec_names]{vec_names()}} are largely +ignored, and key columns are treated as normal value columns (when they +should probably generate an error if they are not lined up correctly, or be +tested for exact rather than approximate equality).} \item{na_equal}{should \code{NA}s be considered equal to each other? (In epiprocess, we usually want this to be \code{TRUE}, but that doesn't match the \code{\link[vctrs:vec_equal]{vctrs::vec_equal()}} default, so this is mandatory.)} -\item{.ptype}{as in \code{\link[vctrs:vec_equal]{vctrs::vec_equal()}}} +\item{.ptype}{as in \code{\link[vctrs:vec_equal]{vctrs::vec_equal()}}.} + +\item{abs_tol}{absolute tolerance; will be used for bare numeric \code{vec1}, +\code{vec2}, or any such columns within \code{vec1}, \code{vec2} if they are data frames.} -\item{inds1, inds2}{optional (row) indices into vec1 and vec2; output should -be consistent with \code{vec_slice}-ing to these indices beforehand, but can -give faster computation if \code{vec1} and \code{vec2} are data frames.} +\item{inds1, inds2}{optional (row) indices into vec1 and vec2 compatible with +\code{\link[vctrs:vec_slice]{vctrs::vec_slice()}}; output should be consistent with \code{vec_slice}-ing to +these indices beforehand, but can give faster computation if \code{vec1} and +\code{vec2} are data frames.} } \value{ logical vector, with length matching the result of recycling \code{vec1} (at \code{inds1} if provided) and \code{vec2} (at \code{inds2} if provided); entries -should all be \code{TRUE} or \code{FALSE} if \code{na_equal = TRUE}. Behavior may differ -from \code{vec_equal} with non-\code{NA} \code{NaN}s involved, or for bare lists that -contain named vectors. +should all be \code{TRUE} or \code{FALSE} if \code{na_equal = TRUE}. } \description{ -Test two vctrs vectors for equality with some tolerance in some cases +Similar to \code{\link[vctrs:vec_equal]{vctrs::vec_equal}}. Behavior may differ from \code{vec_equal} with +non-\code{NA} \code{NaN}s involved, or for bare lists that contain named vectors, and +the precise behavior in these cases may change and should not be relied upon. +} +\examples{ + +# On numeric vectors: +approx_equal( + c(1, 2, 3, NA), + c(1, 2 + 1e-10, NA, NA), + na_equal = TRUE, + abs_tol = 1e-8 +) + +# On tibbles: +tbl1 <- tibble( + a = 1:5, + b = list(1:5, 1:4, 1:3, 1:2, 1:1) \%>\% lapply(as.numeric), + c = tibble( + c1 = 1:5 + ), + d = matrix(1:10, 5, 2) +) +tbl2 <- tbl1 +tbl2$a[[2]] <- tbl1$a[[2]] + 1e-10 +tbl2$b[[3]][[1]] <- tbl1$b[[3]][[1]] + 1e-10 +tbl2$c$c1[[4]] <- tbl1$c$c1[[4]] + 1e-10 +tbl2$d[[5, 2]] <- tbl1$d[[5, 2]] + 1e-10 +vctrs::vec_equal(tbl1, tbl2, na_equal = TRUE) +approx_equal(tbl1, tbl2, na_equal = TRUE, abs_tol = 1e-12) +approx_equal(tbl1, tbl2, na_equal = TRUE, abs_tol = 1e-8) + + + + + +# Type comparison within lists is stricter, matching vctrs: +vctrs::vec_equal(list(1:2), list(as.numeric(1:2))) +approx_equal(list(1:2), list(as.numeric(1:2)), FALSE, abs_tol = 0) + } diff --git a/man/approx_equal0.Rd b/man/approx_equal0.Rd index f7a69f1fe..9726f5013 100644 --- a/man/approx_equal0.Rd +++ b/man/approx_equal0.Rd @@ -4,7 +4,7 @@ \alias{approx_equal0} \title{Helper for \code{\link{approx_equal}} for vecs guaranteed to have the same ptype and size} \usage{ -approx_equal0(vec1, vec2, abs_tol, na_equal, inds1 = NULL, inds2 = NULL) +approx_equal0(vec1, vec2, na_equal, abs_tol, inds1 = NULL, inds2 = NULL) } \description{ Helper for \code{\link{approx_equal}} for vecs guaranteed to have the same ptype and size diff --git a/tests/testthat/test-archive.R b/tests/testthat/test-archive.R index 12d9ed413..4d1be4c52 100644 --- a/tests/testthat/test-archive.R +++ b/tests/testthat/test-archive.R @@ -220,5 +220,5 @@ test_that("`epi_archive` rejects dataframes where time_value and version columns test_that("is_locf replacement works as expected", { vec <- c(1, 1, 1e-10, 1.1e-10, NA, NA, NaN, NaN) is_repeated <- c(0, 1, 0, 1, 0, 1, 1, 1) - expect_equal(c(FALSE, approx_equal(head(vec, -1L), tail(vec, -1L), .Machine$double.eps^0.5, na_equal = TRUE)), as.logical(is_repeated)) + expect_equal(c(FALSE, approx_equal(head(vec, -1L), tail(vec, -1L), na_equal = TRUE, abs_tol = .Machine$double.eps^0.5)), as.logical(is_repeated)) }) From 2b7d786d9874b64f9e77959970bab71c1a2ed57a Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 6 Mar 2025 10:28:46 -0800 Subject: [PATCH 027/107] Expand epi_slide_opt_archive_one_epikey example --- R/epi_slide_opt_archive.R | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/R/epi_slide_opt_archive.R b/R/epi_slide_opt_archive.R index bf90313c2..1a6faad26 100644 --- a/R/epi_slide_opt_archive.R +++ b/R/epi_slide_opt_archive.R @@ -24,24 +24,20 @@ #' #' library(dplyr) #' updates <- bind_rows( -#' tibble( -#' version = 40, time_value = 1:10, value = 1:10 -#' ), -#' tibble( -#' version = 12, time_value = 2:3, value = 3:2 -#' ), -#' tibble( -#' version = 13, time_value = 6, value = 7, -#' ), -#' tibble( -#' version = 13, time_value = 7, value = NA, -#' ) +#' tibble(version = 10, time_value = 1:20, value = 1:20), +#' tibble(version = 12, time_value = 4:5, value = 5:4), +#' tibble(version = 13, time_value = 8, value = 9), +#' tibble(version = 14, time_value = 11, value = NA), +#' tibble(version = 15, time_value = -10, value = -10), +#' tibble(version = 16, time_value = 50, value = 50) #' ) %>% #' mutate(across(c(version, time_value), ~ as.Date("2020-01-01") - 1 + .x)) %>% #' tidyr::nest(.by = version, .key = "subtbl") #' +#' f <- purrr::partial(data.table::frollmean, algo = "exact") +#' #' updates %>% -#' epi_slide_opt_one_epikey("value", data.table::frollmean, "data.table", 1L, 0L, "day", "slide_value") +#' epi_slide_opt_archive_one_epikey("value", f, "data.table", 2L, 0L, "day", "slide_value") #' #' @keywords internal epi_slide_opt_archive_one_epikey <- function(updates, in_colnames, f_dots_baked, f_from_package, before, after, time_type, out_colnames) { From a7bdd0d3f9fca77f07f28b169115bd36f7a4f51a Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 6 Mar 2025 11:22:16 -0800 Subject: [PATCH 028/107] WIP epi_slide_opt.epi_archive tests --- tests/testthat/test-epi_slide_opt_archive.R | 74 +++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 tests/testthat/test-epi_slide_opt_archive.R diff --git a/tests/testthat/test-epi_slide_opt_archive.R b/tests/testthat/test-epi_slide_opt_archive.R new file mode 100644 index 000000000..0f9f0b82b --- /dev/null +++ b/tests/testthat/test-epi_slide_opt_archive.R @@ -0,0 +1,74 @@ +library(dplyr) + +test_that("epi_slide_opt_archive_one_epikey works as expected", { + start_date <- as.Date("2020-01-01") + + updates <- bind_rows( + tibble(version = 10, time_value = 0:20, value = 0:20), + tibble(version = 12, time_value = 4:5, value = 5:4), + tibble(version = 13, time_value = 8, value = 9), + tibble(version = 14, time_value = 11, value = NA), + tibble(version = 15, time_value = -10, value = -10), + tibble(version = 16, time_value = 50, value = 50) + ) %>% + mutate(across(c(version, time_value), ~ start_date - 1 + .x)) %>% + tidyr::nest(.by = version, .key = "subtbl") + + expected <- list( + vctrs::vec_cbind( + tibble(version = 10), + updates$subtbl[[1L]] %>% + mutate(time_value = as.numeric(time_value - start_date) + 1) %>% + mutate(slide_value = frollmean(value, 3, algo = "exact")) + ), + tibble( + version = 12, + time_value = c(4, 5, 7), # time 6 unchanged, compactified away + # time 7 `value` unchanged, but here because `slide_value` changed: + value = c(5, 4, 7), + slide_value = c( + mean(c(2, 3, 5)), + # time 5 `slide_value` unchanged, but here because `value` changed: + mean(c(3, 5, 4)), + mean(c(4, 6, 7)) + ) + ), + tibble( + version = 13, time_value = 8:10, value = c(9, 9, 10), + slide_value = frollmean(c(6, 7, 9, 9, 10), 3, algo = "exact")[-(1:2)] + ), + tibble( + version = 14, time_value = 11:13, value = c(NA, 12, 13), slide_value = rep(NA_real_, 3L) + ), + tibble( + version = 15, time_value = -10, value = -10, slide_value = NA_real_ + ), + tibble( + version = 16, time_value = 50, value = 50, slide_value = NA_real_ + ) + ) %>% + lapply(function(x) { + x %>% + mutate(across(c(version, time_value), ~ start_date - 1 + .x)) + }) + + f <- purrr::partial(data.table::frollmean, algo = "exact") + + result <- updates %>% + epi_slide_opt_archive_one_epikey("value", f, "data.table", 2L, 0L, "day", "slide_value") + + expect_equal( + result %>% lapply(function(x) { + x %>% + arrange(time_value) %>% + select(version, time_value, everything()) + }) + , + expected + ) + + # TODO check about version nesting ordering + +}) + +# TODO tests on example data sets From eb5b020f46e46874164ea1affbf6a8e1cfbc280b Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 7 Mar 2025 09:59:43 -0800 Subject: [PATCH 029/107] More WIP on tests --- tests/testthat/test-epi_slide_opt_archive.R | 78 ++++++++++++++++++--- 1 file changed, 69 insertions(+), 9 deletions(-) diff --git a/tests/testthat/test-epi_slide_opt_archive.R b/tests/testthat/test-epi_slide_opt_archive.R index 0f9f0b82b..3cea0891b 100644 --- a/tests/testthat/test-epi_slide_opt_archive.R +++ b/tests/testthat/test-epi_slide_opt_archive.R @@ -55,20 +55,80 @@ test_that("epi_slide_opt_archive_one_epikey works as expected", { f <- purrr::partial(data.table::frollmean, algo = "exact") result <- updates %>% - epi_slide_opt_archive_one_epikey("value", f, "data.table", 2L, 0L, "day", "slide_value") - - expect_equal( - result %>% lapply(function(x) { + epi_slide_opt_archive_one_epikey("value", f, "data.table", 2L, 0L, "day", "slide_value") %>% + lapply(function(x) { x %>% arrange(time_value) %>% select(version, time_value, everything()) }) - , - expected - ) - # TODO check about version nesting ordering + expect_equal(result, expected) +}) + +test_that("epi_slide_opt.epi_archive is not confused by unique(DT$version) unsorted", { + start_date <- as.Date("2020-01-01") + tibble( + geo_value = 1, + time_value = start_date - 1 + 1:4, + version = start_date - 1 + c(5, 5, 4, 4), + value = c(1, 2, 3, 4) + ) %>% + as_epi_archive() %>% + epi_slide_opt(value, frollmean, .window_size = 2L) %>% + expect_equal( + tibble( + geo_value = 1, + time_value = start_date - 1 + c(1, 2, 3, 3, 4), + version = start_date - 1 + c(5, 5, 4, 5, 4), + value = c(1, 2, 3, 3, 4), + value_2dav = c(NA, 1.5, NA, 2.5, 3.5) + ) %>% + as_epi_archive() + ) }) -# TODO tests on example data sets +test_that("epi_slide_opt.epi_archive is not confused by unique(DT$time_value) unsorted", { + + start_date <- as.Date("2020-01-01") + tibble( + geo_value = c(1, 1, 2, 2), + time_value = start_date - 1 + c(2, 3, 1, 2), + version = start_date - 1 + c(1, 2, 2, 2), + value = c(1, 2, 3, 4) + ) %>% + as_epi_archive() %>% + epi_slide_opt(value, frollmean, .window_size = 2L) %>% + expect_equal( + tibble( + geo_value = c(1, 1, 2, 2), + time_value = start_date - 1 + c(2, 3, 1, 2), + version = start_date - 1 + c(1, 2, 2, 2), + value = c(1, 2, 3, 4), + value_2dav = c(NA, 1.5, NA, 3.5) + ) %>% + as_epi_archive() + ) + +}) + +test_that("epi_slide_opt.epi_archive is equivalent to epix_slide reconversion on example data", { + + case_death_rate_archive %>% + epi_slide_opt(case_rate, frollmean, .window_size = 7 + # , algo = "exact" + ) %>% + .$DT %>% + as.data.frame() %>% + as_tibble() %>% + filter(!approx_equal(case_rate_7dav, case_rate_7d_av, 1e-6, TRUE)) %>% + dplyr::transmute(version, geo_value, time_value, case_rate_7dav, case_rate_7d_av, + abs_diff = abs(case_rate_7dav - case_rate_7d_av)) %>% + {} + + # TODO finish tests on example data sets + + }) + + +# TODO grouped behavior checks From c674e973eb11615e7a928e2678e4c3ad2962f1de Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 7 Mar 2025 13:46:00 -0800 Subject: [PATCH 030/107] Mark renaming TODO on approx_equal --- R/patch.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/patch.R b/R/patch.R index 39f69417f..bf42e9250 100644 --- a/R/patch.R +++ b/R/patch.R @@ -340,3 +340,5 @@ tbl_patch <- function(snapshot, update, ukey_names) { result_tbl } + +# TODO rename approx_equal to vec_approx_equal From 3072b3343ab721a0fbf059ed6e7bc26752245f08 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 10 Mar 2025 03:14:12 -0700 Subject: [PATCH 031/107] fix!: as_epi_archive(tibble) key setting; + distrust key if data.table --- NAMESPACE | 2 ++ NEWS.md | 9 +++++++ R/archive.R | 57 +++++++++++++++++++++--------------------- R/epiprocess-package.R | 2 ++ 4 files changed, 42 insertions(+), 28 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 66532c7cb..7ec1bf1e9 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -125,6 +125,7 @@ importFrom(checkmate,assert_scalar) importFrom(checkmate,assert_string) importFrom(checkmate,assert_subset) importFrom(checkmate,assert_tibble) +importFrom(checkmate,assert_true) importFrom(checkmate,checkInt) importFrom(checkmate,check_atomic) importFrom(checkmate,check_character) @@ -161,6 +162,7 @@ importFrom(data.table,key) importFrom(data.table,rbindlist) importFrom(data.table,set) importFrom(data.table,setDF) +importFrom(data.table,setDT) importFrom(data.table,setcolorder) importFrom(data.table,setkeyv) importFrom(dplyr,"%>%") diff --git a/NEWS.md b/NEWS.md index 39274ff4a..52330c483 100644 --- a/NEWS.md +++ b/NEWS.md @@ -7,6 +7,15 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat ## New features - `is_epi_archive` function has been reintroduced. +## Breaking changes + +- `new_epi_archive()`'s `x` argument has been replaced with a `data_table` + argument, which must be a `data.table` with the key already set appropriately. + +## Bug fixes + +- `as_epi_archive()` no longer has issues setting its `DT`'s `key` on some + versions of `{data.table}` when `x` is a tibble. # epiprocess 0.11 diff --git a/R/archive.R b/R/archive.R index d27f7126b..a5bc02a27 100644 --- a/R/archive.R +++ b/R/archive.R @@ -278,41 +278,22 @@ next_after.Date <- function(x) x + 1L #' @order 3 #' @export new_epi_archive <- function( - x, + data_table, geo_type, time_type, other_keys, clobberable_versions_start, versions_end) { - assert_data_frame(x) + assert_class(data_table, "data.table") assert_string(geo_type) assert_string(time_type) assert_character(other_keys, any.missing = FALSE) if (any(c("geo_value", "time_value", "version") %in% other_keys)) { cli_abort("`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\".") } - validate_version_bound(clobberable_versions_start, x, na_ok = TRUE) - validate_version_bound(versions_end, x, na_ok = FALSE) - - key_vars <- c("geo_value", "time_value", other_keys, "version") - if (!all(key_vars %in% names(x))) { - # Give a more tailored error message than as.data.table would: - cli_abort(c( - "`x` is missing the following expected columns: - {format_varnames(setdiff(key_vars, names(x)))}.", - ">" = "You might need to `dplyr::rename()` beforehand - or use `as_epi_archive()`'s renaming feature.", - ">" = if (!all(other_keys %in% names(x))) { - "Check also for typos in `other_keys`." - } - )) - } - - # Create the data table; if x was an un-keyed data.table itself, - # then the call to as.data.table() will fail to set keys, so we - # need to check this, then do it manually if needed - data_table <- as.data.table(x, key = key_vars) - if (!identical(key_vars, key(data_table))) setkeyv(data_table, cols = key_vars) + assert_true(identical(key(data_table), c("geo_value", "time_value", other_keys, "version"))) + validate_version_bound(clobberable_versions_start, data_table, na_ok = TRUE) + validate_version_bound(versions_end, data_table, na_ok = FALSE) structure( list( @@ -529,11 +510,32 @@ as_epi_archive <- function( .versions_end = max_version_with_row_in(x), ..., versions_end = .versions_end) { assert_data_frame(x) + # Convert first to data.frame to guard against data.table#6859 and potentially + # other things epiprocess#618: + x_already_copied <- identical(class(x), c("data.table", "data.frame")) + x <- as.data.frame(x) x <- rename(x, ...) - x <- guess_column_name(x, "time_value", time_column_names()) x <- guess_column_name(x, "geo_value", geo_column_names()) + if (!all(other_keys %in% names(x))) { + # Give a more tailored error message than as.data.table would: + cli_abort(c( + "`x` is missing the following expected columns: + {format_varnames(setdiff(other_keys, names(x)))}.", + ">" = "You might need to `dplyr::rename()` beforehand + or using `as_epi_archive()`'s renaming feature." + )) + } + x <- guess_column_name(x, "time_value", time_column_names()) x <- guess_column_name(x, "version", version_column_names()) + # Convert to data.table: + key_vars <- c("geo_value", "time_value", other_keys, "version") + if (x_already_copied) { + setDT(x, key = key_vars) + } else { + x <- as.data.table(x, key = key_vars) + } + if (lifecycle::is_present(geo_type)) { cli_warn("epi_archive constructor argument `geo_type` is now ignored. Consider removing.") } @@ -554,11 +556,10 @@ as_epi_archive <- function( cli_abort('`compactify` must be `TRUE`, `FALSE`, or `"message"`') } - data_table <- result$DT - key_vars <- key(data_table) + data_table <- result$DT # probably just `x`, but take no chances nrow_before_compactify <- nrow(data_table) - # Runs compactify on data frame + # Runs compactify on data_table if (identical(compactify, TRUE) || identical(compactify, "message")) { compactified <- apply_compactify(data_table, key_vars, compactify_abs_tol) } else { diff --git a/R/epiprocess-package.R b/R/epiprocess-package.R index f73f67a4d..195bc0258 100644 --- a/R/epiprocess-package.R +++ b/R/epiprocess-package.R @@ -11,6 +11,7 @@ #' @importFrom checkmate assert_string #' @importFrom checkmate assert_subset #' @importFrom checkmate assert_tibble +#' @importFrom checkmate assert_true #' @importFrom checkmate check_atomic check_data_frame expect_class test_int #' @importFrom checkmate check_character #' @importFrom checkmate check_logical @@ -25,6 +26,7 @@ #' @importFrom data.table fifelse #' @importFrom data.table key #' @importFrom data.table setcolorder +#' @importFrom data.table setDT #' @importFrom data.table setkeyv #' @importFrom dplyr arrange #' @importFrom dplyr grouped_df From 279c842f67e3d2d9ccdf61554176c69b068bce53 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 10 Mar 2025 03:28:52 -0700 Subject: [PATCH 032/107] Make epi_archive key order geo !!!other time version --- NEWS.md | 2 ++ R/archive.R | 6 +++--- tests/testthat/test-archive.R | 12 ++++++------ 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/NEWS.md b/NEWS.md index 52330c483..1218da12f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -11,6 +11,8 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat - `new_epi_archive()`'s `x` argument has been replaced with a `data_table` argument, which must be a `data.table` with the key already set appropriately. + The `key()` of its `DT` will also now place `other_keys` before rather than after + `"time_value"`. ## Bug fixes diff --git a/R/archive.R b/R/archive.R index a5bc02a27..2e6ae5497 100644 --- a/R/archive.R +++ b/R/archive.R @@ -291,7 +291,7 @@ new_epi_archive <- function( if (any(c("geo_value", "time_value", "version") %in% other_keys)) { cli_abort("`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\".") } - assert_true(identical(key(data_table), c("geo_value", "time_value", other_keys, "version"))) + assert_true(identical(key(data_table), c("geo_value", other_keys, "time_value", "version"))) validate_version_bound(clobberable_versions_start, data_table, na_ok = TRUE) validate_version_bound(versions_end, data_table, na_ok = FALSE) @@ -319,7 +319,7 @@ new_epi_archive <- function( validate_epi_archive <- function(x) { assert_class(x, "epi_archive") - ukey_vars1 <- c("geo_value", "time_value", x$other_keys, "version") + ukey_vars1 <- c("geo_value", x$other_keys, "time_value", "version") ukey_vars2 <- key(x$DT) if (!identical(ukey_vars1, ukey_vars2)) { cli_abort(c("`data.table::key(x$DT)` not as expected", @@ -529,7 +529,7 @@ as_epi_archive <- function( x <- guess_column_name(x, "version", version_column_names()) # Convert to data.table: - key_vars <- c("geo_value", "time_value", other_keys, "version") + key_vars <- c("geo_value", other_keys, "time_value", "version") if (x_already_copied) { setDT(x, key = key_vars) } else { diff --git a/tests/testthat/test-archive.R b/tests/testthat/test-archive.R index 4d1be4c52..051503e05 100644 --- a/tests/testthat/test-archive.R +++ b/tests/testthat/test-archive.R @@ -92,7 +92,7 @@ test_that("epi_archives are correctly instantiated with a variety of data types" expect_equal(key(ea1$DT), c("geo_value", "time_value", "version")) ea2 <- as_epi_archive(df, other_keys = "value", compactify = FALSE) - expect_equal(key(ea2$DT), c("geo_value", "time_value", "value", "version")) + expect_equal(key(ea2$DT), c("geo_value", "value", "time_value", "version")) # Tibble tib <- tibble::tibble(df, code = "x") @@ -101,7 +101,7 @@ test_that("epi_archives are correctly instantiated with a variety of data types" expect_equal(key(ea3$DT), c("geo_value", "time_value", "version")) ea4 <- as_epi_archive(tib, other_keys = "code", compactify = FALSE) - expect_equal(key(ea4$DT), c("geo_value", "time_value", "code", "version")) + expect_equal(key(ea4$DT), c("geo_value", "code", "time_value", "version")) # Keyed data.table kdt <- data.table::data.table( @@ -119,7 +119,7 @@ test_that("epi_archives are correctly instantiated with a variety of data types" ea6 <- as_epi_archive(kdt, other_keys = "value", compactify = FALSE) # Mismatched keys, but the one from as_epi_archive overrides - expect_equal(key(ea6$DT), c("geo_value", "time_value", "value", "version")) + expect_equal(key(ea6$DT), c("geo_value", "value", "time_value", "version")) # Unkeyed data.table udt <- data.table::data.table( @@ -134,7 +134,7 @@ test_that("epi_archives are correctly instantiated with a variety of data types" expect_equal(key(ea7$DT), c("geo_value", "time_value", "version")) ea8 <- as_epi_archive(udt, other_keys = "code", compactify = FALSE) - expect_equal(key(ea8$DT), c("geo_value", "time_value", "code", "version")) + expect_equal(key(ea8$DT), c("geo_value", "code", "time_value", "version")) # epi_df edf1 <- cases_deaths_subset %>% @@ -145,7 +145,7 @@ test_that("epi_archives are correctly instantiated with a variety of data types" expect_equal(key(ea9$DT), c("geo_value", "time_value", "version")) ea10 <- as_epi_archive(edf1, other_keys = "code", compactify = FALSE) - expect_equal(key(ea10$DT), c("geo_value", "time_value", "code", "version")) + expect_equal(key(ea10$DT), c("geo_value", "code", "time_value", "version")) # Keyed epi_df edf2 <- data.frame( @@ -164,7 +164,7 @@ test_that("epi_archives are correctly instantiated with a variety of data types" expect_equal(key(ea11$DT), c("geo_value", "time_value", "version")) ea12 <- as_epi_archive(edf2, other_keys = "misc", compactify = FALSE) - expect_equal(key(ea12$DT), c("geo_value", "time_value", "misc", "version")) + expect_equal(key(ea12$DT), c("geo_value", "misc", "time_value", "version")) }) test_that("`epi_archive` rejects nonunique keys", { From c7704af0e2bab2f24b6ea49aa9e3aa76fc50b6c0 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 10 Mar 2025 03:34:01 -0700 Subject: [PATCH 033/107] docs(new_epi_archive): roxygen2 for new param requirements --- R/archive.R | 6 +++-- man/epi_archive.Rd | 36 +++++++++++++++++++++++-- man/epi_slide_opt_archive_one_epikey.Rd | 22 +++++++-------- 3 files changed, 47 insertions(+), 17 deletions(-) diff --git a/R/archive.R b/R/archive.R index 2e6ae5497..3935709ac 100644 --- a/R/archive.R +++ b/R/archive.R @@ -186,8 +186,8 @@ next_after.Date <- function(x) x + 1L #' archive. Unexpected behavior may result from modifying the metadata #' directly. #' -#' @param x A data.frame, data.table, or tibble, with columns `geo_value`, -#' `time_value`, `version`, and then any additional number of columns. +#' @param data_table a data.table with [`data.table::key()`] equal to +#' `c("geo_value", other_keys, "time_value", "version")`. #' @param geo_type DEPRECATED Has no effect. Geo value type is inferred from the #' location column and set to "custom" if not recognized. #' @param time_type DEPRECATED Has no effect. Time value type inferred from the time @@ -488,6 +488,8 @@ update_is_locf <- function(arranged_updates_df, ukey_names, abs_tol) { #' `as_epi_archive` converts a data frame, data table, or tibble into an #' `epi_archive` object. #' +#' @param x A data.frame, data.table, or tibble, with columns `geo_value`, +#' `time_value`, `version`, and then any additional number of columns. #' @param ... used for specifying column names, as in [`dplyr::rename`]. For #' example `version = release_date` #' @param .versions_end location based versions_end, used to avoid prefix diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index b92cd5057..fd6e8b39e 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -9,6 +9,17 @@ \title{\code{as_epi_archive} converts a data frame, data table, or tibble into an \code{epi_archive} object.} \usage{ +new_epi_archive( + data_table, + geo_type, + time_type, + other_keys, + clobberable_versions_start, + versions_end +) + +validate_epi_archive(x) + as_epi_archive( x, geo_type = deprecated(), @@ -36,8 +47,8 @@ new_epi_archive( validate_epi_archive(x) } \arguments{ -\item{x}{A data.frame, data.table, or tibble, with columns \code{geo_value}, -\code{time_value}, \code{version}, and then any additional number of columns.} +\item{data_table}{a data.table with \code{\link[data.table:setkey]{data.table::key()}} equal to +\code{c("geo_value", other_keys, "time_value", "version")}.} \item{geo_type}{DEPRECATED Has no effect. Geo value type is inferred from the location column and set to "custom" if not recognized.} @@ -94,6 +105,27 @@ beyond \code{max(x$version)}, but they all contained empty updates. (The default value of \code{clobberable_versions_start} does not fully trust these empty updates, and assumes that any version \verb{>= max(x$version)} could be clobbered.) If \code{nrow(x) == 0}, then this argument is mandatory.} + +\item{x}{A data.frame, data.table, or tibble, with columns \code{geo_value}, +\code{time_value}, \code{version}, and then any additional number of columns.} + +\item{compactify}{Optional; \code{TRUE}, \code{FALSE}, or \code{"message"}. \code{TRUE} will +remove some redundant rows, \code{FALSE} will not. \code{"message"} is like \code{TRUE} +but will emit a message if anything was changed. Default is \code{TRUE}. See +more information below under "Compactification:".} + +\item{compactify_abs_tol}{Optional; double. A tolerance level used to detect +approximate equality for compactification. The default is 0, which +corresponds to exact equality. Consider using this if your value columns +undergo tiny nonmeaningful revisions and the archive object with the +default setting is too large.} + +\item{.versions_end}{location based versions_end, used to avoid prefix +\code{version = issue} from being assigned to \code{versions_end} instead of being +used to rename columns.} + +\item{...}{used for specifying column names, as in \code{\link[dplyr:rename]{dplyr::rename}}. For +example \code{version = release_date}} } \value{ \itemize{ diff --git a/man/epi_slide_opt_archive_one_epikey.Rd b/man/epi_slide_opt_archive_one_epikey.Rd index 95f97909f..cec4b1d02 100644 --- a/man/epi_slide_opt_archive_one_epikey.Rd +++ b/man/epi_slide_opt_archive_one_epikey.Rd @@ -51,24 +51,20 @@ Core operation of \code{epi_slide_opt.epi_archive} for a single epikey's history library(dplyr) updates <- bind_rows( - tibble( - version = 40, time_value = 1:10, value = 1:10 - ), - tibble( - version = 12, time_value = 2:3, value = 3:2 - ), - tibble( - version = 13, time_value = 6, value = 7, - ), - tibble( - version = 13, time_value = 7, value = NA, - ) + tibble(version = 10, time_value = 1:20, value = 1:20), + tibble(version = 12, time_value = 4:5, value = 5:4), + tibble(version = 13, time_value = 8, value = 9), + tibble(version = 14, time_value = 11, value = NA), + tibble(version = 15, time_value = -10, value = -10), + tibble(version = 16, time_value = 50, value = 50) ) \%>\% mutate(across(c(version, time_value), ~ as.Date("2020-01-01") - 1 + .x)) \%>\% tidyr::nest(.by = version, .key = "subtbl") +f <- purrr::partial(data.table::frollmean, algo = "exact") + updates \%>\% - epi_slide_opt_one_epikey("value", data.table::frollmean, "data.table", 1L, 0L, "day", "slide_value") + epi_slide_opt_archive_one_epikey("value", f, "data.table", 2L, 0L, "day", "slide_value") } \keyword{internal} From 3d4f167e94677a16af7be0924b2c4f881f06d62d Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 10 Mar 2025 03:35:54 -0700 Subject: [PATCH 034/107] fix(epi_slide_opt.epi_archive): as.data.table(tibble) key setting --- R/epi_slide_opt_archive.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/epi_slide_opt_archive.R b/R/epi_slide_opt_archive.R index 1a6faad26..43387a2d0 100644 --- a/R/epi_slide_opt_archive.R +++ b/R/epi_slide_opt_archive.R @@ -193,6 +193,8 @@ epi_slide_opt.epi_archive <- res }) %>% ungroup() %>% + as.data.frame() %>% # data.table#6859 + as.data.table(key = key(.x$DT)) %>% new_epi_archive( .x$geo_type, .x$time_type, .x$other_keys, .x$clobberable_versions_start, .x$versions_end From 4801c7b45899c558e73e97fc175a2af34bf3158a Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 10 Mar 2025 03:36:23 -0700 Subject: [PATCH 035/107] tests(epi_slide_opt): on example data sets --- tests/testthat/test-epi_slide_opt_archive.R | 92 ++++++++++++++++----- 1 file changed, 71 insertions(+), 21 deletions(-) diff --git a/tests/testthat/test-epi_slide_opt_archive.R b/tests/testthat/test-epi_slide_opt_archive.R index 3cea0891b..d300e0bcc 100644 --- a/tests/testthat/test-epi_slide_opt_archive.R +++ b/tests/testthat/test-epi_slide_opt_archive.R @@ -35,7 +35,7 @@ test_that("epi_slide_opt_archive_one_epikey works as expected", { ), tibble( version = 13, time_value = 8:10, value = c(9, 9, 10), - slide_value = frollmean(c(6, 7, 9, 9, 10), 3, algo = "exact")[-(1:2)] + slide_value = frollmean(c(6, 7, 9, 9, 10), 3, algo = "exact")[-(1:2)] ), tibble( version = 14, time_value = 11:13, value = c(NA, 12, 13), slide_value = rep(NA_real_, 3L) @@ -89,7 +89,6 @@ test_that("epi_slide_opt.epi_archive is not confused by unique(DT$version) unsor }) test_that("epi_slide_opt.epi_archive is not confused by unique(DT$time_value) unsorted", { - start_date <- as.Date("2020-01-01") tibble( geo_value = c(1, 1, 2, 2), @@ -109,26 +108,77 @@ test_that("epi_slide_opt.epi_archive is not confused by unique(DT$time_value) un ) %>% as_epi_archive() ) - }) -test_that("epi_slide_opt.epi_archive is equivalent to epix_slide reconversion on example data", { - - case_death_rate_archive %>% - epi_slide_opt(case_rate, frollmean, .window_size = 7 - # , algo = "exact" - ) %>% - .$DT %>% - as.data.frame() %>% - as_tibble() %>% - filter(!approx_equal(case_rate_7dav, case_rate_7d_av, 1e-6, TRUE)) %>% - dplyr::transmute(version, geo_value, time_value, case_rate_7dav, case_rate_7d_av, - abs_diff = abs(case_rate_7dav - case_rate_7d_av)) %>% - {} - - # TODO finish tests on example data sets - - }) - +test_that("epi_slide_opt.epi_archive gives expected results on example data", { + # vs. built-in case_rate_7d_av column. + # + # If we were to compare the keyset vs. + # the original, it changes, as the original contains some tiny deviations in + # values that don't seem achievable with available sliding functions. E.g., in + # the recomputed result, geo "ak" version "2020-11-01" changes time 2020-03-13 + # from 0 to 0.138 and time 2020-03-14 from a slightly different value of 0.138 + # to 0, while nearby times remained stable; in the original, this resulted in + # a tiny update to the 7d_av for 2020-03-14 but not following times somehow, + # while in the recomputation there are also minute updates to 2020-03-15 and + # 2020-03-16; 2020-03-17 onward have other case_rate changes factoring in. + # Compactifying and comparing with tolerances would help account for some of + # these differences, but only through writing this was it realized that both + # archives would need the recompactification with tolerance; it's not just + # epi_slide_opt.epi_archive's very rigid compactification that's the cause. + # (Side note: allowing configurable compactification tolerance in + # epi_slide_opt.epi_archive wasn't included due to either feeling strange + # applying the compactification tolerance to all columns rather than just + # computed columns, and a slowdown when using one approach to compactify just + # the new columns + also awkward not matching what's possible with just + # construction functions.) + # + # --> just compare essentially an epix_merge of the original & the recomputation: + case_death_rate_archive_time <- system.time( + case_death_rate_archive_result <- case_death_rate_archive %>% + epi_slide_opt(case_rate, frollmean, algo = "exact", .window_size = 7) + ) + expect_equal( + case_death_rate_archive_result$DT$case_rate_7dav, + case_death_rate_archive_result$DT$case_rate_7d_av + ) + + # vs. computing via epix_slide: + + mini_case_death_rate_archive <- case_death_rate_archive %>% + { + as_tibble(as.data.frame(.$DT)) + } %>% + filter(geo_value %in% head(unique(geo_value), 4L)) %>% + as_epi_archive() + + mini_case_death_rate_archive_time_opt <- system.time( + mini_case_death_rate_archive_result <- mini_case_death_rate_archive %>% + epi_slide_opt(case_rate, frollmean, .window_size = 7) + ) + + mini_case_death_rate_archive_time_gen <- system.time( + mini_case_death_rate_archive_expected <- mini_case_death_rate_archive %>% + epix_slide(~ .x %>% epi_slide_opt(case_rate, frollmean, .window_size = 7)) %>% + select(names(mini_case_death_rate_archive$DT), everything()) %>% + as_epi_archive() + ) + + expect_equal(mini_case_death_rate_archive_result, mini_case_death_rate_archive_expected) + + archive_cases_dv_subset_time_opt <- system.time( + archive_cases_dv_subset_result <- archive_cases_dv_subset %>% + epi_slide_opt(percent_cli, frollmean, .window_size = 7) + ) + + archive_cases_dv_subset_time_gen <- system.time( + archive_cases_dv_subset_expected <- archive_cases_dv_subset %>% + epix_slide(~ .x %>% epi_slide_opt(percent_cli, frollmean, .window_size = 7)) %>% + select(geo_value, time_value, version, everything()) %>% + as_epi_archive() + ) + + expect_equal(archive_cases_dv_subset_result, archive_cases_dv_subset_expected) +}) # TODO grouped behavior checks From 8e7d5070c59630a24f8e7de51954f86fe8a37d3a Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 10 Mar 2025 12:59:48 -0700 Subject: [PATCH 036/107] Fix & test epi_slide_opt.grouped_epi_archive behavior --- NAMESPACE | 1 + R/epi_slide_opt_archive.R | 2 +- R/epiprocess-package.R | 1 + tests/testthat/test-epi_slide_opt_archive.R | 36 +++++++++++++++++++-- 4 files changed, 36 insertions(+), 4 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 7ec1bf1e9..0963a9a6f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -122,6 +122,7 @@ importFrom(checkmate,assert_logical) importFrom(checkmate,assert_number) importFrom(checkmate,assert_numeric) importFrom(checkmate,assert_scalar) +importFrom(checkmate,assert_set_equal) importFrom(checkmate,assert_string) importFrom(checkmate,assert_subset) importFrom(checkmate,assert_tibble) diff --git a/R/epi_slide_opt_archive.R b/R/epi_slide_opt_archive.R index 43387a2d0..3d55bccca 100644 --- a/R/epi_slide_opt_archive.R +++ b/R/epi_slide_opt_archive.R @@ -125,7 +125,7 @@ epi_slide_opt_archive_one_epikey <- function(updates, in_colnames, f_dots_baked, epi_slide_opt.grouped_epi_archive <- function(.x, ...) { assert_set_equal( group_vars(.x), - key_colnames(.x, exclude = c("time_value", "version")) + key_colnames(.x$private$ungrouped, exclude = c("time_value", "version")) ) orig_group_vars <- group_vars(.x) orig_drop <- .x$private$drop diff --git a/R/epiprocess-package.R b/R/epiprocess-package.R index 195bc0258..c5c76da8b 100644 --- a/R/epiprocess-package.R +++ b/R/epiprocess-package.R @@ -8,6 +8,7 @@ #' @importFrom checkmate assert_false #' @importFrom checkmate assert_function #' @importFrom checkmate assert_logical assert_numeric assert_scalar checkInt +#' @importFrom checkmate assert_set_equal #' @importFrom checkmate assert_string #' @importFrom checkmate assert_subset #' @importFrom checkmate assert_tibble diff --git a/tests/testthat/test-epi_slide_opt_archive.R b/tests/testthat/test-epi_slide_opt_archive.R index d300e0bcc..c41072339 100644 --- a/tests/testthat/test-epi_slide_opt_archive.R +++ b/tests/testthat/test-epi_slide_opt_archive.R @@ -110,7 +110,7 @@ test_that("epi_slide_opt.epi_archive is not confused by unique(DT$time_value) un ) }) -test_that("epi_slide_opt.epi_archive gives expected results on example data", { +test_that("epi_slide_opt.epi_archive gives expected results on example data; also grouped behavior", { # vs. built-in case_rate_7d_av column. # # If we were to compare the keyset vs. @@ -166,6 +166,38 @@ test_that("epi_slide_opt.epi_archive gives expected results on example data", { expect_equal(mini_case_death_rate_archive_result, mini_case_death_rate_archive_expected) + mini_case_death_rate_archive_result2 <- mini_case_death_rate_archive %>% + group_by(geo_value) %>% + epi_slide_opt(case_rate, frollmean, .window_size = 7) + + expect_equal( + mini_case_death_rate_archive_result2, + mini_case_death_rate_archive_result %>% + group_by(geo_value) + ) + + mini_case_death_rate_archive_b <- mini_case_death_rate_archive %>% + { + as_tibble(as.data.frame(.$DT)) + } %>% + mutate(age_group = "overall") %>% + as_epi_archive(other_keys = "age_group") + + expect_equal( + mini_case_death_rate_archive_b %>% + group_by(geo_value, age_group) %>% + epi_slide_opt(case_rate, frollmean, .window_size = 7), + mini_case_death_rate_archive_b %>% + epi_slide_opt(case_rate, frollmean, .window_size = 7) %>% + group_by(geo_value, age_group) + ) + + expect_error( + mini_case_death_rate_archive_b %>% + group_by(age_group) %>% + epi_slide_opt(case_rate, frollmean, .window_size = 7) + ) + archive_cases_dv_subset_time_opt <- system.time( archive_cases_dv_subset_result <- archive_cases_dv_subset %>% epi_slide_opt(percent_cli, frollmean, .window_size = 7) @@ -180,5 +212,3 @@ test_that("epi_slide_opt.epi_archive gives expected results on example data", { expect_equal(archive_cases_dv_subset_result, archive_cases_dv_subset_expected) }) - -# TODO grouped behavior checks From 6fa2c3b718255b09dffd7ba27cad8ea697e93efe Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 10 Mar 2025 13:02:35 -0700 Subject: [PATCH 037/107] Rename approx_equal -> vec_approx_equal --- NAMESPACE | 2 +- R/archive.R | 4 +-- R/patch.R | 28 +++++++++----------- man/approx_equal0.Rd | 12 --------- man/{approx_equal.Rd => vec_approx_equal.Rd} | 14 +++++----- man/vec_approx_equal0.Rd | 12 +++++++++ tests/testthat/test-archive.R | 2 +- 7 files changed, 36 insertions(+), 38 deletions(-) delete mode 100644 man/approx_equal0.Rd rename man/{approx_equal.Rd => vec_approx_equal.Rd} (90%) create mode 100644 man/vec_approx_equal0.Rd diff --git a/NAMESPACE b/NAMESPACE index 0963a9a6f..6fce796ca 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -58,7 +58,6 @@ S3method(ungroup,epi_df) S3method(ungroup,grouped_epi_archive) S3method(unnest,epi_df) export("%>%") -export(approx_equal) export(arrange) export(arrange_canonical) export(as_epi_archive) @@ -106,6 +105,7 @@ export(time_column_names) export(ungroup) export(unnest) export(validate_epi_archive) +export(vec_approx_equal) export(version_column_names) import(epidatasets) importFrom(checkmate,anyInfinite) diff --git a/R/archive.R b/R/archive.R index 3935709ac..5ce1938b9 100644 --- a/R/archive.R +++ b/R/archive.R @@ -472,12 +472,12 @@ update_is_locf <- function(arranged_updates_df, ukey_names, abs_tol) { inds2 <- 1L:(n_updates - 1L) c( FALSE, # first observation is not LOCF - approx_equal0(ekts_tbl, + vec_approx_equal0(ekts_tbl, inds1 = inds1, ekts_tbl, inds2 = inds2, # check ekt (key) cols with 0 tolerance: na_equal = TRUE, abs_tol = 0 ) & - approx_equal0(vals_tbl, + vec_approx_equal0(vals_tbl, inds1 = inds1, vals_tbl, inds2 = inds2, na_equal = TRUE, abs_tol = abs_tol ) diff --git a/R/patch.R b/R/patch.R index bf42e9250..035c0ad58 100644 --- a/R/patch.R +++ b/R/patch.R @@ -27,7 +27,7 @@ #' @examples #' #' # On numeric vectors: -#' approx_equal( +#' vec_approx_equal( #' c(1, 2, 3, NA), #' c(1, 2 + 1e-10, NA, NA), #' na_equal = TRUE, @@ -49,8 +49,8 @@ #' tbl2$c$c1[[4]] <- tbl1$c$c1[[4]] + 1e-10 #' tbl2$d[[5, 2]] <- tbl1$d[[5, 2]] + 1e-10 #' vctrs::vec_equal(tbl1, tbl2, na_equal = TRUE) -#' approx_equal(tbl1, tbl2, na_equal = TRUE, abs_tol = 1e-12) -#' approx_equal(tbl1, tbl2, na_equal = TRUE, abs_tol = 1e-8) +#' vec_approx_equal(tbl1, tbl2, na_equal = TRUE, abs_tol = 1e-12) +#' vec_approx_equal(tbl1, tbl2, na_equal = TRUE, abs_tol = 1e-8) #' #' #' @@ -58,10 +58,10 @@ #' #' # Type comparison within lists is stricter, matching vctrs: #' vctrs::vec_equal(list(1:2), list(as.numeric(1:2))) -#' approx_equal(list(1:2), list(as.numeric(1:2)), FALSE, abs_tol = 0) +#' vec_approx_equal(list(1:2), list(as.numeric(1:2)), FALSE, abs_tol = 0) #' #' @export -approx_equal <- function(vec1, vec2, na_equal, .ptype = NULL, ..., abs_tol, inds1 = NULL, inds2 = NULL) { +vec_approx_equal <- function(vec1, vec2, na_equal, .ptype = NULL, ..., abs_tol, inds1 = NULL, inds2 = NULL) { if (!obj_is_vector(vec1)) cli_abort("`vec1` must be recognized by vctrs as a vector") if (!obj_is_vector(vec2)) cli_abort("`vec2` must be recognized by vctrs as a vector") # Leave vec size checking to vctrs recycling ops. @@ -99,13 +99,13 @@ approx_equal <- function(vec1, vec2, na_equal, .ptype = NULL, ..., abs_tol, inds inds2 <- vec_recycle(inds2, common_size) } vecs <- vec_cast_common(vec1, vec2, .to = .ptype) - approx_equal0(vecs[[1]], vecs[[2]], na_equal, abs_tol, inds1, inds2) + vec_approx_equal0(vecs[[1]], vecs[[2]], na_equal, abs_tol, inds1, inds2) } -#' Helper for [`approx_equal`] for vecs guaranteed to have the same ptype and size +#' Helper for [`vec_approx_equal`] for vecs guaranteed to have the same ptype and size #' #' @keywords internal -approx_equal0 <- function(vec1, vec2, na_equal, abs_tol, inds1 = NULL, inds2 = NULL) { +vec_approx_equal0 <- function(vec1, vec2, na_equal, abs_tol, inds1 = NULL, inds2 = NULL) { if (is_bare_numeric(vec1) && abs_tol != 0) { # perf: since we're working with bare numerics and logicals: we can use `[` # and `fifelse`. Matching vec_equal, we ignore names and other attributes. @@ -132,7 +132,7 @@ approx_equal0 <- function(vec1, vec2, na_equal, abs_tol, inds1 = NULL, inds2 = N rep(TRUE, nrow(vec1)) } else { Reduce(`&`, lapply(seq_len(ncol(vec1)), function(col_i) { - approx_equal0(vec1[[col_i]], vec2[[col_i]], na_equal, abs_tol, inds1, inds2) + vec_approx_equal0(vec1[[col_i]], vec2[[col_i]], na_equal, abs_tol, inds1, inds2) })) } } else if (is_bare_list(vec1)) { @@ -144,14 +144,14 @@ approx_equal0 <- function(vec1, vec2, na_equal, abs_tol, inds1 = NULL, inds2 = N # consistently inconsistent, we avoid dispatching to vec_equal for bare # lists even with abs_tol = 0: identical(vec_ptype(entry1), vec_ptype(entry2)) && - all(approx_equal0(entry1, entry2, na_equal, abs_tol)) + all(vec_approx_equal0(entry1, entry2, na_equal, abs_tol)) }, logical(1L)) } else { # XXX No special handling for any other types/situations. Makes sense for # unclassed atomic things; custom classes (e.g., distributions) might want - # recursion / specialization, though. approx_equal0 should probably be an S3 + # recursion / specialization, though. vec_approx_equal0 should probably be an S3 # method. Also, abs_tol == 0 --> vec_equal logic should maybe be either be - # hoisted to approx_equal or we should manually recurse on data frames even + # hoisted to vec_approx_equal or we should manually recurse on data frames even # with abs_tol = 0 when that's faster (might depend on presence of inds*), # after some inconsistencies are ironed out. if (!is.null(inds1)) { @@ -265,7 +265,7 @@ tbl_diff2 <- function(earlier_snapshot, later_tbl, # Which rows from combined are in case 3.? combined_compactify_away <- rep(FALSE, combined_n) combined_compactify_away[combined_ukey_is_repeat] <- - approx_equal0(combined_vals, + vec_approx_equal0(combined_vals, combined_vals, na_equal = TRUE, abs_tol = compactify_abs_tol, @@ -340,5 +340,3 @@ tbl_patch <- function(snapshot, update, ukey_names) { result_tbl } - -# TODO rename approx_equal to vec_approx_equal diff --git a/man/approx_equal0.Rd b/man/approx_equal0.Rd deleted file mode 100644 index 9726f5013..000000000 --- a/man/approx_equal0.Rd +++ /dev/null @@ -1,12 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/patch.R -\name{approx_equal0} -\alias{approx_equal0} -\title{Helper for \code{\link{approx_equal}} for vecs guaranteed to have the same ptype and size} -\usage{ -approx_equal0(vec1, vec2, na_equal, abs_tol, inds1 = NULL, inds2 = NULL) -} -\description{ -Helper for \code{\link{approx_equal}} for vecs guaranteed to have the same ptype and size -} -\keyword{internal} diff --git a/man/approx_equal.Rd b/man/vec_approx_equal.Rd similarity index 90% rename from man/approx_equal.Rd rename to man/vec_approx_equal.Rd index 237b33be4..80e037923 100644 --- a/man/approx_equal.Rd +++ b/man/vec_approx_equal.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/patch.R -\name{approx_equal} -\alias{approx_equal} +\name{vec_approx_equal} +\alias{vec_approx_equal} \title{Test two vctrs vectors for equality with some tolerance in some cases} \usage{ -approx_equal( +vec_approx_equal( vec1, vec2, na_equal, @@ -49,7 +49,7 @@ the precise behavior in these cases may change and should not be relied upon. \examples{ # On numeric vectors: -approx_equal( +vec_approx_equal( c(1, 2, 3, NA), c(1, 2 + 1e-10, NA, NA), na_equal = TRUE, @@ -71,8 +71,8 @@ tbl2$b[[3]][[1]] <- tbl1$b[[3]][[1]] + 1e-10 tbl2$c$c1[[4]] <- tbl1$c$c1[[4]] + 1e-10 tbl2$d[[5, 2]] <- tbl1$d[[5, 2]] + 1e-10 vctrs::vec_equal(tbl1, tbl2, na_equal = TRUE) -approx_equal(tbl1, tbl2, na_equal = TRUE, abs_tol = 1e-12) -approx_equal(tbl1, tbl2, na_equal = TRUE, abs_tol = 1e-8) +vec_approx_equal(tbl1, tbl2, na_equal = TRUE, abs_tol = 1e-12) +vec_approx_equal(tbl1, tbl2, na_equal = TRUE, abs_tol = 1e-8) @@ -80,6 +80,6 @@ approx_equal(tbl1, tbl2, na_equal = TRUE, abs_tol = 1e-8) # Type comparison within lists is stricter, matching vctrs: vctrs::vec_equal(list(1:2), list(as.numeric(1:2))) -approx_equal(list(1:2), list(as.numeric(1:2)), FALSE, abs_tol = 0) +vec_approx_equal(list(1:2), list(as.numeric(1:2)), FALSE, abs_tol = 0) } diff --git a/man/vec_approx_equal0.Rd b/man/vec_approx_equal0.Rd new file mode 100644 index 000000000..01a1141ba --- /dev/null +++ b/man/vec_approx_equal0.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/patch.R +\name{vec_approx_equal0} +\alias{vec_approx_equal0} +\title{Helper for \code{\link{vec_approx_equal}} for vecs guaranteed to have the same ptype and size} +\usage{ +vec_approx_equal0(vec1, vec2, na_equal, abs_tol, inds1 = NULL, inds2 = NULL) +} +\description{ +Helper for \code{\link{vec_approx_equal}} for vecs guaranteed to have the same ptype and size +} +\keyword{internal} diff --git a/tests/testthat/test-archive.R b/tests/testthat/test-archive.R index 051503e05..510d2adfc 100644 --- a/tests/testthat/test-archive.R +++ b/tests/testthat/test-archive.R @@ -220,5 +220,5 @@ test_that("`epi_archive` rejects dataframes where time_value and version columns test_that("is_locf replacement works as expected", { vec <- c(1, 1, 1e-10, 1.1e-10, NA, NA, NaN, NaN) is_repeated <- c(0, 1, 0, 1, 0, 1, 1, 1) - expect_equal(c(FALSE, approx_equal(head(vec, -1L), tail(vec, -1L), na_equal = TRUE, abs_tol = .Machine$double.eps^0.5)), as.logical(is_repeated)) + expect_equal(c(FALSE, vec_approx_equal(head(vec, -1L), tail(vec, -1L), na_equal = TRUE, abs_tol = .Machine$double.eps^0.5)), as.logical(is_repeated)) }) From 1b893314f7fe6da1eff728fa62f9e1909cad6cd5 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 10 Mar 2025 13:30:46 -0700 Subject: [PATCH 038/107] Fix missing n_groups import + epiprocess::: CHECK lint --- NAMESPACE | 1 + R/epi_slide_opt_archive.R | 2 +- R/epiprocess-package.R | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/NAMESPACE b/NAMESPACE index 6fce796ca..114f77c1d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -188,6 +188,7 @@ importFrom(dplyr,if_any) importFrom(dplyr,if_else) importFrom(dplyr,is_grouped_df) importFrom(dplyr,mutate) +importFrom(dplyr,n_groups) importFrom(dplyr,pick) importFrom(dplyr,pull) importFrom(dplyr,relocate) diff --git a/R/epi_slide_opt_archive.R b/R/epi_slide_opt_archive.R index 3d55bccca..714ec0227 100644 --- a/R/epi_slide_opt_archive.R +++ b/R/epi_slide_opt_archive.R @@ -42,7 +42,7 @@ #' @keywords internal epi_slide_opt_archive_one_epikey <- function(updates, in_colnames, f_dots_baked, f_from_package, before, after, time_type, out_colnames) { # TODO check for col name clobbering - unit_step <- epiprocess:::unit_time_delta(time_type) + unit_step <- unit_time_delta(time_type) prev_inp_snapshot <- NULL prev_out_snapshot <- NULL result <- map(seq_len(nrow(updates)), function(update_i) { diff --git a/R/epiprocess-package.R b/R/epiprocess-package.R index c5c76da8b..7c7f54394 100644 --- a/R/epiprocess-package.R +++ b/R/epiprocess-package.R @@ -32,6 +32,7 @@ #' @importFrom dplyr arrange #' @importFrom dplyr grouped_df #' @importFrom dplyr is_grouped_df +#' @importFrom dplyr n_groups #' @importFrom dplyr select #' @importFrom lifecycle deprecated #' @importFrom purrr list_rbind From 0418b03cdaa53307e32c92e194b0f7431c195bd9 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 10 Mar 2025 14:06:33 -0700 Subject: [PATCH 039/107] Address additional lints, CHECK issues --- R/archive.R | 2 +- R/epi_slide_opt_archive.R | 23 ++++++++++++++++----- R/epi_slide_opt_edf.R | 12 +++++++---- R/patch.R | 10 ++++++--- man/apply_compactify.Rd | 2 +- man/vec_approx_equal.Rd | 3 +++ tests/testthat/test-archive.R | 10 ++++++++- tests/testthat/test-epi_slide_opt_archive.R | 8 +++++-- 8 files changed, 53 insertions(+), 17 deletions(-) diff --git a/R/archive.R b/R/archive.R index 5ce1938b9..35a9e775c 100644 --- a/R/archive.R +++ b/R/archive.R @@ -386,7 +386,7 @@ validate_epi_archive <- function(x) { #' would be `key(DT)`. #' @param abs_tol numeric, >=0; absolute tolerance to use on numeric measurement #' columns when determining whether something can be compactified away; see -#' [`is_locf`] +#' [`vec_approx_equal`] #' #' @importFrom data.table is.data.table key #' @importFrom dplyr arrange filter diff --git a/R/epi_slide_opt_archive.R b/R/epi_slide_opt_archive.R index 714ec0227..eaf5bbb88 100644 --- a/R/epi_slide_opt_archive.R +++ b/R/epi_slide_opt_archive.R @@ -40,7 +40,11 @@ #' epi_slide_opt_archive_one_epikey("value", f, "data.table", 2L, 0L, "day", "slide_value") #' #' @keywords internal -epi_slide_opt_archive_one_epikey <- function(updates, in_colnames, f_dots_baked, f_from_package, before, after, time_type, out_colnames) { +epi_slide_opt_archive_one_epikey <- function( + updates, + in_colnames, + f_dots_baked, f_from_package, before, after, time_type, + out_colnames) { # TODO check for col name clobbering unit_step <- unit_time_delta(time_type) prev_inp_snapshot <- NULL @@ -61,7 +65,7 @@ epi_slide_opt_archive_one_epikey <- function(updates, in_colnames, f_dots_baked, # If the input had updates in the range t1..t2, this could produce changes # in slide outputs in the range t1-after..t2+before, and to compute those # slide values, we need to look at the input snapshot from - # t1-after-before..t2+before+after. + # t1-after-before..t2+before+after. nolint: commented_code_linter inp_update_t_min <- min(inp_update$time_value) inp_update_t_max <- max(inp_update$time_value) slide_t_min <- inp_update_t_min - (before + after) * unit_step @@ -78,7 +82,8 @@ epi_slide_opt_archive_one_epikey <- function(updates, in_colnames, f_dots_baked, if (f_from_package == "data.table") { for (col_i in seq_along(in_colnames)) { if (before == Inf) { - slide[[out_colnames[[col_i]]]] <- f_dots_baked(slide[[in_colnames[[col_i]]]], seq_len(slide_nrow), adaptive = TRUE) + slide[[out_colnames[[col_i]]]] <- + f_dots_baked(slide[[in_colnames[[col_i]]]], seq_len(slide_nrow), adaptive = TRUE) } else { out_col <- f_dots_baked(slide[[in_colnames[[col_i]]]], before + after + 1L) if (after != 0L) { @@ -158,7 +163,10 @@ epi_slide_opt.epi_archive <- purrr::partial(.f, ...) } col_names_quo <- enquo(.col_names) - names_info <- across_ish_names_info(.x$DT, time_type, col_names_quo, .f_info$namer, .window_size, .align, .prefix, .suffix, .new_col_names) + names_info <- across_ish_names_info( + .x$DT, time_type, col_names_quo, .f_info$namer, + .window_size, .align, .prefix, .suffix, .new_col_names + ) window_args <- get_before_after_from_window(.window_size, .align, time_type) if (!is.null(.ref_time_values)) { cli_abort("epi_slide.epi_archive does not support the `.ref_time_values` argument") @@ -187,7 +195,12 @@ epi_slide_opt.epi_archive <- nest(.by = version, .key = "subtbl") %>% arrange(version) # TODO move nesting inside the helper? - res <- epi_slide_opt_archive_one_epikey(group_updates, names_info$input_col_names, .f_dots_baked, .f_info$from_package, window_args$before, window_args$after, time_type, names_info$output_col_names) %>% + res <- epi_slide_opt_archive_one_epikey( + group_updates, + names_info$input_col_names, + .f_dots_baked, .f_info$from_package, window_args$before, window_args$after, time_type, + names_info$output_col_names + ) %>% list_rbind() if (use_progress) cli::cli_progress_update(id = progress_bar_id) res diff --git a/R/epi_slide_opt_edf.R b/R/epi_slide_opt_edf.R index 9dd7190c8..b6be49a1a 100644 --- a/R/epi_slide_opt_edf.R +++ b/R/epi_slide_opt_edf.R @@ -81,7 +81,8 @@ upstream_slide_f_info <- function(.f) { #' `names(.x)`; and `output_colnames`, chr, same length as `input_col_names` #' #' @keywords internal -across_ish_names_info <- function(.x, time_type, col_names_quo, .f_namer, .window_size, .align, .prefix, .suffix, .new_col_names) { +across_ish_names_info <- function(.x, time_type, col_names_quo, .f_namer, + .window_size, .align, .prefix, .suffix, .new_col_names) { # The position of a given column can be differ between input `.x` and # `.data_group` since the grouping step by default drops grouping columns. # To avoid rerunning `eval_select` for every `.data_group`, convert @@ -151,10 +152,10 @@ across_ish_names_info <- function(.x, time_type, col_names_quo, .f_namer, .windo } output_col_names <- .new_col_names - return(list( + list( input_col_names = input_col_names, output_col_names = output_col_names - )) + ) } #' Optimized slide functions for common cases @@ -417,7 +418,10 @@ epi_slide_opt.epi_df <- function(.x, .col_names, .f, ..., window_args <- get_before_after_from_window(.window_size, .align, time_type) # Handle output naming: - names_info <- across_ish_names_info(.x, time_type, col_names_quo, f_info$namer, .window_size, .align, .prefix, .suffix, .new_col_names) + names_info <- across_ish_names_info( + .x, time_type, col_names_quo, f_info$namer, + .window_size, .align, .prefix, .suffix, .new_col_names + ) input_col_names <- names_info$input_col_names output_col_names <- names_info$output_col_names diff --git a/R/patch.R b/R/patch.R index 035c0ad58..f8384ad22 100644 --- a/R/patch.R +++ b/R/patch.R @@ -12,9 +12,11 @@ #' @param na_equal should `NA`s be considered equal to each other? (In #' epiprocess, we usually want this to be `TRUE`, but that doesn't match the #' [`vctrs::vec_equal()`] default, so this is mandatory.) +#' @param .ptype as in [`vctrs::vec_equal()`]. +#' @param ... should be empty (it's here to force later arguments to be passed +#' by name) #' @param abs_tol absolute tolerance; will be used for bare numeric `vec1`, #' `vec2`, or any such columns within `vec1`, `vec2` if they are data frames. -#' @param .ptype as in [`vctrs::vec_equal()`]. #' @param inds1,inds2 optional (row) indices into vec1 and vec2 compatible with #' [`vctrs::vec_slice()`]; output should be consistent with `vec_slice`-ing to #' these indices beforehand, but can give faster computation if `vec1` and @@ -205,7 +207,9 @@ tbl_diff2 <- function(earlier_snapshot, later_tbl, cli_abort("`ukey_names` must be a subset of column names") } later_format <- arg_match0(later_format, c("snapshot", "update")) - if (!(is.vector(compactify_abs_tol, mode = "numeric") && length(compactify_abs_tol) == 1L && compactify_abs_tol >= 0)) { + if (!(is.vector(compactify_abs_tol, mode = "numeric") && + length(compactify_abs_tol) == 1L && # nolint:indentation_linter + compactify_abs_tol >= 0)) { # Give a specific message: assert_numeric(compactify_abs_tol, lower = 0, any.missing = FALSE, len = 1L) # Fallback e.g. for invalid classes not caught by assert_numeric: @@ -279,7 +283,7 @@ tbl_diff2 <- function(earlier_snapshot, later_tbl, if (later_format == "update") { # Cases 4. and 5.: combined_tbl <- combined_tbl[combined_from_later & !combined_compactify_away, ] - } else { # later_format == "snapshot" + } else { # later_format is "snapshot" # Which rows from combined are in case 1.? combined_is_deletion <- vec_rep_each(c(TRUE, FALSE), c(earlier_n, later_n)) combined_is_deletion[ukey_repeat_first_i] <- FALSE diff --git a/man/apply_compactify.Rd b/man/apply_compactify.Rd index e96108789..6bd6ce770 100644 --- a/man/apply_compactify.Rd +++ b/man/apply_compactify.Rd @@ -16,7 +16,7 @@ would be \code{key(DT)}.} \item{abs_tol}{numeric, >=0; absolute tolerance to use on numeric measurement columns when determining whether something can be compactified away; see -\code{\link{is_locf}}} +\code{\link{vec_approx_equal}}} } \description{ Works by shifting all rows except the version, then comparing values to see diff --git a/man/vec_approx_equal.Rd b/man/vec_approx_equal.Rd index 80e037923..f3612fa19 100644 --- a/man/vec_approx_equal.Rd +++ b/man/vec_approx_equal.Rd @@ -28,6 +28,9 @@ epiprocess, we usually want this to be \code{TRUE}, but that doesn't match the \item{.ptype}{as in \code{\link[vctrs:vec_equal]{vctrs::vec_equal()}}.} +\item{...}{should be empty (it's here to force later arguments to be passed +by name)} + \item{abs_tol}{absolute tolerance; will be used for bare numeric \code{vec1}, \code{vec2}, or any such columns within \code{vec1}, \code{vec2} if they are data frames.} diff --git a/tests/testthat/test-archive.R b/tests/testthat/test-archive.R index 510d2adfc..336fe4baa 100644 --- a/tests/testthat/test-archive.R +++ b/tests/testthat/test-archive.R @@ -220,5 +220,13 @@ test_that("`epi_archive` rejects dataframes where time_value and version columns test_that("is_locf replacement works as expected", { vec <- c(1, 1, 1e-10, 1.1e-10, NA, NA, NaN, NaN) is_repeated <- c(0, 1, 0, 1, 0, 1, 1, 1) - expect_equal(c(FALSE, vec_approx_equal(head(vec, -1L), tail(vec, -1L), na_equal = TRUE, abs_tol = .Machine$double.eps^0.5)), as.logical(is_repeated)) + expect_equal( + c( + FALSE, + vec_approx_equal(head(vec, -1L), tail(vec, -1L), + na_equal = TRUE, abs_tol = .Machine$double.eps^0.5 + ) + ), + as.logical(is_repeated) + ) }) diff --git a/tests/testthat/test-epi_slide_opt_archive.R b/tests/testthat/test-epi_slide_opt_archive.R index c41072339..aa2a94905 100644 --- a/tests/testthat/test-epi_slide_opt_archive.R +++ b/tests/testthat/test-epi_slide_opt_archive.R @@ -159,7 +159,9 @@ test_that("epi_slide_opt.epi_archive gives expected results on example data; als mini_case_death_rate_archive_time_gen <- system.time( mini_case_death_rate_archive_expected <- mini_case_death_rate_archive %>% - epix_slide(~ .x %>% epi_slide_opt(case_rate, frollmean, .window_size = 7)) %>% + epix_slide( + ~ .x %>% epi_slide_opt(case_rate, frollmean, .window_size = 7) + ) %>% select(names(mini_case_death_rate_archive$DT), everything()) %>% as_epi_archive() ) @@ -205,7 +207,9 @@ test_that("epi_slide_opt.epi_archive gives expected results on example data; als archive_cases_dv_subset_time_gen <- system.time( archive_cases_dv_subset_expected <- archive_cases_dv_subset %>% - epix_slide(~ .x %>% epi_slide_opt(percent_cli, frollmean, .window_size = 7)) %>% + epix_slide( + ~ .x %>% epi_slide_opt(percent_cli, frollmean, .window_size = 7) + ) %>% select(geo_value, time_value, version, everything()) %>% as_epi_archive() ) From f2ecb3174c1cec551343d5c0d4e743f02ccdb63e Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 10 Mar 2025 14:24:28 -0700 Subject: [PATCH 040/107] Address missing ::: in tests --- tests/testthat/test-epi_slide_opt_archive.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-epi_slide_opt_archive.R b/tests/testthat/test-epi_slide_opt_archive.R index aa2a94905..088452da5 100644 --- a/tests/testthat/test-epi_slide_opt_archive.R +++ b/tests/testthat/test-epi_slide_opt_archive.R @@ -55,7 +55,7 @@ test_that("epi_slide_opt_archive_one_epikey works as expected", { f <- purrr::partial(data.table::frollmean, algo = "exact") result <- updates %>% - epi_slide_opt_archive_one_epikey("value", f, "data.table", 2L, 0L, "day", "slide_value") %>% + epiprocess:::epi_slide_opt_archive_one_epikey("value", f, "data.table", 2L, 0L, "day", "slide_value") %>% lapply(function(x) { x %>% arrange(time_value) %>% From a737dbe175563e5174197fcf910986f8253ba74f Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 10 Mar 2025 16:03:07 -0700 Subject: [PATCH 041/107] Fix another missing `:::` --- R/epi_slide_opt_archive.R | 2 +- man/epi_slide_opt_archive_one_epikey.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/epi_slide_opt_archive.R b/R/epi_slide_opt_archive.R index eaf5bbb88..1250d6fb2 100644 --- a/R/epi_slide_opt_archive.R +++ b/R/epi_slide_opt_archive.R @@ -37,7 +37,7 @@ #' f <- purrr::partial(data.table::frollmean, algo = "exact") #' #' updates %>% -#' epi_slide_opt_archive_one_epikey("value", f, "data.table", 2L, 0L, "day", "slide_value") +#' epiprocess:::epi_slide_opt_archive_one_epikey("value", f, "data.table", 2L, 0L, "day", "slide_value") #' #' @keywords internal epi_slide_opt_archive_one_epikey <- function( diff --git a/man/epi_slide_opt_archive_one_epikey.Rd b/man/epi_slide_opt_archive_one_epikey.Rd index cec4b1d02..f1a992a77 100644 --- a/man/epi_slide_opt_archive_one_epikey.Rd +++ b/man/epi_slide_opt_archive_one_epikey.Rd @@ -64,7 +64,7 @@ updates <- bind_rows( f <- purrr::partial(data.table::frollmean, algo = "exact") updates \%>\% - epi_slide_opt_archive_one_epikey("value", f, "data.table", 2L, 0L, "day", "slide_value") + epiprocess:::epi_slide_opt_archive_one_epikey("value", f, "data.table", 2L, 0L, "day", "slide_value") } \keyword{internal} From d1c1e1d428d5b74ed2e8848d9a728f3a16daa277 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 11 Mar 2025 09:34:19 -0700 Subject: [PATCH 042/107] Fix + add NEWS.md entries We actually coincidentally caught the tibble-to-DT key-setting issue with code intended for addressing a separate issue with `data.table`s passed in. --- NEWS.md | 14 +++++++------- R/archive.R | 20 ++++++++++++-------- man/epi_archive.Rd | 18 +++++++++++------- 3 files changed, 30 insertions(+), 22 deletions(-) diff --git a/NEWS.md b/NEWS.md index 1218da12f..89435c9df 100644 --- a/NEWS.md +++ b/NEWS.md @@ -9,15 +9,15 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat - `is_epi_archive` function has been reintroduced. ## Breaking changes -- `new_epi_archive()`'s `x` argument has been replaced with a `data_table` - argument, which must be a `data.table` with the key already set appropriately. - The `key()` of its `DT` will also now place `other_keys` before rather than after - `"time_value"`. +- The low-level `new_epi_archive()` function's `x` argument has been replaced + with a `data_table` argument, which now has extra requirements; see + `?new_epi_archive`. Users should still be using `as_epi_archive()` unless they + have a need for something lower-level. -## Bug fixes +## New features -- `as_epi_archive()` no longer has issues setting its `DT`'s `key` on some - versions of `{data.table}` when `x` is a tibble. +- `epi_slide_{mean,sum,opt}` now work on `epi_archive`s, preparing version + histories for 7-day-averages of signals, etc. # epiprocess 0.11 diff --git a/R/archive.R b/R/archive.R index 35a9e775c..80adbc379 100644 --- a/R/archive.R +++ b/R/archive.R @@ -186,17 +186,21 @@ next_after.Date <- function(x) x + 1L #' archive. Unexpected behavior may result from modifying the metadata #' directly. #' -#' @param data_table a data.table with [`data.table::key()`] equal to -#' `c("geo_value", other_keys, "time_value", "version")`. +#' @param data_table a `data.table` with [`data.table::key()`] equal to +#' `c("geo_value", other_keys, "time_value", "version")`. For `data.table` +#' users: this sets up an alias of `data_table`; if you plan to keep on +#' working with `data_table` or working directly with the archive's `$DT` +#' using mutating operations, you should `copy()` if appropriate. We will not +#' mutate the `DT` with any exported {epiprocess} functions, though. #' @param geo_type DEPRECATED Has no effect. Geo value type is inferred from the -#' location column and set to "custom" if not recognized. -#' @param time_type DEPRECATED Has no effect. Time value type inferred from the time -#' column and set to "custom" if not recognized. Unpredictable behavior may result -#' if the time type is not recognized. +#' location column and set to "custom" if not recognized. +#' @param time_type DEPRECATED Has no effect. Time value type inferred from the +#' time column and set to "custom" if not recognized. Unpredictable behavior +#' may result if the time type is not recognized. #' @param other_keys Character vector specifying the names of variables in `x` #' that should be considered key variables (in the language of `data.table`) -#' apart from "geo_value", "time_value", and "version". Typical examples -#' are "age" or more granular geographies. +#' apart from "geo_value", "time_value", and "version". Typical examples are +#' "age" or more granular geographies. #' @param compactify Optional; `TRUE`, `FALSE`, or `"message"`. `TRUE` will #' remove some redundant rows, `FALSE` will not. `"message"` is like `TRUE` #' but will emit a message if anything was changed. Default is `TRUE`. See diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index fd6e8b39e..1f0bf6bdf 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -47,20 +47,24 @@ new_epi_archive( validate_epi_archive(x) } \arguments{ -\item{data_table}{a data.table with \code{\link[data.table:setkey]{data.table::key()}} equal to -\code{c("geo_value", other_keys, "time_value", "version")}.} +\item{data_table}{a \code{data.table} with \code{\link[data.table:setkey]{data.table::key()}} equal to +\code{c("geo_value", other_keys, "time_value", "version")}. For \code{data.table} +users: this sets up an alias of \code{data_table}; if you plan to keep on +working with \code{data_table} or working directly with the archive's \verb{$DT} +using mutating operations, you should \code{copy()} if appropriate. We will not +mutate the \code{DT} with any exported {epiprocess} functions, though.} \item{geo_type}{DEPRECATED Has no effect. Geo value type is inferred from the location column and set to "custom" if not recognized.} -\item{time_type}{DEPRECATED Has no effect. Time value type inferred from the time -column and set to "custom" if not recognized. Unpredictable behavior may result -if the time type is not recognized.} +\item{time_type}{DEPRECATED Has no effect. Time value type inferred from the +time column and set to "custom" if not recognized. Unpredictable behavior +may result if the time type is not recognized.} \item{other_keys}{Character vector specifying the names of variables in \code{x} that should be considered key variables (in the language of \code{data.table}) -apart from "geo_value", "time_value", and "version". Typical examples -are "age" or more granular geographies.} +apart from "geo_value", "time_value", and "version". Typical examples are +"age" or more granular geographies.} \item{compactify}{Optional; \code{TRUE}, \code{FALSE}, or \code{"message"}. \code{TRUE} will remove some redundant rows, \code{FALSE} will not. \code{"message"} is like \code{TRUE} From f4447b08be4b7de544b9d5da0bd011ee82f2ba28 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 11 Mar 2025 09:54:35 -0700 Subject: [PATCH 043/107] Fix CHECK doc line length lint --- R/epi_slide_opt_archive.R | 4 +++- man/epi_slide_opt_archive_one_epikey.Rd | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/R/epi_slide_opt_archive.R b/R/epi_slide_opt_archive.R index 1250d6fb2..070dc431c 100644 --- a/R/epi_slide_opt_archive.R +++ b/R/epi_slide_opt_archive.R @@ -37,7 +37,9 @@ #' f <- purrr::partial(data.table::frollmean, algo = "exact") #' #' updates %>% -#' epiprocess:::epi_slide_opt_archive_one_epikey("value", f, "data.table", 2L, 0L, "day", "slide_value") +#' epiprocess:::epi_slide_opt_archive_one_epikey( +#' "value", f, "data.table", 2L, 0L, "day", "slide_value" +#' ) #' #' @keywords internal epi_slide_opt_archive_one_epikey <- function( diff --git a/man/epi_slide_opt_archive_one_epikey.Rd b/man/epi_slide_opt_archive_one_epikey.Rd index f1a992a77..78eec4fdf 100644 --- a/man/epi_slide_opt_archive_one_epikey.Rd +++ b/man/epi_slide_opt_archive_one_epikey.Rd @@ -64,7 +64,9 @@ updates <- bind_rows( f <- purrr::partial(data.table::frollmean, algo = "exact") updates \%>\% - epiprocess:::epi_slide_opt_archive_one_epikey("value", f, "data.table", 2L, 0L, "day", "slide_value") + epiprocess:::epi_slide_opt_archive_one_epikey( + "value", f, "data.table", 2L, 0L, "day", "slide_value" + ) } \keyword{internal} From 2c08e01e8f5ca033bcef2270b3859fd6437a7772 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 11 Mar 2025 09:56:00 -0700 Subject: [PATCH 044/107] Fix missing library(dplyr) in example --- R/patch.R | 2 ++ man/vec_approx_equal.Rd | 2 ++ 2 files changed, 4 insertions(+) diff --git a/R/patch.R b/R/patch.R index f8384ad22..0cd42f3cd 100644 --- a/R/patch.R +++ b/R/patch.R @@ -28,6 +28,8 @@ #' #' @examples #' +#' library(dplyr) +#' #' # On numeric vectors: #' vec_approx_equal( #' c(1, 2, 3, NA), diff --git a/man/vec_approx_equal.Rd b/man/vec_approx_equal.Rd index f3612fa19..ff403a1c6 100644 --- a/man/vec_approx_equal.Rd +++ b/man/vec_approx_equal.Rd @@ -51,6 +51,8 @@ the precise behavior in these cases may change and should not be relied upon. } \examples{ +library(dplyr) + # On numeric vectors: vec_approx_equal( c(1, 2, 3, NA), From 2aa030166120e51f9c9758ff085aba8ad77ecad1 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 11 Mar 2025 10:04:25 -0700 Subject: [PATCH 045/107] Fix {epiprocess} -> `{epiprocess}` in roxygen --- R/archive.R | 2 +- man/epi_archive.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/archive.R b/R/archive.R index 80adbc379..5afabb13b 100644 --- a/R/archive.R +++ b/R/archive.R @@ -191,7 +191,7 @@ next_after.Date <- function(x) x + 1L #' users: this sets up an alias of `data_table`; if you plan to keep on #' working with `data_table` or working directly with the archive's `$DT` #' using mutating operations, you should `copy()` if appropriate. We will not -#' mutate the `DT` with any exported {epiprocess} functions, though. +#' mutate the `DT` with any exported `{epiprocess}` functions, though. #' @param geo_type DEPRECATED Has no effect. Geo value type is inferred from the #' location column and set to "custom" if not recognized. #' @param time_type DEPRECATED Has no effect. Time value type inferred from the diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index 1f0bf6bdf..ab9da219f 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -52,7 +52,7 @@ validate_epi_archive(x) users: this sets up an alias of \code{data_table}; if you plan to keep on working with \code{data_table} or working directly with the archive's \verb{$DT} using mutating operations, you should \code{copy()} if appropriate. We will not -mutate the \code{DT} with any exported {epiprocess} functions, though.} +mutate the \code{DT} with any exported \code{{epiprocess}} functions, though.} \item{geo_type}{DEPRECATED Has no effect. Geo value type is inferred from the location column and set to "custom" if not recognized.} From 56cacaafde9f785e6cdeb72780dc28d22bb991d0 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 11 Mar 2025 12:27:00 -0700 Subject: [PATCH 046/107] docs: add vec_approx_equal to pkgdown reference index --- _pkgdown.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/_pkgdown.yml b/_pkgdown.yml index 3742bd416..8d3a09d5c 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -93,6 +93,10 @@ reference: - epidatasets::covid_incidence_outliers - epidatasets::covid_case_death_rates_extended + - title: Other utilities + - contents: + - vec_approx_equal + - title: internal - contents: - starts_with("internal") From 77f7bc0a6dfda47ae1181a317f95694ff2f2c734 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 17 Mar 2025 17:18:00 -0700 Subject: [PATCH 047/107] docs(vec_approx_equal): mention vctrs::vec_proxy_equal --- R/patch.R | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/R/patch.R b/R/patch.R index 0cd42f3cd..af60eb1a1 100644 --- a/R/patch.R +++ b/R/patch.R @@ -153,11 +153,13 @@ vec_approx_equal0 <- function(vec1, vec2, na_equal, abs_tol, inds1 = NULL, inds2 } else { # XXX No special handling for any other types/situations. Makes sense for # unclassed atomic things; custom classes (e.g., distributions) might want - # recursion / specialization, though. vec_approx_equal0 should probably be an S3 - # method. Also, abs_tol == 0 --> vec_equal logic should maybe be either be - # hoisted to vec_approx_equal or we should manually recurse on data frames even - # with abs_tol = 0 when that's faster (might depend on presence of inds*), - # after some inconsistencies are ironed out. + # recursion / specialization, though. vec_approx_equal0 should probably be + # an S3 method; see also `vctrs::vec_proxy_equal` though it's probably not + # sufficient (e.g., for keyed data frames such as `epi_df`s that should have + # strict & nonstrict columns). Also, abs_tol == 0 --> vec_equal logic should + # maybe be either be hoisted to vec_approx_equal or we should manually + # recurse on data frames even with abs_tol = 0 when that's faster (might + # depend on presence of inds*), after some inconsistencies are ironed out. if (!is.null(inds1)) { vec1 <- vec_slice(vec1, inds1) vec2 <- vec_slice(vec2, inds2) From 1cb15554f10f1f3ca707d81a80feef26922491e3 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 25 Mar 2025 16:24:37 -0700 Subject: [PATCH 048/107] Fix & test some vec_approx_equal vs. vec_equal behaviors --- NAMESPACE | 1 + R/epiprocess-package.R | 1 + R/patch.R | 38 ++++++++++------- tests/testthat/test-archive.R | 14 ------- tests/testthat/test-vec_approx_equal.R | 57 ++++++++++++++++++++++++++ 5 files changed, 82 insertions(+), 29 deletions(-) create mode 100644 tests/testthat/test-vec_approx_equal.R diff --git a/NAMESPACE b/NAMESPACE index 114f77c1d..62c3e7fc8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -283,6 +283,7 @@ importFrom(vctrs,vec_recycle_common) importFrom(vctrs,vec_rep) importFrom(vctrs,vec_rep_each) importFrom(vctrs,vec_seq_along) +importFrom(vctrs,vec_set_names) importFrom(vctrs,vec_size) importFrom(vctrs,vec_size_common) importFrom(vctrs,vec_slice) diff --git a/R/epiprocess-package.R b/R/epiprocess-package.R index 7c7f54394..03b1838a6 100644 --- a/R/epiprocess-package.R +++ b/R/epiprocess-package.R @@ -60,6 +60,7 @@ #' @importFrom vctrs vec_rep #' @importFrom vctrs vec_rep_each #' @importFrom vctrs vec_seq_along +#' @importFrom vctrs vec_set_names #' @importFrom vctrs vec_size_common #' @importFrom vctrs vec_slice #' @importFrom vctrs vec_slice<- diff --git a/R/patch.R b/R/patch.R index af60eb1a1..fe9d1b6db 100644 --- a/R/patch.R +++ b/R/patch.R @@ -115,12 +115,18 @@ vec_approx_equal0 <- function(vec1, vec2, na_equal, abs_tol, inds1 = NULL, inds2 # and `fifelse`. Matching vec_equal, we ignore names and other attributes. if (!is.null(inds1)) vec1 <- vec_slice(vec1, inds1) if (!is.null(inds2)) vec2 <- vec_slice(vec2, inds2) + na_or_nan1 <- is.na(vec1) + na_or_nan2 <- is.na(vec2) res <- fifelse( - !is.na(vec1) & !is.na(vec2), + !na_or_nan1 & !na_or_nan2, abs(vec1 - vec2) <= abs_tol, - if (na_equal) is.na(vec1) & is.na(vec2) else NA - # XXX ^ inconsistent with vec_equal treatment: NA vs. NaN comparison - # behavior with na_equal = TRUE is different + if (na_equal) { + na_or_nan1 & na_or_nan2 & (is.nan(vec1) == is.nan(vec2)) + } else { + # Like `==` and `vec_equal`, we consider NaN == {NA, NaN, anything else} + # to be NA. + NA + } ) if (!is.null(dim(vec1))) { dim(res) <- dim(vec1) @@ -139,27 +145,29 @@ vec_approx_equal0 <- function(vec1, vec2, na_equal, abs_tol, inds1 = NULL, inds2 vec_approx_equal0(vec1[[col_i]], vec2[[col_i]], na_equal, abs_tol, inds1, inds2) })) } - } else if (is_bare_list(vec1)) { + } else if (is_bare_list(vec1) && abs_tol != 0) { vapply(seq_along(vec1), function(i) { entry1 <- vec1[[i]] entry2 <- vec2[[i]] vec_size(entry1) == vec_size(entry2) && - # This is inconsistent with vec_equal on named vectors; to be - # consistently inconsistent, we avoid dispatching to vec_equal for bare - # lists even with abs_tol = 0: - identical(vec_ptype(entry1), vec_ptype(entry2)) && + # Trying to follow vec_equal: strict on ptypes aside from vec_namedness: + identical( + vec_set_names(vec_ptype(entry1), NULL), + vec_set_names(vec_ptype(entry2), NULL) + ) && all(vec_approx_equal0(entry1, entry2, na_equal, abs_tol)) }, logical(1L)) } else { # XXX No special handling for any other types/situations. Makes sense for # unclassed atomic things; custom classes (e.g., distributions) might want # recursion / specialization, though. vec_approx_equal0 should probably be - # an S3 method; see also `vctrs::vec_proxy_equal` though it's probably not - # sufficient (e.g., for keyed data frames such as `epi_df`s that should have - # strict & nonstrict columns). Also, abs_tol == 0 --> vec_equal logic should - # maybe be either be hoisted to vec_approx_equal or we should manually - # recurse on data frames even with abs_tol = 0 when that's faster (might - # depend on presence of inds*), after some inconsistencies are ironed out. + # an S3 method; see also `vctrs::vec_proxy_{equal,compare}` though they + # might not be sufficient (e.g., for keyed data frames such as `epi_df`s + # that should have strict & nonstrict columns). Also, abs_tol == 0 --> + # vec_equal logic should maybe be either be hoisted to vec_approx_equal or + # we should manually recurse on data frames even with abs_tol = 0 when + # that's faster (might depend on presence of inds*), after some + # inconsistencies are ironed out. if (!is.null(inds1)) { vec1 <- vec_slice(vec1, inds1) vec2 <- vec_slice(vec2, inds2) diff --git a/tests/testthat/test-archive.R b/tests/testthat/test-archive.R index 336fe4baa..4383272d2 100644 --- a/tests/testthat/test-archive.R +++ b/tests/testthat/test-archive.R @@ -216,17 +216,3 @@ test_that("`epi_archive` rejects dataframes where time_value and version columns ) expect_error(as_epi_archive(tbl3), class = "epiprocess__time_value_version_mismatch") }) - -test_that("is_locf replacement works as expected", { - vec <- c(1, 1, 1e-10, 1.1e-10, NA, NA, NaN, NaN) - is_repeated <- c(0, 1, 0, 1, 0, 1, 1, 1) - expect_equal( - c( - FALSE, - vec_approx_equal(head(vec, -1L), tail(vec, -1L), - na_equal = TRUE, abs_tol = .Machine$double.eps^0.5 - ) - ), - as.logical(is_repeated) - ) -}) diff --git a/tests/testthat/test-vec_approx_equal.R b/tests/testthat/test-vec_approx_equal.R new file mode 100644 index 000000000..709f82bbf --- /dev/null +++ b/tests/testthat/test-vec_approx_equal.R @@ -0,0 +1,57 @@ +test_that("is_locf replacement works as expected", { + vec <- c(1, 1, 1e-10, 1.1e-10, NA, NA, NaN, NaN) + is_repeated <- c(0, 1, 0, 1, 0, 1, 1, 1) + expect_equal( + c( + FALSE, + vec_approx_equal( + head(vec, -1L), tail(vec, -1L), + na_equal = TRUE, abs_tol = .Machine$double.eps^0.5 + ) + ), + as.logical(is_repeated) + ) +}) + +test_that("vec_approx_equal is compatible with vec_equal on some edge cases", { + # Match (`==` and) `vec_equal` on NaN behavior: + tbl <- tibble::tribble( + ~x, ~y, + NaN, 5, + NaN, NA, + NA, NaN, + NaN, NaN, + ) + expect_identical( + vec_approx_equal(tbl$x, tbl$y, na_equal = FALSE, abs_tol = 1e-8), + vctrs::vec_equal(tbl$x, tbl$y, na_equal = FALSE) + ) + expect_identical( + vec_approx_equal(tbl$x, tbl$y, na_equal = TRUE, abs_tol = 1e-8), + vctrs::vec_equal(tbl$x, tbl$y, na_equal = TRUE) + ) + + # Match `vec_equal` behavior on namedness, including within elements: + unnamed_list <- list(5) + named_list <- list(a = 5) + expect_identical( + vec_approx_equal(unnamed_list, named_list, na_equal = TRUE, abs_tol = 1e-8), + vec_equal(unnamed_list, named_list, na_equal = TRUE) + ) + expect_identical( + vec_approx_equal(list(unnamed_list), list(named_list), na_equal = TRUE, abs_tol = 1e-8), + vec_equal(list(unnamed_list), list(named_list), na_equal = TRUE) + ) + + # Match `vec_equal` behavior on (p)types, including within elements: + dbl <- 5.0 + int <- 5L + expect_identical( + vec_approx_equal(dbl, int, na_equal = TRUE, abs_tol = 1e-8), + vec_equal(dbl, int, na_equal = TRUE) + ) + expect_identical( + vec_approx_equal(list(dbl), list(int), na_equal = TRUE, abs_tol = 1e-8), + vec_equal(list(dbl), list(int), na_equal = TRUE) + ) +}) From 5c8abd3b8b7c204206a6b5a418bf74004224779c Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 25 Mar 2025 16:33:02 -0700 Subject: [PATCH 049/107] fix(vec_approx_equal): don't assume inds1 and inds2 non-NULL together --- R/patch.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/patch.R b/R/patch.R index fe9d1b6db..843bc1c10 100644 --- a/R/patch.R +++ b/R/patch.R @@ -170,6 +170,8 @@ vec_approx_equal0 <- function(vec1, vec2, na_equal, abs_tol, inds1 = NULL, inds2 # inconsistencies are ironed out. if (!is.null(inds1)) { vec1 <- vec_slice(vec1, inds1) + } + if (!is.null(inds2)) { vec2 <- vec_slice(vec2, inds2) } res <- vec_equal(vec1, vec2, na_equal = na_equal) From 71104999edb8887eed5f98182d9d625e8c82bd45 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 25 Mar 2025 17:40:03 -0700 Subject: [PATCH 050/107] perf(vec_approx_equal): avoid vec_cast_common, ptype2 work --- R/patch.R | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/R/patch.R b/R/patch.R index 843bc1c10..189ae9615 100644 --- a/R/patch.R +++ b/R/patch.R @@ -102,7 +102,12 @@ vec_approx_equal <- function(vec1, vec2, na_equal, .ptype = NULL, ..., abs_tol, } else { inds2 <- vec_recycle(inds2, common_size) } - vecs <- vec_cast_common(vec1, vec2, .to = .ptype) + if (!identical(vec_ptype(vec1), vec_ptype(vec2)) || !is.null(.ptype)) { + # perf: this is slow, so try to avoid it if it's not needed + vecs <- vec_cast_common(vec1, vec2, .to = .ptype) + } else { + vecs <- list(vec1, vec2) + } vec_approx_equal0(vecs[[1]], vecs[[2]], na_equal, abs_tol, inds1, inds2) } @@ -174,7 +179,9 @@ vec_approx_equal0 <- function(vec1, vec2, na_equal, abs_tol, inds1 = NULL, inds2 if (!is.null(inds2)) { vec2 <- vec_slice(vec2, inds2) } - res <- vec_equal(vec1, vec2, na_equal = na_equal) + # perf: vec1 and vec2 have already been cast to a common ptype; we can't + # disable casts, but can say to cast (again...) to that ptype + res <- vec_equal(vec1, vec2, na_equal = na_equal, vec_ptype(vec1)) return(res) } } From 90c9da7094e85b49f11235978c2e6f8a9a2a75aa Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 25 Mar 2025 17:42:04 -0700 Subject: [PATCH 051/107] docs(vec_approx_equal): convert comment to issue --- R/patch.R | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/R/patch.R b/R/patch.R index 189ae9615..7167bc8c0 100644 --- a/R/patch.R +++ b/R/patch.R @@ -163,16 +163,9 @@ vec_approx_equal0 <- function(vec1, vec2, na_equal, abs_tol, inds1 = NULL, inds2 all(vec_approx_equal0(entry1, entry2, na_equal, abs_tol)) }, logical(1L)) } else { - # XXX No special handling for any other types/situations. Makes sense for - # unclassed atomic things; custom classes (e.g., distributions) might want - # recursion / specialization, though. vec_approx_equal0 should probably be - # an S3 method; see also `vctrs::vec_proxy_{equal,compare}` though they - # might not be sufficient (e.g., for keyed data frames such as `epi_df`s - # that should have strict & nonstrict columns). Also, abs_tol == 0 --> - # vec_equal logic should maybe be either be hoisted to vec_approx_equal or - # we should manually recurse on data frames even with abs_tol = 0 when - # that's faster (might depend on presence of inds*), after some - # inconsistencies are ironed out. + # No special handling for any other types/situations. We may want to allow + # S3 extension of this method or of a new appropriate vec_proxy_* variant. + # See Issue #640. if (!is.null(inds1)) { vec1 <- vec_slice(vec1, inds1) } From 2fcb00c94508aeb0dad1029e0dce5e4b80e34aa0 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 25 Mar 2025 18:01:08 -0700 Subject: [PATCH 052/107] docs(vec_approx_equal): clean up some outdated/misleading comments --- R/patch.R | 7 ++++--- man/vec_approx_equal.Rd | 3 ++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/R/patch.R b/R/patch.R index 7167bc8c0..28176366a 100644 --- a/R/patch.R +++ b/R/patch.R @@ -20,7 +20,8 @@ #' @param inds1,inds2 optional (row) indices into vec1 and vec2 compatible with #' [`vctrs::vec_slice()`]; output should be consistent with `vec_slice`-ing to #' these indices beforehand, but can give faster computation if `vec1` and -#' `vec2` are data frames. +#' `vec2` are data frames. Currently, any speedup is only by making sure that +#' `vec_slice` is used rather than `[` for data frames. #' #' @return logical vector, with length matching the result of recycling `vec1` #' (at `inds1` if provided) and `vec2` (at `inds2` if provided); entries @@ -116,12 +117,12 @@ vec_approx_equal <- function(vec1, vec2, na_equal, .ptype = NULL, ..., abs_tol, #' @keywords internal vec_approx_equal0 <- function(vec1, vec2, na_equal, abs_tol, inds1 = NULL, inds2 = NULL) { if (is_bare_numeric(vec1) && abs_tol != 0) { - # perf: since we're working with bare numerics and logicals: we can use `[` - # and `fifelse`. Matching vec_equal, we ignore names and other attributes. + # Matching vec_equal, we ignore names and other attributes. if (!is.null(inds1)) vec1 <- vec_slice(vec1, inds1) if (!is.null(inds2)) vec2 <- vec_slice(vec2, inds2) na_or_nan1 <- is.na(vec1) na_or_nan2 <- is.na(vec2) + # Since above are bare logical vectors, we can use `fifelse` res <- fifelse( !na_or_nan1 & !na_or_nan2, abs(vec1 - vec2) <= abs_tol, diff --git a/man/vec_approx_equal.Rd b/man/vec_approx_equal.Rd index ff403a1c6..0b80d9910 100644 --- a/man/vec_approx_equal.Rd +++ b/man/vec_approx_equal.Rd @@ -37,7 +37,8 @@ by name)} \item{inds1, inds2}{optional (row) indices into vec1 and vec2 compatible with \code{\link[vctrs:vec_slice]{vctrs::vec_slice()}}; output should be consistent with \code{vec_slice}-ing to these indices beforehand, but can give faster computation if \code{vec1} and -\code{vec2} are data frames.} +\code{vec2} are data frames. Currently, any speedup is only by making sure that +\code{vec_slice} is used rather than \code{[} for data frames.} } \value{ logical vector, with length matching the result of recycling \code{vec1} From 1b10cbe2113820e13dc45937b440aba402a95c70 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 26 Mar 2025 11:33:57 -0700 Subject: [PATCH 053/107] Update is_locf replacement test for new NA vs. NaN results --- tests/testthat/test-vec_approx_equal.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-vec_approx_equal.R b/tests/testthat/test-vec_approx_equal.R index 709f82bbf..18235224c 100644 --- a/tests/testthat/test-vec_approx_equal.R +++ b/tests/testthat/test-vec_approx_equal.R @@ -1,6 +1,6 @@ test_that("is_locf replacement works as expected", { vec <- c(1, 1, 1e-10, 1.1e-10, NA, NA, NaN, NaN) - is_repeated <- c(0, 1, 0, 1, 0, 1, 1, 1) + is_repeated <- c(0, 1, 0, 1, 0, 1, 0, 1) expect_equal( c( FALSE, From 118d0edca71a31ba50d0b55d3829cc712b5140d9 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 26 Mar 2025 11:34:16 -0700 Subject: [PATCH 054/107] docs(vec_approx_equal): update wrt compatibility fixes --- R/patch.R | 4 +--- man/vec_approx_equal.Rd | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/R/patch.R b/R/patch.R index 28176366a..51ae2486d 100644 --- a/R/patch.R +++ b/R/patch.R @@ -1,8 +1,6 @@ #' Test two vctrs vectors for equality with some tolerance in some cases #' -#' Similar to [`vctrs::vec_equal`]. Behavior may differ from `vec_equal` with -#' non-`NA` `NaN`s involved, or for bare lists that contain named vectors, and -#' the precise behavior in these cases may change and should not be relied upon. +#' Generalizes [`vctrs::vec_equal`]. #' #' @param vec1,vec2 vctrs vectors (includes data frames). Take care when using #' on named vectors or "keyed" data frames; [`vec_names()`] are largely diff --git a/man/vec_approx_equal.Rd b/man/vec_approx_equal.Rd index 0b80d9910..4bab9f77f 100644 --- a/man/vec_approx_equal.Rd +++ b/man/vec_approx_equal.Rd @@ -46,9 +46,7 @@ logical vector, with length matching the result of recycling \code{vec1} should all be \code{TRUE} or \code{FALSE} if \code{na_equal = TRUE}. } \description{ -Similar to \code{\link[vctrs:vec_equal]{vctrs::vec_equal}}. Behavior may differ from \code{vec_equal} with -non-\code{NA} \code{NaN}s involved, or for bare lists that contain named vectors, and -the precise behavior in these cases may change and should not be relied upon. +Generalizes \code{\link[vctrs:vec_equal]{vctrs::vec_equal}}. } \examples{ From 33414598c13412babf077d5f622722c3d432ec20 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 26 Mar 2025 13:48:01 -0700 Subject: [PATCH 055/107] refactor(epi_slide_opt.epi_archive): address TODOs - Column name clobbering is already checked for / avoided in main function code. - Move input nesting and output rbinding inside of epi_slide_opt_archive_one_epikey. --- R/epi_slide_opt_archive.R | 48 ++++++++++----------- man/epi_slide_opt_archive_one_epikey.Rd | 21 +++++---- tests/testthat/test-epi_slide_opt_archive.R | 27 +++++------- 3 files changed, 47 insertions(+), 49 deletions(-) diff --git a/R/epi_slide_opt_archive.R b/R/epi_slide_opt_archive.R index 070dc431c..f9cc00f02 100644 --- a/R/epi_slide_opt_archive.R +++ b/R/epi_slide_opt_archive.R @@ -1,8 +1,9 @@ #' Core operation of `epi_slide_opt.epi_archive` for a single epikey's history #' -#' @param updates tibble with two columns: `version` and `subtbl`; `subtbl` is a -#' list of tibbles, each with a `time_value` column and measurement columns. -#' The epikey should not appear. +#' @param grp_updates tibble with a `version` column and measurement columns for +#' a single epikey, without the epikey labeling columns (e.g., from +#' `group_modify`). Interpretation is analogous to an `epi_archive` `DT`, but +#' a specific row order is not required. #' @param in_colnames chr; names of columns to which to apply `f_dots_baked` #' @param f_dots_baked supported sliding function from `{data.table}` or #' `{slider}`, potentially with some arguments baked in with @@ -17,13 +18,15 @@ #' @param time_type as in `new_epi_archive` #' @param out_colnames chr, same length as `in_colnames`; column names to use #' for results -#' @return list of tibbles with same names as `subtbl`s plus: `c(out_colnames, -#' "version")`; (compactified) diff data to put into an `epi_archive` +#' @return tibble with a `version` column, pre-existing measurement columns, and +#' new measurement columns; (compactified) diff data to put into an +#' `epi_archive`. May not match column ordering; may not ensure any row +#' ordering. #' #' @examples #' #' library(dplyr) -#' updates <- bind_rows( +#' grp_updates <- bind_rows( #' tibble(version = 10, time_value = 1:20, value = 1:20), #' tibble(version = 12, time_value = 4:5, value = 5:4), #' tibble(version = 13, time_value = 8, value = 9), @@ -31,29 +34,30 @@ #' tibble(version = 15, time_value = -10, value = -10), #' tibble(version = 16, time_value = 50, value = 50) #' ) %>% -#' mutate(across(c(version, time_value), ~ as.Date("2020-01-01") - 1 + .x)) %>% -#' tidyr::nest(.by = version, .key = "subtbl") +#' mutate(across(c(version, time_value), ~ as.Date("2020-01-01") - 1 + .x)) #' #' f <- purrr::partial(data.table::frollmean, algo = "exact") #' -#' updates %>% +#' grp_updates %>% #' epiprocess:::epi_slide_opt_archive_one_epikey( #' "value", f, "data.table", 2L, 0L, "day", "slide_value" #' ) #' #' @keywords internal epi_slide_opt_archive_one_epikey <- function( - updates, + grp_updates, in_colnames, f_dots_baked, f_from_package, before, after, time_type, out_colnames) { - # TODO check for col name clobbering + grp_updates_by_version <- grp_updates %>% + nest(.by = version, .key = "subtbl") %>% + arrange(version) unit_step <- unit_time_delta(time_type) prev_inp_snapshot <- NULL prev_out_snapshot <- NULL - result <- map(seq_len(nrow(updates)), function(update_i) { - version <- updates$version[[update_i]] - inp_update <- updates$subtbl[[update_i]] + result <- map(seq_len(nrow(grp_updates_by_version)), function(version_i) { + version <- grp_updates_by_version$version[[version_i]] + inp_update <- grp_updates_by_version$subtbl[[version_i]] inp_snapshot <- tbl_patch(prev_inp_snapshot, inp_update, "time_value") if (before == Inf) { if (after != 0) { @@ -124,6 +128,7 @@ epi_slide_opt_archive_one_epikey <- function( out_diff$version <- version out_diff }) + result <- list_rbind(result) result } @@ -188,26 +193,21 @@ epi_slide_opt.epi_archive <- updates_grouped <- .x$DT %>% as.data.frame() %>% as_tibble(.name_repair = "minimal") %>% - # 0 rows input -> 0 rows output, so we can just say drop = TRUE: - grouped_df(epikey_names, TRUE) + # 0 rows input -> 0 rows output for any drop = FALSE groups with 0 rows, so + # we can just say drop = TRUE: + grouped_df(epikey_names, drop = TRUE) if (use_progress) progress_bar_id <- cli::cli_progress_bar(.progress, total = n_groups(updates_grouped)) result <- updates_grouped %>% group_modify(function(group_values, group_key) { - group_updates <- group_values %>% - nest(.by = version, .key = "subtbl") %>% - arrange(version) - # TODO move nesting inside the helper? res <- epi_slide_opt_archive_one_epikey( - group_updates, + group_values, names_info$input_col_names, .f_dots_baked, .f_info$from_package, window_args$before, window_args$after, time_type, names_info$output_col_names - ) %>% - list_rbind() + ) if (use_progress) cli::cli_progress_update(id = progress_bar_id) res }) %>% - ungroup() %>% as.data.frame() %>% # data.table#6859 as.data.table(key = key(.x$DT)) %>% new_epi_archive( diff --git a/man/epi_slide_opt_archive_one_epikey.Rd b/man/epi_slide_opt_archive_one_epikey.Rd index 78eec4fdf..79d635c88 100644 --- a/man/epi_slide_opt_archive_one_epikey.Rd +++ b/man/epi_slide_opt_archive_one_epikey.Rd @@ -5,7 +5,7 @@ \title{Core operation of \code{epi_slide_opt.epi_archive} for a single epikey's history} \usage{ epi_slide_opt_archive_one_epikey( - updates, + grp_updates, in_colnames, f_dots_baked, f_from_package, @@ -16,9 +16,10 @@ epi_slide_opt_archive_one_epikey( ) } \arguments{ -\item{updates}{tibble with two columns: \code{version} and \code{subtbl}; \code{subtbl} is a -list of tibbles, each with a \code{time_value} column and measurement columns. -The epikey should not appear.} +\item{grp_updates}{tibble with a \code{version} column and measurement columns for +a single epikey, without the epikey labeling columns (e.g., from +\code{group_modify}). Interpretation is analogous to an \code{epi_archive} \code{DT}, but +a specific row order is not required.} \item{in_colnames}{chr; names of columns to which to apply \code{f_dots_baked}} @@ -42,7 +43,10 @@ to include in the sliding window computation} for results} } \value{ -list of tibbles with same names as \code{subtbl}s plus: \code{c(out_colnames, "version")}; (compactified) diff data to put into an \code{epi_archive} +tibble with a \code{version} column, pre-existing measurement columns, and +new measurement columns; (compactified) diff data to put into an +\code{epi_archive}. May not match column ordering; may not ensure any row +ordering. } \description{ Core operation of \code{epi_slide_opt.epi_archive} for a single epikey's history @@ -50,7 +54,7 @@ Core operation of \code{epi_slide_opt.epi_archive} for a single epikey's history \examples{ library(dplyr) -updates <- bind_rows( +grp_updates <- bind_rows( tibble(version = 10, time_value = 1:20, value = 1:20), tibble(version = 12, time_value = 4:5, value = 5:4), tibble(version = 13, time_value = 8, value = 9), @@ -58,12 +62,11 @@ updates <- bind_rows( tibble(version = 15, time_value = -10, value = -10), tibble(version = 16, time_value = 50, value = 50) ) \%>\% - mutate(across(c(version, time_value), ~ as.Date("2020-01-01") - 1 + .x)) \%>\% - tidyr::nest(.by = version, .key = "subtbl") + mutate(across(c(version, time_value), ~ as.Date("2020-01-01") - 1 + .x)) f <- purrr::partial(data.table::frollmean, algo = "exact") -updates \%>\% +grp_updates \%>\% epiprocess:::epi_slide_opt_archive_one_epikey( "value", f, "data.table", 2L, 0L, "day", "slide_value" ) diff --git a/tests/testthat/test-epi_slide_opt_archive.R b/tests/testthat/test-epi_slide_opt_archive.R index 088452da5..768107a4d 100644 --- a/tests/testthat/test-epi_slide_opt_archive.R +++ b/tests/testthat/test-epi_slide_opt_archive.R @@ -3,7 +3,7 @@ library(dplyr) test_that("epi_slide_opt_archive_one_epikey works as expected", { start_date <- as.Date("2020-01-01") - updates <- bind_rows( + grp_updates <- bind_rows( tibble(version = 10, time_value = 0:20, value = 0:20), tibble(version = 12, time_value = 4:5, value = 5:4), tibble(version = 13, time_value = 8, value = 9), @@ -11,16 +11,13 @@ test_that("epi_slide_opt_archive_one_epikey works as expected", { tibble(version = 15, time_value = -10, value = -10), tibble(version = 16, time_value = 50, value = 50) ) %>% - mutate(across(c(version, time_value), ~ start_date - 1 + .x)) %>% - tidyr::nest(.by = version, .key = "subtbl") + mutate(across(c(version, time_value), ~ start_date - 1 + .x)) expected <- list( - vctrs::vec_cbind( - tibble(version = 10), - updates$subtbl[[1L]] %>% - mutate(time_value = as.numeric(time_value - start_date) + 1) %>% - mutate(slide_value = frollmean(value, 3, algo = "exact")) - ), + grp_updates %>% + slice_min(version) %>% + mutate(across(c(version, time_value), ~ as.numeric(.x - start_date) + 1)) %>% + mutate(slide_value = frollmean(value, 3, algo = "exact")), tibble( version = 12, time_value = c(4, 5, 7), # time 6 unchanged, compactified away @@ -50,17 +47,15 @@ test_that("epi_slide_opt_archive_one_epikey works as expected", { lapply(function(x) { x %>% mutate(across(c(version, time_value), ~ start_date - 1 + .x)) - }) + }) %>% + list_rbind() f <- purrr::partial(data.table::frollmean, algo = "exact") - result <- updates %>% + result <- grp_updates %>% epiprocess:::epi_slide_opt_archive_one_epikey("value", f, "data.table", 2L, 0L, "day", "slide_value") %>% - lapply(function(x) { - x %>% - arrange(time_value) %>% - select(version, time_value, everything()) - }) + arrange(version, time_value) %>% + select(version, time_value, everything()) expect_equal(result, expected) }) From 06b4286e5ce44124338bb953437d43efe967ddf9 Mon Sep 17 00:00:00 2001 From: brookslogan Date: Wed, 26 Mar 2025 14:16:54 -0700 Subject: [PATCH 056/107] Update R/epi_slide_opt_edf.R Co-authored-by: nmdefries <42820733+nmdefries@users.noreply.github.com> --- R/epi_slide_opt_edf.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/epi_slide_opt_edf.R b/R/epi_slide_opt_edf.R index b6be49a1a..24de63f2c 100644 --- a/R/epi_slide_opt_edf.R +++ b/R/epi_slide_opt_edf.R @@ -42,8 +42,8 @@ upstream_slide_f_info <- function(.f) { # `f` is from somewhere else and not supported cli_abort( c( - "problem with {rlang::expr_label(rlang::caller_arg(f))}", - "i" = "`f` must be one of `data.table`'s rolling functions (`frollmean`, + "problem with {rlang::expr_label(rlang::caller_arg(.f))}", + "i" = "`.f` must be one of `data.table`'s rolling functions (`frollmean`, `frollsum`, `frollapply`. See `?data.table::roll`) or one of `slider`'s specialized sliding functions (`slide_mean`, `slide_sum`, etc. See `?slider::\`summary-slide\`` for more options)." From 82cfbddb2965cfa89adf2fa5dedc55b23196b1e5 Mon Sep 17 00:00:00 2001 From: brookslogan Date: Wed, 26 Mar 2025 14:17:14 -0700 Subject: [PATCH 057/107] Update R/epi_slide_opt_edf.R Co-authored-by: nmdefries <42820733+nmdefries@users.noreply.github.com> --- R/epi_slide_opt_edf.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/epi_slide_opt_edf.R b/R/epi_slide_opt_edf.R index 24de63f2c..0444b857e 100644 --- a/R/epi_slide_opt_edf.R +++ b/R/epi_slide_opt_edf.R @@ -83,7 +83,7 @@ upstream_slide_f_info <- function(.f) { #' @keywords internal across_ish_names_info <- function(.x, time_type, col_names_quo, .f_namer, .window_size, .align, .prefix, .suffix, .new_col_names) { - # The position of a given column can be differ between input `.x` and + # The position of a given column can differ between input `.x` and # `.data_group` since the grouping step by default drops grouping columns. # To avoid rerunning `eval_select` for every `.data_group`, convert # positions of user-provided `col_names` into string column names. We avoid From c2216a21e3dd1cb09fc03adc3bbb71c873bc8661 Mon Sep 17 00:00:00 2001 From: brookslogan Date: Wed, 26 Mar 2025 14:17:26 -0700 Subject: [PATCH 058/107] Update R/epi_slide_opt_edf.R Co-authored-by: nmdefries <42820733+nmdefries@users.noreply.github.com> --- R/epi_slide_opt_edf.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/epi_slide_opt_edf.R b/R/epi_slide_opt_edf.R index 0444b857e..5d28774f8 100644 --- a/R/epi_slide_opt_edf.R +++ b/R/epi_slide_opt_edf.R @@ -168,7 +168,7 @@ across_ish_names_info <- function(.x, time_type, col_names_quo, .f_namer, #' #' * On `epi_df`s, it will take care of looping over `geo_value`s, temporarily #' filling in time gaps with `NA`s and other work needed to ensure there are -#' exactly n consecutive time steps per computation, and has some other +#' exactly `n` consecutive time steps per computation, and has some other #' convenience features. See `vignette("epi_df")` for more examples. #' #' * On `epi_archive`s, it will calculate the version history for these slide From 3296b2d5f40434f87b0536dd1d3afbbc37efb032 Mon Sep 17 00:00:00 2001 From: brookslogan Date: Wed, 26 Mar 2025 14:17:45 -0700 Subject: [PATCH 059/107] Update R/epi_slide_opt_edf.R Co-authored-by: nmdefries <42820733+nmdefries@users.noreply.github.com> --- R/epi_slide_opt_edf.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/epi_slide_opt_edf.R b/R/epi_slide_opt_edf.R index 5d28774f8..508d7e6b4 100644 --- a/R/epi_slide_opt_edf.R +++ b/R/epi_slide_opt_edf.R @@ -163,7 +163,7 @@ across_ish_names_info <- function(.x, time_type, col_names_quo, .f_namer, #' @description #' #' `epi_slide_opt` calculates n-time-step rolling means&sums, -#' cumulative/"running" means&sums, or other operations supported by +#' cumulative/"running" means&sums, and other operations supported by #' [`data.table::froll`] or [`slider::summary-slide`] functions. #' #' * On `epi_df`s, it will take care of looping over `geo_value`s, temporarily From f14c26da978fd55810ae628c0a2e42544dc92a52 Mon Sep 17 00:00:00 2001 From: brookslogan Date: Wed, 26 Mar 2025 21:19:21 +0000 Subject: [PATCH 060/107] docs: document (GHA) --- man/epi_slide_opt.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/epi_slide_opt.Rd b/man/epi_slide_opt.Rd index 1d5999578..bd213828f 100644 --- a/man/epi_slide_opt.Rd +++ b/man/epi_slide_opt.Rd @@ -140,7 +140,7 @@ cumulative/"running" means&sums, or other operations supported by \itemize{ \item On \code{epi_df}s, it will take care of looping over \code{geo_value}s, temporarily filling in time gaps with \code{NA}s and other work needed to ensure there are -exactly n consecutive time steps per computation, and has some other +exactly \code{n} consecutive time steps per computation, and has some other convenience features. See \code{vignette("epi_df")} for more examples. \item On \code{epi_archive}s, it will calculate the version history for these slide computations and combine it with the version history for the rest of the From 87a01d004cd3acd6e747d1ca5d124df8e8962016 Mon Sep 17 00:00:00 2001 From: brookslogan Date: Wed, 26 Mar 2025 21:19:47 +0000 Subject: [PATCH 061/107] docs: document (GHA) --- man/epi_slide_opt.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/epi_slide_opt.Rd b/man/epi_slide_opt.Rd index bd213828f..8c7d5ad49 100644 --- a/man/epi_slide_opt.Rd +++ b/man/epi_slide_opt.Rd @@ -135,7 +135,7 @@ as \code{.x} if \code{.x} was grouped. } \description{ \code{epi_slide_opt} calculates n-time-step rolling means&sums, -cumulative/"running" means&sums, or other operations supported by +cumulative/"running" means&sums, and other operations supported by \code{\link[data.table:froll]{data.table::froll}} or \code{\link[slider:summary-slide]{slider::summary-slide}} functions. \itemize{ \item On \code{epi_df}s, it will take care of looping over \code{geo_value}s, temporarily From 4bc5b234d438d62f5b61faf2d3336b2c83c077dc Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 26 Mar 2025 14:35:00 -0700 Subject: [PATCH 062/107] refactor(epi_slide_opt_archive): add classes to errors --- R/epi_slide_opt_archive.R | 12 ++++++++---- R/patch.R | 25 ++++++++++++++++--------- 2 files changed, 24 insertions(+), 13 deletions(-) diff --git a/R/epi_slide_opt_archive.R b/R/epi_slide_opt_archive.R index f9cc00f02..eba73e196 100644 --- a/R/epi_slide_opt_archive.R +++ b/R/epi_slide_opt_archive.R @@ -61,7 +61,8 @@ epi_slide_opt_archive_one_epikey <- function( inp_snapshot <- tbl_patch(prev_inp_snapshot, inp_update, "time_value") if (before == Inf) { if (after != 0) { - cli_abort('.window_size = Inf is only supported with .align = "right"') + cli_abort('.window_size = Inf is only supported with .align = "right"', + class = "epiprocess__epi_slide_opt_archive__inf_window_invalid_align") } # We need to use the entire input snapshot range, filling in time gaps. We # shouldn't pad the ends. @@ -105,7 +106,8 @@ epi_slide_opt_archive_one_epikey <- function( slide[[out_colnames[[col_i]]]] <- f_dots_baked(slide[[in_colnames[[col_i]]]], before = before, after = after) } } else { - cli_abort("epiprocess internal error: `f_from_package` was {format_chr_deparse(f_from_package)}") + cli_abort("epiprocess internal error: `f_from_package` was {format_chr_deparse(f_from_package)}", + class = "epiprocess__epi_slide_opt_archive__f_from_package_invalid") } rows_should_keep <- if (before == Inf) { @@ -176,10 +178,12 @@ epi_slide_opt.epi_archive <- ) window_args <- get_before_after_from_window(.window_size, .align, time_type) if (!is.null(.ref_time_values)) { - cli_abort("epi_slide.epi_archive does not support the `.ref_time_values` argument") + cli_abort("epi_slide.epi_archive does not support the `.ref_time_values` argument", + class = "epiprocess__epi_slide_opt_archive__ref_time_values_unsupported") } if (!identical(.all_rows, FALSE)) { - cli_abort("epi_slide.epi_archive does not support the `.all_rows` argument") + cli_abort("epi_slide.epi_archive does not support the `.all_rows` argument", + class = "epiprocess__epi_slide_opt_archive__all_rows_unsupported") } assert( checkmate::check_logical(.progress, any.missing = FALSE, len = 1L, names = "unnamed"), diff --git a/R/patch.R b/R/patch.R index 51ae2486d..ed09541c9 100644 --- a/R/patch.R +++ b/R/patch.R @@ -208,16 +208,19 @@ tbl_diff2 <- function(earlier_snapshot, later_tbl, # Most input validation + handle NULL earlier_snapshot. This is a small function so # use faster validation variants: if (!is_tibble(later_tbl)) { - cli_abort("`later_tbl` must be a tibble") + cli_abort("`later_tbl` must be a tibble", + class = "epiprocess__tbl_diff2__later_tbl_invalid") } if (is.null(earlier_snapshot)) { return(later_tbl) } if (!is_tibble(earlier_snapshot)) { - cli_abort("`earlier_snapshot` must be a tibble or `NULL`") + cli_abort("`earlier_snapshot` must be a tibble or `NULL`", + class = "epiprocess__tbl_diff2__earlier_tbl_class_invalid") } if (!is.character(ukey_names) || !all(ukey_names %in% names(earlier_snapshot))) { - cli_abort("`ukey_names` must be a subset of column names") + cli_abort("`ukey_names` must be a subset of column names", + class = "epiprocess__tbl_diff2__ukey_names_class_invalid") } later_format <- arg_match0(later_format, c("snapshot", "update")) if (!(is.vector(compactify_abs_tol, mode = "numeric") && @@ -226,7 +229,8 @@ tbl_diff2 <- function(earlier_snapshot, later_tbl, # Give a specific message: assert_numeric(compactify_abs_tol, lower = 0, any.missing = FALSE, len = 1L) # Fallback e.g. for invalid classes not caught by assert_numeric: - cli_abort("`compactify_abs_tol` must be a length-1 double/integer >= 0") + cli_abort("`compactify_abs_tol` must be a length-1 double/integer >= 0", + class = "epiprocess__tbl_diff2__compactify_abs_tol_invalid") } # Extract metadata: @@ -241,7 +245,7 @@ tbl_diff2 <- function(earlier_snapshot, later_tbl, names and ordering.", "*" = "`earlier_snapshot` colnames: {format_chr_deparse(tbl_names)}", "*" = "`later_tbl` colnames: {format_chr_deparse(names(later_tbl))}" - )) + ), class = "epiprocess__tbl_diff2__tbl_names_differ") } combined_tbl <- vec_rbind(earlier_snapshot, later_tbl) @@ -330,23 +334,26 @@ tbl_patch <- function(snapshot, update, ukey_names) { # Most input validation. This is a small function so use faster validation # variants: if (!is_tibble(update)) { - cli_abort("`update` must be a tibble") + cli_abort("`update` must be a tibble", + class = "epiprocess__tbl_patch__update_class_invalid") } if (is.null(snapshot)) { return(update) } if (!is_tibble(snapshot)) { - cli_abort("`snapshot` must be a tibble") + cli_abort("`snapshot` must be a tibble", + class = "epiprocess__tbl_patch__snapshot_class_invalid") } if (!is.character(ukey_names) || !all(ukey_names %in% names(snapshot))) { - cli_abort("`ukey_names` must be a subset of column names") + cli_abort("`ukey_names` must be a subset of column names", + class = "epiprocess__tbl_patch__ukey_names_invalid") } if (!identical(names(snapshot), names(update))) { cli_abort(c("`snapshot` and `update` should have identical column names and ordering.", "*" = "`snapshot` colnames: {format_chr_deparse(tbl_names)}", "*" = "`update` colnames: {format_chr_deparse(names(update))}" - )) + ), class = "epiprocess__tbl_patch__tbl_names_invalid") } result_tbl <- vec_rbind(update, snapshot) From 711419a9f4c3f9499e7c90117bd2f258e89599cc Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 26 Mar 2025 14:35:24 -0700 Subject: [PATCH 063/107] Skip malfunctioning test being updated separately --- tests/testthat/test-compactify.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-compactify.R b/tests/testthat/test-compactify.R index 229af8453..e314b299f 100644 --- a/tests/testthat/test-compactify.R +++ b/tests/testthat/test-compactify.R @@ -124,7 +124,7 @@ quantile_pred_once <- function(estimates_vec, levels_vec) { hardhat::quantile_pred(t(as.matrix(estimates_vec)), levels_vec) } test_that("compactify works on distributions", { - skip("Until #611 is merged or hardhat/epipredict is patched") + skip("See #631.") forecasts <- tibble( ahead = 2L, geo_value = "ak", From 818f25a7dbb165eb112bb7b29160db4209e11cd2 Mon Sep 17 00:00:00 2001 From: brookslogan Date: Wed, 26 Mar 2025 21:39:21 +0000 Subject: [PATCH 064/107] style: styler (GHA) --- R/epi_slide_opt_archive.R | 12 ++++++++---- R/patch.R | 23 +++++++++++++++-------- 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/R/epi_slide_opt_archive.R b/R/epi_slide_opt_archive.R index eba73e196..f98b80fbb 100644 --- a/R/epi_slide_opt_archive.R +++ b/R/epi_slide_opt_archive.R @@ -62,7 +62,8 @@ epi_slide_opt_archive_one_epikey <- function( if (before == Inf) { if (after != 0) { cli_abort('.window_size = Inf is only supported with .align = "right"', - class = "epiprocess__epi_slide_opt_archive__inf_window_invalid_align") + class = "epiprocess__epi_slide_opt_archive__inf_window_invalid_align" + ) } # We need to use the entire input snapshot range, filling in time gaps. We # shouldn't pad the ends. @@ -107,7 +108,8 @@ epi_slide_opt_archive_one_epikey <- function( } } else { cli_abort("epiprocess internal error: `f_from_package` was {format_chr_deparse(f_from_package)}", - class = "epiprocess__epi_slide_opt_archive__f_from_package_invalid") + class = "epiprocess__epi_slide_opt_archive__f_from_package_invalid" + ) } rows_should_keep <- if (before == Inf) { @@ -179,11 +181,13 @@ epi_slide_opt.epi_archive <- window_args <- get_before_after_from_window(.window_size, .align, time_type) if (!is.null(.ref_time_values)) { cli_abort("epi_slide.epi_archive does not support the `.ref_time_values` argument", - class = "epiprocess__epi_slide_opt_archive__ref_time_values_unsupported") + class = "epiprocess__epi_slide_opt_archive__ref_time_values_unsupported" + ) } if (!identical(.all_rows, FALSE)) { cli_abort("epi_slide.epi_archive does not support the `.all_rows` argument", - class = "epiprocess__epi_slide_opt_archive__all_rows_unsupported") + class = "epiprocess__epi_slide_opt_archive__all_rows_unsupported" + ) } assert( checkmate::check_logical(.progress, any.missing = FALSE, len = 1L, names = "unnamed"), diff --git a/R/patch.R b/R/patch.R index ed09541c9..dad535683 100644 --- a/R/patch.R +++ b/R/patch.R @@ -209,18 +209,21 @@ tbl_diff2 <- function(earlier_snapshot, later_tbl, # use faster validation variants: if (!is_tibble(later_tbl)) { cli_abort("`later_tbl` must be a tibble", - class = "epiprocess__tbl_diff2__later_tbl_invalid") + class = "epiprocess__tbl_diff2__later_tbl_invalid" + ) } if (is.null(earlier_snapshot)) { return(later_tbl) } if (!is_tibble(earlier_snapshot)) { cli_abort("`earlier_snapshot` must be a tibble or `NULL`", - class = "epiprocess__tbl_diff2__earlier_tbl_class_invalid") + class = "epiprocess__tbl_diff2__earlier_tbl_class_invalid" + ) } if (!is.character(ukey_names) || !all(ukey_names %in% names(earlier_snapshot))) { cli_abort("`ukey_names` must be a subset of column names", - class = "epiprocess__tbl_diff2__ukey_names_class_invalid") + class = "epiprocess__tbl_diff2__ukey_names_class_invalid" + ) } later_format <- arg_match0(later_format, c("snapshot", "update")) if (!(is.vector(compactify_abs_tol, mode = "numeric") && @@ -230,7 +233,8 @@ tbl_diff2 <- function(earlier_snapshot, later_tbl, assert_numeric(compactify_abs_tol, lower = 0, any.missing = FALSE, len = 1L) # Fallback e.g. for invalid classes not caught by assert_numeric: cli_abort("`compactify_abs_tol` must be a length-1 double/integer >= 0", - class = "epiprocess__tbl_diff2__compactify_abs_tol_invalid") + class = "epiprocess__tbl_diff2__compactify_abs_tol_invalid" + ) } # Extract metadata: @@ -335,25 +339,28 @@ tbl_patch <- function(snapshot, update, ukey_names) { # variants: if (!is_tibble(update)) { cli_abort("`update` must be a tibble", - class = "epiprocess__tbl_patch__update_class_invalid") + class = "epiprocess__tbl_patch__update_class_invalid" + ) } if (is.null(snapshot)) { return(update) } if (!is_tibble(snapshot)) { cli_abort("`snapshot` must be a tibble", - class = "epiprocess__tbl_patch__snapshot_class_invalid") + class = "epiprocess__tbl_patch__snapshot_class_invalid" + ) } if (!is.character(ukey_names) || !all(ukey_names %in% names(snapshot))) { cli_abort("`ukey_names` must be a subset of column names", - class = "epiprocess__tbl_patch__ukey_names_invalid") + class = "epiprocess__tbl_patch__ukey_names_invalid" + ) } if (!identical(names(snapshot), names(update))) { cli_abort(c("`snapshot` and `update` should have identical column names and ordering.", "*" = "`snapshot` colnames: {format_chr_deparse(tbl_names)}", "*" = "`update` colnames: {format_chr_deparse(names(update))}" - ), class = "epiprocess__tbl_patch__tbl_names_invalid") + ), class = "epiprocess__tbl_patch__tbl_names_invalid") } result_tbl <- vec_rbind(update, snapshot) From 916318a3b1e17a3cdac85c21a32effd58de211e0 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 26 Mar 2025 14:47:43 -0700 Subject: [PATCH 065/107] Use data.table column looping in epi_slide_opt.epi_archive --- R/epi_slide_opt_archive.R | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/R/epi_slide_opt_archive.R b/R/epi_slide_opt_archive.R index f98b80fbb..a5d373d89 100644 --- a/R/epi_slide_opt_archive.R +++ b/R/epi_slide_opt_archive.R @@ -88,19 +88,19 @@ epi_slide_opt_archive_one_epikey <- function( slide <- inp_snapshot[slide_inp_backrefs, ] slide$time_value <- slide_time_values if (f_from_package == "data.table") { - for (col_i in seq_along(in_colnames)) { - if (before == Inf) { - slide[[out_colnames[[col_i]]]] <- - f_dots_baked(slide[[in_colnames[[col_i]]]], seq_len(slide_nrow), adaptive = TRUE) - } else { - out_col <- f_dots_baked(slide[[in_colnames[[col_i]]]], before + after + 1L) - if (after != 0L) { - # data.table always puts NAs at tails, even with na.rm = TRUE; chop - # off extra NAs from beginning and place missing NAs at end: - out_col <- c(out_col[seq(after + 1L, slide_nrow)], rep(NA, after)) - } - slide[[out_colnames[[col_i]]]] <- out_col + if (before == Inf) { + slide[, out_colnames] <- + f_dots_baked(slide[, in_colnames], seq_len(slide_nrow), adaptive = TRUE) + } else { + out_cols <- f_dots_baked(slide[, in_colnames], before + after + 1L) + if (after != 0L) { + # data.table always puts NAs at tails, even with na.rm = TRUE; chop + # off extra NAs from beginning and place missing NAs at end: + out_cols <- purrr::map(out_cols, function(.x) { + c(.x[seq(after + 1L, slide_nrow)], rep(NA, after)) + }) } + slide[, out_colnames] <- out_cols } } else if (f_from_package == "slider") { for (col_i in seq_along(in_colnames)) { From 6575f3d485b0cda0226a3e18cf0ea4c8d7e30e29 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 26 Mar 2025 14:56:32 -0700 Subject: [PATCH 066/107] Avoid `map` overhead in loop --- R/epi_slide_opt_archive.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/epi_slide_opt_archive.R b/R/epi_slide_opt_archive.R index a5d373d89..818c273c1 100644 --- a/R/epi_slide_opt_archive.R +++ b/R/epi_slide_opt_archive.R @@ -96,8 +96,8 @@ epi_slide_opt_archive_one_epikey <- function( if (after != 0L) { # data.table always puts NAs at tails, even with na.rm = TRUE; chop # off extra NAs from beginning and place missing NAs at end: - out_cols <- purrr::map(out_cols, function(.x) { - c(.x[seq(after + 1L, slide_nrow)], rep(NA, after)) + out_cols <- lapply(out_cols, function(out_col) { + c(out_col[seq(after + 1L, slide_nrow)], rep(NA, after)) }) } slide[, out_colnames] <- out_cols From 9b30f46841c69544db7a454cf3061853d6c32a3e Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 26 Mar 2025 15:23:43 -0700 Subject: [PATCH 067/107] Work toward proper `fill =` + `.align` support in `epi_slide_opt()` This approach to getting the `fill` value makes things make more sense internally, but we still may need to manually deal with the impact of our own padding and directly intercept or disallow `fill =`. Suggested-by: nmdefries --- R/epi_slide_opt_archive.R | 6 +++--- R/epi_slide_opt_edf.R | 6 ++++-- tests/testthat/test-epi_slide.R | 23 +++++++++++++++++++++ tests/testthat/test-epi_slide_opt_archive.R | 16 ++++++++++++++ 4 files changed, 46 insertions(+), 5 deletions(-) diff --git a/R/epi_slide_opt_archive.R b/R/epi_slide_opt_archive.R index 818c273c1..0cbea1277 100644 --- a/R/epi_slide_opt_archive.R +++ b/R/epi_slide_opt_archive.R @@ -94,10 +94,10 @@ epi_slide_opt_archive_one_epikey <- function( } else { out_cols <- f_dots_baked(slide[, in_colnames], before + after + 1L) if (after != 0L) { - # data.table always puts NAs at tails, even with na.rm = TRUE; chop - # off extra NAs from beginning and place missing NAs at end: + # data.table always puts `fill` arg (default NA) at the tails, even + # with na.rm = TRUE; chop off extra from beginning and place at end: out_cols <- lapply(out_cols, function(out_col) { - c(out_col[seq(after + 1L, slide_nrow)], rep(NA, after)) + c(out_col[(after + 1L):length(out_col)], out_col[seq_len(after)]) }) } slide[, out_colnames] <- out_cols diff --git a/R/epi_slide_opt_edf.R b/R/epi_slide_opt_edf.R index 508d7e6b4..fc75d1eaf 100644 --- a/R/epi_slide_opt_edf.R +++ b/R/epi_slide_opt_edf.R @@ -474,8 +474,10 @@ epi_slide_opt.epi_df <- function(.x, .col_names, .f, ..., roll_output <- .f(x = .data_group[, input_col_names], n = window_size, adaptive = TRUE, ...) } if (window_args$after >= 1) { - .data_group[, output_col_names] <- purrr::map(roll_output, function(.x) { - c(.x[(window_args$after + 1L):length(.x)], rep(NA, window_args$after)) + .data_group[, output_col_names] <- lapply(roll_output, function(out_col) { + # data.table always puts `fill` arg (default NA) at the tails, even + # with na.rm = TRUE; chop off extra from beginning and place at end: + c(out_col[(window_args$after + 1L):length(out_col)], out_col[seq_len(window_args$after)]) }) } else { .data_group[, output_col_names] <- roll_output diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index 0aa4aca7f..975e17121 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -937,3 +937,26 @@ test_that("epi_slide* output grouping matches input grouping", { character(0) ) }) + +test_that('`epi_slide_opt .align != "right"` respects `fill` arg', { + test_date <- as.Date("2020-01-01") + toy_edf <- tibble( + geo_value = 1, + time_value = test_date - 1 + 1:5, + value = c(1:3, NA, 5) + ) %>% + as_epi_df(as_of = test_date + 10) + + result <- toy_edf %>% + epi_slide_opt(value, frollmean, .window_size = 3, .align = "left", fill = -1000) + + expected <- tibble( + geo_value = 1, + time_value = as.Date("2020-01-01") - 1 + 1:5, + value = c(1:3, NA, 5), + value_3dlav = c(2, NA, NA, -1000, -1000) + ) %>% + as_epi_df(as_of = test_date + 10) + + expect_equal(result, expected) +}) diff --git a/tests/testthat/test-epi_slide_opt_archive.R b/tests/testthat/test-epi_slide_opt_archive.R index 768107a4d..d472a20c4 100644 --- a/tests/testthat/test-epi_slide_opt_archive.R +++ b/tests/testthat/test-epi_slide_opt_archive.R @@ -210,4 +210,20 @@ test_that("epi_slide_opt.epi_archive gives expected results on example data; als ) expect_equal(archive_cases_dv_subset_result, archive_cases_dv_subset_expected) + + archive_cases_dv_subset_time_opt2 <- system.time( + archive_cases_dv_subset_result2 <- archive_cases_dv_subset %>% + epi_slide_opt(percent_cli, frollmean, .window_size = 7, .align = "left", fill = -1000) + ) + + archive_cases_dv_subset_time_gen2 <- system.time( + archive_cases_dv_subset_expected2 <- archive_cases_dv_subset %>% + epix_slide( + ~ .x %>% epi_slide_opt(percent_cli, frollmean, .window_size = 7, .align = "left", fill = -1000) + ) %>% + select(geo_value, time_value, version, everything()) %>% + as_epi_archive() + ) + + expect_equal(archive_cases_dv_subset_result2, archive_cases_dv_subset_expected2) }) From 21a0b38f4256657b1946aede0d809d2f8e14cb68 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 26 Mar 2025 18:15:41 -0700 Subject: [PATCH 068/107] Don't try to support `epi_slide_opt(fill =)` --- R/epi_slide_opt_archive.R | 2 +- R/epi_slide_opt_edf.R | 12 ++++++++++-- tests/testthat/test-epi_slide.R | 19 ++++++------------- tests/testthat/test-epi_slide_opt_archive.R | 18 ++++-------------- 4 files changed, 21 insertions(+), 30 deletions(-) diff --git a/R/epi_slide_opt_archive.R b/R/epi_slide_opt_archive.R index 0cbea1277..1629bee9f 100644 --- a/R/epi_slide_opt_archive.R +++ b/R/epi_slide_opt_archive.R @@ -164,7 +164,7 @@ epi_slide_opt.epi_archive <- epikey_names <- key_colnames(.x, exclude = c("time_value", "version")) # Validation & pre-processing: .align <- arg_match(.align) - .f_info <- upstream_slide_f_info(.f) + .f_info <- upstream_slide_f_info(.f, ...) .f_dots_baked <- if (rlang::dots_n(...) == 0L) { # Leaving `.f` unchanged slightly improves computation speed and trims diff --git a/R/epi_slide_opt_edf.R b/R/epi_slide_opt_edf.R index fc75d1eaf..51d88e785 100644 --- a/R/epi_slide_opt_edf.R +++ b/R/epi_slide_opt_edf.R @@ -21,6 +21,8 @@ upstream_slide_f_possibilities <- tibble::tribble( #' #' @param .f function such as `data.table::frollmean` or `slider::slide_mean`; #' must appear in [`upstream_slide_f_possibilities`] +#' @param ... additional configuration args to `.f` (besides the data and window +#' size&alignment); used to validate `.f` is used in a supported way #' @return named list with two elements: `from_package`, a string containing the #' upstream package name ("data.table" or "slider"), and `namer`, a function #' that takes a column to call `.f` on and outputs a basic name or @@ -28,7 +30,7 @@ upstream_slide_f_possibilities <- tibble::tribble( #' (e.g., "sum", "av", "count"). #' #' @keywords internal -upstream_slide_f_info <- function(.f) { +upstream_slide_f_info <- function(.f, ...) { assert_function(.f) # Check that slide function `.f` is one of those short-listed from @@ -59,6 +61,12 @@ upstream_slide_f_info <- function(.f) { reprex::reprex to provide a minimal reproducible example.') } f_from_package <- f_info_row$package + if (f_from_package == "data.table" && "fill" %in% names(rlang::call_match(dots_expand = FALSE)[["..."]])) { + cli_abort("`epi_slide_opt` does not support `data.table::froll*()` with a + custom `fill =` arg", + class = "epiprocess__epi_slide_opt__fill_unsupported" + ) + } list( from_package = f_from_package, namer = unwrap(f_info_row$namer) @@ -384,7 +392,7 @@ epi_slide_opt.epi_df <- function(.x, .col_names, .f, ..., # Validate/process .col_names, .f: col_names_quo <- enquo(.col_names) - f_info <- upstream_slide_f_info(.f) + f_info <- upstream_slide_f_info(.f, ...) f_from_package <- f_info$from_package # Validate/process .ref_time_values: diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index 975e17121..6fab55d44 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -938,7 +938,7 @@ test_that("epi_slide* output grouping matches input grouping", { ) }) -test_that('`epi_slide_opt .align != "right"` respects `fill` arg', { +test_that('`epi_slide_opt .align != "right"` errors on `fill` arg', { test_date <- as.Date("2020-01-01") toy_edf <- tibble( geo_value = 1, @@ -947,16 +947,9 @@ test_that('`epi_slide_opt .align != "right"` respects `fill` arg', { ) %>% as_epi_df(as_of = test_date + 10) - result <- toy_edf %>% - epi_slide_opt(value, frollmean, .window_size = 3, .align = "left", fill = -1000) - - expected <- tibble( - geo_value = 1, - time_value = as.Date("2020-01-01") - 1 + 1:5, - value = c(1:3, NA, 5), - value_3dlav = c(2, NA, NA, -1000, -1000) - ) %>% - as_epi_df(as_of = test_date + 10) - - expect_equal(result, expected) + expect_error( + toy_edf %>% + epi_slide_opt(value, frollmean, .window_size = 3, .align = "left", fill = -1000), + class = "epiprocess__epi_slide_opt__fill_unsupported" + ) }) diff --git a/tests/testthat/test-epi_slide_opt_archive.R b/tests/testthat/test-epi_slide_opt_archive.R index d472a20c4..6e6fd2479 100644 --- a/tests/testthat/test-epi_slide_opt_archive.R +++ b/tests/testthat/test-epi_slide_opt_archive.R @@ -211,19 +211,9 @@ test_that("epi_slide_opt.epi_archive gives expected results on example data; als expect_equal(archive_cases_dv_subset_result, archive_cases_dv_subset_expected) - archive_cases_dv_subset_time_opt2 <- system.time( - archive_cases_dv_subset_result2 <- archive_cases_dv_subset %>% - epi_slide_opt(percent_cli, frollmean, .window_size = 7, .align = "left", fill = -1000) - ) - - archive_cases_dv_subset_time_gen2 <- system.time( - archive_cases_dv_subset_expected2 <- archive_cases_dv_subset %>% - epix_slide( - ~ .x %>% epi_slide_opt(percent_cli, frollmean, .window_size = 7, .align = "left", fill = -1000) - ) %>% - select(geo_value, time_value, version, everything()) %>% - as_epi_archive() + expect_error( + archive_cases_dv_subset %>% + epi_slide_opt(percent_cli, frollmean, .window_size = 7, .align = "left", fill = -1000), + class = "epiprocess__epi_slide_opt__fill_unsupported" ) - - expect_equal(archive_cases_dv_subset_result2, archive_cases_dv_subset_expected2) }) From c885bbac5a7737d5763f35b4d315e8a572269132 Mon Sep 17 00:00:00 2001 From: brookslogan Date: Thu, 27 Mar 2025 01:18:18 +0000 Subject: [PATCH 069/107] docs: document (GHA) --- man/upstream_slide_f_info.Rd | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/man/upstream_slide_f_info.Rd b/man/upstream_slide_f_info.Rd index 787790ca6..6b8ba7683 100644 --- a/man/upstream_slide_f_info.Rd +++ b/man/upstream_slide_f_info.Rd @@ -4,11 +4,14 @@ \alias{upstream_slide_f_info} \title{Validate & get information about an upstream slide function} \usage{ -upstream_slide_f_info(.f) +upstream_slide_f_info(.f, ...) } \arguments{ \item{.f}{function such as \code{data.table::frollmean} or \code{slider::slide_mean}; must appear in \code{\link{upstream_slide_f_possibilities}}} + +\item{...}{additional configuration args to \code{.f} (besides the data and window +size&alignment); used to validate \code{.f} is used in a supported way} } \value{ named list with two elements: \code{from_package}, a string containing the From f0e495895bf4bac55f42e3eaf00c50cfd0a87a89 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 26 Mar 2025 18:26:29 -0700 Subject: [PATCH 070/107] docs(epi_slide_opt): update comments re. after>=1 padding relocation --- R/epi_slide_opt_archive.R | 7 ++++--- R/epi_slide_opt_edf.R | 7 ++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/R/epi_slide_opt_archive.R b/R/epi_slide_opt_archive.R index 1629bee9f..48d43cceb 100644 --- a/R/epi_slide_opt_archive.R +++ b/R/epi_slide_opt_archive.R @@ -94,10 +94,11 @@ epi_slide_opt_archive_one_epikey <- function( } else { out_cols <- f_dots_baked(slide[, in_colnames], before + after + 1L) if (after != 0L) { - # data.table always puts `fill` arg (default NA) at the tails, even - # with na.rm = TRUE; chop off extra from beginning and place at end: + # Shift an appropriate amount of NA padding from the start to the end. + # (This padding will later be cut off when we filter down to the + # original time_values.) out_cols <- lapply(out_cols, function(out_col) { - c(out_col[(after + 1L):length(out_col)], out_col[seq_len(after)]) + c(out_col[(after + 1L):length(out_col)], rep(NA, after)) }) } slide[, out_colnames] <- out_cols diff --git a/R/epi_slide_opt_edf.R b/R/epi_slide_opt_edf.R index 51d88e785..cb80aac0c 100644 --- a/R/epi_slide_opt_edf.R +++ b/R/epi_slide_opt_edf.R @@ -483,9 +483,10 @@ epi_slide_opt.epi_df <- function(.x, .col_names, .f, ..., } if (window_args$after >= 1) { .data_group[, output_col_names] <- lapply(roll_output, function(out_col) { - # data.table always puts `fill` arg (default NA) at the tails, even - # with na.rm = TRUE; chop off extra from beginning and place at end: - c(out_col[(window_args$after + 1L):length(out_col)], out_col[seq_len(window_args$after)]) + # Shift an appropriate amount of NA padding from the start to the end. + # (This padding will later be cut off when we filter down to the + # original time_values.) + c(out_col[(window_args$after + 1L):length(out_col)], rep(NA, window_args$after)) }) } else { .data_group[, output_col_names] <- roll_output From 60563deb0f4c6ca0da9f3eb7a134110bfd516d39 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 26 Mar 2025 18:38:08 -0700 Subject: [PATCH 071/107] Address styler & linter indentation conflict --- R/patch.R | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/R/patch.R b/R/patch.R index dad535683..929075de3 100644 --- a/R/patch.R +++ b/R/patch.R @@ -245,8 +245,9 @@ tbl_diff2 <- function(earlier_snapshot, later_tbl, # More input validation: if (!identical(tbl_names, names(later_tbl))) { - cli_abort(c("`earlier_snapshot` and `later_tbl` should have identical column - names and ordering.", + cli_abort(c( + "`earlier_snapshot` and `later_tbl` should have identical column + names and ordering.", "*" = "`earlier_snapshot` colnames: {format_chr_deparse(tbl_names)}", "*" = "`later_tbl` colnames: {format_chr_deparse(names(later_tbl))}" ), class = "epiprocess__tbl_diff2__tbl_names_differ") @@ -356,8 +357,9 @@ tbl_patch <- function(snapshot, update, ukey_names) { ) } if (!identical(names(snapshot), names(update))) { - cli_abort(c("`snapshot` and `update` should have identical column - names and ordering.", + cli_abort(c( + "`snapshot` and `update` should have identical column + names and ordering.", "*" = "`snapshot` colnames: {format_chr_deparse(tbl_names)}", "*" = "`update` colnames: {format_chr_deparse(names(update))}" ), class = "epiprocess__tbl_patch__tbl_names_invalid") From 09756df09d9cb0d0f21606b9644bf2927bb99086 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Wed, 2 Apr 2025 17:12:01 +0000 Subject: [PATCH 072/107] docs: document (GHA) --- man/epi_archive.Rd | 46 ++++++++-------------------------------------- 1 file changed, 8 insertions(+), 38 deletions(-) diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index ab9da219f..5aa4ce785 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -9,17 +9,6 @@ \title{\code{as_epi_archive} converts a data frame, data table, or tibble into an \code{epi_archive} object.} \usage{ -new_epi_archive( - data_table, - geo_type, - time_type, - other_keys, - clobberable_versions_start, - versions_end -) - -validate_epi_archive(x) - as_epi_archive( x, geo_type = deprecated(), @@ -36,7 +25,7 @@ as_epi_archive( is_epi_archive(x) new_epi_archive( - x, + data_table, geo_type, time_type, other_keys, @@ -47,12 +36,7 @@ new_epi_archive( validate_epi_archive(x) } \arguments{ -\item{data_table}{a \code{data.table} with \code{\link[data.table:setkey]{data.table::key()}} equal to -\code{c("geo_value", other_keys, "time_value", "version")}. For \code{data.table} -users: this sets up an alias of \code{data_table}; if you plan to keep on -working with \code{data_table} or working directly with the archive's \verb{$DT} -using mutating operations, you should \code{copy()} if appropriate. We will not -mutate the \code{DT} with any exported \code{{epiprocess}} functions, though.} +\item{x}{An object.} \item{geo_type}{DEPRECATED Has no effect. Geo value type is inferred from the location column and set to "custom" if not recognized.} @@ -110,26 +94,12 @@ value of \code{clobberable_versions_start} does not fully trust these empty updates, and assumes that any version \verb{>= max(x$version)} could be clobbered.) If \code{nrow(x) == 0}, then this argument is mandatory.} -\item{x}{A data.frame, data.table, or tibble, with columns \code{geo_value}, -\code{time_value}, \code{version}, and then any additional number of columns.} - -\item{compactify}{Optional; \code{TRUE}, \code{FALSE}, or \code{"message"}. \code{TRUE} will -remove some redundant rows, \code{FALSE} will not. \code{"message"} is like \code{TRUE} -but will emit a message if anything was changed. Default is \code{TRUE}. See -more information below under "Compactification:".} - -\item{compactify_abs_tol}{Optional; double. A tolerance level used to detect -approximate equality for compactification. The default is 0, which -corresponds to exact equality. Consider using this if your value columns -undergo tiny nonmeaningful revisions and the archive object with the -default setting is too large.} - -\item{.versions_end}{location based versions_end, used to avoid prefix -\code{version = issue} from being assigned to \code{versions_end} instead of being -used to rename columns.} - -\item{...}{used for specifying column names, as in \code{\link[dplyr:rename]{dplyr::rename}}. For -example \code{version = release_date}} +\item{data_table}{a \code{data.table} with \code{\link[data.table:setkey]{data.table::key()}} equal to +\code{c("geo_value", other_keys, "time_value", "version")}. For \code{data.table} +users: this sets up an alias of \code{data_table}; if you plan to keep on +working with \code{data_table} or working directly with the archive's \verb{$DT} +using mutating operations, you should \code{copy()} if appropriate. We will not +mutate the \code{DT} with any exported \code{{epiprocess}} functions, though.} } \value{ \itemize{ From 3ce86a00827e8eb031393213a10e13b4ba30b2a5 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Thu, 3 Apr 2025 17:20:40 -0500 Subject: [PATCH 073/107] comments and some nit rewrites --- R/archive.R | 12 +++++++-- R/epi_slide_opt_archive.R | 14 +++++----- R/epi_slide_opt_edf.R | 3 ++- R/patch.R | 30 ++++++++++++--------- tests/testthat/test-epi_slide_opt_archive.R | 9 ++++--- 5 files changed, 42 insertions(+), 26 deletions(-) diff --git a/R/archive.R b/R/archive.R index 5afabb13b..543cb4516 100644 --- a/R/archive.R +++ b/R/archive.R @@ -438,7 +438,8 @@ removed_by_compactify <- function(updates_df, ukey_names, abs_tol) { updates_df[update_is_locf(updates_df, ukey_names, abs_tol), ] } -#' Internal helper; lgl; which updates are LOCF +#' Internal helper; lgl; which updates are LOCF and should thus be dropped when +#' compactifying #' #' (Not validated:) Must be called inside certain dplyr data masking verbs (e.g., #' `filter` or `mutate`) being run on an `epi_archive`'s `DT` or a data frame @@ -470,12 +471,18 @@ update_is_locf <- function(arranged_updates_df, ukey_names, abs_tol) { } else { ekts_tbl <- new_tibble(updates_col_refs[ekt_names]) vals_tbl <- new_tibble(updates_col_refs[val_names]) + # grab the data and a shifted version of the data, and compute the + # entry-wise difference to see if the value has changed # n_updates >= 2L so we can use `:` naturally (this is the reason for # separating out n_updates == 1L from this case): inds1 <- 2L:n_updates inds2 <- 1L:(n_updates - 1L) c( FALSE, # first observation is not LOCF + # for the rest, check that both the keys are exactly the same, and the + # values are within abs_tol + # the key comparison effectively implements a group_by, so that when the + # key changes we're guaranteed the value is correct vec_approx_equal0(ekts_tbl, inds1 = inds1, ekts_tbl, inds2 = inds2, # check ekt (key) cols with 0 tolerance: @@ -493,7 +500,8 @@ update_is_locf <- function(arranged_updates_df, ukey_names, abs_tol) { #' `epi_archive` object. #' #' @param x A data.frame, data.table, or tibble, with columns `geo_value`, -#' `time_value`, `version`, and then any additional number of columns. +#' `time_value`, `version`, and then any additional number of columns, either +#' keys or values. #' @param ... used for specifying column names, as in [`dplyr::rename`]. For #' example `version = release_date` #' @param .versions_end location based versions_end, used to avoid prefix diff --git a/R/epi_slide_opt_archive.R b/R/epi_slide_opt_archive.R index 48d43cceb..c28c56b90 100644 --- a/R/epi_slide_opt_archive.R +++ b/R/epi_slide_opt_archive.R @@ -27,12 +27,12 @@ #' #' library(dplyr) #' grp_updates <- bind_rows( -#' tibble(version = 10, time_value = 1:20, value = 1:20), -#' tibble(version = 12, time_value = 4:5, value = 5:4), -#' tibble(version = 13, time_value = 8, value = 9), -#' tibble(version = 14, time_value = 11, value = NA), -#' tibble(version = 15, time_value = -10, value = -10), -#' tibble(version = 16, time_value = 50, value = 50) +#' tibble(version = 20, time_value = 1:20, value = 1:20), +#' tibble(version = 22, time_value = 4:5, value = 5:4), +#' tibble(version = 23, time_value = 8, value = 9), +#' tibble(version = 24, time_value = 11, value = NA), +#' tibble(version = 25, time_value = -10, value = -10), +#' tibble(version = 26, time_value = 50, value = 50) #' ) %>% #' mutate(across(c(version, time_value), ~ as.Date("2020-01-01") - 1 + .x)) #' @@ -108,7 +108,7 @@ epi_slide_opt_archive_one_epikey <- function( slide[[out_colnames[[col_i]]]] <- f_dots_baked(slide[[in_colnames[[col_i]]]], before = before, after = after) } } else { - cli_abort("epiprocess internal error: `f_from_package` was {format_chr_deparse(f_from_package)}", + cli_abort("epiprocess internal error: `f_from_package` was {format_chr_deparse(f_from_package)}, which is unsupported", class = "epiprocess__epi_slide_opt_archive__f_from_package_invalid" ) } diff --git a/R/epi_slide_opt_edf.R b/R/epi_slide_opt_edf.R index cb80aac0c..1e2401e2d 100644 --- a/R/epi_slide_opt_edf.R +++ b/R/epi_slide_opt_edf.R @@ -73,7 +73,8 @@ upstream_slide_f_info <- function(.f, ...) { ) } -#' Calculate input and output column names for an `{epiprocess}` [`dplyr::across`]-like operations +#' Calculate input and output column names for an `{epiprocess}` +#' [`dplyr::across`]-like operations #' #' @param .x data.frame to perform input column tidyselection on #' @param time_type as in [`new_epi_df`] diff --git a/R/patch.R b/R/patch.R index 929075de3..97d185569 100644 --- a/R/patch.R +++ b/R/patch.R @@ -121,17 +121,22 @@ vec_approx_equal0 <- function(vec1, vec2, na_equal, abs_tol, inds1 = NULL, inds2 na_or_nan1 <- is.na(vec1) na_or_nan2 <- is.na(vec2) # Since above are bare logical vectors, we can use `fifelse` - res <- fifelse( - !na_or_nan1 & !na_or_nan2, - abs(vec1 - vec2) <= abs_tol, - if (na_equal) { + if (na_equal) { + res <- fifelse( + !na_or_nan1 & !na_or_nan2, + abs(vec1 - vec2) <= abs_tol, na_or_nan1 & na_or_nan2 & (is.nan(vec1) == is.nan(vec2)) - } else { - # Like `==` and `vec_equal`, we consider NaN == {NA, NaN, anything else} - # to be NA. + ) + } else { + # Like `==` and `vec_equal`, we consider NaN == {NA, NaN, anything else} + # to be NA. + res <- fifelse( + !na_or_nan1 & !na_or_nan2, + abs(vec1 - vec2) <= abs_tol, NA - } - ) + ) + } + if (!is.null(dim(vec1))) { dim(res) <- dim(vec1) res <- rowSums(res) == ncol(res) @@ -278,9 +283,9 @@ tbl_diff2 <- function(earlier_snapshot, later_tbl, # ukey+val duplicates (cases 2. and 3.).) # Row indices of first occurrence of each ukey; will be the same as - # seq_len(combined_n) except for when that ukey has been re-reported in - # `later_tbl`, in which case (3. or 4.) it will point back to the row index of - # the same ukey in `earlier_snapshot`: + # seq_len(combined_n) (cases 1., 2., or 5.) except for when that ukey has been + # re-reported in `later_tbl`, in which case (3. or 4.) it will point back to + # the row index of the same ukey in `earlier_snapshot`: combined_ukey_firsts <- vec_duplicate_id(combined_ukeys) # Which rows from combined are cases 3. or 4.? @@ -368,6 +373,7 @@ tbl_patch <- function(snapshot, update, ukey_names) { result_tbl <- vec_rbind(update, snapshot) dup_ids <- vec_duplicate_id(result_tbl[ukey_names]) + # check that the index hasn't be reset to something lower that it duplicates not_overwritten <- dup_ids == vec_seq_along(result_tbl) result_tbl <- result_tbl[not_overwritten, ] diff --git a/tests/testthat/test-epi_slide_opt_archive.R b/tests/testthat/test-epi_slide_opt_archive.R index 6e6fd2479..999234217 100644 --- a/tests/testthat/test-epi_slide_opt_archive.R +++ b/tests/testthat/test-epi_slide_opt_archive.R @@ -173,13 +173,14 @@ test_that("epi_slide_opt.epi_archive gives expected results on example data; als group_by(geo_value) ) - mini_case_death_rate_archive_b <- mini_case_death_rate_archive %>% - { - as_tibble(as.data.frame(.$DT)) - } %>% + mini_case_death_rate_archive_b <- + mini_case_death_rate_archive$DT %>% + as.data.frame() %>% + as_tibble() %>% mutate(age_group = "overall") %>% as_epi_archive(other_keys = "age_group") + # grouping shouldn't change the outcome expect_equal( mini_case_death_rate_archive_b %>% group_by(geo_value, age_group) %>% From fcbf695b48d2826d43e461b1c65bc44632985a8e Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Thu, 3 Apr 2025 22:23:28 +0000 Subject: [PATCH 074/107] docs: document (GHA) --- man/across_ish_names_info.Rd | 6 ++++-- man/epi_slide_opt_archive_one_epikey.Rd | 12 ++++++------ man/update_is_locf.Rd | 3 ++- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/man/across_ish_names_info.Rd b/man/across_ish_names_info.Rd index c993c2bf5..36b9ed040 100644 --- a/man/across_ish_names_info.Rd +++ b/man/across_ish_names_info.Rd @@ -2,7 +2,8 @@ % Please edit documentation in R/epi_slide_opt_edf.R \name{across_ish_names_info} \alias{across_ish_names_info} -\title{Calculate input and output column names for an \code{{epiprocess}} \code{\link[dplyr:across]{dplyr::across}}-like operations} +\title{Calculate input and output column names for an \code{{epiprocess}} +\code{\link[dplyr:across]{dplyr::across}}-like operations} \usage{ across_ish_names_info( .x, @@ -41,6 +42,7 @@ named list with two elements: \code{input_col_names}, chr, subset of \code{names(.x)}; and \code{output_colnames}, chr, same length as \code{input_col_names} } \description{ -Calculate input and output column names for an \code{{epiprocess}} \code{\link[dplyr:across]{dplyr::across}}-like operations +Calculate input and output column names for an \code{{epiprocess}} +\code{\link[dplyr:across]{dplyr::across}}-like operations } \keyword{internal} diff --git a/man/epi_slide_opt_archive_one_epikey.Rd b/man/epi_slide_opt_archive_one_epikey.Rd index 79d635c88..ca92c5cf5 100644 --- a/man/epi_slide_opt_archive_one_epikey.Rd +++ b/man/epi_slide_opt_archive_one_epikey.Rd @@ -55,12 +55,12 @@ Core operation of \code{epi_slide_opt.epi_archive} for a single epikey's history library(dplyr) grp_updates <- bind_rows( - tibble(version = 10, time_value = 1:20, value = 1:20), - tibble(version = 12, time_value = 4:5, value = 5:4), - tibble(version = 13, time_value = 8, value = 9), - tibble(version = 14, time_value = 11, value = NA), - tibble(version = 15, time_value = -10, value = -10), - tibble(version = 16, time_value = 50, value = 50) + tibble(version = 20, time_value = 1:20, value = 1:20), + tibble(version = 22, time_value = 4:5, value = 5:4), + tibble(version = 23, time_value = 8, value = 9), + tibble(version = 24, time_value = 11, value = NA), + tibble(version = 25, time_value = -10, value = -10), + tibble(version = 26, time_value = 50, value = 50) ) \%>\% mutate(across(c(version, time_value), ~ as.Date("2020-01-01") - 1 + .x)) diff --git a/man/update_is_locf.Rd b/man/update_is_locf.Rd index 722f3d5c4..aaa70bad2 100644 --- a/man/update_is_locf.Rd +++ b/man/update_is_locf.Rd @@ -2,7 +2,8 @@ % Please edit documentation in R/archive.R \name{update_is_locf} \alias{update_is_locf} -\title{Internal helper; lgl; which updates are LOCF} +\title{Internal helper; lgl; which updates are LOCF and should thus be dropped when +compactifying} \usage{ update_is_locf(arranged_updates_df, ukey_names, abs_tol) } From a7ff734ce3e484e73822ed26e32eeff8d3ae616a Mon Sep 17 00:00:00 2001 From: brookslogan Date: Fri, 4 Apr 2025 12:04:24 -0700 Subject: [PATCH 075/107] Update R/epi_slide_opt_archive.R Co-authored-by: David Weber --- R/epi_slide_opt_archive.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/R/epi_slide_opt_archive.R b/R/epi_slide_opt_archive.R index 48d43cceb..a731bd072 100644 --- a/R/epi_slide_opt_archive.R +++ b/R/epi_slide_opt_archive.R @@ -127,9 +127,8 @@ epi_slide_opt_archive_one_epikey <- function( } out_update <- slide[rows_should_keep, ] out_diff <- tbl_diff2(prev_out_snapshot, out_update, "time_value", "update") - out_snapshot <- tbl_patch(prev_out_snapshot, out_diff, "time_value") prev_inp_snapshot <<- inp_snapshot - prev_out_snapshot <<- out_snapshot + prev_out_snapshot <<- tbl_patch(prev_out_snapshot, out_diff, "time_value") out_diff$version <- version out_diff }) From 5d2782dcad8621a7527fa93184ebe5979b9474f0 Mon Sep 17 00:00:00 2001 From: brookslogan Date: Fri, 4 Apr 2025 19:06:16 +0000 Subject: [PATCH 076/107] style: styler (GHA) --- R/epi_slide_opt_archive.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/epi_slide_opt_archive.R b/R/epi_slide_opt_archive.R index a731bd072..ca2e6a03f 100644 --- a/R/epi_slide_opt_archive.R +++ b/R/epi_slide_opt_archive.R @@ -128,7 +128,7 @@ epi_slide_opt_archive_one_epikey <- function( out_update <- slide[rows_should_keep, ] out_diff <- tbl_diff2(prev_out_snapshot, out_update, "time_value", "update") prev_inp_snapshot <<- inp_snapshot - prev_out_snapshot <<- tbl_patch(prev_out_snapshot, out_diff, "time_value") + prev_out_snapshot <<- tbl_patch(prev_out_snapshot, out_diff, "time_value") out_diff$version <- version out_diff }) From cdd9fee46f1b572df09ed4cb45c9d9fa42119176 Mon Sep 17 00:00:00 2001 From: brookslogan Date: Fri, 4 Apr 2025 12:41:43 -0700 Subject: [PATCH 077/107] Update tests/testthat/test-epi_slide_opt_archive.R Co-authored-by: David Weber --- tests/testthat/test-epi_slide_opt_archive.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/testthat/test-epi_slide_opt_archive.R b/tests/testthat/test-epi_slide_opt_archive.R index 6e6fd2479..66b15b1bd 100644 --- a/tests/testthat/test-epi_slide_opt_archive.R +++ b/tests/testthat/test-epi_slide_opt_archive.R @@ -173,10 +173,10 @@ test_that("epi_slide_opt.epi_archive gives expected results on example data; als group_by(geo_value) ) - mini_case_death_rate_archive_b <- mini_case_death_rate_archive %>% - { - as_tibble(as.data.frame(.$DT)) - } %>% + mini_case_death_rate_archive_b <- + mini_case_death_rate_archive$DT %>% + as.data.frame() %>% + as_tibble() %>% mutate(age_group = "overall") %>% as_epi_archive(other_keys = "age_group") From 722bae8d1dce1e7853711dda7f7f77d265e46454 Mon Sep 17 00:00:00 2001 From: brookslogan Date: Fri, 4 Apr 2025 16:58:45 -0700 Subject: [PATCH 078/107] Update R/epi_slide_opt_archive.R --- R/epi_slide_opt_archive.R | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/R/epi_slide_opt_archive.R b/R/epi_slide_opt_archive.R index c28c56b90..d503483d0 100644 --- a/R/epi_slide_opt_archive.R +++ b/R/epi_slide_opt_archive.R @@ -27,12 +27,12 @@ #' #' library(dplyr) #' grp_updates <- bind_rows( -#' tibble(version = 20, time_value = 1:20, value = 1:20), -#' tibble(version = 22, time_value = 4:5, value = 5:4), -#' tibble(version = 23, time_value = 8, value = 9), -#' tibble(version = 24, time_value = 11, value = NA), -#' tibble(version = 25, time_value = -10, value = -10), -#' tibble(version = 26, time_value = 50, value = 50) +#' tibble(version = 30, time_value = 1:20, value = 1:20), +#' tibble(version = 32, time_value = 4:5, value = 5:4), +#' tibble(version = 33, time_value = 8, value = 9), +#' tibble(version = 34, time_value = 11, value = NA), +#' tibble(version = 35, time_value = -10, value = -10), +#' tibble(version = 56, time_value = 50, value = 50) #' ) %>% #' mutate(across(c(version, time_value), ~ as.Date("2020-01-01") - 1 + .x)) #' From 0f2ee96819985824ed8af8ffe1573dcf4ccf9564 Mon Sep 17 00:00:00 2001 From: brookslogan Date: Fri, 4 Apr 2025 16:58:51 -0700 Subject: [PATCH 079/107] Update R/patch.R --- R/patch.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/patch.R b/R/patch.R index 97d185569..69a7dd0ae 100644 --- a/R/patch.R +++ b/R/patch.R @@ -283,8 +283,8 @@ tbl_diff2 <- function(earlier_snapshot, later_tbl, # ukey+val duplicates (cases 2. and 3.).) # Row indices of first occurrence of each ukey; will be the same as - # seq_len(combined_n) (cases 1., 2., or 5.) except for when that ukey has been - # re-reported in `later_tbl`, in which case (3. or 4.) it will point back to + # seq_len(combined_n) for each ukey's first appearance (cases 1., 2., or 5.); + # for re-reported ukeys in `later_tbl` (cases 3. or 4.), it will point back to # the row index of the same ukey in `earlier_snapshot`: combined_ukey_firsts <- vec_duplicate_id(combined_ukeys) From d20541562fbea5345ea3443d3521e6bdc781540d Mon Sep 17 00:00:00 2001 From: brookslogan Date: Fri, 4 Apr 2025 16:58:58 -0700 Subject: [PATCH 080/107] Update R/patch.R --- R/patch.R | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/R/patch.R b/R/patch.R index 69a7dd0ae..262914638 100644 --- a/R/patch.R +++ b/R/patch.R @@ -373,7 +373,11 @@ tbl_patch <- function(snapshot, update, ukey_names) { result_tbl <- vec_rbind(update, snapshot) dup_ids <- vec_duplicate_id(result_tbl[ukey_names]) - # check that the index hasn't be reset to something lower that it duplicates + # Find the "first" appearance of each ukey; since `update` is ordered before `snapshot`, + # this means favoring the rows from `update` over those in `snapshot`. + # This is like `!duplicated()` but faster, and like `vec_unique_loc()` but guaranteeing + # that we get the first appearance since `vec_duplicate_id()` guarantees that + # it points to the first appearance. not_overwritten <- dup_ids == vec_seq_along(result_tbl) result_tbl <- result_tbl[not_overwritten, ] From ae70f7847289e65c886a9a3f588c58f0009bacb3 Mon Sep 17 00:00:00 2001 From: brookslogan Date: Sat, 5 Apr 2025 00:00:45 +0000 Subject: [PATCH 081/107] docs: document (GHA) --- man/epi_slide_opt_archive_one_epikey.Rd | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/man/epi_slide_opt_archive_one_epikey.Rd b/man/epi_slide_opt_archive_one_epikey.Rd index ca92c5cf5..073e684c5 100644 --- a/man/epi_slide_opt_archive_one_epikey.Rd +++ b/man/epi_slide_opt_archive_one_epikey.Rd @@ -55,12 +55,12 @@ Core operation of \code{epi_slide_opt.epi_archive} for a single epikey's history library(dplyr) grp_updates <- bind_rows( - tibble(version = 20, time_value = 1:20, value = 1:20), - tibble(version = 22, time_value = 4:5, value = 5:4), - tibble(version = 23, time_value = 8, value = 9), - tibble(version = 24, time_value = 11, value = NA), - tibble(version = 25, time_value = -10, value = -10), - tibble(version = 26, time_value = 50, value = 50) + tibble(version = 30, time_value = 1:20, value = 1:20), + tibble(version = 32, time_value = 4:5, value = 5:4), + tibble(version = 33, time_value = 8, value = 9), + tibble(version = 34, time_value = 11, value = NA), + tibble(version = 35, time_value = -10, value = -10), + tibble(version = 56, time_value = 50, value = 50) ) \%>\% mutate(across(c(version, time_value), ~ as.Date("2020-01-01") - 1 + .x)) From 71116cbe4c0cf0befecbc6b9fe82572ccd3d1a54 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Sun, 6 Apr 2025 15:32:19 -0700 Subject: [PATCH 082/107] Simplify `tbl_diff2` with `tbl_fast_anti_join` --- R/epi_slide_opt_archive.R | 2 +- R/patch.R | 139 +++++++++++++------------------------- 2 files changed, 48 insertions(+), 93 deletions(-) diff --git a/R/epi_slide_opt_archive.R b/R/epi_slide_opt_archive.R index efd703186..e6cc9cdc3 100644 --- a/R/epi_slide_opt_archive.R +++ b/R/epi_slide_opt_archive.R @@ -126,7 +126,7 @@ epi_slide_opt_archive_one_epikey <- function( !is.na(slide_inp_backrefs) } out_update <- slide[rows_should_keep, ] - out_diff <- tbl_diff2(prev_out_snapshot, out_update, "time_value", "update") + out_diff <- tbl_diff2_alt2(prev_out_snapshot, out_update, "time_value", "update") prev_inp_snapshot <<- inp_snapshot prev_out_snapshot <<- tbl_patch(prev_out_snapshot, out_diff, "time_value") out_diff$version <- version diff --git a/R/patch.R b/R/patch.R index 262914638..4b7c22d0e 100644 --- a/R/patch.R +++ b/R/patch.R @@ -183,6 +183,37 @@ vec_approx_equal0 <- function(vec1, vec2, na_equal, abs_tol, inds1 = NULL, inds2 } } +#' Variation on [`dplyr::anti_join`] for speed + tolerance setting +#' +#' @param x tibble; `x[ukey_names]` must not have any duplicate rows +#' @param y tibble; `y[ukey_names]` must not have any duplicate rows +#' @param ukey_names chr; names of columns that form a unique key, for `x` and +#' for `y` +#' @param val_names chr; names of columns which should be treated as +#' value/measurement columns, and compared with a tolerance +#' @param abs_tol scalar non-negative numeric; absolute tolerance with which to +#' compare value columns; see [`vec_approx_equal`] +#' @return rows from `x` that either (a) don't have a (0-tolerance) matching +#' ukey in `y`, or (b) have a matching ukey in `y`, but don't have +#' approximately equal value column values +tbl_fast_anti_join <- function(x, y, ukey_names, val_names, abs_tol = 0) { + x <- x[c(ukey_names, val_names)] + y <- y[c(ukey_names, val_names)] + xy <- vec_rbind(x, y) + if (abs_tol == 0) { + x_exclude <- vec_duplicate_detect(xy) + x_exclude <- vec_slice(x_exclude, seq_len(nrow(x))) + } else { + xy_dup_ids <- vec_duplicate_id(xy[ukey_names]) + xy_dup_inds2 <- which(xy_dup_ids != seq_along(xy_dup_ids)) + xy_dup_inds1 <- xy_dup_ids[xy_dup_inds2] + x_exclude <- rep(FALSE, nrow(x)) + xy_vals <- xy[val_names] + x_exclude[xy_dup_inds1] <- vec_approx_equal(xy_vals, inds1 = xy_dup_inds2, xy_vals, inds2 = xy_dup_inds1, na_equal = TRUE, abs_tol = abs_tol) + } + vec_slice(x, !x_exclude) +} + #' Calculate compact patch to move from one snapshot/update to another #' #' @param earlier_snapshot tibble or `NULL`; `NULL` represents that there was no @@ -214,117 +245,41 @@ tbl_diff2 <- function(earlier_snapshot, later_tbl, # use faster validation variants: if (!is_tibble(later_tbl)) { cli_abort("`later_tbl` must be a tibble", - class = "epiprocess__tbl_diff2__later_tbl_invalid" - ) + class = "epiprocess__tbl_diff2__later_tbl_invalid" + ) } if (is.null(earlier_snapshot)) { return(later_tbl) } if (!is_tibble(earlier_snapshot)) { cli_abort("`earlier_snapshot` must be a tibble or `NULL`", - class = "epiprocess__tbl_diff2__earlier_tbl_class_invalid" - ) + class = "epiprocess__tbl_diff2__earlier_tbl_class_invalid" + ) } if (!is.character(ukey_names) || !all(ukey_names %in% names(earlier_snapshot))) { cli_abort("`ukey_names` must be a subset of column names", - class = "epiprocess__tbl_diff2__ukey_names_class_invalid" - ) + class = "epiprocess__tbl_diff2__ukey_names_class_invalid" + ) } later_format <- arg_match0(later_format, c("snapshot", "update")) if (!(is.vector(compactify_abs_tol, mode = "numeric") && - length(compactify_abs_tol) == 1L && # nolint:indentation_linter - compactify_abs_tol >= 0)) { + length(compactify_abs_tol) == 1L && # nolint:indentation_linter + compactify_abs_tol >= 0)) { # Give a specific message: assert_numeric(compactify_abs_tol, lower = 0, any.missing = FALSE, len = 1L) # Fallback e.g. for invalid classes not caught by assert_numeric: cli_abort("`compactify_abs_tol` must be a length-1 double/integer >= 0", - class = "epiprocess__tbl_diff2__compactify_abs_tol_invalid" - ) + class = "epiprocess__tbl_diff2__compactify_abs_tol_invalid") } - # Extract metadata: - earlier_n <- nrow(earlier_snapshot) - later_n <- nrow(later_tbl) - tbl_names <- names(earlier_snapshot) - val_names <- tbl_names[!tbl_names %in% ukey_names] - - # More input validation: - if (!identical(tbl_names, names(later_tbl))) { - cli_abort(c( - "`earlier_snapshot` and `later_tbl` should have identical column - names and ordering.", - "*" = "`earlier_snapshot` colnames: {format_chr_deparse(tbl_names)}", - "*" = "`later_tbl` colnames: {format_chr_deparse(names(later_tbl))}" - ), class = "epiprocess__tbl_diff2__tbl_names_differ") + all_names <- names(later_tbl) + val_names <- all_names[! (all_names %in% ukey_names)] + updates <- tbl_fast_anti_join(later_tbl, earlier_snapshot, ukey_names, val_names, compactify_abs_tol) + if (later_format == "snapshot") { + deletions <- tbl_fast_anti_join(earlier_snapshot[ukey_names], later_tbl[ukey_names], ukey_names, character(), 0) + updates <- vec_rbind(updates, deletions) # fills vals with NAs } - - combined_tbl <- vec_rbind(earlier_snapshot, later_tbl) - combined_n <- nrow(combined_tbl) - - # We'll also need epikeytimes and value columns separately: - combined_ukeys <- combined_tbl[ukey_names] - combined_vals <- combined_tbl[val_names] - - # We have five types of rows in combined_tbl: - # 1. From earlier_snapshot, no matching ukey in later_tbl (deletion; turn vals to - # NAs to match epi_archive format) - # 2. From earlier_snapshot, with matching ukey in later_tbl (context; exclude from - # result) - # 3. From later_tbl, with matching ukey in earlier_snapshot, with value "close" (change - # that we'll compactify away) - # 4. From later_tbl, with matching ukey in earlier_snapshot, value not "close" (change - # that we'll record) - # 5. From later_tbl, with no matching ukey in later_tbl (addition) - - # For "snapshot" later_format, we need to filter to 1., 4., and 5., and alter - # values for 1. For "update" later_format, we need to filter to 4. and 5. - - # (For compactify_abs_tol = 0, we could potentially streamline things by dropping - # ukey+val duplicates (cases 2. and 3.).) - - # Row indices of first occurrence of each ukey; will be the same as - # seq_len(combined_n) for each ukey's first appearance (cases 1., 2., or 5.); - # for re-reported ukeys in `later_tbl` (cases 3. or 4.), it will point back to - # the row index of the same ukey in `earlier_snapshot`: - combined_ukey_firsts <- vec_duplicate_id(combined_ukeys) - - # Which rows from combined are cases 3. or 4.? - combined_ukey_is_repeat <- combined_ukey_firsts != seq_len(combined_n) - # For each row in 3. or 4., row numbers of the ukey appearance in earlier: - ukey_repeat_first_i <- combined_ukey_firsts[combined_ukey_is_repeat] - - # Which rows from combined are in case 3.? - combined_compactify_away <- rep(FALSE, combined_n) - combined_compactify_away[combined_ukey_is_repeat] <- - vec_approx_equal0(combined_vals, - combined_vals, - na_equal = TRUE, - abs_tol = compactify_abs_tol, - inds1 = combined_ukey_is_repeat, - inds2 = ukey_repeat_first_i - ) - - # Which rows from combined are in cases 3., 4., or 5.? - combined_from_later <- vec_rep_each(c(FALSE, TRUE), c(earlier_n, later_n)) - - if (later_format == "update") { - # Cases 4. and 5.: - combined_tbl <- combined_tbl[combined_from_later & !combined_compactify_away, ] - } else { # later_format is "snapshot" - # Which rows from combined are in case 1.? - combined_is_deletion <- vec_rep_each(c(TRUE, FALSE), c(earlier_n, later_n)) - combined_is_deletion[ukey_repeat_first_i] <- FALSE - # Which rows from combined are in cases 1., 4., or 5.? - combined_include <- combined_is_deletion | combined_from_later & !combined_compactify_away - combined_tbl <- combined_tbl[combined_include, ] - # Represent deletion in 1. with NA-ing of all value columns. (In some - # previous approaches to epi_diff2, this seemed to be faster than using - # vec_rbind(case_1_ukeys, cases_45_tbl) or bind_rows to fill with NAs, and more - # general than data.table's rbind(case_1_ukeys, cases_45_tbl, fill = TRUE).) - combined_tbl[combined_is_deletion[combined_include], val_names] <- NA - } - - combined_tbl + updates } #' Apply an update (e.g., from `tbl_diff2`) to a snapshot From 1c5f3c9fa6f42dcf24eb496d9d5d5868d1ab4f07 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Sun, 6 Apr 2025 16:23:10 -0700 Subject: [PATCH 083/107] fix(tbl_fast_anti_join): include non-ukey, non-val cols in result --- R/patch.R | 60 ++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 39 insertions(+), 21 deletions(-) diff --git a/R/patch.R b/R/patch.R index 4b7c22d0e..00f2145f8 100644 --- a/R/patch.R +++ b/R/patch.R @@ -196,7 +196,10 @@ vec_approx_equal0 <- function(vec1, vec2, na_equal, abs_tol, inds1 = NULL, inds2 #' @return rows from `x` that either (a) don't have a (0-tolerance) matching #' ukey in `y`, or (b) have a matching ukey in `y`, but don't have #' approximately equal value column values +#' +#' @keywords internal tbl_fast_anti_join <- function(x, y, ukey_names, val_names, abs_tol = 0) { + x_orig <- x x <- x[c(ukey_names, val_names)] y <- y[c(ukey_names, val_names)] xy <- vec_rbind(x, y) @@ -211,7 +214,7 @@ tbl_fast_anti_join <- function(x, y, ukey_names, val_names, abs_tol = 0) { xy_vals <- xy[val_names] x_exclude[xy_dup_inds1] <- vec_approx_equal(xy_vals, inds1 = xy_dup_inds2, xy_vals, inds2 = xy_dup_inds1, na_equal = TRUE, abs_tol = abs_tol) } - vec_slice(x, !x_exclude) + vec_slice(x_orig, !x_exclude) } #' Calculate compact patch to move from one snapshot/update to another @@ -244,41 +247,53 @@ tbl_diff2 <- function(earlier_snapshot, later_tbl, # Most input validation + handle NULL earlier_snapshot. This is a small function so # use faster validation variants: if (!is_tibble(later_tbl)) { - cli_abort("`later_tbl` must be a tibble", - class = "epiprocess__tbl_diff2__later_tbl_invalid" - ) + cli_abort( + "`later_tbl` must be a tibble", + class = "epiprocess__tbl_diff2__later_tbl_invalid" + ) } if (is.null(earlier_snapshot)) { return(later_tbl) } if (!is_tibble(earlier_snapshot)) { - cli_abort("`earlier_snapshot` must be a tibble or `NULL`", - class = "epiprocess__tbl_diff2__earlier_tbl_class_invalid" - ) + cli_abort( + "`earlier_snapshot` must be a tibble or `NULL`", + class = "epiprocess__tbl_diff2__earlier_tbl_class_invalid" + ) } if (!is.character(ukey_names) || !all(ukey_names %in% names(earlier_snapshot))) { - cli_abort("`ukey_names` must be a subset of column names", - class = "epiprocess__tbl_diff2__ukey_names_class_invalid" - ) + cli_abort( + "`ukey_names` must be a subset of column names", + class = "epiprocess__tbl_diff2__ukey_names_class_invalid" + ) } later_format <- arg_match0(later_format, c("snapshot", "update")) if (!(is.vector(compactify_abs_tol, mode = "numeric") && - length(compactify_abs_tol) == 1L && # nolint:indentation_linter - compactify_abs_tol >= 0)) { + length(compactify_abs_tol) == 1L && # nolint:indentation_linter + compactify_abs_tol >= 0)) { # Give a specific message: assert_numeric(compactify_abs_tol, lower = 0, any.missing = FALSE, len = 1L) # Fallback e.g. for invalid classes not caught by assert_numeric: - cli_abort("`compactify_abs_tol` must be a length-1 double/integer >= 0", - class = "epiprocess__tbl_diff2__compactify_abs_tol_invalid") + cli_abort( + "`compactify_abs_tol` must be a length-1 double/integer >= 0", + class = "epiprocess__tbl_diff2__compactify_abs_tol_invalid" + ) } all_names <- names(later_tbl) - val_names <- all_names[! (all_names %in% ukey_names)] + val_names <- all_names[!(all_names %in% ukey_names)] updates <- tbl_fast_anti_join(later_tbl, earlier_snapshot, ukey_names, val_names, compactify_abs_tol) if (later_format == "snapshot") { + # Interpret `later_tbl` as a full snapshot, rather than a diff / sparse + # update. That means that any ukeys in `earlier_snapshot` that don't appear + # in `later_tbl` were deleted in the later snapshot. deletions <- tbl_fast_anti_join(earlier_snapshot[ukey_names], later_tbl[ukey_names], ukey_names, character(), 0) - updates <- vec_rbind(updates, deletions) # fills vals with NAs + updates <- vec_rbind(updates, deletions) # fills val cols with NAs } + # If `later_format == "update"`, we don't need to do anything special about + # the above ukeys. The full snapshot for the later version would include the + # corresponding rows unchanged, and the diff for these unchanged rows would be + # empty. updates } @@ -299,7 +314,8 @@ tbl_patch <- function(snapshot, update, ukey_names) { # Most input validation. This is a small function so use faster validation # variants: if (!is_tibble(update)) { - cli_abort("`update` must be a tibble", + cli_abort( + "`update` must be a tibble", class = "epiprocess__tbl_patch__update_class_invalid" ) } @@ -307,12 +323,14 @@ tbl_patch <- function(snapshot, update, ukey_names) { return(update) } if (!is_tibble(snapshot)) { - cli_abort("`snapshot` must be a tibble", + cli_abort( + "`snapshot` must be a tibble", class = "epiprocess__tbl_patch__snapshot_class_invalid" ) } if (!is.character(ukey_names) || !all(ukey_names %in% names(snapshot))) { - cli_abort("`ukey_names` must be a subset of column names", + cli_abort( + "`ukey_names` must be a subset of column names", class = "epiprocess__tbl_patch__ukey_names_invalid" ) } @@ -333,8 +351,8 @@ tbl_patch <- function(snapshot, update, ukey_names) { # This is like `!duplicated()` but faster, and like `vec_unique_loc()` but guaranteeing # that we get the first appearance since `vec_duplicate_id()` guarantees that # it points to the first appearance. - not_overwritten <- dup_ids == vec_seq_along(result_tbl) - result_tbl <- result_tbl[not_overwritten, ] + is_only_or_favored_appearance <- dup_ids == vec_seq_along(result_tbl) + result_tbl <- result_tbl[is_only_or_favored_appearance, ] result_tbl } From 5fc1dc594e1c9f41d0bfe6cd0675086f3b503a6f Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Sun, 6 Apr 2025 16:40:05 -0700 Subject: [PATCH 084/107] Comment, clean, style, fix @keywords in archive opt slide & helpers --- R/epi_slide_opt_archive.R | 3 ++- R/patch.R | 39 +++++++++++++++++++++++++++++---------- 2 files changed, 31 insertions(+), 11 deletions(-) diff --git a/R/epi_slide_opt_archive.R b/R/epi_slide_opt_archive.R index e6cc9cdc3..066fc3178 100644 --- a/R/epi_slide_opt_archive.R +++ b/R/epi_slide_opt_archive.R @@ -108,7 +108,8 @@ epi_slide_opt_archive_one_epikey <- function( slide[[out_colnames[[col_i]]]] <- f_dots_baked(slide[[in_colnames[[col_i]]]], before = before, after = after) } } else { - cli_abort("epiprocess internal error: `f_from_package` was {format_chr_deparse(f_from_package)}, which is unsupported", + cli_abort( + "epiprocess internal error: `f_from_package` was {format_chr_deparse(f_from_package)}, which is unsupported", class = "epiprocess__epi_slide_opt_archive__f_from_package_invalid" ) } diff --git a/R/patch.R b/R/patch.R index 00f2145f8..2191c42b0 100644 --- a/R/patch.R +++ b/R/patch.R @@ -199,20 +199,39 @@ vec_approx_equal0 <- function(vec1, vec2, na_equal, abs_tol, inds1 = NULL, inds2 #' #' @keywords internal tbl_fast_anti_join <- function(x, y, ukey_names, val_names, abs_tol = 0) { - x_orig <- x - x <- x[c(ukey_names, val_names)] - y <- y[c(ukey_names, val_names)] - xy <- vec_rbind(x, y) + x_keyvals <- x[c(ukey_names, val_names)] + y_keyvals <- y[c(ukey_names, val_names)] + xy_keyvals <- vec_rbind(x, y) if (abs_tol == 0) { - x_exclude <- vec_duplicate_detect(xy) + # perf: 0 tolerance is just like a normal `anti_join` by both ukey_names and + # val_names together. We can do that more quickly than `anti_join` with + # `vctrs` by checking for keyvals of `x` that are not duplicated in `y`. + # (`vec_duplicate_detect` will mark those, unlike `duplicated`.) + x_exclude <- vec_duplicate_detect(xy_keyvals) x_exclude <- vec_slice(x_exclude, seq_len(nrow(x))) } else { - xy_dup_ids <- vec_duplicate_id(xy[ukey_names]) - xy_dup_inds2 <- which(xy_dup_ids != seq_along(xy_dup_ids)) - xy_dup_inds1 <- xy_dup_ids[xy_dup_inds2] + xy_ukeys <- xy_keyvals[ukey_names] + # Locate ukeys in `y` that match ukeys in `x` and where in `x` they map back + # to. It's faster to do this with `vec_duplicate_id` on `xy_ukeys` than to + # perform a `inner_join`. + xy_ukey_dup_ids <- vec_duplicate_id(xy_ukeys) + xy_ukey_dup_inds2 <- which(xy_ukey_dup_ids != seq_along(xy_ukey_dup_ids)) + # ^ these should point to rows from y that had a ukey match in x + xy_ukey_dup_inds1 <- xy_ukey_dup_ids[xy_ukey_dup_inds2] + # ^ these should point to the respectively corresponding rows from x + + # Anything in `x` without a ukey match in `y` should be kept; start off with + # `FALSE` for everything and just fill in `TRUE`/`FALSE` results for the + # ukeys with matches in `y`: x_exclude <- rep(FALSE, nrow(x)) xy_vals <- xy[val_names] - x_exclude[xy_dup_inds1] <- vec_approx_equal(xy_vals, inds1 = xy_dup_inds2, xy_vals, inds2 = xy_dup_inds1, na_equal = TRUE, abs_tol = abs_tol) + x_exclude[xy_ukey_dup_inds1] <- vec_approx_equal( + xy_vals, + inds1 = xy_ukey_dup_inds2, + xy_vals, + inds2 = xy_ukey_dup_inds1, + na_equal = TRUE, abs_tol = abs_tol + ) } vec_slice(x_orig, !x_exclude) } @@ -269,7 +288,7 @@ tbl_diff2 <- function(earlier_snapshot, later_tbl, } later_format <- arg_match0(later_format, c("snapshot", "update")) if (!(is.vector(compactify_abs_tol, mode = "numeric") && - length(compactify_abs_tol) == 1L && # nolint:indentation_linter + length(compactify_abs_tol) == 1L && # nolint: indentation_linter compactify_abs_tol >= 0)) { # Give a specific message: assert_numeric(compactify_abs_tol, lower = 0, any.missing = FALSE, len = 1L) From 436fbbac82787bc12b9d461d95dc2ac5df934ece Mon Sep 17 00:00:00 2001 From: brookslogan Date: Sun, 6 Apr 2025 23:43:54 +0000 Subject: [PATCH 085/107] docs: document (GHA) --- man/tbl_fast_anti_join.Rd | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 man/tbl_fast_anti_join.Rd diff --git a/man/tbl_fast_anti_join.Rd b/man/tbl_fast_anti_join.Rd new file mode 100644 index 000000000..91104b2a1 --- /dev/null +++ b/man/tbl_fast_anti_join.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/patch.R +\name{tbl_fast_anti_join} +\alias{tbl_fast_anti_join} +\title{Variation on \code{\link[dplyr:filter-joins]{dplyr::anti_join}} for speed + tolerance setting} +\usage{ +tbl_fast_anti_join(x, y, ukey_names, val_names, abs_tol = 0) +} +\arguments{ +\item{x}{tibble; \code{x[ukey_names]} must not have any duplicate rows} + +\item{y}{tibble; \code{y[ukey_names]} must not have any duplicate rows} + +\item{ukey_names}{chr; names of columns that form a unique key, for \code{x} and +for \code{y}} + +\item{val_names}{chr; names of columns which should be treated as +value/measurement columns, and compared with a tolerance} + +\item{abs_tol}{scalar non-negative numeric; absolute tolerance with which to +compare value columns; see \code{\link{vec_approx_equal}}} +} +\value{ +rows from \code{x} that either (a) don't have a (0-tolerance) matching +ukey in \code{y}, or (b) have a matching ukey in \code{y}, but don't have +approximately equal value column values +} +\description{ +Variation on \code{\link[dplyr:filter-joins]{dplyr::anti_join}} for speed + tolerance setting +} +\keyword{internal} From 54223639a3a40e8aab672244fb8b42ec5b44b191 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 7 Apr 2025 10:50:02 -0700 Subject: [PATCH 086/107] Fix incomplete rename refactor --- R/epi_slide_opt_archive.R | 2 +- R/patch.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/epi_slide_opt_archive.R b/R/epi_slide_opt_archive.R index 066fc3178..8da76d413 100644 --- a/R/epi_slide_opt_archive.R +++ b/R/epi_slide_opt_archive.R @@ -127,7 +127,7 @@ epi_slide_opt_archive_one_epikey <- function( !is.na(slide_inp_backrefs) } out_update <- slide[rows_should_keep, ] - out_diff <- tbl_diff2_alt2(prev_out_snapshot, out_update, "time_value", "update") + out_diff <- tbl_diff2(prev_out_snapshot, out_update, "time_value", "update") prev_inp_snapshot <<- inp_snapshot prev_out_snapshot <<- tbl_patch(prev_out_snapshot, out_diff, "time_value") out_diff$version <- version diff --git a/R/patch.R b/R/patch.R index 2191c42b0..c9cbc3f91 100644 --- a/R/patch.R +++ b/R/patch.R @@ -233,7 +233,7 @@ tbl_fast_anti_join <- function(x, y, ukey_names, val_names, abs_tol = 0) { na_equal = TRUE, abs_tol = abs_tol ) } - vec_slice(x_orig, !x_exclude) + vec_slice(x, !x_exclude) } #' Calculate compact patch to move from one snapshot/update to another From f48fbf04d28fe867521a87173aeb8556e23d3b22 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 7 Apr 2025 11:14:16 -0700 Subject: [PATCH 087/107] perf(epi_slide_opt.epi_archive): more `[` -> `vec_slice` changes 14% or 17% time reduction for 7daving on a couple of test archives. --- R/epi_slide_opt_archive.R | 4 ++-- R/patch.R | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/R/epi_slide_opt_archive.R b/R/epi_slide_opt_archive.R index 8da76d413..fd4b9f6bb 100644 --- a/R/epi_slide_opt_archive.R +++ b/R/epi_slide_opt_archive.R @@ -85,7 +85,7 @@ epi_slide_opt_archive_one_epikey <- function( # Get additional values needed from inp_snapshot + perform any NA # tail-padding needed to make slider results a fixed window size rather than # adaptive at tails + perform any NA gap-filling needed: - slide <- inp_snapshot[slide_inp_backrefs, ] + slide <- vec_slice(inp_snapshot, slide_inp_backrefs) slide$time_value <- slide_time_values if (f_from_package == "data.table") { if (before == Inf) { @@ -126,7 +126,7 @@ epi_slide_opt_archive_one_epikey <- function( # Only include time_values that appeared in the input snapshot: !is.na(slide_inp_backrefs) } - out_update <- slide[rows_should_keep, ] + out_update <- vec_slice(slide, rows_should_keep) out_diff <- tbl_diff2(prev_out_snapshot, out_update, "time_value", "update") prev_inp_snapshot <<- inp_snapshot prev_out_snapshot <<- tbl_patch(prev_out_snapshot, out_diff, "time_value") diff --git a/R/patch.R b/R/patch.R index c9cbc3f91..307ce0a2b 100644 --- a/R/patch.R +++ b/R/patch.R @@ -217,7 +217,7 @@ tbl_fast_anti_join <- function(x, y, ukey_names, val_names, abs_tol = 0) { xy_ukey_dup_ids <- vec_duplicate_id(xy_ukeys) xy_ukey_dup_inds2 <- which(xy_ukey_dup_ids != seq_along(xy_ukey_dup_ids)) # ^ these should point to rows from y that had a ukey match in x - xy_ukey_dup_inds1 <- xy_ukey_dup_ids[xy_ukey_dup_inds2] + xy_ukey_dup_inds1 <- vec_slice(xy_ukey_dup_ids, xy_ukey_dup_inds2) # ^ these should point to the respectively corresponding rows from x # Anything in `x` without a ukey match in `y` should be kept; start off with @@ -371,7 +371,7 @@ tbl_patch <- function(snapshot, update, ukey_names) { # that we get the first appearance since `vec_duplicate_id()` guarantees that # it points to the first appearance. is_only_or_favored_appearance <- dup_ids == vec_seq_along(result_tbl) - result_tbl <- result_tbl[is_only_or_favored_appearance, ] + result_tbl <- vec_slice(result_tbl, is_only_or_favored_appearance) result_tbl } From ad0ec0f56f342951e4051f55148f402decfdc591 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 7 Apr 2025 12:53:15 -0700 Subject: [PATCH 088/107] docs(patch.R): fix comment grammar --- R/patch.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/patch.R b/R/patch.R index 307ce0a2b..0a5efa014 100644 --- a/R/patch.R +++ b/R/patch.R @@ -211,9 +211,9 @@ tbl_fast_anti_join <- function(x, y, ukey_names, val_names, abs_tol = 0) { x_exclude <- vec_slice(x_exclude, seq_len(nrow(x))) } else { xy_ukeys <- xy_keyvals[ukey_names] - # Locate ukeys in `y` that match ukeys in `x` and where in `x` they map back - # to. It's faster to do this with `vec_duplicate_id` on `xy_ukeys` than to - # perform a `inner_join`. + # Locate ukeys in `y` that match ukeys in `x`, and where in `x` they map + # back to. It's faster to do this with `vec_duplicate_id` on `xy_ukeys` than + # to perform an `inner_join`. xy_ukey_dup_ids <- vec_duplicate_id(xy_ukeys) xy_ukey_dup_inds2 <- which(xy_ukey_dup_ids != seq_along(xy_ukey_dup_ids)) # ^ these should point to rows from y that had a ukey match in x From 172c3c10c4b1853a79726fb6d2db38ae178901d6 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 7 Apr 2025 15:34:33 -0700 Subject: [PATCH 089/107] fix: add missing importFrom --- NAMESPACE | 1 + R/epiprocess-package.R | 1 + 2 files changed, 2 insertions(+) diff --git a/NAMESPACE b/NAMESPACE index 62c3e7fc8..920422a00 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -271,6 +271,7 @@ importFrom(vctrs,vec_cast) importFrom(vctrs,vec_cast_common) importFrom(vctrs,vec_data) importFrom(vctrs,vec_duplicate_any) +importFrom(vctrs,vec_duplicate_detect) importFrom(vctrs,vec_duplicate_id) importFrom(vctrs,vec_equal) importFrom(vctrs,vec_in) diff --git a/R/epiprocess-package.R b/R/epiprocess-package.R index 03b1838a6..97221a9fc 100644 --- a/R/epiprocess-package.R +++ b/R/epiprocess-package.R @@ -48,6 +48,7 @@ #' @importFrom vctrs vec_cast #' @importFrom vctrs vec_cast_common #' @importFrom vctrs vec_data +#' @importFrom vctrs vec_duplicate_detect #' @importFrom vctrs vec_duplicate_id #' @importFrom vctrs vec_equal #' @importFrom vctrs vec_in From 7a5708f11b7ddc847697724547eec8a56652578b Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 8 Apr 2025 09:10:24 -0700 Subject: [PATCH 090/107] Refactor and speed up branch of `vec_approx_equal` NA/NaN testing --- R/patch.R | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/R/patch.R b/R/patch.R index 0a5efa014..44077d2bb 100644 --- a/R/patch.R +++ b/R/patch.R @@ -118,10 +118,10 @@ vec_approx_equal0 <- function(vec1, vec2, na_equal, abs_tol, inds1 = NULL, inds2 # Matching vec_equal, we ignore names and other attributes. if (!is.null(inds1)) vec1 <- vec_slice(vec1, inds1) if (!is.null(inds2)) vec2 <- vec_slice(vec2, inds2) - na_or_nan1 <- is.na(vec1) - na_or_nan2 <- is.na(vec2) - # Since above are bare logical vectors, we can use `fifelse` if (na_equal) { + na_or_nan1 <- is.na(vec1) + na_or_nan2 <- is.na(vec2) + # Since above are bare logical vectors, we can use `fifelse` res <- fifelse( !na_or_nan1 & !na_or_nan2, abs(vec1 - vec2) <= abs_tol, @@ -129,12 +129,8 @@ vec_approx_equal0 <- function(vec1, vec2, na_equal, abs_tol, inds1 = NULL, inds2 ) } else { # Like `==` and `vec_equal`, we consider NaN == {NA, NaN, anything else} - # to be NA. - res <- fifelse( - !na_or_nan1 & !na_or_nan2, - abs(vec1 - vec2) <= abs_tol, - NA - ) + # to be NA. That logic is actually baked into the basic formula: + res <- abs(vec1 - vec2) <= abs_tol } if (!is.null(dim(vec1))) { From fdee9de47837af70dd78a24aeba704a79b6fc42b Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 8 Apr 2025 14:40:48 -0700 Subject: [PATCH 091/107] Test unifying epi_slide_opt inner comps between edf and archive --- R/epi_slide_opt_archive.R | 71 +---------------- R/epi_slide_opt_edf.R | 162 ++++++++++++++++++-------------------- 2 files changed, 80 insertions(+), 153 deletions(-) diff --git a/R/epi_slide_opt_archive.R b/R/epi_slide_opt_archive.R index fd4b9f6bb..8ff1710c7 100644 --- a/R/epi_slide_opt_archive.R +++ b/R/epi_slide_opt_archive.R @@ -52,81 +52,14 @@ epi_slide_opt_archive_one_epikey <- function( grp_updates_by_version <- grp_updates %>% nest(.by = version, .key = "subtbl") %>% arrange(version) - unit_step <- unit_time_delta(time_type) + unit_step <- unit_time_delta(time_type, format = "fast") prev_inp_snapshot <- NULL prev_out_snapshot <- NULL result <- map(seq_len(nrow(grp_updates_by_version)), function(version_i) { version <- grp_updates_by_version$version[[version_i]] inp_update <- grp_updates_by_version$subtbl[[version_i]] inp_snapshot <- tbl_patch(prev_inp_snapshot, inp_update, "time_value") - if (before == Inf) { - if (after != 0) { - cli_abort('.window_size = Inf is only supported with .align = "right"', - class = "epiprocess__epi_slide_opt_archive__inf_window_invalid_align" - ) - } - # We need to use the entire input snapshot range, filling in time gaps. We - # shouldn't pad the ends. - slide_t_min <- min(inp_snapshot$time_value) - slide_t_max <- max(inp_snapshot$time_value) - } else { - # If the input had updates in the range t1..t2, this could produce changes - # in slide outputs in the range t1-after..t2+before, and to compute those - # slide values, we need to look at the input snapshot from - # t1-after-before..t2+before+after. nolint: commented_code_linter - inp_update_t_min <- min(inp_update$time_value) - inp_update_t_max <- max(inp_update$time_value) - slide_t_min <- inp_update_t_min - (before + after) * unit_step - slide_t_max <- inp_update_t_max + (before + after) * unit_step - } - slide_nrow <- time_delta_to_n_steps(slide_t_max - slide_t_min, time_type) + 1L - slide_time_values <- slide_t_min + 0L:(slide_nrow - 1L) * unit_step - slide_inp_backrefs <- vec_match(slide_time_values, inp_snapshot$time_value) - # Get additional values needed from inp_snapshot + perform any NA - # tail-padding needed to make slider results a fixed window size rather than - # adaptive at tails + perform any NA gap-filling needed: - slide <- vec_slice(inp_snapshot, slide_inp_backrefs) - slide$time_value <- slide_time_values - if (f_from_package == "data.table") { - if (before == Inf) { - slide[, out_colnames] <- - f_dots_baked(slide[, in_colnames], seq_len(slide_nrow), adaptive = TRUE) - } else { - out_cols <- f_dots_baked(slide[, in_colnames], before + after + 1L) - if (after != 0L) { - # Shift an appropriate amount of NA padding from the start to the end. - # (This padding will later be cut off when we filter down to the - # original time_values.) - out_cols <- lapply(out_cols, function(out_col) { - c(out_col[(after + 1L):length(out_col)], rep(NA, after)) - }) - } - slide[, out_colnames] <- out_cols - } - } else if (f_from_package == "slider") { - for (col_i in seq_along(in_colnames)) { - slide[[out_colnames[[col_i]]]] <- f_dots_baked(slide[[in_colnames[[col_i]]]], before = before, after = after) - } - } else { - cli_abort( - "epiprocess internal error: `f_from_package` was {format_chr_deparse(f_from_package)}, which is unsupported", - class = "epiprocess__epi_slide_opt_archive__f_from_package_invalid" - ) - } - rows_should_keep <- - if (before == Inf) { - # Re-introduce time gaps: - !is.na(slide_inp_backrefs) - } else { - # Get back to t1-after..t2+before; times outside this range were included - # only so those inside would have enough context for their slide - # computations, but these "context" rows may contain invalid slide - # computation outputs: - vec_rep_each(c(FALSE, TRUE, FALSE), c(before, slide_nrow - before - after, after)) & - # Only include time_values that appeared in the input snapshot: - !is.na(slide_inp_backrefs) - } - out_update <- vec_slice(slide, rows_should_keep) + out_update <- epi_slide_opt_one_epikey(inp_snapshot, f_dots_baked, f_from_package, before, after, unit_step, time_type, inp_update$time_value, in_colnames, out_colnames) out_diff <- tbl_diff2(prev_out_snapshot, out_update, "time_value", "update") prev_inp_snapshot <<- inp_snapshot prev_out_snapshot <<- tbl_patch(prev_out_snapshot, out_diff, "time_value") diff --git a/R/epi_slide_opt_edf.R b/R/epi_slide_opt_edf.R index 1e2401e2d..ec0e017b4 100644 --- a/R/epi_slide_opt_edf.R +++ b/R/epi_slide_opt_edf.R @@ -167,6 +167,67 @@ across_ish_names_info <- function(.x, time_type, col_names_quo, .f_namer, ) } +epi_slide_opt_one_epikey <- function(inp_snapshot, f_dots_baked, f_from_package, before, after, unit_step, time_type, ref_time_values, in_colnames, out_colnames) { + # TODO try converting time values to reals, do work on reals, convert back at very end? + if (before == Inf) { + if (after != 0L) { + cli_abort('.window_size = Inf is only supported with .align = "right"', + class = "epiprocess__epi_slide_opt_archive__inf_window_invalid_align" + ) + } + # We need to use the entire input snapshot range, filling in time gaps. We + # shouldn't pad the ends. + slide_t_min <- min(inp_snapshot$time_value) + slide_t_max <- max(inp_snapshot$time_value) + } else { + # If the input had updates in the range t1..t2, this could produce changes + # in slide outputs in the range t1-after..t2+before, and to compute those + # slide values, we need to look at the input snapshot from + # t1-after-before..t2+before+after. nolint: commented_code_linter + inp_update_t_min <- min(ref_time_values) + inp_update_t_max <- max(ref_time_values) + slide_t_min <- inp_update_t_min - (before + after) * unit_step + slide_t_max <- inp_update_t_max + (before + after) * unit_step + } + slide_nrow <- time_delta_to_n_steps(slide_t_max - slide_t_min, time_type) + 1L + slide_time_values <- slide_t_min + 0L:(slide_nrow - 1L) * unit_step + slide_inp_backrefs <- vec_match(slide_time_values, inp_snapshot$time_value) + # Get additional values needed from inp_snapshot + perform any NA + # tail-padding needed to make slider results a fixed window size rather than + # adaptive at tails + perform any NA gap-filling needed: + slide <- vec_slice(inp_snapshot, slide_inp_backrefs) + slide$time_value <- slide_time_values + if (f_from_package == "data.table") { + if (before == Inf) { + slide[, out_colnames] <- + f_dots_baked(slide[, in_colnames], seq_len(slide_nrow), adaptive = TRUE) + } else { + out_cols <- f_dots_baked(slide[, in_colnames], before + after + 1L) + if (after != 0L) { + # Shift an appropriate amount of NA padding from the start to the end. + # (This padding will later be cut off when we filter down to the + # original time_values.) + out_cols <- lapply(out_cols, function(out_col) { + c(out_col[(after + 1L):length(out_col)], rep(NA, after)) + }) + } + slide[, out_colnames] <- out_cols + } + } else if (f_from_package == "slider") { + for (col_i in seq_along(in_colnames)) { + slide[[out_colnames[[col_i]]]] <- f_dots_baked(slide[[in_colnames[[col_i]]]], before = before, after = after) + } + } else { + cli_abort( + "epiprocess internal error: `f_from_package` was {format_chr_deparse(f_from_package)}, which is unsupported", + class = "epiprocess__epi_slide_opt_archive__f_from_package_invalid" + ) + } + rows_should_keep <- vec_match(ref_time_values, slide_time_values) + out_update <- vec_slice(slide, rows_should_keep) + out_update +} + #' Optimized slide functions for common cases #' #' @description @@ -425,6 +486,9 @@ epi_slide_opt.epi_df <- function(.x, .col_names, .f, ..., } validate_slide_window_arg(.window_size, time_type) window_args <- get_before_after_from_window(.window_size, .align, time_type) + before <- time_delta_to_n_steps(window_args$before, time_type) + after <- time_delta_to_n_steps(window_args$after, time_type) + unit_step <- unit_time_delta(time_type, format = "fast") # Handle output naming: names_info <- across_ish_names_info( @@ -434,97 +498,27 @@ epi_slide_opt.epi_df <- function(.x, .col_names, .f, ..., input_col_names <- names_info$input_col_names output_col_names <- names_info$output_col_names - # Make a complete date sequence between min(.x$time_value) and max(.x$time_value). - date_seq_list <- full_date_seq(.x, window_args$before, window_args$after, time_type) - all_dates <- date_seq_list$all_dates - pad_early_dates <- date_seq_list$pad_early_dates - pad_late_dates <- date_seq_list$pad_late_dates - - slide_one_grp <- function(.data_group, .group_key, ...) { - missing_times <- all_dates[!vec_in(all_dates, .data_group$time_value)] - # `frollmean` requires a full window to compute a result. Add NA values - # to beginning and end of the group so that we get results for the - # first `before` and last `after` elements. - .data_group <- vec_rbind( - .data_group, # (tibble; epi_slide_opt uses .keep = FALSE) - new_tibble(vec_recycle_common( - time_value = c(missing_times, pad_early_dates, pad_late_dates), - .real = FALSE - )) - ) %>% - `[`(vec_order(.$time_value), ) - - if (f_from_package == "data.table") { - # Grouping should ensure that we don't have duplicate time values. - # Completion above should ensure we have at least .window_size rows. Check - # that we don't have more than .window_size rows (or fewer somehow): - if (nrow(.data_group) != length(c(all_dates, pad_early_dates, pad_late_dates))) { - cli_abort( - c( - "group contains an unexpected number of rows", - "i" = c("Input data may contain `time_values` closer together than the - expected `time_step` size") - ), - class = "epiprocess__epi_slide_opt__unexpected_row_number", - epiprocess__data_group = .data_group, - epiprocess__group_key = .group_key - ) - } - - # `frollmean` is 1-indexed, so create a new window width based on our - # `before` and `after` params. Right-aligned `frollmean` results' - # `ref_time_value`s will be `after` timesteps ahead of where they should - # be; shift results to the left by `after` timesteps. - if (window_args$before != Inf) { - window_size <- window_args$before + window_args$after + 1L - roll_output <- .f(x = .data_group[, input_col_names], n = window_size, ...) - } else { - window_size <- list(seq_along(.data_group$time_value)) - roll_output <- .f(x = .data_group[, input_col_names], n = window_size, adaptive = TRUE, ...) - } - if (window_args$after >= 1) { - .data_group[, output_col_names] <- lapply(roll_output, function(out_col) { - # Shift an appropriate amount of NA padding from the start to the end. - # (This padding will later be cut off when we filter down to the - # original time_values.) - c(out_col[(window_args$after + 1L):length(out_col)], rep(NA, window_args$after)) - }) - } else { - .data_group[, output_col_names] <- roll_output - } - } - if (f_from_package == "slider") { - for (i in seq_along(input_col_names)) { - .data_group[, output_col_names[i]] <- .f( - x = .data_group[[input_col_names[i]]], - before = as.numeric(window_args$before), - after = as.numeric(window_args$after), - ... - ) - } + f_dots_baked <- + if (rlang::dots_n(...) == 0L) { + # Leaving `.f` unchanged slightly improves computation speed and trims + # debug stack traces: + .f + } else { + purrr::partial(.f, ...) } - .data_group - } - result <- .x %>% - `[[<-`(".real", value = TRUE) %>% - group_modify(slide_one_grp, ..., .keep = FALSE) %>% - `[`(.$.real, names(.) != ".real") %>% - arrange_col_canonical() %>% - group_by(!!!.x_orig_groups) + group_modify(function(grp_data, grp_key) { + epi_slide_opt_one_epikey(grp_data, f_dots_baked, f_from_package, before, after, unit_step, time_type, vctrs::vec_set_intersect(ref_time_values, grp_data$time_value), names_info$input_col_names, names_info$output_col_names) + }) %>% + arrange_col_canonical() if (.all_rows) { - result[!vec_in(result$time_value, ref_time_values), output_col_names] <- NA - } else if (user_provided_rtvs) { - result <- result[vec_in(result$time_value, ref_time_values), ] + ekt_names <- key_colnames(.x) + result <- left_join(ungroup(.x), result[c(ekt_names, output_col_names)], by = ekt_names) } - if (!is_epi_df(result)) { - # `.all_rows` handling strips epi_df format and metadata. - # Restore them. - result <- reclass(result, attributes(.x)$metadata) - } + result <- group_by(result, !!!.x_orig_groups) return(result) } From 68065c6dd0a8ee8749dfc3280424040b4d52895f Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 9 Apr 2025 10:51:08 -0700 Subject: [PATCH 092/107] Refactor unified epi_slide_opt_one_epikey for clarity --- R/epi_slide_opt_archive.R | 11 ++++++++++- R/epi_slide_opt_edf.R | 38 +++++++++++++++++++------------------- 2 files changed, 29 insertions(+), 20 deletions(-) diff --git a/R/epi_slide_opt_archive.R b/R/epi_slide_opt_archive.R index 8ff1710c7..543881f90 100644 --- a/R/epi_slide_opt_archive.R +++ b/R/epi_slide_opt_archive.R @@ -59,7 +59,16 @@ epi_slide_opt_archive_one_epikey <- function( version <- grp_updates_by_version$version[[version_i]] inp_update <- grp_updates_by_version$subtbl[[version_i]] inp_snapshot <- tbl_patch(prev_inp_snapshot, inp_update, "time_value") - out_update <- epi_slide_opt_one_epikey(inp_snapshot, f_dots_baked, f_from_package, before, after, unit_step, time_type, inp_update$time_value, in_colnames, out_colnames) + inp_update_min_t <- min(inp_update$time_value) + inp_update_max_t <- max(inp_update$time_value) + # Time inp_update_max_t + before should have an output update, since it + # depends on inp_update_max_t + before - before = inp_update_max_t, which + # has an input update. Similarly, we could have updates beginning with + # inp_update_min_t - after, or anything in between these two bounds. + out_update_min_t <- time_minus_n_steps(inp_update_min_t, after, time_type) + out_update_max_t <- time_plus_n_steps(inp_update_max_t, before, time_type) + out_update_ts <- vec_slice(inp_snapshot$time_value, between(inp_snapshot$time_value, out_update_min_t, out_update_max_t)) + out_update <- epi_slide_opt_one_epikey(inp_snapshot, f_dots_baked, f_from_package, before, after, unit_step, time_type, out_update_ts, in_colnames, out_colnames) out_diff <- tbl_diff2(prev_out_snapshot, out_update, "time_value", "update") prev_inp_snapshot <<- inp_snapshot prev_out_snapshot <<- tbl_patch(prev_out_snapshot, out_diff, "time_value") diff --git a/R/epi_slide_opt_edf.R b/R/epi_slide_opt_edf.R index ec0e017b4..8b2945f0c 100644 --- a/R/epi_slide_opt_edf.R +++ b/R/epi_slide_opt_edf.R @@ -167,7 +167,10 @@ across_ish_names_info <- function(.x, time_type, col_names_quo, .f_namer, ) } -epi_slide_opt_one_epikey <- function(inp_snapshot, f_dots_baked, f_from_package, before, after, unit_step, time_type, ref_time_values, in_colnames, out_colnames) { +epi_slide_opt_one_epikey <- function(inp_tbl, + f_dots_baked, f_from_package, before, after, unit_step, time_type, + out_time_values, + in_colnames, out_colnames) { # TODO try converting time values to reals, do work on reals, convert back at very end? if (before == Inf) { if (after != 0L) { @@ -175,27 +178,23 @@ epi_slide_opt_one_epikey <- function(inp_snapshot, f_dots_baked, f_from_package, class = "epiprocess__epi_slide_opt_archive__inf_window_invalid_align" ) } - # We need to use the entire input snapshot range, filling in time gaps. We - # shouldn't pad the ends. - slide_t_min <- min(inp_snapshot$time_value) - slide_t_max <- max(inp_snapshot$time_value) + # We need to use the entire input range, filling in time gaps. We shouldn't + # pad the ends. + slide_t_min <- min(inp_tbl$time_value) # FIXME match existing behavior or complete change + slide_t_max <- max(inp_tbl$time_value) } else { - # If the input had updates in the range t1..t2, this could produce changes - # in slide outputs in the range t1-after..t2+before, and to compute those - # slide values, we need to look at the input snapshot from - # t1-after-before..t2+before+after. nolint: commented_code_linter - inp_update_t_min <- min(ref_time_values) - inp_update_t_max <- max(ref_time_values) - slide_t_min <- inp_update_t_min - (before + after) * unit_step - slide_t_max <- inp_update_t_max + (before + after) * unit_step + slide_t_min <- min(out_time_values) - before + slide_t_max <- max(out_time_values) + after } slide_nrow <- time_delta_to_n_steps(slide_t_max - slide_t_min, time_type) + 1L slide_time_values <- slide_t_min + 0L:(slide_nrow - 1L) * unit_step - slide_inp_backrefs <- vec_match(slide_time_values, inp_snapshot$time_value) - # Get additional values needed from inp_snapshot + perform any NA + slide_inp_backrefs <- vec_match(slide_time_values, inp_tbl$time_value) + # Get additional values needed from inp_tbl + perform any NA # tail-padding needed to make slider results a fixed window size rather than # adaptive at tails + perform any NA gap-filling needed: - slide <- vec_slice(inp_snapshot, slide_inp_backrefs) + slide <- vec_slice(inp_tbl, slide_inp_backrefs) + # TODO refactor to use a join if not using backrefs later anymore? or perf: + # try removing time_value column before slice? slide$time_value <- slide_time_values if (f_from_package == "data.table") { if (before == Inf) { @@ -223,9 +222,10 @@ epi_slide_opt_one_epikey <- function(inp_snapshot, f_dots_baked, f_from_package, class = "epiprocess__epi_slide_opt_archive__f_from_package_invalid" ) } - rows_should_keep <- vec_match(ref_time_values, slide_time_values) - out_update <- vec_slice(slide, rows_should_keep) - out_update + # TODO remove NAs from the match result to make `out_time_values` easier to use? Or rename to `out_time_values`? + rows_should_keep <- vec_match(out_time_values, slide_time_values) + out_tbl <- vec_slice(slide, rows_should_keep) + out_tbl } #' Optimized slide functions for common cases From f0645782f4ba1e5cf3a019c8f8cb808155c84d09 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 9 Apr 2025 11:13:47 -0700 Subject: [PATCH 093/107] Clean up slide range logic --- R/epi_slide_opt_edf.R | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/R/epi_slide_opt_edf.R b/R/epi_slide_opt_edf.R index 8b2945f0c..77b501455 100644 --- a/R/epi_slide_opt_edf.R +++ b/R/epi_slide_opt_edf.R @@ -180,8 +180,8 @@ epi_slide_opt_one_epikey <- function(inp_tbl, } # We need to use the entire input range, filling in time gaps. We shouldn't # pad the ends. - slide_t_min <- min(inp_tbl$time_value) # FIXME match existing behavior or complete change - slide_t_max <- max(inp_tbl$time_value) + slide_t_min <- min(inp_tbl$time_value) # FIXME match existing behavior, or complete changeover + slide_t_max <- max(out_time_values) } else { slide_t_min <- min(out_time_values) - before slide_t_max <- max(out_time_values) + after @@ -189,9 +189,9 @@ epi_slide_opt_one_epikey <- function(inp_tbl, slide_nrow <- time_delta_to_n_steps(slide_t_max - slide_t_min, time_type) + 1L slide_time_values <- slide_t_min + 0L:(slide_nrow - 1L) * unit_step slide_inp_backrefs <- vec_match(slide_time_values, inp_tbl$time_value) - # Get additional values needed from inp_tbl + perform any NA - # tail-padding needed to make slider results a fixed window size rather than - # adaptive at tails + perform any NA gap-filling needed: + # Get values needed from inp_tbl + perform any NA tail-padding needed to make + # slider results a fixed window size rather than adaptive at tails, and + # perform any NA gap-filling needed: slide <- vec_slice(inp_tbl, slide_inp_backrefs) # TODO refactor to use a join if not using backrefs later anymore? or perf: # try removing time_value column before slice? @@ -222,7 +222,6 @@ epi_slide_opt_one_epikey <- function(inp_tbl, class = "epiprocess__epi_slide_opt_archive__f_from_package_invalid" ) } - # TODO remove NAs from the match result to make `out_time_values` easier to use? Or rename to `out_time_values`? rows_should_keep <- vec_match(out_time_values, slide_time_values) out_tbl <- vec_slice(slide, rows_should_keep) out_tbl From 5295f1b225f485ce1680cd9e0b5a4c8cb16006a2 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 9 Apr 2025 12:24:27 -0700 Subject: [PATCH 094/107] perf(unit_time_delta): faster arg matching --- R/time-utils.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/time-utils.R b/R/time-utils.R index 73fbc8a56..5ab8aaabf 100644 --- a/R/time-utils.R +++ b/R/time-utils.R @@ -110,6 +110,8 @@ validate_slide_window_arg <- function(arg, time_type, lower = 1, allow_inf = TRU } } +time_delta_formats <- c("friendly", "fast") + #' Object that, added to time_values of time_type, advances by one time step/interval #' #' @param time_type string; `epi_df`'s or `epi_archive`'s `time_type` @@ -129,7 +131,7 @@ validate_slide_window_arg <- function(arg, time_type, lower = 1, allow_inf = TRU #' #' @keywords internal unit_time_delta <- function(time_type, format = c("friendly", "fast")) { - format <- rlang::arg_match(format) + format <- rlang::arg_match0(format, time_delta_formats) switch(format, friendly = switch(time_type, day = as.difftime(1, units = "days"), From c568774c9bb04c76adaf678f17c07401b4dda1f0 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 9 Apr 2025 16:13:27 -0700 Subject: [PATCH 095/107] refactor: handle time +/- Inf in helper function --- R/epi_slide_opt_archive.R | 8 ++++--- R/epi_slide_opt_edf.R | 18 ++++----------- R/time-utils.R | 38 ++++++++++++++++++++++++++++--- man/time_minus_time_in_n_steps.Rd | 4 ++-- man/time_plus_n_steps.Rd | 9 ++++++-- man/time_plus_slide_window_arg.Rd | 33 +++++++++++++++++++++++++++ 6 files changed, 86 insertions(+), 24 deletions(-) create mode 100644 man/time_plus_slide_window_arg.Rd diff --git a/R/epi_slide_opt_archive.R b/R/epi_slide_opt_archive.R index 543881f90..e40e87079 100644 --- a/R/epi_slide_opt_archive.R +++ b/R/epi_slide_opt_archive.R @@ -64,9 +64,11 @@ epi_slide_opt_archive_one_epikey <- function( # Time inp_update_max_t + before should have an output update, since it # depends on inp_update_max_t + before - before = inp_update_max_t, which # has an input update. Similarly, we could have updates beginning with - # inp_update_min_t - after, or anything in between these two bounds. - out_update_min_t <- time_minus_n_steps(inp_update_min_t, after, time_type) - out_update_max_t <- time_plus_n_steps(inp_update_max_t, before, time_type) + # inp_update_min_t - after, or anything in between these two bounds. If + # before == Inf, we need to update outputs all the way to the end of the + # input *snapshot*. + out_update_min_t <- time_minus_slide_window_arg(inp_update_min_t, after, time_type) + out_update_max_t <- time_plus_slide_window_arg(inp_update_max_t, before, time_type, max(inp_snapshot$time_value)) out_update_ts <- vec_slice(inp_snapshot$time_value, between(inp_snapshot$time_value, out_update_min_t, out_update_max_t)) out_update <- epi_slide_opt_one_epikey(inp_snapshot, f_dots_baked, f_from_package, before, after, unit_step, time_type, out_update_ts, in_colnames, out_colnames) out_diff <- tbl_diff2(prev_out_snapshot, out_update, "time_value", "update") diff --git a/R/epi_slide_opt_edf.R b/R/epi_slide_opt_edf.R index 77b501455..f0585543d 100644 --- a/R/epi_slide_opt_edf.R +++ b/R/epi_slide_opt_edf.R @@ -172,20 +172,10 @@ epi_slide_opt_one_epikey <- function(inp_tbl, out_time_values, in_colnames, out_colnames) { # TODO try converting time values to reals, do work on reals, convert back at very end? - if (before == Inf) { - if (after != 0L) { - cli_abort('.window_size = Inf is only supported with .align = "right"', - class = "epiprocess__epi_slide_opt_archive__inf_window_invalid_align" - ) - } - # We need to use the entire input range, filling in time gaps. We shouldn't - # pad the ends. - slide_t_min <- min(inp_tbl$time_value) # FIXME match existing behavior, or complete changeover - slide_t_max <- max(out_time_values) - } else { - slide_t_min <- min(out_time_values) - before - slide_t_max <- max(out_time_values) + after - } + # + # FIXME min time_value for this epikey vs. entire edf; match existing behavior, or complete changeover + slide_t_min <- time_minus_slide_window_arg(min(out_time_values), before, time_type, min(inp_tbl$time_value)) + slide_t_max <- time_plus_slide_window_arg(max(out_time_values), after, time_type) slide_nrow <- time_delta_to_n_steps(slide_t_max - slide_t_min, time_type) + 1L slide_time_values <- slide_t_min + 0L:(slide_nrow - 1L) * unit_step slide_inp_backrefs <- vec_match(slide_time_values, inp_tbl$time_value) diff --git a/R/time-utils.R b/R/time-utils.R index 5ab8aaabf..e2eba6807 100644 --- a/R/time-utils.R +++ b/R/time-utils.R @@ -339,7 +339,7 @@ difftime_approx_ceiling_time_delta <- function(difftime, time_type) { ) } -#' Difference between two time value vectors in terms of number of time "steps" +#' Difference between two finite `time_value` vectors in terms of number of time "steps" #' #' @param x a time_value (vector) of time type `time_type` #' @param y a time_value (vector) of time type `time_type` @@ -352,15 +352,18 @@ time_minus_time_in_n_steps <- function(x, y, time_type) { time_delta_to_n_steps(x - y, time_type) } -#' Advance/retreat time_values by specified number of time "steps" +#' Advance/retreat time_value(s) by bare-integerish number(s) of time "steps" #' #' Here, a "step" is based on the `time_type`, not just the class of `x`. #' #' @param x a time_value (vector) of time type `time_type` -#' @param y integerish (vector) +#' @param y bare integerish (vector) #' @param time_type as in [`validate_slide_window_arg()`] #' @return a time_value (vector) of time type `time_type` #' +#' @seealso [`time_plus_slide_window_arg`] if you're working with a `y` that is +#' a slide window arg, which is scalar but otherwise more general (class-wise, +#' Inf-wise) than an integerish vector. #' @keywords internal time_plus_n_steps <- function(x, y, time_type) { x + y * unit_time_delta(time_type, "fast") @@ -370,3 +373,32 @@ time_plus_n_steps <- function(x, y, time_type) { time_minus_n_steps <- function(x, y, time_type) { x - y * unit_time_delta(time_type, "fast") } + +#' Advance/retreat time_value(s) by specified amount (slide window arg) +#' +#' @param x a time_value (vector) of time type `time_type` +#' @param y a (scalar) slide window arg; should pass [`validate_slide_window_arg()`] +#' @param time_type as in [`validate_slide_window_arg()`] +#' @param max_time_value when `y == Inf`, what should be the result of adding `y`? +#' @param min_time_value when `y == Inf`, what should be the result of subtracting `y`? +#' @return a time_value (vector) of time type `time_type` +#' +#' @keywords internal +#' @seealso [`time_plus_n_steps`], if you're working with an integerish vector +#' number of time steps `y` (output from other `*n_steps` functions) instead. +time_plus_slide_window_arg <- function(x, y, time_type, max_time_value) { + if (y == Inf) { + rep(max_time_value, vec_size(x)) + } else { + time_plus_n_steps(x, y, time_type) + } +} + +#' @rdname time_plus_slide_window_arg +time_minus_slide_window_arg <- function(x, y, time_type, min_time_value) { + if (y == Inf) { + rep(min_time_value, vec_size(x)) + } else { + time_minus_n_steps(x, y, time_type) + } +} diff --git a/man/time_minus_time_in_n_steps.Rd b/man/time_minus_time_in_n_steps.Rd index aab030dea..926b79034 100644 --- a/man/time_minus_time_in_n_steps.Rd +++ b/man/time_minus_time_in_n_steps.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/time-utils.R \name{time_minus_time_in_n_steps} \alias{time_minus_time_in_n_steps} -\title{Difference between two time value vectors in terms of number of time "steps"} +\title{Difference between two finite \code{time_value} vectors in terms of number of time "steps"} \usage{ time_minus_time_in_n_steps(x, y, time_type) } @@ -18,6 +18,6 @@ integerish vector such that \code{x + n_steps_to_time_delta_fast(result)} should equal \code{y}. } \description{ -Difference between two time value vectors in terms of number of time "steps" +Difference between two finite \code{time_value} vectors in terms of number of time "steps" } \keyword{internal} diff --git a/man/time_plus_n_steps.Rd b/man/time_plus_n_steps.Rd index f7071c132..26edf9053 100644 --- a/man/time_plus_n_steps.Rd +++ b/man/time_plus_n_steps.Rd @@ -3,7 +3,7 @@ \name{time_plus_n_steps} \alias{time_plus_n_steps} \alias{time_minus_n_steps} -\title{Advance/retreat time_values by specified number of time "steps"} +\title{Advance/retreat time_value(s) by bare-integerish number(s) of time "steps"} \usage{ time_plus_n_steps(x, y, time_type) @@ -12,7 +12,7 @@ time_minus_n_steps(x, y, time_type) \arguments{ \item{x}{a time_value (vector) of time type \code{time_type}} -\item{y}{integerish (vector)} +\item{y}{bare integerish (vector)} \item{time_type}{as in \code{\link[=validate_slide_window_arg]{validate_slide_window_arg()}}} } @@ -22,4 +22,9 @@ a time_value (vector) of time type \code{time_type} \description{ Here, a "step" is based on the \code{time_type}, not just the class of \code{x}. } +\seealso{ +\code{\link{time_plus_slide_window_arg}} if you're working with a \code{y} that is +a slide window arg, which is scalar but otherwise more general (class-wise, +Inf-wise) than an integerish vector. +} \keyword{internal} diff --git a/man/time_plus_slide_window_arg.Rd b/man/time_plus_slide_window_arg.Rd new file mode 100644 index 000000000..1aa117f47 --- /dev/null +++ b/man/time_plus_slide_window_arg.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/time-utils.R +\name{time_plus_slide_window_arg} +\alias{time_plus_slide_window_arg} +\alias{time_minus_slide_window_arg} +\title{Advance/retreat time_value(s) by specified amount (slide window arg)} +\usage{ +time_plus_slide_window_arg(x, y, time_type, max_time_value) + +time_minus_slide_window_arg(x, y, time_type, min_time_value) +} +\arguments{ +\item{x}{a time_value (vector) of time type \code{time_type}} + +\item{y}{a (scalar) slide window arg; should pass \code{\link[=validate_slide_window_arg]{validate_slide_window_arg()}}} + +\item{time_type}{as in \code{\link[=validate_slide_window_arg]{validate_slide_window_arg()}}} + +\item{max_time_value}{when \code{y == Inf}, what should be the result of adding \code{y}?} + +\item{min_time_value}{when \code{y == Inf}, what should be the result of subtracting \code{y}?} +} +\value{ +a time_value (vector) of time type \code{time_type} +} +\description{ +Advance/retreat time_value(s) by specified amount (slide window arg) +} +\seealso{ +\code{\link{time_plus_n_steps}}, if you're working with an integerish vector +number of time steps \code{y} (output from other \verb{*n_steps} functions) instead. +} +\keyword{internal} From 80966e21f29910cd7a2f010755b39151844312d1 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 9 Apr 2025 18:56:09 -0700 Subject: [PATCH 096/107] Attempt rewriting combined opt slide logic to match old archive perf chars Something still seems different; some Date arithmetic stuck out, and attempts to avoid helped somewhat, but this still seems slower for archives than pre-unified approach. --- NAMESPACE | 1 + R/epi_slide_opt_archive.R | 3 +-- R/epi_slide_opt_edf.R | 38 ++++++++++++++++++++++++++++++++------ R/epiprocess-package.R | 1 + 4 files changed, 35 insertions(+), 8 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 920422a00..c596f67a6 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -284,6 +284,7 @@ importFrom(vctrs,vec_recycle_common) importFrom(vctrs,vec_rep) importFrom(vctrs,vec_rep_each) importFrom(vctrs,vec_seq_along) +importFrom(vctrs,vec_set_intersect) importFrom(vctrs,vec_set_names) importFrom(vctrs,vec_size) importFrom(vctrs,vec_size_common) diff --git a/R/epi_slide_opt_archive.R b/R/epi_slide_opt_archive.R index e40e87079..2351db8d9 100644 --- a/R/epi_slide_opt_archive.R +++ b/R/epi_slide_opt_archive.R @@ -69,8 +69,7 @@ epi_slide_opt_archive_one_epikey <- function( # input *snapshot*. out_update_min_t <- time_minus_slide_window_arg(inp_update_min_t, after, time_type) out_update_max_t <- time_plus_slide_window_arg(inp_update_max_t, before, time_type, max(inp_snapshot$time_value)) - out_update_ts <- vec_slice(inp_snapshot$time_value, between(inp_snapshot$time_value, out_update_min_t, out_update_max_t)) - out_update <- epi_slide_opt_one_epikey(inp_snapshot, f_dots_baked, f_from_package, before, after, unit_step, time_type, out_update_ts, in_colnames, out_colnames) + out_update <- epi_slide_opt_one_epikey(inp_snapshot, f_dots_baked, f_from_package, before, after, unit_step, time_type, c(out_update_min_t, out_update_max_t), NULL, in_colnames, out_colnames) out_diff <- tbl_diff2(prev_out_snapshot, out_update, "time_value", "update") prev_inp_snapshot <<- inp_snapshot prev_out_snapshot <<- tbl_patch(prev_out_snapshot, out_diff, "time_value") diff --git a/R/epi_slide_opt_edf.R b/R/epi_slide_opt_edf.R index f0585543d..673227550 100644 --- a/R/epi_slide_opt_edf.R +++ b/R/epi_slide_opt_edf.R @@ -169,13 +169,26 @@ across_ish_names_info <- function(.x, time_type, col_names_quo, .f_namer, epi_slide_opt_one_epikey <- function(inp_tbl, f_dots_baked, f_from_package, before, after, unit_step, time_type, - out_time_values, + out_filter_time_range, out_filter_time_set, in_colnames, out_colnames) { # TODO try converting time values to reals, do work on reals, convert back at very end? # - # FIXME min time_value for this epikey vs. entire edf; match existing behavior, or complete changeover - slide_t_min <- time_minus_slide_window_arg(min(out_time_values), before, time_type, min(inp_tbl$time_value)) - slide_t_max <- time_plus_slide_window_arg(max(out_time_values), after, time_type) + # TODO loosen restrictions here. each filter optional? + if (!is.null(out_filter_time_range) && is.null(out_filter_time_set)) { + out_filter_time_style <- "range" + out_t_min <- out_filter_time_range[[1L]] + out_t_max <- out_filter_time_range[[2L]] + } else if (is.null(out_filter_time_range) && !is.null(out_filter_time_set)) { + # FIXME min time_value for this epikey vs. entire edf; match existing behavior, or complete changeover + out_filter_time_style <- "set" + out_time_values <- vec_set_intersect(inp_tbl$time_value, out_filter_time_set) + out_t_min <- min(out_time_values) + out_t_max <- max(out_time_values) + } else { + cli_abort("Exactly one of `out_filter_time_range` and `out_filter_time_set` must be non-`NULL`.") + } + slide_t_min <- time_minus_slide_window_arg(out_t_min, before, time_type, min(inp_tbl$time_value)) + slide_t_max <- time_plus_slide_window_arg(out_t_max, after, time_type) slide_nrow <- time_delta_to_n_steps(slide_t_max - slide_t_min, time_type) + 1L slide_time_values <- slide_t_min + 0L:(slide_nrow - 1L) * unit_step slide_inp_backrefs <- vec_match(slide_time_values, inp_tbl$time_value) @@ -212,7 +225,20 @@ epi_slide_opt_one_epikey <- function(inp_tbl, class = "epiprocess__epi_slide_opt_archive__f_from_package_invalid" ) } - rows_should_keep <- vec_match(out_time_values, slide_time_values) + rows_should_keep1 <- !is.na(slide_inp_backrefs) + rows_should_keep2 <- switch( + out_filter_time_style, + range = vec_rep_each( + c(FALSE, TRUE, FALSE), + time_minus_time_in_n_steps( + vctrs::vec_c(out_t_min, out_t_max, slide_t_max), + vctrs::vec_c(slide_t_min, out_t_min, out_t_max), + time_type + ) + c(0L, 1L, 0L) + ), + set = vec_in(slide_time_values, out_time_values) + ) + rows_should_keep <- rows_should_keep1 & rows_should_keep2 out_tbl <- vec_slice(slide, rows_should_keep) out_tbl } @@ -498,7 +524,7 @@ epi_slide_opt.epi_df <- function(.x, .col_names, .f, ..., result <- .x %>% group_modify(function(grp_data, grp_key) { - epi_slide_opt_one_epikey(grp_data, f_dots_baked, f_from_package, before, after, unit_step, time_type, vctrs::vec_set_intersect(ref_time_values, grp_data$time_value), names_info$input_col_names, names_info$output_col_names) + epi_slide_opt_one_epikey(grp_data, f_dots_baked, f_from_package, before, after, unit_step, time_type, NULL, ref_time_values, names_info$input_col_names, names_info$output_col_names) }) %>% arrange_col_canonical() diff --git a/R/epiprocess-package.R b/R/epiprocess-package.R index 97221a9fc..c5a975a5f 100644 --- a/R/epiprocess-package.R +++ b/R/epiprocess-package.R @@ -61,6 +61,7 @@ #' @importFrom vctrs vec_rep #' @importFrom vctrs vec_rep_each #' @importFrom vctrs vec_seq_along +#' @importFrom vctrs vec_set_intersect #' @importFrom vctrs vec_set_names #' @importFrom vctrs vec_size_common #' @importFrom vctrs vec_slice From db2f274556603758ded088f2cd90f913fcbaf313 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 10 Apr 2025 08:41:40 -0700 Subject: [PATCH 097/107] perf: try `vec_c` -> `c` on 3 time_values As `vec_c` is slower on 3 scalar `Date`s, and it's probably pretty uncommon to use `time_value`s with `c` incompatible with `vec_c` (though possible... maybe tibbles with year, week, wday, like from MMWRweek; though these would probably break inside archive DT). --- R/epi_slide_opt_edf.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/epi_slide_opt_edf.R b/R/epi_slide_opt_edf.R index 673227550..74ff62753 100644 --- a/R/epi_slide_opt_edf.R +++ b/R/epi_slide_opt_edf.R @@ -231,8 +231,8 @@ epi_slide_opt_one_epikey <- function(inp_tbl, range = vec_rep_each( c(FALSE, TRUE, FALSE), time_minus_time_in_n_steps( - vctrs::vec_c(out_t_min, out_t_max, slide_t_max), - vctrs::vec_c(slide_t_min, out_t_min, out_t_max), + c(out_t_min, out_t_max, slide_t_max), + c(slide_t_min, out_t_min, out_t_max), time_type ) + c(0L, 1L, 0L) ), From 4ec96bee3a49dd7ad3a65e75e95a773c0981b548 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 10 Apr 2025 08:45:17 -0700 Subject: [PATCH 098/107] perf: avoid `c.Date` when possible Revert earlier change to try to reduce time inside `time_minus_time_in_n_steps`, as `c.Date` is more costly. This also works back towards generality from the previous `vec_c` -> `c` change. --- R/epi_slide_opt_edf.R | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/R/epi_slide_opt_edf.R b/R/epi_slide_opt_edf.R index 74ff62753..3247c7116 100644 --- a/R/epi_slide_opt_edf.R +++ b/R/epi_slide_opt_edf.R @@ -230,11 +230,9 @@ epi_slide_opt_one_epikey <- function(inp_tbl, out_filter_time_style, range = vec_rep_each( c(FALSE, TRUE, FALSE), - time_minus_time_in_n_steps( - c(out_t_min, out_t_max, slide_t_max), - c(slide_t_min, out_t_min, out_t_max), - time_type - ) + c(0L, 1L, 0L) + c(time_minus_time_in_n_steps(out_t_min, slide_t_min, time_type), + time_minus_time_in_n_steps(out_t_max, out_t_min, time_type) + 1L, + time_minus_time_in_n_steps(slide_t_max, out_t_max, time_type)) ), set = vec_in(slide_time_values, out_time_values) ) From cc1a5634a07498a30b4ac166cb4ecf4777f21972 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 10 Apr 2025 14:04:45 -0700 Subject: [PATCH 099/107] perf: old, faster slide edge-trimming args are actually usable --- R/epi_slide_opt_edf.R | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/R/epi_slide_opt_edf.R b/R/epi_slide_opt_edf.R index 3247c7116..850c7ffe4 100644 --- a/R/epi_slide_opt_edf.R +++ b/R/epi_slide_opt_edf.R @@ -230,9 +230,7 @@ epi_slide_opt_one_epikey <- function(inp_tbl, out_filter_time_style, range = vec_rep_each( c(FALSE, TRUE, FALSE), - c(time_minus_time_in_n_steps(out_t_min, slide_t_min, time_type), - time_minus_time_in_n_steps(out_t_max, out_t_min, time_type) + 1L, - time_minus_time_in_n_steps(slide_t_max, out_t_max, time_type)) + c(before, slide_nrow - before - after, after), ), set = vec_in(slide_time_values, out_time_values) ) From 4792aeadbc25b6753a65a5696213e44cdebdaaa9 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 10 Apr 2025 14:23:07 -0700 Subject: [PATCH 100/107] fix(time_plus_slide_window_arg): on non-integerish, non-Inf `y` --- R/time-utils.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/time-utils.R b/R/time-utils.R index e2eba6807..26ca803de 100644 --- a/R/time-utils.R +++ b/R/time-utils.R @@ -390,7 +390,7 @@ time_plus_slide_window_arg <- function(x, y, time_type, max_time_value) { if (y == Inf) { rep(max_time_value, vec_size(x)) } else { - time_plus_n_steps(x, y, time_type) + x + y } } @@ -399,6 +399,6 @@ time_minus_slide_window_arg <- function(x, y, time_type, min_time_value) { if (y == Inf) { rep(min_time_value, vec_size(x)) } else { - time_minus_n_steps(x, y, time_type) + x - y } } From 62375f8f49b82fcbe4457c65ab9f92ff63551d3d Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 10 Apr 2025 15:25:28 -0700 Subject: [PATCH 101/107] fix: output filter w/ before=Inf, wrong before/after types Also change a map -> lapply so we can get srcref to internal error immediately, add a missed .window_size-missingness check, delete some helper functions that were sort of helpful but also tacked on significant run time when used in a natural way. --- R/epi_slide_opt_archive.R | 25 ++++++++++---- R/epi_slide_opt_edf.R | 46 +++++++++++++++++-------- R/time-utils.R | 29 ---------------- man/epi_slide_opt_archive_one_epikey.Rd | 14 ++++---- man/time_plus_slide_window_arg.Rd | 33 ------------------ 5 files changed, 57 insertions(+), 90 deletions(-) delete mode 100644 man/time_plus_slide_window_arg.Rd diff --git a/R/epi_slide_opt_archive.R b/R/epi_slide_opt_archive.R index 2351db8d9..6fe0c1da4 100644 --- a/R/epi_slide_opt_archive.R +++ b/R/epi_slide_opt_archive.R @@ -47,7 +47,8 @@ epi_slide_opt_archive_one_epikey <- function( grp_updates, in_colnames, - f_dots_baked, f_from_package, before, after, time_type, + f_dots_baked, f_from_package, + before_steps, after_steps, time_type, out_colnames) { grp_updates_by_version <- grp_updates %>% nest(.by = version, .key = "subtbl") %>% @@ -55,7 +56,7 @@ epi_slide_opt_archive_one_epikey <- function( unit_step <- unit_time_delta(time_type, format = "fast") prev_inp_snapshot <- NULL prev_out_snapshot <- NULL - result <- map(seq_len(nrow(grp_updates_by_version)), function(version_i) { + result <- lapply(seq_len(nrow(grp_updates_by_version)), function(version_i) { version <- grp_updates_by_version$version[[version_i]] inp_update <- grp_updates_by_version$subtbl[[version_i]] inp_snapshot <- tbl_patch(prev_inp_snapshot, inp_update, "time_value") @@ -67,9 +68,13 @@ epi_slide_opt_archive_one_epikey <- function( # inp_update_min_t - after, or anything in between these two bounds. If # before == Inf, we need to update outputs all the way to the end of the # input *snapshot*. - out_update_min_t <- time_minus_slide_window_arg(inp_update_min_t, after, time_type) - out_update_max_t <- time_plus_slide_window_arg(inp_update_max_t, before, time_type, max(inp_snapshot$time_value)) - out_update <- epi_slide_opt_one_epikey(inp_snapshot, f_dots_baked, f_from_package, before, after, unit_step, time_type, c(out_update_min_t, out_update_max_t), NULL, in_colnames, out_colnames) + out_update_min_t <- inp_update_min_t - after_steps * unit_step + if (before_steps == Inf) { + out_update_max_t <- max(inp_snapshot$time_value) + } else { + out_update_max_t <- inp_update_max_t + before_steps * unit_step + } + out_update <- epi_slide_opt_one_epikey(inp_snapshot, f_dots_baked, f_from_package, before_steps, after_steps, unit_step, time_type, c(out_update_min_t, out_update_max_t), NULL, in_colnames, out_colnames) out_diff <- tbl_diff2(prev_out_snapshot, out_update, "time_value", "update") prev_inp_snapshot <<- inp_snapshot prev_out_snapshot <<- tbl_patch(prev_out_snapshot, out_diff, "time_value") @@ -118,11 +123,19 @@ epi_slide_opt.epi_archive <- purrr::partial(.f, ...) } col_names_quo <- enquo(.col_names) + if (is.null(.window_size)) { + cli_abort( + "epi_slide_opt: `.window_size` must be specified.", + class = "epiprocess__epi_slide_opt__window_size_missing" + ) + } names_info <- across_ish_names_info( .x$DT, time_type, col_names_quo, .f_info$namer, .window_size, .align, .prefix, .suffix, .new_col_names ) window_args <- get_before_after_from_window(.window_size, .align, time_type) + before_steps <- time_delta_to_n_steps(window_args$before, time_type) + after_steps <- time_delta_to_n_steps(window_args$after, time_type) if (!is.null(.ref_time_values)) { cli_abort("epi_slide.epi_archive does not support the `.ref_time_values` argument", class = "epiprocess__epi_slide_opt_archive__ref_time_values_unsupported" @@ -154,7 +167,7 @@ epi_slide_opt.epi_archive <- res <- epi_slide_opt_archive_one_epikey( group_values, names_info$input_col_names, - .f_dots_baked, .f_info$from_package, window_args$before, window_args$after, time_type, + .f_dots_baked, .f_info$from_package, before_steps, after_steps, time_type, names_info$output_col_names ) if (use_progress) cli::cli_progress_update(id = progress_bar_id) diff --git a/R/epi_slide_opt_edf.R b/R/epi_slide_opt_edf.R index 850c7ffe4..b5c99d9b4 100644 --- a/R/epi_slide_opt_edf.R +++ b/R/epi_slide_opt_edf.R @@ -168,9 +168,12 @@ across_ish_names_info <- function(.x, time_type, col_names_quo, .f_namer, } epi_slide_opt_one_epikey <- function(inp_tbl, - f_dots_baked, f_from_package, before, after, unit_step, time_type, + f_dots_baked, f_from_package, + before_steps, after_steps, unit_step, time_type, out_filter_time_range, out_filter_time_set, in_colnames, out_colnames) { + # TODO rename function, reorder args, roxygen2 + # # TODO try converting time values to reals, do work on reals, convert back at very end? # # TODO loosen restrictions here. each filter optional? @@ -187,8 +190,14 @@ epi_slide_opt_one_epikey <- function(inp_tbl, } else { cli_abort("Exactly one of `out_filter_time_range` and `out_filter_time_set` must be non-`NULL`.") } - slide_t_min <- time_minus_slide_window_arg(out_t_min, before, time_type, min(inp_tbl$time_value)) - slide_t_max <- time_plus_slide_window_arg(out_t_max, after, time_type) + if (before_steps == Inf) { + slide_t_min <- min(inp_tbl$time_value) + slide_start_padding_n <- time_minus_time_in_n_steps(out_t_min, slide_t_min, time_type) + } else { + slide_t_min <- out_t_min - before_steps * unit_step + slide_start_padding_n <- before_steps # perf: avoid time_minus_time_in_n_steps + } + slide_t_max <- out_t_max + after_steps * unit_step slide_nrow <- time_delta_to_n_steps(slide_t_max - slide_t_min, time_type) + 1L slide_time_values <- slide_t_min + 0L:(slide_nrow - 1L) * unit_step slide_inp_backrefs <- vec_match(slide_time_values, inp_tbl$time_value) @@ -200,24 +209,24 @@ epi_slide_opt_one_epikey <- function(inp_tbl, # try removing time_value column before slice? slide$time_value <- slide_time_values if (f_from_package == "data.table") { - if (before == Inf) { + if (before_steps == Inf) { slide[, out_colnames] <- f_dots_baked(slide[, in_colnames], seq_len(slide_nrow), adaptive = TRUE) } else { - out_cols <- f_dots_baked(slide[, in_colnames], before + after + 1L) - if (after != 0L) { + out_cols <- f_dots_baked(slide[, in_colnames], before_steps + after_steps + 1L) + if (after_steps != 0L) { # Shift an appropriate amount of NA padding from the start to the end. # (This padding will later be cut off when we filter down to the # original time_values.) out_cols <- lapply(out_cols, function(out_col) { - c(out_col[(after + 1L):length(out_col)], rep(NA, after)) + c(out_col[(after_steps + 1L):length(out_col)], rep(NA, after_steps)) }) } slide[, out_colnames] <- out_cols } } else if (f_from_package == "slider") { for (col_i in seq_along(in_colnames)) { - slide[[out_colnames[[col_i]]]] <- f_dots_baked(slide[[in_colnames[[col_i]]]], before = before, after = after) + slide[[out_colnames[[col_i]]]] <- f_dots_baked(slide[[in_colnames[[col_i]]]], before = before_steps, after = after_steps) } } else { cli_abort( @@ -225,12 +234,16 @@ epi_slide_opt_one_epikey <- function(inp_tbl, class = "epiprocess__epi_slide_opt_archive__f_from_package_invalid" ) } + # We should filter down the slide time values to ones in the input time values + # when preparing the output: rows_should_keep1 <- !is.na(slide_inp_backrefs) - rows_should_keep2 <- switch( - out_filter_time_style, + # We also need to apply the out_filter. + # + # TODO comments + test vs. just using inequality + rows_should_keep2 <- switch(out_filter_time_style, range = vec_rep_each( c(FALSE, TRUE, FALSE), - c(before, slide_nrow - before - after, after), + c(slide_start_padding_n, slide_nrow - slide_start_padding_n - after_steps, after_steps), ), set = vec_in(slide_time_values, out_time_values) ) @@ -493,12 +506,15 @@ epi_slide_opt.epi_df <- function(.x, .col_names, .f, ..., .align <- rlang::arg_match(.align) time_type <- attr(.x, "metadata")$time_type if (is.null(.window_size)) { - cli_abort("epi_slide_opt: `.window_size` must be specified.") + cli_abort( + "epi_slide_opt: `.window_size` must be specified.", + class = "epiprocess__epi_slide_opt__window_size_missing" + ) } validate_slide_window_arg(.window_size, time_type) window_args <- get_before_after_from_window(.window_size, .align, time_type) - before <- time_delta_to_n_steps(window_args$before, time_type) - after <- time_delta_to_n_steps(window_args$after, time_type) + before_steps <- time_delta_to_n_steps(window_args$before, time_type) + after_steps <- time_delta_to_n_steps(window_args$after, time_type) unit_step <- unit_time_delta(time_type, format = "fast") # Handle output naming: @@ -520,7 +536,7 @@ epi_slide_opt.epi_df <- function(.x, .col_names, .f, ..., result <- .x %>% group_modify(function(grp_data, grp_key) { - epi_slide_opt_one_epikey(grp_data, f_dots_baked, f_from_package, before, after, unit_step, time_type, NULL, ref_time_values, names_info$input_col_names, names_info$output_col_names) + epi_slide_opt_one_epikey(grp_data, f_dots_baked, f_from_package, before_steps, after_steps, unit_step, time_type, NULL, ref_time_values, names_info$input_col_names, names_info$output_col_names) }) %>% arrange_col_canonical() diff --git a/R/time-utils.R b/R/time-utils.R index 26ca803de..a5740f7f5 100644 --- a/R/time-utils.R +++ b/R/time-utils.R @@ -373,32 +373,3 @@ time_plus_n_steps <- function(x, y, time_type) { time_minus_n_steps <- function(x, y, time_type) { x - y * unit_time_delta(time_type, "fast") } - -#' Advance/retreat time_value(s) by specified amount (slide window arg) -#' -#' @param x a time_value (vector) of time type `time_type` -#' @param y a (scalar) slide window arg; should pass [`validate_slide_window_arg()`] -#' @param time_type as in [`validate_slide_window_arg()`] -#' @param max_time_value when `y == Inf`, what should be the result of adding `y`? -#' @param min_time_value when `y == Inf`, what should be the result of subtracting `y`? -#' @return a time_value (vector) of time type `time_type` -#' -#' @keywords internal -#' @seealso [`time_plus_n_steps`], if you're working with an integerish vector -#' number of time steps `y` (output from other `*n_steps` functions) instead. -time_plus_slide_window_arg <- function(x, y, time_type, max_time_value) { - if (y == Inf) { - rep(max_time_value, vec_size(x)) - } else { - x + y - } -} - -#' @rdname time_plus_slide_window_arg -time_minus_slide_window_arg <- function(x, y, time_type, min_time_value) { - if (y == Inf) { - rep(min_time_value, vec_size(x)) - } else { - x - y - } -} diff --git a/man/epi_slide_opt_archive_one_epikey.Rd b/man/epi_slide_opt_archive_one_epikey.Rd index 073e684c5..6d140ba23 100644 --- a/man/epi_slide_opt_archive_one_epikey.Rd +++ b/man/epi_slide_opt_archive_one_epikey.Rd @@ -9,8 +9,8 @@ epi_slide_opt_archive_one_epikey( in_colnames, f_dots_baked, f_from_package, - before, - after, + before_steps, + after_steps, time_type, out_colnames ) @@ -30,17 +30,17 @@ a specific row order is not required.} \item{f_from_package}{string; name of package from which \code{f_dots_baked} (pre-\code{partial}) originates} +\item{time_type}{as in \code{new_epi_archive}} + +\item{out_colnames}{chr, same length as \code{in_colnames}; column names to use +for results} + \item{before}{integerish >=0 or Inf; number of time steps before each ref_time_value to include in the sliding window computation; Inf to include all times beginning with the min \code{time_value}} \item{after}{integerish >=0; number of time steps after each ref_time_value to include in the sliding window computation} - -\item{time_type}{as in \code{new_epi_archive}} - -\item{out_colnames}{chr, same length as \code{in_colnames}; column names to use -for results} } \value{ tibble with a \code{version} column, pre-existing measurement columns, and diff --git a/man/time_plus_slide_window_arg.Rd b/man/time_plus_slide_window_arg.Rd deleted file mode 100644 index 1aa117f47..000000000 --- a/man/time_plus_slide_window_arg.Rd +++ /dev/null @@ -1,33 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/time-utils.R -\name{time_plus_slide_window_arg} -\alias{time_plus_slide_window_arg} -\alias{time_minus_slide_window_arg} -\title{Advance/retreat time_value(s) by specified amount (slide window arg)} -\usage{ -time_plus_slide_window_arg(x, y, time_type, max_time_value) - -time_minus_slide_window_arg(x, y, time_type, min_time_value) -} -\arguments{ -\item{x}{a time_value (vector) of time type \code{time_type}} - -\item{y}{a (scalar) slide window arg; should pass \code{\link[=validate_slide_window_arg]{validate_slide_window_arg()}}} - -\item{time_type}{as in \code{\link[=validate_slide_window_arg]{validate_slide_window_arg()}}} - -\item{max_time_value}{when \code{y == Inf}, what should be the result of adding \code{y}?} - -\item{min_time_value}{when \code{y == Inf}, what should be the result of subtracting \code{y}?} -} -\value{ -a time_value (vector) of time type \code{time_type} -} -\description{ -Advance/retreat time_value(s) by specified amount (slide window arg) -} -\seealso{ -\code{\link{time_plus_n_steps}}, if you're working with an integerish vector -number of time steps \code{y} (output from other \verb{*n_steps} functions) instead. -} -\keyword{internal} From 68de47be2dc59d3a2d8244bb9350aecc8b9de27c Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 10 Apr 2025 17:11:50 -0700 Subject: [PATCH 102/107] refactor(epi_slide_opt): helper function&variable naming, arg ordering --- R/epi_slide_opt_archive.R | 16 +++++----- R/epi_slide_opt_edf.R | 39 ++++++++++++------------- man/epi_slide_opt_archive_one_epikey.Rd | 4 +-- 3 files changed, 29 insertions(+), 30 deletions(-) diff --git a/R/epi_slide_opt_archive.R b/R/epi_slide_opt_archive.R index 6fe0c1da4..0d21aa58f 100644 --- a/R/epi_slide_opt_archive.R +++ b/R/epi_slide_opt_archive.R @@ -48,7 +48,7 @@ epi_slide_opt_archive_one_epikey <- function( grp_updates, in_colnames, f_dots_baked, f_from_package, - before_steps, after_steps, time_type, + before_n_steps, after_n_steps, time_type, out_colnames) { grp_updates_by_version <- grp_updates %>% nest(.by = version, .key = "subtbl") %>% @@ -68,13 +68,13 @@ epi_slide_opt_archive_one_epikey <- function( # inp_update_min_t - after, or anything in between these two bounds. If # before == Inf, we need to update outputs all the way to the end of the # input *snapshot*. - out_update_min_t <- inp_update_min_t - after_steps * unit_step - if (before_steps == Inf) { + out_update_min_t <- inp_update_min_t - after_n_steps * unit_step + if (before_n_steps == Inf) { out_update_max_t <- max(inp_snapshot$time_value) } else { - out_update_max_t <- inp_update_max_t + before_steps * unit_step + out_update_max_t <- inp_update_max_t + before_n_steps * unit_step } - out_update <- epi_slide_opt_one_epikey(inp_snapshot, f_dots_baked, f_from_package, before_steps, after_steps, unit_step, time_type, c(out_update_min_t, out_update_max_t), NULL, in_colnames, out_colnames) + out_update <- epi_slide_opt_edf_one_epikey(inp_snapshot, in_colnames, f_dots_baked, f_from_package, before_n_steps, after_n_steps, unit_step, time_type, out_colnames, c(out_update_min_t, out_update_max_t), NULL) out_diff <- tbl_diff2(prev_out_snapshot, out_update, "time_value", "update") prev_inp_snapshot <<- inp_snapshot prev_out_snapshot <<- tbl_patch(prev_out_snapshot, out_diff, "time_value") @@ -134,8 +134,8 @@ epi_slide_opt.epi_archive <- .window_size, .align, .prefix, .suffix, .new_col_names ) window_args <- get_before_after_from_window(.window_size, .align, time_type) - before_steps <- time_delta_to_n_steps(window_args$before, time_type) - after_steps <- time_delta_to_n_steps(window_args$after, time_type) + before_n_steps <- time_delta_to_n_steps(window_args$before, time_type) + after_n_steps <- time_delta_to_n_steps(window_args$after, time_type) if (!is.null(.ref_time_values)) { cli_abort("epi_slide.epi_archive does not support the `.ref_time_values` argument", class = "epiprocess__epi_slide_opt_archive__ref_time_values_unsupported" @@ -167,7 +167,7 @@ epi_slide_opt.epi_archive <- res <- epi_slide_opt_archive_one_epikey( group_values, names_info$input_col_names, - .f_dots_baked, .f_info$from_package, before_steps, after_steps, time_type, + .f_dots_baked, .f_info$from_package, before_n_steps, after_n_steps, time_type, names_info$output_col_names ) if (use_progress) cli::cli_progress_update(id = progress_bar_id) diff --git a/R/epi_slide_opt_edf.R b/R/epi_slide_opt_edf.R index b5c99d9b4..98f659237 100644 --- a/R/epi_slide_opt_edf.R +++ b/R/epi_slide_opt_edf.R @@ -167,13 +167,12 @@ across_ish_names_info <- function(.x, time_type, col_names_quo, .f_namer, ) } -epi_slide_opt_one_epikey <- function(inp_tbl, - f_dots_baked, f_from_package, - before_steps, after_steps, unit_step, time_type, - out_filter_time_range, out_filter_time_set, - in_colnames, out_colnames) { - # TODO rename function, reorder args, roxygen2 - # +epi_slide_opt_edf_one_epikey <- function(inp_tbl, + in_colnames, + f_dots_baked, f_from_package, + before_n_steps, after_n_steps, unit_step, time_type, + out_colnames, + out_filter_time_range, out_filter_time_set) { # TODO try converting time values to reals, do work on reals, convert back at very end? # # TODO loosen restrictions here. each filter optional? @@ -190,14 +189,14 @@ epi_slide_opt_one_epikey <- function(inp_tbl, } else { cli_abort("Exactly one of `out_filter_time_range` and `out_filter_time_set` must be non-`NULL`.") } - if (before_steps == Inf) { + if (before_n_steps == Inf) { slide_t_min <- min(inp_tbl$time_value) slide_start_padding_n <- time_minus_time_in_n_steps(out_t_min, slide_t_min, time_type) } else { - slide_t_min <- out_t_min - before_steps * unit_step - slide_start_padding_n <- before_steps # perf: avoid time_minus_time_in_n_steps + slide_t_min <- out_t_min - before_n_steps * unit_step + slide_start_padding_n <- before_n_steps # perf: avoid time_minus_time_in_n_steps } - slide_t_max <- out_t_max + after_steps * unit_step + slide_t_max <- out_t_max + after_n_steps * unit_step slide_nrow <- time_delta_to_n_steps(slide_t_max - slide_t_min, time_type) + 1L slide_time_values <- slide_t_min + 0L:(slide_nrow - 1L) * unit_step slide_inp_backrefs <- vec_match(slide_time_values, inp_tbl$time_value) @@ -209,24 +208,24 @@ epi_slide_opt_one_epikey <- function(inp_tbl, # try removing time_value column before slice? slide$time_value <- slide_time_values if (f_from_package == "data.table") { - if (before_steps == Inf) { + if (before_n_steps == Inf) { slide[, out_colnames] <- f_dots_baked(slide[, in_colnames], seq_len(slide_nrow), adaptive = TRUE) } else { - out_cols <- f_dots_baked(slide[, in_colnames], before_steps + after_steps + 1L) - if (after_steps != 0L) { + out_cols <- f_dots_baked(slide[, in_colnames], before_n_steps + after_n_steps + 1L) + if (after_n_steps != 0L) { # Shift an appropriate amount of NA padding from the start to the end. # (This padding will later be cut off when we filter down to the # original time_values.) out_cols <- lapply(out_cols, function(out_col) { - c(out_col[(after_steps + 1L):length(out_col)], rep(NA, after_steps)) + c(out_col[(after_n_steps + 1L):length(out_col)], rep(NA, after_n_steps)) }) } slide[, out_colnames] <- out_cols } } else if (f_from_package == "slider") { for (col_i in seq_along(in_colnames)) { - slide[[out_colnames[[col_i]]]] <- f_dots_baked(slide[[in_colnames[[col_i]]]], before = before_steps, after = after_steps) + slide[[out_colnames[[col_i]]]] <- f_dots_baked(slide[[in_colnames[[col_i]]]], before = before_n_steps, after = after_n_steps) } } else { cli_abort( @@ -243,7 +242,7 @@ epi_slide_opt_one_epikey <- function(inp_tbl, rows_should_keep2 <- switch(out_filter_time_style, range = vec_rep_each( c(FALSE, TRUE, FALSE), - c(slide_start_padding_n, slide_nrow - slide_start_padding_n - after_steps, after_steps), + c(slide_start_padding_n, slide_nrow - slide_start_padding_n - after_n_steps, after_n_steps), ), set = vec_in(slide_time_values, out_time_values) ) @@ -513,8 +512,8 @@ epi_slide_opt.epi_df <- function(.x, .col_names, .f, ..., } validate_slide_window_arg(.window_size, time_type) window_args <- get_before_after_from_window(.window_size, .align, time_type) - before_steps <- time_delta_to_n_steps(window_args$before, time_type) - after_steps <- time_delta_to_n_steps(window_args$after, time_type) + before_n_steps <- time_delta_to_n_steps(window_args$before, time_type) + after_n_steps <- time_delta_to_n_steps(window_args$after, time_type) unit_step <- unit_time_delta(time_type, format = "fast") # Handle output naming: @@ -536,7 +535,7 @@ epi_slide_opt.epi_df <- function(.x, .col_names, .f, ..., result <- .x %>% group_modify(function(grp_data, grp_key) { - epi_slide_opt_one_epikey(grp_data, f_dots_baked, f_from_package, before_steps, after_steps, unit_step, time_type, NULL, ref_time_values, names_info$input_col_names, names_info$output_col_names) + epi_slide_opt_edf_one_epikey(grp_data, names_info$input_col_names, f_dots_baked, f_from_package, before_n_steps, after_n_steps, unit_step, time_type, names_info$output_col_names, NULL, ref_time_values) }) %>% arrange_col_canonical() diff --git a/man/epi_slide_opt_archive_one_epikey.Rd b/man/epi_slide_opt_archive_one_epikey.Rd index 6d140ba23..795b46ce4 100644 --- a/man/epi_slide_opt_archive_one_epikey.Rd +++ b/man/epi_slide_opt_archive_one_epikey.Rd @@ -9,8 +9,8 @@ epi_slide_opt_archive_one_epikey( in_colnames, f_dots_baked, f_from_package, - before_steps, - after_steps, + before_n_steps, + after_n_steps, time_type, out_colnames ) From 4763e367af30f03a35e93805ae357d0a577b0781 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 10 Apr 2025 17:43:18 -0700 Subject: [PATCH 103/107] More internal renames and documentation --- R/epi_slide_opt_archive.R | 31 +++----- R/epi_slide_opt_edf.R | 65 ++++++++++++++++ man/epi_slide_opt_archive_one_epikey.Rd | 22 +++--- man/epi_slide_opt_edf_one_epikey.Rd | 98 +++++++++++++++++++++++++ 4 files changed, 183 insertions(+), 33 deletions(-) create mode 100644 man/epi_slide_opt_edf_one_epikey.Rd diff --git a/R/epi_slide_opt_archive.R b/R/epi_slide_opt_archive.R index 0d21aa58f..3869639c7 100644 --- a/R/epi_slide_opt_archive.R +++ b/R/epi_slide_opt_archive.R @@ -1,23 +1,10 @@ #' Core operation of `epi_slide_opt.epi_archive` for a single epikey's history #' -#' @param grp_updates tibble with a `version` column and measurement columns for +#' @param inp_updates tibble with a `version` column and measurement columns for #' a single epikey, without the epikey labeling columns (e.g., from #' `group_modify`). Interpretation is analogous to an `epi_archive` `DT`, but #' a specific row order is not required. -#' @param in_colnames chr; names of columns to which to apply `f_dots_baked` -#' @param f_dots_baked supported sliding function from `{data.table}` or -#' `{slider}`, potentially with some arguments baked in with -#' [`purrr::partial`] -#' @param f_from_package string; name of package from which `f_dots_baked` -#' (pre-`partial`) originates -#' @param before integerish >=0 or Inf; number of time steps before each -#' ref_time_value to include in the sliding window computation; Inf to include -#' all times beginning with the min `time_value` -#' @param after integerish >=0; number of time steps after each ref_time_value -#' to include in the sliding window computation -#' @param time_type as in `new_epi_archive` -#' @param out_colnames chr, same length as `in_colnames`; column names to use -#' for results +#' @inheritParams epi_slide_opt_edf_one_epikey #' @return tibble with a `version` column, pre-existing measurement columns, and #' new measurement columns; (compactified) diff data to put into an #' `epi_archive`. May not match column ordering; may not ensure any row @@ -26,7 +13,7 @@ #' @examples #' #' library(dplyr) -#' grp_updates <- bind_rows( +#' inp_updates <- bind_rows( #' tibble(version = 30, time_value = 1:20, value = 1:20), #' tibble(version = 32, time_value = 4:5, value = 5:4), #' tibble(version = 33, time_value = 8, value = 9), @@ -38,27 +25,27 @@ #' #' f <- purrr::partial(data.table::frollmean, algo = "exact") #' -#' grp_updates %>% +#' inp_updates %>% #' epiprocess:::epi_slide_opt_archive_one_epikey( #' "value", f, "data.table", 2L, 0L, "day", "slide_value" #' ) #' #' @keywords internal epi_slide_opt_archive_one_epikey <- function( - grp_updates, + inp_updates, in_colnames, f_dots_baked, f_from_package, before_n_steps, after_n_steps, time_type, out_colnames) { - grp_updates_by_version <- grp_updates %>% + inp_updates_by_version <- inp_updates %>% nest(.by = version, .key = "subtbl") %>% arrange(version) unit_step <- unit_time_delta(time_type, format = "fast") prev_inp_snapshot <- NULL prev_out_snapshot <- NULL - result <- lapply(seq_len(nrow(grp_updates_by_version)), function(version_i) { - version <- grp_updates_by_version$version[[version_i]] - inp_update <- grp_updates_by_version$subtbl[[version_i]] + result <- lapply(seq_len(nrow(inp_updates_by_version)), function(version_i) { + version <- inp_updates_by_version$version[[version_i]] + inp_update <- inp_updates_by_version$subtbl[[version_i]] inp_snapshot <- tbl_patch(prev_inp_snapshot, inp_update, "time_value") inp_update_min_t <- min(inp_update$time_value) inp_update_max_t <- max(inp_update$time_value) diff --git a/R/epi_slide_opt_edf.R b/R/epi_slide_opt_edf.R index 98f659237..49096697b 100644 --- a/R/epi_slide_opt_edf.R +++ b/R/epi_slide_opt_edf.R @@ -167,6 +167,71 @@ across_ish_names_info <- function(.x, time_type, col_names_quo, .f_namer, ) } +#' Run a specialized slide computation on a single `epi_df` epikey, with temporary completion +#' +#' @param inp_tbl tibble; should have a `time_value` column and columns named in +#' `in_colnames`; should not contain any columns named in `out_colnames` +#' @param in_colnames chr; names of columns to which to apply `f_dots_baked` +#' @param f_dots_baked supported sliding function from `{data.table}` or +#' `{slider}`, potentially with some arguments baked in with +#' [`purrr::partial`] +#' @param f_from_package string; name of package from which `f_dots_baked` +#' (pre-`partial`) originates +#' @param before_n_steps integerish `>=0` or `Inf`; number of time steps before +#' each `ref_time_value` to include in the sliding window computation; `Inf` +#' to include all times beginning with the min `time_value` +#' @param after_n_steps integerish `>=0`; number of time steps after each +#' `ref_time_value` to include in the sliding window computation +#' @param time_type as in `new_epi_archive` +#' @param out_colnames chr, same length as `in_colnames`; column names to use +#' for results +#' @param out_filter_time_range,out_filter_time_set `time_value` filter; +#' `time_values` in the output should match the result of applying this filter +#' to `inp_tbl$time_value`. Exactly one of the two must be provided +#' (non-`NULL`) and the other must be `NULL`. `out_filter_time_range`, if +#' provided, should be a length-2 vector/list containing the minimum and +#' maximum `time_value` to allow in the output. `out_filter_time_set`, if +#' provided, should be a vector of `time_values` to intersect with the input +#' `time_value`s. +#' @return tibble; like `inp_tbl` with addition of `out_colnames` holding the +#' slide computation results, with times filtered down as specified +#' +#' @examples +#' +#' library(dplyr) +#' tbl <- tibble( +#' time_value = c(11:12, 15:18) + 0, +#' value = c(c(1, 2), c(4, 8, 16, 32)) +#' ) +#' +#' tbl %>% +#' epi_slide_opt_edf_one_epikey( +#' "value", +#' frollmean, "data.table", +#' 1L, 0L, 1L, "integer", +#' "slide_value", +#' c(11L, 16L), NULL +#' ) +#' +#' tbl %>% +#' epi_slide_opt_edf_one_epikey( +#' "value", +#' frollmean, "data.table", +#' 0L, 1L, 1L, "integer", +#' "slide_value", +#' NULL, c(11, 15, 16, 17, 18) +#' ) +#' +#' tbl %>% +#' epi_slide_opt_edf_one_epikey( +#' "value", +#' frollmean, "data.table", +#' Inf, 0L, 1L, "integer", +#' "slide_value", +#' NULL, c(12, 17) +#' ) +#' +#' @keywords internal epi_slide_opt_edf_one_epikey <- function(inp_tbl, in_colnames, f_dots_baked, f_from_package, diff --git a/man/epi_slide_opt_archive_one_epikey.Rd b/man/epi_slide_opt_archive_one_epikey.Rd index 795b46ce4..847d0d162 100644 --- a/man/epi_slide_opt_archive_one_epikey.Rd +++ b/man/epi_slide_opt_archive_one_epikey.Rd @@ -5,7 +5,7 @@ \title{Core operation of \code{epi_slide_opt.epi_archive} for a single epikey's history} \usage{ epi_slide_opt_archive_one_epikey( - grp_updates, + inp_updates, in_colnames, f_dots_baked, f_from_package, @@ -16,7 +16,7 @@ epi_slide_opt_archive_one_epikey( ) } \arguments{ -\item{grp_updates}{tibble with a \code{version} column and measurement columns for +\item{inp_updates}{tibble with a \code{version} column and measurement columns for a single epikey, without the epikey labeling columns (e.g., from \code{group_modify}). Interpretation is analogous to an \code{epi_archive} \code{DT}, but a specific row order is not required.} @@ -30,17 +30,17 @@ a specific row order is not required.} \item{f_from_package}{string; name of package from which \code{f_dots_baked} (pre-\code{partial}) originates} +\item{before_n_steps}{integerish \verb{>=0} or \code{Inf}; number of time steps before +each \code{ref_time_value} to include in the sliding window computation; \code{Inf} +to include all times beginning with the min \code{time_value}} + +\item{after_n_steps}{integerish \verb{>=0}; number of time steps after each +\code{ref_time_value} to include in the sliding window computation} + \item{time_type}{as in \code{new_epi_archive}} \item{out_colnames}{chr, same length as \code{in_colnames}; column names to use for results} - -\item{before}{integerish >=0 or Inf; number of time steps before each -ref_time_value to include in the sliding window computation; Inf to include -all times beginning with the min \code{time_value}} - -\item{after}{integerish >=0; number of time steps after each ref_time_value -to include in the sliding window computation} } \value{ tibble with a \code{version} column, pre-existing measurement columns, and @@ -54,7 +54,7 @@ Core operation of \code{epi_slide_opt.epi_archive} for a single epikey's history \examples{ library(dplyr) -grp_updates <- bind_rows( +inp_updates <- bind_rows( tibble(version = 30, time_value = 1:20, value = 1:20), tibble(version = 32, time_value = 4:5, value = 5:4), tibble(version = 33, time_value = 8, value = 9), @@ -66,7 +66,7 @@ grp_updates <- bind_rows( f <- purrr::partial(data.table::frollmean, algo = "exact") -grp_updates \%>\% +inp_updates \%>\% epiprocess:::epi_slide_opt_archive_one_epikey( "value", f, "data.table", 2L, 0L, "day", "slide_value" ) diff --git a/man/epi_slide_opt_edf_one_epikey.Rd b/man/epi_slide_opt_edf_one_epikey.Rd new file mode 100644 index 000000000..b3cfad5b3 --- /dev/null +++ b/man/epi_slide_opt_edf_one_epikey.Rd @@ -0,0 +1,98 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/epi_slide_opt_edf.R +\name{epi_slide_opt_edf_one_epikey} +\alias{epi_slide_opt_edf_one_epikey} +\title{Run a specialized slide computation on a single \code{epi_df} epikey, with temporary completion} +\usage{ +epi_slide_opt_edf_one_epikey( + inp_tbl, + in_colnames, + f_dots_baked, + f_from_package, + before_n_steps, + after_n_steps, + unit_step, + time_type, + out_colnames, + out_filter_time_range, + out_filter_time_set +) +} +\arguments{ +\item{inp_tbl}{tibble; should have a \code{time_value} column and columns named in +\code{in_colnames}; should not contain any columns named in \code{out_colnames}} + +\item{in_colnames}{chr; names of columns to which to apply \code{f_dots_baked}} + +\item{f_dots_baked}{supported sliding function from \code{{data.table}} or +\code{{slider}}, potentially with some arguments baked in with +\code{\link[purrr:partial]{purrr::partial}}} + +\item{f_from_package}{string; name of package from which \code{f_dots_baked} +(pre-\code{partial}) originates} + +\item{before_n_steps}{integerish \verb{>=0} or \code{Inf}; number of time steps before +each \code{ref_time_value} to include in the sliding window computation; \code{Inf} +to include all times beginning with the min \code{time_value}} + +\item{after_n_steps}{integerish \verb{>=0}; number of time steps after each +\code{ref_time_value} to include in the sliding window computation} + +\item{time_type}{as in \code{new_epi_archive}} + +\item{out_colnames}{chr, same length as \code{in_colnames}; column names to use +for results} + +\item{out_filter_time_range, out_filter_time_set}{\code{time_value} filter; +\code{time_values} in the output should match the result of applying this filter +to \code{inp_tbl$time_value}. Exactly one of the two must be provided +(non-\code{NULL}) and the other must be \code{NULL}. \code{out_filter_time_range}, if +provided, should be a length-2 vector/list containing the minimum and +maximum \code{time_value} to allow in the output. \code{out_filter_time_set}, if +provided, should be a vector of \code{time_values} to intersect with the input +\code{time_value}s.} +} +\value{ +tibble; like \code{inp_tbl} with addition of \code{out_colnames} holding the +slide computation results, with times filtered down as specified +} +\description{ +Run a specialized slide computation on a single \code{epi_df} epikey, with temporary completion +} +\examples{ + +library(dplyr) +tbl <- tibble( + time_value = c(11:12, 15:18) + 0, + value = c(c(1, 2), c(4, 8, 16, 32)) +) + +tbl \%>\% + epi_slide_opt_edf_one_epikey( + "value", + frollmean, "data.table", + 1L, 0L, 1L, "integer", + "slide_value", + c(11L, 16L), NULL + ) + +tbl \%>\% + epi_slide_opt_edf_one_epikey( + "value", + frollmean, "data.table", + 0L, 1L, 1L, "integer", + "slide_value", + NULL, c(11, 15, 16, 17, 18) + ) + +tbl \%>\% + epi_slide_opt_edf_one_epikey( + "value", + frollmean, "data.table", + Inf, 0L, 1L, "integer", + "slide_value", + NULL, c(12, 17) + ) + +} +\keyword{internal} From 184f98a325a07a1df4b1bd76477647ddf7cf1912 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 10 Apr 2025 20:21:34 -0700 Subject: [PATCH 104/107] perf: c -> list when specifying slide out date min, max --- R/epi_slide_opt_archive.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/epi_slide_opt_archive.R b/R/epi_slide_opt_archive.R index 3869639c7..d0a16749a 100644 --- a/R/epi_slide_opt_archive.R +++ b/R/epi_slide_opt_archive.R @@ -61,7 +61,7 @@ epi_slide_opt_archive_one_epikey <- function( } else { out_update_max_t <- inp_update_max_t + before_n_steps * unit_step } - out_update <- epi_slide_opt_edf_one_epikey(inp_snapshot, in_colnames, f_dots_baked, f_from_package, before_n_steps, after_n_steps, unit_step, time_type, out_colnames, c(out_update_min_t, out_update_max_t), NULL) + out_update <- epi_slide_opt_edf_one_epikey(inp_snapshot, in_colnames, f_dots_baked, f_from_package, before_n_steps, after_n_steps, unit_step, time_type, out_colnames, list(out_update_min_t, out_update_max_t), NULL) out_diff <- tbl_diff2(prev_out_snapshot, out_update, "time_value", "update") prev_inp_snapshot <<- inp_snapshot prev_out_snapshot <<- tbl_patch(prev_out_snapshot, out_diff, "time_value") From a82d48d03969f514a682ae175f3b6c80fd147eff Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 11 Apr 2025 16:30:28 -0700 Subject: [PATCH 105/107] fix(epi_slide_opt): on `out_filter_time_set` narrowing to 0 --- R/epi_slide_opt_edf.R | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/R/epi_slide_opt_edf.R b/R/epi_slide_opt_edf.R index 49096697b..de3ed45a5 100644 --- a/R/epi_slide_opt_edf.R +++ b/R/epi_slide_opt_edf.R @@ -249,8 +249,13 @@ epi_slide_opt_edf_one_epikey <- function(inp_tbl, # FIXME min time_value for this epikey vs. entire edf; match existing behavior, or complete changeover out_filter_time_style <- "set" out_time_values <- vec_set_intersect(inp_tbl$time_value, out_filter_time_set) - out_t_min <- min(out_time_values) - out_t_max <- max(out_time_values) + if (vec_size(out_time_values) == 0L) { + out_t_min <- inp_tbl$time_value[[1L]] + out_t_max <- inp_tbl$time_value[[1L]] - 1L * unit_step + } else { + out_t_min <- min(out_time_values) + out_t_max <- max(out_time_values) + } } else { cli_abort("Exactly one of `out_filter_time_range` and `out_filter_time_set` must be non-`NULL`.") } From 4aadc9504361bebdd59a09524ba80904c1f44a33 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Sat, 12 Apr 2025 16:26:16 -0700 Subject: [PATCH 106/107] docs(slide.R): correct error message regarding data.frame rownames The error message says that slide computations can't be rownamed data frames. This is both bad wording (having rownames != having non-automatic rownames) and we're not actually enforcing any restriction of the sort anyway. We drop/ignore the rownames though (via dplyr/tibble). In `epix_slide`, the allowance of non-automatic rownames was deliberate; in `epi_slide`, it also seems likely the way to go; one might convert to `data.frame` and have some filter-like operation return non-automatic rownames. --- R/slide.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/slide.R b/R/slide.R index c05fbe98f..e4ea80792 100644 --- a/R/slide.R +++ b/R/slide.R @@ -410,7 +410,7 @@ epi_slide_one_group <- function( # Returned values must be data.frame or vector. if ("other" %in% return_types) { cli_abort( - "epi_slide: slide computations must always return either data frames without rownames + "epi_slide: slide computations must always return either data frames or unnamed vectors (as determined by the vctrs package).", class = "epiprocess__invalid_slide_comp_value" ) From d9621015b88773ff0f1ac1d02685bd4ecd93bb0b Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Sun, 13 Apr 2025 01:11:01 -0700 Subject: [PATCH 107/107] fix: `purrr::partial` missing future arg placement Use `... =` to specify this. Prevents some confusing error messages on ``` archive_cases_dv_subset %>% epi_slide_opt(percent_cli, frollmean, 0, .window_size = 7) ``` but does not solve all issues that can happen with unnamed `...`. --- R/epi_slide_opt_archive.R | 2 +- R/epi_slide_opt_edf.R | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/R/epi_slide_opt_archive.R b/R/epi_slide_opt_archive.R index d0a16749a..ab930af61 100644 --- a/R/epi_slide_opt_archive.R +++ b/R/epi_slide_opt_archive.R @@ -107,7 +107,7 @@ epi_slide_opt.epi_archive <- # debug stack traces: .f } else { - purrr::partial(.f, ...) + purrr::partial(.f, ... = , ...) # `... =` stands in for future args } col_names_quo <- enquo(.col_names) if (is.null(.window_size)) { diff --git a/R/epi_slide_opt_edf.R b/R/epi_slide_opt_edf.R index de3ed45a5..a97fef7a7 100644 --- a/R/epi_slide_opt_edf.R +++ b/R/epi_slide_opt_edf.R @@ -62,6 +62,7 @@ upstream_slide_f_info <- function(.f, ...) { } f_from_package <- f_info_row$package if (f_from_package == "data.table" && "fill" %in% names(rlang::call_match(dots_expand = FALSE)[["..."]])) { + # XXX this doesn't detect with `fill` is passed positionally through dots... cli_abort("`epi_slide_opt` does not support `data.table::froll*()` with a custom `fill =` arg", class = "epiprocess__epi_slide_opt__fill_unsupported" @@ -600,7 +601,7 @@ epi_slide_opt.epi_df <- function(.x, .col_names, .f, ..., # debug stack traces: .f } else { - purrr::partial(.f, ...) + purrr::partial(.f, ... = , ...) # `... =` stands in for future args } result <- .x %>%