@@ -984,3 +984,118 @@ dplyr_col_modify.col_modify_recorder_df <- function(data, cols) {
984984 attr(data , " epiprocess::col_modify_recorder_df::cols" ) <- cols
985985 data
986986}
987+
988+
989+
990+ # ' [`dplyr::filter`] for `epi_archive`s
991+ # '
992+ # ' @param .data an `epi_archive`
993+ # ' @param ... as in [`dplyr::filter`]; using the `version` column is not allowed
994+ # ' unless you use `.format_aware = TRUE`; see details.
995+ # ' @param .by as in [`dplyr::filter`]
996+ # ' @param .format_aware optional, `TRUE` or `FALSE`; default `FALSE`. See
997+ # ' details.
998+ # '
999+ # ' @details
1000+ # '
1001+ # ' By default, using the `version` column is disabled as it's easy to
1002+ # ' get unexpected results. See if either [`epix_as_of`] or [`epix_slide`]
1003+ # ' works as an alternative. If they don't cover your use case, then you can
1004+ # ' set `.format_aware = TRUE` to enable usage of the `version` column, but be
1005+ # ' careful to:
1006+ # ' * Factor in that `.data$DT` may be using a "compact" format based on diffing
1007+ # ' consecutive versions; see details of [`as_epi_archive`]
1008+ # ' * Set `clobberable_versions_start` and `versions_end` of the result
1009+ # ' appropriately after the `filter` call. They will be initialized with the
1010+ # ' same values as in `.data`.
1011+ # '
1012+ # ' `dplyr::filter` also has an optional argument `.preserve`, which should not
1013+ # ' have an impact on (ungrouped) `epi_archive`s, and `grouped_epi_archive`s do
1014+ # ' not currently support `dplyr::filter`.
1015+ # '
1016+ # ' @examples
1017+ # '
1018+ # ' # Filter to one location and a particular time range:
1019+ # ' archive_cases_dv_subset %>%
1020+ # ' filter(geo_value == "fl", time_value >= as.Date("2020-10-01"))
1021+ # '
1022+ # ' # Convert to weekly by taking the Saturday data for each week, so that
1023+ # ' # `case_rate_7d_av` represents a Sun--Sat average:
1024+ # ' archive_cases_dv_subset %>%
1025+ # ' filter(as.POSIXlt(time_value)$wday == 6L)
1026+ # '
1027+ # ' # Filtering involving versions requires extra care. See epix_as_of and
1028+ # ' # epix_slide instead for some common operations. One semi-common operation
1029+ # ' # that ends up being fairly simple is treating observations as finalized
1030+ # ' # after some amount of time, and ignoring any revisions that were made after
1031+ # ' # that point:
1032+ # ' archive_cases_dv_subset %>%
1033+ # ' filter(version <= time_value + as.difftime(60, units = "days"),
1034+ # ' .format_aware = TRUE)
1035+ # '
1036+ # ' @export
1037+ filter.epi_archive <- function (.data , ... , .by = NULL , .format_aware = FALSE ) {
1038+ in_tbl <- tibble :: as_tibble(as.list(.data $ DT ), .name_repair = " minimal" )
1039+ if (.format_aware ) {
1040+ out_tbl <- in_tbl %> %
1041+ filter(... , .by = .by )
1042+ } else {
1043+ out_tbl <- in_tbl %> %
1044+ filter(
1045+ # Add our own fake filter arg to the user's ..., to update the data mask
1046+ # to prevent `version` column usage.
1047+ {
1048+ # We should be evaluating inside the data mask. To disable both
1049+ # `version` and `.data$version`, we need to go to the data mask's
1050+ # ------
1051+ e <- environment()
1052+ while (! identical(e , globalenv()) && ! identical(e , emptyenv())) {
1053+ if (" version" %in% names(e )) {
1054+ # "version" is expected to be an active binding, and directly
1055+ # assigning over it has issues; explicitly `rm` first.
1056+ rm(list = " version" , envir = e )
1057+ delayedAssign(" version" , cli :: cli_abort(c(
1058+ " Using `version` in `filter` may produce unexpected results." ,
1059+ " >" = " See if `epix_as_of` or `epix_slide` would work instead." ,
1060+ " >" = " If not, see `?filter.epi_archive` details for how to proceed."
1061+ )), assign.env = e )
1062+ break
1063+ }
1064+ e <- parent.env(e )
1065+ }
1066+ TRUE
1067+ },
1068+ ... ,
1069+ .by = .by
1070+ )
1071+ }
1072+ out_geo_type <-
1073+ if (.data $ geo_type == " custom" ) {
1074+ # We might be going from a multi-resolution to single-resolution archive;
1075+ # e.g. national+state -> state; try to re-infer:
1076+ guess_geo_type(out_tbl $ geo_value )
1077+ } else {
1078+ # We risk less-understandable inference failures such as inferring "hhs"
1079+ # from selecting hrr 10 data; just use the old geo_type:
1080+ .data $ geo_type
1081+ }
1082+ # We might be going from daily to weekly; re-infer:
1083+ out_time_type <- guess_time_type(out_tbl $ time_value )
1084+ # Even if they narrow down to just a single value of an other_keys column,
1085+ # it's probably still better (& simpler) to treat it as an other_keys column
1086+ # since it still exists in the result:
1087+ out_other_keys <- .data $ other_keys
1088+ # `filter` makes no guarantees about not aliasing columns in its result when
1089+ # the filter condition is all TRUE, so don't setDT.
1090+ out_dtbl <- as.data.table(out_tbl , key = out_other_keys )
1091+ result <- new_epi_archive(
1092+ out_dtbl ,
1093+ out_geo_type , out_time_type , out_other_keys ,
1094+ # Assume version-related metadata unchanged; part of why we want to push
1095+ # back on filter expressions like `.data$version <= .env$as_of`:
1096+ .data $ clobberable_versions_start , .data $ versions_end
1097+ )
1098+ # Filtering down rows while keeping all (ukey) columns should preserve ukey
1099+ # uniqueness.
1100+ result
1101+ }
0 commit comments