@@ -984,3 +984,119 @@ dplyr_col_modify.col_modify_recorder_df <- function(data, cols) {
984984 attr(data , " epiprocess::col_modify_recorder_df::cols" ) <- cols
985985 data
986986}
987+
988+
989+
990+ # ' [`dplyr::filter`] for `epi_archive`s
991+ # '
992+ # ' @param .data an `epi_archive`
993+ # ' @param ... as in [`dplyr::filter`]; using the `version` column is not allowed
994+ # ' unless you use `.format_aware = TRUE`; see details.
995+ # ' @param .by as in [`dplyr::filter`]
996+ # ' @param .format_aware optional, `TRUE` or `FALSE`; default `FALSE`. See
997+ # ' details.
998+ # '
999+ # ' @details
1000+ # '
1001+ # ' By default, using the `version` column is disabled as it's easy to
1002+ # ' get unexpected results. See if either [`epix_as_of`] or [`epix_slide`]
1003+ # ' works as an alternative. If they don't cover your use case, then you can
1004+ # ' set `.format_aware = TRUE` to enable usage of the `version` column, but be
1005+ # ' careful to:
1006+ # ' * Factor in that `.data$DT` may be using a "compact" format based on diffing
1007+ # ' consecutive versions; see details of [`as_epi_archive`]
1008+ # ' * Set `clobberable_versions_start` and `versions_end` of the result
1009+ # ' appropriately after the `filter` call. They will be initialized with the
1010+ # ' same values as in `.data`.
1011+ # '
1012+ # ' `dplyr::filter` also has an optional argument `.preserve`, which should not
1013+ # ' have an impact on (ungrouped) `epi_archive`s, and `grouped_epi_archive`s do
1014+ # ' not currently support `dplyr::filter`.
1015+ # '
1016+ # ' @examples
1017+ # '
1018+ # ' # Filter to one location and a particular time range:
1019+ # ' archive_cases_dv_subset %>%
1020+ # ' filter(geo_value == "fl", time_value >= as.Date("2020-10-01"))
1021+ # '
1022+ # ' # Convert to weekly by taking the Saturday data for each week, so that
1023+ # ' # `case_rate_7d_av` represents a Sun--Sat average:
1024+ # ' archive_cases_dv_subset %>%
1025+ # ' filter(as.POSIXlt(time_value)$wday == 6L)
1026+ # '
1027+ # ' # Filtering involving versions requires extra care. See epix_as_of and
1028+ # ' # epix_slide instead for some common operations. One semi-common operation
1029+ # ' # that ends up being fairly simple is treating observations as finalized
1030+ # ' # after some amount of time, and ignoring any revisions that were made after
1031+ # ' # that point:
1032+ # ' archive_cases_dv_subset %>%
1033+ # ' filter(version <= time_value + as.difftime(60, units = "days"),
1034+ # ' .format_aware = TRUE
1035+ # ' )
1036+ # '
1037+ # ' @export
1038+ filter.epi_archive <- function (.data , ... , .by = NULL , .format_aware = FALSE ) {
1039+ in_tbl <- tibble :: as_tibble(as.list(.data $ DT ), .name_repair = " minimal" )
1040+ if (.format_aware ) {
1041+ out_tbl <- in_tbl %> %
1042+ filter(... , .by = .by )
1043+ } else {
1044+ out_tbl <- in_tbl %> %
1045+ filter(
1046+ # Add our own fake filter arg to the user's ..., to update the data mask
1047+ # to prevent `version` column usage.
1048+ {
1049+ # We should be evaluating inside the data mask. To disable both
1050+ # `version` and `.data$version`, we need to go to the data mask's
1051+ # ------
1052+ e <- environment()
1053+ while (! identical(e , globalenv()) && ! identical(e , emptyenv())) {
1054+ if (" version" %in% names(e )) {
1055+ # "version" is expected to be an active binding, and directly
1056+ # assigning over it has issues; explicitly `rm` first.
1057+ rm(list = " version" , envir = e )
1058+ delayedAssign(" version" , cli :: cli_abort(c(
1059+ " Using `version` in `filter` may produce unexpected results." ,
1060+ " >" = " See if `epix_as_of` or `epix_slide` would work instead." ,
1061+ " >" = " If not, see `?filter.epi_archive` details for how to proceed."
1062+ )), assign.env = e )
1063+ break
1064+ }
1065+ e <- parent.env(e )
1066+ }
1067+ TRUE
1068+ },
1069+ ... ,
1070+ .by = .by
1071+ )
1072+ }
1073+ out_geo_type <-
1074+ if (.data $ geo_type == " custom" ) {
1075+ # We might be going from a multi-resolution to single-resolution archive;
1076+ # e.g. national+state -> state; try to re-infer:
1077+ guess_geo_type(out_tbl $ geo_value )
1078+ } else {
1079+ # We risk less-understandable inference failures such as inferring "hhs"
1080+ # from selecting hrr 10 data; just use the old geo_type:
1081+ .data $ geo_type
1082+ }
1083+ # We might be going from daily to weekly; re-infer:
1084+ out_time_type <- guess_time_type(out_tbl $ time_value )
1085+ # Even if they narrow down to just a single value of an other_keys column,
1086+ # it's probably still better (& simpler) to treat it as an other_keys column
1087+ # since it still exists in the result:
1088+ out_other_keys <- .data $ other_keys
1089+ # `filter` makes no guarantees about not aliasing columns in its result when
1090+ # the filter condition is all TRUE, so don't setDT.
1091+ out_dtbl <- as.data.table(out_tbl , key = out_other_keys )
1092+ result <- new_epi_archive(
1093+ out_dtbl ,
1094+ out_geo_type , out_time_type , out_other_keys ,
1095+ # Assume version-related metadata unchanged; part of why we want to push
1096+ # back on filter expressions like `.data$version <= .env$as_of`:
1097+ .data $ clobberable_versions_start , .data $ versions_end
1098+ )
1099+ # Filtering down rows while keeping all (ukey) columns should preserve ukey
1100+ # uniqueness.
1101+ result
1102+ }
0 commit comments