@@ -80,19 +80,13 @@ epix_as_of <- function(x, version, min_time_value = -Inf, all_versions = FALSE,
8080 " `version` must have the same `class` vector as `epi_archive$DT$version`."
8181 )
8282 }
83- if (! identical(typeof(version ), typeof(x $ DT $ version ))) {
84- cli_abort(
85- " `version` must have the same `typeof` as `epi_archive$DT$version`."
86- )
87- }
8883 assert_scalar(version , na.ok = FALSE )
8984 if (version > x $ versions_end ) {
9085 cli_abort(" `version` must be at most `epi_archive$versions_end`." )
9186 }
9287 assert_scalar(min_time_value , na.ok = FALSE )
9388 min_time_value_inf <- is.infinite(min_time_value ) && min_time_value < 0
94- min_time_value_same_type <- typeof(min_time_value ) == typeof(x $ DT $ time_value ) &
95- class(min_time_value ) == class(x $ DT $ time_value )
89+ min_time_value_same_type <- identical(class(min_time_value ), class(x $ DT $ time_value ))
9690 if (! min_time_value_inf && ! min_time_value_same_type ) {
9791 cli_abort(" `min_time_value` must be either -Inf or a time_value of the same type and
9892 class as `epi_archive$time_value`." )
@@ -941,9 +935,6 @@ epix_truncate_versions_after.epi_archive <- function(x, max_version) {
941935 if (! identical(class(max_version ), class(x $ DT $ version ))) {
942936 cli_abort(" `max_version` must have the same `class` as `epi_archive$DT$version`." )
943937 }
944- if (! identical(typeof(max_version ), typeof(x $ DT $ version ))) {
945- cli_abort(" `max_version` must have the same `typeof` as `epi_archive$DT$version`." )
946- }
947938 assert_scalar(max_version , na.ok = FALSE )
948939 if (max_version > x $ versions_end ) {
949940 cli_abort(" `max_version` must be at most `epi_archive$versions_end`." )
@@ -1020,3 +1011,163 @@ dplyr_col_modify.col_modify_recorder_df <- function(data, cols) {
10201011 attr(data , " epiprocess::col_modify_recorder_df::cols" ) <- cols
10211012 data
10221013}
1014+
1015+
1016+
1017+ # ' [`dplyr::filter`] for `epi_archive`s
1018+ # '
1019+ # ' @param .data an `epi_archive`
1020+ # ' @param ... as in [`dplyr::filter`]; using the `version` column is not allowed
1021+ # ' unless you use `.format_aware = TRUE`; see details.
1022+ # ' @param .by as in [`dplyr::filter`]
1023+ # ' @param .format_aware optional, `TRUE` or `FALSE`; default `FALSE`. See
1024+ # ' details.
1025+ # '
1026+ # ' @details
1027+ # '
1028+ # ' By default, using the `version` column or measurement columns is disabled as
1029+ # ' it's easy to get unexpected results. See if either [`epix_as_of`] or
1030+ # ' [`epix_slide`] works for any version selection you have in mind: for version
1031+ # ' selection, see the `version` or `.versions` args, respectively; for
1032+ # ' measurement column-based filtering, try `filter`ing after `epix_as_of` or
1033+ # ' inside the `.f` in `epix_slide()`. If they don't cover your use case, then
1034+ # ' you can set `.format_aware = TRUE` to enable usage of these columns, but be
1035+ # ' careful to:
1036+ # ' * Factor in that `.data$DT` may have been converted into a compact format
1037+ # ' based on diffing consecutive versions, and the last version of each
1038+ # ' observation in `.data$DT` will always be carried forward to future
1039+ # ' `version`s`; see details of [`as_epi_archive`].
1040+ # ' * Set `clobberable_versions_start` and `versions_end` of the result
1041+ # ' appropriately after the `filter` call. They will be initialized with the
1042+ # ' same values as in `.data`.
1043+ # '
1044+ # ' `dplyr::filter` also has an optional argument `.preserve`, which should not
1045+ # ' have an impact on (ungrouped) `epi_archive`s, and `grouped_epi_archive`s do
1046+ # ' not currently support `dplyr::filter`.
1047+ # '
1048+ # ' @examples
1049+ # '
1050+ # ' # Filter to one location and a particular time range:
1051+ # ' archive_cases_dv_subset %>%
1052+ # ' filter(geo_value == "fl", time_value >= as.Date("2020-10-01"))
1053+ # '
1054+ # ' # Convert to weekly by taking the Saturday data for each week, so that
1055+ # ' # `case_rate_7d_av` represents a Sun--Sat average:
1056+ # ' archive_cases_dv_subset %>%
1057+ # ' filter(as.POSIXlt(time_value)$wday == 6L)
1058+ # '
1059+ # ' # Filtering involving the `version` column or measurement columns requires
1060+ # ' # extra care. See epix_as_of and epix_slide instead for some common
1061+ # ' # operations. One semi-common operation that ends up being fairly simple is
1062+ # ' # treating observations as finalized after some amount of time, and ignoring
1063+ # ' # any revisions that were made after that point:
1064+ # ' archive_cases_dv_subset %>%
1065+ # ' filter(
1066+ # ' version <= time_value + as.difftime(60, units = "days"),
1067+ # ' .format_aware = TRUE
1068+ # ' )
1069+ # '
1070+ # ' @export
1071+ filter.epi_archive <- function (.data , ... , .by = NULL , .format_aware = FALSE ) {
1072+ in_tbl <- tibble :: as_tibble(as.list(.data $ DT ), .name_repair = " minimal" )
1073+ if (.format_aware ) {
1074+ out_tbl <- in_tbl %> %
1075+ filter(... , .by = {{ .by }})
1076+ } else {
1077+ measurement_colnames <- setdiff(names(.data $ DT ), key_colnames(.data ))
1078+ forbidden_colnames <- c(" version" , measurement_colnames )
1079+ out_tbl <- in_tbl %> %
1080+ filter(
1081+ # Add our own fake filter arg to the user's ..., to update the data mask
1082+ # to prevent `version` column usage.
1083+ {
1084+ # We should be evaluating inside the data mask. To disable both
1085+ # `version` and `.data$version` etc., we need to go to the ancestor
1086+ # environment containing the data mask's column bindings. This is
1087+ # likely just the parent env, but search to make sure, in a way akin
1088+ # to `<<-`:
1089+ e <- environment()
1090+ while (! identical(e , globalenv()) && ! identical(e , emptyenv())) { # nolint:vector_logic_linter
1091+ if (" version" %in% names(e )) {
1092+ # This is where the column bindings are. Replace the forbidden ones.
1093+ # They are expected to be active bindings, so directly
1094+ # assigning has issues; `rm` first.
1095+ rm(list = forbidden_colnames , envir = e )
1096+ eval_env <- new.env(parent = asNamespace(" epiprocess" )) # see (2) below
1097+ delayedAssign(
1098+ " version" ,
1099+ cli_abort(c(
1100+ " Using `version` in `filter.epi_archive` may produce unexpected results." ,
1101+ " >" = " See if `epix_as_of` or `epix_slide` would work instead." ,
1102+ " >" = " If not, see `?filter.epi_archive` details for how to proceed."
1103+ ), class = " epiprocess__filter_archive__used_version" ),
1104+ eval.env = eval_env ,
1105+ assign.env = e
1106+ )
1107+ for (measurement_colname in measurement_colnames ) {
1108+ # Record current `measurement_colname` and set up execution for
1109+ # the promise for the error in its own dedicated environment, so
1110+ # that (1) `for` loop updating its value and `rm` cleanup don't
1111+ # mess things up. We can also (2) prevent changes to data mask
1112+ # ancestry (to involve user's quosure env rather than our
1113+ # quosure env) or contents (from edge case of user binding
1114+ # functions inside the mask) from potentially interfering by
1115+ # setting the promise's execution environment to skip over the
1116+ # data mask.
1117+ eval_env <- new.env(parent = asNamespace(" epiprocess" ))
1118+ eval_env [[" local_measurement_colname" ]] <- measurement_colname
1119+ delayedAssign(
1120+ measurement_colname ,
1121+ cli_abort(c(
1122+ " Using `{format_varname(local_measurement_colname)}`
1123+ in `filter.epi_archive` may produce unexpected results." ,
1124+ " >" = " See `?filter.epi_archive` details for how to proceed."
1125+ ), class = " epiprocess__filter_archive__used_measurement" ),
1126+ eval.env = eval_env ,
1127+ assign.env = e
1128+ )
1129+ }
1130+ break
1131+ }
1132+ e <- parent.env(e )
1133+ }
1134+ # Don't mask similarly-named user objects in ancestor envs:
1135+ rm(list = c(" e" , " measurement_colname" , " eval_env" ))
1136+ TRUE
1137+ },
1138+ ... ,
1139+ .by = {{ .by }}
1140+ )
1141+ }
1142+ # We could try to re-infer the geo_type, e.g., when filtering from
1143+ # national+state to just state. However, we risk inference failures such as
1144+ # "hrr" -> "hhs" from filtering to hrr 10, or "custom" -> USA-related when
1145+ # working with non-USA data:
1146+ out_geo_type <- .data $ geo_type
1147+ if (.data $ time_type == " day" ) {
1148+ # We might be going from daily to weekly; re-infer:
1149+ out_time_type <- guess_time_type(out_tbl $ time_value )
1150+ } else {
1151+ # We might be filtering weekly to a single time_value; avoid re-inferring to
1152+ # stay "week". Or in other cases, just skip inferring, as re-inferring is
1153+ # expected to match the input time_type:
1154+ out_time_type <- .data $ time_type
1155+ }
1156+ # Even if they narrow down to just a single value of an other_keys column,
1157+ # it's probably still better (& simpler) to treat it as an other_keys column
1158+ # since it still exists in the result:
1159+ out_other_keys <- .data $ other_keys
1160+ # `filter` makes no guarantees about not aliasing columns in its result when
1161+ # the filter condition is all TRUE, so don't setDT.
1162+ out_dtbl <- as.data.table(out_tbl , key = out_other_keys )
1163+ result <- new_epi_archive(
1164+ out_dtbl ,
1165+ out_geo_type , out_time_type , out_other_keys ,
1166+ # Assume version-related metadata unchanged; part of why we want to push
1167+ # back on filter expressions like `.data$version <= .env$as_of`:
1168+ .data $ clobberable_versions_start , .data $ versions_end
1169+ )
1170+ # Filtering down rows while keeping all (ukey) columns should preserve ukey
1171+ # uniqueness.
1172+ result
1173+ }
0 commit comments