3434# ' `NA`'s compactify is run again if `compactify` is `TRUE` to make
3535# ' sure there are no duplicate values from occasions when the signal is
3636# ' revised to `NA`, and then back to its immediately-preceding value.
37- # ' @param print_inform bool, determines whether to print summary information, or
38- # ' only return the full summary tibble
3937# ' @param min_waiting_period `difftime`, integer or `NULL`. Sets a cutoff: any
4038# ' time_values that have not had at least `min_waiting_period` to stabilize as
4139# ' of the `versions_end` are removed. `min_waiting_period` should characterize
4240# ' the typical time during which most significant revisions occur. The default
4341# ' of 60 days corresponds to a typical near-final value for case counts as
4442# ' reported in the context of insurance. To avoid this filtering, either set
45- # ' to `NULL` or 0.
43+ # ' to `NULL` or 0. A `difftime` will be rounded up to the appropriate `time_type` if
44+ # ' necessary (that is 5 days will be rounded to 1 week if the data is weekly).
4645# ' @param within_latest double between 0 and 1. Determines the threshold
4746# ' used for the `lag_to`
48- # ' @param quick_revision difftime or integer (integer is treated as days), for
49- # ' the printed summary, the amount of time between the final revision and the
50- # ' actual time_value to consider the revision quickly resolved. Default of 3
51- # ' days
52- # ' @param few_revisions integer, for the printed summary, the upper bound on the
53- # ' number of revisions to consider "few". Default is 3.
54- # ' @param abs_spread_threshold length-1 numeric, for the printed summary, the
55- # ' maximum spread used to characterize revisions which don't actually change
56- # ' very much. Default is 5% of the maximum value in the dataset, but this is
57- # ' the most unit dependent of values, and likely needs to be chosen
58- # ' appropriate for the scale of the dataset.
59- # ' @param rel_spread_threshold length-1 double between 0 and 1, for the printed
60- # ' summary, the relative spread fraction used to characterize revisions which
61- # ' don't actually change very much. Default is .1, or 10% of the final value
6247# ' @param compactify bool. If `TRUE`, we will compactify after the signal
6348# ' requested in `...` has been selected on its own and the `drop_nas` step.
6449# ' This helps, for example, to give similar results when called on
6752# ' requested signal. The default is `TRUE`.
6853# ' @param compactify_abs_tol length-1 double, used if `compactify` is `TRUE`, it
6954# ' determines the threshold for when two doubles are considered identical.
55+ # ' @param return_only_tibble boolean to return only the simple `tibble` of
56+ # ' computational results rather than the complete S3 object.
7057# '
7158# ' @details Applies to `epi_archive`s with `time_type`s of `"day"`, `"week"`,
7259# ' and `"yearmonth"`. It can also work with a `time_type` of `"integer"` if
7663# ' produce incorrect results for some calculations, since week numbering
7764# ' contains jumps at year boundaries.
7865# '
66+ # ' @return An S3 object with class `revision_behavior`. This function is typically
67+ # ' called for the purposes of inspecting the printed output. The
68+ # ' results of the computations are available in
69+ # ' `revision_analysis(...)$revision_behavior`. If you only want to access
70+ # ' the internal computations, use `return_only_tibble = TRUE`.
71+ # '
7972# ' @examples
80- # ' revision_example <- revision_summary (archive_cases_dv_subset, percent_cli)
81- # ' revision_example %>% arrange(desc(spread))
73+ # ' revision_example <- revision_analysis (archive_cases_dv_subset, percent_cli)
74+ # ' revision_example$revision_behavior %>% arrange(desc(spread))
8275# '
8376# ' @export
8477# ' @importFrom cli cli_inform cli_abort cli_li
8578# ' @importFrom rlang list2 syms dots_n
8679# ' @importFrom vctrs vec_cast
8780# ' @importFrom dplyr mutate group_by arrange filter if_any all_of across pull pick c_across
8881# ' everything ungroup summarize if_else %>%
89- revision_summary <- function (epi_arch ,
90- ... ,
91- drop_nas = TRUE ,
92- print_inform = TRUE ,
93- min_waiting_period = as.difftime(60 , units = " days" ) %> %
94- difftime_approx_ceiling_time_delta(epi_arch $ time_type ),
95- within_latest = 0.2 ,
96- quick_revision = as.difftime(3 , units = " days" ) %> %
97- difftime_approx_ceiling_time_delta(epi_arch $ time_type ),
98- few_revisions = 3 ,
99- abs_spread_threshold = NULL ,
100- rel_spread_threshold = 0.1 ,
101- compactify = TRUE ,
102- compactify_abs_tol = 0 ) {
82+ revision_analysis <- function (epi_arch ,
83+ ... ,
84+ drop_nas = TRUE ,
85+ min_waiting_period = as.difftime(60 , units = " days" ),
86+ within_latest = 0.2 ,
87+ compactify = TRUE ,
88+ compactify_abs_tol = 0 ,
89+ return_only_tibble = FALSE ) {
10390 assert_class(epi_arch , " epi_archive" )
91+ if (methods :: is(min_waiting_period , " difftime" )) {
92+ min_waiting_period <- min_waiting_period %> %
93+ difftime_approx_ceiling_time_delta(epi_arch $ time_type )
94+ }
10495 # if the column to summarize isn't specified, use the only one if there is only one
10596 if (dots_n(... ) == 0 ) {
10697 # Choose the first column that's not a key:
@@ -126,11 +117,6 @@ revision_summary <- function(epi_arch,
126117 cli_abort(" Not currently implementing more than one column at a time. Run each separately." )
127118 }
128119 }
129- if (is.null(abs_spread_threshold )) {
130- abs_spread_threshold <- .05 * epi_arch $ DT %> %
131- pull(!! arg ) %> %
132- max(na.rm = TRUE )
133- }
134120 # for each time_value, get
135121 # the number of revisions
136122 # the maximum spread in value (both absolute and relative)
@@ -193,63 +179,113 @@ revision_summary <- function(epi_arch,
193179 time_value , geo_value , all_of(epikey_names ), n_revisions , min_lag , max_lag , # nolint: object_usage_linter
194180 lag_near_latest , spread , rel_spread , min_value , max_value , median_value # nolint: object_usage_linter
195181 )
196- if (print_inform ) {
197- cli_inform(" Min lag (time to first version):" )
198- time_delta_summary(revision_behavior $ min_lag , time_type ) %> % print()
199- if (! drop_nas ) {
200- total_na <- epi_arch $ DT %> %
201- filter(is.na(c_across(!! arg ))) %> % # nolint: object_usage_linter
202- nrow()
203- cli_inform(" Fraction of all versions that are `NA`:" )
204- cli_li(num_percent(total_na , nrow(epi_arch $ DT ), " " ))
205- cli_inform(" " )
206- }
207- cli_inform(" Fraction of epi_key+time_values with" )
208- total_num <- nrow(revision_behavior ) # nolint: object_usage_linter
209- total_num_unrevised <- sum(revision_behavior $ n_revisions == 0 ) # nolint: object_usage_linter
210- cli_inform(" No revisions:" )
211- cli_li(num_percent(total_num_unrevised , total_num , " " ))
212- total_quickly_revised <- sum( # nolint: object_usage_linter
213- time_delta_to_n_steps(revision_behavior $ max_lag , time_type ) < =
214- time_delta_to_n_steps(quick_revision , time_type )
215- )
216- cli_inform(" Quick revisions (last revision within {format_time_delta(quick_revision, time_type)}
217- of the `time_value`):" )
218- cli_li(num_percent(total_quickly_revised , total_num , " " ))
219- total_barely_revised <- sum( # nolint: object_usage_linter
220- revision_behavior $ n_revisions < =
221- few_revisions
222- )
223- cli_inform(" Few revisions (At most {few_revisions} revisions for that `time_value`):" )
224- cli_li(num_percent(total_barely_revised , total_num , " " ))
225- cli_inform(" " )
226- cli_inform(" Fraction of revised epi_key+time_values which have:" )
182+ total_na <- epi_arch $ DT %> %
183+ filter(is.na(c_across(!! arg ))) %> % # nolint: object_usage_linter
184+ nrow()
185+ if (! return_only_tibble ) {
186+ revision_behavior <- structure(list (
187+ revision_behavior = revision_behavior ,
188+ range_time_values = range(epi_arch $ DT $ time_value ),
189+ signal_variable = arg ,
190+ drop_nas = drop_nas ,
191+ time_type = time_type ,
192+ total_na = total_na ,
193+ max_val = max(epi_arch $ DT [[arg ]], na.rm = TRUE ),
194+ n_obs = nrow(epi_arch $ DT ),
195+ within_latest = within_latest
196+ ), class = " revision_analysis" )
197+ }
198+ return (revision_behavior )
199+ }
200+
201+
227202
228- real_revisions <- revision_behavior %> % filter(n_revisions > 0 ) # nolint: object_usage_linter
229- n_real_revised <- nrow(real_revisions ) # nolint: object_usage_linter
230- rel_spread <- sum( # nolint: object_usage_linter
231- real_revisions $ rel_spread <
232- rel_spread_threshold ,
233- na.rm = TRUE
234- ) + sum(is.na(real_revisions $ rel_spread ))
235- cli_inform(" Less than {rel_spread_threshold} spread in relative value:" )
236- cli_li(num_percent(rel_spread , n_real_revised , " " ))
237- abs_spread <- sum( # nolint: object_usage_linter
238- real_revisions $ spread >
239- abs_spread_threshold
240- ) # nolint: object_usage_linter
241- cli_inform(" Spread of more than {abs_spread_threshold} in actual value (when revised):" )
242- cli_li(num_percent(abs_spread , n_real_revised , " " ))
243203
244- # time_type_unit_pluralizer[[time_type]] is a format string controlled by us
245- # and/or downstream devs, so we can paste it onto our format string safely:
246- units_plural <- pluralize(paste0(" {qty(2)}" , time_type_unit_pluralizer [[time_type ]])) # nolint: object_usage_linter
247- cli_inform(" {toTitleCase(units_plural)} until within {within_latest*100}% of the latest value:" )
248- time_delta_summary(revision_behavior [[" lag_near_latest" ]], time_type ) %> % print()
204+ # ' Print a `revision_analysis` object
205+ # '
206+ # ' @param x a `revision_analysis` object
207+ # ' @param quick_revision Difftime or integer (integer is treated as days).
208+ # ' The amount of time between the final revision and the
209+ # ' actual time_value to consider the revision quickly resolved. Default of 3
210+ # ' days. This will be rounded up to the appropriate `time_type` if
211+ # ' necessary (that is 5 days will be rounded to 1 week if the data is weekly).
212+ # ' @param few_revisions Integer. The upper bound on the
213+ # ' number of revisions to consider "few". Default is 3.
214+ # ' @param abs_spread_threshold Scalar numeric. The
215+ # ' maximum spread used to characterize revisions which don't actually change
216+ # ' very much. Default is 5% of the maximum value in the dataset, but this is
217+ # ' the most unit dependent of values, and likely needs to be chosen
218+ # ' appropriate for the scale of the dataset.
219+ # ' @param rel_spread_threshold Scalar between 0 and 1. The relative spread fraction used to characterize revisions which
220+ # ' don't actually change very much. Default is .1, or 10% of the final value
221+ # '
222+ # ' @rdname revision_analysis
223+ # ' @export
224+ print.revision_analysis <- function (x ,
225+ quick_revision = as.difftime(3 , units = " days" ),
226+ few_revisions = 3 ,
227+ abs_spread_threshold = NULL ,
228+ rel_spread_threshold = 0.1 ,
229+ ... ) {
230+ if (methods :: is(quick_revision , " difftime" )) {
231+ quick_revision <- quick_revision %> %
232+ difftime_approx_ceiling_time_delta(x $ time_type )
249233 }
250- return (revision_behavior )
234+ if (is.null(abs_spread_threshold )) abs_spread_threshold <- .05 * x $ max_val
235+ rev_beh <- x $ revision_behavior
236+ cli :: cli_h2(" An epi_archive spanning {.val {x$range_time_values[1]}} to {.val {x$range_time_values[1]}}." )
237+ cli :: cli_h3(" Min lag (time to first version):" )
238+ time_delta_summary(rev_beh $ min_lag , x $ time_type ) %> % print()
239+ if (! x $ drop_nas ) {
240+ cli_inform(" Fraction of all versions that are `NA`:" )
241+ cli_li(num_percent(x $ total_na , x $ n_obs , " " ))
242+ cli_inform(" " )
243+ }
244+ cli :: cli_h3(" Fraction of epi_key + time_values with" )
245+ total_num <- nrow(rev_beh ) # nolint: object_usage_linter
246+ total_num_unrevised <- sum(rev_beh $ n_revisions == 0 ) # nolint: object_usage_linter
247+ cli_inform(" No revisions:" )
248+ cli_li(num_percent(total_num_unrevised , total_num , " " ))
249+ total_quickly_revised <- sum( # nolint: object_usage_linter
250+ time_delta_to_n_steps(rev_beh $ max_lag , x $ time_type ) < =
251+ time_delta_to_n_steps(quick_revision , x $ time_type )
252+ )
253+ cli_inform(" Quick revisions (last revision within {format_time_delta(quick_revision, x$time_type)}
254+ of the `time_value`):" )
255+ cli_li(num_percent(total_quickly_revised , total_num , " " ))
256+ total_barely_revised <- sum(rev_beh $ n_revisions < = few_revisions )
257+ cli_inform(" Few revisions (At most {.val {few_revisions}} revisions for that `time_value`):" )
258+ cli_li(num_percent(total_barely_revised , total_num , " " ))
259+
260+ cli :: cli_h3(" Fraction of revised epi_key + time_values which have:" )
261+
262+ real_revisions <- rev_beh %> % filter(n_revisions > 0 ) # nolint: object_usage_linter
263+ n_real_revised <- nrow(real_revisions ) # nolint: object_usage_linter
264+ rel_spread <- sum( # nolint: object_usage_linter
265+ real_revisions $ rel_spread < rel_spread_threshold ,
266+ na.rm = TRUE
267+ ) + sum(is.na(real_revisions $ rel_spread ))
268+ cli_inform(" Less than {.val {rel_spread_threshold}} spread in relative value:" )
269+ cli_li(num_percent(rel_spread , n_real_revised , " " ))
270+ abs_spread <- sum( # nolint: object_usage_linter
271+ real_revisions $ spread > abs_spread_threshold
272+ ) # nolint: object_usage_linter
273+ divid <- cli :: cli_div(theme = list (.val = list (digits = 3 )))
274+ cli_inform(" Spread of more than {.val {abs_spread_threshold}} in actual value (when revised):" )
275+ cli :: cli_end(divid )
276+ cli_li(num_percent(abs_spread , n_real_revised , " " ))
277+
278+ # time_type_unit_pluralizer[[time_type]] is a format string controlled by us
279+ # and/or downstream devs, so we can paste it onto our format string safely:
280+ units_plural <- pluralize(paste0(" {qty(2)}" , time_type_unit_pluralizer [[x $ time_type ]])) # nolint: object_usage_linter
281+ cli :: cli_h3(" {toTitleCase(units_plural)} until within {.val {x$within_latest*100}}% of the latest value:" )
282+ time_delta_summary(rev_beh [[" lag_near_latest" ]], x $ time_type ) %> % print()
251283}
252284
285+ # ' @export
286+ # ' @rdname revision_analysis
287+ revision_summary <- revision_analysis
288+
253289# ' pull the value from lags when values starts indefinitely being within prop of its latest value.
254290# ' @param lags vector of lags; should be sorted
255291# ' @param values this should be a vector (e.g., a column) with length matching that of `lags`
0 commit comments