2424# ' @return none
2525# '
2626# ' @import data.table
27- # ' @importFrom dplyr full_join %>%
27+ # ' @importFrom dplyr full_join %>% select all_of
2828# ' @importFrom purrr reduce
2929# '
3030# ' @export
@@ -44,9 +44,18 @@ produce_aggregates <- function(df, aggregations, cw_list, params) {
4444 df <- output [[1 ]]
4545 aggregations <- output [[2 ]]
4646
47+ # # Keep only columns used in indicators, plus supporting columns.
48+ group_vars <- unique( unlist(aggregations $ group_by ) )
49+ df <- select(df ,
50+ all_of(unique(aggregations $ metric )),
51+ all_of(unique(aggregations $ var_weight )),
52+ all_of( group_vars [group_vars != " geo_id" ] ),
53+ zip5 ,
54+ start_dt )
55+
4756 agg_groups <- unique(aggregations [c(" group_by" , " geo_level" )])
4857
49- # For each unique combination of groupby_vars and geo level, run aggregation process once
58+ # For each unique combination of group_vars and geo level, run aggregation process once
5059 # and calculate all desired aggregations on the grouping. Rename columns. Save
5160 # to individual files
5261 for (group_ind in seq_along(agg_groups $ group_by )) {
@@ -158,37 +167,43 @@ post_process_aggs <- function(df, aggregations, cw_list) {
158167 # - multi-select items are converted to a series of binary columns, one for
159168 # each unique level/response code; multi-select used for grouping are left as-is.
160169 # - multiple choice items are left as-is
161-
170+
162171 # ### TODO: How do we want to handle multi-select items when used for grouping?
163- agg_groups <- unique(aggregations $ group_by )
164- group_cols_to_convert <- unique(do.call(c , agg_groups ))
165- group_cols_to_convert <- group_cols_to_convert [startsWith(group_cols_to_convert , " b_" )]
166-
167- metric_cols_to_convert <- unique(aggregations $ metric )
168-
169- for (col_var in c(group_cols_to_convert , metric_cols_to_convert )) {
170- if ( is.null(df [[col_var ]]) ) {
171- aggregations <- aggregations [aggregations $ metric != col_var &
172- ! mapply(aggregations $ group_by ,
173- FUN = function (x ) {col_var %in% x }), ]
174- msg_plain(
175- paste0(
176- col_var , " is not defined. Removing all aggregations that use it. " ,
177- nrow(aggregations ), " remaining" )
178- )
172+ group_vars <- unique( unlist(aggregations $ group_by ) )
173+ group_vars <- group_vars [group_vars != " geo_id" ]
174+
175+ metric_cols <- unique(aggregations $ metric )
176+
177+ cols_check_available <- unique(c(group_vars , metric_cols ))
178+ available <- cols_check_available %in% names(df )
179+ cols_not_available <- cols_check_available [ ! available ]
180+ for (col_var in cols_not_available ) {
181+ # Remove from aggregations
182+ aggregations <- aggregations [aggregations $ metric != col_var &
183+ ! mapply(aggregations $ group_by ,
184+ FUN = function (x ) {col_var %in% x }), ]
185+ msg_plain(paste0(
186+ col_var , " is not defined. Removing all aggregations that use it. " ,
187+ nrow(aggregations ), " remaining" )
188+ )
189+ }
190+
191+ cols_available <- cols_check_available [ available ]
192+ for (col_var in cols_available ) {
193+ if ( col_var %in% group_vars & ! (col_var %in% metric_cols ) & ! startsWith(col_var , " b_" ) ) {
179194 next
180195 }
181196
182197 if (startsWith(col_var , " b_" )) { # Binary
183198 output <- code_binary(df , aggregations , col_var )
184- } else if (startsWith(col_var , " ms_" )) { # Multiselect
185- output <- code_multiselect(df , aggregations , col_var )
186199 } else if (startsWith(col_var , " n_" )) { # Numeric free response
187200 output <- code_numeric_freeresponse(df , aggregations , col_var )
188- } else if (startsWith(col_var , " mc_" )) { # Multiple choice
201+ } else if (startsWith(col_var , " ms_" )) { # Multi-select
202+ output <- code_multiselect(df , aggregations , col_var )
203+ } else {
204+ # Multiple choice and variables that are formatted differently
189205 output <- list (df , aggregations )
190206 }
191-
192207 df <- output [[1 ]]
193208 aggregations <- output [[2 ]]
194209 }
@@ -233,28 +248,27 @@ summarize_aggs <- function(df, crosswalk_data, aggregations, geo_level, params)
233248 # # inefficient; profiling shows the cost to be negligible, so shut it up
234249 df <- suppressWarnings(inner_join(df , crosswalk_data , by = " zip5" ))
235250
236- groupby_vars <- aggregations $ group_by [[1 ]]
251+ group_vars <- aggregations $ group_by [[1 ]]
237252
238- if (all(groupby_vars %in% names(df ))) {
239- unique_group_combos <- unique(df [, groupby_vars , with = FALSE ])
253+ if (all(group_vars %in% names(df ))) {
254+ unique_group_combos <- unique(df [, group_vars , with = FALSE ])
240255 unique_group_combos <- unique_group_combos [complete.cases(unique_group_combos )]
241256 } else {
242257 msg_plain(
243258 sprintf(
244259 " not all of groupby columns %s available in data; skipping aggregation" ,
245- paste(groupby_vars , collapse = " , " )
260+ paste(group_vars , collapse = " , " )
246261 ))
247262 }
248263
249264 if ( ! exists(" unique_group_combos" ) || nrow(unique_group_combos ) == 0 ) {
250265 return (list ())
251266 }
252267
253-
254268 # # Set an index on the groupby var columns so that the groupby step can be
255269 # # faster; data.table stores the sort order of the column and
256270 # # uses a binary search to find matching values, rather than a linear scan.
257- setindexv(df , groupby_vars )
271+ setindexv(df , group_vars )
258272
259273 calculate_group <- function (ii ) {
260274 target_group <- unique_group_combos [ii ]
@@ -287,15 +301,15 @@ summarize_aggs <- function(df, crosswalk_data, aggregations, geo_level, params)
287301 # # Do post-processing.
288302 for (row in seq_len(nrow(aggregations ))) {
289303 aggregation <- aggregations $ id [row ]
290- groupby_vars <- aggregations $ group_by [[row ]]
304+ group_vars <- aggregations $ group_by [[row ]]
291305 post_fn <- aggregations $ post_fn [[row ]]
292306
293307 dfs_out [[aggregation ]] <- dfs_out [[aggregation ]][
294- rowSums(is.na(dfs_out [[aggregation ]][, c(" val" , " sample_size" , groupby_vars )])) == 0 ,
308+ rowSums(is.na(dfs_out [[aggregation ]][, c(" val" , " sample_size" , group_vars )])) == 0 ,
295309 ]
296310
297311 if (geo_level == " county" ) {
298- df_megacounties <- megacounty(dfs_out [[aggregation ]], params $ num_filter , groupby_vars )
312+ df_megacounties <- megacounty(dfs_out [[aggregation ]], params $ num_filter , group_vars )
299313 dfs_out [[aggregation ]] <- bind_rows(dfs_out [[aggregation ]], df_megacounties )
300314 }
301315
0 commit comments