From 713dde68d503fed9bd1ee38e1a20735416da8db5 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 3 Mar 2021 14:57:25 -0500 Subject: [PATCH 01/23] reduce memory usage by subsetting columns --- facebook/delphiFacebook/NAMESPACE | 1 + .../delphiFacebook/R/contingency_aggregate.R | 18 +++++++++++++----- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/facebook/delphiFacebook/NAMESPACE b/facebook/delphiFacebook/NAMESPACE index 9ef255f84..07b9c5a32 100644 --- a/facebook/delphiFacebook/NAMESPACE +++ b/facebook/delphiFacebook/NAMESPACE @@ -63,6 +63,7 @@ importFrom(dplyr,"%>%") importFrom(dplyr,across) importFrom(dplyr,all_of) importFrom(dplyr,anti_join) +importFrom(dplyr,any_of) importFrom(dplyr,arrange) importFrom(dplyr,bind_rows) importFrom(dplyr,case_when) diff --git a/facebook/delphiFacebook/R/contingency_aggregate.R b/facebook/delphiFacebook/R/contingency_aggregate.R index 00e9b065e..8188fbcc7 100644 --- a/facebook/delphiFacebook/R/contingency_aggregate.R +++ b/facebook/delphiFacebook/R/contingency_aggregate.R @@ -24,11 +24,23 @@ #' @return none #' #' @import data.table -#' @importFrom dplyr full_join %>% +#' @importFrom dplyr full_join %>% select all_of any_of #' @importFrom purrr reduce #' #' @export produce_aggregates <- function(df, aggregations, cw_list, params) { + output <- post_process_aggs(df, aggregations, cw_list) + df <- output[[1]] + aggregations <- output[[2]] + + ## Keep only columns used in indicators, plus supporting columns. + df <- select(df, + all_of(unique(aggregations$metric)), + all_of(unique(aggregations$var_weight)), + any_of( unique( unlist(aggregations$group_by) ) ), + zip5, + start_dt) + msg_plain(paste0("Producing aggregates...")) ## For the date range lookups we do on df, use a data.table key. This puts the ## table in sorted order so data.table can use a binary search to find @@ -40,10 +52,6 @@ produce_aggregates <- function(df, aggregations, cw_list, params) { # Keep only obs in desired date range. df <- df[start_dt >= params$start_time & start_dt <= params$end_time] - output <- post_process_aggs(df, aggregations, cw_list) - df <- output[[1]] - aggregations <- output[[2]] - agg_groups <- unique(aggregations[c("group_by", "geo_level")]) # For each unique combination of groupby_vars and geo level, run aggregation process once From 5a3124934694ff73b25775af8884fea1a5bdfb3b Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 3 Mar 2021 17:48:35 -0500 Subject: [PATCH 02/23] switch to all_of to trigger error for missing columns --- facebook/delphiFacebook/R/contingency_aggregate.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/facebook/delphiFacebook/R/contingency_aggregate.R b/facebook/delphiFacebook/R/contingency_aggregate.R index 8188fbcc7..b5b975ee3 100644 --- a/facebook/delphiFacebook/R/contingency_aggregate.R +++ b/facebook/delphiFacebook/R/contingency_aggregate.R @@ -24,7 +24,7 @@ #' @return none #' #' @import data.table -#' @importFrom dplyr full_join %>% select all_of any_of +#' @importFrom dplyr full_join %>% select all_of #' @importFrom purrr reduce #' #' @export @@ -34,10 +34,11 @@ produce_aggregates <- function(df, aggregations, cw_list, params) { aggregations <- output[[2]] ## Keep only columns used in indicators, plus supporting columns. + group_vars <- unique( unlist(aggregations$group_by) ) df <- select(df, all_of(unique(aggregations$metric)), all_of(unique(aggregations$var_weight)), - any_of( unique( unlist(aggregations$group_by) ) ), + all_of( group_vars[group_vars != "geo_id"] ), zip5, start_dt) From 486c7fad73f89cd6da95d2cb475d88e9a62fa320 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 11 Mar 2021 10:24:42 -0500 Subject: [PATCH 03/23] fix hesitant_and_* variables to be defined as among hesitant people --- facebook/delphiFacebook/NAMESPACE | 1 - .../delphiFacebook/R/contingency_variables.R | 79 +++++++++++++------ 2 files changed, 57 insertions(+), 23 deletions(-) diff --git a/facebook/delphiFacebook/NAMESPACE b/facebook/delphiFacebook/NAMESPACE index 07b9c5a32..9ef255f84 100644 --- a/facebook/delphiFacebook/NAMESPACE +++ b/facebook/delphiFacebook/NAMESPACE @@ -63,7 +63,6 @@ importFrom(dplyr,"%>%") importFrom(dplyr,across) importFrom(dplyr,all_of) importFrom(dplyr,anti_join) -importFrom(dplyr,any_of) importFrom(dplyr,arrange) importFrom(dplyr,bind_rows) importFrom(dplyr,case_when) diff --git a/facebook/delphiFacebook/R/contingency_variables.R b/facebook/delphiFacebook/R/contingency_variables.R index 47aabc7ff..c95a554a6 100644 --- a/facebook/delphiFacebook/R/contingency_variables.R +++ b/facebook/delphiFacebook/R/contingency_variables.R @@ -345,14 +345,14 @@ create_derivative_columns <- function(df) { } else { df$b_65_or_older <- NA_real_ } - - if ("mc_accept_cov_vaccine" %in% names(df)) { - df$b_hesitant_cov_vaccine <- as.numeric( - df$mc_accept_cov_vaccine == "prob not vaccinate" | df$mc_accept_cov_vaccine == "def not vaccinate" - ) - } else { - df$b_hesitant_cov_vaccine <- NA_real_ - } + + if ("mc_accept_cov_vaccine" %in% names(df)) { + df$b_hesitant_cov_vaccine <- as.numeric( + df$mc_accept_cov_vaccine == "prob not vaccinate" | df$mc_accept_cov_vaccine == "def not vaccinate" + ) + } else { + df$b_hesitant_cov_vaccine <- NA_real_ + } if ("mc_concerned_sideeffects" %in% names(df)) { df$b_concerned_sideeffects <- as.numeric( @@ -362,31 +362,67 @@ create_derivative_columns <- function(df) { df$b_concerned_sideeffects <- NA_real_ } - df$b_hesitant_sideeffects <- as.numeric( - df$b_hesitant_cov_vaccine & df$b_concerned_sideeffects + df$b_hesitant_sideeffects <- case_when( + is.na(df$b_hesitant_cov_vaccine) == TRUE ~ NA, + is.na(df$b_concerned_sideeffects) == TRUE ~ NA, + df$wave < 7 ~ NA, + df$b_hesitant_cov_vaccine == 0 ~ NA, + df$b_hesitant_cov_vaccine == 1 & df$b_concerned_sideeffects == 1 ~ TRUE, + df$b_hesitant_cov_vaccine == 1 & df$b_concerned_sideeffects == 0 ~ FALSE, + TRUE ~ NA ) - df$b_hesitant_sideeffects[df$wave < 7] <- NA_real_ if ( "b_vaccine_likely_friends" %in% names(df) & "b_vaccine_likely_local_health" %in% names(df) & "b_vaccine_likely_who" %in% names(df) & "b_vaccine_likely_govt_health" %in% names(df) & "b_vaccine_likely_politicians" %in% names(df) ) { - df$b_hesitant_trust_fam <- as.numeric( - df$b_hesitant_cov_vaccine & df$b_vaccine_likely_friends + df$b_hesitant_trust_fam <- case_when( + is.na(df$b_hesitant_cov_vaccine) == TRUE ~ NA, + is.na(df$b_vaccine_likely_friends) == TRUE ~ NA, + df$wave < 7 ~ NA, + df$b_hesitant_cov_vaccine == 0 ~ NA, + df$b_hesitant_cov_vaccine == 1 & df$b_vaccine_likely_friends == 1 ~ TRUE, + df$b_hesitant_cov_vaccine == 1 & df$b_vaccine_likely_friends == 0 ~ FALSE, + TRUE ~ NA ) - df$b_hesitant_trust_healthcare <- as.numeric( - df$b_hesitant_cov_vaccine & df$b_vaccine_likely_local_health + df$b_hesitant_trust_healthcare <- case_when( + is.na(df$b_hesitant_cov_vaccine) == TRUE ~ NA, + is.na(df$b_vaccine_likely_local_health) == TRUE ~ NA, + df$wave < 7 ~ NA, + df$b_hesitant_cov_vaccine == 0 ~ NA, + df$b_hesitant_cov_vaccine == 1 & df$b_vaccine_likely_local_health == 1 ~ TRUE, + df$b_hesitant_cov_vaccine == 1 & df$b_vaccine_likely_local_health == 0 ~ FALSE, + TRUE ~ NA ) - df$b_hesitant_trust_who <- as.numeric( - df$b_hesitant_cov_vaccine & df$b_vaccine_likely_who + df$b_hesitant_trust_who <- case_when( + is.na(df$b_hesitant_cov_vaccine) == TRUE ~ NA, + is.na(df$b_vaccine_likely_who) == TRUE ~ NA, + df$wave < 7 ~ NA, + df$b_hesitant_cov_vaccine == 0 ~ NA, + df$b_hesitant_cov_vaccine == 1 & df$b_vaccine_likely_who == 1 ~ TRUE, + df$b_hesitant_cov_vaccine == 1 & df$b_vaccine_likely_who == 0 ~ FALSE, + TRUE ~ NA ) - df$b_hesitant_trust_govt <- as.numeric( - df$b_hesitant_cov_vaccine & df$b_vaccine_likely_govt_health + df$b_hesitant_trust_govt <- case_when( + is.na(df$b_hesitant_cov_vaccine) == TRUE ~ NA, + is.na(df$b_vaccine_likely_govt_health) == TRUE ~ NA, + df$wave < 7 ~ NA, + df$b_hesitant_cov_vaccine == 0 ~ NA, + df$b_hesitant_cov_vaccine == 1 & df$b_vaccine_likely_govt_health == 1 ~ TRUE, + df$b_hesitant_cov_vaccine == 1 & df$b_vaccine_likely_govt_health == 0 ~ FALSE, + TRUE ~ NA ) - df$b_hesitant_trust_politicians <- as.numeric( - df$b_hesitant_cov_vaccine & df$b_vaccine_likely_politicians + df$b_hesitant_trust_politicians <- case_when( + is.na(df$b_hesitant_cov_vaccine) == TRUE ~ NA, + is.na(df$b_vaccine_likely_politicians) == TRUE ~ NA, + df$wave < 7 ~ NA, + df$b_hesitant_cov_vaccine == 0 ~ NA, + df$b_hesitant_cov_vaccine == 1 & df$b_vaccine_likely_politicians == 1 ~ TRUE, + df$b_hesitant_cov_vaccine == 1 & df$b_vaccine_likely_politicians == 0 ~ FALSE, + TRUE ~ NA ) + } else { df$b_hesitant_trust_fam <- NA_real_ df$b_hesitant_trust_healthcare <- NA_real_ @@ -429,7 +465,6 @@ remap_response <- function(df, col_var, map_old_new, default=NULL, response_type df[[col_var]] <- recode(df[[col_var]], !!!map_old_new, .default=default) } else if (response_type == "ms") { split_col <- split_options(df[[col_var]]) - map_fn <- ifelse( is.null(getOption("mc.cores")) , mapply, mcmapply) df[[col_var]] <- map_fn(split_col, FUN=function(row) { if ( length(row) == 1 && all(is.na(row)) ) { From c20a4a5a936f9eb1d60f4f2544d1f346a733a67f Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 12 Mar 2021 10:41:48 -0500 Subject: [PATCH 04/23] fix failing tests --- .../delphiFacebook/R/contingency_aggregate.R | 64 +++++++++++++------ .../delphiFacebook/R/contingency_variables.R | 1 + 2 files changed, 45 insertions(+), 20 deletions(-) diff --git a/facebook/delphiFacebook/R/contingency_aggregate.R b/facebook/delphiFacebook/R/contingency_aggregate.R index b5b975ee3..0af4ddbff 100644 --- a/facebook/delphiFacebook/R/contingency_aggregate.R +++ b/facebook/delphiFacebook/R/contingency_aggregate.R @@ -29,6 +29,17 @@ #' #' @export produce_aggregates <- function(df, aggregations, cw_list, params) { + msg_plain(paste0("Producing aggregates...")) + ## For the date range lookups we do on df, use a data.table key. This puts the + ## table in sorted order so data.table can use a binary search to find + ## matching dates, rather than a linear scan, and is important for very large + ## input files. + df <- as.data.table(df) + setkeyv(df, "start_dt") + + # Keep only obs in desired date range. + df <- df[start_dt >= params$start_time & start_dt <= params$end_time] + output <- post_process_aggs(df, aggregations, cw_list) df <- output[[1]] aggregations <- output[[2]] @@ -42,17 +53,6 @@ produce_aggregates <- function(df, aggregations, cw_list, params) { zip5, start_dt) - msg_plain(paste0("Producing aggregates...")) - ## For the date range lookups we do on df, use a data.table key. This puts the - ## table in sorted order so data.table can use a binary search to find - ## matching dates, rather than a linear scan, and is important for very large - ## input files. - df <- as.data.table(df) - setkeyv(df, "start_dt") - - # Keep only obs in desired date range. - df <- df[start_dt >= params$start_time & start_dt <= params$end_time] - agg_groups <- unique(aggregations[c("group_by", "geo_level")]) # For each unique combination of groupby_vars and geo level, run aggregation process once @@ -168,14 +168,11 @@ post_process_aggs <- function(df, aggregations, cw_list) { # each unique level/response code; multi-select used for grouping are left as-is. # - multiple choice items are left as-is - #### TODO: How do we want to handle multi-select items when used for grouping? + #### TODO: How do we want to handle multi-select items used for grouping? agg_groups <- unique(aggregations$group_by) group_cols_to_convert <- unique(do.call(c, agg_groups)) - group_cols_to_convert <- group_cols_to_convert[startsWith(group_cols_to_convert, "b_")] - - metric_cols_to_convert <- unique(aggregations$metric) - - for (col_var in c(group_cols_to_convert, metric_cols_to_convert)) { + for (col_var in group_cols_to_convert) { + if (col_var == "geo_id") { next } if ( is.null(df[[col_var]]) ) { aggregations <- aggregations[aggregations$metric != col_var & !mapply(aggregations$group_by, @@ -190,14 +187,41 @@ post_process_aggs <- function(df, aggregations, cw_list) { if (startsWith(col_var, "b_")) { # Binary output <- code_binary(df, aggregations, col_var) - } else if (startsWith(col_var, "ms_")) { # Multiselect - output <- code_multiselect(df, aggregations, col_var) } else if (startsWith(col_var, "n_")) { # Numeric free response output <- code_numeric_freeresponse(df, aggregations, col_var) - } else if (startsWith(col_var, "mc_")) { # Multiple choice + } else { + # Multiple choice, multi-select, and variables that are formatted differently output <- list(df, aggregations) } + df <- output[[1]] + aggregations <- output[[2]] + } + + metric_cols_to_convert <- unique(aggregations$metric) + for (col_var in metric_cols_to_convert) { + if ( is.null(df[[col_var]]) ) { + aggregations <- aggregations[aggregations$metric != col_var & + !mapply(aggregations$group_by, + FUN=function(x) {col_var %in% x}), ] + msg_plain( + paste0( + col_var, " is not defined. Removing all aggregations that use it. ", + nrow(aggregations), " remaining") + ) + next + } + if (startsWith(col_var, "b_")) { # Binary + output <- code_binary(df, aggregations, col_var) + } else if (startsWith(col_var, "n_")) { # Numeric free response + output <- code_numeric_freeresponse(df, aggregations, col_var) + } + else if (startsWith(col_var, "ms_")) { # Multi-select + output <- code_multiselect(df, aggregations, col_var) + } else { + # Multiple choice and variables that are formatted differently + output <- list(df, aggregations) + } df <- output[[1]] aggregations <- output[[2]] } diff --git a/facebook/delphiFacebook/R/contingency_variables.R b/facebook/delphiFacebook/R/contingency_variables.R index c95a554a6..3d419de3a 100644 --- a/facebook/delphiFacebook/R/contingency_variables.R +++ b/facebook/delphiFacebook/R/contingency_variables.R @@ -545,6 +545,7 @@ code_multiselect <- function(df, aggregations, col_var) { sep="_") } )) + #### TODO: eval(parse()) here is not the best approach, but I can't find another # way to get col_var (a string) to be used as a var that references a column # rather than as an actual string. This approach causes a shallow copy to be From 65f158f8269642347f26d010fe737e4ebdf4f21d Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 12 Mar 2021 11:02:51 -0500 Subject: [PATCH 05/23] reduce logic complexity in input file choice --- facebook/delphiFacebook/NAMESPACE | 1 - facebook/delphiFacebook/R/contingency_utils.R | 89 +++++-------------- .../man/get_date_range_from_filenames.Rd | 17 ---- 3 files changed, 21 insertions(+), 86 deletions(-) delete mode 100644 facebook/delphiFacebook/man/get_date_range_from_filenames.Rd diff --git a/facebook/delphiFacebook/NAMESPACE b/facebook/delphiFacebook/NAMESPACE index 9ef255f84..00c0dc09b 100644 --- a/facebook/delphiFacebook/NAMESPACE +++ b/facebook/delphiFacebook/NAMESPACE @@ -17,7 +17,6 @@ export(filter_complete_responses) export(filter_data_for_aggregation) export(filter_responses) export(floor_epiweek) -export(get_date_range_from_filenames) export(get_filenames_in_range) export(get_range_prev_full_month) export(get_range_prev_full_period) diff --git a/facebook/delphiFacebook/R/contingency_utils.R b/facebook/delphiFacebook/R/contingency_utils.R index a8cc22a38..f52dac9f8 100644 --- a/facebook/delphiFacebook/R/contingency_utils.R +++ b/facebook/delphiFacebook/R/contingency_utils.R @@ -52,49 +52,25 @@ read_contingency_params <- function(path = "params.json", template_path = "param #' #' @export update_params <- function(params) { + # Fill in end_time, if missing, with current time. + if (is.null(params$end_time)) { + params$end_time <- Sys.time() + } + if ( !is.null(params$start_date) ) { - # Use all data within the date range, either as explicitly set or assuming - # "now" is the end_date. - if (is.null(params$end_time)) { - params$end_time <- Sys.time() - } date_range <- list(params$start_time, params$end_time) - - if ( is.null(params$input) ) { - params$input <- get_filenames_in_range(date_range[[1]], date_range[[2]], params) - } - - } else if ( is.null(params$end_date) & (is.null(params$input) | length(params$input) == 0) ) { - # Neither end_date nor list of input files is provided, assume want to use - # most current full time period and data. - date_range <- get_range_prev_full_period(Sys.Date(), params$aggregate_range) - params$input <- get_filenames_in_range(date_range[[1]], date_range[[2]], params) - - } else if ( !is.null(params$end_date) & (is.null(params$input) | length(params$input) == 0) ) { - # List of input files is not provided, assume want to use the full period - # preceding the provided end_date - date_range <- get_range_prev_full_period( - as_date(params$end_date), params$aggregate_range) - params$input <- get_filenames_in_range(date_range[[1]], date_range[[2]], params) - - } else if ( is.null(params$end_date) & !is.null(params$input) & length(params$input) != 0 ) { - # Use list of input files provided, even if it does not constitute a full - # period. Use dates in input files to select range. - date_range <- get_date_range_from_filenames(params) - - } else if ( !is.null(params$end_date) & !is.null(params$input) & length(params$input) != 0 ) { - # Use the full period preceding the provided end_date AND use only the list - # of input files provided, even if they don't span the full period. - date_range <- get_range_prev_full_period( - as_date(params$end_date), params$aggregate_range) + } else { + # If start_date is not provided, assume want to use preceding full time period. + date_range <- get_range_prev_full_period(as_date(params$end_date), params$aggregate_range) } - + + params$input <- get_filenames_in_range(date_range[[1]], date_range[[2]], params) if (length(params$input) == 0) { stop("no input files to read in") } + params$start_time <- date_range[[1]] params$end_time <- date_range[[2]] - params$start_date <- as_date(date_range[[1]]) params$end_date <- as_date(date_range[[2]]) @@ -115,12 +91,17 @@ update_params <- function(params) { get_filenames_in_range <- function(start_date, end_date, params) { start_date <- as_date(start_date) - days(params$backfill_days) end_date <- as_date(end_date) - date_pattern <- "^[0-9]{4}-[0-9]{2}-[0-9]{2}.*[.]csv$" - youtube_pattern <- ".*YouTube[.]csv$" - - filenames <- list.files(path=params$input_dir) - filenames <- filenames[grepl(date_pattern, filenames) & !grepl(youtube_pattern, filenames)] + if ( is.null(params$input) | length(params$input) == 0 ) { + date_pattern <- "^[0-9]{4}-[0-9]{2}-[0-9]{2}.*[.]csv$" + youtube_pattern <- ".*YouTube[.]csv$" + + filenames <- list.files(path=params$input_dir) + filenames <- filenames[grepl(date_pattern, filenames) & !grepl(youtube_pattern, filenames)] + } else { + filenames <- params$input + } + file_end_dates <- as_date(substr(filenames, 1, 10)) file_start_dates <- as_date(substr(filenames, 12, 21)) @@ -133,34 +114,6 @@ get_filenames_in_range <- function(start_date, end_date, params) { return(filenames) } - -#' Get date range based on list of input files provided. -#' -#' @param params Params object produced by read_params -#' -#' @return Unnamed list of two dates -#' -#' @export -get_date_range_from_filenames <- function(params) { - date_pattern <- "^[0-9]{4}-[0-9]{2}-[0-9]{2}.*[.]csv$" - youtube_pattern <- ".*YouTube[.]csv$" - - filenames <- params$input - filenames <- filenames[grepl(date_pattern, filenames) & !grepl(youtube_pattern, filenames)] - - dates <- as.Date(unlist(lapply(filenames, function(filename) { - file_end_date <- as_date(substr(filenames, 1, 10)) - file_start_date <- as_date(substr(filenames, 12, 21)) - - return(c(file_end_date, file_start_date)) - })), origin="1970-01-01") - - date_range <- list(min(dates), max(dates)) - - return(date_range) -} - - #' Check user-set aggregations for basic validity and add a few necessary cols. #' #' @param aggregations Data frame with columns `name`, `var_weight`, `metric`, diff --git a/facebook/delphiFacebook/man/get_date_range_from_filenames.Rd b/facebook/delphiFacebook/man/get_date_range_from_filenames.Rd deleted file mode 100644 index 3727e5cd9..000000000 --- a/facebook/delphiFacebook/man/get_date_range_from_filenames.Rd +++ /dev/null @@ -1,17 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/contingency_utils.R -\name{get_date_range_from_filenames} -\alias{get_date_range_from_filenames} -\title{Get date range based on list of input files provided.} -\usage{ -get_date_range_from_filenames(params) -} -\arguments{ -\item{params}{Params object produced by read_params} -} -\value{ -Unnamed list of two dates -} -\description{ -Get date range based on list of input files provided. -} From 1f10a1b2bacccf3fbe92e82fde7c8b69b6085020 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 12 Mar 2021 15:18:27 -0500 Subject: [PATCH 06/23] fix relevant tests. support with new params setting --- .../delphiFacebook/R/contingency_aggregate.R | 3 +- facebook/delphiFacebook/R/contingency_utils.R | 13 +++++++-- .../tests/testthat/params-full.json | 1 + .../tests/testthat/params-test.json | 1 + .../tests/testthat/test-contingency-utils.R | 29 ++++--------------- 5 files changed, 19 insertions(+), 28 deletions(-) diff --git a/facebook/delphiFacebook/R/contingency_aggregate.R b/facebook/delphiFacebook/R/contingency_aggregate.R index 0af4ddbff..bebf92786 100644 --- a/facebook/delphiFacebook/R/contingency_aggregate.R +++ b/facebook/delphiFacebook/R/contingency_aggregate.R @@ -215,8 +215,7 @@ post_process_aggs <- function(df, aggregations, cw_list) { output <- code_binary(df, aggregations, col_var) } else if (startsWith(col_var, "n_")) { # Numeric free response output <- code_numeric_freeresponse(df, aggregations, col_var) - } - else if (startsWith(col_var, "ms_")) { # Multi-select + } else if (startsWith(col_var, "ms_")) { # Multi-select output <- code_multiselect(df, aggregations, col_var) } else { # Multiple choice and variables that are formatted differently diff --git a/facebook/delphiFacebook/R/contingency_utils.R b/facebook/delphiFacebook/R/contingency_utils.R index f52dac9f8..8e2edbdde 100644 --- a/facebook/delphiFacebook/R/contingency_utils.R +++ b/facebook/delphiFacebook/R/contingency_utils.R @@ -33,6 +33,9 @@ read_contingency_params <- function(path = "params.json", template_path = "param contingency_params$num_filter <- if_else(contingency_params$debug, 2L, 100L) contingency_params$s_weight <- if_else(contingency_params$debug, 1.00, 0.01) contingency_params$s_mix_coef <- if_else(contingency_params$debug, 0.05, 0.05) + contingency_params$use_input_asis <- if_else( + is.null(contingency_params$use_input_asis), FALSE, contingency_params$use_input_asis + ) return(contingency_params) } @@ -57,15 +60,19 @@ update_params <- function(params) { params$end_time <- Sys.time() } + # Construct aggregate date range. if ( !is.null(params$start_date) ) { date_range <- list(params$start_time, params$end_time) } else { # If start_date is not provided, assume want to use preceding full time period. - date_range <- get_range_prev_full_period(as_date(params$end_date), params$aggregate_range) + date_range <- get_range_prev_full_period( + as_date(params$end_date) + , params$aggregate_range + ) } params$input <- get_filenames_in_range(date_range[[1]], date_range[[2]], params) - if (length(params$input) == 0) { + if ( length(params[["input"]]) == 0 || all(is.na(params[["input"]])) ) { stop("no input files to read in") } @@ -89,6 +96,8 @@ update_params <- function(params) { #' #' @export get_filenames_in_range <- function(start_date, end_date, params) { + if (params$use_input_asis) { return(params$input) } + start_date <- as_date(start_date) - days(params$backfill_days) end_date <- as_date(end_date) diff --git a/facebook/delphiFacebook/tests/testthat/params-full.json b/facebook/delphiFacebook/tests/testthat/params-full.json index 0d0b7f009..295827f08 100644 --- a/facebook/delphiFacebook/tests/testthat/params-full.json +++ b/facebook/delphiFacebook/tests/testthat/params-full.json @@ -27,6 +27,7 @@ "input": [ "full_synthetic.csv" ], + "use_input_asis": true, "aggregate_range": "month", "debug": false } diff --git a/facebook/delphiFacebook/tests/testthat/params-test.json b/facebook/delphiFacebook/tests/testthat/params-test.json index 7b73cf159..052ba4539 100644 --- a/facebook/delphiFacebook/tests/testthat/params-test.json +++ b/facebook/delphiFacebook/tests/testthat/params-test.json @@ -29,6 +29,7 @@ "input": [ "responses.csv" ], + "use_input_asis": true, "aggregate_range": "month", "debug": false } diff --git a/facebook/delphiFacebook/tests/testthat/test-contingency-utils.R b/facebook/delphiFacebook/tests/testthat/test-contingency-utils.R index c81729a25..165017dc9 100644 --- a/facebook/delphiFacebook/tests/testthat/test-contingency-utils.R +++ b/facebook/delphiFacebook/tests/testthat/test-contingency-utils.R @@ -7,6 +7,7 @@ test_that("testing update_params command", { # Empty input list params <- list( input = c(), + use_input_asis = TRUE, aggregate_range = "month", end_date = "2020-02-01", input_dir = "./input" @@ -17,6 +18,7 @@ test_that("testing update_params command", { # Use specified end date input_params <- list( input = c("full_response.csv"), + use_input_asis = TRUE, aggregate_range = "month", end_date = "2020-02-01" ) @@ -25,15 +27,15 @@ test_that("testing update_params command", { expected_output <- list( input = c("full_response.csv"), + use_input_asis = TRUE, aggregate_range = "month", end_date = ymd("2020-01-31"), - start_time = ymd_hms("2020-01-01 00:00:00", tz=timezone), end_time = ymd_hms("2020-01-31 23:59:59", tz=timezone), + start_time = ymd_hms("2020-01-01 00:00:00", tz=timezone), start_date = ymd("2020-01-01") ) out <- update_params(input_params) - expect_identical(out, expected_output) }) @@ -56,6 +58,7 @@ test_that("testing get_filenames_in_range command", { params <- list( input = c(), + use_input_asis = FALSE, backfill_days = 4, input_dir = tdir ) @@ -73,28 +76,6 @@ test_that("testing get_filenames_in_range command", { expect_equal(out, expected_output) }) -test_that("testing get_date_range_from_filenames command", { - files <- c( - "2019-11-06.2019-10-30.2020-11-06.Survey_of_COVID-Like_Illness_-_TODEPLOY_......_-_US_Expansion.csv", - "2019-12-31.2019-12-24_With_Translations.csv", - "2020-01-06.2019-12-31_Wave_4.csv", - "2020-01-16.2020-01-09_YouTube.csv", - "2020-01-16.2020-01-09_Wave_4.csv", - "2020-02-06.2020-01-31_Wave_4.csv", - "2020-02-16.2020-02-09_Wave_3.csv" - ) - - params <- list( - backfill_days = 4, - input = files - ) - - expected_output <- list(ymd("2019-10-30"), ymd("2020-02-16")) - out <- get_date_range_from_filenames(params) - - expect_equal(out, expected_output) -}) - test_that("testing verify_aggs command", { # Duplicate rows input_aggs <- tribble( From 7328a4f0da7eabfc2cc78b1a9f2b04131a91ddae Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 12 Mar 2021 16:40:28 -0500 Subject: [PATCH 07/23] simplify hesitant_* definitions --- .../delphiFacebook/R/contingency_aggregate.R | 4 +- .../delphiFacebook/R/contingency_variables.R | 76 +++++++------------ 2 files changed, 28 insertions(+), 52 deletions(-) diff --git a/facebook/delphiFacebook/R/contingency_aggregate.R b/facebook/delphiFacebook/R/contingency_aggregate.R index bebf92786..036c95009 100644 --- a/facebook/delphiFacebook/R/contingency_aggregate.R +++ b/facebook/delphiFacebook/R/contingency_aggregate.R @@ -39,7 +39,7 @@ produce_aggregates <- function(df, aggregations, cw_list, params) { # Keep only obs in desired date range. df <- df[start_dt >= params$start_time & start_dt <= params$end_time] - + output <- post_process_aggs(df, aggregations, cw_list) df <- output[[1]] aggregations <- output[[2]] @@ -168,7 +168,7 @@ post_process_aggs <- function(df, aggregations, cw_list) { # each unique level/response code; multi-select used for grouping are left as-is. # - multiple choice items are left as-is - #### TODO: How do we want to handle multi-select items used for grouping? + #### TODO: How do we want to handle multi-select items when used for grouping? agg_groups <- unique(aggregations$group_by) group_cols_to_convert <- unique(do.call(c, agg_groups)) for (col_var in group_cols_to_convert) { diff --git a/facebook/delphiFacebook/R/contingency_variables.R b/facebook/delphiFacebook/R/contingency_variables.R index 3d419de3a..1f26a6ce3 100644 --- a/facebook/delphiFacebook/R/contingency_variables.R +++ b/facebook/delphiFacebook/R/contingency_variables.R @@ -345,14 +345,14 @@ create_derivative_columns <- function(df) { } else { df$b_65_or_older <- NA_real_ } - - if ("mc_accept_cov_vaccine" %in% names(df)) { - df$b_hesitant_cov_vaccine <- as.numeric( - df$mc_accept_cov_vaccine == "prob not vaccinate" | df$mc_accept_cov_vaccine == "def not vaccinate" - ) - } else { - df$b_hesitant_cov_vaccine <- NA_real_ - } + + if ("mc_accept_cov_vaccine" %in% names(df)) { + df$b_hesitant_cov_vaccine <- as.numeric( + df$mc_accept_cov_vaccine == "prob not vaccinate" | df$mc_accept_cov_vaccine == "def not vaccinate" + ) + } else { + df$b_hesitant_cov_vaccine <- NA_real_ + } if ("mc_concerned_sideeffects" %in% names(df)) { df$b_concerned_sideeffects <- as.numeric( @@ -363,13 +363,9 @@ create_derivative_columns <- function(df) { } df$b_hesitant_sideeffects <- case_when( - is.na(df$b_hesitant_cov_vaccine) == TRUE ~ NA, - is.na(df$b_concerned_sideeffects) == TRUE ~ NA, - df$wave < 7 ~ NA, - df$b_hesitant_cov_vaccine == 0 ~ NA, - df$b_hesitant_cov_vaccine == 1 & df$b_concerned_sideeffects == 1 ~ TRUE, - df$b_hesitant_cov_vaccine == 1 & df$b_concerned_sideeffects == 0 ~ FALSE, - TRUE ~ NA + df$b_hesitant_cov_vaccine == 1 & df$b_concerned_sideeffects == 1 ~ 1, + df$b_hesitant_cov_vaccine == 1 & df$b_concerned_sideeffects == 0 ~ 0, + TRUE ~ NA_real_ ) if ( "b_vaccine_likely_friends" %in% names(df) & @@ -378,49 +374,29 @@ create_derivative_columns <- function(df) { "b_vaccine_likely_govt_health" %in% names(df) & "b_vaccine_likely_politicians" %in% names(df) ) { df$b_hesitant_trust_fam <- case_when( - is.na(df$b_hesitant_cov_vaccine) == TRUE ~ NA, - is.na(df$b_vaccine_likely_friends) == TRUE ~ NA, - df$wave < 7 ~ NA, - df$b_hesitant_cov_vaccine == 0 ~ NA, - df$b_hesitant_cov_vaccine == 1 & df$b_vaccine_likely_friends == 1 ~ TRUE, - df$b_hesitant_cov_vaccine == 1 & df$b_vaccine_likely_friends == 0 ~ FALSE, - TRUE ~ NA + df$b_hesitant_cov_vaccine == 1 & df$b_vaccine_likely_friends == 1 ~ 1, + df$b_hesitant_cov_vaccine == 1 & df$b_vaccine_likely_friends == 0 ~ 0, + TRUE ~ NA_real_ ) df$b_hesitant_trust_healthcare <- case_when( - is.na(df$b_hesitant_cov_vaccine) == TRUE ~ NA, - is.na(df$b_vaccine_likely_local_health) == TRUE ~ NA, - df$wave < 7 ~ NA, - df$b_hesitant_cov_vaccine == 0 ~ NA, - df$b_hesitant_cov_vaccine == 1 & df$b_vaccine_likely_local_health == 1 ~ TRUE, - df$b_hesitant_cov_vaccine == 1 & df$b_vaccine_likely_local_health == 0 ~ FALSE, - TRUE ~ NA + df$b_hesitant_cov_vaccine == 1 & df$b_vaccine_likely_local_health == 1 ~ 1, + df$b_hesitant_cov_vaccine == 1 & df$b_vaccine_likely_local_health == 0 ~ 0, + TRUE ~ NA_real_ ) df$b_hesitant_trust_who <- case_when( - is.na(df$b_hesitant_cov_vaccine) == TRUE ~ NA, - is.na(df$b_vaccine_likely_who) == TRUE ~ NA, - df$wave < 7 ~ NA, - df$b_hesitant_cov_vaccine == 0 ~ NA, - df$b_hesitant_cov_vaccine == 1 & df$b_vaccine_likely_who == 1 ~ TRUE, - df$b_hesitant_cov_vaccine == 1 & df$b_vaccine_likely_who == 0 ~ FALSE, - TRUE ~ NA + df$b_hesitant_cov_vaccine == 1 & df$b_vaccine_likely_who == 1 ~ 1, + df$b_hesitant_cov_vaccine == 1 & df$b_vaccine_likely_who == 0 ~ 0, + TRUE ~ NA_real_ ) df$b_hesitant_trust_govt <- case_when( - is.na(df$b_hesitant_cov_vaccine) == TRUE ~ NA, - is.na(df$b_vaccine_likely_govt_health) == TRUE ~ NA, - df$wave < 7 ~ NA, - df$b_hesitant_cov_vaccine == 0 ~ NA, - df$b_hesitant_cov_vaccine == 1 & df$b_vaccine_likely_govt_health == 1 ~ TRUE, - df$b_hesitant_cov_vaccine == 1 & df$b_vaccine_likely_govt_health == 0 ~ FALSE, - TRUE ~ NA + df$b_hesitant_cov_vaccine == 1 & df$b_vaccine_likely_govt_health == 1 ~ 1, + df$b_hesitant_cov_vaccine == 1 & df$b_vaccine_likely_govt_health == 0 ~ 0, + TRUE ~ NA_real_ ) df$b_hesitant_trust_politicians <- case_when( - is.na(df$b_hesitant_cov_vaccine) == TRUE ~ NA, - is.na(df$b_vaccine_likely_politicians) == TRUE ~ NA, - df$wave < 7 ~ NA, - df$b_hesitant_cov_vaccine == 0 ~ NA, - df$b_hesitant_cov_vaccine == 1 & df$b_vaccine_likely_politicians == 1 ~ TRUE, - df$b_hesitant_cov_vaccine == 1 & df$b_vaccine_likely_politicians == 0 ~ FALSE, - TRUE ~ NA + df$b_hesitant_cov_vaccine == 1 & df$b_vaccine_likely_politicians == 1 ~ 1, + df$b_hesitant_cov_vaccine == 1 & df$b_vaccine_likely_politicians == 0 ~ 0, + TRUE ~ NA_real_ ) } else { From c59b3f4de3f444f69537308ecccd4ac1d4ba3e20 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 12 Mar 2021 17:03:28 -0500 Subject: [PATCH 08/23] set weekly aggs to same as monthly aggs --- facebook/delphiFacebook/R/contingency_run.R | 121 ++++++++++++++++++++ 1 file changed, 121 insertions(+) diff --git a/facebook/delphiFacebook/R/contingency_run.R b/facebook/delphiFacebook/R/contingency_run.R index 1e97da99c..d079c9dff 100644 --- a/facebook/delphiFacebook/R/contingency_run.R +++ b/facebook/delphiFacebook/R/contingency_run.R @@ -23,6 +23,127 @@ set_aggs <- function() { weekly_aggs <- tribble( ~name, ~metric, ~group_by, ~compute_fn, ~post_fn, + #### Cut 1: side effects if hesitant about getting vaccine and generally + # National + "pct_concerned_sideeffects", "b_concerned_sideeffects", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "nation"), compute_binary, jeffreys_binary, + "pct_hesitant_sideeffects", "b_hesitant_sideeffects", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "nation"), compute_binary, jeffreys_binary, + + # State + "pct_concerned_sideeffects", "b_concerned_sideeffects", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary, + "pct_hesitant_sideeffects", "b_hesitant_sideeffects", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary, + + # State marginal + "pct_concerned_sideeffects", "b_concerned_sideeffects", c("mc_age", "state"), compute_binary, jeffreys_binary, + "pct_concerned_sideeffects", "b_concerned_sideeffects", c("mc_gender", "state"), compute_binary, jeffreys_binary, + "pct_concerned_sideeffects", "b_concerned_sideeffects", c("mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary, + "pct_hesitant_sideeffects", "b_hesitant_sideeffects", c("mc_age", "state"), compute_binary, jeffreys_binary, + "pct_hesitant_sideeffects", "b_hesitant_sideeffects", c("mc_gender", "state"), compute_binary, jeffreys_binary, + "pct_hesitant_sideeffects", "b_hesitant_sideeffects", c("mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary, + + + + #### Cut 2: trust various institutions if hesitant about getting vaccine + # National + "pct_hesitant_trust_fam", "b_hesitant_trust_fam", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "nation"), compute_binary, jeffreys_binary, + "pct_hesitant_trust_healthcare", "b_hesitant_trust_healthcare", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "nation"), compute_binary, jeffreys_binary, + "pct_hesitant_trust_who", "b_hesitant_trust_who", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "nation"), compute_binary, jeffreys_binary, + "pct_hesitant_trust_govt", "b_hesitant_trust_govt", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "nation"), compute_binary, jeffreys_binary, + "pct_hesitant_trust_politicians", "b_hesitant_trust_politicians", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "nation"), compute_binary, jeffreys_binary, + + # State + "pct_hesitant_trust_fam", "b_hesitant_trust_fam", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary, + "pct_hesitant_trust_healthcare", "b_hesitant_trust_healthcare", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary, + "pct_hesitant_trust_who", "b_hesitant_trust_who", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary, + "pct_hesitant_trust_govt", "b_hesitant_trust_govt", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary, + "pct_hesitant_trust_politicians", "b_hesitant_trust_politicians", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary, + + # State marginal + "pct_hesitant_trust_fam", "b_hesitant_trust_fam", c("mc_age", "state"), compute_binary, jeffreys_binary, + "pct_hesitant_trust_fam", "b_hesitant_trust_fam", c("mc_gender", "state"), compute_binary, jeffreys_binary, + "pct_hesitant_trust_fam", "b_hesitant_trust_fam", c("mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary, + "pct_hesitant_trust_healthcare", "b_hesitant_trust_healthcare", c("mc_age", "state"), compute_binary, jeffreys_binary, + "pct_hesitant_trust_healthcare", "b_hesitant_trust_healthcare", c("mc_gender", "state"), compute_binary, jeffreys_binary, + "pct_hesitant_trust_healthcare", "b_hesitant_trust_healthcare", c("mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary, + "pct_hesitant_trust_who", "b_hesitant_trust_who", c("mc_age", "state"), compute_binary, jeffreys_binary, + "pct_hesitant_trust_who", "b_hesitant_trust_who", c("mc_gender", "state"), compute_binary, jeffreys_binary, + "pct_hesitant_trust_who", "b_hesitant_trust_who", c("mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary, + "pct_hesitant_trust_govt", "b_hesitant_trust_govt", c("mc_age", "state"), compute_binary, jeffreys_binary, + "pct_hesitant_trust_govt", "b_hesitant_trust_govt", c("mc_gender", "state"), compute_binary, jeffreys_binary, + "pct_hesitant_trust_govt", "b_hesitant_trust_govt", c("mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary, + "pct_hesitant_trust_politicians", "b_hesitant_trust_politicians", c("mc_age", "state"), compute_binary, jeffreys_binary, + "pct_hesitant_trust_politicians", "b_hesitant_trust_politicians", c("mc_gender", "state"), compute_binary, jeffreys_binary, + "pct_hesitant_trust_politicians", "b_hesitant_trust_politicians", c("mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary, + + + + #### Cut 3: trust various institutions + # National + "pct_trust_fam", "b_vaccine_likely_friends", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "nation"), compute_binary, jeffreys_binary, + "pct_trust_healthcare", "b_vaccine_likely_local_health", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "nation"), compute_binary, jeffreys_binary, + "pct_trust_who", "b_vaccine_likely_who", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "nation"), compute_binary, jeffreys_binary, + "pct_trust_govt", "b_vaccine_likely_govt_health", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "nation"), compute_binary, jeffreys_binary, + "pct_trust_politicians", "b_vaccine_likely_politicians", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "nation"), compute_binary, jeffreys_binary, + + # State + "pct_trust_fam", "b_vaccine_likely_friends", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary, + "pct_trust_healthcare", "b_vaccine_likely_local_health", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary, + "pct_trust_who", "b_vaccine_likely_who", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary, + "pct_trust_govt", "b_vaccine_likely_govt_health", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary, + "pct_trust_politicians", "b_vaccine_likely_politicians", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary, + + # State marginal + "pct_trust_fam", "b_vaccine_likely_friends", c("mc_age", "state"), compute_binary, jeffreys_binary, + "pct_trust_fam", "b_vaccine_likely_friends", c("mc_gender", "state"), compute_binary, jeffreys_binary, + "pct_trust_fam", "b_vaccine_likely_friends", c("mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary, + "pct_trust_healthcare", "b_vaccine_likely_local_health", c("mc_age", "state"), compute_binary, jeffreys_binary, + "pct_trust_healthcare", "b_vaccine_likely_local_health", c("mc_gender", "state"), compute_binary, jeffreys_binary, + "pct_trust_healthcare", "b_vaccine_likely_local_health", c("mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary, + "pct_trust_who", "b_vaccine_likely_who", c("mc_age", "state"), compute_binary, jeffreys_binary, + "pct_trust_who", "b_vaccine_likely_who", c("mc_gender", "state"), compute_binary, jeffreys_binary, + "pct_trust_who", "b_vaccine_likely_who", c("mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary, + "pct_trust_govt", "b_vaccine_likely_govt_health", c("mc_age", "state"), compute_binary, jeffreys_binary, + "pct_trust_govt", "b_vaccine_likely_govt_health", c("mc_gender", "state"), compute_binary, jeffreys_binary, + "pct_trust_govt", "b_vaccine_likely_govt_health", c("mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary, + "pct_trust_politicians", "b_vaccine_likely_politicians", c("mc_age", "state"), compute_binary, jeffreys_binary, + "pct_trust_politicians", "b_vaccine_likely_politicians", c("mc_gender", "state"), compute_binary, jeffreys_binary, + "pct_trust_politicians", "b_vaccine_likely_politicians", c("mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary, + + + + #### Cuts 4, 5, 6: vaccinated and accepting if senior, in healthcare, or generally + # National + "pct_vaccinated", "b_had_cov_vaccine", c("b_work_in_healthcare", "mc_age", "mc_gender", "mc_race", "b_hispanic", "nation"), compute_binary, jeffreys_binary, + "pct_vaccinated", "b_had_cov_vaccine", c("b_65_or_older", "mc_gender", "mc_race", "b_hispanic", "nation"), compute_binary, jeffreys_binary, + "pct_vaccinated", "b_had_cov_vaccine", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "nation"), compute_binary, jeffreys_binary, + "pct_accepting", "b_accept_cov_vaccine", c("b_work_in_healthcare", "mc_age", "mc_gender", "mc_race", "b_hispanic", "nation"), compute_binary, jeffreys_binary, + "pct_accepting", "b_accept_cov_vaccine", c("b_65_or_older", "mc_gender", "mc_race", "b_hispanic", "nation"), compute_binary, jeffreys_binary, + "pct_accepting", "b_accept_cov_vaccine", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "nation"), compute_binary, jeffreys_binary, + + # State + "pct_vaccinated", "b_had_cov_vaccine", c("b_work_in_healthcare", "mc_age", "mc_gender", "mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary, + "pct_vaccinated", "b_had_cov_vaccine", c("b_65_or_older", "mc_gender", "mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary, + "pct_vaccinated", "b_had_cov_vaccine", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary, + "pct_accepting", "b_accept_cov_vaccine", c("b_work_in_healthcare", "mc_age", "mc_gender", "mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary, + "pct_accepting", "b_accept_cov_vaccine", c("b_65_or_older", "mc_gender", "mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary, + "pct_accepting", "b_accept_cov_vaccine", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary, + + # State marginal + "pct_vaccinated", "b_had_cov_vaccine", c("b_work_in_healthcare", "mc_age", "state"), compute_binary, jeffreys_binary, + "pct_vaccinated", "b_had_cov_vaccine", c("b_work_in_healthcare","mc_gender", "state"), compute_binary, jeffreys_binary, + "pct_vaccinated", "b_had_cov_vaccine", c("b_work_in_healthcare", "mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary, + "pct_vaccinated", "b_had_cov_vaccine", c("b_65_or_older", "mc_gender", "state"), compute_binary, jeffreys_binary, + "pct_vaccinated", "b_had_cov_vaccine", c("b_65_or_older", "mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary, + "pct_vaccinated", "b_had_cov_vaccine", c("mc_age", "state"), compute_binary, jeffreys_binary, + "pct_vaccinated", "b_had_cov_vaccine", c("mc_gender", "state"), compute_binary, jeffreys_binary, + "pct_vaccinated", "b_had_cov_vaccine", c("mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary, + "pct_accepting", "b_accept_cov_vaccine", c("b_work_in_healthcare", "mc_age", "state"), compute_binary, jeffreys_binary, + "pct_accepting", "b_accept_cov_vaccine", c("b_work_in_healthcare","mc_gender", "state"), compute_binary, jeffreys_binary, + "pct_accepting", "b_accept_cov_vaccine", c("b_work_in_healthcare", "mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary, + "pct_accepting", "b_accept_cov_vaccine", c("b_65_or_older", "mc_gender", "state"), compute_binary, jeffreys_binary, + "pct_accepting", "b_accept_cov_vaccine", c("b_65_or_older", "mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary, + "pct_accepting", "b_accept_cov_vaccine", c("mc_age", "state"), compute_binary, jeffreys_binary, + "pct_accepting", "b_accept_cov_vaccine", c("mc_gender", "state"), compute_binary, jeffreys_binary, + "pct_accepting", "b_accept_cov_vaccine", c("mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary, ) monthly_aggs <- tribble( From eb6874ed945c2a0a612f7e427067a0025f0e4001 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 12 Mar 2021 17:23:01 -0500 Subject: [PATCH 09/23] deduplicate process of checking column existence and format --- .../delphiFacebook/R/contingency_aggregate.R | 57 ++++++------------- 1 file changed, 18 insertions(+), 39 deletions(-) diff --git a/facebook/delphiFacebook/R/contingency_aggregate.R b/facebook/delphiFacebook/R/contingency_aggregate.R index 036c95009..9d34355f8 100644 --- a/facebook/delphiFacebook/R/contingency_aggregate.R +++ b/facebook/delphiFacebook/R/contingency_aggregate.R @@ -170,47 +170,26 @@ post_process_aggs <- function(df, aggregations, cw_list) { #### TODO: How do we want to handle multi-select items when used for grouping? agg_groups <- unique(aggregations$group_by) - group_cols_to_convert <- unique(do.call(c, agg_groups)) - for (col_var in group_cols_to_convert) { - if (col_var == "geo_id") { next } - if ( is.null(df[[col_var]]) ) { - aggregations <- aggregations[aggregations$metric != col_var & - !mapply(aggregations$group_by, - FUN=function(x) {col_var %in% x}), ] - msg_plain( - paste0( - col_var, " is not defined. Removing all aggregations that use it. ", - nrow(aggregations), " remaining") - ) - next - } - - if (startsWith(col_var, "b_")) { # Binary - output <- code_binary(df, aggregations, col_var) - } else if (startsWith(col_var, "n_")) { # Numeric free response - output <- code_numeric_freeresponse(df, aggregations, col_var) - } else { - # Multiple choice, multi-select, and variables that are formatted differently - output <- list(df, aggregations) - } - df <- output[[1]] - aggregations <- output[[2]] - } + group_cols <- unique(do.call(c, agg_groups)) + group_cols <- group_cols[group_cols != "geo_id"] metric_cols_to_convert <- unique(aggregations$metric) - for (col_var in metric_cols_to_convert) { - if ( is.null(df[[col_var]]) ) { - aggregations <- aggregations[aggregations$metric != col_var & - !mapply(aggregations$group_by, - FUN=function(x) {col_var %in% x}), ] - msg_plain( - paste0( - col_var, " is not defined. Removing all aggregations that use it. ", - nrow(aggregations), " remaining") - ) - next - } - + + cols_check_available <- c(group_cols, metric_cols_to_convert) + cols_not_available <- cols_check_available[ !(cols_check_available %in% names(df)) ] + for (col_var in cols_not_available) { + # Remove from aggregations + aggregations <- aggregations[aggregations$metric != col_var & + !mapply(aggregations$group_by, + FUN=function(x) {col_var %in% x}), ] + msg_plain( + paste0( + col_var, " is not defined. Removing all aggregations that use it. ", + nrow(aggregations), " remaining") + } + + group_cols_to_convert <- group_cols[startsWith(group_cols, "b_")] + for (col_var in c(group_cols_to_convert, metric_cols_to_convert) ) { if (startsWith(col_var, "b_")) { # Binary output <- code_binary(df, aggregations, col_var) } else if (startsWith(col_var, "n_")) { # Numeric free response From 0257d95282a8654a78cdddaf39c75472cfcc700b Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Sat, 13 Mar 2021 10:03:34 -0500 Subject: [PATCH 10/23] add missing paren --- facebook/delphiFacebook/R/contingency_aggregate.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/facebook/delphiFacebook/R/contingency_aggregate.R b/facebook/delphiFacebook/R/contingency_aggregate.R index 9d34355f8..c08fee77c 100644 --- a/facebook/delphiFacebook/R/contingency_aggregate.R +++ b/facebook/delphiFacebook/R/contingency_aggregate.R @@ -182,10 +182,10 @@ post_process_aggs <- function(df, aggregations, cw_list) { aggregations <- aggregations[aggregations$metric != col_var & !mapply(aggregations$group_by, FUN=function(x) {col_var %in% x}), ] - msg_plain( - paste0( + msg_plain(paste0( col_var, " is not defined. Removing all aggregations that use it. ", nrow(aggregations), " remaining") + ) } group_cols_to_convert <- group_cols[startsWith(group_cols, "b_")] From 1aaf82696ab5ec43c9b2ba47e51dcb2576e99f1c Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 16 Mar 2021 17:19:38 -0400 Subject: [PATCH 11/23] create local wave variable --- facebook/delphiFacebook/R/variables.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/facebook/delphiFacebook/R/variables.R b/facebook/delphiFacebook/R/variables.R index 49d0080a1..632190b57 100644 --- a/facebook/delphiFacebook/R/variables.R +++ b/facebook/delphiFacebook/R/variables.R @@ -291,6 +291,9 @@ code_testing <- function(input_data) { #' #' @importFrom dplyr coalesce code_vaccines <- function(input_data) { + wave <- unique(input_data$wave) + assert(length(wave) == 1, "can only code one wave at a time") + if ("V1" %in% names(input_data)) { # coded as 1 = Yes, 2 = No, 3 = don't know. We assume that don't know = no, # because, well, you'd know. From 972aa8979aee3cf47c19cb8e4eec377f5b8a6889 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 16 Mar 2021 17:50:58 -0400 Subject: [PATCH 12/23] update tests to match --- facebook/delphiFacebook/tests/testthat/test-variables.R | 1 + 1 file changed, 1 insertion(+) diff --git a/facebook/delphiFacebook/tests/testthat/test-variables.R b/facebook/delphiFacebook/tests/testthat/test-variables.R index 0b76f231a..bc8bf60ac 100644 --- a/facebook/delphiFacebook/tests/testthat/test-variables.R +++ b/facebook/delphiFacebook/tests/testthat/test-variables.R @@ -149,6 +149,7 @@ test_that("household size correctly imputes zeros", { test_that("vaccine acceptance is correctly coded", { input_data <- data.frame( + wave = 1, V1 = c(2, 3, 2, NA, 1, NA), V3 = c(1, 2, 3, 4, NA, NA) ) From c0a59e206816df3a10af3bcc6dfb8bab8a5c80ef Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Sun, 14 Mar 2021 10:27:33 -0400 Subject: [PATCH 13/23] remove trailing whitespace. add all_of for explicitness --- .../delphiFacebook/R/contingency_aggregate.R | 25 +++++++++---------- facebook/delphiFacebook/R/contingency_write.R | 20 +++++++-------- 2 files changed, 22 insertions(+), 23 deletions(-) diff --git a/facebook/delphiFacebook/R/contingency_aggregate.R b/facebook/delphiFacebook/R/contingency_aggregate.R index c08fee77c..49a11009b 100644 --- a/facebook/delphiFacebook/R/contingency_aggregate.R +++ b/facebook/delphiFacebook/R/contingency_aggregate.R @@ -43,16 +43,16 @@ produce_aggregates <- function(df, aggregations, cw_list, params) { output <- post_process_aggs(df, aggregations, cw_list) df <- output[[1]] aggregations <- output[[2]] - + ## Keep only columns used in indicators, plus supporting columns. group_vars <- unique( unlist(aggregations$group_by) ) - df <- select(df, - all_of(unique(aggregations$metric)), - all_of(unique(aggregations$var_weight)), - all_of( group_vars[group_vars != "geo_id"] ), + df <- select(df, + all_of(unique(aggregations$metric)), + all_of(unique(aggregations$var_weight)), + all_of( group_vars[group_vars != "geo_id"] ), zip5, start_dt) - + agg_groups <- unique(aggregations[c("group_by", "geo_level")]) # For each unique combination of groupby_vars and geo level, run aggregation process once @@ -167,14 +167,14 @@ post_process_aggs <- function(df, aggregations, cw_list) { # - multi-select items are converted to a series of binary columns, one for # each unique level/response code; multi-select used for grouping are left as-is. # - multiple choice items are left as-is - + #### TODO: How do we want to handle multi-select items when used for grouping? agg_groups <- unique(aggregations$group_by) group_cols <- unique(do.call(c, agg_groups)) group_cols <- group_cols[group_cols != "geo_id"] - + metric_cols_to_convert <- unique(aggregations$metric) - + cols_check_available <- c(group_cols, metric_cols_to_convert) cols_not_available <- cols_check_available[ !(cols_check_available %in% names(df)) ] for (col_var in cols_not_available) { @@ -183,11 +183,11 @@ post_process_aggs <- function(df, aggregations, cw_list) { !mapply(aggregations$group_by, FUN=function(x) {col_var %in% x}), ] msg_plain(paste0( - col_var, " is not defined. Removing all aggregations that use it. ", + col_var, " is not defined. Removing all aggregations that use it. ", nrow(aggregations), " remaining") ) } - + group_cols_to_convert <- group_cols[startsWith(group_cols, "b_")] for (col_var in c(group_cols_to_convert, metric_cols_to_convert) ) { if (startsWith(col_var, "b_")) { # Binary @@ -226,7 +226,7 @@ post_process_aggs <- function(df, aggregations, cw_list) { #' @param params a named list with entries "s_weight", "s_mix_coef", #' "num_filter" #' -#' @importFrom dplyr inner_join bind_rows +#' @importFrom dplyr inner_join bind_rows filter group_by summarize across all_of #' @importFrom parallel mclapply #' @importFrom stats complete.cases #' @@ -261,7 +261,6 @@ summarize_aggs <- function(df, crosswalk_data, aggregations, geo_level, params) return(list()) } - ## Set an index on the groupby var columns so that the groupby step can be ## faster; data.table stores the sort order of the column and ## uses a binary search to find matching values, rather than a linear scan. diff --git a/facebook/delphiFacebook/R/contingency_write.R b/facebook/delphiFacebook/R/contingency_write.R index 97dc07ba9..8f0fd671f 100644 --- a/facebook/delphiFacebook/R/contingency_write.R +++ b/facebook/delphiFacebook/R/contingency_write.R @@ -1,5 +1,5 @@ #' Write csv file for sharing with researchers. -#' +#' #' CSV name includes date specifying start of time period aggregated, geo level, #' and grouping variables. #' @@ -15,36 +15,36 @@ #' @importFrom readr write_csv #' @importFrom dplyr arrange across #' @importFrom stringi stri_trim -#' +#' #' @export write_contingency_tables <- function(data, params, geo_level, groupby_vars) { if (!is.null(data) && nrow(data) != 0) { - data <- arrange(data, across(groupby_vars)) - + data <- arrange(data, across(all_of(groupby_vars))) + # Format reported columns. - data <- mutate_at(data, vars(-c(groupby_vars)), + data <- mutate_at(data, vars(-c(groupby_vars)), function(x) { stri_trim( formatC(as.numeric(x), digits=7, format="f", drop0trailing=TRUE) ) }) - + # Reduce verbosity of grouping vars for output purposes groupby_vars <- gsub("_", "", sub( ".+?_", "", groupby_vars[groupby_vars != "geo_id"])) filename <- sprintf("%s_%s.csv", format(params$start_date, "%Y%m%d"), paste(c(geo_level, groupby_vars), collapse="_")) file_out <- file.path(params$export_dir, filename) - + create_dir_not_exist(params$export_dir) - + msg_df(sprintf("saving contingency table data to %-35s", filename), data) write_csv(data, file_out) - + } else { msg_plain(sprintf( - "no aggregations produced for grouping variables %s (%s); CSV will not be saved", + "no aggregations produced for grouping variables %s (%s); CSV will not be saved", paste(groupby_vars, collapse=", "), geo_level )) } From 15df269991df585c22adf0d74f498c64097cc4e6 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Sun, 14 Mar 2021 11:07:37 -0400 Subject: [PATCH 14/23] more column-existence/format deduplication --- .../delphiFacebook/R/contingency_aggregate.R | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/facebook/delphiFacebook/R/contingency_aggregate.R b/facebook/delphiFacebook/R/contingency_aggregate.R index 49a11009b..3cdab7448 100644 --- a/facebook/delphiFacebook/R/contingency_aggregate.R +++ b/facebook/delphiFacebook/R/contingency_aggregate.R @@ -169,14 +169,14 @@ post_process_aggs <- function(df, aggregations, cw_list) { # - multiple choice items are left as-is #### TODO: How do we want to handle multi-select items when used for grouping? - agg_groups <- unique(aggregations$group_by) - group_cols <- unique(do.call(c, agg_groups)) + group_cols <- unique(do.call(c, aggregations$group_by)) group_cols <- group_cols[group_cols != "geo_id"] - metric_cols_to_convert <- unique(aggregations$metric) - - cols_check_available <- c(group_cols, metric_cols_to_convert) - cols_not_available <- cols_check_available[ !(cols_check_available %in% names(df)) ] + metric_cols <- unique(aggregations$metric) + + cols_check_available <- unique(c(group_cols, metric_cols)) + available <- cols_check_available %in% names(df) + cols_not_available <- cols_check_available[ !available ] for (col_var in cols_not_available) { # Remove from aggregations aggregations <- aggregations[aggregations$metric != col_var & @@ -188,8 +188,12 @@ post_process_aggs <- function(df, aggregations, cw_list) { ) } - group_cols_to_convert <- group_cols[startsWith(group_cols, "b_")] - for (col_var in c(group_cols_to_convert, metric_cols_to_convert) ) { + cols_available <- cols_check_available[ available ] + for (col_var in cols_available) { + if ( col_var %in% group_cols & !(col_var %in% metric_cols) & !startsWith(col_var, "b_") ) { + next + } + if (startsWith(col_var, "b_")) { # Binary output <- code_binary(df, aggregations, col_var) } else if (startsWith(col_var, "n_")) { # Numeric free response From cdc5c3ecda4edcabb2e4157e83ef8162eed8de6a Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 17 Mar 2021 15:12:00 -0400 Subject: [PATCH 15/23] remove unused imports --- facebook/delphiFacebook/R/contingency_aggregate.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/facebook/delphiFacebook/R/contingency_aggregate.R b/facebook/delphiFacebook/R/contingency_aggregate.R index 3cdab7448..eeeb8e75c 100644 --- a/facebook/delphiFacebook/R/contingency_aggregate.R +++ b/facebook/delphiFacebook/R/contingency_aggregate.R @@ -230,7 +230,7 @@ post_process_aggs <- function(df, aggregations, cw_list) { #' @param params a named list with entries "s_weight", "s_mix_coef", #' "num_filter" #' -#' @importFrom dplyr inner_join bind_rows filter group_by summarize across all_of +#' @importFrom dplyr inner_join bind_rows #' @importFrom parallel mclapply #' @importFrom stats complete.cases #' From 2ee7cd5e882883f7f9562463fb9985ed6144ab4a Mon Sep 17 00:00:00 2001 From: andrew Date: Wed, 9 Dec 2020 02:02:16 -0800 Subject: [PATCH 16/23] Add hhs and nation --- .../delphi_doctor_visits/geo_maps.py | 34 +++++++++++++++++++ doctor_visits/delphi_doctor_visits/run.py | 2 +- .../delphi_doctor_visits/update_sensor.py | 11 ++++-- 3 files changed, 44 insertions(+), 3 deletions(-) diff --git a/doctor_visits/delphi_doctor_visits/geo_maps.py b/doctor_visits/delphi_doctor_visits/geo_maps.py index 9ba935683..3bc0cc2b1 100644 --- a/doctor_visits/delphi_doctor_visits/geo_maps.py +++ b/doctor_visits/delphi_doctor_visits/geo_maps.py @@ -61,6 +61,40 @@ def county_to_state(self, data): return data.groupby("state_id"), "state_id" + def county_to_hhs(self, data): + """Aggregate county data to the HHS region resolution. + + Args: + data: dataframe aggregated to the daily-county resolution (all 7 cols expected) + + Returns: tuple of dataframe at the daily-HHS resolution, and geo_id column name + """ + data = self.gmpr.add_geocode(data, + "fips", + "hhs", + from_col="PatCountyFIPS") + data.drop(columns="PatCountyFIPS", inplace=True) + data = data.groupby(["ServiceDate", "hhs"]).sum().reset_index() + + return data.groupby("hhs"), "hhs" + + def county_to_nation(self, data): + """Aggregate county data to the nation resolution. + + Args: + data: dataframe aggregated to the daily-county resolution (all 7 cols expected) + + Returns: tuple of dataframe at the daily-nation resolution, and geo_id column name + """ + data = self.gmpr.add_geocode(data, + "fips", + "nation", + from_col="PatCountyFIPS") + data.drop(columns="PatCountyFIPS", inplace=True) + data = data.groupby(["ServiceDate", "nation"]).sum().reset_index() + + return data.groupby("nation"), "nation" + def county_to_hrr(self, data): """Aggregate county data to the HRR resolution. diff --git a/doctor_visits/delphi_doctor_visits/run.py b/doctor_visits/delphi_doctor_visits/run.py index d85e0b632..1feb6c945 100644 --- a/doctor_visits/delphi_doctor_visits/run.py +++ b/doctor_visits/delphi_doctor_visits/run.py @@ -68,7 +68,7 @@ def run_module(params): logging.info("n_waiting_days:\t{n_waiting_days}") ## geographies - geos = ["state", "msa", "hrr", "county"] + geos = ["state", "msa", "hrr", "county", "hhs", "nation"] ## print out other vars diff --git a/doctor_visits/delphi_doctor_visits/update_sensor.py b/doctor_visits/delphi_doctor_visits/update_sensor.py index 725d4ca4f..b2981ad32 100644 --- a/doctor_visits/delphi_doctor_visits/update_sensor.py +++ b/doctor_visits/delphi_doctor_visits/update_sensor.py @@ -78,7 +78,7 @@ def update_sensor( startdate: first sensor date (YYYY-mm-dd) enddate: last sensor date (YYYY-mm-dd) dropdate: data drop date (YYYY-mm-dd) - geo: geographic resolution, one of ["county", "state", "msa", "hrr"] + geo: geographic resolution, one of ["county", "state", "msa", "hrr", "nation", "hhs"] parallel: boolean to run the sensor update in parallel weekday: boolean to adjust for weekday effects se: boolean to write out standard errors, if true, use an obfuscated name @@ -142,8 +142,15 @@ def update_sensor( data_groups, _ = geo_map.county_to_msa(data) elif geo.lower() == "hrr": data_groups, _ = geo_map.county_to_hrr(data) + elif geo.lower() == "hhs": + data_groups, _ = geo_map.county_to_hhs(data) + elif geo.lower() == "nation": + data_groups, _ = geo_map.county_to_nation(data) else: - logging.error(f"{geo} is invalid, pick one of 'county', 'state', 'msa', 'hrr'") + + logging.error( + f"{geo} is invalid, pick one of 'county', 'state', 'msa', 'hrr', 'hhs', 'nation'" + ) return {} unique_geo_ids = list(data_groups.groups.keys()) From 5a0b2712391dcb30efbf74ae937f4b80367537ed Mon Sep 17 00:00:00 2001 From: andrew Date: Wed, 17 Mar 2021 14:44:23 -0700 Subject: [PATCH 17/23] Change ifelse block to a dict of functions --- .../delphi_doctor_visits/geo_maps.py | 9 ++++++++ .../delphi_doctor_visits/update_sensor.py | 22 ++----------------- 2 files changed, 11 insertions(+), 20 deletions(-) diff --git a/doctor_visits/delphi_doctor_visits/geo_maps.py b/doctor_visits/delphi_doctor_visits/geo_maps.py index 3bc0cc2b1..716e8899d 100644 --- a/doctor_visits/delphi_doctor_visits/geo_maps.py +++ b/doctor_visits/delphi_doctor_visits/geo_maps.py @@ -7,6 +7,7 @@ Created: 2020-04-18 Last modified: 2020-04-30 by Aaron Rumack (add megacounty code) """ +from functools import partial import pandas as pd from delphi_utils.geomap import GeoMapper @@ -20,6 +21,14 @@ class GeoMaps: def __init__(self): """Create the underlying GeoMapper.""" self.gmpr = GeoMapper() + self.geo_func = {"county": partial(self.county_to_megacounty, + threshold_visits=Config.MIN_RECENT_VISITS, + threshold_len=Config.RECENT_LENGTH), + "state": self.county_to_state, + "msa": self.county_to_msa, + "hrr": self.county_to_hrr, + "hhs": self.county_to_hhs, + "nation": self.county_to_nation} @staticmethod def convert_fips(x): diff --git a/doctor_visits/delphi_doctor_visits/update_sensor.py b/doctor_visits/delphi_doctor_visits/update_sensor.py index b2981ad32..01e1647fe 100644 --- a/doctor_visits/delphi_doctor_visits/update_sensor.py +++ b/doctor_visits/delphi_doctor_visits/update_sensor.py @@ -132,26 +132,8 @@ def update_sensor( # get right geography geo_map = GeoMaps() - if geo.lower() == "county": - data_groups, _ = geo_map.county_to_megacounty( - data, Config.MIN_RECENT_VISITS, Config.RECENT_LENGTH - ) - elif geo.lower() == "state": - data_groups, _ = geo_map.county_to_state(data) - elif geo.lower() == "msa": - data_groups, _ = geo_map.county_to_msa(data) - elif geo.lower() == "hrr": - data_groups, _ = geo_map.county_to_hrr(data) - elif geo.lower() == "hhs": - data_groups, _ = geo_map.county_to_hhs(data) - elif geo.lower() == "nation": - data_groups, _ = geo_map.county_to_nation(data) - else: - - logging.error( - f"{geo} is invalid, pick one of 'county', 'state', 'msa', 'hrr', 'hhs', 'nation'" - ) - return {} + mapping_func = geo_map.geo_func[geo.lower()] + data_groups, _ = mapping_func(data) unique_geo_ids = list(data_groups.groups.keys()) # run sensor fitting code (maybe in parallel) From 65ab02db24ed036da7d01adf672a14212c59890d Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 18 Mar 2021 13:12:46 -0400 Subject: [PATCH 18/23] unify group_vars definition between funcs --- facebook/delphiFacebook/R/contingency_aggregate.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/facebook/delphiFacebook/R/contingency_aggregate.R b/facebook/delphiFacebook/R/contingency_aggregate.R index eeeb8e75c..1c091f437 100644 --- a/facebook/delphiFacebook/R/contingency_aggregate.R +++ b/facebook/delphiFacebook/R/contingency_aggregate.R @@ -169,12 +169,12 @@ post_process_aggs <- function(df, aggregations, cw_list) { # - multiple choice items are left as-is #### TODO: How do we want to handle multi-select items when used for grouping? - group_cols <- unique(do.call(c, aggregations$group_by)) - group_cols <- group_cols[group_cols != "geo_id"] + group_vars <- unique( unlist(aggregations$group_by) ) + group_vars <- group_vars[group_vars != "geo_id"] metric_cols <- unique(aggregations$metric) - cols_check_available <- unique(c(group_cols, metric_cols)) + cols_check_available <- unique(c(group_vars, metric_cols)) available <- cols_check_available %in% names(df) cols_not_available <- cols_check_available[ !available ] for (col_var in cols_not_available) { @@ -190,7 +190,7 @@ post_process_aggs <- function(df, aggregations, cw_list) { cols_available <- cols_check_available[ available ] for (col_var in cols_available) { - if ( col_var %in% group_cols & !(col_var %in% metric_cols) & !startsWith(col_var, "b_") ) { + if ( col_var %in% group_vars & !(col_var %in% metric_cols) & !startsWith(col_var, "b_") ) { next } From 572819fef3d040049e4c172c3f6b4563b69633d9 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 18 Mar 2021 14:08:04 -0400 Subject: [PATCH 19/23] unify group_vars name between funcs --- .../delphiFacebook/R/contingency_aggregate.R | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/facebook/delphiFacebook/R/contingency_aggregate.R b/facebook/delphiFacebook/R/contingency_aggregate.R index 7aa8e5ddc..0fec6e66f 100644 --- a/facebook/delphiFacebook/R/contingency_aggregate.R +++ b/facebook/delphiFacebook/R/contingency_aggregate.R @@ -55,7 +55,7 @@ produce_aggregates <- function(df, aggregations, cw_list, params) { agg_groups <- unique(aggregations[c("group_by", "geo_level")]) - # For each unique combination of groupby_vars and geo level, run aggregation process once + # For each unique combination of group_vars and geo level, run aggregation process once # and calculate all desired aggregations on the grouping. Rename columns. Save # to individual files for (group_ind in seq_along(agg_groups$group_by)) { @@ -248,16 +248,16 @@ summarize_aggs <- function(df, crosswalk_data, aggregations, geo_level, params) ## inefficient; profiling shows the cost to be negligible, so shut it up df <- suppressWarnings(inner_join(df, crosswalk_data, by = "zip5")) - groupby_vars <- aggregations$group_by[[1]] + group_vars <- aggregations$group_by[[1]] - if (all(groupby_vars %in% names(df))) { - unique_group_combos <- unique(df[, groupby_vars, with=FALSE]) + if (all(group_vars %in% names(df))) { + unique_group_combos <- unique(df[, group_vars, with=FALSE]) unique_group_combos <- unique_group_combos[complete.cases(unique_group_combos)] } else { msg_plain( sprintf( "not all of groupby columns %s available in data; skipping aggregation", - paste(groupby_vars, collapse=", ") + paste(group_vars, collapse=", ") )) } @@ -268,7 +268,7 @@ summarize_aggs <- function(df, crosswalk_data, aggregations, geo_level, params) ## Set an index on the groupby var columns so that the groupby step can be ## faster; data.table stores the sort order of the column and ## uses a binary search to find matching values, rather than a linear scan. - setindexv(df, groupby_vars) + setindexv(df, group_vars) calculate_group <- function(ii) { target_group <- unique_group_combos[ii] @@ -301,15 +301,15 @@ summarize_aggs <- function(df, crosswalk_data, aggregations, geo_level, params) ## Do post-processing. for (row in seq_len(nrow(aggregations))) { aggregation <- aggregations$id[row] - groupby_vars <- aggregations$group_by[[row]] + group_vars <- aggregations$group_by[[row]] post_fn <- aggregations$post_fn[[row]] dfs_out[[aggregation]] <- dfs_out[[aggregation]][ - rowSums(is.na(dfs_out[[aggregation]][, c("val", "sample_size", groupby_vars)])) == 0, + rowSums(is.na(dfs_out[[aggregation]][, c("val", "sample_size", group_vars)])) == 0, ] if (geo_level == "county") { - df_megacounties <- megacounty(dfs_out[[aggregation]], params$num_filter, groupby_vars) + df_megacounties <- megacounty(dfs_out[[aggregation]], params$num_filter, group_vars) dfs_out[[aggregation]] <- bind_rows(dfs_out[[aggregation]], df_megacounties) } From 64883a1667a8903323a01831f17e2fecca5804cf Mon Sep 17 00:00:00 2001 From: Maria Jahja Date: Fri, 19 Mar 2021 13:57:36 -0400 Subject: [PATCH 20/23] update n_backfill_days to 70, fix logging format --- claims_hosp/params.json.template | 2 +- doctor_visits/delphi_doctor_visits/run.py | 10 +++++----- doctor_visits/params.json.template | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/claims_hosp/params.json.template b/claims_hosp/params.json.template index dc0dfe024..e93fb1d2b 100644 --- a/claims_hosp/params.json.template +++ b/claims_hosp/params.json.template @@ -8,7 +8,7 @@ "start_date": "2020-02-01", "end_date": null, "drop_date": null, - "n_backfill_days": 60, + "n_backfill_days": 70, "n_waiting_days": 3, "write_se": false, "obfuscated_prefix": "foo_obfuscated", diff --git a/doctor_visits/delphi_doctor_visits/run.py b/doctor_visits/delphi_doctor_visits/run.py index 1feb6c945..f3417d283 100644 --- a/doctor_visits/delphi_doctor_visits/run.py +++ b/doctor_visits/delphi_doctor_visits/run.py @@ -61,11 +61,11 @@ def run_module(params): startdate_dt = enddate_dt - timedelta(days=n_backfill_days) enddate = str(enddate_dt.date()) startdate = str(startdate_dt.date()) - logging.info("drop date:\t\t{dropdate}") - logging.info("first sensor date:\t{startdate}") - logging.info("last sensor date:\t{enddate}") - logging.info("n_backfill_days:\t{n_backfill_days}") - logging.info("n_waiting_days:\t{n_waiting_days}") + logging.info("drop date:\t\t%s", dropdate) + logging.info("first sensor date:\t%s", startdate) + logging.info("last sensor date:\t%s", enddate) + logging.info("n_backfill_days:\t%s", n_backfill_days) + logging.info("n_waiting_days:\t%s", n_waiting_days) ## geographies geos = ["state", "msa", "hrr", "county", "hhs", "nation"] diff --git a/doctor_visits/params.json.template b/doctor_visits/params.json.template index 9fc5b5aed..98b958926 100644 --- a/doctor_visits/params.json.template +++ b/doctor_visits/params.json.template @@ -5,7 +5,7 @@ "indicator": { "input_file": "./input/SYNEDI_AGG_OUTPATIENT_18052020_1455CDT.csv.gz", "drop_date": "", - "n_backfill_days": 60, + "n_backfill_days": 70, "n_waiting_days": 3, "weekday": [true, false], "se": false, From db86563d8e2de576646970c8a78726a822f38c32 Mon Sep 17 00:00:00 2001 From: Mike O'Brien Date: Fri, 19 Mar 2021 16:21:18 -0400 Subject: [PATCH 21/23] validation params for safegraph --- .../templates/safegraph-params-prod.json.j2 | 28 +++++++++++++++++-- safegraph/params.json.template | 28 +++++++++++++++++-- safegraph/run-safegraph.sh | 1 + 3 files changed, 53 insertions(+), 4 deletions(-) diff --git a/ansible/templates/safegraph-params-prod.json.j2 b/ansible/templates/safegraph-params-prod.json.j2 index 41550a8ae..4fc9d4785 100644 --- a/ansible/templates/safegraph-params-prod.json.j2 +++ b/ansible/templates/safegraph-params-prod.json.j2 @@ -14,7 +14,31 @@ "sync": true, "wip_signal" : [] }, - "archive": { - "cache_dir": "./cache" + "validation": { + "common": { + "data_source": "safegraph", + "span_length": 14, + "end_date": "today", + "suppressed_errors": [ + {"signal": "bars_visit_num"}, + {"signal": "bars_visit_prop"}, + {"signal": "restaurants_visit_num"}, + {"signal": "restaurants_visit_prop"} + ] + }, + "static": { + "minimum_sample_size": 100, + "missing_se_allowed": false, + "missing_sample_size_allowed": false + }, + "dynamic": { + "ref_window_size": 7, + "smoothed_signals": [ + "completely_home_prop_7dav", + "full_time_work_prop_7dav", + "part_time_work_prop_7dav", + "median_home_dwell_time_7dav" + ] + } } } diff --git a/safegraph/params.json.template b/safegraph/params.json.template index 0c830c41c..3649cacbd 100644 --- a/safegraph/params.json.template +++ b/safegraph/params.json.template @@ -17,7 +17,31 @@ "part_time_work_prop_7dav", "full_time_work_prop_7dav"] }, - "archive": { - "cache_dir": "./cache" + "validation": { + "common": { + "data_source": "safegraph", + "span_length": 14, + "end_date": "today", + "suppressed_errors": [ + {"signal": "bars_visit_num"}, + {"signal": "bars_visit_prop"}, + {"signal": "restaurants_visit_num"}, + {"signal": "restaurants_visit_prop"} + ] + }, + "static": { + "minimum_sample_size": 100, + "missing_se_allowed": false, + "missing_sample_size_allowed": false + }, + "dynamic": { + "ref_window_size": 7, + "smoothed_signals": [ + "completely_home_prop_7dav", + "full_time_work_prop_7dav", + "part_time_work_prop_7dav", + "median_home_dwell_time_7dav" + ] + } } } diff --git a/safegraph/run-safegraph.sh b/safegraph/run-safegraph.sh index 01c54f0e6..a8ba8b90e 100644 --- a/safegraph/run-safegraph.sh +++ b/safegraph/run-safegraph.sh @@ -12,6 +12,7 @@ rm -f ./receiving/* # Run the indicator code. echo "Running the indicator..." env/bin/python -m delphi_safegraph +env/bin/python -m delphi_utils.validator # Copy the files to the ingestion directory. echo "Copying files to the ingestion directory..." From 797ec0a9fbf73e04f81b52d9c001b1cc631b5b33 Mon Sep 17 00:00:00 2001 From: Maria Jahja Date: Sun, 21 Mar 2021 01:11:21 -0400 Subject: [PATCH 22/23] add vi/gu to state count --- claims_hosp/delphi_claims_hosp/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/claims_hosp/delphi_claims_hosp/config.py b/claims_hosp/delphi_claims_hosp/config.py index d21b46c21..0ed640ca1 100644 --- a/claims_hosp/delphi_claims_hosp/config.py +++ b/claims_hosp/delphi_claims_hosp/config.py @@ -63,7 +63,7 @@ class GeoConstants: NUM_COUNTIES = 3141 + 52 NUM_HRRS = 308 NUM_MSAS = 392 + 52 # MSA + States - NUM_STATES = 52 # including DC and PR + NUM_STATES = 54 # including DC, PR, VI, GU NUM_HHSS = 10 NUM_NATIONS = 1 From 6eabe7fdcb4e9860d72f72d3bfa909599553388b Mon Sep 17 00:00:00 2001 From: Kathryn M Mazaitis Date: Mon, 22 Mar 2021 10:50:40 -0400 Subject: [PATCH 23/23] Drop combo nmf and fb vaccine_likely_local_health from sirCAL --- ansible/templates/sir_complainsalot-params-prod.json.j2 | 4 ++-- sir_complainsalot/params.json.template | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ansible/templates/sir_complainsalot-params-prod.json.j2 b/ansible/templates/sir_complainsalot-params-prod.json.j2 index b605c536f..a15c7d70f 100644 --- a/ansible/templates/sir_complainsalot-params-prod.json.j2 +++ b/ansible/templates/sir_complainsalot-params-prod.json.j2 @@ -34,12 +34,12 @@ "fb-survey": { "max_age": 3, "maintainers": ["U01069KCRS7"], - "retired-signals": ["smoothed_anxious_5d", "smoothed_wanxious_5d", "smoothed_depressed_5d", "smoothed_wdepressed_5d", "smoothed_felt_isolated_5d", "smoothed_wfelt_isolated_5d", "smoothed_large_event_1d", "smoothed_wlarge_event_1d", "smoothed_restaurant_1d", "smoothed_wrestaurant_1d", "smoothed_shop_1d", "smoothed_wshop_1d", "smoothed_spent_time_1d", "smoothed_wspent_time_1d", "smoothed_travel_outside_state_5d", "smoothed_wtravel_outside_state_5d", "smoothed_work_outside_home_1d", "smoothed_wwork_outside_home_1d", "smoothed_wearing_mask", "smoothed_wwearing_mask"] + "retired-signals": ["smoothed_anxious_5d", "smoothed_wanxious_5d", "smoothed_depressed_5d", "smoothed_wdepressed_5d", "smoothed_felt_isolated_5d", "smoothed_wfelt_isolated_5d", "smoothed_large_event_1d", "smoothed_wlarge_event_1d", "smoothed_restaurant_1d", "smoothed_wrestaurant_1d", "smoothed_shop_1d", "smoothed_wshop_1d", "smoothed_spent_time_1d", "smoothed_wspent_time_1d", "smoothed_travel_outside_state_5d", "smoothed_wtravel_outside_state_5d", "smoothed_work_outside_home_1d", "smoothed_wwork_outside_home_1d", "smoothed_wearing_mask", "smoothed_wwearing_mask", "smoothed_vaccine_likely_local_health", "smoothed_wvaccine_likely_local_health"] }, "indicator-combination": { "max_age": 4, "maintainers": ["U01AP8GSWG3","U01069KCRS7"], - "retired-signals": ["nmf_day_doc_fbs_ght"] + "retired-signals": ["nmf_day_doc_fbs_ght", "nmf_day_doc_fbc_fbs_ght"] }, "quidel": { "max_age":6, diff --git a/sir_complainsalot/params.json.template b/sir_complainsalot/params.json.template index 140e31538..1e2ee3d1a 100644 --- a/sir_complainsalot/params.json.template +++ b/sir_complainsalot/params.json.template @@ -35,12 +35,12 @@ "fb-survey": { "max_age": 3, "maintainers": ["U01069KCRS7"], - "retired-signals": ["smoothed_anxious_5d", "smoothed_wanxious_5d", "smoothed_depressed_5d", "smoothed_wdepressed_5d", "smoothed_felt_isolated_5d", "smoothed_wfelt_isolated_5d", "smoothed_large_event_1d", "smoothed_wlarge_event_1d", "smoothed_restaurant_1d", "smoothed_wrestaurant_1d", "smoothed_shop_1d", "smoothed_wshop_1d", "smoothed_spent_time_1d", "smoothed_wspent_time_1d", "smoothed_travel_outside_state_5d", "smoothed_wtravel_outside_state_5d", "smoothed_work_outside_home_1d", "smoothed_wwork_outside_home_1d", "smoothed_wearing_mask", "smoothed_wwearing_mask"] + "retired-signals": ["smoothed_anxious_5d", "smoothed_wanxious_5d", "smoothed_depressed_5d", "smoothed_wdepressed_5d", "smoothed_felt_isolated_5d", "smoothed_wfelt_isolated_5d", "smoothed_large_event_1d", "smoothed_wlarge_event_1d", "smoothed_restaurant_1d", "smoothed_wrestaurant_1d", "smoothed_shop_1d", "smoothed_wshop_1d", "smoothed_spent_time_1d", "smoothed_wspent_time_1d", "smoothed_travel_outside_state_5d", "smoothed_wtravel_outside_state_5d", "smoothed_work_outside_home_1d", "smoothed_wwork_outside_home_1d", "smoothed_wearing_mask", "smoothed_wwearing_mask", "smoothed_vaccine_likely_local_health", "smoothed_wvaccine_likely_local_health"] }, "indicator-combination": { "max_age": 3, "maintainers": ["U01AP8GSWG3","U01069KCRS7"], - "retired-signals": ["nmf_day_doc_fbs_ght"] + "retired-signals": ["nmf_day_doc_fbs_ght", "nmf_day_doc_fbc_fbs_ght"] }, "quidel": { "max_age":6,