@@ -105,63 +105,64 @@ normalize <- function(x) {
105105# and prints out the corresponding days (using annotations got a little messy)
106106# that they start as well in the console
107107# also returns the number of gaps in the signal
108- plot_signals <- function(x, version= NULL, verbose= TRUE){
109- title = "Normalized Signals Comparison"
110- if (!is.null(version)){
111- if(verbose){
108+ plot_signals <- function(x, version = NULL, verbose = TRUE) {
109+ title <- "Normalized Signals Comparison"
110+ if (!is.null(version)) {
111+ if (verbose) {
112112 print(paste0("Version: ", version))
113113 }
114- title = paste0("Normalized Signals Comparison\tVersion ", version)
114+ title <- paste0("Normalized Signals Comparison\tVersion ", version)
115115 }
116-
116+
117117 # x <- x %>% filter(geo_value == "ca")
118118 num_signals <- length(colnames(x)) - 2
119119 first_non_nas <- list()
120120 na_regions_list <- list()
121-
122- for (col in colnames(x)){
121+
122+ for (col in colnames(x)) {
123123 if (col == "geo_value" | col == "time_value") {
124124 next
125- }
126- else{
125+ } else {
127126 x[[col]] <- normalize(x[[col]])
128127 pair <- list(col, na.omit(x[, c("time_value", col)])[1, ]$time_value)
129128 first_non_nas <- c(first_non_nas, list(pair))
130-
129+
131130 na_regions <- x %>%
132131 select(time_value, !!sym(col))
133-
134- na_regions <- na_regions %>%
135- mutate(is_na = is.na(!!sym(col)))
136-
137- na_regions <- na_regions %>%
132+
133+ na_regions <- na_regions %>%
134+ mutate(is_na = is.na(!!sym(col)))
135+
136+ na_regions <- na_regions %>%
138137 group_by(group = cumsum(!is_na))
139-
140- na_regions <- na_regions %>%
138+
139+ na_regions <- na_regions %>%
141140 filter(is_na == TRUE)
142-
141+
143142 na_regions <- na_regions %>%
144- summarize(start = min(time_value), end = max(time_value), signal = col)
145-
143+ summarize(start = min(time_value), end = max(time_value), signal = col)
144+
146145 na_regions <- na_regions %>%
147146 ungroup()
148-
147+
149148 na_regions_list[[col]] <- na_regions
150149 }
151150 }
152-
153- x <- x %>% select(-geo_value) %>% gather(key = "signal", value = "value", -time_value)
154-
155- if(verbose){
151+
152+ x <- x %>%
153+ select(-geo_value) %>%
154+ gather(key = "signal", value = "value", -time_value)
155+
156+ if (verbose) {
156157 plot <- ggplot(x, aes(x = time_value, y = value, color = signal)) +
157158 geom_line() +
158159 labs(title = title, x = "Time", y = "Normalized Value") +
159160 theme_minimal()
160-
161+
161162 for (col in names(na_regions_list)) {
162163 cat(paste0("Signal: ", col, "\n"))
163164 na_regions <- na_regions_list[[col]]
164-
165+
165166 if (nrow(na_regions) > 0) {
166167 for (i in 1:nrow(na_regions)) {
167168 cat(paste0(" Gap: ", na_regions$start[i], " to ", na_regions$end[i], "\n"))
@@ -172,25 +173,29 @@ plot_signals <- function(x, version=NULL, verbose=TRUE){
172173 }
173174 for (col in names(na_regions_list)) {
174175 plot <- plot +
175- geom_rect(data = na_regions_list[[col]],
176- aes(xmin = start, xmax = end, ymin = -Inf, ymax = Inf, fill = signal),
177- color = NA, alpha = 0.2, inherit.aes = FALSE)
178-
176+ geom_rect(
177+ data = na_regions_list[[col]],
178+ aes(xmin = start, xmax = end, ymin = -Inf, ymax = Inf, fill = signal),
179+ color = NA, alpha = 0.2, inherit.aes = FALSE
180+ )
181+
179182 lines <- na_regions_list[[col]] %>% filter(start == end)
180183 plot <- plot +
181184 geom_vline(data = lines, aes(xintercept = as.numeric(start)), color = alpha("red", 0.2), linetype = "solid")
182185 }
183186 cat("\n")
184187 print(plot)
185188 }
186-
189+
187190 max_num_gaps <- max(sapply(na_regions_list, function(x) length(x[["start"]])))
188191 max_num_gaps
189192}
190193```
191194
192195``` {r visualize_signal, warning=FALSE}
193- aux_signal_latest_ca <- aux_signal %>% latest() %>% filter(geo_value == 'ca')
196+ aux_signal_latest_ca <- aux_signal %>%
197+ latest() %>%
198+ filter(geo_value == "ca")
194199plot_signals(aux_signal_latest_ca)
195200```
196201
@@ -202,10 +207,10 @@ However, NAs can arise in different settings as well. For example if we look at
202207
203208``` {r, warning=FALSE, echo=FALSE}
204209# Generating False Data
205- # initially thought to be real data, however issue dates were incorrect, and
210+ # initially thought to be real data, however issue dates were incorrect, and
206211# once fixed, there were not many NA values as I had hoped for
207212
208- generate_signal <- function(){
213+ generate_signal <- function() {
209214 cov_adm <- pub_covidcast(
210215 source = "hhs",
211216 signals = "confirmed_admissions_covid_1d",
@@ -217,27 +222,28 @@ generate_signal <- function(){
217222 ) %>%
218223 select(geo_value, time_value, version = issue, confirmed_cov = value) %>%
219224 as_epi_archive()
220-
225+
221226 versions <- unique(cov_adm[["DT"]][["version"]])
222227 # random_dates <- sample(versions, 100, replace = FALSE)
223228 max_gaps <- 0
224229 max_df <- NULL
225230 max_version <- NULL
226231 for (date in sort(versions)) {
227232 date <- as.Date(date)
228- for(geo_val in unlist(strsplit(states, split = ","))){
229- current <- epix_as_of(cov_adm, max_version = date) %>% filter(geo_value == geo_val) %>%
233+ for (geo_val in unlist(strsplit(states, split = ","))) {
234+ current <- epix_as_of(cov_adm, max_version = date) %>%
235+ filter(geo_value == geo_val) %>%
230236 na.omit()
231237 daily <- tibble(time_value = seq(
232238 from = as.Date(min(current$time_value)),
233239 to = as.Date(max(current$time_value)),
234240 by = "1 day"
235241 ))
236-
242+
237243 current <- daily %>% left_join(current, by = "time_value")
238244 current$geo_value <- geo_val
239- num_gaps <- plot_signals(current, version= date, verbose= FALSE)
240-
245+ num_gaps <- plot_signals(current, version = date, verbose = FALSE)
246+
241247 if (num_gaps > max_gaps) {
242248 max_gaps <- num_gaps
243249 max_df <- current
@@ -269,7 +275,7 @@ data <- tibble(
269275
270276data_completed <- data %>%
271277 complete(
272- geo,
278+ geo,
273279 time = seq.Date(from = min(time), to = max(time), by = "day")
274280 )
275281
@@ -315,10 +321,10 @@ No more NAs in our final signal! But this a little bit of a naive approach. Rath
315321impute_moving_average <- function(col, window_size = 3) {
316322 n <- length(col)
317323 for (i in seq(window_size, n - 1)) {
318- curr_sum <- sum(col[max(1, i- window_size+ 1):i])
324+ curr_sum <- sum(col[max(1, i - window_size + 1):i])
319325 average <- curr_sum / window_size
320326 if (col[i + 1] %>% is.na()) {
321- col[i + 1] = average
327+ col[i + 1] <- average
322328 }
323329 }
324330 col
@@ -329,7 +335,7 @@ We will plot it so you can see how the NAs were filled in. The red background is
329335
330336``` {r, warning=FALSE}
331337cov_NAs_subset_ma <- cov_NAs_subset
332- cov_NAs_subset_ma$confirmed_cov_ma <- cov_NAs_subset_ma$confirmed_cov %>% impute_moving_average
338+ cov_NAs_subset_ma$confirmed_cov_ma <- cov_NAs_subset_ma$confirmed_cov %>% impute_moving_average()
333339plot_signals(cov_NAs_subset_ma)
334340```
335341
@@ -342,14 +348,13 @@ cov_NAs_subset
342348Let's do this by hand to verify that our results are correct, by taking the previous 3 values (window size specified) and computing the average
343349
344350``` {r}
345- jul_31 = (944+ 848+ 863)/ 3
346- aug_01 = (848+ 863+ jul_31)/ 3
347- aug_04 = (aug_01+ 704+ 761)/ 3
351+ jul_31 <- (944 + 848 + 863) / 3
352+ aug_01 <- (848 + 863 + jul_31) / 3
353+ aug_04 <- (aug_01 + 704 + 761) / 3
348354
349355paste0("Value for Jul 31st: ", jul_31)
350356paste0("Value for Aug 1st: ", aug_01)
351357paste0("Value for Aug 4th: ", aug_04)
352-
353358```
354359
355360``` {r}
@@ -359,12 +364,12 @@ cov_NAs_subset_ma
359364Note that we can do this exact same appraoch also using multiple calls to ` epi_slide_mean() ` . Because we have to continually update the average each time we update an NA, especially for NAs that occur within the window range.
360365
361366``` {r}
362- cov_NAs_subset_epi_ma <- cov_NAs_subset %>% as_epi_df()
367+ cov_NAs_subset_epi_ma <- cov_NAs_subset %>% as_epi_df()
363368first <- TRUE
364369curr_pass <- cov_NAs_subset_epi_ma %>% epi_slide_mean(confirmed_cov, before = 3, na.rm = TRUE)
365370
366371for (i in 1:length(cov_NAs_subset_epi_ma$confirmed_cov)) {
367- curr_val = cov_NAs_subset_epi_ma$confirmed_cov[i]
372+ curr_val <- cov_NAs_subset_epi_ma$confirmed_cov[i]
368373 if (is.na(curr_val)) {
369374 cov_NAs_subset_epi_ma$confirmed_cov[i] <- curr_pass$slide_value_confirmed_cov[i]
370375 curr_pass <- cov_NAs_subset_epi_ma %>% epi_slide_mean(confirmed_cov, before = 3, na.rm = TRUE)
@@ -387,7 +392,7 @@ The next step is linear interpolation. Here we can think of this as using linear
387392# second passes goes through and runs the regressions between each end point
388393linear_interpolate_2pass <- function(values) {
389394 interpolated_values <- values
390-
395+
391396 # first pass
392397 na_gaps <- list()
393398 in_gap <- FALSE
@@ -407,7 +412,7 @@ linear_interpolate_2pass <- function(values) {
407412 }
408413 }
409414 }
410-
415+
411416 # second pass
412417 for (gap in na_gaps) {
413418 start <- gap[1]
@@ -419,7 +424,7 @@ linear_interpolate_2pass <- function(values) {
419424 interpolated_values[(start + 1):(end - 1)] <- interpolated_section[-c(1, length(interpolated_section))]
420425 }
421426 }
422-
427+
423428 return(interpolated_values)
424429}
425430```
@@ -509,9 +514,8 @@ example_archive_y <- tribble(
509514 version = as.Date(version)
510515 ) %>%
511516 as_epi_archive(compactify = FALSE)
512-
513517```
514518
515519``` {r}
516- epix_merge(example_archive_x, example_archive_y, sync=' locf' )
517- ```
520+ epix_merge(example_archive_x, example_archive_y, sync = " locf" )
521+ ```
0 commit comments