Skip to content

Commit 98620f7

Browse files
committed
style: styler (GHA)
1 parent b9d4226 commit 98620f7

File tree

1 file changed

+62
-58
lines changed

1 file changed

+62
-58
lines changed

vignettes/na-notebook.Rmd

Lines changed: 62 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -105,63 +105,64 @@ normalize <- function(x) {
105105
# and prints out the corresponding days (using annotations got a little messy)
106106
# that they start as well in the console
107107
# also returns the number of gaps in the signal
108-
plot_signals <- function(x, version=NULL, verbose=TRUE){
109-
title = "Normalized Signals Comparison"
110-
if (!is.null(version)){
111-
if(verbose){
108+
plot_signals <- function(x, version = NULL, verbose = TRUE) {
109+
title <- "Normalized Signals Comparison"
110+
if (!is.null(version)) {
111+
if (verbose) {
112112
print(paste0("Version: ", version))
113113
}
114-
title = paste0("Normalized Signals Comparison\tVersion ", version)
114+
title <- paste0("Normalized Signals Comparison\tVersion ", version)
115115
}
116-
116+
117117
# x <- x %>% filter(geo_value == "ca")
118118
num_signals <- length(colnames(x)) - 2
119119
first_non_nas <- list()
120120
na_regions_list <- list()
121-
122-
for (col in colnames(x)){
121+
122+
for (col in colnames(x)) {
123123
if (col == "geo_value" | col == "time_value") {
124124
next
125-
}
126-
else{
125+
} else {
127126
x[[col]] <- normalize(x[[col]])
128127
pair <- list(col, na.omit(x[, c("time_value", col)])[1, ]$time_value)
129128
first_non_nas <- c(first_non_nas, list(pair))
130-
129+
131130
na_regions <- x %>%
132131
select(time_value, !!sym(col))
133-
134-
na_regions <- na_regions %>%
135-
mutate(is_na = is.na(!!sym(col)))
136-
137-
na_regions <- na_regions %>%
132+
133+
na_regions <- na_regions %>%
134+
mutate(is_na = is.na(!!sym(col)))
135+
136+
na_regions <- na_regions %>%
138137
group_by(group = cumsum(!is_na))
139-
140-
na_regions <- na_regions %>%
138+
139+
na_regions <- na_regions %>%
141140
filter(is_na == TRUE)
142-
141+
143142
na_regions <- na_regions %>%
144-
summarize(start = min(time_value), end = max(time_value), signal = col)
145-
143+
summarize(start = min(time_value), end = max(time_value), signal = col)
144+
146145
na_regions <- na_regions %>%
147146
ungroup()
148-
147+
149148
na_regions_list[[col]] <- na_regions
150149
}
151150
}
152-
153-
x <- x %>% select(-geo_value) %>% gather(key = "signal", value = "value", -time_value)
154-
155-
if(verbose){
151+
152+
x <- x %>%
153+
select(-geo_value) %>%
154+
gather(key = "signal", value = "value", -time_value)
155+
156+
if (verbose) {
156157
plot <- ggplot(x, aes(x = time_value, y = value, color = signal)) +
157158
geom_line() +
158159
labs(title = title, x = "Time", y = "Normalized Value") +
159160
theme_minimal()
160-
161+
161162
for (col in names(na_regions_list)) {
162163
cat(paste0("Signal: ", col, "\n"))
163164
na_regions <- na_regions_list[[col]]
164-
165+
165166
if (nrow(na_regions) > 0) {
166167
for (i in 1:nrow(na_regions)) {
167168
cat(paste0(" Gap: ", na_regions$start[i], " to ", na_regions$end[i], "\n"))
@@ -172,25 +173,29 @@ plot_signals <- function(x, version=NULL, verbose=TRUE){
172173
}
173174
for (col in names(na_regions_list)) {
174175
plot <- plot +
175-
geom_rect(data = na_regions_list[[col]],
176-
aes(xmin = start, xmax = end, ymin = -Inf, ymax = Inf, fill = signal),
177-
color = NA, alpha = 0.2, inherit.aes = FALSE)
178-
176+
geom_rect(
177+
data = na_regions_list[[col]],
178+
aes(xmin = start, xmax = end, ymin = -Inf, ymax = Inf, fill = signal),
179+
color = NA, alpha = 0.2, inherit.aes = FALSE
180+
)
181+
179182
lines <- na_regions_list[[col]] %>% filter(start == end)
180183
plot <- plot +
181184
geom_vline(data = lines, aes(xintercept = as.numeric(start)), color = alpha("red", 0.2), linetype = "solid")
182185
}
183186
cat("\n")
184187
print(plot)
185188
}
186-
189+
187190
max_num_gaps <- max(sapply(na_regions_list, function(x) length(x[["start"]])))
188191
max_num_gaps
189192
}
190193
```
191194

192195
```{r visualize_signal, warning=FALSE}
193-
aux_signal_latest_ca <- aux_signal %>% latest() %>% filter(geo_value == 'ca')
196+
aux_signal_latest_ca <- aux_signal %>%
197+
latest() %>%
198+
filter(geo_value == "ca")
194199
plot_signals(aux_signal_latest_ca)
195200
```
196201

@@ -202,10 +207,10 @@ However, NAs can arise in different settings as well. For example if we look at
202207

203208
```{r, warning=FALSE, echo=FALSE}
204209
# Generating False Data
205-
# initially thought to be real data, however issue dates were incorrect, and
210+
# initially thought to be real data, however issue dates were incorrect, and
206211
# once fixed, there were not many NA values as I had hoped for
207212
208-
generate_signal <- function(){
213+
generate_signal <- function() {
209214
cov_adm <- pub_covidcast(
210215
source = "hhs",
211216
signals = "confirmed_admissions_covid_1d",
@@ -217,27 +222,28 @@ generate_signal <- function(){
217222
) %>%
218223
select(geo_value, time_value, version = issue, confirmed_cov = value) %>%
219224
as_epi_archive()
220-
225+
221226
versions <- unique(cov_adm[["DT"]][["version"]])
222227
# random_dates <- sample(versions, 100, replace = FALSE)
223228
max_gaps <- 0
224229
max_df <- NULL
225230
max_version <- NULL
226231
for (date in sort(versions)) {
227232
date <- as.Date(date)
228-
for(geo_val in unlist(strsplit(states, split = ","))){
229-
current <- epix_as_of(cov_adm, max_version = date) %>% filter(geo_value == geo_val) %>%
233+
for (geo_val in unlist(strsplit(states, split = ","))) {
234+
current <- epix_as_of(cov_adm, max_version = date) %>%
235+
filter(geo_value == geo_val) %>%
230236
na.omit()
231237
daily <- tibble(time_value = seq(
232238
from = as.Date(min(current$time_value)),
233239
to = as.Date(max(current$time_value)),
234240
by = "1 day"
235241
))
236-
242+
237243
current <- daily %>% left_join(current, by = "time_value")
238244
current$geo_value <- geo_val
239-
num_gaps <- plot_signals(current, version=date, verbose=FALSE)
240-
245+
num_gaps <- plot_signals(current, version = date, verbose = FALSE)
246+
241247
if (num_gaps > max_gaps) {
242248
max_gaps <- num_gaps
243249
max_df <- current
@@ -269,7 +275,7 @@ data <- tibble(
269275
270276
data_completed <- data %>%
271277
complete(
272-
geo,
278+
geo,
273279
time = seq.Date(from = min(time), to = max(time), by = "day")
274280
)
275281
@@ -315,10 +321,10 @@ No more NAs in our final signal! But this a little bit of a naive approach. Rath
315321
impute_moving_average <- function(col, window_size = 3) {
316322
n <- length(col)
317323
for (i in seq(window_size, n - 1)) {
318-
curr_sum <- sum(col[max(1, i-window_size+1):i])
324+
curr_sum <- sum(col[max(1, i - window_size + 1):i])
319325
average <- curr_sum / window_size
320326
if (col[i + 1] %>% is.na()) {
321-
col[i + 1] = average
327+
col[i + 1] <- average
322328
}
323329
}
324330
col
@@ -329,7 +335,7 @@ We will plot it so you can see how the NAs were filled in. The red background is
329335

330336
```{r, warning=FALSE}
331337
cov_NAs_subset_ma <- cov_NAs_subset
332-
cov_NAs_subset_ma$confirmed_cov_ma <- cov_NAs_subset_ma$confirmed_cov %>% impute_moving_average
338+
cov_NAs_subset_ma$confirmed_cov_ma <- cov_NAs_subset_ma$confirmed_cov %>% impute_moving_average()
333339
plot_signals(cov_NAs_subset_ma)
334340
```
335341

@@ -342,14 +348,13 @@ cov_NAs_subset
342348
Let's do this by hand to verify that our results are correct, by taking the previous 3 values (window size specified) and computing the average
343349

344350
```{r}
345-
jul_31 = (944+848+863)/3
346-
aug_01 = (848+863+jul_31)/3
347-
aug_04 = (aug_01+704+761)/3
351+
jul_31 <- (944 + 848 + 863) / 3
352+
aug_01 <- (848 + 863 + jul_31) / 3
353+
aug_04 <- (aug_01 + 704 + 761) / 3
348354
349355
paste0("Value for Jul 31st: ", jul_31)
350356
paste0("Value for Aug 1st: ", aug_01)
351357
paste0("Value for Aug 4th: ", aug_04)
352-
353358
```
354359

355360
```{r}
@@ -359,12 +364,12 @@ cov_NAs_subset_ma
359364
Note that we can do this exact same appraoch also using multiple calls to `epi_slide_mean()`. Because we have to continually update the average each time we update an NA, especially for NAs that occur within the window range.
360365

361366
```{r}
362-
cov_NAs_subset_epi_ma <- cov_NAs_subset %>% as_epi_df()
367+
cov_NAs_subset_epi_ma <- cov_NAs_subset %>% as_epi_df()
363368
first <- TRUE
364369
curr_pass <- cov_NAs_subset_epi_ma %>% epi_slide_mean(confirmed_cov, before = 3, na.rm = TRUE)
365370
366371
for (i in 1:length(cov_NAs_subset_epi_ma$confirmed_cov)) {
367-
curr_val = cov_NAs_subset_epi_ma$confirmed_cov[i]
372+
curr_val <- cov_NAs_subset_epi_ma$confirmed_cov[i]
368373
if (is.na(curr_val)) {
369374
cov_NAs_subset_epi_ma$confirmed_cov[i] <- curr_pass$slide_value_confirmed_cov[i]
370375
curr_pass <- cov_NAs_subset_epi_ma %>% epi_slide_mean(confirmed_cov, before = 3, na.rm = TRUE)
@@ -387,7 +392,7 @@ The next step is linear interpolation. Here we can think of this as using linear
387392
# second passes goes through and runs the regressions between each end point
388393
linear_interpolate_2pass <- function(values) {
389394
interpolated_values <- values
390-
395+
391396
# first pass
392397
na_gaps <- list()
393398
in_gap <- FALSE
@@ -407,7 +412,7 @@ linear_interpolate_2pass <- function(values) {
407412
}
408413
}
409414
}
410-
415+
411416
# second pass
412417
for (gap in na_gaps) {
413418
start <- gap[1]
@@ -419,7 +424,7 @@ linear_interpolate_2pass <- function(values) {
419424
interpolated_values[(start + 1):(end - 1)] <- interpolated_section[-c(1, length(interpolated_section))]
420425
}
421426
}
422-
427+
423428
return(interpolated_values)
424429
}
425430
```
@@ -509,9 +514,8 @@ example_archive_y <- tribble(
509514
version = as.Date(version)
510515
) %>%
511516
as_epi_archive(compactify = FALSE)
512-
513517
```
514518

515519
```{r}
516-
epix_merge(example_archive_x, example_archive_y, sync='locf')
517-
```
520+
epix_merge(example_archive_x, example_archive_y, sync = "locf")
521+
```

0 commit comments

Comments
 (0)