11"""Collects and reads covidcast data from a set of local CSV files."""
22
33# standard library
4- from dataclasses import dataclass
5- from datetime import date
6- import glob
74import os
85import re
6+ from dataclasses import dataclass
7+ from datetime import date
8+ from glob import glob
9+ from typing import Iterator , NamedTuple , Optional , Tuple
910
1011# third party
11- import pandas as pd
1212import epiweeks as epi
13+ import pandas as pd
1314
1415# first party
1516from delphi_utils import Nans
1617from delphi .utils .epiweek import delta_epiweeks
17- from .logger import get_structured_logger
18+ from delphi .epidata .acquisition .covidcast .database import CovidcastRow
19+ from delphi .epidata .acquisition .covidcast .logger import get_structured_logger
20+
21+ DFRow = NamedTuple ('DFRow' , [('geo_id' , str ), ('value' , float ), ('stderr' , float ), ('sample_size' , float ), ('missing_value' , int ), ('missing_stderr' , int ), ('missing_sample_size' , int )])
22+ PathDetails = NamedTuple ('PathDetails' , [('issue' , int ), ('lag' , int ), ('source' , str ), ('signal' , str ), ('time_type' , str ), ('time_value' , int ), ('geo_type' , str )])
23+
1824
1925@dataclass
2026class CsvRowValue :
@@ -27,6 +33,7 @@ class CsvRowValue:
2733 missing_stderr : int
2834 missing_sample_size : int
2935
36+
3037class CsvImporter :
3138 """Finds and parses covidcast CSV files."""
3239
@@ -60,6 +67,7 @@ class CsvImporter:
6067 "missing_sample_size" : "Int64"
6168 }
6269
70+
6371 @staticmethod
6472 def is_sane_day (value ):
6573 """Return whether `value` is a sane (maybe not valid) YYYYMMDD date.
@@ -76,6 +84,7 @@ def is_sane_day(value):
7684 return False
7785 return date (year = year ,month = month ,day = day )
7886
87+
7988 @staticmethod
8089 def is_sane_week (value ):
8190 """Return whether `value` is a sane (maybe not valid) YYYYWW epiweek.
@@ -91,22 +100,24 @@ def is_sane_week(value):
91100 return False
92101 return value
93102
103+
94104 @staticmethod
95- def find_issue_specific_csv_files (scan_dir , glob = glob ):
105+ def find_issue_specific_csv_files (scan_dir ):
96106 logger = get_structured_logger ('find_issue_specific_csv_files' )
97- for path in sorted (glob . glob (os .path .join (scan_dir , '*' ))):
107+ for path in sorted (glob (os .path .join (scan_dir , '*' ))):
98108 issuedir_match = CsvImporter .PATTERN_ISSUE_DIR .match (path .lower ())
99109 if issuedir_match and os .path .isdir (path ):
100110 issue_date_value = int (issuedir_match .group (2 ))
101111 issue_date = CsvImporter .is_sane_day (issue_date_value )
102112 if issue_date :
103113 logger .info (event = 'processing csv files from issue' , detail = issue_date , file = path )
104- yield from CsvImporter .find_csv_files (path , issue = (issue_date , epi .Week .fromdate (issue_date )), glob = glob )
114+ yield from CsvImporter .find_csv_files (path , issue = (issue_date , epi .Week .fromdate (issue_date )))
105115 else :
106116 logger .warning (event = 'invalid issue directory day' , detail = issue_date_value , file = path )
107117
118+
108119 @staticmethod
109- def find_csv_files (scan_dir , issue = (date .today (), epi .Week .fromdate (date .today ())), glob = glob ):
120+ def find_csv_files (scan_dir , issue = (date .today (), epi .Week .fromdate (date .today ()))):
110121 """Recursively search for and yield covidcast-format CSV files.
111122
112123 scan_dir: the directory to scan (recursively)
@@ -122,11 +133,11 @@ def find_csv_files(scan_dir, issue=(date.today(), epi.Week.fromdate(date.today()
122133 issue_value = - 1
123134 lag_value = - 1
124135
125- for path in sorted (glob . glob (os .path .join (scan_dir , '*' , '*' ))):
126-
136+ for path in sorted (glob (os .path .join (scan_dir , '*' , '*' ))):
137+ # safe to ignore this file
127138 if not path .lower ().endswith ('.csv' ):
128- # safe to ignore this file
129139 continue
140+
130141 # match a daily or weekly naming pattern
131142 daily_match = CsvImporter .PATTERN_DAILY .match (path .lower ())
132143 weekly_match = CsvImporter .PATTERN_WEEKLY .match (path .lower ())
@@ -174,14 +185,16 @@ def find_csv_files(scan_dir, issue=(date.today(), epi.Week.fromdate(date.today()
174185 yield (path , None )
175186 continue
176187
177- yield (path , (source , signal , time_type , geo_type , time_value , issue_value , lag_value ))
188+ yield (path , PathDetails (issue_value , lag_value , source , signal , time_type , time_value , geo_type ))
189+
178190
179191 @staticmethod
180192 def is_header_valid (columns ):
181193 """Return whether the given pandas columns contains the required fields."""
182194
183195 return set (columns ) >= CsvImporter .REQUIRED_COLUMNS
184196
197+
185198 @staticmethod
186199 def floaty_int (value : str ) -> int :
187200 """Cast a string to an int, even if it looks like a float.
@@ -195,6 +208,7 @@ def floaty_int(value: str) -> int:
195208 raise ValueError ('not an int: "%s"' % str (value ))
196209 return int (float_value )
197210
211+
198212 @staticmethod
199213 def maybe_apply (func , quantity ):
200214 """Apply the given function to the given quantity if not null-ish."""
@@ -205,6 +219,7 @@ def maybe_apply(func, quantity):
205219 else :
206220 return func (quantity )
207221
222+
208223 @staticmethod
209224 def validate_quantity (row , attr_quantity ):
210225 """Take a row and validate a given associated quantity (e.g., val, se, stderr).
@@ -218,6 +233,7 @@ def validate_quantity(row, attr_quantity):
218233 # val was a string or another data
219234 return "Error"
220235
236+
221237 @staticmethod
222238 def validate_missing_code (row , attr_quantity , attr_name , filepath = None , logger = None ):
223239 """Take a row and validate the missing code associated with
@@ -250,8 +266,9 @@ def validate_missing_code(row, attr_quantity, attr_name, filepath=None, logger=N
250266
251267 return missing_entry
252268
269+
253270 @staticmethod
254- def extract_and_check_row (row , geo_type , filepath = None ):
271+ def extract_and_check_row (row : DFRow , geo_type : str , filepath : Optional [ str ] = None ) -> Tuple [ Optional [ CsvRowValue ], Optional [ str ]] :
255272 """Extract and return `CsvRowValue` from a CSV row, with sanity checks.
256273
257274 Also returns the name of the field which failed sanity check, or None.
@@ -331,8 +348,9 @@ def extract_and_check_row(row, geo_type, filepath=None):
331348 # return extracted and validated row values
332349 return (CsvRowValue (geo_id , value , stderr , sample_size , missing_value , missing_stderr , missing_sample_size ), None )
333350
351+
334352 @staticmethod
335- def load_csv (filepath , geo_type ) :
353+ def load_csv (filepath : str , details : PathDetails ) -> Iterator [ Optional [ CovidcastRow ]] :
336354 """Load, validate, and yield data as `RowValues` from a CSV file.
337355
338356 filepath: the CSV file to be loaded
@@ -357,9 +375,32 @@ def load_csv(filepath, geo_type):
357375 table .rename (columns = {"val" : "value" , "se" : "stderr" , "missing_val" : "missing_value" , "missing_se" : "missing_stderr" }, inplace = True )
358376
359377 for row in table .itertuples (index = False ):
360- row_values , error = CsvImporter .extract_and_check_row (row , geo_type , filepath )
378+ csv_row_values , error = CsvImporter .extract_and_check_row (row , details .geo_type , filepath )
379+
361380 if error :
362381 logger .warning (event = 'invalid value for row' , detail = (str (row ), error ), file = filepath )
363382 yield None
364383 continue
365- yield row_values
384+
385+ yield CovidcastRow (
386+ details .source ,
387+ details .signal ,
388+ details .time_type ,
389+ details .geo_type ,
390+ details .time_value ,
391+ csv_row_values .geo_value ,
392+ csv_row_values .value ,
393+ csv_row_values .stderr ,
394+ csv_row_values .sample_size ,
395+ csv_row_values .missing_value ,
396+ csv_row_values .missing_stderr ,
397+ csv_row_values .missing_sample_size ,
398+ details .issue ,
399+ details .lag ,
400+ # These four fields are unused by database acquisition
401+ # TODO: These will be used when CovidcastRow is updated.
402+ # id=None,
403+ # direction=None,
404+ # direction_updated_timestamp=0,
405+ # value_updated_timestamp=0,
406+ )
0 commit comments