base changes

minhkhul · minhkhul · commit 7e2229a62527 · 2024-11-06T17:18:39.000-05:00
diff --git a/nssp/delphi_nssp/constants.py b/nssp/delphi_nssp/constants.py
@@ -41,3 +41,29 @@
         "fips": str,
     }
 )
+
+SECONDARY_COLS_MAP = {
+    "week_end": "timestamp",
+    "geography": "geo_value",
+    "percent_visits": "val",
+    "pathogen": "signal",
+}
+
+SECONDARY_SIGNALS_MAP = {
+    "COVID-19": "pct_ed_visits_covid_secondary",
+    "INFLUENZA": "pct_ed_visits_influenza_secondary",
+    "RSV": "pct_ed_visits_rsv_secondary",
+    "Combined": "pct_ed_visits_combined_secondary",
+}
+
+SECONDARY_SIGNALS = [val for (key, val) in SECONDARY_SIGNALS_MAP.items()]
+SECONDARY_GEOS = ["state","nation","hhs"]
+
+SECONDARY_TYPE_DICT = {
+    "timestamp": "datetime64[ns]",
+    "geo_value": str,
+    "val": float,
+    "geo_type": str,
+    "signal": str,
+}
+SECONDARY_KEEP_COLS = [key for (key, val) in SECONDARY_TYPE_DICT.items()]
diff --git a/nssp/delphi_nssp/pull.py b/nssp/delphi_nssp/pull.py
@@ -6,7 +6,7 @@
 import pandas as pd
 from sodapy import Socrata
 
-from .constants import NEWLINE, SIGNALS, SIGNALS_MAP, TYPE_DICT
+from .constants import *
 
 
 def warn_string(df, type_dict):
@@ -28,19 +28,13 @@ def warn_string(df, type_dict):
 
 
 def pull_nssp_data(socrata_token: str):
-    """Pull the latest NSSP ER visits data, and conforms it into a dataset.
-
-    The output dataset has:
-
-    - Each row corresponds to a single observation
-    - Each row additionally has columns for the signals in SIGNALS
+    """Pull the latest NSSP ER visits primary dataset
+    https://data.cdc.gov/Public-Health-Surveillance/NSSP-Emergency-Department-Visit-Trajectories-by-St/rdmq-nq56/data_preview
 
     Parameters
     ----------
     socrata_token: str
-        My App Token for pulling the NWSS data (could be the same as the nchs data)
-    test_file: Optional[str]
-        When not null, name of file from which to read test data
+        My App Token for pulling the NSSP data (could be the same as the nchs data)
 
     Returns
     -------
@@ -72,3 +66,57 @@ def pull_nssp_data(socrata_token: str):
 
     keep_columns = ["timestamp", "geography", "county", "fips"]
     return df_ervisits[SIGNALS + keep_columns]
+
+
+def secondary_pull_nssp_data(socrata_token: str):
+    """Pull the latest NSSP ER visits secondary dataset:
+    https://data.cdc.gov/Public-Health-Surveillance/2023-Respiratory-Virus-Response-NSSP-Emergency-Dep/7mra-9cq9/data_preview
+
+    The output dataset has:
+
+    - Each row corresponds to a single observation
+
+    Parameters
+    ----------
+    socrata_token: str
+        My App Token for pulling the NSSP data (could be the same as the nchs data)
+
+    Returns
+    -------
+    pd.DataFrame
+        Dataframe as described above.
+    """
+    # Pull data from Socrata API
+    client = Socrata("data.cdc.gov", socrata_token)
+    results = []
+    offset = 0
+    limit = 50000  # maximum limit allowed by SODA 2.0
+    while True:
+        page = client.get("7mra-9cq9", limit=limit, offset=offset)
+        if not page:
+            break  # exit the loop if no more results
+        results.extend(page)
+        offset += limit
+    df_ervisits = pd.DataFrame.from_records(results)
+    df_ervisits = df_ervisits.rename(columns=SECONDARY_COLS_MAP)
+
+    # geo_type is not provided in the dataset, so we infer it from the geo_value
+    # which is either state names, "National" or hhs region numbers
+    df_ervisits['geo_type'] = 'state'
+
+    df_ervisits.loc[df_ervisits['geo_value'] == 'National', 'geo_type'] = 'nation'
+
+    hhs_region_mask = df_ervisits['geo_value'].str.startswith('Region ')
+    df_ervisits.loc[hhs_region_mask, 'geo_value'] = df_ervisits.loc[hhs_region_mask, 'geo_value'].str.replace('Region ', '')
+    df_ervisits.loc[hhs_region_mask, 'geo_type'] = 'hhs'
+
+    df_ervisits['signal'] = df_ervisits['signal'].map(SECONDARY_SIGNALS_MAP)
+
+    df_ervisits = df_ervisits[SECONDARY_KEEP_COLS]
+
+    try:
+        df_ervisits = df_ervisits.astype(SECONDARY_TYPE_DICT)
+    except KeyError as exc:
+        raise ValueError(warn_string(df_ervisits, SECONDARY_TYPE_DICT)) from exc
+
+    return df_ervisits
diff --git a/nssp/delphi_nssp/run.py b/nssp/delphi_nssp/run.py
@@ -31,8 +31,8 @@
 from delphi_utils.geomap import GeoMapper
 from delphi_utils.nancodes import add_default_nancodes
 
-from .constants import AUXILIARY_COLS, CSV_COLS, GEOS, SIGNALS
-from .pull import pull_nssp_data
+from .constants import *
+from .pull import pull_nssp_data, secondary_pull_nssp_data
 
 
 def add_needed_columns(df, col_names=None):
@@ -81,6 +81,7 @@ def run_module(params):
     socrata_token = params["indicator"]["socrata_token"]
 
     run_stats = []
+
     ## build the base version of the signal at the most detailed geo level you can get.
     ## compute stuff here or farm out to another function or file
     df_pull = pull_nssp_data(socrata_token)
@@ -137,5 +138,45 @@ def run_module(params):
             if len(dates) > 0:
                 run_stats.append((max(dates), len(dates)))
 
+    secondary_df_pull = secondary_pull_nssp_data(socrata_token)
+    ## aggregate
+    geo_mapper = GeoMapper()
+    for signal in SECONDARY_SIGNALS:
+        for geo in SECONDARY_GEOS:
+            df = secondary_df_pull.copy()
+            logger.info("Generating signal and exporting to CSV", geo_type=geo, signal=signal)
+            if geo == "state":
+                df = df[(df["geo_type"] == "state")]
+                df["geo_id"] = df["geo_value"].apply(
+                    lambda x: (
+                        us.states.lookup(x).abbr.lower() if us.states.lookup(x)
+                        else ("dc" if x == "District of Columbia" else x)
+                    )
+                )
+                unexpected_state_names = df[df["geo_id"] == df["geo_value"]]
+                if unexpected_state_names.shape[0] > 0:
+                    logger.error("Unexpected state names", df=unexpected_state_names)
+                    exit(1)
+            elif geo == "nation":
+                df = df[(df["geo_type"] == "nation")]
+                df["geo_id"] = "us"
+            elif geo == "hhs":
+                df = df[(df["geo_type"] == "hhs")]
+                df["geo_id"] = df["geo_type"]
+            # add se, sample_size, and na codes
+            missing_cols = set(CSV_COLS) - set(df.columns)
+            df = add_needed_columns(df, col_names=list(missing_cols))
+            df_csv = df[CSV_COLS + ["timestamp"]]
+            # actual export
+            dates = create_export_csv(
+                df_csv,
+                geo_res=geo,
+                export_dir=export_dir,
+                sensor=signal,
+                weekly_dates=True,
+            )
+            if len(dates) > 0:
+                run_stats.append((max(dates), len(dates)))
+
     ## log this indicator run
     logging(start_time, run_stats, logger)