Skip to content

Commit 57e8305

Browse files
minhkhuldshemetov
andauthored
Add hsanci to nssp (#2162)
* add hsanci code logic * lint: make happy * hsanci -> hsa-nci + add comment about min() for dedupe * use drop_duplicates * adjust test for new timeout --------- Co-authored-by: Dmitry Shemetov <dshemeto@andrew.cmu.edu>
1 parent 166ea2c commit 57e8305

File tree

4 files changed

+13
-3
lines changed

4 files changed

+13
-3
lines changed

nssp/delphi_nssp/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
"state",
1010
"county",
1111
"hhs",
12+
"hsa-nci",
1213
]
1314

1415
SIGNALS_MAP = {

nssp/delphi_nssp/pull.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ def pull_with_socrata_api(socrata_token: str, dataset_id: str):
113113
-------
114114
list of dictionaries, each representing a row in the dataset
115115
"""
116-
client = Socrata("data.cdc.gov", socrata_token)
116+
client = Socrata("data.cdc.gov", socrata_token, timeout=50) # set timeout to avoid read timed out error
117117
results = []
118118
offset = 0
119119
limit = 50000 # maximum limit allowed by SODA 2.0
@@ -177,5 +177,5 @@ def pull_nssp_data(
177177
# Format county fips to all be 5 digits with leading zeros
178178
df_ervisits["fips"] = df_ervisits["fips"].apply(lambda x: str(x).zfill(5) if str(x) != "0" else "0")
179179

180-
keep_columns = ["timestamp", "geography", "county", "fips"]
180+
keep_columns = ["timestamp", "geography", "county", "fips", "hsa_nci_id"]
181181
return df_ervisits[SIGNALS + keep_columns]

nssp/delphi_nssp/run.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ def add_needed_columns(df, col_names=None):
4545
df = add_default_nancodes(df)
4646
return df
4747

48+
4849
def logging(start_time, run_stats, logger):
4950
"""Boilerplate making logs."""
5051
elapsed_time_in_seconds = round(time.time() - start_time, 2)
@@ -137,6 +138,14 @@ def run_module(params, logger=None):
137138
df = geo_mapper.add_geocode(df, "state_code", "hhs", from_col="state_code", new_col="geo_id")
138139
df = geo_mapper.aggregate_by_weighted_sum(df, "geo_id", "val", "timestamp", "population")
139140
df = df.rename(columns={"weighted_val": "val"})
141+
elif geo == "hsa-nci":
142+
df = df[["hsa_nci_id", "val", "timestamp"]]
143+
df = df[df["hsa_nci_id"] != "All"]
144+
# We use drop_duplicates below just to pick a representative value,
145+
# since all the values in a given HSA-NCI level are the same
146+
# (the data is reported at the HSA-NCI level).
147+
df.drop_duplicates(["hsa_nci_id", "timestamp", "val"], inplace=True)
148+
df = df.rename(columns={"hsa_nci_id": "geo_id"})
140149
else:
141150
df = df[df["county"] != "All"]
142151
df["geo_id"] = df["fips"]

nssp/tests/test_pull.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ def test_normal_pull_nssp_data(self, mock_socrata, params, caplog):
7878
pd.testing.assert_frame_equal(expected_data, actual_data)
7979

8080
# Check that Socrata client was initialized with correct arguments
81-
mock_socrata.assert_called_once_with("data.cdc.gov", test_token)
81+
mock_socrata.assert_called_once_with("data.cdc.gov", test_token, timeout=50)
8282

8383
# Check that get method was called with correct arguments
8484
mock_client.get.assert_any_call("rdmq-nq56", limit=50000, offset=0)

0 commit comments

Comments
 (0)