Skip to content

Commit 63e5ae7

Browse files
committed
NY = Upstate + NYC
This is part of a cross-repo update that splits NY ("ny") into Upstate ("ny_minus_jfk") and NYC ("jfk"). Also, replace embedded locations with utils/geo/locations.
1 parent c32fe15 commit 63e5ae7

File tree

4 files changed

+106
-122
lines changed

4 files changed

+106
-122
lines changed

src/acquisition/fluview/fluview_locations.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,9 @@
7474
'new hampshire': 'nh',
7575
'new jersey': 'nj',
7676
'new mexico': 'nm',
77-
'new york': 'ny',
77+
# Even though it's called "New York", this location doesn't include New
78+
# York City ("jfk"). New York ("ny") is actually this *plus* jfk.
79+
'new york': 'ny_minus_jfk',
7880
'north carolina': 'nc',
7981
'north dakota': 'nd',
8082
'ohio': 'oh',

src/acquisition/fluview/fluview_update.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@
4949
issue: the epiweek of publication (e.g. issue 201453 includes epiweeks up to
5050
and including 2014w53, but not 2015w01 or following)
5151
epiweek: the epiweek during which the data was collected
52-
region: the name of the location (e.g. 'nat', 'hhs1', 'cen9', 'pa', 'nyc')
52+
region: the name of the location (e.g. 'nat', 'hhs1', 'cen9', 'pa', 'jfk')
5353
lag: number of weeks between `epiweek` and `issue`
5454
num_ili: the number of ILI cases (numerator)
5555
num_patients: the total number of patients (denominator)

src/acquisition/fluview/impute_missing_values.py

Lines changed: 54 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
issue: the epiweek of publication (e.g. issue 201453 includes epiweeks up to
3838
and including 2014w53, but not 2015w01 or following)
3939
epiweek: the epiweek during which the data was collected
40-
region: the name of the location (e.g. 'fl', 'la', 'ms', 'pr', 'vi')
40+
region: the name of the location (e.g. 'fl', 'la', 'ms', 'pr', 'vi', 'ny')
4141
lag: number of weeks between `epiweek` and `issue`
4242
num_ili: the number of ILI cases (numerator)
4343
num_patients: the total number of patients (denominator)
@@ -56,6 +56,7 @@
5656
from delphi.epidata.acquisition.fluview import fluview_locations
5757
import delphi.operations.secrets as secrets
5858
from delphi.utils.epiweek import delta_epiweeks
59+
from delphi.utils.geo.locations import Locations
5960

6061

6162
class Database:
@@ -192,115 +193,69 @@ def add_imputed_values(self, issue, epiweek, imputed):
192193
self.cur.execute(Database.Sql.add_imputed_values, args)
193194

194195

195-
class Locations:
196+
class StatespaceException(Exception):
197+
"""Used to indicate that imputation is not possible with the given inputs."""
198+
199+
200+
def get_location_graph():
196201
"""
197-
A class that encodes the hierarchy of US locations and provides utility
198-
functions for imputing ILI in those locations.
202+
Return a matrix where rows represent regions, columns represent atoms, and
203+
each entry is a 1 if the region contains the atom, otherwise 0. The
204+
corresponding lists of regions and atoms are also returned.
199205
"""
200206

201-
# Atomic regions for ILINet data.
202-
atoms = [
203-
# entire states
204-
'ak', 'al', 'ar', 'az', 'ca', 'co', 'ct', 'de', 'fl', 'ga', 'hi', 'ia',
205-
'id', 'il', 'in', 'ks', 'ky', 'la', 'ma', 'md', 'me', 'mi', 'mn', 'mo',
206-
'ms', 'mt', 'nc', 'nd', 'ne', 'nh', 'nj', 'nm', 'nv', 'oh', 'ok', 'or',
207-
'pa', 'ri', 'sc', 'sd', 'tn', 'tx', 'ut', 'va', 'vt', 'wa', 'wi', 'wv',
208-
'wy',
209-
# partial states
210-
'ny',
211-
# territories
212-
'dc', 'pr', 'vi',
213-
# cities
214-
'jfk',
215-
]
216-
217-
# National, HHS, and Census regions since we have ILINet data for those.
218-
regions = {
219-
'nat': atoms,
220-
'hhs1': ['ct', 'ma', 'me', 'nh', 'ri', 'vt'],
221-
'hhs2': ['jfk', 'nj', 'ny', 'pr', 'vi'],
222-
'hhs3': ['dc', 'de', 'md', 'pa', 'va', 'wv'],
223-
'hhs4': ['al', 'fl', 'ga', 'ky', 'ms', 'nc', 'sc', 'tn'],
224-
'hhs5': ['il', 'in', 'mi', 'mn', 'oh', 'wi'],
225-
'hhs6': ['ar', 'la', 'nm', 'ok', 'tx'],
226-
'hhs7': ['ia', 'ks', 'mo', 'ne'],
227-
'hhs8': ['co', 'mt', 'nd', 'sd', 'ut', 'wy'],
228-
'hhs9': ['az', 'ca', 'hi', 'nv'],
229-
'hhs10': ['ak', 'id', 'or', 'wa'],
230-
'cen1': ['ct', 'ma', 'me', 'nh', 'ri', 'vt'],
231-
'cen2': ['jfk', 'nj', 'ny', 'pa', 'pr', 'vi'],
232-
'cen3': ['il', 'in', 'mi', 'oh', 'wi'],
233-
'cen4': ['ia', 'ks', 'mn', 'mo', 'nd', 'ne', 'sd'],
234-
'cen5': ['dc', 'de', 'fl', 'ga', 'md', 'nc', 'sc', 'va', 'wv'],
235-
'cen6': ['al', 'ky', 'ms', 'tn'],
236-
'cen7': ['ar', 'la', 'ok', 'tx'],
237-
'cen8': ['az', 'co', 'id', 'mt', 'nm', 'nv', 'ut', 'wy'],
238-
'cen9': ['ak', 'ca', 'hi', 'or', 'wa'],
239-
}
240-
241-
# Atomic locations are like regions containing only themselves.
242-
regions.update(dict([(a, [a]) for a in atoms]))
243-
244-
@staticmethod
245-
def get_location_graph():
246-
"""
247-
Return a matrix where rows represent regions, columns represent atoms, and
248-
each entry is a 1 if the region contains the atom, otherwise 0. The
249-
corresponding lists of regions and atoms are also returned.
250-
"""
207+
regions = sorted(Locations.region_list)
208+
atoms = sorted(Locations.atom_list)
209+
graph = np.zeros((len(regions), len(atoms)))
210+
for i, r in enumerate(regions):
211+
for a in Locations.region_map[r]:
212+
j = atoms.index(a)
213+
graph[i, j] = 1
214+
return graph, regions, atoms
251215

252-
regions = sorted(Locations.regions.keys())
253-
atoms = sorted(Locations.atoms)
254-
graph = np.zeros((len(regions), len(atoms)))
255-
for i, r in enumerate(regions):
256-
for a in Locations.regions[r]:
257-
j = atoms.index(a)
258-
graph[i, j] = 1
259-
return graph, regions, atoms
260-
261-
@staticmethod
262-
def get_fusion_parameters(known_locations):
263-
"""
264-
Return a matrix that fuses known ILI values into unknown ILI values. The
265-
corresponding lists of known and unknown locations are also returned.
266216

267-
The goal is to infer ILI data in all locations, given ILI data in some
268-
partial set of locations. This function takes a sensor fusion approach.
217+
def get_fusion_parameters(known_locations):
218+
"""
219+
Return a matrix that fuses known ILI values into unknown ILI values. The
220+
corresponding lists of known and unknown locations are also returned.
269221
270-
Let $z$ be a column vector of values in reported locations. Let $y$ be the
271-
desired column vector of values in unreported locations. With matrices $H$
272-
(mapping from latent state to reported values), $W$ (mapping from latent
273-
state to unreported values), and $R = I$ (covariance, which is identity):
222+
The goal is to infer ILI data in all locations, given ILI data in some
223+
partial set of locations. This function takes a sensor fusion approach.
274224
275-
$y = W (H^T R^{-1} H)^{-1} H^T R^{-1} z$
276-
$y = W (H^T H)^{-1} H^T z$
225+
Let $z$ be a column vector of values in reported locations. Let $y$ be the
226+
desired column vector of values in unreported locations. With matrices $H$
227+
(mapping from latent state to reported values), $W$ (mapping from latent
228+
state to unreported values), and $R = I$ (covariance, which is identity):
277229
278-
This is equavalent to OLS regression with an added translation from atomic
279-
locations to missing locations. Unknown values are computed as a linear
280-
combination of known values.
281-
"""
230+
$y = W (H^T R^{-1} H)^{-1} H^T R^{-1} z$
231+
$y = W (H^T H)^{-1} H^T z$
232+
233+
This is equavalent to OLS regression with an added translation from atomic
234+
locations to missing locations. Unknown values are computed as a linear
235+
combination of known values.
236+
"""
282237

283-
graph, regions, atoms = Locations.get_location_graph()
284-
is_known = np.array([r in known_locations for r in regions])
285-
is_unknown = np.logical_not(is_known)
286-
if not np.any(is_known):
287-
raise Exception('no values are known')
288-
if not np.any(is_unknown):
289-
raise Exception('no values are unknown')
238+
graph, regions, atoms = get_location_graph()
239+
is_known = np.array([r in known_locations for r in regions])
240+
is_unknown = np.logical_not(is_known)
241+
if not np.any(is_known):
242+
raise StatespaceException('no values are known')
243+
if not np.any(is_unknown):
244+
raise StatespaceException('no values are unknown')
290245

291-
H = graph[is_known, :]
292-
W = graph[is_unknown, :]
293-
if np.linalg.matrix_rank(H) != len(atoms):
294-
raise Exception('system is underdetermined')
246+
H = graph[is_known, :]
247+
W = graph[is_unknown, :]
248+
if np.linalg.matrix_rank(H) != len(atoms):
249+
raise StatespaceException('system is underdetermined')
295250

296-
HtH = np.dot(H.T, H)
297-
HtH_inv = np.linalg.inv(HtH)
298-
H_pseudo_inv = np.dot(HtH_inv, H.T)
299-
fuser = np.dot(W, H_pseudo_inv)
251+
HtH = np.dot(H.T, H)
252+
HtH_inv = np.linalg.inv(HtH)
253+
H_pseudo_inv = np.dot(HtH_inv, H.T)
254+
fuser = np.dot(W, H_pseudo_inv)
300255

301-
locations = np.array(regions)
302-
filter_locations = lambda selected: list(map(str, locations[selected]))
303-
return fuser, filter_locations(is_known), filter_locations(is_unknown)
256+
locations = np.array(regions)
257+
filter_locations = lambda selected: list(map(str, locations[selected]))
258+
return fuser, filter_locations(is_known), filter_locations(is_unknown)
304259

305260

306261
def get_lag_and_ili(issue, epiweek, num_ili, num_patients):
@@ -344,7 +299,7 @@ def impute_missing_values(database, test_mode=False):
344299
known_values['pr'] = (0, 0, 0)
345300

346301
# get the imputation matrix and lists of known and unknown locations
347-
F, known, unknown = Locations.get_fusion_parameters(known_values.keys())
302+
F, known, unknown = get_fusion_parameters(known_values.keys())
348303

349304
# finally, impute the missing values
350305
z = np.array([known_values[k] for k in known])

tests/acquisition/fluview/test_impute_missing_values.py

Lines changed: 48 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@
55
import unittest
66
from unittest.mock import MagicMock
77

8+
# first party
9+
from delphi.utils.geo.locations import Locations
10+
811
# py3tester coverage target
912
__test_target__ = 'delphi.epidata.acquisition.fluview.impute_missing_values'
1013

@@ -13,6 +16,7 @@ class FunctionTests(unittest.TestCase):
1316
"""Tests each function individually."""
1417

1518
def test_get_argument_parser(self):
19+
"""An ArgumentParser is returned."""
1620
self.assertIsInstance(get_argument_parser(), argparse.ArgumentParser)
1721

1822
def test_get_lag_and_ili(self):
@@ -28,20 +32,19 @@ def test_get_lag_and_ili(self):
2832
self.assertEquals(actual, expected)
2933

3034
def test_impute_missing_values(self):
35+
"""Atoms are imputed and stored."""
36+
3137
unknown_set = set(['pa', 'tx'])
32-
known_set = set(['nat', 'hhs6'] + Locations.atoms) - unknown_set
38+
known_set = set(['nat', 'hhs6'] + Locations.atom_list) - unknown_set
3339
known_data = {}
3440
for loc in known_set:
35-
n = len(Locations.regions[loc])
41+
n = len(Locations.region_map[loc])
3642
known_data[loc] = (n, n, n)
3743

3844
db = MagicMock()
39-
db.connect = MagicMock()
40-
db.close = MagicMock()
41-
db.count_rows = MagicMock(return_value=123)
42-
db.find_missing_rows = MagicMock(return_value=[(201740, 201740)])
43-
db.get_known_values = MagicMock(return_value=known_data)
44-
db.add_imputed_values = MagicMock()
45+
db.count_rows.return_value = 123
46+
db.find_missing_rows.return_value = [(201740, 201740)]
47+
db.get_known_values.return_value = known_data
4548

4649
impute_missing_values(db, test_mode=True)
4750

@@ -56,47 +59,71 @@ def test_impute_missing_values(self):
5659
self.assertTrue(unknown_set <= set(imputed.keys()))
5760
for loc, (lag, n_ili, n_pat, n_prov, ili) in imputed.items():
5861
with self.subTest(loc=loc):
59-
num = len(Locations.regions[loc])
62+
num = len(Locations.region_map[loc])
6063
self.assertEquals(lag, 0)
6164
self.assertEquals(n_ili, num)
6265
self.assertEquals(n_pat, num)
6366
self.assertEquals(n_prov, num)
6467
self.assertEquals(ili, 100)
6568

6669
def test_impute_missing_values_vipr(self):
70+
"""PR and VI are imputed only when appropriate."""
71+
6772
unknown_set = set(['vi', 'pr'])
68-
known_set = set(['nat'] + Locations.atoms) - unknown_set
73+
known_set = set(['nat'] + Locations.atom_list) - unknown_set
6974
known_data = {}
7075
for loc in known_set:
71-
n = len(Locations.regions[loc])
76+
n = len(Locations.region_map[loc])
7277
known_data[loc] = (n, n, n)
7378

7479
db = MagicMock()
75-
db.count_rows = MagicMock(return_value=123)
76-
db.get_known_values = MagicMock(return_value=known_data)
80+
db.count_rows.return_value = 123
81+
db.get_known_values.return_value = known_data
7782

78-
db.find_missing_rows = MagicMock(return_value=[(201340, 201340)])
83+
db.find_missing_rows.return_value = [(201340, 201340)]
7984
with self.assertRaises(Exception):
8085
impute_missing_values(db, test_mode=True)
8186

82-
db.find_missing_rows = MagicMock(return_value=[(201339, 201339)])
87+
db.find_missing_rows.return_value = [(201339, 201339)]
8388
impute_missing_values(db, test_mode=True)
8489

8590
imputed = db.add_imputed_values.call_args[0][-1]
8691
self.assertIn('pr', set(imputed.keys()))
8792

93+
def test_impute_missing_values_regions(self):
94+
"""Regions are imputed in addition to atoms."""
95+
96+
known_set = set(Locations.atom_list)
97+
known_data = {}
98+
for loc in known_set:
99+
known_data[loc] = (1, 2, 3)
100+
101+
db = MagicMock()
102+
db.count_rows.return_value = 123
103+
db.find_missing_rows.return_value = [(201740, 201740)]
104+
db.get_known_values.return_value = known_data
105+
106+
impute_missing_values(db, test_mode=True)
107+
imputed = db.add_imputed_values.call_args[0][-1]
108+
self.assertIn('nat', set(imputed.keys()))
109+
self.assertIn('hhs2', set(imputed.keys()))
110+
self.assertIn('cen3', set(imputed.keys()))
111+
self.assertIn('ny', set(imputed.keys()))
112+
88113
def test_impute_missing_values_underdetermined(self):
114+
"""Fail when the system is underdetermined."""
115+
89116
unknown_set = set(['pa', 'tx'])
90-
known_set = set(Locations.atoms) - unknown_set
117+
known_set = set(Locations.atom_list) - unknown_set
91118
known_data = {}
92119
for loc in known_set:
93-
n = len(Locations.regions[loc])
120+
n = len(Locations.region_map[loc])
94121
known_data[loc] = (n, n, n)
95122

96123
db = MagicMock()
97-
db.count_rows = MagicMock(return_value=123)
98-
db.find_missing_rows = MagicMock(return_value=[(201740, 201740)])
99-
db.get_known_values = MagicMock(return_value=known_data)
124+
db.count_rows.return_value = 123
125+
db.find_missing_rows.return_value = [(201740, 201740)]
126+
db.get_known_values.return_value = known_data
100127

101-
with self.assertRaises(Exception):
128+
with self.assertRaises(StatespaceException):
102129
impute_missing_values(db, test_mode=True)

0 commit comments

Comments
 (0)