NY = Upstate + NYC

undefx · undefx · commit 63e5ae746220 · 2018-06-15T17:26:47.000-05:00
This is part of a cross-repo update that splits NY ("ny") into Upstate
("ny_minus_jfk") and NYC ("jfk").

Also, replace embedded locations with utils/geo/locations.
diff --git a/src/acquisition/fluview/fluview_locations.py b/src/acquisition/fluview/fluview_locations.py
@@ -74,7 +74,9 @@
     'new hampshire': 'nh',
     'new jersey': 'nj',
     'new mexico': 'nm',
-    'new york': 'ny',
+    # Even though it's called "New York", this location doesn't include New
+    # York City ("jfk"). New York ("ny") is actually this *plus* jfk.
+    'new york': 'ny_minus_jfk',
     'north carolina': 'nc',
     'north dakota': 'nd',
     'ohio': 'oh',
diff --git a/src/acquisition/fluview/fluview_update.py b/src/acquisition/fluview/fluview_update.py
@@ -49,7 +49,7 @@
 issue: the epiweek of publication (e.g. issue 201453 includes epiweeks up to
   and including 2014w53, but not 2015w01 or following)
 epiweek: the epiweek during which the data was collected
-region: the name of the location (e.g. 'nat', 'hhs1', 'cen9', 'pa', 'nyc')
+region: the name of the location (e.g. 'nat', 'hhs1', 'cen9', 'pa', 'jfk')
 lag: number of weeks between `epiweek` and `issue`
 num_ili: the number of ILI cases (numerator)
 num_patients: the total number of patients (denominator)
diff --git a/src/acquisition/fluview/impute_missing_values.py b/src/acquisition/fluview/impute_missing_values.py
@@ -37,7 +37,7 @@
 issue: the epiweek of publication (e.g. issue 201453 includes epiweeks up to
   and including 2014w53, but not 2015w01 or following)
 epiweek: the epiweek during which the data was collected
-region: the name of the location (e.g. 'fl', 'la', 'ms', 'pr', 'vi')
+region: the name of the location (e.g. 'fl', 'la', 'ms', 'pr', 'vi', 'ny')
 lag: number of weeks between `epiweek` and `issue`
 num_ili: the number of ILI cases (numerator)
 num_patients: the total number of patients (denominator)
@@ -56,6 +56,7 @@
 from delphi.epidata.acquisition.fluview import fluview_locations
 import delphi.operations.secrets as secrets
 from delphi.utils.epiweek import delta_epiweeks
+from delphi.utils.geo.locations import Locations
 
 
 class Database:
@@ -192,115 +193,69 @@ def add_imputed_values(self, issue, epiweek, imputed):
       self.cur.execute(Database.Sql.add_imputed_values, args)
 
 
-class Locations:
+class StatespaceException(Exception):
+  """Used to indicate that imputation is not possible with the given inputs."""
+
+
+def get_location_graph():
   """
-  A class that encodes the hierarchy of US locations and provides utility
-  functions for imputing ILI in those locations.
+  Return a matrix where rows represent regions, columns represent atoms, and
+  each entry is a 1 if the region contains the atom, otherwise 0. The
+  corresponding lists of regions and atoms are also returned.
   """
 
-  # Atomic regions for ILINet data.
-  atoms = [
-    # entire states
-    'ak', 'al', 'ar', 'az', 'ca', 'co', 'ct', 'de', 'fl', 'ga', 'hi', 'ia',
-    'id', 'il', 'in', 'ks', 'ky', 'la', 'ma', 'md', 'me', 'mi', 'mn', 'mo',
-    'ms', 'mt', 'nc', 'nd', 'ne', 'nh', 'nj', 'nm', 'nv', 'oh', 'ok', 'or',
-    'pa', 'ri', 'sc', 'sd', 'tn', 'tx', 'ut', 'va', 'vt', 'wa', 'wi', 'wv',
-    'wy',
-    # partial states
-    'ny',
-    # territories
-    'dc', 'pr', 'vi',
-    # cities
-    'jfk',
-  ]
-
-  # National, HHS, and Census regions since we have ILINet data for those.
-  regions = {
-    'nat': atoms,
-    'hhs1': ['ct', 'ma', 'me', 'nh', 'ri', 'vt'],
-    'hhs2': ['jfk', 'nj', 'ny', 'pr', 'vi'],
-    'hhs3': ['dc', 'de', 'md', 'pa', 'va', 'wv'],
-    'hhs4': ['al', 'fl', 'ga', 'ky', 'ms', 'nc', 'sc', 'tn'],
-    'hhs5': ['il', 'in', 'mi', 'mn', 'oh', 'wi'],
-    'hhs6': ['ar', 'la', 'nm', 'ok', 'tx'],
-    'hhs7': ['ia', 'ks', 'mo', 'ne'],
-    'hhs8': ['co', 'mt', 'nd', 'sd', 'ut', 'wy'],
-    'hhs9': ['az', 'ca', 'hi', 'nv'],
-    'hhs10': ['ak', 'id', 'or', 'wa'],
-    'cen1': ['ct', 'ma', 'me', 'nh', 'ri', 'vt'],
-    'cen2': ['jfk', 'nj', 'ny', 'pa', 'pr', 'vi'],
-    'cen3': ['il', 'in', 'mi', 'oh', 'wi'],
-    'cen4': ['ia', 'ks', 'mn', 'mo', 'nd', 'ne', 'sd'],
-    'cen5': ['dc', 'de', 'fl', 'ga', 'md', 'nc', 'sc', 'va', 'wv'],
-    'cen6': ['al', 'ky', 'ms', 'tn'],
-    'cen7': ['ar', 'la', 'ok', 'tx'],
-    'cen8': ['az', 'co', 'id', 'mt', 'nm', 'nv', 'ut', 'wy'],
-    'cen9': ['ak', 'ca', 'hi', 'or', 'wa'],
-  }
-
-  # Atomic locations are like regions containing only themselves.
-  regions.update(dict([(a, [a]) for a in atoms]))
-
-  @staticmethod
-  def get_location_graph():
-    """
-    Return a matrix where rows represent regions, columns represent atoms, and
-    each entry is a 1 if the region contains the atom, otherwise 0. The
-    corresponding lists of regions and atoms are also returned.
-    """
+  regions = sorted(Locations.region_list)
+  atoms = sorted(Locations.atom_list)
+  graph = np.zeros((len(regions), len(atoms)))
+  for i, r in enumerate(regions):
+    for a in Locations.region_map[r]:
+      j = atoms.index(a)
+      graph[i, j] = 1
+  return graph, regions, atoms
 
-    regions = sorted(Locations.regions.keys())
-    atoms = sorted(Locations.atoms)
-    graph = np.zeros((len(regions), len(atoms)))
-    for i, r in enumerate(regions):
-      for a in Locations.regions[r]:
-        j = atoms.index(a)
-        graph[i, j] = 1
-    return graph, regions, atoms
-
-  @staticmethod
-  def get_fusion_parameters(known_locations):
-    """
-    Return a matrix that fuses known ILI values into unknown ILI values. The
-    corresponding lists of known and unknown locations are also returned.
 
-    The goal is to infer ILI data in all locations, given ILI data in some
-    partial set of locations. This function takes a sensor fusion approach.
+def get_fusion_parameters(known_locations):
+  """
+  Return a matrix that fuses known ILI values into unknown ILI values. The
+  corresponding lists of known and unknown locations are also returned.
 
-    Let $z$ be a column vector of values in reported locations. Let $y$ be the
-    desired column vector of values in unreported locations. With matrices $H$
-    (mapping from latent state to reported values), $W$ (mapping from latent
-    state to unreported values), and $R = I$ (covariance, which is identity):
+  The goal is to infer ILI data in all locations, given ILI data in some
+  partial set of locations. This function takes a sensor fusion approach.
 
-      $y = W (H^T R^{-1} H)^{-1} H^T R^{-1} z$
-      $y = W (H^T H)^{-1} H^T z$
+  Let $z$ be a column vector of values in reported locations. Let $y$ be the
+  desired column vector of values in unreported locations. With matrices $H$
+  (mapping from latent state to reported values), $W$ (mapping from latent
+  state to unreported values), and $R = I$ (covariance, which is identity):
 
-    This is equavalent to OLS regression with an added translation from atomic
-    locations to missing locations. Unknown values are computed as a linear
-    combination of known values.
-    """
+    $y = W (H^T R^{-1} H)^{-1} H^T R^{-1} z$
+    $y = W (H^T H)^{-1} H^T z$
+
+  This is equavalent to OLS regression with an added translation from atomic
+  locations to missing locations. Unknown values are computed as a linear
+  combination of known values.
+  """
 
-    graph, regions, atoms = Locations.get_location_graph()
-    is_known = np.array([r in known_locations for r in regions])
-    is_unknown = np.logical_not(is_known)
-    if not np.any(is_known):
-      raise Exception('no values are known')
-    if not np.any(is_unknown):
-      raise Exception('no values are unknown')
+  graph, regions, atoms = get_location_graph()
+  is_known = np.array([r in known_locations for r in regions])
+  is_unknown = np.logical_not(is_known)
+  if not np.any(is_known):
+    raise StatespaceException('no values are known')
+  if not np.any(is_unknown):
+    raise StatespaceException('no values are unknown')
 
-    H = graph[is_known, :]
-    W = graph[is_unknown, :]
-    if np.linalg.matrix_rank(H) != len(atoms):
-      raise Exception('system is underdetermined')
+  H = graph[is_known, :]
+  W = graph[is_unknown, :]
+  if np.linalg.matrix_rank(H) != len(atoms):
+    raise StatespaceException('system is underdetermined')
 
-    HtH = np.dot(H.T, H)
-    HtH_inv = np.linalg.inv(HtH)
-    H_pseudo_inv = np.dot(HtH_inv, H.T)
-    fuser = np.dot(W, H_pseudo_inv)
+  HtH = np.dot(H.T, H)
+  HtH_inv = np.linalg.inv(HtH)
+  H_pseudo_inv = np.dot(HtH_inv, H.T)
+  fuser = np.dot(W, H_pseudo_inv)
 
-    locations = np.array(regions)
-    filter_locations = lambda selected: list(map(str, locations[selected]))
-    return fuser, filter_locations(is_known), filter_locations(is_unknown)
+  locations = np.array(regions)
+  filter_locations = lambda selected: list(map(str, locations[selected]))
+  return fuser, filter_locations(is_known), filter_locations(is_unknown)
 
 
 def get_lag_and_ili(issue, epiweek, num_ili, num_patients):
@@ -344,7 +299,7 @@ def impute_missing_values(database, test_mode=False):
       known_values['pr'] = (0, 0, 0)
 
     # get the imputation matrix and lists of known and unknown locations
-    F, known, unknown = Locations.get_fusion_parameters(known_values.keys())
+    F, known, unknown = get_fusion_parameters(known_values.keys())
 
     # finally, impute the missing values
     z = np.array([known_values[k] for k in known])
diff --git a/tests/acquisition/fluview/test_impute_missing_values.py b/tests/acquisition/fluview/test_impute_missing_values.py
@@ -5,6 +5,9 @@
 import unittest
 from unittest.mock import MagicMock
 
+# first party
+from delphi.utils.geo.locations import Locations
+
 # py3tester coverage target
 __test_target__ = 'delphi.epidata.acquisition.fluview.impute_missing_values'
 
@@ -13,6 +16,7 @@ class FunctionTests(unittest.TestCase):
   """Tests each function individually."""
 
   def test_get_argument_parser(self):
+    """An ArgumentParser is returned."""
     self.assertIsInstance(get_argument_parser(), argparse.ArgumentParser)
 
   def test_get_lag_and_ili(self):
@@ -28,20 +32,19 @@ def test_get_lag_and_ili(self):
         self.assertEquals(actual, expected)
 
   def test_impute_missing_values(self):
+    """Atoms are imputed and stored."""
+
     unknown_set = set(['pa', 'tx'])
-    known_set = set(['nat', 'hhs6'] + Locations.atoms) - unknown_set
+    known_set = set(['nat', 'hhs6'] + Locations.atom_list) - unknown_set
     known_data = {}
     for loc in known_set:
-      n = len(Locations.regions[loc])
+      n = len(Locations.region_map[loc])
       known_data[loc] = (n, n, n)
 
     db = MagicMock()
-    db.connect = MagicMock()
-    db.close = MagicMock()
-    db.count_rows = MagicMock(return_value=123)
-    db.find_missing_rows = MagicMock(return_value=[(201740, 201740)])
-    db.get_known_values = MagicMock(return_value=known_data)
-    db.add_imputed_values = MagicMock()
+    db.count_rows.return_value = 123
+    db.find_missing_rows.return_value = [(201740, 201740)]
+    db.get_known_values.return_value = known_data
 
     impute_missing_values(db, test_mode=True)
 
@@ -56,47 +59,71 @@ def test_impute_missing_values(self):
     self.assertTrue(unknown_set <= set(imputed.keys()))
     for loc, (lag, n_ili, n_pat, n_prov, ili) in imputed.items():
       with self.subTest(loc=loc):
-        num = len(Locations.regions[loc])
+        num = len(Locations.region_map[loc])
         self.assertEquals(lag, 0)
         self.assertEquals(n_ili, num)
         self.assertEquals(n_pat, num)
         self.assertEquals(n_prov, num)
         self.assertEquals(ili, 100)
 
   def test_impute_missing_values_vipr(self):
+    """PR and VI are imputed only when appropriate."""
+
     unknown_set = set(['vi', 'pr'])
-    known_set = set(['nat'] + Locations.atoms) - unknown_set
+    known_set = set(['nat'] + Locations.atom_list) - unknown_set
     known_data = {}
     for loc in known_set:
-      n = len(Locations.regions[loc])
+      n = len(Locations.region_map[loc])
       known_data[loc] = (n, n, n)
 
     db = MagicMock()
-    db.count_rows = MagicMock(return_value=123)
-    db.get_known_values = MagicMock(return_value=known_data)
+    db.count_rows.return_value = 123
+    db.get_known_values.return_value = known_data
 
-    db.find_missing_rows = MagicMock(return_value=[(201340, 201340)])
+    db.find_missing_rows.return_value = [(201340, 201340)]
     with self.assertRaises(Exception):
       impute_missing_values(db, test_mode=True)
 
-    db.find_missing_rows = MagicMock(return_value=[(201339, 201339)])
+    db.find_missing_rows.return_value = [(201339, 201339)]
     impute_missing_values(db, test_mode=True)
 
     imputed = db.add_imputed_values.call_args[0][-1]
     self.assertIn('pr', set(imputed.keys()))
 
+  def test_impute_missing_values_regions(self):
+    """Regions are imputed in addition to atoms."""
+
+    known_set = set(Locations.atom_list)
+    known_data = {}
+    for loc in known_set:
+      known_data[loc] = (1, 2, 3)
+
+    db = MagicMock()
+    db.count_rows.return_value = 123
+    db.find_missing_rows.return_value = [(201740, 201740)]
+    db.get_known_values.return_value = known_data
+
+    impute_missing_values(db, test_mode=True)
+    imputed = db.add_imputed_values.call_args[0][-1]
+    self.assertIn('nat', set(imputed.keys()))
+    self.assertIn('hhs2', set(imputed.keys()))
+    self.assertIn('cen3', set(imputed.keys()))
+    self.assertIn('ny', set(imputed.keys()))
+
   def test_impute_missing_values_underdetermined(self):
+    """Fail when the system is underdetermined."""
+
     unknown_set = set(['pa', 'tx'])
-    known_set = set(Locations.atoms) - unknown_set
+    known_set = set(Locations.atom_list) - unknown_set
     known_data = {}
     for loc in known_set:
-      n = len(Locations.regions[loc])
+      n = len(Locations.region_map[loc])
       known_data[loc] = (n, n, n)
 
     db = MagicMock()
-    db.count_rows = MagicMock(return_value=123)
-    db.find_missing_rows = MagicMock(return_value=[(201740, 201740)])
-    db.get_known_values = MagicMock(return_value=known_data)
+    db.count_rows.return_value = 123
+    db.find_missing_rows.return_value = [(201740, 201740)]
+    db.get_known_values.return_value = known_data
 
-    with self.assertRaises(Exception):
+    with self.assertRaises(StatespaceException):
       impute_missing_values(db, test_mode=True)