|
37 | 37 | issue: the epiweek of publication (e.g. issue 201453 includes epiweeks up to |
38 | 38 | and including 2014w53, but not 2015w01 or following) |
39 | 39 | epiweek: the epiweek during which the data was collected |
40 | | -region: the name of the location (e.g. 'fl', 'la', 'ms', 'pr', 'vi') |
| 40 | +region: the name of the location (e.g. 'fl', 'la', 'ms', 'pr', 'vi', 'ny') |
41 | 41 | lag: number of weeks between `epiweek` and `issue` |
42 | 42 | num_ili: the number of ILI cases (numerator) |
43 | 43 | num_patients: the total number of patients (denominator) |
|
56 | 56 | from delphi.epidata.acquisition.fluview import fluview_locations |
57 | 57 | import delphi.operations.secrets as secrets |
58 | 58 | from delphi.utils.epiweek import delta_epiweeks |
| 59 | +from delphi.utils.geo.locations import Locations |
59 | 60 |
|
60 | 61 |
|
61 | 62 | class Database: |
@@ -192,115 +193,69 @@ def add_imputed_values(self, issue, epiweek, imputed): |
192 | 193 | self.cur.execute(Database.Sql.add_imputed_values, args) |
193 | 194 |
|
194 | 195 |
|
195 | | -class Locations: |
| 196 | +class StatespaceException(Exception): |
| 197 | + """Used to indicate that imputation is not possible with the given inputs.""" |
| 198 | + |
| 199 | + |
| 200 | +def get_location_graph(): |
196 | 201 | """ |
197 | | - A class that encodes the hierarchy of US locations and provides utility |
198 | | - functions for imputing ILI in those locations. |
| 202 | + Return a matrix where rows represent regions, columns represent atoms, and |
| 203 | + each entry is a 1 if the region contains the atom, otherwise 0. The |
| 204 | + corresponding lists of regions and atoms are also returned. |
199 | 205 | """ |
200 | 206 |
|
201 | | - # Atomic regions for ILINet data. |
202 | | - atoms = [ |
203 | | - # entire states |
204 | | - 'ak', 'al', 'ar', 'az', 'ca', 'co', 'ct', 'de', 'fl', 'ga', 'hi', 'ia', |
205 | | - 'id', 'il', 'in', 'ks', 'ky', 'la', 'ma', 'md', 'me', 'mi', 'mn', 'mo', |
206 | | - 'ms', 'mt', 'nc', 'nd', 'ne', 'nh', 'nj', 'nm', 'nv', 'oh', 'ok', 'or', |
207 | | - 'pa', 'ri', 'sc', 'sd', 'tn', 'tx', 'ut', 'va', 'vt', 'wa', 'wi', 'wv', |
208 | | - 'wy', |
209 | | - # partial states |
210 | | - 'ny', |
211 | | - # territories |
212 | | - 'dc', 'pr', 'vi', |
213 | | - # cities |
214 | | - 'jfk', |
215 | | - ] |
216 | | - |
217 | | - # National, HHS, and Census regions since we have ILINet data for those. |
218 | | - regions = { |
219 | | - 'nat': atoms, |
220 | | - 'hhs1': ['ct', 'ma', 'me', 'nh', 'ri', 'vt'], |
221 | | - 'hhs2': ['jfk', 'nj', 'ny', 'pr', 'vi'], |
222 | | - 'hhs3': ['dc', 'de', 'md', 'pa', 'va', 'wv'], |
223 | | - 'hhs4': ['al', 'fl', 'ga', 'ky', 'ms', 'nc', 'sc', 'tn'], |
224 | | - 'hhs5': ['il', 'in', 'mi', 'mn', 'oh', 'wi'], |
225 | | - 'hhs6': ['ar', 'la', 'nm', 'ok', 'tx'], |
226 | | - 'hhs7': ['ia', 'ks', 'mo', 'ne'], |
227 | | - 'hhs8': ['co', 'mt', 'nd', 'sd', 'ut', 'wy'], |
228 | | - 'hhs9': ['az', 'ca', 'hi', 'nv'], |
229 | | - 'hhs10': ['ak', 'id', 'or', 'wa'], |
230 | | - 'cen1': ['ct', 'ma', 'me', 'nh', 'ri', 'vt'], |
231 | | - 'cen2': ['jfk', 'nj', 'ny', 'pa', 'pr', 'vi'], |
232 | | - 'cen3': ['il', 'in', 'mi', 'oh', 'wi'], |
233 | | - 'cen4': ['ia', 'ks', 'mn', 'mo', 'nd', 'ne', 'sd'], |
234 | | - 'cen5': ['dc', 'de', 'fl', 'ga', 'md', 'nc', 'sc', 'va', 'wv'], |
235 | | - 'cen6': ['al', 'ky', 'ms', 'tn'], |
236 | | - 'cen7': ['ar', 'la', 'ok', 'tx'], |
237 | | - 'cen8': ['az', 'co', 'id', 'mt', 'nm', 'nv', 'ut', 'wy'], |
238 | | - 'cen9': ['ak', 'ca', 'hi', 'or', 'wa'], |
239 | | - } |
240 | | - |
241 | | - # Atomic locations are like regions containing only themselves. |
242 | | - regions.update(dict([(a, [a]) for a in atoms])) |
243 | | - |
244 | | - @staticmethod |
245 | | - def get_location_graph(): |
246 | | - """ |
247 | | - Return a matrix where rows represent regions, columns represent atoms, and |
248 | | - each entry is a 1 if the region contains the atom, otherwise 0. The |
249 | | - corresponding lists of regions and atoms are also returned. |
250 | | - """ |
| 207 | + regions = sorted(Locations.region_list) |
| 208 | + atoms = sorted(Locations.atom_list) |
| 209 | + graph = np.zeros((len(regions), len(atoms))) |
| 210 | + for i, r in enumerate(regions): |
| 211 | + for a in Locations.region_map[r]: |
| 212 | + j = atoms.index(a) |
| 213 | + graph[i, j] = 1 |
| 214 | + return graph, regions, atoms |
251 | 215 |
|
252 | | - regions = sorted(Locations.regions.keys()) |
253 | | - atoms = sorted(Locations.atoms) |
254 | | - graph = np.zeros((len(regions), len(atoms))) |
255 | | - for i, r in enumerate(regions): |
256 | | - for a in Locations.regions[r]: |
257 | | - j = atoms.index(a) |
258 | | - graph[i, j] = 1 |
259 | | - return graph, regions, atoms |
260 | | - |
261 | | - @staticmethod |
262 | | - def get_fusion_parameters(known_locations): |
263 | | - """ |
264 | | - Return a matrix that fuses known ILI values into unknown ILI values. The |
265 | | - corresponding lists of known and unknown locations are also returned. |
266 | 216 |
|
267 | | - The goal is to infer ILI data in all locations, given ILI data in some |
268 | | - partial set of locations. This function takes a sensor fusion approach. |
| 217 | +def get_fusion_parameters(known_locations): |
| 218 | + """ |
| 219 | + Return a matrix that fuses known ILI values into unknown ILI values. The |
| 220 | + corresponding lists of known and unknown locations are also returned. |
269 | 221 |
|
270 | | - Let $z$ be a column vector of values in reported locations. Let $y$ be the |
271 | | - desired column vector of values in unreported locations. With matrices $H$ |
272 | | - (mapping from latent state to reported values), $W$ (mapping from latent |
273 | | - state to unreported values), and $R = I$ (covariance, which is identity): |
| 222 | + The goal is to infer ILI data in all locations, given ILI data in some |
| 223 | + partial set of locations. This function takes a sensor fusion approach. |
274 | 224 |
|
275 | | - $y = W (H^T R^{-1} H)^{-1} H^T R^{-1} z$ |
276 | | - $y = W (H^T H)^{-1} H^T z$ |
| 225 | + Let $z$ be a column vector of values in reported locations. Let $y$ be the |
| 226 | + desired column vector of values in unreported locations. With matrices $H$ |
| 227 | + (mapping from latent state to reported values), $W$ (mapping from latent |
| 228 | + state to unreported values), and $R = I$ (covariance, which is identity): |
277 | 229 |
|
278 | | - This is equavalent to OLS regression with an added translation from atomic |
279 | | - locations to missing locations. Unknown values are computed as a linear |
280 | | - combination of known values. |
281 | | - """ |
| 230 | + $y = W (H^T R^{-1} H)^{-1} H^T R^{-1} z$ |
| 231 | + $y = W (H^T H)^{-1} H^T z$ |
| 232 | +
|
| 233 | + This is equavalent to OLS regression with an added translation from atomic |
| 234 | + locations to missing locations. Unknown values are computed as a linear |
| 235 | + combination of known values. |
| 236 | + """ |
282 | 237 |
|
283 | | - graph, regions, atoms = Locations.get_location_graph() |
284 | | - is_known = np.array([r in known_locations for r in regions]) |
285 | | - is_unknown = np.logical_not(is_known) |
286 | | - if not np.any(is_known): |
287 | | - raise Exception('no values are known') |
288 | | - if not np.any(is_unknown): |
289 | | - raise Exception('no values are unknown') |
| 238 | + graph, regions, atoms = get_location_graph() |
| 239 | + is_known = np.array([r in known_locations for r in regions]) |
| 240 | + is_unknown = np.logical_not(is_known) |
| 241 | + if not np.any(is_known): |
| 242 | + raise StatespaceException('no values are known') |
| 243 | + if not np.any(is_unknown): |
| 244 | + raise StatespaceException('no values are unknown') |
290 | 245 |
|
291 | | - H = graph[is_known, :] |
292 | | - W = graph[is_unknown, :] |
293 | | - if np.linalg.matrix_rank(H) != len(atoms): |
294 | | - raise Exception('system is underdetermined') |
| 246 | + H = graph[is_known, :] |
| 247 | + W = graph[is_unknown, :] |
| 248 | + if np.linalg.matrix_rank(H) != len(atoms): |
| 249 | + raise StatespaceException('system is underdetermined') |
295 | 250 |
|
296 | | - HtH = np.dot(H.T, H) |
297 | | - HtH_inv = np.linalg.inv(HtH) |
298 | | - H_pseudo_inv = np.dot(HtH_inv, H.T) |
299 | | - fuser = np.dot(W, H_pseudo_inv) |
| 251 | + HtH = np.dot(H.T, H) |
| 252 | + HtH_inv = np.linalg.inv(HtH) |
| 253 | + H_pseudo_inv = np.dot(HtH_inv, H.T) |
| 254 | + fuser = np.dot(W, H_pseudo_inv) |
300 | 255 |
|
301 | | - locations = np.array(regions) |
302 | | - filter_locations = lambda selected: list(map(str, locations[selected])) |
303 | | - return fuser, filter_locations(is_known), filter_locations(is_unknown) |
| 256 | + locations = np.array(regions) |
| 257 | + filter_locations = lambda selected: list(map(str, locations[selected])) |
| 258 | + return fuser, filter_locations(is_known), filter_locations(is_unknown) |
304 | 259 |
|
305 | 260 |
|
306 | 261 | def get_lag_and_ili(issue, epiweek, num_ili, num_patients): |
@@ -344,7 +299,7 @@ def impute_missing_values(database, test_mode=False): |
344 | 299 | known_values['pr'] = (0, 0, 0) |
345 | 300 |
|
346 | 301 | # get the imputation matrix and lists of known and unknown locations |
347 | | - F, known, unknown = Locations.get_fusion_parameters(known_values.keys()) |
| 302 | + F, known, unknown = get_fusion_parameters(known_values.keys()) |
348 | 303 |
|
349 | 304 | # finally, impute the missing values |
350 | 305 | z = np.array([known_values[k] for k in known]) |
|
0 commit comments