From bea3b1ef2f70d31e006709e83485fb89e307fae1 Mon Sep 17 00:00:00 2001 From: sastoudt Date: Wed, 14 Aug 2019 16:34:15 -0700 Subject: [PATCH 01/13] moe and jam adds, need to work out edge cases --- census_data_aggregator/__init__.py | 270 +++++++++++++++++---------- census_data_aggregator/exceptions.py | 8 + 2 files changed, 182 insertions(+), 96 deletions(-) diff --git a/census_data_aggregator/__init__.py b/census_data_aggregator/__init__.py index 6c682a2..5643e5b 100644 --- a/census_data_aggregator/__init__.py +++ b/census_data_aggregator/__init__.py @@ -4,7 +4,7 @@ import math import numpy import warnings -from .exceptions import DataError, SamplingPercentageWarning +from .exceptions import DataError, SamplingPercentageWarning, JamValueWarning def approximate_sum(*pairs): @@ -61,7 +61,7 @@ def approximate_sum(*pairs): return total, margin_of_error -def approximate_median(range_list, design_factor=1, sampling_percentage=None): +def approximate_median(range_list, design_factor=1, sampling_percentage=None, jam_values=None, simulations=50): """ Estimate a median and approximate the margin of error. @@ -130,99 +130,176 @@ def approximate_median(range_list, design_factor=1, sampling_percentage=None): # Sort the list range_list.sort(key=lambda x: x['min']) - # For each range calculate its min and max value along the universe's scale - cumulative_n = 0 - for range_ in range_list: - range_['n_min'] = cumulative_n - cumulative_n += range_['n'] - range_['n_max'] = cumulative_n - - # What is the total number of observations in the universe? - n = sum([d['n'] for d in range_list]) - - # What is the estimated midpoint of the n? - n_midpoint = n / 2.0 - - # Now use those to determine which group contains the midpoint. - n_midpoint_range = next(d for d in range_list if n_midpoint >= d['n_min'] and n_midpoint <= d['n_max']) - - # How many households in the midrange are needed to reach the midpoint? - n_midrange_gap = n_midpoint - n_midpoint_range['n_min'] - - # What is the proportion of the group that would be needed to get the midpoint? - n_midrange_gap_percent = n_midrange_gap / n_midpoint_range['n'] - - # Apply this proportion to the width of the midrange - n_midrange_gap_adjusted = (n_midpoint_range['max'] - n_midpoint_range['min']) * n_midrange_gap_percent - - # Estimate the median - estimated_median = n_midpoint_range['min'] + n_midrange_gap_adjusted - - # If there's no sampling percentage, we can't calculate a margin of error - if not sampling_percentage: - # Let's throw a warning, but still return the median - warnings.warn("", SamplingPercentageWarning) - return estimated_median, None - - # Get the standard error for this dataset - standard_error = (design_factor * math.sqrt(((100 - sampling_percentage) / (n * sampling_percentage)) * (50**2))) / 100 - - # Use the standard error to calculate the p values - p_lower = .5 - standard_error - p_upper = .5 + standard_error - - # Estimate the p_lower and p_upper n values - p_lower_n = n * p_lower - p_upper_n = n * p_upper - - # Find the ranges the p values fall within - try: - p_lower_range_i, p_lower_range = next( - (i, d) for i, d in enumerate(range_list) - if p_lower_n >= d['n_min'] and p_lower_n <= d['n_max'] - ) - except StopIteration: - raise DataError(f"The n's lower p value {p_lower_n} does not fall within a data range.") - - try: - p_upper_range_i, p_upper_range = next( - (i, d) for i, d in enumerate(range_list) - if p_upper_n >= d['n_min'] and p_upper_n <= d['n_max'] - ) - except StopIteration: - raise DataError(f"The n's upper p value {p_upper_n} does not fall within a data range.") - - # Use these values to estimate the lower bound of the confidence interval - p_lower_a1 = p_lower_range['min'] - try: - p_lower_a2 = range_list[p_lower_range_i + 1]['min'] - except IndexError: - p_lower_a2 = p_lower_range['max'] - p_lower_c1 = p_lower_range['n_min'] / n - try: - p_lower_c2 = range_list[p_lower_range_i + 1]['n_min'] / n - except IndexError: - p_lower_c2 = p_lower_range['n_max'] / n - lower_bound = ((p_lower - p_lower_c1) / (p_lower_c2 - p_lower_c1)) * (p_lower_a2 - p_lower_a1) + p_lower_a1 - - # Same for the upper bound - p_upper_a1 = p_upper_range['min'] - try: - p_upper_a2 = range_list[p_upper_range_i + 1]['min'] - except IndexError: - p_upper_a2 = p_upper_range['max'] - p_upper_c1 = p_upper_range['n_min'] / n - try: - p_upper_c2 = range_list[p_upper_range_i + 1]['n_min'] / n - except IndexError: - p_upper_c2 = p_upper_range['n_max'] / n - upper_bound = ((p_upper - p_upper_c1) / (p_upper_c2 - p_upper_c1)) * (p_upper_a2 - p_upper_a1) + p_upper_a1 - - # Calculate the standard error of the median - standard_error_median = 0.5 * (upper_bound - lower_bound) - - # Calculate the margin of error at the 90% confidence level - margin_of_error = 1.645 * standard_error_median + # if moe is included, can use simulation to estimate margin of error for median + if "moe" in list(range_list[0].keys()): + simulation_results = [] + for i in range(simulations): + # For each range calculate its min and max value along the universe's scale + simulated_n = [] + cumulative_n = 0 + for range_ in range_list: + range_['n_min'] = cumulative_n + se = range_['moe'] / 1.645 # convert moe to se + nn = round(numpy.random.normal(range_['n'], se)) # use moe to introduce randomness into number in bin + nn = int(nn) # clean it up + cumulative_n += nn + range_['n_max'] = cumulative_n + range_['n_new'] = nn + simulated_n.append(nn) + + # What is the total number of observations in the universe? + n = sum([d['n'] for d in range_list]) + + # What is the estimated midpoint of the n? + n_midpoint = n / 2.0 + + # Now use those to determine which group contains the midpoint. + n_midpoint_range = next(d for d in range_list if n_midpoint >= d['n_min'] and n_midpoint <= d['n_max']) + + # How many households in the midrange are needed to reach the midpoint? + n_midrange_gap = n_midpoint - n_midpoint_range['n_min'] + + # What is the proportion of the group that would be needed to get the midpoint? + n_midrange_gap_percent = n_midrange_gap / n_midpoint_range['n_new'] # n_midpoint_range['n'] + + # Apply this proportion to the width of the midrange + n_midrange_gap_adjusted = (n_midpoint_range['max'] - n_midpoint_range['min']) * n_midrange_gap_percent + + # Estimate the median + estimated_median = n_midpoint_range['min'] + n_midrange_gap_adjusted + + if math.isnan(n_midpoint_range['max']) and not jam_values: + # Let's throw a warning + warnings.warn("", JamValueWarning) + simulation_results.append(math.nan) # does it make sense to average these? + elif math.isnan(n_midpoint_range['min']) and not jam_values: + # Let's throw a warning + warnings.warn("", JamValueWarning) + simulation_results.append(math.nan) # does it make sense to average these? + elif math.isnan(n_midpoint_range['max']): # already exhausted the no jam value case + estimated_median = jam_values[1] + simulation_results.append(estimated_median) + elif math.isnan(n_midpoint_range['min']): # already exhausted the no jam value case + estimated_median = jam_values[0] + simulation_results.append(estimated_median) + else: + simulation_results.append(estimated_median) + + estimated_median = numpy.nanmean(simulation_results) # should this be median of medians instead? + t1 = numpy.nanquantile(simulation_results, 0.95) - estimated_median # go from confidence interval to margin of error + t2 = estimated_median - numpy.nanquantile(simulation_results, 0.05) # go from confidence interval to margin of error + margin_of_error = max(t1, t2) # if asymmetrical take bigger one, conservative + # get the median and moe via census approximation + else: + # For each range calculate its min and max value along the universe's scale + cumulative_n = 0 + for range_ in range_list: + range_['n_min'] = cumulative_n + cumulative_n += range_['n'] + range_['n_max'] = cumulative_n + + # What is the total number of observations in the universe? + n = sum([d['n'] for d in range_list]) + + # What is the estimated midpoint of the n? + n_midpoint = n / 2.0 + + # Now use those to determine which group contains the midpoint. + n_midpoint_range = next(d for d in range_list if n_midpoint >= d['n_min'] and n_midpoint <= d['n_max']) + + # How many households in the midrange are needed to reach the midpoint? + n_midrange_gap = n_midpoint - n_midpoint_range['n_min'] + + # What is the proportion of the group that would be needed to get the midpoint? + n_midrange_gap_percent = n_midrange_gap / n_midpoint_range['n'] + + # Apply this proportion to the width of the midrange + n_midrange_gap_adjusted = (n_midpoint_range['max'] - n_midpoint_range['min']) * n_midrange_gap_percent + + # Estimate the median + estimated_median = n_midpoint_range['min'] + n_midrange_gap_adjusted + + if not jam_values and math.isnan(n_midpoint_range['max']): + # Let's throw a warning + warnings.warn("", JamValueWarning) + return None, None + + if not jam_values and math.isnan(n_midpoint_range['min']): + # Let's throw a warning + warnings.warn("", JamValueWarning) + return None, None + + if math.isnan(n_midpoint_range['max']): + estimated_median = jam_values[1] + + elif math.isnan(n_midpoint_range['min']): + estimated_median = jam_values[0] + + # If there's no sampling percentage, we can't calculate a margin of error + if not sampling_percentage: + # Let's throw a warning, but still return the median + warnings.warn("", SamplingPercentageWarning) + return estimated_median, None + + # Get the standard error for this dataset + standard_error = (design_factor * math.sqrt(((100 - sampling_percentage) / (n * sampling_percentage)) * (50**2))) / 100 + + # Use the standard error to calculate the p values + p_lower = .5 - standard_error + p_upper = .5 + standard_error + + # Estimate the p_lower and p_upper n values + p_lower_n = n * p_lower + p_upper_n = n * p_upper + + # Find the ranges the p values fall within + try: + p_lower_range_i, p_lower_range = next( + (i, d) for i, d in enumerate(range_list) + if p_lower_n >= d['n_min'] and p_lower_n <= d['n_max'] + ) + except StopIteration: + raise DataError(f"The n's lower p value {p_lower_n} does not fall within a data range.") + + try: + p_upper_range_i, p_upper_range = next( + (i, d) for i, d in enumerate(range_list) + if p_upper_n >= d['n_min'] and p_upper_n <= d['n_max'] + ) + except StopIteration: + raise DataError(f"The n's upper p value {p_upper_n} does not fall within a data range.") + + # Use these values to estimate the lower bound of the confidence interval + p_lower_a1 = p_lower_range['min'] + try: + p_lower_a2 = range_list[p_lower_range_i + 1]['min'] + except IndexError: + p_lower_a2 = p_lower_range['max'] + p_lower_c1 = p_lower_range['n_min'] / n + try: + p_lower_c2 = range_list[p_lower_range_i + 1]['n_min'] / n + except IndexError: + p_lower_c2 = p_lower_range['n_max'] / n + lower_bound = ((p_lower - p_lower_c1) / (p_lower_c2 - p_lower_c1)) * (p_lower_a2 - p_lower_a1) + p_lower_a1 + + # Same for the upper bound + p_upper_a1 = p_upper_range['min'] + try: + p_upper_a2 = range_list[p_upper_range_i + 1]['min'] + except IndexError: + p_upper_a2 = p_upper_range['max'] + p_upper_c1 = p_upper_range['n_min'] / n + try: + p_upper_c2 = range_list[p_upper_range_i + 1]['n_min'] / n + except IndexError: + p_upper_c2 = p_upper_range['n_max'] / n + upper_bound = ((p_upper - p_upper_c1) / (p_upper_c2 - p_upper_c1)) * (p_upper_a2 - p_upper_a1) + p_upper_a1 + + # Calculate the standard error of the median + standard_error_median = 0.5 * (upper_bound - lower_bound) + + # Calculate the margin of error at the 90% confidence level + margin_of_error = 1.645 * standard_error_median # Return the result return estimated_median, margin_of_error @@ -433,7 +510,8 @@ def approximate_mean(range_list, simulations=50, pareto=False): * n (int): The number of people, households or other units in the range * moe (float): The margin of error for n simulations (int): number of simulations to run, used to estimate margin of error. Defaults to 50. - pareto (logical): Set True to use the Pareto distribution to simulate values in upper bin. Set False to assume a uniform distribution. Pareto is often appropriate for income. Defaults to False. + pareto (logical): Set True to use the Pareto distribution to simulate values in upper bin. + Set False to assume a uniform distribution. Pareto is often appropriate for income. Defaults to False. Returns: A two-item tuple with the mean followed by the approximated margin of error. diff --git a/census_data_aggregator/exceptions.py b/census_data_aggregator/exceptions.py index 3639ac5..afae929 100644 --- a/census_data_aggregator/exceptions.py +++ b/census_data_aggregator/exceptions.py @@ -15,3 +15,11 @@ class SamplingPercentageWarning(Warning): """ def __str__(self): return """A margin of error cannot be calculated unless you provide a sampling percentage.""" + + +class JamValueWarning(Warning): + """ + Warns that you have not provided jam values when they are needed. + """ + def __str__(self): + return """The median falls in the upper or lower bracket. Provide a jam value for more information.""" From 5b0b35c4087af4d8e932c8b2cf4d93599f450342 Mon Sep 17 00:00:00 2001 From: sastoudt Date: Thu, 15 Aug 2019 16:29:36 -0700 Subject: [PATCH 02/13] nan to none and tests for cases that are working --- census_data_aggregator/__init__.py | 3 ++ test.py | 76 +++++++++++++++++++++++++++++- 2 files changed, 78 insertions(+), 1 deletion(-) diff --git a/census_data_aggregator/__init__.py b/census_data_aggregator/__init__.py index 5643e5b..468bf40 100644 --- a/census_data_aggregator/__init__.py +++ b/census_data_aggregator/__init__.py @@ -301,6 +301,9 @@ def approximate_median(range_list, design_factor=1, sampling_percentage=None, ja # Calculate the margin of error at the 90% confidence level margin_of_error = 1.645 * standard_error_median + if math.isnan(margin_of_error): + margin_of_error = None + # Return the result return estimated_median, margin_of_error diff --git a/test.py b/test.py index 44b5e2b..e170a49 100644 --- a/test.py +++ b/test.py @@ -4,6 +4,7 @@ import unittest import census_data_aggregator import numpy +import math from census_data_aggregator.exceptions import ( DataError, SamplingPercentageWarning @@ -115,7 +116,80 @@ def test_median(self): census_data_aggregator.approximate_median(household_income_la_2013_acs1, sampling_percentage=2.5), (70065.84266055046, 3850.680465234964) ) - + + household_income_2013_acs5 = [ + dict(min=2499, max=9999, n=186), + dict(min=10000, max=14999, n=78), + dict(min=15000, max=19999, n=98), + dict(min=20000, max=24999, n=287), + dict(min=25000, max=29999, n=142), + dict(min=30000, max=34999, n=90), + dict(min=35000, max=39999, n=107), + dict(min=40000, max=44999, n=104), + dict(min=45000, max=49999, n=178), + dict(min=50000, max=59999, n=106), + dict(min=60000, max=74999, n=177), + dict(min=75000, max=99999, n=262), + dict(min=100000, max=124999, n=77), + dict(min=125000, max=149999, n=100), + dict(min=150000, max=199999, n=58), + dict(min=200000, max=250001, n=18) + ] + estimate, moe = census_data_aggregator.approximate_median(household_income_2013_acs5, design_factor=1, sampling_percentage=5*2.5, jam_values=None, simulations=50) + self.assertAlmostEqual(estimate, 42211.096153846156) + self.assertAlmostEqual(moe, 4706.522752733644) + + household_income_2013_acs5 = [ + dict(min=math.nan, max=9999, n=186), + dict(min=10000, max=14999, n=1), + dict(min=15000, max=19999, n=8), + dict(min=20000, max=24999, n=7), + dict(min=25000, max=29999, n=2), + dict(min=30000, max=34999, n=90), + dict(min=35000, max=39999, n=7), + dict(min=40000, max=44999, n=4), + dict(min=45000, max=49999, n=8), + dict(min=50000, max=59999, n=6), + dict(min=60000, max=74999, n=7), + dict(min=75000, max=99999, n=2), + dict(min=100000, max=124999, n=7), + dict(min=125000, max=149999, n=10), + dict(min=150000, max=199999, n=8), + dict(min=200000, max=250001, n=8) + ] + + estimate, moe = census_data_aggregator.approximate_median(household_income_2013_acs5, design_factor=1, sampling_percentage=5*2.5, jam_values=None, simulations=50) + self.assertEqual(estimate, None) + self.assertEqual(moe, None) + + estimate, moe = census_data_aggregator.approximate_median(household_income_2013_acs5, design_factor=1, sampling_percentage=5*2.5, simulations=50,jam_values=[2599]) + self.assertEqual(estimate, 2599) + self.assertEqual(moe, None) + + household_income_2013_acs5 = [ + dict(min=math.nan, max=9999, n=6), + dict(min=10000, max=14999, n=1), + dict(min=15000, max=19999, n=8), + dict(min=20000, max=24999, n=7), + dict(min=25000, max=29999, n=2), + dict(min=30000, max=34999, n=90), + dict(min=35000, max=39999, n=7), + dict(min=40000, max=44999, n=4), + dict(min=45000, max=49999, n=8), + dict(min=50000, max=59999, n=6), + dict(min=60000, max=74999, n=7), + dict(min=75000, max=99999, n=2), + dict(min=100000, max=124999, n=7), + dict(min=125000, max=149999, n=10), + dict(min=150000, max=199999, n=8), + dict(min=200000, max=math.nan, n=186) + ] + + estimate, moe = census_data_aggregator.approximate_median(household_income_2013_acs5, design_factor=1, sampling_percentage=5*2.5, simulations=50,jam_values=[2599, 200001]) + self.assertEqual(estimate, 200001) + self.assertEqual(moe, None) + + with self.assertWarns(SamplingPercentageWarning): m, moe = census_data_aggregator.approximate_median(household_income_Los_Angeles_County_2013_acs5, design_factor=1.5) self.assertTrue(moe == None) From 0669165d68752d15e386004ea615af0ccae6852a Mon Sep 17 00:00:00 2001 From: sastoudt Date: Thu, 15 Aug 2019 17:21:04 -0700 Subject: [PATCH 03/13] stop averaging jam values in moe, tests that reflect this change --- census_data_aggregator/__init__.py | 22 ++++++++++---- test.py | 49 ++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 6 deletions(-) diff --git a/census_data_aggregator/__init__.py b/census_data_aggregator/__init__.py index 468bf40..7b7c2c5 100644 --- a/census_data_aggregator/__init__.py +++ b/census_data_aggregator/__init__.py @@ -170,11 +170,11 @@ def approximate_median(range_list, design_factor=1, sampling_percentage=None, ja if math.isnan(n_midpoint_range['max']) and not jam_values: # Let's throw a warning - warnings.warn("", JamValueWarning) + # warnings.warn("", JamValueWarning) simulation_results.append(math.nan) # does it make sense to average these? elif math.isnan(n_midpoint_range['min']) and not jam_values: # Let's throw a warning - warnings.warn("", JamValueWarning) + # warnings.warn("", JamValueWarning) simulation_results.append(math.nan) # does it make sense to average these? elif math.isnan(n_midpoint_range['max']): # already exhausted the no jam value case estimated_median = jam_values[1] @@ -185,10 +185,20 @@ def approximate_median(range_list, design_factor=1, sampling_percentage=None, ja else: simulation_results.append(estimated_median) - estimated_median = numpy.nanmean(simulation_results) # should this be median of medians instead? - t1 = numpy.nanquantile(simulation_results, 0.95) - estimated_median # go from confidence interval to margin of error - t2 = estimated_median - numpy.nanquantile(simulation_results, 0.05) # go from confidence interval to margin of error - margin_of_error = max(t1, t2) # if asymmetrical take bigger one, conservative + if math.nan in simulation_results: + estimated_median = None + margin_of_error = None + elif jam_values and jam_values[0] in simulation_results: + estimated_median = None + margin_of_error = None + elif jam_values and jam_values[1] in simulation_results: + estimated_median = None + margin_of_error = None + else: + estimated_median = numpy.nanmean(simulation_results) # mean of medians + t1 = numpy.nanquantile(simulation_results, 0.95) - estimated_median # go from confidence interval to margin of error + t2 = estimated_median - numpy.nanquantile(simulation_results, 0.05) # go from confidence interval to margin of error + margin_of_error = max(t1, t2) # if asymmetrical take bigger one, conservative # get the median and moe via census approximation else: # For each range calculate its min and max value along the universe's scale diff --git a/test.py b/test.py index e170a49..2a38f4b 100644 --- a/test.py +++ b/test.py @@ -189,7 +189,56 @@ def test_median(self): self.assertEqual(estimate, 200001) self.assertEqual(moe, None) + household_income_2013_acs5 = [ + dict(min=math.nan, max=9999, n=6, moe=1), + dict(min=10000, max=14999, n=1, moe=1), + dict(min=15000, max=19999, n=8, moe=1), + dict(min=20000, max=24999, n=7, moe=1), + dict(min=25000, max=29999, n=2, moe=1), + dict(min=30000, max=34999, n=90, moe=8), + dict(min=35000, max=39999, n=7, moe=1), + dict(min=40000, max=44999, n=4, moe=1), + dict(min=45000, max=49999, n=8, moe=1), + dict(min=50000, max=59999, n=6, moe=1), + dict(min=60000, max=74999, n=7, moe=1), + dict(min=75000, max=99999, n=2, moe=0.25), + dict(min=100000, max=124999, n=7, moe=1), + dict(min=125000, max=149999, n=10, moe=1), + dict(min=150000, max=199999, n=8, moe=1), + dict(min=200000, max=math.nan, n=186, moe=10) + ] + + estimate, moe = census_data_aggregator.approximate_median(household_income_2013_acs5, design_factor=1, sampling_percentage=5*2.5, simulations=50,jam_values=[2599, 200001]) + self.assertEqual(estimate, None) + self.assertEqual(moe, None) + estimate, moe = census_data_aggregator.approximate_median(household_income_2013_acs5, design_factor=1, sampling_percentage=5*2.5, simulations=50,jam_values=None) + self.assertEqual(estimate, None) + self.assertEqual(moe, None) + + household_income_2013_acs5 = [ + dict(min=math.nan, max=9999, n=6, moe=1), + dict(min=10000, max=14999, n=1, moe=1), + dict(min=15000, max=19999, n=8, moe=1), + dict(min=20000, max=24999, n=7, moe=1), + dict(min=25000, max=29999, n=2, moe=1), + dict(min=30000, max=34999, n=900, moe=8), + dict(min=35000, max=39999, n=7, moe=1), + dict(min=40000, max=44999, n=4, moe=1), + dict(min=45000, max=49999, n=8, moe=1), + dict(min=50000, max=59999, n=6, moe=1), + dict(min=60000, max=74999, n=7, moe=1), + dict(min=75000, max=99999, n=2, moe=0.25), + dict(min=100000, max=124999, n=7, moe=1), + dict(min=125000, max=149999, n=10, moe=1), + dict(min=150000, max=199999, n=8, moe=1), + dict(min=200000, max=math.nan, n=18, moe=10) + ] + numpy.random.seed(711355) + estimate, moe = census_data_aggregator.approximate_median(household_income_2013_acs5, design_factor=1, sampling_percentage=5*2.5, simulations=50,jam_values=None) + self.assertAlmostEqual(estimate, 32644.851568840597) + self.assertAlmostEqual(moe, 33.0019114324823) + with self.assertWarns(SamplingPercentageWarning): m, moe = census_data_aggregator.approximate_median(household_income_Los_Angeles_County_2013_acs5, design_factor=1.5) self.assertTrue(moe == None) From 83cb8958a0352d61bcd52d3fdd470a95ba7189c0 Mon Sep 17 00:00:00 2001 From: sastoudt Date: Thu, 15 Aug 2019 17:25:59 -0700 Subject: [PATCH 04/13] add comments for simulation jam choices --- census_data_aggregator/__init__.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/census_data_aggregator/__init__.py b/census_data_aggregator/__init__.py index 7b7c2c5..8010fbf 100644 --- a/census_data_aggregator/__init__.py +++ b/census_data_aggregator/__init__.py @@ -168,23 +168,28 @@ def approximate_median(range_list, design_factor=1, sampling_percentage=None, ja # Estimate the median estimated_median = n_midpoint_range['min'] + n_midrange_gap_adjusted + # median in last bin but no jam value input if math.isnan(n_midpoint_range['max']) and not jam_values: - # Let's throw a warning + # don't need warning because jam value won't appear # warnings.warn("", JamValueWarning) - simulation_results.append(math.nan) # does it make sense to average these? + simulation_results.append(math.nan) # return nan + # median in first bin but not jam value input elif math.isnan(n_midpoint_range['min']) and not jam_values: - # Let's throw a warning + # don't need warning because jam value won't appear # warnings.warn("", JamValueWarning) - simulation_results.append(math.nan) # does it make sense to average these? + simulation_results.append(math.nan) # return nan + # median in last bin and jam value given elif math.isnan(n_midpoint_range['max']): # already exhausted the no jam value case estimated_median = jam_values[1] simulation_results.append(estimated_median) + # median in first bin and jam value given elif math.isnan(n_midpoint_range['min']): # already exhausted the no jam value case estimated_median = jam_values[0] simulation_results.append(estimated_median) + # median is fine else: simulation_results.append(estimated_median) - + # if jam values are involved, doesn't make sense to take a mean, return None if math.nan in simulation_results: estimated_median = None margin_of_error = None @@ -194,6 +199,7 @@ def approximate_median(range_list, design_factor=1, sampling_percentage=None, ja elif jam_values and jam_values[1] in simulation_results: estimated_median = None margin_of_error = None + # if jam values aren't involved, proceed as normal else: estimated_median = numpy.nanmean(simulation_results) # mean of medians t1 = numpy.nanquantile(simulation_results, 0.95) - estimated_median # go from confidence interval to margin of error From 6128fe18f0cfe9a16a5a671ae4bd351b49976ff2 Mon Sep 17 00:00:00 2001 From: sastoudt Date: Thu, 15 Aug 2019 18:16:06 -0700 Subject: [PATCH 05/13] make different jam value warnings, tests for warnings --- census_data_aggregator/__init__.py | 11 ++-- census_data_aggregator/exceptions.py | 18 ++++++- test.py | 78 ++++++++++++++++++---------- 3 files changed, 76 insertions(+), 31 deletions(-) diff --git a/census_data_aggregator/__init__.py b/census_data_aggregator/__init__.py index 8010fbf..a33aeec 100644 --- a/census_data_aggregator/__init__.py +++ b/census_data_aggregator/__init__.py @@ -4,7 +4,7 @@ import math import numpy import warnings -from .exceptions import DataError, SamplingPercentageWarning, JamValueWarning +from .exceptions import DataError, SamplingPercentageWarning, JamValueMissingWarning, JamValueResultWarning, JamValueResultMOEWarning def approximate_sum(*pairs): @@ -191,12 +191,15 @@ def approximate_median(range_list, design_factor=1, sampling_percentage=None, ja simulation_results.append(estimated_median) # if jam values are involved, doesn't make sense to take a mean, return None if math.nan in simulation_results: + warnings.warn("", JamValueMissingWarning) estimated_median = None margin_of_error = None elif jam_values and jam_values[0] in simulation_results: + warnings.warn("", JamValueResultMOEWarning) estimated_median = None margin_of_error = None elif jam_values and jam_values[1] in simulation_results: + warnings.warn("", JamValueResultMOEWarning) estimated_median = None margin_of_error = None # if jam values aren't involved, proceed as normal @@ -237,18 +240,20 @@ def approximate_median(range_list, design_factor=1, sampling_percentage=None, ja if not jam_values and math.isnan(n_midpoint_range['max']): # Let's throw a warning - warnings.warn("", JamValueWarning) + warnings.warn("", JamValueMissingWarning) return None, None if not jam_values and math.isnan(n_midpoint_range['min']): # Let's throw a warning - warnings.warn("", JamValueWarning) + warnings.warn("", JamValueMissingWarning) return None, None if math.isnan(n_midpoint_range['max']): + warnings.warn("", JamValueResultWarning) estimated_median = jam_values[1] elif math.isnan(n_midpoint_range['min']): + warnings.warn("", JamValueResultWarning) estimated_median = jam_values[0] # If there's no sampling percentage, we can't calculate a margin of error diff --git a/census_data_aggregator/exceptions.py b/census_data_aggregator/exceptions.py index afae929..21c566f 100644 --- a/census_data_aggregator/exceptions.py +++ b/census_data_aggregator/exceptions.py @@ -17,9 +17,25 @@ def __str__(self): return """A margin of error cannot be calculated unless you provide a sampling percentage.""" -class JamValueWarning(Warning): +class JamValueMissingWarning(Warning): """ Warns that you have not provided jam values when they are needed. """ def __str__(self): return """The median falls in the upper or lower bracket. Provide a jam value for more information.""" + + +class JamValueResultWarning(Warning): + """ + Warns that the estiamte is a jam value. + """ + def __str__(self): + return """The median falls in the upper or lower bracket. A jam value is returned as the median estimate.""" + + +class JamValueResultMOEWarning(Warning): + """ + Warns that the estiamte is not provided because averaging jam values is inappropriate. + """ + def __str__(self): + return """The median falls in the upper or lower bracket. No estimate is returned since an average over jam values is inappropriate.""" diff --git a/test.py b/test.py index 2a38f4b..1cf317f 100644 --- a/test.py +++ b/test.py @@ -7,7 +7,10 @@ import math from census_data_aggregator.exceptions import ( DataError, - SamplingPercentageWarning + SamplingPercentageWarning, + JamValueMissingWarning, + JamValueResultWarning, + JamValueResultMOEWarning ) @@ -117,6 +120,7 @@ def test_median(self): (70065.84266055046, 3850.680465234964) ) + household_income_2013_acs5 = [ dict(min=2499, max=9999, n=186), dict(min=10000, max=14999, n=78), @@ -139,7 +143,8 @@ def test_median(self): self.assertAlmostEqual(estimate, 42211.096153846156) self.assertAlmostEqual(moe, 4706.522752733644) - household_income_2013_acs5 = [ + with self.assertWarns(JamValueMissingWarning): + household_income_2013_acs5 = [ dict(min=math.nan, max=9999, n=186), dict(min=10000, max=14999, n=1), dict(min=15000, max=19999, n=8), @@ -156,17 +161,36 @@ def test_median(self): dict(min=125000, max=149999, n=10), dict(min=150000, max=199999, n=8), dict(min=200000, max=250001, n=8) - ] - - estimate, moe = census_data_aggregator.approximate_median(household_income_2013_acs5, design_factor=1, sampling_percentage=5*2.5, jam_values=None, simulations=50) - self.assertEqual(estimate, None) - self.assertEqual(moe, None) - - estimate, moe = census_data_aggregator.approximate_median(household_income_2013_acs5, design_factor=1, sampling_percentage=5*2.5, simulations=50,jam_values=[2599]) - self.assertEqual(estimate, 2599) - self.assertEqual(moe, None) + ] + estimate, moe = census_data_aggregator.approximate_median(household_income_2013_acs5, design_factor=1, sampling_percentage=5*2.5, jam_values=None, simulations=50) + self.assertEqual(estimate, None) + self.assertEqual(moe, None) + + with self.assertWarns(JamValueResultWarning): + household_income_2013_acs5 = [ + dict(min=math.nan, max=9999, n=186), + dict(min=10000, max=14999, n=1), + dict(min=15000, max=19999, n=8), + dict(min=20000, max=24999, n=7), + dict(min=25000, max=29999, n=2), + dict(min=30000, max=34999, n=90), + dict(min=35000, max=39999, n=7), + dict(min=40000, max=44999, n=4), + dict(min=45000, max=49999, n=8), + dict(min=50000, max=59999, n=6), + dict(min=60000, max=74999, n=7), + dict(min=75000, max=99999, n=2), + dict(min=100000, max=124999, n=7), + dict(min=125000, max=149999, n=10), + dict(min=150000, max=199999, n=8), + dict(min=200000, max=250001, n=8) + ] + estimate, moe = census_data_aggregator.approximate_median(household_income_2013_acs5, design_factor=1, sampling_percentage=5*2.5, simulations=50,jam_values=[2599]) + self.assertEqual(estimate, 2599) + self.assertEqual(moe, None) - household_income_2013_acs5 = [ + with self.assertWarns(JamValueResultWarning): + household_income_2013_acs5 = [ dict(min=math.nan, max=9999, n=6), dict(min=10000, max=14999, n=1), dict(min=15000, max=19999, n=8), @@ -183,13 +207,13 @@ def test_median(self): dict(min=125000, max=149999, n=10), dict(min=150000, max=199999, n=8), dict(min=200000, max=math.nan, n=186) - ] - - estimate, moe = census_data_aggregator.approximate_median(household_income_2013_acs5, design_factor=1, sampling_percentage=5*2.5, simulations=50,jam_values=[2599, 200001]) - self.assertEqual(estimate, 200001) - self.assertEqual(moe, None) + ] + estimate, moe = census_data_aggregator.approximate_median(household_income_2013_acs5, design_factor=1, sampling_percentage=5*2.5, simulations=50,jam_values=[2599, 200001]) + self.assertEqual(estimate, 200001) + self.assertEqual(moe, None) - household_income_2013_acs5 = [ + with self.assertWarns(JamValueResultMOEWarning): + household_income_2013_acs5 = [ dict(min=math.nan, max=9999, n=6, moe=1), dict(min=10000, max=14999, n=1, moe=1), dict(min=15000, max=19999, n=8, moe=1), @@ -206,15 +230,15 @@ def test_median(self): dict(min=125000, max=149999, n=10, moe=1), dict(min=150000, max=199999, n=8, moe=1), dict(min=200000, max=math.nan, n=186, moe=10) - ] - - estimate, moe = census_data_aggregator.approximate_median(household_income_2013_acs5, design_factor=1, sampling_percentage=5*2.5, simulations=50,jam_values=[2599, 200001]) - self.assertEqual(estimate, None) - self.assertEqual(moe, None) + ] + estimate, moe = census_data_aggregator.approximate_median(household_income_2013_acs5, design_factor=1, sampling_percentage=5*2.5, simulations=50,jam_values=[2599, 200001]) + self.assertEqual(estimate, None) + self.assertEqual(moe, None) - estimate, moe = census_data_aggregator.approximate_median(household_income_2013_acs5, design_factor=1, sampling_percentage=5*2.5, simulations=50,jam_values=None) - self.assertEqual(estimate, None) - self.assertEqual(moe, None) + with self.assertWarns(JamValueMissingWarning): + estimate, moe = census_data_aggregator.approximate_median(household_income_2013_acs5, design_factor=1, sampling_percentage=5*2.5, simulations=50,jam_values=None) + self.assertEqual(estimate, None) + self.assertEqual(moe, None) household_income_2013_acs5 = [ dict(min=math.nan, max=9999, n=6, moe=1), @@ -238,7 +262,7 @@ def test_median(self): estimate, moe = census_data_aggregator.approximate_median(household_income_2013_acs5, design_factor=1, sampling_percentage=5*2.5, simulations=50,jam_values=None) self.assertAlmostEqual(estimate, 32644.851568840597) self.assertAlmostEqual(moe, 33.0019114324823) - + with self.assertWarns(SamplingPercentageWarning): m, moe = census_data_aggregator.approximate_median(household_income_Los_Angeles_County_2013_acs5, design_factor=1.5) self.assertTrue(moe == None) From 02af6613f0b98f65c318e4179f7d65fc8e09ee14 Mon Sep 17 00:00:00 2001 From: sastoudt Date: Fri, 16 Aug 2019 09:07:27 -0700 Subject: [PATCH 06/13] error if only one jam value given and two are needed, plus test --- census_data_aggregator/__init__.py | 7 +++++-- census_data_aggregator/exceptions.py | 7 +++++++ test.py | 22 ++++++++++++++++++++++ 3 files changed, 34 insertions(+), 2 deletions(-) diff --git a/census_data_aggregator/__init__.py b/census_data_aggregator/__init__.py index a33aeec..cb0e762 100644 --- a/census_data_aggregator/__init__.py +++ b/census_data_aggregator/__init__.py @@ -4,7 +4,7 @@ import math import numpy import warnings -from .exceptions import DataError, SamplingPercentageWarning, JamValueMissingWarning, JamValueResultWarning, JamValueResultMOEWarning +from .exceptions import DataError, InputError, SamplingPercentageWarning, JamValueMissingWarning, JamValueResultWarning, JamValueResultMOEWarning def approximate_sum(*pairs): @@ -250,7 +250,10 @@ def approximate_median(range_list, design_factor=1, sampling_percentage=None, ja if math.isnan(n_midpoint_range['max']): warnings.warn("", JamValueResultWarning) - estimated_median = jam_values[1] + if len(jam_values) < 2: + raise InputError(f"An upper jam value input is needed") + else: + estimated_median = jam_values[1] elif math.isnan(n_midpoint_range['min']): warnings.warn("", JamValueResultWarning) diff --git a/census_data_aggregator/exceptions.py b/census_data_aggregator/exceptions.py index 21c566f..39a626f 100644 --- a/census_data_aggregator/exceptions.py +++ b/census_data_aggregator/exceptions.py @@ -9,6 +9,13 @@ class DataError(Exception): pass +class InputError(Exception): + """ + Raised if jam value input is invalid. + """ + pass + + class SamplingPercentageWarning(Warning): """ Warns that you have not provided a design factor. diff --git a/test.py b/test.py index 1cf317f..2b83bdd 100644 --- a/test.py +++ b/test.py @@ -7,6 +7,7 @@ import math from census_data_aggregator.exceptions import ( DataError, + InputError, SamplingPercentageWarning, JamValueMissingWarning, JamValueResultWarning, @@ -212,6 +213,27 @@ def test_median(self): self.assertEqual(estimate, 200001) self.assertEqual(moe, None) + with self.assertRaises(InputError): + household_income_2013_acs5 = [ + dict(min=math.nan, max=9999, n=6), + dict(min=10000, max=14999, n=1), + dict(min=15000, max=19999, n=8), + dict(min=20000, max=24999, n=7), + dict(min=25000, max=29999, n=2), + dict(min=30000, max=34999, n=90), + dict(min=35000, max=39999, n=7), + dict(min=40000, max=44999, n=4), + dict(min=45000, max=49999, n=8), + dict(min=50000, max=59999, n=6), + dict(min=60000, max=74999, n=7), + dict(min=75000, max=99999, n=2), + dict(min=100000, max=124999, n=7), + dict(min=125000, max=149999, n=10), + dict(min=150000, max=199999, n=8), + dict(min=200000, max=math.nan, n=186) + ] + estimate, moe = census_data_aggregator.approximate_median(household_income_2013_acs5, design_factor=1, sampling_percentage=5*2.5, simulations=50,jam_values=[2599]) + with self.assertWarns(JamValueResultMOEWarning): household_income_2013_acs5 = [ dict(min=math.nan, max=9999, n=6, moe=1), From 0a31f3c2b3bd8a2a979e5452be1626224b44beee Mon Sep 17 00:00:00 2001 From: sastoudt Date: Fri, 16 Aug 2019 10:51:32 -0700 Subject: [PATCH 07/13] documentation --- README.rst | 17 +++++++++++++---- census_data_aggregator/__init__.py | 15 +++++++++------ 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/README.rst b/README.rst index 38689bb..c87dafe 100644 --- a/README.rst +++ b/README.rst @@ -131,7 +131,7 @@ Approximating medians Estimate a median and approximate the margin of error. Follows the U.S. Census Bureau's official guidelines for estimation. Useful for generating medians for measures like household income and age when aggregating census geographies. -Expects a list of dictionaries that divide the full range of data values into continuous categories. Each dictionary should have three keys: +Expects a list of dictionaries that divide the full range of data values into continuous categories. Each dictionary should have three keys with an optional fourth key for margin of error inputs: .. list-table:: :header-rows: 1 @@ -139,11 +139,13 @@ Expects a list of dictionaries that divide the full range of data values into co * - key - value * - min - - The minimum value of the range + - The minimum value of the range (if unknown use `math.nan`) * - max - - The maximum value of the range + - The maximum value of the range (if unknown use `math.nan`) * - n - The number of people, households or other units in the range + * - moe (optional) + - The `n` value's associated margin of error .. code-block:: python @@ -195,8 +197,15 @@ If you do not provide the value to the function, no margin of error will be retu >>> census_data_aggregator.approximate_median(household_income_Los_Angeles_County_2013_acs1) 70065.84266055046, None -If the data being approximated comes from PUMS, an additional design factor must also be provided. The design factor is a statistical input used to tailor the estimate to the variance of the dataset. Find the value for the dataset you are estimating by referring to `the bureau's reference material `_. +If the data being approximated comes from PUMS, an additional design factor must also be provided. +The design factor is a statistical input used to tailor the estimate to the variance of the dataset. +Find the value for the dataset you are estimating by referring to `the bureau's reference material `_. +If you have an associated "jam values" for your dataset provided in the `American Community Survey's technical documentation `_, input the pair as a list to the `jam_values` keyword argument. +Then if the median falls in the first or last bin, the jam value will be returned instead of `None`. + +If the `n` values have an associated margin of error, a simulation based approach will be used to estimate the new margin of error. The `simulations` keyword argument controls the number of simulations to run and defaults to 50. +Jam values will not be used in the simulation approach. If the estimated median falls in the lower or upper bin, the estimate returned will be `None`. Approximating percent change ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/census_data_aggregator/__init__.py b/census_data_aggregator/__init__.py index cb0e762..ff83d05 100644 --- a/census_data_aggregator/__init__.py +++ b/census_data_aggregator/__init__.py @@ -71,12 +71,11 @@ def approximate_median(range_list, design_factor=1, sampling_percentage=None, ja Args: range_list (list): A list of dictionaries that divide the full range of data values into continuous categories. Each dictionary should have three keys: - * min (int): The minimum value of the range - * max (int): The maximum value of the range + * min (int): The minimum value of the range. If unknown use `math.nan`. + * max (int): The maximum value of the range. If unknown use `math.nan`. * n (int): The number of people, households or other unit in the range - The minimum value in the first range and the maximum value in the last range - can be tailored to the dataset by using the "jam values" provided in - the `American Community Survey's technical documentation`_. + * moe (float, optional): If the `n` value has an associated margin of error, include it to contribute + to the new margin of error calculation. design_factor (float, optional): A statistical input used to tailor the standard error to the variance of the dataset. This is only needed for data coming from public use microdata sample, also known as PUMS. You do not need to provide this input if you are approximating @@ -89,7 +88,11 @@ def approximate_median(range_list, design_factor=1, sampling_percentage=None, ja * Three-year ACS: 7.5 * Five-year ACS: 12.5 If you do not provide this input, a margin of error will not be returned. - + jam_values (list, optional): If you have an associated "jam values" for your dataset provided in + the `American Community Survey's technical documentation`_ input the pair as a list. If the median falls + in the first or last bin, the jam value will be returned instead of `None`. + simulations (integer, optional): If the `n` values have an associated margin of error, a simulation based approach + will be used to estimate the new margin of error. This input controls the number of simulations to run. Defaults to 50. Returns: A two-item tuple with the median followed by the approximated margin of error. From 742ad1ede9cd83192eb6b5d12da52ad48089aa0a Mon Sep 17 00:00:00 2001 From: sastoudt Date: Fri, 16 Aug 2019 10:56:49 -0700 Subject: [PATCH 08/13] add example to readme, spacing tests --- README.rst | 26 ++++++++++++++++++++++++++ test.py | 12 ++++++------ 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/README.rst b/README.rst index c87dafe..a347273 100644 --- a/README.rst +++ b/README.rst @@ -207,6 +207,32 @@ Then if the median falls in the first or last bin, the jam value will be returne If the `n` values have an associated margin of error, a simulation based approach will be used to estimate the new margin of error. The `simulations` keyword argument controls the number of simulations to run and defaults to 50. Jam values will not be used in the simulation approach. If the estimated median falls in the lower or upper bin, the estimate returned will be `None`. + +.. code-block:: python + + >>> moe_example = [ + dict(min=math.nan, max=9999, n=6, moe=1), + dict(min=10000, max=14999, n=1, moe=1), + dict(min=15000, max=19999, n=8, moe=1), + dict(min=20000, max=24999, n=7, moe=1), + dict(min=25000, max=29999, n=2, moe=1), + dict(min=30000, max=34999, n=900, moe=8), + dict(min=35000, max=39999, n=7, moe=1), + dict(min=40000, max=44999, n=4, moe=1), + dict(min=45000, max=49999, n=8, moe=1), + dict(min=50000, max=59999, n=6, moe=1), + dict(min=60000, max=74999, n=7, moe=1), + dict(min=75000, max=99999, n=2, moe=0.25), + dict(min=100000, max=124999, n=7, moe=1), + dict(min=125000, max=149999, n=10, moe=1), + dict(min=150000, max=199999, n=8, moe=1), + dict(min=200000, max=math.nan, n=18, moe=10) + ] + >>> import numpy + >>> numpy.random.seed(711355) + >>> approximate_median(moe_example, design_factor=1, sampling_percentage=5*2.5, simulations=50, jam_values=[2499, 200001]) + (32644.851568840597, 33.0019114324823) + Approximating percent change ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/test.py b/test.py index 2b83bdd..e250b06 100644 --- a/test.py +++ b/test.py @@ -186,7 +186,7 @@ def test_median(self): dict(min=150000, max=199999, n=8), dict(min=200000, max=250001, n=8) ] - estimate, moe = census_data_aggregator.approximate_median(household_income_2013_acs5, design_factor=1, sampling_percentage=5*2.5, simulations=50,jam_values=[2599]) + estimate, moe = census_data_aggregator.approximate_median(household_income_2013_acs5, design_factor=1, sampling_percentage=5*2.5, simulations=50, jam_values=[2599]) self.assertEqual(estimate, 2599) self.assertEqual(moe, None) @@ -209,7 +209,7 @@ def test_median(self): dict(min=150000, max=199999, n=8), dict(min=200000, max=math.nan, n=186) ] - estimate, moe = census_data_aggregator.approximate_median(household_income_2013_acs5, design_factor=1, sampling_percentage=5*2.5, simulations=50,jam_values=[2599, 200001]) + estimate, moe = census_data_aggregator.approximate_median(household_income_2013_acs5, design_factor=1, sampling_percentage=5*2.5, simulations=50, jam_values=[2599, 200001]) self.assertEqual(estimate, 200001) self.assertEqual(moe, None) @@ -232,7 +232,7 @@ def test_median(self): dict(min=150000, max=199999, n=8), dict(min=200000, max=math.nan, n=186) ] - estimate, moe = census_data_aggregator.approximate_median(household_income_2013_acs5, design_factor=1, sampling_percentage=5*2.5, simulations=50,jam_values=[2599]) + estimate, moe = census_data_aggregator.approximate_median(household_income_2013_acs5, design_factor=1, sampling_percentage=5*2.5, simulations=50, jam_values=[2599]) with self.assertWarns(JamValueResultMOEWarning): household_income_2013_acs5 = [ @@ -253,12 +253,12 @@ def test_median(self): dict(min=150000, max=199999, n=8, moe=1), dict(min=200000, max=math.nan, n=186, moe=10) ] - estimate, moe = census_data_aggregator.approximate_median(household_income_2013_acs5, design_factor=1, sampling_percentage=5*2.5, simulations=50,jam_values=[2599, 200001]) + estimate, moe = census_data_aggregator.approximate_median(household_income_2013_acs5, design_factor=1, sampling_percentage=5*2.5, simulations=50, jam_values=[2599, 200001]) self.assertEqual(estimate, None) self.assertEqual(moe, None) with self.assertWarns(JamValueMissingWarning): - estimate, moe = census_data_aggregator.approximate_median(household_income_2013_acs5, design_factor=1, sampling_percentage=5*2.5, simulations=50,jam_values=None) + estimate, moe = census_data_aggregator.approximate_median(household_income_2013_acs5, design_factor=1, sampling_percentage=5*2.5, simulations=50, jam_values=None) self.assertEqual(estimate, None) self.assertEqual(moe, None) @@ -281,7 +281,7 @@ def test_median(self): dict(min=200000, max=math.nan, n=18, moe=10) ] numpy.random.seed(711355) - estimate, moe = census_data_aggregator.approximate_median(household_income_2013_acs5, design_factor=1, sampling_percentage=5*2.5, simulations=50,jam_values=None) + estimate, moe = census_data_aggregator.approximate_median(household_income_2013_acs5, design_factor=1, sampling_percentage=5*2.5, simulations=50, jam_values=None) self.assertAlmostEqual(estimate, 32644.851568840597) self.assertAlmostEqual(moe, 33.0019114324823) From 438f8f8ee6113b174474298b5495b89f0e880bd9 Mon Sep 17 00:00:00 2001 From: sastoudt Date: Fri, 16 Aug 2019 11:05:23 -0700 Subject: [PATCH 09/13] update examples, consistent call of functions --- README.rst | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/README.rst b/README.rst index a347273..e6fc9af 100644 --- a/README.rst +++ b/README.rst @@ -95,7 +95,7 @@ Expects a list of dictionaries that divide the full range of data values into co dict(min=150000, max=199999, n=6931136, moe=37236), dict(min=200000, max=1000000, n=7465517, moe=42206) ] - >>> approximate_mean(income) + >>> census_data_aggregator.approximate_mean(income) (98045.44530685373, 194.54892406267754) Note that this function expects you to submit a lower bound for the smallest bin and an upper bound for the largest bin. This is often not available for ACS datasets like income. We recommend experimenting with different lower and upper bounds to assess its effect on the resulting mean. @@ -110,7 +110,7 @@ The simulation assumes a uniform distribution of values within each bin. In some .. code-block:: python - >>> approximate_mean(income, pareto=True) + >>> census_data_aggregator.approximate_mean(income, pareto=True) (60364.96525340687, 58.60735554621351) Also, due to the stochastic nature of the simulation approach, you will need to set a seed before running this function to ensure replicability. @@ -119,10 +119,10 @@ Also, due to the stochastic nature of the simulation approach, you will need to >>> import numpy >>> numpy.random.seed(711355) - >>> approximate_mean(income, pareto=True) + >>> census_data_aggregator.approximate_mean(income, pareto=True) (60364.96525340687, 58.60735554621351) >>> numpy.random.seed(711355) - >>> approximate_mean(income, pareto=True) + >>> census_data_aggregator.approximate_mean(income, pareto=True) (60364.96525340687, 58.60735554621351) @@ -151,7 +151,7 @@ Expects a list of dictionaries that divide the full range of data values into co .. code-block:: python >>> household_income_la_2013_acs1 = [ - dict(min=2499, max=9999, n=1382), + dict(min=0, max=9999, n=1382), dict(min=10000, max=14999, n=2377), dict(min=15000, max=19999, n=1332), dict(min=20000, max=24999, n=3129), @@ -166,7 +166,7 @@ Expects a list of dictionaries that divide the full range of data values into co dict(min=100000, max=124999, n=5257), dict(min=125000, max=149999, n=3485), dict(min=150000, max=199999, n=2926), - dict(min=200000, max=250001, n=4215) + dict(min=200000, max=500000, n=4215) ] For a margin of error to be returned, a sampling percentage must be provided to calculate the standard error. The sampling percentage represents what proportion of the population that participated in the survey. Here are the values for some common census surveys. @@ -187,14 +187,14 @@ For a margin of error to be returned, a sampling percentage must be provided to .. code-block:: python - >>> census_data_aggregator.approximate_median(household_income_Los_Angeles_County_2013_acs1, sampling_percentage=2.5) + >>> census_data_aggregator.approximate_median(household_income_la_2013_acs1, sampling_percentage=2.5) 70065.84266055046, 3850.680465234964 If you do not provide the value to the function, no margin of error will be returned. .. code-block:: python - >>> census_data_aggregator.approximate_median(household_income_Los_Angeles_County_2013_acs1) + >>> census_data_aggregator.approximate_median(household_income_la_2013_acs1) 70065.84266055046, None If the data being approximated comes from PUMS, an additional design factor must also be provided. @@ -230,7 +230,7 @@ Jam values will not be used in the simulation approach. If the estimated median ] >>> import numpy >>> numpy.random.seed(711355) - >>> approximate_median(moe_example, design_factor=1, sampling_percentage=5*2.5, simulations=50, jam_values=[2499, 200001]) + >>> census_data_aggregator.approximate_median(moe_example, design_factor=1, sampling_percentage=5*2.5, simulations=50, jam_values=[2499, 200001]) (32644.851568840597, 33.0019114324823) Approximating percent change From c910c0c7e6de67aae2c5612132ec4094efe578a6 Mon Sep 17 00:00:00 2001 From: sastoudt Date: Tue, 27 Aug 2019 10:22:37 -0700 Subject: [PATCH 10/13] take max of zero and simulated n to avoid negative numbers --- census_data_aggregator/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/census_data_aggregator/__init__.py b/census_data_aggregator/__init__.py index ff83d05..0c15c48 100644 --- a/census_data_aggregator/__init__.py +++ b/census_data_aggregator/__init__.py @@ -593,6 +593,7 @@ def approximate_mean(range_list, simulations=50, pareto=False): se = range_['moe'] / 1.645 # convert moe to se nn = round(numpy.random.normal(range_['n'], se)) # use moe to introduce randomness into number in bin nn = int(nn) # clean it up + nn = max(0, nn) # don't allow negative values simulated_values.append(numpy.random.uniform(range_['min'], range_['max'], size=(1, nn)).sum()) # draw random values within the bin, assume uniform simulated_n.append(nn) # a special case to handle the last bin @@ -601,6 +602,7 @@ def approximate_mean(range_list, simulations=50, pareto=False): se = last['moe'] / 1.645 # convert moe to se nn = round(numpy.random.normal(last['n'], se)) # use moe to introduce randomness into number in bin nn = int(nn) # clean it up + nn = max(0, nn) # don't allow negative values simulated_values.append(numpy.random.pareto(a=alpha_hat, size=(1, nn)).sum()) # draw random values within the bin, assume uniform simulated_n.append(nn) # use uniform otherwise From 7e211a67611d83fb1f3dc9a593a11f8945a64c05 Mon Sep 17 00:00:00 2001 From: sastoudt Date: Wed, 28 Aug 2019 17:16:01 -0700 Subject: [PATCH 11/13] fix none v. nan --- census_data_aggregator/__init__.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/census_data_aggregator/__init__.py b/census_data_aggregator/__init__.py index 0c15c48..88ac6dd 100644 --- a/census_data_aggregator/__init__.py +++ b/census_data_aggregator/__init__.py @@ -130,9 +130,15 @@ def approximate_median(range_list, design_factor=1, sampling_percentage=None, ja ... _the bureau's reference material: https://www.census.gov/programs-surveys/acs/technical-documentation/pums/documentation.html """ + # need to replace before sort + for i in range(len(range_list)): + for k, v in range_list[i].items(): + if v is None: + range_list[i][v] = math.nan + # Sort the list range_list.sort(key=lambda x: x['min']) - + # if moe is included, can use simulation to estimate margin of error for median if "moe" in list(range_list[0].keys()): simulation_results = [] From c23809d2fafbbda10bb5e3fc3a177be3cc962d9b Mon Sep 17 00:00:00 2001 From: sastoudt Date: Tue, 3 Sep 2019 09:25:17 -0700 Subject: [PATCH 12/13] documentation tweaks --- README.rst | 120 ++++++++++++++++++++--------- census_data_aggregator/__init__.py | 47 ++++++----- 2 files changed, 105 insertions(+), 62 deletions(-) diff --git a/README.rst b/README.rst index e6fc9af..55feb7a 100644 --- a/README.rst +++ b/README.rst @@ -98,7 +98,7 @@ Expects a list of dictionaries that divide the full range of data values into co >>> census_data_aggregator.approximate_mean(income) (98045.44530685373, 194.54892406267754) -Note that this function expects you to submit a lower bound for the smallest bin and an upper bound for the largest bin. This is often not available for ACS datasets like income. We recommend experimenting with different lower and upper bounds to assess its effect on the resulting mean. +Note that, unlike `approximate_median` this function expects you to submit a lower bound for the smallest bin and an upper bound for the largest bin. This is because the Census's jam value approach is only used for median calculations. We recommend experimenting with different lower and upper bounds to assess its effect on the resulting mean. By default the simulation is run 50 times, which can take as long as a minute. The number of simulations can be changed by setting the `simulation` keyword argument. @@ -131,7 +131,7 @@ Approximating medians Estimate a median and approximate the margin of error. Follows the U.S. Census Bureau's official guidelines for estimation. Useful for generating medians for measures like household income and age when aggregating census geographies. -Expects a list of dictionaries that divide the full range of data values into continuous categories. Each dictionary should have three keys with an optional fourth key for margin of error inputs: +Expects a list of dictionaries that divide the full range of data values into continuous categories. The first `min` and the last `max` should be `None` since we typically do not know the boundaries for the top and bottom bins (e.g. income). If these values are actually known (e.g. lower bound for age), the known value can replace `None.` Each dictionary should have three keys with an optional fourth key for margin of error inputs: .. list-table:: :header-rows: 1 @@ -145,31 +145,38 @@ Expects a list of dictionaries that divide the full range of data values into co * - n - The number of people, households or other units in the range * - moe (optional) - - The `n` value's associated margin of error + - The `n` value's associated margin of error. If given as an input, a simulation approach will be used to estimate the new margin of error. + .. code-block:: python - >>> household_income_la_2013_acs1 = [ - dict(min=0, max=9999, n=1382), - dict(min=10000, max=14999, n=2377), - dict(min=15000, max=19999, n=1332), - dict(min=20000, max=24999, n=3129), - dict(min=25000, max=29999, n=1927), - dict(min=30000, max=34999, n=1825), - dict(min=35000, max=39999, n=1567), - dict(min=40000, max=44999, n=1996), - dict(min=45000, max=49999, n=1757), - dict(min=50000, max=59999, n=3523), - dict(min=60000, max=74999, n=4360), - dict(min=75000, max=99999, n=6424), - dict(min=100000, max=124999, n=5257), - dict(min=125000, max=149999, n=3485), - dict(min=150000, max=199999, n=2926), - dict(min=200000, max=500000, n=4215) - ] - -For a margin of error to be returned, a sampling percentage must be provided to calculate the standard error. The sampling percentage represents what proportion of the population that participated in the survey. Here are the values for some common census surveys. + >>> median_with_moe_example = [ + dict(min=None, max=9999, n=6, moe=1), + dict(min=10000, max=14999, n=1, moe=1), + dict(min=15000, max=19999, n=8, moe=1), + dict(min=20000, max=24999, n=7, moe=1), + dict(min=25000, max=29999, n=2, moe=1), + dict(min=30000, max=34999, n=900, moe=8), + dict(min=35000, max=39999, n=7, moe=1), + dict(min=40000, max=44999, n=4, moe=1), + dict(min=45000, max=49999, n=8, moe=1), + dict(min=50000, max=59999, n=6, moe=1), + dict(min=60000, max=74999, n=7, moe=1), + dict(min=75000, max=99999, n=2, moe=0.25), + dict(min=100000, max=124999, n=7, moe=1), + dict(min=125000, max=149999, n=10, moe=1), + dict(min=150000, max=199999, n=8, moe=1), + dict(min=200000, max=None, n=18, moe=10) + ] + + +.. code-block:: python + + >>> census_data_aggregator.approximate_median(median_with_moe_example, sampling_percentage=2.5) + (32646.07020990552, 26.638686513280845) + +In the case without margin of error inputs, a sampling percentage must be provided to in order for a margin of error to be returned. The sampling percentage represents what proportion of the population that participated in the survey. Here are the values for some common census surveys. .. list-table:: :header-rows: 1 @@ -185,17 +192,31 @@ For a margin of error to be returned, a sampling percentage must be provided to * - Five-year ACS - 12.5 -.. code-block:: python - - >>> census_data_aggregator.approximate_median(household_income_la_2013_acs1, sampling_percentage=2.5) - 70065.84266055046, 3850.680465234964 - -If you do not provide the value to the function, no margin of error will be returned. +If you do not provide the sampling percentage value to the function, no margin of error will be returned. .. code-block:: python - >>> census_data_aggregator.approximate_median(household_income_la_2013_acs1) - 70065.84266055046, None + >>> median_without_moe_example = [ + dict(min=None, max=9999, n=6), + dict(min=10000, max=14999, n=1), + dict(min=15000, max=19999, n=8), + dict(min=20000, max=24999, n=7), + dict(min=25000, max=29999, n=2), + dict(min=30000, max=34999, n=900), + dict(min=35000, max=39999, n=7), + dict(min=40000, max=44999, n=4), + dict(min=45000, max=49999, n=8), + dict(min=50000, max=59999, n=6), + dict(min=60000, max=74999, n=7), + dict(min=75000, max=99999, n=2), + dict(min=100000, max=124999, n=7), + dict(min=125000, max=149999, n=10), + dict(min=150000, max=199999, n=8), + dict(min=200000, max=None, n=18) + ] + + >>> census_data_aggregator.approximate_median(median_without_moe_example) + 32646.69277777778, None If the data being approximated comes from PUMS, an additional design factor must also be provided. The design factor is a statistical input used to tailor the estimate to the variance of the dataset. @@ -203,6 +224,30 @@ Find the value for the dataset you are estimating by referring to `the bureau's If you have an associated "jam values" for your dataset provided in the `American Community Survey's technical documentation `_, input the pair as a list to the `jam_values` keyword argument. Then if the median falls in the first or last bin, the jam value will be returned instead of `None`. + +.. code-block:: python + + >>> jam_without_simulation = [ + dict(min=None, max=9999, n=6), + dict(min=10000, max=14999, n=1), + dict(min=15000, max=19999, n=8), + dict(min=20000, max=24999, n=7), + dict(min=25000, max=29999, n=2), + dict(min=30000, max=34999, n=9), + dict(min=35000, max=39999, n=7), + dict(min=40000, max=44999, n=4), + dict(min=45000, max=49999, n=8), + dict(min=50000, max=59999, n=6), + dict(min=60000, max=74999, n=7), + dict(min=75000, max=99999, n=2), + dict(min=100000, max=124999, n=7), + dict(min=125000, max=149999, n=10), + dict(min=150000, max=199999, n=8), + dict(min=200000, max=None, n=186) + ] + >>> import numpy + >>> census_data_aggregator.approximate_median(jam_without_simulation, sampling_percentage=5*2.5,jam_values=[2599, 200001]) + (200001, None) If the `n` values have an associated margin of error, a simulation based approach will be used to estimate the new margin of error. The `simulations` keyword argument controls the number of simulations to run and defaults to 50. Jam values will not be used in the simulation approach. If the estimated median falls in the lower or upper bin, the estimate returned will be `None`. @@ -210,13 +255,13 @@ Jam values will not be used in the simulation approach. If the estimated median .. code-block:: python - >>> moe_example = [ - dict(min=math.nan, max=9999, n=6, moe=1), + >>> simulation_with_jam = [ + dict(min=None, max=9999, n=6, moe=1), dict(min=10000, max=14999, n=1, moe=1), dict(min=15000, max=19999, n=8, moe=1), dict(min=20000, max=24999, n=7, moe=1), dict(min=25000, max=29999, n=2, moe=1), - dict(min=30000, max=34999, n=900, moe=8), + dict(min=30000, max=34999, n=90, moe=8), dict(min=35000, max=39999, n=7, moe=1), dict(min=40000, max=44999, n=4, moe=1), dict(min=45000, max=49999, n=8, moe=1), @@ -226,12 +271,11 @@ Jam values will not be used in the simulation approach. If the estimated median dict(min=100000, max=124999, n=7, moe=1), dict(min=125000, max=149999, n=10, moe=1), dict(min=150000, max=199999, n=8, moe=1), - dict(min=200000, max=math.nan, n=18, moe=10) + dict(min=200000, max=None, n=186, moe=10) ] >>> import numpy - >>> numpy.random.seed(711355) - >>> census_data_aggregator.approximate_median(moe_example, design_factor=1, sampling_percentage=5*2.5, simulations=50, jam_values=[2499, 200001]) - (32644.851568840597, 33.0019114324823) + >>> census_data_aggregator.approximate_median(simulation_with_jam, simulations=50, jam_values=[2499, 200001]) + (None, None) Approximating percent change ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/census_data_aggregator/__init__.py b/census_data_aggregator/__init__.py index 88ac6dd..74f9f98 100644 --- a/census_data_aggregator/__init__.py +++ b/census_data_aggregator/__init__.py @@ -71,8 +71,8 @@ def approximate_median(range_list, design_factor=1, sampling_percentage=None, ja Args: range_list (list): A list of dictionaries that divide the full range of data values into continuous categories. Each dictionary should have three keys: - * min (int): The minimum value of the range. If unknown use `math.nan`. - * max (int): The maximum value of the range. If unknown use `math.nan`. + * min (int): The minimum value of the range. If unknown use `None`. + * max (int): The maximum value of the range. If unknown use `None`. * n (int): The number of people, households or other unit in the range * moe (float, optional): If the `n` value has an associated margin of error, include it to contribute to the new margin of error calculation. @@ -88,7 +88,7 @@ def approximate_median(range_list, design_factor=1, sampling_percentage=None, ja * Three-year ACS: 7.5 * Five-year ACS: 12.5 If you do not provide this input, a margin of error will not be returned. - jam_values (list, optional): If you have an associated "jam values" for your dataset provided in + jam_values (list, optional): If you have associated "jam values" for your dataset provided in the `American Community Survey's technical documentation`_ input the pair as a list. If the median falls in the first or last bin, the jam value will be returned instead of `None`. simulations (integer, optional): If the `n` values have an associated margin of error, a simulation based approach @@ -101,27 +101,28 @@ def approximate_median(range_list, design_factor=1, sampling_percentage=None, ja Examples: Estimating the median for a range of household incomes. - >>> household_income_2013_acs5 = [ - dict(min=2499, max=9999, n=186), - dict(min=10000, max=14999, n=78), - dict(min=15000, max=19999, n=98), - dict(min=20000, max=24999, n=287), - dict(min=25000, max=29999, n=142), - dict(min=30000, max=34999, n=90), - dict(min=35000, max=39999, n=107), - dict(min=40000, max=44999, n=104), - dict(min=45000, max=49999, n=178), - dict(min=50000, max=59999, n=106), - dict(min=60000, max=74999, n=177), - dict(min=75000, max=99999, n=262), - dict(min=100000, max=124999, n=77), - dict(min=125000, max=149999, n=100), - dict(min=150000, max=199999, n=58), - dict(min=200000, max=250001, n=18) + >>> median_with_moe_example = [ + dict(min=None, max=9999, n=6, moe=1), + dict(min=10000, max=14999, n=1, moe=1), + dict(min=15000, max=19999, n=8, moe=1), + dict(min=20000, max=24999, n=7, moe=1), + dict(min=25000, max=29999, n=2, moe=1), + dict(min=30000, max=34999, n=900, moe=8), + dict(min=35000, max=39999, n=7, moe=1), + dict(min=40000, max=44999, n=4, moe=1), + dict(min=45000, max=49999, n=8, moe=1), + dict(min=50000, max=59999, n=6, moe=1), + dict(min=60000, max=74999, n=7, moe=1), + dict(min=75000, max=99999, n=2, moe=0.25), + dict(min=100000, max=124999, n=7, moe=1), + dict(min=125000, max=149999, n=10, moe=1), + dict(min=150000, max=199999, n=8, moe=1), + dict(min=200000, max=None, n=18, moe=10) ] - >>> approximate_median(household_income_2013_acs5, sampling_percentage=5*2.5) - (42211.096153846156, 4706.522752733644) + + >>> approximate_median(median_with_moe_example, sampling_percentage=2.5) + (32646.07020990552, 26.638686513280845) ... _official guidelines: https://www.documentcloud.org/documents/6165603-2013-2017AccuracyPUMS.html#document/p18 @@ -135,10 +136,8 @@ def approximate_median(range_list, design_factor=1, sampling_percentage=None, ja for k, v in range_list[i].items(): if v is None: range_list[i][v] = math.nan - # Sort the list range_list.sort(key=lambda x: x['min']) - # if moe is included, can use simulation to estimate margin of error for median if "moe" in list(range_list[0].keys()): simulation_results = [] From 5cb87b9e2d4d3a696093419e38e07c02f5a1287d Mon Sep 17 00:00:00 2001 From: sastoudt Date: Tue, 3 Sep 2019 11:50:01 -0700 Subject: [PATCH 13/13] fix tests and bug in none handling --- census_data_aggregator/__init__.py | 2 +- test.py | 20 ++++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/census_data_aggregator/__init__.py b/census_data_aggregator/__init__.py index 74f9f98..d9a687e 100644 --- a/census_data_aggregator/__init__.py +++ b/census_data_aggregator/__init__.py @@ -135,7 +135,7 @@ def approximate_median(range_list, design_factor=1, sampling_percentage=None, ja for i in range(len(range_list)): for k, v in range_list[i].items(): if v is None: - range_list[i][v] = math.nan + range_list[i][k] = math.nan # Sort the list range_list.sort(key=lambda x: x['min']) # if moe is included, can use simulation to estimate margin of error for median diff --git a/test.py b/test.py index e250b06..c683c5e 100644 --- a/test.py +++ b/test.py @@ -146,7 +146,7 @@ def test_median(self): with self.assertWarns(JamValueMissingWarning): household_income_2013_acs5 = [ - dict(min=math.nan, max=9999, n=186), + dict(min=None, max=9999, n=186), dict(min=10000, max=14999, n=1), dict(min=15000, max=19999, n=8), dict(min=20000, max=24999, n=7), @@ -169,7 +169,7 @@ def test_median(self): with self.assertWarns(JamValueResultWarning): household_income_2013_acs5 = [ - dict(min=math.nan, max=9999, n=186), + dict(min=None, max=9999, n=186), dict(min=10000, max=14999, n=1), dict(min=15000, max=19999, n=8), dict(min=20000, max=24999, n=7), @@ -192,7 +192,7 @@ def test_median(self): with self.assertWarns(JamValueResultWarning): household_income_2013_acs5 = [ - dict(min=math.nan, max=9999, n=6), + dict(min=None, max=9999, n=6), dict(min=10000, max=14999, n=1), dict(min=15000, max=19999, n=8), dict(min=20000, max=24999, n=7), @@ -207,7 +207,7 @@ def test_median(self): dict(min=100000, max=124999, n=7), dict(min=125000, max=149999, n=10), dict(min=150000, max=199999, n=8), - dict(min=200000, max=math.nan, n=186) + dict(min=200000, max=None, n=186) ] estimate, moe = census_data_aggregator.approximate_median(household_income_2013_acs5, design_factor=1, sampling_percentage=5*2.5, simulations=50, jam_values=[2599, 200001]) self.assertEqual(estimate, 200001) @@ -215,7 +215,7 @@ def test_median(self): with self.assertRaises(InputError): household_income_2013_acs5 = [ - dict(min=math.nan, max=9999, n=6), + dict(min=None, max=9999, n=6), dict(min=10000, max=14999, n=1), dict(min=15000, max=19999, n=8), dict(min=20000, max=24999, n=7), @@ -230,13 +230,13 @@ def test_median(self): dict(min=100000, max=124999, n=7), dict(min=125000, max=149999, n=10), dict(min=150000, max=199999, n=8), - dict(min=200000, max=math.nan, n=186) + dict(min=200000, max=None, n=186) ] estimate, moe = census_data_aggregator.approximate_median(household_income_2013_acs5, design_factor=1, sampling_percentage=5*2.5, simulations=50, jam_values=[2599]) with self.assertWarns(JamValueResultMOEWarning): household_income_2013_acs5 = [ - dict(min=math.nan, max=9999, n=6, moe=1), + dict(min=None, max=9999, n=6, moe=1), dict(min=10000, max=14999, n=1, moe=1), dict(min=15000, max=19999, n=8, moe=1), dict(min=20000, max=24999, n=7, moe=1), @@ -251,7 +251,7 @@ def test_median(self): dict(min=100000, max=124999, n=7, moe=1), dict(min=125000, max=149999, n=10, moe=1), dict(min=150000, max=199999, n=8, moe=1), - dict(min=200000, max=math.nan, n=186, moe=10) + dict(min=200000, max=None, n=186, moe=10) ] estimate, moe = census_data_aggregator.approximate_median(household_income_2013_acs5, design_factor=1, sampling_percentage=5*2.5, simulations=50, jam_values=[2599, 200001]) self.assertEqual(estimate, None) @@ -263,7 +263,7 @@ def test_median(self): self.assertEqual(moe, None) household_income_2013_acs5 = [ - dict(min=math.nan, max=9999, n=6, moe=1), + dict(min=None, max=9999, n=6, moe=1), dict(min=10000, max=14999, n=1, moe=1), dict(min=15000, max=19999, n=8, moe=1), dict(min=20000, max=24999, n=7, moe=1), @@ -278,7 +278,7 @@ def test_median(self): dict(min=100000, max=124999, n=7, moe=1), dict(min=125000, max=149999, n=10, moe=1), dict(min=150000, max=199999, n=8, moe=1), - dict(min=200000, max=math.nan, n=18, moe=10) + dict(min=200000, max=None, n=18, moe=10) ] numpy.random.seed(711355) estimate, moe = census_data_aggregator.approximate_median(household_income_2013_acs5, design_factor=1, sampling_percentage=5*2.5, simulations=50, jam_values=None)