diff --git a/README.rst b/README.rst index 38689bb..55feb7a 100644 --- a/README.rst +++ b/README.rst @@ -95,10 +95,10 @@ Expects a list of dictionaries that divide the full range of data values into co dict(min=150000, max=199999, n=6931136, moe=37236), dict(min=200000, max=1000000, n=7465517, moe=42206) ] - >>> approximate_mean(income) + >>> census_data_aggregator.approximate_mean(income) (98045.44530685373, 194.54892406267754) -Note that this function expects you to submit a lower bound for the smallest bin and an upper bound for the largest bin. This is often not available for ACS datasets like income. We recommend experimenting with different lower and upper bounds to assess its effect on the resulting mean. +Note that, unlike `approximate_median` this function expects you to submit a lower bound for the smallest bin and an upper bound for the largest bin. This is because the Census's jam value approach is only used for median calculations. We recommend experimenting with different lower and upper bounds to assess its effect on the resulting mean. By default the simulation is run 50 times, which can take as long as a minute. The number of simulations can be changed by setting the `simulation` keyword argument. @@ -110,7 +110,7 @@ The simulation assumes a uniform distribution of values within each bin. In some .. code-block:: python - >>> approximate_mean(income, pareto=True) + >>> census_data_aggregator.approximate_mean(income, pareto=True) (60364.96525340687, 58.60735554621351) Also, due to the stochastic nature of the simulation approach, you will need to set a seed before running this function to ensure replicability. @@ -119,10 +119,10 @@ Also, due to the stochastic nature of the simulation approach, you will need to >>> import numpy >>> numpy.random.seed(711355) - >>> approximate_mean(income, pareto=True) + >>> census_data_aggregator.approximate_mean(income, pareto=True) (60364.96525340687, 58.60735554621351) >>> numpy.random.seed(711355) - >>> approximate_mean(income, pareto=True) + >>> census_data_aggregator.approximate_mean(income, pareto=True) (60364.96525340687, 58.60735554621351) @@ -131,7 +131,7 @@ Approximating medians Estimate a median and approximate the margin of error. Follows the U.S. Census Bureau's official guidelines for estimation. Useful for generating medians for measures like household income and age when aggregating census geographies. -Expects a list of dictionaries that divide the full range of data values into continuous categories. Each dictionary should have three keys: +Expects a list of dictionaries that divide the full range of data values into continuous categories. The first `min` and the last `max` should be `None` since we typically do not know the boundaries for the top and bottom bins (e.g. income). If these values are actually known (e.g. lower bound for age), the known value can replace `None.` Each dictionary should have three keys with an optional fourth key for margin of error inputs: .. list-table:: :header-rows: 1 @@ -139,35 +139,44 @@ Expects a list of dictionaries that divide the full range of data values into co * - key - value * - min - - The minimum value of the range + - The minimum value of the range (if unknown use `math.nan`) * - max - - The maximum value of the range + - The maximum value of the range (if unknown use `math.nan`) * - n - The number of people, households or other units in the range + * - moe (optional) + - The `n` value's associated margin of error. If given as an input, a simulation approach will be used to estimate the new margin of error. + .. code-block:: python - >>> household_income_la_2013_acs1 = [ - dict(min=2499, max=9999, n=1382), - dict(min=10000, max=14999, n=2377), - dict(min=15000, max=19999, n=1332), - dict(min=20000, max=24999, n=3129), - dict(min=25000, max=29999, n=1927), - dict(min=30000, max=34999, n=1825), - dict(min=35000, max=39999, n=1567), - dict(min=40000, max=44999, n=1996), - dict(min=45000, max=49999, n=1757), - dict(min=50000, max=59999, n=3523), - dict(min=60000, max=74999, n=4360), - dict(min=75000, max=99999, n=6424), - dict(min=100000, max=124999, n=5257), - dict(min=125000, max=149999, n=3485), - dict(min=150000, max=199999, n=2926), - dict(min=200000, max=250001, n=4215) - ] - -For a margin of error to be returned, a sampling percentage must be provided to calculate the standard error. The sampling percentage represents what proportion of the population that participated in the survey. Here are the values for some common census surveys. + >>> median_with_moe_example = [ + dict(min=None, max=9999, n=6, moe=1), + dict(min=10000, max=14999, n=1, moe=1), + dict(min=15000, max=19999, n=8, moe=1), + dict(min=20000, max=24999, n=7, moe=1), + dict(min=25000, max=29999, n=2, moe=1), + dict(min=30000, max=34999, n=900, moe=8), + dict(min=35000, max=39999, n=7, moe=1), + dict(min=40000, max=44999, n=4, moe=1), + dict(min=45000, max=49999, n=8, moe=1), + dict(min=50000, max=59999, n=6, moe=1), + dict(min=60000, max=74999, n=7, moe=1), + dict(min=75000, max=99999, n=2, moe=0.25), + dict(min=100000, max=124999, n=7, moe=1), + dict(min=125000, max=149999, n=10, moe=1), + dict(min=150000, max=199999, n=8, moe=1), + dict(min=200000, max=None, n=18, moe=10) + ] + + +.. code-block:: python + + >>> census_data_aggregator.approximate_median(median_with_moe_example, sampling_percentage=2.5) + (32646.07020990552, 26.638686513280845) + +In the case without margin of error inputs, a sampling percentage must be provided to in order for a margin of error to be returned. The sampling percentage represents what proportion of the population that participated in the survey. Here are the values for some common census surveys. .. list-table:: :header-rows: 1 @@ -183,20 +192,90 @@ For a margin of error to be returned, a sampling percentage must be provided to * - Five-year ACS - 12.5 -.. code-block:: python +If you do not provide the sampling percentage value to the function, no margin of error will be returned. - >>> census_data_aggregator.approximate_median(household_income_Los_Angeles_County_2013_acs1, sampling_percentage=2.5) - 70065.84266055046, 3850.680465234964 +.. code-block:: python -If you do not provide the value to the function, no margin of error will be returned. + >>> median_without_moe_example = [ + dict(min=None, max=9999, n=6), + dict(min=10000, max=14999, n=1), + dict(min=15000, max=19999, n=8), + dict(min=20000, max=24999, n=7), + dict(min=25000, max=29999, n=2), + dict(min=30000, max=34999, n=900), + dict(min=35000, max=39999, n=7), + dict(min=40000, max=44999, n=4), + dict(min=45000, max=49999, n=8), + dict(min=50000, max=59999, n=6), + dict(min=60000, max=74999, n=7), + dict(min=75000, max=99999, n=2), + dict(min=100000, max=124999, n=7), + dict(min=125000, max=149999, n=10), + dict(min=150000, max=199999, n=8), + dict(min=200000, max=None, n=18) + ] + + >>> census_data_aggregator.approximate_median(median_without_moe_example) + 32646.69277777778, None + +If the data being approximated comes from PUMS, an additional design factor must also be provided. +The design factor is a statistical input used to tailor the estimate to the variance of the dataset. +Find the value for the dataset you are estimating by referring to `the bureau's reference material `_. + +If you have an associated "jam values" for your dataset provided in the `American Community Survey's technical documentation `_, input the pair as a list to the `jam_values` keyword argument. +Then if the median falls in the first or last bin, the jam value will be returned instead of `None`. .. code-block:: python - >>> census_data_aggregator.approximate_median(household_income_Los_Angeles_County_2013_acs1) - 70065.84266055046, None + >>> jam_without_simulation = [ + dict(min=None, max=9999, n=6), + dict(min=10000, max=14999, n=1), + dict(min=15000, max=19999, n=8), + dict(min=20000, max=24999, n=7), + dict(min=25000, max=29999, n=2), + dict(min=30000, max=34999, n=9), + dict(min=35000, max=39999, n=7), + dict(min=40000, max=44999, n=4), + dict(min=45000, max=49999, n=8), + dict(min=50000, max=59999, n=6), + dict(min=60000, max=74999, n=7), + dict(min=75000, max=99999, n=2), + dict(min=100000, max=124999, n=7), + dict(min=125000, max=149999, n=10), + dict(min=150000, max=199999, n=8), + dict(min=200000, max=None, n=186) + ] + >>> import numpy + >>> census_data_aggregator.approximate_median(jam_without_simulation, sampling_percentage=5*2.5,jam_values=[2599, 200001]) + (200001, None) + +If the `n` values have an associated margin of error, a simulation based approach will be used to estimate the new margin of error. The `simulations` keyword argument controls the number of simulations to run and defaults to 50. +Jam values will not be used in the simulation approach. If the estimated median falls in the lower or upper bin, the estimate returned will be `None`. + -If the data being approximated comes from PUMS, an additional design factor must also be provided. The design factor is a statistical input used to tailor the estimate to the variance of the dataset. Find the value for the dataset you are estimating by referring to `the bureau's reference material `_. +.. code-block:: python + >>> simulation_with_jam = [ + dict(min=None, max=9999, n=6, moe=1), + dict(min=10000, max=14999, n=1, moe=1), + dict(min=15000, max=19999, n=8, moe=1), + dict(min=20000, max=24999, n=7, moe=1), + dict(min=25000, max=29999, n=2, moe=1), + dict(min=30000, max=34999, n=90, moe=8), + dict(min=35000, max=39999, n=7, moe=1), + dict(min=40000, max=44999, n=4, moe=1), + dict(min=45000, max=49999, n=8, moe=1), + dict(min=50000, max=59999, n=6, moe=1), + dict(min=60000, max=74999, n=7, moe=1), + dict(min=75000, max=99999, n=2, moe=0.25), + dict(min=100000, max=124999, n=7, moe=1), + dict(min=125000, max=149999, n=10, moe=1), + dict(min=150000, max=199999, n=8, moe=1), + dict(min=200000, max=None, n=186, moe=10) + ] + >>> import numpy + >>> census_data_aggregator.approximate_median(simulation_with_jam, simulations=50, jam_values=[2499, 200001]) + (None, None) Approximating percent change ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/census_data_aggregator/__init__.py b/census_data_aggregator/__init__.py index 6c682a2..d9a687e 100644 --- a/census_data_aggregator/__init__.py +++ b/census_data_aggregator/__init__.py @@ -4,7 +4,7 @@ import math import numpy import warnings -from .exceptions import DataError, SamplingPercentageWarning +from .exceptions import DataError, InputError, SamplingPercentageWarning, JamValueMissingWarning, JamValueResultWarning, JamValueResultMOEWarning def approximate_sum(*pairs): @@ -61,7 +61,7 @@ def approximate_sum(*pairs): return total, margin_of_error -def approximate_median(range_list, design_factor=1, sampling_percentage=None): +def approximate_median(range_list, design_factor=1, sampling_percentage=None, jam_values=None, simulations=50): """ Estimate a median and approximate the margin of error. @@ -71,12 +71,11 @@ def approximate_median(range_list, design_factor=1, sampling_percentage=None): Args: range_list (list): A list of dictionaries that divide the full range of data values into continuous categories. Each dictionary should have three keys: - * min (int): The minimum value of the range - * max (int): The maximum value of the range + * min (int): The minimum value of the range. If unknown use `None`. + * max (int): The maximum value of the range. If unknown use `None`. * n (int): The number of people, households or other unit in the range - The minimum value in the first range and the maximum value in the last range - can be tailored to the dataset by using the "jam values" provided in - the `American Community Survey's technical documentation`_. + * moe (float, optional): If the `n` value has an associated margin of error, include it to contribute + to the new margin of error calculation. design_factor (float, optional): A statistical input used to tailor the standard error to the variance of the dataset. This is only needed for data coming from public use microdata sample, also known as PUMS. You do not need to provide this input if you are approximating @@ -89,7 +88,11 @@ def approximate_median(range_list, design_factor=1, sampling_percentage=None): * Three-year ACS: 7.5 * Five-year ACS: 12.5 If you do not provide this input, a margin of error will not be returned. - + jam_values (list, optional): If you have associated "jam values" for your dataset provided in + the `American Community Survey's technical documentation`_ input the pair as a list. If the median falls + in the first or last bin, the jam value will be returned instead of `None`. + simulations (integer, optional): If the `n` values have an associated margin of error, a simulation based approach + will be used to estimate the new margin of error. This input controls the number of simulations to run. Defaults to 50. Returns: A two-item tuple with the median followed by the approximated margin of error. @@ -98,27 +101,28 @@ def approximate_median(range_list, design_factor=1, sampling_percentage=None): Examples: Estimating the median for a range of household incomes. - >>> household_income_2013_acs5 = [ - dict(min=2499, max=9999, n=186), - dict(min=10000, max=14999, n=78), - dict(min=15000, max=19999, n=98), - dict(min=20000, max=24999, n=287), - dict(min=25000, max=29999, n=142), - dict(min=30000, max=34999, n=90), - dict(min=35000, max=39999, n=107), - dict(min=40000, max=44999, n=104), - dict(min=45000, max=49999, n=178), - dict(min=50000, max=59999, n=106), - dict(min=60000, max=74999, n=177), - dict(min=75000, max=99999, n=262), - dict(min=100000, max=124999, n=77), - dict(min=125000, max=149999, n=100), - dict(min=150000, max=199999, n=58), - dict(min=200000, max=250001, n=18) + >>> median_with_moe_example = [ + dict(min=None, max=9999, n=6, moe=1), + dict(min=10000, max=14999, n=1, moe=1), + dict(min=15000, max=19999, n=8, moe=1), + dict(min=20000, max=24999, n=7, moe=1), + dict(min=25000, max=29999, n=2, moe=1), + dict(min=30000, max=34999, n=900, moe=8), + dict(min=35000, max=39999, n=7, moe=1), + dict(min=40000, max=44999, n=4, moe=1), + dict(min=45000, max=49999, n=8, moe=1), + dict(min=50000, max=59999, n=6, moe=1), + dict(min=60000, max=74999, n=7, moe=1), + dict(min=75000, max=99999, n=2, moe=0.25), + dict(min=100000, max=124999, n=7, moe=1), + dict(min=125000, max=149999, n=10, moe=1), + dict(min=150000, max=199999, n=8, moe=1), + dict(min=200000, max=None, n=18, moe=10) ] - >>> approximate_median(household_income_2013_acs5, sampling_percentage=5*2.5) - (42211.096153846156, 4706.522752733644) + + >>> approximate_median(median_with_moe_example, sampling_percentage=2.5) + (32646.07020990552, 26.638686513280845) ... _official guidelines: https://www.documentcloud.org/documents/6165603-2013-2017AccuracyPUMS.html#document/p18 @@ -127,102 +131,210 @@ def approximate_median(range_list, design_factor=1, sampling_percentage=None): ... _the bureau's reference material: https://www.census.gov/programs-surveys/acs/technical-documentation/pums/documentation.html """ + # need to replace before sort + for i in range(len(range_list)): + for k, v in range_list[i].items(): + if v is None: + range_list[i][k] = math.nan # Sort the list range_list.sort(key=lambda x: x['min']) - - # For each range calculate its min and max value along the universe's scale - cumulative_n = 0 - for range_ in range_list: - range_['n_min'] = cumulative_n - cumulative_n += range_['n'] - range_['n_max'] = cumulative_n - - # What is the total number of observations in the universe? - n = sum([d['n'] for d in range_list]) - - # What is the estimated midpoint of the n? - n_midpoint = n / 2.0 - - # Now use those to determine which group contains the midpoint. - n_midpoint_range = next(d for d in range_list if n_midpoint >= d['n_min'] and n_midpoint <= d['n_max']) - - # How many households in the midrange are needed to reach the midpoint? - n_midrange_gap = n_midpoint - n_midpoint_range['n_min'] - - # What is the proportion of the group that would be needed to get the midpoint? - n_midrange_gap_percent = n_midrange_gap / n_midpoint_range['n'] - - # Apply this proportion to the width of the midrange - n_midrange_gap_adjusted = (n_midpoint_range['max'] - n_midpoint_range['min']) * n_midrange_gap_percent - - # Estimate the median - estimated_median = n_midpoint_range['min'] + n_midrange_gap_adjusted - - # If there's no sampling percentage, we can't calculate a margin of error - if not sampling_percentage: - # Let's throw a warning, but still return the median - warnings.warn("", SamplingPercentageWarning) - return estimated_median, None - - # Get the standard error for this dataset - standard_error = (design_factor * math.sqrt(((100 - sampling_percentage) / (n * sampling_percentage)) * (50**2))) / 100 - - # Use the standard error to calculate the p values - p_lower = .5 - standard_error - p_upper = .5 + standard_error - - # Estimate the p_lower and p_upper n values - p_lower_n = n * p_lower - p_upper_n = n * p_upper - - # Find the ranges the p values fall within - try: - p_lower_range_i, p_lower_range = next( - (i, d) for i, d in enumerate(range_list) - if p_lower_n >= d['n_min'] and p_lower_n <= d['n_max'] - ) - except StopIteration: - raise DataError(f"The n's lower p value {p_lower_n} does not fall within a data range.") - - try: - p_upper_range_i, p_upper_range = next( - (i, d) for i, d in enumerate(range_list) - if p_upper_n >= d['n_min'] and p_upper_n <= d['n_max'] - ) - except StopIteration: - raise DataError(f"The n's upper p value {p_upper_n} does not fall within a data range.") - - # Use these values to estimate the lower bound of the confidence interval - p_lower_a1 = p_lower_range['min'] - try: - p_lower_a2 = range_list[p_lower_range_i + 1]['min'] - except IndexError: - p_lower_a2 = p_lower_range['max'] - p_lower_c1 = p_lower_range['n_min'] / n - try: - p_lower_c2 = range_list[p_lower_range_i + 1]['n_min'] / n - except IndexError: - p_lower_c2 = p_lower_range['n_max'] / n - lower_bound = ((p_lower - p_lower_c1) / (p_lower_c2 - p_lower_c1)) * (p_lower_a2 - p_lower_a1) + p_lower_a1 - - # Same for the upper bound - p_upper_a1 = p_upper_range['min'] - try: - p_upper_a2 = range_list[p_upper_range_i + 1]['min'] - except IndexError: - p_upper_a2 = p_upper_range['max'] - p_upper_c1 = p_upper_range['n_min'] / n - try: - p_upper_c2 = range_list[p_upper_range_i + 1]['n_min'] / n - except IndexError: - p_upper_c2 = p_upper_range['n_max'] / n - upper_bound = ((p_upper - p_upper_c1) / (p_upper_c2 - p_upper_c1)) * (p_upper_a2 - p_upper_a1) + p_upper_a1 - - # Calculate the standard error of the median - standard_error_median = 0.5 * (upper_bound - lower_bound) - - # Calculate the margin of error at the 90% confidence level - margin_of_error = 1.645 * standard_error_median + # if moe is included, can use simulation to estimate margin of error for median + if "moe" in list(range_list[0].keys()): + simulation_results = [] + for i in range(simulations): + # For each range calculate its min and max value along the universe's scale + simulated_n = [] + cumulative_n = 0 + for range_ in range_list: + range_['n_min'] = cumulative_n + se = range_['moe'] / 1.645 # convert moe to se + nn = round(numpy.random.normal(range_['n'], se)) # use moe to introduce randomness into number in bin + nn = int(nn) # clean it up + cumulative_n += nn + range_['n_max'] = cumulative_n + range_['n_new'] = nn + simulated_n.append(nn) + + # What is the total number of observations in the universe? + n = sum([d['n'] for d in range_list]) + + # What is the estimated midpoint of the n? + n_midpoint = n / 2.0 + + # Now use those to determine which group contains the midpoint. + n_midpoint_range = next(d for d in range_list if n_midpoint >= d['n_min'] and n_midpoint <= d['n_max']) + + # How many households in the midrange are needed to reach the midpoint? + n_midrange_gap = n_midpoint - n_midpoint_range['n_min'] + + # What is the proportion of the group that would be needed to get the midpoint? + n_midrange_gap_percent = n_midrange_gap / n_midpoint_range['n_new'] # n_midpoint_range['n'] + + # Apply this proportion to the width of the midrange + n_midrange_gap_adjusted = (n_midpoint_range['max'] - n_midpoint_range['min']) * n_midrange_gap_percent + + # Estimate the median + estimated_median = n_midpoint_range['min'] + n_midrange_gap_adjusted + + # median in last bin but no jam value input + if math.isnan(n_midpoint_range['max']) and not jam_values: + # don't need warning because jam value won't appear + # warnings.warn("", JamValueWarning) + simulation_results.append(math.nan) # return nan + # median in first bin but not jam value input + elif math.isnan(n_midpoint_range['min']) and not jam_values: + # don't need warning because jam value won't appear + # warnings.warn("", JamValueWarning) + simulation_results.append(math.nan) # return nan + # median in last bin and jam value given + elif math.isnan(n_midpoint_range['max']): # already exhausted the no jam value case + estimated_median = jam_values[1] + simulation_results.append(estimated_median) + # median in first bin and jam value given + elif math.isnan(n_midpoint_range['min']): # already exhausted the no jam value case + estimated_median = jam_values[0] + simulation_results.append(estimated_median) + # median is fine + else: + simulation_results.append(estimated_median) + # if jam values are involved, doesn't make sense to take a mean, return None + if math.nan in simulation_results: + warnings.warn("", JamValueMissingWarning) + estimated_median = None + margin_of_error = None + elif jam_values and jam_values[0] in simulation_results: + warnings.warn("", JamValueResultMOEWarning) + estimated_median = None + margin_of_error = None + elif jam_values and jam_values[1] in simulation_results: + warnings.warn("", JamValueResultMOEWarning) + estimated_median = None + margin_of_error = None + # if jam values aren't involved, proceed as normal + else: + estimated_median = numpy.nanmean(simulation_results) # mean of medians + t1 = numpy.nanquantile(simulation_results, 0.95) - estimated_median # go from confidence interval to margin of error + t2 = estimated_median - numpy.nanquantile(simulation_results, 0.05) # go from confidence interval to margin of error + margin_of_error = max(t1, t2) # if asymmetrical take bigger one, conservative + # get the median and moe via census approximation + else: + # For each range calculate its min and max value along the universe's scale + cumulative_n = 0 + for range_ in range_list: + range_['n_min'] = cumulative_n + cumulative_n += range_['n'] + range_['n_max'] = cumulative_n + + # What is the total number of observations in the universe? + n = sum([d['n'] for d in range_list]) + + # What is the estimated midpoint of the n? + n_midpoint = n / 2.0 + + # Now use those to determine which group contains the midpoint. + n_midpoint_range = next(d for d in range_list if n_midpoint >= d['n_min'] and n_midpoint <= d['n_max']) + + # How many households in the midrange are needed to reach the midpoint? + n_midrange_gap = n_midpoint - n_midpoint_range['n_min'] + + # What is the proportion of the group that would be needed to get the midpoint? + n_midrange_gap_percent = n_midrange_gap / n_midpoint_range['n'] + + # Apply this proportion to the width of the midrange + n_midrange_gap_adjusted = (n_midpoint_range['max'] - n_midpoint_range['min']) * n_midrange_gap_percent + + # Estimate the median + estimated_median = n_midpoint_range['min'] + n_midrange_gap_adjusted + + if not jam_values and math.isnan(n_midpoint_range['max']): + # Let's throw a warning + warnings.warn("", JamValueMissingWarning) + return None, None + + if not jam_values and math.isnan(n_midpoint_range['min']): + # Let's throw a warning + warnings.warn("", JamValueMissingWarning) + return None, None + + if math.isnan(n_midpoint_range['max']): + warnings.warn("", JamValueResultWarning) + if len(jam_values) < 2: + raise InputError(f"An upper jam value input is needed") + else: + estimated_median = jam_values[1] + + elif math.isnan(n_midpoint_range['min']): + warnings.warn("", JamValueResultWarning) + estimated_median = jam_values[0] + + # If there's no sampling percentage, we can't calculate a margin of error + if not sampling_percentage: + # Let's throw a warning, but still return the median + warnings.warn("", SamplingPercentageWarning) + return estimated_median, None + + # Get the standard error for this dataset + standard_error = (design_factor * math.sqrt(((100 - sampling_percentage) / (n * sampling_percentage)) * (50**2))) / 100 + + # Use the standard error to calculate the p values + p_lower = .5 - standard_error + p_upper = .5 + standard_error + + # Estimate the p_lower and p_upper n values + p_lower_n = n * p_lower + p_upper_n = n * p_upper + + # Find the ranges the p values fall within + try: + p_lower_range_i, p_lower_range = next( + (i, d) for i, d in enumerate(range_list) + if p_lower_n >= d['n_min'] and p_lower_n <= d['n_max'] + ) + except StopIteration: + raise DataError(f"The n's lower p value {p_lower_n} does not fall within a data range.") + + try: + p_upper_range_i, p_upper_range = next( + (i, d) for i, d in enumerate(range_list) + if p_upper_n >= d['n_min'] and p_upper_n <= d['n_max'] + ) + except StopIteration: + raise DataError(f"The n's upper p value {p_upper_n} does not fall within a data range.") + + # Use these values to estimate the lower bound of the confidence interval + p_lower_a1 = p_lower_range['min'] + try: + p_lower_a2 = range_list[p_lower_range_i + 1]['min'] + except IndexError: + p_lower_a2 = p_lower_range['max'] + p_lower_c1 = p_lower_range['n_min'] / n + try: + p_lower_c2 = range_list[p_lower_range_i + 1]['n_min'] / n + except IndexError: + p_lower_c2 = p_lower_range['n_max'] / n + lower_bound = ((p_lower - p_lower_c1) / (p_lower_c2 - p_lower_c1)) * (p_lower_a2 - p_lower_a1) + p_lower_a1 + + # Same for the upper bound + p_upper_a1 = p_upper_range['min'] + try: + p_upper_a2 = range_list[p_upper_range_i + 1]['min'] + except IndexError: + p_upper_a2 = p_upper_range['max'] + p_upper_c1 = p_upper_range['n_min'] / n + try: + p_upper_c2 = range_list[p_upper_range_i + 1]['n_min'] / n + except IndexError: + p_upper_c2 = p_upper_range['n_max'] / n + upper_bound = ((p_upper - p_upper_c1) / (p_upper_c2 - p_upper_c1)) * (p_upper_a2 - p_upper_a1) + p_upper_a1 + + # Calculate the standard error of the median + standard_error_median = 0.5 * (upper_bound - lower_bound) + + # Calculate the margin of error at the 90% confidence level + margin_of_error = 1.645 * standard_error_median + + if math.isnan(margin_of_error): + margin_of_error = None # Return the result return estimated_median, margin_of_error @@ -433,7 +545,8 @@ def approximate_mean(range_list, simulations=50, pareto=False): * n (int): The number of people, households or other units in the range * moe (float): The margin of error for n simulations (int): number of simulations to run, used to estimate margin of error. Defaults to 50. - pareto (logical): Set True to use the Pareto distribution to simulate values in upper bin. Set False to assume a uniform distribution. Pareto is often appropriate for income. Defaults to False. + pareto (logical): Set True to use the Pareto distribution to simulate values in upper bin. + Set False to assume a uniform distribution. Pareto is often appropriate for income. Defaults to False. Returns: A two-item tuple with the mean followed by the approximated margin of error. @@ -485,6 +598,7 @@ def approximate_mean(range_list, simulations=50, pareto=False): se = range_['moe'] / 1.645 # convert moe to se nn = round(numpy.random.normal(range_['n'], se)) # use moe to introduce randomness into number in bin nn = int(nn) # clean it up + nn = max(0, nn) # don't allow negative values simulated_values.append(numpy.random.uniform(range_['min'], range_['max'], size=(1, nn)).sum()) # draw random values within the bin, assume uniform simulated_n.append(nn) # a special case to handle the last bin @@ -493,6 +607,7 @@ def approximate_mean(range_list, simulations=50, pareto=False): se = last['moe'] / 1.645 # convert moe to se nn = round(numpy.random.normal(last['n'], se)) # use moe to introduce randomness into number in bin nn = int(nn) # clean it up + nn = max(0, nn) # don't allow negative values simulated_values.append(numpy.random.pareto(a=alpha_hat, size=(1, nn)).sum()) # draw random values within the bin, assume uniform simulated_n.append(nn) # use uniform otherwise diff --git a/census_data_aggregator/exceptions.py b/census_data_aggregator/exceptions.py index 3639ac5..39a626f 100644 --- a/census_data_aggregator/exceptions.py +++ b/census_data_aggregator/exceptions.py @@ -9,9 +9,40 @@ class DataError(Exception): pass +class InputError(Exception): + """ + Raised if jam value input is invalid. + """ + pass + + class SamplingPercentageWarning(Warning): """ Warns that you have not provided a design factor. """ def __str__(self): return """A margin of error cannot be calculated unless you provide a sampling percentage.""" + + +class JamValueMissingWarning(Warning): + """ + Warns that you have not provided jam values when they are needed. + """ + def __str__(self): + return """The median falls in the upper or lower bracket. Provide a jam value for more information.""" + + +class JamValueResultWarning(Warning): + """ + Warns that the estiamte is a jam value. + """ + def __str__(self): + return """The median falls in the upper or lower bracket. A jam value is returned as the median estimate.""" + + +class JamValueResultMOEWarning(Warning): + """ + Warns that the estiamte is not provided because averaging jam values is inappropriate. + """ + def __str__(self): + return """The median falls in the upper or lower bracket. No estimate is returned since an average over jam values is inappropriate.""" diff --git a/test.py b/test.py index 44b5e2b..c683c5e 100644 --- a/test.py +++ b/test.py @@ -4,9 +4,14 @@ import unittest import census_data_aggregator import numpy +import math from census_data_aggregator.exceptions import ( DataError, - SamplingPercentageWarning + InputError, + SamplingPercentageWarning, + JamValueMissingWarning, + JamValueResultWarning, + JamValueResultMOEWarning ) @@ -115,7 +120,171 @@ def test_median(self): census_data_aggregator.approximate_median(household_income_la_2013_acs1, sampling_percentage=2.5), (70065.84266055046, 3850.680465234964) ) + + household_income_2013_acs5 = [ + dict(min=2499, max=9999, n=186), + dict(min=10000, max=14999, n=78), + dict(min=15000, max=19999, n=98), + dict(min=20000, max=24999, n=287), + dict(min=25000, max=29999, n=142), + dict(min=30000, max=34999, n=90), + dict(min=35000, max=39999, n=107), + dict(min=40000, max=44999, n=104), + dict(min=45000, max=49999, n=178), + dict(min=50000, max=59999, n=106), + dict(min=60000, max=74999, n=177), + dict(min=75000, max=99999, n=262), + dict(min=100000, max=124999, n=77), + dict(min=125000, max=149999, n=100), + dict(min=150000, max=199999, n=58), + dict(min=200000, max=250001, n=18) + ] + estimate, moe = census_data_aggregator.approximate_median(household_income_2013_acs5, design_factor=1, sampling_percentage=5*2.5, jam_values=None, simulations=50) + self.assertAlmostEqual(estimate, 42211.096153846156) + self.assertAlmostEqual(moe, 4706.522752733644) + + with self.assertWarns(JamValueMissingWarning): + household_income_2013_acs5 = [ + dict(min=None, max=9999, n=186), + dict(min=10000, max=14999, n=1), + dict(min=15000, max=19999, n=8), + dict(min=20000, max=24999, n=7), + dict(min=25000, max=29999, n=2), + dict(min=30000, max=34999, n=90), + dict(min=35000, max=39999, n=7), + dict(min=40000, max=44999, n=4), + dict(min=45000, max=49999, n=8), + dict(min=50000, max=59999, n=6), + dict(min=60000, max=74999, n=7), + dict(min=75000, max=99999, n=2), + dict(min=100000, max=124999, n=7), + dict(min=125000, max=149999, n=10), + dict(min=150000, max=199999, n=8), + dict(min=200000, max=250001, n=8) + ] + estimate, moe = census_data_aggregator.approximate_median(household_income_2013_acs5, design_factor=1, sampling_percentage=5*2.5, jam_values=None, simulations=50) + self.assertEqual(estimate, None) + self.assertEqual(moe, None) + + with self.assertWarns(JamValueResultWarning): + household_income_2013_acs5 = [ + dict(min=None, max=9999, n=186), + dict(min=10000, max=14999, n=1), + dict(min=15000, max=19999, n=8), + dict(min=20000, max=24999, n=7), + dict(min=25000, max=29999, n=2), + dict(min=30000, max=34999, n=90), + dict(min=35000, max=39999, n=7), + dict(min=40000, max=44999, n=4), + dict(min=45000, max=49999, n=8), + dict(min=50000, max=59999, n=6), + dict(min=60000, max=74999, n=7), + dict(min=75000, max=99999, n=2), + dict(min=100000, max=124999, n=7), + dict(min=125000, max=149999, n=10), + dict(min=150000, max=199999, n=8), + dict(min=200000, max=250001, n=8) + ] + estimate, moe = census_data_aggregator.approximate_median(household_income_2013_acs5, design_factor=1, sampling_percentage=5*2.5, simulations=50, jam_values=[2599]) + self.assertEqual(estimate, 2599) + self.assertEqual(moe, None) + + with self.assertWarns(JamValueResultWarning): + household_income_2013_acs5 = [ + dict(min=None, max=9999, n=6), + dict(min=10000, max=14999, n=1), + dict(min=15000, max=19999, n=8), + dict(min=20000, max=24999, n=7), + dict(min=25000, max=29999, n=2), + dict(min=30000, max=34999, n=90), + dict(min=35000, max=39999, n=7), + dict(min=40000, max=44999, n=4), + dict(min=45000, max=49999, n=8), + dict(min=50000, max=59999, n=6), + dict(min=60000, max=74999, n=7), + dict(min=75000, max=99999, n=2), + dict(min=100000, max=124999, n=7), + dict(min=125000, max=149999, n=10), + dict(min=150000, max=199999, n=8), + dict(min=200000, max=None, n=186) + ] + estimate, moe = census_data_aggregator.approximate_median(household_income_2013_acs5, design_factor=1, sampling_percentage=5*2.5, simulations=50, jam_values=[2599, 200001]) + self.assertEqual(estimate, 200001) + self.assertEqual(moe, None) + + with self.assertRaises(InputError): + household_income_2013_acs5 = [ + dict(min=None, max=9999, n=6), + dict(min=10000, max=14999, n=1), + dict(min=15000, max=19999, n=8), + dict(min=20000, max=24999, n=7), + dict(min=25000, max=29999, n=2), + dict(min=30000, max=34999, n=90), + dict(min=35000, max=39999, n=7), + dict(min=40000, max=44999, n=4), + dict(min=45000, max=49999, n=8), + dict(min=50000, max=59999, n=6), + dict(min=60000, max=74999, n=7), + dict(min=75000, max=99999, n=2), + dict(min=100000, max=124999, n=7), + dict(min=125000, max=149999, n=10), + dict(min=150000, max=199999, n=8), + dict(min=200000, max=None, n=186) + ] + estimate, moe = census_data_aggregator.approximate_median(household_income_2013_acs5, design_factor=1, sampling_percentage=5*2.5, simulations=50, jam_values=[2599]) + + with self.assertWarns(JamValueResultMOEWarning): + household_income_2013_acs5 = [ + dict(min=None, max=9999, n=6, moe=1), + dict(min=10000, max=14999, n=1, moe=1), + dict(min=15000, max=19999, n=8, moe=1), + dict(min=20000, max=24999, n=7, moe=1), + dict(min=25000, max=29999, n=2, moe=1), + dict(min=30000, max=34999, n=90, moe=8), + dict(min=35000, max=39999, n=7, moe=1), + dict(min=40000, max=44999, n=4, moe=1), + dict(min=45000, max=49999, n=8, moe=1), + dict(min=50000, max=59999, n=6, moe=1), + dict(min=60000, max=74999, n=7, moe=1), + dict(min=75000, max=99999, n=2, moe=0.25), + dict(min=100000, max=124999, n=7, moe=1), + dict(min=125000, max=149999, n=10, moe=1), + dict(min=150000, max=199999, n=8, moe=1), + dict(min=200000, max=None, n=186, moe=10) + ] + estimate, moe = census_data_aggregator.approximate_median(household_income_2013_acs5, design_factor=1, sampling_percentage=5*2.5, simulations=50, jam_values=[2599, 200001]) + self.assertEqual(estimate, None) + self.assertEqual(moe, None) + + with self.assertWarns(JamValueMissingWarning): + estimate, moe = census_data_aggregator.approximate_median(household_income_2013_acs5, design_factor=1, sampling_percentage=5*2.5, simulations=50, jam_values=None) + self.assertEqual(estimate, None) + self.assertEqual(moe, None) + + household_income_2013_acs5 = [ + dict(min=None, max=9999, n=6, moe=1), + dict(min=10000, max=14999, n=1, moe=1), + dict(min=15000, max=19999, n=8, moe=1), + dict(min=20000, max=24999, n=7, moe=1), + dict(min=25000, max=29999, n=2, moe=1), + dict(min=30000, max=34999, n=900, moe=8), + dict(min=35000, max=39999, n=7, moe=1), + dict(min=40000, max=44999, n=4, moe=1), + dict(min=45000, max=49999, n=8, moe=1), + dict(min=50000, max=59999, n=6, moe=1), + dict(min=60000, max=74999, n=7, moe=1), + dict(min=75000, max=99999, n=2, moe=0.25), + dict(min=100000, max=124999, n=7, moe=1), + dict(min=125000, max=149999, n=10, moe=1), + dict(min=150000, max=199999, n=8, moe=1), + dict(min=200000, max=None, n=18, moe=10) + ] + numpy.random.seed(711355) + estimate, moe = census_data_aggregator.approximate_median(household_income_2013_acs5, design_factor=1, sampling_percentage=5*2.5, simulations=50, jam_values=None) + self.assertAlmostEqual(estimate, 32644.851568840597) + self.assertAlmostEqual(moe, 33.0019114324823) + with self.assertWarns(SamplingPercentageWarning): m, moe = census_data_aggregator.approximate_median(household_income_Los_Angeles_County_2013_acs5, design_factor=1.5) self.assertTrue(moe == None)