import numpy as np
import svy
svy.Estimate.PRINT_WIDTH = 95
from rich import print as rprint
rng = np.random.default_rng(12345)
hld_data = svy.load_dataset(name="hld_sample_wb_2023", limit=None)
hld_design = svy.Design(stratum=("geo1", "urbrur"), psu="ea", wgt="hhweight")
hld_sample = svy.Sample(data=hld_data, design=hld_design)Survey Parameter Estimation
Means, totals, proportions, ratios, and medians with proper variance estimation
survey estimation Python, Taylor linearization, replicate weights variance, survey mean estimation, survey proportion, design effect deff, complex survey analysis, svy library
The purpose of a survey is to draw inferences about the target population from a sampled subset. Because most surveys employ some combination of stratification, clustering, unequal probabilities of selection, and post-collection adjustments, treating observations as independent and identically distributed (i.i.d.) can misstate uncertainty.
Proper variance estimation uses design information (weights, strata, PSUs) via Taylor linearization or replication methods (BRR, jackknife, bootstrap) to produce accurate standard errors and confidence intervals.
This tutorial demonstrates how to use the svy library to produce point estimates for means, totals, proportions, ratios, and medians, along with corresponding Taylor-based and replication-based measures of uncertainty.
Setting Up the Sample
We’ll use the imaginary country household dataset from World Bank (2023):
Taylor-Based Estimation
Taylor linearization is the standard approach for variance estimation in complex surveys. It accounts for stratification, clustering, and unequal weights without requiring replicate weight columns.
svy
- Use
estimation.mean()to estimate means - Use
estimation.prop()to estimate proportions - Use
estimation.total()to estimate totals - Use
estimation.ratio()to estimate ratios - Use
estimation.median()to estimate medians
Estimating Means
Estimate the average total household expenditure:
tot_exp_mean = hld_sample.estimation.mean(y="tot_exp")
print(tot_exp_mean)╭─────────────────── Estimate: MEAN (TAYLOR) ───────────────────╮ │ │ │ est se lci uci cv (%) │ │ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ │ │ 12,048.9638 229.9865 11,596.3788 12,501.5488 1.91 │ │ │ ╰───────────────────────────────────────────────────────────────╯
Estimate means by region using the by parameter:
tot_exp_mean_admin1 = hld_sample.estimation.mean(y="tot_exp", by="geo1")
print(tot_exp_mean_admin1)╭──────────────────────── Estimate: MEAN (TAYLOR) ─────────────────────────╮ │ │ │ geo1 est se lci uci cv (%) │ │ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ │ │ geo_01 14,288.0552 675.5344 12,958.6869 15,617.4235 4.73 │ │ geo_02 11,332.3640 661.5220 10,030.5703 12,634.1577 5.84 │ │ geo_03 12,892.2285 716.3554 11,482.5295 14,301.9274 5.56 │ │ geo_04 12,508.1484 609.8082 11,308.1212 13,708.1757 4.88 │ │ geo_05 10,288.6744 623.8749 9,060.9657 11,516.3831 6.06 │ │ geo_06 11,866.6574 726.9371 10,436.1349 13,297.1798 6.13 │ │ geo_07 11,255.0069 924.1644 9,436.3654 13,073.6484 8.21 │ │ geo_08 11,633.2305 1,290.4820 9,093.7212 14,172.7397 11.09 │ │ geo_09 12,265.4656 690.6409 10,906.3695 13,624.5616 5.63 │ │ geo_10 10,460.2557 672.6037 9,136.6547 11,783.8567 6.43 │ │ │ ╰──────────────────────────────────────────────────────────────────────────╯
Estimate means by multiple grouping variables (geo1 and urbrur):
tot_exp_mean_admin1_urbrur = hld_sample.estimation.mean(y="tot_exp", by=("geo1", "urbrur"))
print(tot_exp_mean_admin1_urbrur)╭───────────────────────────── Estimate: MEAN (TAYLOR) ─────────────────────────────╮ │ │ │ geo1 urbrur est se lci uci cv (%) │ │ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ │ │ geo_01 Urban 14,288.0552 675.5344 12,958.6869 15,617.4235 4.73 │ │ geo_02 Rural 9,260.2908 589.2201 8,100.7783 10,419.8032 6.36 │ │ geo_02 Urban 17,755.5353 1,590.1787 14,626.2600 20,884.8105 8.96 │ │ geo_03 Rural 8,419.1458 681.3013 7,078.4289 9,759.8627 8.09 │ │ geo_03 Urban 15,249.3535 990.8078 13,299.5661 17,199.1409 6.50 │ │ geo_04 Rural 8,513.7156 1,182.0717 6,187.5445 10,839.8868 13.88 │ │ geo_04 Urban 14,265.1253 732.6696 12,823.3220 15,706.9285 5.14 │ │ geo_05 Rural 9,128.2380 650.8814 7,847.3837 10,409.0924 7.13 │ │ geo_05 Urban 13,146.8476 1,394.8279 10,401.9985 15,891.6966 10.61 │ │ geo_06 Rural 9,806.3231 994.6903 7,848.8953 11,763.7509 10.14 │ │ geo_06 Urban 14,786.6872 1,050.4005 12,719.6287 16,853.7456 7.10 │ │ geo_07 Rural 7,756.9065 1,206.5539 5,382.5573 10,131.2556 15.55 │ │ geo_07 Urban 14,353.0549 1,163.4578 12,063.5135 16,642.5962 8.11 │ │ geo_08 Rural 10,640.8358 1,451.9525 7,783.5725 13,498.0991 13.65 │ │ geo_08 Urban 17,918.1376 3,553.5066 10,925.2753 24,910.9999 19.83 │ │ geo_09 Rural 9,436.2621 743.0018 7,974.1263 10,898.3980 7.87 │ │ geo_09 Urban 16,115.2087 1,126.7975 13,897.8102 18,332.6071 6.99 │ │ geo_10 Rural 8,817.8009 1,021.0338 6,808.5323 10,827.0694 11.58 │ │ geo_10 Urban 11,776.0426 893.9162 10,016.9259 13,535.1593 7.59 │ │ │ ╰───────────────────────────────────────────────────────────────────────────────────╯
Estimating Proportions
We’ll estimate the share of households living below the poverty line.
In this imaginary country, assume the person-level poverty threshold is 1,800 local currency units. We convert it to a household threshold by multiplying by household size—a household is classified as poor if its welfare measure falls below this threshold.
This household scaling is for illustration only. Applied analyses commonly use adult-equivalence scales, economies-of-scale adjustments, and regional price indices.
First, create the poverty status variable using mutate():
# Create household poverty line and binary poverty status
hld_sample = hld_sample.wrangling.mutate(
{
"hhpovline": svy.col("hhsize") * 1800,
"pov_status": svy.when(svy.col("tot_exp") < svy.col("hhpovline")).then(1).otherwise(0),
}
)
rprint(
hld_sample.show_data(
columns=[
"hid",
"geo1",
"urbrur",
"hhsize",
"tot_exp",
"hhpovline",
"pov_status",
"hhweight",
],
how="sample",
n=9,
rstate=rng,
),
)shape: (9, 8) ┌─────────────┬────────┬────────┬────────┬─────────┬───────────┬────────────┬────────────┐ │ hid ┆ geo1 ┆ urbrur ┆ hhsize ┆ tot_exp ┆ hhpovline ┆ pov_status ┆ hhweight │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i32 ┆ f64 │ ╞═════════════╪════════╪════════╪════════╪═════════╪═══════════╪════════════╪════════════╡ │ c3c6fb8128b ┆ geo_02 ┆ Rural ┆ 5 ┆ 6629 ┆ 9000 ┆ 1 ┆ 351.538056 │ │ 48e50c183b8 ┆ geo_09 ┆ Rural ┆ 5 ┆ 7878 ┆ 9000 ┆ 1 ┆ 237.332226 │ │ a627906a158 ┆ geo_09 ┆ Urban ┆ 2 ┆ 7786 ┆ 3600 ┆ 0 ┆ 297.582281 │ │ a3ca4cb8183 ┆ geo_07 ┆ Rural ┆ 7 ┆ 8685 ┆ 12600 ┆ 1 ┆ 260.100388 │ │ 7a4dbad563f ┆ geo_08 ┆ Urban ┆ 7 ┆ 12135 ┆ 12600 ┆ 1 ┆ 333.592341 │ │ 2ecbbf32149 ┆ geo_07 ┆ Rural ┆ 12 ┆ 13366 ┆ 21600 ┆ 1 ┆ 262.554165 │ │ 1a442ef36ce ┆ geo_09 ┆ Rural ┆ 6 ┆ 12220 ┆ 10800 ┆ 0 ┆ 296.665283 │ │ 179a0a159d1 ┆ geo_04 ┆ Rural ┆ 4 ┆ 5080 ┆ 7200 ┆ 1 ┆ 279.200835 │ │ f0ab4534b79 ┆ geo_10 ┆ Rural ┆ 2 ┆ 13831 ┆ 3600 ┆ 0 ┆ 389.558197 │ └─────────────┴────────┴────────┴────────┴─────────┴───────────┴────────────┴────────────┘
Estimate poverty rates with the design effect (deff):
hld_pov_ratio = hld_sample.estimation.prop(y="pov_status", deff=True, drop_nulls=True)
print(hld_pov_ratio)╭───────────────────── Estimate: PROP (TAYLOR) ──────────────────────╮ │ │ │ pov_status est se lci uci cv (%) deff │ │ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ │ │ 1 0.2288 0.0140 0.2023 0.2575 6.13 8.9479 │ │ 0 0.7712 0.0140 0.7425 0.7977 1.82 8.9479 │ │ │ ╰────────────────────────────────────────────────────────────────────╯
Estimating Totals
Estimate the total count of poor households:
hld_pov_count = hld_sample.estimation.total(y="pov_status", deff=True, drop_nulls=True)
print(hld_pov_count)╭────────────────────────── Estimate: TOTAL (TAYLOR) ──────────────────────────╮ │ │ │ est se lci uci cv (%) deff │ │ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ │ │ 572,308.8749 36,062.1842 501,342.9490 643,274.8008 6.30 9.4508 │ │ │ ╰──────────────────────────────────────────────────────────────────────────────╯
Estimating Medians
Estimate the median household total expenditure:
hld_median_exp = hld_sample.estimation.median(y="tot_exp")
print(hld_median_exp)╭───────────────── Estimate: MEDIAN (TAYLOR) ──────────────────╮ │ │ │ est se lci uci cv (%) │ │ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ │ │ 10,268.0000 220.7964 9,844.0000 10,712.1489 2.15 │ │ │ ╰──────────────────────────────────────────────────────────────╯
Replicate-Based Estimation
Replicate-based variance estimation uses replicate weights (bootstrap, BRR, or jackknife) instead of Taylor linearization. This approach is especially useful for non-linear statistics where linearization may be inaccurate.
Coming Soon: Examples of replicate-based estimation will be added in a future update.
Next Steps
Continue to Categorical Data Analysis to learn how to create weighted cross-tabulations and perform chi-square tests.
Ready for more analysis?
Learn categorical analysis in Categorical Data Analysis →