import numpy as np
import svy
from rich import print as rprint
from svy import load_dataset, Sample, Design
rng = np.random.default_rng(12345)
hld_data = load_dataset(name="hld_sample_wb_2023", limit=None)
hld_design = Design(stratum=("geo1", "urbrur"), psu="ea", wgt="hhweight")
hld_sample = Sample(data=hld_data, design=hld_design)Parameters Estimation
The purpose of a survey is to draw inferences about the target population from a sampled subset. Because most surveys employ some combination of stratification, clustering, unequal probabilities of selection, and post-collection adjustments, treating observations as independent and identically distributed (i.i.d.) can misstate uncertainty. Proper variance estimation therefore uses the design information (weights, strata, PSUs) via Taylor linearization or replication methods (e.g., BRR, jackknife, bootstrap) to produce accurate standard errors and confidence intervals.
This tutorial demonstrates how to use svy’s APIs to produce point estimates for means, totals, proportions, and ratios, along with corresponding Taylor-based and replication-based measures of uncertainty.
Let’s define the sample using the imaginary country household dataset, World Bank (2023).
Taylor-based Estimation
svy
- Use
Sample.estimation.mean()to estimate means. - Use
Sample.estimation.prop()to estimate proportions. - Use
Sample.estimation.total()to estimate totals. - Use
Sample.estimation.ratio()to estimate ratios. - Use
Sample.estimation.median()to estimate medians.
Mean
Let’s estimate the average total household expenditure
tot_exp_mean = hld_sample.estimation.mean(y="tot_exp")
print(tot_exp_mean)╭────────────────────────────────────── Estimate ──────────────────────────────────────╮ │ Estimation of MEAN │ │ Method: TAYLOR │ │ Y: tot_exp │ │ Number of strata: 19 │ │ Number of PSUs: 320 │ │ Degrees of freedom: 301 │ │ Alpha: 0.05 │ │ │ │ est se lci uci cv (%) │ │ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ │ │ 12,049 230 11,596 12,502 1.91 │ │ │ ╰──────────────────────────────────────────────────────────────────────────────────────╯
If we need the estimate by region
tot_exp_mean_admin1 = hld_sample.estimation.mean(y="tot_exp", by="geo1")
print(tot_exp_mean_admin1)╭────────────────────────────────────── Estimate ──────────────────────────────────────╮ │ Estimation of MEAN │ │ Method: TAYLOR │ │ Y: tot_exp │ │ By: geo1 │ │ Number of strata: 19 │ │ Number of PSUs: 320 │ │ Degrees of freedom: 301 │ │ Alpha: 0.05 │ │ │ │ geo1 est se lci uci cv (%) │ │ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ │ │ geo_01 14,288 675.5 12,959 15,617 4.73 │ │ geo_02 11,332 661.5 10,031 12,634 5.84 │ │ geo_03 12,892 716.4 11,483 14,302 5.56 │ │ geo_04 12,508 609.8 11,308 13,708 4.88 │ │ geo_05 10,289 623.9 9,061 11,516 6.06 │ │ geo_06 11,867 726.9 10,436 13,297 6.13 │ │ geo_07 11,255 924.2 9,436 13,074 8.21 │ │ geo_08 11,633 1,290.5 9,094 14,173 11.09 │ │ geo_09 12,265 690.6 10,906 13,625 5.63 │ │ geo_10 10,460 672.6 9,137 11,784 6.43 │ │ │ ╰──────────────────────────────────────────────────────────────────────────────────────╯
And by admin 1 and urbrur
tot_exp_mean_admin1_urbrur = hld_sample.estimation.mean(y="tot_exp", by=("geo1", "urbrur"))
print(tot_exp_mean_admin1_urbrur)╭────────────────────────────────────── Estimate ──────────────────────────────────────╮ │ Estimation of MEAN │ │ Method: TAYLOR │ │ Y: tot_exp │ │ By: geo1, urbrur │ │ Number of strata: 19 │ │ Number of PSUs: 320 │ │ Degrees of freedom: 301 │ │ Alpha: 0.05 │ │ │ │ geo1 urbrur est se lci uci cv (%) │ │ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ │ │ geo_01 Urban 14,288 675.5 12,959 15,617 4.73 │ │ geo_02 Rural 9,260 589.2 8,101 10,420 6.36 │ │ geo_02 Urban 17,756 1,590.2 14,626 20,885 8.96 │ │ geo_03 Rural 8,419 681.3 7,078 9,760 8.09 │ │ geo_03 Urban 15,249 990.8 13,300 17,199 6.50 │ │ geo_04 Rural 8,514 1,182.1 6,188 10,840 13.88 │ │ geo_04 Urban 14,265 732.7 12,823 15,707 5.14 │ │ geo_05 Rural 9,128 650.9 7,847 10,409 7.13 │ │ geo_05 Urban 13,147 1,394.8 10,402 15,892 10.61 │ │ geo_06 Rural 9,806 994.7 7,849 11,764 10.14 │ │ geo_06 Urban 14,787 1,050.4 12,720 16,854 7.10 │ │ geo_07 Rural 7,757 1,206.6 5,383 10,131 15.55 │ │ geo_07 Urban 14,353 1,163.5 12,064 16,643 8.11 │ │ geo_08 Rural 10,641 1,452.0 7,784 13,498 13.65 │ │ geo_08 Urban 17,918 3,553.5 10,925 24,911 19.83 │ │ geo_09 Rural 9,436 743 7,974 10,898 7.87 │ │ geo_09 Urban 16,115 1,126.8 13,898 18,333 6.99 │ │ geo_10 Rural 8,818 1,021.0 6,809 10,827 11.58 │ │ geo_10 Urban 11,776 893.9 10,017 13,535 7.59 │ │ │ ╰──────────────────────────────────────────────────────────────────────────────────────╯
Proportion
We now estimate the share of households living below the poverty line.
In this imaginary country, we assume thata the person-level poverty threshold is 1,800 (local currency units). For this tutorial, we convert it to a household threshold by multiplying 1,800 by the household size; a household is classified as poor if its welfare measure (e.g., total consumption or income) is below size × 1,800.
This household scaling is for illustration only. Applied analyses commonly use adult-equivalence scales, economies-of-scale adjustments, and sometimes regional price indices.
As shown below, we use mutate() to add the household poverty line and a binary poverty status column.
# Mutate the dataset to create household level poverty line
hld_sample = hld_sample.wrangling.mutate(
{
"hhpovline": svy.col("hhsize") * 1800,
"pov_status": svy.when(svy.col("tot_exp") < svy.col("hhpovline")).then(1).otherwise(0),
}
)
rprint(
hld_sample.show_data(
columns=[
"hid",
"geo1",
"urbrur",
"hhsize",
"tot_exp",
"hhpovline",
"pov_status",
"hhweight",
],
how="sample",
n=9,
rstate=rng,
),
)shape: (9, 8) ┌─────────────┬────────┬────────┬────────┬─────────┬───────────┬────────────┬────────────┐ │ hid ┆ geo1 ┆ urbrur ┆ hhsize ┆ tot_exp ┆ hhpovline ┆ pov_status ┆ hhweight │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i32 ┆ f64 │ ╞═════════════╪════════╪════════╪════════╪═════════╪═══════════╪════════════╪════════════╡ │ d64d5e077e1 ┆ geo_01 ┆ Urban ┆ 3 ┆ 34395 ┆ 5400 ┆ 0 ┆ 223.409342 │ │ 2107ab85eaf ┆ geo_09 ┆ Rural ┆ 10 ┆ 8923 ┆ 18000 ┆ 1 ┆ 335.179723 │ │ 48fd6bcf810 ┆ geo_01 ┆ Urban ┆ 5 ┆ 9280 ┆ 9000 ┆ 0 ┆ 358.183456 │ │ 3ecc96596f2 ┆ geo_10 ┆ Rural ┆ 2 ┆ 3341 ┆ 3600 ┆ 1 ┆ 163.528826 │ │ 1beaea460b2 ┆ geo_04 ┆ Urban ┆ 2 ┆ 9147 ┆ 3600 ┆ 0 ┆ 231.314029 │ │ 27690299d12 ┆ geo_03 ┆ Rural ┆ 3 ┆ 10006 ┆ 5400 ┆ 0 ┆ 326.828958 │ │ ed1ddf41362 ┆ geo_09 ┆ Rural ┆ 3 ┆ 6769 ┆ 5400 ┆ 0 ┆ 416.372327 │ │ 91b3bb7f73e ┆ geo_02 ┆ Rural ┆ 6 ┆ 7897 ┆ 10800 ┆ 1 ┆ 219.140606 │ │ 8127162c639 ┆ geo_04 ┆ Urban ┆ 7 ┆ 15050 ┆ 12600 ┆ 0 ┆ 269.028273 │ └─────────────┴────────┴────────┴────────┴─────────┴───────────┴────────────┴────────────┘
Let’s now estimate the poverty rates
hld_pov_ratio = hld_sample.estimation.prop(y="pov_status", deff=True, drop_nulls=True)
print(hld_pov_ratio)╭────────────────────────────────────── Estimate ──────────────────────────────────────╮ │ Estimation of PROP │ │ Method: TAYLOR │ │ Y: pov_status │ │ Number of strata: 19 │ │ Number of PSUs: 320 │ │ Degrees of freedom: 301 │ │ Alpha: 0.05 │ │ │ │ pov_status est se lci uci cv (%) deff │ │ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ │ │ 1 0.2288 0.01403 0.2023 0.2575 6.13 8.948 │ │ 0 0.7712 0.01403 0.7425 0.7977 1.82 8.948 │ │ │ ╰──────────────────────────────────────────────────────────────────────────────────────╯
Total
Let’s estimate the poverty counts
hld_pov_count = hld_sample.estimation.total(y="pov_status", deff=True, drop_nulls=True)
print(hld_pov_count)╭────────────────────────────────────── Estimate ──────────────────────────────────────╮ │ Estimation of TOTAL │ │ Method: TAYLOR │ │ Y: pov_status │ │ Number of strata: 19 │ │ Number of PSUs: 320 │ │ Degrees of freedom: 301 │ │ Alpha: 0.05 │ │ │ │ est se lci uci cv (%) deff │ │ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ │ │ 572,309 36,062.2 501,343 643,275 6.30 9.451 │ │ │ ╰──────────────────────────────────────────────────────────────────────────────────────╯
Median
Let’s estimate the median household total expenditure
hld_pov_count = hld_sample.estimation.median(y="tot_exp")
print(hld_pov_count)╭────────────────────────────────────── Estimate ──────────────────────────────────────╮ │ Estimation of MEDIAN │ │ Method: TAYLOR │ │ Y: tot_exp │ │ Number of strata: 19 │ │ Number of PSUs: 320 │ │ Degrees of freedom: 301 │ │ Alpha: 0.05 │ │ │ │ est se lci uci cv (%) │ │ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ │ │ 10,268 220.8 9,844 10,712 2.15 │ │ │ ╰──────────────────────────────────────────────────────────────────────────────────────╯
Replicate-based Estimation
svy
- Use
Sample.estimation.rmean()to estimate means. - Use
Sample.estimation.rprop()to estimate proportions. - Use
Sample.estimation.rtotal()to estimate totals. - Use
Sample.estimation.rratio()to estimate ratios. - Use
Sample.estimation.rmedian()to estimate medians.
COMING SOON!!!
Next Steps
Next, we discuss the categorical data analysis methods: Categorical Data Analysis.