import numpy as np
import svy
from rich import print as rprint
from svy import load_dataset, Sample, Design
hld_data = load_dataset(name="hld_sample_wb_2023", limit=None)
hld_design = Design(stratum=("geo1", "urbrur"), psu="ea", wgt="hhweight")
hld_sample = Sample(data=hld_data, design=hld_design)
# Mutate the dataset to create household level poverty line
hld_sample = hld_sample.wrangling.mutate(
{
"hhpovline": svy.col("hhsize") * 1800,
"pov_status": svy.when(svy.col("tot_exp") < svy.col("hhpovline")).then(1).otherwise(0),
}
)Categorical Analysis
Categorical data analysis spans descriptive techniques—contingency tables and cross-tabulations with design-adjusted tests—and model-based approaches for categorical outcomes (logistic, multinomial, loglinear, and mixed-effects GLMs; see Agresti (2013)).
In complex survey applications, ignoring stratification, clustering, or unequal weights can misstate uncertainty. This tutorial shows how to use svy to produce design-aware tabulations, cross-tabulations, and a t-test for group differences, with standard errors that respect the sample design.
Let’s define the sample using the imaginary country household dataset, World Bank (2023).
Tabulation
Let’s tabulate admin-1 (first-level administrative unit) for the imaginary country.
hld_admin1_tab = hld_sample.categorical.tabulate(rowvar="geo1")
print(hld_admin1_tab)╭─────────────────────────────────────── Table ────────────────────────────────────────╮ │ Type=One-Way │ │ Alpha=0.05 │ │ │ │ Row Estimate Std Err CV Lower Upper │ │ ────────────────────────────────────────────────────── │ │ geo_01 0.1370 0.0041 0.0296 0.1292 0.1452 │ │ geo_02 0.0956 0.0038 0.0402 0.0883 0.1034 │ │ geo_03 0.1126 0.0033 0.0296 0.1062 0.1193 │ │ geo_04 0.1481 0.0047 0.0319 0.1391 0.1577 │ │ geo_05 0.0843 0.0031 0.0373 0.0783 0.0907 │ │ geo_06 0.0747 0.0032 0.0425 0.0687 0.0812 │ │ geo_07 0.0727 0.0040 0.0557 0.0651 0.0810 │ │ geo_08 0.0411 0.0039 0.0938 0.0342 0.0494 │ │ geo_09 0.1148 0.0040 0.0351 0.1071 0.1229 │ │ geo_10 0.1191 0.0047 0.0398 0.1101 0.1287 │ ╰──────────────────────────────────────────────────────────────────────────────────────╯
By default, tabulate() produces proportions. You can change the output scale via units to get counts or percentages:
from svy import TableUnits
# Counts
hld_admin1_tab_count = hld_sample.categorical.tabulate(rowvar="geo1", units=TableUnits.COUNT)
print("Table with counts")
print(hld_admin1_tab_count)
# Percentages
hld_admin1_tab_percent = hld_sample.categorical.tabulate(rowvar="geo1", units=TableUnits.PERCENT)
print("Table with percentages:")
print(hld_admin1_tab_percent)Table with counts
╭─────────────────────────────────────── Table ────────────────────────────────────────╮ │ Type=One-Way │ │ Alpha=0.05 │ │ │ │ Row Estimate Std Err CV Lower Upper │ │ ────────────────────────────────────────────────────────────────────── │ │ geo_01 342733.0000 10680.8501 0.0312 321714.4057 363751.5943 │ │ geo_02 239113.0000 10113.6242 0.0423 219210.6363 259015.3637 │ │ geo_03 281600.0000 8476.1378 0.0301 264920.0074 298279.9926 │ │ geo_04 370596.0000 12852.9351 0.0347 345303.0107 395888.9893 │ │ geo_05 210960.0000 8080.7698 0.0383 195058.0427 226861.9573 │ │ geo_06 186992.0000 8198.4041 0.0438 170858.5530 203125.4470 │ │ geo_07 181766.0000 10639.1062 0.0585 160829.5526 202702.4474 │ │ geo_08 102927.0000 9977.5362 0.0969 83292.4407 122561.5593 │ │ geo_09 287141.0000 10651.1400 0.0371 266180.8715 308101.1285 │ │ geo_10 297927.0000 12817.4855 0.0430 272703.7711 323150.2289 │ ╰──────────────────────────────────────────────────────────────────────────────────────╯
Table with percentages:
╭─────────────────────────────────────── Table ────────────────────────────────────────╮ │ Type=One-Way │ │ Alpha=0.05 │ │ │ │ Row Estimate Std Err CV Lower Upper │ │ ──────────────────────────────────────────────────────── │ │ geo_01 13.6997 0.4269 0.0312 12.8595 14.5399 │ │ geo_02 9.5578 0.4043 0.0423 8.7623 10.3533 │ │ geo_03 11.2561 0.3388 0.0301 10.5894 11.9228 │ │ geo_04 14.8134 0.5138 0.0347 13.8024 15.8245 │ │ geo_05 8.4325 0.3230 0.0383 7.7968 9.0681 │ │ geo_06 7.4744 0.3277 0.0438 6.8295 8.1193 │ │ geo_07 7.2655 0.4253 0.0585 6.4287 8.1024 │ │ geo_08 4.1142 0.3988 0.0969 3.3294 4.8990 │ │ geo_09 11.4776 0.4257 0.0371 10.6398 12.3154 │ │ geo_10 11.9087 0.5123 0.0430 10.9005 12.9169 │ ╰──────────────────────────────────────────────────────────────────────────────────────╯
To express counts on an arbitrary total, use count_total (useful for scaled headcounts while preserving shares):
# Scale counts so the total sums to 1,000
hld_admin1_tab_n = hld_sample.categorical.tabulate(rowvar="geo1", count_total=1_000)
print(hld_admin1_tab_n)╭─────────────────────────────────────── Table ────────────────────────────────────────╮ │ Type=One-Way │ │ Alpha=0.05 │ │ │ │ Row Estimate Std Err CV Lower Upper │ │ ────────────────────────────────────────────────────────── │ │ geo_01 136.9970 4.2693 0.0312 128.5955 145.3986 │ │ geo_02 95.5781 4.0426 0.0423 87.6227 103.5335 │ │ geo_03 112.5610 3.3881 0.0301 105.8937 119.2283 │ │ geo_04 148.1344 5.1376 0.0347 138.0243 158.2445 │ │ geo_05 84.3248 3.2300 0.0383 77.9685 90.6811 │ │ geo_06 74.7443 3.2771 0.0438 68.2955 81.1932 │ │ geo_07 72.6554 4.2527 0.0585 64.2867 81.0241 │ │ geo_08 41.1419 3.9882 0.0969 33.2936 48.9902 │ │ geo_09 114.7758 4.2575 0.0371 106.3977 123.1540 │ │ geo_10 119.0872 5.1234 0.0430 109.0050 129.1694 │ ╰──────────────────────────────────────────────────────────────────────────────────────╯
T-test
pov_status_mean_h0 = hld_sample.categorical.ttest(y="pov_status", mean_h0=0.25)
print(pov_status_mean_h0)╭─────────────────────────────────── t-test results ───────────────────────────────────╮ │ One-sample t-test │ │ Y = 'pov_status' │ │ H₀: μ = 0.2500 │ │ │ │ Estimate Std Err CV Lower Upper │ │ ───────────────────────────────────────────── │ │ 0.2288 0.0140 0.0613 0.2012 0.2564 │ │ │ │ Test statistic │ │ t df p(<) p(>) p(≠) │ │ ────────────────────────────────────────────── │ │ -1.5141 7999.0000 0.0650 0.9350 0.1300 │ ╰──────────────────────────────────────────────────────────────────────────────────────╯