import polars as pl
import svy
svy.Estimate.PRINT_WIDTH = 89
svy.Estimate.DECIMALS = 42025 Medical Expenditure Panel Survey, Household Component (MEPS-HC)
Reproducing the 2025 MEPS Workshop in Python (svy)
In this document, we use Python and the svy library to reproduce the 2025 MEPS Workshop (originally conducted in R, see GitHub Repository).
To follow along and run the code locally, download the following 2023 MEPS public-use files:
- 2023 Full-Year Consolidated (HC-251)
- 2023 Office-Based Medical Provider Visits (HC-248G)
- 2023 Medical Conditions (HC-249)
- 2023 CLNK: Condition–Event Link (HC-248I)
uv
The steps below give you a fast, reproducible setup for running the MEPS workshop with svy.
- If you do not have
uv, install it. See instructions at https://docs.astral.sh/uv/getting-started/installation/ - restart your shell so
uvis on PATH, and from your root project run:uv venv -p 3.13 - Initialize the environment with
uv init - Add the requirements:
uv add svy[report]
Since it’s a simple analysis, you can store the datasets and code in the root project folder.
Imports used throughout the workshop and some general settings.
Part I — Estimates for National Health Care for the U.S. Civilian Non-Institutionalized Population, 2023
Exploration of the Relevant Data from the 2023 MEPS-HC
First, we read the 2023 Full-Year Consolidated file from local storage using svy.read_stata. Then we subset the variables needed for this tutorial, derive two helper variables, and run quick QC checks.
Read the Stata file via svy
fyc23 = svy.read_stata(path="./h251.dta")
print(fyc23)shape: (18_919, 1_374)
┌────────────┬───────┬────────────┬───────┬───┬──────────────┬──────────────┬────────┬────────┐
│ DUID ┆ PID ┆ DUPERSID ┆ PANEL ┆ … ┆ FAMWT23C ┆ SAQWT23F ┆ VARSTR ┆ VARPSU │
│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ str ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │
╞════════════╪═══════╪════════════╪═══════╪═══╪══════════════╪══════════════╪════════╪════════╡
│ 2.790002e6 ┆ 101.0 ┆ 2790002101 ┆ 27.0 ┆ … ┆ 11158.817826 ┆ 13221.315673 ┆ 2019.0 ┆ 1.0 │
│ 2.790002e6 ┆ 102.0 ┆ 2790002102 ┆ 27.0 ┆ … ┆ 11158.817826 ┆ 0.0 ┆ 2019.0 ┆ 1.0 │
│ 2.790004e6 ┆ 101.0 ┆ 2790004101 ┆ 27.0 ┆ … ┆ 28540.745942 ┆ 29999.277476 ┆ 2084.0 ┆ 1.0 │
│ 2.790006e6 ┆ 101.0 ┆ 2790006101 ┆ 27.0 ┆ … ┆ 10821.040689 ┆ 11144.513916 ┆ 2113.0 ┆ 1.0 │
│ 2.790006e6 ┆ 102.0 ┆ 2790006102 ┆ 27.0 ┆ … ┆ 10821.040689 ┆ 0.0 ┆ 2113.0 ┆ 1.0 │
│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │
│ 2.819784e6 ┆ 105.0 ┆ 2819784105 ┆ 28.0 ┆ … ┆ 5126.030033 ┆ 0.0 ┆ 2015.0 ┆ 1.0 │
│ 2.819788e6 ┆ 101.0 ┆ 2819788101 ┆ 28.0 ┆ … ┆ 3608.933864 ┆ 4902.751776 ┆ 2005.0 ┆ 1.0 │
│ 2.819792e6 ┆ 101.0 ┆ 2819792101 ┆ 28.0 ┆ … ┆ 26238.599825 ┆ 0.0 ┆ 2012.0 ┆ 3.0 │
│ 2.819793e6 ┆ 101.0 ┆ 2819793101 ┆ 28.0 ┆ … ┆ 15780.213332 ┆ 0.0 ┆ 2004.0 ┆ 1.0 │
│ 2.819793e6 ┆ 102.0 ┆ 2819793102 ┆ 28.0 ┆ … ┆ 15780.213332 ┆ 0.0 ┆ 2004.0 ┆ 1.0 │
└────────────┴───────┴────────────┴───────┴───┴──────────────┴──────────────┴────────┴────────┘
Subset to columns used in the tutorial using polars.
fyc23_sub = fyc23.select(
["AGELAST", "TOTEXP23", "DUPERSID", "VARSTR", "VARPSU", "PERWT23F"]
)
print(fyc23_sub.head())shape: (5, 6)
┌─────────┬──────────┬────────────┬────────┬────────┬──────────────┐
│ AGELAST ┆ TOTEXP23 ┆ DUPERSID ┆ VARSTR ┆ VARPSU ┆ PERWT23F │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ str ┆ f64 ┆ f64 ┆ f64 │
╞═════════╪══════════╪════════════╪════════╪════════╪══════════════╡
│ 58.0 ┆ 646.0 ┆ 2790002101 ┆ 2019.0 ┆ 1.0 ┆ 11664.426815 │
│ 27.0 ┆ 1894.0 ┆ 2790002102 ┆ 2019.0 ┆ 1.0 ┆ 32212.113596 │
│ 49.0 ┆ 986.0 ┆ 2790004101 ┆ 2084.0 ┆ 1.0 ┆ 21944.142826 │
│ 75.0 ┆ 1312.0 ┆ 2790006101 ┆ 2113.0 ┆ 1.0 ┆ 10328.00953 │
│ 23.0 ┆ 0.0 ┆ 2790006102 ┆ 2113.0 ┆ 1.0 ┆ 17430.521357 │
└─────────┴──────────┴────────────┴────────┴────────┴──────────────┘
Derive helper variables:
has_exp: indicator for any total expenditureage_cat: <65 vs 65+
fyc23x = fyc23_sub.with_columns(
has_exp=pl.col("TOTEXP23").gt(pl.lit(0)),
age_cat=pl.when(pl.col("AGELAST") < 65)
.then(pl.lit("<65"))
.otherwise(pl.lit("65+")),
)
print(fyc23x.head())shape: (5, 8)
┌─────────┬──────────┬────────────┬────────┬────────┬──────────────┬─────────┬─────────┐
│ AGELAST ┆ TOTEXP23 ┆ DUPERSID ┆ VARSTR ┆ VARPSU ┆ PERWT23F ┆ has_exp ┆ age_cat │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ str ┆ f64 ┆ f64 ┆ f64 ┆ bool ┆ str │
╞═════════╪══════════╪════════════╪════════╪════════╪══════════════╪═════════╪═════════╡
│ 58.0 ┆ 646.0 ┆ 2790002101 ┆ 2019.0 ┆ 1.0 ┆ 11664.426815 ┆ true ┆ <65 │
│ 27.0 ┆ 1894.0 ┆ 2790002102 ┆ 2019.0 ┆ 1.0 ┆ 32212.113596 ┆ true ┆ <65 │
│ 49.0 ┆ 986.0 ┆ 2790004101 ┆ 2084.0 ┆ 1.0 ┆ 21944.142826 ┆ true ┆ <65 │
│ 75.0 ┆ 1312.0 ┆ 2790006101 ┆ 2113.0 ┆ 1.0 ┆ 10328.00953 ┆ true ┆ 65+ │
│ 23.0 ┆ 0.0 ┆ 2790006102 ┆ 2113.0 ┆ 1.0 ┆ 17430.521357 ┆ false ┆ <65 │
└─────────┴──────────┴────────────┴────────┴────────┴──────────────┴─────────┴─────────┘
QC 1: Two-way counts of derived variables
qc_exp_by_age = fyc23x.group_by(["has_exp", "age_cat"]).agg(pl.len())
print(qc_exp_by_age)shape: (4, 3)
┌─────────┬─────────┬───────┐
│ has_exp ┆ age_cat ┆ len │
│ --- ┆ --- ┆ --- │
│ bool ┆ str ┆ u32 │
╞═════════╪═════════╪═══════╡
│ true ┆ 65+ ┆ 4486 │
│ false ┆ 65+ ┆ 158 │
│ true ┆ <65 ┆ 11778 │
│ false ┆ <65 ┆ 2497 │
└─────────┴─────────┴───────┘
QC 2: Expenditure ranges by has_exp
qc_exp = fyc23x.group_by(["has_exp"]).agg(
pl.col("TOTEXP23").min().alias("min"),
pl.col("TOTEXP23").max().alias("max"),
)
print(qc_exp)shape: (2, 3)
┌─────────┬─────┬──────────┐
│ has_exp ┆ min ┆ max │
│ --- ┆ --- ┆ --- │
│ bool ┆ f64 ┆ f64 │
╞═════════╪═════╪══════════╡
│ true ┆ 1.0 ┆ 574675.0 │
│ false ┆ 0.0 ┆ 0.0 │
└─────────┴─────┴──────────┘
QC 3: Age ranges by age_cat
qc_age = fyc23x.group_by(["age_cat"]).agg(
pl.col("AGELAST").min().alias("min"),
pl.col("AGELAST").max().alias("max"),
)
print(qc_age)shape: (2, 3)
┌─────────┬──────┬──────┐
│ age_cat ┆ min ┆ max │
│ --- ┆ --- ┆ --- │
│ str ┆ f64 ┆ f64 │
╞═════════╪══════╪══════╡
│ <65 ┆ 0.0 ┆ 64.0 │
│ 65+ ┆ 65.0 ┆ 85.0 │
└─────────┴──────┴──────┘
Estimation of Expenses
Sample design
First, we are going to define the sample
from svy import Sample, Design
fyc23_design = Design(stratum="VARSTR", psu="VARPSU", wgt="PERWT23F")
fyc23_sample = Sample(data=fyc23x, design=fyc23_design)
print(fyc23_sample)╭─────────────────────────── Sample ────────────────────────────╮ │ Survey Data: │ │ Number of rows: 18919 │ │ Number of columns: 11 │ │ Number of strata: 105 │ │ Number of PSUs: 262 │ │ │ │ Survey Design: │ │ │ │ Field Value │ │ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ │ │ Row index svy_row_index │ │ Stratum VARSTR │ │ PSU VARPSU │ │ SSU None │ │ Weight PERWT23F │ │ With replacement False │ │ Prob None │ │ Hit None │ │ MOS None │ │ Population size None │ │ Replicate weights None │ │ │ ╰───────────────────────────────────────────────────────────────╯
Checking for singletons
If some strata only have one PSU, the estimation will fail. We can check for singletons (strata with one PSU) as follows
# List of strata with only one PSU
fyc23_sample.singleton.detected()[]
The sample does not have singletons.
Overall expenses (national totals)
tot_exp = fyc23_sample.estimation.total(y="TOTEXP23")
print(tot_exp)╭────────────────────────────────────── Estimate ───────────────────────────────────────╮ │ Estimation of TOTAL │ │ Method: TAYLOR │ │ Y: TOTEXP23 │ │ Number of strata: 105 │ │ Number of PSUs: 262 │ │ Degrees of freedom: 157 │ │ Alpha: 0.05 │ │ │ │ Field Value │ │ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ │ │ est 2,504,715,663,922.5674 │ │ se 72,782,531,974.9032 │ │ lci 2,360,956,395,505.0254 │ │ uci 2,648,474,932,340.1094 │ │ cv (%) 2.9058 │ │ │ ╰───────────────────────────────────────────────────────────────────────────────────────╯
Percentage of persons with an expense
has_exp = fyc23_sample.estimation.prop(y="has_exp")
has_exp.set_decimals(6)
print(has_exp)╭────────────────────────────────────── Estimate ───────────────────────────────────────╮ │ Estimation of PROP │ │ Method: TAYLOR │ │ Y: has_exp │ │ Number of strata: 105 │ │ Number of PSUs: 262 │ │ Degrees of freedom: 157 │ │ Alpha: 0.05 │ │ │ │ has_exp est se lci uci cv (%) │ │ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ │ │ 1 0.855509 0.005029 0.845289 0.865162 0.587875 │ │ 0 0.144491 0.005029 0.134838 0.154711 3.480716 │ │ │ ╰───────────────────────────────────────────────────────────────────────────────────────╯
Mean expense per person
avg_exp = fyc23_sample.estimation.mean(y="TOTEXP23")
print(avg_exp)╭────────────────────────────────────── Estimate ───────────────────────────────────────╮ │ Estimation of MEAN │ │ Method: TAYLOR │ │ Y: TOTEXP23 │ │ Number of strata: 105 │ │ Number of PSUs: 262 │ │ Degrees of freedom: 157 │ │ Alpha: 0.05 │ │ │ │ est se lci uci cv (%) │ │ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ │ │ 7,487.2616 173.3692 7,144.8246 7,829.6986 2.3155 │ │ │ ╰───────────────────────────────────────────────────────────────────────────────────────╯
Mean expense per person, for people with expenditures)
avg_exp1 = fyc23_sample.estimation.mean(y="TOTEXP23")
print(avg_exp1)╭────────────────────────────────────── Estimate ───────────────────────────────────────╮ │ Estimation of MEAN │ │ Method: TAYLOR │ │ Y: TOTEXP23 │ │ Number of strata: 105 │ │ Number of PSUs: 262 │ │ Degrees of freedom: 157 │ │ Alpha: 0.05 │ │ │ │ est se lci uci cv (%) │ │ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ │ │ 7,487.2616 173.3692 7,144.8246 7,829.6986 2.3155 │ │ │ ╰───────────────────────────────────────────────────────────────────────────────────────╯
Subset to people with income
has_exp_sample = fyc23_sample.wrangling.filter_records(svy.col("has_exp"))
print(has_exp_sample)╭─────────────────────────── Sample ────────────────────────────╮ │ Survey Data: │ │ Number of rows: 16264 │ │ Number of columns: 11 │ │ Number of strata: 105 │ │ Number of PSUs: 262 │ │ │ │ Survey Design: │ │ │ │ Field Value │ │ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ │ │ Row index svy_row_index │ │ Stratum VARSTR │ │ PSU VARPSU │ │ SSU None │ │ Weight PERWT23F │ │ With replacement False │ │ Prob None │ │ Hit None │ │ MOS None │ │ Population size None │ │ Replicate weights None │ │ │ ╰───────────────────────────────────────────────────────────────╯
Mean expense per person with an expense
avg_exp1 = has_exp_sample.estimation.mean(y="TOTEXP23")
print(avg_exp1)╭────────────────────────────────────── Estimate ───────────────────────────────────────╮ │ Estimation of MEAN │ │ Method: TAYLOR │ │ Y: TOTEXP23 │ │ Number of strata: 105 │ │ Number of PSUs: 262 │ │ Degrees of freedom: 157 │ │ Alpha: 0.05 │ │ │ │ est se lci uci cv (%) │ │ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ │ │ 8,751.8209 187.2480 8,381.9707 9,121.6712 2.1395 │ │ │ ╰───────────────────────────────────────────────────────────────────────────────────────╯
Mean expense per person with an expense, by age category
avg_exp1_by_age_cat = has_exp_sample.estimation.mean(
y="TOTEXP23", by="age_cat"
)
print(avg_exp1_by_age_cat)╭────────────────────────────────────── Estimate ───────────────────────────────────────╮ │ Estimation of MEAN │ │ Method: TAYLOR │ │ Y: TOTEXP23 │ │ By: age_cat │ │ Number of strata: 105 │ │ Number of PSUs: 262 │ │ Degrees of freedom: 157 │ │ Alpha: 0.05 │ │ │ │ age_cat est se lci uci cv (%) │ │ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ │ │ <65 6,853.3919 196.1686 6,465.9217 7,240.8620 2.8624 │ │ 65+ 16,000.1324 517.8130 14,977.3538 17,022.9111 3.2363 │ │ │ ╰───────────────────────────────────────────────────────────────────────────────────────╯
Median expense per person with an expense, by age category
from svy import QuantileMethod
median_exp1_by_age_cat = has_exp_sample.estimation.median(
y="TOTEXP23", by="age_cat", q_method=QuantileMethod.HIGHER
)
print(median_exp1_by_age_cat)╭────────────────────────────────────── Estimate ───────────────────────────────────────╮ │ Estimation of MEDIAN │ │ Method: TAYLOR │ │ Y: TOTEXP23 │ │ By: age_cat │ │ Number of strata: 105 │ │ Number of PSUs: 262 │ │ Degrees of freedom: 157 │ │ Alpha: 0.05 │ │ │ │ age_cat est se lci uci cv (%) │ │ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ │ │ <65 1,751.0000 55.6909 1,653.0000 1,873.0000 3.1805 │ │ 65+ 6,080.0000 223.0166 5,759.6119 6,644.0598 3.6680 │ │ │ ╰───────────────────────────────────────────────────────────────────────────────────────╯
Part II - Link the MEPS-HC Medical Conditions File to the Office-Based Medical Visits File for Estimation
Merging Relevant Files
Load Stata data files using svy.read_dta
ob23 = svy.read_stata(path="./h248g.dta")
cond23 = svy.read_stata(path="./h249.dta")
clnk23 = svy.read_stata(path="./h248if1.dta")
fyc23 = svy.read_stata(path="./h251.dta")Keep only needed variables
ob23x = ob23.select(["DUPERSID", "EVNTIDX", "OBXP23X"])
cond23x = cond23.select(
["DUPERSID", "CONDIDX", "ICD10CDX", "CCSR1X", "CCSR2X", "CCSR3X", "CCSR4X"]
)
fyc23_sub = fyc23.select(
["DUPERSID", "AGELAST", "VARSTR", "VARPSU", "PERWT23F"]
)Prepare data for estimation
cond_combos = cond23.group_by(["CCSR1X", "CCSR2X", "CCSR3X", "CCSR4X"]).agg(
pl.len()
)
print(cond_combos.sort(by=["CCSR1X", "CCSR2X", "CCSR3X", "CCSR4X"]))shape: (229, 5)
┌────────┬────────┬────────┬────────┬─────┐
│ CCSR1X ┆ CCSR2X ┆ CCSR3X ┆ CCSR4X ┆ len │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ str ┆ str ┆ str ┆ u32 │
╞════════╪════════╪════════╪════════╪═════╡
│ -15 ┆ -1 ┆ -1 ┆ -1 ┆ 396 │
│ BLD000 ┆ -1 ┆ -1 ┆ -1 ┆ 127 │
│ BLD000 ┆ CIR000 ┆ -1 ┆ -1 ┆ 1 │
│ BLD000 ┆ NEO000 ┆ -1 ┆ -1 ┆ 5 │
│ BLD001 ┆ -1 ┆ -1 ┆ -1 ┆ 71 │
│ … ┆ … ┆ … ┆ … ┆ … │
│ SYM013 ┆ -1 ┆ -1 ┆ -1 ┆ 561 │
│ SYM014 ┆ -1 ┆ -1 ┆ -1 ┆ 313 │
│ SYM015 ┆ -1 ┆ -1 ┆ -1 ┆ 223 │
│ SYM016 ┆ -1 ┆ -1 ┆ -1 ┆ 681 │
│ SYM017 ┆ -1 ┆ -1 ┆ -1 ┆ 460 │
└────────┴────────┴────────┴────────┴─────┘
cols = ["CCSR1X", "CCSR2X", "CCSR3X", "CCSR4X"]
cancer = cond23x.filter(
(
pl.any_horizontal(pl.col(cols).str.contains("NEO"))
| pl.any_horizontal(pl.col(cols) == "FAC006")
)
& pl.all_horizontal(pl.col(cols) != "NEO073")
)cancers = cancer.group_by(
["ICD10CDX", "CCSR1X", "CCSR2X", "CCSR3X", "CCSR4X"]
).agg(pl.len())
print(cancers.sort(by=["ICD10CDX", "CCSR1X", "CCSR2X", "CCSR3X", "CCSR4X"]))shape: (16, 6)
┌──────────┬────────┬────────┬────────┬────────┬─────┐
│ ICD10CDX ┆ CCSR1X ┆ CCSR2X ┆ CCSR3X ┆ CCSR4X ┆ len │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ str ┆ str ┆ str ┆ str ┆ u32 │
╞══════════╪════════╪════════╪════════╪════════╪═════╡
│ -15 ┆ BLD000 ┆ NEO000 ┆ -1 ┆ -1 ┆ 5 │
│ -15 ┆ NEO000 ┆ -1 ┆ -1 ┆ -1 ┆ 273 │
│ C18 ┆ NEO015 ┆ -1 ┆ -1 ┆ -1 ┆ 44 │
│ C34 ┆ NEO022 ┆ -1 ┆ -1 ┆ -1 ┆ 42 │
│ C43 ┆ NEO025 ┆ -1 ┆ -1 ┆ -1 ┆ 119 │
│ … ┆ … ┆ … ┆ … ┆ … ┆ … │
│ C73 ┆ NEO050 ┆ -1 ┆ -1 ┆ -1 ┆ 49 │
│ C95 ┆ NEO064 ┆ -1 ┆ -1 ┆ -1 ┆ 46 │
│ D04 ┆ NEO028 ┆ -1 ┆ -1 ┆ -1 ┆ 97 │
│ D48 ┆ NEO072 ┆ -1 ┆ -1 ┆ -1 ┆ 58 │
│ D49 ┆ NEO072 ┆ -1 ┆ -1 ┆ -1 ┆ 78 │
└──────────┴────────┴────────┴────────┴────────┴─────┘
Many-to-many joins via CLNK crosswalk, then OB events
cancer_merged = (
cancer
.join(clnk23, on=["DUPERSID", "CONDIDX"], how="inner") # m:m join ok
.join(ob23x, on=["DUPERSID", "EVNTIDX"], how="inner") # m:m join ok
.with_columns(pl.lit(1).alias("ob_visit")) # indicator
)QC: Check EVENTYPE distribution
clnk_counts = (
clnk23.group_by("EVENTYPE").agg(pl.len().alias("n")).sort("EVENTYPE")
)
print(clnk_counts)
cm_counts = (
cancer_merged.group_by("EVENTYPE")
.agg(pl.len().alias("n"))
.sort("EVENTYPE")
)
print(cm_counts)shape: (6, 2)
┌──────────┬────────┐
│ EVENTYPE ┆ n │
│ --- ┆ --- │
│ f64 ┆ u32 │
╞══════════╪════════╡
│ 1.0 ┆ 145548 │
│ 2.0 ┆ 22177 │
│ 3.0 ┆ 4294 │
│ 4.0 ┆ 2153 │
│ 7.0 ┆ 9045 │
│ 8.0 ┆ 97941 │
└──────────┴────────┘
shape: (1, 2)
┌──────────┬──────┐
│ EVENTYPE ┆ n │
│ --- ┆ --- │
│ f64 ┆ u32 │
╞══════════╪══════╡
│ 1.0 ┆ 4362 │
└──────────┴──────┘
Example: Same event treating multiple cancers for same person
example_evts = cancer_merged.filter(pl.col("DUPERSID") == "2790405102")
print(example_evts)shape: (3, 13)
┌────────────┬───────────────┬──────────┬────────┬───┬──────────┬───────┬─────────┬──────────┐
│ DUPERSID ┆ CONDIDX ┆ ICD10CDX ┆ CCSR1X ┆ … ┆ EVENTYPE ┆ PANEL ┆ OBXP23X ┆ ob_visit │
│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ str ┆ str ┆ str ┆ ┆ f64 ┆ f64 ┆ f64 ┆ i32 │
╞════════════╪═══════════════╪══════════╪════════╪═══╪══════════╪═══════╪═════════╪══════════╡
│ 2790405102 ┆ 2790405102007 ┆ C61 ┆ NEO039 ┆ … ┆ 1.0 ┆ 27.0 ┆ 184.59 ┆ 1 │
│ 2790405102 ┆ 2790405102005 ┆ D04 ┆ NEO028 ┆ … ┆ 1.0 ┆ 27.0 ┆ 715.29 ┆ 1 │
│ 2790405102 ┆ 2790405102006 ┆ D48 ┆ NEO072 ┆ … ┆ 1.0 ┆ 27.0 ┆ 715.29 ┆ 1 │
└────────────┴───────────────┴──────────┴────────┴───┴──────────┴───────┴─────────┴──────────┘
De-Duplicate by EVNTIDX Per Person
cancer_unique = cancer_merged.unique(
subset=["DUPERSID", "EVNTIDX"], keep="first", maintain_order=True
)Check the example person after de-duplication
print(cancer_unique.filter(pl.col("DUPERSID") == "2790405102"))shape: (2, 13)
┌────────────┬───────────────┬──────────┬────────┬───┬──────────┬───────┬─────────┬──────────┐
│ DUPERSID ┆ CONDIDX ┆ ICD10CDX ┆ CCSR1X ┆ … ┆ EVENTYPE ┆ PANEL ┆ OBXP23X ┆ ob_visit │
│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ str ┆ str ┆ str ┆ ┆ f64 ┆ f64 ┆ f64 ┆ i32 │
╞════════════╪═══════════════╪══════════╪════════╪═══╪══════════╪═══════╪═════════╪══════════╡
│ 2790405102 ┆ 2790405102007 ┆ C61 ┆ NEO039 ┆ … ┆ 1.0 ┆ 27.0 ┆ 184.59 ┆ 1 │
│ 2790405102 ┆ 2790405102005 ┆ D04 ┆ NEO028 ┆ … ┆ 1.0 ┆ 27.0 ┆ 715.29 ┆ 1 │
└────────────┴───────────────┴──────────┴────────┴───┴──────────┴───────┴─────────┴──────────┘
Aggregate to person-Level
pers = (
cancer_unique.group_by("DUPERSID")
.agg(
pl.col("OBXP23X")
.sum()
.alias("pers_ob_exp"), # total cancer office-visit exp
pl.col("ob_visit")
.sum()
.alias("ob_visits"), # total number of cancer office visits
)
.with_columns(any_OB=pl.lit(1)) # indicator (all 1's in this subset)
)Revisit the example person
print(pers.filter(pl.col("DUPERSID") == "2790405102"))shape: (1, 4)
┌────────────┬─────────────┬───────────┬────────┐
│ DUPERSID ┆ pers_ob_exp ┆ ob_visits ┆ any_OB │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ f64 ┆ i32 ┆ i32 │
╞════════════╪═════════════╪═══════════╪════════╡
│ 2790405102 ┆ 899.88 ┆ 2 ┆ 1 │
└────────────┴─────────────┴───────────┴────────┘
Left-join person-level cancer statistics onto FYC
fyc23_cancer = (
fyc23x.join(pers, on="DUPERSID", how="left")
# replace NA with 0 for the joined measures
.with_columns(pl.col(["ob_visits", "any_OB", "pers_ob_exp"]).fill_null(0))
)
print(fyc23_cancer)shape: (18_919, 11)
┌─────────┬──────────┬────────────┬────────┬───┬─────────┬─────────────┬───────────┬────────┐
│ AGELAST ┆ TOTEXP23 ┆ DUPERSID ┆ VARSTR ┆ … ┆ age_cat ┆ pers_ob_exp ┆ ob_visits ┆ any_OB │
│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ str ┆ f64 ┆ ┆ str ┆ f64 ┆ i32 ┆ i32 │
╞═════════╪══════════╪════════════╪════════╪═══╪═════════╪═════════════╪═══════════╪════════╡
│ 58.0 ┆ 646.0 ┆ 2790002101 ┆ 2019.0 ┆ … ┆ <65 ┆ 0.0 ┆ 0 ┆ 0 │
│ 27.0 ┆ 1894.0 ┆ 2790002102 ┆ 2019.0 ┆ … ┆ <65 ┆ 0.0 ┆ 0 ┆ 0 │
│ 49.0 ┆ 986.0 ┆ 2790004101 ┆ 2084.0 ┆ … ┆ <65 ┆ 0.0 ┆ 0 ┆ 0 │
│ 75.0 ┆ 1312.0 ┆ 2790006101 ┆ 2113.0 ┆ … ┆ 65+ ┆ 283.28 ┆ 3 ┆ 1 │
│ 23.0 ┆ 0.0 ┆ 2790006102 ┆ 2113.0 ┆ … ┆ <65 ┆ 0.0 ┆ 0 ┆ 0 │
│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │
│ 11.0 ┆ 4585.0 ┆ 2819784105 ┆ 2015.0 ┆ … ┆ <65 ┆ 0.0 ┆ 0 ┆ 0 │
│ 37.0 ┆ 6820.0 ┆ 2819788101 ┆ 2005.0 ┆ … ┆ <65 ┆ 0.0 ┆ 0 ┆ 0 │
│ 24.0 ┆ 2549.0 ┆ 2819792101 ┆ 2012.0 ┆ … ┆ <65 ┆ 0.0 ┆ 0 ┆ 0 │
│ 22.0 ┆ 1680.0 ┆ 2819793101 ┆ 2004.0 ┆ … ┆ <65 ┆ 0.0 ┆ 0 ┆ 0 │
│ 50.0 ┆ 7802.0 ┆ 2819793102 ┆ 2004.0 ┆ … ┆ <65 ┆ 0.0 ┆ 0 ┆ 0 │
└─────────┴──────────┴────────────┴────────┴───┴─────────┴─────────────┴───────────┴────────┘
QC: Should match original FYC row count
fyc23x.height == fyc23_cancer.heightTrue