Download the notebook here! Interactive online version: colab

Parameter Sensitivity

Every measurement model has tuning parameters that influence the treatment effect estimate. Subclassification requires choosing the number of strata; nearest neighbour matching requires setting a caliper distance. How sensitive are results to these choices?

This notebook answers two questions:

  1. Single-seed sensitivity — How does the estimate change as we sweep a tuning parameter?

  2. Sensitivity with uncertainty — Are the observed patterns robust to sampling variation, or just noise?

We use the same A/A test design (true effect = 0) so that any deviation from 0 reflects estimator behavior, not a real treatment effect.

Initial setup

[1]:
import copy
import os
from pathlib import Path

import numpy as np
import pandas as pd
import yaml
from impact_engine_measure import measure_impact, load_results
from impact_engine_measure.core.validation import load_config
from online_retail_simulator import simulate
[2]:
# Configurable via environment variables for CI (reduced values speed up execution)
N_REPS = 25

output_path = Path("output/demo_parameter_sensitivity")
output_path.mkdir(parents=True, exist_ok=True)

Step 1 — Product catalog

All parameter sweeps use the same product catalog.

[3]:
with open("configs/demo_model_selection_catalog.yaml") as f:
    catalog_config = yaml.safe_load(f)

tmp_catalog = output_path / "catalog_config.yaml"
with open(tmp_catalog, "w") as f:
    yaml.dump(catalog_config, f, default_flow_style=False)

catalog_job = simulate(str(tmp_catalog), job_id="catalog")
products = catalog_job.load_df("products")

print(f"Generated {len(products)} products")
products.head()
Generated 1000 products
[3]:
product_identifier category price
0 B1P4DZHDS9 Electronics 686.37
1 B1SE4QSNG7 Toys & Games 80.75
2 BXTPQIDT5C Food & Beverage 42.02
3 B3F1ZMC8Q6 Food & Beverage 33.42
4 B2NQRBTF0Y Toys & Games 27.52

Step 2 — Configuration

[4]:
config_path = "configs/demo_model_selection.yaml"
true_te = 0  # A/A design: no treatment effect by construction

base_config = load_config(config_path)
[5]:
def run_with_override(base_config, measurement_override, storage_url, job_id, source_seed=None):
    """Override MEASUREMENT in base config, write temp YAML, run measure_impact().

    Optionally override the data-generating seed for Monte Carlo replications.
    Returns the full MeasureJobResult for access to both impact_results and transformed_metrics.
    """
    config = copy.deepcopy(base_config)
    config["MEASUREMENT"] = measurement_override
    if source_seed is not None:
        config["DATA"]["SOURCE"]["CONFIG"]["seed"] = source_seed

    tmp_config_path = Path(storage_url) / f"config_{job_id}.yaml"
    tmp_config_path.parent.mkdir(parents=True, exist_ok=True)
    with open(tmp_config_path, "w") as f:
        yaml.dump(config, f, default_flow_style=False)

    job_info = measure_impact(str(tmp_config_path), storage_url, job_id=job_id)
    return load_results(job_info)

Step 3 — Parameter sensitivity (single seed)

For a given model and data, how sensitive is the treatment effect estimate to tuning parameters? We sweep one parameter at a time while keeping everything else fixed.

3a. Subclassification: n_strata

More strata means finer partitioning of the covariate space. This can improve precision but may leave strata without common support.

[6]:
n_strata_values = [2, 3, 5, 10, 20, 50, 100]
subclass_estimates = []
strata_used = []
strata_dropped = []
mean_revenue = None

for n in n_strata_values:
    measurement = {
        "MODEL": "subclassification",
        "PARAMS": {
            "treatment_column": "enriched",
            "covariate_columns": ["price"],
            "n_strata": n,
            "estimand": "att",
            "dependent_variable": "revenue",
        },
    }
    result = run_with_override(base_config, measurement, str(output_path), f"subclass_strata_{n}")
    estimates = result.impact_results["data"]["impact_estimates"]
    if mean_revenue is None:
        mean_revenue = result.transformed_metrics["revenue"].mean()

    subclass_estimates.append(estimates["treatment_effect"])
    strata_used.append(estimates["n_strata"])
    strata_dropped.append(estimates["n_strata_dropped"])

subclass_sensitivity = pd.DataFrame(
    {
        "n_strata (requested)": n_strata_values,
        "Strata Used": strata_used,
        "Strata Dropped": strata_dropped,
        "Treatment Effect": subclass_estimates,
        "Absolute Error": [abs(est - true_te) for est in subclass_estimates],
        "Relative Error (%)": [abs(est - true_te) / mean_revenue * 100 for est in subclass_estimates],
    }
)

print("Subclassification: n_strata Sensitivity")
print(f"Mean revenue: {mean_revenue:.2f} (used as denominator for relative error)")
print("-" * 90)
print(subclass_sensitivity.to_string(index=False, float_format=lambda x: f"{x:.4f}"))
Subclassification: n_strata Sensitivity
Mean revenue: 36.35 (used as denominator for relative error)
------------------------------------------------------------------------------------------
 n_strata (requested)  Strata Used  Strata Dropped  Treatment Effect  Absolute Error  Relative Error (%)
                    2            2               0            4.3435          4.3435             11.9484
                    3            3               0            3.2505          3.2505              8.9417
                    5            5               0            6.8515          6.8515             18.8474
                   10           10               0            4.1249          4.1249             11.3469
                   20           20               0            3.9686          3.9686             10.9170
                   50           50               0            6.7603          6.7603             18.5965
                  100          100               0            9.8478          9.8478             27.0899
[7]:
from notebook_support import plot_parameter_sensitivity

plot_parameter_sensitivity(
    param_values=n_strata_values,
    estimates=subclass_estimates,
    true_effect=true_te,
    xlabel="Number of Strata (n_strata)",
    ylabel="Treatment Effect",
    title="Subclassification: Sensitivity to n_strata",
)
../_images/methodology_demo_parameter_sensitivity_12_0.png

3b. Nearest neighbour matching: caliper

The caliper controls the maximum allowed distance between a treated unit and its matched control. Smaller values enforce tighter matches but may discard units, while larger values allow more matches with worse balance.

[8]:
caliper_values = [0.01, 0.05, 0.1, 0.2, 0.5, 1.0, 2.0]
matching_estimates = []
n_matched_att_list = []

for cal in caliper_values:
    measurement = {
        "MODEL": "nearest_neighbour_matching",
        "PARAMS": {
            "treatment_column": "enriched",
            "covariate_columns": ["price"],
            "dependent_variable": "revenue",
            "caliper": cal,
            "replace": True,
            "ratio": 1,
        },
    }
    result = run_with_override(base_config, measurement, str(output_path), f"matching_caliper_{cal}")
    estimates = result.impact_results["data"]["impact_estimates"]
    summary = result.impact_results["data"]["model_summary"]

    matching_estimates.append(estimates["att"])
    n_matched_att_list.append(summary["n_matched_att"])

matching_sensitivity = pd.DataFrame(
    {
        "Caliper": caliper_values,
        "N Matched (ATT)": n_matched_att_list,
        "Treatment Effect (ATT)": matching_estimates,
        "Absolute Error": [abs(est - true_te) for est in matching_estimates],
        "Relative Error (%)": [abs(est - true_te) / mean_revenue * 100 for est in matching_estimates],
    }
)

print("Nearest Neighbour Matching: Caliper Sensitivity")
print(f"Mean revenue: {mean_revenue:.2f} (used as denominator for relative error)")
print("-" * 90)
print(matching_sensitivity.to_string(index=False, float_format=lambda x: f"{x:.4f}"))
Nearest Neighbour Matching: Caliper Sensitivity
Mean revenue: 36.35 (used as denominator for relative error)
------------------------------------------------------------------------------------------
 Caliper  N Matched (ATT)  Treatment Effect (ATT)  Absolute Error  Relative Error (%)
  0.0100              900                 12.5041         12.5041             34.3969
  0.0500              978                 19.5498         19.5498             53.7784
  0.1000              990                 16.7830         16.7830             46.1674
  0.2000              996                 14.1673         14.1673             38.9720
  0.5000             1000                 14.1106         14.1106             38.8162
  1.0000             1000                 14.1106         14.1106             38.8162
  2.0000             1000                 14.1106         14.1106             38.8162
[9]:
plot_parameter_sensitivity(
    param_values=caliper_values,
    estimates=matching_estimates,
    true_effect=true_te,
    xlabel="Caliper",
    ylabel="Treatment Effect (ATT)",
    title="Nearest Neighbour Matching: Sensitivity to Caliper",
)
../_images/methodology_demo_parameter_sensitivity_15_0.png

Step 4 — Parameter sensitivity with uncertainty

Step 3 showed how estimates change with tuning parameters using a single seed. Here we add uncertainty bands by running each parameter value across multiple replications. This reveals whether apparent sensitivity is real or just noise.

[10]:
rng = np.random.default_rng(seed=2024)
mc_seeds = rng.integers(low=0, high=2**31, size=N_REPS).tolist()

4a. Subclassification: n_strata

[11]:
n_strata_mc = {n: [] for n in n_strata_values}

for i, seed in enumerate(mc_seeds):
    for n in n_strata_values:
        measurement = {
            "MODEL": "subclassification",
            "PARAMS": {
                "treatment_column": "enriched",
                "covariate_columns": ["price"],
                "n_strata": n,
                "estimand": "att",
                "dependent_variable": "revenue",
            },
        }
        result = run_with_override(
            base_config,
            measurement,
            str(output_path),
            f"mc_subclass_{n}_rep{i}",
            source_seed=seed,
        )
        n_strata_mc[n].append(result.impact_results["data"]["impact_estimates"]["treatment_effect"])

    if (i + 1) % 5 == 0:
        print(f"Subclassification sweep: {i + 1}/{N_REPS} replications")
Subclassification sweep: 5/25 replications
Subclassification sweep: 10/25 replications
Subclassification sweep: 15/25 replications
Subclassification sweep: 20/25 replications
Subclassification sweep: 25/25 replications
[12]:
from notebook_support import plot_parameter_sensitivity_mc

strata_means = [np.mean(n_strata_mc[n]) for n in n_strata_values]
strata_stds = [np.std(n_strata_mc[n], ddof=1) for n in n_strata_values]
strata_lower = [m - s for m, s in zip(strata_means, strata_stds)]
strata_upper = [m + s for m, s in zip(strata_means, strata_stds)]

plot_parameter_sensitivity_mc(
    param_values=n_strata_values,
    mean_estimates=strata_means,
    lower_band=strata_lower,
    upper_band=strata_upper,
    true_effect=true_te,
    xlabel="Number of Strata (n_strata)",
    ylabel="Treatment Effect",
    title=f"Subclassification: n_strata Sensitivity ({N_REPS} replications)",
)
../_images/methodology_demo_parameter_sensitivity_20_0.png

4b. Nearest Neighbour Matching: caliper

[13]:
caliper_mc = {c: [] for c in caliper_values}

for i, seed in enumerate(mc_seeds):
    for cal in caliper_values:
        measurement = {
            "MODEL": "nearest_neighbour_matching",
            "PARAMS": {
                "treatment_column": "enriched",
                "covariate_columns": ["price"],
                "dependent_variable": "revenue",
                "caliper": cal,
                "replace": True,
                "ratio": 1,
            },
        }
        result = run_with_override(
            base_config,
            measurement,
            str(output_path),
            f"mc_matching_{cal}_rep{i}",
            source_seed=seed,
        )
        caliper_mc[cal].append(result.impact_results["data"]["impact_estimates"]["att"])

    if (i + 1) % 5 == 0:
        print(f"Matching sweep: {i + 1}/{N_REPS} replications")
Matching sweep: 5/25 replications
Matching sweep: 10/25 replications
Matching sweep: 15/25 replications
Matching sweep: 20/25 replications
Matching sweep: 25/25 replications
[14]:
cal_means = [np.mean(caliper_mc[c]) for c in caliper_values]
cal_stds = [np.std(caliper_mc[c], ddof=1) for c in caliper_values]
cal_lower = [m - s for m, s in zip(cal_means, cal_stds)]
cal_upper = [m + s for m, s in zip(cal_means, cal_stds)]

plot_parameter_sensitivity_mc(
    param_values=caliper_values,
    mean_estimates=cal_means,
    lower_band=cal_lower,
    upper_band=cal_upper,
    true_effect=true_te,
    xlabel="Caliper",
    ylabel="Treatment Effect (ATT)",
    title=f"Nearest Neighbour Matching: Caliper Sensitivity ({N_REPS} replications)",
)
../_images/methodology_demo_parameter_sensitivity_23_0.png