diff --git a/CHANGELOG.md b/CHANGELOG.md index 35297a3..e5288e8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ Full parity with the R `moderndive` and `infer` packages. +- **R argument parity (moderndive)**: `get_correlation(method=)` adds Spearman and Kendall rank correlations (plus `na_rm`); `get_regression_points(newdata=, ID=)` predicts on a held-out set and labels rows by a column; `get_regression_table(default_categorical_levels=)` can keep raw factor-level term names; `gg_parallel_slopes(alpha=)` sets point transparency. - **R argument parity (infer)**: `hypothesize(med=)`/`observe(med=)` add a median point null; `prop_test()` gains `z`, `correct` (Yates), `conf_int`, and `conf_level` (now matching R's `prop.test` — chi-square by default, with a Wilson-score CI for one proportion); `rep_slice_sample(prop=, weight_by=)` and `rep_sample_n(prob=)` add fractional and weighted sampling; `generate(variables=)` chooses which column to permute; `shade_p_value`/`shade_confidence_interval` gain `fill`; `visualize(dens_color=)` sets the theoretical-curve color. (`shade_p_value` now also honors `color`.) - **Chi-square goodness-of-fit** (closes the last `infer` vignette gap): diff --git a/doc/guides/regression.md b/doc/guides/regression.md index a50c81f..bc860dd 100644 --- a/doc/guides/regression.md +++ b/doc/guides/regression.md @@ -52,6 +52,16 @@ get_regression_points(model).head() # columns: ID, price, living_area, bedrooms, price_hat, residual ``` +Pass `ID=` to label rows by a column, and `newdata=` to predict on a held-out set +(a `residual` is included when the outcome is present in `newdata`): + +```{code-cell} python +train = houses.head(800) +test = houses.tail(200) +model_train = smf.ols("price ~ living_area + bedrooms", data=train.to_pandas()).fit() +get_regression_points(model_train, newdata=test).head() +``` + ## Model-fit summaries ```{code-cell} python @@ -66,6 +76,12 @@ get_correlation(houses, "price ~ living_area") # ≈ 0.759 # or: get_correlation(houses, x="living_area", y="price") ``` +Use `method=` for rank correlations (`"spearman"` or `"kendall"`): + +```{code-cell} python +get_correlation(houses, "price ~ living_area", method="spearman") +``` + Pass several predictors on the right-hand side to get one correlation per predictor (long by default; `wide=True` for one column each): diff --git a/moderndive/correlation.py b/moderndive/correlation.py index baf699a..07db223 100644 --- a/moderndive/correlation.py +++ b/moderndive/correlation.py @@ -35,29 +35,56 @@ def _parse_formula(formula: str) -> tuple[str, list[str]]: return lhs, predictors +_CORR_METHODS = ("pearson", "spearman", "kendall") + + +def _correlate(x_vals, y_vals, method: str) -> float: + """Correlation coefficient between two numpy arrays for the chosen method.""" + if method == "pearson": + return float(np.corrcoef(x_vals, y_vals)[0, 1]) + from scipy import stats + + fn = stats.spearmanr if method == "spearman" else stats.kendalltau + return float(fn(x_vals, y_vals).statistic) + + def get_correlation( data, formula: str | None = None, *, x: str | None = None, y: str | None = None, + method: str = "pearson", + na_rm: bool = True, wide: bool = False, quiet: bool = False, ) -> pl.DataFrame: - """Pearson correlation between an outcome and one or more predictors. + """Correlation between an outcome and one or more predictors. Mirrors ``moderndive::get_correlation``. Give the variables either as a formula (``"y ~ x"`` or ``"y ~ x1 + x2 + x3"``) or, for a single predictor, via ``x=`` and ``y=``. + ``method`` is ``"pearson"`` (default), ``"spearman"`` (rank correlation), or + ``"kendall"`` (rank concordance). ``na_rm`` drops rows with a null in either + column before computing (per predictor pair); set ``na_rm=False`` to keep + them (yielding ``nan`` if any are present). + With **one** predictor the result is a 1-row frame with a ``cor`` column. With **multiple** predictors the result is long by default — columns ``predictor`` and ``cor`` (one row each) — or pass ``wide=True`` for one - column per predictor. Rows with a null in either column are dropped per pair. + column per predictor. A short note points to a full pairwise correlation matrix when there are multiple predictors; silence it with ``quiet=True``. """ + if method not in _CORR_METHODS: + raise ValueError( + helpful_error( + f"method must be one of {_CORR_METHODS}, got {method!r}.", + "Use 'pearson' (linear), 'spearman' (rank), or 'kendall'.", + ) + ) df = data if isinstance(data, pl.DataFrame) else pl.from_pandas(data) if formula is not None: @@ -90,10 +117,10 @@ def get_correlation( cors: dict[str, float] = {} for predictor in predictors: - pair = df.select(predictor, outcome).drop_nulls() - cors[predictor] = float( - np.corrcoef(pair[predictor].to_numpy(), pair[outcome].to_numpy())[0, 1] - ) + pair = df.select(predictor, outcome) + if na_rm: + pair = pair.drop_nulls() + cors[predictor] = _correlate(pair[predictor].to_numpy(), pair[outcome].to_numpy(), method) if len(predictors) == 1: return pl.DataFrame({"cor": [cors[predictors[0]]]}) diff --git a/moderndive/modeling.py b/moderndive/modeling.py index d6b3bb4..c4834f5 100644 --- a/moderndive/modeling.py +++ b/moderndive/modeling.py @@ -125,11 +125,17 @@ def _predictor_vars(model) -> list[str]: return ordered +def _intercept_only_term(term: str) -> str: + """Map only the intercept term to ``intercept``; leave everything else raw.""" + return "intercept" if term in ("Intercept", "const") else term + + def get_regression_table( model, digits: int = 3, conf_level: float = 0.95, exponentiate: bool = False, + default_categorical_levels: bool = False, ) -> pl.DataFrame: """Tidy regression table: term, estimate, std_error, statistic, p_value, lower/upper_ci. @@ -140,6 +146,10 @@ def get_regression_table( coefficient estimate and its confidence interval as rate / odds ratios (``std_error``, ``statistic``, and ``p_value`` stay on the model's link scale, matching ``broom::tidy``). + + By default, categorical-predictor terms are prettified (e.g. + ``income[T.High income]`` → ``income: High income``). Pass + ``default_categorical_levels=True`` to keep the raw statsmodels term names. """ _check_model(model) conf = np.asarray(model.conf_int(alpha=1 - conf_level), dtype=float) @@ -147,9 +157,10 @@ def get_regression_table( lower, upper = conf[:, 0], conf[:, 1] if exponentiate: estimate, lower, upper = np.exp(estimate), np.exp(lower), np.exp(upper) + name_fn = _intercept_only_term if default_categorical_levels else _clean_term table = pl.DataFrame( { - "term": [_clean_term(t) for t in _term_names(model)], + "term": [name_fn(t) for t in _term_names(model)], "estimate": estimate, "std_error": np.asarray(model.bse, dtype=float), "statistic": np.asarray(model.tvalues, dtype=float), @@ -162,7 +173,9 @@ def get_regression_table( return table.with_columns(pl.col(numeric).round(digits)) -def get_regression_points(model, digits: int = 3) -> pl.DataFrame: +def get_regression_points( + model, digits: int = 3, *, newdata=None, ID: str | None = None +) -> pl.DataFrame: """Fitted values + residuals per observation (~ ``broom::augment``). Columns: ``ID``, the outcome, each original predictor, ``_hat``, @@ -172,8 +185,16 @@ def get_regression_points(model, digits: int = 3) -> pl.DataFrame: (``poly()``, ``scale()``, ``I()``) are shown as their original columns rather than leaking basis matrices. For GLMs, fitted values and residuals are on the response scale (e.g. probabilities for logistic regression). + + Pass ``newdata`` (a polars/pandas frame) to apply the model to new + observations: predictions are returned, plus a ``residual`` if the outcome is + present in ``newdata``. ``ID`` names a column to use as the identifier (placed + first); without it, ``ID`` is ``1..n``. """ _check_model(model) + if newdata is not None: + return _points_newdata(model, newdata, digits, ID) + name, name_hat, out = ( _points_columns_formula(model) if _has_formula_frame(model) @@ -187,8 +208,74 @@ def get_regression_points(model, digits: int = 3) -> pl.DataFrame: out[name_hat] = fitted out["residual"] = residual - df = pl.DataFrame(out).drop_nulls().with_row_index("ID", offset=1) - df = df.with_columns(pl.col("ID").cast(pl.Int64)) + if ID is not None: + out = {ID: _id_values(model, ID), **out} + df = pl.DataFrame(out).drop_nulls() + if ID is None: + df = df.with_row_index("ID", offset=1).with_columns(pl.col("ID").cast(pl.Int64)) + df = df.select("ID", *[c for c in df.columns if c != "ID"]) + float_cols = [c for c, dt in df.schema.items() if dt.is_float()] + return df.with_columns(pl.col(float_cols).round(digits)) + + +def _id_values(model, ID: str): + """Pull the ID column from the model's source frame (formula API only).""" + if not _has_formula_frame(model): + raise TypeError( + helpful_error( + f"ID={ID!r} needs a formula-API model so the source data is available.", + 'Refit with smf.ols("y ~ x", data).fit(), or pass newdata=.', + ) + ) + frame = model.model.data.frame + if ID not in frame.columns: + raise ValueError( + helpful_error( + f"ID column {ID!r} is not in the model's data.", + f"Available columns: {', '.join(map(str, frame.columns))}.", + ) + ) + return np.asarray(frame[ID]) + + +def _points_newdata(model, newdata, digits: int, ID: str | None) -> pl.DataFrame: + """Apply ``model`` to ``newdata``: predictions (+ residual if outcome present).""" + nd = _to_pandas(newdata) + name, name_hat, _ = _outcome_info(model) + pvars = _predictor_vars(model) + missing = [c for c in pvars if c not in nd.columns] + if missing: + raise ValueError( + helpful_error( + f"newdata is missing predictor column(s): {', '.join(missing)}.", + f"newdata has: {', '.join(map(str, nd.columns))}.", + ) + ) + fitted = np.asarray(model.predict(nd), dtype=float) + + out = {} + if ID is not None: + if ID not in nd.columns: + raise ValueError( + helpful_error( + f"ID column {ID!r} is not in newdata.", + f"newdata has: {', '.join(map(str, nd.columns))}.", + ) + ) + out[ID] = np.asarray(nd[ID]) + has_outcome = name in nd.columns + if has_outcome: + out[name] = np.asarray(nd[name]) + for predictor in pvars: + out[predictor] = np.asarray(nd[predictor]) + out[name_hat] = fitted + if has_outcome: + out["residual"] = np.asarray(out[name], dtype=float) - fitted + + df = pl.DataFrame(out).drop_nulls() + if ID is None: + df = df.with_row_index("ID", offset=1).with_columns(pl.col("ID").cast(pl.Int64)) + df = df.select("ID", *[c for c in df.columns if c != "ID"]) float_cols = [c for c, dt in df.schema.items() if dt.is_float()] return df.with_columns(pl.col(float_cols).round(digits)) diff --git a/moderndive/plots/models.py b/moderndive/plots/models.py index 7b7c3d9..934919c 100644 --- a/moderndive/plots/models.py +++ b/moderndive/plots/models.py @@ -65,11 +65,14 @@ def _parallel_slopes_fit(pdf, response: str, explanatory: str, by: str): return intercepts, slope, levels -def gg_parallel_slopes(data, response: str, explanatory: str, by: str, *, engine: str = "plotly"): +def gg_parallel_slopes( + data, response: str, explanatory: str, by: str, *, alpha: float = 1.0, engine: str = "plotly" +): """Scatterplot with a parallel-slopes regression model overlaid. Fits ``response ~ explanatory + C(by)`` (one common slope, a separate intercept per level of ``by``) and draws one fitted line per group over the data. + ``alpha`` sets the point transparency (0–1), useful when points overlap. """ engine = _resolve_engine(engine) pdf = _to_pandas(data, [response, explanatory, by]) @@ -80,7 +83,7 @@ def gg_parallel_slopes(data, response: str, explanatory: str, by: str, *, engine if engine == "plotly": import plotly.express as px - fig = px.scatter(pdf, x=explanatory, y=response, color=by) + fig = px.scatter(pdf, x=explanatory, y=response, color=by, opacity=alpha) for i, lvl in enumerate(levels): b = intercepts[lvl] fig.add_scatter( @@ -98,7 +101,7 @@ def gg_parallel_slopes(data, response: str, explanatory: str, by: str, *, engine return ( ggplot(pdf, aes(x=explanatory, y=response, color=by)) - + geom_point() + + geom_point(alpha=alpha) + geom_parallel_slopes(pdf, response, explanatory, by) + labs(x=explanatory, y=response, title="Parallel slopes model") + theme_light() diff --git a/tests/test_arg_parity_moderndive.py b/tests/test_arg_parity_moderndive.py new file mode 100644 index 0000000..18da442 --- /dev/null +++ b/tests/test_arg_parity_moderndive.py @@ -0,0 +1,158 @@ +"""Tests for the moderndive R argument-parity additions: + +- get_correlation(method=, na_rm=) +- get_regression_points(newdata=, ID=) +- get_regression_table(default_categorical_levels=) +- gg_parallel_slopes(alpha=) +""" + +from __future__ import annotations + +import math + +import numpy as np +import plotly.graph_objects as go +import polars as pl +import pytest +import statsmodels.formula.api as smf +from plotnine import ggplot +from scipy import stats + +import moderndive as md +from moderndive import get_correlation, get_regression_points, get_regression_table + + +@pytest.fixture(scope="module") +def houses(): + return md.load_saratoga_houses() + + +# ============================ get_correlation ============================ + + +@pytest.mark.parametrize("method", ["pearson", "spearman", "kendall"]) +def test_get_correlation_methods_match_scipy(houses, method): + x = houses["living_area"].to_numpy() + y = houses["price"].to_numpy() + got = get_correlation(houses, "price ~ living_area", method=method).item() + if method == "pearson": + expected = float(np.corrcoef(x, y)[0, 1]) + elif method == "spearman": + expected = float(stats.spearmanr(x, y).statistic) + else: + expected = float(stats.kendalltau(x, y).statistic) + assert got == pytest.approx(expected) + + +def test_get_correlation_method_applies_to_multi(houses): + out = get_correlation(houses, "price ~ living_area + bedrooms", method="spearman", quiet=True) + expected = float( + stats.spearmanr(houses["bedrooms"].to_numpy(), houses["price"].to_numpy()).statistic + ) + assert out.filter(pl.col("predictor") == "bedrooms")["cor"].item() == pytest.approx(expected) + + +def test_get_correlation_bad_method(houses): + with pytest.raises(ValueError, match="method must be one of"): + get_correlation(houses, "price ~ living_area", method="bogus") + + +def test_get_correlation_na_rm_false_yields_nan(): + df = pl.DataFrame({"y": [1.0, 2.0, None, 4.0], "x": [1.0, 2.0, 3.0, 4.0]}) + # default drops nulls and gives a real number + assert math.isfinite(get_correlation(df, "y ~ x").item()) + # na_rm=False keeps the null → nan + assert math.isnan(get_correlation(df, "y ~ x", na_rm=False).item()) + + +# ============================ get_regression_points ===================== + + +@pytest.fixture(scope="module") +def fit_train_test(houses): + train = houses.head(800) + test = houses.tail(200) + model = smf.ols("price ~ living_area + bedrooms", data=train.to_pandas()).fit() + return model, test + + +def test_points_newdata_with_outcome(fit_train_test): + model, test = fit_train_test + out = get_regression_points(model, newdata=test) + assert out.columns == ["ID", "price", "living_area", "bedrooms", "price_hat", "residual"] + assert out.height == test.height + + +def test_points_newdata_without_outcome(fit_train_test): + model, test = fit_train_test + out = get_regression_points(model, newdata=test.drop("price")) + # no outcome → predictions only, no residual + assert out.columns == ["ID", "living_area", "bedrooms", "price_hat"] + + +def test_points_newdata_with_id(fit_train_test): + model, test = fit_train_test + test_id = test.with_row_index("house_id") + out = get_regression_points(model, newdata=test_id, ID="house_id") + assert out.columns[0] == "house_id" + + +def test_points_newdata_missing_predictor(fit_train_test): + model, test = fit_train_test + with pytest.raises(ValueError, match="missing predictor"): + get_regression_points(model, newdata=test.select("price")) + + +def test_points_newdata_bad_id(fit_train_test): + model, test = fit_train_test + with pytest.raises(ValueError, match="ID column"): + get_regression_points(model, newdata=test, ID="nope") + + +def test_points_id_from_source_column(houses): + h = houses.with_row_index("home_id") + model = smf.ols("price ~ living_area", data=h.to_pandas()).fit() + out = get_regression_points(model, ID="home_id") + assert out.columns[0] == "home_id" + assert out.height == h.height + + +def test_points_id_not_in_source(houses): + model = smf.ols("price ~ living_area", data=houses.to_pandas()).fit() + with pytest.raises(ValueError, match="ID column"): + get_regression_points(model, ID="nope") + + +def test_points_id_requires_formula_model(): + import statsmodels.api as sm + + X = sm.add_constant(np.arange(1.0, 11.0)) + model = sm.OLS(np.arange(1.0, 11.0) * 2, X).fit() + with pytest.raises(TypeError, match="formula-API"): + get_regression_points(model, ID="whatever") + + +# ============================ get_regression_table ====================== + + +def test_default_categorical_levels(houses): + un = md.load_un_member_states_2024().to_pandas() + model = smf.ols("life_expectancy_2022 ~ gdp_per_capita + C(continent)", data=un).fit() + pretty = get_regression_table(model)["term"].to_list() + raw = get_regression_table(model, default_categorical_levels=True)["term"].to_list() + assert any(t.startswith("continent: ") for t in pretty) + assert any("C(continent)[T." in t for t in raw) + # intercept is mapped in both + assert "intercept" in pretty and "intercept" in raw + + +# ============================ gg_parallel_slopes alpha ================== + + +@pytest.mark.parametrize("engine", ["plotly", "plotnine"]) +def test_gg_parallel_slopes_alpha(engine): + evals = md.load_evals() + fig = md.gg_parallel_slopes( + evals, response="score", explanatory="age", by="gender", alpha=0.3, engine=engine + ) + assert isinstance(fig, go.Figure if engine == "plotly" else ggplot)