moderndive · ismayc · Jun 20, 2026 · Jun 20, 2026 · Jun 20, 2026 · Jun 20, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,7 @@
 
 Full parity with the R `moderndive` and `infer` packages.
 
+- **R argument parity (moderndive)**: `get_correlation(method=)` adds Spearman and Kendall rank correlations (plus `na_rm`); `get_regression_points(newdata=, ID=)` predicts on a held-out set and labels rows by a column; `get_regression_table(default_categorical_levels=)` can keep raw factor-level term names; `gg_parallel_slopes(alpha=)` sets point transparency.
 - **R argument parity (infer)**: `hypothesize(med=)`/`observe(med=)` add a median point null; `prop_test()` gains `z`, `correct` (Yates), `conf_int`, and `conf_level` (now matching R's `prop.test` — chi-square by default, with a Wilson-score CI for one proportion); `rep_slice_sample(prop=, weight_by=)` and `rep_sample_n(prob=)` add fractional and weighted sampling; `generate(variables=)` chooses which column to permute; `shade_p_value`/`shade_confidence_interval` gain `fill`; `visualize(dens_color=)` sets the theoretical-curve color. (`shade_p_value` now also honors `color`.)
 
 - **Chi-square goodness-of-fit** (closes the last `infer` vignette gap):

diff --git a/doc/guides/regression.md b/doc/guides/regression.md
@@ -52,6 +52,16 @@ get_regression_points(model).head()
 # columns: ID, price, living_area, bedrooms, price_hat, residual
 ```
 
+Pass `ID=` to label rows by a column, and `newdata=` to predict on a held-out set
+(a `residual` is included when the outcome is present in `newdata`):
+
+```{code-cell} python
+train = houses.head(800)
+test = houses.tail(200)
+model_train = smf.ols("price ~ living_area + bedrooms", data=train.to_pandas()).fit()
+get_regression_points(model_train, newdata=test).head()
+```
+
 ## Model-fit summaries
 
 ```{code-cell} python
@@ -66,6 +76,12 @@ get_correlation(houses, "price ~ living_area")   # ≈ 0.759
 # or: get_correlation(houses, x="living_area", y="price")
 ```
 
+Use `method=` for rank correlations (`"spearman"` or `"kendall"`):
+
+```{code-cell} python
+get_correlation(houses, "price ~ living_area", method="spearman")
+```
+
 Pass several predictors on the right-hand side to get one correlation per
 predictor (long by default; `wide=True` for one column each):
 

diff --git a/moderndive/correlation.py b/moderndive/correlation.py
@@ -35,29 +35,56 @@ def _parse_formula(formula: str) -> tuple[str, list[str]]:
     return lhs, predictors
 
 
+_CORR_METHODS = ("pearson", "spearman", "kendall")
+
+
+def _correlate(x_vals, y_vals, method: str) -> float:
+    """Correlation coefficient between two numpy arrays for the chosen method."""
+    if method == "pearson":
+        return float(np.corrcoef(x_vals, y_vals)[0, 1])
+    from scipy import stats
+
+    fn = stats.spearmanr if method == "spearman" else stats.kendalltau
+    return float(fn(x_vals, y_vals).statistic)
+
+
 def get_correlation(
     data,
     formula: str | None = None,
     *,
     x: str | None = None,
     y: str | None = None,
+    method: str = "pearson",
+    na_rm: bool = True,
     wide: bool = False,
     quiet: bool = False,
 ) -> pl.DataFrame:
-    """Pearson correlation between an outcome and one or more predictors.
+    """Correlation between an outcome and one or more predictors.
 
     Mirrors ``moderndive::get_correlation``. Give the variables either as a
     formula (``"y ~ x"`` or ``"y ~ x1 + x2 + x3"``) or, for a single predictor,
     via ``x=`` and ``y=``.
 
+    ``method`` is ``"pearson"`` (default), ``"spearman"`` (rank correlation), or
+    ``"kendall"`` (rank concordance). ``na_rm`` drops rows with a null in either
+    column before computing (per predictor pair); set ``na_rm=False`` to keep
+    them (yielding ``nan`` if any are present).
+
     With **one** predictor the result is a 1-row frame with a ``cor`` column.
     With **multiple** predictors the result is long by default — columns
     ``predictor`` and ``cor`` (one row each) — or pass ``wide=True`` for one
-    column per predictor. Rows with a null in either column are dropped per pair.
+    column per predictor.
 
     A short note points to a full pairwise correlation matrix when there are
     multiple predictors; silence it with ``quiet=True``.
     """
+    if method not in _CORR_METHODS:
+        raise ValueError(
+            helpful_error(
+                f"method must be one of {_CORR_METHODS}, got {method!r}.",
+                "Use 'pearson' (linear), 'spearman' (rank), or 'kendall'.",
+            )
+        )
     df = data if isinstance(data, pl.DataFrame) else pl.from_pandas(data)
 
     if formula is not None:
@@ -90,10 +117,10 @@ def get_correlation(
 
     cors: dict[str, float] = {}
     for predictor in predictors:
-        pair = df.select(predictor, outcome).drop_nulls()
-        cors[predictor] = float(
-            np.corrcoef(pair[predictor].to_numpy(), pair[outcome].to_numpy())[0, 1]
-        )
+        pair = df.select(predictor, outcome)
+        if na_rm:
+            pair = pair.drop_nulls()
+        cors[predictor] = _correlate(pair[predictor].to_numpy(), pair[outcome].to_numpy(), method)
 
     if len(predictors) == 1:
         return pl.DataFrame({"cor": [cors[predictors[0]]]})

diff --git a/moderndive/modeling.py b/moderndive/modeling.py
@@ -125,11 +125,17 @@ def _predictor_vars(model) -> list[str]:
     return ordered
 
 
+def _intercept_only_term(term: str) -> str:
+    """Map only the intercept term to ``intercept``; leave everything else raw."""
+    return "intercept" if term in ("Intercept", "const") else term
+
+
 def get_regression_table(
     model,
     digits: int = 3,
     conf_level: float = 0.95,
     exponentiate: bool = False,
+    default_categorical_levels: bool = False,
 ) -> pl.DataFrame:
     """Tidy regression table: term, estimate, std_error, statistic, p_value, lower/upper_ci.
 
@@ -140,16 +146,21 @@ def get_regression_table(
     coefficient estimate and its confidence interval as rate / odds ratios
     (``std_error``, ``statistic``, and ``p_value`` stay on the model's link scale,
     matching ``broom::tidy``).
+
+    By default, categorical-predictor terms are prettified (e.g.
+    ``income[T.High income]`` → ``income: High income``). Pass
+    ``default_categorical_levels=True`` to keep the raw statsmodels term names.
     """
     _check_model(model)
     conf = np.asarray(model.conf_int(alpha=1 - conf_level), dtype=float)
     estimate = np.asarray(model.params, dtype=float)
     lower, upper = conf[:, 0], conf[:, 1]
     if exponentiate:
         estimate, lower, upper = np.exp(estimate), np.exp(lower), np.exp(upper)
+    name_fn = _intercept_only_term if default_categorical_levels else _clean_term
     table = pl.DataFrame(
         {
-            "term": [_clean_term(t) for t in _term_names(model)],
+            "term": [name_fn(t) for t in _term_names(model)],
             "estimate": estimate,
             "std_error": np.asarray(model.bse, dtype=float),
             "statistic": np.asarray(model.tvalues, dtype=float),
@@ -162,7 +173,9 @@ def get_regression_table(
     return table.with_columns(pl.col(numeric).round(digits))
 
 
-def get_regression_points(model, digits: int = 3) -> pl.DataFrame:
+def get_regression_points(
+    model, digits: int = 3, *, newdata=None, ID: str | None = None
+) -> pl.DataFrame:
     """Fitted values + residuals per observation (~ ``broom::augment``).
 
     Columns: ``ID``, the outcome, each original predictor, ``<outcome>_hat``,
@@ -172,8 +185,16 @@ def get_regression_points(model, digits: int = 3) -> pl.DataFrame:
     (``poly()``, ``scale()``, ``I()``) are shown as their original columns rather
     than leaking basis matrices. For GLMs, fitted values and residuals are on the
     response scale (e.g. probabilities for logistic regression).
+
+    Pass ``newdata`` (a polars/pandas frame) to apply the model to new
+    observations: predictions are returned, plus a ``residual`` if the outcome is
+    present in ``newdata``. ``ID`` names a column to use as the identifier (placed
+    first); without it, ``ID`` is ``1..n``.
     """
     _check_model(model)
+    if newdata is not None:
+        return _points_newdata(model, newdata, digits, ID)
+
     name, name_hat, out = (
         _points_columns_formula(model)
         if _has_formula_frame(model)
@@ -187,8 +208,74 @@ def get_regression_points(model, digits: int = 3) -> pl.DataFrame:
     out[name_hat] = fitted
     out["residual"] = residual
 
-    df = pl.DataFrame(out).drop_nulls().with_row_index("ID", offset=1)
-    df = df.with_columns(pl.col("ID").cast(pl.Int64))
+    if ID is not None:
+        out = {ID: _id_values(model, ID), **out}
+    df = pl.DataFrame(out).drop_nulls()
+    if ID is None:
+        df = df.with_row_index("ID", offset=1).with_columns(pl.col("ID").cast(pl.Int64))
+        df = df.select("ID", *[c for c in df.columns if c != "ID"])
+    float_cols = [c for c, dt in df.schema.items() if dt.is_float()]
+    return df.with_columns(pl.col(float_cols).round(digits))
+
+
+def _id_values(model, ID: str):
+    """Pull the ID column from the model's source frame (formula API only)."""
+    if not _has_formula_frame(model):
+        raise TypeError(
+            helpful_error(
+                f"ID={ID!r} needs a formula-API model so the source data is available.",
+                'Refit with smf.ols("y ~ x", data).fit(), or pass newdata=.',
+            )
+        )
+    frame = model.model.data.frame
+    if ID not in frame.columns:
+        raise ValueError(
+            helpful_error(
+                f"ID column {ID!r} is not in the model's data.",
+                f"Available columns: {', '.join(map(str, frame.columns))}.",
+            )
+        )
+    return np.asarray(frame[ID])
+
+
+def _points_newdata(model, newdata, digits: int, ID: str | None) -> pl.DataFrame:
+    """Apply ``model`` to ``newdata``: predictions (+ residual if outcome present)."""
+    nd = _to_pandas(newdata)
+    name, name_hat, _ = _outcome_info(model)
+    pvars = _predictor_vars(model)
+    missing = [c for c in pvars if c not in nd.columns]
+    if missing:
+        raise ValueError(
+            helpful_error(
+                f"newdata is missing predictor column(s): {', '.join(missing)}.",
+                f"newdata has: {', '.join(map(str, nd.columns))}.",
+            )
+        )
+    fitted = np.asarray(model.predict(nd), dtype=float)
+
+    out = {}
+    if ID is not None:
+        if ID not in nd.columns:
+            raise ValueError(
+                helpful_error(
+                    f"ID column {ID!r} is not in newdata.",
+                    f"newdata has: {', '.join(map(str, nd.columns))}.",
+                )
+            )
+        out[ID] = np.asarray(nd[ID])
+    has_outcome = name in nd.columns
+    if has_outcome:
+        out[name] = np.asarray(nd[name])
+    for predictor in pvars:
+        out[predictor] = np.asarray(nd[predictor])
+    out[name_hat] = fitted
+    if has_outcome:
+        out["residual"] = np.asarray(out[name], dtype=float) - fitted
+
+    df = pl.DataFrame(out).drop_nulls()
+    if ID is None:
+        df = df.with_row_index("ID", offset=1).with_columns(pl.col("ID").cast(pl.Int64))
+        df = df.select("ID", *[c for c in df.columns if c != "ID"])
     float_cols = [c for c, dt in df.schema.items() if dt.is_float()]
     return df.with_columns(pl.col(float_cols).round(digits))
 

diff --git a/moderndive/plots/models.py b/moderndive/plots/models.py
@@ -65,11 +65,14 @@ def _parallel_slopes_fit(pdf, response: str, explanatory: str, by: str):
     return intercepts, slope, levels
 
 
-def gg_parallel_slopes(data, response: str, explanatory: str, by: str, *, engine: str = "plotly"):
+def gg_parallel_slopes(
+    data, response: str, explanatory: str, by: str, *, alpha: float = 1.0, engine: str = "plotly"
+):
     """Scatterplot with a parallel-slopes regression model overlaid.
 
     Fits ``response ~ explanatory + C(by)`` (one common slope, a separate intercept
     per level of ``by``) and draws one fitted line per group over the data.
+    ``alpha`` sets the point transparency (0–1), useful when points overlap.
     """
     engine = _resolve_engine(engine)
     pdf = _to_pandas(data, [response, explanatory, by])
@@ -80,7 +83,7 @@ def gg_parallel_slopes(data, response: str, explanatory: str, by: str, *, engine
     if engine == "plotly":
         import plotly.express as px
 
-        fig = px.scatter(pdf, x=explanatory, y=response, color=by)
+        fig = px.scatter(pdf, x=explanatory, y=response, color=by, opacity=alpha)
         for i, lvl in enumerate(levels):
             b = intercepts[lvl]
             fig.add_scatter(
@@ -98,7 +101,7 @@ def gg_parallel_slopes(data, response: str, explanatory: str, by: str, *, engine
 
     return (
         ggplot(pdf, aes(x=explanatory, y=response, color=by))
-        + geom_point()
+        + geom_point(alpha=alpha)
         + geom_parallel_slopes(pdf, response, explanatory, by)
         + labs(x=explanatory, y=response, title="Parallel slopes model")
         + theme_light()