Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

Full parity with the R `moderndive` and `infer` packages.

- **R argument parity (moderndive)**: `get_correlation(method=)` adds Spearman and Kendall rank correlations (plus `na_rm`); `get_regression_points(newdata=, ID=)` predicts on a held-out set and labels rows by a column; `get_regression_table(default_categorical_levels=)` can keep raw factor-level term names; `gg_parallel_slopes(alpha=)` sets point transparency.
- **R argument parity (infer)**: `hypothesize(med=)`/`observe(med=)` add a median point null; `prop_test()` gains `z`, `correct` (Yates), `conf_int`, and `conf_level` (now matching R's `prop.test` — chi-square by default, with a Wilson-score CI for one proportion); `rep_slice_sample(prop=, weight_by=)` and `rep_sample_n(prob=)` add fractional and weighted sampling; `generate(variables=)` chooses which column to permute; `shade_p_value`/`shade_confidence_interval` gain `fill`; `visualize(dens_color=)` sets the theoretical-curve color. (`shade_p_value` now also honors `color`.)

- **Chi-square goodness-of-fit** (closes the last `infer` vignette gap):
Expand Down
16 changes: 16 additions & 0 deletions doc/guides/regression.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,16 @@ get_regression_points(model).head()
# columns: ID, price, living_area, bedrooms, price_hat, residual
```

Pass `ID=` to label rows by a column, and `newdata=` to predict on a held-out set
(a `residual` is included when the outcome is present in `newdata`):

```{code-cell} python
train = houses.head(800)
test = houses.tail(200)
model_train = smf.ols("price ~ living_area + bedrooms", data=train.to_pandas()).fit()
get_regression_points(model_train, newdata=test).head()
```

## Model-fit summaries

```{code-cell} python
Expand All @@ -66,6 +76,12 @@ get_correlation(houses, "price ~ living_area") # ≈ 0.759
# or: get_correlation(houses, x="living_area", y="price")
```

Use `method=` for rank correlations (`"spearman"` or `"kendall"`):

```{code-cell} python
get_correlation(houses, "price ~ living_area", method="spearman")
```

Pass several predictors on the right-hand side to get one correlation per
predictor (long by default; `wide=True` for one column each):

Expand Down
39 changes: 33 additions & 6 deletions moderndive/correlation.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,29 +35,56 @@ def _parse_formula(formula: str) -> tuple[str, list[str]]:
return lhs, predictors


_CORR_METHODS = ("pearson", "spearman", "kendall")


def _correlate(x_vals, y_vals, method: str) -> float:
"""Correlation coefficient between two numpy arrays for the chosen method."""
if method == "pearson":
return float(np.corrcoef(x_vals, y_vals)[0, 1])
from scipy import stats

fn = stats.spearmanr if method == "spearman" else stats.kendalltau
return float(fn(x_vals, y_vals).statistic)


def get_correlation(
data,
formula: str | None = None,
*,
x: str | None = None,
y: str | None = None,
method: str = "pearson",
na_rm: bool = True,
wide: bool = False,
quiet: bool = False,
) -> pl.DataFrame:
"""Pearson correlation between an outcome and one or more predictors.
"""Correlation between an outcome and one or more predictors.

Mirrors ``moderndive::get_correlation``. Give the variables either as a
formula (``"y ~ x"`` or ``"y ~ x1 + x2 + x3"``) or, for a single predictor,
via ``x=`` and ``y=``.

``method`` is ``"pearson"`` (default), ``"spearman"`` (rank correlation), or
``"kendall"`` (rank concordance). ``na_rm`` drops rows with a null in either
column before computing (per predictor pair); set ``na_rm=False`` to keep
them (yielding ``nan`` if any are present).

With **one** predictor the result is a 1-row frame with a ``cor`` column.
With **multiple** predictors the result is long by default — columns
``predictor`` and ``cor`` (one row each) — or pass ``wide=True`` for one
column per predictor. Rows with a null in either column are dropped per pair.
column per predictor.

A short note points to a full pairwise correlation matrix when there are
multiple predictors; silence it with ``quiet=True``.
"""
if method not in _CORR_METHODS:
raise ValueError(
helpful_error(
f"method must be one of {_CORR_METHODS}, got {method!r}.",
"Use 'pearson' (linear), 'spearman' (rank), or 'kendall'.",
)
)
df = data if isinstance(data, pl.DataFrame) else pl.from_pandas(data)

if formula is not None:
Expand Down Expand Up @@ -90,10 +117,10 @@ def get_correlation(

cors: dict[str, float] = {}
for predictor in predictors:
pair = df.select(predictor, outcome).drop_nulls()
cors[predictor] = float(
np.corrcoef(pair[predictor].to_numpy(), pair[outcome].to_numpy())[0, 1]
)
pair = df.select(predictor, outcome)
if na_rm:
pair = pair.drop_nulls()
cors[predictor] = _correlate(pair[predictor].to_numpy(), pair[outcome].to_numpy(), method)

if len(predictors) == 1:
return pl.DataFrame({"cor": [cors[predictors[0]]]})
Expand Down
95 changes: 91 additions & 4 deletions moderndive/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,11 +125,17 @@ def _predictor_vars(model) -> list[str]:
return ordered


def _intercept_only_term(term: str) -> str:
"""Map only the intercept term to ``intercept``; leave everything else raw."""
return "intercept" if term in ("Intercept", "const") else term


def get_regression_table(
model,
digits: int = 3,
conf_level: float = 0.95,
exponentiate: bool = False,
default_categorical_levels: bool = False,
) -> pl.DataFrame:
"""Tidy regression table: term, estimate, std_error, statistic, p_value, lower/upper_ci.

Expand All @@ -140,16 +146,21 @@ def get_regression_table(
coefficient estimate and its confidence interval as rate / odds ratios
(``std_error``, ``statistic``, and ``p_value`` stay on the model's link scale,
matching ``broom::tidy``).

By default, categorical-predictor terms are prettified (e.g.
``income[T.High income]`` → ``income: High income``). Pass
``default_categorical_levels=True`` to keep the raw statsmodels term names.
"""
_check_model(model)
conf = np.asarray(model.conf_int(alpha=1 - conf_level), dtype=float)
estimate = np.asarray(model.params, dtype=float)
lower, upper = conf[:, 0], conf[:, 1]
if exponentiate:
estimate, lower, upper = np.exp(estimate), np.exp(lower), np.exp(upper)
name_fn = _intercept_only_term if default_categorical_levels else _clean_term
table = pl.DataFrame(
{
"term": [_clean_term(t) for t in _term_names(model)],
"term": [name_fn(t) for t in _term_names(model)],
"estimate": estimate,
"std_error": np.asarray(model.bse, dtype=float),
"statistic": np.asarray(model.tvalues, dtype=float),
Expand All @@ -162,7 +173,9 @@ def get_regression_table(
return table.with_columns(pl.col(numeric).round(digits))


def get_regression_points(model, digits: int = 3) -> pl.DataFrame:
def get_regression_points(
model, digits: int = 3, *, newdata=None, ID: str | None = None
) -> pl.DataFrame:
"""Fitted values + residuals per observation (~ ``broom::augment``).

Columns: ``ID``, the outcome, each original predictor, ``<outcome>_hat``,
Expand All @@ -172,8 +185,16 @@ def get_regression_points(model, digits: int = 3) -> pl.DataFrame:
(``poly()``, ``scale()``, ``I()``) are shown as their original columns rather
than leaking basis matrices. For GLMs, fitted values and residuals are on the
response scale (e.g. probabilities for logistic regression).

Pass ``newdata`` (a polars/pandas frame) to apply the model to new
observations: predictions are returned, plus a ``residual`` if the outcome is
present in ``newdata``. ``ID`` names a column to use as the identifier (placed
first); without it, ``ID`` is ``1..n``.
"""
_check_model(model)
if newdata is not None:
return _points_newdata(model, newdata, digits, ID)

name, name_hat, out = (
_points_columns_formula(model)
if _has_formula_frame(model)
Expand All @@ -187,8 +208,74 @@ def get_regression_points(model, digits: int = 3) -> pl.DataFrame:
out[name_hat] = fitted
out["residual"] = residual

df = pl.DataFrame(out).drop_nulls().with_row_index("ID", offset=1)
df = df.with_columns(pl.col("ID").cast(pl.Int64))
if ID is not None:
out = {ID: _id_values(model, ID), **out}
df = pl.DataFrame(out).drop_nulls()
if ID is None:
df = df.with_row_index("ID", offset=1).with_columns(pl.col("ID").cast(pl.Int64))
df = df.select("ID", *[c for c in df.columns if c != "ID"])
float_cols = [c for c, dt in df.schema.items() if dt.is_float()]
return df.with_columns(pl.col(float_cols).round(digits))


def _id_values(model, ID: str):
"""Pull the ID column from the model's source frame (formula API only)."""
if not _has_formula_frame(model):
raise TypeError(
helpful_error(
f"ID={ID!r} needs a formula-API model so the source data is available.",
'Refit with smf.ols("y ~ x", data).fit(), or pass newdata=.',
)
)
frame = model.model.data.frame
if ID not in frame.columns:
raise ValueError(
helpful_error(
f"ID column {ID!r} is not in the model's data.",
f"Available columns: {', '.join(map(str, frame.columns))}.",
)
)
return np.asarray(frame[ID])


def _points_newdata(model, newdata, digits: int, ID: str | None) -> pl.DataFrame:
"""Apply ``model`` to ``newdata``: predictions (+ residual if outcome present)."""
nd = _to_pandas(newdata)
name, name_hat, _ = _outcome_info(model)
pvars = _predictor_vars(model)
missing = [c for c in pvars if c not in nd.columns]
if missing:
raise ValueError(
helpful_error(
f"newdata is missing predictor column(s): {', '.join(missing)}.",
f"newdata has: {', '.join(map(str, nd.columns))}.",
)
)
fitted = np.asarray(model.predict(nd), dtype=float)

out = {}
if ID is not None:
if ID not in nd.columns:
raise ValueError(
helpful_error(
f"ID column {ID!r} is not in newdata.",
f"newdata has: {', '.join(map(str, nd.columns))}.",
)
)
out[ID] = np.asarray(nd[ID])
has_outcome = name in nd.columns
if has_outcome:
out[name] = np.asarray(nd[name])
for predictor in pvars:
out[predictor] = np.asarray(nd[predictor])
out[name_hat] = fitted
if has_outcome:
out["residual"] = np.asarray(out[name], dtype=float) - fitted

df = pl.DataFrame(out).drop_nulls()
if ID is None:
df = df.with_row_index("ID", offset=1).with_columns(pl.col("ID").cast(pl.Int64))
df = df.select("ID", *[c for c in df.columns if c != "ID"])
float_cols = [c for c, dt in df.schema.items() if dt.is_float()]
return df.with_columns(pl.col(float_cols).round(digits))

Expand Down
9 changes: 6 additions & 3 deletions moderndive/plots/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,11 +65,14 @@ def _parallel_slopes_fit(pdf, response: str, explanatory: str, by: str):
return intercepts, slope, levels


def gg_parallel_slopes(data, response: str, explanatory: str, by: str, *, engine: str = "plotly"):
def gg_parallel_slopes(
data, response: str, explanatory: str, by: str, *, alpha: float = 1.0, engine: str = "plotly"
):
"""Scatterplot with a parallel-slopes regression model overlaid.

Fits ``response ~ explanatory + C(by)`` (one common slope, a separate intercept
per level of ``by``) and draws one fitted line per group over the data.
``alpha`` sets the point transparency (0–1), useful when points overlap.
"""
engine = _resolve_engine(engine)
pdf = _to_pandas(data, [response, explanatory, by])
Expand All @@ -80,7 +83,7 @@ def gg_parallel_slopes(data, response: str, explanatory: str, by: str, *, engine
if engine == "plotly":
import plotly.express as px

fig = px.scatter(pdf, x=explanatory, y=response, color=by)
fig = px.scatter(pdf, x=explanatory, y=response, color=by, opacity=alpha)
for i, lvl in enumerate(levels):
b = intercepts[lvl]
fig.add_scatter(
Expand All @@ -98,7 +101,7 @@ def gg_parallel_slopes(data, response: str, explanatory: str, by: str, *, engine

return (
ggplot(pdf, aes(x=explanatory, y=response, color=by))
+ geom_point()
+ geom_point(alpha=alpha)
+ geom_parallel_slopes(pdf, response, explanatory, by)
+ labs(x=explanatory, y=response, title="Parallel slopes model")
+ theme_light()
Expand Down
Loading