Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 72 additions & 4 deletions src/winml/modelkit/commands/perf.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,12 @@ class BenchmarkResult:
samples_per_sec: float = 0.0
batches_per_sec: float = 0.0

# Batch dimension the session actually ran. Equals config.batch_size when
# the model's leading input dim is dynamic; falls back to the model's
# static batch (often 1) otherwise. samples_per_sec is scaled by this, not
# by the requested config.batch_size.
effective_batch_size: int = 1

# Actual values used (after auto-detection)
actual_device: str = ""
actual_task: str = ""
Expand Down Expand Up @@ -159,6 +165,7 @@ def to_dict(self) -> dict[str, Any]:
"iterations": self.config.iterations,
"warmup": self.config.warmup,
"batch_size": self.config.batch_size,
"effective_batch_size": self.effective_batch_size,
"timestamp": self.timestamp,
},
"model_info": {
Expand Down Expand Up @@ -279,6 +286,37 @@ def _resolve_shape(
return tuple(resolved)


def effective_batch_size(
inputs: dict[str, np.ndarray],
input_names: list[str],
requested: int,
) -> int:
"""The batch dimension actually present in the generated inputs.

The requested ``--batch-size`` only lands on inputs whose leading
dimension is dynamic; a model with a statically-fixed batch dim ignores
it (see :func:`_resolve_shape`). Throughput (samples/sec) must be scaled
by what the session actually ran, not by what was asked, or a static-batch
model reports ``requested / latency`` while only processing one batch per
call -- inflating samples/sec by ``requested``.

Reads the leading dim back from the first batched (rank >= 1) input,
matching the "first dim is batch" convention used throughout this module.
Falls back to ``requested`` when no batched input exists (e.g. all-scalar
inputs), which preserves the prior behavior for that edge case.

Single-input assumption: only the first batched input is inspected. For
multimodal or encoder-decoder models whose batched inputs disagree on the
leading dim (e.g. an image batch of 4 alongside a differently batched
tensor), the reported value reflects only the first batched input.
"""
for name in input_names:
arr = inputs.get(name)
if arr is not None and arr.ndim >= 1:
Comment thread
xieofxie marked this conversation as resolved.
return int(arr.shape[0])
return requested


# =============================================================================
# Benchmark Engine
# =============================================================================
Expand All @@ -302,6 +340,7 @@ def __init__(self, config: BenchmarkConfig) -> None:
self.config = config
self._model: WinMLPreTrainedModel | WinMLCompositeModel | None = None
self._inputs: dict[str, np.ndarray] | None = None
self._effective_batch: int = config.batch_size
self._memory: dict[str, float] | None = None

@property
Expand Down Expand Up @@ -525,10 +564,25 @@ def _load_model(self) -> None:

def _generate_inputs(self) -> None:
"""Generate random inputs based on model io_config."""
io_config = self._single.io_config
self._inputs = generate_random_inputs(
io_config=self._single.io_config,
io_config=io_config,
batch_size=self.config.batch_size,
)
self._effective_batch = effective_batch_size(
self._inputs,
io_config["input_names"],
self.config.batch_size,
)
if self._effective_batch != self.config.batch_size:
logger.warning(
"Requested --batch-size %d could not be applied: the model's "
"leading input dimension is statically %d. Throughput is scaled "
"by the actual batch (%d), not the requested value.",
self.config.batch_size,
self._effective_batch,
self._effective_batch,
)

def _resolve_adapter_luid(self) -> str | None:
"""Resolve adapter LUID for VRAM queries."""
Expand Down Expand Up @@ -645,9 +699,11 @@ def _collect_results(self, stats: PerfStats) -> BenchmarkResult:
"""Collect benchmark results from PerfStats."""
io_config = self._single.io_config

# Calculate throughput
# Calculate throughput. Scale by the batch the session actually ran
# (self._effective_batch), not the requested config.batch_size, which a
# static-batch model silently ignores during input generation.
mean_latency_sec = stats.mean_ms / 1000.0
samples_per_sec = self.config.batch_size / mean_latency_sec if mean_latency_sec > 0 else 0
samples_per_sec = self._effective_batch / mean_latency_sec if mean_latency_sec > 0 else 0
batches_per_sec = 1.0 / mean_latency_sec if mean_latency_sec > 0 else 0

# Calculate standard deviation
Expand Down Expand Up @@ -681,6 +737,7 @@ def _collect_results(self, stats: PerfStats) -> BenchmarkResult:
# Throughput
samples_per_sec=samples_per_sec,
batches_per_sec=batches_per_sec,
effective_batch_size=self._effective_batch,
# Actual values (resolved after build + compile)
actual_device=self._single.device,
actual_task=self._single.task or self.config.task or "auto-detected",
Expand Down Expand Up @@ -1040,7 +1097,18 @@ def display_console_report(result: BenchmarkResult, console: Console) -> None:

# Throughput
console.print()
console.print(f"[bold]Throughput:[/bold] {result.samples_per_sec:.2f} samples/sec")
throughput_line = f"[bold]Throughput:[/bold] {result.samples_per_sec:.2f} samples/sec"
if result.effective_batch_size != 1:
throughput_line += f" [dim](batch {result.effective_batch_size})[/dim]"
console.print(throughput_line)
# Flag when the requested batch couldn't be honored so a static-batch model
# doesn't look like it silently ran the requested batch.
if result.config.batch_size != result.effective_batch_size:
console.print(
f" [yellow]Note:[/yellow] requested batch {result.config.batch_size} "
f"could not be applied (model has a static batch of "
f"{result.effective_batch_size})."
)

# Hardware section (only when monitoring was active)
if result.hw_monitor:
Expand Down
133 changes: 133 additions & 0 deletions tests/unit/commands/test_perf_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -516,6 +516,139 @@ def test_ep_options_none_when_not_set_in_to_dict(self) -> None:
assert result.to_dict()["benchmark_info"]["ep_options"] is None


class TestEffectiveBatchSize:
"""Throughput must scale by the batch the session actually ran.

``--batch-size`` only lands on inputs whose leading dim is dynamic, so a
static-batch model silently runs a different batch than requested. The
reported ``samples_per_sec`` must reflect the actual batch, not the request.
"""

def test_helper_reads_dynamic_batch_from_inputs(self) -> None:
import numpy as np

from winml.modelkit.commands.perf import effective_batch_size

inputs = {"pixel_values": np.zeros((8, 3, 224, 224), dtype=np.float32)}
assert effective_batch_size(inputs, ["pixel_values"], requested=8) == 8

def test_helper_reads_static_batch_not_requested(self) -> None:
import numpy as np

from winml.modelkit.commands.perf import effective_batch_size

# Model has a static batch of 1; the requested 8 never reached the input.
inputs = {"pixel_values": np.zeros((1, 3, 224, 224), dtype=np.float32)}
assert effective_batch_size(inputs, ["pixel_values"], requested=8) == 1

def test_helper_skips_scalar_inputs(self) -> None:
import numpy as np

from winml.modelkit.commands.perf import effective_batch_size

# First input is a rank-0 scalar (no batch dim); fall through to the
# first batched input for the batch reading.
inputs = {
"scalar": np.array(3, dtype=np.int64),
"tokens": np.zeros((4, 128), dtype=np.int64),
}
assert effective_batch_size(inputs, ["scalar", "tokens"], requested=4) == 4

def test_helper_falls_back_when_all_scalar(self) -> None:
import numpy as np

from winml.modelkit.commands.perf import effective_batch_size

inputs = {"scalar": np.array(3, dtype=np.int64)}
assert effective_batch_size(inputs, ["scalar"], requested=8) == 8

def _fake_stats(self) -> MagicMock:
stats = MagicMock()
stats.mean_ms = 10.0 # 0.01 s -> 100 batches/sec
stats.min_ms = 9.0
stats.max_ms = 11.0
stats.p50_ms = 10.0
stats.p90_ms = 10.5
stats.p95_ms = 10.8
stats.p99_ms = 11.0
stats.samples_ms = [10.0, 10.0]
stats.all_samples_ms = [10.0, 10.0]
return stats

def _benchmark_with_single(self, *, batch_size: int, effective_batch: int) -> PerfBenchmark:
config = BenchmarkConfig(model_id="m", batch_size=batch_size, warmup=0)
benchmark = PerfBenchmark(config)
single = MagicMock()
single.io_config = {
"input_names": ["pixel_values"],
"input_shapes": [[effective_batch, 3, 224, 224]],
"input_types": ["float32"],
"output_names": ["logits"],
"output_shapes": [[effective_batch, 1000]],
}
single.device = "cpu"
single.ep_name = None
single.task = "image-classification"
single.running_model_path = "model.onnx"
benchmark._model = single
benchmark._effective_batch = effective_batch
return benchmark

def test_throughput_scales_by_effective_not_requested(self) -> None:
# Requested batch 8, but model ran batch 1: 100 batches/sec -> 100 sps,
# NOT 800. This is the bug guard.
benchmark = self._benchmark_with_single(batch_size=8, effective_batch=1)
result = benchmark._collect_results(self._fake_stats())

assert result.effective_batch_size == 1
assert result.batches_per_sec == pytest.approx(100.0)
assert result.samples_per_sec == pytest.approx(100.0)

def test_throughput_scales_when_batch_applied(self) -> None:
# Dynamic batch honored: 100 batches/sec * 8 = 800 samples/sec.
benchmark = self._benchmark_with_single(batch_size=8, effective_batch=8)
result = benchmark._collect_results(self._fake_stats())

assert result.effective_batch_size == 8
assert result.batches_per_sec == pytest.approx(100.0)
assert result.samples_per_sec == pytest.approx(800.0)

def test_generate_inputs_warns_on_static_batch(self) -> None:
import numpy as np

config = BenchmarkConfig(model_id="m", batch_size=8)
benchmark = PerfBenchmark(config)
single = MagicMock()
single.io_config = {
"input_names": ["pixel_values"],
"input_shapes": [[1, 3, 224, 224]],
"input_types": ["float32"],
}
benchmark._model = single

# Static batch of 1: generate_random_inputs ignores the requested 8.
static_inputs = {"pixel_values": np.zeros((1, 3, 224, 224), dtype=np.float32)}
with (
patch(
"winml.modelkit.commands.perf.generate_random_inputs",
return_value=static_inputs,
),
patch("winml.modelkit.commands.perf.logger") as mock_logger,
):
benchmark._generate_inputs()

assert benchmark._effective_batch == 1
mock_logger.warning.assert_called_once()

def test_to_dict_emits_effective_batch_size(self) -> None:
config = BenchmarkConfig(model_id="m", batch_size=8)
result = BenchmarkResult(config=config, effective_batch_size=1)

info = result.to_dict()["benchmark_info"]
assert info["batch_size"] == 8
assert info["effective_batch_size"] == 1


# =============================================================================
# --FORMAT JSON TESTS
# =============================================================================
Expand Down
Loading