diff --git a/src/winml/modelkit/commands/perf.py b/src/winml/modelkit/commands/perf.py index 2f63e5652..54ef28aba 100644 --- a/src/winml/modelkit/commands/perf.py +++ b/src/winml/modelkit/commands/perf.py @@ -130,6 +130,12 @@ class BenchmarkResult: samples_per_sec: float = 0.0 batches_per_sec: float = 0.0 + # Batch dimension the session actually ran. Equals config.batch_size when + # the model's leading input dim is dynamic; falls back to the model's + # static batch (often 1) otherwise. samples_per_sec is scaled by this, not + # by the requested config.batch_size. + effective_batch_size: int = 1 + # Actual values used (after auto-detection) actual_device: str = "" actual_task: str = "" @@ -159,6 +165,7 @@ def to_dict(self) -> dict[str, Any]: "iterations": self.config.iterations, "warmup": self.config.warmup, "batch_size": self.config.batch_size, + "effective_batch_size": self.effective_batch_size, "timestamp": self.timestamp, }, "model_info": { @@ -279,6 +286,37 @@ def _resolve_shape( return tuple(resolved) +def effective_batch_size( + inputs: dict[str, np.ndarray], + input_names: list[str], + requested: int, +) -> int: + """The batch dimension actually present in the generated inputs. + + The requested ``--batch-size`` only lands on inputs whose leading + dimension is dynamic; a model with a statically-fixed batch dim ignores + it (see :func:`_resolve_shape`). Throughput (samples/sec) must be scaled + by what the session actually ran, not by what was asked, or a static-batch + model reports ``requested / latency`` while only processing one batch per + call -- inflating samples/sec by ``requested``. + + Reads the leading dim back from the first batched (rank >= 1) input, + matching the "first dim is batch" convention used throughout this module. + Falls back to ``requested`` when no batched input exists (e.g. all-scalar + inputs), which preserves the prior behavior for that edge case. + + Single-input assumption: only the first batched input is inspected. For + multimodal or encoder-decoder models whose batched inputs disagree on the + leading dim (e.g. an image batch of 4 alongside a differently batched + tensor), the reported value reflects only the first batched input. + """ + for name in input_names: + arr = inputs.get(name) + if arr is not None and arr.ndim >= 1: + return int(arr.shape[0]) + return requested + + # ============================================================================= # Benchmark Engine # ============================================================================= @@ -302,6 +340,7 @@ def __init__(self, config: BenchmarkConfig) -> None: self.config = config self._model: WinMLPreTrainedModel | WinMLCompositeModel | None = None self._inputs: dict[str, np.ndarray] | None = None + self._effective_batch: int = config.batch_size self._memory: dict[str, float] | None = None @property @@ -525,10 +564,25 @@ def _load_model(self) -> None: def _generate_inputs(self) -> None: """Generate random inputs based on model io_config.""" + io_config = self._single.io_config self._inputs = generate_random_inputs( - io_config=self._single.io_config, + io_config=io_config, batch_size=self.config.batch_size, ) + self._effective_batch = effective_batch_size( + self._inputs, + io_config["input_names"], + self.config.batch_size, + ) + if self._effective_batch != self.config.batch_size: + logger.warning( + "Requested --batch-size %d could not be applied: the model's " + "leading input dimension is statically %d. Throughput is scaled " + "by the actual batch (%d), not the requested value.", + self.config.batch_size, + self._effective_batch, + self._effective_batch, + ) def _resolve_adapter_luid(self) -> str | None: """Resolve adapter LUID for VRAM queries.""" @@ -645,9 +699,11 @@ def _collect_results(self, stats: PerfStats) -> BenchmarkResult: """Collect benchmark results from PerfStats.""" io_config = self._single.io_config - # Calculate throughput + # Calculate throughput. Scale by the batch the session actually ran + # (self._effective_batch), not the requested config.batch_size, which a + # static-batch model silently ignores during input generation. mean_latency_sec = stats.mean_ms / 1000.0 - samples_per_sec = self.config.batch_size / mean_latency_sec if mean_latency_sec > 0 else 0 + samples_per_sec = self._effective_batch / mean_latency_sec if mean_latency_sec > 0 else 0 batches_per_sec = 1.0 / mean_latency_sec if mean_latency_sec > 0 else 0 # Calculate standard deviation @@ -681,6 +737,7 @@ def _collect_results(self, stats: PerfStats) -> BenchmarkResult: # Throughput samples_per_sec=samples_per_sec, batches_per_sec=batches_per_sec, + effective_batch_size=self._effective_batch, # Actual values (resolved after build + compile) actual_device=self._single.device, actual_task=self._single.task or self.config.task or "auto-detected", @@ -1040,7 +1097,18 @@ def display_console_report(result: BenchmarkResult, console: Console) -> None: # Throughput console.print() - console.print(f"[bold]Throughput:[/bold] {result.samples_per_sec:.2f} samples/sec") + throughput_line = f"[bold]Throughput:[/bold] {result.samples_per_sec:.2f} samples/sec" + if result.effective_batch_size != 1: + throughput_line += f" [dim](batch {result.effective_batch_size})[/dim]" + console.print(throughput_line) + # Flag when the requested batch couldn't be honored so a static-batch model + # doesn't look like it silently ran the requested batch. + if result.config.batch_size != result.effective_batch_size: + console.print( + f" [yellow]Note:[/yellow] requested batch {result.config.batch_size} " + f"could not be applied (model has a static batch of " + f"{result.effective_batch_size})." + ) # Hardware section (only when monitoring was active) if result.hw_monitor: diff --git a/tests/unit/commands/test_perf_cli.py b/tests/unit/commands/test_perf_cli.py index 9e05c3c57..6e7055d89 100644 --- a/tests/unit/commands/test_perf_cli.py +++ b/tests/unit/commands/test_perf_cli.py @@ -516,6 +516,139 @@ def test_ep_options_none_when_not_set_in_to_dict(self) -> None: assert result.to_dict()["benchmark_info"]["ep_options"] is None +class TestEffectiveBatchSize: + """Throughput must scale by the batch the session actually ran. + + ``--batch-size`` only lands on inputs whose leading dim is dynamic, so a + static-batch model silently runs a different batch than requested. The + reported ``samples_per_sec`` must reflect the actual batch, not the request. + """ + + def test_helper_reads_dynamic_batch_from_inputs(self) -> None: + import numpy as np + + from winml.modelkit.commands.perf import effective_batch_size + + inputs = {"pixel_values": np.zeros((8, 3, 224, 224), dtype=np.float32)} + assert effective_batch_size(inputs, ["pixel_values"], requested=8) == 8 + + def test_helper_reads_static_batch_not_requested(self) -> None: + import numpy as np + + from winml.modelkit.commands.perf import effective_batch_size + + # Model has a static batch of 1; the requested 8 never reached the input. + inputs = {"pixel_values": np.zeros((1, 3, 224, 224), dtype=np.float32)} + assert effective_batch_size(inputs, ["pixel_values"], requested=8) == 1 + + def test_helper_skips_scalar_inputs(self) -> None: + import numpy as np + + from winml.modelkit.commands.perf import effective_batch_size + + # First input is a rank-0 scalar (no batch dim); fall through to the + # first batched input for the batch reading. + inputs = { + "scalar": np.array(3, dtype=np.int64), + "tokens": np.zeros((4, 128), dtype=np.int64), + } + assert effective_batch_size(inputs, ["scalar", "tokens"], requested=4) == 4 + + def test_helper_falls_back_when_all_scalar(self) -> None: + import numpy as np + + from winml.modelkit.commands.perf import effective_batch_size + + inputs = {"scalar": np.array(3, dtype=np.int64)} + assert effective_batch_size(inputs, ["scalar"], requested=8) == 8 + + def _fake_stats(self) -> MagicMock: + stats = MagicMock() + stats.mean_ms = 10.0 # 0.01 s -> 100 batches/sec + stats.min_ms = 9.0 + stats.max_ms = 11.0 + stats.p50_ms = 10.0 + stats.p90_ms = 10.5 + stats.p95_ms = 10.8 + stats.p99_ms = 11.0 + stats.samples_ms = [10.0, 10.0] + stats.all_samples_ms = [10.0, 10.0] + return stats + + def _benchmark_with_single(self, *, batch_size: int, effective_batch: int) -> PerfBenchmark: + config = BenchmarkConfig(model_id="m", batch_size=batch_size, warmup=0) + benchmark = PerfBenchmark(config) + single = MagicMock() + single.io_config = { + "input_names": ["pixel_values"], + "input_shapes": [[effective_batch, 3, 224, 224]], + "input_types": ["float32"], + "output_names": ["logits"], + "output_shapes": [[effective_batch, 1000]], + } + single.device = "cpu" + single.ep_name = None + single.task = "image-classification" + single.running_model_path = "model.onnx" + benchmark._model = single + benchmark._effective_batch = effective_batch + return benchmark + + def test_throughput_scales_by_effective_not_requested(self) -> None: + # Requested batch 8, but model ran batch 1: 100 batches/sec -> 100 sps, + # NOT 800. This is the bug guard. + benchmark = self._benchmark_with_single(batch_size=8, effective_batch=1) + result = benchmark._collect_results(self._fake_stats()) + + assert result.effective_batch_size == 1 + assert result.batches_per_sec == pytest.approx(100.0) + assert result.samples_per_sec == pytest.approx(100.0) + + def test_throughput_scales_when_batch_applied(self) -> None: + # Dynamic batch honored: 100 batches/sec * 8 = 800 samples/sec. + benchmark = self._benchmark_with_single(batch_size=8, effective_batch=8) + result = benchmark._collect_results(self._fake_stats()) + + assert result.effective_batch_size == 8 + assert result.batches_per_sec == pytest.approx(100.0) + assert result.samples_per_sec == pytest.approx(800.0) + + def test_generate_inputs_warns_on_static_batch(self) -> None: + import numpy as np + + config = BenchmarkConfig(model_id="m", batch_size=8) + benchmark = PerfBenchmark(config) + single = MagicMock() + single.io_config = { + "input_names": ["pixel_values"], + "input_shapes": [[1, 3, 224, 224]], + "input_types": ["float32"], + } + benchmark._model = single + + # Static batch of 1: generate_random_inputs ignores the requested 8. + static_inputs = {"pixel_values": np.zeros((1, 3, 224, 224), dtype=np.float32)} + with ( + patch( + "winml.modelkit.commands.perf.generate_random_inputs", + return_value=static_inputs, + ), + patch("winml.modelkit.commands.perf.logger") as mock_logger, + ): + benchmark._generate_inputs() + + assert benchmark._effective_batch == 1 + mock_logger.warning.assert_called_once() + + def test_to_dict_emits_effective_batch_size(self) -> None: + config = BenchmarkConfig(model_id="m", batch_size=8) + result = BenchmarkResult(config=config, effective_batch_size=1) + + info = result.to_dict()["benchmark_info"] + assert info["batch_size"] == 8 + assert info["effective_batch_size"] == 1 + + # ============================================================================= # --FORMAT JSON TESTS # =============================================================================