microsoft · xieofxie · Jun 22, 2026 · Jun 22, 2026 · Jun 23, 2026
@@ -130,6 +130,12 @@ class BenchmarkResult:
     samples_per_sec: float = 0.0
     batches_per_sec: float = 0.0
 
+    # Batch dimension the session actually ran. Equals config.batch_size when
+    # the model's leading input dim is dynamic; falls back to the model's
+    # static batch (often 1) otherwise. samples_per_sec is scaled by this, not
+    # by the requested config.batch_size.
+    effective_batch_size: int = 1
+
     # Actual values used (after auto-detection)
     actual_device: str = ""
     actual_task: str = ""
@@ -159,6 +165,7 @@ def to_dict(self) -> dict[str, Any]:
                 "iterations": self.config.iterations,
                 "warmup": self.config.warmup,
                 "batch_size": self.config.batch_size,
+                "effective_batch_size": self.effective_batch_size,
                 "timestamp": self.timestamp,
             },
             "model_info": {
@@ -279,6 +286,37 @@ def _resolve_shape(
     return tuple(resolved)
 
 
+def effective_batch_size(
+    inputs: dict[str, np.ndarray],
+    input_names: list[str],
+    requested: int,
+) -> int:
+    """The batch dimension actually present in the generated inputs.
+
+    The requested ``--batch-size`` only lands on inputs whose leading
+    dimension is dynamic; a model with a statically-fixed batch dim ignores
+    it (see :func:`_resolve_shape`). Throughput (samples/sec) must be scaled
+    by what the session actually ran, not by what was asked, or a static-batch
+    model reports ``requested / latency`` while only processing one batch per
+    call -- inflating samples/sec by ``requested``.
+
+    Reads the leading dim back from the first batched (rank >= 1) input,
+    matching the "first dim is batch" convention used throughout this module.
+    Falls back to ``requested`` when no batched input exists (e.g. all-scalar
+    inputs), which preserves the prior behavior for that edge case.
+
+    Single-input assumption: only the first batched input is inspected. For
+    multimodal or encoder-decoder models whose batched inputs disagree on the
+    leading dim (e.g. an image batch of 4 alongside a differently batched
+    tensor), the reported value reflects only the first batched input.
+    """
+    for name in input_names:
+        arr = inputs.get(name)
+        if arr is not None and arr.ndim >= 1:
+            return int(arr.shape[0])
+    return requested
+
+
 # =============================================================================
 # Benchmark Engine
 # =============================================================================
@@ -302,6 +340,7 @@ def __init__(self, config: BenchmarkConfig) -> None:
         self.config = config
         self._model: WinMLPreTrainedModel | WinMLCompositeModel | None = None
         self._inputs: dict[str, np.ndarray] | None = None
+        self._effective_batch: int = config.batch_size
         self._memory: dict[str, float] | None = None
 
     @property
@@ -525,10 +564,25 @@ def _load_model(self) -> None:
 
     def _generate_inputs(self) -> None:
         """Generate random inputs based on model io_config."""
+        io_config = self._single.io_config
         self._inputs = generate_random_inputs(
-            io_config=self._single.io_config,
+            io_config=io_config,
             batch_size=self.config.batch_size,
         )
+        self._effective_batch = effective_batch_size(
+            self._inputs,
+            io_config["input_names"],
+            self.config.batch_size,
+        )
+        if self._effective_batch != self.config.batch_size:
+            logger.warning(
+                "Requested --batch-size %d could not be applied: the model's "
+                "leading input dimension is statically %d. Throughput is scaled "
+                "by the actual batch (%d), not the requested value.",
+                self.config.batch_size,
+                self._effective_batch,
+                self._effective_batch,
+            )
 
     def _resolve_adapter_luid(self) -> str | None:
         """Resolve adapter LUID for VRAM queries."""
@@ -645,9 +699,11 @@ def _collect_results(self, stats: PerfStats) -> BenchmarkResult:
         """Collect benchmark results from PerfStats."""
         io_config = self._single.io_config
 
-        # Calculate throughput
+        # Calculate throughput. Scale by the batch the session actually ran
+        # (self._effective_batch), not the requested config.batch_size, which a
+        # static-batch model silently ignores during input generation.
         mean_latency_sec = stats.mean_ms / 1000.0
-        samples_per_sec = self.config.batch_size / mean_latency_sec if mean_latency_sec > 0 else 0
+        samples_per_sec = self._effective_batch / mean_latency_sec if mean_latency_sec > 0 else 0
         batches_per_sec = 1.0 / mean_latency_sec if mean_latency_sec > 0 else 0
 
         # Calculate standard deviation
@@ -681,6 +737,7 @@ def _collect_results(self, stats: PerfStats) -> BenchmarkResult:
             # Throughput
             samples_per_sec=samples_per_sec,
             batches_per_sec=batches_per_sec,
+            effective_batch_size=self._effective_batch,
             # Actual values (resolved after build + compile)
             actual_device=self._single.device,
             actual_task=self._single.task or self.config.task or "auto-detected",
@@ -1040,7 +1097,18 @@ def display_console_report(result: BenchmarkResult, console: Console) -> None:
 
     # Throughput
     console.print()
-    console.print(f"[bold]Throughput:[/bold] {result.samples_per_sec:.2f} samples/sec")
+    throughput_line = f"[bold]Throughput:[/bold] {result.samples_per_sec:.2f} samples/sec"
+    if result.effective_batch_size != 1:
+        throughput_line += f" [dim](batch {result.effective_batch_size})[/dim]"
+    console.print(throughput_line)
+    # Flag when the requested batch couldn't be honored so a static-batch model
+    # doesn't look like it silently ran the requested batch.
+    if result.config.batch_size != result.effective_batch_size:
+        console.print(
+            f"  [yellow]Note:[/yellow] requested batch {result.config.batch_size} "
+            f"could not be applied (model has a static batch of "
+            f"{result.effective_batch_size})."
+        )
 
     # Hardware section (only when monitoring was active)
     if result.hw_monitor:

@@ -516,6 +516,139 @@ def test_ep_options_none_when_not_set_in_to_dict(self) -> None:
         assert result.to_dict()["benchmark_info"]["ep_options"] is None
 
 
+class TestEffectiveBatchSize:
+    """Throughput must scale by the batch the session actually ran.
+
+    ``--batch-size`` only lands on inputs whose leading dim is dynamic, so a
+    static-batch model silently runs a different batch than requested. The
+    reported ``samples_per_sec`` must reflect the actual batch, not the request.
+    """
+
+    def test_helper_reads_dynamic_batch_from_inputs(self) -> None:
+        import numpy as np
+
+        from winml.modelkit.commands.perf import effective_batch_size
+
+        inputs = {"pixel_values": np.zeros((8, 3, 224, 224), dtype=np.float32)}
+        assert effective_batch_size(inputs, ["pixel_values"], requested=8) == 8
+
+    def test_helper_reads_static_batch_not_requested(self) -> None:
+        import numpy as np
+
+        from winml.modelkit.commands.perf import effective_batch_size
+
+        # Model has a static batch of 1; the requested 8 never reached the input.
+        inputs = {"pixel_values": np.zeros((1, 3, 224, 224), dtype=np.float32)}
+        assert effective_batch_size(inputs, ["pixel_values"], requested=8) == 1
+
+    def test_helper_skips_scalar_inputs(self) -> None:
+        import numpy as np
+
+        from winml.modelkit.commands.perf import effective_batch_size
+
+        # First input is a rank-0 scalar (no batch dim); fall through to the
+        # first batched input for the batch reading.
+        inputs = {
+            "scalar": np.array(3, dtype=np.int64),
+            "tokens": np.zeros((4, 128), dtype=np.int64),
+        }
+        assert effective_batch_size(inputs, ["scalar", "tokens"], requested=4) == 4
+
+    def test_helper_falls_back_when_all_scalar(self) -> None:
+        import numpy as np
+
+        from winml.modelkit.commands.perf import effective_batch_size
+
+        inputs = {"scalar": np.array(3, dtype=np.int64)}
+        assert effective_batch_size(inputs, ["scalar"], requested=8) == 8
+
+    def _fake_stats(self) -> MagicMock:
+        stats = MagicMock()
+        stats.mean_ms = 10.0  # 0.01 s -> 100 batches/sec
+        stats.min_ms = 9.0
+        stats.max_ms = 11.0
+        stats.p50_ms = 10.0
+        stats.p90_ms = 10.5
+        stats.p95_ms = 10.8
+        stats.p99_ms = 11.0
+        stats.samples_ms = [10.0, 10.0]
+        stats.all_samples_ms = [10.0, 10.0]
+        return stats
+
+    def _benchmark_with_single(self, *, batch_size: int, effective_batch: int) -> PerfBenchmark:
+        config = BenchmarkConfig(model_id="m", batch_size=batch_size, warmup=0)
+        benchmark = PerfBenchmark(config)
+        single = MagicMock()
+        single.io_config = {
+            "input_names": ["pixel_values"],
+            "input_shapes": [[effective_batch, 3, 224, 224]],
+            "input_types": ["float32"],
+            "output_names": ["logits"],
+            "output_shapes": [[effective_batch, 1000]],
+        }
+        single.device = "cpu"
+        single.ep_name = None
+        single.task = "image-classification"
+        single.running_model_path = "model.onnx"
+        benchmark._model = single
+        benchmark._effective_batch = effective_batch
+        return benchmark
+
+    def test_throughput_scales_by_effective_not_requested(self) -> None:
+        # Requested batch 8, but model ran batch 1: 100 batches/sec -> 100 sps,
+        # NOT 800. This is the bug guard.
+        benchmark = self._benchmark_with_single(batch_size=8, effective_batch=1)
+        result = benchmark._collect_results(self._fake_stats())
+
+        assert result.effective_batch_size == 1
+        assert result.batches_per_sec == pytest.approx(100.0)
+        assert result.samples_per_sec == pytest.approx(100.0)
+
+    def test_throughput_scales_when_batch_applied(self) -> None:
+        # Dynamic batch honored: 100 batches/sec * 8 = 800 samples/sec.
+        benchmark = self._benchmark_with_single(batch_size=8, effective_batch=8)
+        result = benchmark._collect_results(self._fake_stats())
+
+        assert result.effective_batch_size == 8
+        assert result.batches_per_sec == pytest.approx(100.0)
+        assert result.samples_per_sec == pytest.approx(800.0)
+
+    def test_generate_inputs_warns_on_static_batch(self) -> None:
+        import numpy as np
+
+        config = BenchmarkConfig(model_id="m", batch_size=8)
+        benchmark = PerfBenchmark(config)
+        single = MagicMock()
+        single.io_config = {
+            "input_names": ["pixel_values"],
+            "input_shapes": [[1, 3, 224, 224]],
+            "input_types": ["float32"],
+        }
+        benchmark._model = single
+
+        # Static batch of 1: generate_random_inputs ignores the requested 8.
+        static_inputs = {"pixel_values": np.zeros((1, 3, 224, 224), dtype=np.float32)}
+        with (
+            patch(
+                "winml.modelkit.commands.perf.generate_random_inputs",
+                return_value=static_inputs,
+            ),
+            patch("winml.modelkit.commands.perf.logger") as mock_logger,
+        ):
+            benchmark._generate_inputs()
+
+        assert benchmark._effective_batch == 1
+        mock_logger.warning.assert_called_once()
+
+    def test_to_dict_emits_effective_batch_size(self) -> None:
+        config = BenchmarkConfig(model_id="m", batch_size=8)
+        result = BenchmarkResult(config=config, effective_batch_size=1)
+
+        info = result.to_dict()["benchmark_info"]
+        assert info["batch_size"] == 8
+        assert info["effective_batch_size"] == 1
+
+
 # =============================================================================
 # --FORMAT JSON TESTS
 # =============================================================================