From 91ea07800e4f6c96a5eea3b72c86eb03e81c2e11 Mon Sep 17 00:00:00 2001 From: Caitlin Ross Date: Mon, 8 Jun 2026 17:18:23 -0500 Subject: [PATCH 01/20] initial roadmap for codes refactoring --- doc/refactor-plan/new-config-format.md | 668 ++++++++++++++++++++ doc/refactor-plan/refactor-roadmap.md | 818 +++++++++++++++++++++++++ 2 files changed, 1486 insertions(+) create mode 100644 doc/refactor-plan/new-config-format.md create mode 100644 doc/refactor-plan/refactor-roadmap.md diff --git a/doc/refactor-plan/new-config-format.md b/doc/refactor-plan/new-config-format.md new file mode 100644 index 00000000..405990ba --- /dev/null +++ b/doc/refactor-plan/new-config-format.md @@ -0,0 +1,668 @@ +# CODES Topology & Config Contract + +**Status:** Draft v0 + +This document defines the **file format that flows between NetMaestro's topology +editor (or a hand-author) and the CODES configuration front-end**. It is the +interface contract: NetMaestro's editor *exports* it, hand-authors *write* it, and +the CODES "config compiler" (see [refactor-roadmap.md](refactor-roadmap.md) §7) +*consumes* it. It is versioned (`schema_version`); breaking changes bump the version. + +It replaces the legacy `.conf` (`LPGROUPS` + `PARAMS` + `@annotation`) **and** eventually +other files, like the +separate latency-bandwidth matrix files, with a single +format read by one parser. + +--- + +## 1. Principles + +1. **Topology and parameters are separate concerns.** The topology graph answers + *what connects to what* and *what each node is*. It does **not** carry the bulk + of model parameters — those live in **custom components** referenced by name. A + node tags a component; it does not repeat the component's parameters. Exception: HPC + specific fabrics will be handled differently +2. **One parser.** Topology is expressed in [Cytoscape.js][cyto] element form, which + is JSON. Because JSON is a subset of YAML, the same parser (RapidYAML) reads both + an editor-exported `.json` and a hand-written `.yaml`. +3. **CODES reads an allowlist.** Of each element's `data`, CODES consumes only the + documented keys below. Everything editor/visual — `position`, `classes`, `style`, + `selected`, `locked`, `scratch`, … — is ignored, so the editor may add visual + state freely without affecting the simulation. +4. **Implementation details never appear.** Anything CODES can derive (the ROSS + event size, network ordering, repetition counts) is *computed by the compiler*, + not written by the user. See §9. +5. **Units should be explicit.** Physical quantities can carry their unit (`"100Gbps"`, + `"5ms"`); the compiler converts to whatever each model uses internally. If no + unit is specified, then a default will be assumed. + +[cyto]: https://js.cytoscape.org/#notation/elements-json + +--- + +## 2. Vocabulary + +| Domain term | NetMaestro DB model | In this file | Meaning | +|---|---|---|---| +| component model | `ComponentModel` | `model:` | a CODES simulation model from the catalog (e.g. `nw-lp`, `simplep2p`) | +| component type | `ComponentType` | `type:` | `host` \| `router` \| `switch`; usually inferred from the model | +| custom component | `ComponentConfig` | an entry under `components:` | a model paired with configured parameters — **what a node references** | +| node | `TopologyNode` | a Cytoscape node | a placed component (graph vertex) | +| link | `TopologyLink` | a Cytoscape edge | a graph edge (a physical link or an attachment) | +| group | `TopologyGroup` | `data.parent` / a `groups:` entry | optional grouping of nodes (e.g. a site) — the multi-network seam | +| topology | `Topology` | the `topology:` block | the saved layout (nodes + links + groups) | + +The PascalCase names are NetMaestro-internal DB model names; the file uses the +lowercase `snake_case` keys. + +--- + +## 3. File shape + +A complete configuration has these top-level blocks: + +```yaml +schema_version: 1 + +simulation: # run-level settings (mostly derived; a few user knobs) +components: # the component configs (ComponentConfigs) referenced by the topology +topology: # the Topology: an enumerated graph (nodes + links + groups) or a parametric fabric +jobs: # optional: what workloads run and where — multi-job; see §6 +surrogate: # RESERVED: fast network surrogate; see §7 +``` + +When NetMaestro exports a `Topology` for a run, it serializes the **components its +nodes reference** into `components:`, the run settings into `simulation:`, and the +graph into `topology:`. `jobs:` and `surrogate:` are **authored separately from the +topology graph** — they are run concerns, not something drawn in the topology editor — +and `jobs:` may be omitted in favour of a single inline component workload (§6.1). + +The topology graph may be inline or in a separate file: + +```yaml +topology: + format: cytoscape + file: my-network.json # OR inline: + # elements: { nodes: [...], edges: [...] } +``` + +Both Cytoscape forms are accepted: the object form `{ nodes: [...], edges: [...] }` +and the flat array form `[ { group: "nodes", ... }, ... ]`. + +--- + +## 4. Custom components (`components:`) + +A custom component pairs a `ComponentModel` with a configuration. Nodes reference it +by key. + +```yaml +components: + compute_host: # the component's key (referenced by nodes) + model: nw-lp # ComponentModel (required) + type: host # ComponentType (optional; inferred from model) + workload: # model-specific parameter blocks + traffic: uniform + num_messages: 30 # count — always a bare number + arrival_time: 1000 # bare uses the default unit (ns); write "1us" to set it explicitly + payload_size: 2048 # bare uses the default unit (bytes); write "2KiB" to set it explicitly + edge_router: + model: simplep2p + type: router + # routing: minimal # user-facing knob with a model default — override here + # chunk_size, vc_size, ... # advanced knobs with defaults — override here +``` + +- **Required:** `model`. +- **Optional:** `type` (inferred from the model when omitted), plus any of the + model's parameters. Omitted parameters take the model's default. +- A parameter is *user-facing* (documented, prominent) or *advanced* (defaulted, + reachable) — both are settable here. Only the **derived** set (§9) can never be set. + +--- + +## 5. Topology (`topology:`) + +### 5.1 Nodes (`TopologyNode`) + +| `data` field | Required | Meaning | +|---|:---:|---| +| `id` | ✅ | unique, stable node name; the compiler maps it to an LP id | +| `component` | ✅ | key of an entry in `components:` — every node references one | +| *(any component param)* | – | per-node **override** of the component's value (e.g. `num_messages: 100`) | +| `parent` | – | this node's `TopologyGroup` (a compound-node parent id) — **reserved**, see §10 | + +A node **always** references a `component`. + +Ignored by CODES: `position`, `classes`, `style`, `selected`, `grabbable`, +`locked`, `scratch`, and any other non-allowlisted key. + +### 5.2 Links (`TopologyLink`) + +| `data` field | Required | Meaning | +|---|:---:|---| +| `id` | conventional | link name | +| `source`, `target` | ✅ | node `id`s; both must exist | +| `bandwidth` | model-dependent | link capacity (unit-bearing, e.g. `"100Gbps"`). **Per-edge** for explicit-topology/WAN models (e.g. simplep2p); for HPC/regular models it's set **per link-class** as a model parameter (`local_bandwidth`/`global_bandwidth`/`cn_bandwidth`), not per edge | +| `latency` | model-dependent | propagation delay (unit-bearing, e.g. `"5ms"`). **Required** for long-haul/WAN models where propagation dominates (e.g. simplep2p); **derived**, not a per-link input, for HPC/datacenter models (latency falls out of bandwidth + per-hop delays) | +| `directed` | – | marks a one-way edge (used for per-direction asymmetric links); see §5.3 | +| *(advanced)* | – | **reserved**: per-link VC/buffer overrides, routing `weight` | + +**Link meaning is determined by endpoint `ComponentType`** (the compiler's +interpretation, not stated in the file): + +- `(router|switch) — (router|switch)` → a **network link**; its `bandwidth`/`latency` + populate the model's link table (what the simplep2p matrix files used to do). +- `host — (router|switch)` → an **attachment**; binds the host to its injection + point. `bandwidth`/`latency` are optional here. + +By default a link is **symmetric** (same `bandwidth`/`latency` both ways if specified), and +`source`/`target` are just its two endpoints — not an origin and destination. + +### 5.3 Link bandwidth & latency directionality + +> **The format supports all three levels below**, with **symmetric the +> default**. The others stay available even though most links won't use them — the +> format shouldn't express *less* than the models it feeds (simplep2p's matrix has +> per-direction *and* per-side entries). Presented here for team feedback on +> **representation** (the per-direction form) and on **when** per-side is actually used. + +A `TopologyLink`'s `bandwidth`/`latency` (if needed per link for the model) can be specified at three levels of detail. + +#### Case A — Symmetric link *(default; clearly needed)* + +Same bandwidth and/or latency in both directions — LAN, datacenter fabric, backbone +fiber (full-duplex). One undirected edge, one value each: + +```yaml +- data: { id: e0, source: router0, target: router1, bandwidth: "10Gbps", latency: "5ms" } +``` + +#### Case B — Per-direction asymmetry: source→target ≠ target→source *(supported)* + +Real situations: +- **Asymmetric access links** — ADSL/cable/satellite/cellular, e.g. 100 Mbps down / + 10 Mbps up (asymmetric *by design*). + +Two candidate forms (`to_target` = the `source`→`target` direction): + +**B1 — one link, directional values:** +```yaml +- data: { id: e0, source: home, target: isp, + bandwidth: { to_target: "10Mbps", to_source: "100Mbps" }, # up / down + latency: "12ms" } # scalar = same both ways +``` + +**B2 — two directed edges, one per direction:** +```yaml +- data: { id: e0, source: home, target: isp, directed: true, bandwidth: "10Mbps", latency: "12ms" } +- data: { id: e1, source: isp, target: home, directed: true, bandwidth: "100Mbps", latency: "12ms" } +``` + +Trade-off: **B1** keeps a single physical-link object (natural for an asymmetric +access link); **B2** makes each direction a first-class edge. + +> **Open (representation):** which form is canonical — B1, B2, or both? Primarily a +> **NetMaestro** editor call (how an asymmetric link is drawn and edited). The choice +> **carries through to Case C** (per-side mirrors the same form). + +#### Case C — Per-side egress/ingress split: sender-rate ≠ receiver-rate *within one direction* *(supported; advanced, rarely used)* + +The legacy simplep2p matrix stores, *for each direction*, a separate **egress** +(sender-side) and **ingress** (receiver-side) value — so a single transfer is charged +serialization at two different rates. Per-side refines *within* a direction, so it +**mirrors B1/B2 and must match whichever B form is chosen**: + +**C1 — one link, directional + per-side values** (extends B1): +```yaml +# egress = charged at the sender, ingress = charged at the receiver; latency uses the same nested shape +- data: { id: e0, source: host0, target: router0, + bandwidth: { to_target: { egress: "50Mbps", ingress: "25Mbps" }, + to_source: { egress: "50Mbps", ingress: "45Mbps" } } } +``` + +**C2 — two directed edges, each with per-side values** (extends B2): +```yaml +- data: { id: e0, source: host0, target: router0, directed: true, egress_bandwidth: "50Mbps", ingress_bandwidth: "25Mbps" } +- data: { id: e1, source: router0, target: host0, directed: true, egress_bandwidth: "50Mbps", ingress_bandwidth: "45Mbps" } +``` + +Per-side is where the nested form bloats — C2 (directed edges) stays noticeably flatter +than C1's nested maps, which is worth weighing in the B1-vs-B2 call. + +This is the only place CODES exposes it, and not sure how to think of it in a real world scenario. + +> **Supported** as advanced fields (so the format can fully express simplep2p's +> matrix), but off the common path — omit them and the compiler uses one rate per +> direction (`egress == ingress`). **Domain feedback wanted:** when, if ever, do you +> use an independent sender-side vs receiver-side rate? It's hard to tie to a physical +> link, so we want to hear the real scenarios. + +#### Summary + +| Case | Situation | Config sketch | Status | +|---|---|---|---| +| **A** symmetric | LAN / DC / backbone (full-duplex) | one undirected edge, one value each | **supported — default** | +| **B** per-direction | asymmetric access links (ADSL/cable/sat/cellular) | B1 directional values · or B2 two directed edges | **supported** — form TBD (NetMaestro) | +| **C** per-side | sender vs receiver rate (simplep2p matrix) | C1 nested egress/ingress · or C2 directed edges w/ `egress_*`/`ingress_*` | **supported** — advanced, rare; **form follows B** | + +### 5.4 Groups (`TopologyGroup`) + +A group is an optional set of related nodes (e.g. a site). It is expressed natively +as a Cytoscape compound node: a node is a group when other nodes name it via +`data.parent`. Groups are **reserved** in Phase 3 (parsed, not yet acted on beyond +single-network) and become the seam for **multi-network / multi-site composition** — +the modern replacement for legacy `@annotation` scoping (see §7 of the roadmap). + +### 5.5 Parametric topology source (HPC fabrics) + +Regular HPC fabrics (dragonfly, fattree, torus, slimfly) are **not** drawn +node-by-node. Their connectivity is fully determined by a handful of **shape +parameters**, so they use a different topology *source*: `format: parametric`. +Instead of `elements:` (an explicit node/edge graph), the block carries a single +**fabric** description. + +```yaml +topology: + format: parametric + fabric: + model: # the fabric's network model; the topology KIND is fixed by it + shape: { ... } # the model's shape parameters (a sufficient, non-redundant set) + links: { ... } # per-link-CLASS bandwidth / vc_size + routing: { ... } # fabric-global routing + packet_size: ... + chunk_size: ... + connections: { intra: ..., inter: ... } # ONLY for file-enumerated models — see below + hosts: + component: # the workload component that runs on every compute-node slot +``` + +**Why a `fabric` block instead of per-node `components:`.** A fabric's parameters +are per **link-class** (`local`/`global`/`cn`, or `link`/`cn` for fattree) and +fabric-global (routing) — they do not attach to individual nodes. And the topology +*kind* is inseparable from the `model`: a fattree layout only exists with the fattree +model; you cannot place arbitrary components into it. So the fabric names its network +`model` directly (from the same catalog `components:` draw from) rather than wrapping +a single-use config in a named component. The **host workload is still a `components:` +entry** — it genuinely repeats across every terminal — referenced by `hosts.component`. + +**Shape: a sufficient, non-redundant input set.** Each model defines a canonical +minimal input set; the user supplies a *sufficient* set and nothing redundant. For +dragonfly-dally the Dally construction derives every count from two numbers: + +```yaml +topology: + format: parametric + fabric: + model: dragonfly-dally + shape: + router_radix: 7 + conn_between_groups: 1 + links: + local: { bandwidth: "2GiBps", vc_size: "16KiB" } + global: { bandwidth: "2GiBps", vc_size: "16KiB" } + cn: { bandwidth: "2GiBps", vc_size: "32KiB" } + routing: { algorithm: minimal, minimal_bias: 1 } + packet_size: "4KiB" + chunk_size: "4KiB" + connections: + intra: "conf/dragonfly-dally/dfdally-72-intra" # current binary files (see below) + inter: "conf/dragonfly-dally/dfdally-72-inter" + hosts: + component: compute_host +``` + +`router_radix: 7, conn_between_groups: 1` is the *entire* shape of the 72-terminal +fabric: `num_routers_per_group = (radix+1)/2 = 4`, `num_cns_per_router = 2`, +`num_global_channels = 2`, `num_groups = 9` all fall out of it (the same derivation the +generator script does). Writing those derived counts **as well** is over-specification — +a second value can only agree (noise) or conflict (a silent bug). The compiler +**rejects/warns on redundant or conflicting shape values**. A user may pin a different +*single* knob where it's still sufficient (e.g. `num_groups` instead of `router_radix`), +but never two names for the same quantity. + +Models whose natural inputs are already minimal (fattree, torus, slimfly) have only one +form. Note the link classes are **model-specific** — dragonfly has `local`/`global`/`cn`, +fattree has `link`/`cn` — another reason the fabric is tied to its model: + +```yaml +topology: + format: parametric + fabric: + model: fattree + shape: + num_levels: 3 + switch_count: [32, 32, 16] # per level + switch_radix: [8, 8, 8] + # tapering: 1.0 # advanced; default 1.0 + links: + link: { bandwidth: "12.5GiBps", vc_size: "64KiB" } + cn: { bandwidth: "12.5GiBps", vc_size: "64KiB" } + routing: { algorithm: adaptive } + packet_size: "512B" + chunk_size: "512B" + hosts: + component: compute_host +``` + +**Connectivity generation (current scope).** Of the four families, only the +**dragonfly-custom / -dally / -plus** variants read an explicit wiring from external +**binary connection files** (`intra` / `inter`, directed router→router edge lists); +the others generate connectivity internally from the shape parameters. Near-term CODES +keeps that split unchanged: + +- **Internally-generated** (torus, fattree, slimfly, regular dragonfly): the compiler + emits the shape parameters; the model generates as today. No external files, no + `connections:` block. +- **File-enumerated** (dragonfly-custom/-dally/-plus): the user still produces the + binary files with the existing generator scripts (`scripts/dragonfly-*/`) and + references them via `fabric.connections.{intra,inter}`. The compiler passes the paths + through to the model, which `fread`s them at init as it does today. + +A future **shared generator utility** (shape → Cytoscape elements, runnable *outside* +CODES so NetMaestro can both visualize a fabric and feed the simulation from one source) +is **reserved, not precluded** (§10, §13). It is deliberately out of near-term scope +because HPC is not this SBIR's focus; the parametric format above is designed so that +utility can be added later without changing the user-facing schema. + +**Reducing the file-drift footgun now (recommended, not build-now).** The one real +hazard today is that a file-enumerated model needs the shape counts in *both* the +config *and* the binary files, kept consistent by hand. Cheapest mitigation that +doesn't build the full utility: have the existing generator script **also emit the +`fabric.shape` block** (the counts it already computes) and the `connections:` paths it +just wrote — so a single command produces the binary files *and* a matching fabric +snippet, nothing hand-copied. A compiler-side validation pass (read the files, check the +implied router/group counts against the declared shape) is a cheaper-still backstop. + +--- + +## 6. Jobs & workloads (`jobs:`) + +What a host *runs* is separate from what a host *is*. A component answers "what kind of +endpoint" (`model: nw-lp`); a **job** answers "what workload runs, and where." Jobs are +a distinct top-level block because real runs are **multi-job** — different workloads on +different subsets of nodes (a trace replay beside synthetic background traffic, several +MPI apps co-scheduled) — which a single per-component workload cannot express. + +A **job** = a **workload source** (what runs) + a **placement** (which nodes) + a rank +count. + +```yaml +jobs: + - id: production + workload: + type: dumpi # trace replay + trace: "traces/app1.dumpi" + ranks: 256 + placement: { policy: contiguous } # 256 contiguous compute-node slots + - id: background + workload: + type: synthetic + traffic: uniform + num_messages: 30 + payload_size: "2KiB" + arrival_time: "1us" + ranks: 128 + placement: { policy: random } + qos: 1 # optional priority; maps to num_qos_levels +``` + +### 6.1 The single-workload shortcut + +For the common case — one workload on every endpoint — a component may carry an inline +`workload:` instead, and the compiler desugars it to a single job placed on all of that +component's nodes. This keeps simple configs simple (it is the form used in §11's worked +example): + +```yaml +components: + compute_host: + model: nw-lp + workload: { traffic: uniform, num_messages: 30, payload_size: "2KiB" } + # ≡ jobs: [ { workload: , ranks: , placement: all } ] +``` + +A component carries an inline `workload:` **or** the config has a `jobs:` block — not +both. + +### 6.2 Workload sources + +One `type:` discriminator selects the source; the rest of the block is that source's +parameters (the same knobs the workload generators read today). The catalog: + +| `type:` | Key parameters | Backed by | +|---|---|---| +| `synthetic` | `traffic` (uniform / nearest-neighbor / …), `num_messages`, `arrival_time`, `payload_size` | the synthetic traffic LPs | +| `dumpi` | `trace` (file / prefix) | offline MPI trace replay | +| `swm` | app name + `config` (JSON) | SWM online | +| `union` | `config` | UNION online | +| `darshan` | `log` | Darshan I/O trace | +| `checkpoint` | `checkpoint_sz`, `wr_bw`, `total_checkpoints`, `mtti` | checkpoint/restart synthetic | +| `iomock` / `iolang` | `num_requests` / `request_size` / `type`, or `kernel_meta` | I/O mocks | + +These replace today's `workload_type` PARAMS key plus the scattered trace-file and +command-line plumbing. + +### 6.3 Placement + +`placement` says which compute-node slots a job's ranks occupy. Two forms: + +- **Policy (common):** `{ policy: contiguous | scatter | random }` — the compiler/jobmap + generates the allocation, the way real allocators do (the legacy + `allocation-cont.conf` is the contiguous policy). Large jobs don't enumerate thousands + of ids. +- **Explicit (escape hatch):** `{ nodes: [host3, host7, …] }` — an exact node list, + mirroring the legacy `alloc_file`. + +### 6.4 Phasing + +The `jobs:` **schema is defined now** so the plan is visible and NetMaestro can build +around it, but it lands in stages: + +- **Phase 3 (near-term / SBIR):** the inline-workload shortcut and the `synthetic` + source; simple explicit multi-job. Enough for the WAN / simplep2p work. +- **Later stage:** rich placement policies and trace-driven multi-job at scale — the + compiler generates allocations (the policy forms of §6.3) onto the existing jobmap, + the way the legacy `allocation-cont.conf` / `alloc_file` flow does, rather than + introducing a second allocator. + +--- + +## 7. Surrogate (`surrogate:`) — reserved + +CODES can swap the detailed network model for a fast **surrogate** (an average-latency +or learned predictor) over part of a run. This maps to a top-level `surrogate:` block, +the successor to the legacy `NETWORK_SURROGATE` section: + +```yaml +surrogate: # RESERVED — schema tracks active development + enable: true + predictor: average # average | torch-jit + director_mode: at-fixed-virtual-times + switch_timestamps: [ "10ms", "89ms" ] +``` + +**Status: reserved placeholder.** The surrogate is on the SBIR path and needed soon, but +it is still being taken beyond a prototype, so the field set above is **indicative, not +pinned** — it will firm up as the surrogate productionizes. It is captured here so the +format reserves a home for it; until then a surrogate run may still be configured the +legacy way. (There is also an `APPLICATION_SURROGATE` counterpart, similarly reserved.) + +--- + +## 8. Units + +Every dimensioned parameter has a **documented default unit**. A bare number is +interpreted in that unit; a **unit-bearing string** (`"1us"`, `"2KiB"`) overrides it +and is converted. Explicit units are **recommended** — especially for bandwidth, +where there is no safe convention — but not required. Only dimensionless **counts** +(`num_messages`, `num_routers`, repetitions) are inherently unitless. + +| Quantity | Bare number means | Explicit forms | Internal target | +|---|---|---|---| +| latency / time | ns | `"5ms"`, `"10us"`, `"1.5ns"` | ns | +| size | bytes | `"2KiB"`, `"1500B"`, `"4MiB"` | bytes | +| bandwidth | *(units recommended)* | `"100Gbps"`, `"10Gbps"`, `"2.5GBps"` | per-model (CODES mixes GiB/s and MiB/s today) | + +--- + +## 9. Derived values — never written in this file + +The compiler computes these from the models and topology; a hand value would be +wrong, so they are rejected (or ignored with a warning) if present: + +| Value | Derived from | +|---|---| +| `message_size` (ROSS event-blob size) | the size of the models' message union | +| `modelnet_order` | the set of network models present | +| `pe_mem_factor` | defaulted from the run | +| repetition / group counts | the topology and component placement | + +For a **parametric fabric** (§5.5), `modelnet_order` is derived from the fabric +`model`, and the repetition / group / router counts come from the fabric `shape`. (For +the file-enumerated dragonflies the shape counts are genuine *inputs* the model needs — +not derived — and must stay consistent with the connection files; that consistency is +the footgun §5.5's generator-emits-the-shape recommendation removes.) + +> Note: Initially these values will be implemented in the yaml format while we transition away. + +--- + +## 10. What the new format covers vs. reserves vs. defers + +The format replaces the whole legacy `.conf` (LPGROUPS + PARAMS + `@annotation`) plus +the matrix/connection files — but in stages. This is the scope map for the *entire* +config surface, not just topology. + +**Covered now** (Phase 3, hand-authored, verified by equivalence): +- *Enumerated WAN / single-network topology* (simplep2p / simplenet / synthetic): + `id`, `component`, per-node overrides, `source`/`target`, `bandwidth`, `latency` — + symmetric, per-direction, and per-side egress/ingress all supported (§5.3), mapped + onto simplep2p's matrix; component `model`/`type`/params. +- *Parametric HPC fabrics* (§5.5): the `fabric` block — `model`, `shape`, per-class + `links`, `routing`, `packet_size`/`chunk_size`, and (for the file-enumerated + dragonflies) the `connections:` paths to the **existing** binary files. Compiled to + the PARAMS the HPC models already read; generation stays with the current scripts. +- *Single-workload jobs* (§6): the inline-workload shortcut and the `synthetic` source. +- *`@annotation`*: not user-facing — subsumed by components (the compiler may emit + annotations internally to drive the existing mapping). +- *Advanced network knobs* slot in as component/fabric params with no new concept: + `modelnet_scheduler` (incl. `priority` + its sub-options), `num_qos_levels` / + `qos_bandwidth`, multi-rail / multi-plane (`num_rails`, `rail_select`, `tapering`, + `rail_routing`) — prominent-vs-advanced as elsewhere. + +**Defined, lands in a later phase:** +- *Multi-job / trace-driven workloads* (§6.4) → rich placement policies on the + **existing jobmap** (a model-level concern, independent of the LP-mapper / + connectivity work). +- *`parent` / `groups:` (`TopologyGroup`)* → multi-network / multi-site composition — the + modern replacement for cross-cluster `@annotation` scoping (the `forwarder`-bridged + heterogeneous configs live here). +- *Storage / I/O models* (`lsm`, `resource`) → additional `ComponentModel`s + (`model: lsm` + params); the abstraction already fits — unaddressed scope, driven by + the I/O workload sources in §6.2. +- *Surrogate* (§7) → reserved `surrogate:` block; schema firms up as it productionizes. + +**Reserved** (parsed/known, not yet acted on): +- the **shared generator → Cytoscape elements utility** (§5.5) — near-term HPC + connectivity comes from the current generators + binary files; **not precluded**. +- the **very-large / Internet-scale** generated topology source. A generator's *output* + is these same Cytoscape elements. +- advanced per-link parameters (VC/buffer overrides, routing weights). + +**Explicitly deferred (a decision, not an oversight):** +- *Output / sampling / instrumentation* (`lp-io`, `cn_sample_file` / `rt_sample_file`, + sampling intervals, ROSS instrumentation) — a thin pass-through for now; unifying + CODES-direct stats vs. the ROSS instrumentation callbacks is its own future work + (roadmap §7.4 / §10). + +--- + +## 11. Worked example — simplep2p-style (2 routers, 4 hosts) + +```yaml +schema_version: 1 + +simulation: + end_time: "1ms" + +components: + compute_host: + model: nw-lp + type: host + workload: + traffic: uniform + num_messages: 30 + arrival_time: "1us" + payload_size: "2KiB" + edge_router: + model: simplep2p + type: router + +topology: + format: cytoscape + elements: + nodes: + - data: { id: host0, component: compute_host } + - data: { id: host1, component: compute_host } + - data: { id: host2, component: compute_host } + - data: { id: host3, component: compute_host } + - data: { id: router0, component: edge_router } + - data: { id: router1, component: edge_router } + edges: + - data: { id: e0, source: host0, target: router0, bandwidth: "100Gbps", latency: "1us" } + - data: { id: e1, source: host1, target: router0, bandwidth: "100Gbps", latency: "1us" } + - data: { id: e2, source: host2, target: router1, bandwidth: "100Gbps", latency: "1us" } + - data: { id: e3, source: host3, target: router1, bandwidth: "100Gbps", latency: "1us" } + - data: { id: e4, source: router0, target: router1, bandwidth: "10Gbps", latency: "5ms" } +``` + +This replaces the prototype's YAML + DOT + two matrix files, with no +`node_ids` lists, no `message_size`, and no separately-indexed latency/bandwidth +matrices. + +--- + +## 12. Validation (compiler-side) + +- node `id`s are unique; +- every link `source`/`target` resolves to a node; +- every node `component` resolves to a `components:` entry; +- every component `model` is a registered `ComponentModel`; +- required component parameters are present; units parse; +- (warning) a `router`/`switch` with no network link; a `host` with no attachment; +- (rejected/warned) any derived value (§9) written explicitly; +- a component carries an inline `workload:` **xor** the config has a `jobs:` block (§6.1); +- every job `workload.type` is a registered source; each job's `ranks` fit its placement; + an explicit-`nodes` placement resolves to existing nodes of a workload-capable component. + +--- + +## 13. Open items + +- **Units strictness:** lenient (chosen) — every dimensioned parameter has a + documented default unit; a bare number uses it, an explicit unit string overrides. + Explicit units recommended (especially bandwidth). +- **Per-direction link form (§5.3):** all three directionality levels are supported; + **open:** the canonical form for per-direction — B1 (one link, directional values) + vs B2 (two directed edges), or both — a NetMaestro representation call. +- **Group semantics:** exact mapping of `TopologyGroup` → network/annotation scope + (settled when multi-network lands). +- **HPC parametric source (§5.5):** drafted — `fabric` block, sufficient/non-redundant + shape, per-class links, current generators + binary files kept. **Open:** the + per-model canonical input sets (which single knobs are accepted); and the future + **shared generator → elements utility** — *where it runs and in what language* (a + standalone shared lib/service vs. one spec with native NetMaestro + CODES + implementations). Deferred because HPC is not this SBIR's focus. +- **Jobs & workloads (§6):** schema sketched — `jobs:` block (workload source + + placement + ranks), inline-workload shortcut, source taxonomy. **Open:** the placement + policy set and its jobmap binding (on the existing jobmap, decoupled from the + LP-mapper / connectivity work — §6.4); per-source parameter + schemas beyond `synthetic`; whether per-job timing (start/pause/stop, the legacy + timer/period files) is a job field or a separate block. +- **Surrogate (§7):** reserved `surrogate:` placeholder; field set indicative. Firms up + as the surrogate moves beyond prototype — needed soon (SBIR), not yet pinned. +- **Model self-description / import:** how a third-party model (separate repo) + advertises its `ComponentType`, parameters (name, type, default, unit, user-facing?) + and connectivity so NetMaestro can ingest it into the catalog. Tracked with the + Phase 4 model-framework work (see roadmap §10). diff --git a/doc/refactor-plan/refactor-roadmap.md b/doc/refactor-plan/refactor-roadmap.md new file mode 100644 index 00000000..8d9d7245 --- /dev/null +++ b/doc/refactor-plan/refactor-roadmap.md @@ -0,0 +1,818 @@ +# CODES Rewrite Roadmap + +**Status:** Planning + +This document is the canonical plan for the staged rewrite of CODES. It captures +*why* the rewrite is happening, the sequence of phases (in dependency/risk order), +the decisions made so far (with rationale), and the open questions still to settle. +It is meant to be read top-to-bottom once, then used as a reference and updated as +phases complete. + +The rewrite is deliberately **waved**: each phase keeps the simulator working, +leans on equivalence tests for safety, and keeps legacy paths alive in parallel +until their replacements are proven. + +--- + +## 1. Background & motivation + +CODES is built on ROSS (a C-based optimistic parallel discrete-event simulator). +Because ROSS is C, CODES started in C, with C++ being inconsistently bolted on in places. +There's two main structural problems that make it difficult to not only add new models, +but to maintain the code: + +1. **Copy-and-edit modeling.** To write a new model, the established workflow is: + find the most similar existing model, copy its file, and edit the copy. The same + logic then lives in many files, so a single bug must be fixed in every copy. +2. **Inconsistent, ad-hoc C++.** Where C++ exists, it is uneven in style and rarely + uses composition/abstraction to remove the duplication above. + +Concrete evidence (paths/lines indicative, as of this writing): + +- **Synthetic traffic drivers** — `src/network-workloads/model-net-synthetic.c`, + `-fattree.c`, `-slimfly.c`, `-dragonfly-all.c` are near-identical. Each + re-implements send/recv counters, `issue_event()`, RNG + reverse-RNG handling, + the kickoff event, and its own `main()`. They differ mostly in the *traffic + pattern* (destination selection). +- **Network models** — `dragonfly.c` (~3,800 lines), `fattree.c`, `torus.c`, + `slimfly.c` each duplicate chunking math, per-network message-list types and + their init/delete, buffer/credit/virtual-channel accounting, statistics, and + bandwidth conversion. `bytes_to_ns` already exists in + `src/networks/model-net/common-net.c` yet is re-copied into individual models. +- **simplep2p.c** even comments a helper as *"more or less copied from + model_net_find_stats"*. + +**Goal:** restructure CODES so shared logic lives in one place (kill the +copy-and-edit duplication), modern C++ is used consistently, and adding a new model +means writing only the parts that are genuinely new — done in safe, verifiable +waves inside the existing CODES repository. + +--- + +## 2. How we sequence the work + +Each candidate piece of work is ranked by four questions: + +1. **Does it gate other work?** Foundational changes (build, configuration, core + APIs, the ROSS interface) land before things built on top, so we don't refactor + twice. +2. **Is it on the SBIR critical path?** The WAN / `simplep2p` + surrogate work has + deadlines; anything blocking it moves up. +3. **Blast radius / risk.** Touching core (mapping, the message union, ROSS + integration) destabilizes everything; isolated changes are safe anytime. +4. **Reversibility.** Cheap-to-undo experiments go early to learn; hard-to-undo + commitments (a new config format, a language-wide C→C++ move) need more upfront + agreement. + +Two practices apply throughout: + +- **Parallel legacy paths.** Replacements run alongside what they replace (e.g. the + old `.conf` format) for a couple of releases, with a clear deprecation window. +- **Verify by equivalence.** Every migration is checked by comparing outputs: old + vs. new config, sequential vs. optimistic runs, and refactored vs. original model. +- **Existing prototype code.** Some C++ prototypes already exist in the `digital-twin` + repo (branch `initial-code`) — a config-driven driver (`Orchestrator`), a RapidYAML + `ConfigParser`, YAML/DOT example configs, and a `Mapper`. These are pulled into CODES + **incrementally, in the phase that consumes each** (the config pieces in Phase 3, the + `Mapper`'s connectivity *idea* harvested during the Wave 4 WAN-model work, §8), not + all at once. + +--- + +## 3. Hard constraints from ROSS (shape every phase) + +These are non-negotiable properties of the engine and drive much of the design, +especially the C++ class work (Phase 4): + +1. **ROSS owns LP state and event messages as fixed-size POD blobs** of `state_sz` + bytes (`tw_lptype.state_sz`); it does **not** run C++ constructors on them. + → A polymorphic LP object with a vtable pointer embedded in the state blob is a + footgun (and interacts badly with the `crv_checkpointer` state-compare + machinery in `model-net-lp.c`). +2. **ROSS dispatches via C function pointers** (`init_f`/`event_f`/`revent_f`/…). + → C++ must expose C-compatible callbacks (a trampoline). +3. **Optimistic PDES ⇒ reverse computation.** Every forward event handler needs an + exact undo. `tw_bf` bits record which branches were taken; every `tw_rand_*` + draw needs a matching `tw_rand_reverse_unif`; non-recomputable state is saved + into the message and restored in reverse. This is the single biggest source of + fragile, duplicated code. +4. **Messages are fixed-size** (`model_net_wrap_msg` is a union of every network's + message type). Undo data must fit in the message or live in `rc_stack` (heap, + GVT-tracked). No unbounded per-event journals. + +--- + +## 4. Phase overview + +| # | Phase | Gates? | Risk | Summary | +|---|-------|:---:|:---:|---------| +| 1a | Safety net: CI & equivalence | ✅ | low | CI on MRs (GitHub Actions), pinned-clone ROSS built in-CI, equivalence harness, C++17 baseline, unit-test framework | +| 1b | Build hygiene | – | low | CMake modernization (target-based, find_package ROSS, tri-state deps), co-location reorg, portable presets, heavy-deps images, install/export — must **not** block Phase 3; may run in parallel with it | +| 2 | Shared C++ foundation | ✅ | low | Conventions + automated style enforcement (clang-format/clang-tidy in CI) — one idiom for config + classes | +| 3 | YAML config front-end | ◐ | med | Friendly YAML "config compiler" → **existing** mapping (narrow `ConfigVTable` seam); Cytoscape topology + explicit units + custom components; old `.conf` in parallel | +| 4 | C++ class application | – | med | Hybrid base+composition LP classes; Wave 1 synthetic (incl. ROSS trampoline) → Wave 2 `simplep2p` → Wave 3 others → Wave 4 **new WAN model** (routing+congestion) + the explicit-connectivity workstream (§8) | +| 5 | Far future | – | – | Decouple the message union, modernize workload API, surrogate integration cleanup, broader C→C++, performance baseline | + +A cross-cutting **config-driven driver** (the `Orchestrator`, see §11) runs through +all phases as the single entry point. + +--- + +## 5. Phase 1 — Build, packaging & CI + +**Goal:** a correctly-engineered CMake build, shareable presets, a pinned ROSS, and +a CI safety net strong enough to protect every later phase. This phase also does a +deliberate **CMake cleanup** — the current build works but uses dated, non-idiomatic +patterns (global flag appends, an `ENV{PKG_CONFIG_PATH}` hack, a legacy MPI module, +no off-switch for found deps) that should be fixed before more is layered on. + +**Phase 1 is split in two** — by the §2 criteria, much of this phase's scope +doesn't actually gate anything downstream, and the SBIR-critical Phase 3 shouldn't +queue behind build hygiene: + +- **1a — the safety net (gates Phase 3):** CI on every MR (Workstream 4), the pinned + ROSS (Workstream 3), the equivalence harness (Workstream 5), the unit-test framework + (Workstream 6), and the C++17 baseline (a one-line change, listed with Workstream 1). +- **1b — build hygiene (gates nothing on the critical path):** the CMake modernization + including the co-location reorg (Workstream 1), portable presets (Workstream 2), + tri-state optional deps, install/export, and the heavy-deps/python2 images. Valuable, + but it may run **in parallel with (or after) Phase 3** rather than in front of it. + +**Decisions made** + +- **Repo structure:** the rewrite happens **entirely in the CODES repo** (§2). +- **Optional-dependency hygiene:** Every optional feature + gets a user-facing `CODES_USE_` cache variable with values **AUTO / ON / OFF**: + - `AUTO` (default): probe; enable if found, silently disable if not. + - `ON`: probe; **hard error** if not found (so a `full`/CI build fails loudly + instead of silently producing a lesser binary). + - `OFF`: don't probe at all. + + Keep the user-facing string (`CODES_USE_SWM`) separate from the internal resolved + boolean (`USE_SWM`) the build already uses, so downstream code is untouched. + Respect dependency chains (UNION ⇒ SWM ⇒ argobots). Applies to SWM, UNION, DUMPI, + RECORDER, TORCH (and the stubbed DARSHAN). Today only TORCH is close to this + (and its `ON` silently downgrades); SWM/UNION/DUMPI can't be turned off when found. +- **Consume ROSS correctly via `find_package(ROSS CONFIG)`.** ROSS now ships a + proper CMake package (`ross/cmake/ROSSConfig.cmake.in` → `ROSSTargets.cmake`, + installed at `/lib/cmake/ROSS/`). CODES links the imported ROSS target and + **drops** the current pkg-config probe **and** the `set(ENV{PKG_CONFIG_PATH} …)` + hack. +- **Drop DAMARIS *build* support, keep the code.** ROSS master recently removed the + damaris build path; CODES follows. Remove `DAMARIS_PKG_CONFIG_PATH` and the + commented DAMARIS block; leave the `#ifdef USE_RDAMARIS` code paths in place (never + defined) pending the eventual RISA rewrite. + +**Workstream 1 (1b) — CMake modernization (the cleanup)** + +- **Co-location reorg first.** Execute the big-bang layout move (§6) right after 1a's + CI lands and before the rest of this workstream, so the modern CMake below is written once + against the final `codes//` tree (and harvested/new C++ lands there from + day one). Purely mechanical (`git mv` + include rewrite); forwarding-header shims for + downstream; time it for low branch activity. +- **Target-based everything.** Replace global `include_directories`, + string-appended `CMAKE_C_FLAGS`, and `add_definitions` with `target_link_libraries` + / `target_include_directories` / `target_compile_definitions` / + `target_compile_options` with correct PUBLIC/PRIVATE/INTERFACE scoping. +- **ROSS** via `find_package(ROSS CONFIG REQUIRED)` + imported target (above). +- **MPI** via `find_package(MPI REQUIRED)` + `MPI::MPI_C` / `MPI::MPI_CXX` imported + targets; delete the legacy BLT `src/cmake/SetupMPI.cmake` and the manual include/lib + plumbing. +- **Optional deps** linked as imported targets (`PkgConfig::SWM`, …) on the `codes` + target, gated by the tri-state resolver — not global flag appends. +- **Install + export** a `codesConfig.cmake` (imported `codes::codes` target) so + in-repo consumers (driver, tests) and any external user link CODES the modern way. +- **C++17 baseline** set unconditionally (drop the Torch-only bump), with a quick + compile-clean check of the existing "C with classes" code at 17. +- **Housekeeping:** settle one CMake minimum version — **≥3.23** (needed for + `FILE_SET HEADERS`). Use generator + expressions for build-type logic (e.g. the test gate) instead of string compares; + purge dead autotools `.gitignore` entries and add `install/`, `test/` artifacts. + +**Workstream 2 (1b) — Portable presets** + +- Committed `CMakePresets.json` with environment-independent base presets: `debug` + (tests + RC verifier), `release`, `core` (heavy optional deps OFF), `full` (all ON, + strict). Dependency *locations* come from `$env{…}`. +- Personal/machine paths live in a git-ignored **`CMakeUserPresets.json`** that + `inherits` a base preset. + +**Workstream 3 (1a) — ROSS pinning (decided: pinned clone, built in CI)** + +Pin via a **pinned clone**: each CI job clones ROSS at +a fixed commit/tag, builds + installs it, and CODES consumes it via `find_package`. +ROSS is **built in CI, not baked into an image** — it's quick to build and it's the +pin that changes most often, so baking it would force an image rebuild on every bump. +The **ROSS pin is a workflow variable**; bumping it is a one-line reviewed change with +no image rebuild. Cache the built ROSS keyed on its commit so unchanged pins skip the +rebuild. + +**Workstream 4 (1a; full-matrix images are 1b) — CI (decided: GitHub Actions; ROSS in-CI, heavy deps imaged)** + +- Host: **GitHub Actions**. The **whole build matrix runs on every MR/PR** (and on + push to master) — debug/release × feature sets. +- **Docker images bake most deps** (SWM/UNION/argobots/ + conceptual + Torch), rebuilt rarely (only when those pins change). ROSS is built + in-CI on top (Workstream 3). +- **Core** job needs **no custom image** — a stock runner with apt MPI/cmake/ninja + + the in-CI ROSS build covers Phase 3 and Phase 4 waves 1–2. The **full** job uses the + heavy-deps image (plus the same in-CI ROSS build). Derive the full image from + `CODES-compile-instructions.sh`. +- **python2 is confined to the UNION feature** and lives only in the full image: + conceptual (which UNION depends on) needs python2 *at image-build time* for code + generation; SWM-only online and everything else do **not**. Contain + python2 in the image (e.g. `ubuntu:20.04` base, or build 2.7.18 from source) — it + isn't needed at simulation runtime. + +**Workstream 5 (1a) — Equivalence / golden-output harness (decided)** + +Extend the existing determinism tests (`example-ping-pong-determinism.sh`) into the +equivalence checks the migration leans on: +- seq (`--sync=1`) vs. optimistic (`--sync=3`) — proves reverse computation (Phase 4), +- old `.conf` vs. new YAML — proves the config migration (Phase 3), +- refactored vs. original model — proves Phase 4 refactors. + +Two layers of check: +- **Aggregate determinism** — diff `Net Events Processed` (robust, format-stable). + Keep this for **all** models as a cheap fast-path. +- **Per-LP result equivalence** — diff **`lp-io` output** (structured, per-LP, + sortable), normalized and sorted. Use it **now wherever a model supports `lp-io`**. + +Do **not** scrape fragile per-LP **stdout** stats. Models that only print stats today +get **no per-LP equivalence coverage for now** (they still get the aggregate check); +when such a model is rewritten in Phase 4, add a small deterministic **results digest** +to it — the long-term canonical form. So: `lp-io` now → results digest as models are +rewritten; stdout-only models are an explicit, documented coverage gap until then. + +Never diff raw `ross.csv` for seq-vs-optimistic — its engine counters (rollbacks, etc.) +legitimately differ. Prefer equivalence (self-comparison) over stored golden files; +keep a few golden snapshots only as a both-paths-drift backstop. + +**Workstream 6 (1a) — Unit-test framework** + +Equivalence testing protects *refactors*; it cannot exercise *new* code's error paths. +Pick a C++ unit-test framework (Catch2 vs GoogleTest — settle at implementation time, +§13) and wire it into CTest + the 1a CI jobs. First real consumer: the Phase 3 config +compiler's validation / derivation / unit-conversion logic — especially the negative +paths (rejecting redundant shape values, bad units, hand-written derived values; +contract §12), which equivalence against a golden `.conf` can never reach. + +--- + +## 6. Phase 2 — Shared C++ foundation + +**Goal:** one C++ idiom used by *both* the config/orchestrator subsystem and the LP +classes, so the two bodies of C++ don't drift apart. + +**Why now (pulled up from the class work):** the config subsystem pulled in during +Phase 3 is already C++ (RapidYAML parser, `Orchestrator`, `Mapper`). If the LP classes +introduce a *different* C++ style, we end up maintaining two idioms. Establishing the +foundation here lets both reuse it. (C++17 itself is set in Phase 1.) + +**Decisions made** + +- **Rich, placement-new'd LP state.** The `make_lptype()` trampoline (§9.2) + placement-news the state object in `init` and runs its destructor in `final`, so the + state blob *is* a properly-constructed C++ object (STL members allowed) — fixing the + current pattern (e.g. `congestion-controller.C` *assigns* `s->output_ports = + set()` into never-constructed, zeroed memory: technically UB). Safe because + CODES uses reverse computation (in-place, no state copy); the one path that copies + state is ROSS's opt-in `SEQUENTIAL_ROLLBACK_CHECK` mode (`crv_checkpointer`, in + `ross/core/check-revent/crv-state.h`), for which we supply a **C++-aware + checkpointer** (save = copy-construct, clean = destruct, check = `==`/field-compare) + instead of the default whole-struct byte copy. **Messages stay POD always** (ROSS + memcpy's events). +- **Naming: `snake_case`**, matching existing CODES C style (not the CamelCase of the + prototype config code) — so the **harvested `Orchestrator`/`ConfigParser`/… get + renamed to snake_case** when pulled in (Phase 3). +- **File extensions: `.cxx` for C++ sources** (rename the current `.C` files — which + also removes the `.C`/`.c` collision on case-insensitive macOS filesystems; update + the source lists in `src/CMakeLists.txt`). `.c` for C sources; `.h` headers with + `extern "C"` guards where C-includable (as today). +- **Layout: fully co-located, big-bang (decided).** Every header sits beside its + source in one `codes/`-rooted tree (`codes//.{h,cxx,c}`), so + includes become `#include "codes//…"`; public headers are designated for + install via CMake `target_sources(FILE_SET HEADERS)` (needs CMake ≥3.23 → the Phase 1 + minimum). Done **all at once**, not incrementally — a layout move is mechanical and + CI-verifiable, and one clean break beats a long two-conventions-at-once period that + smears churn across releases. Executed **in Phase 1b, after 1a's CI is up and before + the target-based CMake rewrite** (so new CMake is written once against the final tree and + all new/harvested C++ lands in the final layout). Guardrails: keep the move **purely + mechanical** (`git mv` + include rewrite + CMake path updates — *no* logic changes, + designed target tree decided first); ship **forwarding-header shims** at old + `codes/foo.h` paths (deprecation `#warning`) for a release or two; and **time it for + low in-flight-branch activity** (it touches nearly every file — a rebase grenade + otherwise). It's also the moment to define CODES's **public API surface**. + +**Work** + +- **Automated style enforcement** — `clang-format` enforced in CI plus a starter + `clang-tidy` profile. The diagnosed root cause of the codebase's drift (§1) is + rotating contributors, and a conventions doc alone doesn't survive them — CI gates + do. This is the enforcement arm of the conventions doc below. +- **Conventions doc** — captures the naming/layout/extension rules above; the + **reverse-computation discipline** guide is stubbed now and filled in Phase 4 when + the RC helpers exist. + +The **Layer-0 ROSS trampoline** (§9.2) and the C++-aware `crv_checkpointer` are +*designed* by the decisions above but **implemented in Phase 4 Wave 1, against their +first real consumer** — the same "primitives born from real consumers" principle as +the Layer-1 helpers (§9.2). Building them cold, ahead of any LP, risks an API that +gets redesigned on first contact. + +--- + +## 7. Phase 3 — YAML configuration front-end (the "config compiler") + +**Goal:** introduce a **user-friendly YAML configuration** (the basis for NetMaestro +and wide-area networks) that feeds the **existing `codes_mapping`** — **without** +rewriting mapping yet. The wire-format contract for NetMaestro's editor lives in a +companion doc, [new-config-format.md](new-config-format.md); this section is the +design rationale and the implementation plan. + +### 7.1 Why a new format (not just a nicer `.conf`) + +The current `.conf` (`LPGROUPS` + `PARAMS` + `@annotation`) is HPC-oriented and +**hostile to non-PDES experts**. Configuration today is scattered across **four** +places: + +- `.conf` `PARAMS` (e.g. `packet_size`, bandwidths), +- separate files (simplep2p's two **latency/bandwidth matrices**; + dragonfly's intra/inter-group connection files), +- ROSS **command-line options** (synthetic's `traffic`, `num_messages`, + `arrival_time` — `model-net-synthetic.c:156`), +- `#define`s in source (`PAYLOAD_SZ` is `#define 2048` in + `model-net-synthetic.c:20`). + +It also leaks implementation details into the user's file — the worst offender is +`message_size`, which is the **ROSS event-blob size** (`codes_mapping.c:577` → +`tw_define_lps`), not a simulated message size at all. A user should never have to +know implementation details to fill out a config file. + +### 7.2 The lesson from the abandoned prototype (why this is now low-risk) + +The earlier prototype (`digital-twin`, branch `initial-code`) changed **the format and the mapping at the same +time** — it went straight to a graph-based `Mapper` that *replaced* `codes_mapping`, +then spent itself trying to re-derive `codes_mapping`'s +`(group, repetition, lp_type, annotation, offset)` math from a flat DOT graph. The +scars: `offset = 0` hardcoded with FIXMEs, annotations dropped entirely, the +GROUP_RATIO routing path commented out, restricted to a single network type. So +**feeding the existing `codes_mapping` and deferring the connectivity harvest to the +Wave 4 workstream (§8) is the correction**. + +### 7.3 Approach (decided): a config compiler feeding the existing pipeline + +Treat the front-end as a **config compiler**. The friendly YAML is the *source*; it +**compiles down** to the full, explicit internal configuration that `codes_mapping` +and every model already consume — filling in defaults, *deriving* implementation +details, and expanding compact descriptions into the group/rep structure. Downstream +is untouched. + +- **Narrow seam.** The whole legacy query surface — `configuration_get_lpgroups` + (which builds `lpconf`, which `codes_mapping` reads) **and** every model's + `configuration_get_value_*("PARAMS", …, anno)` — funnels through one abstract + interface, `struct ConfigVTable` (`codes/configfile.h:42`); the `.conf` text parser + is just one implementation of it. The compiler's *output* plugs in at that seam (a + programmatically-built config tree / a YAML-backed `ConfigVTable`), so + **`codes_mapping` and the models need no changes**. +- **Two seam edges, bridged (decided).** The seam delivers *strings*; two of the + target models do work *behind* it that the YAML must respect: + - **simplep2p's matrices:** the model receives the matrix file **paths** through the + seam (`configuration_get_value_relpath`, `simplep2p.c:871`) and `fread`s the files + itself. So initially the YAML simply **references the existing matrix files by + path** — the same treatment the parametric fabrics give their `connections:` + binaries (contract §5.5). The friendly per-edge `bandwidth`/`latency` form can + later be compiled down to *emitted* matrix files without touching simplep2p; + whether simplep2p is modernized at all is revisited once the WAN model exists + (it may not be worth it then). + - **The synthetic drivers' parameters** (`traffic`, `num_messages`, `arrival_time`) + are ROSS **command-line options** (`model-net-synthetic.c:156`), not config — + initially they stay CLI. Eventually they + become settable either way, with **CLI taking precedence** over the config file — + a parameter sweep shouldn't require near-identical config copies that differ in + one knob. To keep that reproducible, every run **dumps the fully-resolved config + including CLI overrides**, so a result is always traceable to one complete + configuration. +- **Keep `.conf` in parallel** for a deprecation window (length TBD, §13). +- **Verify by equivalence.** Because the YAML is intentionally *not* a 1:1 mirror of + `.conf`, verify two ways: (a) compile the YAML, dump the resolved config tree, and + `cf_equal()` it (`configfile.h:106`) against a golden `.conf` — this checks the + defaulting/derivation directly; and (b) **behavioral** equivalence (Net Events / + `lp-io`) via the Phase 1 harness. + +### 7.4 The friendly schema + +The schema design is settled around one principle: **a knob stays user-facing only +if it is a physical property of the modeled system or a deliberate experimental +variable; anything that is an artifact of how CODES is implemented is derived or +defaulted.** Concretely: + +- **Derived (never written; a hand value would be wrong):** `message_size` (from the + models' message-union size), `modelnet_order` (from the models present), + `pe_mem_factor`, repetition/group counts (from the topology). +- **Parameter with a default (always overridable; prominence varies):** + - *prominent* (physical/experimental): bandwidth & latency, packet/payload size, + network scale & shape, traffic pattern, workload intensity, end time, and + **routing** (defaulted but front-and-center — it is a studied variable). + - *advanced but reachable*: `chunk_size`, VC/buffer sizes, scheduler, low-level + timing/queue internals. +- **Output / instrumentation is dual-owned and a pass-through.** NetMaestro owns + results/visualization in its flow; a direct-run user configures collection + themselves. The schema must be able to **enable `lp-io`** (the Phase 1 equivalence + harness depends on it). Phase 3 does **not** unify CODES-direct stats vs. the ROSS + instrumentation-layer callbacks — that standardization is explicitly deferred + (§10); the output section is a thin pass-through for now. + +- **Topology = Cytoscape elements; one parser.** Topology is expressed in + [Cytoscape.js][cyto] element form (nodes/edges with `data`). Because JSON ⊂ YAML, + RapidYAML reads both an editor-exported `.json` and a hand-written `.yaml` — **no + DOT, no `libcgraph`, no matrix files.** Physical quantities are **unit-bearing** + (`"100Gbps"`, `"5ms"`), with a documented default unit when written bare; the + compiler converts. Per-edge `bandwidth`/`latency` **replace the + simplep2p matrices** (`simplep2p.c:315` `parse_mat`); the compiler builds the link + table from edges. +- **Custom components + per-node overrides (this is NetMaestro's model).** A node + references a **custom component** (`ComponentConfig`: a model + configured params), + and may override individual params per node. This *is* the modern replacement for + `@annotation` (see below). +- **Topology source is a first-class choice.** *Enumerated* (Cytoscape elements — + hand-drawn, editor-exported, irregular WAN) is implemented in Phase 3. + *Parametric* (a `fabric` block of shape params — regular HPC fabrics like dragonfly, + fattree, torus, slimfly) is now **specified** as a hand-authored source + (new-config-format.md §5.5): the compiler compiles the fabric's shape + per-class + links + routing down to the PARAMS the existing models read. Crucially, **connectivity + generation stays where it is** — internally-generated models (torus/fattree/slimfly/ + regular dragonfly) generate as today; the file-enumerated dragonflies (custom/dally/ + plus) keep their current external Python generators + binary `intra`/`inter` files, + referenced by path. A **shared generator → elements utility** (so NetMaestro can + *visualize* HPC fabrics and feed the sim from one source) is **future, not + precluded** — deferred because HPC is not the SBIR focus, and the parametric schema + is designed to accept it later unchanged. A generator's *output* is the same + enumerated elements. The very-large WAN generated source remains reserved. + +[cyto]: https://js.cytoscape.org/#notation/elements-json + +### 7.5 Annotations are subsumed by custom components + +The legacy `@annotation` mechanism (canonical example: +`doc/example_heterogeneous/example.conf`) exists to run **one model type with +different parameter sets in different regions of one simulation** — e.g. a `foo` +cluster `simplenet` at 10 Gb/s and a `bar` cluster `simplenet` at 15 Gb/s, the +annotation string selecting which `PARAMS` apply. A **custom component is exactly +this, named and defaulted** (`@foo` ⇒ component `foo_net`). So: + +- Annotations are **not a user-facing concept** in the new format — nobody writes `@`. +- The compiler may **generate annotations internally** (one per custom component that + shares a model) to drive the *existing, annotation-aware* `codes_mapping` for + heterogeneous configs. The legacy annotation machinery thus becomes an **asset**. +- Phase 3's single-network target configs share no model across components, so the + compiler emits **zero** annotations at first; they appear only with multi-network + composition (`TopologyGroup`s / sites), which is deferred to future multi-network + work (the `groups:` seam stays reserved — contract §5.4). + +### 7.6 Harvest & scope + +- **Harvest from `digital-twin` (`initial-code`):** the RapidYAML config parser and + the `Orchestrator` entry point (renamed to `snake_case`, §6). **Decouple the + `Orchestrator` from the prototype `Mapper`** — it must drive `codes_mapping_setup()` + (the compiled config feeding the existing mapping), *not* install ROSS `CUSTOM` + mapping via the prototype `Mapper`. The prototype `Mapper` and its DOT/`node_ids` + data model are **left behind**; the Wave 4 connectivity workstream harvests their + connectivity *idea*, not the code (§8). +- **Scope narrowly:** target `simplep2p` / `simplenet` / synthetic, verified by + equivalence — not the entire legacy API on day one. +- **Unit tests for the compiler** (on the Workstream 6 framework): the validation / + derivation / unit-conversion rules, especially the negative paths (contract §12) — + they are not reachable by equivalence against a golden `.conf`. + +--- + +## 8. Explicit connectivity — a Wave 4 workstream + +**Goal:** give mapping a first-class, queryable model of **connectivity** (who is +connected to whom) for irregular topologies, **without** discarding the proven +`(group, repetition, lp_type, annotation, offset)` semantics the existing +`codes_mapping` is built on. This is an **extension of mapping, not a wholesale +replacement** of `codes_mapping`. + +**This is no longer a standalone phase (decided).** It was originally sequenced as its +own phase between the config work and the classes, but its only real consumer is the +Wave 4 WAN model (§9.3): Waves 1–2 need nothing from it — the ESnet 2-site path runs +entirely off the Phase 3 compiled link table — and the HPC models keep deriving their +own neighbors. So the connectivity service is built **inside Wave 4, against the WAN +model's actual queries** — the same "primitives born from real consumers" principle as +the trampoline (§6, §9.2) — and built **incrementally**: the SBIR-minimum queries +first, the rest time-pending. The §8.5 guardrails still apply; see there for how the +service stays verifiable now that it lands in the same wave as its consumer. + +### 8.1 What actually motivates this workstream + +It is **not** the new format. Phase 3 already makes the YAML run by compiling down to +the existing `codes_mapping` at the `ConfigVTable` seam (§7.3), so the format needs no +mapper changes at all. The one capability `codes_mapping` genuinely lacks is **explicit +connectivity**: its entire API (`codes/codes_mapping.h`) is projections of the +group/rep/type/anno/offset coordinate system, and **nothing in it answers "who is node +A connected to."** Today connectivity is *implicit* — each network model re-derives its +own neighbors from topology PARAMS (dragonfly's connection files, torus's algorithmic +neighbors, simplep2p's latency/bandwidth matrix). A hand-drawn WAN (Cytoscape elements, +§7.4) has no algorithm to derive neighbors from — the graph *is* the source of truth — +so something must carry "A ↔ B" as first-class data. That gap is the whole reason this +workstream exists. + +### 8.2 Harvest the idea, not the code + +The prototype `Mapper` (`digital-twin`, `initial-code`) proved out exactly this one +thing: an explicit `Node`/edge graph with `GetDestinationLPId` / +`GetDestinationLPCount`. **That connectivity model is what we keep.** But the prototype +gets there by **throwing away** group/rep/annotation/offset — global ids are +DOT-traversal order, `offset` is just an index into a `NodeNames` list, annotations +don't exist, and it is built on `graphviz/cgraph`. That is precisely the §7.2 failure +mode (offset=0 FIXMEs, annotations dropped, GROUP_RATIO commented out, single network +type), and it is now also obsolete: Phase 3 chose Cytoscape elements over DOT, so the +connectivity comes from the Cytoscape graph, **not** `libcgraph`. + +So we **do not** mature the prototype `Mapper` into a drop-in `codes_mapping` +replacement. Doing so would reintroduce those regressions *and* pay the +highest-blast-radius cost in the codebase (every LP and model calls `codes_mapping_*`; +a different API means rewriting every call site) in order to *lose* proven semantics. + +### 8.3 Decision: extend, scoped narrow-first + +Keep `codes_mapping`'s API and its group/rep/annotation/offset semantics intact; **add +an explicit-connectivity layer** (the prototype's `Node`/edge model + `GetDestination*` +queries) **fed from the Cytoscape graph**, alongside the existing mapping. Regular +models keep deriving neighbors as they do; irregular WAN models get to *ask* the mapper +for theirs. + +- **Narrow (do first):** an additive connectivity service **beside** `codes_mapping`, + consumed by the new WAN model (Wave 4) and grown query-by-query as that model needs + them. Tiny blast radius, directly serves the SBIR FABRIC/WAN path, and can be built + and verified behind the Phase 1 equivalence tests on a 2-site config. +- **Broad (defer):** actually routing *all* models through one new mapper and retiring + `codes_mapping`. Only worth it if a model genuinely needs it — and if it ever + happens, group/rep/annotation are **re-implemented** on the new mapper, not discarded. + This is the original "replace `codes_mapping`" idea, demoted to optional/future. + +### 8.4 Parity, not new capability + +The prototype's `MappingSetup` also does engine setup — `pe_mem_factor`, RNG-offset +seeding, `tw_define_lps`, the `g_tw_nRNG_per_lp` bumps, and installing ROSS `CUSTOM` +mapping. `codes_mapping_setup` already does all of this; it is parity work that has to +live somewhere, **not** a reason to switch mappers. + +### 8.5 Guardrails + +Behind the Phase 1 equivalence tests; the old mapping stays available throughout; +verify connectivity queries against a known small topology before any model depends on +them. That verification step is an **explicit early Wave 4 task** — since the service +and its consumer now land in the same wave, it is what keeps WAN-model bugs and +connectivity bugs distinguishable. + +--- + +## 9. Phase 4 — C++ class application (the LP class design) + +This is the most fully-planned phase. It removes the copy-and-edit duplication in +the LPs themselves. It comes after the config work because the LP classes consume +configuration, and doing it earlier would mean building against an API that's about +to change. + +### 9.1 Chosen approach + +- **Variation → Hybrid: thin concrete base + composition.** A base/framework class + owns the shared LP lifecycle and ROSS glue; the parts that actually differ + (routing, topology, traffic pattern) are injected as **stateless strategy + singletons referenced by pointer**. This is the natural C++ formalization of a + pattern CODES already uses — `struct model_net_method` is effectively a strategy + vtable. LP state objects stay **non-virtual / POD-friendly** (no vtable in the + blob); only the shared strategy singletons are polymorphic. Templates/CRTP are + held in reserve as a surgical optimization for a single proven-hot tiny function + (per-hop routing) *if* profiling ever demands it — not a default. (Rationale: the + dispatch cost equals the C function-pointer call ROSS already pays; the real cost + of CRTP is readability/onboarding for rotating contributors.) + +- **Reverse computation → targeted helpers, not full automation.** Keep reverse + handlers hand-written but *safe by construction*: + 1. **`ReversibleRng`** — wraps the RNG, counts draws, auto-reverses them + (eliminates hand-counted `tw_rand_reverse_unif`). + 2. **RC verification harness** — debug-only snapshot → forward → reverse → + assert-equal per event, built on the existing `crv_checkpointer` seams; zero + release cost. *This is what makes hand-written reverse handlers safe.* + 3. **`RcStack`** — RAII wrapper over the existing `rc_stack`. + 4. **`RcJournal`** (typed field save/restore in the message) — **prototype only**, + adopted selectively where it clearly wins (bounded by fixed message size). + Explicitly **avoided:** mandating the journal everywhere, and switching to full + state-saving rollback (both trade away the performance reverse computation buys). + +### 9.2 Class architecture + +**Layer 0 — ROSS adapter (trampoline).** `make_lptype()` + callback trampolines +(header-only): generates a `tw_lptype` whose C function pointers cast `void* sv` → +`T*` and call fixed methods; does **placement-new in init** and **destructor in +final** so C++ members in the state object are correctly constructed/destroyed. LP +classes are concrete and `final` (no vtable in state). + +**Layer 1 — Reusable primitives** born from real +consumers: +- `ReversibleRng`, `RcStack`, `RcVerifier`, `RcJournal` (prototype) — the RC bundle. +- `WorkloadSource` — thin C++ wrapper over the `codes_workload_method` vtable. +- `NetworkStats` — wraps `mn_stats` + `model_net_find_stats` with `record_send/recv` + and RC twins (consumed from Wave 2). +- `Bandwidth` — one home for `bytes_to_ns` (GiB/s) and `rate_to_ns` (MiB/s) + (Wave 2). +- *(Wave 2/3)* `Chunker`/`Packet`, `MessageQueue`, `VirtualChannelSet`/ + `BufferPool`, `LinkScheduler` (idle-time tracking). + +**Layer 2 — LP role bases** (concrete; hold strategies): +- `WorkloadDrivenLp` *(Wave 1)* — base for app/traffic LPs: lifecycle, counters, + `issue_next_event` (+RC), completion tracking, RC scaffolding; holds a + `TrafficPattern` (synthetic) or `WorkloadSource` (replay) strategy. +- `TrafficPattern` *(Wave 1)* — strategy `choose_destination(...)` (+RC twin); + impls `UniformRandom`, `NearestGroup`, `NearestNeighbor`. +- `NetworkModelLp` + `RoutingStrategy` + `Topology` *(Wave 2/3)* — endpoint/router + lifecycle with injected routing + topology. + +### 9.3 Waves (risk order) + +- **Wave 1 — Synthetic app LPs.** Safe seam: application LPs (`nw-lp`) that only call + `model_net_event`, no network internals touched. Proves the trampoline, the hybrid + base+strategy, the RC bundle, and the C++17 build end-to-end on a small target. +- **Wave 2 — `simplep2p`** (prioritized). Refactor onto Layer-1 network primitives + (`NetworkStats`, `Bandwidth`, `LinkScheduler` idle-tracking; its `*_saved` fields are + the first `RcJournal` candidate), validated against the current model's output as a + golden test. Designs those primitives against a real consumer. Now consumes the new + YAML. **simplep2p's role:** it is an abstract direct-link delay model — *no runtime + routing*; it delivers source→dest in one hop using a pre-set per-pair + latency/bandwidth table (proven in `simplep2p.c:670`, `handle_msg_start_event`: it + reads the direct `(src,dest)` cell and aborts if absent — it never transits an + intermediate node). That is **exactly enough for the SBIR's actual deliverable — the + ESnet testbed (2 sites, one router ≈ a point-to-point link)** — and it + bootstraps the config compiler + the C++ architecture. It is **not** a general WAN + model (see Wave 4). +- **Wave 3 — remaining network models one at a time** (dragonfly/fattree/torus/ + slimfly) as time allows, onto shared primitives + `RoutingStrategy`. Biggest + line-count payoff; lower current priority. +- **Wave 4 — a new WAN model (routing + congestion)** — the real SBIR payoff for + FABRIC and other WANs. Beyond the ESnet testbed, simplep2p's static per-pair pipes + cannot model routing, congestion, loss, or programmable behavior — the things WAN + research is about (and pre-computing an all-pairs matrix over a full network is both + impractical and scientifically frozen). **This is the concrete justification for the + whole refactor:** the WAN model is built by *composing* the Phase 4 primitives + (`RoutingStrategy`, `Topology`, `VirtualChannelSet`/`BufferPool`, `LinkScheduler`) + instead of copy-pasting a 3,800-line model. CODES already does routed, + congestion-aware simulation at scale (the HPC fabric models), so the engine machinery + is proven; the genuinely new part is the **congestion/transport** model — the + Internet is lossy with end-to-end TCP-like control, vs HPC's lossless credit-based + backpressure. Sequenced **after** Waves 1–3 harden the primitives. Wave 4 also + carries the **explicit-connectivity workstream** (§8): the mapper-side service is + designed against this model's actual queries and built incrementally alongside it, + with the §8.5 verify-against-a-known-topology step done early. A staged + simple→sophisticated congestion ladder, each rung useful on its own: + 1. **per-link queueing, open-loop** — packets route hop-by-hop and queue at busy + links → delay under load. Reuses the existing per-port queue/buffer machinery and + reverse-computes cleanly (local state). Already far beyond simplep2p. *(queueing + congestion, not yet transport dynamics)* + 2. **finite buffers + drop** (drop-tail / simple AQM) — models loss, the defining + Internet behavior HPC lacks. Still open-loop. + 3. **closed-loop transport** (TCP-like AIMD/CUBIC, RTT, retransmit) — real flow + dynamics (fairness, sawtooth, incast). The big new piece; **no HPC analog**. + 4. **programmable routing/protocols** for FABRIC-style custom behavior. + Rungs 1–2 ride on the Phase 4 primitives; rung 3 is where the new modeling + concentrates. Open design questions in §13. + +### 9.4 Wave 1 concrete steps + +1. **Build:** confirm `CMAKE_CXX_STANDARD 17`; add new C++ sources to the `codes` + library in `src/CMakeLists.txt`. +2. **Layer 0:** `codes/cpp/ross_lp.h` — `make_lptype()` + trampolines + + placement-new/dtor lifecycle (designed by the Phase 2 decisions, implemented here + with its first consumer), plus the C++-aware `crv_checkpointer`. +3. **Layer 1 (lean):** `codes/cpp/reversible_rng.h`, `codes/cpp/rc_verifier.h`. + (`RcStack`/`NetworkStats`/`Bandwidth` deferred to Wave 2 — synthetic LPs use + plain int counters.) +4. **Layer 2:** `codes/cpp/workload_driven_lp.h` + `traffic_pattern.h` with the + three patterns. Capture the real variation: destination selection + the small + topology queries each synthetic file hardcodes (e.g. the dragonfly `num_nodes` + formula). +5. **Run under the config-driven driver** (`Orchestrator`, *not* a new `main()`): + register the new C++ LP types with the registry; let the driver instantiate them + + their `TrafficPattern` from config. Retire the per-model `main()` boilerplate. +6. **Pilot then fold in:** port `model-net-synthetic.c` (dragonfly) first; verify; + then re-express `-fattree`/`-slimfly`/`-dragonfly-all` as configurations of the + same code. Keep existing executable names so downstream scripts keep working. + +### 9.5 Reuse (do not reinvent) + +`model_net_event` / `model_net_event_rc2`; `codes_local_latency` / +`codes_local_latency_reverse`; `codes_mapping_*`; `configuration_*`; `rc_stack` + +`crv_checkpointer` seams; `mn_stats` + `model_net_find_stats`; `bytes_to_ns` +(`common-net.c`); `codes_workload_*` vtable. + +### 9.6 Verification + +1. Build with C++17; the synthetic executables still link. +2. Golden output: run `tests/modelnet-test-{dragonfly,fattree,slimfly}-synthetic.sh` + before vs. after; diff per-server send/recv counts and MiB/s. +3. RC correctness: `--sync=1` vs. `--sync=3` must match. +4. RC drift: enable the `RcVerifier` in a debug build on a small run. +5. The headline win: 4 near-duplicate `.c` files collapse to one shared + implementation + small strategy/config files. + +--- + +## 10. Phase 5 — Far future + +- Decouple the `model_net_wrap_msg` union (adding a network currently edits a central + union and sizes every event to the largest member). +- Modernize the workload API (`codes_workload_method`). +- Surrogate / ML integration **cleanup** (`zmqml`, `torch-jit`). *Note:* the surrogate's + *config* home is not far-future — it is already reserved in the new format and firms + up near-term on the SBIR path as the surrogate productionizes (contract §7); only the + broader integration cleanup lives here. +- **Establish a performance baseline + tracking** (e.g. events/sec on fixed configs, + recorded in CI). Deliberately deferred: no baseline exists today to regress against, + and the equivalence harness checks outputs, not speed. Capture before/after numbers + once the Wave 2/3 refactors of hot LP code begin in earnest. +- Broader, deliberate C→C++ migration of remaining subsystems. +- **Standardize data collection** onto a single mechanism — CODES-direct stats + (`lp-io`, `counting_*`, sample files) and the ROSS instrumentation-layer callbacks + are duplicative today (called out in Phase 3 as out of scope there). +- **Model self-description / manifest** so a third-party model (in a separate repo) + can advertise its `ComponentType`, parameters (name, type, default, unit, + user-facing?), and connectivity for NetMaestro to **ingest into its catalog**. The + parameter-metadata piece dovetails with the Phase 4 model framework — a model that + declares its parameters serves both the config compiler *and* NetMaestro. + +--- + +## 11. Cross-cutting: the config-driven driver + +An earlier prototyping effort produced a single **config-driven driver** +(`Orchestrator`) that runs any simulation from config + an LP-type registry, +replacing the per-model `main()` file that used to be copied for every new +experiment. It is pulled into CODES with the config work (Phase 3) and retained as +**the** entry point. In Phase 3 it is **decoupled from the prototype `Mapper`** and +drives `codes_mapping_setup()` (the compiled config feeding the existing mapping); +harvesting the prototype `Mapper`'s connectivity model is deferred to the Wave 4 +connectivity workstream (§8). + +It is also the natural plug-in seam for the Phase 4 hybrid design: the driver +constructs an LP plus its injected strategy from the `model:` name + YAML properties. +So Wave 3's "knock off another network model" becomes "register another LP class with +the registry" — never a new driver file again. + +--- + +## 12. Decision log + +| Decision | Rationale | +|---|---| +| Rewrite **in the CODES repo** (not a separate one) | Single build, single source of truth; `digital-twin` was only ever a personal prototype to harvest from | +| Variation via **hybrid (thin base + composition)** | Matches existing `model_net_method` pattern; POD-friendly state; testable; readable for rotating contributors. CRTP held in reserve for proven hot paths only | +| RC bundle: **RNG wrapper + verify harness + RcStack** solid; **RcJournal** prototype | High-value/low-risk; verification makes hand-written reverse handlers safe; avoid perf-killing full state-saving | +| **Incremental** config adapter (YAML → existing mapping) | Mapping is highest-risk; decouple format adoption from the mapping rewrite; keep old `.conf` in parallel | +| Adapter **scoped narrowly** | Bridge only the configs the YAML examples target; verify by equivalence | +| First class wave = **synthetic LPs**; Wave 2 = **simplep2p** | Synthetic = safest proof of the architecture; simplep2p = clean self-contained model that also delivers the SBIR's ESnet-testbed artifact (2 sites ≈ point-to-point) | +| **simplep2p ≠ general WAN model; a new routing+congestion WAN model is Wave 4** | simplep2p has no runtime routing (direct per-pair pipes, `simplep2p.c:670`); fine for the ESnet testbed, but FABRIC/WAN research needs routing, congestion, loss — built by composing Phase 4 primitives. The WAN model is the refactor's concrete payoff | +| **C++17** baseline | Low risk; already used with Torch; enables modern idioms | +| Keep the **config-driven driver** | Eliminates per-model `main()` copying; is the plug-in seam for new LP classes | +| Optional deps use **tri-state style** (AUTO/ON/OFF; strict ON errors) | Reproducible configs; can disable a found dep; CI fails loudly when a requested dep is missing | +| Consume ROSS via **`find_package(ROSS CONFIG)`**; drop pkg-config + `ENV{PKG_CONFIG_PATH}` hack | ROSS now correctly exports a CMake package; target-based is the correct idiom | +| Phase 1 includes a **CMake modernization pass** (target-based, modern MPI, install/export) | Current build is dated/non-idiomatic; fix the foundation before layering on | +| **Drop DAMARIS build support, keep the code** | ROSS master removed the damaris build path; RISA rewrite is future — keep `#ifdef USE_RDAMARIS` code, remove only the build machinery | +| ROSS pinned via a **pinned clone built in CI** (not a submodule, not imaged) | ROSS is quick to build and the most-bumped pin — keep it a workflow variable so bumps need no image rebuild (cache by commit) | +| CI = **GitHub Actions**; ROSS in-CI, **heavy deps in Docker images**; full matrix on every MR | Separates the fast-moving pin (ROSS) from slow stable deps (imaged); core job needs no custom image; full builds stay cheap, no nightly tier | +| **python2 confined to the UNION feature** (full image, build-time only) | conceptual needs py2 for codegen; SWM-only online + everything else don't; UNION is optional/off the critical path | +| Equivalence diff: **`lp-io` now → results digest as models are rewritten**; skip fragile stdout | `lp-io` is structured/per-LP; stdout scraping is brittle; stdout-only models keep the aggregate event-count check until they get a digest | +| **Rich, placement-new'd LP state** (trampoline constructs/destructs; C++-aware `crv_checkpointer`) | Fixes current UB (assigning into unconstructed STL members); clean "state is the object" model; safe under reverse computation | +| C++ **snake_case** names, **`.cxx`** sources | Match existing CODES C style; `.cxx` avoids `.C`/`.c` clash on case-insensitive filesystems; harvested CamelCase config code gets renamed | +| **Fully co-located header+impl, big-bang reorg** (in Phase 1, after CI) | One clean break beats incremental smear; mechanical + CI-verifiable; mechanical-only + forwarding shims + timed for low branch activity; defines the public API surface | +| Phase 3 front-end is a **config compiler** (friendly YAML → resolved internal config → existing pipeline) | YAML intentionally omits implementation details; downstream `codes_mapping`/models unchanged; verify via compiled-tree `cf_equal` + behavioral equivalence | +| Plug in at the **`ConfigVTable` seam**; defer **connectivity** to the Wave 4 workstream | The prototype failed by changing format **and** mapping together; the seam keeps blast radius minimal and reuses the whole existing query pipeline | +| Explicit connectivity = **extend mapping** (harvest the prototype's graph idea), **not** replace `codes_mapping`; additive, narrow-first | The format already runs via Phase 3; the only real gap is explicit connectivity for irregular WANs. `codes_mapping` has no connectivity API; models derive neighbors themselves. Maturing the prototype wholesale would re-drop group/rep/annotation (§7.2) and force an every-call-site migration to *lose* proven semantics. An additive connectivity service beside `codes_mapping` keeps blast radius low; full replacement demoted to optional/future | +| The connectivity work is a **Wave 4 workstream, not a standalone phase** (the former mapping phase is dissolved) | Its only real consumer is the Wave 4 WAN model — Waves 1–2 run off the Phase 3 compiled link table and HPC models derive their own neighbors. Designing the query API against its real consumer ("born from real consumers") beats building it in a vacuum, and it can grow incrementally: SBIR-minimum first, rest time-pending | +| **Cytoscape** compatible topology for WANs (JSON ⊂ YAML, one parser), explicit units, per-edge bw/latency | Established format NetMaestro's editor exports natively; drops DOT/`libcgraph` from prototype and the error-prone latency/bandwidth matrices | +| **Custom components + per-node overrides**; users never write annotations | Matches NetMaestro's catalog→instance→place flow; subsumes `@annotation` (the compiler may emit annotations internally to drive the existing mapping) | +| Hide engine knobs (`message_size`, `modelnet_order`, `pe_mem_factor`, rep counts) as **derived**; everything else defaulted + overridable | Friendly to non-PDES users; e.g. `message_size` is the ROSS event-blob size, which should be an implementation detail and never a user concept | +| Phase 1 **split: 1a safety net** (CI, ROSS pin, equivalence, C++17, unit-test framework) **gates Phase 3; 1b build hygiene** (CMake modernization + reorg, presets, tri-state, images, install/export) doesn't | The SBIR-critical Phase 3 shouldn't queue behind build hygiene; by the §2 criteria most of 1b gates nothing — it may run in parallel with Phase 3 | +| **Unit-test framework in 1a**; compiler-validation tests are a Phase 3 deliverable | Equivalence protects refactors, not new code — the config compiler's negative paths (redundant shape, bad units, hand-written derived values) need real unit tests | +| **Automated style enforcement in Phase 2** (clang-format CI gate + clang-tidy starter profile) | The diagnosed root cause is style drift from rotating contributors; a conventions doc alone doesn't survive them — CI gates do | +| **Trampoline implemented in Wave 1**, not Phase 2 (design decisions stay in Phase 2) | Born from its first real consumer; built cold it risks redesign on first contact | +| simplep2p YAML initially **references the existing matrix files by path** | The seam delivers paths and the model `fread`s them itself (`simplep2p.c:871`); zero model changes. The per-edge form can later compile to *emitted* matrix files; whether to modernize simplep2p at all is revisited once the WAN model exists | +| Synthetic params stay **CLI-only initially → later config-or-CLI, CLI takes precedence**; every run dumps the resolved config incl. overrides | They're ROSS options today, not config; parameter sweeps shouldn't need near-identical config copies; the resolved-config dump keeps every result traceable | +| **Job placement (jobmap) ≠ LP mapping** — the format drives the existing `codes-jobmap` as-is | Placing a job's ranks on *simulated* nodes is a model concern; placing/routing LPs is an engine concern. Multi-job support doesn't wait on the connectivity/mapper work (contract §6.4) | +| **Performance baseline deferred** to far future (§10) | No baseline exists today to regress against; recorded as a decision, not an oversight | + +--- + +## 13. Open questions + +- **Config-format specifics** (Phase 3): largely settled — see §7 and + [new-config-format.md](new-config-format.md). + **Remaining:** the per-model canonical input sets; placement-policy set + jobmap + binding; per-source workload schemas beyond `synthetic`; the future **shared generator + → elements utility** (where it runs / what language) and NetMaestro HPC visualization; + the **very-large** generated source; the wire-format micro-decisions in + new-config-format.md §13. +- **Unit-test framework choice** — Catch2 vs GoogleTest (Phase 1a, Workstream 6). +- **Deprecation window** for the old `.conf` (how many releases?). \ No newline at end of file From e6a2d9e6f6acdf2a18283b295c84f615288e0850 Mon Sep 17 00:00:00 2001 From: Caitlin Ross Date: Mon, 22 Jun 2026 16:02:14 -0500 Subject: [PATCH 02/20] update refactor roadmap --- doc/refactor-plan/refactor-roadmap.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/doc/refactor-plan/refactor-roadmap.md b/doc/refactor-plan/refactor-roadmap.md index 8d9d7245..1cb989b9 100644 --- a/doc/refactor-plan/refactor-roadmap.md +++ b/doc/refactor-plan/refactor-roadmap.md @@ -313,6 +313,17 @@ foundation here lets both reuse it. (C++17 itself is set in Phase 1.) - **Conventions doc** — captures the naming/layout/extension rules above; the **reverse-computation discipline** guide is stubbed now and filled in Phase 4 when the RC helpers exist. +- **Retire the intrusive-list C macros** (candidate, surfaced in Phase 1 B1). + `codes/quicklist.h` is a Linux-kernel-style intrusive list — `qlist_entry` is + `container_of`, and `typeof(*pos)` recovers the entry's struct type — the classic + template-free, type-generic-C idiom. It predates the C++ work and is `#include`d today + by *both* C and C++ translation units (1 `.C`, 7 `.c`). Phase 1 B1 changed its + `typeof` → `__typeof__` so it survives strict `-std=c++17` (`CMAKE_CXX_EXTENSIONS OFF`); + that's a portable stopgap, **not** the end state. The C++ replacement is a `std::` + container or a small templated intrusive list, done when the owning data structures are + modernized (Phase 4 waves) — at which point `quicklist.h` and the `typeof`/`__typeof__` + question delete themselves. A standalone `typeof` → `decltype` swap isn't worth doing + before then, and `decltype` can't land while the header is still shared with C sources. The **Layer-0 ROSS trampoline** (§9.2) and the C++-aware `crv_checkpointer` are *designed* by the decisions above but **implemented in Phase 4 Wave 1, against their From 268170f01321d2c3d24f76155ceefd02394aed8f Mon Sep 17 00:00:00 2001 From: Caitlin Ross Date: Mon, 22 Jun 2026 14:52:06 -0500 Subject: [PATCH 03/20] ci: add mac build --- .github/workflows/build.yml | 46 ++++++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 11 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 52ab0520..1d9af0b5 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -8,25 +8,37 @@ on: - master schedule: # Daily at 07:00 UTC. Catches drift between pin bumps — runner image - # updates, apt package bumps, and ROSS @ master moving forward. The + # updates, apt/brew package bumps, and ROSS @ master moving forward. The # scheduled run intentionally ignores ROSS_REF and builds against # ROSS-org/ROSS@master so a stale pin still surfaces problems. - cron: "0 7 * * *" -# Bootstrap CI for CODES +# CI for CODES. # -# Today: one minimal Linux job — ubuntu-24.04 + system MPICH + a freshly -# built ROSS pinned to a SHA. Heavy optional deps (SWM, UNION, DUMPI, -# TORCH, ZMQML) stay OFF so this job runs on a stock runner with no -# custom image. macOS, OpenMPI, the compiler matrix, coverage tracking, -# and the heavy-deps "full" job are tracked separately as 1b followups. +# Today: an OS matrix — ubuntu-24.04 (apt MPICH) and macos-14 / Apple Silicon +# (Homebrew MPICH) — each building a freshly built ROSS pinned to a SHA, then +# CODES, then ctest. Heavy optional deps (SWM, UNION, DUMPI, TORCH, ZMQML) +# stay OFF so both legs run on stock runners with no custom image. +# +# The macOS leg is the safety net for Mac-specific link/include breakage, which +# has historically reached master and had to be hand-fixed on a developer Mac. +# +# Still deferred (1b follow-ups): OpenMPI alongside MPICH, ubuntu-22.04, a +# gcc/clang compiler matrix, coverage tracking, and a heavy-deps "full" job. # # Symmetric with ROSS's codes-contract.yml: ROSS pins CODES, CODES pins # ROSS, each catches consumer-API regressions in the other. jobs: - build-linux: - runs-on: ubuntu-24.04 + build: + name: build (${{ matrix.os }}) + runs-on: ${{ matrix.os }} + strategy: + # Don't let one platform's failure cancel the other — we want to see + # both Linux and macOS results on every run. + fail-fast: false + matrix: + os: [ubuntu-24.04, macos-14] env: # ross-org/ROSS @ master as of 2026-06-17. # Bump this SHA whenever CODES needs to track ROSS forward — that @@ -49,7 +61,8 @@ jobs: ref: ${{ github.event_name == 'schedule' && 'master' || env.ROSS_REF }} path: ross - - name: Install system dependencies + - name: Install system dependencies (Linux) + if: runner.os == 'Linux' run: | sudo apt-get update sudo apt-get install -y \ @@ -57,6 +70,16 @@ jobs: cmake ninja-build pkg-config \ flex bison + - name: Install system dependencies (macOS) + if: runner.os == 'macOS' + run: | + brew install mpich cmake ninja pkg-config flex bison + # Homebrew flex/bison are keg-only and the macOS system ones are + # too old for flex_target/bison_target — put the brew versions + # ahead of /usr/bin on PATH for subsequent steps. + echo "$(brew --prefix bison)/bin" >> "$GITHUB_PATH" + echo "$(brew --prefix flex)/bin" >> "$GITHUB_PATH" + - name: Configure ROSS run: > cmake -S ross -B ross/build -G Ninja @@ -88,7 +111,8 @@ jobs: if: failure() uses: actions/upload-artifact@v4 with: - name: build-logs + # Artifact names must be unique across matrix legs (upload-artifact@v4). + name: build-logs-${{ matrix.os }} path: | codes/build/Testing/Temporary/LastTest.log codes/build/Testing/Temporary/LastTestsFailed.log From 5d9ef37f398ee897e17e8df101484071cc216e5e Mon Sep 17 00:00:00 2001 From: Caitlin Ross Date: Mon, 22 Jun 2026 16:01:45 -0500 Subject: [PATCH 04/20] cmake: set c++17 and turn off compiler extensions for portability --- CMakeLists.txt | 7 +++++-- codes/quicklist.h | 10 +++++----- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 20afe8d1..8521d2cb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,8 +11,12 @@ cmake_print_variables(CMAKE_MODULE_PATH) set(CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}" CACHE PATH "Where to install CODES") -set(CMAKE_CXX_STANDARD 11) +# C++17 is the project-wide baseline (was previously only enabled when Torch +# was found). Required, no compiler extensions, so the build is portable +# across gcc/clang/AppleClang. +set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED True) +set(CMAKE_CXX_EXTENSIONS OFF) set(CMAKE_C_STANDARD 11) set(CMAKE_C_STANDARD_REQUIRED True) @@ -123,7 +127,6 @@ endif() if((NOT DEFINED USE_TORCH) OR USE_TORCH) find_package(Torch) if(Torch_FOUND) - set(CMAKE_CXX_STANDARD 17) add_definitions(-DUSE_TORCH) set(USE_TORCH true) message(STATUS "Loading TORCH models enabled.") diff --git a/codes/quicklist.h b/codes/quicklist.h index 440312e5..02a96ba9 100644 --- a/codes/quicklist.h +++ b/codes/quicklist.h @@ -213,8 +213,8 @@ static __inline__ void qlist_splice(struct qlist_head* qlist, struct qlist_head* * @member: the name of the list_struct within the struct. */ #define qlist_for_each_entry(pos, head, member) \ - for (pos = qlist_entry((head)->next, typeof(*pos), member); &pos->member != (head); \ - pos = qlist_entry(pos->member.next, typeof(*pos), member)) + for (pos = qlist_entry((head)->next, __typeof__(*pos), member); &pos->member != (head); \ + pos = qlist_entry(pos->member.next, __typeof__(*pos), member)) /** * qlist_for_each_entry_safe - iterate over list of given type safe against removal of list entry @@ -224,9 +224,9 @@ static __inline__ void qlist_splice(struct qlist_head* qlist, struct qlist_head* * @member: the name of the list_struct within the struct. */ #define qlist_for_each_entry_safe(pos, n, head, member) \ - for (pos = qlist_entry((head)->next, typeof(*pos), member), \ - n = qlist_entry(pos->member.next, typeof(*pos), member); \ - &pos->member != (head); pos = n, n = qlist_entry(n->member.next, typeof(*n), member)) + for (pos = qlist_entry((head)->next, __typeof__(*pos), member), \ + n = qlist_entry(pos->member.next, __typeof__(*pos), member); \ + &pos->member != (head); pos = n, n = qlist_entry(n->member.next, __typeof__(*n), member)) static inline int qlist_exists(struct qlist_head* list, struct qlist_head* qlink) { struct qlist_head* pos; From 2c1b93496f3e9464a8d4271db51ba737b0c0fbc5 Mon Sep 17 00:00:00 2001 From: Caitlin Ross Date: Mon, 22 Jun 2026 16:24:19 -0500 Subject: [PATCH 05/20] cmake: modernize find_package(mpi) --- .github/workflows/build.yml | 2 - CMakeLists.txt | 19 +++------ CODES-compile-instructions.sh | 5 +-- README.md | 7 +++- src/CMakeLists.txt | 6 ++- src/cmake/SetupMPI.cmake | 77 ----------------------------------- 6 files changed, 17 insertions(+), 99 deletions(-) delete mode 100644 src/cmake/SetupMPI.cmake diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 1d9af0b5..fddfd0ab 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -97,8 +97,6 @@ jobs: -DBUILD_TESTING=ON -DUSE_TORCH=OFF -DUSE_ZMQML=OFF - -DCMAKE_C_COMPILER=mpicc - -DCMAKE_CXX_COMPILER=mpicxx -DROSS_PKG_CONFIG_PATH=$PWD/ross-install/lib/pkgconfig - name: Build CODES diff --git a/CMakeLists.txt b/CMakeLists.txt index 8521d2cb..28a62494 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,9 +6,6 @@ project(codes LANGUAGES C CXX VERSION 2.0) include(CMakePrintHelpers) cmake_print_variables(CMAKE_CURRENT_SOURCE_DIR) -list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/src/cmake/") -cmake_print_variables(CMAKE_MODULE_PATH) - set(CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}" CACHE PATH "Where to install CODES") # C++17 is the project-wide baseline (was previously only enabled when Torch @@ -35,16 +32,12 @@ find_package(PkgConfig REQUIRED) set(ENV{PKG_CONFIG_PATH} "${ROSS_PKG_CONFIG_PATH}:${SWM_PKG_CONFIG_PATH}:${UNION_PKG_CONFIG_PATH}:${ARGOBOTS_PKG_CONFIG_PATH}") pkg_check_modules(ROSS REQUIRED IMPORTED_TARGET ross) -# MPI -include(SetupMPI) -if(MPI_C_FOUND) - include_directories(${MPI_C_INCLUDE_PATH}) - list(APPEND CODES_EXTERNAL_LIBS ${MPI_C_LIBRARIES}) -else(MPI_C_FOUND) - message("WARNING: Could not find MPI!") - message(" Either add an MPI compiler to your path (using modules)") - message(" Or force CMake to build using the correct compiler (`export CC=mpicc`)") -endif(MPI_C_FOUND) +# MPI — modern imported-target discovery. find_package(MPI) provides the +# MPI::MPI_C / MPI::MPI_CXX imported targets, which carry the include dirs, +# compile flags, and link libraries; targets link them directly (see +# src/CMakeLists.txt). No need to set CC=mpicc — FindMPI locates the wrapper +# compiler automatically and derives the flags from it. +find_package(MPI REQUIRED) ## DUMPI diff --git a/CODES-compile-instructions.sh b/CODES-compile-instructions.sh index 23f862ae..bf2ad326 100644 --- a/CODES-compile-instructions.sh +++ b/CODES-compile-instructions.sh @@ -133,8 +133,8 @@ else fi # ---- end CODES CUDA arch autodetection ---- -cmake .. -DCMAKE_C_COMPILER=mpicc -DCMAKE_CXX_COMPILER=mpicxx -DROSS_BUILD_MODELS=ON -DCMAKE_INSTALL_PREFIX="$(realpath ./bin)" \ - -DCMAKE_C_COMPILER=mpicc -DCMAKE_BUILD_TYPE=Debug -DCMAKE_C_FLAGS="-g -Wall" +cmake .. -DROSS_BUILD_MODELS=ON -DCMAKE_INSTALL_PREFIX="$(realpath ./bin)" \ + -DCMAKE_BUILD_TYPE=Debug -DCMAKE_C_FLAGS="-g -Wall" #make VERBOSE=1 make install -j4 err=$? @@ -358,7 +358,6 @@ fi make_args_codes=( -DCMAKE_PREFIX_PATH="${cmake_prefix_path}" - -DCMAKE_CXX_COMPILER=mpicxx -DCMAKE_C_COMPILER=mpicc -DCMAKE_C_FLAGS="-g -Wall" -DCMAKE_CXX_FLAGS="-g -Wall" -DTHREADS_PREFER_PTHREAD_FLAG=ON diff --git a/README.md b/README.md index 92b61cf4..4d909dc3 100644 --- a/README.md +++ b/README.md @@ -53,8 +53,6 @@ cd codes && mkdir build && cd build # 3. Configure with CMake cmake .. \ -DCMAKE_PREFIX_PATH=$HOME/ross \ - -DCMAKE_C_COMPILER=mpicc \ - -DCMAKE_CXX_COMPILER=mpicxx \ -DCMAKE_BUILD_TYPE=Debug \ -DBUILD_TESTING=ON @@ -63,6 +61,11 @@ make -j ctest ``` +MPI is auto-discovered via `find_package(MPI)` — do **not** set `CC=mpicc` or +`-DCMAKE_C_COMPILER=mpicc`. Just make sure an MPI implementation is installed and +its wrapper (`mpicc`) is on your `PATH` (e.g. `module load mpich`). For a +non-standard install, hint with `-DMPI_HOME=/path/to/mpi`. + ## Testing Check your installation with: diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 99430538..44fd74dc 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -143,8 +143,10 @@ endif() add_library(codes STATIC ${SRCS}) -list(APPEND LIBS_TO_LINK ${MPI_C_LIBRARIES}) -target_include_directories(codes INTERFACE ${MPI_C_INCLUDE_PATH}) +# MPI imported targets carry the include dirs, compile flags, and link libs. +# Linked PUBLIC (via LIBS_TO_LINK below) so codes' own sources and every +# consumer (executables, tests) inherit them. +list(APPEND LIBS_TO_LINK MPI::MPI_C MPI::MPI_CXX) # set(LIBS_TO_LINK # PkgConfig::ROSS diff --git a/src/cmake/SetupMPI.cmake b/src/cmake/SetupMPI.cmake deleted file mode 100644 index c257cefa..00000000 --- a/src/cmake/SetupMPI.cmake +++ /dev/null @@ -1,77 +0,0 @@ -############################################################################### -# Copyright (c) 2017, Lawrence Livermore National Security, LLC. -# -# Produced at the Lawrence Livermore National Laboratory -# -# LLNL-CODE-725085 -# -# All rights reserved. -# -# This file is part of BLT. -# -# For additional details, please also read BLT/LICENSE. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, -# this list of conditions and the disclaimer below. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the disclaimer (as noted below) in the -# documentation and/or other materials provided with the distribution. -# -# * Neither the name of the LLNS/LLNL nor the names of its contributors may -# be used to endorse or promote products derived from this software without -# specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, -# LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY -# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, -# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING -# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -# POSSIBILITY OF SUCH DAMAGE. -# -############################################################################### - -################################ -# MPI -################################ - -find_package(MPI) -message(STATUS "MPI C Compile Flags: ${MPI_C_COMPILE_FLAGS}") -message(STATUS "MPI C Include Path: ${MPI_C_INCLUDE_PATH}") -message(STATUS "MPI C Link Flags: ${MPI_C_LINK_FLAGS}") -message(STATUS "MPI C Libraries: ${MPI_C_LIBRARIES}") - -message(STATUS "MPI CXX Compile Flags: ${MPI_CXX_COMPILE_FLAGS}") -message(STATUS "MPI CXX Include Path: ${MPI_CXX_INCLUDE_PATH}") -message(STATUS "MPI CXX Link Flags: ${MPI_CXX_LINK_FLAGS}") -message(STATUS "MPI CXX Libraries: ${MPI_CXX_LIBRARIES}") - -message(STATUS "MPI Executable: ${MPIEXEC}") -message(STATUS "MPI Num Proc Flag: ${MPIEXEC_NUMPROC_FLAG}") - - -if (ENABLE_FORTRAN) - # Determine if we should use fortran mpif.h header or fortran mpi module - find_path(mpif_path - NAMES "mpif.h" - PATHS ${MPI_Fortran_INCLUDE_PATH} - NO_DEFAULT_PATH - ) - - if(mpif_path) - set(MPI_Fortran_USE_MPIF ON CACHE PATH "") - message(STATUS "Using MPI Fortran header: mpif.h") - else() - set(MPI_Fortran_USE_MPIF OFF CACHE PATH "") - message(STATUS "Using MPI Fortran module: mpi.mod") - endif() -endif() From 94ecfd1e741e71496068599b7b6072affc5bb743 Mon Sep 17 00:00:00 2001 From: Caitlin Ross Date: Mon, 22 Jun 2026 16:46:07 -0500 Subject: [PATCH 06/20] cmake: find_package(ROSS) --- .github/workflows/build.yml | 2 +- CMakeLists.txt | 13 ++++++++++--- src/CMakeLists.txt | 5 ++--- tests/CMakeLists.txt | 4 +++- 4 files changed, 16 insertions(+), 8 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index fddfd0ab..704a299d 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -97,7 +97,7 @@ jobs: -DBUILD_TESTING=ON -DUSE_TORCH=OFF -DUSE_ZMQML=OFF - -DROSS_PKG_CONFIG_PATH=$PWD/ross-install/lib/pkgconfig + -DCMAKE_PREFIX_PATH=$PWD/ross-install - name: Build CODES run: cmake --build codes/build -j diff --git a/CMakeLists.txt b/CMakeLists.txt index 28a62494..3ec9800f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -21,16 +21,23 @@ set(CMAKE_C_STANDARD_REQUIRED True) #prevent cmake from stripping the runtime path (important if shared libraries are imported) SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) -set(ROSS_PKG_CONFIG_PATH "" CACHE PATH "Where is ROSS PKG_CONFIG is installed?") set(SWM_PKG_CONFIG_PATH "" CACHE PATH "Where is the SWM PKG_CONFIG installed?") set(UNION_PKG_CONFIG_PATH "" CACHE PATH "Where is the Union PKG_CONFIG installed?") set(ARGOBOTS_PKG_CONFIG_PATH "" CACHE PATH "Where is argobots PKG_COPNFIG installed? Necessary for SWM") set(DAMARIS_PKG_CONFIG_PATH "" CACHE PATH "Where is the damaris PKG_CONFIG installed?") +# ROSS — modern CMake-config discovery. find_package(ROSS) resolves +# /lib/cmake/ROSS/ROSSConfig.cmake; point CMAKE_PREFIX_PATH at the +# ROSS install prefix. The ROSS::ROSS imported target carries the include +# dirs, the MPI dependency, and link libraries (linked in src/CMakeLists.txt). +find_package(ROSS CONFIG REQUIRED) + +# PkgConfig still discovers the optional SWM/UNION/ARGOBOTS deps below +# (modernized to imported targets in a later step); their *_PKG_CONFIG_PATH +# cache vars feed PKG_CONFIG_PATH here. find_package(PkgConfig REQUIRED) -set(ENV{PKG_CONFIG_PATH} "${ROSS_PKG_CONFIG_PATH}:${SWM_PKG_CONFIG_PATH}:${UNION_PKG_CONFIG_PATH}:${ARGOBOTS_PKG_CONFIG_PATH}") -pkg_check_modules(ROSS REQUIRED IMPORTED_TARGET ross) +set(ENV{PKG_CONFIG_PATH} "${SWM_PKG_CONFIG_PATH}:${UNION_PKG_CONFIG_PATH}:${ARGOBOTS_PKG_CONFIG_PATH}") # MPI — modern imported-target discovery. find_package(MPI) provides the # MPI::MPI_C / MPI::MPI_CXX imported targets, which carry the include dirs, diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 44fd74dc..485b43e7 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -89,7 +89,7 @@ list(APPEND SRCS modelconfig/txt_configfile.c ) -list(APPEND LIBS_TO_LINK PkgConfig::ROSS) +list(APPEND LIBS_TO_LINK ROSS::ROSS) if(USE_DUMPI) list(APPEND SRCS workload/methods/codes-dumpi-trace-nw-wrkld.c) @@ -190,7 +190,6 @@ endif() # target_link_libraries(codes PUBLIC PkgConfig::ROSS) target_include_directories(codes PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} - ${ROSS_INCLUDE_DIRS} ${PROJECT_BINARY_DIR} ${PROJECT_SOURCE_DIR} ${PROJECT_SOURCE_DIR}/codes @@ -238,7 +237,7 @@ if(USE_ZMQML) endif() foreach(tar IN LISTS CODES_TARGETS) - target_include_directories(${tar} PUBLIC ${CODES_INCLUDE_DIRS} ${ROSS_INCLUDE_DIRS}) + target_include_directories(${tar} PUBLIC ${CODES_INCLUDE_DIRS}) target_link_libraries(${tar} PUBLIC codes ${LIBS_TO_LINK}) if(USE_ZMQML) target_link_libraries(${tar} PUBLIC zmqmlrequester ${ZeroMQ_LIBRARY}) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 6390965c..58105aff 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -2,7 +2,9 @@ enable_testing() configure_file(run-test.sh.in run-test.sh) -include_directories("${ROSS_INCLUDE_DIRS}" "${CODES_SOURCE_DIR}") +# ROSS include dirs come transitively from linking codes (PUBLIC ROSS::ROSS); +# CODES_SOURCE_DIR is kept so test sources resolve in-tree codes headers. +include_directories("${CODES_SOURCE_DIR}") # Unfortunatelly, CMake doesn't support iteration of a key-pair structure, # otherwise the following lists could be easily compressed into a single From 466868ba63f7b707f32228c182e91058132b0142 Mon Sep 17 00:00:00 2001 From: Caitlin Ross Date: Mon, 22 Jun 2026 17:07:25 -0500 Subject: [PATCH 07/20] cmake: make the codes build target-based - convert global add_definitions(-DUSE_*) to target_compile_definitions on codes - drop redundant CMAKE_C_FLAGS include appends (already covered by target_include_directories) - executables/tests inherit includes + libs transitively by linking codes --- CMakeLists.txt | 22 ++++++---------------- src/CMakeLists.txt | 27 +++++++++------------------ tests/CMakeLists.txt | 4 ---- 3 files changed, 15 insertions(+), 38 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3ec9800f..ca815c7e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -56,9 +56,7 @@ if(NOT DUMPI_LIB) else(DUMPI_LIB) message(STATUS "Undumpi library found ${DUMPI_LIB}") set(DUMPI_INCLUDE "${DUMPI_BUILD_PATH}/include" CACHE PATH "Dumpi library include") - set(DUMPI_CFLAGS "-I${DUMPI_INCLUDE}") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${DUMPI_CFLAGS}") - add_definitions(-DUSE_DUMPI=1) + list(APPEND CODES_COMPILE_DEFINITIONS USE_DUMPI=1) set(USE_DUMPI true) endif() @@ -75,31 +73,24 @@ else(SWM_FOUND) else(ARGOBOTS_FOUND) message(STATUS "Argobots Library Found: ${ARGOBOTS_LIBRARIES}") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${ARGOBOTS_CFLAGS} -I${ARGOBOTS_INCLUDE}") pkg_get_variable(SWM_DATAROOTDIR swm datarootdir) cmake_print_variables(SWM_DATAROOTDIR) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SWM_CFLAGS} -I${SWM_INCLUDE}") - add_definitions(-DUSE_ONLINE=1) + list(APPEND CODES_COMPILE_DEFINITIONS USE_ONLINE=1) set(USE_ONLINE true) pkg_check_modules(UNION IMPORTED_TARGET union) if(NOT UNION_FOUND) message(STATUS "UNION Library Not Found, SWM-only online workloads enabled") - add_definitions(-DUSE_SWM=1) + list(APPEND CODES_COMPILE_DEFINITIONS USE_SWM=1) set(USE_SWM true) else(UNION_FOUND) message(STATUS "UNION Library Found: ${UNION_LIBRARIES}") pkg_get_variable(UNION_DATAROOTDIR union datarootdir) cmake_print_variables(UNION_DATAROOTDIR) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${UNION_INCLUDE}") - foreach(INCLUDE_OPT ${UNION_CFLAGS}) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${INCLUDE_OPT}") - endforeach() - - add_definitions(-DUSE_UNION=1) + list(APPEND CODES_COMPILE_DEFINITIONS USE_UNION=1) set(USE_UNION true) endif() endif() @@ -109,7 +100,7 @@ endif() ## RECORDER option(USE_RECORDER "use recorder io workload" ON) if(USE_RECORDER) - add_definitions(-DUSE_RECORDER=1) + list(APPEND CODES_COMPILE_DEFINITIONS USE_RECORDER=1) endif() ## DARSHAN @@ -127,7 +118,7 @@ endif() if((NOT DEFINED USE_TORCH) OR USE_TORCH) find_package(Torch) if(Torch_FOUND) - add_definitions(-DUSE_TORCH) + list(APPEND CODES_COMPILE_DEFINITIONS USE_TORCH) set(USE_TORCH true) message(STATUS "Loading TORCH models enabled.") else() @@ -138,7 +129,6 @@ else() message(STATUS "Loading TORCH models NOT enabled.") endif() -cmake_print_variables(CMAKE_C_FLAGS) add_subdirectory(src) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 485b43e7..97a177d5 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -143,20 +143,16 @@ endif() add_library(codes STATIC ${SRCS}) +# Compile definitions accumulated from optional-dep detection in the top-level +# CMakeLists (USE_DUMPI, USE_ONLINE, USE_SWM, USE_UNION, USE_RECORDER, USE_TORCH). +# PUBLIC so codes' sources and every consumer (executables, tests) see them. +target_compile_definitions(codes PUBLIC ${CODES_COMPILE_DEFINITIONS}) + # MPI imported targets carry the include dirs, compile flags, and link libs. # Linked PUBLIC (via LIBS_TO_LINK below) so codes' own sources and every # consumer (executables, tests) inherit them. list(APPEND LIBS_TO_LINK MPI::MPI_C MPI::MPI_CXX) -# set(LIBS_TO_LINK -# PkgConfig::ROSS -# ${DUMPI_LIB} -# PkgConfig::ARGOBOTS -# PkgConfig::SWM -# ) - -#LINK DUMPI -# target_link_libraries(codes PUBLIC ${DUPMI_LIB}) if(USE_DUMPI) target_include_directories(codes PUBLIC ${DUMPI_INCLUDE}) endif() @@ -185,9 +181,6 @@ if(USE_ZMQML) target_compile_definitions(codes PUBLIC USE_ZMQML) endif() -#LINK ROSS -# target_link_libraries(codes PUBLIC #{pkgcfg_lib_ROSS_ROSS}) -# target_link_libraries(codes PUBLIC PkgConfig::ROSS) target_include_directories(codes PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${PROJECT_BINARY_DIR} @@ -200,9 +193,6 @@ target_include_directories(codes PUBLIC target_link_libraries(codes PUBLIC ${LIBS_TO_LINK}) -get_target_property(CODES_INCLUDE_DIRS codes INCLUDE_DIRECTORIES) -cmake_print_variables(CODES_INCLUDE_DIRS) - add_executable(topology-test networks/model-net/topology-test.c) add_executable(model-net-mpi-replay network-workloads/model-net-mpi-replay.c network-workloads/model-net-mpi-replay-main.c) if(USE_DUMPI) @@ -236,11 +226,12 @@ if(USE_ZMQML) find_library(ZeroMQ_LIBRARY NAMES zmq PATHS ${PC_ZeroMQ_LIBRARY_DIRS}) endif() +# Each executable links codes; its include dirs, MPI, ROSS, and optional-dep +# libs all propagate transitively through codes' PUBLIC usage requirements. foreach(tar IN LISTS CODES_TARGETS) - target_include_directories(${tar} PUBLIC ${CODES_INCLUDE_DIRS}) - target_link_libraries(${tar} PUBLIC codes ${LIBS_TO_LINK}) + target_link_libraries(${tar} PRIVATE codes) if(USE_ZMQML) - target_link_libraries(${tar} PUBLIC zmqmlrequester ${ZeroMQ_LIBRARY}) + target_link_libraries(${tar} PRIVATE zmqmlrequester ${ZeroMQ_LIBRARY}) endif() endforeach() diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 58105aff..ec0acdeb 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -2,10 +2,6 @@ enable_testing() configure_file(run-test.sh.in run-test.sh) -# ROSS include dirs come transitively from linking codes (PUBLIC ROSS::ROSS); -# CODES_SOURCE_DIR is kept so test sources resolve in-tree codes headers. -include_directories("${CODES_SOURCE_DIR}") - # Unfortunatelly, CMake doesn't support iteration of a key-pair structure, # otherwise the following lists could be easily compressed into a single # list/dictionary/structure. Instead each C file name **MUST** match each From 303cb4465186b019d6646c4e842d161a40c8a2f3 Mon Sep 17 00:00:00 2001 From: Caitlin Ross Date: Tue, 23 Jun 2026 12:48:06 -0500 Subject: [PATCH 08/20] cmake: improve options for finding and using dependencies The various dependencies had some inconsistencies in how they were handled. Some if found in PATH, could not be turned off. Now all have three possible states: AUTO, ON, OFF. All are AUTO by default meaning that we will attempt to find them and if found, build with them. If a dependency is not found, there is no error or warning, we just build without it. If a dependency is set to OFF, it won't be looked for at all, and if a dependency is set to ON and not found, an error will be given. Also changed naming from USE_* to CODES_USE_* to prevent any collisions with other projects that may use CODES. --- .github/workflows/build.yml | 6 +- CMakeLists.txt | 186 +++++++++++++------ CODES-compile-instructions.sh | 6 +- src/CMakeLists.txt | 34 ++-- src/network-workloads/model-net-mpi-replay.c | 2 +- src/networks/model-net/dragonfly-dally.C | 16 +- 6 files changed, 158 insertions(+), 92 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 704a299d..c837f9a3 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -17,7 +17,7 @@ on: # # Today: an OS matrix — ubuntu-24.04 (apt MPICH) and macos-14 / Apple Silicon # (Homebrew MPICH) — each building a freshly built ROSS pinned to a SHA, then -# CODES, then ctest. Heavy optional deps (SWM, UNION, DUMPI, TORCH, ZMQML) +# CODES, then ctest. Heavy optional deps (SWM, UNION, DUMPI, TORCH, ZEROMQ) # stay OFF so both legs run on stock runners with no custom image. # # The macOS leg is the safety net for Mac-specific link/include breakage, which @@ -95,8 +95,8 @@ jobs: cmake -S codes -B codes/build -G Ninja -DCMAKE_BUILD_TYPE=Debug -DBUILD_TESTING=ON - -DUSE_TORCH=OFF - -DUSE_ZMQML=OFF + -DCODES_USE_TORCH=OFF + -DCODES_USE_ZEROMQ=OFF -DCMAKE_PREFIX_PATH=$PWD/ross-install - name: Build CODES diff --git a/CMakeLists.txt b/CMakeLists.txt index ca815c7e..a30e507d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -47,66 +47,125 @@ set(ENV{PKG_CONFIG_PATH} "${SWM_PKG_CONFIG_PATH}:${UNION_PKG_CONFIG_PATH}:${ARGO find_package(MPI REQUIRED) -## DUMPI +# ============================================================================ +# Optional dependencies — tri-state CODES_USE_ (AUTO / ON / OFF) +# ---------------------------------------------------------------------------- +# AUTO : probe; enable if found, disable quietly if not. +# ON : probe; hard error if not found. +# OFF : don't probe; feature disabled. +# Each resolves to an internal USE_ bool (names unchanged) consumed by +# src/CMakeLists.txt and the C sources' #ifdef sites. +# ============================================================================ + set(DUMPI_BUILD_PATH "" CACHE PATH "Directory where dumpi include and lib are installed") -find_library(DUMPI_LIB undumpi PATHS ${DUMPI_BUILD_PATH}/lib) -if(NOT DUMPI_LIB) - message(STATUS "Undumpi library not found, DUMPI trace workloads disabled") - unset(USE_DUMPI) -else(DUMPI_LIB) - message(STATUS "Undumpi library found ${DUMPI_LIB}") - set(DUMPI_INCLUDE "${DUMPI_BUILD_PATH}/include" CACHE PATH "Dumpi library include") - list(APPEND CODES_COMPILE_DEFINITIONS USE_DUMPI=1) - set(USE_DUMPI true) -endif() +set(ZEROMQ_BUILD_PATH "" CACHE PATH "Directory containing libzmqmlrequester.so (for CODES_USE_ZEROMQ)") + +set(CODES_USE_DUMPI AUTO CACHE STRING "DUMPI trace workloads (AUTO/ON/OFF)") +set(CODES_USE_SWM AUTO CACHE STRING "SWM online workloads; requires argobots (AUTO/ON/OFF)") +set(CODES_USE_UNION AUTO CACHE STRING "UNION online workloads; implies SWM (AUTO/ON/OFF)") +set(CODES_USE_RECORDER AUTO CACHE STRING "Recorder I/O workload, no external dep (AUTO/ON/OFF)") +set(CODES_USE_TORCH AUTO CACHE STRING "Torch ML models (AUTO/ON/OFF)") +set(CODES_USE_DARSHAN OFF CACHE STRING "Darshan I/O workload — not yet wired (AUTO/ON/OFF)") +set(CODES_USE_ZEROMQ AUTO CACHE STRING "ZeroMQ director-client surrogate; needs ZEROMQ_BUILD_PATH (AUTO/ON/OFF)") +foreach(_dep DUMPI SWM UNION RECORDER TORCH DARSHAN ZEROMQ) + set_property(CACHE CODES_USE_${_dep} PROPERTY STRINGS AUTO ON OFF) +endforeach() + +# Back-compat: the pre-rename option names map onto CODES_USE_* for one +# deprecation cycle. Remove once downstream invocations have migrated. +foreach(_old TORCH RECORDER) + if(DEFINED USE_${_old}) + message(WARNING "USE_${_old} is deprecated; use CODES_USE_${_old} (AUTO/ON/OFF).") + if(USE_${_old}) + set(CODES_USE_${_old} ON CACHE STRING "" FORCE) + else() + set(CODES_USE_${_old} OFF CACHE STRING "" FORCE) + endif() + endif() +endforeach() + +# Resolve a tri-state CODES_USE_ against a probe result (found = a path, +# a *_FOUND var, or a bool). Sets the internal ${out_var} bool in the caller. +function(codes_resolve_dep name found out_var) + if(CODES_USE_${name} STREQUAL "OFF") + set(${out_var} FALSE PARENT_SCOPE) + elseif(found) + set(${out_var} TRUE PARENT_SCOPE) + elseif(CODES_USE_${name} STREQUAL "ON") + message(FATAL_ERROR "CODES_USE_${name}=ON but ${name} could not be found/enabled.") + else() + set(${out_var} FALSE PARENT_SCOPE) + endif() +endfunction() -# SWM and UNION (both require ARGOBOTS to function) -pkg_check_modules(SWM IMPORTED_TARGET swm) -if(NOT SWM_FOUND) - message(STATUS "SWM Library Not Found, Online workloads disabled") -else(SWM_FOUND) - message(STATUS "SWM Library Found: ${SWM_LIBRARIES}") - pkg_check_modules(ARGOBOTS REQUIRED IMPORTED_TARGET argobots) - if(NOT ARGOBOTS_FOUND) - message(STATUS "Argobots Library Not Found, Online workloads disabled") +## DUMPI — trace workloads +if(NOT CODES_USE_DUMPI STREQUAL "OFF") + find_library(DUMPI_LIB undumpi PATHS ${DUMPI_BUILD_PATH}/lib) +endif() +codes_resolve_dep(DUMPI "${DUMPI_LIB}" USE_DUMPI) +if(USE_DUMPI) + message(STATUS "DUMPI trace workloads enabled (${DUMPI_LIB})") + set(DUMPI_INCLUDE "${DUMPI_BUILD_PATH}/include" CACHE PATH "Dumpi library include") + list(APPEND CODES_COMPILE_DEFINITIONS USE_DUMPI=1) +else() + message(STATUS "DUMPI trace workloads disabled") +endif() - else(ARGOBOTS_FOUND) - message(STATUS "Argobots Library Found: ${ARGOBOTS_LIBRARIES}") - pkg_get_variable(SWM_DATAROOTDIR swm datarootdir) - cmake_print_variables(SWM_DATAROOTDIR) +## SWM / UNION — online workloads (UNION ⇒ SWM ⇒ argobots) +set(_swm_ok FALSE) +if(NOT CODES_USE_SWM STREQUAL "OFF") + pkg_check_modules(SWM IMPORTED_TARGET swm) + if(SWM_FOUND) + pkg_check_modules(ARGOBOTS IMPORTED_TARGET argobots) + if(ARGOBOTS_FOUND) + set(_swm_ok TRUE) + endif() + endif() +endif() +codes_resolve_dep(SWM "${_swm_ok}" USE_ONLINE) - list(APPEND CODES_COMPILE_DEFINITIONS USE_ONLINE=1) - set(USE_ONLINE true) +if(USE_ONLINE) + message(STATUS "SWM online workloads enabled (${SWM_LIBRARIES})") + list(APPEND CODES_COMPILE_DEFINITIONS USE_ONLINE=1) + pkg_get_variable(SWM_DATAROOTDIR swm datarootdir) + set(_union_found FALSE) + if(NOT CODES_USE_UNION STREQUAL "OFF") pkg_check_modules(UNION IMPORTED_TARGET union) - if(NOT UNION_FOUND) - message(STATUS "UNION Library Not Found, SWM-only online workloads enabled") - list(APPEND CODES_COMPILE_DEFINITIONS USE_SWM=1) - set(USE_SWM true) - else(UNION_FOUND) - message(STATUS "UNION Library Found: ${UNION_LIBRARIES}") - pkg_get_variable(UNION_DATAROOTDIR union datarootdir) - cmake_print_variables(UNION_DATAROOTDIR) - - list(APPEND CODES_COMPILE_DEFINITIONS USE_UNION=1) - set(USE_UNION true) - endif() + set(_union_found ${UNION_FOUND}) + endif() + codes_resolve_dep(UNION "${_union_found}" USE_UNION) + if(USE_UNION) + message(STATUS "UNION online workloads enabled (${UNION_LIBRARIES})") + list(APPEND CODES_COMPILE_DEFINITIONS USE_UNION=1) + pkg_get_variable(UNION_DATAROOTDIR union datarootdir) + else() + list(APPEND CODES_COMPILE_DEFINITIONS USE_SWM=1) + set(USE_SWM TRUE) + endif() +else() + message(STATUS "SWM/UNION online workloads disabled") + if(CODES_USE_UNION STREQUAL "ON") + message(FATAL_ERROR "CODES_USE_UNION=ON requires SWM/online (set CODES_USE_SWM and provide SWM+argobots).") endif() endif() -## RECORDER -option(USE_RECORDER "use recorder io workload" ON) +## RECORDER — built-in I/O workload, no external dependency +codes_resolve_dep(RECORDER TRUE USE_RECORDER) if(USE_RECORDER) - list(APPEND CODES_COMPILE_DEFINITIONS USE_RECORDER=1) + list(APPEND CODES_COMPILE_DEFINITIONS USE_RECORDER=1) endif() -## DARSHAN + +## DARSHAN — stub: the probe + the commented darshan source in src/ aren't +## wired up yet, so this resolves OFF today (CODES_USE_DARSHAN=ON errors). +codes_resolve_dep(DARSHAN FALSE USE_DARSHAN) -## DAMARIS +## DAMARIS — build support intentionally absent (cleanup tracked separately). # pkg_check_modules(DAMARIS IMPORTED_TARGET) # if(NOT DAMARIS_FOUND) # message(STATUS "DAMARIS Library not found, Damaris disabled") @@ -114,19 +173,36 @@ endif() # set(USE_DAMARIS true) # endif() -## TORCH loading ML models -if((NOT DEFINED USE_TORCH) OR USE_TORCH) - find_package(Torch) - if(Torch_FOUND) - list(APPEND CODES_COMPILE_DEFINITIONS USE_TORCH) - set(USE_TORCH true) - message(STATUS "Loading TORCH models enabled.") - else() - set(USE_TORCH false) - message(STATUS "Torch library not found. Loading TORCH models disabled.") - endif() + +## TORCH — ML models +if(NOT CODES_USE_TORCH STREQUAL "OFF") + find_package(Torch QUIET) +endif() +codes_resolve_dep(TORCH "${Torch_FOUND}" USE_TORCH) +if(USE_TORCH) + message(STATUS "Torch ML models enabled") + list(APPEND CODES_COMPILE_DEFINITIONS USE_TORCH) else() - message(STATUS "Loading TORCH models NOT enabled.") + message(STATUS "Torch ML models disabled") +endif() + + +## ZeroMQ — director-client surrogate (opt-in; needs ZEROMQ_BUILD_PATH). +## "found" = a build path was provided; the imported target + linkage live in +## src/CMakeLists.txt, gated on the internal USE_ZEROMQ bool. +if(CODES_USE_ZEROMQ STREQUAL "ON" AND NOT ZEROMQ_BUILD_PATH) + message(FATAL_ERROR + "CODES_USE_ZEROMQ=ON requires ZEROMQ_BUILD_PATH. Build " + "src/surrogate/zmqml/libzmqmlrequester.so first, then reconfigure with " + "-DZEROMQ_BUILD_PATH=.") +endif() +set(_zeromq_found FALSE) +if(ZEROMQ_BUILD_PATH) + set(_zeromq_found TRUE) +endif() +codes_resolve_dep(ZEROMQ "${_zeromq_found}" USE_ZEROMQ) +if(USE_ZEROMQ) + message(STATUS "ZeroMQ director-client surrogate enabled (${ZEROMQ_BUILD_PATH})") endif() add_subdirectory(src) diff --git a/CODES-compile-instructions.sh b/CODES-compile-instructions.sh index bf2ad326..1f285f18 100644 --- a/CODES-compile-instructions.sh +++ b/CODES-compile-instructions.sh @@ -367,7 +367,7 @@ make_args_codes=( -DCMAKE_USE_WIN32_THREADS_INIT=0 -DCMAKE_BUILD_TYPE=Debug -DBUILD_TESTING=ON -DCMAKE_INSTALL_PREFIX="$(realpath bin)" - -DZMQML_BUILD_PATH="$(realpath "$CUR_DIR/codes/src/surrogate/zmqml")" + -DZEROMQ_BUILD_PATH="$(realpath "$CUR_DIR/codes/src/surrogate/zmqml")" -DZeroMQ_INCLUDE_DIR=/usr/include -DZeroMQ_LIBRARY=/usr/lib/x86_64-linux-gnu/libzmq.so ) @@ -387,7 +387,7 @@ fi if [ "$torch_enable" = 1 ]; then make_args_codes=( "${make_args_codes[@]}" - -DUSE_TORCH=true + -DCODES_USE_TORCH=ON -DTorch_DIR="${torch_dir}" ) @@ -411,7 +411,7 @@ if [ "$torch_enable" = 1 ]; then ) fi else - make_args_codes=("${make_args_codes[@]}" -DUSE_TORCH=false) + make_args_codes=("${make_args_codes[@]}" -DCODES_USE_TORCH=OFF) fi cmake .. "${make_args_codes[@]}" diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 97a177d5..e7baa1ee 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -124,20 +124,10 @@ if(USE_TORCH) list(APPEND LIBS_TO_LINK ${TORCH_LIBRARIES}) endif() -# ZMQML / director-client (opt-in). When USE_ZMQML=ON, callers must -# point ZMQML_BUILD_PATH at a directory containing libzmqmlrequester.so -# (build it via src/surrogate/zmqml/Makefile, or set ZMQML_BUILD_PATH to -# wherever you installed it). When OFF (the default), CODES builds with -# no surrogate/director-client linkage; configs that reference -# "dir-nw-lp" will fail at runtime because the LP type isn't registered. -option(USE_ZMQML "Build the director-client + zmqml surrogate integration" OFF) -if(USE_ZMQML) - if(NOT ZMQML_BUILD_PATH) - message(FATAL_ERROR - "USE_ZMQML=ON but ZMQML_BUILD_PATH is unset.\n" - "Build src/surrogate/zmqml/libzmqmlrequester.so first, then " - "reconfigure with -DZMQML_BUILD_PATH=.") - endif() +# ZeroMQ / director-client (opt-in via CODES_USE_ZEROMQ; resolved to the internal +# USE_ZEROMQ bool in the top-level CMakeLists, which also enforces that +# ZEROMQ_BUILD_PATH is set). The imported target + linkage below are gated on it. +if(USE_ZEROMQ) list(APPEND SRCS surrogate/director-client.C) endif() @@ -173,12 +163,12 @@ if(USE_ONLINE) endif() endif() -if(USE_ZMQML) +if(USE_ZEROMQ) add_library(zmqmlrequester SHARED IMPORTED GLOBAL) set_target_properties(zmqmlrequester PROPERTIES - IMPORTED_LOCATION "${ZMQML_BUILD_PATH}/libzmqmlrequester.so" - INTERFACE_INCLUDE_DIRECTORIES "${ZMQML_BUILD_PATH}") - target_compile_definitions(codes PUBLIC USE_ZMQML) + IMPORTED_LOCATION "${ZEROMQ_BUILD_PATH}/libzmqmlrequester.so" + INTERFACE_INCLUDE_DIRECTORIES "${ZEROMQ_BUILD_PATH}") + target_compile_definitions(codes PUBLIC USE_ZEROMQ) endif() target_include_directories(codes PUBLIC @@ -188,7 +178,7 @@ target_include_directories(codes PUBLIC ${PROJECT_SOURCE_DIR}/codes ${PROJECT_SOURCE_DIR}/src ${PROJECT_SOURCE_DIR}/src/modelconfig - $<$:$> + $<$:$> ) target_link_libraries(codes PUBLIC ${LIBS_TO_LINK}) @@ -218,9 +208,9 @@ if(USE_DUMPI) list(APPEND CODES_TARGETS model-net-dumpi-traces-dump) endif() -# ZMQ — only resolved + linked when USE_ZMQML is on; otherwise nothing +# ZMQ — only resolved + linked when USE_ZEROMQ is on; otherwise nothing # in the codes library calls into libzmq. -if(USE_ZMQML) +if(USE_ZEROMQ) pkg_check_modules(PC_ZeroMQ QUIET zmq) find_path(ZeroMQ_INCLUDE_DIR NAMES zmq.hpp PATHS ${PC_ZeroMQ_INCLUDE_DIRS}) find_library(ZeroMQ_LIBRARY NAMES zmq PATHS ${PC_ZeroMQ_LIBRARY_DIRS}) @@ -230,7 +220,7 @@ endif() # libs all propagate transitively through codes' PUBLIC usage requirements. foreach(tar IN LISTS CODES_TARGETS) target_link_libraries(${tar} PRIVATE codes) - if(USE_ZMQML) + if(USE_ZEROMQ) target_link_libraries(${tar} PRIVATE zmqmlrequester ${ZeroMQ_LIBRARY}) endif() endforeach() diff --git a/src/network-workloads/model-net-mpi-replay.c b/src/network-workloads/model-net-mpi-replay.c index 95f2c48b..8c7d3988 100644 --- a/src/network-workloads/model-net-mpi-replay.c +++ b/src/network-workloads/model-net-mpi-replay.c @@ -4194,7 +4194,7 @@ int modelnet_mpi_replay(MPI_Comm comm, int* argc, char*** argv) { if (g_st_ev_trace || g_st_model_stats || g_st_use_analysis_lps) nw_lp_register_model(); -#ifdef USE_ZMQML +#ifdef USE_ZEROMQ director_lp_register_model("dir-nw-lp"); #endif diff --git a/src/networks/model-net/dragonfly-dally.C b/src/networks/model-net/dragonfly-dally.C index b4c064cf..49475fc7 100644 --- a/src/networks/model-net/dragonfly-dally.C +++ b/src/networks/model-net/dragonfly-dally.C @@ -47,8 +47,8 @@ /* * Optional ZeroMQ Director requester. * - * These symbols are defined only when CODES is built with USE_ZMQML=ON - * (src/surrogate/director-client.C + libzmqmlrequester). USE_ZMQML is + * These symbols are defined only when CODES is built with USE_ZEROMQ=ON + * (src/surrogate/director-client.C + libzmqmlrequester). USE_ZEROMQ is * all-or-nothing for a given build: src/CMakeLists.txt links libzmqmlrequester * into *every* CODES executable when ON and into none when OFF. So whether the * requester is available is a compile-time fact, not a runtime one — the @@ -56,14 +56,14 @@ * checks could only ever take their "available" branch under ON and their * "null" branch under OFF. * - * The declarations and every reference are therefore #ifdef USE_ZMQML-guarded: + * The declarations and every reference are therefore #ifdef USE_ZEROMQ-guarded: * the ON build calls the requester directly, the OFF build compiles in only * the original-PDES fallback and emits no reference to the symbol. (Without * the guard the OFF build fails to link on macOS/Mach-O, where ld64 rejects an * undefined weak symbol with no providing library; Linux/ELF happens to * resolve it to null.) */ -#ifdef USE_ZMQML +#ifdef USE_ZEROMQ extern std::vector zmqml_director_request(const std::string& surrogate_family, const std::string& surrogate_backend, const std::string& operation, @@ -285,7 +285,7 @@ static std::vector dfdally_event_time_director_request_with_latency struct timespec start, finish; clock_gettime(CLOCK_MONOTONIC, &start); -#ifdef USE_ZMQML +#ifdef USE_ZEROMQ ret = zmqml_director_request("event-time", "dragonfly-dally", op, args, bindata); #else ret.push_back("failed"); @@ -296,7 +296,7 @@ static std::vector dfdally_event_time_director_request_with_latency double local_latency_sec = (double)(finish.tv_sec - start.tv_sec) + (double)(finish.tv_nsec - start.tv_nsec) / 1000000000.0; -#ifdef USE_ZMQML +#ifdef USE_ZEROMQ director_record_zmq_latency_stats(label, ret, local_latency_sec); #endif @@ -3569,7 +3569,7 @@ static void dfdally_event_time_zmq_flush(void) { return; } -#ifndef USE_ZMQML +#ifndef USE_ZEROMQ if (dfdally_surrogate_debug_prints) { fprintf(stderr, "[event-time records] zmqml_director_request unavailable; dropping %llu " @@ -3786,7 +3786,7 @@ static double dfdally_event_time_predict_or_original(tw_lp* lp, int current_lp_t std::vector args; args.push_back("1"); -#ifndef USE_ZMQML +#ifndef USE_ZEROMQ if (dfdally_surrogate_debug_prints) { fprintf(stderr, "[event-time inference] zmqml_director_request unavailable; " From e3e5604358ccfd38ea0ca0869c24c21c85b6759d Mon Sep 17 00:00:00 2001 From: Caitlin Ross Date: Tue, 23 Jun 2026 14:23:53 -0500 Subject: [PATCH 09/20] Move optional-feature flags into a namespaced codes_config.h Optional-dependency state (DUMPI, ONLINE, SWM, UNION, RECORDER, TORCH, ZEROMQ, DARSHAN) was carried as bare -DUSE_ defines added PUBLIC to the codes target, so every one of those names leaked onto the command line of everything that links codes. That approach had two real problems: - The names are generic and unnamespaced. USE_ONLINE / USE_TORCH and the like can collide with whatever a downstream project defines, and USE_ONLINE was already baked into an installed public header (codes-workload.h) -- so the leak was part of our public surface. - #ifdef tests existence, not value. A stray -DUSE_X=0 would *enable* the feature, and any source that forgot the define or typo'd it silently compiled the feature off, with no diagnostic at all. Instead, generate a single namespaced header (codes_config.h) defining CODES_HAVE_ to 0/1 for each optional subsystem, install it alongside the public headers, and query it with `#if CODES_HAVE_`. Build CODES's own targets with -Wundef. This is strictly better than the old scheme: feature availability is namespaced (no clashes), it lives in one installed header consumers can actually query instead of an invisible command-line contract, and nothing leaks onto consumer compile lines. The `#if` + -Wundef combination turns the classic preprocessor foot-guns -- forgotten include, typo'd macro, -DFOO=0 -- into compile-time warnings rather than a silently mis-built feature. --- CMakeLists.txt | 30 +++++++++---- codes/codes-workload.h | 3 +- codes_config.h.cmake.in | 42 +++++++++++-------- src/CMakeLists.txt | 12 +++--- .../model-net-mpi-replay-main.c | 7 ++-- src/network-workloads/model-net-mpi-replay.c | 3 +- src/networks/model-net/dragonfly-dally.C | 21 +++++----- src/surrogate/init.c | 7 ++-- src/workload/codes-workload-dump.c | 5 ++- src/workload/codes-workload.c | 20 ++++----- .../methods/codes-conc-online-comm-wrkld.C | 4 +- 11 files changed, 91 insertions(+), 63 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a30e507d..6bbe947b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -18,6 +18,13 @@ set(CMAKE_CXX_EXTENSIONS OFF) set(CMAKE_C_STANDARD 11) set(CMAKE_C_STANDARD_REQUIRED True) +# Build CODES's own targets with -Wundef so the generated config-header feature +# macros (CODES_HAVE_, always 0/1 — see codes_config.h.cmake.in) flag a +# forgotten `#include "codes_config.h"` or a typo'd name as a warning instead of +# a silently-disabled feature. Applies only to this project's compiles, not to +# consumers; ROSS/MPI headers arrive via -isystem and are exempt. +add_compile_options(-Wundef) + #prevent cmake from stripping the runtime path (important if shared libraries are imported) SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) @@ -54,7 +61,7 @@ find_package(MPI REQUIRED) # ON : probe; hard error if not found. # OFF : don't probe; feature disabled. # Each resolves to an internal USE_ bool (names unchanged) consumed by -# src/CMakeLists.txt and the C sources' #ifdef sites. +# src/CMakeLists.txt and the C sources' #if sites. # ============================================================================ set(DUMPI_BUILD_PATH "" CACHE PATH "Directory where dumpi include and lib are installed") @@ -107,7 +114,6 @@ codes_resolve_dep(DUMPI "${DUMPI_LIB}" USE_DUMPI) if(USE_DUMPI) message(STATUS "DUMPI trace workloads enabled (${DUMPI_LIB})") set(DUMPI_INCLUDE "${DUMPI_BUILD_PATH}/include" CACHE PATH "Dumpi library include") - list(APPEND CODES_COMPILE_DEFINITIONS USE_DUMPI=1) else() message(STATUS "DUMPI trace workloads disabled") endif() @@ -128,7 +134,6 @@ codes_resolve_dep(SWM "${_swm_ok}" USE_ONLINE) if(USE_ONLINE) message(STATUS "SWM online workloads enabled (${SWM_LIBRARIES})") - list(APPEND CODES_COMPILE_DEFINITIONS USE_ONLINE=1) pkg_get_variable(SWM_DATAROOTDIR swm datarootdir) set(_union_found FALSE) @@ -139,10 +144,8 @@ if(USE_ONLINE) codes_resolve_dep(UNION "${_union_found}" USE_UNION) if(USE_UNION) message(STATUS "UNION online workloads enabled (${UNION_LIBRARIES})") - list(APPEND CODES_COMPILE_DEFINITIONS USE_UNION=1) pkg_get_variable(UNION_DATAROOTDIR union datarootdir) else() - list(APPEND CODES_COMPILE_DEFINITIONS USE_SWM=1) set(USE_SWM TRUE) endif() else() @@ -155,9 +158,6 @@ endif() ## RECORDER — built-in I/O workload, no external dependency codes_resolve_dep(RECORDER TRUE USE_RECORDER) -if(USE_RECORDER) - list(APPEND CODES_COMPILE_DEFINITIONS USE_RECORDER=1) -endif() ## DARSHAN — stub: the probe + the commented darshan source in src/ aren't @@ -181,7 +181,6 @@ endif() codes_resolve_dep(TORCH "${Torch_FOUND}" USE_TORCH) if(USE_TORCH) message(STATUS "Torch ML models enabled") - list(APPEND CODES_COMPILE_DEFINITIONS USE_TORCH) else() message(STATUS "Torch ML models disabled") endif() @@ -208,6 +207,19 @@ endif() add_subdirectory(src) +# Feature-availability macros consumed by codes_config.h.cmake.in. CODES_HAVE_ +# mirrors the resolved USE_ bool; #cmakedefine emits the #define only when set. +# This replaces the old -DUSE_ command-line defines: feature state now lives in +# the namespaced, installed codes_config.h instead of leaking onto compile lines. +set(CODES_HAVE_DUMPI ${USE_DUMPI}) +set(CODES_HAVE_ONLINE ${USE_ONLINE}) +set(CODES_HAVE_SWM ${USE_SWM}) +set(CODES_HAVE_UNION ${USE_UNION}) +set(CODES_HAVE_RECORDER ${USE_RECORDER}) +set(CODES_HAVE_TORCH ${USE_TORCH}) +set(CODES_HAVE_ZEROMQ ${USE_ZEROMQ}) +set(CODES_HAVE_DARSHAN ${USE_DARSHAN}) + configure_file(codes_config.h.cmake.in codes_config.h) add_subdirectory(doc/example) diff --git a/codes/codes-workload.h b/codes/codes-workload.h index 1ebb00b5..7988d71e 100644 --- a/codes/codes-workload.h +++ b/codes/codes-workload.h @@ -18,8 +18,9 @@ extern "C" { #include #include "configuration.h" +#include "codes_config.h" -#ifdef USE_ONLINE +#if CODES_HAVE_ONLINE #include "abt.h" #endif #define MAX_NAME_LENGTH_WKLD 512 diff --git a/codes_config.h.cmake.in b/codes_config.h.cmake.in index 1a687a23..8782c586 100644 --- a/codes_config.h.cmake.in +++ b/codes_config.h.cmake.in @@ -1,20 +1,28 @@ - - - -// ross - -// dumpi - -// swm +#ifndef CODES_CONFIG_H +#define CODES_CONFIG_H + +// codes_config.h — generated from codes_config.h.cmake.in by CMake. +// +// Optional-feature availability: CODES_HAVE_ is ALWAYS defined — to 1 if that +// optional subsystem was enabled at configure time (resolved from the +// CODES_USE_ option) and 0 otherwise. Query it with `#if CODES_HAVE_` (not +// `#ifdef`): the value-based form, combined with the -Wundef build flag, turns a +// forgotten `#include "codes_config.h"` or a typo'd name into a compiler warning +// instead of a silently-disabled feature. This is the supported, namespaced way +// for CODES sources and external consumers to query which subsystems a build +// includes — it replaces the old bare -DUSE_ command-line defines. + +#cmakedefine01 CODES_HAVE_DUMPI +#cmakedefine01 CODES_HAVE_ONLINE +#cmakedefine01 CODES_HAVE_SWM +#cmakedefine01 CODES_HAVE_UNION +#cmakedefine01 CODES_HAVE_RECORDER +#cmakedefine01 CODES_HAVE_TORCH +#cmakedefine01 CODES_HAVE_ZEROMQ +#cmakedefine01 CODES_HAVE_DARSHAN + +// Data-install paths used by the SWM / UNION online-workload backends. #define SWM_DATAROOTDIR "${SWM_DATAROOTDIR}" -// union #define UNION_DATADIR "${UNION_DATAROOTDIR}" - -// damaris - - -// darshan - - -// cortex +#endif /* CODES_CONFIG_H */ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index e7baa1ee..3b2be169 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -133,10 +133,9 @@ endif() add_library(codes STATIC ${SRCS}) -# Compile definitions accumulated from optional-dep detection in the top-level -# CMakeLists (USE_DUMPI, USE_ONLINE, USE_SWM, USE_UNION, USE_RECORDER, USE_TORCH). -# PUBLIC so codes' sources and every consumer (executables, tests) see them. -target_compile_definitions(codes PUBLIC ${CODES_COMPILE_DEFINITIONS}) +# Optional-feature state is exposed through the generated codes_config.h +# (CODES_HAVE_), included by sources that need it — no -DUSE_ compile +# definitions, so nothing leaks onto consumers' command lines. # MPI imported targets carry the include dirs, compile flags, and link libs. # Linked PUBLIC (via LIBS_TO_LINK below) so codes' own sources and every @@ -168,7 +167,6 @@ if(USE_ZEROMQ) set_target_properties(zmqmlrequester PROPERTIES IMPORTED_LOCATION "${ZEROMQ_BUILD_PATH}/libzmqmlrequester.so" INTERFACE_INCLUDE_DIRECTORIES "${ZEROMQ_BUILD_PATH}") - target_compile_definitions(codes PUBLIC USE_ZEROMQ) endif() target_include_directories(codes PUBLIC @@ -234,6 +232,10 @@ endforeach() install(DIRECTORY "${CMAKE_SOURCE_DIR}/codes" DESTINATION include) +# The generated config header is public now — codes-workload.h includes it for +# CODES_HAVE_* feature queries — so install it next to the other codes/ headers. +install(FILES "${PROJECT_BINARY_DIR}/codes_config.h" DESTINATION include/codes) + install(TARGETS ${CODES_TARGETS} DESTINATION bin) install(TARGETS codes ARCHIVE DESTINATION lib LIBRARY DESTINATION lib) diff --git a/src/network-workloads/model-net-mpi-replay-main.c b/src/network-workloads/model-net-mpi-replay-main.c index cd494f7a..f73ead64 100644 --- a/src/network-workloads/model-net-mpi-replay-main.c +++ b/src/network-workloads/model-net-mpi-replay-main.c @@ -4,8 +4,9 @@ * */ #include +#include "codes_config.h" -#ifdef USE_ONLINE +#if CODES_HAVE_ONLINE #include #endif @@ -14,7 +15,7 @@ int main(int argc, char** argv) { MPI_Init(&argc, &argv); -#ifdef USE_ONLINE +#if CODES_HAVE_ONLINE ABT_init(argc, argv); #endif // int rank, size; @@ -29,7 +30,7 @@ int main(int argc, char** argv) { modelnet_mpi_replay(MPI_COMM_WORLD, &argc, &argv); int flag; -#ifdef USE_ONLINE +#if CODES_HAVE_ONLINE ABT_finalize(); #endif diff --git a/src/network-workloads/model-net-mpi-replay.c b/src/network-workloads/model-net-mpi-replay.c index 8c7d3988..e93b5efa 100644 --- a/src/network-workloads/model-net-mpi-replay.c +++ b/src/network-workloads/model-net-mpi-replay.c @@ -11,6 +11,7 @@ #include #include "codes/codes-workload.h" #include "codes/codes.h" +#include "codes_config.h" #include "codes/configuration.h" #include "codes/codes_mapping.h" #include "codes/model-net.h" @@ -4194,7 +4195,7 @@ int modelnet_mpi_replay(MPI_Comm comm, int* argc, char*** argv) { if (g_st_ev_trace || g_st_model_stats || g_st_use_analysis_lps) nw_lp_register_model(); -#ifdef USE_ZEROMQ +#if CODES_HAVE_ZEROMQ director_lp_register_model("dir-nw-lp"); #endif diff --git a/src/networks/model-net/dragonfly-dally.C b/src/networks/model-net/dragonfly-dally.C index 49475fc7..de305309 100644 --- a/src/networks/model-net/dragonfly-dally.C +++ b/src/networks/model-net/dragonfly-dally.C @@ -15,6 +15,7 @@ */ #include +#include "codes_config.h" #include "codes/jenkins-hash.h" #include "codes/codes_mapping.h" @@ -22,7 +23,7 @@ #include "codes/model-net-method.h" #include "codes/model-net-lp.h" #include "codes/surrogate/init.h" -#ifdef USE_TORCH +#if CODES_HAVE_TORCH #include "codes/surrogate/packet-latency-predictor/torch-jit.h" #endif #include "codes/net/dragonfly-dally.h" @@ -47,8 +48,8 @@ /* * Optional ZeroMQ Director requester. * - * These symbols are defined only when CODES is built with USE_ZEROMQ=ON - * (src/surrogate/director-client.C + libzmqmlrequester). USE_ZEROMQ is + * These symbols are defined only when CODES is built with CODES_HAVE_ZEROMQ=ON + * (src/surrogate/director-client.C + libzmqmlrequester). CODES_HAVE_ZEROMQ is * all-or-nothing for a given build: src/CMakeLists.txt links libzmqmlrequester * into *every* CODES executable when ON and into none when OFF. So whether the * requester is available is a compile-time fact, not a runtime one — the @@ -56,14 +57,14 @@ * checks could only ever take their "available" branch under ON and their * "null" branch under OFF. * - * The declarations and every reference are therefore #ifdef USE_ZEROMQ-guarded: + * The declarations and every reference are therefore #if CODES_HAVE_ZEROMQ-guarded: * the ON build calls the requester directly, the OFF build compiles in only * the original-PDES fallback and emits no reference to the symbol. (Without * the guard the OFF build fails to link on macOS/Mach-O, where ld64 rejects an * undefined weak symbol with no providing library; Linux/ELF happens to * resolve it to null.) */ -#ifdef USE_ZEROMQ +#if CODES_HAVE_ZEROMQ extern std::vector zmqml_director_request(const std::string& surrogate_family, const std::string& surrogate_backend, const std::string& operation, @@ -285,7 +286,7 @@ static std::vector dfdally_event_time_director_request_with_latency struct timespec start, finish; clock_gettime(CLOCK_MONOTONIC, &start); -#ifdef USE_ZEROMQ +#if CODES_HAVE_ZEROMQ ret = zmqml_director_request("event-time", "dragonfly-dally", op, args, bindata); #else ret.push_back("failed"); @@ -296,7 +297,7 @@ static std::vector dfdally_event_time_director_request_with_latency double local_latency_sec = (double)(finish.tv_sec - start.tv_sec) + (double)(finish.tv_nsec - start.tv_nsec) / 1000000000.0; -#ifdef USE_ZEROMQ +#if CODES_HAVE_ZEROMQ director_record_zmq_latency_stats(label, ret, local_latency_sec); #endif @@ -3569,7 +3570,7 @@ static void dfdally_event_time_zmq_flush(void) { return; } -#ifndef USE_ZEROMQ +#if !CODES_HAVE_ZEROMQ if (dfdally_surrogate_debug_prints) { fprintf(stderr, "[event-time records] zmqml_director_request unavailable; dropping %llu " @@ -3786,7 +3787,7 @@ static double dfdally_event_time_predict_or_original(tw_lp* lp, int current_lp_t std::vector args; args.push_back("1"); -#ifndef USE_ZEROMQ +#if !CODES_HAVE_ZEROMQ if (dfdally_surrogate_debug_prints) { fprintf(stderr, "[event-time inference] zmqml_director_request unavailable; " @@ -7595,7 +7596,7 @@ static void router_packet_send(router_state* s, tw_bf* bf, terminal_dally_messag maxd(0.0, s->next_output_available_time[output_port] - cur_entry->msg.this_router_arrival) + propagation_delay; bool router_timing_prediction_used = false; -#ifdef USE_TORCH +#if CODES_HAVE_TORCH if (is_dally_surrogate_on && surrogate_torch_router_timing_model_enabled()) { struct router_timing_prediction_start timing_start = { .router_id = (float)s->router_id, diff --git a/src/surrogate/init.c b/src/surrogate/init.c index e14c09c8..73d59f90 100644 --- a/src/surrogate/init.c +++ b/src/surrogate/init.c @@ -1,10 +1,11 @@ +#include "codes_config.h" #include #include #include #include #include -#ifdef USE_TORCH +#if CODES_HAVE_TORCH #include #endif @@ -111,7 +112,7 @@ bool network_surrogate_configure(char const* const anno, struct network_surrogat current_net_predictor = average_latency_predictor(sc->total_terminals); *pl_pred = ¤t_net_predictor; -#ifdef USE_TORCH +#if CODES_HAVE_TORCH } else if (strcmp(latency_pred_name, "torch-jit") == 0) { char torch_jit_mode[MAX_NAME_LENGTH]; torch_jit_mode[0] = '\0'; @@ -171,7 +172,7 @@ bool network_surrogate_configure(char const* const anno, struct network_surrogat tw_error(TW_LOC, "Unknown predictor for packet latency `%s` " "(possibilities include: average" -#ifdef USE_TORCH +#if CODES_HAVE_TORCH ", torch-jit" #endif ")", diff --git a/src/workload/codes-workload-dump.c b/src/workload/codes-workload-dump.c index e17b54d3..7f2f5fe7 100644 --- a/src/workload/codes-workload-dump.c +++ b/src/workload/codes-workload-dump.c @@ -9,6 +9,7 @@ #include #include #include +#include "codes_config.h" #include static char type[128] = {'\0'}; @@ -76,7 +77,7 @@ void usage() { } int main(int argc, char* argv[]) { -#ifdef USE_ONLINE +#if CODES_HAVE_ONLINE ABT_init(argc, argv); #endif int print_stats = 0; @@ -463,7 +464,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "NUM_TESTALLS: %" PRId64 "\n", num_testalls); } -#ifdef USE_ONLINE +#if CODES_HAVE_ONLINE ABT_finalize(); #endif return 0; diff --git a/src/workload/codes-workload.c b/src/workload/codes-workload.c index 7cf37610..754b9b3a 100644 --- a/src/workload/codes-workload.c +++ b/src/workload/codes-workload.c @@ -20,11 +20,11 @@ extern struct codes_workload_method test_workload_method; extern struct codes_workload_method iolang_workload_method; -#ifdef USE_DUMPI +#if CODES_HAVE_DUMPI extern struct codes_workload_method dumpi_trace_workload_method; #endif -#ifdef USE_DARSHAN +#if CODES_HAVE_DARSHAN #if DARSHAN_POSIX_IO extern struct codes_workload_method darshan_posix_io_workload_method; #elif DARSHAN_MPI_IO @@ -32,14 +32,14 @@ extern struct codes_workload_method darshan_mpi_io_workload_method; #endif #endif -#ifdef USE_RECORDER +#if CODES_HAVE_RECORDER extern struct codes_workload_method recorder_io_workload_method; #endif -#ifdef USE_SWM +#if CODES_HAVE_SWM extern struct codes_workload_method swm_online_comm_workload_method; #endif -#ifdef USE_UNION +#if CODES_HAVE_UNION extern struct codes_workload_method conc_online_comm_workload_method; #endif @@ -49,11 +49,11 @@ extern struct codes_workload_method iomock_workload_method; static struct codes_workload_method const* method_array_default[] = { &test_workload_method, &iolang_workload_method, -#ifdef USE_DUMPI +#if CODES_HAVE_DUMPI &dumpi_trace_workload_method, #endif -#ifdef USE_DARSHAN +#if CODES_HAVE_DARSHAN /* added by pj: posix and mpi io */ #if DARSHAN_POSIX_IO &darshan_posix_io_workload_method, @@ -63,13 +63,13 @@ static struct codes_workload_method const* method_array_default[] = { #endif #endif -#ifdef USE_SWM +#if CODES_HAVE_SWM &swm_online_comm_workload_method, #endif -#ifdef USE_UNION +#if CODES_HAVE_UNION &conc_online_comm_workload_method, #endif -#ifdef USE_RECORDER +#if CODES_HAVE_RECORDER &recorder_io_workload_method, #endif &checkpoint_workload_method, diff --git a/src/workload/methods/codes-conc-online-comm-wrkld.C b/src/workload/methods/codes-conc-online-comm-wrkld.C index fc579881..f6b7e7d9 100644 --- a/src/workload/methods/codes-conc-online-comm-wrkld.C +++ b/src/workload/methods/codes-conc-online-comm-wrkld.C @@ -23,7 +23,7 @@ #include "codes_config.h" #include "union_util.h" -//#ifdef USE_SWM +//#if CODES_HAVE_SWM #include "lammps.h" #include "nekbone_swm_user_code.h" #include "nearest_neighbor_swm_user_code.h" @@ -959,7 +959,7 @@ void UNION_MPI_Alltoall(const void* sendbuf, int sendcount, UNION_Datatype sendt } -//#ifdef USE_SWM +//#if CODES_HAVE_SWM void SWM_Pass_app_data(struct swm_app_data* app_data) { /* Retreive the shared context state */ From d476cd1db40b3fa08879776f0788bd58e1299c87 Mon Sep 17 00:00:00 2001 From: Caitlin Ross Date: Tue, 23 Jun 2026 14:32:44 -0500 Subject: [PATCH 10/20] cmake: drop damaris build support, keep inert c paths for now --- CMakeLists.txt | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6bbe947b..5a28b474 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,7 +31,6 @@ SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) set(SWM_PKG_CONFIG_PATH "" CACHE PATH "Where is the SWM PKG_CONFIG installed?") set(UNION_PKG_CONFIG_PATH "" CACHE PATH "Where is the Union PKG_CONFIG installed?") set(ARGOBOTS_PKG_CONFIG_PATH "" CACHE PATH "Where is argobots PKG_COPNFIG installed? Necessary for SWM") -set(DAMARIS_PKG_CONFIG_PATH "" CACHE PATH "Where is the damaris PKG_CONFIG installed?") # ROSS — modern CMake-config discovery. find_package(ROSS) resolves @@ -165,15 +164,6 @@ codes_resolve_dep(RECORDER TRUE USE_RECORDER) codes_resolve_dep(DARSHAN FALSE USE_DARSHAN) -## DAMARIS — build support intentionally absent (cleanup tracked separately). -# pkg_check_modules(DAMARIS IMPORTED_TARGET) -# if(NOT DAMARIS_FOUND) -# message(STATUS "DAMARIS Library not found, Damaris disabled") -# else(DAMARIS_FOUND) -# set(USE_DAMARIS true) -# endif() - - ## TORCH — ML models if(NOT CODES_USE_TORCH STREQUAL "OFF") find_package(Torch QUIET) From 11a54d969f7065194834730b748e46ba604bfc1e Mon Sep 17 00:00:00 2001 From: Caitlin Ross Date: Tue, 23 Jun 2026 14:58:26 -0500 Subject: [PATCH 11/20] Export a codesConfig.cmake package with a codes::codes target CODES installed a static library and headers but no CMake package, so a downstream model had no way to find_package(codes) -- it had to hard-code CODES's include and library paths (and ROSS's, and MPI's) by hand, or vendor the tree. That is exactly the fragile, manual coupling the rest of this effort has been removing everywhere else. This adds a namespaced install(EXPORT) that generates codesConfig.cmake and a codes::codes imported target, mirroring how ROSS already exposes ROSS::ROSS (and how CODES now consumes ROSS). A consumer does find_package(codes REQUIRED) target_link_libraries(mymodel PRIVATE codes::codes) and inherits CODES's public include dirs plus its ROSS / MPI usage requirements transitively -- no bespoke CODES_INCLUDE_DIRS / CODES_LIBRARIES variables, no knowing where ROSS lives. The config re-resolves the dependencies CODES links (find_dependency for ROSS and MPI C/CXX). To keep the exported target relocatable, the library's public include interface moves to BUILD_INTERFACE / INSTALL_INTERFACE generator expressions: in-tree builds see the source/binary dirs, installed consumers get /include and /include/codes, and no build-machine path leaks into the installed codesTargets.cmake. Verified with a standalone find_package(codes) consumer that links codes::codes and compiles a CODES header. --- cmake/codesConfig.cmake.in | 28 ++++++++++++ src/CMakeLists.txt | 89 ++++++++++++++++++++++++++++++-------- 2 files changed, 98 insertions(+), 19 deletions(-) create mode 100644 cmake/codesConfig.cmake.in diff --git a/cmake/codesConfig.cmake.in b/cmake/codesConfig.cmake.in new file mode 100644 index 00000000..f490d661 --- /dev/null +++ b/cmake/codesConfig.cmake.in @@ -0,0 +1,28 @@ +@PACKAGE_INIT@ + +# codesConfig.cmake — CODES package configuration for find_package(codes) +# +# Usage: +# find_package(codes CONFIG REQUIRED) +# target_link_libraries(mymodel PRIVATE codes::codes) +# +# The codes::codes imported target carries the public include dirs and the +# propagated ROSS / MPI usage requirements — there are no legacy +# CODES_LIBRARIES / CODES_INCLUDE_DIRS variables to read. + +include(CMakeFindDependencyMacro) + +# CODES links ROSS (find_package(ROSS CONFIG)) and MPI (C + CXX) PUBLIC, so a +# consumer must be able to resolve both. Point CMAKE_PREFIX_PATH at the ROSS +# install prefix the same way the CODES build itself did. +find_dependency(ROSS CONFIG REQUIRED) +find_dependency(MPI REQUIRED COMPONENTS C CXX) + +include("${CMAKE_CURRENT_LIST_DIR}/codesTargets.cmake") + +# NOTE: heavy optional deps a CODES build may have linked (SWM, UNION, DUMPI, +# ZeroMQ) are NOT re-resolved here — a consumer of such a build must supply those +# include/link paths itself. The common deps-off build needs only ROSS + MPI; +# full optional-dep re-export is deferred. + +check_required_components(codes) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 3b2be169..3ec8a286 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -142,23 +142,27 @@ add_library(codes STATIC ${SRCS}) # consumer (executables, tests) inherit them. list(APPEND LIBS_TO_LINK MPI::MPI_C MPI::MPI_CXX) +# Optional-dep include paths are absolute build-machine paths, so they are +# BUILD_INTERFACE-only — they keep install(EXPORT) relocatable. (Re-exposing +# them to installed consumers of a heavy-deps build is deferred; see +# cmake/codesConfig.cmake.in.) if(USE_DUMPI) - target_include_directories(codes PUBLIC ${DUMPI_INCLUDE}) + target_include_directories(codes PUBLIC $) endif() #LINK ARGOBOTS, SWM and UNION # target_link_libraries(codes PUBLIC PkgConfig::ARGOBOTS) if(USE_ONLINE) if(USE_SWM) - target_include_directories(codes PUBLIC ${ARGOBOTS_INCLUDE_DIRS}) + target_include_directories(codes PUBLIC $) # target_link_libraries(codes PUBLIC PkgConfig::SWM) - target_include_directories(codes PUBLIC ${SWM_INCLUDE_DIRS}) + target_include_directories(codes PUBLIC $) endif() if(USE_UNION) - target_include_directories(codes PUBLIC ${ARGOBOTS_INCLUDE_DIRS}) + target_include_directories(codes PUBLIC $) # target_link_libraries(codes PUBLIC PkgConfig::SWM) - target_include_directories(codes PUBLIC ${SWM_INCLUDE_DIRS}) - target_include_directories(codes PUBLIC ${UNION_INCLUDE_DIRS}) + target_include_directories(codes PUBLIC $) + target_include_directories(codes PUBLIC $) endif() endif() @@ -169,14 +173,21 @@ if(USE_ZEROMQ) INTERFACE_INCLUDE_DIRECTORIES "${ZEROMQ_BUILD_PATH}") endif() +# Public include dirs travel with the target. BUILD_INTERFACE entries serve +# CODES's own compilation and in-tree consumers; INSTALL_INTERFACE entries serve +# installed consumers — include/ for `` and include/codes/ for the +# sibling codes_config.h and bare codes headers. The src/ paths are build-only: +# they are CODES-internal and not part of the installed surface. target_include_directories(codes PUBLIC - ${CMAKE_CURRENT_SOURCE_DIR} - ${PROJECT_BINARY_DIR} - ${PROJECT_SOURCE_DIR} - ${PROJECT_SOURCE_DIR}/codes - ${PROJECT_SOURCE_DIR}/src - ${PROJECT_SOURCE_DIR}/src/modelconfig - $<$:$> + $ + $ + $ + $ + $ + $ + $ + $ + $:${ZEROMQ_BUILD_PATH}>> ) target_link_libraries(codes PUBLIC ${LIBS_TO_LINK}) @@ -230,13 +241,53 @@ endforeach() # configure_file(modelconfig/configparser.c ${CMAKE_CURRENT_BINARY_DIR}/modelconfig/configparser.c COPYONLY) # configure_file(modelconfig/configparser.h ${CMAKE_CURRENT_BINARY_DIR}/modelconfig/configparser.h COPYONLY) -install(DIRECTORY "${CMAKE_SOURCE_DIR}/codes" DESTINATION include) +# --------------------------------------------------------------------------- +# Install + CMake package export (find_package(codes) -> codes::codes) +# --------------------------------------------------------------------------- +include(CMakePackageConfigHelpers) -# The generated config header is public now — codes-workload.h includes it for -# CODES_HAVE_* feature queries — so install it next to the other codes/ headers. +# Public headers under /include/codes/. The generated config header is +# public (codes-workload.h includes it for CODES_HAVE_* queries), so it ships +# alongside them. +install(DIRECTORY "${CMAKE_SOURCE_DIR}/codes" DESTINATION include) install(FILES "${PROJECT_BINARY_DIR}/codes_config.h" DESTINATION include/codes) -install(TARGETS ${CODES_TARGETS} DESTINATION bin) - -install(TARGETS codes ARCHIVE DESTINATION lib LIBRARY DESTINATION lib) +# Application binaries — not part of the exported package. +install(TARGETS ${CODES_TARGETS} RUNTIME DESTINATION bin) + +# Library target + namespaced export. codes::codes carries the public include +# dirs (INSTALL_INTERFACE) and the ROSS / MPI usage requirements. +install(TARGETS codes + EXPORT codesTargets + ARCHIVE DESTINATION lib + LIBRARY DESTINATION lib + RUNTIME DESTINATION bin) + +install(EXPORT codesTargets + FILE codesTargets.cmake + NAMESPACE codes:: + DESTINATION lib/cmake/codes) + +# codesConfig.cmake re-finds ROSS + MPI via find_dependency and pulls in the +# targets file; the version file gates on SameMajorVersion. +configure_package_config_file( + ${CMAKE_SOURCE_DIR}/cmake/codesConfig.cmake.in + ${CMAKE_CURRENT_BINARY_DIR}/codesConfig.cmake + INSTALL_DESTINATION lib/cmake/codes) + +write_basic_package_version_file( + ${CMAKE_CURRENT_BINARY_DIR}/codesConfigVersion.cmake + VERSION ${PROJECT_VERSION} + COMPATIBILITY SameMajorVersion) + +install(FILES + ${CMAKE_CURRENT_BINARY_DIR}/codesConfig.cmake + ${CMAKE_CURRENT_BINARY_DIR}/codesConfigVersion.cmake + DESTINATION lib/cmake/codes) + +# Build-tree export so a superbuild / FetchContent consumer can +# find_package(codes) against the build dir without installing. +export(EXPORT codesTargets + FILE ${CMAKE_CURRENT_BINARY_DIR}/codesTargets.cmake + NAMESPACE codes::) From c64953fb1f58d2080a1a96e7abb9f8125981cafe Mon Sep 17 00:00:00 2001 From: Caitlin Ross Date: Tue, 23 Jun 2026 15:39:05 -0500 Subject: [PATCH 12/20] ci: mpich and openmpi builds --- .github/workflows/build.yml | 50 ++++++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 15 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index c837f9a3..9d54fcf5 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -15,30 +15,33 @@ on: # CI for CODES. # -# Today: an OS matrix — ubuntu-24.04 (apt MPICH) and macos-14 / Apple Silicon -# (Homebrew MPICH) — each building a freshly built ROSS pinned to a SHA, then -# CODES, then ctest. Heavy optional deps (SWM, UNION, DUMPI, TORCH, ZEROMQ) -# stay OFF so both legs run on stock runners with no custom image. +# Matrix: {ubuntu-24.04, macos-14 / Apple Silicon} x {MPICH, OpenMPI}. Each leg +# builds a freshly built ROSS pinned to a SHA, then CODES, then ctest. MPI is +# auto-discovered via find_package(MPI), so the same CMake works for both impls +# — the matrix just installs one MPI per leg (apt/brew). Heavy optional deps +# (SWM, UNION, DUMPI, TORCH, ZEROMQ) stay OFF so every leg runs on a stock +# runner with no custom image. # # The macOS leg is the safety net for Mac-specific link/include breakage, which # has historically reached master and had to be hand-fixed on a developer Mac. # -# Still deferred (1b follow-ups): OpenMPI alongside MPICH, ubuntu-22.04, a -# gcc/clang compiler matrix, coverage tracking, and a heavy-deps "full" job. +# Still deferred (1b follow-ups): ubuntu-22.04, a gcc/clang compiler matrix, +# coverage tracking, and a heavy-deps "full" job. # # Symmetric with ROSS's codes-contract.yml: ROSS pins CODES, CODES pins # ROSS, each catches consumer-API regressions in the other. jobs: build: - name: build (${{ matrix.os }}) + name: build (${{ matrix.os }}, ${{ matrix.mpi }}) runs-on: ${{ matrix.os }} strategy: - # Don't let one platform's failure cancel the other — we want to see - # both Linux and macOS results on every run. + # Don't let one leg's failure cancel the others — we want to see every + # OS x MPI result on every run. fail-fast: false matrix: os: [ubuntu-24.04, macos-14] + mpi: [mpich, openmpi] env: # ross-org/ROSS @ master as of 2026-06-17. # Bump this SHA whenever CODES needs to track ROSS forward — that @@ -46,6 +49,14 @@ jobs: # TODO: promote to a workflow variable + cache the # built ROSS by SHA so unchanged pins skip the rebuild. ROSS_REF: 9b6ccb18f9b9db438bf41b5b221d0ef16a4dac48 + # OpenMPI refuses to launch more ranks than detected slots. CODES test + # scripts invoke `mpirun -np N` directly (up to np=3 in the core config), + # so --oversubscribe can't be routed through CMake's MPIEXEC_PREFLAGS; + # these env vars enable oversubscription for OpenMPI 4 (OMPI_MCA_*, apt) + # and 5 (PRTE_MCA_*, brew). MPICH ignores them, so they're harmless on the + # MPICH legs. + OMPI_MCA_rmaps_base_oversubscribe: "1" + PRTE_MCA_rmaps_default_mapping_policy: ":oversubscribe" steps: - name: Checkout CODES uses: actions/checkout@v4 @@ -65,15 +76,24 @@ jobs: if: runner.os == 'Linux' run: | sudo apt-get update - sudo apt-get install -y \ - mpich libmpich-dev \ - cmake ninja-build pkg-config \ - flex bison + sudo apt-get install -y cmake ninja-build pkg-config flex bison + if [ "${{ matrix.mpi }}" = "openmpi" ]; then + sudo apt-get install -y libopenmpi-dev openmpi-bin + else + sudo apt-get install -y mpich libmpich-dev + fi - name: Install system dependencies (macOS) if: runner.os == 'macOS' run: | - brew install mpich cmake ninja pkg-config flex bison + brew install cmake ninja pkg-config flex bison + # mpich and open-mpi conflict (both provide mpicc/mpirun), but each + # matrix leg runs on its own runner and installs only one. + if [ "${{ matrix.mpi }}" = "openmpi" ]; then + brew install open-mpi + else + brew install mpich + fi # Homebrew flex/bison are keg-only and the macOS system ones are # too old for flex_target/bison_target — put the brew versions # ahead of /usr/bin on PATH for subsequent steps. @@ -110,7 +130,7 @@ jobs: uses: actions/upload-artifact@v4 with: # Artifact names must be unique across matrix legs (upload-artifact@v4). - name: build-logs-${{ matrix.os }} + name: build-logs-${{ matrix.os }}-${{ matrix.mpi }} path: | codes/build/Testing/Temporary/LastTest.log codes/build/Testing/Temporary/LastTestsFailed.log From 50227d6b26a87085ff584dd0f36ffe333f5a12d1 Mon Sep 17 00:00:00 2001 From: Caitlin Ross Date: Tue, 23 Jun 2026 15:44:44 -0500 Subject: [PATCH 13/20] ci: add older ubuntu build --- .github/workflows/build.yml | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 9d54fcf5..53ab6316 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -15,18 +15,20 @@ on: # CI for CODES. # -# Matrix: {ubuntu-24.04, macos-14 / Apple Silicon} x {MPICH, OpenMPI}. Each leg -# builds a freshly built ROSS pinned to a SHA, then CODES, then ctest. MPI is -# auto-discovered via find_package(MPI), so the same CMake works for both impls -# — the matrix just installs one MPI per leg (apt/brew). Heavy optional deps -# (SWM, UNION, DUMPI, TORCH, ZEROMQ) stay OFF so every leg runs on a stock -# runner with no custom image. +# Matrix: {ubuntu-22.04, ubuntu-24.04, macos-14 / Apple Silicon} x {MPICH, +# OpenMPI}. Each leg builds a freshly built ROSS pinned to a SHA, then CODES, +# then ctest. MPI is auto-discovered via find_package(MPI), so the same CMake +# works for both impls — the matrix just installs one MPI per leg (apt/brew). +# Heavy optional deps (SWM, UNION, DUMPI, TORCH, ZEROMQ) stay OFF so every leg +# runs on a stock runner with no custom image. ubuntu-22.04 covers the older +# glibc / gcc-11 / cmake-3.22 toolchain (both minimums — CODES 3.17, ROSS 3.16 — +# are satisfied). # # The macOS leg is the safety net for Mac-specific link/include breakage, which # has historically reached master and had to be hand-fixed on a developer Mac. # -# Still deferred (1b follow-ups): ubuntu-22.04, a gcc/clang compiler matrix, -# coverage tracking, and a heavy-deps "full" job. +# Still deferred (1b follow-ups): a gcc/clang compiler matrix, coverage +# tracking, and a heavy-deps "full" job. # # Symmetric with ROSS's codes-contract.yml: ROSS pins CODES, CODES pins # ROSS, each catches consumer-API regressions in the other. @@ -40,7 +42,7 @@ jobs: # OS x MPI result on every run. fail-fast: false matrix: - os: [ubuntu-24.04, macos-14] + os: [ubuntu-22.04, ubuntu-24.04, macos-14] mpi: [mpich, openmpi] env: # ross-org/ROSS @ master as of 2026-06-17. From 42a31f58c0e5077e1b9e3ec3a89703e9332843b9 Mon Sep 17 00:00:00 2001 From: Caitlin Ross Date: Tue, 23 Jun 2026 15:59:30 -0500 Subject: [PATCH 14/20] ci: add clang build to linux build --- .github/workflows/build.yml | 58 ++++++++++++++++++++++++++++--------- 1 file changed, 45 insertions(+), 13 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 53ab6316..6b9aa9ec 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -15,35 +15,49 @@ on: # CI for CODES. # -# Matrix: {ubuntu-22.04, ubuntu-24.04, macos-14 / Apple Silicon} x {MPICH, -# OpenMPI}. Each leg builds a freshly built ROSS pinned to a SHA, then CODES, -# then ctest. MPI is auto-discovered via find_package(MPI), so the same CMake -# works for both impls — the matrix just installs one MPI per leg (apt/brew). -# Heavy optional deps (SWM, UNION, DUMPI, TORCH, ZEROMQ) stay OFF so every leg -# runs on a stock runner with no custom image. ubuntu-22.04 covers the older -# glibc / gcc-11 / cmake-3.22 toolchain (both minimums — CODES 3.17, ROSS 3.16 — -# are satisfied). +# Matrix: OS x MPI x compiler, pruned via `exclude` to 8 legs: +# - ubuntu-24.04 : gcc + clang (full compiler coverage on the modern toolchain) +# - ubuntu-22.04 : gcc only (its job is the older glibc / gcc-11 / cmake-3.22 +# toolchain; clang divergence is covered on 24.04) +# - macos-14 : Apple clang (a "gcc" leg there is just clang again) +# ...each crossed with {MPICH, OpenMPI}. Each leg builds a freshly built ROSS +# pinned to a SHA, then CODES, then ctest. MPI is auto-discovered via +# find_package(MPI) and the base compiler comes from CC/CXX, so the same CMake +# serves every leg — the matrix only changes which packages get installed. Heavy +# optional deps (SWM, UNION, DUMPI, TORCH, ZEROMQ) stay OFF so every leg runs on +# a stock runner with no custom image. Both cmake minimums (CODES 3.17, ROSS +# 3.16) are satisfied on 22.04's cmake 3.22. # # The macOS leg is the safety net for Mac-specific link/include breakage, which # has historically reached master and had to be hand-fixed on a developer Mac. # -# Still deferred (1b follow-ups): a gcc/clang compiler matrix, coverage -# tracking, and a heavy-deps "full" job. +# Still deferred (1b follow-ups): coverage tracking and a heavy-deps "full" job. # # Symmetric with ROSS's codes-contract.yml: ROSS pins CODES, CODES pins # ROSS, each catches consumer-API regressions in the other. jobs: build: - name: build (${{ matrix.os }}, ${{ matrix.mpi }}) + name: build (${{ matrix.os }}, ${{ matrix.mpi }}, ${{ matrix.cc }}) runs-on: ${{ matrix.os }} strategy: # Don't let one leg's failure cancel the others — we want to see every - # OS x MPI result on every run. + # OS x MPI x compiler result on every run. fail-fast: false matrix: os: [ubuntu-22.04, ubuntu-24.04, macos-14] mpi: [mpich, openmpi] + cc: [gcc, clang] + exclude: + # macOS only ships (Apple) clang — a "gcc" leg there is just clang + # again under a different name. + - os: macos-14 + cc: gcc + # ubuntu-22.04 exists to test the older glibc / gcc-11 toolchain; the + # gcc-vs-clang divergence signal comes from the newest, strictest + # toolchain, so clang is exercised on ubuntu-24.04 instead. + - os: ubuntu-22.04 + cc: clang env: # ross-org/ROSS @ master as of 2026-06-17. # Bump this SHA whenever CODES needs to track ROSS forward — that @@ -84,6 +98,9 @@ jobs: else sudo apt-get install -y mpich libmpich-dev fi + if [ "${{ matrix.cc }}" = "clang" ]; then + sudo apt-get install -y clang + fi - name: Install system dependencies (macOS) if: runner.os == 'macOS' @@ -102,6 +119,21 @@ jobs: echo "$(brew --prefix bison)/bin" >> "$GITHUB_PATH" echo "$(brew --prefix flex)/bin" >> "$GITHUB_PATH" + - name: Select compiler + # Drive the base compiler for both ROSS and CODES via CC/CXX. + # find_package(MPI) layers the MPI include/link flags on top (the mpicc + # wrapper is only probed for flags, never used as the compiler), so this + # is what actually makes a gcc vs clang build. On the macOS leg CC=clang + # resolves to Apple clang. + run: | + if [ "${{ matrix.cc }}" = "clang" ]; then + echo "CC=clang" >> "$GITHUB_ENV" + echo "CXX=clang++" >> "$GITHUB_ENV" + else + echo "CC=gcc" >> "$GITHUB_ENV" + echo "CXX=g++" >> "$GITHUB_ENV" + fi + - name: Configure ROSS run: > cmake -S ross -B ross/build -G Ninja @@ -132,7 +164,7 @@ jobs: uses: actions/upload-artifact@v4 with: # Artifact names must be unique across matrix legs (upload-artifact@v4). - name: build-logs-${{ matrix.os }}-${{ matrix.mpi }} + name: build-logs-${{ matrix.os }}-${{ matrix.mpi }}-${{ matrix.cc }} path: | codes/build/Testing/Temporary/LastTest.log codes/build/Testing/Temporary/LastTestsFailed.log From b34d27927e146896166ddd073dfe9d37bff0ed8e Mon Sep 17 00:00:00 2001 From: Caitlin Ross Date: Tue, 23 Jun 2026 16:24:02 -0500 Subject: [PATCH 15/20] cmake/ci: add codecov support for coverage testing --- .github/workflows/build.yml | 126 ++++++++++++++++++++++++++++++++++-- CMakeLists.txt | 5 ++ codecov.yml | 40 ++++++++++++ src/CMakeLists.txt | 8 +++ 4 files changed, 172 insertions(+), 7 deletions(-) create mode 100644 codecov.yml diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 6b9aa9ec..9ef25e50 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -31,11 +31,20 @@ on: # The macOS leg is the safety net for Mac-specific link/include breakage, which # has historically reached master and had to be hand-fixed on a developer Mac. # -# Still deferred (1b follow-ups): coverage tracking and a heavy-deps "full" job. +# A separate `coverage` job (below) builds CODES with --coverage on +# ubuntu-24.04/gcc and uploads to Codecov. Still deferred (1b follow-up): a +# heavy-deps "full" job. # # Symmetric with ROSS's codes-contract.yml: ROSS pins CODES, CODES pins # ROSS, each catches consumer-API regressions in the other. +# Shared by every job: the pinned ROSS commit. Bump when CODES needs to track +# ROSS forward — a one-line reviewed change. The nightly scheduled run ignores +# this and builds ROSS @ master so pin drift surfaces within a day. +# TODO: cache the built ROSS by SHA so unchanged pins skip the rebuild. +env: + ROSS_REF: 9b6ccb18f9b9db438bf41b5b221d0ef16a4dac48 + jobs: build: name: build (${{ matrix.os }}, ${{ matrix.mpi }}, ${{ matrix.cc }}) @@ -59,12 +68,6 @@ jobs: - os: ubuntu-22.04 cc: clang env: - # ross-org/ROSS @ master as of 2026-06-17. - # Bump this SHA whenever CODES needs to track ROSS forward — that - # bump is a one-line reviewed change. - # TODO: promote to a workflow variable + cache the - # built ROSS by SHA so unchanged pins skip the rebuild. - ROSS_REF: 9b6ccb18f9b9db438bf41b5b221d0ef16a4dac48 # OpenMPI refuses to launch more ranks than detected slots. CODES test # scripts invoke `mpirun -np N` directly (up to np=3 in the core config), # so --oversubscribe can't be routed through CMake's MPIEXEC_PREFLAGS; @@ -176,3 +179,112 @@ jobs: ross/build/CMakeCache.txt if-no-files-found: ignore retention-days: 14 + + # Dedicated coverage build: instrument the codes library (--coverage), run the + # test suite, capture with lcov, and upload to Codecov. Single config + # (ubuntu-24.04 / gcc / MPICH) — coverage is about which lines run, not + # platform/MPI breakage (that's the matrix above). + # + # Uses lcov (like ROSS's coverage job), not gcovr: lcov 2.x reads gcov's JSON + # intermediate format and its --ignore-errors skips the generated iokernellang + # parser (codesparser.c, whose .y isn't committed) and other gcov hiccups — + # where gcovr's text parser aborts. + # + # Requires a CODECOV_TOKEN repo secret (Settings -> Secrets and variables -> + # Actions). Until that's set the upload is a no-op: fail_ci_if_error is false, + # so this job stays green and never gates a PR. + coverage: + name: coverage (ubuntu-24.04, gcc) + runs-on: ubuntu-24.04 + steps: + - name: Checkout CODES + uses: actions/checkout@v4 + # No path: check out at the workspace root so the source paths lcov + # records are repo-relative and Codecov maps them onto the repo. + + - name: Checkout ROSS + uses: actions/checkout@v4 + with: + repository: ROSS-org/ROSS + ref: ${{ github.event_name == 'schedule' && 'master' || env.ROSS_REF }} + path: ross + + - name: Install system dependencies + run: | + sudo apt-get update + sudo apt-get install -y \ + mpich libmpich-dev \ + cmake ninja-build pkg-config \ + flex bison lcov + + - name: Configure ROSS + run: > + cmake -S ross -B ross/build -G Ninja + -DCMAKE_BUILD_TYPE=Debug + -DROSS_BUILD_MODELS=ON + -DCMAKE_INSTALL_PREFIX=$PWD/ross-install + + - name: Build and install ROSS + run: cmake --build ross/build --target install -j + + - name: Configure CODES (with coverage) + run: > + cmake -S . -B build -G Ninja + -DCMAKE_BUILD_TYPE=Debug + -DBUILD_TESTING=ON + -DCODES_USE_TORCH=OFF + -DCODES_USE_ZEROMQ=OFF + -DCODES_ENABLE_COVERAGE=ON + -DCMAKE_PREFIX_PATH=$PWD/ross-install + + - name: Build CODES + run: cmake --build build -j + + - name: Run CODES tests + run: ctest --test-dir build --output-on-failure + + - name: Collect coverage (lcov) + # --ignore-errors mismatch,gcov,source,unused tolerates the generated + # iokernellang parser whose .y/.l aren't committed (source/gcov), any + # gcov data/version mismatch, and unused --remove patterns. Then strip + # system + ROSS headers, the test harness, examples, and the generated + # flex/bison lexers + parsers from the metric. network-workloads mains DO + # run in the synthetic tests, so they're kept. + run: | + lcov --capture --directory build --output-file coverage.info \ + --ignore-errors mismatch,gcov,source,unused + lcov --remove coverage.info \ + '/usr/*' \ + '*/ross-install/*' \ + '*/tests/*' \ + '*/doc/*' \ + '*configlex*' '*configparser*' \ + '*codeslexer*' '*codesparser*' \ + --output-file coverage.info \ + --ignore-errors unused + lcov --list coverage.info + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v5 + with: + token: ${{ secrets.CODECOV_TOKEN }} + files: coverage.info + # Upload only the lcov report; don't let Codecov re-discover the raw + # .gcda and run its own gcov pass over them. + plugins: noop + disable_search: true + flags: ctest + # Coverage is informational — never fail the job (or before the token + # is configured) on an upload hiccup. + fail_ci_if_error: false + + - name: Upload coverage report on failure + if: failure() + uses: actions/upload-artifact@v4 + with: + name: coverage-report + path: | + coverage.info + build/Testing/Temporary/LastTest.log + if-no-files-found: ignore + retention-days: 14 diff --git a/CMakeLists.txt b/CMakeLists.txt index 5a28b474..26279aab 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,6 +25,11 @@ set(CMAKE_C_STANDARD_REQUIRED True) # consumers; ROSS/MPI headers arrive via -isystem and are exempt. add_compile_options(-Wundef) +# Optional gcov/--coverage instrumentation, applied to the `codes` target (and +# propagated to the binaries/tests that link it) in src/CMakeLists.txt. Off by +# default; the coverage CI job turns it on. gcc/clang only. +option(CODES_ENABLE_COVERAGE "Instrument the codes library with --coverage (gcov)" OFF) + #prevent cmake from stripping the runtime path (important if shared libraries are imported) SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) diff --git a/codecov.yml b/codecov.yml new file mode 100644 index 00000000..71cb1f7e --- /dev/null +++ b/codecov.yml @@ -0,0 +1,40 @@ +# Codecov configuration for CODES. https://docs.codecov.com/docs/codecov-yaml +# +# Coverage is reported on PRs and commits but is INFORMATIONAL for now — it +# annotates without failing checks, so it can't block a merge before the project +# has an established baseline. Flip `informational: false` once coverage is +# trusted and you want it to gate. +coverage: + status: + project: + default: + informational: true + patch: + default: + informational: true + +# Files/paths excluded from the coverage metric. The coverage CI job also strips +# matching paths via `lcov --remove`; this is the belt-and-suspenders side so the +# Codecov UI agrees with the uploaded report. +ignore: + - "tests/" # the test harness itself + - "**/configlex.*" # generated flex lexer (modelconfig) + - "**/configparser.*" # generated bison parser (modelconfig) + - "**/codeslexer.*" # generated flex lexer (iokernellang) + - "**/codesparser.*" # generated bison parser (iokernellang; .y not committed) + - "doc/" # examples/tutorials (the ping-pong example IS tested, + # but it's teaching code, not the product — revisit if + # you'd rather measure it) + +# PR comment from the Codecov bot. The defaults already post a comment; this just +# makes it explicit. require_base/require_changes:false mean it still comments on +# the first PRs, before master has an established coverage baseline to diff +# against. NOTE: posting also needs the Codecov GitHub App installed on the org +# with access to this repo — the upload token alone does NOT grant comment/status +# permissions. +comment: + layout: "reach, diff, flags, files" + behavior: default + require_changes: false + require_base: false + require_head: true diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 3ec8a286..6152d8cf 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -133,6 +133,14 @@ endif() add_library(codes STATIC ${SRCS}) +# Coverage instrumentation (opt-in via CODES_ENABLE_COVERAGE). PUBLIC so the +# library objects are instrumented AND every binary/test that links codes pulls +# in the gcov runtime, so `ctest` emits .gcda files for them. +if(CODES_ENABLE_COVERAGE) + target_compile_options(codes PUBLIC --coverage) + target_link_options(codes PUBLIC --coverage) +endif() + # Optional-feature state is exposed through the generated codes_config.h # (CODES_HAVE_), included by sources that need it — no -DUSE_ compile # definitions, so nothing leaks onto consumers' command lines. From 4146dabf26f3d8e25a8c4cdb0bd47cdbb34212aa Mon Sep 17 00:00:00 2001 From: Caitlin Ross Date: Thu, 25 Jun 2026 13:23:12 -0500 Subject: [PATCH 16/20] cmake fix --- src/CMakeLists.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 6152d8cf..54bb27da 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -155,22 +155,22 @@ list(APPEND LIBS_TO_LINK MPI::MPI_C MPI::MPI_CXX) # them to installed consumers of a heavy-deps build is deferred; see # cmake/codesConfig.cmake.in.) if(USE_DUMPI) - target_include_directories(codes PUBLIC $) + target_include_directories(codes PUBLIC "$") endif() #LINK ARGOBOTS, SWM and UNION # target_link_libraries(codes PUBLIC PkgConfig::ARGOBOTS) if(USE_ONLINE) if(USE_SWM) - target_include_directories(codes PUBLIC $) + target_include_directories(codes PUBLIC "$") # target_link_libraries(codes PUBLIC PkgConfig::SWM) - target_include_directories(codes PUBLIC $) + target_include_directories(codes PUBLIC "$") endif() if(USE_UNION) - target_include_directories(codes PUBLIC $) + target_include_directories(codes PUBLIC "$") # target_link_libraries(codes PUBLIC PkgConfig::SWM) - target_include_directories(codes PUBLIC $) - target_include_directories(codes PUBLIC $) + target_include_directories(codes PUBLIC "$") + target_include_directories(codes PUBLIC "$") endif() endif() From ce51b225ee80a80b532b07b5405a670370f0678b Mon Sep 17 00:00:00 2001 From: Caitlin Ross Date: Thu, 25 Jun 2026 17:49:20 -0500 Subject: [PATCH 17/20] cmake: link the zmqml requester PUBLIC on codes so all consumers resolve zmqml_* MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The director-client.C / dragonfly-dally.C objects in libcodes.a reference the zmqml_* symbols defined in libzmqmlrequester.so, so the requester is a usage requirement of the codes library itself. Linking it PUBLIC on `codes` (rather than PRIVATE per-executable in the CODES_TARGETS loop) makes every consumer inherit it transitively, at the end of the link line after libcodes.a + ROSS / SWM / UNION. This fixes two faults on the heavy (UNION+ZeroMQ) build: the doc/example and tests targets — which link codes but aren't in CODES_TARGETS — previously linked no requester at all, and even the CODES_TARGETS exes had it ahead of codes' transitive deps where the linker dropped it, leaving zmqml_* undefined. zmqmlrequester is an IMPORTED target, so like PkgConfig::SWM it's exempt from the install(EXPORT) "not in export set" error. --- src/CMakeLists.txt | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 54bb27da..edd6ff86 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -231,15 +231,29 @@ if(USE_ZEROMQ) pkg_check_modules(PC_ZeroMQ QUIET zmq) find_path(ZeroMQ_INCLUDE_DIR NAMES zmq.hpp PATHS ${PC_ZeroMQ_INCLUDE_DIRS}) find_library(ZeroMQ_LIBRARY NAMES zmq PATHS ${PC_ZeroMQ_LIBRARY_DIRS}) + + # The requester defines the zmqml_* symbols that codes' OWN objects + # (director-client.C, dragonfly-dally.C) reference, so it is a usage + # requirement of the codes library itself. Linking it PUBLIC here — rather + # than PRIVATE per-executable in the CODES_TARGETS loop below — fixes two + # things the old per-exe approach got wrong: + # * Coverage: every consumer of codes inherits it, including the + # doc/example targets and the tests targets, which are NOT in + # CODES_TARGETS and so linked no requester at all (undefined zmqml_*). + # * Order: it lands at the END of each consumer's link line, after + # libcodes.a and codes' other transitive deps (ROSS, SWM, UNION, ...), + # which is the order the references need. Linked ahead of those deps it + # was dropped and zmqml_* went undefined on the heavy (UNION+ZeroMQ) + # build. zmqmlrequester is an IMPORTED target, so — like PkgConfig::SWM + # above — it's exempt from the install(EXPORT) "not in export set" error. + target_link_libraries(codes PUBLIC zmqmlrequester ${ZeroMQ_LIBRARY}) endif() -# Each executable links codes; its include dirs, MPI, ROSS, and optional-dep -# libs all propagate transitively through codes' PUBLIC usage requirements. +# Each executable links codes; its include dirs, MPI, ROSS, the optional-dep +# libs, and (under USE_ZEROMQ) the zmqml requester all propagate transitively +# through codes' PUBLIC usage requirements. foreach(tar IN LISTS CODES_TARGETS) target_link_libraries(${tar} PRIVATE codes) - if(USE_ZEROMQ) - target_link_libraries(${tar} PRIVATE zmqmlrequester ${ZeroMQ_LIBRARY}) - endif() endforeach() From fe907d1b6618de059ad9d1f54da964a81f6807a4 Mon Sep 17 00:00:00 2001 From: Caitlin Ross Date: Thu, 25 Jun 2026 17:49:20 -0500 Subject: [PATCH 18/20] ci: add the full heavy-deps CODES build job Add a `full` job to build.yml that runs inside ghcr.io/codes-org/codes-ci-full and builds ROSS + the zmqml requester + CODES with every optional subsystem on (UNION, ZEROMQ, DUMPI, TORCH), then runs ctest excluding union-surrogate director tests. The heavy dependency compiles live in the prebuilt image, so this job only builds ROSS + CODES. --- .github/workflows/build.yml | 97 +++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 9ef25e50..7909c496 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -288,3 +288,100 @@ jobs: build/Testing/Temporary/LastTest.log if-no-files-found: ignore retention-days: 14 + + # Heavy-dependency "full" build: every optional subsystem ON (SWM, UNION, + # DUMPI, Torch, ZeroMQ). Runs INSIDE the prebuilt ghcr.io/codes-org/codes-ci-full + # image (built + published by full-ci-image.yml) so the slow dependency + # compiles happen once in that image instead of on every PR — this job only + # builds ROSS + the zmqml requester + CODES. The image ships MPICH, cmake, + # ninja, flex/bison, and each dep installed under /opt/. + # + # Bootstrap: the image must exist in GHCR before this job can pull it. Run the + # "Full CI image" workflow once (workflow_dispatch) to publish :latest. + full: + name: full (ubuntu-22.04 image, all deps) + runs-on: ubuntu-24.04 + # The codes-ci-full GHCR package is public, so the image pulls anonymously — + # no credentials needed. This is also what lets fork-PR runs use it: a + # fork-PR GITHUB_TOKEN can't read the org's private packages, but a public + # image needs no token at all. + container: + image: ghcr.io/codes-org/codes-ci-full:latest + permissions: + contents: read + steps: + - name: Checkout CODES + uses: actions/checkout@v4 + with: + path: codes + + - name: Checkout ROSS + uses: actions/checkout@v4 + with: + repository: ROSS-org/ROSS + ref: ${{ github.event_name == 'schedule' && 'master' || env.ROSS_REF }} + path: ross + + - name: Configure ROSS + # No CC=mpicc: the image's MPICH is auto-discovered by find_package(MPI). + run: > + cmake -S ross -B ross/build -G Ninja + -DCMAKE_BUILD_TYPE=Debug + -DCMAKE_INSTALL_PREFIX=$PWD/ross-install + + - name: Build and install ROSS + run: cmake --build ross/build --target install -j + + - name: Build the zmqml requester + # libzmqmlrequester.so is built from CODES source (the image only ships + # libzmq + rapidjson headers); ZEROMQ_BUILD_PATH points the CODES + # configure at it below. + run: make -C codes/src/surrogate/zmqml + + - name: Configure CODES (all deps ON) + # Torch is discovered through the pip package's cmake prefix, matching + # CODES-compile-instructions.sh. + run: | + export Torch_DIR="$(python3 -c 'import torch; print(torch.utils.cmake_prefix_path)')/Torch" + cmake -S codes -B codes/build -G Ninja \ + -DCMAKE_BUILD_TYPE=Debug \ + -DBUILD_TESTING=ON \ + -DCODES_USE_UNION=ON \ + -DCODES_USE_ZEROMQ=ON \ + -DCODES_USE_DUMPI=ON \ + -DCODES_USE_TORCH=ON \ + -DSWM_PKG_CONFIG_PATH=/opt/swm/lib/pkgconfig \ + -DUNION_PKG_CONFIG_PATH=/opt/union/lib/pkgconfig \ + -DARGOBOTS_PKG_CONFIG_PATH=/opt/argobots/lib/pkgconfig \ + -DDUMPI_BUILD_PATH=/opt/dumpi \ + -DTorch_DIR="$Torch_DIR" \ + -DZEROMQ_BUILD_PATH="$PWD/codes/src/surrogate/zmqml" \ + -DCMAKE_PREFIX_PATH="$PWD/ross-install" + + - name: Build CODES + run: cmake --build codes/build -j + + - name: Run CODES tests + # Exclude the union-surrogate director tests: their *.sh harnesses set + # only ~8 of the ~20 config-template vars (DIRECTOR_* etc.), so CODES + # rejects the rendered config regardless of build flags. Tracked as a + # follow-up pending the intended director env from the surrogate's + # author; the ZeroMQ *build* is exercised by this job, just not those + # tests' runtime config. + run: ctest --test-dir codes/build --output-on-failure -E "union-workload-test-surrogate" + + - name: Upload logs on failure + if: failure() + uses: actions/upload-artifact@v4 + with: + name: build-logs-full + path: | + codes/build/Testing/Temporary/LastTest.log + codes/build/Testing/Temporary/LastTestsFailed.log + codes/build/CMakeFiles/CMakeError.log + codes/build/CMakeFiles/CMakeOutput.log + codes/build/CMakeCache.txt + ross/build/CMakeFiles/CMakeError.log + ross/build/CMakeFiles/CMakeOutput.log + if-no-files-found: ignore + retention-days: 14 From 147e3ca42a849d5fc866f1fdb5c29ce13134057e Mon Sep 17 00:00:00 2001 From: Caitlin Ross Date: Thu, 25 Jun 2026 18:42:41 -0500 Subject: [PATCH 19/20] rename .C sources to .cxx --- .github/workflows/format.yml | 2 +- codes/surrogate/network-surrogate.h | 6 ++--- doc/GETTING_STARTED | 2 +- doc/codes-vis-readme.md | 2 +- doc/workload/union_online_workload.txt | 8 +++---- src/CMakeLists.txt | 22 +++++++++---------- src/Makefile.subdir | 12 +++++----- .../model-net/doc/README.dragonfly-custom.txt | 2 +- ...ragonfly-custom.C => dragonfly-custom.cxx} | 0 ...{dragonfly-dally.C => dragonfly-dally.cxx} | 8 +++---- .../{dragonfly-plus.C => dragonfly-plus.cxx} | 0 .../{express-mesh.C => express-mesh.cxx} | 0 .../{net-template.C => net-template.cxx} | 0 ...anager.C => dragonfly-network-manager.cxx} | 0 ...{director-client.C => director-client.cxx} | 2 +- .../director/train_packet_latency_torchjit.py | 4 ++-- .../{torch-jit.C => torch-jit.cxx} | 0 ...controller.C => congestion-controller.cxx} | 0 ...kld.C => codes-conc-online-comm-wrkld.cxx} | 0 ...mm-wrkld.C => codes-online-comm-wrkld.cxx} | 0 20 files changed, 35 insertions(+), 35 deletions(-) rename src/networks/model-net/{dragonfly-custom.C => dragonfly-custom.cxx} (100%) rename src/networks/model-net/{dragonfly-dally.C => dragonfly-dally.cxx} (99%) rename src/networks/model-net/{dragonfly-plus.C => dragonfly-plus.cxx} (100%) rename src/networks/model-net/{express-mesh.C => express-mesh.cxx} (100%) rename src/networks/model-net/{net-template.C => net-template.cxx} (100%) rename src/networks/model-net/network-managers/{dragonfly-network-manager.C => dragonfly-network-manager.cxx} (100%) rename src/surrogate/{director-client.C => director-client.cxx} (99%) rename src/surrogate/packet-latency-predictor/{torch-jit.C => torch-jit.cxx} (100%) rename src/util/{congestion-controller.C => congestion-controller.cxx} (100%) rename src/workload/methods/{codes-conc-online-comm-wrkld.C => codes-conc-online-comm-wrkld.cxx} (100%) rename src/workload/methods/{codes-online-comm-wrkld.C => codes-online-comm-wrkld.cxx} (100%) diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml index bb38dcdf..d37e0e8b 100644 --- a/.github/workflows/format.yml +++ b/.github/workflows/format.yml @@ -41,7 +41,7 @@ jobs: # when paths are passed explicitly (as `find ... -exec` does). run: | find src codes doc tests \ - -type f \( -name "*.c" -o -name "*.h" -o -name "*.C" \ + -type f \( -name "*.c" -o -name "*.h" \ -o -name "*.cpp" -o -name "*.hpp" -o -name "*.cxx" \) \ -not -path 'src/modelconfig/configlex.*' \ -not -path 'src/modelconfig/configparser.*' \ diff --git a/codes/surrogate/network-surrogate.h b/codes/surrogate/network-surrogate.h index c2bb86ef..df58038c 100644 --- a/codes/surrogate/network-surrogate.h +++ b/codes/surrogate/network-surrogate.h @@ -18,9 +18,9 @@ extern "C" { // Functions that director should have access to typedef void (*switch_surrogate_f)( - void); // Switches back and forth from surrogate mode as defined by network model (e.g, by dragonfly-dally.C) + void); // Switches back and forth from surrogate mode as defined by network model (e.g, by dragonfly-dally.cxx) typedef bool (*is_surrogate_on_f)( - void); // Switches back and forth from surrogate mode as defined by network model (e.g, by dragonfly-dally.C) + void); // Switches back and forth from surrogate mode as defined by network model (e.g, by dragonfly-dally.cxx) struct network_model_surrogate { switch_surrogate_f @@ -30,7 +30,7 @@ struct network_model_surrogate { // Switches back and forth from surrogate mode as defined by network model -// (e.g, by dragonfly-dally.C) +// (e.g, by dragonfly-dally.cxx) // Parameters: `data` corresponds to the lp sub-state, lp is the lp pointer, and the array of events in queue (to be processed) typedef void (*model_switch_f)(void* data, tw_lp* lp, tw_event**); typedef bool (*model_ask_if_freeze_f)( diff --git a/doc/GETTING_STARTED b/doc/GETTING_STARTED index 482340c8..cfa6a9d5 100644 --- a/doc/GETTING_STARTED +++ b/doc/GETTING_STARTED @@ -204,7 +204,7 @@ form "dumpi-YYYY.MM.DD.HH.MM.SS-XXXX.bin", then the input should be === Quality of Service -Two models (dragonfly-dally.C and dragonfly-plus.C) can now support traffic +Two models (dragonfly-dally.cxx and dragonfly-plus.cxx) can now support traffic differentiation and prioritization. The models support quality of service by directing the network traffic on separate class of virtual channels. Additional documentation on using traffic classes can be found at the wiki link: diff --git a/doc/codes-vis-readme.md b/doc/codes-vis-readme.md index b60bd973..b18d0ebc 100644 --- a/doc/codes-vis-readme.md +++ b/doc/codes-vis-readme.md @@ -140,7 +140,7 @@ If you're using any of the following CODES models, you don't have to add anythin - fat tree server LP (model-net-synthetic-fattree.c) - slimfly server LP (model-net-synthetic-slimfly.c) - original dragonfly router and terminal LPs (dragonfly.c) -- dragonfly custom router and terminal LPs (dragonfly-custom.C) +- dragonfly custom router and terminal LPs (dragonfly-custom.cxx) - slimfly router and terminal LPs (slimfly.c) - fat tree switch and terminal LPs (fat-tree.c) - model-net-base-lp (model-net-lp.c) diff --git a/doc/workload/union_online_workload.txt b/doc/workload/union_online_workload.txt index fcd9938f..d5d6d9b7 100644 --- a/doc/workload/union_online_workload.txt +++ b/doc/workload/union_online_workload.txt @@ -18,17 +18,17 @@ Added parameters for collecting router traffic data, including: == Makefile Added checking for Union installation in the autoconf configure script configure.ac -Added src/workload/methods/codes-conc-online-comm-wrkld.C to code base if compile with Union in Makefile.am +Added src/workload/methods/codes-conc-online-comm-wrkld.cxx to code base if compile with Union in Makefile.am == Union online workload -We add a pluggable workload module "src/workload/methods/codes-conc-online-comm-wrkld.C" into CODES workload generator to hold the actual implementation of Union communication events, such that the messages from Union skeletons can be emitted as simulation events in CODES. +We add a pluggable workload module "src/workload/methods/codes-conc-online-comm-wrkld.cxx" into CODES workload generator to hold the actual implementation of Union communication events, such that the messages from Union skeletons can be emitted as simulation events in CODES. == Router status collection for dragonfly custom and dragonfly dally Added supportive functions for collecting traffic data on router port on the following network models: -* dragonfly custom at src/networks/model-net/dragonfly-custom.C -* dragonfly dally at src/networks/model-net/dragonfly-dally.C +* dragonfly custom at src/networks/model-net/dragonfly-custom.cxx +* dragonfly dally at src/networks/model-net/dragonfly-dally.cxx == Updates in MPI replay diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index edd6ff86..1f27f5ea 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -19,17 +19,17 @@ list(APPEND SRCS networks/model-net/common-net.c networks/model-net/simplenet-upd.c networks/model-net/torus.c - networks/model-net/express-mesh.C + networks/model-net/express-mesh.cxx networks/model-net/dragonfly.c - networks/model-net/dragonfly-custom.C - networks/model-net/dragonfly-plus.C - networks/model-net/dragonfly-dally.C + networks/model-net/dragonfly-custom.cxx + networks/model-net/dragonfly-plus.cxx + networks/model-net/dragonfly-dally.cxx networks/model-net/slimfly.c networks/model-net/fattree.c networks/model-net/loggp.c networks/model-net/simplep2p.c - networks/model-net/network-managers/dragonfly-network-manager.C + networks/model-net/network-managers/dragonfly-network-manager.cxx workload/codes-workload.c workload/methods/codes-iolang-wrkld.c @@ -53,7 +53,7 @@ list(APPEND SRCS util/codes-mapping-context.c util/codes-comm.c util/rc-stack.c - util/congestion-controller.C + util/congestion-controller.cxx surrogate/init.c surrogate/application-surrogate.c @@ -98,12 +98,12 @@ endif() if(USE_ONLINE) if(USE_SWM) - list(APPEND SRCS workload/methods/codes-online-comm-wrkld.C) + list(APPEND SRCS workload/methods/codes-online-comm-wrkld.cxx) list(APPEND LIBS_TO_LINK PkgConfig::SWM) list(APPEND LIBS_TO_LINK PkgConfig::ARGOBOTS) endif() if(USE_UNION) - list(APPEND SRCS workload/methods/codes-conc-online-comm-wrkld.C) + list(APPEND SRCS workload/methods/codes-conc-online-comm-wrkld.cxx) list(APPEND LIBS_TO_LINK PkgConfig::SWM) list(APPEND LIBS_TO_LINK PkgConfig::UNION) list(APPEND LIBS_TO_LINK PkgConfig::ARGOBOTS) @@ -120,7 +120,7 @@ endif() # endif() if(USE_TORCH) - list(APPEND SRCS surrogate/packet-latency-predictor/torch-jit.C) + list(APPEND SRCS surrogate/packet-latency-predictor/torch-jit.cxx) list(APPEND LIBS_TO_LINK ${TORCH_LIBRARIES}) endif() @@ -128,7 +128,7 @@ endif() # USE_ZEROMQ bool in the top-level CMakeLists, which also enforces that # ZEROMQ_BUILD_PATH is set). The imported target + linkage below are gated on it. if(USE_ZEROMQ) - list(APPEND SRCS surrogate/director-client.C) + list(APPEND SRCS surrogate/director-client.cxx) endif() add_library(codes STATIC ${SRCS}) @@ -233,7 +233,7 @@ if(USE_ZEROMQ) find_library(ZeroMQ_LIBRARY NAMES zmq PATHS ${PC_ZeroMQ_LIBRARY_DIRS}) # The requester defines the zmqml_* symbols that codes' OWN objects - # (director-client.C, dragonfly-dally.C) reference, so it is a usage + # (director-client.cxx, dragonfly-dally.cxx) reference, so it is a usage # requirement of the codes library itself. Linking it PUBLIC here — rather # than PRIVATE per-executable in the CODES_TARGETS loop below — fixes two # things the old per-exe approach got wrong: diff --git a/src/Makefile.subdir b/src/Makefile.subdir index 5721f7df..1254e9a7 100644 --- a/src/Makefile.subdir +++ b/src/Makefile.subdir @@ -154,7 +154,7 @@ src_libcodes_la_SOURCES = \ src/util/jobmap-impl/jobmap-identity.c\ src/util/codes-mapping-context.c \ src/util/codes-comm.c \ - src/util/congestion-controller.C \ + src/util/congestion-controller.cxx \ src/workload/codes-workload.c \ src/workload/methods/codes-iolang-wrkld.c \ src/workload/methods/codes-checkpoint-wrkld.c \ @@ -163,16 +163,16 @@ src_libcodes_la_SOURCES = \ codes/rc-stack.h \ src/util/rc-stack.c \ src/util/surrogate.c \ - src/networks/model-net/network-managers/dragonfly-network-manager.C \ + src/networks/model-net/network-managers/dragonfly-network-manager.cxx \ src/networks/model-net/core/model-net.c \ src/networks/model-net/common-net.c \ src/networks/model-net/simplenet-upd.c \ src/networks/model-net/torus.c \ - src/networks/model-net/express-mesh.C \ + src/networks/model-net/express-mesh.cxx \ src/networks/model-net/dragonfly.c \ - src/networks/model-net/dragonfly-custom.C \ - src/networks/model-net/dragonfly-plus.C \ - src/networks/model-net/dragonfly-dally.C \ + src/networks/model-net/dragonfly-custom.cxx \ + src/networks/model-net/dragonfly-plus.cxx \ + src/networks/model-net/dragonfly-dally.cxx \ src/networks/model-net/slimfly.c \ src/networks/model-net/fattree.c \ src/networks/model-net/loggp.c \ diff --git a/src/networks/model-net/doc/README.dragonfly-custom.txt b/src/networks/model-net/doc/README.dragonfly-custom.txt index d7b8d9de..f12a1e76 100644 --- a/src/networks/model-net/doc/README.dragonfly-custom.txt +++ b/src/networks/model-net/doc/README.dragonfly-custom.txt @@ -102,4 +102,4 @@ ../src/network-workloads/conf/dragonfly-custom/modelnet-test-dragonfly- --------- Debugging Tips ------------ - Set DUMP_CONNECTIONS debugging option to see the detailed local and global - channel connectivity of routers in src/networks/model-net/dragonfly-custom.C + channel connectivity of routers in src/networks/model-net/dragonfly-custom.cxx diff --git a/src/networks/model-net/dragonfly-custom.C b/src/networks/model-net/dragonfly-custom.cxx similarity index 100% rename from src/networks/model-net/dragonfly-custom.C rename to src/networks/model-net/dragonfly-custom.cxx diff --git a/src/networks/model-net/dragonfly-dally.C b/src/networks/model-net/dragonfly-dally.cxx similarity index 99% rename from src/networks/model-net/dragonfly-dally.C rename to src/networks/model-net/dragonfly-dally.cxx index de305309..abf1e012 100644 --- a/src/networks/model-net/dragonfly-dally.C +++ b/src/networks/model-net/dragonfly-dally.cxx @@ -5,12 +5,12 @@ * Originally written by Misbah Mubarak * Updated by Neil McGlohon and Elkin Cruz-Camacho * - * A 1D specific dragonfly custom model - diverged from dragonfly-custom.C + * A 1D specific dragonfly custom model - diverged from dragonfly-custom.cxx * Differs from dragonfly.C in that it allows for the custom features typically found in - * dragonfly-custom.C. + * dragonfly-custom.cxx. * * This was not intended to be a long term solution, but enough changes had been made that merging - * into dragonfly-custom.C wasn't feasible at the time of creation. Today, there is enough differences + * into dragonfly-custom.cxx wasn't feasible at the time of creation. Today, there is enough differences * in the two models that there is currently no plan to re-merge the two. */ @@ -49,7 +49,7 @@ * Optional ZeroMQ Director requester. * * These symbols are defined only when CODES is built with CODES_HAVE_ZEROMQ=ON - * (src/surrogate/director-client.C + libzmqmlrequester). CODES_HAVE_ZEROMQ is + * (src/surrogate/director-client.cxx + libzmqmlrequester). CODES_HAVE_ZEROMQ is * all-or-nothing for a given build: src/CMakeLists.txt links libzmqmlrequester * into *every* CODES executable when ON and into none when OFF. So whether the * requester is available is a compile-time fact, not a runtime one — the diff --git a/src/networks/model-net/dragonfly-plus.C b/src/networks/model-net/dragonfly-plus.cxx similarity index 100% rename from src/networks/model-net/dragonfly-plus.C rename to src/networks/model-net/dragonfly-plus.cxx diff --git a/src/networks/model-net/express-mesh.C b/src/networks/model-net/express-mesh.cxx similarity index 100% rename from src/networks/model-net/express-mesh.C rename to src/networks/model-net/express-mesh.cxx diff --git a/src/networks/model-net/net-template.C b/src/networks/model-net/net-template.cxx similarity index 100% rename from src/networks/model-net/net-template.C rename to src/networks/model-net/net-template.cxx diff --git a/src/networks/model-net/network-managers/dragonfly-network-manager.C b/src/networks/model-net/network-managers/dragonfly-network-manager.cxx similarity index 100% rename from src/networks/model-net/network-managers/dragonfly-network-manager.C rename to src/networks/model-net/network-managers/dragonfly-network-manager.cxx diff --git a/src/surrogate/director-client.C b/src/surrogate/director-client.cxx similarity index 99% rename from src/surrogate/director-client.C rename to src/surrogate/director-client.cxx index 3b115dc0..f865b0f1 100644 --- a/src/surrogate/director-client.C +++ b/src/surrogate/director-client.cxx @@ -173,7 +173,7 @@ static const char* director_iteration_records_command(void) { * the original zmqmlserver.py command name. * * Event-time records use the unified director-request API from - * dragonfly-dally.C and should not affect this iteration-time path. + * dragonfly-dally.cxx and should not affect this iteration-time path. */ return "send-records"; } diff --git a/src/surrogate/ml_models/director/train_packet_latency_torchjit.py b/src/surrogate/ml_models/director/train_packet_latency_torchjit.py index bd20713f..d2104e2e 100644 --- a/src/surrogate/ml_models/director/train_packet_latency_torchjit.py +++ b/src/surrogate/ml_models/director/train_packet_latency_torchjit.py @@ -22,7 +22,7 @@ class PacketLatencyModel(nn.Module): """ - TorchScript-compatible model for CODES torch-jit.C. + TorchScript-compatible model for CODES torch-jit.cxx. C++ passes a LongTensor shaped [1, 4]: @@ -232,7 +232,7 @@ def main() -> None: model.eval() - # Exact shape/dtype sanity check for src/surrogate/packet-latency-predictor/torch-jit.C + # Exact shape/dtype sanity check for src/surrogate/packet-latency-predictor/torch-jit.cxx dummy = torch.zeros((1, 4), dtype=torch.long) with torch.no_grad(): out = model(dummy) diff --git a/src/surrogate/packet-latency-predictor/torch-jit.C b/src/surrogate/packet-latency-predictor/torch-jit.cxx similarity index 100% rename from src/surrogate/packet-latency-predictor/torch-jit.C rename to src/surrogate/packet-latency-predictor/torch-jit.cxx diff --git a/src/util/congestion-controller.C b/src/util/congestion-controller.cxx similarity index 100% rename from src/util/congestion-controller.C rename to src/util/congestion-controller.cxx diff --git a/src/workload/methods/codes-conc-online-comm-wrkld.C b/src/workload/methods/codes-conc-online-comm-wrkld.cxx similarity index 100% rename from src/workload/methods/codes-conc-online-comm-wrkld.C rename to src/workload/methods/codes-conc-online-comm-wrkld.cxx diff --git a/src/workload/methods/codes-online-comm-wrkld.C b/src/workload/methods/codes-online-comm-wrkld.cxx similarity index 100% rename from src/workload/methods/codes-online-comm-wrkld.C rename to src/workload/methods/codes-online-comm-wrkld.cxx From 3230eecce2897cc241f16c8ff1847ff159f20867 Mon Sep 17 00:00:00 2001 From: Caitlin Ross Date: Thu, 25 Jun 2026 19:24:46 -0500 Subject: [PATCH 20/20] cmake: build the zmqml requester + demo as CMake targets, drop the side Makefile MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The zmqml requester shared lib was built out-of-band by a hand-rolled Makefile (src/surrogate/zmqml/Makefile) and consumed via a SHARED IMPORTED target pointed at by -DZEROMQ_BUILD_PATH. That Makefile linked the .so with no -soname, so consumers linking it by absolute path baked a CMake- relativized "../src/surrogate/zmqml/libzmqmlrequester.so" into DT_NEEDED. ld.so resolves a slash-containing DT_NEEDED against the process CWD rather than RUNPATH, so every binary run from ctest's testing-output/test-* subdir failed with "error while loading shared libraries" — turning the heavy-deps `full` CI lane red across the modelnet and ping-pong tests. Replace the arrangement with real CMake targets: - New src/surrogate/zmqml/CMakeLists.txt defines a STATIC zmqmlrequester target (add_subdirectory under USE_ZEROMQ) carrying libzmq + its own public header dir as usage requirements. zmqml_* is compiled straight into libcodes, so there is no runtime .so to locate. - codes links it PUBLIC and exports it in codesTargets; its imported PkgConfig::ZeroMQ dep stays export-exempt, like the SWM/UNION targets. - ZeroMQ is now AUTO-probed like the other optional deps: pkg_check_modules (libzmq) + find zmq.hpp + find rapidjson/document.h. Drop the ZEROMQ_BUILD_PATH cache var and its FATAL_ERROR. - Drop the "Build the zmqml requester" make step and -DZEROMQ_BUILD_PATH from build.yml and CODES-compile-instructions.sh. - Delete the Makefile and runcppdemo.sh; the standalone demozmqmlrequester demo becomes an optional CMake target (-DCODES_BUILD_ZMQML_DEMO=ON, default OFF). Drop the stale .gitignore entries; update NOTES.txt. Verified in the codes-ci-full image: all previously-failing modelnet and ping-pong tests pass; the requester and demo build with only libzmq.so.5 in DT_NEEDED. --- .github/workflows/build.yml | 7 --- .gitignore | 2 - CMakeLists.txt | 32 +++++++------ CODES-compile-instructions.sh | 29 ++---------- ci/full/Dockerfile | 4 +- src/CMakeLists.txt | 54 +++++++++------------- src/networks/model-net/dragonfly-dally.cxx | 6 +-- src/surrogate/zmqml/CMakeLists.txt | 38 +++++++++++++++ src/surrogate/zmqml/Makefile | 27 ----------- src/surrogate/zmqml/NOTES.txt | 11 ++--- src/surrogate/zmqml/runcppdemo.sh | 2 - 11 files changed, 91 insertions(+), 121 deletions(-) create mode 100644 src/surrogate/zmqml/CMakeLists.txt delete mode 100644 src/surrogate/zmqml/Makefile delete mode 100755 src/surrogate/zmqml/runcppdemo.sh diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 7909c496..1a1f8a34 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -332,12 +332,6 @@ jobs: - name: Build and install ROSS run: cmake --build ross/build --target install -j - - name: Build the zmqml requester - # libzmqmlrequester.so is built from CODES source (the image only ships - # libzmq + rapidjson headers); ZEROMQ_BUILD_PATH points the CODES - # configure at it below. - run: make -C codes/src/surrogate/zmqml - - name: Configure CODES (all deps ON) # Torch is discovered through the pip package's cmake prefix, matching # CODES-compile-instructions.sh. @@ -355,7 +349,6 @@ jobs: -DARGOBOTS_PKG_CONFIG_PATH=/opt/argobots/lib/pkgconfig \ -DDUMPI_BUILD_PATH=/opt/dumpi \ -DTorch_DIR="$Torch_DIR" \ - -DZEROMQ_BUILD_PATH="$PWD/codes/src/surrogate/zmqml" \ -DCMAKE_PREFIX_PATH="$PWD/ross-install" - name: Build CODES diff --git a/.gitignore b/.gitignore index 66e2d514..a81f0b86 100644 --- a/.gitignore +++ b/.gitignore @@ -20,8 +20,6 @@ /maint/codes.pc /test-driver .deps -src/surrogate/zmqml/demozmqmlrequester -src/surrogate/zmqml/libzmqmlrequester.so # make generated artifacts .dirstamp diff --git a/CMakeLists.txt b/CMakeLists.txt index 26279aab..b078d34a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -69,7 +69,6 @@ find_package(MPI REQUIRED) # ============================================================================ set(DUMPI_BUILD_PATH "" CACHE PATH "Directory where dumpi include and lib are installed") -set(ZEROMQ_BUILD_PATH "" CACHE PATH "Directory containing libzmqmlrequester.so (for CODES_USE_ZEROMQ)") set(CODES_USE_DUMPI AUTO CACHE STRING "DUMPI trace workloads (AUTO/ON/OFF)") set(CODES_USE_SWM AUTO CACHE STRING "SWM online workloads; requires argobots (AUTO/ON/OFF)") @@ -77,7 +76,7 @@ set(CODES_USE_UNION AUTO CACHE STRING "UNION online workloads; implies SWM (A set(CODES_USE_RECORDER AUTO CACHE STRING "Recorder I/O workload, no external dep (AUTO/ON/OFF)") set(CODES_USE_TORCH AUTO CACHE STRING "Torch ML models (AUTO/ON/OFF)") set(CODES_USE_DARSHAN OFF CACHE STRING "Darshan I/O workload — not yet wired (AUTO/ON/OFF)") -set(CODES_USE_ZEROMQ AUTO CACHE STRING "ZeroMQ director-client surrogate; needs ZEROMQ_BUILD_PATH (AUTO/ON/OFF)") +set(CODES_USE_ZEROMQ AUTO CACHE STRING "ZeroMQ director-client/zmqml surrogate; needs libzmq + zmq.hpp + rapidjson (AUTO/ON/OFF)") foreach(_dep DUMPI SWM UNION RECORDER TORCH DARSHAN ZEROMQ) set_property(CACHE CODES_USE_${_dep} PROPERTY STRINGS AUTO ON OFF) endforeach() @@ -181,22 +180,27 @@ else() endif() -## ZeroMQ — director-client surrogate (opt-in; needs ZEROMQ_BUILD_PATH). -## "found" = a build path was provided; the imported target + linkage live in -## src/CMakeLists.txt, gated on the internal USE_ZEROMQ bool. -if(CODES_USE_ZEROMQ STREQUAL "ON" AND NOT ZEROMQ_BUILD_PATH) - message(FATAL_ERROR - "CODES_USE_ZEROMQ=ON requires ZEROMQ_BUILD_PATH. Build " - "src/surrogate/zmqml/libzmqmlrequester.so first, then reconfigure with " - "-DZEROMQ_BUILD_PATH=.") -endif() +## ZeroMQ — director-client / zmqml surrogate (the CLIENT half; the ML server is +## a separate process reached over a socket). The requester needs the libzmq C++ +## binding (zmq.hpp) and the header-only rapidjson, both build-time only. AUTO- +## probed like the other optional deps; the requester target + linkage live in +## src/surrogate/zmqml + src/CMakeLists.txt, gated on the internal USE_ZEROMQ bool. set(_zeromq_found FALSE) -if(ZEROMQ_BUILD_PATH) - set(_zeromq_found TRUE) +if(NOT CODES_USE_ZEROMQ STREQUAL "OFF") + pkg_check_modules(ZeroMQ IMPORTED_TARGET libzmq) + # zmq.hpp (cppzmq) ships with libzmq-dev on some distros, separately on others. + find_path(ZMQ_CPP_INCLUDE_DIR zmq.hpp HINTS ${ZeroMQ_INCLUDE_DIRS}) + find_path(RapidJSON_INCLUDE_DIR rapidjson/document.h) + if(ZeroMQ_FOUND AND ZMQ_CPP_INCLUDE_DIR AND RapidJSON_INCLUDE_DIR) + set(_zeromq_found TRUE) + endif() endif() codes_resolve_dep(ZEROMQ "${_zeromq_found}" USE_ZEROMQ) if(USE_ZEROMQ) - message(STATUS "ZeroMQ director-client surrogate enabled (${ZEROMQ_BUILD_PATH})") + message(STATUS "ZeroMQ director-client/zmqml surrogate enabled " + "(${ZeroMQ_LIBRARIES}; rapidjson at ${RapidJSON_INCLUDE_DIR})") +else() + message(STATUS "ZeroMQ director-client/zmqml surrogate disabled") endif() add_subdirectory(src) diff --git a/CODES-compile-instructions.sh b/CODES-compile-instructions.sh index 1f285f18..b380f6ed 100644 --- a/CODES-compile-instructions.sh +++ b/CODES-compile-instructions.sh @@ -217,28 +217,10 @@ if ! pkg-config --exists libzmq 2>/dev/null; then echo " or set PKG_CONFIG_PATH to the directory containing libzmq.pc." >&2 fi -# Build local ZMQML requester library required by director-client.C -pushd codes/src/surrogate/zmqml -make clean -make -test -f libzmqmlrequester.so -test -f zmqmlrequester.h -popd - -# Make imported zmqmlrequester target visible to doc/example and tests. -python3 - <<'INNERPY' -from pathlib import Path -cm = Path("codes/src/CMakeLists.txt") -text = cm.read_text() -old = "add_library(zmqmlrequester SHARED IMPORTED )" -new = "add_library(zmqmlrequester SHARED IMPORTED GLOBAL)" -if old in text: - cm.write_text(text.replace(old, new)) -elif new in text: - pass -else: - raise SystemExit("Could not find zmqmlrequester imported target line in codes/src/CMakeLists.txt") -INNERPY +# The zmqml requester is built by CODES' own CMake (src/surrogate/zmqml) when +# CODES_USE_ZEROMQ resolves on — no separate make step. The pkg-config setup +# above is what lets CMake find libzmq; rapidjson is picked up from the system +# include path. mkdir -p codes/build pushd codes/build @@ -367,9 +349,6 @@ make_args_codes=( -DCMAKE_USE_WIN32_THREADS_INIT=0 -DCMAKE_BUILD_TYPE=Debug -DBUILD_TESTING=ON -DCMAKE_INSTALL_PREFIX="$(realpath bin)" - -DZEROMQ_BUILD_PATH="$(realpath "$CUR_DIR/codes/src/surrogate/zmqml")" - -DZeroMQ_INCLUDE_DIR=/usr/include - -DZeroMQ_LIBRARY=/usr/lib/x86_64-linux-gnu/libzmq.so ) if [ $swm_enable = 1 ]; then make_args_codes=( diff --git a/ci/full/Dockerfile b/ci/full/Dockerfile index e51c8720..ba505878 100644 --- a/ci/full/Dockerfile +++ b/ci/full/Dockerfile @@ -102,8 +102,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ # --- ZeroMQ + RapidJSON: back the optional director-client / zmqml surrogate # (USE_ZEROMQ). On jammy libzmq3-dev ships zmq.h AND the zmq.hpp C++ binding; # rapidjson-dev is the header-only JSON parser the requester + director-client -# use. Both are headers/libs only — libzmqmlrequester.so is built from CODES -# source in the build job. --- +# use. Both are headers/libs only — the zmqml requester itself is built from +# CODES source by CODES' own CMake (src/surrogate/zmqml) in the build job. --- RUN apt-get update && apt-get install -y --no-install-recommends \ libzmq3-dev rapidjson-dev \ && rm -rf /var/lib/apt/lists/* diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 1f27f5ea..1c07eb74 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -125,10 +125,12 @@ if(USE_TORCH) endif() # ZeroMQ / director-client (opt-in via CODES_USE_ZEROMQ; resolved to the internal -# USE_ZEROMQ bool in the top-level CMakeLists, which also enforces that -# ZEROMQ_BUILD_PATH is set). The imported target + linkage below are gated on it. +# USE_ZEROMQ bool in the top-level CMakeLists). director-client.cxx is part of +# libcodes; the zmqml requester it calls is its own static lib (built in the +# surrogate/zmqml subdir and linked into codes below). if(USE_ZEROMQ) list(APPEND SRCS surrogate/director-client.cxx) + add_subdirectory(surrogate/zmqml) endif() add_library(codes STATIC ${SRCS}) @@ -174,13 +176,6 @@ if(USE_ONLINE) endif() endif() -if(USE_ZEROMQ) - add_library(zmqmlrequester SHARED IMPORTED GLOBAL) - set_target_properties(zmqmlrequester PROPERTIES - IMPORTED_LOCATION "${ZEROMQ_BUILD_PATH}/libzmqmlrequester.so" - INTERFACE_INCLUDE_DIRECTORIES "${ZEROMQ_BUILD_PATH}") -endif() - # Public include dirs travel with the target. BUILD_INTERFACE entries serve # CODES's own compilation and in-tree consumers; INSTALL_INTERFACE entries serve # installed consumers — include/ for `` and include/codes/ for the @@ -195,7 +190,6 @@ target_include_directories(codes PUBLIC $ $ $ - $:${ZEROMQ_BUILD_PATH}>> ) target_link_libraries(codes PUBLIC ${LIBS_TO_LINK}) @@ -225,28 +219,15 @@ if(USE_DUMPI) list(APPEND CODES_TARGETS model-net-dumpi-traces-dump) endif() -# ZMQ — only resolved + linked when USE_ZEROMQ is on; otherwise nothing -# in the codes library calls into libzmq. +# The zmqml requester defines the zmqml_* symbols that codes' OWN objects +# (director-client.cxx, dragonfly-dally.cxx) reference, so it is a usage +# requirement of the codes library itself — link it PUBLIC so every consumer of +# codes (the CODES_TARGETS executables AND the doc/example + tests targets, which +# are not in CODES_TARGETS) inherits it and resolves zmqml_*. As a real target it +# lands at the end of each consumer's link line after libcodes.a, the order the +# references need, and carries libzmq + its include dirs transitively. if(USE_ZEROMQ) - pkg_check_modules(PC_ZeroMQ QUIET zmq) - find_path(ZeroMQ_INCLUDE_DIR NAMES zmq.hpp PATHS ${PC_ZeroMQ_INCLUDE_DIRS}) - find_library(ZeroMQ_LIBRARY NAMES zmq PATHS ${PC_ZeroMQ_LIBRARY_DIRS}) - - # The requester defines the zmqml_* symbols that codes' OWN objects - # (director-client.cxx, dragonfly-dally.cxx) reference, so it is a usage - # requirement of the codes library itself. Linking it PUBLIC here — rather - # than PRIVATE per-executable in the CODES_TARGETS loop below — fixes two - # things the old per-exe approach got wrong: - # * Coverage: every consumer of codes inherits it, including the - # doc/example targets and the tests targets, which are NOT in - # CODES_TARGETS and so linked no requester at all (undefined zmqml_*). - # * Order: it lands at the END of each consumer's link line, after - # libcodes.a and codes' other transitive deps (ROSS, SWM, UNION, ...), - # which is the order the references need. Linked ahead of those deps it - # was dropped and zmqml_* went undefined on the heavy (UNION+ZeroMQ) - # build. zmqmlrequester is an IMPORTED target, so — like PkgConfig::SWM - # above — it's exempt from the install(EXPORT) "not in export set" error. - target_link_libraries(codes PUBLIC zmqmlrequester ${ZeroMQ_LIBRARY}) + target_link_libraries(codes PUBLIC zmqmlrequester) endif() # Each executable links codes; its include dirs, MPI, ROSS, the optional-dep @@ -278,8 +259,15 @@ install(FILES "${PROJECT_BINARY_DIR}/codes_config.h" DESTINATION include/codes) install(TARGETS ${CODES_TARGETS} RUNTIME DESTINATION bin) # Library target + namespaced export. codes::codes carries the public include -# dirs (INSTALL_INTERFACE) and the ROSS / MPI usage requirements. -install(TARGETS codes +# dirs (INSTALL_INTERFACE) and the ROSS / MPI usage requirements. Under +# USE_ZEROMQ the zmqml requester is a real PUBLIC dependency of codes, so it must +# join the export set too (its imported PkgConfig::ZeroMQ dep stays exempt, like +# the SWM/UNION imported targets codes already exports). +set(CODES_EXPORT_TARGETS codes) +if(USE_ZEROMQ) + list(APPEND CODES_EXPORT_TARGETS zmqmlrequester) +endif() +install(TARGETS ${CODES_EXPORT_TARGETS} EXPORT codesTargets ARCHIVE DESTINATION lib LIBRARY DESTINATION lib diff --git a/src/networks/model-net/dragonfly-dally.cxx b/src/networks/model-net/dragonfly-dally.cxx index abf1e012..47a36c7f 100644 --- a/src/networks/model-net/dragonfly-dally.cxx +++ b/src/networks/model-net/dragonfly-dally.cxx @@ -49,9 +49,9 @@ * Optional ZeroMQ Director requester. * * These symbols are defined only when CODES is built with CODES_HAVE_ZEROMQ=ON - * (src/surrogate/director-client.cxx + libzmqmlrequester). CODES_HAVE_ZEROMQ is - * all-or-nothing for a given build: src/CMakeLists.txt links libzmqmlrequester - * into *every* CODES executable when ON and into none when OFF. So whether the + * (src/surrogate/director-client.cxx + the zmqml requester lib). CODES_HAVE_ZEROMQ + * is all-or-nothing for a given build: src/CMakeLists.txt links the zmqmlrequester + * target into *every* CODES executable when ON and into none when OFF. So whether the * requester is available is a compile-time fact, not a runtime one — the * original __attribute__((weak)) + runtime `if (!zmqml_director_request)` * checks could only ever take their "available" branch under ON and their diff --git a/src/surrogate/zmqml/CMakeLists.txt b/src/surrogate/zmqml/CMakeLists.txt new file mode 100644 index 00000000..e74d1628 --- /dev/null +++ b/src/surrogate/zmqml/CMakeLists.txt @@ -0,0 +1,38 @@ +# The zmqml requester: the CLIENT half of the director / ML surrogate. It opens +# a ZeroMQ REQ socket and exchanges JSON (rapidjson) with a *separate* ML server +# process (zmqmlserver.py). The "many sims, one ML server" topology lives at that +# socket boundary, not here — every consumer statically links this small client +# and dials the server independently. +# +# Built STATIC so it stays a self-contained, reusable unit (CODES, the demo, any +# future client) with no runtime .so to locate. A future SHARED build is a +# one-line change; CMake stamps the SONAME automatically. (A hand-rolled Makefile +# that forgot the SONAME is what made consumers bake a relative DT_NEEDED and +# fail to load from ctest's working dirs — never an issue for a real CMake target.) +# +# Reached only when USE_ZEROMQ resolved true, so libzmq + zmq.hpp + rapidjson are +# all known present (probed in the top-level CMakeLists). +add_library(zmqmlrequester STATIC zmqmlrequester.cpp) + +# PUBLIC libzmq: consumers (codes, and the executables that link codes) need the +# zmq_* symbols on their final link line. zmq.hpp travels with PkgConfig::ZeroMQ. +target_link_libraries(zmqmlrequester PUBLIC PkgConfig::ZeroMQ) + +target_include_directories(zmqmlrequester + # zmqmlrequester.h is the public surface — director-client.cxx includes it. + PUBLIC $ + # rapidjson is header-only and used only inside zmqmlrequester.cpp. + PRIVATE ${RapidJSON_INCLUDE_DIR}) + +# codes may be built as a shared lib in a superbuild; keep the requester PIC so +# it can be archived into one. (The old Makefile compiled -fPIC too.) +set_target_properties(zmqmlrequester PROPERTIES POSITION_INDEPENDENT_CODE ON) + +# Standalone dev tool: drives the requester against a running zmqmlserver.py to +# smoke-test the wire protocol. Not part of CODES — off by default; turn on with +# -DCODES_BUILD_ZMQML_DEMO=ON. (Replaces the old hand-rolled Makefile + runcppdemo.sh.) +option(CODES_BUILD_ZMQML_DEMO "Build the standalone zmqml requester demo tool" OFF) +if(CODES_BUILD_ZMQML_DEMO) + add_executable(demozmqmlrequester demozmqmlrequester.cpp) + target_link_libraries(demozmqmlrequester PRIVATE zmqmlrequester) +endif() diff --git a/src/surrogate/zmqml/Makefile b/src/surrogate/zmqml/Makefile deleted file mode 100644 index b4abcfab..00000000 --- a/src/surrogate/zmqml/Makefile +++ /dev/null @@ -1,27 +0,0 @@ - -CXX=g++ -CXXFLAGS=-g -Wall -O2 -std=c++11 $(shell pkg-config --cflags libzmq) -Wdeprecated-declarations -LDFLAGS=$(shell pkg-config --libs libzmq) -lm -TARGETS=libzmqmlrequester.so demozmqmlrequester - -all: $(TARGETS) - -libzmqmlrequester.so: zmqmlrequester.o - $(CXX) -shared -o $@ $^ $(LDFLAGS) - -zmqmlrequester.o: zmqmlrequester.cpp zmqmlrequester.h - $(CXX) $(CXXFLAGS) -fPIC -c $< -o $@ - -demozmqmlrequester: demozmqmlrequester.cpp libzmqmlrequester.so - $(CXX) $(CXXFLAGS) -o $@ $< $(LDFLAGS) -L./ -lzmqmlrequester - -# doxygen -# sphinx - -clean: - rm -f $(TARGETS) - rm -f *.o - rm -f tmptestsend.dat - -distclean: clean - rm -f *~ diff --git a/src/surrogate/zmqml/NOTES.txt b/src/surrogate/zmqml/NOTES.txt index 5e82b2eb..eeba2ab7 100644 --- a/src/surrogate/zmqml/NOTES.txt +++ b/src/surrogate/zmqml/NOTES.txt @@ -7,9 +7,8 @@ You need to open two terminals for this demo. In the first terminal, $ ./zmqmlserver.py -In the second terminal, -$ ./runcppdemo.sh - - - - +In the second terminal, build the demo (off by default) and run it from this +directory so it finds model/ and the server socket: +$ cmake -S . -B build -DCODES_USE_ZEROMQ=ON -DCODES_BUILD_ZMQML_DEMO=ON ... +$ cmake --build build --target demozmqmlrequester +$ cd src/surrogate/zmqml && /src/surrogate/zmqml/demozmqmlrequester diff --git a/src/surrogate/zmqml/runcppdemo.sh b/src/surrogate/zmqml/runcppdemo.sh deleted file mode 100755 index fe460392..00000000 --- a/src/surrogate/zmqml/runcppdemo.sh +++ /dev/null @@ -1,2 +0,0 @@ -make -LD_LIBRARY_PATH=$LD_LIBRARY_PATH:`pwd` ./demozmqmlrequester