From 987a1a951512e3e39bac9c9023806474caa747ac Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 18 Jun 2026 04:05:16 +0000
Subject: [PATCH] Rebuild the live voice cascade on a deepagents brain and
 rename to `assembly live`
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The cascade's LLM leg is now a deepagents graph (web search, URL fetch, and
AssemblyAI docs tools) instead of a single completion, so a spoken turn can
transparently use tools — a "talk to Gemini Live" experience. Tools are
read-only and auto-approved (a voice turn can't pause for keyboard approval),
and the system prompt keeps replies short and speakable for low latency.

The command is renamed from `agent-cascade` to `live`. The internal slice keeps
its `agent_cascade` name; only the user-facing command, help text, examples,
docs, and the show-code teaching snippet change. `assembly agent`'s help now
says it uses the Voice Agent API to distinguish the two.

- aai_cli/agent_cascade/brain.py: build the gateway-bound deepagents graph and a
  stateless per-turn completer; the graph is the only network seam, so the
  wiring is unit-tested against a fake graph / fake chat model.
- code_agent/model.build_model gains max_tokens + extra (extra_body) so the
  cascade's --max-tokens/--llm-config thread through; the coding agent's call is
  unchanged.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01Hiovdq7aoXtbfg9juPzwQa
---
 README.md                                     |   8 +-
 REFERENCE.md                                  |   2 +-
 aai_cli/agent_cascade/brain.py                | 136 ++++++++++
 aai_cli/agent_cascade/engine.py               |  15 +-
 aai_cli/code_agent/model.py                   |  17 +-
 aai_cli/code_gen/agent_cascade.py             |   4 +-
 aai_cli/commands/agent/__init__.py            |   2 +-
 aai_cli/commands/agent_cascade/__init__.py    |  34 +--
 aai_cli/commands/agent_cascade/_exec.py       |   4 +-
 pyproject.toml                                |   5 +-
 pyrightconfig.tests.json                      |   3 +-
 scripts/generated_code_compile_gate.py        |   4 +-
 .../test_snapshots_help_root.ambr             |  83 +++----
 .../test_snapshots_help_run.ambr              | 233 +++++++++---------
 tests/test_agent_cascade_brain.py             | 186 ++++++++++++++
 tests/test_agent_cascade_command.py           |  51 ++--
 tests/test_agent_cascade_show_code.py         |  24 +-
 tests/test_sandbox_access.py                  |   6 +-
 tests/test_smoke.py                           |   2 +-
 19 files changed, 577 insertions(+), 242 deletions(-)
 create mode 100644 aai_cli/agent_cascade/brain.py
 create mode 100644 tests/test_agent_cascade_brain.py

diff --git a/README.md b/README.md
index ed70dd87..c534497e 100644
--- a/README.md
+++ b/README.md
@@ -36,7 +36,7 @@ That's it. Run `assembly onboard` for a guided tour, or see [Installation](#-ins
 - **🎯 One command for everything**: transcription, real-time streaming, voice agents, LLM prompts, and WER benchmarking — no SDK boilerplate.
 - **🔌 Built for pipelines**: data goes to stdout, errors to stderr, `--json` gives stable machine-readable output, and `-` reads audio from stdin.
 - **🔐 Secure by default**: your API key lives in the OS keyring, never in a dotfile — and run commands have no `--api-key` flag, so keys can't leak into `ps` or shell history.
-- **🛠️ From demo to deployed app**: `assembly init` scaffolds a runnable FastAPI starter, `assembly dev` / `share` / `deploy` run, tunnel, and ship it, and `--show-code` prints the equivalent Python SDK script for any run command (`transcribe` / `stream` / `agent` / `agent-cascade`).
+- **🛠️ From demo to deployed app**: `assembly init` scaffolds a runnable FastAPI starter, `assembly dev` / `share` / `deploy` run, tunnel, and ship it, and `--show-code` prints the equivalent Python SDK script for any run command (`transcribe` / `stream` / `agent` / `live`).
 - **🤖 Agent-ready**: `assembly setup install` wires your coding agent up with the AssemblyAI docs MCP server and skills.
 - **📖 Open source**: MIT licensed.
 
@@ -48,7 +48,7 @@ That's it. Run `assembly onboard` for a guided tour, or see [Installation](#-ins
 | `assembly stream` | Real-time transcription from your microphone, a file, or a URL — on macOS it can capture system audio too |
 | `assembly dictate` | Signal-driven dictation: records immediately, send SIGTERM for instant text — scriptable from hotkey tools like Hammerspoon (Sync STT API, up to 120 s per utterance) |
 | `assembly agent` | Full-duplex spoken conversation with a voice agent, right in your terminal |
-| `assembly agent-cascade` | Same live conversation, but wired client-side from Streaming STT + the LLM Gateway + streaming TTS, like the `agent-cascade` starter (sandbox-only) |
+| `assembly live` | Talk live to a tool-using voice agent, wired client-side from Streaming STT + a deepagents brain on the LLM Gateway + streaming TTS — it can web-search, fetch URLs, and read the docs mid-conversation, like the `agent-cascade` starter (sandbox-only) |
 | `assembly speak` | Synthesize text to speech over the streaming-TTS WebSocket (sandbox-only) |
 | `assembly llm` | Prompt the LLM Gateway over a transcript, files, stdin, or a live stream |
 | `assembly code` | Terminal coding agent (deepagents SDK) backed only by the LLM Gateway — reads/writes/edits files, runs shell, searches the docs MCP, and can invoke the `assembly` CLI itself; mutating actions ask for approval. Defaults to voice in a terminal (speak your request, replies read back via streaming TTS in the sandbox); pass `--no-voice` for the keyboard TUI |
@@ -63,7 +63,7 @@ That's it. Run `assembly onboard` for a guided tour, or see [Installation](#-ins
 | `assembly transcripts` / `sessions` | Browse and fetch past transcripts and streaming sessions |
 | `assembly keys` / `balance` / `usage` / `limits` / `audit` | Account self-service via browser login |
 
-Add `--show-code` to `transcribe` / `stream` / `agent` / `agent-cascade` to print the equivalent Python SDK script instead of running — the built-in path from CLI experiment to SDK code.
+Add `--show-code` to `transcribe` / `stream` / `agent` / `live` to print the equivalent Python SDK script instead of running — the built-in path from CLI experiment to SDK code.
 
 ## ✨ Things you can do with it
 
@@ -194,7 +194,7 @@ assembly transcripts list --json --limit 5 \
 assembly agent --voice ivy --system-prompt "you're a helpful interviewer"
 ```
 
-**Graduate to the SDK** — `--show-code` prints the equivalent Python script for any `transcribe`/`stream`/`agent`/`agent-cascade` run instead of executing it:
+**Graduate to the SDK** — `--show-code` prints the equivalent Python script for any `transcribe`/`stream`/`agent`/`live` run instead of executing it:
 
 ```sh
 assembly agent --system-prompt "you're a story generator" --show-code > story.py
diff --git a/REFERENCE.md b/REFERENCE.md
index bf9f3d8e..09216ef4 100644
--- a/REFERENCE.md
+++ b/REFERENCE.md
@@ -94,7 +94,7 @@ each carrying a `"type"` field to dispatch on:
 | ------- | ----------- |
 | `assembly stream --json` | `begin`, `turn`, `termination` (with `--from-stdin`, a `source` event precedes each file's events) |
 | `assembly agent --json` | `session.ready`, `transcript.user.delta`, `transcript.user`, `reply.started`, `transcript.agent`, `reply.done` |
-| `assembly agent-cascade --json` | `session.ready`, `transcript.user.delta`, `transcript.user`, `reply.started`, `transcript.agent`, `reply.done` |
+| `assembly live --json` | `session.ready`, `transcript.user.delta`, `transcript.user`, `reply.started`, `transcript.agent`, `reply.done` |
 | `assembly dictate --json` | `utterance` |
 | `assembly llm --follow --json` | `answer` |
 | `assembly transcribe <batch> --json` | `result` (one per source), then `reduce` if `--llm-reduce` is set |
diff --git a/aai_cli/agent_cascade/brain.py b/aai_cli/agent_cascade/brain.py
new file mode 100644
index 00000000..7be2297c
--- /dev/null
+++ b/aai_cli/agent_cascade/brain.py
@@ -0,0 +1,136 @@
+"""Deepagents-powered reply brain for the live voice cascade.
+
+`assembly live` answers each spoken turn with a deepagents graph instead of a single
+LLM completion, so the agent can transparently reach for tools — web search, URL
+fetch, the AssemblyAI docs — mid-conversation, mimicking a live multimodal assistant
+(the "talk to Gemini Live" experience). The graph is built once per session
+(:func:`build_graph`) and invoked statelessly per turn with the running history the
+cascade already keeps (:func:`build_completer`); tools are read-only and auto-approved,
+because a spoken turn can't pause for a keyboard confirmation, and the system prompt
+keeps every reply short and speakable.
+
+The graph is the only network seam: :func:`build_completer` accepts an injected graph,
+so the per-turn orchestration is unit-tested against a fake with no sockets — the same
+seam the rest of the cascade uses for its STT/LLM/TTS legs.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Callable, Sequence
+from typing import TYPE_CHECKING
+
+from aai_cli.agent_cascade.config import CascadeConfig
+from aai_cli.code_agent.agent import CompiledAgent
+
+if TYPE_CHECKING:
+    from langchain_core.tools import BaseTool
+    from openai.types.chat import ChatCompletionMessageParam
+
+# Appended to the user's persona so the model knows it has tools and must keep replies
+# spoken. The cascade's plain-LLM persona (CascadeConfig.system_prompt) says nothing
+# about tools, so without this the agent would never reach for web search.
+_TOOL_GUIDANCE = (
+    "You can use tools to help answer: search the web for current or unfamiliar facts, "
+    "fetch a specific URL, and look up the AssemblyAI documentation. Reach for a tool "
+    "when a question needs fresh or external information; answer directly and instantly "
+    "when you already know. Your reply is read aloud, so keep it short and spoken — no "
+    "markdown, lists, code, or raw URLs."
+)
+
+
+def build_system_prompt(persona: str) -> str:
+    """The live agent's system prompt: the user's persona plus the tool guidance."""
+    return f"{persona}\n\n{_TOOL_GUIDANCE}"
+
+
+def build_live_tools() -> list[BaseTool]:
+    """The live agent's read-only toolset: URL fetch, web search (if keyed), and docs.
+
+    All three are reused from the coding agent's tool modules. Unlike there they are
+    *not* approval-gated — a spoken turn can't wait for a keyboard confirmation, so the
+    live agent only gets read-only tools and runs them automatically. Web search is
+    present only when ``TAVILY_API_KEY`` is set; the docs MCP is best-effort (an empty
+    list when the host is unreachable), so neither blocks a session.
+    """
+    from aai_cli.code_agent.docs_mcp import load_docs_tools
+    from aai_cli.code_agent.fetch_tool import build_fetch_tool
+    from aai_cli.code_agent.web_search import build_web_search_tool
+
+    tools: list[BaseTool] = [build_fetch_tool()]
+    search = build_web_search_tool()
+    if search is not None:
+        tools.append(search)
+    tools.extend(load_docs_tools())
+    return tools
+
+
+def build_graph(
+    api_key: str, config: CascadeConfig, *, tools: Sequence[BaseTool] | None = None
+) -> CompiledAgent:
+    """Compile the deepagents graph for one live session over the gateway model.
+
+    Reuses the coding agent's gateway-bound ``ChatOpenAI`` (so the live agent can only
+    ever reach AssemblyAI), threading the cascade's ``--max-tokens``/``--llm-config``
+    through it. ``tools`` defaults to :func:`build_live_tools`; tests pass an explicit
+    (possibly empty) list to skip the network-touching docs probe.
+    """
+    from deepagents import create_deep_agent
+
+    from aai_cli.code_agent.model import build_model
+
+    model = build_model(
+        api_key, model=config.model, max_tokens=config.max_tokens, extra=config.llm_extra
+    )
+    resolved = build_live_tools() if tools is None else list(tools)
+    return create_deep_agent(
+        model=model, tools=resolved, system_prompt=build_system_prompt(config.system_prompt)
+    )
+
+
+def build_completer(
+    api_key: str, config: CascadeConfig, *, graph: CompiledAgent | None = None
+) -> Callable[[list[ChatCompletionMessageParam]], str]:
+    """A ``complete_reply`` for the cascade engine backed by the deepagents graph.
+
+    The cascade prepends its own ``system`` message to the history each turn; the graph
+    already owns the system prompt, so we drop it before invoking. The graph runs the
+    full tool loop and we return its final spoken text. ``graph`` is injected in tests
+    so the per-turn wiring runs against a fake with no network.
+    """
+    resolved = build_graph(api_key, config) if graph is None else graph
+
+    def complete_reply(messages: list[ChatCompletionMessageParam]) -> str:
+        conversation = [message for message in messages if message.get("role") != "system"]
+        return _reply_text(resolved.invoke({"messages": conversation}))
+
+    return complete_reply
+
+
+def _reply_text(result: dict[str, object]) -> str:
+    """The agent's final spoken reply: the last assistant message that carries text.
+
+    A tool-using turn ends in an ``AIMessage`` whose ``content`` is the spoken answer,
+    but earlier ``AIMessage``\\s in the same turn (the tool-call requests) have empty
+    text — so we scan from the end for the last one with non-empty content.
+    """
+    messages = result.get("messages")
+    if not isinstance(messages, list):
+        return ""
+    for message in reversed(messages):
+        if type(message).__name__ != "AIMessage":
+            continue
+        text = _content_text(getattr(message, "content", "")).strip()
+        if text:
+            return text
+    return ""
+
+
+def _content_text(content: object) -> str:
+    """Coerce a message's content (a string, or a list of content blocks) to plain text."""
+    if isinstance(content, str):
+        return content
+    if isinstance(content, list):
+        return "".join(
+            block.get("text", "") if isinstance(block, dict) else str(block) for block in content
+        )
+    return str(content)
diff --git a/aai_cli/agent_cascade/engine.py b/aai_cli/agent_cascade/engine.py
index 9c400657..af52f15a 100644
--- a/aai_cli/agent_cascade/engine.py
+++ b/aai_cli/agent_cascade/engine.py
@@ -18,9 +18,10 @@
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Protocol
 
+from aai_cli.agent_cascade import brain
 from aai_cli.agent_cascade.config import CascadeConfig
 from aai_cli.agent_cascade.text import split_sentences, trim_history
-from aai_cli.core import client, llm
+from aai_cli.core import client
 from aai_cli.core.errors import CLIError
 from aai_cli.tts import session as tts_session
 from aai_cli.tts.session import SpeakConfig
@@ -121,15 +122,9 @@ def real(
         def run_stt(on_turn: Callable[[object], None]) -> None:
             client.stream_audio(api_key, audio, params=stt_params, on_turn=on_turn)
 
-        def complete_reply(messages: list[ChatCompletionMessageParam]) -> str:
-            response = llm.complete(
-                api_key,
-                model=config.model,
-                messages=messages,
-                max_tokens=config.max_tokens,
-                extra=dict(config.llm_extra) or None,
-            )
-            return llm.content_of(response)
+        # The LLM leg is a deepagents graph (web search / URL fetch / docs tools), not a
+        # single completion, so a spoken turn can transparently use tools.
+        complete_reply = brain.build_completer(api_key, config)
 
         def synthesize(text: str) -> bytes:
             spec = SpeakConfig(
diff --git a/aai_cli/code_agent/model.py b/aai_cli/code_agent/model.py
index bdb6a4a2..716af2fc 100644
--- a/aai_cli/code_agent/model.py
+++ b/aai_cli/code_agent/model.py
@@ -8,6 +8,7 @@
 
 from __future__ import annotations
 
+from collections.abc import Mapping
 from typing import TYPE_CHECKING
 
 from aai_cli.core import environments
@@ -37,7 +38,13 @@ def _flatten_content(messages: object) -> None:
             )
 
 
-def build_model(api_key: str, *, model: str) -> BaseChatModel:
+def build_model(
+    api_key: str,
+    *,
+    model: str,
+    max_tokens: int | None = None,
+    extra: Mapping[str, object] | None = None,
+) -> BaseChatModel:
     """A ChatOpenAI bound to the active environment's LLM Gateway.
 
     ``use_responses_api=False`` keeps it on the chat-completions endpoint the gateway
@@ -45,6 +52,12 @@ def build_model(api_key: str, *, model: str) -> BaseChatModel:
     Responses API that langchain would otherwise prefer for ``openai:`` models. The
     subclass also flattens content-parts arrays the gateway rejects (see
     :func:`_flatten_content`).
+
+    ``max_tokens`` caps the per-reply length (the live voice agent passes a small cap to
+    keep spoken replies short and fast); ``extra`` passes any additional gateway request
+    fields through as ``extra_body`` (so they reach the request body verbatim, like
+    `aai_cli.core.llm`'s ``extra``). Both default to off so the coding agent's call is
+    unchanged.
     """
     from langchain_openai import ChatOpenAI
     from pydantic import SecretStr
@@ -64,4 +77,6 @@ def _get_request_payload(
         base_url=environments.active().llm_gateway_base,
         api_key=SecretStr(api_key),
         use_responses_api=False,
+        max_tokens=max_tokens,
+        extra_body=dict(extra) if extra else None,
     )
diff --git a/aai_cli/code_gen/agent_cascade.py b/aai_cli/code_gen/agent_cascade.py
index 0a861911..5f5306f0 100644
--- a/aai_cli/code_gen/agent_cascade.py
+++ b/aai_cli/code_gen/agent_cascade.py
@@ -16,9 +16,11 @@
 # which is never formatted — so no brace has to be doubled.
 _HEADER = """\
 # Live voice cascade: Streaming STT -> LLM Gateway -> streaming TTS, wired client-side.
-# This is what `assembly --sandbox agent-cascade` runs: it transcribes your speech,
+# The basic cascade behind `assembly --sandbox live`: it transcribes your speech,
 # sends each finalized turn to the LLM Gateway, and speaks the reply through streaming
 # TTS — the same three primitives the agent-cascade init template wires server-side.
+# (The `live` command adds a tool-using agent on the LLM leg; this snippet is the
+# plain single-completion version to build from.)
 # Requires audio + websockets:  pip install sounddevice websockets openai
 # Tip: use headphones — the mic stays open while the agent speaks, so on speakers it
 # would hear itself and loop.
diff --git a/aai_cli/commands/agent/__init__.py b/aai_cli/commands/agent/__init__.py
index f535b54c..b20dfc2a 100644
--- a/aai_cli/commands/agent/__init__.py
+++ b/aai_cli/commands/agent/__init__.py
@@ -84,7 +84,7 @@ def agent(
         help="Print the equivalent Python SDK code and exit (does not start a session)",
     ),
 ) -> None:
-    """Hold a live two-way voice conversation with a voice agent
+    """Hold a live two-way voice conversation with the Voice Agent API
 
     Use headphones: the mic stays open while the agent speaks, so on
     speakers it would hear itself and loop. Pass an audio file/URL (or
diff --git a/aai_cli/commands/agent_cascade/__init__.py b/aai_cli/commands/agent_cascade/__init__.py
index 3e99f146..b17e85e8 100644
--- a/aai_cli/commands/agent_cascade/__init__.py
+++ b/aai_cli/commands/agent_cascade/__init__.py
@@ -31,7 +31,7 @@
 SPEC = command_registry.CommandModuleSpec(
     panel=help_panels.TRANSCRIPTION,
     order=45,  # pragma: no mutate -- sparse rank; a +-1 shift is order-equivalent
-    commands=("agent-cascade",),
+    commands=("live",),
 )
 
 
@@ -43,28 +43,28 @@ def _emit_voice_list(_state: AppState, json_mode: bool) -> None:
 
 
 @app.command(
-    name="agent-cascade",
+    name="live",
     rich_help_panel=help_panels.TRANSCRIPTION,
     epilog=examples_epilog(
         [
-            ("Start a live cascade conversation", "assembly --sandbox agent-cascade"),
+            ("Start a live voice conversation", "assembly --sandbox live"),
             (
                 "Pick a voice and opening line",
-                'assembly --sandbox agent-cascade --voice michael --greeting "Hi there"',
+                'assembly --sandbox live --voice michael --greeting "Hi there"',
             ),
             (
                 "Give the agent a persona",
-                'assembly --sandbox agent-cascade --system-prompt "You are a terse pirate."',
+                'assembly --sandbox live --system-prompt "You are a terse pirate."',
             ),
-            ("See available voices", "assembly --sandbox agent-cascade --list-voices"),
+            ("See available voices", "assembly --sandbox live --list-voices"),
             (
                 "Print equivalent Python instead of running",
-                "assembly --sandbox agent-cascade --show-code",
+                "assembly --sandbox live --show-code",
             ),
         ]
     ),
 )
-def agent_cascade(
+def live(
     ctx: typer.Context,
     source: str | None = typer.Argument(
         None, help="Audio file path or URL to speak to the agent. Omit to use the microphone."
@@ -169,14 +169,15 @@ def agent_cascade(
         help="Print the equivalent Python SDK code and exit (does not start a session)",
     ),
 ) -> None:
-    """\\[sandbox] Hold a live voice conversation through a self-wired cascade
+    """\\[sandbox] Talk live to a tool-using voice agent
 
-    Like 'assembly agent', but instead of AssemblyAI's Voice Agent endpoint this
-    wires the three primitives together itself — Streaming STT, the LLM Gateway,
-    and streaming TTS — exactly like the 'agent-cascade' init template does
-    server-side. Because it uses streaming TTS it only runs in the sandbox: run
-    it as 'assembly --sandbox agent-cascade' (--sandbox goes before the
-    subcommand).
+    A real-time spoken conversation, wired client-side from three primitives —
+    Streaming STT, a deepagents brain on the LLM Gateway, and streaming TTS. Unlike
+    'assembly agent' (the Voice Agent API), the brain here is an agent that can use
+    tools mid-conversation — web search, URL fetch, and the AssemblyAI docs — so it
+    answers like a live multimodal assistant. Because it uses streaming TTS it only
+    runs in the sandbox: run it as 'assembly --sandbox live' (--sandbox goes before
+    the subcommand).
 
     Use headphones: the mic stays open while the agent speaks, so on speakers it
     would hear itself and loop. Pass an audio file/URL (or --sample) to speak a
@@ -185,6 +186,9 @@ def agent_cascade(
 
     This only runs a conversation in the terminal — it writes no code. To build
     an agent-cascade app, run 'assembly init agent-cascade' instead.
+
+    Web search needs a TAVILY_API_KEY in the environment; without it the agent
+    keeps its URL-fetch and docs tools.
     """
 
     if list_voices:
diff --git a/aai_cli/commands/agent_cascade/_exec.py b/aai_cli/commands/agent_cascade/_exec.py
index 0b97e230..af466c56 100644
--- a/aai_cli/commands/agent_cascade/_exec.py
+++ b/aai_cli/commands/agent_cascade/_exec.py
@@ -169,9 +169,9 @@ def _print_show_code(opts: AgentCascadeOptions, system_prompt_text: str) -> None
 def run_agent_cascade(opts: AgentCascadeOptions, state: AppState, *, json_mode: bool) -> None:
     """Execute one `assembly agent-cascade` cascade from already-parsed flags."""
     text_mode, json_mode = resolve_output_modes(opts.output_field, json_mode=json_mode)
-    validate_voice(opts.voice, voices.VOICE_NAMES, command="agent-cascade")
+    validate_voice(opts.voice, voices.VOICE_NAMES, command="live")
     # Streaming TTS has no production host, so the whole cascade is sandbox-only.
-    tts_session.require_available("agent-cascade")
+    tts_session.require_available("live")
     system_prompt_text = _resolve_system_prompt(opts.system_prompt, opts.system_prompt_file)
 
     if opts.show_code:
diff --git a/pyproject.toml b/pyproject.toml
index a7eb9e4e..6ca42df6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -258,9 +258,10 @@ module = [
     "aai_cli.code_agent.store",
     "aai_cli.code_agent.model",
     "aai_cli.commands.code._exec",
+    "aai_cli.agent_cascade.brain",
 ]
 disallow_any_generics = false
-disable_error_code = ["return-value", "arg-type", "type-arg"]
+disable_error_code = ["return-value", "arg-type", "type-arg", "call-arg"]
 
 [tool.pyright]
 # Second type checker alongside mypy: pyright catches a different class of
@@ -279,7 +280,7 @@ exclude = ["**/node_modules", "**/__pycache__", "**/.*"]
 # Unknown*/invariance diagnostics our precise signatures can't satisfy. mypy still
 # type-checks these modules (with the targeted overrides above) as the safety net, so
 # we suppress pyright diagnostics here rather than littering per-line `# pyright: ignore`.
-ignore = ["aai_cli/code_agent", "aai_cli/commands/code"]
+ignore = ["aai_cli/code_agent", "aai_cli/commands/code", "aai_cli/agent_cascade/brain.py"]
 pythonVersion = "3.12"
 typeCheckingMode = "strict"
 # Third-party deps (assemblyai, sounddevice) ship no type stubs.
diff --git a/pyrightconfig.tests.json b/pyrightconfig.tests.json
index 1ea7be4a..f9dbdf0e 100644
--- a/pyrightconfig.tests.json
+++ b/pyrightconfig.tests.json
@@ -3,7 +3,8 @@
   "ignore": [
     "tests/test_code_agent.py",
     "tests/test_code_command.py",
-    "tests/test_code_tui.py"
+    "tests/test_code_tui.py",
+    "tests/test_agent_cascade_brain.py"
   ],
   "pythonVersion": "3.12",
   "typeCheckingMode": "standard",
diff --git a/scripts/generated_code_compile_gate.py b/scripts/generated_code_compile_gate.py
index 8d258efe..bd71efdf 100644
--- a/scripts/generated_code_compile_gate.py
+++ b/scripts/generated_code_compile_gate.py
@@ -118,10 +118,10 @@ def main() -> int:
         ),
         (
             # Sandbox-only: streaming TTS has no prod host, so --sandbox makes the URLs valid.
-            "agent-cascade-basic",
+            "live-basic",
             (
                 "--sandbox",
-                "agent-cascade",
+                "live",
                 "--voice",
                 "jane",
                 "--greeting",
diff --git a/tests/__snapshots__/test_snapshots_help_root.ambr b/tests/__snapshots__/test_snapshots_help_root.ambr
index 82cc9dc9..2bb0f987 100644
--- a/tests/__snapshots__/test_snapshots_help_root.ambr
+++ b/tests/__snapshots__/test_snapshots_help_root.ambr
@@ -32,60 +32,59 @@
   │                                                    exit.                     │
   ╰──────────────────────────────────────────────────────────────────────────────╯
   ╭─ Quick Start ────────────────────────────────────────────────────────────────╮
-  │ onboard        Guided setup: sign in and run your first transcription        │
+  │ onboard      Guided setup: sign in and run your first transcription          │
   ╰──────────────────────────────────────────────────────────────────────────────╯
   ╭─ Coding Agent ───────────────────────────────────────────────────────────────╮
-  │ code           Run a terminal coding agent backed by the AssemblyAI LLM      │
-  │                Gateway                                                       │
+  │ code         Run a terminal coding agent backed by the AssemblyAI LLM        │
+  │              Gateway                                                         │
   ╰──────────────────────────────────────────────────────────────────────────────╯
   ╭─ Build an App ───────────────────────────────────────────────────────────────╮
-  │ init           Scaffold a new app from a template and launch it              │
-  │ dev            Run the dev server for the app in the current directory       │
-  │ share          Expose the local app on a public URL via a cloudflared tunnel │
-  │ deploy         Deploy the current project to Vercel, Railway, or Fly.io      │
+  │ init         Scaffold a new app from a template and launch it                │
+  │ dev          Run the dev server for the app in the current directory         │
+  │ share        Expose the local app on a public URL via a cloudflared tunnel   │
+  │ deploy       Deploy the current project to Vercel, Railway, or Fly.io        │
   ╰──────────────────────────────────────────────────────────────────────────────╯
   ╭─ Run AssemblyAI ─────────────────────────────────────────────────────────────╮
-  │ transcribe     Transcribe a file, URL, or YouTube/podcast link — or a whole  │
-  │                batch                                                         │
-  │ stream         Transcribe live audio in real time from a mic, file, URL, or  │
-  │                pipe                                                          │
-  │ dictate        Signal-driven dictation: record the mic, get the transcript   │
-  │                back                                                          │
-  │ agent          Hold a live two-way voice conversation with a voice agent     │
-  │ agent-cascade  [sandbox] Hold a live voice conversation through a self-wired │
-  │                cascade                                                       │
-  │ speak          [sandbox] Synthesize speech from text with AssemblyAI         │
-  │                streaming TTS                                                 │
-  │ llm            Send a prompt to AssemblyAI's LLM Gateway and print the reply │
-  │ clip           Cut clips from media by speaker, text match, LLM pick, or     │
-  │                time range                                                    │
-  │ dub            [sandbox] Dub a video or audio file into another language     │
-  │ caption        Burn always-visible captions into a video                     │
-  │ eval           Transcribe one or more datasets and score WER against their   │
-  │                reference texts                                               │
-  │ webhooks       Receive webhook deliveries on a public dev URL                │
+  │ transcribe   Transcribe a file, URL, or YouTube/podcast link — or a whole    │
+  │              batch                                                           │
+  │ stream       Transcribe live audio in real time from a mic, file, URL, or    │
+  │              pipe                                                            │
+  │ dictate      Signal-driven dictation: record the mic, get the transcript     │
+  │              back                                                            │
+  │ agent        Hold a live two-way voice conversation with the Voice Agent API │
+  │ live         [sandbox] Talk live to a tool-using voice agent                 │
+  │ speak        [sandbox] Synthesize speech from text with AssemblyAI streaming │
+  │              TTS                                                             │
+  │ llm          Send a prompt to AssemblyAI's LLM Gateway and print the reply   │
+  │ clip         Cut clips from media by speaker, text match, LLM pick, or time  │
+  │              range                                                           │
+  │ dub          [sandbox] Dub a video or audio file into another language       │
+  │ caption      Burn always-visible captions into a video                       │
+  │ eval         Transcribe one or more datasets and score WER against their     │
+  │              reference texts                                                 │
+  │ webhooks     Receive webhook deliveries on a public dev URL                  │
   ╰──────────────────────────────────────────────────────────────────────────────╯
   ╭─ Setup & Tools ──────────────────────────────────────────────────────────────╮
-  │ doctor         Check that your environment is ready for AssemblyAI           │
-  │ setup          Set up your coding agent for AssemblyAI (docs MCP + skills)   │
-  │ config         Inspect and edit persisted CLI settings (profiles, env,       │
-  │                telemetry)                                                    │
-  │ update         Update the CLI to the latest release (brew/pipx/uv)           │
-  │ telemetry      Anonymous usage telemetry: status, enable, disable            │
+  │ doctor       Check that your environment is ready for AssemblyAI             │
+  │ setup        Set up your coding agent for AssemblyAI (docs MCP + skills)     │
+  │ config       Inspect and edit persisted CLI settings (profiles, env,         │
+  │              telemetry)                                                      │
+  │ update       Update the CLI to the latest release (brew/pipx/uv)             │
+  │ telemetry    Anonymous usage telemetry: status, enable, disable              │
   ╰──────────────────────────────────────────────────────────────────────────────╯
   ╭─ History ────────────────────────────────────────────────────────────────────╮
-  │ transcripts    Browse and fetch past transcripts                             │
-  │ sessions       Browse your past streaming (real-time) sessions               │
+  │ transcripts  Browse and fetch past transcripts                               │
+  │ sessions     Browse your past streaming (real-time) sessions                 │
   ╰──────────────────────────────────────────────────────────────────────────────╯
   ╭─ Account ────────────────────────────────────────────────────────────────────╮
-  │ login          Authenticate via your browser and store a CLI API key         │
-  │ logout         Clear stored credentials for the active profile               │
-  │ whoami         Show the active profile and whether its key works             │
-  │ balance        Show your remaining account balance                           │
-  │ usage          Show usage over a date range (default: last 30 days)          │
-  │ limits         Show your account's rate limits per service                   │
-  │ keys           List, create, and rename your AssemblyAI API keys             │
-  │ audit          List recent audit-log entries for your account                │
+  │ login        Authenticate via your browser and store a CLI API key           │
+  │ logout       Clear stored credentials for the active profile                 │
+  │ whoami       Show the active profile and whether its key works               │
+  │ balance      Show your remaining account balance                             │
+  │ usage        Show usage over a date range (default: last 30 days)            │
+  │ limits       Show your account's rate limits per service                     │
+  │ keys         List, create, and rename your AssemblyAI API keys               │
+  │ audit        List recent audit-log entries for your account                  │
   ╰──────────────────────────────────────────────────────────────────────────────╯
   
    Examples
diff --git a/tests/__snapshots__/test_snapshots_help_run.ambr b/tests/__snapshots__/test_snapshots_help_run.ambr
index 2879f6f9..cdc75e01 100644
--- a/tests/__snapshots__/test_snapshots_help_run.ambr
+++ b/tests/__snapshots__/test_snapshots_help_run.ambr
@@ -1,121 +1,10 @@
 # serializer version: 1
-# name: test_command_help_matches_snapshot[agent-cascade]
-  '''
-  
-   Usage: assembly agent-cascade [OPTIONS] [SOURCE]
-  
-   [sandbox] Hold a live voice conversation through a self-wired cascade
-  
-   Like 'assembly agent', but instead of AssemblyAI's Voice Agent endpoint this
-   wires the three primitives together itself — Streaming STT, the LLM Gateway,
-   and streaming TTS — exactly like the 'agent-cascade' init template does
-   server-side. Because it uses streaming TTS it only runs in the sandbox: run
-   it as 'assembly --sandbox agent-cascade' (--sandbox goes before the
-   subcommand).
-  
-   Use headphones: the mic stays open while the agent speaks, so on speakers it
-   would hear itself and loop. Pass an audio file/URL (or --sample) to speak a
-   recorded clip instead of the microphone; the session then ends after the
-   agent's reply.
-  
-   This only runs a conversation in the terminal — it writes no code. To build
-   an agent-cascade app, run 'assembly init agent-cascade' instead.
-  
-  ╭─ Arguments ──────────────────────────────────────────────────────────────────╮
-  │   source      [SOURCE]  Audio file path or URL to speak to the agent. Omit   │
-  │                         to use the microphone.                               │
-  ╰──────────────────────────────────────────────────────────────────────────────╯
-  ╭─ Options ────────────────────────────────────────────────────────────────────╮
-  │ --sample                                   Speak the hosted wildfires.mp3    │
-  │                                            sample to the agent               │
-  │ --system-prompt               TEXT         System prompt (the agent's        │
-  │                                            persona)                          │
-  │                                            [default: You are a friendly,     │
-  │                                            concise voice assistant. Keep     │
-  │                                            replies short and conversational. │
-  │                                            Your reply is read aloud by a     │
-  │                                            text-to-speech engine, so write   │
-  │                                            plain spoken prose — no markdown, │
-  │                                            emoji, bullet lists, or code.]    │
-  │ --system-prompt-file          FILE         Read the system prompt from a     │
-  │                                            file (overrides --system-prompt)  │
-  │ --greeting                    TEXT         Spoken greeting                   │
-  │                                            [default: Hi! I'm your AssemblyAI │
-  │                                            voice agent. What can I help you  │
-  │                                            with?]                            │
-  │ --device                      INTEGER      Microphone device index           │
-  │ --list-voices                              Print known voices and exit       │
-  │ --json                -j                   Emit newline-delimited JSON       │
-  │                                            events                            │
-  │ --output              -o      [text|json]  Output mode: text (you:/agent:    │
-  │                                            lines as plain stdout,            │
-  │                                            pipe-friendly) or json            │
-  │ --show-code                                Print the equivalent Python SDK   │
-  │                                            code and exit (does not start a   │
-  │                                            session)                          │
-  │ --help                                     Show this message and exit.       │
-  ╰──────────────────────────────────────────────────────────────────────────────╯
-  ╭─ Text-to-speech ─────────────────────────────────────────────────────────────╮
-  │ --voice             TEXT  TTS voice. See --list-voices. [default: jane]      │
-  │ --language          TEXT  TTS language (defaults to the voice's language)    │
-  │ --tts-config        TEXT  Set any extra streaming-TTS query field as         │
-  │                           KEY=VALUE (repeatable)                             │
-  ╰──────────────────────────────────────────────────────────────────────────────╯
-  ╭─ Language model ─────────────────────────────────────────────────────────────╮
-  │ --model             TEXT                  LLM Gateway model that powers the  │
-  │                                           agent's replies                    │
-  │                                           [default:                          │
-  │                                           claude-haiku-4-5-20251001]         │
-  │ --max-tokens        INTEGER RANGE [x>=1]  Max tokens per reply               │
-  │                                           [default: 8192]                    │
-  │ --llm-config        TEXT                  Set any LLM Gateway request field  │
-  │                                           as KEY=VALUE (repeatable)          │
-  ╰──────────────────────────────────────────────────────────────────────────────╯
-  ╭─ Speech-to-text ─────────────────────────────────────────────────────────────╮
-  │ --speech-model                            TEXT              Streaming speech │
-  │                                                             model            │
-  │                                                             [default:        │
-  │                                                             u3-rt-pro]       │
-  │ --format-turns       --no-format-turns                      Format           │
-  │                                                             (punctuate)      │
-  │                                                             finalized turns  │
-  │                                                             before replying  │
-  │                                                             [default:        │
-  │                                                             format-turns]    │
-  │ --turn-detection                          [aggressive|bala  Turn-detection   │
-  │                                           nced|conservativ  sensitivity      │
-  │                                           e]                preset           │
-  │ --stt-config                              TEXT              Set any          │
-  │                                                             StreamingParame… │
-  │                                                             field as         │
-  │                                                             KEY=VALUE        │
-  │                                                             (repeatable)     │
-  │ --stt-config-file                         FILE              JSON file of     │
-  │                                                             streaming fields │
-  ╰──────────────────────────────────────────────────────────────────────────────╯
-  
-   Examples
-   Start a live cascade conversation
-   $ assembly --sandbox agent-cascade
-   Pick a voice and opening line
-   $ assembly --sandbox agent-cascade --voice michael --greeting "Hi there"
-   Give the agent a persona
-   $ assembly --sandbox agent-cascade --system-prompt "You are a terse pirate."
-   See available voices
-   $ assembly --sandbox agent-cascade --list-voices
-   Print equivalent Python instead of running
-   $ assembly --sandbox agent-cascade --show-code
-  
-  
-  
-  '''
-# ---
 # name: test_command_help_matches_snapshot[agent]
   '''
   
    Usage: assembly agent [OPTIONS] [SOURCE]
   
-   Hold a live two-way voice conversation with a voice agent
+   Hold a live two-way voice conversation with the Voice Agent API
   
    Use headphones: the mic stays open while the agent speaks, so on
    speakers it would hear itself and loop. Pass an audio file/URL (or
@@ -699,6 +588,126 @@
   
   
   
+  '''
+# ---
+# name: test_command_help_matches_snapshot[live]
+  '''
+  
+   Usage: assembly live [OPTIONS] [SOURCE]
+  
+   [sandbox] Talk live to a tool-using voice agent
+  
+   A real-time spoken conversation, wired client-side from three primitives —
+   Streaming STT, a deepagents brain on the LLM Gateway, and streaming TTS.
+   Unlike
+   'assembly agent' (the Voice Agent API), the brain here is an agent that can
+   use
+   tools mid-conversation — web search, URL fetch, and the AssemblyAI docs — so
+   it
+   answers like a live multimodal assistant. Because it uses streaming TTS it
+   only
+   runs in the sandbox: run it as 'assembly --sandbox live' (--sandbox goes
+   before
+   the subcommand).
+  
+   Use headphones: the mic stays open while the agent speaks, so on speakers it
+   would hear itself and loop. Pass an audio file/URL (or --sample) to speak a
+   recorded clip instead of the microphone; the session then ends after the
+   agent's reply.
+  
+   This only runs a conversation in the terminal — it writes no code. To build
+   an agent-cascade app, run 'assembly init agent-cascade' instead.
+  
+   Web search needs a TAVILY_API_KEY in the environment; without it the agent
+   keeps its URL-fetch and docs tools.
+  
+  ╭─ Arguments ──────────────────────────────────────────────────────────────────╮
+  │   source      [SOURCE]  Audio file path or URL to speak to the agent. Omit   │
+  │                         to use the microphone.                               │
+  ╰──────────────────────────────────────────────────────────────────────────────╯
+  ╭─ Options ────────────────────────────────────────────────────────────────────╮
+  │ --sample                                   Speak the hosted wildfires.mp3    │
+  │                                            sample to the agent               │
+  │ --system-prompt               TEXT         System prompt (the agent's        │
+  │                                            persona)                          │
+  │                                            [default: You are a friendly,     │
+  │                                            concise voice assistant. Keep     │
+  │                                            replies short and conversational. │
+  │                                            Your reply is read aloud by a     │
+  │                                            text-to-speech engine, so write   │
+  │                                            plain spoken prose — no markdown, │
+  │                                            emoji, bullet lists, or code.]    │
+  │ --system-prompt-file          FILE         Read the system prompt from a     │
+  │                                            file (overrides --system-prompt)  │
+  │ --greeting                    TEXT         Spoken greeting                   │
+  │                                            [default: Hi! I'm your AssemblyAI │
+  │                                            voice agent. What can I help you  │
+  │                                            with?]                            │
+  │ --device                      INTEGER      Microphone device index           │
+  │ --list-voices                              Print known voices and exit       │
+  │ --json                -j                   Emit newline-delimited JSON       │
+  │                                            events                            │
+  │ --output              -o      [text|json]  Output mode: text (you:/agent:    │
+  │                                            lines as plain stdout,            │
+  │                                            pipe-friendly) or json            │
+  │ --show-code                                Print the equivalent Python SDK   │
+  │                                            code and exit (does not start a   │
+  │                                            session)                          │
+  │ --help                                     Show this message and exit.       │
+  ╰──────────────────────────────────────────────────────────────────────────────╯
+  ╭─ Text-to-speech ─────────────────────────────────────────────────────────────╮
+  │ --voice             TEXT  TTS voice. See --list-voices. [default: jane]      │
+  │ --language          TEXT  TTS language (defaults to the voice's language)    │
+  │ --tts-config        TEXT  Set any extra streaming-TTS query field as         │
+  │                           KEY=VALUE (repeatable)                             │
+  ╰──────────────────────────────────────────────────────────────────────────────╯
+  ╭─ Language model ─────────────────────────────────────────────────────────────╮
+  │ --model             TEXT                  LLM Gateway model that powers the  │
+  │                                           agent's replies                    │
+  │                                           [default:                          │
+  │                                           claude-haiku-4-5-20251001]         │
+  │ --max-tokens        INTEGER RANGE [x>=1]  Max tokens per reply               │
+  │                                           [default: 8192]                    │
+  │ --llm-config        TEXT                  Set any LLM Gateway request field  │
+  │                                           as KEY=VALUE (repeatable)          │
+  ╰──────────────────────────────────────────────────────────────────────────────╯
+  ╭─ Speech-to-text ─────────────────────────────────────────────────────────────╮
+  │ --speech-model                            TEXT              Streaming speech │
+  │                                                             model            │
+  │                                                             [default:        │
+  │                                                             u3-rt-pro]       │
+  │ --format-turns       --no-format-turns                      Format           │
+  │                                                             (punctuate)      │
+  │                                                             finalized turns  │
+  │                                                             before replying  │
+  │                                                             [default:        │
+  │                                                             format-turns]    │
+  │ --turn-detection                          [aggressive|bala  Turn-detection   │
+  │                                           nced|conservativ  sensitivity      │
+  │                                           e]                preset           │
+  │ --stt-config                              TEXT              Set any          │
+  │                                                             StreamingParame… │
+  │                                                             field as         │
+  │                                                             KEY=VALUE        │
+  │                                                             (repeatable)     │
+  │ --stt-config-file                         FILE              JSON file of     │
+  │                                                             streaming fields │
+  ╰──────────────────────────────────────────────────────────────────────────────╯
+  
+   Examples
+   Start a live voice conversation
+   $ assembly --sandbox live
+   Pick a voice and opening line
+   $ assembly --sandbox live --voice michael --greeting "Hi there"
+   Give the agent a persona
+   $ assembly --sandbox live --system-prompt "You are a terse pirate."
+   See available voices
+   $ assembly --sandbox live --list-voices
+   Print equivalent Python instead of running
+   $ assembly --sandbox live --show-code
+  
+  
+  
   '''
 # ---
 # name: test_command_help_matches_snapshot[llm]
diff --git a/tests/test_agent_cascade_brain.py b/tests/test_agent_cascade_brain.py
new file mode 100644
index 00000000..acb98013
--- /dev/null
+++ b/tests/test_agent_cascade_brain.py
@@ -0,0 +1,186 @@
+"""Tests for the deepagents reply brain behind `assembly live`.
+
+The brain's only network seam is the compiled graph, so `build_completer` is driven
+against the *real* deepagents graph wired to a fake chat model (pytest-socket stays
+armed) — no sockets. `build_live_tools` and `build_model`'s new knobs are unit-tested
+directly.
+"""
+
+from __future__ import annotations
+
+from langchain_core.language_models.chat_models import BaseChatModel
+from langchain_core.messages import AIMessage
+from langchain_core.outputs import ChatGeneration, ChatResult
+
+from aai_cli.agent_cascade import brain
+from aai_cli.agent_cascade.config import CascadeConfig
+from aai_cli.code_agent import model as model_mod
+
+
+class FakeChatModel(BaseChatModel):
+    """A chat model that replays a scripted list of AIMessages (mirrors the code agent's)."""
+
+    responses: list[AIMessage]
+    index: int = 0
+
+    @property
+    def _llm_type(self) -> str:
+        return "fake-live-model"
+
+    def bind_tools(self, tools, **kwargs):
+        del tools, kwargs
+        return self
+
+    def _generate(self, messages, stop=None, run_manager=None, **kwargs):
+        del messages, stop, run_manager, kwargs
+        message = self.responses[self.index]
+        self.index += 1
+        return ChatResult(generations=[ChatGeneration(message=message)])
+
+
+def _graph(model: BaseChatModel):
+    from deepagents import create_deep_agent
+
+    return create_deep_agent(model=model, tools=[], system_prompt="be a friendly live agent")
+
+
+# --- build_system_prompt -----------------------------------------------------
+
+
+def test_system_prompt_appends_tool_guidance():
+    prompt = brain.build_system_prompt("You are a pirate.")
+    # The persona is preserved, and the tool guidance is appended so the model knows it
+    # can search the web (the plain cascade persona never mentions tools).
+    assert prompt.startswith("You are a pirate.")
+    assert "search the web" in prompt
+
+
+# --- build_completer (driving the real graph with a fake model) --------------
+
+
+def test_completer_returns_final_spoken_text():
+    graph = _graph(FakeChatModel(responses=[AIMessage(content="Hello there.")]))
+    completer = brain.build_completer("k", CascadeConfig(), graph=graph)
+    reply = completer([{"role": "system", "content": "x"}, {"role": "user", "content": "hi"}])
+    assert reply == "Hello there."
+
+
+def test_completer_strips_system_message_before_invoking():
+    # The cascade prepends its own system message each turn, but the graph already owns
+    # the system prompt — so the completer must drop it before invoking, leaving only the
+    # conversation. We capture what the graph received to prove the system line is gone.
+    captured = {}
+
+    class _CapturingGraph:
+        def invoke(self, value):
+            captured["messages"] = value["messages"]
+            return {"messages": [AIMessage(content="ok")]}
+
+    completer = brain.build_completer("k", CascadeConfig(), graph=_CapturingGraph())
+    completer([{"role": "system", "content": "persona"}, {"role": "user", "content": "hi"}])
+    roles = [m["role"] for m in captured["messages"]]
+    assert roles == ["user"]
+
+
+# --- _reply_text / _content_text ---------------------------------------------
+
+
+def test_reply_text_skips_empty_ai_messages_and_takes_last_text():
+    # Scanning from the end, a trailing empty AIMessage (a tool-call request with no
+    # spoken text) is skipped so the reply falls back to the prior AIMessage's text,
+    # rather than coming back blank.
+    result = {
+        "messages": [
+            AIMessage(content="The answer is 42."),
+            AIMessage(content=""),
+        ]
+    }
+    assert brain._reply_text(result) == "The answer is 42."
+
+
+def test_reply_text_joins_list_content_blocks():
+    result = {"messages": [AIMessage(content=[{"type": "text", "text": "Hello "}, "world"])]}
+    assert brain._reply_text(result) == "Hello world"
+
+
+def test_reply_text_skips_non_assistant_messages():
+    from langchain_core.messages import ToolMessage
+
+    # Scanning from the end, a trailing non-assistant message (e.g. a tool result) is
+    # skipped — the spoken reply is the AIMessage before it.
+    result = {
+        "messages": [
+            AIMessage(content="hello there"),
+            ToolMessage(content="tool output", tool_call_id="c1"),
+        ]
+    }
+    assert brain._reply_text(result) == "hello there"
+
+
+def test_content_text_coerces_unexpected_content():
+    # A content that is neither a string nor a list of blocks (defensive fallback).
+    assert brain._content_text(123) == "123"
+
+
+def test_reply_text_is_empty_without_an_assistant_message():
+    assert brain._reply_text({"messages": []}) == ""
+    assert brain._reply_text({}) == ""
+
+
+# --- build_live_tools --------------------------------------------------------
+
+
+def test_build_live_tools_includes_search_when_keyed(monkeypatch):
+    search = object()
+    monkeypatch.setattr("aai_cli.code_agent.fetch_tool.build_fetch_tool", lambda: "fetch")
+    monkeypatch.setattr("aai_cli.code_agent.web_search.build_web_search_tool", lambda: search)
+    monkeypatch.setattr("aai_cli.code_agent.docs_mcp.load_docs_tools", lambda: ["docs"])
+    tools = brain.build_live_tools()
+    # Fetch + the keyed search + the docs tools, in that order.
+    assert tools == ["fetch", search, "docs"]
+
+
+def test_build_live_tools_omits_search_when_unkeyed(monkeypatch):
+    monkeypatch.setattr("aai_cli.code_agent.fetch_tool.build_fetch_tool", lambda: "fetch")
+    monkeypatch.setattr("aai_cli.code_agent.web_search.build_web_search_tool", lambda: None)
+    monkeypatch.setattr("aai_cli.code_agent.docs_mcp.load_docs_tools", list)
+    tools = brain.build_live_tools()
+    # No TAVILY_API_KEY -> no search tool, just the fetch tool.
+    assert tools == ["fetch"]
+
+
+# --- build_graph (model construction + compile, with the docs probe skipped) -
+
+
+def test_build_graph_uses_gateway_model_and_runs_offline(monkeypatch):
+    captured = {}
+
+    def fake_build_model(api_key, *, model, max_tokens, extra):
+        captured["model"] = model
+        captured["max_tokens"] = max_tokens
+        captured["extra"] = dict(extra)
+        return FakeChatModel(responses=[AIMessage(content="hi from the agent")])
+
+    monkeypatch.setattr(model_mod, "build_model", fake_build_model)
+    cfg = CascadeConfig(model="claude-x", max_tokens=128, llm_extra={"temperature": 0.2})
+    graph = brain.build_graph("k", cfg, tools=[])
+    # The cascade's model + knobs are threaded into the gateway model build.
+    assert captured == {"model": "claude-x", "max_tokens": 128, "extra": {"temperature": 0.2}}
+    # The compiled graph is a real deepagents graph that answers offline via the fake model.
+    completer = brain.build_completer("k", cfg, graph=graph)
+    assert completer([{"role": "user", "content": "hi"}]) == "hi from the agent"
+
+
+# --- build_model new knobs ---------------------------------------------------
+
+
+def test_build_model_threads_max_tokens_and_extra():
+    model = model_mod.build_model("k", model="claude-x", max_tokens=222, extra={"top_k": 5})
+    assert model.max_tokens == 222
+    assert model.extra_body == {"top_k": 5}
+
+
+def test_build_model_defaults_have_no_extra():
+    model = model_mod.build_model("k", model="claude-x")
+    assert model.max_tokens is None
+    assert model.extra_body is None
diff --git a/tests/test_agent_cascade_command.py b/tests/test_agent_cascade_command.py
index 513dc1cc..93d25a4e 100644
--- a/tests/test_agent_cascade_command.py
+++ b/tests/test_agent_cascade_command.py
@@ -1,4 +1,4 @@
-"""Command + wiring tests for `assembly agent-cascade`.
+"""Command + wiring tests for `assembly live`.
 
 Covers the argv -> options seam, the validation guards, _open_audio source
 selection, and CascadeDeps.real's three live legs (all driven against fakes).
@@ -60,14 +60,14 @@ def _opts(**overrides) -> AgentCascadeOptions:
 
 
 def test_list_voices_human_lists_catalog():
-    result = runner.invoke(app, ["agent-cascade", "--list-voices"])
+    result = runner.invoke(app, ["live", "--list-voices"])
     assert result.exit_code == 0
     assert "jane" in result.output
     assert "English:" in result.output
 
 
 def test_list_voices_json_emits_array():
-    result = runner.invoke(app, ["agent-cascade", "--list-voices", "--json"])
+    result = runner.invoke(app, ["live", "--list-voices", "--json"])
     assert result.exit_code == 0
     assert result.output.lstrip().startswith("[")
     assert '"jane"' in result.output
@@ -92,14 +92,14 @@ def test_missing_system_prompt_file_is_rejected_by_typer():
     # so the sandbox guard (the other exit-2 path) never runs. Asserting the guard's
     # message is absent kills the exists=True mutant without depending on the Rich error
     # text, which CI renders with ANSI + width ellipsis.
-    result = runner.invoke(app, ["agent-cascade", "--system-prompt-file", "/no/such/file"])
+    result = runner.invoke(app, ["live", "--system-prompt-file", "/no/such/file"])
     assert result.exit_code == 2
     assert "sandbox" not in result.output.lower()
 
 
 def test_production_env_is_rejected_with_sandbox_hint():
     # Default env is production, which has no streaming-TTS host.
-    result = runner.invoke(app, ["agent-cascade", "--voice", "jane"])
+    result = runner.invoke(app, ["live", "--voice", "jane"])
     assert result.exit_code == 2
     assert "only available in the sandbox" in result.output
 
@@ -126,7 +126,7 @@ def fake_run(opts, state, *, json_mode):
         captured["opts"] = opts
 
     monkeypatch.setattr(_exec, "run_agent_cascade", fake_run)
-    result = runner.invoke(app, ["agent-cascade", *argv])
+    result = runner.invoke(app, ["live", *argv])
     assert result.exit_code == 0
     assert captured["opts"].format_turns is expected
 
@@ -137,7 +137,7 @@ def test_stt_config_file_must_exist():
     # terminal so the "does not exist" message isn't wrapped by the 80-col error box.
     result = runner.invoke(
         app,
-        ["agent-cascade", "--stt-config-file", "/no/such/file.json"],
+        ["live", "--stt-config-file", "/no/such/file.json"],
         env={"COLUMNS": "300"},
     )
     assert result.exit_code == 2
@@ -418,36 +418,23 @@ def fake_stream_audio(api_key, source, *, params, on_turn):
     assert captured["params"] is params
 
 
-def test_deps_real_complete_reply_threads_model_tokens_and_extra(monkeypatch):
+def test_deps_real_complete_reply_is_built_by_the_deepagents_brain(monkeypatch):
+    # The LLM leg is now a deepagents graph: .real delegates to brain.build_completer,
+    # passing the api key + config, and uses whatever completer it returns. We assert the
+    # exact wiring so the brain swap (not a plain llm.complete) can't silently regress.
     captured = {}
 
-    def fake_complete(api_key, **kwargs):
-        captured.update(kwargs)
-        return "raw-response"
+    def fake_build_completer(api_key, config):
+        captured["api_key"] = api_key
+        captured["config"] = config
+        return lambda messages: f"reply to {messages[-1]['content']}"
 
-    monkeypatch.setattr(engine.llm, "complete", fake_complete)
-    monkeypatch.setattr(engine.llm, "content_of", lambda response: response.upper())
+    monkeypatch.setattr(engine.brain, "build_completer", fake_build_completer)
     cfg = CascadeConfig(model="m", max_tokens=222, llm_extra={"temperature": 0.5})
     deps = CascadeDeps.real("k", cfg, audio=[], stt_params=_stt_params())
-    assert deps.complete_reply([{"role": "user", "content": "hi"}]) == "RAW-RESPONSE"
-    assert captured["model"] == "m"
-    assert captured["max_tokens"] == 222
-    assert captured["extra"] == {"temperature": 0.5}
-
-
-def test_deps_real_complete_reply_sends_no_extra_when_unset(monkeypatch):
-    captured = {}
-
-    def fake_complete(api_key, **kwargs):
-        captured.update(kwargs)
-        return "x"
-
-    monkeypatch.setattr(engine.llm, "complete", fake_complete)
-    monkeypatch.setattr(engine.llm, "content_of", lambda response: response)
-    deps = CascadeDeps.real("k", CascadeConfig(), audio=[], stt_params=_stt_params())
-    deps.complete_reply([{"role": "user", "content": "hi"}])
-    # Empty overrides collapse to None, not an empty dict, so the gateway sees no extra body.
-    assert captured["extra"] is None
+    assert deps.complete_reply([{"role": "user", "content": "hi"}]) == "reply to hi"
+    assert captured["api_key"] == "k"
+    assert captured["config"] is cfg
 
 
 def test_deps_real_synthesize_threads_voice_language_and_extra(monkeypatch):
diff --git a/tests/test_agent_cascade_show_code.py b/tests/test_agent_cascade_show_code.py
index d05b5874..97bbe0ff 100644
--- a/tests/test_agent_cascade_show_code.py
+++ b/tests/test_agent_cascade_show_code.py
@@ -1,4 +1,4 @@
-"""`assembly agent-cascade --show-code` tests.
+"""`assembly live --show-code` tests.
 
 Split from test_agent_cascade_command.py (which holds the run-path wiring) so the
 print-only path's many invocations live in their own file. The cascade is
@@ -33,7 +33,7 @@ def _boom(**kwargs):
     )
     result = runner.invoke(
         app,
-        ["--sandbox", "agent-cascade", "--voice", "jane", "--greeting", "Hi there", "--show-code"],
+        ["--sandbox", "live", "--voice", "jane", "--greeting", "Hi there", "--show-code"],
     )
     assert result.exit_code == 0
     # Targets the sandbox the key was minted for — all three legs.
@@ -54,25 +54,23 @@ def fake_run(opts, state, *, json_mode):
         captured["opts"] = opts
 
     monkeypatch.setattr(_exec, "run_agent_cascade", fake_run)
-    assert runner.invoke(app, ["agent-cascade"]).exit_code == 0
+    assert runner.invoke(app, ["live"]).exit_code == 0
     assert captured["opts"].show_code is False
-    assert runner.invoke(app, ["agent-cascade", "--show-code"]).exit_code == 0
+    assert runner.invoke(app, ["live", "--show-code"]).exit_code == 0
     assert captured["opts"].show_code is True
 
 
 def test_show_code_injects_speech_model(monkeypatch):
     monkeypatch.setattr(_exec.engine, "run_cascade", lambda **kw: None)
-    result = runner.invoke(
-        app, ["--sandbox", "agent-cascade", "--speech-model", "u3-rt-pro", "--show-code"]
-    )
+    result = runner.invoke(app, ["--sandbox", "live", "--speech-model", "u3-rt-pro", "--show-code"])
     assert result.exit_code == 0
     assert "speech_model=u3-rt-pro" in result.stdout
 
 
 def test_show_code_reflects_no_format_turns(monkeypatch):
     monkeypatch.setattr(_exec.engine, "run_cascade", lambda **kw: None)
-    formatted = runner.invoke(app, ["--sandbox", "agent-cascade", "--show-code"])
-    bare = runner.invoke(app, ["--sandbox", "agent-cascade", "--no-format-turns", "--show-code"])
+    formatted = runner.invoke(app, ["--sandbox", "live", "--show-code"])
+    bare = runner.invoke(app, ["--sandbox", "live", "--no-format-turns", "--show-code"])
     # With formatting on the cue waits for the punctuated turn; off, a bare end-of-turn fires.
     assert "turn_is_formatted" in formatted.stdout
     assert "turn_is_formatted" not in bare.stdout
@@ -83,7 +81,7 @@ def test_show_code_threads_model_and_max_tokens(monkeypatch):
     monkeypatch.setattr(_exec.engine, "run_cascade", lambda **kw: None)
     result = runner.invoke(
         app,
-        ["--sandbox", "agent-cascade", "--model", "claude-x", "--max-tokens", "321", "--show-code"],
+        ["--sandbox", "live", "--model", "claude-x", "--max-tokens", "321", "--show-code"],
     )
     assert result.exit_code == 0
     assert "claude-x" in result.stdout
@@ -95,7 +93,7 @@ def test_show_code_file_source_warns_on_stderr(monkeypatch):
     monkeypatch.setattr(
         _exec.engine, "run_cascade", lambda **kw: (_ for _ in ()).throw(AssertionError("no run"))
     )
-    result = runner.invoke(app, ["--sandbox", "agent-cascade", "clip.wav", "--show-code"])
+    result = runner.invoke(app, ["--sandbox", "live", "clip.wav", "--show-code"])
     assert result.exit_code == 0
     assert "uses the microphone" in result.stderr
     assert "uses the microphone" not in result.stdout  # stdout stays a clean script
@@ -104,13 +102,13 @@ def test_show_code_file_source_warns_on_stderr(monkeypatch):
 
 def test_show_code_mic_emits_no_warning(monkeypatch):
     monkeypatch.setattr(_exec.engine, "run_cascade", lambda **kw: None)
-    result = runner.invoke(app, ["--sandbox", "agent-cascade", "--show-code"])
+    result = runner.invoke(app, ["--sandbox", "live", "--show-code"])
     assert result.exit_code == 0
     assert "uses the microphone" not in result.stderr  # mic script matches the run, nothing to warn
 
 
 def test_show_code_in_production_is_rejected_with_sandbox_hint():
     # --show-code still honors the sandbox-only guard, so the generated URLs are valid.
-    result = runner.invoke(app, ["agent-cascade", "--show-code"])
+    result = runner.invoke(app, ["live", "--show-code"])
     assert result.exit_code == 2
     assert "only available in the sandbox" in result.output
diff --git a/tests/test_sandbox_access.py b/tests/test_sandbox_access.py
index ce947ec4..6fe112de 100644
--- a/tests/test_sandbox_access.py
+++ b/tests/test_sandbox_access.py
@@ -241,7 +241,9 @@ def test_help_hides_the_sandbox_surface_from_external_accounts_and_restores_it(m
     assert "--sandbox" not in external
     assert "--env" not in external
     assert "[sandbox]" not in external
-    assert "agent-cascade" not in external
+    # The [sandbox]-only `live` command's summary is hidden too (a token unique to it,
+    # since the bare word "live" also appears in other commands' descriptions).
+    assert "tool-using" not in external
     # …but the filter is surgical: non-sandbox flags and commands stay visible (this
     # also kills the mutant that would treat every option/command as sandbox).
     assert "--profile" in external
@@ -255,4 +257,4 @@ def test_help_hides_the_sandbox_surface_from_external_accounts_and_restores_it(m
     assert "--sandbox" in internal
     assert "--env" in internal
     assert "[sandbox]" in internal
-    assert "agent-cascade" in internal
+    assert "tool-using" in internal
diff --git a/tests/test_smoke.py b/tests/test_smoke.py
index b9ba17ff..a66e2929 100644
--- a/tests/test_smoke.py
+++ b/tests/test_smoke.py
@@ -162,7 +162,7 @@ def test_help_lists_commands_in_workflow_order():
         "stream",
         "dictate",
         "agent",
-        "agent-cascade",
+        "live",
         "speak",
         "llm",
         "clip",