From 987a1a951512e3e39bac9c9023806474caa747ac Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 18 Jun 2026 04:05:16 +0000 Subject: [PATCH] Rebuild the live voice cascade on a deepagents brain and rename to `assembly live` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cascade's LLM leg is now a deepagents graph (web search, URL fetch, and AssemblyAI docs tools) instead of a single completion, so a spoken turn can transparently use tools โ€” a "talk to Gemini Live" experience. Tools are read-only and auto-approved (a voice turn can't pause for keyboard approval), and the system prompt keeps replies short and speakable for low latency. The command is renamed from `agent-cascade` to `live`. The internal slice keeps its `agent_cascade` name; only the user-facing command, help text, examples, docs, and the show-code teaching snippet change. `assembly agent`'s help now says it uses the Voice Agent API to distinguish the two. - aai_cli/agent_cascade/brain.py: build the gateway-bound deepagents graph and a stateless per-turn completer; the graph is the only network seam, so the wiring is unit-tested against a fake graph / fake chat model. - code_agent/model.build_model gains max_tokens + extra (extra_body) so the cascade's --max-tokens/--llm-config thread through; the coding agent's call is unchanged. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01Hiovdq7aoXtbfg9juPzwQa --- README.md | 8 +- REFERENCE.md | 2 +- aai_cli/agent_cascade/brain.py | 136 ++++++++++ aai_cli/agent_cascade/engine.py | 15 +- aai_cli/code_agent/model.py | 17 +- aai_cli/code_gen/agent_cascade.py | 4 +- aai_cli/commands/agent/__init__.py | 2 +- aai_cli/commands/agent_cascade/__init__.py | 34 +-- aai_cli/commands/agent_cascade/_exec.py | 4 +- pyproject.toml | 5 +- pyrightconfig.tests.json | 3 +- scripts/generated_code_compile_gate.py | 4 +- .../test_snapshots_help_root.ambr | 83 +++---- .../test_snapshots_help_run.ambr | 233 +++++++++--------- tests/test_agent_cascade_brain.py | 186 ++++++++++++++ tests/test_agent_cascade_command.py | 51 ++-- tests/test_agent_cascade_show_code.py | 24 +- tests/test_sandbox_access.py | 6 +- tests/test_smoke.py | 2 +- 19 files changed, 577 insertions(+), 242 deletions(-) create mode 100644 aai_cli/agent_cascade/brain.py create mode 100644 tests/test_agent_cascade_brain.py diff --git a/README.md b/README.md index ed70dd87..c534497e 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ That's it. Run `assembly onboard` for a guided tour, or see [Installation](#-ins - **๐ŸŽฏ One command for everything**: transcription, real-time streaming, voice agents, LLM prompts, and WER benchmarking โ€” no SDK boilerplate. - **๐Ÿ”Œ Built for pipelines**: data goes to stdout, errors to stderr, `--json` gives stable machine-readable output, and `-` reads audio from stdin. - **๐Ÿ” Secure by default**: your API key lives in the OS keyring, never in a dotfile โ€” and run commands have no `--api-key` flag, so keys can't leak into `ps` or shell history. -- **๐Ÿ› ๏ธ From demo to deployed app**: `assembly init` scaffolds a runnable FastAPI starter, `assembly dev` / `share` / `deploy` run, tunnel, and ship it, and `--show-code` prints the equivalent Python SDK script for any run command (`transcribe` / `stream` / `agent` / `agent-cascade`). +- **๐Ÿ› ๏ธ From demo to deployed app**: `assembly init` scaffolds a runnable FastAPI starter, `assembly dev` / `share` / `deploy` run, tunnel, and ship it, and `--show-code` prints the equivalent Python SDK script for any run command (`transcribe` / `stream` / `agent` / `live`). - **๐Ÿค– Agent-ready**: `assembly setup install` wires your coding agent up with the AssemblyAI docs MCP server and skills. - **๐Ÿ“– Open source**: MIT licensed. @@ -48,7 +48,7 @@ That's it. Run `assembly onboard` for a guided tour, or see [Installation](#-ins | `assembly stream` | Real-time transcription from your microphone, a file, or a URL โ€” on macOS it can capture system audio too | | `assembly dictate` | Signal-driven dictation: records immediately, send SIGTERM for instant text โ€” scriptable from hotkey tools like Hammerspoon (Sync STT API, up to 120 s per utterance) | | `assembly agent` | Full-duplex spoken conversation with a voice agent, right in your terminal | -| `assembly agent-cascade` | Same live conversation, but wired client-side from Streaming STT + the LLM Gateway + streaming TTS, like the `agent-cascade` starter (sandbox-only) | +| `assembly live` | Talk live to a tool-using voice agent, wired client-side from Streaming STT + a deepagents brain on the LLM Gateway + streaming TTS โ€” it can web-search, fetch URLs, and read the docs mid-conversation, like the `agent-cascade` starter (sandbox-only) | | `assembly speak` | Synthesize text to speech over the streaming-TTS WebSocket (sandbox-only) | | `assembly llm` | Prompt the LLM Gateway over a transcript, files, stdin, or a live stream | | `assembly code` | Terminal coding agent (deepagents SDK) backed only by the LLM Gateway โ€” reads/writes/edits files, runs shell, searches the docs MCP, and can invoke the `assembly` CLI itself; mutating actions ask for approval. Defaults to voice in a terminal (speak your request, replies read back via streaming TTS in the sandbox); pass `--no-voice` for the keyboard TUI | @@ -63,7 +63,7 @@ That's it. Run `assembly onboard` for a guided tour, or see [Installation](#-ins | `assembly transcripts` / `sessions` | Browse and fetch past transcripts and streaming sessions | | `assembly keys` / `balance` / `usage` / `limits` / `audit` | Account self-service via browser login | -Add `--show-code` to `transcribe` / `stream` / `agent` / `agent-cascade` to print the equivalent Python SDK script instead of running โ€” the built-in path from CLI experiment to SDK code. +Add `--show-code` to `transcribe` / `stream` / `agent` / `live` to print the equivalent Python SDK script instead of running โ€” the built-in path from CLI experiment to SDK code. ## โœจ Things you can do with it @@ -194,7 +194,7 @@ assembly transcripts list --json --limit 5 \ assembly agent --voice ivy --system-prompt "you're a helpful interviewer" ``` -**Graduate to the SDK** โ€” `--show-code` prints the equivalent Python script for any `transcribe`/`stream`/`agent`/`agent-cascade` run instead of executing it: +**Graduate to the SDK** โ€” `--show-code` prints the equivalent Python script for any `transcribe`/`stream`/`agent`/`live` run instead of executing it: ```sh assembly agent --system-prompt "you're a story generator" --show-code > story.py diff --git a/REFERENCE.md b/REFERENCE.md index bf9f3d8e..09216ef4 100644 --- a/REFERENCE.md +++ b/REFERENCE.md @@ -94,7 +94,7 @@ each carrying a `"type"` field to dispatch on: | ------- | ----------- | | `assembly stream --json` | `begin`, `turn`, `termination` (with `--from-stdin`, a `source` event precedes each file's events) | | `assembly agent --json` | `session.ready`, `transcript.user.delta`, `transcript.user`, `reply.started`, `transcript.agent`, `reply.done` | -| `assembly agent-cascade --json` | `session.ready`, `transcript.user.delta`, `transcript.user`, `reply.started`, `transcript.agent`, `reply.done` | +| `assembly live --json` | `session.ready`, `transcript.user.delta`, `transcript.user`, `reply.started`, `transcript.agent`, `reply.done` | | `assembly dictate --json` | `utterance` | | `assembly llm --follow --json` | `answer` | | `assembly transcribe --json` | `result` (one per source), then `reduce` if `--llm-reduce` is set | diff --git a/aai_cli/agent_cascade/brain.py b/aai_cli/agent_cascade/brain.py new file mode 100644 index 00000000..7be2297c --- /dev/null +++ b/aai_cli/agent_cascade/brain.py @@ -0,0 +1,136 @@ +"""Deepagents-powered reply brain for the live voice cascade. + +`assembly live` answers each spoken turn with a deepagents graph instead of a single +LLM completion, so the agent can transparently reach for tools โ€” web search, URL +fetch, the AssemblyAI docs โ€” mid-conversation, mimicking a live multimodal assistant +(the "talk to Gemini Live" experience). The graph is built once per session +(:func:`build_graph`) and invoked statelessly per turn with the running history the +cascade already keeps (:func:`build_completer`); tools are read-only and auto-approved, +because a spoken turn can't pause for a keyboard confirmation, and the system prompt +keeps every reply short and speakable. + +The graph is the only network seam: :func:`build_completer` accepts an injected graph, +so the per-turn orchestration is unit-tested against a fake with no sockets โ€” the same +seam the rest of the cascade uses for its STT/LLM/TTS legs. +""" + +from __future__ import annotations + +from collections.abc import Callable, Sequence +from typing import TYPE_CHECKING + +from aai_cli.agent_cascade.config import CascadeConfig +from aai_cli.code_agent.agent import CompiledAgent + +if TYPE_CHECKING: + from langchain_core.tools import BaseTool + from openai.types.chat import ChatCompletionMessageParam + +# Appended to the user's persona so the model knows it has tools and must keep replies +# spoken. The cascade's plain-LLM persona (CascadeConfig.system_prompt) says nothing +# about tools, so without this the agent would never reach for web search. +_TOOL_GUIDANCE = ( + "You can use tools to help answer: search the web for current or unfamiliar facts, " + "fetch a specific URL, and look up the AssemblyAI documentation. Reach for a tool " + "when a question needs fresh or external information; answer directly and instantly " + "when you already know. Your reply is read aloud, so keep it short and spoken โ€” no " + "markdown, lists, code, or raw URLs." +) + + +def build_system_prompt(persona: str) -> str: + """The live agent's system prompt: the user's persona plus the tool guidance.""" + return f"{persona}\n\n{_TOOL_GUIDANCE}" + + +def build_live_tools() -> list[BaseTool]: + """The live agent's read-only toolset: URL fetch, web search (if keyed), and docs. + + All three are reused from the coding agent's tool modules. Unlike there they are + *not* approval-gated โ€” a spoken turn can't wait for a keyboard confirmation, so the + live agent only gets read-only tools and runs them automatically. Web search is + present only when ``TAVILY_API_KEY`` is set; the docs MCP is best-effort (an empty + list when the host is unreachable), so neither blocks a session. + """ + from aai_cli.code_agent.docs_mcp import load_docs_tools + from aai_cli.code_agent.fetch_tool import build_fetch_tool + from aai_cli.code_agent.web_search import build_web_search_tool + + tools: list[BaseTool] = [build_fetch_tool()] + search = build_web_search_tool() + if search is not None: + tools.append(search) + tools.extend(load_docs_tools()) + return tools + + +def build_graph( + api_key: str, config: CascadeConfig, *, tools: Sequence[BaseTool] | None = None +) -> CompiledAgent: + """Compile the deepagents graph for one live session over the gateway model. + + Reuses the coding agent's gateway-bound ``ChatOpenAI`` (so the live agent can only + ever reach AssemblyAI), threading the cascade's ``--max-tokens``/``--llm-config`` + through it. ``tools`` defaults to :func:`build_live_tools`; tests pass an explicit + (possibly empty) list to skip the network-touching docs probe. + """ + from deepagents import create_deep_agent + + from aai_cli.code_agent.model import build_model + + model = build_model( + api_key, model=config.model, max_tokens=config.max_tokens, extra=config.llm_extra + ) + resolved = build_live_tools() if tools is None else list(tools) + return create_deep_agent( + model=model, tools=resolved, system_prompt=build_system_prompt(config.system_prompt) + ) + + +def build_completer( + api_key: str, config: CascadeConfig, *, graph: CompiledAgent | None = None +) -> Callable[[list[ChatCompletionMessageParam]], str]: + """A ``complete_reply`` for the cascade engine backed by the deepagents graph. + + The cascade prepends its own ``system`` message to the history each turn; the graph + already owns the system prompt, so we drop it before invoking. The graph runs the + full tool loop and we return its final spoken text. ``graph`` is injected in tests + so the per-turn wiring runs against a fake with no network. + """ + resolved = build_graph(api_key, config) if graph is None else graph + + def complete_reply(messages: list[ChatCompletionMessageParam]) -> str: + conversation = [message for message in messages if message.get("role") != "system"] + return _reply_text(resolved.invoke({"messages": conversation})) + + return complete_reply + + +def _reply_text(result: dict[str, object]) -> str: + """The agent's final spoken reply: the last assistant message that carries text. + + A tool-using turn ends in an ``AIMessage`` whose ``content`` is the spoken answer, + but earlier ``AIMessage``\\s in the same turn (the tool-call requests) have empty + text โ€” so we scan from the end for the last one with non-empty content. + """ + messages = result.get("messages") + if not isinstance(messages, list): + return "" + for message in reversed(messages): + if type(message).__name__ != "AIMessage": + continue + text = _content_text(getattr(message, "content", "")).strip() + if text: + return text + return "" + + +def _content_text(content: object) -> str: + """Coerce a message's content (a string, or a list of content blocks) to plain text.""" + if isinstance(content, str): + return content + if isinstance(content, list): + return "".join( + block.get("text", "") if isinstance(block, dict) else str(block) for block in content + ) + return str(content) diff --git a/aai_cli/agent_cascade/engine.py b/aai_cli/agent_cascade/engine.py index 9c400657..af52f15a 100644 --- a/aai_cli/agent_cascade/engine.py +++ b/aai_cli/agent_cascade/engine.py @@ -18,9 +18,10 @@ from dataclasses import dataclass, field from typing import TYPE_CHECKING, Protocol +from aai_cli.agent_cascade import brain from aai_cli.agent_cascade.config import CascadeConfig from aai_cli.agent_cascade.text import split_sentences, trim_history -from aai_cli.core import client, llm +from aai_cli.core import client from aai_cli.core.errors import CLIError from aai_cli.tts import session as tts_session from aai_cli.tts.session import SpeakConfig @@ -121,15 +122,9 @@ def real( def run_stt(on_turn: Callable[[object], None]) -> None: client.stream_audio(api_key, audio, params=stt_params, on_turn=on_turn) - def complete_reply(messages: list[ChatCompletionMessageParam]) -> str: - response = llm.complete( - api_key, - model=config.model, - messages=messages, - max_tokens=config.max_tokens, - extra=dict(config.llm_extra) or None, - ) - return llm.content_of(response) + # The LLM leg is a deepagents graph (web search / URL fetch / docs tools), not a + # single completion, so a spoken turn can transparently use tools. + complete_reply = brain.build_completer(api_key, config) def synthesize(text: str) -> bytes: spec = SpeakConfig( diff --git a/aai_cli/code_agent/model.py b/aai_cli/code_agent/model.py index bdb6a4a2..716af2fc 100644 --- a/aai_cli/code_agent/model.py +++ b/aai_cli/code_agent/model.py @@ -8,6 +8,7 @@ from __future__ import annotations +from collections.abc import Mapping from typing import TYPE_CHECKING from aai_cli.core import environments @@ -37,7 +38,13 @@ def _flatten_content(messages: object) -> None: ) -def build_model(api_key: str, *, model: str) -> BaseChatModel: +def build_model( + api_key: str, + *, + model: str, + max_tokens: int | None = None, + extra: Mapping[str, object] | None = None, +) -> BaseChatModel: """A ChatOpenAI bound to the active environment's LLM Gateway. ``use_responses_api=False`` keeps it on the chat-completions endpoint the gateway @@ -45,6 +52,12 @@ def build_model(api_key: str, *, model: str) -> BaseChatModel: Responses API that langchain would otherwise prefer for ``openai:`` models. The subclass also flattens content-parts arrays the gateway rejects (see :func:`_flatten_content`). + + ``max_tokens`` caps the per-reply length (the live voice agent passes a small cap to + keep spoken replies short and fast); ``extra`` passes any additional gateway request + fields through as ``extra_body`` (so they reach the request body verbatim, like + `aai_cli.core.llm`'s ``extra``). Both default to off so the coding agent's call is + unchanged. """ from langchain_openai import ChatOpenAI from pydantic import SecretStr @@ -64,4 +77,6 @@ def _get_request_payload( base_url=environments.active().llm_gateway_base, api_key=SecretStr(api_key), use_responses_api=False, + max_tokens=max_tokens, + extra_body=dict(extra) if extra else None, ) diff --git a/aai_cli/code_gen/agent_cascade.py b/aai_cli/code_gen/agent_cascade.py index 0a861911..5f5306f0 100644 --- a/aai_cli/code_gen/agent_cascade.py +++ b/aai_cli/code_gen/agent_cascade.py @@ -16,9 +16,11 @@ # which is never formatted โ€” so no brace has to be doubled. _HEADER = """\ # Live voice cascade: Streaming STT -> LLM Gateway -> streaming TTS, wired client-side. -# This is what `assembly --sandbox agent-cascade` runs: it transcribes your speech, +# The basic cascade behind `assembly --sandbox live`: it transcribes your speech, # sends each finalized turn to the LLM Gateway, and speaks the reply through streaming # TTS โ€” the same three primitives the agent-cascade init template wires server-side. +# (The `live` command adds a tool-using agent on the LLM leg; this snippet is the +# plain single-completion version to build from.) # Requires audio + websockets: pip install sounddevice websockets openai # Tip: use headphones โ€” the mic stays open while the agent speaks, so on speakers it # would hear itself and loop. diff --git a/aai_cli/commands/agent/__init__.py b/aai_cli/commands/agent/__init__.py index f535b54c..b20dfc2a 100644 --- a/aai_cli/commands/agent/__init__.py +++ b/aai_cli/commands/agent/__init__.py @@ -84,7 +84,7 @@ def agent( help="Print the equivalent Python SDK code and exit (does not start a session)", ), ) -> None: - """Hold a live two-way voice conversation with a voice agent + """Hold a live two-way voice conversation with the Voice Agent API Use headphones: the mic stays open while the agent speaks, so on speakers it would hear itself and loop. Pass an audio file/URL (or diff --git a/aai_cli/commands/agent_cascade/__init__.py b/aai_cli/commands/agent_cascade/__init__.py index 3e99f146..b17e85e8 100644 --- a/aai_cli/commands/agent_cascade/__init__.py +++ b/aai_cli/commands/agent_cascade/__init__.py @@ -31,7 +31,7 @@ SPEC = command_registry.CommandModuleSpec( panel=help_panels.TRANSCRIPTION, order=45, # pragma: no mutate -- sparse rank; a +-1 shift is order-equivalent - commands=("agent-cascade",), + commands=("live",), ) @@ -43,28 +43,28 @@ def _emit_voice_list(_state: AppState, json_mode: bool) -> None: @app.command( - name="agent-cascade", + name="live", rich_help_panel=help_panels.TRANSCRIPTION, epilog=examples_epilog( [ - ("Start a live cascade conversation", "assembly --sandbox agent-cascade"), + ("Start a live voice conversation", "assembly --sandbox live"), ( "Pick a voice and opening line", - 'assembly --sandbox agent-cascade --voice michael --greeting "Hi there"', + 'assembly --sandbox live --voice michael --greeting "Hi there"', ), ( "Give the agent a persona", - 'assembly --sandbox agent-cascade --system-prompt "You are a terse pirate."', + 'assembly --sandbox live --system-prompt "You are a terse pirate."', ), - ("See available voices", "assembly --sandbox agent-cascade --list-voices"), + ("See available voices", "assembly --sandbox live --list-voices"), ( "Print equivalent Python instead of running", - "assembly --sandbox agent-cascade --show-code", + "assembly --sandbox live --show-code", ), ] ), ) -def agent_cascade( +def live( ctx: typer.Context, source: str | None = typer.Argument( None, help="Audio file path or URL to speak to the agent. Omit to use the microphone." @@ -169,14 +169,15 @@ def agent_cascade( help="Print the equivalent Python SDK code and exit (does not start a session)", ), ) -> None: - """\\[sandbox] Hold a live voice conversation through a self-wired cascade + """\\[sandbox] Talk live to a tool-using voice agent - Like 'assembly agent', but instead of AssemblyAI's Voice Agent endpoint this - wires the three primitives together itself โ€” Streaming STT, the LLM Gateway, - and streaming TTS โ€” exactly like the 'agent-cascade' init template does - server-side. Because it uses streaming TTS it only runs in the sandbox: run - it as 'assembly --sandbox agent-cascade' (--sandbox goes before the - subcommand). + A real-time spoken conversation, wired client-side from three primitives โ€” + Streaming STT, a deepagents brain on the LLM Gateway, and streaming TTS. Unlike + 'assembly agent' (the Voice Agent API), the brain here is an agent that can use + tools mid-conversation โ€” web search, URL fetch, and the AssemblyAI docs โ€” so it + answers like a live multimodal assistant. Because it uses streaming TTS it only + runs in the sandbox: run it as 'assembly --sandbox live' (--sandbox goes before + the subcommand). Use headphones: the mic stays open while the agent speaks, so on speakers it would hear itself and loop. Pass an audio file/URL (or --sample) to speak a @@ -185,6 +186,9 @@ def agent_cascade( This only runs a conversation in the terminal โ€” it writes no code. To build an agent-cascade app, run 'assembly init agent-cascade' instead. + + Web search needs a TAVILY_API_KEY in the environment; without it the agent + keeps its URL-fetch and docs tools. """ if list_voices: diff --git a/aai_cli/commands/agent_cascade/_exec.py b/aai_cli/commands/agent_cascade/_exec.py index 0b97e230..af466c56 100644 --- a/aai_cli/commands/agent_cascade/_exec.py +++ b/aai_cli/commands/agent_cascade/_exec.py @@ -169,9 +169,9 @@ def _print_show_code(opts: AgentCascadeOptions, system_prompt_text: str) -> None def run_agent_cascade(opts: AgentCascadeOptions, state: AppState, *, json_mode: bool) -> None: """Execute one `assembly agent-cascade` cascade from already-parsed flags.""" text_mode, json_mode = resolve_output_modes(opts.output_field, json_mode=json_mode) - validate_voice(opts.voice, voices.VOICE_NAMES, command="agent-cascade") + validate_voice(opts.voice, voices.VOICE_NAMES, command="live") # Streaming TTS has no production host, so the whole cascade is sandbox-only. - tts_session.require_available("agent-cascade") + tts_session.require_available("live") system_prompt_text = _resolve_system_prompt(opts.system_prompt, opts.system_prompt_file) if opts.show_code: diff --git a/pyproject.toml b/pyproject.toml index a7eb9e4e..6ca42df6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -258,9 +258,10 @@ module = [ "aai_cli.code_agent.store", "aai_cli.code_agent.model", "aai_cli.commands.code._exec", + "aai_cli.agent_cascade.brain", ] disallow_any_generics = false -disable_error_code = ["return-value", "arg-type", "type-arg"] +disable_error_code = ["return-value", "arg-type", "type-arg", "call-arg"] [tool.pyright] # Second type checker alongside mypy: pyright catches a different class of @@ -279,7 +280,7 @@ exclude = ["**/node_modules", "**/__pycache__", "**/.*"] # Unknown*/invariance diagnostics our precise signatures can't satisfy. mypy still # type-checks these modules (with the targeted overrides above) as the safety net, so # we suppress pyright diagnostics here rather than littering per-line `# pyright: ignore`. -ignore = ["aai_cli/code_agent", "aai_cli/commands/code"] +ignore = ["aai_cli/code_agent", "aai_cli/commands/code", "aai_cli/agent_cascade/brain.py"] pythonVersion = "3.12" typeCheckingMode = "strict" # Third-party deps (assemblyai, sounddevice) ship no type stubs. diff --git a/pyrightconfig.tests.json b/pyrightconfig.tests.json index 1ea7be4a..f9dbdf0e 100644 --- a/pyrightconfig.tests.json +++ b/pyrightconfig.tests.json @@ -3,7 +3,8 @@ "ignore": [ "tests/test_code_agent.py", "tests/test_code_command.py", - "tests/test_code_tui.py" + "tests/test_code_tui.py", + "tests/test_agent_cascade_brain.py" ], "pythonVersion": "3.12", "typeCheckingMode": "standard", diff --git a/scripts/generated_code_compile_gate.py b/scripts/generated_code_compile_gate.py index 8d258efe..bd71efdf 100644 --- a/scripts/generated_code_compile_gate.py +++ b/scripts/generated_code_compile_gate.py @@ -118,10 +118,10 @@ def main() -> int: ), ( # Sandbox-only: streaming TTS has no prod host, so --sandbox makes the URLs valid. - "agent-cascade-basic", + "live-basic", ( "--sandbox", - "agent-cascade", + "live", "--voice", "jane", "--greeting", diff --git a/tests/__snapshots__/test_snapshots_help_root.ambr b/tests/__snapshots__/test_snapshots_help_root.ambr index 82cc9dc9..2bb0f987 100644 --- a/tests/__snapshots__/test_snapshots_help_root.ambr +++ b/tests/__snapshots__/test_snapshots_help_root.ambr @@ -32,60 +32,59 @@ โ”‚ exit. โ”‚ โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ โ•ญโ”€ Quick Start โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ - โ”‚ onboard Guided setup: sign in and run your first transcription โ”‚ + โ”‚ onboard Guided setup: sign in and run your first transcription โ”‚ โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ โ•ญโ”€ Coding Agent โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ - โ”‚ code Run a terminal coding agent backed by the AssemblyAI LLM โ”‚ - โ”‚ Gateway โ”‚ + โ”‚ code Run a terminal coding agent backed by the AssemblyAI LLM โ”‚ + โ”‚ Gateway โ”‚ โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ โ•ญโ”€ Build an App โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ - โ”‚ init Scaffold a new app from a template and launch it โ”‚ - โ”‚ dev Run the dev server for the app in the current directory โ”‚ - โ”‚ share Expose the local app on a public URL via a cloudflared tunnel โ”‚ - โ”‚ deploy Deploy the current project to Vercel, Railway, or Fly.io โ”‚ + โ”‚ init Scaffold a new app from a template and launch it โ”‚ + โ”‚ dev Run the dev server for the app in the current directory โ”‚ + โ”‚ share Expose the local app on a public URL via a cloudflared tunnel โ”‚ + โ”‚ deploy Deploy the current project to Vercel, Railway, or Fly.io โ”‚ โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ โ•ญโ”€ Run AssemblyAI โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ - โ”‚ transcribe Transcribe a file, URL, or YouTube/podcast link โ€” or a whole โ”‚ - โ”‚ batch โ”‚ - โ”‚ stream Transcribe live audio in real time from a mic, file, URL, or โ”‚ - โ”‚ pipe โ”‚ - โ”‚ dictate Signal-driven dictation: record the mic, get the transcript โ”‚ - โ”‚ back โ”‚ - โ”‚ agent Hold a live two-way voice conversation with a voice agent โ”‚ - โ”‚ agent-cascade [sandbox] Hold a live voice conversation through a self-wired โ”‚ - โ”‚ cascade โ”‚ - โ”‚ speak [sandbox] Synthesize speech from text with AssemblyAI โ”‚ - โ”‚ streaming TTS โ”‚ - โ”‚ llm Send a prompt to AssemblyAI's LLM Gateway and print the reply โ”‚ - โ”‚ clip Cut clips from media by speaker, text match, LLM pick, or โ”‚ - โ”‚ time range โ”‚ - โ”‚ dub [sandbox] Dub a video or audio file into another language โ”‚ - โ”‚ caption Burn always-visible captions into a video โ”‚ - โ”‚ eval Transcribe one or more datasets and score WER against their โ”‚ - โ”‚ reference texts โ”‚ - โ”‚ webhooks Receive webhook deliveries on a public dev URL โ”‚ + โ”‚ transcribe Transcribe a file, URL, or YouTube/podcast link โ€” or a whole โ”‚ + โ”‚ batch โ”‚ + โ”‚ stream Transcribe live audio in real time from a mic, file, URL, or โ”‚ + โ”‚ pipe โ”‚ + โ”‚ dictate Signal-driven dictation: record the mic, get the transcript โ”‚ + โ”‚ back โ”‚ + โ”‚ agent Hold a live two-way voice conversation with the Voice Agent API โ”‚ + โ”‚ live [sandbox] Talk live to a tool-using voice agent โ”‚ + โ”‚ speak [sandbox] Synthesize speech from text with AssemblyAI streaming โ”‚ + โ”‚ TTS โ”‚ + โ”‚ llm Send a prompt to AssemblyAI's LLM Gateway and print the reply โ”‚ + โ”‚ clip Cut clips from media by speaker, text match, LLM pick, or time โ”‚ + โ”‚ range โ”‚ + โ”‚ dub [sandbox] Dub a video or audio file into another language โ”‚ + โ”‚ caption Burn always-visible captions into a video โ”‚ + โ”‚ eval Transcribe one or more datasets and score WER against their โ”‚ + โ”‚ reference texts โ”‚ + โ”‚ webhooks Receive webhook deliveries on a public dev URL โ”‚ โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ โ•ญโ”€ Setup & Tools โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ - โ”‚ doctor Check that your environment is ready for AssemblyAI โ”‚ - โ”‚ setup Set up your coding agent for AssemblyAI (docs MCP + skills) โ”‚ - โ”‚ config Inspect and edit persisted CLI settings (profiles, env, โ”‚ - โ”‚ telemetry) โ”‚ - โ”‚ update Update the CLI to the latest release (brew/pipx/uv) โ”‚ - โ”‚ telemetry Anonymous usage telemetry: status, enable, disable โ”‚ + โ”‚ doctor Check that your environment is ready for AssemblyAI โ”‚ + โ”‚ setup Set up your coding agent for AssemblyAI (docs MCP + skills) โ”‚ + โ”‚ config Inspect and edit persisted CLI settings (profiles, env, โ”‚ + โ”‚ telemetry) โ”‚ + โ”‚ update Update the CLI to the latest release (brew/pipx/uv) โ”‚ + โ”‚ telemetry Anonymous usage telemetry: status, enable, disable โ”‚ โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ โ•ญโ”€ History โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ - โ”‚ transcripts Browse and fetch past transcripts โ”‚ - โ”‚ sessions Browse your past streaming (real-time) sessions โ”‚ + โ”‚ transcripts Browse and fetch past transcripts โ”‚ + โ”‚ sessions Browse your past streaming (real-time) sessions โ”‚ โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ โ•ญโ”€ Account โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ - โ”‚ login Authenticate via your browser and store a CLI API key โ”‚ - โ”‚ logout Clear stored credentials for the active profile โ”‚ - โ”‚ whoami Show the active profile and whether its key works โ”‚ - โ”‚ balance Show your remaining account balance โ”‚ - โ”‚ usage Show usage over a date range (default: last 30 days) โ”‚ - โ”‚ limits Show your account's rate limits per service โ”‚ - โ”‚ keys List, create, and rename your AssemblyAI API keys โ”‚ - โ”‚ audit List recent audit-log entries for your account โ”‚ + โ”‚ login Authenticate via your browser and store a CLI API key โ”‚ + โ”‚ logout Clear stored credentials for the active profile โ”‚ + โ”‚ whoami Show the active profile and whether its key works โ”‚ + โ”‚ balance Show your remaining account balance โ”‚ + โ”‚ usage Show usage over a date range (default: last 30 days) โ”‚ + โ”‚ limits Show your account's rate limits per service โ”‚ + โ”‚ keys List, create, and rename your AssemblyAI API keys โ”‚ + โ”‚ audit List recent audit-log entries for your account โ”‚ โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ Examples diff --git a/tests/__snapshots__/test_snapshots_help_run.ambr b/tests/__snapshots__/test_snapshots_help_run.ambr index 2879f6f9..cdc75e01 100644 --- a/tests/__snapshots__/test_snapshots_help_run.ambr +++ b/tests/__snapshots__/test_snapshots_help_run.ambr @@ -1,121 +1,10 @@ # serializer version: 1 -# name: test_command_help_matches_snapshot[agent-cascade] - ''' - - Usage: assembly agent-cascade [OPTIONS] [SOURCE] - - [sandbox] Hold a live voice conversation through a self-wired cascade - - Like 'assembly agent', but instead of AssemblyAI's Voice Agent endpoint this - wires the three primitives together itself โ€” Streaming STT, the LLM Gateway, - and streaming TTS โ€” exactly like the 'agent-cascade' init template does - server-side. Because it uses streaming TTS it only runs in the sandbox: run - it as 'assembly --sandbox agent-cascade' (--sandbox goes before the - subcommand). - - Use headphones: the mic stays open while the agent speaks, so on speakers it - would hear itself and loop. Pass an audio file/URL (or --sample) to speak a - recorded clip instead of the microphone; the session then ends after the - agent's reply. - - This only runs a conversation in the terminal โ€” it writes no code. To build - an agent-cascade app, run 'assembly init agent-cascade' instead. - - โ•ญโ”€ Arguments โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ - โ”‚ source [SOURCE] Audio file path or URL to speak to the agent. Omit โ”‚ - โ”‚ to use the microphone. โ”‚ - โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ - โ•ญโ”€ Options โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ - โ”‚ --sample Speak the hosted wildfires.mp3 โ”‚ - โ”‚ sample to the agent โ”‚ - โ”‚ --system-prompt TEXT System prompt (the agent's โ”‚ - โ”‚ persona) โ”‚ - โ”‚ [default: You are a friendly, โ”‚ - โ”‚ concise voice assistant. Keep โ”‚ - โ”‚ replies short and conversational. โ”‚ - โ”‚ Your reply is read aloud by a โ”‚ - โ”‚ text-to-speech engine, so write โ”‚ - โ”‚ plain spoken prose โ€” no markdown, โ”‚ - โ”‚ emoji, bullet lists, or code.] โ”‚ - โ”‚ --system-prompt-file FILE Read the system prompt from a โ”‚ - โ”‚ file (overrides --system-prompt) โ”‚ - โ”‚ --greeting TEXT Spoken greeting โ”‚ - โ”‚ [default: Hi! I'm your AssemblyAI โ”‚ - โ”‚ voice agent. What can I help you โ”‚ - โ”‚ with?] โ”‚ - โ”‚ --device INTEGER Microphone device index โ”‚ - โ”‚ --list-voices Print known voices and exit โ”‚ - โ”‚ --json -j Emit newline-delimited JSON โ”‚ - โ”‚ events โ”‚ - โ”‚ --output -o [text|json] Output mode: text (you:/agent: โ”‚ - โ”‚ lines as plain stdout, โ”‚ - โ”‚ pipe-friendly) or json โ”‚ - โ”‚ --show-code Print the equivalent Python SDK โ”‚ - โ”‚ code and exit (does not start a โ”‚ - โ”‚ session) โ”‚ - โ”‚ --help Show this message and exit. โ”‚ - โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ - โ•ญโ”€ Text-to-speech โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ - โ”‚ --voice TEXT TTS voice. See --list-voices. [default: jane] โ”‚ - โ”‚ --language TEXT TTS language (defaults to the voice's language) โ”‚ - โ”‚ --tts-config TEXT Set any extra streaming-TTS query field as โ”‚ - โ”‚ KEY=VALUE (repeatable) โ”‚ - โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ - โ•ญโ”€ Language model โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ - โ”‚ --model TEXT LLM Gateway model that powers the โ”‚ - โ”‚ agent's replies โ”‚ - โ”‚ [default: โ”‚ - โ”‚ claude-haiku-4-5-20251001] โ”‚ - โ”‚ --max-tokens INTEGER RANGE [x>=1] Max tokens per reply โ”‚ - โ”‚ [default: 8192] โ”‚ - โ”‚ --llm-config TEXT Set any LLM Gateway request field โ”‚ - โ”‚ as KEY=VALUE (repeatable) โ”‚ - โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ - โ•ญโ”€ Speech-to-text โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ - โ”‚ --speech-model TEXT Streaming speech โ”‚ - โ”‚ model โ”‚ - โ”‚ [default: โ”‚ - โ”‚ u3-rt-pro] โ”‚ - โ”‚ --format-turns --no-format-turns Format โ”‚ - โ”‚ (punctuate) โ”‚ - โ”‚ finalized turns โ”‚ - โ”‚ before replying โ”‚ - โ”‚ [default: โ”‚ - โ”‚ format-turns] โ”‚ - โ”‚ --turn-detection [aggressive|bala Turn-detection โ”‚ - โ”‚ nced|conservativ sensitivity โ”‚ - โ”‚ e] preset โ”‚ - โ”‚ --stt-config TEXT Set any โ”‚ - โ”‚ StreamingParameโ€ฆ โ”‚ - โ”‚ field as โ”‚ - โ”‚ KEY=VALUE โ”‚ - โ”‚ (repeatable) โ”‚ - โ”‚ --stt-config-file FILE JSON file of โ”‚ - โ”‚ streaming fields โ”‚ - โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ - - Examples - Start a live cascade conversation - $ assembly --sandbox agent-cascade - Pick a voice and opening line - $ assembly --sandbox agent-cascade --voice michael --greeting "Hi there" - Give the agent a persona - $ assembly --sandbox agent-cascade --system-prompt "You are a terse pirate." - See available voices - $ assembly --sandbox agent-cascade --list-voices - Print equivalent Python instead of running - $ assembly --sandbox agent-cascade --show-code - - - - ''' -# --- # name: test_command_help_matches_snapshot[agent] ''' Usage: assembly agent [OPTIONS] [SOURCE] - Hold a live two-way voice conversation with a voice agent + Hold a live two-way voice conversation with the Voice Agent API Use headphones: the mic stays open while the agent speaks, so on speakers it would hear itself and loop. Pass an audio file/URL (or @@ -699,6 +588,126 @@ + ''' +# --- +# name: test_command_help_matches_snapshot[live] + ''' + + Usage: assembly live [OPTIONS] [SOURCE] + + [sandbox] Talk live to a tool-using voice agent + + A real-time spoken conversation, wired client-side from three primitives โ€” + Streaming STT, a deepagents brain on the LLM Gateway, and streaming TTS. + Unlike + 'assembly agent' (the Voice Agent API), the brain here is an agent that can + use + tools mid-conversation โ€” web search, URL fetch, and the AssemblyAI docs โ€” so + it + answers like a live multimodal assistant. Because it uses streaming TTS it + only + runs in the sandbox: run it as 'assembly --sandbox live' (--sandbox goes + before + the subcommand). + + Use headphones: the mic stays open while the agent speaks, so on speakers it + would hear itself and loop. Pass an audio file/URL (or --sample) to speak a + recorded clip instead of the microphone; the session then ends after the + agent's reply. + + This only runs a conversation in the terminal โ€” it writes no code. To build + an agent-cascade app, run 'assembly init agent-cascade' instead. + + Web search needs a TAVILY_API_KEY in the environment; without it the agent + keeps its URL-fetch and docs tools. + + โ•ญโ”€ Arguments โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ + โ”‚ source [SOURCE] Audio file path or URL to speak to the agent. Omit โ”‚ + โ”‚ to use the microphone. โ”‚ + โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ + โ•ญโ”€ Options โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ + โ”‚ --sample Speak the hosted wildfires.mp3 โ”‚ + โ”‚ sample to the agent โ”‚ + โ”‚ --system-prompt TEXT System prompt (the agent's โ”‚ + โ”‚ persona) โ”‚ + โ”‚ [default: You are a friendly, โ”‚ + โ”‚ concise voice assistant. Keep โ”‚ + โ”‚ replies short and conversational. โ”‚ + โ”‚ Your reply is read aloud by a โ”‚ + โ”‚ text-to-speech engine, so write โ”‚ + โ”‚ plain spoken prose โ€” no markdown, โ”‚ + โ”‚ emoji, bullet lists, or code.] โ”‚ + โ”‚ --system-prompt-file FILE Read the system prompt from a โ”‚ + โ”‚ file (overrides --system-prompt) โ”‚ + โ”‚ --greeting TEXT Spoken greeting โ”‚ + โ”‚ [default: Hi! I'm your AssemblyAI โ”‚ + โ”‚ voice agent. What can I help you โ”‚ + โ”‚ with?] โ”‚ + โ”‚ --device INTEGER Microphone device index โ”‚ + โ”‚ --list-voices Print known voices and exit โ”‚ + โ”‚ --json -j Emit newline-delimited JSON โ”‚ + โ”‚ events โ”‚ + โ”‚ --output -o [text|json] Output mode: text (you:/agent: โ”‚ + โ”‚ lines as plain stdout, โ”‚ + โ”‚ pipe-friendly) or json โ”‚ + โ”‚ --show-code Print the equivalent Python SDK โ”‚ + โ”‚ code and exit (does not start a โ”‚ + โ”‚ session) โ”‚ + โ”‚ --help Show this message and exit. โ”‚ + โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ + โ•ญโ”€ Text-to-speech โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ + โ”‚ --voice TEXT TTS voice. See --list-voices. [default: jane] โ”‚ + โ”‚ --language TEXT TTS language (defaults to the voice's language) โ”‚ + โ”‚ --tts-config TEXT Set any extra streaming-TTS query field as โ”‚ + โ”‚ KEY=VALUE (repeatable) โ”‚ + โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ + โ•ญโ”€ Language model โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ + โ”‚ --model TEXT LLM Gateway model that powers the โ”‚ + โ”‚ agent's replies โ”‚ + โ”‚ [default: โ”‚ + โ”‚ claude-haiku-4-5-20251001] โ”‚ + โ”‚ --max-tokens INTEGER RANGE [x>=1] Max tokens per reply โ”‚ + โ”‚ [default: 8192] โ”‚ + โ”‚ --llm-config TEXT Set any LLM Gateway request field โ”‚ + โ”‚ as KEY=VALUE (repeatable) โ”‚ + โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ + โ•ญโ”€ Speech-to-text โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ + โ”‚ --speech-model TEXT Streaming speech โ”‚ + โ”‚ model โ”‚ + โ”‚ [default: โ”‚ + โ”‚ u3-rt-pro] โ”‚ + โ”‚ --format-turns --no-format-turns Format โ”‚ + โ”‚ (punctuate) โ”‚ + โ”‚ finalized turns โ”‚ + โ”‚ before replying โ”‚ + โ”‚ [default: โ”‚ + โ”‚ format-turns] โ”‚ + โ”‚ --turn-detection [aggressive|bala Turn-detection โ”‚ + โ”‚ nced|conservativ sensitivity โ”‚ + โ”‚ e] preset โ”‚ + โ”‚ --stt-config TEXT Set any โ”‚ + โ”‚ StreamingParameโ€ฆ โ”‚ + โ”‚ field as โ”‚ + โ”‚ KEY=VALUE โ”‚ + โ”‚ (repeatable) โ”‚ + โ”‚ --stt-config-file FILE JSON file of โ”‚ + โ”‚ streaming fields โ”‚ + โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ + + Examples + Start a live voice conversation + $ assembly --sandbox live + Pick a voice and opening line + $ assembly --sandbox live --voice michael --greeting "Hi there" + Give the agent a persona + $ assembly --sandbox live --system-prompt "You are a terse pirate." + See available voices + $ assembly --sandbox live --list-voices + Print equivalent Python instead of running + $ assembly --sandbox live --show-code + + + ''' # --- # name: test_command_help_matches_snapshot[llm] diff --git a/tests/test_agent_cascade_brain.py b/tests/test_agent_cascade_brain.py new file mode 100644 index 00000000..acb98013 --- /dev/null +++ b/tests/test_agent_cascade_brain.py @@ -0,0 +1,186 @@ +"""Tests for the deepagents reply brain behind `assembly live`. + +The brain's only network seam is the compiled graph, so `build_completer` is driven +against the *real* deepagents graph wired to a fake chat model (pytest-socket stays +armed) โ€” no sockets. `build_live_tools` and `build_model`'s new knobs are unit-tested +directly. +""" + +from __future__ import annotations + +from langchain_core.language_models.chat_models import BaseChatModel +from langchain_core.messages import AIMessage +from langchain_core.outputs import ChatGeneration, ChatResult + +from aai_cli.agent_cascade import brain +from aai_cli.agent_cascade.config import CascadeConfig +from aai_cli.code_agent import model as model_mod + + +class FakeChatModel(BaseChatModel): + """A chat model that replays a scripted list of AIMessages (mirrors the code agent's).""" + + responses: list[AIMessage] + index: int = 0 + + @property + def _llm_type(self) -> str: + return "fake-live-model" + + def bind_tools(self, tools, **kwargs): + del tools, kwargs + return self + + def _generate(self, messages, stop=None, run_manager=None, **kwargs): + del messages, stop, run_manager, kwargs + message = self.responses[self.index] + self.index += 1 + return ChatResult(generations=[ChatGeneration(message=message)]) + + +def _graph(model: BaseChatModel): + from deepagents import create_deep_agent + + return create_deep_agent(model=model, tools=[], system_prompt="be a friendly live agent") + + +# --- build_system_prompt ----------------------------------------------------- + + +def test_system_prompt_appends_tool_guidance(): + prompt = brain.build_system_prompt("You are a pirate.") + # The persona is preserved, and the tool guidance is appended so the model knows it + # can search the web (the plain cascade persona never mentions tools). + assert prompt.startswith("You are a pirate.") + assert "search the web" in prompt + + +# --- build_completer (driving the real graph with a fake model) -------------- + + +def test_completer_returns_final_spoken_text(): + graph = _graph(FakeChatModel(responses=[AIMessage(content="Hello there.")])) + completer = brain.build_completer("k", CascadeConfig(), graph=graph) + reply = completer([{"role": "system", "content": "x"}, {"role": "user", "content": "hi"}]) + assert reply == "Hello there." + + +def test_completer_strips_system_message_before_invoking(): + # The cascade prepends its own system message each turn, but the graph already owns + # the system prompt โ€” so the completer must drop it before invoking, leaving only the + # conversation. We capture what the graph received to prove the system line is gone. + captured = {} + + class _CapturingGraph: + def invoke(self, value): + captured["messages"] = value["messages"] + return {"messages": [AIMessage(content="ok")]} + + completer = brain.build_completer("k", CascadeConfig(), graph=_CapturingGraph()) + completer([{"role": "system", "content": "persona"}, {"role": "user", "content": "hi"}]) + roles = [m["role"] for m in captured["messages"]] + assert roles == ["user"] + + +# --- _reply_text / _content_text --------------------------------------------- + + +def test_reply_text_skips_empty_ai_messages_and_takes_last_text(): + # Scanning from the end, a trailing empty AIMessage (a tool-call request with no + # spoken text) is skipped so the reply falls back to the prior AIMessage's text, + # rather than coming back blank. + result = { + "messages": [ + AIMessage(content="The answer is 42."), + AIMessage(content=""), + ] + } + assert brain._reply_text(result) == "The answer is 42." + + +def test_reply_text_joins_list_content_blocks(): + result = {"messages": [AIMessage(content=[{"type": "text", "text": "Hello "}, "world"])]} + assert brain._reply_text(result) == "Hello world" + + +def test_reply_text_skips_non_assistant_messages(): + from langchain_core.messages import ToolMessage + + # Scanning from the end, a trailing non-assistant message (e.g. a tool result) is + # skipped โ€” the spoken reply is the AIMessage before it. + result = { + "messages": [ + AIMessage(content="hello there"), + ToolMessage(content="tool output", tool_call_id="c1"), + ] + } + assert brain._reply_text(result) == "hello there" + + +def test_content_text_coerces_unexpected_content(): + # A content that is neither a string nor a list of blocks (defensive fallback). + assert brain._content_text(123) == "123" + + +def test_reply_text_is_empty_without_an_assistant_message(): + assert brain._reply_text({"messages": []}) == "" + assert brain._reply_text({}) == "" + + +# --- build_live_tools -------------------------------------------------------- + + +def test_build_live_tools_includes_search_when_keyed(monkeypatch): + search = object() + monkeypatch.setattr("aai_cli.code_agent.fetch_tool.build_fetch_tool", lambda: "fetch") + monkeypatch.setattr("aai_cli.code_agent.web_search.build_web_search_tool", lambda: search) + monkeypatch.setattr("aai_cli.code_agent.docs_mcp.load_docs_tools", lambda: ["docs"]) + tools = brain.build_live_tools() + # Fetch + the keyed search + the docs tools, in that order. + assert tools == ["fetch", search, "docs"] + + +def test_build_live_tools_omits_search_when_unkeyed(monkeypatch): + monkeypatch.setattr("aai_cli.code_agent.fetch_tool.build_fetch_tool", lambda: "fetch") + monkeypatch.setattr("aai_cli.code_agent.web_search.build_web_search_tool", lambda: None) + monkeypatch.setattr("aai_cli.code_agent.docs_mcp.load_docs_tools", list) + tools = brain.build_live_tools() + # No TAVILY_API_KEY -> no search tool, just the fetch tool. + assert tools == ["fetch"] + + +# --- build_graph (model construction + compile, with the docs probe skipped) - + + +def test_build_graph_uses_gateway_model_and_runs_offline(monkeypatch): + captured = {} + + def fake_build_model(api_key, *, model, max_tokens, extra): + captured["model"] = model + captured["max_tokens"] = max_tokens + captured["extra"] = dict(extra) + return FakeChatModel(responses=[AIMessage(content="hi from the agent")]) + + monkeypatch.setattr(model_mod, "build_model", fake_build_model) + cfg = CascadeConfig(model="claude-x", max_tokens=128, llm_extra={"temperature": 0.2}) + graph = brain.build_graph("k", cfg, tools=[]) + # The cascade's model + knobs are threaded into the gateway model build. + assert captured == {"model": "claude-x", "max_tokens": 128, "extra": {"temperature": 0.2}} + # The compiled graph is a real deepagents graph that answers offline via the fake model. + completer = brain.build_completer("k", cfg, graph=graph) + assert completer([{"role": "user", "content": "hi"}]) == "hi from the agent" + + +# --- build_model new knobs --------------------------------------------------- + + +def test_build_model_threads_max_tokens_and_extra(): + model = model_mod.build_model("k", model="claude-x", max_tokens=222, extra={"top_k": 5}) + assert model.max_tokens == 222 + assert model.extra_body == {"top_k": 5} + + +def test_build_model_defaults_have_no_extra(): + model = model_mod.build_model("k", model="claude-x") + assert model.max_tokens is None + assert model.extra_body is None diff --git a/tests/test_agent_cascade_command.py b/tests/test_agent_cascade_command.py index 513dc1cc..93d25a4e 100644 --- a/tests/test_agent_cascade_command.py +++ b/tests/test_agent_cascade_command.py @@ -1,4 +1,4 @@ -"""Command + wiring tests for `assembly agent-cascade`. +"""Command + wiring tests for `assembly live`. Covers the argv -> options seam, the validation guards, _open_audio source selection, and CascadeDeps.real's three live legs (all driven against fakes). @@ -60,14 +60,14 @@ def _opts(**overrides) -> AgentCascadeOptions: def test_list_voices_human_lists_catalog(): - result = runner.invoke(app, ["agent-cascade", "--list-voices"]) + result = runner.invoke(app, ["live", "--list-voices"]) assert result.exit_code == 0 assert "jane" in result.output assert "English:" in result.output def test_list_voices_json_emits_array(): - result = runner.invoke(app, ["agent-cascade", "--list-voices", "--json"]) + result = runner.invoke(app, ["live", "--list-voices", "--json"]) assert result.exit_code == 0 assert result.output.lstrip().startswith("[") assert '"jane"' in result.output @@ -92,14 +92,14 @@ def test_missing_system_prompt_file_is_rejected_by_typer(): # so the sandbox guard (the other exit-2 path) never runs. Asserting the guard's # message is absent kills the exists=True mutant without depending on the Rich error # text, which CI renders with ANSI + width ellipsis. - result = runner.invoke(app, ["agent-cascade", "--system-prompt-file", "/no/such/file"]) + result = runner.invoke(app, ["live", "--system-prompt-file", "/no/such/file"]) assert result.exit_code == 2 assert "sandbox" not in result.output.lower() def test_production_env_is_rejected_with_sandbox_hint(): # Default env is production, which has no streaming-TTS host. - result = runner.invoke(app, ["agent-cascade", "--voice", "jane"]) + result = runner.invoke(app, ["live", "--voice", "jane"]) assert result.exit_code == 2 assert "only available in the sandbox" in result.output @@ -126,7 +126,7 @@ def fake_run(opts, state, *, json_mode): captured["opts"] = opts monkeypatch.setattr(_exec, "run_agent_cascade", fake_run) - result = runner.invoke(app, ["agent-cascade", *argv]) + result = runner.invoke(app, ["live", *argv]) assert result.exit_code == 0 assert captured["opts"].format_turns is expected @@ -137,7 +137,7 @@ def test_stt_config_file_must_exist(): # terminal so the "does not exist" message isn't wrapped by the 80-col error box. result = runner.invoke( app, - ["agent-cascade", "--stt-config-file", "/no/such/file.json"], + ["live", "--stt-config-file", "/no/such/file.json"], env={"COLUMNS": "300"}, ) assert result.exit_code == 2 @@ -418,36 +418,23 @@ def fake_stream_audio(api_key, source, *, params, on_turn): assert captured["params"] is params -def test_deps_real_complete_reply_threads_model_tokens_and_extra(monkeypatch): +def test_deps_real_complete_reply_is_built_by_the_deepagents_brain(monkeypatch): + # The LLM leg is now a deepagents graph: .real delegates to brain.build_completer, + # passing the api key + config, and uses whatever completer it returns. We assert the + # exact wiring so the brain swap (not a plain llm.complete) can't silently regress. captured = {} - def fake_complete(api_key, **kwargs): - captured.update(kwargs) - return "raw-response" + def fake_build_completer(api_key, config): + captured["api_key"] = api_key + captured["config"] = config + return lambda messages: f"reply to {messages[-1]['content']}" - monkeypatch.setattr(engine.llm, "complete", fake_complete) - monkeypatch.setattr(engine.llm, "content_of", lambda response: response.upper()) + monkeypatch.setattr(engine.brain, "build_completer", fake_build_completer) cfg = CascadeConfig(model="m", max_tokens=222, llm_extra={"temperature": 0.5}) deps = CascadeDeps.real("k", cfg, audio=[], stt_params=_stt_params()) - assert deps.complete_reply([{"role": "user", "content": "hi"}]) == "RAW-RESPONSE" - assert captured["model"] == "m" - assert captured["max_tokens"] == 222 - assert captured["extra"] == {"temperature": 0.5} - - -def test_deps_real_complete_reply_sends_no_extra_when_unset(monkeypatch): - captured = {} - - def fake_complete(api_key, **kwargs): - captured.update(kwargs) - return "x" - - monkeypatch.setattr(engine.llm, "complete", fake_complete) - monkeypatch.setattr(engine.llm, "content_of", lambda response: response) - deps = CascadeDeps.real("k", CascadeConfig(), audio=[], stt_params=_stt_params()) - deps.complete_reply([{"role": "user", "content": "hi"}]) - # Empty overrides collapse to None, not an empty dict, so the gateway sees no extra body. - assert captured["extra"] is None + assert deps.complete_reply([{"role": "user", "content": "hi"}]) == "reply to hi" + assert captured["api_key"] == "k" + assert captured["config"] is cfg def test_deps_real_synthesize_threads_voice_language_and_extra(monkeypatch): diff --git a/tests/test_agent_cascade_show_code.py b/tests/test_agent_cascade_show_code.py index d05b5874..97bbe0ff 100644 --- a/tests/test_agent_cascade_show_code.py +++ b/tests/test_agent_cascade_show_code.py @@ -1,4 +1,4 @@ -"""`assembly agent-cascade --show-code` tests. +"""`assembly live --show-code` tests. Split from test_agent_cascade_command.py (which holds the run-path wiring) so the print-only path's many invocations live in their own file. The cascade is @@ -33,7 +33,7 @@ def _boom(**kwargs): ) result = runner.invoke( app, - ["--sandbox", "agent-cascade", "--voice", "jane", "--greeting", "Hi there", "--show-code"], + ["--sandbox", "live", "--voice", "jane", "--greeting", "Hi there", "--show-code"], ) assert result.exit_code == 0 # Targets the sandbox the key was minted for โ€” all three legs. @@ -54,25 +54,23 @@ def fake_run(opts, state, *, json_mode): captured["opts"] = opts monkeypatch.setattr(_exec, "run_agent_cascade", fake_run) - assert runner.invoke(app, ["agent-cascade"]).exit_code == 0 + assert runner.invoke(app, ["live"]).exit_code == 0 assert captured["opts"].show_code is False - assert runner.invoke(app, ["agent-cascade", "--show-code"]).exit_code == 0 + assert runner.invoke(app, ["live", "--show-code"]).exit_code == 0 assert captured["opts"].show_code is True def test_show_code_injects_speech_model(monkeypatch): monkeypatch.setattr(_exec.engine, "run_cascade", lambda **kw: None) - result = runner.invoke( - app, ["--sandbox", "agent-cascade", "--speech-model", "u3-rt-pro", "--show-code"] - ) + result = runner.invoke(app, ["--sandbox", "live", "--speech-model", "u3-rt-pro", "--show-code"]) assert result.exit_code == 0 assert "speech_model=u3-rt-pro" in result.stdout def test_show_code_reflects_no_format_turns(monkeypatch): monkeypatch.setattr(_exec.engine, "run_cascade", lambda **kw: None) - formatted = runner.invoke(app, ["--sandbox", "agent-cascade", "--show-code"]) - bare = runner.invoke(app, ["--sandbox", "agent-cascade", "--no-format-turns", "--show-code"]) + formatted = runner.invoke(app, ["--sandbox", "live", "--show-code"]) + bare = runner.invoke(app, ["--sandbox", "live", "--no-format-turns", "--show-code"]) # With formatting on the cue waits for the punctuated turn; off, a bare end-of-turn fires. assert "turn_is_formatted" in formatted.stdout assert "turn_is_formatted" not in bare.stdout @@ -83,7 +81,7 @@ def test_show_code_threads_model_and_max_tokens(monkeypatch): monkeypatch.setattr(_exec.engine, "run_cascade", lambda **kw: None) result = runner.invoke( app, - ["--sandbox", "agent-cascade", "--model", "claude-x", "--max-tokens", "321", "--show-code"], + ["--sandbox", "live", "--model", "claude-x", "--max-tokens", "321", "--show-code"], ) assert result.exit_code == 0 assert "claude-x" in result.stdout @@ -95,7 +93,7 @@ def test_show_code_file_source_warns_on_stderr(monkeypatch): monkeypatch.setattr( _exec.engine, "run_cascade", lambda **kw: (_ for _ in ()).throw(AssertionError("no run")) ) - result = runner.invoke(app, ["--sandbox", "agent-cascade", "clip.wav", "--show-code"]) + result = runner.invoke(app, ["--sandbox", "live", "clip.wav", "--show-code"]) assert result.exit_code == 0 assert "uses the microphone" in result.stderr assert "uses the microphone" not in result.stdout # stdout stays a clean script @@ -104,13 +102,13 @@ def test_show_code_file_source_warns_on_stderr(monkeypatch): def test_show_code_mic_emits_no_warning(monkeypatch): monkeypatch.setattr(_exec.engine, "run_cascade", lambda **kw: None) - result = runner.invoke(app, ["--sandbox", "agent-cascade", "--show-code"]) + result = runner.invoke(app, ["--sandbox", "live", "--show-code"]) assert result.exit_code == 0 assert "uses the microphone" not in result.stderr # mic script matches the run, nothing to warn def test_show_code_in_production_is_rejected_with_sandbox_hint(): # --show-code still honors the sandbox-only guard, so the generated URLs are valid. - result = runner.invoke(app, ["agent-cascade", "--show-code"]) + result = runner.invoke(app, ["live", "--show-code"]) assert result.exit_code == 2 assert "only available in the sandbox" in result.output diff --git a/tests/test_sandbox_access.py b/tests/test_sandbox_access.py index ce947ec4..6fe112de 100644 --- a/tests/test_sandbox_access.py +++ b/tests/test_sandbox_access.py @@ -241,7 +241,9 @@ def test_help_hides_the_sandbox_surface_from_external_accounts_and_restores_it(m assert "--sandbox" not in external assert "--env" not in external assert "[sandbox]" not in external - assert "agent-cascade" not in external + # The [sandbox]-only `live` command's summary is hidden too (a token unique to it, + # since the bare word "live" also appears in other commands' descriptions). + assert "tool-using" not in external # โ€ฆbut the filter is surgical: non-sandbox flags and commands stay visible (this # also kills the mutant that would treat every option/command as sandbox). assert "--profile" in external @@ -255,4 +257,4 @@ def test_help_hides_the_sandbox_surface_from_external_accounts_and_restores_it(m assert "--sandbox" in internal assert "--env" in internal assert "[sandbox]" in internal - assert "agent-cascade" in internal + assert "tool-using" in internal diff --git a/tests/test_smoke.py b/tests/test_smoke.py index b9ba17ff..a66e2929 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -162,7 +162,7 @@ def test_help_lists_commands_in_workflow_order(): "stream", "dictate", "agent", - "agent-cascade", + "live", "speak", "llm", "clip",