diff --git a/.vitepress/config.ts b/.vitepress/config.ts index 5429c9e..df6e667 100644 --- a/.vitepress/config.ts +++ b/.vitepress/config.ts @@ -76,6 +76,10 @@ export default { text: '7. Advanced Agent Patterns', link: '/chapters/07-advanced-patterns', }, + { + text: '8. Production and Deployment', + link: '/chapters/08-production', + }, ], }, ], diff --git a/README.md b/README.md index 166ca47..8818bcd 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,7 @@ Published with VitePress at https://yagop.github.io/coding-agents-tutorial/. Eac 5. [Implementing Tools and Function Calling](chapters/05-tools.md) 6. [Building Tool Chains and Complex Workflows](chapters/06-tool-chains.md) 7. [Advanced Agent Patterns](chapters/07-advanced-patterns.md) +8. [Production and Deployment](chapters/08-production.md) More chapters are tracked as issues and land here as they are written. diff --git a/chapters/08-production.md b/chapters/08-production.md new file mode 100644 index 0000000..5854fcf --- /dev/null +++ b/chapters/08-production.md @@ -0,0 +1,61 @@ +# Production and Deployment + +🚢 The agent works on your machine; now you ship it. Production is where the happy path stops being the only path - the network blips, the rate limit bites, the bill creeps up, and the process has to live somewhere other than your terminal. This chapter wraps the agent in production armor: a real error strategy, retries and timeouts, rate-limit-aware concurrency, observability, cost control, and a way to deploy. + +You already have a working agent from Chapters 1-7, so here you only harden it. As always the key comes from the environment - never hardcode it - and Bun auto-loads `.env`. + +## Handling errors and retries + +Every API call can fail, and the SDK hands you a typed hierarchy to tell failures apart. `Anthropic.APIError` is the base; the subclasses you branch on are `RateLimitError`, `APIConnectionError` (with `APIConnectionTimeoutError` beneath it), `AuthenticationError`, and `BadRequestError`. The type tells you what to do: retry connection and rate-limit failures, surface auth and bad-request ones - those will not fix themselves on a retry. + +<<< @/examples/08-production/error-handling.ts + +Run it to watch the ladder catch a deliberately malformed call: + +```sh +bun run examples/08-production/error-handling.ts +``` + +::: warning The retry gotcha +The SDK already retries `408/409/429/5xx` with exponential backoff (tune it with `maxRetries`, default 2). So do NOT wrap every call in your own retry loop - you would double up. Set `maxRetries: 0` only for a non-idempotent call, where a retried tool side effect would happen twice. +::: + +When you genuinely own the retries, turn the SDK's off and back off yourself, honoring the `retry-after` header that rides on a 429. + +<<< @/examples/08-production/retry-backoff.ts + +The per-request `{ maxRetries: 0 }` option hands the retry decision to your loop; everything else stays the SDK default. + +## Rate limits and concurrency + +Fan out too many requests at once and you will hit the limit. The fix is a small queue with a concurrency cap, watching the `anthropic-ratelimit-requests-remaining` response header to know how much headroom is left. + +<<< @/examples/08-production/concurrency-limiter.ts + +A fixed pool of workers drains the queue, so no more than `maxConcurrent` calls are ever in flight. Read the remaining-requests header to slow down before you hit a wall, and respect `retry-after` when you do. + +## Observability and cost + +You cannot debug or budget what you cannot see. Pair every call's `request_id` (from `.withResponse()`) with its `usage`, and accumulate the token counts - including `cache_read_input_tokens` - to turn an opaque agent into a line-item bill. + +<<< @/examples/08-production/cost-tracker.ts + +Multiply each token count by your model's price for a running total. Three levers keep that number down: pick a cheaper model for easy steps, set `max_tokens` conservatively, and put `cache_control: { type: 'ephemeral' }` on stable system prompts and large tool schemas so repeated input is billed once. + +::: details Going deeper: layered cache breakpoints +On a long multi-turn conversation you can place several `cache_control` breakpoints - one on the system prompt, one after the tool schemas - so the cached prefix grows with the chat and each turn re-reads as much as possible from cache. +::: + +## Deploying + +Finally, the agent needs a home. A long-running process that polls is the simplest; for real traffic a webhook server is leaner - Telegram POSTs each `Update` to your HTTPS endpoint, and you validate its secret header before doing any work. + +<<< @/examples/08-production/webhook-server.ts + +Bun's built-in `Bun.serve` is the entire server, and the secret-token check rejects anything that is not Telegram. To run it anywhere, containerize it. + +<<< @/examples/08-production/Dockerfile + +A multi-stage build keeps the image small, `USER bun` drops root, and the secrets arrive at runtime through `docker run -e` - never copied into a layer. + +What's next: Chapter 9 - Advanced Topics: RAG, Prompt Engineering, and Fine-Tuning. diff --git a/examples/08-production/Dockerfile b/examples/08-production/Dockerfile new file mode 100644 index 0000000..cbd12e0 --- /dev/null +++ b/examples/08-production/Dockerfile @@ -0,0 +1,17 @@ +# docker build -t coding-agent -f examples/08-production/Dockerfile . + +FROM oven/bun:1 AS build +WORKDIR /app +COPY package.json bun.lock ./ +RUN bun install --frozen-lockfile --production +COPY . . + +FROM oven/bun:1-slim +WORKDIR /app +# The official Bun image already ships an unprivileged "bun" user. +USER bun +COPY --from=build --chown=bun:bun /app /app +# Secrets are read from the environment at runtime, never baked into the image: +# pass ANTHROPIC_API_KEY and TELEGRAM_BOT_TOKEN with `docker run -e`. +ENV NODE_ENV=production +CMD ["bun", "run", "examples/08-production/webhook-server.ts"] diff --git a/examples/08-production/concurrency-limiter.ts b/examples/08-production/concurrency-limiter.ts new file mode 100644 index 0000000..2f8adca --- /dev/null +++ b/examples/08-production/concurrency-limiter.ts @@ -0,0 +1,33 @@ +// bun run examples/08-production/concurrency-limiter.ts + +import Anthropic from '@anthropic-ai/sdk'; + +const client = new Anthropic(); +const model = process.env.ANTHROPIC_DEFAULT_SONNET_MODEL ?? 'claude-sonnet-4-6'; + +const words = ['cat', 'dog', 'bird', 'fish', 'frog']; +const maxConcurrent = 2; +let remaining = Infinity; + +async function ask(word: string): Promise { + const { data, response } = await client.messages + .create({ model, max_tokens: 64, messages: [{ role: 'user', content: `One fun fact about a ${word}.` }] }) + .withResponse(); + const header = response.headers.get('anthropic-ratelimit-requests-remaining'); + if (header !== null) remaining = Number(header); + const text = data.content.find((b) => b.type === 'text'); + console.log(`${word}: ${text?.text?.slice(0, 60) ?? ''}`); +} + +const queue = [...words]; + +async function worker(): Promise { + let word = queue.shift(); + while (word !== undefined) { + await ask(word); + word = queue.shift(); + } +} + +await Promise.all(Array.from({ length: maxConcurrent }, worker)); +console.log(`requests remaining per headers: ${remaining}`); diff --git a/examples/08-production/cost-tracker.ts b/examples/08-production/cost-tracker.ts new file mode 100644 index 0000000..fca27be --- /dev/null +++ b/examples/08-production/cost-tracker.ts @@ -0,0 +1,28 @@ +// bun run examples/08-production/cost-tracker.ts + +import Anthropic from '@anthropic-ai/sdk'; + +const client = new Anthropic(); +const model = process.env.ANTHROPIC_DEFAULT_SONNET_MODEL ?? 'claude-sonnet-4-6'; + +// Illustrative per-million-token prices; use your model's real numbers. +const inputPerMillion = 3; +const outputPerMillion = 15; + +const totals = { input: 0, output: 0, cacheRead: 0, cost: 0 }; + +function record(usage: Anthropic.Usage): void { + totals.input += usage.input_tokens; + totals.output += usage.output_tokens; + totals.cacheRead += usage.cache_read_input_tokens ?? 0; + totals.cost += (usage.input_tokens / 1e6) * inputPerMillion; + totals.cost += (usage.output_tokens / 1e6) * outputPerMillion; +} + +for (const prompt of ['Define an LLM agent in one sentence.', 'Now in a single word.']) { + const message = await client.messages.create({ model, max_tokens: 256, messages: [{ role: 'user', content: prompt }] }); + record(message.usage); +} + +console.log(`input=${totals.input} output=${totals.output} cache_read=${totals.cacheRead}`); +console.log(`estimated cost: $${totals.cost.toFixed(5)}`); diff --git a/examples/08-production/error-handling.ts b/examples/08-production/error-handling.ts new file mode 100644 index 0000000..95752f3 --- /dev/null +++ b/examples/08-production/error-handling.ts @@ -0,0 +1,25 @@ +// bun run examples/08-production/error-handling.ts + +import Anthropic from '@anthropic-ai/sdk'; + +const client = new Anthropic(); +const model = process.env.ANTHROPIC_DEFAULT_SONNET_MODEL ?? 'claude-sonnet-4-6'; + +async function ask(body: Anthropic.MessageCreateParamsNonStreaming): Promise { + try { + const { data, request_id } = await client.messages.create(body).withResponse(); + const text = data.content.find((b) => b.type === 'text'); + console.log(`ok request_id=${request_id ?? 'n/a'}: ${text?.text ?? ''}`); + } catch (error) { + if (error instanceof Anthropic.RateLimitError) console.log('rate limited - back off and retry'); + else if (error instanceof Anthropic.APIConnectionTimeoutError) console.log('timed out - retry'); + else if (error instanceof Anthropic.APIConnectionError) console.log('connection failed - retry'); + else if (error instanceof Anthropic.AuthenticationError) console.log('bad credentials - surface, do not retry'); + else if (error instanceof Anthropic.BadRequestError) console.log(`bad request (${error.status}) - fix the call`); + else if (error instanceof Anthropic.APIError) console.log(`api error ${error.status} request_id=${error.requestID ?? 'n/a'}`); + else throw error; + } +} + +await ask({ model, max_tokens: 64, messages: [{ role: 'user', content: 'Say hello in three words.' }] }); +await ask({ model, max_tokens: 64, messages: [] }); diff --git a/examples/08-production/retry-backoff.ts b/examples/08-production/retry-backoff.ts new file mode 100644 index 0000000..80fe729 --- /dev/null +++ b/examples/08-production/retry-backoff.ts @@ -0,0 +1,28 @@ +// bun run examples/08-production/retry-backoff.ts + +import Anthropic from '@anthropic-ai/sdk'; + +const client = new Anthropic(); +const model = process.env.ANTHROPIC_DEFAULT_SONNET_MODEL ?? 'claude-sonnet-4-6'; + +async function createOnce(content: string): Promise { + return client.messages.create( + { model, max_tokens: 256, messages: [{ role: 'user', content }] }, + { maxRetries: 0 }, + ); +} + +for (let attempt = 0; attempt < 4; attempt++) { + try { + const message = await createOnce('Name one benefit of idempotent tools.'); + const text = message.content.find((b) => b.type === 'text'); + console.log(text?.text ?? ''); + break; + } catch (error) { + if (!(error instanceof Anthropic.RateLimitError) || attempt === 3) throw error; + const retryAfter = error.headers?.get('retry-after'); + const wait = retryAfter ? Number(retryAfter) : 2 ** attempt; + console.log(`rate limited; waiting ${wait}s`); + await new Promise((resolve) => setTimeout(resolve, wait * 1000)); + } +} diff --git a/examples/08-production/webhook-server.ts b/examples/08-production/webhook-server.ts new file mode 100644 index 0000000..d8aa84e --- /dev/null +++ b/examples/08-production/webhook-server.ts @@ -0,0 +1,29 @@ +// bun run examples/08-production/webhook-server.ts + +import Anthropic from '@anthropic-ai/sdk'; + +const client = new Anthropic(); +const model = process.env.ANTHROPIC_DEFAULT_SONNET_MODEL ?? 'claude-sonnet-4-6'; +const secret = process.env.TELEGRAM_WEBHOOK_SECRET ?? 'dev-secret'; + +type Update = { message?: { chat: { id: number }; text?: string } }; + +const server = Bun.serve({ + port: 3000, + async fetch(req) { + if (req.method !== 'POST') return new Response('ok'); + if (req.headers.get('x-telegram-bot-api-secret-token') !== secret) { + return new Response('forbidden', { status: 403 }); + } + const update = (await req.json()) as Update; + const text = update.message?.text; + if (text) { + const reply = await client.messages.create({ model, max_tokens: 256, messages: [{ role: 'user', content: text }] }); + const block = reply.content.find((b) => b.type === 'text'); + console.log(`chat ${update.message?.chat.id}: ${block?.text ?? ''}`); + } + return new Response('ok'); + }, +}); + +console.log(`listening on :${server.port}`);