yagop · yagop · Jun 17, 2026
diff --git a/.vitepress/config.ts b/.vitepress/config.ts
@@ -76,6 +76,10 @@ export default {
             text: '7. Advanced Agent Patterns',
             link: '/chapters/07-advanced-patterns',
           },
+          {
+            text: '8. Production and Deployment',
+            link: '/chapters/08-production',
+          },
         ],
       },
     ],

diff --git a/README.md b/README.md
@@ -13,6 +13,7 @@ Published with VitePress at https://yagop.github.io/coding-agents-tutorial/. Eac
 5. [Implementing Tools and Function Calling](chapters/05-tools.md)
 6. [Building Tool Chains and Complex Workflows](chapters/06-tool-chains.md)
 7. [Advanced Agent Patterns](chapters/07-advanced-patterns.md)
+8. [Production and Deployment](chapters/08-production.md)
 
 More chapters are tracked as issues and land here as they are written.
 

diff --git a/chapters/08-production.md b/chapters/08-production.md
@@ -0,0 +1,61 @@
+# Production and Deployment
+
+🚢 The agent works on your machine; now you ship it. Production is where the happy path stops being the only path - the network blips, the rate limit bites, the bill creeps up, and the process has to live somewhere other than your terminal. This chapter wraps the agent in production armor: a real error strategy, retries and timeouts, rate-limit-aware concurrency, observability, cost control, and a way to deploy.
+
+You already have a working agent from Chapters 1-7, so here you only harden it. As always the key comes from the environment - never hardcode it - and Bun auto-loads `.env`.
+
+## Handling errors and retries
+
+Every API call can fail, and the SDK hands you a typed hierarchy to tell failures apart. `Anthropic.APIError` is the base; the subclasses you branch on are `RateLimitError`, `APIConnectionError` (with `APIConnectionTimeoutError` beneath it), `AuthenticationError`, and `BadRequestError`. The type tells you what to do: retry connection and rate-limit failures, surface auth and bad-request ones - those will not fix themselves on a retry.
+
+<<< @/examples/08-production/error-handling.ts
+
+Run it to watch the ladder catch a deliberately malformed call:
+
+```sh
+bun run examples/08-production/error-handling.ts
+```
+
+::: warning The retry gotcha
+The SDK already retries `408/409/429/5xx` with exponential backoff (tune it with `maxRetries`, default 2). So do NOT wrap every call in your own retry loop - you would double up. Set `maxRetries: 0` only for a non-idempotent call, where a retried tool side effect would happen twice.
+:::
+
+When you genuinely own the retries, turn the SDK's off and back off yourself, honoring the `retry-after` header that rides on a 429.
+
+<<< @/examples/08-production/retry-backoff.ts
+
+The per-request `{ maxRetries: 0 }` option hands the retry decision to your loop; everything else stays the SDK default.
+
+## Rate limits and concurrency
+
+Fan out too many requests at once and you will hit the limit. The fix is a small queue with a concurrency cap, watching the `anthropic-ratelimit-requests-remaining` response header to know how much headroom is left.
+
+<<< @/examples/08-production/concurrency-limiter.ts
+
+A fixed pool of workers drains the queue, so no more than `maxConcurrent` calls are ever in flight. Read the remaining-requests header to slow down before you hit a wall, and respect `retry-after` when you do.
+
+## Observability and cost
+
+You cannot debug or budget what you cannot see. Pair every call's `request_id` (from `.withResponse()`) with its `usage`, and accumulate the token counts - including `cache_read_input_tokens` - to turn an opaque agent into a line-item bill.
+
+<<< @/examples/08-production/cost-tracker.ts
+
+Multiply each token count by your model's price for a running total. Three levers keep that number down: pick a cheaper model for easy steps, set `max_tokens` conservatively, and put `cache_control: { type: 'ephemeral' }` on stable system prompts and large tool schemas so repeated input is billed once.
+
+::: details Going deeper: layered cache breakpoints
+On a long multi-turn conversation you can place several `cache_control` breakpoints - one on the system prompt, one after the tool schemas - so the cached prefix grows with the chat and each turn re-reads as much as possible from cache.
+:::
+
+## Deploying
+
+Finally, the agent needs a home. A long-running process that polls is the simplest; for real traffic a webhook server is leaner - Telegram POSTs each `Update` to your HTTPS endpoint, and you validate its secret header before doing any work.
+
+<<< @/examples/08-production/webhook-server.ts
+
+Bun's built-in `Bun.serve` is the entire server, and the secret-token check rejects anything that is not Telegram. To run it anywhere, containerize it.
+
+<<< @/examples/08-production/Dockerfile
+
+A multi-stage build keeps the image small, `USER bun` drops root, and the secrets arrive at runtime through `docker run -e` - never copied into a layer.
+
+What's next: Chapter 9 - Advanced Topics: RAG, Prompt Engineering, and Fine-Tuning.
diff --git a/examples/08-production/Dockerfile b/examples/08-production/Dockerfile
@@ -0,0 +1,17 @@
+# docker build -t coding-agent -f examples/08-production/Dockerfile .
+
+FROM oven/bun:1 AS build
+WORKDIR /app
+COPY package.json bun.lock ./
+RUN bun install --frozen-lockfile --production
+COPY . .
+
+FROM oven/bun:1-slim
+WORKDIR /app
+# The official Bun image already ships an unprivileged "bun" user.
+USER bun
+COPY --from=build --chown=bun:bun /app /app
+# Secrets are read from the environment at runtime, never baked into the image:
+# pass ANTHROPIC_API_KEY and TELEGRAM_BOT_TOKEN with `docker run -e`.
+ENV NODE_ENV=production
+CMD ["bun", "run", "examples/08-production/webhook-server.ts"]
diff --git a/examples/08-production/concurrency-limiter.ts b/examples/08-production/concurrency-limiter.ts
@@ -0,0 +1,33 @@
+// bun run examples/08-production/concurrency-limiter.ts
+
+import Anthropic from '@anthropic-ai/sdk';
+
+const client = new Anthropic();
+const model = process.env.ANTHROPIC_DEFAULT_SONNET_MODEL ?? 'claude-sonnet-4-6';
+
+const words = ['cat', 'dog', 'bird', 'fish', 'frog'];
+const maxConcurrent = 2;
+let remaining = Infinity;
+
+async function ask(word: string): Promise<void> {
+  const { data, response } = await client.messages
+    .create({ model, max_tokens: 64, messages: [{ role: 'user', content: `One fun fact about a ${word}.` }] })
+    .withResponse();
+  const header = response.headers.get('anthropic-ratelimit-requests-remaining');
+  if (header !== null) remaining = Number(header);
+  const text = data.content.find((b) => b.type === 'text');
+  console.log(`${word}: ${text?.text?.slice(0, 60) ?? ''}`);
+}
+
+const queue = [...words];
+
+async function worker(): Promise<void> {
+  let word = queue.shift();
+  while (word !== undefined) {
+    await ask(word);
+    word = queue.shift();
+  }
+}
+
+await Promise.all(Array.from({ length: maxConcurrent }, worker));
+console.log(`requests remaining per headers: ${remaining}`);
diff --git a/examples/08-production/cost-tracker.ts b/examples/08-production/cost-tracker.ts
@@ -0,0 +1,28 @@
+// bun run examples/08-production/cost-tracker.ts
+
+import Anthropic from '@anthropic-ai/sdk';
+
+const client = new Anthropic();
+const model = process.env.ANTHROPIC_DEFAULT_SONNET_MODEL ?? 'claude-sonnet-4-6';
+
+// Illustrative per-million-token prices; use your model's real numbers.
+const inputPerMillion = 3;
+const outputPerMillion = 15;
+
+const totals = { input: 0, output: 0, cacheRead: 0, cost: 0 };
+
+function record(usage: Anthropic.Usage): void {
+  totals.input += usage.input_tokens;
+  totals.output += usage.output_tokens;
+  totals.cacheRead += usage.cache_read_input_tokens ?? 0;
+  totals.cost += (usage.input_tokens / 1e6) * inputPerMillion;
+  totals.cost += (usage.output_tokens / 1e6) * outputPerMillion;
+}
+
+for (const prompt of ['Define an LLM agent in one sentence.', 'Now in a single word.']) {
+  const message = await client.messages.create({ model, max_tokens: 256, messages: [{ role: 'user', content: prompt }] });
+  record(message.usage);
+}
+
+console.log(`input=${totals.input} output=${totals.output} cache_read=${totals.cacheRead}`);
+console.log(`estimated cost: $${totals.cost.toFixed(5)}`);
diff --git a/examples/08-production/error-handling.ts b/examples/08-production/error-handling.ts
@@ -0,0 +1,25 @@
+// bun run examples/08-production/error-handling.ts
+
+import Anthropic from '@anthropic-ai/sdk';
+
+const client = new Anthropic();
+const model = process.env.ANTHROPIC_DEFAULT_SONNET_MODEL ?? 'claude-sonnet-4-6';
+
+async function ask(body: Anthropic.MessageCreateParamsNonStreaming): Promise<void> {
+  try {
+    const { data, request_id } = await client.messages.create(body).withResponse();
+    const text = data.content.find((b) => b.type === 'text');
+    console.log(`ok request_id=${request_id ?? 'n/a'}: ${text?.text ?? ''}`);
+  } catch (error) {
+    if (error instanceof Anthropic.RateLimitError) console.log('rate limited - back off and retry');
+    else if (error instanceof Anthropic.APIConnectionTimeoutError) console.log('timed out - retry');
+    else if (error instanceof Anthropic.APIConnectionError) console.log('connection failed - retry');
+    else if (error instanceof Anthropic.AuthenticationError) console.log('bad credentials - surface, do not retry');
+    else if (error instanceof Anthropic.BadRequestError) console.log(`bad request (${error.status}) - fix the call`);
+    else if (error instanceof Anthropic.APIError) console.log(`api error ${error.status} request_id=${error.requestID ?? 'n/a'}`);
+    else throw error;
+  }
+}
+
+await ask({ model, max_tokens: 64, messages: [{ role: 'user', content: 'Say hello in three words.' }] });
+await ask({ model, max_tokens: 64, messages: [] });
diff --git a/examples/08-production/retry-backoff.ts b/examples/08-production/retry-backoff.ts
@@ -0,0 +1,28 @@
+// bun run examples/08-production/retry-backoff.ts
+
+import Anthropic from '@anthropic-ai/sdk';
+
+const client = new Anthropic();
+const model = process.env.ANTHROPIC_DEFAULT_SONNET_MODEL ?? 'claude-sonnet-4-6';
+
+async function createOnce(content: string): Promise<Anthropic.Message> {
+  return client.messages.create(
+    { model, max_tokens: 256, messages: [{ role: 'user', content }] },
+    { maxRetries: 0 },
+  );
+}
+
+for (let attempt = 0; attempt < 4; attempt++) {
+  try {
+    const message = await createOnce('Name one benefit of idempotent tools.');
+    const text = message.content.find((b) => b.type === 'text');
+    console.log(text?.text ?? '');
+    break;
+  } catch (error) {
+    if (!(error instanceof Anthropic.RateLimitError) || attempt === 3) throw error;
+    const retryAfter = error.headers?.get('retry-after');
+    const wait = retryAfter ? Number(retryAfter) : 2 ** attempt;
+    console.log(`rate limited; waiting ${wait}s`);
+    await new Promise((resolve) => setTimeout(resolve, wait * 1000));
+  }
+}
diff --git a/examples/08-production/webhook-server.ts b/examples/08-production/webhook-server.ts
@@ -0,0 +1,29 @@
+// bun run examples/08-production/webhook-server.ts
+
+import Anthropic from '@anthropic-ai/sdk';
+
+const client = new Anthropic();
+const model = process.env.ANTHROPIC_DEFAULT_SONNET_MODEL ?? 'claude-sonnet-4-6';
+const secret = process.env.TELEGRAM_WEBHOOK_SECRET ?? 'dev-secret';
+
+type Update = { message?: { chat: { id: number }; text?: string } };
+
+const server = Bun.serve({
+  port: 3000,
+  async fetch(req) {
+    if (req.method !== 'POST') return new Response('ok');
+    if (req.headers.get('x-telegram-bot-api-secret-token') !== secret) {
+      return new Response('forbidden', { status: 403 });
+    }
+    const update = (await req.json()) as Update;
+    const text = update.message?.text;
+    if (text) {
+      const reply = await client.messages.create({ model, max_tokens: 256, messages: [{ role: 'user', content: text }] });
+      const block = reply.content.find((b) => b.type === 'text');
+      console.log(`chat ${update.message?.chat.id}: ${block?.text ?? ''}`);
+    }
+    return new Response('ok');
+  },
+});
+
+console.log(`listening on :${server.port}`);