diff --git a/.claude/skills/write-chapter/SKILL.md b/.claude/skills/write-chapter/SKILL.md
index e7af83d..8b6531e 100644
--- a/.claude/skills/write-chapter/SKILL.md
+++ b/.claude/skills/write-chapter/SKILL.md
@@ -79,7 +79,7 @@ Add a comment on the issue linking the PR (`gh issue comment <N> --body ...`, or
 ### 7. Verify
 - Run every example end-to-end - do not just typecheck. Bun auto-loads a `.env` at the repo root for `.ts` files, so after `bun install` (once) run `bun run <file>` for each sample and paste the real output into the PR. Shell samples do not get `.env` auto-loaded - source it first: `set -a; . ./.env; set +a; bash <file>`.
 - `bunx tsc --noEmit` must be clean, and the docs must build so snippet imports resolve: `bun x vitepress@2.0.0-alpha.17 build` (a broken `<<< @/examples/...` path fails the build).
-- Check the chapter is within budget and paste the numbers in the PR: `wc -l chapters/NN-slug.md` (<=150) and `grep -c '^## ' chapters/NN-slug.md` (<=4 main-line H2s plus an optional What's next closer). Spot-check that each sample is <=35 lines with comment:code <=0.30.
+- Check the chapter is within budget and paste the numbers in the PR: `wc -l chapters/NN-slug.md` (<=150) and `grep -c '^## ' chapters/NN-slug.md` (<=4 main-line H2s plus an optional What's next closer). Spot-check that each sample is readable (one statement per line, no golfed one-liners or comma-operator sequencing) and within budget - aim <=70 lines, hard cap 100 (`wc -l`), comment:code <=0.40.
 - Only if no API credentials are available may you skip the live run - say so explicitly in the PR, and never claim the code runs if it was not executed.
 
 ## Special cases (accuracy)
@@ -186,6 +186,7 @@ Find and FIX every instance of:
 - Anything that would fail under bun run (syntax, type, import errors, missing await).
 - Placeholders, TODOs, or incomplete logic.
 - Non-ASCII punctuation.
+- Golfed/compressed code: any line that stacks multiple statements, sequences side effects with the comma operator, or inlines a multi-step expression to save a line. Re-expand to one statement per line (a multi-line \`async function\` over a dense one-line arrow), even if that grows the file - up to the 100-line hard cap. A correct-but-dense one-liner is a defect here, not a pass.
 
 For cutoff-sensitive APIs (extended thinking, citations source schema, fine-tuning), verify the exact shape against current Anthropic docs; if you cannot verify, set ok to false and explain in issues.
 
@@ -218,6 +219,7 @@ Scale the run to the issue: a small chapter is a handful of agents; a large one
 - Provider-agnostic samples: the same file must run unchanged against Anthropic direct OR any Anthropic-compatible gateway (for example Z.ai). Never hardcode a model id or a base URL. Read the model from the environment with a Claude fallback - `process.env.ANTHROPIC_DEFAULT_SONNET_MODEL ?? 'claude-sonnet-4-6'` (and `ANTHROPIC_DEFAULT_OPUS_MODEL` / `ANTHROPIC_DEFAULT_HAIKU_MODEL` for the other tiers) - and let `new Anthropic()` pick up the key, token, and base URL. Shell/curl samples read `ANTHROPIC_BASE_URL` and send `Authorization: Bearer $ANTHROPIC_AUTH_TOKEN` when that token is set, otherwise `x-api-key: $ANTHROPIC_API_KEY`.
 - Auth and runtime env vars are introduced and explained once, in Chapter 1: `ANTHROPIC_API_KEY` (sent as `x-api-key`, Anthropic direct), `ANTHROPIC_AUTH_TOKEN` (sent as `Authorization: Bearer`, used by some compatible providers), and `ANTHROPIC_BASE_URL` (which endpoint to call). Later chapters assume them; only mention an env var a sample actually uses.
 - TypeScript style: prefer `type` over `interface`; never use `unknown` or index signatures; reuse SDK-exported types (`Anthropic.MessageParam`, `Anthropic.Tool`, `Anthropic.ToolUseBlock`, etc.).
+- Loop idioms: for an unbounded produce-then-consume loop (long-polling an API, draining an event stream), prefer an async generator - `async function* poll()` that `yield`s, consumed with `for await (const x of poll())` - which separates transport from handling and matches the `for await...of` style used elsewhere. Where a generator does not fit, use `while (true)`. Never use `for (;;)`.
 - ASCII punctuation only: `-`, `->`, `...`. No em dashes, no smart quotes. (This governs punctuation; emoji are allowed in chapter prose per the visual-aids rule, but code and example files stay ASCII-only.)
 - Each example is standalone and runnable on its own. Begin each file with a short header comment giving the run command (for example: `// bun run examples/05-tools/define-tool.ts`).
 - Chapters are rendered by VitePress and published to GitHub Pages. In chapter prose, show a sample's full source with a VitePress snippet import (`<<< @/examples/NN-slug/file.ts`) on its own line, NOT by pasting the code into a fenced block - this keeps the rendered docs in lockstep with the runnable file. Inline fenced blocks are only for short illustrative fragments. The runnable file under `examples/` is the single source of truth; prose must not contradict it. After editing chapters, the site builds with `bun x vitepress@2.0.0-alpha.17 build`.
@@ -233,5 +235,6 @@ Scale the run to the issue: a small chapter is a handful of agents; a large one
 - Going-deeper asides: secondary material (extra providers, full configs, full taxonomies) goes in a `::: details` block (or a `::: tip`/`::: info` callout), never a main-line H2. The main line must read complete if every aside is collapsed.
 - Visual aids, used sparingly (seasoning, not structure): VitePress callout containers (`::: tip`, `::: info`, `::: warning`, `::: details`) for asides; AT MOST one small ASCII diagram per chapter, and only where a picture genuinely beats a sentence; a small Markdown table when comparing a short list of options (for example env vars or model tiers).
 - Config lives in the repo, not the prose: no `package.json`/`tsconfig.json` JSON dumps in a chapter - one sentence plus the run command, and note the repo already ships the scaffold so a follow-along reader can just run the file.
-- Example code budget: each sample <=35 lines and comment:code ratio <=0.30 (a comment line's first token is `//` or `#`; an end-of-line comment counts as code). The header comment is the run command and nothing else. No numbered `// 1) ... // 2) ...` blocks over `console.log` groups, and no reference tables inside code files.
+- Example code budget: keep each sample focused and single-concept; verbose, explicit code is welcome - favor clarity over brevity. Aim for <=70 lines (HARD CAP 100, `wc -l`) with comment:code ratio <=0.40 (a comment line's first token is `//` or `#`; an end-of-line comment counts as code). The header comment is the run command and nothing else. No numbered `// 1) ... // 2) ...` blocks over `console.log` groups, and no reference tables inside code files.
+- Readability outranks the line count - never golf a sample to hit the budget. One statement per line: do NOT join multiple statements with `;` on one line, do NOT use the comma operator to sequence side effects (`(last = now), edit(...)`), and do NOT inline a multi-step expression (for example `(await fetch(...)).json()` with method/headers/body options) purely to save a line. A normal multi-line `async function` helper beats a dense one-line arrow. The standalone-file rule means each Telegram/`fetch` sample re-pastes its own helper, and verbose, explicit style is encouraged - that is exactly why the budget is generous (up to 100 lines) for samples that need it. If a sample still cannot fit while staying readable, cut its scope (or, when writing the issue, split it into two samples) - compression is never the answer.
 - Friendliness floor: address the reader as "you" (never "the user" or "one"); the intro and at least one section open with a warm, second-person sentence. Terse is not the same as friendly.
diff --git a/.vitepress/config.ts b/.vitepress/config.ts
index 720a7bb..a632bc5 100644
--- a/.vitepress/config.ts
+++ b/.vitepress/config.ts
@@ -60,6 +60,10 @@ export default {
             text: '3. Handling User Requests: REPL and Telegram Bot',
             link: '/chapters/03-repl-telegram',
           },
+          {
+            text: '4. Context and Conversation Management',
+            link: '/chapters/04-context',
+          },
         ],
       },
     ],
diff --git a/README.md b/README.md
index 0334593..c5309f2 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,7 @@ Published with VitePress at https://yagop.github.io/coding-agents-tutorial/. Eac
 1. [The Claude SDK and Your First API Request](chapters/01-sdk-first-request.md)
 2. [Streaming Responses and Message Types](chapters/02-streaming.md)
 3. [Handling User Requests: REPL and Telegram Bot](chapters/03-repl-telegram.md)
+4. [Context and Conversation Management](chapters/04-context.md)
 
 More chapters are tracked as issues and land here as they are written.
 
diff --git a/chapters/04-context.md b/chapters/04-context.md
new file mode 100644
index 0000000..3111c9c
--- /dev/null
+++ b/chapters/04-context.md
@@ -0,0 +1,68 @@
+# Context and Conversation Management
+
+🧠 Your bot from Chapter 3 already remembers a conversation - but only because you happened to keep one `messages` array alive in a single process. Pull the plug and the memory is gone; open a second chat and the two talk over each other. The reason is worth saying plainly: the Messages API is *stateless*. Every `client.messages.create` call must carry the entire conversation with it, because the server keeps nothing between requests. So the memory has to live in your code, and this chapter is about holding it well - across turns, across the model's context-window ceiling, across users, and across restarts.
+
+You already have `new Anthropic()`, the env vars, and content-block narrowing from Chapter 1, plus the REPL loop and Telegram token from Chapter 3, so here you only add the stateful layer on top. As always the key comes from the environment - never hardcode it - and Bun auto-loads `.env`, so there is nothing to import.
+
+## History and the system prompt
+
+The shape of memory is a list of turns: a `messages` array of `Anthropic.MessageParam`, strictly alternating `role: 'user'` and `role: 'assistant'`, where each `content` is a string or an array of content blocks (`text`, `tool_use`, `tool_result`). The one rule that keeps it valid is alternation, and the one move that maintains it is this: after each call, push `response.content` straight back as an `assistant` turn before you add the next `user` message.
+
+<<< @/examples/04-context/multi-turn.ts
+
+Notice that the assistant turn is `response.content` *unchanged* - the same block array the model returned - so the next call sees the full, faithful history. The printed turn count climbs by two each round, one `user` and one `assistant`, which is the alternation made visible. Run the first sample to watch it grow:
+
+```sh
+bun run examples/04-context/multi-turn.ts
+```
+
+Persona and standing instructions do not belong in a turn - they belong in `system`, which sits outside the alternation and is sent with every request. You can pass `system` as a plain string or as an array of `text` blocks; the array form is what you will want the moment caching enters the picture.
+
+<<< @/examples/04-context/system-prompt.ts
+
+The same `system` rides along on both turns, so the persona holds without you ever restating it inside a `user` message. String or block array, the model reads them identically - the array just gives you a place to attach `cache_control` later.
+
+## Counting tokens and trimming
+
+Here is the wall you will eventually hit: every model has a finite context window, and a conversation that runs long enough will overflow it. You get ahead of that by measuring before you send. `client.messages.countTokens` takes the same `model`, `system`, `messages`, and `tools` you are about to pass to `create`, and returns the input-token count - so you can branch *before* spending a request.
+
+<<< @/examples/04-context/token-counter.ts
+
+The threshold is paired with the model from `client.models.retrieve` rather than a bare number hard-coded in, because a window that is generous on one tier is tight on another. When the count crosses it, a rolling window keeps only the last N turns and drops the rest.
+
+::: warning Trim in pairs, never one side alone
+Every trim must remove a `user` and its `assistant` reply *together*. Drop one side and you break alternation - two `user` turns in a row, or an `assistant` with nothing before it - and the next `create` call rejects the whole array. The window slides by two, always.
+:::
+
+A rolling window is cheap but forgetful: it throws away the early turns wholesale. When those early turns still matter, summarize instead. You call `create` once with a summarize instruction over the old turns, then replace that whole stretch with a single injected `user`/`assistant` pair carrying the summary - history compressed, alternation intact.
+
+<<< @/examples/04-context/summarize-history.ts
+
+The injected pair *is* the new beginning of `messages`: one short `user` turn that asks for the state of things, one `assistant` turn that holds the summary. Everything before it is gone, but its meaning rides forward in far fewer tokens.
+
+## Prompt caching
+
+When a large, stable chunk of context rides along on every request - a long system prompt, a file you keep referencing - you are paying to re-process the same tokens each time. Prompt caching fixes that: add `cache_control: { type: 'ephemeral' }` to the final `text` block of your `system` array (or to a large stable `user` turn), and the model caches everything up to that point. Later requests that share the prefix read it from cache instead of reprocessing it, which cuts both latency and cost.
+
+<<< @/examples/04-context/prompt-cache.ts
+
+The first request reports `cache_creation_input_tokens` as it writes the cache; the second, identical request reports a non-zero `cache_read_input_tokens` - the prefix served from cache, paid for once. Two conditions make or break this:
+
+| Requirement | Detail |
+| --- | --- |
+| Minimum size | The cached prefix must exceed the model's floor: ~4096 tokens on Opus 4.x and Haiku 4.5, ~2048 on Sonnet 4.6. Smaller prefixes are never cached. |
+| Byte-stability | The prefix must be byte-for-byte identical across requests. Slip a changing value - a timestamp, a counter - into the cached block and it silently no-ops, charging full price with no warning. |
+
+::: tip Verify, don't trust
+Caching fails quietly, so always confirm it by reading `usage.cache_read_input_tokens` on the second request. Zero where you expected a hit means your prefix changed or fell under the minimum.
+:::
+
+## Per-user sessions that survive restarts
+
+Now back to the bot, with everything above in hand. One process serves many chats, so one shared `messages` array will not do - each Telegram `chat.id` needs its own history. You key an in-memory `Map` by `chat.id`, and to outlast a restart you serialize that `Map` to a JSON file and load it back on startup.
+
+<<< @/examples/04-context/telegram-sessions.ts
+
+Each chat's `Anthropic.MessageParam[]` lives under its own key, so two people never bleed into each other's context. Writing `sessions.json` after each turn means a crash or a redeploy costs you nothing - the histories are read back the next time the bot wakes, exactly where they left off.
+
+What's next: Chapter 5 - Implementing Tools and Function Calling.
diff --git a/examples/03-repl-telegram/telegram-bot.ts b/examples/03-repl-telegram/telegram-bot.ts
index df7dc2b..cbff2b1 100644
--- a/examples/03-repl-telegram/telegram-bot.ts
+++ b/examples/03-repl-telegram/telegram-bot.ts
@@ -3,33 +3,59 @@
 import Anthropic from '@anthropic-ai/sdk';
 
 const token = process.env.TELEGRAM_BOT_TOKEN;
-if (!token) throw new Error('Set TELEGRAM_BOT_TOKEN in your .env (from BotFather).');
+if (!token) {
+  throw new Error('Set TELEGRAM_BOT_TOKEN in your .env (from BotFather).');
+}
+
 const base = `https://api.telegram.org/bot${token}`;
+const client = new Anthropic();
+const model = process.env.ANTHROPIC_DEFAULT_SONNET_MODEL ?? 'claude-sonnet-4-6';
+
+type Update = {
+  update_id: number;
+  message?: {
+    chat: { id: number };
+    text?: string;
+  };
+};
 
-type Update = { update_id: number; message?: { chat: { id: number }; text?: string } };
-type TgResult<T> = { ok: boolean; result?: T; description?: string };
+type TgResult<T> = {
+  ok: boolean;
+  result?: T;
+  description?: string;
+};
 
-async function call<T>(method: string, body: object): Promise<TgResult<T>> {
-  const res = await fetch(`${base}/${method}`, { method: 'POST', headers: { 'content-type': 'application/json' }, body: JSON.stringify(body) });
-  return res.json() as Promise<TgResult<T>>;
+async function tg<T>(method: string, body: object): Promise<TgResult<T>> {
+  const response = await fetch(`${base}/${method}`, {
+    method: 'POST',
+    headers: { 'content-type': 'application/json' },
+    body: JSON.stringify(body),
+  });
+  return response.json() as Promise<TgResult<T>>;
 }
 
-const client = new Anthropic();
-let offset = 0; // last update_id + 1, so each update arrives exactly once
-
-while (true) {
-  const { result } = await call<Update[]>('getUpdates', { offset, timeout: 30 });
-  for (const update of result ?? []) {
-    offset = update.update_id + 1;
-    const message = update.message;
-    if (!message?.text) continue;
-    const reply = await client.messages.create({
-      model: process.env.ANTHROPIC_DEFAULT_SONNET_MODEL ?? 'claude-sonnet-4-6',
-      max_tokens: 1024,
-      messages: [{ role: 'user', content: message.text }],
-    });
-    const first = reply.content[0];
-    const answer = first?.type === 'text' ? first.text : '...';
-    await call('sendMessage', { chat_id: message.chat.id, text: answer });
+async function* pollUpdates(): AsyncGenerator<Update> {
+  let offset = 0;
+  while (true) {
+    const { result = [] } = await tg<Update[]>('getUpdates', { offset, timeout: 30 });
+    for (const update of result) {
+      offset = update.update_id + 1;
+      yield update;
+    }
+  }
+}
+
+for await (const update of pollUpdates()) {
+  const message = update.message;
+  if (!message?.text) {
+    continue;
   }
+  const reply = await client.messages.create({
+    model,
+    max_tokens: 1024,
+    messages: [{ role: 'user', content: message.text }],
+  });
+  const first = reply.content[0];
+  const answer = first?.type === 'text' ? first.text : '...';
+  await tg('sendMessage', { chat_id: message.chat.id, text: answer });
 }
diff --git a/examples/03-repl-telegram/telegram-stream.ts b/examples/03-repl-telegram/telegram-stream.ts
index 91a5809..131712a 100644
--- a/examples/03-repl-telegram/telegram-stream.ts
+++ b/examples/03-repl-telegram/telegram-stream.ts
@@ -1,35 +1,93 @@
 // bun run examples/03-repl-telegram/telegram-stream.ts
+
 import Anthropic from '@anthropic-ai/sdk';
 
 const token = process.env.TELEGRAM_BOT_TOKEN;
-if (!token) throw new Error('Set TELEGRAM_BOT_TOKEN in your .env');
+if (!token) {
+  throw new Error('Set TELEGRAM_BOT_TOKEN in your .env');
+}
+
 const base = `https://api.telegram.org/bot${token}`;
 const client = new Anthropic();
-type Update = { update_id: number; message?: { chat: { id: number }; text?: string } };
-type TgResult<T> = { ok: boolean; result?: T; error_code: number; description: string; parameters?: { retry_after?: number } };
-const tg = async <T>(method: string, body: object): Promise<TgResult<T>> =>
-  (await fetch(`${base}/${method}`, { method: 'POST', headers: { 'content-type': 'application/json' }, body: JSON.stringify(body) })).json() as Promise<TgResult<T>>;
-async function edit(chat_id: number, message_id: number, text: string) {
-  const res = await tg('editMessageText', { chat_id, message_id, text });
-  if (res.ok || res.description?.includes('not modified') || res.error_code !== 429) return;
-  await new Promise((r) => setTimeout(r, (res.parameters?.retry_after ?? 1) * 1000));
+const model = process.env.ANTHROPIC_DEFAULT_SONNET_MODEL ?? 'claude-sonnet-4-6';
+
+type Update = {
+  update_id: number;
+  message?: {
+    chat: { id: number };
+    text?: string;
+  };
+};
+
+type TgResult<T> = {
+  ok: boolean;
+  result?: T;
+  error_code: number;
+  description: string;
+  parameters?: { retry_after?: number };
+};
+
+async function tg<T>(method: string, body: object): Promise<TgResult<T>> {
+  const response = await fetch(`${base}/${method}`, {
+    method: 'POST',
+    headers: { 'content-type': 'application/json' },
+    body: JSON.stringify(body),
+  });
+  return response.json() as Promise<TgResult<T>>;
+}
+
+async function* pollUpdates(): AsyncGenerator<Update> {
+  let offset = 0;
+  while (true) {
+    const { result = [] } = await tg<Update[]>('getUpdates', { offset, timeout: 30 });
+    for (const update of result) {
+      offset = update.update_id + 1;
+      yield update;
+    }
+  }
+}
+
+async function edit(chat_id: number, message_id: number, text: string): Promise<void> {
+  const result = await tg('editMessageText', { chat_id, message_id, text });
+  if (result.ok || result.description?.includes('not modified') || result.error_code !== 429) {
+    return;
+  }
+  const retryAfter = result.parameters?.retry_after ?? 1;
+  await new Promise((resolve) => setTimeout(resolve, retryAfter * 1000));
   await edit(chat_id, message_id, text);
 }
-let offset = 0;
-for (;;) {
-  const { result = [] } = await tg<Update[]>('getUpdates', { offset, timeout: 30 });
-  for (const u of result) {
-    offset = u.update_id + 1;
-    const chat_id = u.message?.chat.id;
-    const prompt = u.message?.text?.trim();
-    if (chat_id === undefined || !prompt) continue;
-    const sent = await tg<{ message_id: number }>('sendMessage', { chat_id, text: '...' });
-    const message_id = sent.result!.message_id;
-    const stream = client.messages.stream({ model: process.env.ANTHROPIC_DEFAULT_SONNET_MODEL ?? 'claude-sonnet-4-6', max_tokens: 1024, messages: [{ role: 'user', content: prompt }] });
-    let text = '', last = 0;
-    // Editing per token trips Telegram's flood limit instantly - batch deltas, edit at most ~1/sec.
-    stream.on('text', (delta) => { text += delta; if (Date.now() - last >= 1000) (last = Date.now()), edit(chat_id, message_id, text); });
-    await stream.finalMessage();
-    await edit(chat_id, message_id, text);
+
+for await (const update of pollUpdates()) {
+  const chat_id = update.message?.chat.id;
+  const prompt = update.message?.text?.trim();
+  if (chat_id === undefined || !prompt) {
+    continue;
   }
+
+  const placeholder = await tg<{ message_id: number }>('sendMessage', {
+    chat_id,
+    text: '...',
+  });
+  const message_id = placeholder.result!.message_id;
+
+  const stream = client.messages.stream({
+    model,
+    max_tokens: 1024,
+    messages: [{ role: 'user', content: prompt }],
+  });
+
+  let text = '';
+  let lastEdit = 0;
+  // Editing on every token trips the flood limit; batch deltas and edit at most ~1/sec.
+  stream.on('text', (delta) => {
+    text += delta;
+    if (Date.now() - lastEdit < 1000) {
+      return;
+    }
+    lastEdit = Date.now();
+    edit(chat_id, message_id, text);
+  });
+
+  await stream.finalMessage();
+  await edit(chat_id, message_id, text);
 }
diff --git a/examples/04-context/multi-turn.ts b/examples/04-context/multi-turn.ts
new file mode 100644
index 0000000..d6be76c
--- /dev/null
+++ b/examples/04-context/multi-turn.ts
@@ -0,0 +1,30 @@
+// bun run examples/04-context/multi-turn.ts
+
+import Anthropic from '@anthropic-ai/sdk';
+
+const client = new Anthropic();
+const model = process.env.ANTHROPIC_DEFAULT_SONNET_MODEL ?? 'claude-sonnet-4-6';
+
+// The API is stateless: this array is the entire memory you resend every call.
+const messages: Anthropic.MessageParam[] = [];
+
+const turns = [
+  'My favorite color is teal. Remember it.',
+  'What is 12 times 11?',
+  'What was my favorite color again?',
+];
+
+for (const text of turns) {
+  messages.push({ role: 'user', content: text });
+  const message = await client.messages.create({ model, max_tokens: 256, messages });
+
+  // Push the response.content block array straight back as the assistant turn.
+  messages.push({ role: 'assistant', content: message.content });
+
+  const first = message.content[0];
+  const reply = first?.type === 'text' ? first.text : '';
+  console.log(`turn ${messages.length / 2} | you: ${text}`);
+  console.log(`claude: ${reply}\n`);
+}
+
+console.log(`history holds ${messages.length} messages across ${messages.length / 2} turns`);
diff --git a/examples/04-context/prompt-cache.ts b/examples/04-context/prompt-cache.ts
new file mode 100644
index 0000000..aba4c63
--- /dev/null
+++ b/examples/04-context/prompt-cache.ts
@@ -0,0 +1,21 @@
+// bun run examples/04-context/prompt-cache.ts
+
+import Anthropic from '@anthropic-ai/sdk';
+
+const client = new Anthropic();
+const model = process.env.ANTHROPIC_DEFAULT_SONNET_MODEL ?? 'claude-sonnet-4-6';
+
+// A byte-stable prefix past Sonnet's ~2048-token minimum: no timestamp or random value, or the cache silently misses.
+const stable = 'You are a meticulous code reviewer. '.repeat(900);
+const system: Anthropic.TextBlockParam[] = [{ type: 'text', text: stable, cache_control: { type: 'ephemeral' } }];
+const messages: Anthropic.MessageParam[] = [{ role: 'user', content: 'Reply with the single word: ok.' }];
+
+async function ask(label: string) {
+  const message = await client.messages.create({ model, max_tokens: 16, system, messages });
+  const { cache_creation_input_tokens, cache_read_input_tokens } = message.usage;
+  console.log(`${label} created=${cache_creation_input_tokens ?? 0} read=${cache_read_input_tokens ?? 0}`);
+}
+
+// First request writes the cache; the second, identical request reads it back.
+await ask('request 1');
+await ask('request 2');
diff --git a/examples/04-context/summarize-history.ts b/examples/04-context/summarize-history.ts
new file mode 100644
index 0000000..f8d2911
--- /dev/null
+++ b/examples/04-context/summarize-history.ts
@@ -0,0 +1,35 @@
+// bun run examples/04-context/summarize-history.ts
+
+import Anthropic from '@anthropic-ai/sdk';
+
+const client = new Anthropic();
+const model = process.env.ANTHROPIC_DEFAULT_SONNET_MODEL ?? 'claude-sonnet-4-6';
+// A pretend-long history that in a real bot grows turn by turn.
+const messages: Anthropic.MessageParam[] = [
+  { role: 'user', content: 'I am Ada, planning a five-night Lisbon trip in early May on a tight budget.' },
+  { role: 'assistant', content: 'Got it: Ada, Lisbon, early May, five nights, budget-conscious.' },
+  { role: 'user', content: 'Vegetarian, and I want to be near the tram lines.' },
+  { role: 'assistant', content: 'Noted: vegetarian, lodging close to the historic tram routes.' },
+];
+// Replace the old turns with one user/assistant pair so roles still alternate.
+async function summarize(history: Anthropic.MessageParam[]): Promise<Anthropic.MessageParam[]> {
+  const transcript = history.map((m) => `${m.role}: ${m.content}`).join('\n');
+  const summary = await client.messages.create({
+    model,
+    max_tokens: 512,
+    messages: [{ role: 'user', content: `Summarize this conversation as durable memory:\n\n${transcript}` }],
+  });
+  const first = summary.content[0];
+  return [
+    { role: 'user', content: 'Here is a summary of our earlier conversation.' },
+    { role: 'assistant', content: first?.type === 'text' ? first.text : '' },
+  ];
+}
+const threshold = 30;
+const { input_tokens } = await client.messages.countTokens({ model, messages });
+console.log(`history is ${input_tokens} tokens; threshold ${threshold}`);
+if (input_tokens > threshold) {
+  const compacted = await summarize(messages);
+  console.log(`compacted ${messages.length} turns into ${compacted.length}:`);
+  for (const m of compacted) console.log(`  ${m.role}: ${m.content}`);
+}
diff --git a/examples/04-context/system-prompt.ts b/examples/04-context/system-prompt.ts
new file mode 100644
index 0000000..543986c
--- /dev/null
+++ b/examples/04-context/system-prompt.ts
@@ -0,0 +1,28 @@
+// bun run examples/04-context/system-prompt.ts
+
+import Anthropic from '@anthropic-ai/sdk';
+
+const client = new Anthropic();
+const model = process.env.ANTHROPIC_DEFAULT_SONNET_MODEL ?? 'claude-sonnet-4-6';
+
+// A bare string and a block array set the same persona; pick whichever reads cleaner.
+const asString = 'You are Captain Reef, a pirate. Answer in one sentence and end with "Arr!".';
+const asBlocks: Anthropic.TextBlockParam[] = [
+  { type: 'text', text: 'You are Captain Reef, a pirate.' },
+  { type: 'text', text: 'Answer in one sentence and end with "Arr!".' },
+];
+
+const messages: Anthropic.MessageParam[] = [];
+
+async function turn(system: string | Anthropic.TextBlockParam[], question: string) {
+  messages.push({ role: 'user', content: question });
+  const message = await client.messages.create({ model, max_tokens: 256, system, messages });
+  const first = message.content[0];
+  const reply = first?.type === 'text' ? first.text : '';
+  messages.push({ role: 'assistant', content: reply });
+  console.log(`> ${question}\n${reply}\n`);
+}
+
+// Same system on both turns: the persona and the "Arr!" constraint should survive the follow-up.
+await turn(asString, 'What is a variable?');
+await turn(asBlocks, 'And a function?');
diff --git a/examples/04-context/telegram-sessions.ts b/examples/04-context/telegram-sessions.ts
new file mode 100644
index 0000000..0fd90a2
--- /dev/null
+++ b/examples/04-context/telegram-sessions.ts
@@ -0,0 +1,83 @@
+// bun run examples/04-context/telegram-sessions.ts
+
+import Anthropic from '@anthropic-ai/sdk';
+
+const token = process.env.TELEGRAM_BOT_TOKEN;
+if (!token) {
+  throw new Error('Set TELEGRAM_BOT_TOKEN in your .env');
+}
+
+const base = `https://api.telegram.org/bot${token}`;
+const client = new Anthropic();
+const model = process.env.ANTHROPIC_DEFAULT_SONNET_MODEL ?? 'claude-sonnet-4-6';
+
+type Update = {
+  update_id: number;
+  message?: {
+    chat: { id: number };
+    text?: string;
+  };
+};
+
+type Entry = [number, Anthropic.MessageParam[]];
+
+// POST a JSON body to one Bot API method and return the parsed response.
+async function tg<T>(method: string, body: object): Promise<{ result?: T }> {
+  const response = await fetch(`${base}/${method}`, {
+    method: 'POST',
+    headers: { 'content-type': 'application/json' },
+    body: JSON.stringify(body),
+  });
+  return response.json() as Promise<{ result?: T }>;
+}
+
+// Long-poll getUpdates forever, yielding one update at a time.
+async function* pollUpdates(): AsyncGenerator<Update> {
+  let offset = 0;
+  while (true) {
+    const { result = [] } = await tg<Update[]>('getUpdates', { offset, timeout: 30 });
+    for (const update of result) {
+      offset = update.update_id + 1;
+      yield update;
+    }
+  }
+}
+
+const file = `${import.meta.dir}/sessions.json`;
+const sessions = new Map<number, Anthropic.MessageParam[]>();
+if (await Bun.file(file).exists()) {
+  const saved = (await Bun.file(file).json()) as Entry[];
+  for (const [chatId, history] of saved) {
+    sessions.set(chatId, history);
+  }
+}
+
+async function persist(): Promise<void> {
+  const entries: Entry[] = [...sessions];
+  await Bun.write(file, JSON.stringify(entries, null, 2));
+}
+
+for await (const update of pollUpdates()) {
+  const chatId = update.message?.chat.id;
+  const prompt = update.message?.text?.trim();
+  if (chatId === undefined || !prompt) {
+    continue;
+  }
+
+  const history = sessions.get(chatId) ?? [];
+  history.push({ role: 'user', content: prompt });
+
+  const reply = await client.messages.create({
+    model,
+    max_tokens: 1024,
+    messages: history,
+  });
+  history.push({ role: 'assistant', content: reply.content });
+
+  sessions.set(chatId, history);
+  await persist();
+
+  const first = reply.content[0];
+  const answer = first?.type === 'text' ? first.text : '...';
+  await tg('sendMessage', { chat_id: chatId, text: answer });
+}
diff --git a/examples/04-context/token-counter.ts b/examples/04-context/token-counter.ts
new file mode 100644
index 0000000..f2f6347
--- /dev/null
+++ b/examples/04-context/token-counter.ts
@@ -0,0 +1,31 @@
+// bun run examples/04-context/token-counter.ts
+
+import Anthropic from '@anthropic-ai/sdk';
+
+const client = new Anthropic();
+const model = process.env.ANTHROPIC_DEFAULT_SONNET_MODEL ?? 'claude-sonnet-4-6';
+
+// retrieve confirms the model and gives you a label to log; it is not a budget source.
+const info = await client.models.retrieve(model);
+
+// A deliberately small threshold so these short demo turns actually trip the trim.
+const THRESHOLD = 40;
+const system = 'You are a terse assistant. Reply in one short sentence.';
+console.log(`counting against ${info.display_name}, trimming above ${THRESHOLD} tokens`);
+
+const messages: Anthropic.MessageParam[] = [];
+for (const turn of ['Name a planet.', 'Another?', 'And one more?', 'Last one?']) {
+  messages.push({ role: 'user', content: turn });
+
+  // Count the exact payload create will send, then roll the window under threshold.
+  let { input_tokens } = await client.messages.countTokens({ model, system, messages });
+  while (input_tokens > THRESHOLD && messages.length > 2) {
+    messages.splice(0, 2);
+    ({ input_tokens } = await client.messages.countTokens({ model, system, messages }));
+  }
+  console.log(`tokens=${input_tokens} window=${messages.length} msgs`);
+
+  const reply = await client.messages.create({ model, max_tokens: 64, system, messages });
+  const first = reply.content[0];
+  messages.push({ role: 'assistant', content: first?.type === 'text' ? first.text : '' });
+}