diff --git a/.gitignore b/.gitignore index a989c721f5..3f81c99239 100644 --- a/.gitignore +++ b/.gitignore @@ -101,6 +101,10 @@ out/ wp-files/* !wp-files/blueprints/ +# Data Liberation engine prepared at build time (scripts/prepare-data-liberation.ts). +# Lives under packages/ (its future workspace-package home); gitignored until migrated. +packages/data-liberation-agent/ + # Release Tooling vendor/* /fastlane/report.xml diff --git a/apps/cli/ai/skills/liberate/SKILL.md b/apps/cli/ai/skills/liberate/SKILL.md index 9dedbffa30..ca3c5ee1e6 100644 --- a/apps/cli/ai/skills/liberate/SKILL.md +++ b/apps/cli/ai/skills/liberate/SKILL.md @@ -7,14 +7,9 @@ description: Import and rebuild a website from a closed platform (Wix, Squarespa This skill is only a **redirect**. The real, always-up-to-date pipeline lives in the Data Liberation engine's own skill, which Studio downloads on first use. Do NOT re-plan or summarize the steps here — defer to the engine skill so its updates take effect automatically. Your job is just to (1) stand the engine up and (2) follow its skill, translating its tool calls into Studio's bridge. -## Step 1 — Prepare and locate the engine +## Step 1 — Locate the engine -Tell the user one concise line first, e.g. *"Checking the Data Liberation engine — installing it if this is the first run."* Then call the `data_liberation` tool with **no arguments** (setup mode). It returns `{ ready, alreadyInstalled, engineDir, liberateSkill, skillsDir }`. - -Report accurately based on `alreadyInstalled`: - -- `alreadyInstalled: false` → this run performed the **one-time** install. -- `alreadyInstalled: true` → the engine was already set up; just say it's ready and proceed immediately. +Call the `data_liberation` tool with **no arguments** (setup mode). It returns `{ ready, engineDir, liberateSkill, skillsDir }`. The engine ships prebuilt with Studio, so this is instant — just proceed. ## Step 2 — Load the engine's tool catalog diff --git a/apps/cli/ai/tools/data-liberation.ts b/apps/cli/ai/tools/data-liberation.ts index 7aee4f7fd5..cfaf6b0bf7 100644 --- a/apps/cli/ai/tools/data-liberation.ts +++ b/apps/cli/ai/tools/data-liberation.ts @@ -1,98 +1,78 @@ -import { spawn } from 'child_process'; +import { execFile } from 'child_process'; import { existsSync } from 'fs'; -import fs from 'fs/promises'; import path from 'path'; +import { promisify } from 'util'; import { Client } from '@modelcontextprotocol/sdk/client/index.js'; import { StdioClientTransport } from '@modelcontextprotocol/sdk/client/stdio.js'; import { Type } from 'typebox'; -import { STUDIO_SITES_ROOT } from 'cli/lib/site-paths'; import { defineTool } from './define-tool'; import { textResult } from './utils'; -const ENGINE_DIR = path.join( STUDIO_SITES_ROOT, '_liberations', 'data-liberation' ); -const ENGINE_REPO = 'https://github.com/Automattic/data-liberation-agent.git'; - // Engine operations (extract, screenshot, reconstruct) drive Playwright and // routinely run for minutes — far past the MCP SDK's 60s default request // timeout, which otherwise surfaces as `MCP error -32001: Request timed out`. -// Use a generous, env-overridable per-call timeout. NOTE: this is effectively a -// FLAT cap — the engine emits no MCP progress notifications, so the -// `resetTimeoutOnProgress` flag we pass is currently inert (kept as a no-op -// in case the engine adds notifications later). A single op that exceeds the -// cap still returns -32001 while the engine keeps running in the background; -// the `/liberate` skill handles that by polling `liberate_status` rather than -// re-invoking. +// Use a generous per-call timeout. NOTE: this is effectively a FLAT cap — the +// engine emits no MCP progress notifications, so the `resetTimeoutOnProgress` +// flag we pass is currently inert (kept as a no-op in case the engine adds +// notifications later). A single op that exceeds the cap still returns -32001 +// while the engine keeps running in the background; the `/liberate` skill +// handles that by polling `liberate_status` rather than re-invoking. const ENGINE_CALL_TIMEOUT_MS = 600_000; -function appendBoundedOutput( current: string, chunk: unknown ): string { - const PROCESS_OUTPUT_LIMIT = 2000; - - return ( current + String( chunk ) ).slice( -PROCESS_OUTPUT_LIMIT ); -} - -function runProcess( command: string, args: string[], cwd: string ): Promise< void > { - return new Promise( ( resolve, reject ) => { - const child = spawn( command, args, { cwd, stdio: [ 'ignore', 'pipe', 'pipe' ] } ); - let stdout = ''; - let stderr = ''; - child.stdout?.on( 'data', ( chunk ) => { - stdout = appendBoundedOutput( stdout, chunk ); - } ); - child.stderr?.on( 'data', ( chunk ) => { - stderr = appendBoundedOutput( stderr, chunk ); - } ); - child.on( 'error', reject ); - child.on( 'close', ( code ) => { - if ( code === 0 ) { - resolve(); - } else { - reject( - new Error( - `\`${ command } ${ args.join( - ' ' - ) }\` exited with code ${ code }.\nstdout:\n${ stdout }\nstderr:\n${ stderr }` - ) - ); - } - } ); - } ); +function hasCompiledServer( dir: string ): boolean { + return existsSync( path.join( dir, 'dist', 'mcp-server.js' ) ); } -function isEngineInstalled(): boolean { - return ( - existsSync( path.join( ENGINE_DIR, 'node_modules' ) ) && - existsSync( path.join( ENGINE_DIR, 'src', 'mcp-server.ts' ) ) - ); +// Locate the build-time prepared engine. The CLI always runs bundled — both in the +// packaged app and locally via `npm run cli:build` — and the build copies the engine +// in next to the chunks as `data-liberation-agent`, so it's always a sibling of this +// module (`dist/cli`). We deliberately do NOT fall back to the source `packages/` dir: +// that exists only in a dev checkout, so relying on it would mask a missing/stale +// bundle locally while the packaged app still failed — keeping the single bundled +// path makes local resolution faithful to production. +function resolveEngineDir(): string { + const dir = path.join( import.meta.dirname, 'data-liberation-agent' ); + if ( ! hasCompiledServer( dir ) ) { + throw new Error( + 'Data Liberation engine is not prepared. Run `npm install` (prepares the engine ' + + 'into `packages/`) then `npm run cli:build` (bundles it into `dist/cli`).' + ); + } + return dir; } -let enginePromise: Promise< boolean > | null = null; - -function ensureEngine(): Promise< boolean > { - if ( ! enginePromise ) { - enginePromise = installEngine().catch( ( error ) => { - enginePromise = null; - throw error; +const execFileAsync = promisify( execFile ); + +// The engine drives Playwright (extract / screenshot / reconstruct) but ships +// no browser binary — `prepare-data-liberation` skips the download, and the +// engine's playwright (1.58.x) pins a different chromium revision than Studio's +// own (1.60.x), so they can't share one. Install the engine's matching chromium +// once, into the shared OS cache, using the engine's OWN playwright installer +// (mirrors apps/cli/ai/browser-utils.ts). Best-effort + memoized: `playwright +// install` is a fast no-op when already present, and a failure here must NOT +// block the non-browser tools (detect/discover/paths) — those still work. +let chromiumPromise: Promise< void > | null = null; + +function ensureEngineChromium( engineDir: string ): Promise< void > { + if ( ! chromiumPromise ) { + chromiumPromise = installEngineChromium( engineDir ).catch( ( error ) => { + chromiumPromise = null; // allow a retry on the next connect + console.error( + `[data_liberation] Could not install the engine's Playwright Chromium: ${ + error instanceof Error ? error.message : String( error ) + }. Browser-dependent steps (extract/screenshot/reconstruct) may fail until it is installed.` + ); } ); } - return enginePromise; + return chromiumPromise; } -async function installEngine(): Promise< boolean > { - if ( isEngineInstalled() ) { - return true; - } - - await fs.mkdir( path.dirname( ENGINE_DIR ), { recursive: true } ); - - await runProcess( - 'git', - [ 'clone', '--depth', '1', '--branch', 'main', ENGINE_REPO, ENGINE_DIR ], - STUDIO_SITES_ROOT - ); - - await runProcess( 'npm', [ 'ci' ], ENGINE_DIR ); - - return false; +async function installEngineChromium( engineDir: string ): Promise< void > { + const cli = path.join( engineDir, 'node_modules', 'playwright', 'cli.js' ); + await execFileAsync( process.execPath, [ cli, 'install', 'chromium' ], { + env: { ...process.env, CI: process.env.CI ?? '1' }, + maxBuffer: 10 * 1024 * 1024, + } ); } let clientPromise: Promise< Client > | null = null; @@ -108,18 +88,14 @@ function getClient( engineDir: string ): Promise< Client > { } async function connectClient( engineDir: string ): Promise< Client > { - // Launch via the engine's own local `tsx` bin — matches how the engine runs - // its MCP server (`npx tsx src/mcp-server.ts`) and avoids depending on `npx` - // being on PATH inside the packaged app. - const tsxBin = path.join( - engineDir, - 'node_modules', - '.bin', - process.platform === 'win32' ? 'tsx.cmd' : 'tsx' - ); + // Ensure the engine's Chromium is present before any tool runs (best-effort; + // a one-time ~150MB download on the first connect per machine, then cached). + await ensureEngineChromium( engineDir ); + + // Run the compiled MCP server with the current Node binary const transport = new StdioClientTransport( { - command: tsxBin, - args: [ 'src/mcp-server.ts' ], + command: process.execPath, + args: [ path.join( engineDir, 'dist', 'mcp-server.js' ) ], cwd: engineDir, stderr: 'pipe', } ); @@ -176,8 +152,7 @@ export const dataLiberationTool = defineTool( '(GoDaddy, Hostinger, HubSpot, Shopify, Squarespace, Webflow, Weebly, Wix) ' + 'and reconstructs it into a WordPress site. This tool forwards a single call to the engine; ' + "the `/liberate` skill orchestrates the full sequence. Omit `tool` (or pass 'setup') to " + - 'install/locate the engine and learn where its skill files live. The FIRST call downloads ' + - 'the engine (git clone + npm install + a headless-browser download) and can take several minutes.', + 'locate the engine and learn where its skill files live (the engine ships prebuilt with Studio).', { tool: Type.Optional( Type.String( { @@ -198,21 +173,20 @@ export const dataLiberationTool = defineTool( ), }, async ( args ) => { - if ( ! args.tool || args.tool === 'setup' ) { - const wasEngineInstalled = await ensureEngine(); + const engineDir = resolveEngineDir(); + if ( ! args.tool || args.tool === 'setup' ) { return textResult( JSON.stringify( { ready: true, - alreadyInstalled: wasEngineInstalled, - engineDir: ENGINE_DIR, - skillsDir: path.join( ENGINE_DIR, 'skills' ), - liberateSkill: path.join( ENGINE_DIR, 'skills', 'liberate', 'SKILL.md' ), + engineDir, + skillsDir: path.join( engineDir, 'skills' ), + liberateSkill: path.join( engineDir, 'skills', 'liberate', 'SKILL.md' ), } ) ); } - const client = await getClient( ENGINE_DIR ); + const client = await getClient( engineDir ); if ( args.tool === 'list' ) { const listed = await client.listTools(); diff --git a/apps/cli/vite.config.base.ts b/apps/cli/vite.config.base.ts index da26c71082..ebdbe9c5f2 100644 --- a/apps/cli/vite.config.base.ts +++ b/apps/cli/vite.config.base.ts @@ -31,6 +31,16 @@ const phpSourceCodePath = resolve( __dirname, 'php' ); // The Skill tool loads skills from `/skills` at runtime (see // `ai/skills.ts`), so they must sit directly next to the built chunks. const skillsSourcePath = resolve( __dirname, 'ai/skills' ); +// The data_liberation bridge spawns the compiled engine from `/data-liberation-agent` +// (see `ai/tools/data-liberation.ts`); prepared by `scripts/prepare-data-liberation.ts` into +// the repo-root `packages/` dir (where it will migrate to as a real workspace package). +const dataLiberationSourcePath = resolve( + __dirname, + '..', + '..', + 'packages', + 'data-liberation-agent' +); export const baseConfig = defineConfig( { oxc: { @@ -56,6 +66,11 @@ export const baseConfig = defineConfig( { if ( existsSync( skillsSourcePath ) ) { cpSync( skillsSourcePath, resolve( outDir, 'skills' ), { recursive: true } ); } + if ( existsSync( dataLiberationSourcePath ) ) { + cpSync( dataLiberationSourcePath, resolve( outDir, 'data-liberation-agent' ), { + recursive: true, + } ); + } }, }, ], diff --git a/package.json b/package.json index 2949cf4cf2..69cc5fd457 100644 --- a/package.json +++ b/package.json @@ -29,7 +29,7 @@ "start": "npm -w studio-app run start", "start:test": "DEV_CONFIG_DIR=/tmp/studio-test npm -w studio-app run start", "start-wayland": "npm -w studio-app run start-wayland", - "postinstall": "patch-package --patch-dir apps/studio/patches && node ./scripts/remove-fs-ext-other-platform-binaries.mjs && tsx ./scripts/download-wp-server-files.ts && node ./scripts/download-available-site-translations.mjs && tsx ./scripts/download-agent-skills.ts && tsx ./scripts/download-php-binary.ts", + "postinstall": "patch-package --patch-dir apps/studio/patches && node ./scripts/remove-fs-ext-other-platform-binaries.mjs && tsx ./scripts/download-wp-server-files.ts && node ./scripts/download-available-site-translations.mjs && tsx ./scripts/download-agent-skills.ts && tsx ./scripts/download-php-binary.ts && tsx ./scripts/prepare-data-liberation.ts", "package": "tsx ./scripts/package-in-isolation.ts package", "make": "tsx ./scripts/package-in-isolation.ts make", "make:windows-x64": "tsx ./scripts/package-in-isolation.ts make:windows-x64", @@ -58,6 +58,7 @@ "test:metrics": "npx playwright test --config=./tools/metrics/playwright.metrics.config.ts", "download-language-packs": "tsx ./scripts/download-language-packs.ts", "download-agent-skills": "tsx ./scripts/download-agent-skills.ts", + "prepare-data-liberation": "tsx ./scripts/prepare-data-liberation.ts", "download:php-binary": "tsx ./scripts/download-php-binary.ts", "eval": "npm run cli:build --silent && npx promptfoo@0.121.4 eval -c scripts/eval/promptfoo.config.yaml", "eval:view": "npx promptfoo@0.121.4 view" diff --git a/scripts/package-in-isolation.ts b/scripts/package-in-isolation.ts index b227086424..c4045f05ae 100644 --- a/scripts/package-in-isolation.ts +++ b/scripts/package-in-isolation.ts @@ -82,7 +82,9 @@ function ensureBuildToolchain( stagingRoot: string ) { function hasBundledServerFiles( repoRoot: string ): boolean { // Marker paths for artifacts produced by download-wp-server-files.ts, - // download-available-site-translations.mjs, and download-agent-skills.ts. + // download-available-site-translations.mjs, download-agent-skills.ts, and + // prepare-data-liberation.ts. The packaging install uses `--ignore-scripts`, + // so these don't run via the root postinstall and must be triggered here. const requiredPaths = [ 'wp-files/latest/wordpress/wp-includes/version.php', 'wp-files/latest/available-site-translations.json', @@ -92,6 +94,7 @@ function hasBundledServerFiles( repoRoot: string ): boolean { 'wp-files/phpmyadmin/index.php', 'wp-files/reprint/reprint.phar', 'wp-files/skills/wp-plugin-development/SKILL.md', + 'packages/data-liberation-agent/dist/mcp-server.js', ]; return requiredPaths.every( ( requiredPath ) => @@ -108,6 +111,9 @@ function ensureBundledServerFiles( stagingRoot: string ) { runOrFail( 'npx', [ 'tsx', './scripts/download-wp-server-files.ts' ], stagingRoot ); runOrFail( 'node', [ './scripts/download-available-site-translations.mjs' ], stagingRoot ); runOrFail( 'npx', [ 'tsx', './scripts/download-agent-skills.ts' ], stagingRoot ); + // Builds the Data Liberation engine into packages/data-liberation-agent (self-gates + // if already present); write-dist-extras then bundles it into dist/cli. + runOrFail( 'npx', [ 'tsx', './scripts/prepare-data-liberation.ts' ], stagingRoot ); } function shouldCopyToStaging( sourcePath: string ): boolean { diff --git a/scripts/prepare-data-liberation.ts b/scripts/prepare-data-liberation.ts new file mode 100644 index 0000000000..18535c294d --- /dev/null +++ b/scripts/prepare-data-liberation.ts @@ -0,0 +1,152 @@ +// Prepares the Data Liberation engine (https://github.com/Automattic/data-liberation-agent) +// as a BUILD-TIME asset so the packaged Studio app never has to clone/install it +// on the user's machine (no runtime git/npm/Node-version dependency). +// +// It downloads a pinned release archive, `npm ci`s it, and compiles it to `dist/` +// (the engine ships a `build` script + `copy-runtime-assets.mjs` precisely so its +// MCP server runs from compiled JS — `node dist/mcp-server.js` — instead of tsx). +// The result lands in `packages/data-liberation-agent` (the location it will +// migrate to as a real workspace package), which `write-dist-extras` +// (apps/cli/vite.config.base.ts) copies into `dist/cli/data-liberation-agent`, +// like `ai/skills`. The bridge tool (apps/cli/ai/tools/data-liberation.ts) then +// spawns the compiled server with the bundled Node. +// +// The browser is intentionally NOT downloaded here (PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD): +// the Studio CLI already depends on playwright and installs chromium on demand into +// the shared OS cache, which the engine reuses. +// +// Idempotent + gated (skip if already built); delete the target dir to rebuild. +import { spawnSync } from 'child_process'; +import os from 'os'; +import path from 'path'; +import fs from 'fs-extra'; +import { downloadFile } from '../tools/common/lib/download-file'; +import { extractZip } from '../tools/common/lib/extract-zip'; + +const ENGINE_REF = 'main'; +const ENGINE_ARCHIVE_URL = `https://github.com/Automattic/data-liberation-agent/archive/refs/heads/${ ENGINE_REF }.zip`; +const ENGINE_DIR = path.join( import.meta.dirname, '..', 'packages', 'data-liberation-agent' ); + +function run( command: string, args: string[], cwd: string ): void { + const result = spawnSync( command, args, { + cwd, + stdio: 'inherit', + shell: process.platform === 'win32', + env: { ...process.env, PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD: '1' }, + } ); + if ( result.status !== 0 ) { + throw new Error( `Command failed: ${ [ command, ...args ].join( ' ' ) }` ); + } +} + +// `tsc` emits only `.js`; non-TS runtime assets (e.g. `core-block-attrs.json`) +// that the compiled server reads at load time must be mirrored from `src/` into +// `dist/`. The engine's own copy step covers only `.php`, so this generalizes it +// (and is resilient to new asset types the engine adds). +async function mirrorRuntimeAssets( engineDir: string ): Promise< void > { + const isAsset = ( src: string ): boolean => + ! /\.(ts|tsx)$/.test( src ) && ! /(\.test\.|__fixtures__|__snapshots__)/.test( src ); + await fs.copy( path.join( engineDir, 'src' ), path.join( engineDir, 'dist' ), { + overwrite: false, + filter: isAsset, + } ); +} + +// The engine ships as a raw `npm install` (it runs from its own node_modules as a +// child process), so it carries ~34MB of files that never execute at runtime. Strip +// them after the build to keep the packaged app lean: +// - non-runtime top-level dirs/files (sources, tests, build scripts, docs, lockfile); +// the engine runs from `dist/`, and `mirrorRuntimeAssets` already copied any data +// assets out of `src/` into `dist/`. +// - sourcemaps, type declarations, and markdown inside node_modules — pure dev cruft. +// Conservative on purpose: keep `skills/` (the bridge tool reads it), `prompts/`, and +// every package's actual code. Idempotent — `fs.remove` no-ops on missing paths. +const NON_RUNTIME_ENTRIES = [ + 'src', + 'test', + 'tests', + 'scripts', + 'docs', + 'DISCOVERIES.md', + 'package-lock.json', + 'tsconfig.json', + 'tsconfig.build.json', +]; + +async function pruneEngine( engineDir: string ): Promise< void > { + await Promise.all( + NON_RUNTIME_ENTRIES.map( ( entry ) => fs.remove( path.join( engineDir, entry ) ) ) + ); + + const nodeModules = path.join( engineDir, 'node_modules' ); + if ( ! fs.existsSync( nodeModules ) ) { + return; + } + const isCruft = ( file: string ): boolean => + /\.(map|d\.ts|d\.mts|d\.cts)$/.test( file ) || /\.md$/i.test( file ); + const walk = async ( dir: string ): Promise< void > => { + const dirents = await fs.readdir( dir, { withFileTypes: true } ); + await Promise.all( + dirents.map( async ( dirent ) => { + const full = path.join( dir, dirent.name ); + if ( dirent.isDirectory() ) { + await walk( full ); + } else if ( dirent.isFile() && isCruft( dirent.name ) ) { + await fs.remove( full ); + } + } ) + ); + }; + await walk( nodeModules ); +} + +async function prepareDataLiberation(): Promise< void > { + if ( fs.existsSync( path.join( ENGINE_DIR, 'dist', 'mcp-server.js' ) ) ) { + console.log( `[data-liberation] Already built at ${ ENGINE_DIR }. Delete it to rebuild.` ); + return; + } + + console.log( `[data-liberation] Preparing engine (${ ENGINE_REF })…` ); + + // Download + extract into a temp dir, then move the single repo dir into place. + const tmpDir = path.join( os.tmpdir(), 'studio-data-liberation' ); + const zipPath = path.join( tmpDir, 'archive.zip' ); + const extractDir = path.join( tmpDir, 'extract' ); + await fs.remove( tmpDir ); + await fs.remove( ENGINE_DIR ); + + try { + await downloadFile( ENGINE_ARCHIVE_URL, zipPath ); + await extractZip( zipPath, extractDir ); + + const entries = await fs.readdir( extractDir ); + const repoDir = entries.find( ( name ) => name.startsWith( 'data-liberation-agent-' ) ); + if ( ! repoDir ) { + throw new Error( + `Unexpected archive layout: no data-liberation-agent-* dir in ${ extractDir }` + ); + } + await fs.move( path.join( extractDir, repoDir ), ENGINE_DIR ); + } finally { + await fs.remove( tmpDir ); + } + + // Install (incl. devDeps for the build) + compile to dist/, then drop devDeps. + console.log( '[data-liberation] npm ci…' ); + run( 'npm', [ 'ci' ], ENGINE_DIR ); + console.log( '[data-liberation] npm run build…' ); + run( 'npm', [ 'run', 'build' ], ENGINE_DIR ); + console.log( '[data-liberation] mirroring runtime assets to dist/…' ); + await mirrorRuntimeAssets( ENGINE_DIR ); + console.log( '[data-liberation] pruning devDependencies…' ); + run( 'npm', [ 'prune', '--omit=dev' ], ENGINE_DIR ); + console.log( '[data-liberation] stripping non-runtime files…' ); + await pruneEngine( ENGINE_DIR ); + + console.log( `[data-liberation] Done → ${ ENGINE_DIR }` ); +} + +prepareDataLiberation().catch( ( err ) => { + console.error( err ); + process.exit( 1 ); +} );