Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,10 @@ out/
wp-files/*
!wp-files/blueprints/

# Data Liberation engine prepared at build time (scripts/prepare-data-liberation.ts).
# Lives under packages/ (its future workspace-package home); gitignored until migrated.
packages/data-liberation-agent/

# Release Tooling
vendor/*
/fastlane/report.xml
Expand Down
9 changes: 2 additions & 7 deletions apps/cli/ai/skills/liberate/SKILL.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,9 @@ description: Import and rebuild a website from a closed platform (Wix, Squarespa

This skill is only a **redirect**. The real, always-up-to-date pipeline lives in the Data Liberation engine's own skill, which Studio downloads on first use. Do NOT re-plan or summarize the steps here — defer to the engine skill so its updates take effect automatically. Your job is just to (1) stand the engine up and (2) follow its skill, translating its tool calls into Studio's bridge.

## Step 1 — Prepare and locate the engine
## Step 1 — Locate the engine

Tell the user one concise line first, e.g. *"Checking the Data Liberation engine — installing it if this is the first run."* Then call the `data_liberation` tool with **no arguments** (setup mode). It returns `{ ready, alreadyInstalled, engineDir, liberateSkill, skillsDir }`.

Report accurately based on `alreadyInstalled`:

- `alreadyInstalled: false` → this run performed the **one-time** install.
- `alreadyInstalled: true` → the engine was already set up; just say it's ready and proceed immediately.
Call the `data_liberation` tool with **no arguments** (setup mode). It returns `{ ready, engineDir, liberateSkill, skillsDir }`. The engine ships prebuilt with Studio, so this is instant — just proceed.

## Step 2 — Load the engine's tool catalog

Expand Down
162 changes: 68 additions & 94 deletions apps/cli/ai/tools/data-liberation.ts
Original file line number Diff line number Diff line change
@@ -1,98 +1,78 @@
import { spawn } from 'child_process';
import { execFile } from 'child_process';
import { existsSync } from 'fs';
import fs from 'fs/promises';
import path from 'path';
import { promisify } from 'util';
import { Client } from '@modelcontextprotocol/sdk/client/index.js';
import { StdioClientTransport } from '@modelcontextprotocol/sdk/client/stdio.js';
import { Type } from 'typebox';
import { STUDIO_SITES_ROOT } from 'cli/lib/site-paths';
import { defineTool } from './define-tool';
import { textResult } from './utils';

const ENGINE_DIR = path.join( STUDIO_SITES_ROOT, '_liberations', 'data-liberation' );
const ENGINE_REPO = 'https://github.com/Automattic/data-liberation-agent.git';

// Engine operations (extract, screenshot, reconstruct) drive Playwright and
// routinely run for minutes — far past the MCP SDK's 60s default request
// timeout, which otherwise surfaces as `MCP error -32001: Request timed out`.
// Use a generous, env-overridable per-call timeout. NOTE: this is effectively a
// FLAT cap — the engine emits no MCP progress notifications, so the
// `resetTimeoutOnProgress` flag we pass is currently inert (kept as a no-op
// in case the engine adds notifications later). A single op that exceeds the
// cap still returns -32001 while the engine keeps running in the background;
// the `/liberate` skill handles that by polling `liberate_status` rather than
// re-invoking.
// Use a generous per-call timeout. NOTE: this is effectively a FLAT cap — the
// engine emits no MCP progress notifications, so the `resetTimeoutOnProgress`
// flag we pass is currently inert (kept as a no-op in case the engine adds
// notifications later). A single op that exceeds the cap still returns -32001
// while the engine keeps running in the background; the `/liberate` skill
// handles that by polling `liberate_status` rather than re-invoking.
const ENGINE_CALL_TIMEOUT_MS = 600_000;

function appendBoundedOutput( current: string, chunk: unknown ): string {
const PROCESS_OUTPUT_LIMIT = 2000;

return ( current + String( chunk ) ).slice( -PROCESS_OUTPUT_LIMIT );
}

function runProcess( command: string, args: string[], cwd: string ): Promise< void > {
return new Promise( ( resolve, reject ) => {
const child = spawn( command, args, { cwd, stdio: [ 'ignore', 'pipe', 'pipe' ] } );
let stdout = '';
let stderr = '';
child.stdout?.on( 'data', ( chunk ) => {
stdout = appendBoundedOutput( stdout, chunk );
} );
child.stderr?.on( 'data', ( chunk ) => {
stderr = appendBoundedOutput( stderr, chunk );
} );
child.on( 'error', reject );
child.on( 'close', ( code ) => {
if ( code === 0 ) {
resolve();
} else {
reject(
new Error(
`\`${ command } ${ args.join(
' '
) }\` exited with code ${ code }.\nstdout:\n${ stdout }\nstderr:\n${ stderr }`
)
);
}
} );
} );
function hasCompiledServer( dir: string ): boolean {
return existsSync( path.join( dir, 'dist', 'mcp-server.js' ) );
}

function isEngineInstalled(): boolean {
return (
existsSync( path.join( ENGINE_DIR, 'node_modules' ) ) &&
existsSync( path.join( ENGINE_DIR, 'src', 'mcp-server.ts' ) )
);
// Locate the build-time prepared engine. The CLI always runs bundled — both in the
// packaged app and locally via `npm run cli:build` — and the build copies the engine
// in next to the chunks as `data-liberation-agent`, so it's always a sibling of this
// module (`dist/cli`). We deliberately do NOT fall back to the source `packages/` dir:
// that exists only in a dev checkout, so relying on it would mask a missing/stale
// bundle locally while the packaged app still failed — keeping the single bundled
// path makes local resolution faithful to production.
function resolveEngineDir(): string {
const dir = path.join( import.meta.dirname, 'data-liberation-agent' );
if ( ! hasCompiledServer( dir ) ) {
throw new Error(
'Data Liberation engine is not prepared. Run `npm install` (prepares the engine ' +
'into `packages/`) then `npm run cli:build` (bundles it into `dist/cli`).'
);
}
return dir;
}

let enginePromise: Promise< boolean > | null = null;

function ensureEngine(): Promise< boolean > {
if ( ! enginePromise ) {
enginePromise = installEngine().catch( ( error ) => {
enginePromise = null;
throw error;
const execFileAsync = promisify( execFile );

// The engine drives Playwright (extract / screenshot / reconstruct) but ships
// no browser binary — `prepare-data-liberation` skips the download, and the
// engine's playwright (1.58.x) pins a different chromium revision than Studio's
// own (1.60.x), so they can't share one. Install the engine's matching chromium
// once, into the shared OS cache, using the engine's OWN playwright installer
// (mirrors apps/cli/ai/browser-utils.ts). Best-effort + memoized: `playwright
// install` is a fast no-op when already present, and a failure here must NOT
// block the non-browser tools (detect/discover/paths) — those still work.
let chromiumPromise: Promise< void > | null = null;

function ensureEngineChromium( engineDir: string ): Promise< void > {
if ( ! chromiumPromise ) {
chromiumPromise = installEngineChromium( engineDir ).catch( ( error ) => {
chromiumPromise = null; // allow a retry on the next connect
console.error(
`[data_liberation] Could not install the engine's Playwright Chromium: ${
error instanceof Error ? error.message : String( error )
}. Browser-dependent steps (extract/screenshot/reconstruct) may fail until it is installed.`
);
} );
}
return enginePromise;
return chromiumPromise;
}

async function installEngine(): Promise< boolean > {
if ( isEngineInstalled() ) {
return true;
}

await fs.mkdir( path.dirname( ENGINE_DIR ), { recursive: true } );

await runProcess(
'git',
[ 'clone', '--depth', '1', '--branch', 'main', ENGINE_REPO, ENGINE_DIR ],
STUDIO_SITES_ROOT
);

await runProcess( 'npm', [ 'ci' ], ENGINE_DIR );

return false;
async function installEngineChromium( engineDir: string ): Promise< void > {
const cli = path.join( engineDir, 'node_modules', 'playwright', 'cli.js' );
await execFileAsync( process.execPath, [ cli, 'install', 'chromium' ], {
env: { ...process.env, CI: process.env.CI ?? '1' },
maxBuffer: 10 * 1024 * 1024,
} );
}

let clientPromise: Promise< Client > | null = null;
Expand All @@ -108,18 +88,14 @@ function getClient( engineDir: string ): Promise< Client > {
}

async function connectClient( engineDir: string ): Promise< Client > {
// Launch via the engine's own local `tsx` bin — matches how the engine runs
// its MCP server (`npx tsx src/mcp-server.ts`) and avoids depending on `npx`
// being on PATH inside the packaged app.
const tsxBin = path.join(
engineDir,
'node_modules',
'.bin',
process.platform === 'win32' ? 'tsx.cmd' : 'tsx'
);
// Ensure the engine's Chromium is present before any tool runs (best-effort;
// a one-time ~150MB download on the first connect per machine, then cached).
await ensureEngineChromium( engineDir );

// Run the compiled MCP server with the current Node binary
const transport = new StdioClientTransport( {
command: tsxBin,
args: [ 'src/mcp-server.ts' ],
command: process.execPath,
args: [ path.join( engineDir, 'dist', 'mcp-server.js' ) ],
cwd: engineDir,
stderr: 'pipe',
} );
Expand Down Expand Up @@ -176,8 +152,7 @@ export const dataLiberationTool = defineTool(
'(GoDaddy, Hostinger, HubSpot, Shopify, Squarespace, Webflow, Weebly, Wix) ' +
'and reconstructs it into a WordPress site. This tool forwards a single call to the engine; ' +
"the `/liberate` skill orchestrates the full sequence. Omit `tool` (or pass 'setup') to " +
'install/locate the engine and learn where its skill files live. The FIRST call downloads ' +
'the engine (git clone + npm install + a headless-browser download) and can take several minutes.',
'locate the engine and learn where its skill files live (the engine ships prebuilt with Studio).',
{
tool: Type.Optional(
Type.String( {
Expand All @@ -198,21 +173,20 @@ export const dataLiberationTool = defineTool(
),
},
async ( args ) => {
if ( ! args.tool || args.tool === 'setup' ) {
const wasEngineInstalled = await ensureEngine();
const engineDir = resolveEngineDir();

if ( ! args.tool || args.tool === 'setup' ) {
return textResult(
JSON.stringify( {
ready: true,
alreadyInstalled: wasEngineInstalled,
engineDir: ENGINE_DIR,
skillsDir: path.join( ENGINE_DIR, 'skills' ),
liberateSkill: path.join( ENGINE_DIR, 'skills', 'liberate', 'SKILL.md' ),
engineDir,
skillsDir: path.join( engineDir, 'skills' ),
liberateSkill: path.join( engineDir, 'skills', 'liberate', 'SKILL.md' ),
} )
);
}

const client = await getClient( ENGINE_DIR );
const client = await getClient( engineDir );

if ( args.tool === 'list' ) {
const listed = await client.listTools();
Expand Down
15 changes: 15 additions & 0 deletions apps/cli/vite.config.base.ts
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,16 @@ const phpSourceCodePath = resolve( __dirname, 'php' );
// The Skill tool loads skills from `<chunk dir>/skills` at runtime (see
// `ai/skills.ts`), so they must sit directly next to the built chunks.
const skillsSourcePath = resolve( __dirname, 'ai/skills' );
// The data_liberation bridge spawns the compiled engine from `<chunk dir>/data-liberation-agent`
// (see `ai/tools/data-liberation.ts`); prepared by `scripts/prepare-data-liberation.ts` into
// the repo-root `packages/` dir (where it will migrate to as a real workspace package).
const dataLiberationSourcePath = resolve(
__dirname,
'..',
'..',
'packages',
'data-liberation-agent'
);

export const baseConfig = defineConfig( {
oxc: {
Expand All @@ -56,6 +66,11 @@ export const baseConfig = defineConfig( {
if ( existsSync( skillsSourcePath ) ) {
cpSync( skillsSourcePath, resolve( outDir, 'skills' ), { recursive: true } );
}
if ( existsSync( dataLiberationSourcePath ) ) {
cpSync( dataLiberationSourcePath, resolve( outDir, 'data-liberation-agent' ), {
recursive: true,
} );
}
},
},
],
Expand Down
3 changes: 2 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
"start": "npm -w studio-app run start",
"start:test": "DEV_CONFIG_DIR=/tmp/studio-test npm -w studio-app run start",
"start-wayland": "npm -w studio-app run start-wayland",
"postinstall": "patch-package --patch-dir apps/studio/patches && node ./scripts/remove-fs-ext-other-platform-binaries.mjs && tsx ./scripts/download-wp-server-files.ts && node ./scripts/download-available-site-translations.mjs && tsx ./scripts/download-agent-skills.ts && tsx ./scripts/download-php-binary.ts",
"postinstall": "patch-package --patch-dir apps/studio/patches && node ./scripts/remove-fs-ext-other-platform-binaries.mjs && tsx ./scripts/download-wp-server-files.ts && node ./scripts/download-available-site-translations.mjs && tsx ./scripts/download-agent-skills.ts && tsx ./scripts/download-php-binary.ts && tsx ./scripts/prepare-data-liberation.ts",
"package": "tsx ./scripts/package-in-isolation.ts package",
"make": "tsx ./scripts/package-in-isolation.ts make",
"make:windows-x64": "tsx ./scripts/package-in-isolation.ts make:windows-x64",
Expand Down Expand Up @@ -58,6 +58,7 @@
"test:metrics": "npx playwright test --config=./tools/metrics/playwright.metrics.config.ts",
"download-language-packs": "tsx ./scripts/download-language-packs.ts",
"download-agent-skills": "tsx ./scripts/download-agent-skills.ts",
"prepare-data-liberation": "tsx ./scripts/prepare-data-liberation.ts",
"download:php-binary": "tsx ./scripts/download-php-binary.ts",
"eval": "npm run cli:build --silent && npx promptfoo@0.121.4 eval -c scripts/eval/promptfoo.config.yaml",
"eval:view": "npx promptfoo@0.121.4 view"
Expand Down
8 changes: 7 additions & 1 deletion scripts/package-in-isolation.ts
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,9 @@ function ensureBuildToolchain( stagingRoot: string ) {

function hasBundledServerFiles( repoRoot: string ): boolean {
// Marker paths for artifacts produced by download-wp-server-files.ts,
// download-available-site-translations.mjs, and download-agent-skills.ts.
// download-available-site-translations.mjs, download-agent-skills.ts, and
// prepare-data-liberation.ts. The packaging install uses `--ignore-scripts`,
// so these don't run via the root postinstall and must be triggered here.
const requiredPaths = [
'wp-files/latest/wordpress/wp-includes/version.php',
'wp-files/latest/available-site-translations.json',
Expand All @@ -92,6 +94,7 @@ function hasBundledServerFiles( repoRoot: string ): boolean {
'wp-files/phpmyadmin/index.php',
'wp-files/reprint/reprint.phar',
'wp-files/skills/wp-plugin-development/SKILL.md',
'packages/data-liberation-agent/dist/mcp-server.js',
];

return requiredPaths.every( ( requiredPath ) =>
Expand All @@ -108,6 +111,9 @@ function ensureBundledServerFiles( stagingRoot: string ) {
runOrFail( 'npx', [ 'tsx', './scripts/download-wp-server-files.ts' ], stagingRoot );
runOrFail( 'node', [ './scripts/download-available-site-translations.mjs' ], stagingRoot );
runOrFail( 'npx', [ 'tsx', './scripts/download-agent-skills.ts' ], stagingRoot );
// Builds the Data Liberation engine into packages/data-liberation-agent (self-gates
// if already present); write-dist-extras then bundles it into dist/cli.
runOrFail( 'npx', [ 'tsx', './scripts/prepare-data-liberation.ts' ], stagingRoot );
}

function shouldCopyToStaging( sourcePath: string ): boolean {
Expand Down
Loading