Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
<script setup lang="ts">
import { Button } from '#theme/components/ui/button';
import { Label } from '#theme/components/ui/label';

const props = defineProps<{ knowledgeId: string }>();
const emit = defineEmits<{ added: [] }>();

const store = useKnowledgeStore();
const open = ref(false);
const submitting = ref(false);
const errorMessage = ref<string | null>(null);
const successMessage = ref<string | null>(null);
const file = ref<File | null>(null);

function onFileChange(e: Event) {
const target = e.target as HTMLInputElement;
file.value = target.files?.[0] ?? null;
}

async function submit() {
if (!file.value) {
errorMessage.value = 'Pick a .zip archive first';
return;
}
submitting.value = true;
errorMessage.value = null;
successMessage.value = null;
try {
const result = await store.addSourcesFromArchive(
props.knowledgeId,
file.value,
);
successMessage.value = `Detected ${result.detected} file${result.detected === 1 ? '' : 's'}. Importing in the background - refresh the list to watch sources appear, then run Index.`;
emit('added');
file.value = null;
} catch (err: unknown) {
const e = err as {
response?: { data?: { message?: string } };
message?: string;
};
errorMessage.value =
e?.response?.data?.message ?? e?.message ?? 'Archive import failed';
} finally {
submitting.value = false;
}
}

function cancel() {
file.value = null;
errorMessage.value = null;
successMessage.value = null;
open.value = false;
}
</script>

<template>
<div class="rounded-md border bg-card p-4">
<div v-if="!open" class="flex items-center justify-between">
<p class="text-sm text-muted-foreground">
Bulk-import a .zip of documents (pdf, docx, xlsx, txt, html, ...) as
file sources.
</p>
<Button size="sm" variant="outline" @click="open = true">
Upload archive
</Button>
</div>

<form v-else class="flex flex-col gap-4" @submit.prevent="submit">
<div class="grid gap-2">
<Label for="archive-file">Zip archive</Label>
<input
id="archive-file"
type="file"
accept=".zip,application/zip"
class="text-sm"
@change="onFileChange"
/>
<p class="text-xs text-muted-foreground">
Every supported file inside becomes a source. Unsupported files
(images, video) and macOS metadata are skipped automatically.
</p>
</div>

<p v-if="errorMessage" class="text-xs text-destructive">
{{ errorMessage }}
</p>
<p v-if="successMessage" class="text-xs text-emerald-600">
{{ successMessage }}
</p>

<div class="flex items-center gap-2">
<Button type="submit" :disabled="submitting">
{{ submitting ? 'Uploading…' : 'Upload' }}
</Button>
<Button type="button" variant="ghost" @click="cancel">Close</Button>
</div>
</form>
</div>
</template>
5 changes: 5 additions & 0 deletions admin/slices/reins/components/knowledgeSources/Provider.vue
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,11 @@ async function onAdded() {
@added="onAdded"
/>

<KnowledgeSourcesAddFromArchiveForm
:knowledge-id="(route.params.id as string)"
@added="onAdded"
/>

<div v-if="loading" class="text-sm text-muted-foreground">Loading…</div>

<div v-else-if="sources.length" class="rounded-md border bg-card">
Expand Down
46 changes: 37 additions & 9 deletions admin/slices/reins/stores/knowledge.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,14 @@ function isSitemapResult(
return typeof obj.added === 'number' && typeof obj.discovered === 'number';
}

function isArchiveResult(
value: unknown,
): value is { detected: number; started: boolean } {
if (typeof value !== 'object' || value === null) return false;
const obj = value as Record<string, unknown>;
return typeof obj.detected === 'number' && typeof obj.started === 'boolean';
}

export type IndexStatus = 'idle' | 'indexing' | 'ready' | 'failed';
export type SourceType = 'file' | 'url' | 'text';

Expand Down Expand Up @@ -229,11 +237,30 @@ export const useKnowledgeStore = defineStore('reins-knowledge', () => {
form.append('type', 'file');
form.append('name', file.name);
form.append('file', file);
const res = await $fetch<unknown>(`/api/knowledges/${id}/sources`, {
method: 'POST',
body: form,
});
return unwrap<ISource>(res);
// Multipart can't go through the generated SDK, so post on its axios
// instance directly: that reuses the configured apiUrl base and the
// Bearer-token interceptor (setup/api apiBaseUrl.ts). A bare '/api/...'
// URL would instead hit the admin origin with no auth.
const res = await apiClient.instance.post<unknown>(
`/knowledges/${id}/sources`,
form,
);
return unwrap<ISource>(res.data);
}

async function addSourcesFromArchive(
id: string,
file: File,
): Promise<{ detected: number; started: boolean }> {
const form = new FormData();
form.append('file', file);
const res = await apiClient.instance.post<unknown>(
`/knowledges/${id}/sources/from-archive`,
form,
);
const data = unwrap<unknown>(res.data);
if (isArchiveResult(data)) return data;
return { detected: 0, started: false };
}

async function addSourcesFromSitemap(
Expand All @@ -243,11 +270,11 @@ export const useKnowledgeStore = defineStore('reins-knowledge', () => {
): Promise<{ added: number; discovered: number }> {
const body: { sitemapUrl: string; urlPrefix?: string } = { sitemapUrl };
if (urlPrefix) body.urlPrefix = urlPrefix;
const res = await $fetch<unknown>(
`/api/knowledges/${id}/sources/from-sitemap`,
{ method: 'POST', body },
const res = await apiClient.instance.post<unknown>(
`/knowledges/${id}/sources/from-sitemap`,
body,
);
const data = unwrap<unknown>(res);
const data = unwrap<unknown>(res.data);
if (isSitemapResult(data)) return data;
return { added: 0, discovered: 0 };
}
Expand Down Expand Up @@ -297,6 +324,7 @@ export const useKnowledgeStore = defineStore('reins-knowledge', () => {
addUrlSource,
addFileSource,
addSourcesFromSitemap,
addSourcesFromArchive,
removeSource,
getGraphLabels,
getGraph,
Expand Down
10 changes: 7 additions & 3 deletions api/src/slices/reins/lightrag/data/lightragHttp.client.ts
Original file line number Diff line number Diff line change
Expand Up @@ -139,13 +139,17 @@ export class LightragHttpClient extends ILightragClient {
new Blob([new Uint8Array(input.content)], { type: input.mimeType }),
input.filename,
);
const res = await this.fetchImpl(`${cfg.baseUrl}/documents/file`, {
// LightRAG renamed /documents/file -> /documents/upload (the old path
// now 404s, same drift that killed /documents/url). Upload saves the
// file to the input dir and processes it in the background, returning a
// track_id like the text endpoints.
const res = await this.fetchImpl(`${cfg.baseUrl}/documents/upload`, {
method: 'POST',
headers: this.headers(cfg.apiKey),
body: form,
});
await this.ensureOk(res, '/documents/file');
return this.extractDocId(res, '/documents/file');
await this.ensureOk(res, '/documents/upload');
return this.extractDocId(res, '/documents/upload');
}

async query(input: IQueryInput): Promise<IQueryResult> {
Expand Down
111 changes: 111 additions & 0 deletions api/src/slices/reins/source/data/archive.extractor.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
// @scope:api
// @slice:reins/source
// @layer:data
// @type:utility

import * as path from 'path';
import * as unzipper from 'unzipper';
import { Readable } from 'stream';

export interface ArchiveEntry {
path: string;
size: number;
openStream: () => Readable;
}

/**
* Reads the central directory of a zip (cheap, no extraction) and returns
* the entries worth ingesting. openStream is lazy - the actual bytes are
* only read when the caller pipes the stream, so memory stays flat while
* iterating a 500MB archive. The zip file on disk must outlive all
* openStream calls.
*/
export async function listIngestableEntries(
zipPath: string,
): Promise<ArchiveEntry[]> {
const directory = await unzipper.Open.file(zipPath);
return directory.files
.filter(
(f) => f.type === 'File' && isIngestableEntry(f.path, f.uncompressedSize),
)
.map((f) => ({
path: f.path,
size: f.uncompressedSize,
openStream: (): Readable => f.stream(),
}));
}

// Extensions we can extract usable text from downstream (LightRAG ingest).
// Everything else in an archive (images, video, fonts, audio) is skipped.
const SUPPORTED_EXTENSIONS = new Set([
'.pdf',
'.docx',
'.doc',
'.pptx',
'.ppt',
'.xlsx',
'.xls',
'.txt',
'.md',
'.markdown',
'.html',
'.htm',
'.xml',
'.csv',
'.rtf',
'.json',
]);

const CONTENT_TYPE_BY_EXT: Record<string, string> = {
'.pdf': 'application/pdf',
'.docx':
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'.doc': 'application/msword',
'.pptx':
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
'.ppt': 'application/vnd.ms-powerpoint',
'.xlsx':
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'.xls': 'application/vnd.ms-excel',
'.txt': 'text/plain',
'.md': 'text/markdown',
'.markdown': 'text/markdown',
'.html': 'text/html',
'.htm': 'text/html',
'.xml': 'application/xml',
'.csv': 'text/csv',
'.rtf': 'application/rtf',
'.json': 'application/json',
};

/**
* Decide whether an archive entry should become a knowledge source.
* Filters out macOS resource forks, directories, dot-files, and any
* extension we can't extract text from. `size` is the uncompressed size;
* zero-byte entries are skipped (usually directory placeholders).
*/
export function isIngestableEntry(entryPath: string, size: number): boolean {
if (size <= 0) return false;
if (entryPath.endsWith('/')) return false;

const base = path.basename(entryPath);
if (entryPath.includes('__MACOSX/')) return false;
if (base === '.DS_Store') return false;
if (base.startsWith('._')) return false;

const ext = path.extname(base).toLowerCase();
return SUPPORTED_EXTENSIONS.has(ext);
}

export function contentTypeForEntry(entryPath: string): string {
const ext = path.extname(entryPath).toLowerCase();
return CONTENT_TYPE_BY_EXT[ext] ?? 'application/octet-stream';
}

// Archive entries keep their folder structure (e.g. "Content/foo.pdf").
// Source names use just the basename for readability, but two files with the
// same basename in different folders would collide on dedup - so callers
// dedup on the full entry path, not this display name.
export function displayNameForEntry(entryPath: string): string {
return path.basename(entryPath);
}
28 changes: 28 additions & 0 deletions api/src/slices/reins/source/data/source.gateway.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import {
ISourceData,
ICreateSourceData,
IUploadSourceFileInput,
IUploadSourceStreamInput,
IUploadedSourceFile,
} from '../domain/source.types';
import { SourceMapper } from './source.mapper';
Expand Down Expand Up @@ -113,6 +114,33 @@ export class SourceGateway extends ISourceGateway {
return { url: stored.uri };
}

async uploadFileStream(
input: IUploadSourceStreamInput,
): Promise<IUploadedSourceFile> {
const bucket = await this.requireBucket();
const key = `${input.knowledgeId}/${crypto.randomUUID()}-${input.filename}`;
// S3Repository uploads buffers via PutObject; the custom/MinIO endpoint
// doesn't accept unbounded streaming bodies. Archive entries are
// processed one at a time, so materializing a single entry keeps peak
// memory bounded to that one file - the same profile as uploadFile.
const chunks: Buffer[] = [];
for await (const chunk of input.body) {
if (!(chunk instanceof Uint8Array)) {
throw new BadRequestException(
'archive entry stream emitted a non-binary chunk',
);
}
chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
}
const stored = await this.s3.upload({
bucket,
key,
body: Buffer.concat(chunks),
contentType: input.contentType,
});
return { url: stored.uri };
}

async deleteFile(url: string): Promise<void> {
const location = S3Repository.parseUri(url);
await this.s3.delete(location);
Expand Down
4 changes: 4 additions & 0 deletions api/src/slices/reins/source/domain/source.gateway.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import {
ISourceData,
ICreateSourceData,
IUploadSourceFileInput,
IUploadSourceStreamInput,
IUploadedSourceFile,
} from './source.types';

Expand All @@ -15,6 +16,9 @@ export abstract class ISourceGateway {
abstract uploadFile(
input: IUploadSourceFileInput,
): Promise<IUploadedSourceFile>;
abstract uploadFileStream(
input: IUploadSourceStreamInput,
): Promise<IUploadedSourceFile>;
abstract deleteFile(url: string): Promise<void>;

abstract indexSource(source: ISourceData): Promise<void>;
Expand Down
Loading
Loading