diff --git a/apps/sim/tools/knowledge/create_document.ts b/apps/sim/tools/knowledge/create_document.ts index 5b82d504a7..e209a0e9bd 100644 --- a/apps/sim/tools/knowledge/create_document.ts +++ b/apps/sim/tools/knowledge/create_document.ts @@ -1,4 +1,7 @@ -import type { KnowledgeCreateDocumentResponse } from '@/tools/knowledge/types' +import { + inferDocumentFileInfo, + type KnowledgeCreateDocumentResponse, +} from '@/tools/knowledge/types' import { enrichKBTagsSchema } from '@/tools/schema-enrichers' import { formatDocumentTagsForAPI, parseDocumentTags } from '@/tools/shared/tags' import type { ToolConfig } from '@/tools/types' @@ -63,30 +66,36 @@ export const knowledgeCreateDocumentTool: ToolConfig 1000000) { + const utf8Bytes = new TextEncoder().encode(textContent) + const contentBytes = utf8Bytes.length + + if (contentBytes > 1_000_000) { throw new Error('Document content exceeds maximum size of 1MB') } - const contentBytes = new TextEncoder().encode(textContent).length - - const utf8Bytes = new TextEncoder().encode(textContent) - const base64Content = - typeof Buffer !== 'undefined' - ? Buffer.from(textContent, 'utf8').toString('base64') - : btoa(String.fromCharCode(...utf8Bytes)) + let base64Content: string + if (typeof Buffer !== 'undefined') { + base64Content = Buffer.from(textContent, 'utf8').toString('base64') + } else { + let binary = '' + for (let i = 0; i < utf8Bytes.length; i++) { + binary += String.fromCharCode(utf8Bytes[i]) + } + base64Content = btoa(binary) + } - const dataUri = `data:text/plain;base64,${base64Content}` + const { filename, mimeType } = inferDocumentFileInfo(documentName) + const dataUri = `data:${mimeType};base64,${base64Content}` - // Parse document tags from various formats (object, array, JSON string) const parsedTags = parseDocumentTags(params.documentTags) const tagData = formatDocumentTagsForAPI(parsedTags) const documents = [ { - filename: documentName.endsWith('.txt') ? documentName : `${documentName}.txt`, + filename, fileUrl: dataUri, fileSize: contentBytes, - mimeType: 'text/plain', + mimeType, ...tagData, }, ] diff --git a/apps/sim/tools/knowledge/types.ts b/apps/sim/tools/knowledge/types.ts index 49fb6d8c33..3fa87ccaad 100644 --- a/apps/sim/tools/knowledge/types.ts +++ b/apps/sim/tools/knowledge/types.ts @@ -1,3 +1,38 @@ +import { + getFileExtension, + getMimeTypeFromExtension as getUploadMimeType, +} from '@/lib/uploads/utils/file-utils' + +const TEXT_COMPATIBLE_MIME_TYPES = new Set([ + 'text/plain', + 'text/html', + 'text/markdown', + 'text/csv', + 'application/json', + 'application/xml', + 'application/x-yaml', +]) + +/** + * Extracts extension from a filename and returns the normalized filename and MIME type. + * If the extension maps to a recognized text-compatible MIME type, it is preserved. + * Otherwise, the filename is normalized to `.txt` with `text/plain`. + */ +export function inferDocumentFileInfo(documentName: string): { + filename: string + mimeType: string +} { + const ext = getFileExtension(documentName) + if (ext) { + const mimeType = getUploadMimeType(ext) + if (TEXT_COMPATIBLE_MIME_TYPES.has(mimeType)) { + return { filename: documentName, mimeType } + } + } + const base = ext ? documentName.slice(0, documentName.lastIndexOf('.')) : documentName + return { filename: `${base || documentName}.txt`, mimeType: 'text/plain' } +} + export interface KnowledgeSearchResult { documentId: string documentName: string diff --git a/apps/sim/tools/knowledge/upsert_document.ts b/apps/sim/tools/knowledge/upsert_document.ts index 0314350a0d..5d1155f78a 100644 --- a/apps/sim/tools/knowledge/upsert_document.ts +++ b/apps/sim/tools/knowledge/upsert_document.ts @@ -1,6 +1,7 @@ -import type { - KnowledgeUpsertDocumentParams, - KnowledgeUpsertDocumentResponse, +import { + inferDocumentFileInfo, + type KnowledgeUpsertDocumentParams, + type KnowledgeUpsertDocumentResponse, } from '@/tools/knowledge/types' import { enrichKBTagsSchema } from '@/tools/schema-enrichers' import { formatDocumentTagsForAPI, parseDocumentTags } from '@/tools/shared/tags' @@ -94,18 +95,17 @@ export const knowledgeUpsertDocumentTool: ToolConfig< base64Content = btoa(binary) } - const dataUri = `data:text/plain;base64,${base64Content}` + const { filename, mimeType } = inferDocumentFileInfo(documentName) + const dataUri = `data:${mimeType};base64,${base64Content}` const parsedTags = parseDocumentTags(params.documentTags) const tagData = formatDocumentTagsForAPI(parsedTags) - const filename = documentName.endsWith('.txt') ? documentName : `${documentName}.txt` - const requestBody: Record = { filename, fileUrl: dataUri, fileSize: contentBytes, - mimeType: 'text/plain', + mimeType, ...tagData, processingOptions: { chunkSize: 1024,