simstudioai
diff --git a/‎apps/sim/app/api/knowledge/route.ts‎
Lines changed: 30 additions & 12 deletions b/‎apps/sim/app/api/knowledge/route.ts‎
Lines changed: 30 additions & 12 deletions
diff --git a/‎apps/sim/app/workspace/[workspaceId]/knowledge/[id]/components/add-documents-modal/add-documents-modal.tsx‎
Lines changed: 2 additions & 1 deletion b/‎apps/sim/app/workspace/[workspaceId]/knowledge/[id]/components/add-documents-modal/add-documents-modal.tsx‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎apps/sim/app/workspace/[workspaceId]/knowledge/components/create-base-modal/create-base-modal.tsx‎
Lines changed: 117 additions & 6 deletions b/‎apps/sim/app/workspace/[workspaceId]/knowledge/components/create-base-modal/create-base-modal.tsx‎
Lines changed: 117 additions & 6 deletions
diff --git a/‎apps/sim/hooks/queries/kb/knowledge.ts‎
Lines changed: 4 additions & 8 deletions b/‎apps/sim/hooks/queries/kb/knowledge.ts‎
Lines changed: 4 additions & 8 deletions
@@ -15,14 +15,6 @@ import { captureServerEvent } from '@/lib/posthog/server'
 
 const logger = createLogger('KnowledgeBaseAPI')
 
-/**
- * Schema for creating a knowledge base
- *
- * Chunking config units:
- * - maxSize: tokens (1 token ≈ 4 characters)
- * - minSize: characters
- * - overlap: tokens (1 token ≈ 4 characters)
- */
 const CreateKnowledgeBaseSchema = z.object({
   name: z.string().min(1, 'Name is required'),
   description: z.string().optional(),
@@ -31,12 +23,20 @@ const CreateKnowledgeBaseSchema = z.object({
   embeddingDimension: z.literal(1536).default(1536),
   chunkingConfig: z
     .object({
-      /** Maximum chunk size in tokens (1 token ≈ 4 characters) */
       maxSize: z.number().min(100).max(4000).default(1024),
-      /** Minimum chunk size in characters */
       minSize: z.number().min(1).max(2000).default(100),
-      /** Overlap between chunks in tokens (1 token ≈ 4 characters) */
       overlap: z.number().min(0).max(500).default(200),
+      strategy: z
+        .enum(['auto', 'text', 'regex', 'recursive', 'sentence', 'token'])
+        .default('auto')
+        .optional(),
+      strategyOptions: z
+        .object({
+          pattern: z.string().max(500).optional(),
+          separators: z.array(z.string()).optional(),
+          recipe: z.enum(['plain', 'markdown', 'code']).optional(),
+        })
+        .optional(),
     })
     .default({
       maxSize: 1024,
@@ -45,13 +45,31 @@ const CreateKnowledgeBaseSchema = z.object({
     })
     .refine(
       (data) => {
-        // Convert maxSize from tokens to characters for comparison (1 token ≈ 4 chars)
         const maxSizeInChars = data.maxSize * 4
         return data.minSize < maxSizeInChars
       },
       {
         message: 'Min chunk size (characters) must be less than max chunk size (tokens × 4)',
       }
+    )
+    .refine(
+      (data) => {
+        return data.overlap < data.maxSize
+      },
+      {
+        message: 'Overlap must be less than max chunk size',
+      }
+    )
+    .refine(
+      (data) => {
+        if (data.strategy === 'regex' && !data.strategyOptions?.pattern) {
+          return false
+        }
+        return true
+      },
+      {
+        message: 'Regex pattern is required when using the regex chunking strategy',
+      }
     ),
 })
 
 
@@ -263,7 +263,8 @@ export function AddDocumentsModal({
                       {isDragging ? 'Drop files here' : 'Drop files here or click to browse'}
                     </span>
                     <span className='text-[var(--text-tertiary)] text-xs'>
-                      PDF, DOC, DOCX, TXT, CSV, XLS, XLSX, MD, PPT, PPTX, HTML (max 100MB each)
+                      PDF, DOC, DOCX, TXT, CSV, XLS, XLSX, MD, PPT, PPTX, HTML, JSONL (max 100MB
+                      each)
                     </span>
                   </div>
                 </Button>
 
@@ -9,6 +9,8 @@ import { useForm } from 'react-hook-form'
 import { z } from 'zod'
 import {
   Button,
+  Combobox,
+  type ComboboxOption,
   Input,
   Label,
   Modal,
@@ -18,6 +20,7 @@ import {
   ModalHeader,
   Textarea,
 } from '@/components/emcn'
+import type { StrategyOptions } from '@/lib/chunkers/types'
 import { cn } from '@/lib/core/utils/cn'
 import { formatFileSize, validateKnowledgeBaseFile } from '@/lib/uploads/utils/file-utils'
 import { ACCEPT_ATTRIBUTE } from '@/lib/uploads/utils/validation'
@@ -35,6 +38,20 @@ interface CreateBaseModalProps {
   onOpenChange: (open: boolean) => void
 }
 
+const STRATEGY_OPTIONS = [
+  { value: 'auto', label: 'Auto (detect from content)' },
+  { value: 'text', label: 'Text (word boundary splitting)' },
+  { value: 'recursive', label: 'Recursive (configurable separators)' },
+  { value: 'sentence', label: 'Sentence' },
+  { value: 'token', label: 'Token (fixed-size)' },
+  { value: 'regex', label: 'Regex (custom pattern)' },
+] as const
+
+const STRATEGY_COMBOBOX_OPTIONS: ComboboxOption[] = STRATEGY_OPTIONS.map((o) => ({
+  label: o.label,
+  value: o.value,
+}))
+
 const FormSchema = z
   .object({
     name: z
@@ -43,25 +60,24 @@ const FormSchema = z
       .max(100, 'Name must be less than 100 characters')
       .refine((value) => value.trim().length > 0, 'Name cannot be empty'),
     description: z.string().max(500, 'Description must be less than 500 characters').optional(),
-    /** Minimum chunk size in characters */
     minChunkSize: z
       .number()
       .min(1, 'Min chunk size must be at least 1 character')
       .max(2000, 'Min chunk size must be less than 2000 characters'),
-    /** Maximum chunk size in tokens (1 token ≈ 4 characters) */
     maxChunkSize: z
       .number()
       .min(100, 'Max chunk size must be at least 100 tokens')
       .max(4000, 'Max chunk size must be less than 4000 tokens'),
-    /** Overlap between chunks in tokens */
     overlapSize: z
       .number()
       .min(0, 'Overlap must be non-negative')
       .max(500, 'Overlap must be less than 500 tokens'),
+    strategy: z.enum(['auto', 'text', 'regex', 'recursive', 'sentence', 'token']).default('auto'),
+    regexPattern: z.string().optional(),
+    customSeparators: z.string().optional(),
   })
   .refine(
     (data) => {
-      // Convert maxChunkSize from tokens to characters for comparison (1 token ≈ 4 chars)
       const maxChunkSizeInChars = data.maxChunkSize * 4
       return data.minChunkSize < maxChunkSizeInChars
     },
@@ -70,6 +86,27 @@ const FormSchema = z
       path: ['minChunkSize'],
     }
   )
+  .refine(
+    (data) => {
+      return data.overlapSize < data.maxChunkSize
+    },
+    {
+      message: 'Overlap must be less than max chunk size',
+      path: ['overlapSize'],
+    }
+  )
+  .refine(
+    (data) => {
+      if (data.strategy === 'regex' && !data.regexPattern?.trim()) {
+        return false
+      }
+      return true
+    },
+    {
+      message: 'Regex pattern is required when using the regex strategy',
+      path: ['regexPattern'],
+    }
+  )
 
 type FormValues = z.infer<typeof FormSchema>
 
@@ -124,6 +161,7 @@ export const CreateBaseModal = memo(function CreateBaseModal({
     handleSubmit,
     reset,
     watch,
+    setValue,
     formState: { errors },
   } = useForm<FormValues>({
     resolver: zodResolver(FormSchema),
@@ -133,11 +171,15 @@ export const CreateBaseModal = memo(function CreateBaseModal({
       minChunkSize: 100,
       maxChunkSize: 1024,
       overlapSize: 200,
+      strategy: 'auto',
+      regexPattern: '',
+      customSeparators: '',
     },
     mode: 'onSubmit',
   })
 
   const nameValue = watch('name')
+  const strategyValue = watch('strategy')
 
   useEffect(() => {
     if (open) {
@@ -153,6 +195,9 @@ export const CreateBaseModal = memo(function CreateBaseModal({
         minChunkSize: 100,
         maxChunkSize: 1024,
         overlapSize: 200,
+        strategy: 'auto',
+        regexPattern: '',
+        customSeparators: '',
       })
     }
   }, [open, reset])
@@ -255,6 +300,17 @@ export const CreateBaseModal = memo(function CreateBaseModal({
     setSubmitStatus(null)
 
     try {
+      const strategyOptions: StrategyOptions | undefined =
+        data.strategy === 'regex' && data.regexPattern
+          ? { pattern: data.regexPattern }
+          : data.strategy === 'recursive' && data.customSeparators?.trim()
+            ? {
+                separators: data.customSeparators
+                  .split(',')
+                  .map((s) => s.trim().replace(/\\n/g, '\n').replace(/\\t/g, '\t')),
+              }
+            : undefined
+
       const newKnowledgeBase = await createKnowledgeBaseMutation.mutateAsync({
         name: data.name,
         description: data.description || undefined,
@@ -263,6 +319,8 @@ export const CreateBaseModal = memo(function CreateBaseModal({
           maxSize: data.maxChunkSize,
           minSize: data.minChunkSize,
           overlap: data.overlapSize,
+          ...(data.strategy !== 'auto' && { strategy: data.strategy }),
+          ...(strategyOptions && { strategyOptions }),
         },
       })
 
@@ -312,7 +370,6 @@ export const CreateBaseModal = memo(function CreateBaseModal({
               <div className='space-y-3'>
                 <div className='flex flex-col gap-2'>
                   <Label htmlFor='kb-name'>Name</Label>
-                  {/* Hidden decoy fields to prevent browser autofill */}
                   <input
                     type='text'
                     name='fakeusernameremembered'
@@ -403,6 +460,59 @@ export const CreateBaseModal = memo(function CreateBaseModal({
                   </p>
                 </div>
 
+                <div className='flex flex-col gap-2'>
+                  <Label>Chunking Strategy</Label>
+                  <Combobox
+                    options={STRATEGY_COMBOBOX_OPTIONS}
+                    value={strategyValue}
+                    onChange={(value) => setValue('strategy', value as FormValues['strategy'])}
+                    dropdownWidth='trigger'
+                    align='start'
+                  />
+                  <p className='text-[var(--text-muted)] text-xs'>
+                    Auto detects the best strategy based on file content type.
+                  </p>
+                </div>
+
+                {strategyValue === 'regex' && (
+                  <div className='flex flex-col gap-2'>
+                    <Label htmlFor='regexPattern'>Regex Pattern</Label>
+                    <Input
+                      id='regexPattern'
+                      placeholder='e.g. \\n\\n or (?<=\\})\\s*(?=\\{)'
+                      {...register('regexPattern')}
+                      className={cn(errors.regexPattern && 'border-[var(--text-error)]')}
+                      autoComplete='off'
+                      data-form-type='other'
+                    />
+                    {errors.regexPattern && (
+                      <p className='text-[var(--text-error)] text-xs'>
+                        {errors.regexPattern.message}
+                      </p>
+                    )}
+                    <p className='text-[var(--text-muted)] text-xs'>
+                      Text will be split at each match of this regex pattern.
+                    </p>
+                  </div>
+                )}
+
+                {strategyValue === 'recursive' && (
+                  <div className='flex flex-col gap-2'>
+                    <Label htmlFor='customSeparators'>Custom Separators (optional)</Label>
+                    <Input
+                      id='customSeparators'
+                      placeholder='e.g. \n\n, \n, . ,  '
+                      {...register('customSeparators')}
+                      autoComplete='off'
+                      data-form-type='other'
+                    />
+                    <p className='text-[var(--text-muted)] text-xs'>
+                      Comma-separated list of delimiters in priority order. Leave empty for default
+                      separators.
+                    </p>
+                  </div>
+                )}
+
                 <div className='flex flex-col gap-2'>
                   <Label>Upload Documents</Label>
                   <Button
@@ -431,7 +541,8 @@ export const CreateBaseModal = memo(function CreateBaseModal({
                         {isDragging ? 'Drop files here' : 'Drop files here or click to browse'}
                       </span>
                       <span className='text-[var(--text-tertiary)] text-xs'>
-                        PDF, DOC, DOCX, TXT, CSV, XLS, XLSX, MD, PPT, PPTX, HTML (max 100MB each)
+                        PDF, DOC, DOCX, TXT, CSV, XLS, XLSX, MD, PPT, PPTX, HTML, JSONL (max 100MB
+                        each)
                       </span>
                     </div>
                   </Button>
 
@@ -1,6 +1,7 @@
 import { createLogger } from '@sim/logger'
 import { keepPreviousData, useMutation, useQuery, useQueryClient } from '@tanstack/react-query'
 import { toast } from '@/components/emcn'
+import type { ChunkingStrategy, StrategyOptions } from '@/lib/chunkers/types'
 import type {
   ChunkData,
   ChunksPagination,
@@ -338,10 +339,7 @@ export interface DocumentChunkSearchParams {
   search: string
 }
 
-/**
- * Fetches all chunks matching a search query by paginating through results.
- * This is used for search functionality where we need all matching chunks.
- */
+/** Paginates through all matching chunks rather than returning a single page. */
 export async function fetchAllDocumentChunks(
   { knowledgeBaseId, documentId, search }: DocumentChunkSearchParams,
   signal?: AbortSignal
@@ -376,10 +374,6 @@ export const serializeSearchParams = (params: DocumentChunkSearchParams) =>
     search: params.search,
   })
 
-/**
- * Hook to search for chunks in a document.
- * Fetches all matching chunks and returns them for client-side pagination.
- */
 export function useDocumentChunkSearchQuery(
   params: DocumentChunkSearchParams,
   options?: {
@@ -707,6 +701,8 @@ export interface CreateKnowledgeBaseParams {
     maxSize: number
     minSize: number
     overlap: number
+    strategy?: ChunkingStrategy
+    strategyOptions?: StrategyOptions
   }
 }