Sfoglia il codice sorgente

Fix: add check for maximum chunk length (#9837)

KVOJJJin 6 mesi fa
parent
commit
7a0d0d9b96

+ 9 - 0
web/app/components/datasets/create/step-two/index.tsx

@@ -212,6 +212,10 @@ const StepTwo = ({
   }
 
   const confirmChangeCustomConfig = () => {
+    if (segmentationType === SegmentType.CUSTOM && max > 4000) {
+      Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.maxLengthCheck') })
+      return
+    }
     setCustomFileIndexingEstimate(null)
     setShowPreview()
     fetchFileIndexingEstimate()
@@ -339,6 +343,10 @@ const StepTwo = ({
       Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.overlapCheck') })
       return
     }
+    if (segmentationType === SegmentType.CUSTOM && max > 4000) {
+      Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.maxLengthCheck') })
+      return
+    }
     if (isSetting) {
       params = {
         original_document_id: documentDetail?.id,
@@ -663,6 +671,7 @@ const StepTwo = ({
                         className='h-9'
                         placeholder={t('datasetCreation.stepTwo.maxLength') || ''}
                         value={max}
+                        max={4000}
                         min={1}
                         onChange={e => setMax(parseInt(e.target.value.replace(/^0+/, ''), 10))}
                       />

+ 1 - 0
web/i18n/en-US/dataset-creation.ts

@@ -103,6 +103,7 @@ const translation = {
     separatorTip: 'A delimiter is the character used to separate text. \\n\\n and \\n are commonly used delimiters for separating paragraphs and lines. Combined with commas (\\n\\n,\\n), paragraphs will be segmented by lines when exceeding the maximum chunk length. You can also use special delimiters defined by yourself (e.g. ***).',
     separatorPlaceholder: '\\n\\n for separating paragraphs; \\n for separating lines',
     maxLength: 'Maximum chunk length',
+    maxLengthCheck: 'Maximum chunk length should be less than 4000',
     overlap: 'Chunk overlap',
     overlapTip: 'Setting the chunk overlap can maintain the semantic relevance between them, enhancing the retrieve effect. It is recommended to set 10%-25% of the maximum chunk size.',
     overlapCheck: 'chunk overlap should not bigger than maximum chunk length',

+ 1 - 0
web/i18n/zh-Hans/dataset-creation.ts

@@ -103,6 +103,7 @@ const translation = {
     separatorTip: '分隔符是用于分隔文本的字符。\\n\\n 和 \\n 是常用于分隔段落和行的分隔符。用逗号连接分隔符(\\n\\n,\\n),当段落超过最大块长度时,会按行进行分割。你也可以使用自定义的特殊分隔符(例如 ***)。',
     separatorPlaceholder: '\\n\\n 用于分段;\\n 用于分行',
     maxLength: '分段最大长度',
+    maxLengthCheck: '分段最大长度不能大于 4000',
     overlap: '分段重叠长度',
     overlapTip: '设置分段之间的重叠长度可以保留分段之间的语义关系,提升召回效果。建议设置为最大分段长度的10%-25%',
     overlapCheck: '分段重叠长度不能大于分段最大长度',