před 1 rokem · 89fcf4ea7c
--- a/api/core/indexing_runner.py
+++ b/api/core/indexing_runner.py
@@ -562,7 +562,7 @@ class IndexingRunner:
 
				 
			
 
				             character_splitter = FixedRecursiveCharacterTextSplitter.from_encoder(
			
 
				                 chunk_size=segmentation["max_tokens"],
			
 
				-                chunk_overlap=0,
			
 
				+                chunk_overlap=segmentation.get('chunk_overlap', 0),
			
 
				                 fixed_separator=separator,
			
 
				                 separators=["\n\n", "。", ".", " ", ""],
			
 
				                 embedding_model_instance=embedding_model_instance
			
@@ -571,7 +571,7 @@ class IndexingRunner:
 
				             # Automatic segmentation
			
 
				             character_splitter = EnhanceRecursiveCharacterTextSplitter.from_encoder(
			
 
				                 chunk_size=DatasetProcessRule.AUTOMATIC_RULES['segmentation']['max_tokens'],
			
 
				-                chunk_overlap=0,
			
 
				+                chunk_overlap=DatasetProcessRule.AUTOMATIC_RULES['segmentation']['chunk_overlap'],
			
 
				                 separators=["\n\n", "。", ".", " ", ""],
			
 
				                 embedding_model_instance=embedding_model_instance
			
 
				             )
			
--- a/api/models/dataset.py
+++ b/api/models/dataset.py
@@ -134,7 +134,8 @@ class DatasetProcessRule(db.Model):
 
				         ],
			
 
				         'segmentation': {
			
 
				             'delimiter': '\n',
			
 
				-            'max_tokens': 1000
			
 
				+            'max_tokens': 500,
			
 
				+            'chunk_overlap': 50
			
 
				         }
			
 
				     }
			
 
				 
			
--- a/api/services/dataset_service.py
+++ b/api/services/dataset_service.py
@@ -241,7 +241,8 @@ class DocumentService:
 
				             ],
			
 
				             'segmentation': {
			
 
				                 'delimiter': '\n',
			
 
				-                'max_tokens': 500
			
 
				+                'max_tokens': 500,
			
 
				+                'chunk_overlap': 50
			
 
				             }
			
 
				         }
			
 
				     }
			
--- a/web/app/components/datasets/create/step-two/index.module.css
+++ b/web/app/components/datasets/create/step-two/index.module.css
@@ -18,7 +18,7 @@
 
				 }
			
 
				 
			
 
				 .form .label {
			
 
				-  @apply pt-6 pb-2;
			
 
				+  @apply pt-6 pb-2 flex items-center;
			
 
				   font-weight: 500;
			
 
				   font-size: 16px;
			
 
				   line-height: 24px;
			
--- a/web/app/components/datasets/create/step-two/index.tsx
+++ b/web/app/components/datasets/create/step-two/index.tsx
@@ -33,13 +33,14 @@ import { DataSourceType, DocForm } from '@/models/datasets'
 
				 import NotionIcon from '@/app/components/base/notion-icon'
			
 
				 import Switch from '@/app/components/base/switch'
			
 
				 import { MessageChatSquare } from '@/app/components/base/icons/src/public/common'
			
 
				-import { XClose } from '@/app/components/base/icons/src/vender/line/general'
			
 
				+import { HelpCircle, XClose } from '@/app/components/base/icons/src/vender/line/general'
			
 
				 import { useDatasetDetailContext } from '@/context/dataset-detail'
			
 
				 import I18n from '@/context/i18n'
			
 
				 import { IS_CE_EDITION } from '@/config'
			
 
				 import { RETRIEVE_METHOD } from '@/types/app'
			
 
				 import useBreakpoints, { MediaType } from '@/hooks/use-breakpoints'
			
 
				 import Tooltip from '@/app/components/base/tooltip'
			
 
				+import TooltipPlus from '@/app/components/base/tooltip-plus'
			
 
				 import { useModelListAndDefaultModelAndCurrentProviderAndModel } from '@/app/components/header/account-setting/model-provider-page/hooks'
			
 
				 import { LanguagesSupportedUnderscore, getModelRuntimeSupported } from '@/utils/language'
			
 
				 
			
@@ -99,7 +100,8 @@ const StepTwo = ({
 
				   const [previewScrolled, setPreviewScrolled] = useState(false)
			
 
				   const [segmentationType, setSegmentationType] = useState<SegmentType>(SegmentType.AUTO)
			
 
				   const [segmentIdentifier, setSegmentIdentifier] = useState('\\n')
			
 
				-  const [max, setMax] = useState(1000)
			
 
				+  const [max, setMax] = useState(500)
			
 
				+  const [overlap, setOverlap] = useState(50)
			
 
				   const [rules, setRules] = useState<PreProcessingRule[]>([])
			
 
				   const [defaultConfig, setDefaultConfig] = useState<Rules>()
			
 
				   const hasSetIndexType = !!indexingType
			
@@ -171,6 +173,7 @@ const StepTwo = ({
 
				     if (defaultConfig) {
			
 
				       setSegmentIdentifier((defaultConfig.segmentation.separator === '\n' ? '\\n' : defaultConfig.segmentation.separator) || '\\n')
			
 
				       setMax(defaultConfig.segmentation.max_tokens)
			
 
				+      setOverlap(defaultConfig.segmentation.chunk_overlap)
			
 
				       setRules(defaultConfig.pre_processing_rules)
			
 
				     }
			
 
				   }
			
@@ -207,6 +210,7 @@ const StepTwo = ({
 
				         segmentation: {
			
 
				           separator: segmentIdentifier === '\\n' ? '\n' : segmentIdentifier,
			
 
				           max_tokens: max,
			
 
				+          chunk_overlap: overlap,
			
 
				         },
			
 
				       }
			
 
				       processRule.rules = ruleObj
			
@@ -275,6 +279,10 @@ const StepTwo = ({
 
				   } = useModelListAndDefaultModelAndCurrentProviderAndModel(3)
			
 
				   const getCreationParams = () => {
			
 
				     let params
			
 
				+    if (segmentationType === SegmentType.CUSTOM && overlap > max) {
			
 
				+      Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.overlapCheck') })
			
 
				+      return
			
 
				+    }
			
 
				     if (isSetting) {
			
 
				       params = {
			
 
				         original_document_id: documentDetail?.id,
			
@@ -337,6 +345,7 @@ const StepTwo = ({
 
				       const separator = res.rules.segmentation.separator
			
 
				       setSegmentIdentifier((separator === '\n' ? '\\n' : separator) || '\\n')
			
 
				       setMax(res.rules.segmentation.max_tokens)
			
 
				+      setOverlap(res.rules.segmentation.chunk_overlap)
			
 
				       setRules(res.rules.pre_processing_rules)
			
 
				       setDefaultConfig(res.rules)
			
 
				     }
			
@@ -350,8 +359,10 @@ const StepTwo = ({
 
				       const rules = documentDetail.dataset_process_rule.rules
			
 
				       const separator = rules.segmentation.separator
			
 
				       const max = rules.segmentation.max_tokens
			
 
				+      const overlap = rules.segmentation.chunk_overlap
			
 
				       setSegmentIdentifier((separator === '\n' ? '\\n' : separator) || '\\n')
			
 
				       setMax(max)
			
 
				+      setOverlap(overlap)
			
 
				       setRules(rules.pre_processing_rules)
			
 
				       setDefaultConfig(rules)
			
 
				     }
			
@@ -569,13 +580,35 @@ const StepTwo = ({
 
				                       <input
			
 
				                         type="number"
			
 
				                         className={s.input}
			
 
				-                        placeholder={t('datasetCreation.stepTwo.separatorPlaceholder') || ''}
			
 
				+                        placeholder={t('datasetCreation.stepTwo.maxLength') || ''}
			
 
				                         value={max}
			
 
				                         min={1}
			
 
				                         onChange={e => setMax(parseInt(e.target.value.replace(/^0+/, ''), 10))}
			
 
				                       />
			
 
				                     </div>
			
 
				                   </div>
			
 
				+                  <div className={s.formRow}>
			
 
				+                    <div className='w-full'>
			
 
				+                      <div className={s.label}>
			
 
				+                        {t('datasetCreation.stepTwo.overlap')}
			
 
				+                        <TooltipPlus popupContent={
			
 
				+                          <div className='max-w-[200px]'>
			
 
				+                            {t('datasetCreation.stepTwo.overlapTip')}
			
 
				+                          </div>
			
 
				+                        }>
			
 
				+                          <HelpCircle className='ml-1 w-3.5 h-3.5 text-gray-400' />
			
 
				+                        </TooltipPlus>
			
 
				+                      </div>
			
 
				+                      <input
			
 
				+                        type="number"
			
 
				+                        className={s.input}
			
 
				+                        placeholder={t('datasetCreation.stepTwo.overlap') || ''}
			
 
				+                        value={overlap}
			
 
				+                        min={1}
			
 
				+                        onChange={e => setOverlap(parseInt(e.target.value.replace(/^0+/, ''), 10))}
			
 
				+                      />
			
 
				+                    </div>
			
 
				+                  </div>
			
 
				                   <div className={s.formRow}>
			
 
				                     <div className='w-full flex flex-col gap-1'>
			
 
				                       <div className={s.label}>{t('datasetCreation.stepTwo.rules')}</div>
			
--- a/web/i18n/lang/dataset-creation.en.ts
+++ b/web/i18n/lang/dataset-creation.en.ts
@@ -59,6 +59,9 @@ const translation = {
 
				     separator: 'Segment identifier',
			
 
				     separatorPlaceholder: 'For example, newline (\\\\n) or special separator (such as "***")',
			
 
				     maxLength: 'Maximum chunk length',
			
 
				+    overlap: 'Chunk overlap',
			
 
				+    overlapTip: 'Setting the chunk overlap can maintain the semantic relevance between them, enhancing the retrieve effect. It is recommended to set 10%-25% of the maximum chunk size.',
			
 
				+    overlapCheck: 'chunk overlap should not bigger than maximun chunk length',
			
 
				     rules: 'Text preprocessing rules',
			
 
				     removeExtraSpaces: 'Replace consecutive spaces, newlines and tabs',
			
 
				     removeUrlEmails: 'Delete all URLs and email addresses',
			
--- a/web/i18n/lang/dataset-creation.pt.ts
+++ b/web/i18n/lang/dataset-creation.pt.ts
@@ -59,6 +59,9 @@ const translation = {
 
				     separator: 'Identificador de segmento',
			
 
				     separatorPlaceholder: 'Por exemplo, nova linha (\\\\n) ou separador especial (como "***")',
			
 
				     maxLength: 'Comprimento máximo do fragmento',
			
 
				+    overlap: 'Sobreposição de blocos',
			
 
				+    overlapTip: 'Configurar a sobreposição de blocos pode manter a relevância semântica entre eles, melhorando o efeito de recuperação. É recomendado definir de 10% a 25% do tamanho máximo do bloco.',
			
 
				+    overlapCheck: 'a sobreposição de blocos não deve ser maior que o comprimento máximo do bloco',
			
 
				     rules: 'Regras de pré-processamento de texto',
			
 
				     removeExtraSpaces: 'Substituir espaços consecutivos, quebras de linha e tabulações',
			
 
				     removeUrlEmails: 'Excluir todos os URLs e endereços de e-mail',
			
--- a/web/i18n/lang/dataset-creation.zh.ts
+++ b/web/i18n/lang/dataset-creation.zh.ts
@@ -59,6 +59,9 @@ const translation = {
 
				     separator: '分段标识符',
			
 
				     separatorPlaceholder: '例如换行符（\n）或特定的分隔符（如 "***"）',
			
 
				     maxLength: '分段最大长度',
			
 
				+    overlap: '分段重叠长度',
			
 
				+    overlapTip: '设置分段之间的重叠长度可以保留分段之间的语义关系，提升召回效果。建议设置为最大分段长度的10%-25%',
			
 
				+    overlapCheck: '分段重叠长度不能大于分段最大长度',
			
 
				     rules: '文本预处理规则',
			
 
				     removeExtraSpaces: '替换掉连续的空格、换行符和制表符',
			
 
				     removeUrlEmails: '删除所有 URL 和电子邮件地址',
			
--- a/web/models/datasets.ts
+++ b/web/models/datasets.ts
@@ -108,6 +108,7 @@ export type PreProcessingRule = {
 
				 export type Segmentation = {
			
 
				   separator: string
			
 
				   max_tokens: number
			
 
				+  chunk_overlap: number
			
 
				 }
			
 
				 
			
 
				 export const DocumentIndexingStatusList = [