Jelajahi Sumber

feat: support config chunk length by env (#12925)

Joel 3 bulan lalu
induk
melakukan
e09f6e4987

+ 2 - 1
docker/docker-compose-template.yaml

@@ -1,4 +1,4 @@
-x-shared-env: &shared-api-worker-env
+x-shared-env: &shared-api-worker-env 
 services:
   # API service
   api:
@@ -57,6 +57,7 @@ services:
       TEXT_GENERATION_TIMEOUT_MS: ${TEXT_GENERATION_TIMEOUT_MS:-60000}
       CSP_WHITELIST: ${CSP_WHITELIST:-}
       TOP_K_MAX_VALUE: ${TOP_K_MAX_VALUE:-}
+      INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH: ${INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH:-}
 
   # The postgres database.
   db:

+ 1 - 0
docker/docker-compose.yaml

@@ -448,6 +448,7 @@ services:
       TEXT_GENERATION_TIMEOUT_MS: ${TEXT_GENERATION_TIMEOUT_MS:-60000}
       CSP_WHITELIST: ${CSP_WHITELIST:-}
       TOP_K_MAX_VALUE: ${TOP_K_MAX_VALUE:-}
+      INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH: ${INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH:-}
 
   # The postgres database.
   db:

+ 3 - 0
web/.env.example

@@ -28,3 +28,6 @@ NEXT_PUBLIC_CSP_WHITELIST=
 
 # The maximum number of top-k value for RAG.
 NEXT_PUBLIC_TOP_K_MAX_VALUE=10
+
+# The maximum number of tokens for segmentation
+NEXT_PUBLIC_INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH=4000

+ 5 - 4
web/app/components/datasets/create/step-two/index.tsx

@@ -98,6 +98,7 @@ export enum IndexingType {
 const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n'
 const DEFAULT_MAXIMUM_CHUNK_LENGTH = 500
 const DEFAULT_OVERLAP = 50
+const MAXIMUM_CHUNK_TOKEN_LENGTH = parseInt(globalThis.document?.body?.getAttribute('data-public-indexing-max-segmentation-tokens-length') || '4000', 10)
 
 type ParentChildConfig = {
   chunkForContext: ParentMode
@@ -163,7 +164,7 @@ const StepTwo = ({
     doSetSegmentIdentifier(value ? escape(value) : (canEmpty ? '' : DEFAULT_SEGMENT_IDENTIFIER))
   }, [])
   const [maxChunkLength, setMaxChunkLength] = useState(DEFAULT_MAXIMUM_CHUNK_LENGTH) // default chunk length
-  const [limitMaxChunkLength, setLimitMaxChunkLength] = useState(4000)
+  const [limitMaxChunkLength, setLimitMaxChunkLength] = useState(MAXIMUM_CHUNK_TOKEN_LENGTH)
   const [overlap, setOverlap] = useState(DEFAULT_OVERLAP)
   const [rules, setRules] = useState<PreProcessingRule[]>([])
   const [defaultConfig, setDefaultConfig] = useState<Rules>()
@@ -342,8 +343,8 @@ const StepTwo = ({
   }
 
   const updatePreview = () => {
-    if (segmentationType === ProcessMode.general && maxChunkLength > 4000) {
-      Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.maxLengthCheck') })
+    if (segmentationType === ProcessMode.general && maxChunkLength > MAXIMUM_CHUNK_TOKEN_LENGTH) {
+      Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.maxLengthCheck', { limit: MAXIMUM_CHUNK_TOKEN_LENGTH }) })
       return
     }
     fetchEstimate()
@@ -393,7 +394,7 @@ const StepTwo = ({
       score_threshold_enabled: false,
       score_threshold: 0.5,
     })
-  // eslint-disable-next-line react-hooks/exhaustive-deps
+    // eslint-disable-next-line react-hooks/exhaustive-deps
   }, [rerankDefaultModel, isRerankDefaultModelValid])
 
   const getCreationParams = () => {

+ 4 - 2
web/app/components/datasets/create/step-two/inputs.tsx

@@ -39,6 +39,8 @@ export const DelimiterInput: FC<InputProps & { tooltip?: string }> = (props) =>
 }
 
 export const MaxLengthInput: FC<InputNumberProps> = (props) => {
+  const maxValue = parseInt(globalThis.document?.body?.getAttribute('data-public-indexing-max-segmentation-tokens-length') || '4000', 10)
+
   const { t } = useTranslation()
   return <FormField label={<div className='system-sm-semibold mb-1'>
     {t('datasetCreation.stepTwo.maxLength')}
@@ -46,8 +48,8 @@ export const MaxLengthInput: FC<InputNumberProps> = (props) => {
     <InputNumber
       type="number"
       className='h-9'
-      placeholder={'≤ 4000'}
-      max={4000}
+      placeholder={`≤ ${maxValue}`}
+      max={maxValue}
       min={1}
       {...props}
     />

+ 1 - 0
web/app/layout.tsx

@@ -45,6 +45,7 @@ const LocaleLayout = ({
         data-public-site-about={process.env.NEXT_PUBLIC_SITE_ABOUT}
         data-public-text-generation-timeout-ms={process.env.NEXT_PUBLIC_TEXT_GENERATION_TIMEOUT_MS}
         data-public-top-k-max-value={process.env.NEXT_PUBLIC_TOP_K_MAX_VALUE}
+        data-public-indexing-max-segmentation-tokens-length={process.env.NEXT_PUBLIC_INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH}
       >
         <BrowserInitor>
           <SentryInitor>

+ 1 - 0
web/docker/entrypoint.sh

@@ -24,5 +24,6 @@ export NEXT_TELEMETRY_DISABLED=${NEXT_TELEMETRY_DISABLED}
 export NEXT_PUBLIC_TEXT_GENERATION_TIMEOUT_MS=${TEXT_GENERATION_TIMEOUT_MS}
 export NEXT_PUBLIC_CSP_WHITELIST=${CSP_WHITELIST}
 export NEXT_PUBLIC_TOP_K_MAX_VALUE=${TOP_K_MAX_VALUE}
+export NEXT_PUBLIC_INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH=${INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH}
 
 pm2 start ./pm2.json --no-daemon