浏览代码

Feature/mutil embedding model (#908)

Co-authored-by: JzoNg <jzongcode@gmail.com>
Co-authored-by: jyong <jyong@dify.ai>
Co-authored-by: StyleZhang <jasonapring2015@outlook.com>
Jyong 1 年之前
父节点
当前提交
db7156dafd
共有 54 个文件被更改,包括 1697 次插入271 次删除
  1. 31 5
      api/controllers/console/datasets/datasets.py
  2. 37 6
      api/controllers/console/datasets/datasets_document.py
  3. 168 5
      api/controllers/console/datasets/datasets_segments.py
  4. 6 1
      api/controllers/console/datasets/hit_testing.py
  5. 9 7
      api/core/docstore/dataset_docstore.py
  6. 2 2
      api/core/generator/llm_generator.py
  7. 3 1
      api/core/index/index.py
  8. 79 34
      api/core/indexing_runner.py
  9. 2 2
      api/core/prompt/prompts.py
  10. 11 3
      api/core/tool/dataset_retriever_tool.py
  11. 32 0
      api/migrations/versions/2c8af9671032_add_qa_document_language.py
  12. 34 0
      api/migrations/versions/e8883b0148c9_add_dataset_model_name.py
  13. 5 0
      api/models/dataset.py
  14. 2 1
      api/requirements.txt
  15. 108 59
      api/services/dataset_service.py
  16. 3 1
      api/services/hit_testing_service.py
  17. 69 0
      api/services/vector_service.py
  18. 95 0
      api/tasks/batch_create_segment_to_index_task.py
  19. 58 0
      api/tasks/delete_segment_from_index_task.py
  20. 4 4
      api/tasks/disable_segment_from_index_task.py
  21. 0 11
      api/tasks/update_segment_keyword_index_task.py
  22. 21 10
      web/app/(commonLayout)/datasets/DatasetCard.tsx
  23. 0 6
      web/app/(commonLayout)/datasets/page.tsx
  24. 8 0
      web/app/(commonLayout)/list.module.css
  25. 16 3
      web/app/components/app/configuration/dataset-config/card-item/index.tsx
  26. 15 6
      web/app/components/app/configuration/dataset-config/select-dataset/index.tsx
  27. 5 1
      web/app/components/app/configuration/dataset-config/select-dataset/style.module.css
  28. 4 4
      web/app/components/base/icons/assets/vender/line/general/dots-horizontal.svg
  29. 7 7
      web/app/components/base/icons/src/vender/line/general/DotsHorizontal.json
  30. 13 8
      web/app/components/base/popover/index.tsx
  31. 1 1
      web/app/components/datasets/create/file-uploader/index.tsx
  32. 44 19
      web/app/components/datasets/create/step-two/index.tsx
  33. 38 0
      web/app/components/datasets/create/step-two/language-select/index.tsx
  34. 108 0
      web/app/components/datasets/documents/detail/batch-modal/csv-downloader.tsx
  35. 126 0
      web/app/components/datasets/documents/detail/batch-modal/csv-uploader.tsx
  36. 65 0
      web/app/components/datasets/documents/detail/batch-modal/index.tsx
  37. 7 0
      web/app/components/datasets/documents/detail/completed/InfiniteVirtualList.tsx
  38. 41 3
      web/app/components/datasets/documents/detail/completed/SegmentCard.tsx
  39. 61 32
      web/app/components/datasets/documents/detail/completed/index.tsx
  40. 21 0
      web/app/components/datasets/documents/detail/completed/style.module.css
  41. 65 16
      web/app/components/datasets/documents/detail/index.tsx
  42. 84 0
      web/app/components/datasets/documents/detail/segment-add/index.tsx
  43. 71 7
      web/app/components/datasets/documents/list.tsx
  44. 36 1
      web/app/components/datasets/settings/form/index.tsx
  45. 1 0
      web/i18n/lang/dataset-creation.en.ts
  46. 1 0
      web/i18n/lang/dataset-creation.zh.ts
  47. 23 1
      web/i18n/lang/dataset-documents.en.ts
  48. 22 0
      web/i18n/lang/dataset-documents.zh.ts
  49. 3 0
      web/i18n/lang/dataset-settings.en.ts
  50. 3 0
      web/i18n/lang/dataset-settings.zh.ts
  51. 2 0
      web/i18n/lang/dataset.en.ts
  52. 2 0
      web/i18n/lang/dataset.zh.ts
  53. 9 0
      web/models/datasets.ts
  54. 16 4
      web/service/datasets.ts

+ 31 - 5
api/controllers/console/datasets/datasets.py

@@ -10,13 +10,15 @@ from controllers.console.datasets.error import DatasetNameDuplicateError
 from controllers.console.setup import setup_required
 from controllers.console.setup import setup_required
 from controllers.console.wraps import account_initialization_required
 from controllers.console.wraps import account_initialization_required
 from core.indexing_runner import IndexingRunner
 from core.indexing_runner import IndexingRunner
-from core.model_providers.error import LLMBadRequestError
+from core.model_providers.error import LLMBadRequestError, ProviderTokenNotInitError
 from core.model_providers.model_factory import ModelFactory
 from core.model_providers.model_factory import ModelFactory
+from core.model_providers.models.entity.model_params import ModelType
 from libs.helper import TimestampField
 from libs.helper import TimestampField
 from extensions.ext_database import db
 from extensions.ext_database import db
 from models.dataset import DocumentSegment, Document
 from models.dataset import DocumentSegment, Document
 from models.model import UploadFile
 from models.model import UploadFile
 from services.dataset_service import DatasetService, DocumentService
 from services.dataset_service import DatasetService, DocumentService
+from services.provider_service import ProviderService
 
 
 dataset_detail_fields = {
 dataset_detail_fields = {
     'id': fields.String,
     'id': fields.String,
@@ -33,6 +35,9 @@ dataset_detail_fields = {
     'created_at': TimestampField,
     'created_at': TimestampField,
     'updated_by': fields.String,
     'updated_by': fields.String,
     'updated_at': TimestampField,
     'updated_at': TimestampField,
+    'embedding_model': fields.String,
+    'embedding_model_provider': fields.String,
+    'embedding_available': fields.Boolean
 }
 }
 
 
 dataset_query_detail_fields = {
 dataset_query_detail_fields = {
@@ -74,8 +79,22 @@ class DatasetListApi(Resource):
             datasets, total = DatasetService.get_datasets(page, limit, provider,
             datasets, total = DatasetService.get_datasets(page, limit, provider,
                                                           current_user.current_tenant_id, current_user)
                                                           current_user.current_tenant_id, current_user)
 
 
+        # check embedding setting
+        provider_service = ProviderService()
+        valid_model_list = provider_service.get_valid_model_list(current_user.current_tenant_id, ModelType.EMBEDDINGS.value)
+        # if len(valid_model_list) == 0:
+        #     raise ProviderNotInitializeError(
+        #         f"No Embedding Model available. Please configure a valid provider "
+        #         f"in the Settings -> Model Provider.")
+        model_names = [item['model_name'] for item in valid_model_list]
+        data = marshal(datasets, dataset_detail_fields)
+        for item in data:
+            if item['embedding_model'] in model_names:
+                item['embedding_available'] = True
+            else:
+                item['embedding_available'] = False
         response = {
         response = {
-            'data': marshal(datasets, dataset_detail_fields),
+            'data': data,
             'has_more': len(datasets) == limit,
             'has_more': len(datasets) == limit,
             'limit': limit,
             'limit': limit,
             'total': total,
             'total': total,
@@ -99,7 +118,6 @@ class DatasetListApi(Resource):
         # The role of the current user in the ta table must be admin or owner
         # The role of the current user in the ta table must be admin or owner
         if current_user.current_tenant.current_role not in ['admin', 'owner']:
         if current_user.current_tenant.current_role not in ['admin', 'owner']:
             raise Forbidden()
             raise Forbidden()
-
         try:
         try:
             ModelFactory.get_embedding_model(
             ModelFactory.get_embedding_model(
                 tenant_id=current_user.current_tenant_id
                 tenant_id=current_user.current_tenant_id
@@ -233,6 +251,8 @@ class DatasetIndexingEstimateApi(Resource):
         parser.add_argument('info_list', type=dict, required=True, nullable=True, location='json')
         parser.add_argument('info_list', type=dict, required=True, nullable=True, location='json')
         parser.add_argument('process_rule', type=dict, required=True, nullable=True, location='json')
         parser.add_argument('process_rule', type=dict, required=True, nullable=True, location='json')
         parser.add_argument('doc_form', type=str, default='text_model', required=False, nullable=False, location='json')
         parser.add_argument('doc_form', type=str, default='text_model', required=False, nullable=False, location='json')
+        parser.add_argument('dataset_id', type=str, required=False, nullable=False, location='json')
+        parser.add_argument('doc_language', type=str, default='English', required=False, nullable=False, location='json')
         args = parser.parse_args()
         args = parser.parse_args()
         # validate args
         # validate args
         DocumentService.estimate_args_validate(args)
         DocumentService.estimate_args_validate(args)
@@ -250,11 +270,14 @@ class DatasetIndexingEstimateApi(Resource):
 
 
             try:
             try:
                 response = indexing_runner.file_indexing_estimate(current_user.current_tenant_id, file_details,
                 response = indexing_runner.file_indexing_estimate(current_user.current_tenant_id, file_details,
-                                                                  args['process_rule'], args['doc_form'])
+                                                                  args['process_rule'], args['doc_form'],
+                                                                  args['doc_language'], args['dataset_id'])
             except LLMBadRequestError:
             except LLMBadRequestError:
                 raise ProviderNotInitializeError(
                 raise ProviderNotInitializeError(
                     f"No Embedding Model available. Please configure a valid provider "
                     f"No Embedding Model available. Please configure a valid provider "
                     f"in the Settings -> Model Provider.")
                     f"in the Settings -> Model Provider.")
+            except ProviderTokenNotInitError as ex:
+                raise ProviderNotInitializeError(ex.description)
         elif args['info_list']['data_source_type'] == 'notion_import':
         elif args['info_list']['data_source_type'] == 'notion_import':
 
 
             indexing_runner = IndexingRunner()
             indexing_runner = IndexingRunner()
@@ -262,11 +285,14 @@ class DatasetIndexingEstimateApi(Resource):
             try:
             try:
                 response = indexing_runner.notion_indexing_estimate(current_user.current_tenant_id,
                 response = indexing_runner.notion_indexing_estimate(current_user.current_tenant_id,
                                                                     args['info_list']['notion_info_list'],
                                                                     args['info_list']['notion_info_list'],
-                                                                    args['process_rule'], args['doc_form'])
+                                                                    args['process_rule'], args['doc_form'],
+                                                                    args['doc_language'], args['dataset_id'])
             except LLMBadRequestError:
             except LLMBadRequestError:
                 raise ProviderNotInitializeError(
                 raise ProviderNotInitializeError(
                     f"No Embedding Model available. Please configure a valid provider "
                     f"No Embedding Model available. Please configure a valid provider "
                     f"in the Settings -> Model Provider.")
                     f"in the Settings -> Model Provider.")
+            except ProviderTokenNotInitError as ex:
+                raise ProviderNotInitializeError(ex.description)
         else:
         else:
             raise ValueError('Data source type not support')
             raise ValueError('Data source type not support')
         return response, 200
         return response, 200

+ 37 - 6
api/controllers/console/datasets/datasets_document.py

@@ -274,6 +274,7 @@ class DatasetDocumentListApi(Resource):
         parser.add_argument('duplicate', type=bool, nullable=False, location='json')
         parser.add_argument('duplicate', type=bool, nullable=False, location='json')
         parser.add_argument('original_document_id', type=str, required=False, location='json')
         parser.add_argument('original_document_id', type=str, required=False, location='json')
         parser.add_argument('doc_form', type=str, default='text_model', required=False, nullable=False, location='json')
         parser.add_argument('doc_form', type=str, default='text_model', required=False, nullable=False, location='json')
+        parser.add_argument('doc_language', type=str, default='English', required=False, nullable=False, location='json')
         args = parser.parse_args()
         args = parser.parse_args()
 
 
         if not dataset.indexing_technique and not args['indexing_technique']:
         if not dataset.indexing_technique and not args['indexing_technique']:
@@ -282,14 +283,19 @@ class DatasetDocumentListApi(Resource):
         # validate args
         # validate args
         DocumentService.document_create_args_validate(args)
         DocumentService.document_create_args_validate(args)
 
 
+        # check embedding model setting
         try:
         try:
             ModelFactory.get_embedding_model(
             ModelFactory.get_embedding_model(
-                tenant_id=current_user.current_tenant_id
+                tenant_id=current_user.current_tenant_id,
+                model_provider_name=dataset.embedding_model_provider,
+                model_name=dataset.embedding_model
             )
             )
         except LLMBadRequestError:
         except LLMBadRequestError:
             raise ProviderNotInitializeError(
             raise ProviderNotInitializeError(
                 f"No Embedding Model available. Please configure a valid provider "
                 f"No Embedding Model available. Please configure a valid provider "
                 f"in the Settings -> Model Provider.")
                 f"in the Settings -> Model Provider.")
+        except ProviderTokenNotInitError as ex:
+            raise ProviderNotInitializeError(ex.description)
 
 
         try:
         try:
             documents, batch = DocumentService.save_document_with_dataset_id(dataset, args, current_user)
             documents, batch = DocumentService.save_document_with_dataset_id(dataset, args, current_user)
@@ -328,6 +334,7 @@ class DatasetInitApi(Resource):
         parser.add_argument('data_source', type=dict, required=True, nullable=True, location='json')
         parser.add_argument('data_source', type=dict, required=True, nullable=True, location='json')
         parser.add_argument('process_rule', type=dict, required=True, nullable=True, location='json')
         parser.add_argument('process_rule', type=dict, required=True, nullable=True, location='json')
         parser.add_argument('doc_form', type=str, default='text_model', required=False, nullable=False, location='json')
         parser.add_argument('doc_form', type=str, default='text_model', required=False, nullable=False, location='json')
+        parser.add_argument('doc_language', type=str, default='English', required=False, nullable=False, location='json')
         args = parser.parse_args()
         args = parser.parse_args()
 
 
         try:
         try:
@@ -406,11 +413,13 @@ class DocumentIndexingEstimateApi(DocumentResource):
 
 
                 try:
                 try:
                     response = indexing_runner.file_indexing_estimate(current_user.current_tenant_id, [file],
                     response = indexing_runner.file_indexing_estimate(current_user.current_tenant_id, [file],
-                                                                      data_process_rule_dict)
+                                                                      data_process_rule_dict, None, dataset_id)
                 except LLMBadRequestError:
                 except LLMBadRequestError:
                     raise ProviderNotInitializeError(
                     raise ProviderNotInitializeError(
                         f"No Embedding Model available. Please configure a valid provider "
                         f"No Embedding Model available. Please configure a valid provider "
                         f"in the Settings -> Model Provider.")
                         f"in the Settings -> Model Provider.")
+                except ProviderTokenNotInitError as ex:
+                    raise ProviderNotInitializeError(ex.description)
 
 
         return response
         return response
 
 
@@ -473,22 +482,27 @@ class DocumentBatchIndexingEstimateApi(DocumentResource):
             indexing_runner = IndexingRunner()
             indexing_runner = IndexingRunner()
             try:
             try:
                 response = indexing_runner.file_indexing_estimate(current_user.current_tenant_id, file_details,
                 response = indexing_runner.file_indexing_estimate(current_user.current_tenant_id, file_details,
-                                                                  data_process_rule_dict)
+                                                                  data_process_rule_dict,  None, dataset_id)
             except LLMBadRequestError:
             except LLMBadRequestError:
                 raise ProviderNotInitializeError(
                 raise ProviderNotInitializeError(
                     f"No Embedding Model available. Please configure a valid provider "
                     f"No Embedding Model available. Please configure a valid provider "
                     f"in the Settings -> Model Provider.")
                     f"in the Settings -> Model Provider.")
-        elif dataset.data_source_type:
+            except ProviderTokenNotInitError as ex:
+                raise ProviderNotInitializeError(ex.description)
+        elif dataset.data_source_type == 'notion_import':
 
 
             indexing_runner = IndexingRunner()
             indexing_runner = IndexingRunner()
             try:
             try:
                 response = indexing_runner.notion_indexing_estimate(current_user.current_tenant_id,
                 response = indexing_runner.notion_indexing_estimate(current_user.current_tenant_id,
                                                                     info_list,
                                                                     info_list,
-                                                                    data_process_rule_dict)
+                                                                    data_process_rule_dict,
+                                                                    None, dataset_id)
             except LLMBadRequestError:
             except LLMBadRequestError:
                 raise ProviderNotInitializeError(
                 raise ProviderNotInitializeError(
                     f"No Embedding Model available. Please configure a valid provider "
                     f"No Embedding Model available. Please configure a valid provider "
                     f"in the Settings -> Model Provider.")
                     f"in the Settings -> Model Provider.")
+            except ProviderTokenNotInitError as ex:
+                raise ProviderNotInitializeError(ex.description)
         else:
         else:
             raise ValueError('Data source type not support')
             raise ValueError('Data source type not support')
         return response
         return response
@@ -575,7 +589,8 @@ class DocumentIndexingStatusApi(DocumentResource):
 
 
         document.completed_segments = completed_segments
         document.completed_segments = completed_segments
         document.total_segments = total_segments
         document.total_segments = total_segments
-
+        if document.is_paused:
+            document.indexing_status = 'paused'
         return marshal(document, self.document_status_fields)
         return marshal(document, self.document_status_fields)
 
 
 
 
@@ -832,6 +847,22 @@ class DocumentStatusApi(DocumentResource):
 
 
                 remove_document_from_index_task.delay(document_id)
                 remove_document_from_index_task.delay(document_id)
 
 
+            return {'result': 'success'}, 200
+        elif action == "un_archive":
+            if not document.archived:
+                raise InvalidActionError('Document is not archived.')
+
+            document.archived = False
+            document.archived_at = None
+            document.archived_by = None
+            document.updated_at = datetime.utcnow()
+            db.session.commit()
+
+            # Set cache to prevent indexing the same document multiple times
+            redis_client.setex(indexing_cache_key, 600, 1)
+
+            add_document_to_index_task.delay(document_id)
+
             return {'result': 'success'}, 200
             return {'result': 'success'}, 200
         else:
         else:
             raise InvalidActionError()
             raise InvalidActionError()

+ 168 - 5
api/controllers/console/datasets/datasets_segments.py

@@ -1,15 +1,20 @@
 # -*- coding:utf-8 -*-
 # -*- coding:utf-8 -*-
+import uuid
 from datetime import datetime
 from datetime import datetime
 
 
+from flask import request
 from flask_login import login_required, current_user
 from flask_login import login_required, current_user
 from flask_restful import Resource, reqparse, fields, marshal
 from flask_restful import Resource, reqparse, fields, marshal
 from werkzeug.exceptions import NotFound, Forbidden
 from werkzeug.exceptions import NotFound, Forbidden
 
 
 import services
 import services
 from controllers.console import api
 from controllers.console import api
-from controllers.console.datasets.error import InvalidActionError
+from controllers.console.app.error import ProviderNotInitializeError
+from controllers.console.datasets.error import InvalidActionError, NoFileUploadedError, TooManyFilesError
 from controllers.console.setup import setup_required
 from controllers.console.setup import setup_required
 from controllers.console.wraps import account_initialization_required
 from controllers.console.wraps import account_initialization_required
+from core.model_providers.error import LLMBadRequestError, ProviderTokenNotInitError
+from core.model_providers.model_factory import ModelFactory
 from extensions.ext_database import db
 from extensions.ext_database import db
 from extensions.ext_redis import redis_client
 from extensions.ext_redis import redis_client
 from models.dataset import DocumentSegment
 from models.dataset import DocumentSegment
@@ -17,7 +22,9 @@ from models.dataset import DocumentSegment
 from libs.helper import TimestampField
 from libs.helper import TimestampField
 from services.dataset_service import DatasetService, DocumentService, SegmentService
 from services.dataset_service import DatasetService, DocumentService, SegmentService
 from tasks.enable_segment_to_index_task import enable_segment_to_index_task
 from tasks.enable_segment_to_index_task import enable_segment_to_index_task
-from tasks.remove_segment_from_index_task import remove_segment_from_index_task
+from tasks.disable_segment_from_index_task import disable_segment_from_index_task
+from tasks.batch_create_segment_to_index_task import batch_create_segment_to_index_task
+import pandas as pd
 
 
 segment_fields = {
 segment_fields = {
     'id': fields.String,
     'id': fields.String,
@@ -152,6 +159,20 @@ class DatasetDocumentSegmentApi(Resource):
         except services.errors.account.NoPermissionError as e:
         except services.errors.account.NoPermissionError as e:
             raise Forbidden(str(e))
             raise Forbidden(str(e))
 
 
+        # check embedding model setting
+        try:
+            ModelFactory.get_embedding_model(
+                tenant_id=current_user.current_tenant_id,
+                model_provider_name=dataset.embedding_model_provider,
+                model_name=dataset.embedding_model
+            )
+        except LLMBadRequestError:
+            raise ProviderNotInitializeError(
+                f"No Embedding Model available. Please configure a valid provider "
+                f"in the Settings -> Model Provider.")
+        except ProviderTokenNotInitError as ex:
+            raise ProviderNotInitializeError(ex.description)
+
         segment = DocumentSegment.query.filter(
         segment = DocumentSegment.query.filter(
             DocumentSegment.id == str(segment_id),
             DocumentSegment.id == str(segment_id),
             DocumentSegment.tenant_id == current_user.current_tenant_id
             DocumentSegment.tenant_id == current_user.current_tenant_id
@@ -197,7 +218,7 @@ class DatasetDocumentSegmentApi(Resource):
             # Set cache to prevent indexing the same segment multiple times
             # Set cache to prevent indexing the same segment multiple times
             redis_client.setex(indexing_cache_key, 600, 1)
             redis_client.setex(indexing_cache_key, 600, 1)
 
 
-            remove_segment_from_index_task.delay(segment.id)
+            disable_segment_from_index_task.delay(segment.id)
 
 
             return {'result': 'success'}, 200
             return {'result': 'success'}, 200
         else:
         else:
@@ -222,6 +243,19 @@ class DatasetDocumentSegmentAddApi(Resource):
         # The role of the current user in the ta table must be admin or owner
         # The role of the current user in the ta table must be admin or owner
         if current_user.current_tenant.current_role not in ['admin', 'owner']:
         if current_user.current_tenant.current_role not in ['admin', 'owner']:
             raise Forbidden()
             raise Forbidden()
+        # check embedding model setting
+        try:
+            ModelFactory.get_embedding_model(
+                tenant_id=current_user.current_tenant_id,
+                model_provider_name=dataset.embedding_model_provider,
+                model_name=dataset.embedding_model
+            )
+        except LLMBadRequestError:
+            raise ProviderNotInitializeError(
+                f"No Embedding Model available. Please configure a valid provider "
+                f"in the Settings -> Model Provider.")
+        except ProviderTokenNotInitError as ex:
+            raise ProviderNotInitializeError(ex.description)
         try:
         try:
             DatasetService.check_dataset_permission(dataset, current_user)
             DatasetService.check_dataset_permission(dataset, current_user)
         except services.errors.account.NoPermissionError as e:
         except services.errors.account.NoPermissionError as e:
@@ -233,7 +267,7 @@ class DatasetDocumentSegmentAddApi(Resource):
         parser.add_argument('keywords', type=list, required=False, nullable=True, location='json')
         parser.add_argument('keywords', type=list, required=False, nullable=True, location='json')
         args = parser.parse_args()
         args = parser.parse_args()
         SegmentService.segment_create_args_validate(args, document)
         SegmentService.segment_create_args_validate(args, document)
-        segment = SegmentService.create_segment(args, document)
+        segment = SegmentService.create_segment(args, document, dataset)
         return {
         return {
             'data': marshal(segment, segment_fields),
             'data': marshal(segment, segment_fields),
             'doc_form': document.doc_form
             'doc_form': document.doc_form
@@ -255,6 +289,19 @@ class DatasetDocumentSegmentUpdateApi(Resource):
         document = DocumentService.get_document(dataset_id, document_id)
         document = DocumentService.get_document(dataset_id, document_id)
         if not document:
         if not document:
             raise NotFound('Document not found.')
             raise NotFound('Document not found.')
+        # check embedding model setting
+        try:
+            ModelFactory.get_embedding_model(
+                tenant_id=current_user.current_tenant_id,
+                model_provider_name=dataset.embedding_model_provider,
+                model_name=dataset.embedding_model
+            )
+        except LLMBadRequestError:
+            raise ProviderNotInitializeError(
+                f"No Embedding Model available. Please configure a valid provider "
+                f"in the Settings -> Model Provider.")
+        except ProviderTokenNotInitError as ex:
+            raise ProviderNotInitializeError(ex.description)
         # check segment
         # check segment
         segment_id = str(segment_id)
         segment_id = str(segment_id)
         segment = DocumentSegment.query.filter(
         segment = DocumentSegment.query.filter(
@@ -277,12 +324,125 @@ class DatasetDocumentSegmentUpdateApi(Resource):
         parser.add_argument('keywords', type=list, required=False, nullable=True, location='json')
         parser.add_argument('keywords', type=list, required=False, nullable=True, location='json')
         args = parser.parse_args()
         args = parser.parse_args()
         SegmentService.segment_create_args_validate(args, document)
         SegmentService.segment_create_args_validate(args, document)
-        segment = SegmentService.update_segment(args, segment, document)
+        segment = SegmentService.update_segment(args, segment, document, dataset)
         return {
         return {
             'data': marshal(segment, segment_fields),
             'data': marshal(segment, segment_fields),
             'doc_form': document.doc_form
             'doc_form': document.doc_form
         }, 200
         }, 200
 
 
+    @setup_required
+    @login_required
+    @account_initialization_required
+    def delete(self, dataset_id, document_id, segment_id):
+        # check dataset
+        dataset_id = str(dataset_id)
+        dataset = DatasetService.get_dataset(dataset_id)
+        if not dataset:
+            raise NotFound('Dataset not found.')
+        # check document
+        document_id = str(document_id)
+        document = DocumentService.get_document(dataset_id, document_id)
+        if not document:
+            raise NotFound('Document not found.')
+        # check segment
+        segment_id = str(segment_id)
+        segment = DocumentSegment.query.filter(
+            DocumentSegment.id == str(segment_id),
+            DocumentSegment.tenant_id == current_user.current_tenant_id
+        ).first()
+        if not segment:
+            raise NotFound('Segment not found.')
+        # The role of the current user in the ta table must be admin or owner
+        if current_user.current_tenant.current_role not in ['admin', 'owner']:
+            raise Forbidden()
+        try:
+            DatasetService.check_dataset_permission(dataset, current_user)
+        except services.errors.account.NoPermissionError as e:
+            raise Forbidden(str(e))
+        SegmentService.delete_segment(segment, document, dataset)
+        return {'result': 'success'}, 200
+
+
+class DatasetDocumentSegmentBatchImportApi(Resource):
+    @setup_required
+    @login_required
+    @account_initialization_required
+    def post(self, dataset_id, document_id):
+        # check dataset
+        dataset_id = str(dataset_id)
+        dataset = DatasetService.get_dataset(dataset_id)
+        if not dataset:
+            raise NotFound('Dataset not found.')
+        # check document
+        document_id = str(document_id)
+        document = DocumentService.get_document(dataset_id, document_id)
+        if not document:
+            raise NotFound('Document not found.')
+        try:
+            ModelFactory.get_embedding_model(
+                tenant_id=current_user.current_tenant_id,
+                model_provider_name=dataset.embedding_model_provider,
+                model_name=dataset.embedding_model
+            )
+        except LLMBadRequestError:
+            raise ProviderNotInitializeError(
+                f"No Embedding Model available. Please configure a valid provider "
+                f"in the Settings -> Model Provider.")
+        except ProviderTokenNotInitError as ex:
+            raise ProviderNotInitializeError(ex.description)
+        # get file from request
+        file = request.files['file']
+        # check file
+        if 'file' not in request.files:
+            raise NoFileUploadedError()
+
+        if len(request.files) > 1:
+            raise TooManyFilesError()
+        # check file type
+        if not file.filename.endswith('.csv'):
+            raise ValueError("Invalid file type. Only CSV files are allowed")
+
+        try:
+            # Skip the first row
+            df = pd.read_csv(file)
+            result = []
+            for index, row in df.iterrows():
+                if document.doc_form == 'qa_model':
+                    data = {'content': row[0], 'answer': row[1]}
+                else:
+                    data = {'content': row[0]}
+                result.append(data)
+            if len(result) == 0:
+                raise ValueError("The CSV file is empty.")
+            # async job
+            job_id = str(uuid.uuid4())
+            indexing_cache_key = 'segment_batch_import_{}'.format(str(job_id))
+            # send batch add segments task
+            redis_client.setnx(indexing_cache_key, 'waiting')
+            batch_create_segment_to_index_task.delay(str(job_id), result, dataset_id, document_id,
+                                                     current_user.current_tenant_id, current_user.id)
+        except Exception as e:
+            return {'error': str(e)}, 500
+        return {
+            'job_id': job_id,
+            'job_status': 'waiting'
+        }, 200
+
+    @setup_required
+    @login_required
+    @account_initialization_required
+    def get(self, job_id):
+        job_id = str(job_id)
+        indexing_cache_key = 'segment_batch_import_{}'.format(job_id)
+        cache_result = redis_client.get(indexing_cache_key)
+        if cache_result is None:
+            raise ValueError("The job is not exist.")
+
+        return {
+            'job_id': job_id,
+            'job_status': cache_result.decode()
+        }, 200
+
 
 
 api.add_resource(DatasetDocumentSegmentListApi,
 api.add_resource(DatasetDocumentSegmentListApi,
                  '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/segments')
                  '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/segments')
@@ -292,3 +452,6 @@ api.add_resource(DatasetDocumentSegmentAddApi,
                  '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/segment')
                  '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/segment')
 api.add_resource(DatasetDocumentSegmentUpdateApi,
 api.add_resource(DatasetDocumentSegmentUpdateApi,
                  '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/segments/<uuid:segment_id>')
                  '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/segments/<uuid:segment_id>')
+api.add_resource(DatasetDocumentSegmentBatchImportApi,
+                 '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/segments/batch_import',
+                 '/datasets/batch_import_status/<uuid:job_id>')

+ 6 - 1
api/controllers/console/datasets/hit_testing.py

@@ -11,7 +11,8 @@ from controllers.console.app.error import ProviderNotInitializeError, ProviderQu
 from controllers.console.datasets.error import HighQualityDatasetOnlyError, DatasetNotInitializedError
 from controllers.console.datasets.error import HighQualityDatasetOnlyError, DatasetNotInitializedError
 from controllers.console.setup import setup_required
 from controllers.console.setup import setup_required
 from controllers.console.wraps import account_initialization_required
 from controllers.console.wraps import account_initialization_required
-from core.model_providers.error import ProviderTokenNotInitError, QuotaExceededError, ModelCurrentlyNotSupportError
+from core.model_providers.error import ProviderTokenNotInitError, QuotaExceededError, ModelCurrentlyNotSupportError, \
+    LLMBadRequestError
 from libs.helper import TimestampField
 from libs.helper import TimestampField
 from services.dataset_service import DatasetService
 from services.dataset_service import DatasetService
 from services.hit_testing_service import HitTestingService
 from services.hit_testing_service import HitTestingService
@@ -102,6 +103,10 @@ class HitTestingApi(Resource):
             raise ProviderQuotaExceededError()
             raise ProviderQuotaExceededError()
         except ModelCurrentlyNotSupportError:
         except ModelCurrentlyNotSupportError:
             raise ProviderModelCurrentlyNotSupportError()
             raise ProviderModelCurrentlyNotSupportError()
+        except LLMBadRequestError:
+            raise ProviderNotInitializeError(
+                f"No Embedding Model available. Please configure a valid provider "
+                f"in the Settings -> Model Provider.")
         except ValueError as e:
         except ValueError as e:
             raise ValueError(str(e))
             raise ValueError(str(e))
         except Exception as e:
         except Exception as e:

+ 9 - 7
api/core/docstore/dataset_docstore.py

@@ -10,10 +10,10 @@ from models.dataset import Dataset, DocumentSegment
 
 
 class DatesetDocumentStore:
 class DatesetDocumentStore:
     def __init__(
     def __init__(
-        self,
-        dataset: Dataset,
-        user_id: str,
-        document_id: Optional[str] = None,
+            self,
+            dataset: Dataset,
+            user_id: str,
+            document_id: Optional[str] = None,
     ):
     ):
         self._dataset = dataset
         self._dataset = dataset
         self._user_id = user_id
         self._user_id = user_id
@@ -59,7 +59,7 @@ class DatesetDocumentStore:
         return output
         return output
 
 
     def add_documents(
     def add_documents(
-        self, docs: Sequence[Document], allow_update: bool = True
+            self, docs: Sequence[Document], allow_update: bool = True
     ) -> None:
     ) -> None:
         max_position = db.session.query(func.max(DocumentSegment.position)).filter(
         max_position = db.session.query(func.max(DocumentSegment.position)).filter(
             DocumentSegment.document_id == self._document_id
             DocumentSegment.document_id == self._document_id
@@ -69,7 +69,9 @@ class DatesetDocumentStore:
             max_position = 0
             max_position = 0
 
 
         embedding_model = ModelFactory.get_embedding_model(
         embedding_model = ModelFactory.get_embedding_model(
-            tenant_id=self._dataset.tenant_id
+            tenant_id=self._dataset.tenant_id,
+            model_provider_name=self._dataset.embedding_model_provider,
+            model_name=self._dataset.embedding_model
         )
         )
 
 
         for doc in docs:
         for doc in docs:
@@ -123,7 +125,7 @@ class DatesetDocumentStore:
         return result is not None
         return result is not None
 
 
     def get_document(
     def get_document(
-        self, doc_id: str, raise_error: bool = True
+            self, doc_id: str, raise_error: bool = True
     ) -> Optional[Document]:
     ) -> Optional[Document]:
         document_segment = self.get_document_segment(doc_id)
         document_segment = self.get_document_segment(doc_id)
 
 

+ 2 - 2
api/core/generator/llm_generator.py

@@ -179,8 +179,8 @@ class LLMGenerator:
         return rule_config
         return rule_config
 
 
     @classmethod
     @classmethod
-    def generate_qa_document(cls, tenant_id: str, query):
-        prompt = GENERATOR_QA_PROMPT
+    def generate_qa_document(cls, tenant_id: str, query, document_language: str):
+        prompt = GENERATOR_QA_PROMPT.format(language=document_language)
 
 
         model_instance = ModelFactory.get_text_generation_model(
         model_instance = ModelFactory.get_text_generation_model(
             tenant_id=tenant_id,
             tenant_id=tenant_id,

+ 3 - 1
api/core/index/index.py

@@ -15,7 +15,9 @@ class IndexBuilder:
                 return None
                 return None
 
 
             embedding_model = ModelFactory.get_embedding_model(
             embedding_model = ModelFactory.get_embedding_model(
-                tenant_id=dataset.tenant_id
+                tenant_id=dataset.tenant_id,
+                model_provider_name=dataset.embedding_model_provider,
+                model_name=dataset.embedding_model
             )
             )
 
 
             embeddings = CacheEmbedding(embedding_model)
             embeddings = CacheEmbedding(embedding_model)

+ 79 - 34
api/core/indexing_runner.py

@@ -67,14 +67,6 @@ class IndexingRunner:
                     dataset_document=dataset_document,
                     dataset_document=dataset_document,
                     processing_rule=processing_rule
                     processing_rule=processing_rule
                 )
                 )
-                # new_documents = []
-                # for document in documents:
-                #     response = LLMGenerator.generate_qa_document(dataset.tenant_id, document.page_content)
-                #     document_qa_list = self.format_split_text(response)
-                #     for result in document_qa_list:
-                #         document = Document(page_content=result['question'], metadata={'source': result['answer']})
-                #         new_documents.append(document)
-                # build index
                 self._build_index(
                 self._build_index(
                     dataset=dataset,
                     dataset=dataset,
                     dataset_document=dataset_document,
                     dataset_document=dataset_document,
@@ -225,14 +217,25 @@ class IndexingRunner:
             db.session.commit()
             db.session.commit()
 
 
     def file_indexing_estimate(self, tenant_id: str, file_details: List[UploadFile], tmp_processing_rule: dict,
     def file_indexing_estimate(self, tenant_id: str, file_details: List[UploadFile], tmp_processing_rule: dict,
-                               doc_form: str = None) -> dict:
+                               doc_form: str = None, doc_language: str = 'English', dataset_id: str = None) -> dict:
         """
         """
         Estimate the indexing for the document.
         Estimate the indexing for the document.
         """
         """
-        embedding_model = ModelFactory.get_embedding_model(
-            tenant_id=tenant_id
-        )
-
+        if dataset_id:
+            dataset = Dataset.query.filter_by(
+                id=dataset_id
+            ).first()
+            if not dataset:
+                raise ValueError('Dataset not found.')
+            embedding_model = ModelFactory.get_embedding_model(
+                tenant_id=dataset.tenant_id,
+                model_provider_name=dataset.embedding_model_provider,
+                model_name=dataset.embedding_model
+            )
+        else:
+            embedding_model = ModelFactory.get_embedding_model(
+                tenant_id=tenant_id
+            )
         tokens = 0
         tokens = 0
         preview_texts = []
         preview_texts = []
         total_segments = 0
         total_segments = 0
@@ -263,14 +266,13 @@ class IndexingRunner:
 
 
                 tokens += embedding_model.get_num_tokens(self.filter_string(document.page_content))
                 tokens += embedding_model.get_num_tokens(self.filter_string(document.page_content))
 
 
-        text_generation_model = ModelFactory.get_text_generation_model(
-            tenant_id=tenant_id
-        )
-
         if doc_form and doc_form == 'qa_model':
         if doc_form and doc_form == 'qa_model':
+            text_generation_model = ModelFactory.get_text_generation_model(
+                tenant_id=tenant_id
+            )
             if len(preview_texts) > 0:
             if len(preview_texts) > 0:
                 # qa model document
                 # qa model document
-                response = LLMGenerator.generate_qa_document(current_user.current_tenant_id, preview_texts[0])
+                response = LLMGenerator.generate_qa_document(current_user.current_tenant_id, preview_texts[0], doc_language)
                 document_qa_list = self.format_split_text(response)
                 document_qa_list = self.format_split_text(response)
                 return {
                 return {
                     "total_segments": total_segments * 20,
                     "total_segments": total_segments * 20,
@@ -289,13 +291,26 @@ class IndexingRunner:
             "preview": preview_texts
             "preview": preview_texts
         }
         }
 
 
-    def notion_indexing_estimate(self, tenant_id: str, notion_info_list: list, tmp_processing_rule: dict, doc_form: str = None) -> dict:
+    def notion_indexing_estimate(self, tenant_id: str, notion_info_list: list, tmp_processing_rule: dict,
+                                 doc_form: str = None, doc_language: str = 'English', dataset_id: str = None) -> dict:
         """
         """
         Estimate the indexing for the document.
         Estimate the indexing for the document.
         """
         """
-        embedding_model = ModelFactory.get_embedding_model(
-            tenant_id=tenant_id
-        )
+        if dataset_id:
+            dataset = Dataset.query.filter_by(
+                id=dataset_id
+            ).first()
+            if not dataset:
+                raise ValueError('Dataset not found.')
+            embedding_model = ModelFactory.get_embedding_model(
+                tenant_id=dataset.tenant_id,
+                model_provider_name=dataset.embedding_model_provider,
+                model_name=dataset.embedding_model
+            )
+        else:
+            embedding_model = ModelFactory.get_embedding_model(
+                tenant_id=tenant_id
+            )
 
 
         # load data from notion
         # load data from notion
         tokens = 0
         tokens = 0
@@ -344,14 +359,13 @@ class IndexingRunner:
 
 
                     tokens += embedding_model.get_num_tokens(document.page_content)
                     tokens += embedding_model.get_num_tokens(document.page_content)
 
 
-        text_generation_model = ModelFactory.get_text_generation_model(
-            tenant_id=tenant_id
-        )
-
         if doc_form and doc_form == 'qa_model':
         if doc_form and doc_form == 'qa_model':
+            text_generation_model = ModelFactory.get_text_generation_model(
+                tenant_id=tenant_id
+            )
             if len(preview_texts) > 0:
             if len(preview_texts) > 0:
                 # qa model document
                 # qa model document
-                response = LLMGenerator.generate_qa_document(current_user.current_tenant_id, preview_texts[0])
+                response = LLMGenerator.generate_qa_document(current_user.current_tenant_id, preview_texts[0], doc_language)
                 document_qa_list = self.format_split_text(response)
                 document_qa_list = self.format_split_text(response)
                 return {
                 return {
                     "total_segments": total_segments * 20,
                     "total_segments": total_segments * 20,
@@ -458,7 +472,8 @@ class IndexingRunner:
             splitter=splitter,
             splitter=splitter,
             processing_rule=processing_rule,
             processing_rule=processing_rule,
             tenant_id=dataset.tenant_id,
             tenant_id=dataset.tenant_id,
-            document_form=dataset_document.doc_form
+            document_form=dataset_document.doc_form,
+            document_language=dataset_document.doc_language
         )
         )
 
 
         # save node to document segment
         # save node to document segment
@@ -494,7 +509,8 @@ class IndexingRunner:
         return documents
         return documents
 
 
     def _split_to_documents(self, text_docs: List[Document], splitter: TextSplitter,
     def _split_to_documents(self, text_docs: List[Document], splitter: TextSplitter,
-                            processing_rule: DatasetProcessRule, tenant_id: str, document_form: str) -> List[Document]:
+                            processing_rule: DatasetProcessRule, tenant_id: str,
+                            document_form: str, document_language: str) -> List[Document]:
         """
         """
         Split the text documents into nodes.
         Split the text documents into nodes.
         """
         """
@@ -523,8 +539,9 @@ class IndexingRunner:
                 sub_documents = all_documents[i:i + 10]
                 sub_documents = all_documents[i:i + 10]
                 for doc in sub_documents:
                 for doc in sub_documents:
                     document_format_thread = threading.Thread(target=self.format_qa_document, kwargs={
                     document_format_thread = threading.Thread(target=self.format_qa_document, kwargs={
-                        'flask_app': current_app._get_current_object(), 'tenant_id': tenant_id, 'document_node': doc,
-                        'all_qa_documents': all_qa_documents})
+                        'flask_app': current_app._get_current_object(),
+                        'tenant_id': tenant_id, 'document_node': doc, 'all_qa_documents': all_qa_documents,
+                        'document_language': document_language})
                     threads.append(document_format_thread)
                     threads.append(document_format_thread)
                     document_format_thread.start()
                     document_format_thread.start()
                 for thread in threads:
                 for thread in threads:
@@ -532,14 +549,14 @@ class IndexingRunner:
             return all_qa_documents
             return all_qa_documents
         return all_documents
         return all_documents
 
 
-    def format_qa_document(self, flask_app: Flask, tenant_id: str, document_node, all_qa_documents):
+    def format_qa_document(self, flask_app: Flask, tenant_id: str, document_node, all_qa_documents, document_language):
         format_documents = []
         format_documents = []
         if document_node.page_content is None or not document_node.page_content.strip():
         if document_node.page_content is None or not document_node.page_content.strip():
             return
             return
         with flask_app.app_context():
         with flask_app.app_context():
             try:
             try:
                 # qa model document
                 # qa model document
-                response = LLMGenerator.generate_qa_document(tenant_id, document_node.page_content)
+                response = LLMGenerator.generate_qa_document(tenant_id, document_node.page_content, document_language)
                 document_qa_list = self.format_split_text(response)
                 document_qa_list = self.format_split_text(response)
                 qa_documents = []
                 qa_documents = []
                 for result in document_qa_list:
                 for result in document_qa_list:
@@ -641,7 +658,9 @@ class IndexingRunner:
         keyword_table_index = IndexBuilder.get_index(dataset, 'economy')
         keyword_table_index = IndexBuilder.get_index(dataset, 'economy')
 
 
         embedding_model = ModelFactory.get_embedding_model(
         embedding_model = ModelFactory.get_embedding_model(
-            tenant_id=dataset.tenant_id
+            tenant_id=dataset.tenant_id,
+            model_provider_name=dataset.embedding_model_provider,
+            model_name=dataset.embedding_model
         )
         )
 
 
         # chunk nodes by chunk size
         # chunk nodes by chunk size
@@ -722,6 +741,32 @@ class IndexingRunner:
         DocumentSegment.query.filter_by(document_id=dataset_document_id).update(update_params)
         DocumentSegment.query.filter_by(document_id=dataset_document_id).update(update_params)
         db.session.commit()
         db.session.commit()
 
 
+    def batch_add_segments(self, segments: List[DocumentSegment], dataset: Dataset):
+        """
+        Batch add segments index processing
+        """
+        documents = []
+        for segment in segments:
+            document = Document(
+                page_content=segment.content,
+                metadata={
+                    "doc_id": segment.index_node_id,
+                    "doc_hash": segment.index_node_hash,
+                    "document_id": segment.document_id,
+                    "dataset_id": segment.dataset_id,
+                }
+            )
+            documents.append(document)
+        # save vector index
+        index = IndexBuilder.get_index(dataset, 'high_quality')
+        if index:
+            index.add_texts(documents, duplicate_check=True)
+
+        # save keyword index
+        index = IndexBuilder.get_index(dataset, 'economy')
+        if index:
+            index.add_texts(documents)
+
 
 
 class DocumentIsPausedException(Exception):
 class DocumentIsPausedException(Exception):
     pass
     pass

+ 2 - 2
api/core/prompt/prompts.py

@@ -44,13 +44,13 @@ SUGGESTED_QUESTIONS_AFTER_ANSWER_INSTRUCTION_PROMPT = (
 )
 )
 
 
 GENERATOR_QA_PROMPT = (
 GENERATOR_QA_PROMPT = (
-    "Please respond according to the language of the user's input text. If the text is in language [A], you must also reply in language [A].\n"
+    'The user will send a long text. Please think step by step.'
     'Step 1: Understand and summarize the main content of this text.\n'
     'Step 1: Understand and summarize the main content of this text.\n'
     'Step 2: What key information or concepts are mentioned in this text?\n'
     'Step 2: What key information or concepts are mentioned in this text?\n'
     'Step 3: Decompose or combine multiple pieces of information and concepts.\n'
     'Step 3: Decompose or combine multiple pieces of information and concepts.\n'
     'Step 4: Generate 20 questions and answers based on these key information and concepts.'
     'Step 4: Generate 20 questions and answers based on these key information and concepts.'
     'The questions should be clear and detailed, and the answers should be detailed and complete.\n'
     'The questions should be clear and detailed, and the answers should be detailed and complete.\n'
-    "Answer in the following format: Q1:\nA1:\nQ2:\nA2:...\n"
+    "Answer must be the language:{language} and in the following format: Q1:\nA1:\nQ2:\nA2:...\n"
 )
 )
 
 
 RULE_CONFIG_GENERATE_TEMPLATE = """Given MY INTENDED AUDIENCES and HOPING TO SOLVE using a language model, please select \
 RULE_CONFIG_GENERATE_TEMPLATE = """Given MY INTENDED AUDIENCES and HOPING TO SOLVE using a language model, please select \

+ 11 - 3
api/core/tool/dataset_retriever_tool.py

@@ -9,6 +9,7 @@ from core.callback_handler.index_tool_callback_handler import DatasetIndexToolCa
 from core.embedding.cached_embedding import CacheEmbedding
 from core.embedding.cached_embedding import CacheEmbedding
 from core.index.keyword_table_index.keyword_table_index import KeywordTableIndex, KeywordTableConfig
 from core.index.keyword_table_index.keyword_table_index import KeywordTableIndex, KeywordTableConfig
 from core.index.vector_index.vector_index import VectorIndex
 from core.index.vector_index.vector_index import VectorIndex
+from core.model_providers.error import LLMBadRequestError, ProviderTokenNotInitError
 from core.model_providers.model_factory import ModelFactory
 from core.model_providers.model_factory import ModelFactory
 from extensions.ext_database import db
 from extensions.ext_database import db
 from models.dataset import Dataset, DocumentSegment
 from models.dataset import Dataset, DocumentSegment
@@ -70,10 +71,17 @@ class DatasetRetrieverTool(BaseTool):
             documents = kw_table_index.search(query, search_kwargs={'k': self.k})
             documents = kw_table_index.search(query, search_kwargs={'k': self.k})
             return str("\n".join([document.page_content for document in documents]))
             return str("\n".join([document.page_content for document in documents]))
         else:
         else:
-            embedding_model = ModelFactory.get_embedding_model(
-                tenant_id=dataset.tenant_id
-            )
 
 
+            try:
+                embedding_model = ModelFactory.get_embedding_model(
+                    tenant_id=dataset.tenant_id,
+                    model_provider_name=dataset.embedding_model_provider,
+                    model_name=dataset.embedding_model
+                )
+            except LLMBadRequestError:
+                return ''
+            except ProviderTokenNotInitError:
+                return ''
             embeddings = CacheEmbedding(embedding_model)
             embeddings = CacheEmbedding(embedding_model)
 
 
             vector_index = VectorIndex(
             vector_index = VectorIndex(

+ 32 - 0
api/migrations/versions/2c8af9671032_add_qa_document_language.py

@@ -0,0 +1,32 @@
+"""add_qa_document_language
+
+Revision ID: 2c8af9671032
+Revises: 8d2d099ceb74
+Create Date: 2023-08-01 18:57:27.294973
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = '2c8af9671032'
+down_revision = '5022897aaceb'
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    with op.batch_alter_table('documents', schema=None) as batch_op:
+        batch_op.add_column(sa.Column('doc_language', sa.String(length=255), nullable=True))
+
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    with op.batch_alter_table('documents', schema=None) as batch_op:
+        batch_op.drop_column('doc_language')
+
+    # ### end Alembic commands ###

+ 34 - 0
api/migrations/versions/e8883b0148c9_add_dataset_model_name.py

@@ -0,0 +1,34 @@
+"""add_dataset_model_name
+
+Revision ID: e8883b0148c9
+Revises: 2c8af9671032
+Create Date: 2023-08-15 20:54:58.936787
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = 'e8883b0148c9'
+down_revision = '2c8af9671032'
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    with op.batch_alter_table('datasets', schema=None) as batch_op:
+        batch_op.add_column(sa.Column('embedding_model', sa.String(length=255), server_default=sa.text("'text-embedding-ada-002'::character varying"), nullable=False))
+        batch_op.add_column(sa.Column('embedding_model_provider', sa.String(length=255), server_default=sa.text("'openai'::character varying"), nullable=False))
+
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    with op.batch_alter_table('datasets', schema=None) as batch_op:
+        batch_op.drop_column('embedding_model_provider')
+        batch_op.drop_column('embedding_model')
+
+    # ### end Alembic commands ###

+ 5 - 0
api/models/dataset.py

@@ -36,6 +36,10 @@ class Dataset(db.Model):
     updated_by = db.Column(UUID, nullable=True)
     updated_by = db.Column(UUID, nullable=True)
     updated_at = db.Column(db.DateTime, nullable=False,
     updated_at = db.Column(db.DateTime, nullable=False,
                            server_default=db.text('CURRENT_TIMESTAMP(0)'))
                            server_default=db.text('CURRENT_TIMESTAMP(0)'))
+    embedding_model = db.Column(db.String(
+        255), nullable=False, server_default=db.text("'text-embedding-ada-002'::character varying"))
+    embedding_model_provider = db.Column(db.String(
+        255), nullable=False, server_default=db.text("'openai'::character varying"))
 
 
     @property
     @property
     def dataset_keyword_table(self):
     def dataset_keyword_table(self):
@@ -209,6 +213,7 @@ class Document(db.Model):
     doc_metadata = db.Column(db.JSON, nullable=True)
     doc_metadata = db.Column(db.JSON, nullable=True)
     doc_form = db.Column(db.String(
     doc_form = db.Column(db.String(
         255), nullable=False, server_default=db.text("'text_model'::character varying"))
         255), nullable=False, server_default=db.text("'text_model'::character varying"))
+    doc_language = db.Column(db.String(255), nullable=True)
 
 
     DATA_SOURCES = ['upload_file', 'notion_import']
     DATA_SOURCES = ['upload_file', 'notion_import']
 
 

+ 2 - 1
api/requirements.txt

@@ -47,4 +47,5 @@ websocket-client~=1.6.1
 dashscope~=1.5.0
 dashscope~=1.5.0
 huggingface_hub~=0.16.4
 huggingface_hub~=0.16.4
 transformers~=4.31.0
 transformers~=4.31.0
-stripe~=5.5.0
+stripe~=5.5.0
+pandas==1.5.3

+ 108 - 59
api/services/dataset_service.py

@@ -9,6 +9,7 @@ from typing import Optional, List
 from flask import current_app
 from flask import current_app
 from sqlalchemy import func
 from sqlalchemy import func
 
 
+from core.index.index import IndexBuilder
 from core.model_providers.model_factory import ModelFactory
 from core.model_providers.model_factory import ModelFactory
 from extensions.ext_redis import redis_client
 from extensions.ext_redis import redis_client
 from flask_login import current_user
 from flask_login import current_user
@@ -25,14 +26,16 @@ from services.errors.account import NoPermissionError
 from services.errors.dataset import DatasetNameDuplicateError
 from services.errors.dataset import DatasetNameDuplicateError
 from services.errors.document import DocumentIndexingError
 from services.errors.document import DocumentIndexingError
 from services.errors.file import FileNotExistsError
 from services.errors.file import FileNotExistsError
+from services.vector_service import VectorService
 from tasks.clean_notion_document_task import clean_notion_document_task
 from tasks.clean_notion_document_task import clean_notion_document_task
 from tasks.deal_dataset_vector_index_task import deal_dataset_vector_index_task
 from tasks.deal_dataset_vector_index_task import deal_dataset_vector_index_task
 from tasks.document_indexing_task import document_indexing_task
 from tasks.document_indexing_task import document_indexing_task
 from tasks.document_indexing_update_task import document_indexing_update_task
 from tasks.document_indexing_update_task import document_indexing_update_task
 from tasks.create_segment_to_index_task import create_segment_to_index_task
 from tasks.create_segment_to_index_task import create_segment_to_index_task
 from tasks.update_segment_index_task import update_segment_index_task
 from tasks.update_segment_index_task import update_segment_index_task
-from tasks.update_segment_keyword_index_task\
-    import update_segment_keyword_index_task
+from tasks.recover_document_indexing_task import recover_document_indexing_task
+from tasks.update_segment_keyword_index_task import update_segment_keyword_index_task
+from tasks.delete_segment_from_index_task import delete_segment_from_index_task
 
 
 
 
 class DatasetService:
 class DatasetService:
@@ -88,12 +91,16 @@ class DatasetService:
         if Dataset.query.filter_by(name=name, tenant_id=tenant_id).first():
         if Dataset.query.filter_by(name=name, tenant_id=tenant_id).first():
             raise DatasetNameDuplicateError(
             raise DatasetNameDuplicateError(
                 f'Dataset with name {name} already exists.')
                 f'Dataset with name {name} already exists.')
-
+        embedding_model = ModelFactory.get_embedding_model(
+            tenant_id=current_user.current_tenant_id
+        )
         dataset = Dataset(name=name, indexing_technique=indexing_technique)
         dataset = Dataset(name=name, indexing_technique=indexing_technique)
         # dataset = Dataset(name=name, provider=provider, config=config)
         # dataset = Dataset(name=name, provider=provider, config=config)
         dataset.created_by = account.id
         dataset.created_by = account.id
         dataset.updated_by = account.id
         dataset.updated_by = account.id
         dataset.tenant_id = tenant_id
         dataset.tenant_id = tenant_id
+        dataset.embedding_model_provider = embedding_model.model_provider.provider_name
+        dataset.embedding_model = embedding_model.name
         db.session.add(dataset)
         db.session.add(dataset)
         db.session.commit()
         db.session.commit()
         return dataset
         return dataset
@@ -372,7 +379,7 @@ class DocumentService:
         indexing_cache_key = 'document_{}_is_paused'.format(document.id)
         indexing_cache_key = 'document_{}_is_paused'.format(document.id)
         redis_client.delete(indexing_cache_key)
         redis_client.delete(indexing_cache_key)
         # trigger async task
         # trigger async task
-        document_indexing_task.delay(document.dataset_id, document.id)
+        recover_document_indexing_task.delay(document.dataset_id, document.id)
 
 
     @staticmethod
     @staticmethod
     def get_documents_position(dataset_id):
     def get_documents_position(dataset_id):
@@ -450,6 +457,7 @@ class DocumentService:
                     document = DocumentService.save_document(dataset, dataset_process_rule.id,
                     document = DocumentService.save_document(dataset, dataset_process_rule.id,
                                                              document_data["data_source"]["type"],
                                                              document_data["data_source"]["type"],
                                                              document_data["doc_form"],
                                                              document_data["doc_form"],
+                                                             document_data["doc_language"],
                                                              data_source_info, created_from, position,
                                                              data_source_info, created_from, position,
                                                              account, file_name, batch)
                                                              account, file_name, batch)
                     db.session.add(document)
                     db.session.add(document)
@@ -495,20 +503,11 @@ class DocumentService:
                             document = DocumentService.save_document(dataset, dataset_process_rule.id,
                             document = DocumentService.save_document(dataset, dataset_process_rule.id,
                                                                      document_data["data_source"]["type"],
                                                                      document_data["data_source"]["type"],
                                                                      document_data["doc_form"],
                                                                      document_data["doc_form"],
+                                                                     document_data["doc_language"],
                                                                      data_source_info, created_from, position,
                                                                      data_source_info, created_from, position,
                                                                      account, page['page_name'], batch)
                                                                      account, page['page_name'], batch)
-                            # if page['type'] == 'database':
-                            #     document.splitting_completed_at = datetime.datetime.utcnow()
-                            #     document.cleaning_completed_at = datetime.datetime.utcnow()
-                            #     document.parsing_completed_at = datetime.datetime.utcnow()
-                            #     document.completed_at = datetime.datetime.utcnow()
-                            #     document.indexing_status = 'completed'
-                            #     document.word_count = 0
-                            #     document.tokens = 0
-                            #     document.indexing_latency = 0
                             db.session.add(document)
                             db.session.add(document)
                             db.session.flush()
                             db.session.flush()
-                            # if page['type'] != 'database':
                             document_ids.append(document.id)
                             document_ids.append(document.id)
                             documents.append(document)
                             documents.append(document)
                             position += 1
                             position += 1
@@ -520,15 +519,15 @@ class DocumentService:
             db.session.commit()
             db.session.commit()
 
 
             # trigger async task
             # trigger async task
-            #document_index_created.send(dataset.id, document_ids=document_ids)
             document_indexing_task.delay(dataset.id, document_ids)
             document_indexing_task.delay(dataset.id, document_ids)
 
 
         return documents, batch
         return documents, batch
 
 
     @staticmethod
     @staticmethod
     def save_document(dataset: Dataset, process_rule_id: str, data_source_type: str, document_form: str,
     def save_document(dataset: Dataset, process_rule_id: str, data_source_type: str, document_form: str,
-                      data_source_info: dict, created_from: str, position: int, account: Account, name: str,
-                      batch: str):
+                      document_language: str, data_source_info: dict, created_from: str, position: int,
+                      account: Account,
+                      name: str, batch: str):
         document = Document(
         document = Document(
             tenant_id=dataset.tenant_id,
             tenant_id=dataset.tenant_id,
             dataset_id=dataset.id,
             dataset_id=dataset.id,
@@ -540,7 +539,8 @@ class DocumentService:
             name=name,
             name=name,
             created_from=created_from,
             created_from=created_from,
             created_by=account.id,
             created_by=account.id,
-            doc_form=document_form
+            doc_form=document_form,
+            doc_language=document_language
         )
         )
         return document
         return document
 
 
@@ -654,13 +654,18 @@ class DocumentService:
             tenant_document_count = int(current_app.config['TENANT_DOCUMENT_COUNT'])
             tenant_document_count = int(current_app.config['TENANT_DOCUMENT_COUNT'])
             if documents_count > tenant_document_count:
             if documents_count > tenant_document_count:
                 raise ValueError(f"over document limit {tenant_document_count}.")
                 raise ValueError(f"over document limit {tenant_document_count}.")
+        embedding_model = ModelFactory.get_embedding_model(
+            tenant_id=tenant_id
+        )
         # save dataset
         # save dataset
         dataset = Dataset(
         dataset = Dataset(
             tenant_id=tenant_id,
             tenant_id=tenant_id,
             name='',
             name='',
             data_source_type=document_data["data_source"]["type"],
             data_source_type=document_data["data_source"]["type"],
             indexing_technique=document_data["indexing_technique"],
             indexing_technique=document_data["indexing_technique"],
-            created_by=account.id
+            created_by=account.id,
+            embedding_model=embedding_model.name,
+            embedding_model_provider=embedding_model.model_provider.provider_name
         )
         )
 
 
         db.session.add(dataset)
         db.session.add(dataset)
@@ -870,13 +875,15 @@ class SegmentService:
                 raise ValueError("Answer is required")
                 raise ValueError("Answer is required")
 
 
     @classmethod
     @classmethod
-    def create_segment(cls, args: dict, document: Document):
+    def create_segment(cls, args: dict, document: Document, dataset: Dataset):
         content = args['content']
         content = args['content']
         doc_id = str(uuid.uuid4())
         doc_id = str(uuid.uuid4())
         segment_hash = helper.generate_text_hash(content)
         segment_hash = helper.generate_text_hash(content)
 
 
         embedding_model = ModelFactory.get_embedding_model(
         embedding_model = ModelFactory.get_embedding_model(
-            tenant_id=document.tenant_id
+            tenant_id=dataset.tenant_id,
+            model_provider_name=dataset.embedding_model_provider,
+            model_name=dataset.embedding_model
         )
         )
 
 
         # calc embedding use tokens
         # calc embedding use tokens
@@ -894,6 +901,9 @@ class SegmentService:
             content=content,
             content=content,
             word_count=len(content),
             word_count=len(content),
             tokens=tokens,
             tokens=tokens,
+            status='completed',
+            indexing_at=datetime.datetime.utcnow(),
+            completed_at=datetime.datetime.utcnow(),
             created_by=current_user.id
             created_by=current_user.id
         )
         )
         if document.doc_form == 'qa_model':
         if document.doc_form == 'qa_model':
@@ -901,49 +911,88 @@ class SegmentService:
 
 
         db.session.add(segment_document)
         db.session.add(segment_document)
         db.session.commit()
         db.session.commit()
-        indexing_cache_key = 'segment_{}_indexing'.format(segment_document.id)
-        redis_client.setex(indexing_cache_key, 600, 1)
-        create_segment_to_index_task.delay(segment_document.id, args['keywords'])
-        return segment_document
+
+        # save vector index
+        try:
+            VectorService.create_segment_vector(args['keywords'], segment_document, dataset)
+        except Exception as e:
+            logging.exception("create segment index failed")
+            segment_document.enabled = False
+            segment_document.disabled_at = datetime.datetime.utcnow()
+            segment_document.status = 'error'
+            segment_document.error = str(e)
+            db.session.commit()
+        segment = db.session.query(DocumentSegment).filter(DocumentSegment.id == segment_document.id).first()
+        return segment
 
 
     @classmethod
     @classmethod
-    def update_segment(cls, args: dict, segment: DocumentSegment, document: Document):
+    def update_segment(cls, args: dict, segment: DocumentSegment, document: Document, dataset: Dataset):
         indexing_cache_key = 'segment_{}_indexing'.format(segment.id)
         indexing_cache_key = 'segment_{}_indexing'.format(segment.id)
         cache_result = redis_client.get(indexing_cache_key)
         cache_result = redis_client.get(indexing_cache_key)
         if cache_result is not None:
         if cache_result is not None:
             raise ValueError("Segment is indexing, please try again later")
             raise ValueError("Segment is indexing, please try again later")
-        content = args['content']
-        if segment.content == content:
-            if document.doc_form == 'qa_model':
-                segment.answer = args['answer']
-            if args['keywords']:
-                segment.keywords = args['keywords']
-            db.session.add(segment)
-            db.session.commit()
-            # update segment index task
-            redis_client.setex(indexing_cache_key, 600, 1)
-            update_segment_keyword_index_task.delay(segment.id)
-        else:
-            segment_hash = helper.generate_text_hash(content)
-
-            embedding_model = ModelFactory.get_embedding_model(
-                tenant_id=document.tenant_id
-            )
-
-            # calc embedding use tokens
-            tokens = embedding_model.get_num_tokens(content)
-            segment.content = content
-            segment.index_node_hash = segment_hash
-            segment.word_count = len(content)
-            segment.tokens = tokens
-            segment.status = 'updating'
-            segment.updated_by = current_user.id
-            segment.updated_at = datetime.datetime.utcnow()
-            if document.doc_form == 'qa_model':
-                segment.answer = args['answer']
-            db.session.add(segment)
+        try:
+            content = args['content']
+            if segment.content == content:
+                if document.doc_form == 'qa_model':
+                    segment.answer = args['answer']
+                if args['keywords']:
+                    segment.keywords = args['keywords']
+                db.session.add(segment)
+                db.session.commit()
+                # update segment index task
+                if args['keywords']:
+                    kw_index = IndexBuilder.get_index(dataset, 'economy')
+                    # delete from keyword index
+                    kw_index.delete_by_ids([segment.index_node_id])
+                    # save keyword index
+                    kw_index.update_segment_keywords_index(segment.index_node_id, segment.keywords)
+            else:
+                segment_hash = helper.generate_text_hash(content)
+
+                embedding_model = ModelFactory.get_embedding_model(
+                    tenant_id=dataset.tenant_id,
+                    model_provider_name=dataset.embedding_model_provider,
+                    model_name=dataset.embedding_model
+                )
+
+                # calc embedding use tokens
+                tokens = embedding_model.get_num_tokens(content)
+                segment.content = content
+                segment.index_node_hash = segment_hash
+                segment.word_count = len(content)
+                segment.tokens = tokens
+                segment.status = 'completed'
+                segment.indexing_at = datetime.datetime.utcnow()
+                segment.completed_at = datetime.datetime.utcnow()
+                segment.updated_by = current_user.id
+                segment.updated_at = datetime.datetime.utcnow()
+                if document.doc_form == 'qa_model':
+                    segment.answer = args['answer']
+                db.session.add(segment)
+                db.session.commit()
+                # update segment vector index
+                VectorService.create_segment_vector(args['keywords'], segment, dataset)
+        except Exception as e:
+            logging.exception("update segment index failed")
+            segment.enabled = False
+            segment.disabled_at = datetime.datetime.utcnow()
+            segment.status = 'error'
+            segment.error = str(e)
             db.session.commit()
             db.session.commit()
-            # update segment index task
-            redis_client.setex(indexing_cache_key, 600, 1)
-            update_segment_index_task.delay(segment.id, args['keywords'])
+        segment = db.session.query(DocumentSegment).filter(DocumentSegment.id == segment.id).first()
         return segment
         return segment
+
+    @classmethod
+    def delete_segment(cls, segment: DocumentSegment, document: Document, dataset: Dataset):
+        indexing_cache_key = 'segment_{}_delete_indexing'.format(segment.id)
+        cache_result = redis_client.get(indexing_cache_key)
+        if cache_result is not None:
+            raise ValueError("Segment is deleting.")
+        # send delete segment index task
+        redis_client.setex(indexing_cache_key, 600, 1)
+        # enabled segment need to delete index
+        if segment.enabled:
+            delete_segment_from_index_task.delay(segment.id, segment.index_node_id, dataset.id, document.id)
+        db.session.delete(segment)
+        db.session.commit()

+ 3 - 1
api/services/hit_testing_service.py

@@ -29,7 +29,9 @@ class HitTestingService:
             }
             }
 
 
         embedding_model = ModelFactory.get_embedding_model(
         embedding_model = ModelFactory.get_embedding_model(
-            tenant_id=dataset.tenant_id
+            tenant_id=dataset.tenant_id,
+            model_provider_name=dataset.embedding_model_provider,
+            model_name=dataset.embedding_model
         )
         )
 
 
         embeddings = CacheEmbedding(embedding_model)
         embeddings = CacheEmbedding(embedding_model)

+ 69 - 0
api/services/vector_service.py

@@ -0,0 +1,69 @@
+
+from typing import Optional, List
+
+from langchain.schema import Document
+
+from core.index.index import IndexBuilder
+
+from models.dataset import Dataset, DocumentSegment
+
+
+class VectorService:
+
+    @classmethod
+    def create_segment_vector(cls, keywords: Optional[List[str]], segment: DocumentSegment, dataset: Dataset):
+        document = Document(
+            page_content=segment.content,
+            metadata={
+                "doc_id": segment.index_node_id,
+                "doc_hash": segment.index_node_hash,
+                "document_id": segment.document_id,
+                "dataset_id": segment.dataset_id,
+            }
+        )
+
+        # save vector index
+        index = IndexBuilder.get_index(dataset, 'high_quality')
+        if index:
+            index.add_texts([document], duplicate_check=True)
+
+        # save keyword index
+        index = IndexBuilder.get_index(dataset, 'economy')
+        if index:
+            if keywords and len(keywords) > 0:
+                index.create_segment_keywords(segment.index_node_id, keywords)
+            else:
+                index.add_texts([document])
+
+    @classmethod
+    def update_segment_vector(cls, keywords: Optional[List[str]], segment: DocumentSegment, dataset: Dataset):
+        # update segment index task
+        vector_index = IndexBuilder.get_index(dataset, 'high_quality')
+        kw_index = IndexBuilder.get_index(dataset, 'economy')
+        # delete from vector index
+        if vector_index:
+            vector_index.delete_by_ids([segment.index_node_id])
+
+        # delete from keyword index
+        kw_index.delete_by_ids([segment.index_node_id])
+
+        # add new index
+        document = Document(
+            page_content=segment.content,
+            metadata={
+                "doc_id": segment.index_node_id,
+                "doc_hash": segment.index_node_hash,
+                "document_id": segment.document_id,
+                "dataset_id": segment.dataset_id,
+            }
+        )
+
+        # save vector index
+        if vector_index:
+            vector_index.add_texts([document], duplicate_check=True)
+
+        # save keyword index
+        if keywords and len(keywords) > 0:
+            kw_index.create_segment_keywords(segment.index_node_id, keywords)
+        else:
+            kw_index.add_texts([document])

+ 95 - 0
api/tasks/batch_create_segment_to_index_task.py

@@ -0,0 +1,95 @@
+import datetime
+import logging
+import time
+import uuid
+from typing import Optional, List
+
+import click
+from celery import shared_task
+from sqlalchemy import func
+from werkzeug.exceptions import NotFound
+
+from core.index.index import IndexBuilder
+from core.indexing_runner import IndexingRunner
+from core.model_providers.model_factory import ModelFactory
+from extensions.ext_database import db
+from extensions.ext_redis import redis_client
+from libs import helper
+from models.dataset import DocumentSegment, Dataset, Document
+
+
+@shared_task(queue='dataset')
+def batch_create_segment_to_index_task(job_id: str, content: List, dataset_id: str, document_id: str,
+                                       tenant_id: str, user_id: str):
+    """
+    Async batch create segment to index
+    :param job_id:
+    :param content:
+    :param dataset_id:
+    :param document_id:
+    :param tenant_id:
+    :param user_id:
+
+    Usage: batch_create_segment_to_index_task.delay(segment_id)
+    """
+    logging.info(click.style('Start batch create segment jobId: {}'.format(job_id), fg='green'))
+    start_at = time.perf_counter()
+
+    indexing_cache_key = 'segment_batch_import_{}'.format(job_id)
+
+    try:
+        dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()
+        if not dataset:
+            raise ValueError('Dataset not exist.')
+
+        dataset_document = db.session.query(Document).filter(Document.id == document_id).first()
+        if not dataset_document:
+            raise ValueError('Document not exist.')
+
+        if not dataset_document.enabled or dataset_document.archived or dataset_document.indexing_status != 'completed':
+            raise ValueError('Document is not available.')
+        document_segments = []
+        for segment in content:
+            content = segment['content']
+            doc_id = str(uuid.uuid4())
+            segment_hash = helper.generate_text_hash(content)
+            embedding_model = ModelFactory.get_embedding_model(
+                tenant_id=dataset.tenant_id,
+                model_provider_name=dataset.embedding_model_provider,
+                model_name=dataset.embedding_model
+            )
+
+            # calc embedding use tokens
+            tokens = embedding_model.get_num_tokens(content)
+            max_position = db.session.query(func.max(DocumentSegment.position)).filter(
+                DocumentSegment.document_id == dataset_document.id
+            ).scalar()
+            segment_document = DocumentSegment(
+                tenant_id=tenant_id,
+                dataset_id=dataset_id,
+                document_id=document_id,
+                index_node_id=doc_id,
+                index_node_hash=segment_hash,
+                position=max_position + 1 if max_position else 1,
+                content=content,
+                word_count=len(content),
+                tokens=tokens,
+                created_by=user_id,
+                indexing_at=datetime.datetime.utcnow(),
+                status='completed',
+                completed_at=datetime.datetime.utcnow()
+            )
+            if dataset_document.doc_form == 'qa_model':
+                segment_document.answer = segment['answer']
+            db.session.add(segment_document)
+            document_segments.append(segment_document)
+        # add index to db
+        indexing_runner = IndexingRunner()
+        indexing_runner.batch_add_segments(document_segments, dataset)
+        db.session.commit()
+        redis_client.setex(indexing_cache_key, 600, 'completed')
+        end_at = time.perf_counter()
+        logging.info(click.style('Segment batch created job: {} latency: {}'.format(job_id, end_at - start_at), fg='green'))
+    except Exception as e:
+        logging.exception("Segments batch created index failed:{}".format(str(e)))
+        redis_client.setex(indexing_cache_key, 600, 'error')

+ 58 - 0
api/tasks/delete_segment_from_index_task.py

@@ -0,0 +1,58 @@
+import logging
+import time
+
+import click
+from celery import shared_task
+from werkzeug.exceptions import NotFound
+
+from core.index.index import IndexBuilder
+from extensions.ext_database import db
+from extensions.ext_redis import redis_client
+from models.dataset import DocumentSegment, Dataset, Document
+
+
+@shared_task(queue='dataset')
+def delete_segment_from_index_task(segment_id: str, index_node_id: str, dataset_id: str, document_id: str):
+    """
+    Async Remove segment from index
+    :param segment_id:
+    :param index_node_id:
+    :param dataset_id:
+    :param document_id:
+
+    Usage: delete_segment_from_index_task.delay(segment_id)
+    """
+    logging.info(click.style('Start delete segment from index: {}'.format(segment_id), fg='green'))
+    start_at = time.perf_counter()
+    indexing_cache_key = 'segment_{}_delete_indexing'.format(segment_id)
+    try:
+        dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()
+        if not dataset:
+            logging.info(click.style('Segment {} has no dataset, pass.'.format(segment_id), fg='cyan'))
+            return
+
+        dataset_document = db.session.query(Document).filter(Document.id == document_id).first()
+        if not dataset_document:
+            logging.info(click.style('Segment {} has no document, pass.'.format(segment_id), fg='cyan'))
+            return
+
+        if not dataset_document.enabled or dataset_document.archived or dataset_document.indexing_status != 'completed':
+            logging.info(click.style('Segment {} document status is invalid, pass.'.format(segment_id), fg='cyan'))
+            return
+
+        vector_index = IndexBuilder.get_index(dataset, 'high_quality')
+        kw_index = IndexBuilder.get_index(dataset, 'economy')
+
+        # delete from vector index
+        if vector_index:
+            vector_index.delete_by_ids([index_node_id])
+
+        # delete from keyword index
+        kw_index.delete_by_ids([index_node_id])
+
+        end_at = time.perf_counter()
+        logging.info(click.style('Segment deleted from index: {} latency: {}'.format(segment_id, end_at - start_at), fg='green'))
+    except Exception:
+        logging.exception("delete segment from index failed")
+    finally:
+        redis_client.delete(indexing_cache_key)

+ 4 - 4
api/tasks/remove_segment_from_index_task.py → api/tasks/disable_segment_from_index_task.py

@@ -12,14 +12,14 @@ from models.dataset import DocumentSegment
 
 
 
 
 @shared_task(queue='dataset')
 @shared_task(queue='dataset')
-def remove_segment_from_index_task(segment_id: str):
+def disable_segment_from_index_task(segment_id: str):
     """
     """
-    Async Remove segment from index
+    Async disable segment from index
     :param segment_id:
     :param segment_id:
 
 
-    Usage: remove_segment_from_index.delay(segment_id)
+    Usage: disable_segment_from_index_task.delay(segment_id)
     """
     """
-    logging.info(click.style('Start remove segment from index: {}'.format(segment_id), fg='green'))
+    logging.info(click.style('Start disable segment from index: {}'.format(segment_id), fg='green'))
     start_at = time.perf_counter()
     start_at = time.perf_counter()
 
 
     segment = db.session.query(DocumentSegment).filter(DocumentSegment.id == segment_id).first()
     segment = db.session.query(DocumentSegment).filter(DocumentSegment.id == segment_id).first()

+ 0 - 11
api/tasks/update_segment_keyword_index_task.py

@@ -52,17 +52,6 @@ def update_segment_keyword_index_task(segment_id: str):
         # delete from keyword index
         # delete from keyword index
         kw_index.delete_by_ids([segment.index_node_id])
         kw_index.delete_by_ids([segment.index_node_id])
 
 
-        # add new index
-        document = Document(
-            page_content=segment.content,
-            metadata={
-                "doc_id": segment.index_node_id,
-                "doc_hash": segment.index_node_hash,
-                "document_id": segment.document_id,
-                "dataset_id": segment.dataset_id,
-            }
-        )
-
         # save keyword index
         # save keyword index
         index = IndexBuilder.get_index(dataset, 'economy')
         index = IndexBuilder.get_index(dataset, 'economy')
         if index:
         if index:

+ 21 - 10
web/app/(commonLayout)/datasets/DatasetCard.tsx

@@ -5,13 +5,14 @@ import Link from 'next/link'
 import type { MouseEventHandler } from 'react'
 import type { MouseEventHandler } from 'react'
 import { useCallback, useState } from 'react'
 import { useCallback, useState } from 'react'
 import { useTranslation } from 'react-i18next'
 import { useTranslation } from 'react-i18next'
-import classNames from 'classnames'
+import cn from 'classnames'
 import style from '../list.module.css'
 import style from '../list.module.css'
 import Confirm from '@/app/components/base/confirm'
 import Confirm from '@/app/components/base/confirm'
 import { ToastContext } from '@/app/components/base/toast'
 import { ToastContext } from '@/app/components/base/toast'
 import { deleteDataset } from '@/service/datasets'
 import { deleteDataset } from '@/service/datasets'
 import AppIcon from '@/app/components/base/app-icon'
 import AppIcon from '@/app/components/base/app-icon'
 import type { DataSet } from '@/models/datasets'
 import type { DataSet } from '@/models/datasets'
+import Tooltip from '@/app/components/base/tooltip'
 
 
 export type DatasetCardProps = {
 export type DatasetCardProps = {
   dataset: DataSet
   dataset: DataSet
@@ -45,26 +46,36 @@ const DatasetCard = ({
 
 
   return (
   return (
     <>
     <>
-      <Link href={`/datasets/${dataset.id}/documents`} className={style.listItem}>
+      <Link href={`/datasets/${dataset.id}/documents`} className={cn(style.listItem)}>
         <div className={style.listItemTitle}>
         <div className={style.listItemTitle}>
-          <AppIcon size='small' />
-          <div className={style.listItemHeading}>
-            <div className={style.listItemHeadingContent}>{dataset.name}</div>
+          <AppIcon size='small' className={cn(!dataset.embedding_available && style.unavailable)} />
+          <div className={cn(style.listItemHeading, !dataset.embedding_available && style.unavailable)}>
+            <div className={style.listItemHeadingContent}>
+              {dataset.name}
+            </div>
           </div>
           </div>
+          {!dataset.embedding_available && (
+            <Tooltip
+              selector={`dataset-tag-${dataset.id}`}
+              htmlContent={t('dataset.unavailableTip')}
+            >
+              <span className='px-1 border boder-gray-200 rounded-md text-gray-500 text-xs font-normal leading-[18px]'>{t('dataset.unavailable')}</span>
+            </Tooltip>
+          )}
           <span className={style.deleteDatasetIcon} onClick={onDeleteClick} />
           <span className={style.deleteDatasetIcon} onClick={onDeleteClick} />
         </div>
         </div>
-        <div className={style.listItemDescription}>{dataset.description}</div>
-        <div className={classNames(style.listItemFooter, style.datasetCardFooter)}>
+        <div className={cn(style.listItemDescription, !dataset.embedding_available && style.unavailable)}>{dataset.description}</div>
+        <div className={cn(style.listItemFooter, style.datasetCardFooter, !dataset.embedding_available && style.unavailable)}>
           <span className={style.listItemStats}>
           <span className={style.listItemStats}>
-            <span className={classNames(style.listItemFooterIcon, style.docIcon)} />
+            <span className={cn(style.listItemFooterIcon, style.docIcon)} />
             {dataset.document_count}{t('dataset.documentCount')}
             {dataset.document_count}{t('dataset.documentCount')}
           </span>
           </span>
           <span className={style.listItemStats}>
           <span className={style.listItemStats}>
-            <span className={classNames(style.listItemFooterIcon, style.textIcon)} />
+            <span className={cn(style.listItemFooterIcon, style.textIcon)} />
             {Math.round(dataset.word_count / 1000)}{t('dataset.wordCount')}
             {Math.round(dataset.word_count / 1000)}{t('dataset.wordCount')}
           </span>
           </span>
           <span className={style.listItemStats}>
           <span className={style.listItemStats}>
-            <span className={classNames(style.listItemFooterIcon, style.applicationIcon)} />
+            <span className={cn(style.listItemFooterIcon, style.applicationIcon)} />
             {dataset.app_count}{t('dataset.appCount')}
             {dataset.app_count}{t('dataset.appCount')}
           </span>
           </span>
         </div>
         </div>

+ 0 - 6
web/app/(commonLayout)/datasets/page.tsx

@@ -1,13 +1,7 @@
-import classNames from 'classnames'
-import { getLocaleOnServer } from '@/i18n/server'
-import { useTranslation } from '@/i18n/i18next-serverside-config'
 import Datasets from './Datasets'
 import Datasets from './Datasets'
 import DatasetFooter from './DatasetFooter'
 import DatasetFooter from './DatasetFooter'
 
 
 const AppList = async () => {
 const AppList = async () => {
-  const locale = getLocaleOnServer()
-  const { t } = await useTranslation(locale, 'dataset')
-
   return (
   return (
     <div className='flex flex-col overflow-auto bg-gray-100 shrink-0 grow'>
     <div className='flex flex-col overflow-auto bg-gray-100 shrink-0 grow'>
       <Datasets />
       <Datasets />

+ 8 - 0
web/app/(commonLayout)/list.module.css

@@ -192,3 +192,11 @@
   @apply inline-flex items-center mb-2 text-sm font-medium;
   @apply inline-flex items-center mb-2 text-sm font-medium;
 }
 }
 /* #endregion new app dialog */
 /* #endregion new app dialog */
+
+.unavailable {
+  @apply opacity-50;
+}
+
+.listItem:hover .unavailable {
+  @apply opacity-100;
+}

+ 16 - 3
web/app/components/app/configuration/dataset-config/card-item/index.tsx

@@ -7,6 +7,7 @@ import TypeIcon from '../type-icon'
 import RemoveIcon from '../../base/icons/remove-icon'
 import RemoveIcon from '../../base/icons/remove-icon'
 import s from './style.module.css'
 import s from './style.module.css'
 import { formatNumber } from '@/utils/format'
 import { formatNumber } from '@/utils/format'
+import Tooltip from '@/app/components/base/tooltip'
 
 
 export type ICardItemProps = {
 export type ICardItemProps = {
   className?: string
   className?: string
@@ -36,10 +37,22 @@ const CardItem: FC<ICardItemProps> = ({
           'flex items-center justify-between rounded-xl  px-3 py-2.5 bg-white border border-gray-200  cursor-pointer')
           'flex items-center justify-between rounded-xl  px-3 py-2.5 bg-white border border-gray-200  cursor-pointer')
       }>
       }>
       <div className='shrink-0 flex items-center space-x-2'>
       <div className='shrink-0 flex items-center space-x-2'>
-        <TypeIcon type="upload_file" />
+        <div className={cn(!config.embedding_available && 'opacity-50')}>
+          <TypeIcon type="upload_file" />
+        </div>
         <div>
         <div>
-          <div className='w-[160px] text-[13px] leading-[18px] font-medium text-gray-800 overflow-hidden text-ellipsis whitespace-nowrap'>{config.name}</div>
-          <div className='flex text-xs text-gray-500'>
+          <div className='flex items-center w-[160px] mr-1'>
+            <div className={cn('text-[13px] leading-[18px] font-medium text-gray-800 overflow-hidden text-ellipsis whitespace-nowrap', !config.embedding_available && 'opacity-50')}>{config.name}</div>
+            {!config.embedding_available && (
+              <Tooltip
+                selector={`unavailable-tag-${config.id}`}
+                htmlContent={t('dataset.unavailableTip')}
+              >
+                <span className='shrink-0 px-1 border boder-gray-200 rounded-md text-gray-500 text-xs font-normal leading-[18px]'>{t('dataset.unavailable')}</span>
+              </Tooltip>
+            )}
+          </div>
+          <div className={cn('flex text-xs text-gray-500', !config.embedding_available && 'opacity-50')}>
             {formatNumber(config.word_count)} {t('appDebug.feature.dataSet.words')} · {formatNumber(config.document_count)} {t('appDebug.feature.dataSet.textBlocks')}
             {formatNumber(config.word_count)} {t('appDebug.feature.dataSet.words')} · {formatNumber(config.document_count)} {t('appDebug.feature.dataSet.textBlocks')}
           </div>
           </div>
         </div>
         </div>

+ 15 - 6
web/app/components/app/configuration/dataset-config/select-dataset/index.tsx

@@ -120,15 +120,24 @@ const SelectDataSet: FC<ISelectDataSetProps> = ({
             {datasets.map(item => (
             {datasets.map(item => (
               <div
               <div
                 key={item.id}
                 key={item.id}
-                className={cn(s.item, selected.some(i => i.id === item.id) && s.selected, 'flex justify-between items-center h-10 px-2 rounded-lg bg-white border border-gray-200  cursor-pointer')}
-                onClick={() => toggleSelect(item)}
+                className={cn(s.item, selected.some(i => i.id === item.id) && s.selected, 'flex justify-between items-center h-10 px-2 rounded-lg bg-white border border-gray-200  cursor-pointer', !item.embedding_available && s.disabled)}
+                onClick={() => {
+                  if (!item.embedding_available)
+                    return
+                  toggleSelect(item)
+                }}
               >
               >
-                <div className='flex items-center space-x-2'>
-                  <TypeIcon type="upload_file" size='md' />
-                  <div className='max-w-[200px] text-[13px] font-medium text-gray-800 overflow-hidden text-ellipsis whitespace-nowrap'>{item.name}</div>
+                <div className='mr-1 flex items-center'>
+                  <div className={cn('mr-2', !item.embedding_available && 'opacity-50')}>
+                    <TypeIcon type="upload_file" size='md' />
+                  </div>
+                  <div className={cn('max-w-[200px] text-[13px] font-medium text-gray-800 overflow-hidden text-ellipsis whitespace-nowrap', !item.embedding_available && 'opacity-50 !max-w-[120px]')}>{item.name}</div>
+                  {!item.embedding_available && (
+                    <span className='ml-1 shrink-0 px-1 border boder-gray-200 rounded-md text-gray-500 text-xs font-normal leading-[18px]'>{t('dataset.unavailable')}</span>
+                  )}
                 </div>
                 </div>
 
 
-                <div className='flex text-xs text-gray-500 overflow-hidden whitespace-nowrap'>
+                <div className={cn('shrink-0 flex text-xs text-gray-500 overflow-hidden whitespace-nowrap', !item.embedding_available && 'opacity-50')}>
                   <span className='max-w-[100px] overflow-hidden text-ellipsis whitespace-nowrap'>{formatNumber(item.word_count)}</span>
                   <span className='max-w-[100px] overflow-hidden text-ellipsis whitespace-nowrap'>{formatNumber(item.word_count)}</span>
                   {t('appDebug.feature.dataSet.words')}
                   {t('appDebug.feature.dataSet.words')}
                   <span className='px-0.5'>·</span>
                   <span className='px-0.5'>·</span>

+ 5 - 1
web/app/components/app/configuration/dataset-config/select-dataset/style.module.css

@@ -6,4 +6,8 @@
 .item.selected {
 .item.selected {
   background: #F5F8FF;
   background: #F5F8FF;
   border-color: #528BFF;
   border-color: #528BFF;
-}
+}
+
+.item.disabled {
+  @apply bg-white border-gray-200 cursor-default;
+}

+ 4 - 4
web/app/components/base/icons/assets/vender/line/general/dots-horizontal.svg

@@ -1,9 +1,9 @@
-<svg width="12" height="12" viewBox="0 0 12 12" fill="none" xmlns="http://www.w3.org/2000/svg">
+<svg width="16" height="16" viewBox="0 0 16 16" fill="none" xmlns="http://www.w3.org/2000/svg">
 <g id="Icon">
 <g id="Icon">
 <g id="Icon_2">
 <g id="Icon_2">
-<path d="M6 6.5C6.27614 6.5 6.5 6.27614 6.5 6C6.5 5.72386 6.27614 5.5 6 5.5C5.72386 5.5 5.5 5.72386 5.5 6C5.5 6.27614 5.72386 6.5 6 6.5Z" stroke="#344054" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M9.5 6.5C9.77614 6.5 10 6.27614 10 6C10 5.72386 9.77614 5.5 9.5 5.5C9.22386 5.5 9 5.72386 9 6C9 6.27614 9.22386 6.5 9.5 6.5Z" stroke="#344054" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M2.5 6.5C2.77614 6.5 3 6.27614 3 6C3 5.72386 2.77614 5.5 2.5 5.5C2.22386 5.5 2 5.72386 2 6C2 6.27614 2.22386 6.5 2.5 6.5Z" stroke="#344054" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M8.00008 8.66634C8.36827 8.66634 8.66675 8.36786 8.66675 7.99967C8.66675 7.63148 8.36827 7.33301 8.00008 7.33301C7.63189 7.33301 7.33341 7.63148 7.33341 7.99967C7.33341 8.36786 7.63189 8.66634 8.00008 8.66634Z" stroke="#344054" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M12.6667 8.66634C13.0349 8.66634 13.3334 8.36786 13.3334 7.99967C13.3334 7.63148 13.0349 7.33301 12.6667 7.33301C12.2986 7.33301 12.0001 7.63148 12.0001 7.99967C12.0001 8.36786 12.2986 8.66634 12.6667 8.66634Z" stroke="#344054" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M3.33341 8.66634C3.7016 8.66634 4.00008 8.36786 4.00008 7.99967C4.00008 7.63148 3.7016 7.33301 3.33341 7.33301C2.96522 7.33301 2.66675 7.63148 2.66675 7.99967C2.66675 8.36786 2.96522 8.66634 3.33341 8.66634Z" stroke="#344054" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"/>
 </g>
 </g>
 </g>
 </g>
 </svg>
 </svg>

+ 7 - 7
web/app/components/base/icons/src/vender/line/general/DotsHorizontal.json

@@ -4,9 +4,9 @@
 		"isRootNode": true,
 		"isRootNode": true,
 		"name": "svg",
 		"name": "svg",
 		"attributes": {
 		"attributes": {
-			"width": "12",
-			"height": "12",
-			"viewBox": "0 0 12 12",
+			"width": "16",
+			"height": "16",
+			"viewBox": "0 0 16 16",
 			"fill": "none",
 			"fill": "none",
 			"xmlns": "http://www.w3.org/2000/svg"
 			"xmlns": "http://www.w3.org/2000/svg"
 		},
 		},
@@ -29,7 +29,7 @@
 								"type": "element",
 								"type": "element",
 								"name": "path",
 								"name": "path",
 								"attributes": {
 								"attributes": {
-									"d": "M6 6.5C6.27614 6.5 6.5 6.27614 6.5 6C6.5 5.72386 6.27614 5.5 6 5.5C5.72386 5.5 5.5 5.72386 5.5 6C5.5 6.27614 5.72386 6.5 6 6.5Z",
+									"d": "M8.00008 8.66634C8.36827 8.66634 8.66675 8.36786 8.66675 7.99967C8.66675 7.63148 8.36827 7.33301 8.00008 7.33301C7.63189 7.33301 7.33341 7.63148 7.33341 7.99967C7.33341 8.36786 7.63189 8.66634 8.00008 8.66634Z",
 									"stroke": "currentColor",
 									"stroke": "currentColor",
 									"stroke-width": "1.5",
 									"stroke-width": "1.5",
 									"stroke-linecap": "round",
 									"stroke-linecap": "round",
@@ -41,7 +41,7 @@
 								"type": "element",
 								"type": "element",
 								"name": "path",
 								"name": "path",
 								"attributes": {
 								"attributes": {
-									"d": "M9.5 6.5C9.77614 6.5 10 6.27614 10 6C10 5.72386 9.77614 5.5 9.5 5.5C9.22386 5.5 9 5.72386 9 6C9 6.27614 9.22386 6.5 9.5 6.5Z",
+									"d": "M12.6667 8.66634C13.0349 8.66634 13.3334 8.36786 13.3334 7.99967C13.3334 7.63148 13.0349 7.33301 12.6667 7.33301C12.2986 7.33301 12.0001 7.63148 12.0001 7.99967C12.0001 8.36786 12.2986 8.66634 12.6667 8.66634Z",
 									"stroke": "currentColor",
 									"stroke": "currentColor",
 									"stroke-width": "1.5",
 									"stroke-width": "1.5",
 									"stroke-linecap": "round",
 									"stroke-linecap": "round",
@@ -53,7 +53,7 @@
 								"type": "element",
 								"type": "element",
 								"name": "path",
 								"name": "path",
 								"attributes": {
 								"attributes": {
-									"d": "M2.5 6.5C2.77614 6.5 3 6.27614 3 6C3 5.72386 2.77614 5.5 2.5 5.5C2.22386 5.5 2 5.72386 2 6C2 6.27614 2.22386 6.5 2.5 6.5Z",
+									"d": "M3.33341 8.66634C3.7016 8.66634 4.00008 8.36786 4.00008 7.99967C4.00008 7.63148 3.7016 7.33301 3.33341 7.33301C2.96522 7.33301 2.66675 7.63148 2.66675 7.99967C2.66675 8.36786 2.96522 8.66634 3.33341 8.66634Z",
 									"stroke": "currentColor",
 									"stroke": "currentColor",
 									"stroke-width": "1.5",
 									"stroke-width": "1.5",
 									"stroke-linecap": "round",
 									"stroke-linecap": "round",
@@ -68,4 +68,4 @@
 		]
 		]
 	},
 	},
 	"name": "DotsHorizontal"
 	"name": "DotsHorizontal"
-}
+}

+ 13 - 8
web/app/components/base/popover/index.tsx

@@ -9,6 +9,7 @@ type IPopover = {
   position?: 'bottom' | 'br'
   position?: 'bottom' | 'br'
   btnElement?: string | React.ReactNode
   btnElement?: string | React.ReactNode
   btnClassName?: string | ((open: boolean) => string)
   btnClassName?: string | ((open: boolean) => string)
+  manualClose?: boolean
 }
 }
 
 
 const timeoutDuration = 100
 const timeoutDuration = 100
@@ -20,6 +21,7 @@ export default function CustomPopover({
   btnElement,
   btnElement,
   className,
   className,
   btnClassName,
   btnClassName,
+  manualClose,
 }: IPopover) {
 }: IPopover) {
   const buttonRef = useRef<HTMLButtonElement>(null)
   const buttonRef = useRef<HTMLButtonElement>(null)
   const timeOutRef = useRef<NodeJS.Timeout | null>(null)
   const timeOutRef = useRef<NodeJS.Timeout | null>(null)
@@ -62,17 +64,14 @@ export default function CustomPopover({
               </Popover.Button>
               </Popover.Button>
               <Transition as={Fragment}>
               <Transition as={Fragment}>
                 <Popover.Panel
                 <Popover.Panel
-                  className={`${s.popupPanel} ${
-                    position === 'br'
-                      ? 'right-0'
-                      : 'transform -translate-x-1/2 left-1/2'
-                  } ${className}`}
+                  className={`${s.popupPanel} ${position === 'br' ? 'right-0' : 'translate-x-1/2 left-1/2'} ${className}`}
                   {...(trigger !== 'hover'
                   {...(trigger !== 'hover'
                     ? {}
                     ? {}
                     : {
                     : {
                       onMouseLeave: () => onMouseLeave(open),
                       onMouseLeave: () => onMouseLeave(open),
                       onMouseEnter: () => onMouseEnter(open),
                       onMouseEnter: () => onMouseEnter(open),
-                    })}
+                    })
+                  }
                 >
                 >
                   {({ close }) => (
                   {({ close }) => (
                     <div
                     <div
@@ -82,10 +81,16 @@ export default function CustomPopover({
                         : {
                         : {
                           onMouseLeave: () => onMouseLeave(open),
                           onMouseLeave: () => onMouseLeave(open),
                           onMouseEnter: () => onMouseEnter(open),
                           onMouseEnter: () => onMouseEnter(open),
-                        })}
+                        })
+                      }
                     >
                     >
                       {cloneElement(htmlContent as React.ReactElement, {
                       {cloneElement(htmlContent as React.ReactElement, {
-                        onClose: () => close(),
+                        onClose: () => onMouseLeave(open),
+                        ...(manualClose
+                          ? {
+                            onClick: close,
+                          }
+                          : {}),
                       })}
                       })}
                     </div>
                     </div>
                   )}
                   )}

+ 1 - 1
web/app/components/datasets/create/file-uploader/index.tsx

@@ -29,7 +29,7 @@ const ACCEPTS = [
   '.txt',
   '.txt',
   // '.xls',
   // '.xls',
   '.xlsx',
   '.xlsx',
-  '.csv',
+  // '.csv',
 ]
 ]
 
 
 const FileUploader = ({
 const FileUploader = ({

+ 44 - 19
web/app/components/datasets/create/step-two/index.tsx

@@ -2,12 +2,14 @@
 'use client'
 'use client'
 import React, { useEffect, useLayoutEffect, useRef, useState } from 'react'
 import React, { useEffect, useLayoutEffect, useRef, useState } from 'react'
 import { useTranslation } from 'react-i18next'
 import { useTranslation } from 'react-i18next'
+import { useContext } from 'use-context-selector'
 import { useBoolean } from 'ahooks'
 import { useBoolean } from 'ahooks'
 import { XMarkIcon } from '@heroicons/react/20/solid'
 import { XMarkIcon } from '@heroicons/react/20/solid'
 import cn from 'classnames'
 import cn from 'classnames'
 import Link from 'next/link'
 import Link from 'next/link'
 import { groupBy } from 'lodash-es'
 import { groupBy } from 'lodash-es'
 import PreviewItem, { PreviewType } from './preview-item'
 import PreviewItem, { PreviewType } from './preview-item'
+import LanguageSelect from './language-select'
 import s from './index.module.css'
 import s from './index.module.css'
 import type { CreateDocumentReq, CustomFile, FullDocumentDetail, FileIndexingEstimateResponse as IndexingEstimateResponse, NotionInfo, PreProcessingRule, Rules, createDocumentResponse } from '@/models/datasets'
 import type { CreateDocumentReq, CustomFile, FullDocumentDetail, FileIndexingEstimateResponse as IndexingEstimateResponse, NotionInfo, PreProcessingRule, Rules, createDocumentResponse } from '@/models/datasets'
 import {
 import {
@@ -22,11 +24,13 @@ import Loading from '@/app/components/base/loading'
 import Toast from '@/app/components/base/toast'
 import Toast from '@/app/components/base/toast'
 import { formatNumber } from '@/utils/format'
 import { formatNumber } from '@/utils/format'
 import type { DataSourceNotionPage } from '@/models/common'
 import type { DataSourceNotionPage } from '@/models/common'
-import { DataSourceType } from '@/models/datasets'
+import { DataSourceType, DocForm } from '@/models/datasets'
 import NotionIcon from '@/app/components/base/notion-icon'
 import NotionIcon from '@/app/components/base/notion-icon'
 import Switch from '@/app/components/base/switch'
 import Switch from '@/app/components/base/switch'
 import { MessageChatSquare } from '@/app/components/base/icons/src/public/common'
 import { MessageChatSquare } from '@/app/components/base/icons/src/public/common'
+import { XClose } from '@/app/components/base/icons/src/vender/line/general'
 import { useDatasetDetailContext } from '@/context/dataset-detail'
 import { useDatasetDetailContext } from '@/context/dataset-detail'
+import I18n from '@/context/i18n'
 import { IS_CE_EDITION } from '@/config'
 import { IS_CE_EDITION } from '@/config'
 
 
 type Page = DataSourceNotionPage & { workspace_id: string }
 type Page = DataSourceNotionPage & { workspace_id: string }
@@ -56,10 +60,6 @@ enum IndexingType {
   QUALIFIED = 'high_quality',
   QUALIFIED = 'high_quality',
   ECONOMICAL = 'economy',
   ECONOMICAL = 'economy',
 }
 }
-enum DocForm {
-  TEXT = 'text_model',
-  QA = 'qa_model',
-}
 
 
 const StepTwo = ({
 const StepTwo = ({
   isSetting,
   isSetting,
@@ -78,6 +78,8 @@ const StepTwo = ({
   onCancel,
   onCancel,
 }: StepTwoProps) => {
 }: StepTwoProps) => {
   const { t } = useTranslation()
   const { t } = useTranslation()
+  const { locale } = useContext(I18n)
+
   const { mutateDatasetRes } = useDatasetDetailContext()
   const { mutateDatasetRes } = useDatasetDetailContext()
   const scrollRef = useRef<HTMLDivElement>(null)
   const scrollRef = useRef<HTMLDivElement>(null)
   const [scrolled, setScrolled] = useState(false)
   const [scrolled, setScrolled] = useState(false)
@@ -98,6 +100,8 @@ const StepTwo = ({
   const [docForm, setDocForm] = useState<DocForm | string>(
   const [docForm, setDocForm] = useState<DocForm | string>(
     datasetId && documentDetail ? documentDetail.doc_form : DocForm.TEXT,
     datasetId && documentDetail ? documentDetail.doc_form : DocForm.TEXT,
   )
   )
+  const [docLanguage, setDocLanguage] = useState<string>(locale === 'en' ? 'English' : 'Chinese')
+  const [QATipHide, setQATipHide] = useState(false)
   const [previewSwitched, setPreviewSwitched] = useState(false)
   const [previewSwitched, setPreviewSwitched] = useState(false)
   const [showPreview, { setTrue: setShowPreview, setFalse: hidePreview }] = useBoolean()
   const [showPreview, { setTrue: setShowPreview, setFalse: hidePreview }] = useBoolean()
   const [customFileIndexingEstimate, setCustomFileIndexingEstimate] = useState<IndexingEstimateResponse | null>(null)
   const [customFileIndexingEstimate, setCustomFileIndexingEstimate] = useState<IndexingEstimateResponse | null>(null)
@@ -230,6 +234,8 @@ const StepTwo = ({
         indexing_technique: getIndexing_technique(),
         indexing_technique: getIndexing_technique(),
         process_rule: getProcessRule(),
         process_rule: getProcessRule(),
         doc_form: docForm,
         doc_form: docForm,
+        doc_language: docLanguage,
+        dataset_id: datasetId,
       }
       }
     }
     }
     if (dataSourceType === DataSourceType.NOTION) {
     if (dataSourceType === DataSourceType.NOTION) {
@@ -241,6 +247,8 @@ const StepTwo = ({
         indexing_technique: getIndexing_technique(),
         indexing_technique: getIndexing_technique(),
         process_rule: getProcessRule(),
         process_rule: getProcessRule(),
         doc_form: docForm,
         doc_form: docForm,
+        doc_language: docLanguage,
+        dataset_id: datasetId,
       }
       }
     }
     }
     return params
     return params
@@ -252,6 +260,7 @@ const StepTwo = ({
       params = {
       params = {
         original_document_id: documentDetail?.id,
         original_document_id: documentDetail?.id,
         doc_form: docForm,
         doc_form: docForm,
+        doc_language: docLanguage,
         process_rule: getProcessRule(),
         process_rule: getProcessRule(),
       } as CreateDocumentReq
       } as CreateDocumentReq
     }
     }
@@ -266,6 +275,7 @@ const StepTwo = ({
         indexing_technique: getIndexing_technique(),
         indexing_technique: getIndexing_technique(),
         process_rule: getProcessRule(),
         process_rule: getProcessRule(),
         doc_form: docForm,
         doc_form: docForm,
+        doc_language: docLanguage,
       } as CreateDocumentReq
       } as CreateDocumentReq
       if (dataSourceType === DataSourceType.FILE) {
       if (dataSourceType === DataSourceType.FILE) {
         params.data_source.info_list.file_info_list = {
         params.data_source.info_list.file_info_list = {
@@ -348,6 +358,10 @@ const StepTwo = ({
       setDocForm(DocForm.TEXT)
       setDocForm(DocForm.TEXT)
   }
   }
 
 
+  const handleSelect = (language: string) => {
+    setDocLanguage(language)
+  }
+
   const changeToEconomicalType = () => {
   const changeToEconomicalType = () => {
     if (!hasSetIndexType) {
     if (!hasSetIndexType) {
       setIndexType(IndexingType.ECONOMICAL)
       setIndexType(IndexingType.ECONOMICAL)
@@ -574,21 +588,32 @@ const StepTwo = ({
               </div>
               </div>
             )}
             )}
             {IS_CE_EDITION && indexType === IndexingType.QUALIFIED && (
             {IS_CE_EDITION && indexType === IndexingType.QUALIFIED && (
-              <div className='flex justify-between items-center mt-3 px-5 py-4 rounded-xl bg-gray-50 border border-gray-100'>
-                <div className='flex justify-center items-center w-8 h-8 rounded-lg bg-indigo-50'>
-                  <MessageChatSquare className='w-4 h-4' />
-                </div>
-                <div className='grow mx-3'>
-                  <div className='mb-[2px] text-md font-medium text-gray-900'>{t('datasetCreation.stepTwo.QATitle')}</div>
-                  <div className='text-[13px] leading-[18px] text-gray-500'>{t('datasetCreation.stepTwo.QATip')}</div>
-                </div>
-                <div className='shrink-0'>
-                  <Switch
-                    defaultValue={docForm === DocForm.QA}
-                    onChange={handleSwitch}
-                    size='md'
-                  />
+              <div className='mt-3 rounded-xl bg-gray-50 border border-gray-100'>
+                <div className='flex justify-between items-center px-5 py-4'>
+                  <div className='flex justify-center items-center w-8 h-8 rounded-lg bg-indigo-50'>
+                    <MessageChatSquare className='w-4 h-4' />
+                  </div>
+                  <div className='grow mx-3'>
+                    <div className='mb-[2px] text-md font-medium text-gray-900'>{t('datasetCreation.stepTwo.QATitle')}</div>
+                    <div className='inline-flex items-center text-[13px] leading-[18px] text-gray-500'>
+                      <span className='pr-1'>{t('datasetCreation.stepTwo.QALanguage')}</span>
+                      <LanguageSelect currentLanguage={docLanguage} onSelect={handleSelect} />
+                    </div>
+                  </div>
+                  <div className='shrink-0'>
+                    <Switch
+                      defaultValue={docForm === DocForm.QA}
+                      onChange={handleSwitch}
+                      size='md'
+                    />
+                  </div>
                 </div>
                 </div>
+                {docForm === DocForm.QA && !QATipHide && (
+                  <div className='flex justify-between items-center px-5 py-2 bg-orange-50 border-t border-amber-100 rounded-b-xl text-[13px] leading-[18px] text-medium text-amber-500'>
+                    {t('datasetCreation.stepTwo.QATip')}
+                    <XClose className='w-4 h-4 text-gray-500 cursor-pointer' onClick={() => setQATipHide(true)} />
+                  </div>
+                )}
               </div>
               </div>
             )}
             )}
             <div className={s.source}>
             <div className={s.source}>

+ 38 - 0
web/app/components/datasets/create/step-two/language-select/index.tsx

@@ -0,0 +1,38 @@
+'use client'
+import type { FC } from 'react'
+import React from 'react'
+import cn from 'classnames'
+import { ChevronDown } from '@/app/components/base/icons/src/vender/line/arrows'
+import Popover from '@/app/components/base/popover'
+
+export type ILanguageSelectProps = {
+  currentLanguage: string
+  onSelect: (language: string) => void
+}
+
+const LanguageSelect: FC<ILanguageSelectProps> = ({
+  currentLanguage,
+  onSelect,
+}) => {
+  return (
+    <Popover
+      manualClose
+      trigger='click'
+      htmlContent={
+        <div className='w-full py-1'>
+          <div className='py-2 px-3 mx-1 flex items-center gap-2 hover:bg-gray-100 rounded-lg cursor-pointer text-gray-700 text-sm' onClick={() => onSelect('English')}>English</div>
+          <div className='py-2 px-3 mx-1 flex items-center gap-2 hover:bg-gray-100 rounded-lg cursor-pointer text-gray-700 text-sm' onClick={() => onSelect('Chinese')}>简体中文</div>
+        </div>
+      }
+      btnElement={
+        <div className='inline-flex items-center'>
+          <span className='pr-[2px] text-xs leading-[18px] font-medium'>{currentLanguage === 'English' ? 'English' : '简体中文'}</span>
+          <ChevronDown className='w-3 h-3 opacity-60' />
+        </div>
+      }
+      btnClassName={open => cn('!border-0 !px-0 !py-0 !bg-inherit !hover:bg-inherit', open ? 'text-blue-600' : 'text-gray-500')}
+      className='!w-[120px] h-fit !z-20 !translate-x-0 !left-[-16px]'
+    />
+  )
+}
+export default React.memo(LanguageSelect)

+ 108 - 0
web/app/components/datasets/documents/detail/batch-modal/csv-downloader.tsx

@@ -0,0 +1,108 @@
+'use client'
+import type { FC } from 'react'
+import React from 'react'
+import {
+  useCSVDownloader,
+} from 'react-papaparse'
+import { useTranslation } from 'react-i18next'
+import { useContext } from 'use-context-selector'
+import { Download02 as DownloadIcon } from '@/app/components/base/icons/src/vender/solid/general'
+import { DocForm } from '@/models/datasets'
+import I18n from '@/context/i18n'
+
+const CSV_TEMPLATE_QA_EN = [
+  ['question', 'answer'],
+  ['question1', 'answer1'],
+  ['question2', 'answer2'],
+]
+const CSV_TEMPLATE_QA_CN = [
+  ['问题', '答案'],
+  ['问题 1', '答案 1'],
+  ['问题 2', '答案 2'],
+]
+const CSV_TEMPLATE_EN = [
+  ['segment content'],
+  ['content1'],
+  ['content2'],
+]
+const CSV_TEMPLATE_CN = [
+  ['分段内容'],
+  ['内容 1'],
+  ['内容 2'],
+]
+
+const CSVDownload: FC<{ docForm: DocForm }> = ({ docForm }) => {
+  const { t } = useTranslation()
+  const { locale } = useContext(I18n)
+  const { CSVDownloader, Type } = useCSVDownloader()
+
+  const getTemplate = () => {
+    if (locale === 'en') {
+      if (docForm === DocForm.QA)
+        return CSV_TEMPLATE_QA_EN
+      return CSV_TEMPLATE_EN
+    }
+    if (docForm === DocForm.QA)
+      return CSV_TEMPLATE_QA_CN
+    return CSV_TEMPLATE_CN
+  }
+
+  return (
+    <div className='mt-6'>
+      <div className='text-sm text-gray-900 font-medium'>{t('share.generation.csvStructureTitle')}</div>
+      <div className='mt-2 max-h-[500px] overflow-auto'>
+        {docForm === DocForm.QA && (
+          <table className='table-fixed w-full border-separate border-spacing-0 border border-gray-200 rounded-lg text-xs'>
+            <thead className='text-gray-500'>
+              <tr>
+                <td className='h-9 pl-3 pr-2 border-b border-gray-200'>{t('datasetDocuments.list.batchModal.question')}</td>
+                <td className='h-9 pl-3 pr-2 border-b border-gray-200'>{t('datasetDocuments.list.batchModal.answer')}</td>
+              </tr>
+            </thead>
+            <tbody className='text-gray-700'>
+              <tr>
+                <td className='h-9 pl-3 pr-2 border-b border-gray-100 text-[13px]'>{t('datasetDocuments.list.batchModal.question')} 1</td>
+                <td className='h-9 pl-3 pr-2 border-b border-gray-100 text-[13px]'>{t('datasetDocuments.list.batchModal.answer')} 1</td>
+              </tr>
+              <tr>
+                <td className='h-9 pl-3 pr-2 text-[13px]'>{t('datasetDocuments.list.batchModal.question')} 2</td>
+                <td className='h-9 pl-3 pr-2 text-[13px]'>{t('datasetDocuments.list.batchModal.answer')} 2</td>
+              </tr>
+            </tbody>
+          </table>
+        )}
+        {docForm === DocForm.TEXT && (
+          <table className='table-fixed w-full border-separate border-spacing-0 border border-gray-200 rounded-lg text-xs'>
+            <thead className='text-gray-500'>
+              <tr>
+                <td className='h-9 pl-3 pr-2 border-b border-gray-200'>{t('datasetDocuments.list.batchModal.contentTitle')}</td>
+              </tr>
+            </thead>
+            <tbody className='text-gray-700'>
+              <tr>
+                <td className='h-9 pl-3 pr-2 border-b border-gray-100 text-[13px]'>{t('datasetDocuments.list.batchModal.content')} 1</td>
+              </tr>
+              <tr>
+                <td className='h-9 pl-3 pr-2 text-[13px]'>{t('datasetDocuments.list.batchModal.content')} 2</td>
+              </tr>
+            </tbody>
+          </table>
+        )}
+      </div>
+      <CSVDownloader
+        className="block mt-2 cursor-pointer"
+        type={Type.Link}
+        filename={'template'}
+        bom={true}
+        data={getTemplate()}
+      >
+        <div className='flex items-center h-[18px] space-x-1 text-[#155EEF] text-xs font-medium'>
+          <DownloadIcon className='w-3 h-3 mr-1' />
+          {t('datasetDocuments.list.batchModal.template')}
+        </div>
+      </CSVDownloader>
+    </div>
+
+  )
+}
+export default React.memo(CSVDownload)

+ 126 - 0
web/app/components/datasets/documents/detail/batch-modal/csv-uploader.tsx

@@ -0,0 +1,126 @@
+'use client'
+import type { FC } from 'react'
+import React, { useEffect, useRef, useState } from 'react'
+import cn from 'classnames'
+import { useTranslation } from 'react-i18next'
+import { useContext } from 'use-context-selector'
+import { Csv as CSVIcon } from '@/app/components/base/icons/src/public/files'
+import { ToastContext } from '@/app/components/base/toast'
+import { Trash03 } from '@/app/components/base/icons/src/vender/line/general'
+import Button from '@/app/components/base/button'
+
+export type Props = {
+  file: File | undefined
+  updateFile: (file?: File) => void
+}
+
+const CSVUploader: FC<Props> = ({
+  file,
+  updateFile,
+}) => {
+  const { t } = useTranslation()
+  const { notify } = useContext(ToastContext)
+  const [dragging, setDragging] = useState(false)
+  const dropRef = useRef<HTMLDivElement>(null)
+  const dragRef = useRef<HTMLDivElement>(null)
+  const fileUploader = useRef<HTMLInputElement>(null)
+
+  const handleDragEnter = (e: DragEvent) => {
+    e.preventDefault()
+    e.stopPropagation()
+    e.target !== dragRef.current && setDragging(true)
+  }
+  const handleDragOver = (e: DragEvent) => {
+    e.preventDefault()
+    e.stopPropagation()
+  }
+  const handleDragLeave = (e: DragEvent) => {
+    e.preventDefault()
+    e.stopPropagation()
+    e.target === dragRef.current && setDragging(false)
+  }
+  const handleDrop = (e: DragEvent) => {
+    e.preventDefault()
+    e.stopPropagation()
+    setDragging(false)
+    if (!e.dataTransfer)
+      return
+    const files = [...e.dataTransfer.files]
+    if (files.length > 1) {
+      notify({ type: 'error', message: t('datasetCreation.stepOne.uploader.validation.count') })
+      return
+    }
+    updateFile(files[0])
+  }
+  const selectHandle = () => {
+    if (fileUploader.current)
+      fileUploader.current.click()
+  }
+  const removeFile = () => {
+    if (fileUploader.current)
+      fileUploader.current.value = ''
+    updateFile()
+  }
+  const fileChangeHandle = (e: React.ChangeEvent<HTMLInputElement>) => {
+    const currentFile = e.target.files?.[0]
+    updateFile(currentFile)
+  }
+
+  useEffect(() => {
+    dropRef.current?.addEventListener('dragenter', handleDragEnter)
+    dropRef.current?.addEventListener('dragover', handleDragOver)
+    dropRef.current?.addEventListener('dragleave', handleDragLeave)
+    dropRef.current?.addEventListener('drop', handleDrop)
+    return () => {
+      dropRef.current?.removeEventListener('dragenter', handleDragEnter)
+      dropRef.current?.removeEventListener('dragover', handleDragOver)
+      dropRef.current?.removeEventListener('dragleave', handleDragLeave)
+      dropRef.current?.removeEventListener('drop', handleDrop)
+    }
+  }, [])
+
+  return (
+    <div className='mt-6'>
+      <input
+        ref={fileUploader}
+        style={{ display: 'none' }}
+        type="file"
+        id="fileUploader"
+        accept='.csv'
+        onChange={fileChangeHandle}
+      />
+      <div ref={dropRef}>
+        {!file && (
+          <div className={cn('flex items-center h-20 rounded-xl bg-gray-50 border border-dashed border-gray-200 text-sm font-normal', dragging && 'bg-[#F5F8FF] border border-[#B2CCFF]')}>
+            <div className='w-full flex items-center justify-center space-x-2'>
+              <CSVIcon className="shrink-0" />
+              <div className='text-gray-500'>
+                {t('datasetDocuments.list.batchModal.csvUploadTitle')}
+                <span className='text-primary-400 cursor-pointer' onClick={selectHandle}>{t('datasetDocuments.list.batchModal.browse')}</span>
+              </div>
+            </div>
+            {dragging && <div ref={dragRef} className='absolute w-full h-full top-0 left-0'/>}
+          </div>
+        )}
+        {file && (
+          <div className={cn('flex items-center h-20 px-6 rounded-xl bg-gray-50 border border-gray-200 text-sm font-normal group', 'hover:bg-[#F5F8FF] hover:border-[#B2CCFF]')}>
+            <CSVIcon className="shrink-0" />
+            <div className='flex ml-2 w-0 grow'>
+              <span className='max-w-[calc(100%_-_30px)] text-ellipsis whitespace-nowrap overflow-hidden text-gray-800'>{file.name.replace(/.csv$/, '')}</span>
+              <span className='shrink-0 text-gray-500'>.csv</span>
+            </div>
+            <div className='hidden group-hover:flex items-center'>
+              <Button className='!h-8 !px-3 !py-[6px] bg-white !text-[13px] !leading-[18px] text-gray-700' onClick={selectHandle}>{t('datasetCreation.stepOne.uploader.change')}</Button>
+              <div className='mx-2 w-px h-4 bg-gray-200' />
+              <div className='p-2 cursor-pointer' onClick={removeFile}>
+                <Trash03 className='w-4 h-4 text-gray-500' />
+              </div>
+            </div>
+          </div>
+        )}
+      </div>
+    </div>
+  )
+}
+
+export default React.memo(CSVUploader)

+ 65 - 0
web/app/components/datasets/documents/detail/batch-modal/index.tsx

@@ -0,0 +1,65 @@
+'use client'
+import type { FC } from 'react'
+import React, { useEffect, useState } from 'react'
+import { useTranslation } from 'react-i18next'
+import CSVUploader from './csv-uploader'
+import CSVDownloader from './csv-downloader'
+import Button from '@/app/components/base/button'
+import Modal from '@/app/components/base/modal'
+import { XClose } from '@/app/components/base/icons/src/vender/line/general'
+import type { DocForm } from '@/models/datasets'
+
+export type IBatchModalProps = {
+  isShow: boolean
+  docForm: DocForm
+  onCancel: () => void
+  onConfirm: (file: File) => void
+}
+
+const BatchModal: FC<IBatchModalProps> = ({
+  isShow,
+  docForm,
+  onCancel,
+  onConfirm,
+}) => {
+  const { t } = useTranslation()
+  const [currentCSV, setCurrentCSV] = useState<File>()
+  const handleFile = (file?: File) => setCurrentCSV(file)
+
+  const handleSend = () => {
+    if (!currentCSV)
+      return
+    onCancel()
+    onConfirm(currentCSV)
+  }
+
+  useEffect(() => {
+    if (!isShow)
+      setCurrentCSV(undefined)
+  }, [isShow])
+
+  return (
+    <Modal isShow={isShow} onClose={() => {}} className='px-8 py-6 !max-w-[520px] !rounded-xl'>
+      <div className='relative pb-1 text-xl font-medium leading-[30px] text-gray-900'>{t('datasetDocuments.list.batchModal.title')}</div>
+      <div className='absolute right-4 top-4 p-2 cursor-pointer' onClick={onCancel}>
+        <XClose className='w-4 h-4 text-gray-500' />
+      </div>
+      <CSVUploader
+        file={currentCSV}
+        updateFile={handleFile}
+      />
+      <CSVDownloader
+        docForm={docForm}
+      />
+      <div className='mt-[28px] pt-6 flex justify-end'>
+        <Button className='mr-2 text-gray-700 text-sm font-medium' onClick={onCancel}>
+          {t('datasetDocuments.list.batchModal.cancel')}
+        </Button>
+        <Button className='text-sm font-medium' type="primary" onClick={handleSend} disabled={!currentCSV}>
+          {t('datasetDocuments.list.batchModal.run')}
+        </Button>
+      </div>
+    </Modal>
+  )
+}
+export default React.memo(BatchModal)

+ 7 - 0
web/app/components/datasets/documents/detail/completed/InfiniteVirtualList.tsx

@@ -13,6 +13,9 @@ type IInfiniteVirtualListProps = {
   loadNextPage: () => Promise<any> // Callback function responsible for loading the next page of items.
   loadNextPage: () => Promise<any> // Callback function responsible for loading the next page of items.
   onClick: (detail: SegmentDetailModel) => void
   onClick: (detail: SegmentDetailModel) => void
   onChangeSwitch: (segId: string, enabled: boolean) => Promise<void>
   onChangeSwitch: (segId: string, enabled: boolean) => Promise<void>
+  onDelete: (segId: string) => Promise<void>
+  archived?: boolean
+
 }
 }
 
 
 const InfiniteVirtualList: FC<IInfiniteVirtualListProps> = ({
 const InfiniteVirtualList: FC<IInfiniteVirtualListProps> = ({
@@ -22,6 +25,8 @@ const InfiniteVirtualList: FC<IInfiniteVirtualListProps> = ({
   loadNextPage,
   loadNextPage,
   onClick: onClickCard,
   onClick: onClickCard,
   onChangeSwitch,
   onChangeSwitch,
+  onDelete,
+  archived,
 }) => {
 }) => {
   // If there are more items to be loaded then add an extra row to hold a loading indicator.
   // If there are more items to be loaded then add an extra row to hold a loading indicator.
   const itemCount = hasNextPage ? items.length + 1 : items.length
   const itemCount = hasNextPage ? items.length + 1 : items.length
@@ -52,7 +57,9 @@ const InfiniteVirtualList: FC<IInfiniteVirtualListProps> = ({
           detail={segItem}
           detail={segItem}
           onClick={() => onClickCard(segItem)}
           onClick={() => onClickCard(segItem)}
           onChangeSwitch={onChangeSwitch}
           onChangeSwitch={onChangeSwitch}
+          onDelete={onDelete}
           loading={false}
           loading={false}
+          archived={archived}
         />
         />
       ))
       ))
     }
     }

+ 41 - 3
web/app/components/datasets/documents/detail/completed/SegmentCard.tsx

@@ -1,5 +1,5 @@
 import type { FC } from 'react'
 import type { FC } from 'react'
-import React from 'react'
+import React, { useState } from 'react'
 import cn from 'classnames'
 import cn from 'classnames'
 import { ArrowUpRightIcon } from '@heroicons/react/24/outline'
 import { ArrowUpRightIcon } from '@heroicons/react/24/outline'
 import { useTranslation } from 'react-i18next'
 import { useTranslation } from 'react-i18next'
@@ -7,11 +7,15 @@ import { StatusItem } from '../../list'
 import { DocumentTitle } from '../index'
 import { DocumentTitle } from '../index'
 import s from './style.module.css'
 import s from './style.module.css'
 import { SegmentIndexTag } from './index'
 import { SegmentIndexTag } from './index'
+import Modal from '@/app/components/base/modal'
+import Button from '@/app/components/base/button'
 import Switch from '@/app/components/base/switch'
 import Switch from '@/app/components/base/switch'
 import Divider from '@/app/components/base/divider'
 import Divider from '@/app/components/base/divider'
 import Indicator from '@/app/components/header/indicator'
 import Indicator from '@/app/components/header/indicator'
 import { formatNumber } from '@/utils/format'
 import { formatNumber } from '@/utils/format'
 import type { SegmentDetailModel } from '@/models/datasets'
 import type { SegmentDetailModel } from '@/models/datasets'
+import { AlertCircle } from '@/app/components/base/icons/src/vender/solid/alertsAndFeedback'
+import { Trash03 } from '@/app/components/base/icons/src/vender/line/general'
 
 
 const ProgressBar: FC<{ percent: number; loading: boolean }> = ({ percent, loading }) => {
 const ProgressBar: FC<{ percent: number; loading: boolean }> = ({ percent, loading }) => {
   return (
   return (
@@ -35,8 +39,10 @@ type ISegmentCardProps = {
   score?: number
   score?: number
   onClick?: () => void
   onClick?: () => void
   onChangeSwitch?: (segId: string, enabled: boolean) => Promise<void>
   onChangeSwitch?: (segId: string, enabled: boolean) => Promise<void>
+  onDelete?: (segId: string) => Promise<void>
   scene?: UsageScene
   scene?: UsageScene
   className?: string
   className?: string
+  archived?: boolean
 }
 }
 
 
 const SegmentCard: FC<ISegmentCardProps> = ({
 const SegmentCard: FC<ISegmentCardProps> = ({
@@ -44,9 +50,11 @@ const SegmentCard: FC<ISegmentCardProps> = ({
   score,
   score,
   onClick,
   onClick,
   onChangeSwitch,
   onChangeSwitch,
+  onDelete,
   loading = true,
   loading = true,
   scene = 'doc',
   scene = 'doc',
   className = '',
   className = '',
+  archived,
 }) => {
 }) => {
   const { t } = useTranslation()
   const { t } = useTranslation()
   const {
   const {
@@ -60,6 +68,7 @@ const SegmentCard: FC<ISegmentCardProps> = ({
     answer,
     answer,
   } = detail as any
   } = detail as any
   const isDocScene = scene === 'doc'
   const isDocScene = scene === 'doc'
+  const [showModal, setShowModal] = useState(false)
 
 
   const renderContent = () => {
   const renderContent = () => {
     if (answer) {
     if (answer) {
@@ -86,7 +95,7 @@ const SegmentCard: FC<ISegmentCardProps> = ({
         s.segWrapper,
         s.segWrapper,
         (isDocScene && !enabled) ? 'bg-gray-25' : '',
         (isDocScene && !enabled) ? 'bg-gray-25' : '',
         'group',
         'group',
-        !loading ? 'pb-4' : '',
+        !loading ? 'pb-4 hover:pb-[10px]' : '',
         className,
         className,
       )}
       )}
       onClick={() => onClick?.()}
       onClick={() => onClick?.()}
@@ -116,6 +125,7 @@ const SegmentCard: FC<ISegmentCardProps> = ({
                       >
                       >
                         <Switch
                         <Switch
                           size='md'
                           size='md'
+                          disabled={archived}
                           defaultValue={enabled}
                           defaultValue={enabled}
                           onChange={async (val) => {
                           onChange={async (val) => {
                             await onChangeSwitch?.(id, val)
                             await onChangeSwitch?.(id, val)
@@ -159,10 +169,18 @@ const SegmentCard: FC<ISegmentCardProps> = ({
                   <div className={cn(s.commonIcon, s.targetIcon)} />
                   <div className={cn(s.commonIcon, s.targetIcon)} />
                   <div className={s.segDataText}>{formatNumber(hit_count)}</div>
                   <div className={s.segDataText}>{formatNumber(hit_count)}</div>
                 </div>
                 </div>
-                <div className="flex items-center">
+                <div className="grow flex items-center">
                   <div className={cn(s.commonIcon, s.bezierCurveIcon)} />
                   <div className={cn(s.commonIcon, s.bezierCurveIcon)} />
                   <div className={s.segDataText}>{index_node_hash}</div>
                   <div className={s.segDataText}>{index_node_hash}</div>
                 </div>
                 </div>
+                {!archived && (
+                  <div className='shrink-0 w-6 h-6 flex items-center justify-center rounded-md hover:bg-red-100 hover:text-red-600 cursor-pointer group/delete' onClick={(e) => {
+                    e.stopPropagation()
+                    setShowModal(true)
+                  }}>
+                    <Trash03 className='w-[14px] h-[14px] text-gray-500 group-hover/delete:text-red-600' />
+                  </div>
+                )}
               </div>
               </div>
             </>
             </>
             : <>
             : <>
@@ -187,6 +205,26 @@ const SegmentCard: FC<ISegmentCardProps> = ({
               </div>
               </div>
             </>
             </>
         )}
         )}
+      {showModal && <Modal isShow={showModal} onClose={() => setShowModal(false)} className={s.delModal} closable>
+        <div>
+          <div className={s.warningWrapper}>
+            <AlertCircle className='w-6 h-6 text-red-600' />
+          </div>
+          <div className='text-xl font-semibold text-gray-900 mb-1'>{t('datasetDocuments.segment.delete')}</div>
+          <div className='flex gap-2 justify-end'>
+            <Button onClick={() => setShowModal(false)}>{t('common.operation.cancel')}</Button>
+            <Button
+              type='warning'
+              onClick={async () => {
+                await onDelete?.(id)
+              }}
+              className='border-red-700 border-[0.5px]'
+            >
+              {t('common.operation.sure')}
+            </Button>
+          </div>
+        </div>
+      </Modal>}
     </div>
     </div>
   )
   )
 }
 }

+ 61 - 32
web/app/components/datasets/documents/detail/completed/index.tsx

@@ -8,6 +8,7 @@ import { debounce, isNil, omitBy } from 'lodash-es'
 import cn from 'classnames'
 import cn from 'classnames'
 import { StatusItem } from '../../list'
 import { StatusItem } from '../../list'
 import { DocumentContext } from '../index'
 import { DocumentContext } from '../index'
+import { ProcessStatus } from '../segment-add'
 import s from './style.module.css'
 import s from './style.module.css'
 import InfiniteVirtualList from './InfiniteVirtualList'
 import InfiniteVirtualList from './InfiniteVirtualList'
 import { formatNumber } from '@/utils/format'
 import { formatNumber } from '@/utils/format'
@@ -18,7 +19,7 @@ import Input from '@/app/components/base/input'
 import { ToastContext } from '@/app/components/base/toast'
 import { ToastContext } from '@/app/components/base/toast'
 import type { Item } from '@/app/components/base/select'
 import type { Item } from '@/app/components/base/select'
 import { SimpleSelect } from '@/app/components/base/select'
 import { SimpleSelect } from '@/app/components/base/select'
-import { disableSegment, enableSegment, fetchSegments, updateSegment } from '@/service/datasets'
+import { deleteSegment, disableSegment, enableSegment, fetchSegments, updateSegment } from '@/service/datasets'
 import type { SegmentDetailModel, SegmentUpdator, SegmentsQuery, SegmentsResponse } from '@/models/datasets'
 import type { SegmentDetailModel, SegmentUpdator, SegmentsQuery, SegmentsResponse } from '@/models/datasets'
 import { asyncRunSafe } from '@/utils'
 import { asyncRunSafe } from '@/utils'
 import type { CommonResponse } from '@/models/common'
 import type { CommonResponse } from '@/models/common'
@@ -48,12 +49,14 @@ type ISegmentDetailProps = {
   onChangeSwitch?: (segId: string, enabled: boolean) => Promise<void>
   onChangeSwitch?: (segId: string, enabled: boolean) => Promise<void>
   onUpdate: (segmentId: string, q: string, a: string, k: string[]) => void
   onUpdate: (segmentId: string, q: string, a: string, k: string[]) => void
   onCancel: () => void
   onCancel: () => void
+  archived?: boolean
 }
 }
 /**
 /**
  * Show all the contents of the segment
  * Show all the contents of the segment
  */
  */
 export const SegmentDetail: FC<ISegmentDetailProps> = memo(({
 export const SegmentDetail: FC<ISegmentDetailProps> = memo(({
   segInfo,
   segInfo,
+  archived,
   onChangeSwitch,
   onChangeSwitch,
   onUpdate,
   onUpdate,
   onCancel,
   onCancel,
@@ -116,31 +119,30 @@ export const SegmentDetail: FC<ISegmentDetailProps> = memo(({
   return (
   return (
     <div className={'flex flex-col relative'}>
     <div className={'flex flex-col relative'}>
       <div className='absolute right-0 top-0 flex items-center h-7'>
       <div className='absolute right-0 top-0 flex items-center h-7'>
-        {
-          isEditing
-            ? (
-              <>
-                <Button
-                  className='mr-2 !h-7 !px-3 !py-[5px] text-xs font-medium text-gray-700 !rounded-md'
-                  onClick={handleCancel}>
-                  {t('common.operation.cancel')}
-                </Button>
-                <Button
-                  type='primary'
-                  className='!h-7 !px-3 !py-[5px] text-xs font-medium !rounded-md'
-                  onClick={handleSave}>
-                  {t('common.operation.save')}
-                </Button>
-              </>
-            )
-            : (
-              <div className='group relative flex justify-center items-center w-6 h-6 hover:bg-gray-100 rounded-md cursor-pointer'>
-                <div className={cn(s.editTip, 'hidden items-center absolute -top-10 px-3 h-[34px] bg-white rounded-lg whitespace-nowrap text-xs font-semibold text-gray-700 group-hover:flex')}>{t('common.operation.edit')}</div>
-                <Edit03 className='w-4 h-4 text-gray-500' onClick={() => setIsEditing(true)} />
-              </div>
-            )
-        }
-        <div className='mx-3 w-[1px] h-3 bg-gray-200' />
+        {isEditing && (
+          <>
+            <Button
+              className='mr-2 !h-7 !px-3 !py-[5px] text-xs font-medium text-gray-700 !rounded-md'
+              onClick={handleCancel}>
+              {t('common.operation.cancel')}
+            </Button>
+            <Button
+              type='primary'
+              className='!h-7 !px-3 !py-[5px] text-xs font-medium !rounded-md'
+              onClick={handleSave}>
+              {t('common.operation.save')}
+            </Button>
+          </>
+        )}
+        {!isEditing && !archived && (
+          <>
+            <div className='group relative flex justify-center items-center w-6 h-6 hover:bg-gray-100 rounded-md cursor-pointer'>
+              <div className={cn(s.editTip, 'hidden items-center absolute -top-10 px-3 h-[34px] bg-white rounded-lg whitespace-nowrap text-xs font-semibold text-gray-700 group-hover:flex')}>{t('common.operation.edit')}</div>
+              <Edit03 className='w-4 h-4 text-gray-500' onClick={() => setIsEditing(true)} />
+            </div>
+            <div className='mx-3 w-[1px] h-3 bg-gray-200' />
+          </>
+        )}
         <div className='flex justify-center items-center w-6 h-6 cursor-pointer' onClick={onCancel}>
         <div className='flex justify-center items-center w-6 h-6 cursor-pointer' onClick={onCancel}>
           <XClose className='w-4 h-4 text-gray-500' />
           <XClose className='w-4 h-4 text-gray-500' />
         </div>
         </div>
@@ -176,6 +178,7 @@ export const SegmentDetail: FC<ISegmentDetailProps> = memo(({
             onChange={async (val) => {
             onChange={async (val) => {
               await onChangeSwitch?.(segInfo?.id || '', val)
               await onChangeSwitch?.(segInfo?.id || '', val)
             }}
             }}
+            disabled={archived}
           />
           />
         </div>
         </div>
       </div>
       </div>
@@ -195,13 +198,20 @@ export const splitArray = (arr: any[], size = 3) => {
 type ICompletedProps = {
 type ICompletedProps = {
   showNewSegmentModal: boolean
   showNewSegmentModal: boolean
   onNewSegmentModalChange: (state: boolean) => void
   onNewSegmentModalChange: (state: boolean) => void
+  importStatus: ProcessStatus | string | undefined
+  archived?: boolean
   // data: Array<{}> // all/part segments
   // data: Array<{}> // all/part segments
 }
 }
 /**
 /**
  * Embedding done, show list of all segments
  * Embedding done, show list of all segments
  * Support search and filter
  * Support search and filter
  */
  */
-const Completed: FC<ICompletedProps> = ({ showNewSegmentModal, onNewSegmentModalChange }) => {
+const Completed: FC<ICompletedProps> = ({
+  showNewSegmentModal,
+  onNewSegmentModalChange,
+  importStatus,
+  archived,
+}) => {
   const { t } = useTranslation()
   const { t } = useTranslation()
   const { notify } = useContext(ToastContext)
   const { notify } = useContext(ToastContext)
   const { datasetId = '', documentId = '', docForm } = useContext(DocumentContext)
   const { datasetId = '', documentId = '', docForm } = useContext(DocumentContext)
@@ -250,11 +260,6 @@ const Completed: FC<ICompletedProps> = ({ showNewSegmentModal, onNewSegmentModal
     getSegments(false)
     getSegments(false)
   }
   }
 
 
-  useEffect(() => {
-    if (lastSegmentsRes !== undefined)
-      getSegments(false)
-  }, [selectedStatus, searchValue])
-
   const onClickCard = (detail: SegmentDetailModel) => {
   const onClickCard = (detail: SegmentDetailModel) => {
     setCurrSegment({ segInfo: detail, showModal: true })
     setCurrSegment({ segInfo: detail, showModal: true })
   }
   }
@@ -281,6 +286,17 @@ const Completed: FC<ICompletedProps> = ({ showNewSegmentModal, onNewSegmentModal
     }
     }
   }
   }
 
 
+  const onDelete = async (segId: string) => {
+    const [e] = await asyncRunSafe<CommonResponse>(deleteSegment({ datasetId, documentId, segmentId: segId }) as Promise<CommonResponse>)
+    if (!e) {
+      notify({ type: 'success', message: t('common.actionMsg.modifiedSuccessfully') })
+      resetList()
+    }
+    else {
+      notify({ type: 'error', message: t('common.actionMsg.modificationFailed') })
+    }
+  }
+
   const handleUpdateSegment = async (segmentId: string, question: string, answer: string, keywords: string[]) => {
   const handleUpdateSegment = async (segmentId: string, question: string, answer: string, keywords: string[]) => {
     const params: SegmentUpdator = { content: '' }
     const params: SegmentUpdator = { content: '' }
     if (docForm === 'qa_model') {
     if (docForm === 'qa_model') {
@@ -321,6 +337,16 @@ const Completed: FC<ICompletedProps> = ({ showNewSegmentModal, onNewSegmentModal
     setAllSegments([...allSegments])
     setAllSegments([...allSegments])
   }
   }
 
 
+  useEffect(() => {
+    if (lastSegmentsRes !== undefined)
+      getSegments(false)
+  }, [selectedStatus, searchValue])
+
+  useEffect(() => {
+    if (importStatus === ProcessStatus.COMPLETED)
+      resetList()
+  }, [importStatus])
+
   return (
   return (
     <>
     <>
       <div className={s.docSearchWrapper}>
       <div className={s.docSearchWrapper}>
@@ -343,7 +369,9 @@ const Completed: FC<ICompletedProps> = ({ showNewSegmentModal, onNewSegmentModal
         items={allSegments}
         items={allSegments}
         loadNextPage={getSegments}
         loadNextPage={getSegments}
         onChangeSwitch={onChangeSwitch}
         onChangeSwitch={onChangeSwitch}
+        onDelete={onDelete}
         onClick={onClickCard}
         onClick={onClickCard}
+        archived={archived}
       />
       />
       <Modal isShow={currSegment.showModal} onClose={() => {}} className='!max-w-[640px] !overflow-visible'>
       <Modal isShow={currSegment.showModal} onClose={() => {}} className='!max-w-[640px] !overflow-visible'>
         <SegmentDetail
         <SegmentDetail
@@ -351,6 +379,7 @@ const Completed: FC<ICompletedProps> = ({ showNewSegmentModal, onNewSegmentModal
           onChangeSwitch={onChangeSwitch}
           onChangeSwitch={onChangeSwitch}
           onUpdate={handleUpdateSegment}
           onUpdate={handleUpdateSegment}
           onCancel={onCloseModal}
           onCancel={onCloseModal}
+          archived={archived}
         />
         />
       </Modal>
       </Modal>
       <NewSegmentModal
       <NewSegmentModal

+ 21 - 0
web/app/components/datasets/documents/detail/completed/style.module.css

@@ -132,3 +132,24 @@
 .editTip {
 .editTip {
   box-shadow: 0px 4px 6px -2px rgba(16, 24, 40, 0.03), 0px 12px 16px -4px rgba(16, 24, 40, 0.08);
   box-shadow: 0px 4px 6px -2px rgba(16, 24, 40, 0.03), 0px 12px 16px -4px rgba(16, 24, 40, 0.08);
 }
 }
+
+.delModal {
+  background: linear-gradient(
+      180deg,
+      rgba(217, 45, 32, 0.05) 0%,
+      rgba(217, 45, 32, 0) 24.02%
+    ),
+    #f9fafb;
+  box-shadow: 0px 20px 24px -4px rgba(16, 24, 40, 0.08),
+    0px 8px 8px -4px rgba(16, 24, 40, 0.03);
+  @apply rounded-2xl p-8;
+}
+.warningWrapper {
+  box-shadow: 0px 20px 24px -4px rgba(16, 24, 40, 0.08),
+    0px 8px 8px -4px rgba(16, 24, 40, 0.03);
+  background: rgba(255, 255, 255, 0.9);
+  @apply h-12 w-12 border-[0.5px] border-gray-100 rounded-xl mb-3 flex items-center justify-center;
+}
+.warningIcon {
+  @apply w-[22px] h-[22px] fill-current text-red-600;
+}

+ 65 - 16
web/app/components/datasets/documents/detail/index.tsx

@@ -3,7 +3,7 @@ import type { FC } from 'react'
 import React, { useState } from 'react'
 import React, { useState } from 'react'
 import useSWR from 'swr'
 import useSWR from 'swr'
 import { ArrowLeftIcon } from '@heroicons/react/24/solid'
 import { ArrowLeftIcon } from '@heroicons/react/24/solid'
-import { createContext } from 'use-context-selector'
+import { createContext, useContext } from 'use-context-selector'
 import { useTranslation } from 'react-i18next'
 import { useTranslation } from 'react-i18next'
 import { useRouter } from 'next/navigation'
 import { useRouter } from 'next/navigation'
 import { omit } from 'lodash-es'
 import { omit } from 'lodash-es'
@@ -13,19 +13,15 @@ import s from '../style.module.css'
 import Completed from './completed'
 import Completed from './completed'
 import Embedding from './embedding'
 import Embedding from './embedding'
 import Metadata from './metadata'
 import Metadata from './metadata'
+import SegmentAdd, { ProcessStatus } from './segment-add'
+import BatchModal from './batch-modal'
 import style from './style.module.css'
 import style from './style.module.css'
 import Divider from '@/app/components/base/divider'
 import Divider from '@/app/components/base/divider'
 import Loading from '@/app/components/base/loading'
 import Loading from '@/app/components/base/loading'
 import type { MetadataType } from '@/service/datasets'
 import type { MetadataType } from '@/service/datasets'
-import { fetchDocumentDetail } from '@/service/datasets'
-
-export const BackCircleBtn: FC<{ onClick: () => void }> = ({ onClick }) => {
-  return (
-    <div onClick={onClick} className={'rounded-full w-8 h-8 flex justify-center items-center border-gray-100 cursor-pointer border hover:border-gray-300 shadow-lg'}>
-      <ArrowLeftIcon className='text-primary-600 fill-current stroke-current h-4 w-4' />
-    </div>
-  )
-}
+import { checkSegmentBatchImportProgress, fetchDocumentDetail, segmentBatchImport } from '@/service/datasets'
+import { ToastContext } from '@/app/components/base/toast'
+import type { DocForm } from '@/models/datasets'
 
 
 export const DocumentContext = createContext<{ datasetId?: string; documentId?: string; docForm: string }>({ docForm: '' })
 export const DocumentContext = createContext<{ datasetId?: string; documentId?: string; docForm: string }>({ docForm: '' })
 
 
@@ -51,10 +47,45 @@ type Props = {
 }
 }
 
 
 const DocumentDetail: FC<Props> = ({ datasetId, documentId }) => {
 const DocumentDetail: FC<Props> = ({ datasetId, documentId }) => {
-  const { t } = useTranslation()
   const router = useRouter()
   const router = useRouter()
+  const { t } = useTranslation()
+  const { notify } = useContext(ToastContext)
   const [showMetadata, setShowMetadata] = useState(true)
   const [showMetadata, setShowMetadata] = useState(true)
-  const [showNewSegmentModal, setShowNewSegmentModal] = useState(false)
+  const [newSegmentModalVisible, setNewSegmentModalVisible] = useState(false)
+  const [batchModalVisible, setBatchModalVisible] = useState(false)
+  const [importStatus, setImportStatus] = useState<ProcessStatus | string>()
+  const showNewSegmentModal = () => setNewSegmentModalVisible(true)
+  const showBatchModal = () => setBatchModalVisible(true)
+  const hideBatchModal = () => setBatchModalVisible(false)
+  const resetProcessStatus = () => setImportStatus('')
+  const checkProcess = async (jobID: string) => {
+    try {
+      const res = await checkSegmentBatchImportProgress({ jobID })
+      setImportStatus(res.job_status)
+      if (res.job_status === ProcessStatus.WAITING || res.job_status === ProcessStatus.PROCESSING)
+        setTimeout(() => checkProcess(res.job_id), 2500)
+      if (res.job_status === ProcessStatus.ERROR)
+        notify({ type: 'error', message: `${t('datasetDocuments.list.batchModal.runError')}` })
+    }
+    catch (e: any) {
+      notify({ type: 'error', message: `${t('datasetDocuments.list.batchModal.runError')}${'message' in e ? `: ${e.message}` : ''}` })
+    }
+  }
+  const runBatch = async (csv: File) => {
+    const formData = new FormData()
+    formData.append('file', csv)
+    try {
+      const res = await segmentBatchImport({
+        url: `/datasets/${datasetId}/documents/${documentId}/segments/batch_import`,
+        body: formData,
+      })
+      setImportStatus(res.job_status)
+      checkProcess(res.job_id)
+    }
+    catch (e: any) {
+      notify({ type: 'error', message: `${t('datasetDocuments.list.batchModal.runError')}${'message' in e ? `: ${e.message}` : ''}` })
+    }
+  }
 
 
   const { data: documentDetail, error, mutate: detailMutate } = useSWR({
   const { data: documentDetail, error, mutate: detailMutate } = useSWR({
     action: 'fetchDocumentDetail',
     action: 'fetchDocumentDetail',
@@ -91,22 +122,32 @@ const DocumentDetail: FC<Props> = ({ datasetId, documentId }) => {
     <DocumentContext.Provider value={{ datasetId, documentId, docForm: documentDetail?.doc_form || '' }}>
     <DocumentContext.Provider value={{ datasetId, documentId, docForm: documentDetail?.doc_form || '' }}>
       <div className='flex flex-col h-full'>
       <div className='flex flex-col h-full'>
         <div className='flex h-16 border-b-gray-100 border-b items-center p-4'>
         <div className='flex h-16 border-b-gray-100 border-b items-center p-4'>
-          <BackCircleBtn onClick={backToPrev} />
+          <div onClick={backToPrev} className={'rounded-full w-8 h-8 flex justify-center items-center border-gray-100 cursor-pointer border hover:border-gray-300 shadow-[0px_12px_16px_-4px_rgba(16,24,40,0.08),0px_4px_6px_-2px_rgba(16,24,40,0.03)]'}>
+            <ArrowLeftIcon className='text-primary-600 fill-current stroke-current h-4 w-4' />
+          </div>
           <Divider className='!h-4' type='vertical' />
           <Divider className='!h-4' type='vertical' />
           <DocumentTitle extension={documentDetail?.data_source_info?.upload_file?.extension} name={documentDetail?.name} />
           <DocumentTitle extension={documentDetail?.data_source_info?.upload_file?.extension} name={documentDetail?.name} />
           <StatusItem status={documentDetail?.display_status || 'available'} scene='detail' />
           <StatusItem status={documentDetail?.display_status || 'available'} scene='detail' />
+          {documentDetail && !documentDetail.archived && (
+            <SegmentAdd
+              importStatus={importStatus}
+              clearProcessStatus={resetProcessStatus}
+              showNewSegmentModal={showNewSegmentModal}
+              showBatchModal={showBatchModal}
+            />
+          )}
           <OperationAction
           <OperationAction
             scene='detail'
             scene='detail'
             detail={{
             detail={{
               enabled: documentDetail?.enabled || false,
               enabled: documentDetail?.enabled || false,
               archived: documentDetail?.archived || false,
               archived: documentDetail?.archived || false,
               id: documentId,
               id: documentId,
+              data_source_type: documentDetail?.data_source_type || '',
               doc_form: documentDetail?.doc_form || '',
               doc_form: documentDetail?.doc_form || '',
             }}
             }}
             datasetId={datasetId}
             datasetId={datasetId}
             onUpdate={handleOperate}
             onUpdate={handleOperate}
             className='!w-[216px]'
             className='!w-[216px]'
-            showNewSegmentModal={() => setShowNewSegmentModal(true)}
           />
           />
           <button
           <button
             className={cn(style.layoutRightIcon, showMetadata ? style.iconShow : style.iconClose)}
             className={cn(style.layoutRightIcon, showMetadata ? style.iconShow : style.iconClose)}
@@ -120,8 +161,10 @@ const DocumentDetail: FC<Props> = ({ datasetId, documentId }) => {
               {embedding
               {embedding
                 ? <Embedding detail={documentDetail} detailUpdate={detailMutate} />
                 ? <Embedding detail={documentDetail} detailUpdate={detailMutate} />
                 : <Completed
                 : <Completed
-                  showNewSegmentModal={showNewSegmentModal}
-                  onNewSegmentModalChange={setShowNewSegmentModal}
+                  showNewSegmentModal={newSegmentModalVisible}
+                  onNewSegmentModalChange={setNewSegmentModalVisible}
+                  importStatus={importStatus}
+                  archived={documentDetail?.archived}
                 />
                 />
               }
               }
             </div>
             </div>
@@ -132,6 +175,12 @@ const DocumentDetail: FC<Props> = ({ datasetId, documentId }) => {
             onUpdate={metadataMutate}
             onUpdate={metadataMutate}
           />}
           />}
         </div>
         </div>
+        <BatchModal
+          isShow={batchModalVisible}
+          onCancel={hideBatchModal}
+          onConfirm={runBatch}
+          docForm={documentDetail?.doc_form as DocForm}
+        />
       </div>
       </div>
     </DocumentContext.Provider>
     </DocumentContext.Provider>
   )
   )

+ 84 - 0
web/app/components/datasets/documents/detail/segment-add/index.tsx

@@ -0,0 +1,84 @@
+'use client'
+import type { FC } from 'react'
+import React from 'react'
+import { useTranslation } from 'react-i18next'
+import cn from 'classnames'
+import { FilePlus02 } from '@/app/components/base/icons/src/vender/line/files'
+import { Loading02 } from '@/app/components/base/icons/src/vender/line/general'
+import { AlertCircle } from '@/app/components/base/icons/src/vender/solid/alertsAndFeedback'
+import { CheckCircle } from '@/app/components/base/icons/src/vender/solid/general'
+import Popover from '@/app/components/base/popover'
+
+export type ISegmentAddProps = {
+  importStatus: ProcessStatus | string | undefined
+  clearProcessStatus: () => void
+  showNewSegmentModal: () => void
+  showBatchModal: () => void
+}
+
+export enum ProcessStatus {
+  WAITING = 'waiting',
+  PROCESSING = 'processing',
+  COMPLETED = 'completed',
+  ERROR = 'error',
+}
+
+const SegmentAdd: FC<ISegmentAddProps> = ({
+  importStatus,
+  clearProcessStatus,
+  showNewSegmentModal,
+  showBatchModal,
+}) => {
+  const { t } = useTranslation()
+
+  if (importStatus) {
+    return (
+      <>
+        {(importStatus === ProcessStatus.WAITING || importStatus === ProcessStatus.PROCESSING) && (
+          <div className='relative overflow-hidden inline-flex items-center mr-2 px-3 py-[6px] text-blue-700 bg-[#F5F8FF] rounded-lg border border-black/5'>
+            {importStatus === ProcessStatus.WAITING && <div className='absolute left-0 top-0 w-3/12 h-full bg-[#D1E0FF] z-0'/>}
+            {importStatus === ProcessStatus.PROCESSING && <div className='absolute left-0 top-0 w-2/3 h-full bg-[#D1E0FF] z-0'/>}
+            <Loading02 className='animate-spin mr-2 w-4 h-4' />
+            <span className='font-medium text-[13px] leading-[18px] z-10'>{t('datasetDocuments.list.batchModal.processing')}</span>
+          </div>
+        )}
+        {importStatus === ProcessStatus.COMPLETED && (
+          <div className='inline-flex items-center mr-2 px-3 py-[6px] text-gray-700 bg-[#F6FEF9] rounded-lg border border-black/5'>
+            <CheckCircle className='mr-2 w-4 h-4 text-[#039855]' />
+            <span className='font-medium text-[13px] leading-[18px]'>{t('datasetDocuments.list.batchModal.completed')}</span>
+            <span className='pl-2 font-medium text-[13px] leading-[18px] text-[#155EEF] cursor-pointer' onClick={clearProcessStatus}>{t('datasetDocuments.list.batchModal.ok')}</span>
+          </div>
+        )}
+        {importStatus === ProcessStatus.ERROR && (
+          <div className='inline-flex items-center mr-2 px-3 py-[6px] text-red-600 bg-red-100 rounded-lg border border-black/5'>
+            <AlertCircle className='mr-2 w-4 h-4 text-[#D92D20]' />
+            <span className='font-medium text-[13px] leading-[18px]'>{t('datasetDocuments.list.batchModal.error')}</span>
+            <span className='pl-2 font-medium text-[13px] leading-[18px] text-[#155EEF] cursor-pointer' onClick={clearProcessStatus}>{t('datasetDocuments.list.batchModal.ok')}</span>
+          </div>
+        )}
+      </>
+    )
+  }
+
+  return (
+    <Popover
+      manualClose
+      trigger='click'
+      htmlContent={
+        <div className='w-full py-1'>
+          <div className='py-2 px-3 mx-1 flex items-center gap-2 hover:bg-gray-100 rounded-lg cursor-pointer text-gray-700 text-sm' onClick={showNewSegmentModal}>{t('datasetDocuments.list.action.add')}</div>
+          <div className='py-2 px-3 mx-1 flex items-center gap-2 hover:bg-gray-100 rounded-lg cursor-pointer text-gray-700 text-sm' onClick={showBatchModal}>{t('datasetDocuments.list.action.batchAdd')}</div>
+        </div>
+      }
+      btnElement={
+        <div className='inline-flex items-center'>
+          <FilePlus02 className='w-4 h-4 text-gray-700' />
+          <span className='pl-1'>{t('datasetDocuments.list.action.addButton')}</span>
+        </div>
+      }
+      btnClassName={open => cn('mr-2 !py-[6px] !text-[13px] !leading-[18px] hover:bg-gray-50 border border-gray-200 hover:border-gray-300 hover:shadow-[0_1px_2px_rgba(16,24,40,0.05)]', open ? '!bg-gray-100 !shadow-none' : '!bg-transparent')}
+      className='!w-[132px] h-fit !z-20  !translate-x-0 !left-0'
+    />
+  )
+}
+export default React.memo(SegmentAdd)

+ 71 - 7
web/app/components/datasets/documents/list.tsx

@@ -22,12 +22,12 @@ import type { IndicatorProps } from '@/app/components/header/indicator'
 import Indicator from '@/app/components/header/indicator'
 import Indicator from '@/app/components/header/indicator'
 import { asyncRunSafe } from '@/utils'
 import { asyncRunSafe } from '@/utils'
 import { formatNumber } from '@/utils/format'
 import { formatNumber } from '@/utils/format'
-import { archiveDocument, deleteDocument, disableDocument, enableDocument, syncDocument } from '@/service/datasets'
+import { archiveDocument, deleteDocument, disableDocument, enableDocument, syncDocument, unArchiveDocument } from '@/service/datasets'
 import NotionIcon from '@/app/components/base/notion-icon'
 import NotionIcon from '@/app/components/base/notion-icon'
 import ProgressBar from '@/app/components/base/progress-bar'
 import ProgressBar from '@/app/components/base/progress-bar'
 import { DataSourceType, type DocumentDisplayStatus, type SimpleDocumentDetail } from '@/models/datasets'
 import { DataSourceType, type DocumentDisplayStatus, type SimpleDocumentDetail } from '@/models/datasets'
 import type { CommonResponse } from '@/models/common'
 import type { CommonResponse } from '@/models/common'
-import { FilePlus02 } from '@/app/components/base/icons/src/vender/line/files'
+import { DotsHorizontal } from '@/app/components/base/icons/src/vender/line/general'
 
 
 export const SettingsIcon: FC<{ className?: string }> = ({ className }) => {
 export const SettingsIcon: FC<{ className?: string }> = ({ className }) => {
   return <svg width="16" height="16" viewBox="0 0 16 16" fill="none" xmlns="http://www.w3.org/2000/svg" className={className ?? ''}>
   return <svg width="16" height="16" viewBox="0 0 16 16" fill="none" xmlns="http://www.w3.org/2000/svg" className={className ?? ''}>
@@ -86,7 +86,7 @@ export const StatusItem: FC<{
   </div>
   </div>
 }
 }
 
 
-type OperationName = 'delete' | 'archive' | 'enable' | 'disable' | 'sync'
+type OperationName = 'delete' | 'archive' | 'enable' | 'disable' | 'sync' | 'un_archive'
 
 
 // operation action for list and detail
 // operation action for list and detail
 export const OperationAction: FC<{
 export const OperationAction: FC<{
@@ -101,8 +101,7 @@ export const OperationAction: FC<{
   onUpdate: (operationName?: string) => void
   onUpdate: (operationName?: string) => void
   scene?: 'list' | 'detail'
   scene?: 'list' | 'detail'
   className?: string
   className?: string
-  showNewSegmentModal?: () => void
-}> = ({ datasetId, detail, onUpdate, scene = 'list', className = '', showNewSegmentModal }) => {
+}> = ({ datasetId, detail, onUpdate, scene = 'list', className = '' }) => {
   const { id, enabled = false, archived = false, data_source_type } = detail || {}
   const { id, enabled = false, archived = false, data_source_type } = detail || {}
   const [showModal, setShowModal] = useState(false)
   const [showModal, setShowModal] = useState(false)
   const { notify } = useContext(ToastContext)
   const { notify } = useContext(ToastContext)
@@ -117,6 +116,9 @@ export const OperationAction: FC<{
       case 'archive':
       case 'archive':
         opApi = archiveDocument
         opApi = archiveDocument
         break
         break
+      case 'un_archive':
+        opApi = unArchiveDocument
+        break
       case 'enable':
       case 'enable':
         opApi = enableDocument
         opApi = enableDocument
         break
         break
@@ -218,10 +220,72 @@ export const OperationAction: FC<{
       <Divider className='!ml-4 !mr-2 !h-3' type='vertical' />
       <Divider className='!ml-4 !mr-2 !h-3' type='vertical' />
     </>}
     </>}
     <Popover
     <Popover
-      htmlContent={<Operations />}
+      htmlContent={
+        <div className='w-full py-1'>
+          {!isListScene && <>
+            <div className='flex justify-between items-center mx-4 pt-2'>
+              <span className={cn(s.actionName, 'font-medium')}>
+                {!archived && enabled ? t('datasetDocuments.list.index.enable') : t('datasetDocuments.list.index.disable')}
+              </span>
+              <Tooltip
+                selector={`detail-switch-${id}`}
+                content={t('datasetDocuments.list.action.enableWarning') as string}
+                className='!font-semibold'
+                disabled={!archived}
+              >
+                <div>
+                  <Switch
+                    defaultValue={archived ? false : enabled}
+                    onChange={v => !archived && onOperate(v ? 'enable' : 'disable')}
+                    disabled={archived}
+                    size='md'
+                  />
+                </div>
+              </Tooltip>
+            </div>
+            <div className='mx-4 pb-1 pt-0.5 text-xs text-gray-500'>
+              {!archived && enabled ? t('datasetDocuments.list.index.enableTip') : t('datasetDocuments.list.index.disableTip')}
+            </div>
+            <Divider />
+          </>}
+          {!archived && (
+            <>
+              <div className={s.actionItem} onClick={() => router.push(`/datasets/${datasetId}/documents/${detail.id}/settings`)}>
+                <SettingsIcon />
+                <span className={s.actionName}>{t('datasetDocuments.list.action.settings')}</span>
+              </div>
+              {data_source_type === 'notion_import' && (
+                <div className={s.actionItem} onClick={() => onOperate('sync')}>
+                  <SyncIcon />
+                  <span className={s.actionName}>{t('datasetDocuments.list.action.sync')}</span>
+                </div>
+              )}
+              <Divider className='my-1' />
+            </>
+          )}
+          {!archived && <div className={s.actionItem} onClick={() => onOperate('archive')}>
+            <ArchiveIcon />
+            <span className={s.actionName}>{t('datasetDocuments.list.action.archive')}</span>
+          </div>}
+          {archived && (
+            <div className={s.actionItem} onClick={() => onOperate('un_archive')}>
+              <ArchiveIcon />
+              <span className={s.actionName}>{t('datasetDocuments.list.action.unarchive')}</span>
+            </div>
+          )}
+          <div className={cn(s.actionItem, s.deleteActionItem, 'group')} onClick={() => setShowModal(true)}>
+            <TrashIcon className={'w-4 h-4 stroke-current text-gray-500 stroke-2 group-hover:text-red-500'} />
+            <span className={cn(s.actionName, 'group-hover:text-red-500')}>{t('datasetDocuments.list.action.delete')}</span>
+          </div>
+        </div>
+      }
       trigger='click'
       trigger='click'
       position='br'
       position='br'
-      btnElement={<div className={cn(s.actionIcon, s.commonIcon)} />}
+      btnElement={
+        <div className={cn(s.commonIcon)}>
+          <DotsHorizontal className='w-4 h-4 text-gray-700' />
+        </div>
+      }
       btnClassName={open => cn(isListScene ? s.actionIconWrapperList : s.actionIconWrapperDetail, open ? '!bg-gray-100 !shadow-none' : '!bg-transparent')}
       btnClassName={open => cn(isListScene ? s.actionIconWrapperList : s.actionIconWrapperDetail, open ? '!bg-gray-100 !shadow-none' : '!bg-transparent')}
       className={`!w-[200px] h-fit !z-20 ${className}`}
       className={`!w-[200px] h-fit !z-20 ${className}`}
     />
     />

+ 36 - 1
web/app/components/datasets/settings/form/index.tsx

@@ -10,6 +10,10 @@ import { ToastContext } from '@/app/components/base/toast'
 import Button from '@/app/components/base/button'
 import Button from '@/app/components/base/button'
 import { fetchDataDetail, updateDatasetSetting } from '@/service/datasets'
 import { fetchDataDetail, updateDatasetSetting } from '@/service/datasets'
 import type { DataSet } from '@/models/datasets'
 import type { DataSet } from '@/models/datasets'
+import ModelSelector from '@/app/components/header/account-setting/model-page/model-selector'
+import type { ProviderEnum } from '@/app/components/header/account-setting/model-page/declarations'
+import { ModelType } from '@/app/components/header/account-setting/model-page/declarations'
+import AccountSetting from '@/app/components/header/account-setting'
 
 
 const rowClass = `
 const rowClass = `
   flex justify-between py-4
   flex justify-between py-4
@@ -41,7 +45,7 @@ const Form = ({
   const [description, setDescription] = useState(currentDataset?.description ?? '')
   const [description, setDescription] = useState(currentDataset?.description ?? '')
   const [permission, setPermission] = useState(currentDataset?.permission)
   const [permission, setPermission] = useState(currentDataset?.permission)
   const [indexMethod, setIndexMethod] = useState(currentDataset?.indexing_technique)
   const [indexMethod, setIndexMethod] = useState(currentDataset?.indexing_technique)
-
+  const [showSetAPIKeyModal, setShowSetAPIKeyModal] = useState(false)
   const handleSave = async () => {
   const handleSave = async () => {
     if (loading)
     if (loading)
       return
       return
@@ -128,6 +132,32 @@ const Form = ({
           />
           />
         </div>
         </div>
       </div>
       </div>
+      <div className={rowClass}>
+        <div className={labelClass}>
+          <div>{t('datasetSettings.form.embeddingModel')}</div>
+        </div>
+        <div className='w-[480px]'>
+          {currentDataset && (
+            <>
+              <div className='w-full h-9 rounded-lg bg-gray-100 opacity-60'>
+                <ModelSelector
+                  readonly
+                  value={{
+                    providerName: currentDataset.embedding_model_provider as ProviderEnum,
+                    modelName: currentDataset.embedding_model,
+                  }}
+                  modelType={ModelType.embeddings}
+                  onChange={() => {}}
+                />
+              </div>
+              <div className='mt-2 w-full text-xs leading-6 text-gray-500'>
+                {t('datasetSettings.form.embeddingModelTip')}
+                <span className='text-[#155eef] cursor-pointer' onClick={() => setShowSetAPIKeyModal(true)}>{t('datasetSettings.form.embeddingModelTipLink')}</span>
+              </div>
+            </>
+          )}
+        </div>
+      </div>
       <div className={rowClass}>
       <div className={rowClass}>
         <div className={labelClass} />
         <div className={labelClass} />
         <div className='w-[480px]'>
         <div className='w-[480px]'>
@@ -140,6 +170,11 @@ const Form = ({
           </Button>
           </Button>
         </div>
         </div>
       </div>
       </div>
+      {showSetAPIKeyModal && (
+        <AccountSetting activeTab="provider" onCancel={async () => {
+          setShowSetAPIKeyModal(false)
+        }} />
+      )}
     </div>
     </div>
   )
   )
 }
 }

+ 1 - 0
web/i18n/lang/dataset-creation.en.ts

@@ -75,6 +75,7 @@ const translation = {
     economicalTip: 'Use offline vector engines, keyword indexes, etc. to reduce accuracy without spending tokens',
     economicalTip: 'Use offline vector engines, keyword indexes, etc. to reduce accuracy without spending tokens',
     QATitle: 'Segmenting in Question & Answer format',
     QATitle: 'Segmenting in Question & Answer format',
     QATip: 'Enable this option will consume more tokens',
     QATip: 'Enable this option will consume more tokens',
+    QALanguage: 'Segment using',
     emstimateCost: 'Estimation',
     emstimateCost: 'Estimation',
     emstimateSegment: 'Estimated segments',
     emstimateSegment: 'Estimated segments',
     segmentCount: 'segments',
     segmentCount: 'segments',

+ 1 - 0
web/i18n/lang/dataset-creation.zh.ts

@@ -75,6 +75,7 @@ const translation = {
     economicalTip: '使用离线的向量引擎、关键词索引等方式,降低了准确度但无需花费 Token',
     economicalTip: '使用离线的向量引擎、关键词索引等方式,降低了准确度但无需花费 Token',
     QATitle: '采用 Q&A 分段模式',
     QATitle: '采用 Q&A 分段模式',
     QATip: '开启后将会消耗额外的 token',
     QATip: '开启后将会消耗额外的 token',
+    QALanguage: '分段使用',
     emstimateCost: '执行嵌入预估消耗',
     emstimateCost: '执行嵌入预估消耗',
     emstimateSegment: '预估分段数',
     emstimateSegment: '预估分段数',
     segmentCount: '段',
     segmentCount: '段',

+ 23 - 1
web/i18n/lang/dataset-documents.en.ts

@@ -17,8 +17,11 @@ const translation = {
     action: {
     action: {
       uploadFile: 'Upload new file',
       uploadFile: 'Upload new file',
       settings: 'Segment settings',
       settings: 'Segment settings',
-      add: 'Add new segment',
+      addButton: 'Add segment',
+      add: 'Add a segment',
+      batchAdd: 'Batch add',
       archive: 'Archive',
       archive: 'Archive',
+      unarchive: 'Unarchive',
       delete: 'Delete',
       delete: 'Delete',
       enableWarning: 'Archived file cannot be enabled',
       enableWarning: 'Archived file cannot be enabled',
       sync: 'Sync',
       sync: 'Sync',
@@ -53,6 +56,24 @@ const translation = {
       title: 'Are you sure Delete?',
       title: 'Are you sure Delete?',
       content: 'If you need to resume processing later, you will continue from where you left off',
       content: 'If you need to resume processing later, you will continue from where you left off',
     },
     },
+    batchModal: {
+      title: 'Batch add segments',
+      csvUploadTitle: 'Drag and drop your CSV file here, or ',
+      browse: 'browse',
+      tip: 'The CSV file must conform to the following structure:',
+      question: 'question',
+      answer: 'answer',
+      contentTitle: 'segment content',
+      content: 'content',
+      template: 'Download the template here',
+      cancel: 'Cancel',
+      run: 'Run Batch',
+      runError: 'Run batch failed',
+      processing: 'In batch processing',
+      completed: 'Import completed',
+      error: 'Import Error',
+      ok: 'OK',
+    },
   },
   },
   metadata: {
   metadata: {
     title: 'Metadata',
     title: 'Metadata',
@@ -321,6 +342,7 @@ const translation = {
     contentEmpty: 'Content can not be empty',
     contentEmpty: 'Content can not be empty',
     newTextSegment: 'New Text Segment',
     newTextSegment: 'New Text Segment',
     newQaSegment: 'New Q&A Segment',
     newQaSegment: 'New Q&A Segment',
+    delete: 'Delete this segment ?',
   },
   },
 }
 }
 
 

+ 22 - 0
web/i18n/lang/dataset-documents.zh.ts

@@ -17,8 +17,11 @@ const translation = {
     action: {
     action: {
       uploadFile: '上传新文件',
       uploadFile: '上传新文件',
       settings: '分段设置',
       settings: '分段设置',
+      addButton: '添加分段',
       add: '添加新分段',
       add: '添加新分段',
+      batchAdd: '批量添加',
       archive: '归档',
       archive: '归档',
+      unarchive: '撤销归档',
       delete: '删除',
       delete: '删除',
       enableWarning: '归档的文件无法启用',
       enableWarning: '归档的文件无法启用',
       sync: '同步',
       sync: '同步',
@@ -53,6 +56,24 @@ const translation = {
       title: '确定删除吗?',
       title: '确定删除吗?',
       content: '如果您需要稍后恢复处理,您将从您离开的地方继续',
       content: '如果您需要稍后恢复处理,您将从您离开的地方继续',
     },
     },
+    batchModal: {
+      title: '批量添加分段',
+      csvUploadTitle: '将您的 CSV 文件拖放到此处,或',
+      browse: '选择文件',
+      tip: 'CSV 文件必须符合以下结构:',
+      question: '问题',
+      answer: '回答',
+      contentTitle: '分段内容',
+      content: '内容',
+      template: '下载模板',
+      cancel: '取消',
+      run: '导入',
+      runError: '批量导入失败',
+      processing: '批量处理中',
+      completed: '导入完成',
+      error: '导入出错',
+      ok: '确定',
+    },
   },
   },
   metadata: {
   metadata: {
     title: '元数据',
     title: '元数据',
@@ -320,6 +341,7 @@ const translation = {
     contentEmpty: '内容不能为空',
     contentEmpty: '内容不能为空',
     newTextSegment: '新文本分段',
     newTextSegment: '新文本分段',
     newQaSegment: '新问答分段',
     newQaSegment: '新问答分段',
+    delete: '删除这个分段?',
   },
   },
 }
 }
 
 

+ 3 - 0
web/i18n/lang/dataset-settings.en.ts

@@ -15,6 +15,9 @@ const translation = {
     indexMethodHighQualityTip: 'Call OpenAI\'s embedding interface for processing to provide higher accuracy when users query.',
     indexMethodHighQualityTip: 'Call OpenAI\'s embedding interface for processing to provide higher accuracy when users query.',
     indexMethodEconomy: 'Economical',
     indexMethodEconomy: 'Economical',
     indexMethodEconomyTip: 'Use offline vector engines, keyword indexes, etc. to reduce accuracy without spending tokens',
     indexMethodEconomyTip: 'Use offline vector engines, keyword indexes, etc. to reduce accuracy without spending tokens',
+    embeddingModel: 'Embedding Model',
+    embeddingModelTip: 'Change the embedded model, please go to ',
+    embeddingModelTipLink: 'Settings',
     save: 'Save',
     save: 'Save',
   },
   },
 }
 }

+ 3 - 0
web/i18n/lang/dataset-settings.zh.ts

@@ -15,6 +15,9 @@ const translation = {
     indexMethodHighQualityTip: '调用 OpenAI 的嵌入接口进行处理,以在用户查询时提供更高的准确度',
     indexMethodHighQualityTip: '调用 OpenAI 的嵌入接口进行处理,以在用户查询时提供更高的准确度',
     indexMethodEconomy: '经济',
     indexMethodEconomy: '经济',
     indexMethodEconomyTip: '使用离线的向量引擎、关键词索引等方式,降低了准确度但无需花费 Token',
     indexMethodEconomyTip: '使用离线的向量引擎、关键词索引等方式,降低了准确度但无需花费 Token',
+    embeddingModel: 'Embedding 模型',
+    embeddingModelTip: '修改 Embedding 模型,请去',
+    embeddingModelTipLink: '设置',
     save: '保存',
     save: '保存',
   },
   },
 }
 }

+ 2 - 0
web/i18n/lang/dataset.en.ts

@@ -16,6 +16,8 @@ const translation = {
   intro4: 'or it ',
   intro4: 'or it ',
   intro5: 'can be created',
   intro5: 'can be created',
   intro6: ' as a standalone ChatGPT index plug-in to publish',
   intro6: ' as a standalone ChatGPT index plug-in to publish',
+  unavailable: 'Unavailable',
+  unavailableTip: 'Embedding model is not available, the default embedding model needs to be configured',
 }
 }
 
 
 export default translation
 export default translation

+ 2 - 0
web/i18n/lang/dataset.zh.ts

@@ -16,6 +16,8 @@ const translation = {
   intro4: '或可以',
   intro4: '或可以',
   intro5: '创建',
   intro5: '创建',
   intro6: '为独立的 ChatGPT 插件发布使用',
   intro6: '为独立的 ChatGPT 插件发布使用',
+  unavailable: '不可用',
+  unavailableTip: '由于 embedding 模型不可用,需要配置默认 embedding 模型',
 }
 }
 
 
 export default translation
 export default translation

+ 9 - 0
web/models/datasets.ts

@@ -22,6 +22,9 @@ export type DataSet = {
   app_count: number
   app_count: number
   document_count: number
   document_count: number
   word_count: number
   word_count: number
+  embedding_model: string
+  embedding_model_provider: string
+  embedding_available: boolean
 }
 }
 
 
 export type CustomFile = File & {
 export type CustomFile = File & {
@@ -184,6 +187,7 @@ export type CreateDocumentReq = {
   original_document_id?: string
   original_document_id?: string
   indexing_technique?: string
   indexing_technique?: string
   doc_form: 'text_model' | 'qa_model'
   doc_form: 'text_model' | 'qa_model'
+  doc_language: string
   data_source: DataSource
   data_source: DataSource
   process_rule: ProcessRule
   process_rule: ProcessRule
 }
 }
@@ -390,3 +394,8 @@ export type SegmentUpdator = {
   answer?: string
   answer?: string
   keywords?: string[]
   keywords?: string[]
 }
 }
+
+export enum DocForm {
+  TEXT = 'text_model',
+  QA = 'qa_model',
+}

+ 16 - 4
web/service/datasets.ts

@@ -118,6 +118,10 @@ export const archiveDocument: Fetcher<CommonResponse, CommonDocReq> = ({ dataset
   return patch(`/datasets/${datasetId}/documents/${documentId}/status/archive`) as Promise<CommonResponse>
   return patch(`/datasets/${datasetId}/documents/${documentId}/status/archive`) as Promise<CommonResponse>
 }
 }
 
 
+export const unArchiveDocument: Fetcher<CommonResponse, CommonDocReq> = ({ datasetId, documentId }) => {
+  return patch(`/datasets/${datasetId}/documents/${documentId}/status/un_archive`) as Promise<CommonResponse>
+}
+
 export const enableDocument: Fetcher<CommonResponse, CommonDocReq> = ({ datasetId, documentId }) => {
 export const enableDocument: Fetcher<CommonResponse, CommonDocReq> = ({ datasetId, documentId }) => {
   return patch(`/datasets/${datasetId}/documents/${documentId}/status/enable`) as Promise<CommonResponse>
   return patch(`/datasets/${datasetId}/documents/${documentId}/status/enable`) as Promise<CommonResponse>
 }
 }
@@ -138,10 +142,6 @@ export const modifyDocMetadata: Fetcher<CommonResponse, CommonDocReq & { body: {
   return put(`/datasets/${datasetId}/documents/${documentId}/metadata`, { body }) as Promise<CommonResponse>
   return put(`/datasets/${datasetId}/documents/${documentId}/metadata`, { body }) as Promise<CommonResponse>
 }
 }
 
 
-export const getDatasetIndexingStatus: Fetcher<{ data: IndexingStatusResponse[] }, string> = (datasetId) => {
-  return get(`/datasets/${datasetId}/indexing-status`) as Promise<{ data: IndexingStatusResponse[] }>
-}
-
 // apis for segments in a document
 // apis for segments in a document
 
 
 export const fetchSegments: Fetcher<SegmentsResponse, CommonDocReq & { params: SegmentsQuery }> = ({ datasetId, documentId, params }) => {
 export const fetchSegments: Fetcher<SegmentsResponse, CommonDocReq & { params: SegmentsQuery }> = ({ datasetId, documentId, params }) => {
@@ -164,6 +164,18 @@ export const addSegment: Fetcher<{ data: SegmentDetailModel; doc_form: string },
   return post(`/datasets/${datasetId}/documents/${documentId}/segment`, { body }) as Promise<{ data: SegmentDetailModel; doc_form: string }>
   return post(`/datasets/${datasetId}/documents/${documentId}/segment`, { body }) as Promise<{ data: SegmentDetailModel; doc_form: string }>
 }
 }
 
 
+export const deleteSegment: Fetcher<CommonResponse, { datasetId: string; documentId: string; segmentId: string }> = ({ datasetId, documentId, segmentId }) => {
+  return del(`/datasets/${datasetId}/documents/${documentId}/segments/${segmentId}`) as Promise<CommonResponse>
+}
+
+export const segmentBatchImport: Fetcher<{ job_id: string; job_status: string }, { url: string; body: FormData }> = ({ url, body }) => {
+  return post(url, { body }, { bodyStringify: false, deleteContentType: true }) as Promise<{ job_id: string; job_status: string }>
+}
+
+export const checkSegmentBatchImportProgress: Fetcher<{ job_id: string; job_status: string }, { jobID: string }> = ({ jobID }) => {
+  return get(`/datasets/batch_import_status/${jobID}`) as Promise<{ job_id: string; job_status: string }>
+}
+
 // hit testing
 // hit testing
 export const hitTesting: Fetcher<HitTestingResponse, { datasetId: string; queryText: string }> = ({ datasetId, queryText }) => {
 export const hitTesting: Fetcher<HitTestingResponse, { datasetId: string; queryText: string }> = ({ datasetId, queryText }) => {
   return post(`/datasets/${datasetId}/hit-testing`, { body: { query: queryText } }) as Promise<HitTestingResponse>
   return post(`/datasets/${datasetId}/hit-testing`, { body: { query: queryText } }) as Promise<HitTestingResponse>