Browse Source

Feat/dataset notion import (#392)

Co-authored-by: StyleZhang <jasonapring2015@outlook.com>
Co-authored-by: JzoNg <jzongcode@gmail.com>
Jyong 1 year ago
parent
commit
9253f72dea
96 changed files with 4483 additions and 383 deletions
  1. 1 0
      api/.env.example
  2. 1 1
      api/app.py
  3. 3 0
      api/config.py
  4. 2 2
      api/controllers/console/__init__.py
  5. 95 0
      api/controllers/console/auth/data_source_oauth.py
  6. 303 0
      api/controllers/console/datasets/data_source.py
  7. 74 13
      api/controllers/console/datasets/datasets.py
  8. 192 13
      api/controllers/console/datasets/datasets_document.py
  9. 7 3
      api/controllers/service_api/dataset/document.py
  10. 367 0
      api/core/data_source/notion.py
  11. 193 67
      api/core/indexing_runner.py
  12. 7 0
      api/libs/oauth.py
  13. 256 0
      api/libs/oauth_data_source.py
  14. 46 0
      api/migrations/versions/e32f6ccb87c6_e08af0a69ccefbb59fa80c778efee300bb780980.py
  15. 3 1
      api/models/dataset.py
  16. 21 0
      api/models/source.py
  17. 255 62
      api/services/dataset_service.py
  18. 58 0
      api/tasks/clean_notion_document_task.py
  19. 109 0
      api/tasks/document_indexing_sync_task.py
  20. 20 16
      api/tasks/document_indexing_task.py
  21. 1 1
      api/tasks/document_indexing_update_task.py
  22. 1 1
      api/tasks/recover_document_indexing_task.py
  23. 2 1
      web/app/(commonLayout)/datasets/(datasetDetailLayout)/[datasetId]/layout.tsx
  24. 15 1
      web/app/components/app-sidebar/basic.tsx
  25. 1 2
      web/app/components/app-sidebar/index.tsx
  26. 3 0
      web/app/components/base/checkbox/assets/check.svg
  27. 9 0
      web/app/components/base/checkbox/index.module.css
  28. 19 0
      web/app/components/base/checkbox/index.tsx
  29. 6 0
      web/app/components/base/notion-icon/index.module.css
  30. 58 0
      web/app/components/base/notion-icon/index.tsx
  31. 3 0
      web/app/components/base/notion-page-selector/assets/clear.svg
  32. 3 0
      web/app/components/base/notion-page-selector/assets/down-arrow.svg
  33. 3 0
      web/app/components/base/notion-page-selector/assets/notion-empty-page.svg
  34. 3 0
      web/app/components/base/notion-page-selector/assets/notion-page.svg
  35. 5 0
      web/app/components/base/notion-page-selector/assets/search.svg
  36. 3 0
      web/app/components/base/notion-page-selector/assets/setting.svg
  37. 4 0
      web/app/components/base/notion-page-selector/base.module.css
  38. 141 0
      web/app/components/base/notion-page-selector/base.tsx
  39. 2 0
      web/app/components/base/notion-page-selector/index.tsx
  40. 28 0
      web/app/components/base/notion-page-selector/notion-page-selector-modal/index.module.css
  41. 62 0
      web/app/components/base/notion-page-selector/notion-page-selector-modal/index.tsx
  42. 17 0
      web/app/components/base/notion-page-selector/page-selector/index.module.css
  43. 299 0
      web/app/components/base/notion-page-selector/page-selector/index.tsx
  44. 15 0
      web/app/components/base/notion-page-selector/search-input/index.module.css
  45. 42 0
      web/app/components/base/notion-page-selector/search-input/index.tsx
  46. 9 0
      web/app/components/base/notion-page-selector/workspace-selector/index.module.css
  47. 84 0
      web/app/components/base/notion-page-selector/workspace-selector/index.tsx
  48. 20 0
      web/app/components/base/progress-bar/index.tsx
  49. 3 0
      web/app/components/datasets/create/assets/Icon-3-dots.svg
  50. 2 0
      web/app/components/datasets/create/assets/normal.svg
  51. 11 0
      web/app/components/datasets/create/assets/star.svg
  52. 111 0
      web/app/components/datasets/create/embedding-process/index.module.css
  53. 242 0
      web/app/components/datasets/create/embedding-process/index.tsx
  54. 3 0
      web/app/components/datasets/create/file-preview/index.module.css
  55. 17 10
      web/app/components/datasets/create/file-preview/index.tsx
  56. 38 10
      web/app/components/datasets/create/index.tsx
  57. 54 0
      web/app/components/datasets/create/notion-page-preview/index.module.css
  58. 75 0
      web/app/components/datasets/create/notion-page-preview/index.tsx
  59. 50 0
      web/app/components/datasets/create/step-one/index.module.css
  60. 105 24
      web/app/components/datasets/create/step-one/index.tsx
  61. 11 12
      web/app/components/datasets/create/step-three/index.tsx
  62. 32 5
      web/app/components/datasets/create/step-two/index.module.css
  63. 104 21
      web/app/components/datasets/create/step-two/index.tsx
  64. 26 16
      web/app/components/datasets/documents/detail/index.tsx
  65. 12 1
      web/app/components/datasets/documents/detail/settings/index.tsx
  66. 118 12
      web/app/components/datasets/documents/index.tsx
  67. 45 16
      web/app/components/datasets/documents/list.tsx
  68. 3 0
      web/app/components/datasets/documents/style.module.css
  69. 1 1
      web/app/components/header/account-dropdown/index.tsx
  70. 102 0
      web/app/components/header/account-setting/data-source-page/data-source-notion/index.tsx
  71. 14 0
      web/app/components/header/account-setting/data-source-page/data-source-notion/operate/index.module.css
  72. 107 0
      web/app/components/header/account-setting/data-source-page/data-source-notion/operate/index.tsx
  73. 12 0
      web/app/components/header/account-setting/data-source-page/data-source-notion/style.module.css
  74. 0 0
      web/app/components/header/account-setting/data-source-page/index.module.css
  75. 17 0
      web/app/components/header/account-setting/data-source-page/index.tsx
  76. 10 0
      web/app/components/header/account-setting/index.module.css
  77. 26 5
      web/app/components/header/account-setting/index.tsx
  78. 9 9
      web/app/components/header/account-setting/members-page/index.tsx
  79. 1 0
      web/app/components/header/assets/data-source-blue.svg
  80. 3 0
      web/app/components/header/assets/data-source.svg
  81. 3 0
      web/app/components/header/assets/file.svg
  82. 12 0
      web/app/components/header/assets/notion.svg
  83. 3 0
      web/app/components/header/assets/sync.svg
  84. 3 0
      web/app/components/header/assets/trash.svg
  85. 4 4
      web/app/components/header/nav/nav-selector/index.tsx
  86. 3 1
      web/context/dataset-detail.ts
  87. 24 0
      web/i18n/lang/common.en.ts
  88. 24 0
      web/i18n/lang/common.zh.ts
  89. 9 1
      web/i18n/lang/dataset-creation.en.ts
  90. 9 1
      web/i18n/lang/dataset-creation.zh.ts
  91. 2 0
      web/i18n/lang/dataset-documents.en.ts
  92. 2 0
      web/i18n/lang/dataset-documents.zh.ts
  93. 32 0
      web/models/common.ts
  94. 62 35
      web/models/datasets.ts
  95. 17 4
      web/service/common.ts
  96. 39 11
      web/service/datasets.ts

+ 1 - 0
api/.env.example

@@ -22,6 +22,7 @@ CELERY_BROKER_URL=redis://:difyai123456@localhost:6379/1
 # redis configuration
 REDIS_HOST=localhost
 REDIS_PORT=6379
+REDIS_USERNAME: ''
 REDIS_PASSWORD=difyai123456
 REDIS_DB=0
 

+ 1 - 1
api/app.py

@@ -20,7 +20,7 @@ from extensions.ext_database import db
 from extensions.ext_login import login_manager
 
 # DO NOT REMOVE BELOW
-from models import model, account, dataset, web, task
+from models import model, account, dataset, web, task, source
 from events import event_handlers
 # DO NOT REMOVE ABOVE
 

+ 3 - 0
api/config.py

@@ -187,6 +187,9 @@ class Config:
         # For temp use only
         # set default LLM provider, default is 'openai', support `azure_openai`
         self.DEFAULT_LLM_PROVIDER = get_env('DEFAULT_LLM_PROVIDER')
+        # notion import setting
+        self.NOTION_CLIENT_ID = get_env('NOTION_CLIENT_ID')
+        self.NOTION_CLIENT_SECRET = get_env('NOTION_CLIENT_SECRET')
 
 class CloudEditionConfig(Config):
 

+ 2 - 2
api/controllers/console/__init__.py

@@ -12,10 +12,10 @@ from . import setup, version, apikey, admin
 from .app import app, site, completion, model_config, statistic, conversation, message, generator
 
 # Import auth controllers
-from .auth import login, oauth
+from .auth import login, oauth, data_source_oauth
 
 # Import datasets controllers
-from .datasets import datasets, datasets_document, datasets_segments, file, hit_testing
+from .datasets import datasets, datasets_document, datasets_segments, file, hit_testing, data_source
 
 # Import workspace controllers
 from .workspace import workspace, members, providers, account

+ 95 - 0
api/controllers/console/auth/data_source_oauth.py

@@ -0,0 +1,95 @@
+import logging
+from datetime import datetime
+from typing import Optional
+
+import flask_login
+import requests
+from flask import request, redirect, current_app, session
+from flask_login import current_user, login_required
+from flask_restful import Resource
+from werkzeug.exceptions import Forbidden
+from libs.oauth_data_source import NotionOAuth
+from controllers.console import api
+from ..setup import setup_required
+from ..wraps import account_initialization_required
+
+
+def get_oauth_providers():
+    with current_app.app_context():
+        notion_oauth = NotionOAuth(client_id=current_app.config.get('NOTION_CLIENT_ID'),
+                                   client_secret=current_app.config.get(
+                                       'NOTION_CLIENT_SECRET'),
+                                   redirect_uri=current_app.config.get(
+                                       'CONSOLE_URL') + '/console/api/oauth/data-source/callback/notion')
+
+        OAUTH_PROVIDERS = {
+            'notion': notion_oauth
+        }
+        return OAUTH_PROVIDERS
+
+
+class OAuthDataSource(Resource):
+    def get(self, provider: str):
+        # The role of the current user in the table must be admin or owner
+        if current_user.current_tenant.current_role not in ['admin', 'owner']:
+            raise Forbidden()
+        OAUTH_DATASOURCE_PROVIDERS = get_oauth_providers()
+        with current_app.app_context():
+            oauth_provider = OAUTH_DATASOURCE_PROVIDERS.get(provider)
+            print(vars(oauth_provider))
+        if not oauth_provider:
+            return {'error': 'Invalid provider'}, 400
+
+        auth_url = oauth_provider.get_authorization_url()
+        return redirect(auth_url)
+
+
+class OAuthDataSourceCallback(Resource):
+    def get(self, provider: str):
+        OAUTH_DATASOURCE_PROVIDERS = get_oauth_providers()
+        with current_app.app_context():
+            oauth_provider = OAUTH_DATASOURCE_PROVIDERS.get(provider)
+        if not oauth_provider:
+            return {'error': 'Invalid provider'}, 400
+        if 'code' in request.args:
+            code = request.args.get('code')
+            try:
+                oauth_provider.get_access_token(code)
+            except requests.exceptions.HTTPError as e:
+                logging.exception(
+                    f"An error occurred during the OAuthCallback process with {provider}: {e.response.text}")
+                return {'error': 'OAuth data source process failed'}, 400
+
+            return redirect(f'{current_app.config.get("CONSOLE_URL")}?oauth_data_source=success')
+        elif 'error' in request.args:
+            error = request.args.get('error')
+            return redirect(f'{current_app.config.get("CONSOLE_URL")}?oauth_data_source={error}')
+        else:
+            return redirect(f'{current_app.config.get("CONSOLE_URL")}?oauth_data_source=access_denied')
+
+
+class OAuthDataSourceSync(Resource):
+    @setup_required
+    @login_required
+    @account_initialization_required
+    def get(self, provider, binding_id):
+        provider = str(provider)
+        binding_id = str(binding_id)
+        OAUTH_DATASOURCE_PROVIDERS = get_oauth_providers()
+        with current_app.app_context():
+            oauth_provider = OAUTH_DATASOURCE_PROVIDERS.get(provider)
+        if not oauth_provider:
+            return {'error': 'Invalid provider'}, 400
+        try:
+            oauth_provider.sync_data_source(binding_id)
+        except requests.exceptions.HTTPError as e:
+            logging.exception(
+                f"An error occurred during the OAuthCallback process with {provider}: {e.response.text}")
+            return {'error': 'OAuth data source process failed'}, 400
+
+        return {'result': 'success'}, 200
+
+
+api.add_resource(OAuthDataSource, '/oauth/data-source/<string:provider>')
+api.add_resource(OAuthDataSourceCallback, '/oauth/data-source/callback/<string:provider>')
+api.add_resource(OAuthDataSourceSync, '/oauth/data-source/<string:provider>/<uuid:binding_id>/sync')

+ 303 - 0
api/controllers/console/datasets/data_source.py

@@ -0,0 +1,303 @@
+import datetime
+import json
+
+from cachetools import TTLCache
+from flask import request, current_app
+from flask_login import login_required, current_user
+from flask_restful import Resource, marshal_with, fields, reqparse, marshal
+from werkzeug.exceptions import NotFound
+
+from controllers.console import api
+from controllers.console.setup import setup_required
+from controllers.console.wraps import account_initialization_required
+from core.data_source.notion import NotionPageReader
+from core.indexing_runner import IndexingRunner
+from extensions.ext_database import db
+from libs.helper import TimestampField
+from libs.oauth_data_source import NotionOAuth
+from models.dataset import Document
+from models.source import DataSourceBinding
+from services.dataset_service import DatasetService, DocumentService
+from tasks.document_indexing_sync_task import document_indexing_sync_task
+
+cache = TTLCache(maxsize=None, ttl=30)
+
+FILE_SIZE_LIMIT = 15 * 1024 * 1024  # 15MB
+ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm']
+PREVIEW_WORDS_LIMIT = 3000
+
+
+class DataSourceApi(Resource):
+    integrate_icon_fields = {
+        'type': fields.String,
+        'url': fields.String,
+        'emoji': fields.String
+    }
+    integrate_page_fields = {
+        'page_name': fields.String,
+        'page_id': fields.String,
+        'page_icon': fields.Nested(integrate_icon_fields, allow_null=True),
+        'parent_id': fields.String,
+        'type': fields.String
+    }
+    integrate_workspace_fields = {
+        'workspace_name': fields.String,
+        'workspace_id': fields.String,
+        'workspace_icon': fields.String,
+        'pages': fields.List(fields.Nested(integrate_page_fields)),
+        'total': fields.Integer
+    }
+    integrate_fields = {
+        'id': fields.String,
+        'provider': fields.String,
+        'created_at': TimestampField,
+        'is_bound': fields.Boolean,
+        'disabled': fields.Boolean,
+        'link': fields.String,
+        'source_info': fields.Nested(integrate_workspace_fields)
+    }
+    integrate_list_fields = {
+        'data': fields.List(fields.Nested(integrate_fields)),
+    }
+
+    @setup_required
+    @login_required
+    @account_initialization_required
+    @marshal_with(integrate_list_fields)
+    def get(self):
+        # get workspace data source integrates
+        data_source_integrates = db.session.query(DataSourceBinding).filter(
+            DataSourceBinding.tenant_id == current_user.current_tenant_id,
+            DataSourceBinding.disabled == False
+        ).all()
+
+        base_url = request.url_root.rstrip('/')
+        data_source_oauth_base_path = "/console/api/oauth/data-source"
+        providers = ["notion"]
+
+        integrate_data = []
+        for provider in providers:
+            # existing_integrate = next((ai for ai in data_source_integrates if ai.provider == provider), None)
+            existing_integrates = filter(lambda item: item.provider == provider, data_source_integrates)
+            if existing_integrates:
+                for existing_integrate in list(existing_integrates):
+                    integrate_data.append({
+                        'id': existing_integrate.id,
+                        'provider': provider,
+                        'created_at': existing_integrate.created_at,
+                        'is_bound': True,
+                        'disabled': existing_integrate.disabled,
+                        'source_info': existing_integrate.source_info,
+                        'link': f'{base_url}{data_source_oauth_base_path}/{provider}'
+                })
+            else:
+                integrate_data.append({
+                    'id': None,
+                    'provider': provider,
+                    'created_at': None,
+                    'source_info': None,
+                    'is_bound': False,
+                    'disabled': None,
+                    'link': f'{base_url}{data_source_oauth_base_path}/{provider}'
+                })
+        return {'data': integrate_data}, 200
+
+    @setup_required
+    @login_required
+    @account_initialization_required
+    def patch(self, binding_id, action):
+        binding_id = str(binding_id)
+        action = str(action)
+        data_source_binding = DataSourceBinding.query.filter_by(
+            id=binding_id
+        ).first()
+        if data_source_binding is None:
+            raise NotFound('Data source binding not found.')
+        # enable binding
+        if action == 'enable':
+            if data_source_binding.disabled:
+                data_source_binding.disabled = False
+                data_source_binding.updated_at = datetime.datetime.utcnow()
+                db.session.add(data_source_binding)
+                db.session.commit()
+            else:
+                raise ValueError('Data source is not disabled.')
+        # disable binding
+        if action == 'disable':
+            if not data_source_binding.disabled:
+                data_source_binding.disabled = True
+                data_source_binding.updated_at = datetime.datetime.utcnow()
+                db.session.add(data_source_binding)
+                db.session.commit()
+            else:
+                raise ValueError('Data source is disabled.')
+        return {'result': 'success'}, 200
+
+
+class DataSourceNotionListApi(Resource):
+    integrate_icon_fields = {
+        'type': fields.String,
+        'url': fields.String,
+        'emoji': fields.String
+    }
+    integrate_page_fields = {
+        'page_name': fields.String,
+        'page_id': fields.String,
+        'page_icon': fields.Nested(integrate_icon_fields, allow_null=True),
+        'is_bound': fields.Boolean,
+        'parent_id': fields.String,
+        'type': fields.String
+    }
+    integrate_workspace_fields = {
+        'workspace_name': fields.String,
+        'workspace_id': fields.String,
+        'workspace_icon': fields.String,
+        'pages': fields.List(fields.Nested(integrate_page_fields))
+    }
+    integrate_notion_info_list_fields = {
+        'notion_info': fields.List(fields.Nested(integrate_workspace_fields)),
+    }
+
+    @setup_required
+    @login_required
+    @account_initialization_required
+    @marshal_with(integrate_notion_info_list_fields)
+    def get(self):
+        dataset_id = request.args.get('dataset_id', default=None, type=str)
+        exist_page_ids = []
+        # import notion in the exist dataset
+        if dataset_id:
+            dataset = DatasetService.get_dataset(dataset_id)
+            if not dataset:
+                raise NotFound('Dataset not found.')
+            if dataset.data_source_type != 'notion_import':
+                raise ValueError('Dataset is not notion type.')
+            documents = Document.query.filter_by(
+                dataset_id=dataset_id,
+                tenant_id=current_user.current_tenant_id,
+                data_source_type='notion_import',
+                enabled=True
+            ).all()
+            if documents:
+                for document in documents:
+                    data_source_info = json.loads(document.data_source_info)
+                    exist_page_ids.append(data_source_info['notion_page_id'])
+        # get all authorized pages
+        data_source_bindings = DataSourceBinding.query.filter_by(
+            tenant_id=current_user.current_tenant_id,
+            provider='notion',
+            disabled=False
+        ).all()
+        if not data_source_bindings:
+            return {
+                'notion_info': []
+            }, 200
+        pre_import_info_list = []
+        for data_source_binding in data_source_bindings:
+            source_info = data_source_binding.source_info
+            pages = source_info['pages']
+            # Filter out already bound pages
+            for page in pages:
+                if page['page_id'] in exist_page_ids:
+                    page['is_bound'] = True
+                else:
+                    page['is_bound'] = False
+            pre_import_info = {
+                'workspace_name': source_info['workspace_name'],
+                'workspace_icon': source_info['workspace_icon'],
+                'workspace_id': source_info['workspace_id'],
+                'pages': pages,
+            }
+            pre_import_info_list.append(pre_import_info)
+        return {
+            'notion_info': pre_import_info_list
+        }, 200
+
+
+class DataSourceNotionApi(Resource):
+
+    @setup_required
+    @login_required
+    @account_initialization_required
+    def get(self, workspace_id, page_id, page_type):
+        workspace_id = str(workspace_id)
+        page_id = str(page_id)
+        data_source_binding = DataSourceBinding.query.filter(
+            db.and_(
+                DataSourceBinding.tenant_id == current_user.current_tenant_id,
+                DataSourceBinding.provider == 'notion',
+                DataSourceBinding.disabled == False,
+                DataSourceBinding.source_info['workspace_id'] == f'"{workspace_id}"'
+            )
+        ).first()
+        if not data_source_binding:
+            raise NotFound('Data source binding not found.')
+        reader = NotionPageReader(integration_token=data_source_binding.access_token)
+        if page_type == 'page':
+            page_content = reader.read_page(page_id)
+        elif page_type == 'database':
+            page_content = reader.query_database_data(page_id)
+        else:
+            page_content = ""
+        return {
+            'content': page_content
+        }, 200
+
+    @setup_required
+    @login_required
+    @account_initialization_required
+    def post(self):
+        parser = reqparse.RequestParser()
+        parser.add_argument('notion_info_list', type=list, required=True, nullable=True, location='json')
+        parser.add_argument('process_rule', type=dict, required=True, nullable=True, location='json')
+        args = parser.parse_args()
+        # validate args
+        DocumentService.estimate_args_validate(args)
+        indexing_runner = IndexingRunner()
+        response = indexing_runner.notion_indexing_estimate(args['notion_info_list'], args['process_rule'])
+        return response, 200
+
+
+class DataSourceNotionDatasetSyncApi(Resource):
+
+    @setup_required
+    @login_required
+    @account_initialization_required
+    def get(self, dataset_id):
+        dataset_id_str = str(dataset_id)
+        dataset = DatasetService.get_dataset(dataset_id_str)
+        if dataset is None:
+            raise NotFound("Dataset not found.")
+
+        documents = DocumentService.get_document_by_dataset_id(dataset_id_str)
+        for document in documents:
+            document_indexing_sync_task.delay(dataset_id_str, document.id)
+        return 200
+
+
+class DataSourceNotionDocumentSyncApi(Resource):
+
+    @setup_required
+    @login_required
+    @account_initialization_required
+    def get(self, dataset_id, document_id):
+        dataset_id_str = str(dataset_id)
+        document_id_str = str(document_id)
+        dataset = DatasetService.get_dataset(dataset_id_str)
+        if dataset is None:
+            raise NotFound("Dataset not found.")
+
+        document = DocumentService.get_document(dataset_id_str, document_id_str)
+        if document is None:
+            raise NotFound("Document not found.")
+        document_indexing_sync_task.delay(dataset_id_str, document_id_str)
+        return 200
+
+
+api.add_resource(DataSourceApi, '/data-source/integrates', '/data-source/integrates/<uuid:binding_id>/<string:action>')
+api.add_resource(DataSourceNotionListApi, '/notion/pre-import/pages')
+api.add_resource(DataSourceNotionApi,
+                 '/notion/workspaces/<uuid:workspace_id>/pages/<uuid:page_id>/<string:page_type>/preview',
+                 '/datasets/notion-indexing-estimate')
+api.add_resource(DataSourceNotionDatasetSyncApi, '/datasets/<uuid:dataset_id>/notion/sync')
+api.add_resource(DataSourceNotionDocumentSyncApi, '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/notion/sync')

+ 74 - 13
api/controllers/console/datasets/datasets.py

@@ -12,8 +12,9 @@ from controllers.console.wraps import account_initialization_required
 from core.indexing_runner import IndexingRunner
 from libs.helper import TimestampField
 from extensions.ext_database import db
+from models.dataset import DocumentSegment, Document
 from models.model import UploadFile
-from services.dataset_service import DatasetService
+from services.dataset_service import DatasetService, DocumentService
 
 dataset_detail_fields = {
     'id': fields.String,
@@ -217,17 +218,31 @@ class DatasetIndexingEstimateApi(Resource):
     @login_required
     @account_initialization_required
     def post(self):
-        segment_rule = request.get_json()
-        file_detail = db.session.query(UploadFile).filter(
-            UploadFile.tenant_id == current_user.current_tenant_id,
-            UploadFile.id == segment_rule["file_id"]
-        ).first()
-
-        if file_detail is None:
-            raise NotFound("File not found.")
-
-        indexing_runner = IndexingRunner()
-        response = indexing_runner.indexing_estimate(file_detail, segment_rule['process_rule'])
+        parser = reqparse.RequestParser()
+        parser.add_argument('info_list', type=dict, required=True, nullable=True, location='json')
+        parser.add_argument('process_rule', type=dict, required=True, nullable=True, location='json')
+        args = parser.parse_args()
+        # validate args
+        DocumentService.estimate_args_validate(args)
+        if args['info_list']['data_source_type'] == 'upload_file':
+            file_ids = args['info_list']['file_info_list']['file_ids']
+            file_details = db.session.query(UploadFile).filter(
+                UploadFile.tenant_id == current_user.current_tenant_id,
+                UploadFile.id.in_(file_ids)
+            ).all()
+
+            if file_details is None:
+                raise NotFound("File not found.")
+
+            indexing_runner = IndexingRunner()
+            response = indexing_runner.file_indexing_estimate(file_details, args['process_rule'])
+        elif args['info_list']['data_source_type'] == 'notion_import':
+
+            indexing_runner = IndexingRunner()
+            response = indexing_runner.notion_indexing_estimate(args['info_list']['notion_info_list'],
+                                                                args['process_rule'])
+        else:
+            raise ValueError('Data source type not support')
         return response, 200
 
 
@@ -274,8 +289,54 @@ class DatasetRelatedAppListApi(Resource):
         }, 200
 
 
+class DatasetIndexingStatusApi(Resource):
+    document_status_fields = {
+        'id': fields.String,
+        'indexing_status': fields.String,
+        'processing_started_at': TimestampField,
+        'parsing_completed_at': TimestampField,
+        'cleaning_completed_at': TimestampField,
+        'splitting_completed_at': TimestampField,
+        'completed_at': TimestampField,
+        'paused_at': TimestampField,
+        'error': fields.String,
+        'stopped_at': TimestampField,
+        'completed_segments': fields.Integer,
+        'total_segments': fields.Integer,
+    }
+
+    document_status_fields_list = {
+        'data': fields.List(fields.Nested(document_status_fields))
+    }
+
+    @setup_required
+    @login_required
+    @account_initialization_required
+    def get(self, dataset_id):
+        dataset_id = str(dataset_id)
+        documents = db.session.query(Document).filter(
+            Document.dataset_id == dataset_id,
+            Document.tenant_id == current_user.current_tenant_id
+        ).all()
+        documents_status = []
+        for document in documents:
+            completed_segments = DocumentSegment.query.filter(DocumentSegment.completed_at.isnot(None),
+                                                              DocumentSegment.document_id == str(document.id),
+                                                              DocumentSegment.status != 're_segment').count()
+            total_segments = DocumentSegment.query.filter(DocumentSegment.document_id == str(document.id),
+                                                          DocumentSegment.status != 're_segment').count()
+            document.completed_segments = completed_segments
+            document.total_segments = total_segments
+            documents_status.append(marshal(document, self.document_status_fields))
+        data = {
+            'data': documents_status
+        }
+        return data
+
+
 api.add_resource(DatasetListApi, '/datasets')
 api.add_resource(DatasetApi, '/datasets/<uuid:dataset_id>')
 api.add_resource(DatasetQueryApi, '/datasets/<uuid:dataset_id>/queries')
-api.add_resource(DatasetIndexingEstimateApi, '/datasets/file-indexing-estimate')
+api.add_resource(DatasetIndexingEstimateApi, '/datasets/indexing-estimate')
 api.add_resource(DatasetRelatedAppListApi, '/datasets/<uuid:dataset_id>/related-apps')
+api.add_resource(DatasetIndexingStatusApi, '/datasets/<uuid:dataset_id>/indexing-status')

+ 192 - 13
api/controllers/console/datasets/datasets_document.py

@@ -1,6 +1,7 @@
 # -*- coding:utf-8 -*-
 import random
 from datetime import datetime
+from typing import List
 
 from flask import request
 from flask_login import login_required, current_user
@@ -61,6 +62,29 @@ document_fields = {
     'hit_count': fields.Integer,
 }
 
+document_with_segments_fields = {
+    'id': fields.String,
+    'position': fields.Integer,
+    'data_source_type': fields.String,
+    'data_source_info': fields.Raw(attribute='data_source_info_dict'),
+    'dataset_process_rule_id': fields.String,
+    'name': fields.String,
+    'created_from': fields.String,
+    'created_by': fields.String,
+    'created_at': TimestampField,
+    'tokens': fields.Integer,
+    'indexing_status': fields.String,
+    'error': fields.String,
+    'enabled': fields.Boolean,
+    'disabled_at': TimestampField,
+    'disabled_by': fields.String,
+    'archived': fields.Boolean,
+    'display_status': fields.String,
+    'word_count': fields.Integer,
+    'hit_count': fields.Integer,
+    'completed_segments': fields.Integer,
+    'total_segments': fields.Integer
+}
 
 class DocumentResource(Resource):
     def get_document(self, dataset_id: str, document_id: str) -> Document:
@@ -83,6 +107,23 @@ class DocumentResource(Resource):
 
         return document
 
+    def get_batch_documents(self, dataset_id: str, batch: str) -> List[Document]:
+        dataset = DatasetService.get_dataset(dataset_id)
+        if not dataset:
+            raise NotFound('Dataset not found.')
+
+        try:
+            DatasetService.check_dataset_permission(dataset, current_user)
+        except services.errors.account.NoPermissionError as e:
+            raise Forbidden(str(e))
+
+        documents = DocumentService.get_batch_documents(dataset_id, batch)
+
+        if not documents:
+            raise NotFound('Documents not found.')
+
+        return documents
+
 
 class GetProcessRuleApi(Resource):
     @setup_required
@@ -132,9 +173,9 @@ class DatasetDocumentListApi(Resource):
         dataset_id = str(dataset_id)
         page = request.args.get('page', default=1, type=int)
         limit = request.args.get('limit', default=20, type=int)
-        search = request.args.get('search', default=None, type=str)
+        search = request.args.get('keyword', default=None, type=str)
         sort = request.args.get('sort', default='-created_at', type=str)
-
+        fetch = request.args.get('fetch', default=False, type=bool)
         dataset = DatasetService.get_dataset(dataset_id)
         if not dataset:
             raise NotFound('Dataset not found.')
@@ -173,9 +214,20 @@ class DatasetDocumentListApi(Resource):
         paginated_documents = query.paginate(
             page=page, per_page=limit, max_per_page=100, error_out=False)
         documents = paginated_documents.items
-
+        if fetch:
+            for document in documents:
+                completed_segments = DocumentSegment.query.filter(DocumentSegment.completed_at.isnot(None),
+                                                                  DocumentSegment.document_id == str(document.id),
+                                                                  DocumentSegment.status != 're_segment').count()
+                total_segments = DocumentSegment.query.filter(DocumentSegment.document_id == str(document.id),
+                                                              DocumentSegment.status != 're_segment').count()
+                document.completed_segments = completed_segments
+                document.total_segments = total_segments
+            data = marshal(documents, document_with_segments_fields)
+        else:
+            data = marshal(documents, document_fields)
         response = {
-            'data': marshal(documents, document_fields),
+            'data': data,
             'has_more': len(documents) == limit,
             'limit': limit,
             'total': paginated_documents.total,
@@ -184,10 +236,15 @@ class DatasetDocumentListApi(Resource):
 
         return response
 
+    documents_and_batch_fields = {
+        'documents': fields.List(fields.Nested(document_fields)),
+        'batch': fields.String
+    }
+
     @setup_required
     @login_required
     @account_initialization_required
-    @marshal_with(document_fields)
+    @marshal_with(documents_and_batch_fields)
     def post(self, dataset_id):
         dataset_id = str(dataset_id)
 
@@ -221,7 +278,7 @@ class DatasetDocumentListApi(Resource):
         DocumentService.document_create_args_validate(args)
 
         try:
-            document = DocumentService.save_document_with_dataset_id(dataset, args, current_user)
+            documents, batch = DocumentService.save_document_with_dataset_id(dataset, args, current_user)
         except ProviderTokenNotInitError:
             raise ProviderNotInitializeError()
         except QuotaExceededError:
@@ -229,13 +286,17 @@ class DatasetDocumentListApi(Resource):
         except ModelCurrentlyNotSupportError:
             raise ProviderModelCurrentlyNotSupportError()
 
-        return document
+        return {
+            'documents': documents,
+            'batch': batch
+        }
 
 
 class DatasetInitApi(Resource):
     dataset_and_document_fields = {
         'dataset': fields.Nested(dataset_fields),
-        'document': fields.Nested(document_fields)
+        'documents': fields.List(fields.Nested(document_fields)),
+        'batch': fields.String
     }
 
     @setup_required
@@ -258,7 +319,7 @@ class DatasetInitApi(Resource):
         DocumentService.document_create_args_validate(args)
 
         try:
-            dataset, document = DocumentService.save_document_without_dataset_id(
+            dataset, documents, batch = DocumentService.save_document_without_dataset_id(
                 tenant_id=current_user.current_tenant_id,
                 document_data=args,
                 account=current_user
@@ -272,7 +333,8 @@ class DatasetInitApi(Resource):
 
         response = {
             'dataset': dataset,
-            'document': document
+            'documents': documents,
+            'batch': batch
         }
 
         return response
@@ -317,11 +379,122 @@ class DocumentIndexingEstimateApi(DocumentResource):
                     raise NotFound('File not found.')
 
                 indexing_runner = IndexingRunner()
-                response = indexing_runner.indexing_estimate(file, data_process_rule_dict)
 
+                response = indexing_runner.file_indexing_estimate([file], data_process_rule_dict)
+
+        return response
+
+
+class DocumentBatchIndexingEstimateApi(DocumentResource):
+
+    @setup_required
+    @login_required
+    @account_initialization_required
+    def get(self, dataset_id, batch):
+        dataset_id = str(dataset_id)
+        batch = str(batch)
+        dataset = DatasetService.get_dataset(dataset_id)
+        if dataset is None:
+            raise NotFound("Dataset not found.")
+        documents = self.get_batch_documents(dataset_id, batch)
+        response = {
+            "tokens": 0,
+            "total_price": 0,
+            "currency": "USD",
+            "total_segments": 0,
+            "preview": []
+        }
+        if not documents:
+            return response
+        data_process_rule = documents[0].dataset_process_rule
+        data_process_rule_dict = data_process_rule.to_dict()
+        info_list = []
+        for document in documents:
+            if document.indexing_status in ['completed', 'error']:
+                raise DocumentAlreadyFinishedError()
+            data_source_info = document.data_source_info_dict
+            # format document files info
+            if data_source_info and 'upload_file_id' in data_source_info:
+                file_id = data_source_info['upload_file_id']
+                info_list.append(file_id)
+            # format document notion info
+            elif data_source_info and 'notion_workspace_id' in data_source_info and 'notion_page_id' in data_source_info:
+                pages = []
+                page = {
+                    'page_id': data_source_info['notion_page_id'],
+                    'type': data_source_info['type']
+                }
+                pages.append(page)
+                notion_info = {
+                    'workspace_id': data_source_info['notion_workspace_id'],
+                    'pages': pages
+                }
+                info_list.append(notion_info)
+
+        if dataset.data_source_type == 'upload_file':
+            file_details = db.session.query(UploadFile).filter(
+                UploadFile.tenant_id == current_user.current_tenant_id,
+                UploadFile.id in info_list
+            ).all()
+
+            if file_details is None:
+                raise NotFound("File not found.")
+
+            indexing_runner = IndexingRunner()
+            response = indexing_runner.file_indexing_estimate(file_details, data_process_rule_dict)
+        elif dataset.data_source_type:
+
+            indexing_runner = IndexingRunner()
+            response = indexing_runner.notion_indexing_estimate(info_list,
+                                                                data_process_rule_dict)
+        else:
+            raise ValueError('Data source type not support')
         return response
 
 
+class DocumentBatchIndexingStatusApi(DocumentResource):
+    document_status_fields = {
+        'id': fields.String,
+        'indexing_status': fields.String,
+        'processing_started_at': TimestampField,
+        'parsing_completed_at': TimestampField,
+        'cleaning_completed_at': TimestampField,
+        'splitting_completed_at': TimestampField,
+        'completed_at': TimestampField,
+        'paused_at': TimestampField,
+        'error': fields.String,
+        'stopped_at': TimestampField,
+        'completed_segments': fields.Integer,
+        'total_segments': fields.Integer,
+    }
+
+    document_status_fields_list = {
+        'data': fields.List(fields.Nested(document_status_fields))
+    }
+
+    @setup_required
+    @login_required
+    @account_initialization_required
+    def get(self, dataset_id, batch):
+        dataset_id = str(dataset_id)
+        batch = str(batch)
+        documents = self.get_batch_documents(dataset_id, batch)
+        documents_status = []
+        for document in documents:
+            completed_segments = DocumentSegment.query.filter(DocumentSegment.completed_at.isnot(None),
+                                                              DocumentSegment.document_id == str(document.id),
+                                                              DocumentSegment.status != 're_segment').count()
+            total_segments = DocumentSegment.query.filter(DocumentSegment.document_id == str(document.id),
+                                                          DocumentSegment.status != 're_segment').count()
+            document.completed_segments = completed_segments
+            document.total_segments = total_segments
+            documents_status.append(marshal(document, self.document_status_fields))
+        data = {
+            'data': documents_status
+        }
+        return data
+
+
 class DocumentIndexingStatusApi(DocumentResource):
     document_status_fields = {
         'id': fields.String,
@@ -408,7 +581,7 @@ class DocumentDetailApi(DocumentResource):
                 'disabled_by': document.disabled_by,
                 'archived': document.archived,
                 'segment_count': document.segment_count,
-                'average_segment_length':   document.average_segment_length,
+                'average_segment_length': document.average_segment_length,
                 'hit_count': document.hit_count,
                 'display_status': document.display_status
             }
@@ -428,7 +601,7 @@ class DocumentDetailApi(DocumentResource):
                 'created_at': document.created_at.timestamp(),
                 'tokens': document.tokens,
                 'indexing_status': document.indexing_status,
-                'completed_at': int(document.completed_at.timestamp())if document.completed_at else None,
+                'completed_at': int(document.completed_at.timestamp()) if document.completed_at else None,
                 'updated_at': int(document.updated_at.timestamp()) if document.updated_at else None,
                 'indexing_latency': document.indexing_latency,
                 'error': document.error,
@@ -579,6 +752,8 @@ class DocumentStatusApi(DocumentResource):
             return {'result': 'success'}, 200
 
         elif action == "disable":
+            if not document.completed_at or document.indexing_status != 'completed':
+                raise InvalidActionError('Document is not completed.')
             if not document.enabled:
                 raise InvalidActionError('Document already disabled.')
 
@@ -678,6 +853,10 @@ api.add_resource(DatasetInitApi,
                  '/datasets/init')
 api.add_resource(DocumentIndexingEstimateApi,
                  '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/indexing-estimate')
+api.add_resource(DocumentBatchIndexingEstimateApi,
+                 '/datasets/<uuid:dataset_id>/batch/<string:batch>/indexing-estimate')
+api.add_resource(DocumentBatchIndexingStatusApi,
+                 '/datasets/<uuid:dataset_id>/batch/<string:batch>/indexing-status')
 api.add_resource(DocumentIndexingStatusApi,
                  '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/indexing-status')
 api.add_resource(DocumentDetailApi,

+ 7 - 3
api/controllers/service_api/dataset/document.py

@@ -69,12 +69,16 @@ class DocumentListApi(DatasetApiResource):
         document_data = {
             'data_source': {
                 'type': 'upload_file',
-                'info': upload_file.id
+                'info': [
+                    {
+                        'upload_file_id': upload_file.id
+                    }
+                ]
             }
         }
 
         try:
-            document = DocumentService.save_document_with_dataset_id(
+            documents, batch = DocumentService.save_document_with_dataset_id(
                 dataset=dataset,
                 document_data=document_data,
                 account=dataset.created_by_account,
@@ -83,7 +87,7 @@ class DocumentListApi(DatasetApiResource):
             )
         except ProviderTokenNotInitError:
             raise ProviderNotInitializeError()
-
+        document = documents[0]
         if doc_type and doc_metadata:
             metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[doc_type]
 

+ 367 - 0
api/core/data_source/notion.py

@@ -0,0 +1,367 @@
+"""Notion reader."""
+import json
+import logging
+import os
+from datetime import datetime
+from typing import Any, Dict, List, Optional
+
+import requests  # type: ignore
+
+from llama_index.readers.base import BaseReader
+from llama_index.readers.schema.base import Document
+
+INTEGRATION_TOKEN_NAME = "NOTION_INTEGRATION_TOKEN"
+BLOCK_CHILD_URL_TMPL = "https://api.notion.com/v1/blocks/{block_id}/children"
+DATABASE_URL_TMPL = "https://api.notion.com/v1/databases/{database_id}/query"
+SEARCH_URL = "https://api.notion.com/v1/search"
+RETRIEVE_PAGE_URL_TMPL = "https://api.notion.com/v1/pages/{page_id}"
+RETRIEVE_DATABASE_URL_TMPL = "https://api.notion.com/v1/databases/{database_id}"
+HEADING_TYPE = ['heading_1', 'heading_2', 'heading_3']
+logger = logging.getLogger(__name__)
+
+
+# TODO: Notion DB reader coming soon!
+class NotionPageReader(BaseReader):
+    """Notion Page reader.
+
+    Reads a set of Notion pages.
+
+    Args:
+        integration_token (str): Notion integration token.
+
+    """
+
+    def __init__(self, integration_token: Optional[str] = None) -> None:
+        """Initialize with parameters."""
+        if integration_token is None:
+            integration_token = os.getenv(INTEGRATION_TOKEN_NAME)
+            if integration_token is None:
+                raise ValueError(
+                    "Must specify `integration_token` or set environment "
+                    "variable `NOTION_INTEGRATION_TOKEN`."
+                )
+        self.token = integration_token
+        self.headers = {
+            "Authorization": "Bearer " + self.token,
+            "Content-Type": "application/json",
+            "Notion-Version": "2022-06-28",
+        }
+
+    def _read_block(self, block_id: str, num_tabs: int = 0) -> str:
+        """Read a block."""
+        done = False
+        result_lines_arr = []
+        cur_block_id = block_id
+        while not done:
+            block_url = BLOCK_CHILD_URL_TMPL.format(block_id=cur_block_id)
+            query_dict: Dict[str, Any] = {}
+
+            res = requests.request(
+                "GET", block_url, headers=self.headers, json=query_dict
+            )
+            data = res.json()
+            if 'results' not in data or data["results"] is None:
+                done = True
+                break
+            heading = ''
+            for result in data["results"]:
+                result_type = result["type"]
+                result_obj = result[result_type]
+                cur_result_text_arr = []
+                if result_type == 'table':
+                    result_block_id = result["id"]
+                    text = self._read_table_rows(result_block_id)
+                    result_lines_arr.append(text)
+                else:
+                    if "rich_text" in result_obj:
+                        for rich_text in result_obj["rich_text"]:
+                            # skip if doesn't have text object
+                            if "text" in rich_text:
+                                text = rich_text["text"]["content"]
+                                prefix = "\t" * num_tabs
+                                cur_result_text_arr.append(prefix + text)
+                                if result_type in HEADING_TYPE:
+                                    heading = text
+                    result_block_id = result["id"]
+                    has_children = result["has_children"]
+                    if has_children:
+                        children_text = self._read_block(
+                            result_block_id, num_tabs=num_tabs + 1
+                        )
+                        cur_result_text_arr.append(children_text)
+
+                    cur_result_text = "\n".join(cur_result_text_arr)
+                    if result_type in HEADING_TYPE:
+                        result_lines_arr.append(cur_result_text)
+                    else:
+                        result_lines_arr.append(f'{heading}\n{cur_result_text}')
+
+            if data["next_cursor"] is None:
+                done = True
+                break
+            else:
+                cur_block_id = data["next_cursor"]
+
+        result_lines = "\n".join(result_lines_arr)
+        return result_lines
+
+    def _read_table_rows(self, block_id: str) -> str:
+        """Read table rows."""
+        done = False
+        result_lines_arr = []
+        cur_block_id = block_id
+        while not done:
+            block_url = BLOCK_CHILD_URL_TMPL.format(block_id=cur_block_id)
+            query_dict: Dict[str, Any] = {}
+
+            res = requests.request(
+                "GET", block_url, headers=self.headers, json=query_dict
+            )
+            data = res.json()
+            # get table headers text
+            table_header_cell_texts = []
+            tabel_header_cells = data["results"][0]['table_row']['cells']
+            for tabel_header_cell in tabel_header_cells:
+                if tabel_header_cell:
+                    for table_header_cell_text in tabel_header_cell:
+                        text = table_header_cell_text["text"]["content"]
+                        table_header_cell_texts.append(text)
+            # get table columns text and format
+            results = data["results"]
+            for i in range(len(results)-1):
+                column_texts = []
+                tabel_column_cells = data["results"][i+1]['table_row']['cells']
+                for j in range(len(tabel_column_cells)):
+                    if tabel_column_cells[j]:
+                        for table_column_cell_text in tabel_column_cells[j]:
+                            column_text = table_column_cell_text["text"]["content"]
+                            column_texts.append(f'{table_header_cell_texts[j]}:{column_text}')
+
+                cur_result_text = "\n".join(column_texts)
+                result_lines_arr.append(cur_result_text)
+
+            if data["next_cursor"] is None:
+                done = True
+                break
+            else:
+                cur_block_id = data["next_cursor"]
+
+        result_lines = "\n".join(result_lines_arr)
+        return result_lines
+    def _read_parent_blocks(self, block_id: str, num_tabs: int = 0) -> List[str]:
+        """Read a block."""
+        done = False
+        result_lines_arr = []
+        cur_block_id = block_id
+        while not done:
+            block_url = BLOCK_CHILD_URL_TMPL.format(block_id=cur_block_id)
+            query_dict: Dict[str, Any] = {}
+
+            res = requests.request(
+                "GET", block_url, headers=self.headers, json=query_dict
+            )
+            data = res.json()
+            # current block's heading
+            heading = ''
+            for result in data["results"]:
+                result_type = result["type"]
+                result_obj = result[result_type]
+                cur_result_text_arr = []
+                if result_type == 'table':
+                    result_block_id = result["id"]
+                    text = self._read_table_rows(result_block_id)
+                    text += "\n\n"
+                    result_lines_arr.append(text)
+                else:
+                    if "rich_text" in result_obj:
+                        for rich_text in result_obj["rich_text"]:
+                            # skip if doesn't have text object
+                            if "text" in rich_text:
+                                text = rich_text["text"]["content"]
+                                cur_result_text_arr.append(text)
+                                if result_type in HEADING_TYPE:
+                                    heading = text
+
+                    result_block_id = result["id"]
+                    has_children = result["has_children"]
+                    if has_children:
+                        children_text = self._read_block(
+                            result_block_id, num_tabs=num_tabs + 1
+                        )
+                        cur_result_text_arr.append(children_text)
+
+                    cur_result_text = "\n".join(cur_result_text_arr)
+                    cur_result_text += "\n\n"
+                    if result_type in HEADING_TYPE:
+                        result_lines_arr.append(cur_result_text)
+                    else:
+                        result_lines_arr.append(f'{heading}\n{cur_result_text}')
+
+            if data["next_cursor"] is None:
+                done = True
+                break
+            else:
+                cur_block_id = data["next_cursor"]
+        return result_lines_arr
+
+    def read_page(self, page_id: str) -> str:
+        """Read a page."""
+        return self._read_block(page_id)
+
+    def read_page_as_documents(self, page_id: str) -> List[str]:
+        """Read a page as documents."""
+        return self._read_parent_blocks(page_id)
+
+    def query_database_data(
+            self, database_id: str, query_dict: Dict[str, Any] = {}
+    ) -> str:
+        """Get all the pages from a Notion database."""
+        res = requests.post\
+                (
+            DATABASE_URL_TMPL.format(database_id=database_id),
+            headers=self.headers,
+            json=query_dict,
+        )
+        data = res.json()
+        database_content_list = []
+        if 'results' not in data or data["results"] is None:
+            return ""
+        for result in data["results"]:
+            properties = result['properties']
+            data = {}
+            for property_name, property_value in properties.items():
+                type = property_value['type']
+                if type == 'multi_select':
+                    value = []
+                    multi_select_list = property_value[type]
+                    for multi_select in multi_select_list:
+                        value.append(multi_select['name'])
+                elif type == 'rich_text' or type == 'title':
+                    if len(property_value[type]) > 0:
+                        value = property_value[type][0]['plain_text']
+                    else:
+                        value = ''
+                elif type == 'select' or type == 'status':
+                    if property_value[type]:
+                        value = property_value[type]['name']
+                    else:
+                        value = ''
+                else:
+                    value = property_value[type]
+                data[property_name] = value
+            database_content_list.append(json.dumps(data))
+
+        return "\n\n".join(database_content_list)
+
+    def query_database(
+            self, database_id: str, query_dict: Dict[str, Any] = {}
+    ) -> List[str]:
+        """Get all the pages from a Notion database."""
+        res = requests.post\
+                (
+            DATABASE_URL_TMPL.format(database_id=database_id),
+            headers=self.headers,
+            json=query_dict,
+        )
+        data = res.json()
+        page_ids = []
+        for result in data["results"]:
+            page_id = result["id"]
+            page_ids.append(page_id)
+
+        return page_ids
+
+    def search(self, query: str) -> List[str]:
+        """Search Notion page given a text query."""
+        done = False
+        next_cursor: Optional[str] = None
+        page_ids = []
+        while not done:
+            query_dict = {
+                "query": query,
+            }
+            if next_cursor is not None:
+                query_dict["start_cursor"] = next_cursor
+            res = requests.post(SEARCH_URL, headers=self.headers, json=query_dict)
+            data = res.json()
+            for result in data["results"]:
+                page_id = result["id"]
+                page_ids.append(page_id)
+
+            if data["next_cursor"] is None:
+                done = True
+                break
+            else:
+                next_cursor = data["next_cursor"]
+        return page_ids
+
+    def load_data(
+            self, page_ids: List[str] = [], database_id: Optional[str] = None
+    ) -> List[Document]:
+        """Load data from the input directory.
+
+        Args:
+            page_ids (List[str]): List of page ids to load.
+
+        Returns:
+            List[Document]: List of documents.
+
+        """
+        if not page_ids and not database_id:
+            raise ValueError("Must specify either `page_ids` or `database_id`.")
+        docs = []
+        if database_id is not None:
+            # get all the pages in the database
+            page_ids = self.query_database(database_id)
+            for page_id in page_ids:
+                page_text = self.read_page(page_id)
+                docs.append(Document(page_text))
+        else:
+            for page_id in page_ids:
+                page_text = self.read_page(page_id)
+                docs.append(Document(page_text))
+
+        return docs
+
+    def load_data_as_documents(
+            self, page_ids: List[str] = [], database_id: Optional[str] = None
+    ) -> List[Document]:
+        if not page_ids and not database_id:
+            raise ValueError("Must specify either `page_ids` or `database_id`.")
+        docs = []
+        if database_id is not None:
+            # get all the pages in the database
+            page_text = self.query_database_data(database_id)
+            docs.append(Document(page_text))
+        else:
+            for page_id in page_ids:
+                page_text_list = self.read_page_as_documents(page_id)
+                for page_text in page_text_list:
+                    docs.append(Document(page_text))
+
+        return docs
+
+    def get_page_last_edited_time(self, page_id: str) -> str:
+        retrieve_page_url = RETRIEVE_PAGE_URL_TMPL.format(page_id=page_id)
+        query_dict: Dict[str, Any] = {}
+
+        res = requests.request(
+            "GET", retrieve_page_url, headers=self.headers, json=query_dict
+        )
+        data = res.json()
+        return data["last_edited_time"]
+
+    def get_database_last_edited_time(self, database_id: str) -> str:
+        retrieve_page_url = RETRIEVE_DATABASE_URL_TMPL.format(database_id=database_id)
+        query_dict: Dict[str, Any] = {}
+
+        res = requests.request(
+            "GET", retrieve_page_url, headers=self.headers, json=query_dict
+        )
+        data = res.json()
+        return data["last_edited_time"]
+
+
+if __name__ == "__main__":
+    reader = NotionPageReader()
+    logger.info(reader.search("What I"))

+ 193 - 67
api/core/indexing_runner.py

@@ -5,6 +5,8 @@ import tempfile
 import time
 from pathlib import Path
 from typing import Optional, List
+
+from flask_login import current_user
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 
 from llama_index import SimpleDirectoryReader
@@ -13,6 +15,8 @@ from llama_index.data_structs.node_v2 import DocumentRelationship
 from llama_index.node_parser import SimpleNodeParser, NodeParser
 from llama_index.readers.file.base import DEFAULT_FILE_EXTRACTOR
 from llama_index.readers.file.markdown_parser import MarkdownParser
+
+from core.data_source.notion import NotionPageReader
 from core.index.readers.xlsx_parser import XLSXParser
 from core.docstore.dataset_docstore import DatesetDocumentStore
 from core.index.keyword_table_index import KeywordTableIndex
@@ -27,6 +31,7 @@ from extensions.ext_redis import redis_client
 from extensions.ext_storage import storage
 from models.dataset import Document, Dataset, DocumentSegment, DatasetProcessRule
 from models.model import UploadFile
+from models.source import DataSourceBinding
 
 
 class IndexingRunner:
@@ -35,42 +40,43 @@ class IndexingRunner:
         self.storage = storage
         self.embedding_model_name = embedding_model_name
 
-    def run(self, document: Document):
+    def run(self, documents: List[Document]):
         """Run the indexing process."""
-        # get dataset
-        dataset = Dataset.query.filter_by(
-            id=document.dataset_id
-        ).first()
-
-        if not dataset:
-            raise ValueError("no dataset found")
-
-        # load file
-        text_docs = self._load_data(document)
-
-        # get the process rule
-        processing_rule = db.session.query(DatasetProcessRule). \
-            filter(DatasetProcessRule.id == document.dataset_process_rule_id). \
-            first()
-
-        # get node parser for splitting
-        node_parser = self._get_node_parser(processing_rule)
-
-        # split to nodes
-        nodes = self._step_split(
-            text_docs=text_docs,
-            node_parser=node_parser,
-            dataset=dataset,
-            document=document,
-            processing_rule=processing_rule
-        )
+        for document in documents:
+            # get dataset
+            dataset = Dataset.query.filter_by(
+                id=document.dataset_id
+            ).first()
+
+            if not dataset:
+                raise ValueError("no dataset found")
+
+            # load file
+            text_docs = self._load_data(document)
+
+            # get the process rule
+            processing_rule = db.session.query(DatasetProcessRule). \
+                filter(DatasetProcessRule.id == document.dataset_process_rule_id). \
+                first()
+
+            # get node parser for splitting
+            node_parser = self._get_node_parser(processing_rule)
+
+            # split to nodes
+            nodes = self._step_split(
+                text_docs=text_docs,
+                node_parser=node_parser,
+                dataset=dataset,
+                document=document,
+                processing_rule=processing_rule
+            )
 
-        # build index
-        self._build_index(
-            dataset=dataset,
-            document=document,
-            nodes=nodes
-        )
+            # build index
+            self._build_index(
+                dataset=dataset,
+                document=document,
+                nodes=nodes
+            )
 
     def run_in_splitting_status(self, document: Document):
         """Run the indexing process when the index_status is splitting."""
@@ -164,38 +170,98 @@ class IndexingRunner:
             nodes=nodes
         )
 
-    def indexing_estimate(self, file_detail: UploadFile, tmp_processing_rule: dict) -> dict:
+    def file_indexing_estimate(self, file_details: List[UploadFile], tmp_processing_rule: dict) -> dict:
         """
         Estimate the indexing for the document.
         """
-        # load data from file
-        text_docs = self._load_data_from_file(file_detail)
+        tokens = 0
+        preview_texts = []
+        total_segments = 0
+        for file_detail in file_details:
+            # load data from file
+            text_docs = self._load_data_from_file(file_detail)
+
+            processing_rule = DatasetProcessRule(
+                mode=tmp_processing_rule["mode"],
+                rules=json.dumps(tmp_processing_rule["rules"])
+            )
 
-        processing_rule = DatasetProcessRule(
-            mode=tmp_processing_rule["mode"],
-            rules=json.dumps(tmp_processing_rule["rules"])
-        )
+            # get node parser for splitting
+            node_parser = self._get_node_parser(processing_rule)
 
-        # get node parser for splitting
-        node_parser = self._get_node_parser(processing_rule)
+            # split to nodes
+            nodes = self._split_to_nodes(
+                text_docs=text_docs,
+                node_parser=node_parser,
+                processing_rule=processing_rule
+            )
+            total_segments += len(nodes)
+            for node in nodes:
+                if len(preview_texts) < 5:
+                    preview_texts.append(node.get_text())
 
-        # split to nodes
-        nodes = self._split_to_nodes(
-            text_docs=text_docs,
-            node_parser=node_parser,
-            processing_rule=processing_rule
-        )
+                tokens += TokenCalculator.get_num_tokens(self.embedding_model_name, node.get_text())
+
+        return {
+            "total_segments": total_segments,
+            "tokens": tokens,
+            "total_price": '{:f}'.format(TokenCalculator.get_token_price(self.embedding_model_name, tokens)),
+            "currency": TokenCalculator.get_currency(self.embedding_model_name),
+            "preview": preview_texts
+        }
 
+    def notion_indexing_estimate(self, notion_info_list: list, tmp_processing_rule: dict) -> dict:
+        """
+        Estimate the indexing for the document.
+        """
+        # load data from notion
         tokens = 0
         preview_texts = []
-        for node in nodes:
-            if len(preview_texts) < 5:
-                preview_texts.append(node.get_text())
-
-            tokens += TokenCalculator.get_num_tokens(self.embedding_model_name, node.get_text())
+        total_segments = 0
+        for notion_info in notion_info_list:
+            workspace_id = notion_info['workspace_id']
+            data_source_binding = DataSourceBinding.query.filter(
+                db.and_(
+                    DataSourceBinding.tenant_id == current_user.current_tenant_id,
+                    DataSourceBinding.provider == 'notion',
+                    DataSourceBinding.disabled == False,
+                    DataSourceBinding.source_info['workspace_id'] == f'"{workspace_id}"'
+                )
+            ).first()
+            if not data_source_binding:
+                raise ValueError('Data source binding not found.')
+            reader = NotionPageReader(integration_token=data_source_binding.access_token)
+            for page in notion_info['pages']:
+                if page['type'] == 'page':
+                    page_ids = [page['page_id']]
+                    documents = reader.load_data_as_documents(page_ids=page_ids)
+                elif page['type'] == 'database':
+                    documents = reader.load_data_as_documents(database_id=page['page_id'])
+                else:
+                    documents = []
+                processing_rule = DatasetProcessRule(
+                    mode=tmp_processing_rule["mode"],
+                    rules=json.dumps(tmp_processing_rule["rules"])
+                )
+
+                # get node parser for splitting
+                node_parser = self._get_node_parser(processing_rule)
+
+                # split to nodes
+                nodes = self._split_to_nodes(
+                    text_docs=documents,
+                    node_parser=node_parser,
+                    processing_rule=processing_rule
+                )
+                total_segments += len(nodes)
+                for node in nodes:
+                    if len(preview_texts) < 5:
+                        preview_texts.append(node.get_text())
+
+                    tokens += TokenCalculator.get_num_tokens(self.embedding_model_name, node.get_text())
 
         return {
-            "total_segments": len(nodes),
+            "total_segments": total_segments,
             "tokens": tokens,
             "total_price": '{:f}'.format(TokenCalculator.get_token_price(self.embedding_model_name, tokens)),
             "currency": TokenCalculator.get_currency(self.embedding_model_name),
@@ -204,25 +270,50 @@ class IndexingRunner:
 
     def _load_data(self, document: Document) -> List[Document]:
         # load file
-        if document.data_source_type != "upload_file":
+        if document.data_source_type not in ["upload_file", "notion_import"]:
             return []
 
         data_source_info = document.data_source_info_dict
-        if not data_source_info or 'upload_file_id' not in data_source_info:
-            raise ValueError("no upload file found")
-
-        file_detail = db.session.query(UploadFile). \
-            filter(UploadFile.id == data_source_info['upload_file_id']). \
-            one_or_none()
-
-        text_docs = self._load_data_from_file(file_detail)
-
+        text_docs = []
+        if document.data_source_type == 'upload_file':
+            if not data_source_info or 'upload_file_id' not in data_source_info:
+                raise ValueError("no upload file found")
+
+            file_detail = db.session.query(UploadFile). \
+                filter(UploadFile.id == data_source_info['upload_file_id']). \
+                one_or_none()
+
+            text_docs = self._load_data_from_file(file_detail)
+        elif document.data_source_type == 'notion_import':
+            if not data_source_info or 'notion_page_id' not in data_source_info \
+                    or 'notion_workspace_id' not in data_source_info:
+                raise ValueError("no notion page found")
+            workspace_id = data_source_info['notion_workspace_id']
+            page_id = data_source_info['notion_page_id']
+            page_type = data_source_info['type']
+            data_source_binding = DataSourceBinding.query.filter(
+                db.and_(
+                    DataSourceBinding.tenant_id == document.tenant_id,
+                    DataSourceBinding.provider == 'notion',
+                    DataSourceBinding.disabled == False,
+                    DataSourceBinding.source_info['workspace_id'] == f'"{workspace_id}"'
+                )
+            ).first()
+            if not data_source_binding:
+                raise ValueError('Data source binding not found.')
+            if page_type == 'page':
+                # add page last_edited_time to data_source_info
+                self._get_notion_page_last_edited_time(page_id, data_source_binding.access_token, document)
+                text_docs = self._load_page_data_from_notion(page_id, data_source_binding.access_token)
+            elif page_type == 'database':
+                # add page last_edited_time to data_source_info
+                self._get_notion_database_last_edited_time(page_id, data_source_binding.access_token, document)
+                text_docs = self._load_database_data_from_notion(page_id, data_source_binding.access_token)
         # update document status to splitting
         self._update_document_index_status(
             document_id=document.id,
             after_indexing_status="splitting",
             extra_update_params={
-                Document.file_id: file_detail.id,
                 Document.word_count: sum([len(text_doc.text) for text_doc in text_docs]),
                 Document.parsing_completed_at: datetime.datetime.utcnow()
             }
@@ -259,6 +350,41 @@ class IndexingRunner:
 
             return text_docs
 
+    def _load_page_data_from_notion(self, page_id: str, access_token: str) -> List[Document]:
+        page_ids = [page_id]
+        reader = NotionPageReader(integration_token=access_token)
+        text_docs = reader.load_data_as_documents(page_ids=page_ids)
+        return text_docs
+
+    def _load_database_data_from_notion(self, database_id: str, access_token: str) -> List[Document]:
+        reader = NotionPageReader(integration_token=access_token)
+        text_docs = reader.load_data_as_documents(database_id=database_id)
+        return text_docs
+
+    def _get_notion_page_last_edited_time(self, page_id: str, access_token: str, document: Document):
+        reader = NotionPageReader(integration_token=access_token)
+        last_edited_time = reader.get_page_last_edited_time(page_id)
+        data_source_info = document.data_source_info_dict
+        data_source_info['last_edited_time'] = last_edited_time
+        update_params = {
+            Document.data_source_info: json.dumps(data_source_info)
+        }
+
+        Document.query.filter_by(id=document.id).update(update_params)
+        db.session.commit()
+
+    def _get_notion_database_last_edited_time(self, page_id: str, access_token: str, document: Document):
+        reader = NotionPageReader(integration_token=access_token)
+        last_edited_time = reader.get_database_last_edited_time(page_id)
+        data_source_info = document.data_source_info_dict
+        data_source_info['last_edited_time'] = last_edited_time
+        update_params = {
+            Document.data_source_info: json.dumps(data_source_info)
+        }
+
+        Document.query.filter_by(id=document.id).update(update_params)
+        db.session.commit()
+
     def _get_node_parser(self, processing_rule: DatasetProcessRule) -> NodeParser:
         """
         Get the NodeParser object according to the processing rule.
@@ -308,7 +434,7 @@ class IndexingRunner:
             embedding_model_name=self.embedding_model_name,
             document_id=document.id
         )
-
+        # add document segments
         doc_store.add_documents(nodes)
 
         # update document status to indexing

+ 7 - 0
api/libs/oauth.py

@@ -1,7 +1,12 @@
+import json
 import urllib.parse
 from dataclasses import dataclass
 
 import requests
+from flask_login import current_user
+
+from extensions.ext_database import db
+from models.source import DataSourceBinding
 
 
 @dataclass
@@ -134,3 +139,5 @@ class GoogleOAuth(OAuth):
             name=None,
             email=raw_info['email']
         )
+
+

+ 256 - 0
api/libs/oauth_data_source.py

@@ -0,0 +1,256 @@
+import json
+import urllib.parse
+
+import requests
+from flask_login import current_user
+
+from extensions.ext_database import db
+from models.source import DataSourceBinding
+
+
+class OAuthDataSource:
+    def __init__(self, client_id: str, client_secret: str, redirect_uri: str):
+        self.client_id = client_id
+        self.client_secret = client_secret
+        self.redirect_uri = redirect_uri
+
+    def get_authorization_url(self):
+        raise NotImplementedError()
+
+    def get_access_token(self, code: str):
+        raise NotImplementedError()
+
+
+class NotionOAuth(OAuthDataSource):
+    _AUTH_URL = 'https://api.notion.com/v1/oauth/authorize'
+    _TOKEN_URL = 'https://api.notion.com/v1/oauth/token'
+    _NOTION_PAGE_SEARCH = "https://api.notion.com/v1/search"
+    _NOTION_BLOCK_SEARCH = "https://api.notion.com/v1/blocks"
+
+    def get_authorization_url(self):
+        params = {
+            'client_id': self.client_id,
+            'response_type': 'code',
+            'redirect_uri': self.redirect_uri,
+            'owner': 'user'
+        }
+        return f"{self._AUTH_URL}?{urllib.parse.urlencode(params)}"
+
+    def get_access_token(self, code: str):
+        data = {
+            'code': code,
+            'grant_type': 'authorization_code',
+            'redirect_uri': self.redirect_uri
+        }
+        headers = {'Accept': 'application/json'}
+        auth = (self.client_id, self.client_secret)
+        response = requests.post(self._TOKEN_URL, data=data, auth=auth, headers=headers)
+
+        response_json = response.json()
+        access_token = response_json.get('access_token')
+        if not access_token:
+            raise ValueError(f"Error in Notion OAuth: {response_json}")
+        workspace_name = response_json.get('workspace_name')
+        workspace_icon = response_json.get('workspace_icon')
+        workspace_id = response_json.get('workspace_id')
+        # get all authorized pages
+        pages = self.get_authorized_pages(access_token)
+        source_info = {
+            'workspace_name': workspace_name,
+            'workspace_icon': workspace_icon,
+            'workspace_id': workspace_id,
+            'pages': pages,
+            'total': len(pages)
+        }
+        # save data source binding
+        data_source_binding = DataSourceBinding.query.filter(
+            db.and_(
+                DataSourceBinding.tenant_id == current_user.current_tenant_id,
+                DataSourceBinding.provider == 'notion',
+                DataSourceBinding.access_token == access_token
+            )
+        ).first()
+        if data_source_binding:
+            data_source_binding.source_info = source_info
+            data_source_binding.disabled = False
+            db.session.commit()
+        else:
+            new_data_source_binding = DataSourceBinding(
+                tenant_id=current_user.current_tenant_id,
+                access_token=access_token,
+                source_info=source_info,
+                provider='notion'
+            )
+            db.session.add(new_data_source_binding)
+            db.session.commit()
+
+    def sync_data_source(self, binding_id: str):
+        # save data source binding
+        data_source_binding = DataSourceBinding.query.filter(
+            db.and_(
+                DataSourceBinding.tenant_id == current_user.current_tenant_id,
+                DataSourceBinding.provider == 'notion',
+                DataSourceBinding.id == binding_id,
+                DataSourceBinding.disabled == False
+            )
+        ).first()
+        if data_source_binding:
+            # get all authorized pages
+            pages = self.get_authorized_pages(data_source_binding.access_token)
+            source_info = data_source_binding.source_info
+            new_source_info = {
+                'workspace_name': source_info['workspace_name'],
+                'workspace_icon': source_info['workspace_icon'],
+                'workspace_id': source_info['workspace_id'],
+                'pages': pages,
+                'total': len(pages)
+            }
+            data_source_binding.source_info = new_source_info
+            data_source_binding.disabled = False
+            db.session.commit()
+        else:
+            raise ValueError('Data source binding not found')
+
+    def get_authorized_pages(self, access_token: str):
+        pages = []
+        page_results = self.notion_page_search(access_token)
+        database_results = self.notion_database_search(access_token)
+        # get page detail
+        for page_result in page_results:
+            page_id = page_result['id']
+            if 'Name' in page_result['properties']:
+                if len(page_result['properties']['Name']['title']) > 0:
+                    page_name = page_result['properties']['Name']['title'][0]['plain_text']
+                else:
+                    page_name = 'Untitled'
+            elif 'title' in page_result['properties']:
+                if len(page_result['properties']['title']['title']) > 0:
+                    page_name = page_result['properties']['title']['title'][0]['plain_text']
+                else:
+                    page_name = 'Untitled'
+            elif 'Title' in page_result['properties']:
+                if len(page_result['properties']['Title']['title']) > 0:
+                    page_name = page_result['properties']['Title']['title'][0]['plain_text']
+                else:
+                    page_name = 'Untitled'
+            else:
+                page_name = 'Untitled'
+            page_icon = page_result['icon']
+            if page_icon:
+                icon_type = page_icon['type']
+                if icon_type == 'external' or icon_type == 'file':
+                    url = page_icon[icon_type]['url']
+                    icon = {
+                        'type': 'url',
+                        'url': url if url.startswith('http') else f'https://www.notion.so{url}'
+                    }
+                else:
+                    icon = {
+                        'type': 'emoji',
+                        'emoji': page_icon[icon_type]
+                    }
+            else:
+                icon = None
+            parent = page_result['parent']
+            parent_type = parent['type']
+            if parent_type == 'block_id':
+                parent_id = self.notion_block_parent_page_id(access_token, parent[parent_type])
+            elif parent_type == 'workspace':
+                parent_id = 'root'
+            else:
+                parent_id = parent[parent_type]
+            page = {
+                'page_id': page_id,
+                'page_name': page_name,
+                'page_icon': icon,
+                'parent_id': parent_id,
+                'type': 'page'
+            }
+            pages.append(page)
+            # get database detail
+        for database_result in database_results:
+            page_id = database_result['id']
+            if len(database_result['title']) > 0:
+                page_name = database_result['title'][0]['plain_text']
+            else:
+                page_name = 'Untitled'
+            page_icon = database_result['icon']
+            if page_icon:
+                icon_type = page_icon['type']
+                if icon_type == 'external' or icon_type == 'file':
+                    url = page_icon[icon_type]['url']
+                    icon = {
+                        'type': 'url',
+                        'url': url if url.startswith('http') else f'https://www.notion.so{url}'
+                    }
+                else:
+                    icon = {
+                        'type': icon_type,
+                        icon_type: page_icon[icon_type]
+                    }
+            else:
+                icon = None
+            parent = database_result['parent']
+            parent_type = parent['type']
+            if parent_type == 'block_id':
+                parent_id = self.notion_block_parent_page_id(access_token, parent[parent_type])
+            elif parent_type == 'workspace':
+                parent_id = 'root'
+            else:
+                parent_id = parent[parent_type]
+            page = {
+                'page_id': page_id,
+                'page_name': page_name,
+                'page_icon': icon,
+                'parent_id': parent_id,
+                'type': 'database'
+            }
+            pages.append(page)
+        return pages
+
+    def notion_page_search(self, access_token: str):
+        data = {
+            'filter': {
+                "value": "page",
+                "property": "object"
+            }
+        }
+        headers = {
+            'Content-Type': 'application/json',
+            'Authorization': f"Bearer {access_token}",
+            'Notion-Version': '2022-06-28',
+        }
+        response = requests.post(url=self._NOTION_PAGE_SEARCH, json=data, headers=headers)
+        response_json = response.json()
+        results = response_json['results']
+        return results
+
+    def notion_block_parent_page_id(self, access_token: str, block_id: str):
+        headers = {
+            'Authorization': f"Bearer {access_token}",
+            'Notion-Version': '2022-06-28',
+        }
+        response = requests.get(url=f'{self._NOTION_BLOCK_SEARCH}/{block_id}', headers=headers)
+        response_json = response.json()
+        parent = response_json['parent']
+        parent_type = parent['type']
+        if parent_type == 'block_id':
+            return self.notion_block_parent_page_id(access_token, parent[parent_type])
+        return parent[parent_type]
+
+    def notion_database_search(self, access_token: str):
+        data = {
+            'filter': {
+                "value": "database",
+                "property": "object"
+            }
+        }
+        headers = {
+            'Content-Type': 'application/json',
+            'Authorization': f"Bearer {access_token}",
+            'Notion-Version': '2022-06-28',
+        }
+        response = requests.post(url=self._NOTION_PAGE_SEARCH, json=data, headers=headers)
+        response_json = response.json()
+        results = response_json['results']
+        return results

+ 46 - 0
api/migrations/versions/e32f6ccb87c6_e08af0a69ccefbb59fa80c778efee300bb780980.py

@@ -0,0 +1,46 @@
+"""e08af0a69ccefbb59fa80c778efee300bb780980
+
+Revision ID: e32f6ccb87c6
+Revises: a45f4dfde53b
+Create Date: 2023-06-06 19:58:33.103819
+
+"""
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision = 'e32f6ccb87c6'
+down_revision = '614f77cecc48'
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_table('data_source_bindings',
+    sa.Column('id', postgresql.UUID(), server_default=sa.text('uuid_generate_v4()'), nullable=False),
+    sa.Column('tenant_id', postgresql.UUID(), nullable=False),
+    sa.Column('access_token', sa.String(length=255), nullable=False),
+    sa.Column('provider', sa.String(length=255), nullable=False),
+    sa.Column('source_info', postgresql.JSONB(astext_type=sa.Text()), nullable=False),
+    sa.Column('created_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP(0)'), nullable=False),
+    sa.Column('updated_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP(0)'), nullable=False),
+    sa.Column('disabled', sa.Boolean(), server_default=sa.text('false'), nullable=True),
+    sa.PrimaryKeyConstraint('id', name='source_binding_pkey')
+    )
+    with op.batch_alter_table('data_source_bindings', schema=None) as batch_op:
+        batch_op.create_index('source_binding_tenant_id_idx', ['tenant_id'], unique=False)
+        batch_op.create_index('source_info_idx', ['source_info'], unique=False, postgresql_using='gin')
+
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    with op.batch_alter_table('data_source_bindings', schema=None) as batch_op:
+        batch_op.drop_index('source_info_idx', postgresql_using='gin')
+        batch_op.drop_index('source_binding_tenant_id_idx')
+
+    op.drop_table('data_source_bindings')
+    # ### end Alembic commands ###

+ 3 - 1
api/models/dataset.py

@@ -190,7 +190,7 @@ class Document(db.Model):
     doc_type = db.Column(db.String(40), nullable=True)
     doc_metadata = db.Column(db.JSON, nullable=True)
 
-    DATA_SOURCES = ['upload_file']
+    DATA_SOURCES = ['upload_file', 'notion_import']
 
     @property
     def display_status(self):
@@ -242,6 +242,8 @@ class Document(db.Model):
                             'created_at': file_detail.created_at.timestamp()
                         }
                     }
+            elif self.data_source_type == 'notion_import':
+                return json.loads(self.data_source_info)
         return {}
 
     @property

+ 21 - 0
api/models/source.py

@@ -0,0 +1,21 @@
+from sqlalchemy.dialects.postgresql import UUID
+
+from extensions.ext_database import db
+from sqlalchemy.dialects.postgresql import JSONB
+
+class DataSourceBinding(db.Model):
+    __tablename__ = 'data_source_bindings'
+    __table_args__ = (
+        db.PrimaryKeyConstraint('id', name='source_binding_pkey'),
+        db.Index('source_binding_tenant_id_idx', 'tenant_id'),
+        db.Index('source_info_idx', "source_info", postgresql_using='gin')
+    )
+
+    id = db.Column(UUID, server_default=db.text('uuid_generate_v4()'))
+    tenant_id = db.Column(UUID, nullable=False)
+    access_token = db.Column(db.String(255), nullable=False)
+    provider = db.Column(db.String(255), nullable=False)
+    source_info = db.Column(JSONB, nullable=False)
+    created_at = db.Column(db.DateTime, nullable=False, server_default=db.text('CURRENT_TIMESTAMP(0)'))
+    updated_at = db.Column(db.DateTime, nullable=False, server_default=db.text('CURRENT_TIMESTAMP(0)'))
+    disabled = db.Column(db.Boolean, nullable=True, server_default=db.text('false'))

+ 255 - 62
api/services/dataset_service.py

@@ -3,7 +3,7 @@ import logging
 import datetime
 import time
 import random
-from typing import Optional
+from typing import Optional, List
 from extensions.ext_redis import redis_client
 from flask_login import current_user
 
@@ -14,10 +14,12 @@ from extensions.ext_database import db
 from models.account import Account
 from models.dataset import Dataset, Document, DatasetQuery, DatasetProcessRule, AppDatasetJoin, DocumentSegment
 from models.model import UploadFile
+from models.source import DataSourceBinding
 from services.errors.account import NoPermissionError
 from services.errors.dataset import DatasetNameDuplicateError
 from services.errors.document import DocumentIndexingError
 from services.errors.file import FileNotExistsError
+from tasks.clean_notion_document_task import clean_notion_document_task
 from tasks.deal_dataset_vector_index_task import deal_dataset_vector_index_task
 from tasks.document_indexing_task import document_indexing_task
 from tasks.document_indexing_update_task import document_indexing_update_task
@@ -286,6 +288,24 @@ class DocumentService:
         return document
 
     @staticmethod
+    def get_document_by_dataset_id(dataset_id: str) -> List[Document]:
+        documents = db.session.query(Document).filter(
+            Document.dataset_id == dataset_id,
+            Document.enabled == True
+        ).all()
+
+        return documents
+
+    @staticmethod
+    def get_batch_documents(dataset_id: str, batch: str) -> List[Document]:
+        documents = db.session.query(Document).filter(
+            Document.batch == batch,
+            Document.dataset_id == dataset_id,
+            Document.tenant_id == current_user.current_tenant_id
+        ).all()
+
+        return documents
+    @staticmethod
     def get_document_file_detail(file_id: str):
         file_detail = db.session.query(UploadFile). \
             filter(UploadFile.id == file_id). \
@@ -344,9 +364,9 @@ class DocumentService:
 
     @staticmethod
     def get_documents_position(dataset_id):
-        documents = Document.query.filter_by(dataset_id=dataset_id).all()
-        if documents:
-            return len(documents) + 1
+        document = Document.query.filter_by(dataset_id=dataset_id).order_by(Document.position.desc()).first()
+        if document:
+            return document.position + 1
         else:
             return 1
 
@@ -363,9 +383,11 @@ class DocumentService:
 
         if dataset.indexing_technique == 'high_quality':
             IndexBuilder.get_default_service_context(dataset.tenant_id)
-
+        documents = []
+        batch = time.strftime('%Y%m%d%H%M%S') + str(random.randint(100000, 999999))
         if 'original_document_id' in document_data and document_data["original_document_id"]:
             document = DocumentService.update_document_with_dataset_id(dataset, document_data, account)
+            documents.append(document)
         else:
             # save process rule
             if not dataset_process_rule:
@@ -386,46 +408,114 @@ class DocumentService:
                     )
                 db.session.add(dataset_process_rule)
                 db.session.commit()
-
-            file_name = ''
-            data_source_info = {}
-            if document_data["data_source"]["type"] == "upload_file":
-                file_id = document_data["data_source"]["info"]
-                file = db.session.query(UploadFile).filter(
-                    UploadFile.tenant_id == dataset.tenant_id,
-                    UploadFile.id == file_id
-                ).first()
-
-                # raise error if file not found
-                if not file:
-                    raise FileNotExistsError()
-
-                file_name = file.name
-                data_source_info = {
-                    "upload_file_id": file_id,
-                }
-
-            # save document
             position = DocumentService.get_documents_position(dataset.id)
-            document = Document(
-                tenant_id=dataset.tenant_id,
-                dataset_id=dataset.id,
-                position=position,
-                data_source_type=document_data["data_source"]["type"],
-                data_source_info=json.dumps(data_source_info),
-                dataset_process_rule_id=dataset_process_rule.id,
-                batch=time.strftime('%Y%m%d%H%M%S') + str(random.randint(100000, 999999)),
-                name=file_name,
-                created_from=created_from,
-                created_by=account.id,
-                # created_api_request_id = db.Column(UUID, nullable=True)
-            )
-
-            db.session.add(document)
+            document_ids = []
+            if document_data["data_source"]["type"] == "upload_file":
+                upload_file_list = document_data["data_source"]["info_list"]['file_info_list']['file_ids']
+                for file_id in upload_file_list:
+                    file = db.session.query(UploadFile).filter(
+                        UploadFile.tenant_id == dataset.tenant_id,
+                        UploadFile.id == file_id
+                    ).first()
+
+                    # raise error if file not found
+                    if not file:
+                        raise FileNotExistsError()
+
+                    file_name = file.name
+                    data_source_info = {
+                        "upload_file_id": file_id,
+                    }
+                    document = DocumentService.save_document(dataset, dataset_process_rule.id,
+                                                             document_data["data_source"]["type"],
+                                                             data_source_info, created_from, position,
+                                                             account, file_name, batch)
+                    db.session.add(document)
+                    db.session.flush()
+                    document_ids.append(document.id)
+                    documents.append(document)
+                    position += 1
+            elif document_data["data_source"]["type"] == "notion_import":
+                notion_info_list = document_data["data_source"]['info_list']['notion_info_list']
+                exist_page_ids = []
+                exist_document = dict()
+                documents = Document.query.filter_by(
+                    dataset_id=dataset.id,
+                    tenant_id=current_user.current_tenant_id,
+                    data_source_type='notion_import',
+                    enabled=True
+                ).all()
+                if documents:
+                    for document in documents:
+                        data_source_info = json.loads(document.data_source_info)
+                        exist_page_ids.append(data_source_info['notion_page_id'])
+                        exist_document[data_source_info['notion_page_id']] = document.id
+                for notion_info in notion_info_list:
+                    workspace_id = notion_info['workspace_id']
+                    data_source_binding = DataSourceBinding.query.filter(
+                        db.and_(
+                            DataSourceBinding.tenant_id == current_user.current_tenant_id,
+                            DataSourceBinding.provider == 'notion',
+                            DataSourceBinding.disabled == False,
+                            DataSourceBinding.source_info['workspace_id'] == f'"{workspace_id}"'
+                        )
+                    ).first()
+                    if not data_source_binding:
+                        raise ValueError('Data source binding not found.')
+                    for page in notion_info['pages']:
+                        if page['page_id'] not in exist_page_ids:
+                            data_source_info = {
+                                "notion_workspace_id": workspace_id,
+                                "notion_page_id": page['page_id'],
+                                "notion_page_icon": page['page_icon'],
+                                "type": page['type']
+                            }
+                            document = DocumentService.save_document(dataset, dataset_process_rule.id,
+                                                                     document_data["data_source"]["type"],
+                                                                     data_source_info, created_from, position,
+                                                                     account, page['page_name'], batch)
+                            # if page['type'] == 'database':
+                            #     document.splitting_completed_at = datetime.datetime.utcnow()
+                            #     document.cleaning_completed_at = datetime.datetime.utcnow()
+                            #     document.parsing_completed_at = datetime.datetime.utcnow()
+                            #     document.completed_at = datetime.datetime.utcnow()
+                            #     document.indexing_status = 'completed'
+                            #     document.word_count = 0
+                            #     document.tokens = 0
+                            #     document.indexing_latency = 0
+                            db.session.add(document)
+                            db.session.flush()
+                            # if page['type'] != 'database':
+                            document_ids.append(document.id)
+                            documents.append(document)
+                            position += 1
+                        else:
+                            exist_document.pop(page['page_id'])
+                # delete not selected documents
+                if len(exist_document) > 0:
+                    clean_notion_document_task.delay(list(exist_document.values()), dataset.id)
             db.session.commit()
 
             # trigger async task
-            document_indexing_task.delay(document.dataset_id, document.id)
+            document_indexing_task.delay(dataset.id, document_ids)
+
+        return documents, batch
+
+    @staticmethod
+    def save_document(dataset: Dataset, process_rule_id: str, data_source_type: str, data_source_info: dict,
+                      created_from: str, position: int, account: Account, name: str, batch: str):
+        document = Document(
+            tenant_id=dataset.tenant_id,
+            dataset_id=dataset.id,
+            position=position,
+            data_source_type=data_source_type,
+            data_source_info=json.dumps(data_source_info),
+            dataset_process_rule_id=process_rule_id,
+            batch=batch,
+            name=name,
+            created_from=created_from,
+            created_by=account.id,
+        )
         return document
 
     @staticmethod
@@ -460,20 +550,42 @@ class DocumentService:
             file_name = ''
             data_source_info = {}
             if document_data["data_source"]["type"] == "upload_file":
-                file_id = document_data["data_source"]["info"]
-                file = db.session.query(UploadFile).filter(
-                    UploadFile.tenant_id == dataset.tenant_id,
-                    UploadFile.id == file_id
-                ).first()
-
-                # raise error if file not found
-                if not file:
-                    raise FileNotExistsError()
-
-                file_name = file.name
-                data_source_info = {
-                    "upload_file_id": file_id,
-                }
+                upload_file_list = document_data["data_source"]["info_list"]['file_info_list']['file_ids']
+                for file_id in upload_file_list:
+                    file = db.session.query(UploadFile).filter(
+                        UploadFile.tenant_id == dataset.tenant_id,
+                        UploadFile.id == file_id
+                    ).first()
+
+                    # raise error if file not found
+                    if not file:
+                        raise FileNotExistsError()
+
+                    file_name = file.name
+                    data_source_info = {
+                        "upload_file_id": file_id,
+                    }
+            elif document_data["data_source"]["type"] == "notion_import":
+                notion_info_list = document_data["data_source"]['info_list']['notion_info_list']
+                for notion_info in notion_info_list:
+                    workspace_id = notion_info['workspace_id']
+                    data_source_binding = DataSourceBinding.query.filter(
+                        db.and_(
+                            DataSourceBinding.tenant_id == current_user.current_tenant_id,
+                            DataSourceBinding.provider == 'notion',
+                            DataSourceBinding.disabled == False,
+                            DataSourceBinding.source_info['workspace_id'] == f'"{workspace_id}"'
+                        )
+                    ).first()
+                    if not data_source_binding:
+                        raise ValueError('Data source binding not found.')
+                    for page in notion_info['pages']:
+                        data_source_info = {
+                            "notion_workspace_id": workspace_id,
+                            "notion_page_id": page['page_id'],
+                            "notion_page_icon": page['page_icon'],
+                            "type": page['type']
+                        }
             document.data_source_type = document_data["data_source"]["type"]
             document.data_source_info = json.dumps(data_source_info)
             document.name = file_name
@@ -513,15 +625,15 @@ class DocumentService:
         db.session.add(dataset)
         db.session.flush()
 
-        document = DocumentService.save_document_with_dataset_id(dataset, document_data, account)
+        documents, batch = DocumentService.save_document_with_dataset_id(dataset, document_data, account)
 
         cut_length = 18
-        cut_name = document.name[:cut_length]
-        dataset.name = cut_name + '...' if len(document.name) > cut_length else cut_name
-        dataset.description = 'useful for when you want to answer queries about the ' + document.name
+        cut_name = documents[0].name[:cut_length]
+        dataset.name = cut_name + '...'
+        dataset.description = 'useful for when you want to answer queries about the ' + documents[0].name
         db.session.commit()
 
-        return dataset, document
+        return dataset, documents, batch
 
     @classmethod
     def document_create_args_validate(cls, args: dict):
@@ -552,9 +664,15 @@ class DocumentService:
         if args['data_source']['type'] not in Document.DATA_SOURCES:
             raise ValueError("Data source type is invalid")
 
+        if 'info_list' not in args['data_source'] or not args['data_source']['info_list']:
+            raise ValueError("Data source info is required")
+
         if args['data_source']['type'] == 'upload_file':
-            if 'info' not in args['data_source'] or not args['data_source']['info']:
-                raise ValueError("Data source info is required")
+            if 'file_info_list' not in args['data_source']['info_list'] or not args['data_source']['info_list']['file_info_list']:
+                raise ValueError("File source info is required")
+        if args['data_source']['type'] == 'notion_import':
+            if 'notion_info_list' not in args['data_source']['info_list'] or not args['data_source']['info_list']['notion_info_list']:
+                raise ValueError("Notion source info is required")
 
     @classmethod
     def process_rule_args_validate(cls, args: dict):
@@ -624,3 +742,78 @@ class DocumentService:
 
             if not isinstance(args['process_rule']['rules']['segmentation']['max_tokens'], int):
                 raise ValueError("Process rule segmentation max_tokens is invalid")
+
+    @classmethod
+    def estimate_args_validate(cls, args: dict):
+        if 'info_list' not in args or not args['info_list']:
+            raise ValueError("Data source info is required")
+
+        if not isinstance(args['info_list'], dict):
+            raise ValueError("Data info is invalid")
+
+        if 'process_rule' not in args or not args['process_rule']:
+            raise ValueError("Process rule is required")
+
+        if not isinstance(args['process_rule'], dict):
+            raise ValueError("Process rule is invalid")
+
+        if 'mode' not in args['process_rule'] or not args['process_rule']['mode']:
+            raise ValueError("Process rule mode is required")
+
+        if args['process_rule']['mode'] not in DatasetProcessRule.MODES:
+            raise ValueError("Process rule mode is invalid")
+
+        if args['process_rule']['mode'] == 'automatic':
+            args['process_rule']['rules'] = {}
+        else:
+            if 'rules' not in args['process_rule'] or not args['process_rule']['rules']:
+                raise ValueError("Process rule rules is required")
+
+            if not isinstance(args['process_rule']['rules'], dict):
+                raise ValueError("Process rule rules is invalid")
+
+            if 'pre_processing_rules' not in args['process_rule']['rules'] \
+                    or args['process_rule']['rules']['pre_processing_rules'] is None:
+                raise ValueError("Process rule pre_processing_rules is required")
+
+            if not isinstance(args['process_rule']['rules']['pre_processing_rules'], list):
+                raise ValueError("Process rule pre_processing_rules is invalid")
+
+            unique_pre_processing_rule_dicts = {}
+            for pre_processing_rule in args['process_rule']['rules']['pre_processing_rules']:
+                if 'id' not in pre_processing_rule or not pre_processing_rule['id']:
+                    raise ValueError("Process rule pre_processing_rules id is required")
+
+                if pre_processing_rule['id'] not in DatasetProcessRule.PRE_PROCESSING_RULES:
+                    raise ValueError("Process rule pre_processing_rules id is invalid")
+
+                if 'enabled' not in pre_processing_rule or pre_processing_rule['enabled'] is None:
+                    raise ValueError("Process rule pre_processing_rules enabled is required")
+
+                if not isinstance(pre_processing_rule['enabled'], bool):
+                    raise ValueError("Process rule pre_processing_rules enabled is invalid")
+
+                unique_pre_processing_rule_dicts[pre_processing_rule['id']] = pre_processing_rule
+
+            args['process_rule']['rules']['pre_processing_rules'] = list(unique_pre_processing_rule_dicts.values())
+
+            if 'segmentation' not in args['process_rule']['rules'] \
+                    or args['process_rule']['rules']['segmentation'] is None:
+                raise ValueError("Process rule segmentation is required")
+
+            if not isinstance(args['process_rule']['rules']['segmentation'], dict):
+                raise ValueError("Process rule segmentation is invalid")
+
+            if 'separator' not in args['process_rule']['rules']['segmentation'] \
+                    or not args['process_rule']['rules']['segmentation']['separator']:
+                raise ValueError("Process rule segmentation separator is required")
+
+            if not isinstance(args['process_rule']['rules']['segmentation']['separator'], str):
+                raise ValueError("Process rule segmentation separator is invalid")
+
+            if 'max_tokens' not in args['process_rule']['rules']['segmentation'] \
+                    or not args['process_rule']['rules']['segmentation']['max_tokens']:
+                raise ValueError("Process rule segmentation max_tokens is required")
+
+            if not isinstance(args['process_rule']['rules']['segmentation']['max_tokens'], int):
+                raise ValueError("Process rule segmentation max_tokens is invalid")

+ 58 - 0
api/tasks/clean_notion_document_task.py

@@ -0,0 +1,58 @@
+import logging
+import time
+from typing import List
+
+import click
+from celery import shared_task
+
+from core.index.keyword_table_index import KeywordTableIndex
+from core.index.vector_index import VectorIndex
+from extensions.ext_database import db
+from models.dataset import DocumentSegment, Dataset, Document
+
+
+@shared_task
+def clean_notion_document_task(document_ids: List[str], dataset_id: str):
+    """
+    Clean document when document deleted.
+    :param document_ids: document ids
+    :param dataset_id: dataset id
+
+    Usage: clean_notion_document_task.delay(document_ids, dataset_id)
+    """
+    logging.info(click.style('Start clean document when import form notion document deleted: {}'.format(dataset_id), fg='green'))
+    start_at = time.perf_counter()
+
+    try:
+        dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()
+
+        if not dataset:
+            raise Exception('Document has no dataset')
+
+        vector_index = VectorIndex(dataset=dataset)
+        keyword_table_index = KeywordTableIndex(dataset=dataset)
+        for document_id in document_ids:
+            document = db.session.query(Document).filter(
+                Document.id == document_id
+            ).first()
+            db.session.delete(document)
+            segments = db.session.query(DocumentSegment).filter(DocumentSegment.document_id == document_id).all()
+            index_node_ids = [segment.index_node_id for segment in segments]
+
+            # delete from vector index
+            vector_index.del_nodes(index_node_ids)
+
+            # delete from keyword index
+            if index_node_ids:
+                keyword_table_index.del_nodes(index_node_ids)
+
+            for segment in segments:
+                db.session.delete(segment)
+        db.session.commit()
+        end_at = time.perf_counter()
+        logging.info(
+            click.style('Clean document when import form notion document deleted end :: {} latency: {}'.format(
+                dataset_id, end_at - start_at),
+                        fg='green'))
+    except Exception:
+        logging.exception("Cleaned document when import form notion document deleted  failed")

+ 109 - 0
api/tasks/document_indexing_sync_task.py

@@ -0,0 +1,109 @@
+import datetime
+import logging
+import time
+
+import click
+from celery import shared_task
+from werkzeug.exceptions import NotFound
+
+from core.data_source.notion import NotionPageReader
+from core.index.keyword_table_index import KeywordTableIndex
+from core.index.vector_index import VectorIndex
+from core.indexing_runner import IndexingRunner, DocumentIsPausedException
+from core.llm.error import ProviderTokenNotInitError
+from extensions.ext_database import db
+from models.dataset import Document, Dataset, DocumentSegment
+from models.source import DataSourceBinding
+
+
+@shared_task
+def document_indexing_sync_task(dataset_id: str, document_id: str):
+    """
+    Async update document
+    :param dataset_id:
+    :param document_id:
+
+    Usage: document_indexing_sync_task.delay(dataset_id, document_id)
+    """
+    logging.info(click.style('Start sync document: {}'.format(document_id), fg='green'))
+    start_at = time.perf_counter()
+
+    document = db.session.query(Document).filter(
+        Document.id == document_id,
+        Document.dataset_id == dataset_id
+    ).first()
+
+    if not document:
+        raise NotFound('Document not found')
+
+    data_source_info = document.data_source_info_dict
+    if document.data_source_type == 'notion_import':
+        if not data_source_info or 'notion_page_id' not in data_source_info \
+                or 'notion_workspace_id' not in data_source_info:
+            raise ValueError("no notion page found")
+        workspace_id = data_source_info['notion_workspace_id']
+        page_id = data_source_info['notion_page_id']
+        page_edited_time = data_source_info['last_edited_time']
+        data_source_binding = DataSourceBinding.query.filter(
+            db.and_(
+                DataSourceBinding.tenant_id == document.tenant_id,
+                DataSourceBinding.provider == 'notion',
+                DataSourceBinding.disabled == False,
+                DataSourceBinding.source_info['workspace_id'] == f'"{workspace_id}"'
+            )
+        ).first()
+        if not data_source_binding:
+            raise ValueError('Data source binding not found.')
+        reader = NotionPageReader(integration_token=data_source_binding.access_token)
+        last_edited_time = reader.get_page_last_edited_time(page_id)
+        # check the page is updated
+        if last_edited_time != page_edited_time:
+            document.indexing_status = 'parsing'
+            document.processing_started_at = datetime.datetime.utcnow()
+            db.session.commit()
+
+            # delete all document segment and index
+            try:
+                dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()
+                if not dataset:
+                    raise Exception('Dataset not found')
+
+                vector_index = VectorIndex(dataset=dataset)
+                keyword_table_index = KeywordTableIndex(dataset=dataset)
+
+                segments = db.session.query(DocumentSegment).filter(DocumentSegment.document_id == document_id).all()
+                index_node_ids = [segment.index_node_id for segment in segments]
+
+                # delete from vector index
+                vector_index.del_nodes(index_node_ids)
+
+                # delete from keyword index
+                if index_node_ids:
+                    keyword_table_index.del_nodes(index_node_ids)
+
+                for segment in segments:
+                    db.session.delete(segment)
+
+                end_at = time.perf_counter()
+                logging.info(
+                    click.style('Cleaned document when document update data source or process rule: {} latency: {}'.format(document_id, end_at - start_at), fg='green'))
+            except Exception:
+                logging.exception("Cleaned document when document update data source or process rule failed")
+            try:
+                indexing_runner = IndexingRunner()
+                indexing_runner.run([document])
+                end_at = time.perf_counter()
+                logging.info(click.style('update document: {} latency: {}'.format(document.id, end_at - start_at), fg='green'))
+            except DocumentIsPausedException:
+                logging.info(click.style('Document update paused, document id: {}'.format(document.id), fg='yellow'))
+            except ProviderTokenNotInitError as e:
+                document.indexing_status = 'error'
+                document.error = str(e.description)
+                document.stopped_at = datetime.datetime.utcnow()
+                db.session.commit()
+            except Exception as e:
+                logging.exception("consume update document failed")
+                document.indexing_status = 'error'
+                document.error = str(e)
+                document.stopped_at = datetime.datetime.utcnow()
+                db.session.commit()

+ 20 - 16
api/tasks/document_indexing_task.py

@@ -13,32 +13,36 @@ from models.dataset import Document
 
 
 @shared_task
-def document_indexing_task(dataset_id: str, document_id: str):
+def document_indexing_task(dataset_id: str, document_ids: list):
     """
     Async process document
     :param dataset_id:
-    :param document_id:
+    :param document_ids:
 
     Usage: document_indexing_task.delay(dataset_id, document_id)
     """
-    logging.info(click.style('Start process document: {}'.format(document_id), fg='green'))
-    start_at = time.perf_counter()
-
-    document = db.session.query(Document).filter(
-        Document.id == document_id,
-        Document.dataset_id == dataset_id
-    ).first()
-
-    if not document:
-        raise NotFound('Document not found')
-
-    document.indexing_status = 'parsing'
-    document.processing_started_at = datetime.datetime.utcnow()
+    documents = []
+    for document_id in document_ids:
+        logging.info(click.style('Start process document: {}'.format(document_id), fg='green'))
+        start_at = time.perf_counter()
+
+        document = db.session.query(Document).filter(
+            Document.id == document_id,
+            Document.dataset_id == dataset_id
+        ).first()
+
+        if not document:
+            raise NotFound('Document not found')
+
+        document.indexing_status = 'parsing'
+        document.processing_started_at = datetime.datetime.utcnow()
+        documents.append(document)
+        db.session.add(document)
     db.session.commit()
 
     try:
         indexing_runner = IndexingRunner()
-        indexing_runner.run(document)
+        indexing_runner.run(documents)
         end_at = time.perf_counter()
         logging.info(click.style('Processed document: {} latency: {}'.format(document.id, end_at - start_at), fg='green'))
     except DocumentIsPausedException:

+ 1 - 1
api/tasks/document_indexing_update_task.py

@@ -67,7 +67,7 @@ def document_indexing_update_task(dataset_id: str, document_id: str):
         logging.exception("Cleaned document when document update data source or process rule failed")
     try:
         indexing_runner = IndexingRunner()
-        indexing_runner.run(document)
+        indexing_runner.run([document])
         end_at = time.perf_counter()
         logging.info(click.style('update document: {} latency: {}'.format(document.id, end_at - start_at), fg='green'))
     except DocumentIsPausedException:

+ 1 - 1
api/tasks/recover_document_indexing_task.py

@@ -34,7 +34,7 @@ def recover_document_indexing_task(dataset_id: str, document_id: str):
     try:
         indexing_runner = IndexingRunner()
         if document.indexing_status in ["waiting", "parsing", "cleaning"]:
-            indexing_runner.run(document)
+            indexing_runner.run([document])
         elif document.indexing_status == "splitting":
             indexing_runner.run_in_splitting_status(document)
         elif document.indexing_status == "indexing":

+ 2 - 1
web/app/(commonLayout)/datasets/(datasetDetailLayout)/[datasetId]/layout.tsx

@@ -28,6 +28,7 @@ import Indicator from '@/app/components/header/indicator'
 import AppIcon from '@/app/components/base/app-icon'
 import Loading from '@/app/components/base/loading'
 import DatasetDetailContext from '@/context/dataset-detail'
+import { DataSourceType } from '@/models/datasets'
 
 // import { fetchDatasetDetail } from '@/service/datasets'
 
@@ -162,7 +163,7 @@ const DatasetDetailLayout: FC<IAppDetailLayoutProps> = (props) => {
         desc={datasetRes?.description || '--'}
         navigation={navigation}
         extraInfo={<ExtraInfo />}
-        iconType='dataset'
+        iconType={datasetRes?.data_source_type === DataSourceType.NOTION ? 'notion' : 'dataset'}
       />}
       <DatasetDetailContext.Provider value={{
         indexingTechnique: datasetRes?.indexing_technique,

+ 15 - 1
web/app/components/app-sidebar/basic.tsx

@@ -14,7 +14,7 @@ export function randomString(length: number) {
 }
 
 export type IAppBasicProps = {
-  iconType?: 'app' | 'api' | 'dataset' | 'webapp'
+  iconType?: 'app' | 'api' | 'dataset' | 'webapp' | 'notion'
   icon?: string
   icon_background?: string
   name: string
@@ -40,11 +40,25 @@ const WebappSvg = <svg width="16" height="18" viewBox="0 0 16 18" fill="none" xm
   <path d="M14.375 5.45825L7.99998 8.99992M7.99998 8.99992L1.62498 5.45825M7.99998 8.99992L8 16.1249M14.75 12.0439V5.95603C14.75 5.69904 14.75 5.57055 14.7121 5.45595C14.6786 5.35457 14.6239 5.26151 14.5515 5.18299C14.4697 5.09424 14.3574 5.03184 14.1328 4.90704L8.58277 1.8237C8.37007 1.70553 8.26372 1.64645 8.15109 1.62329C8.05141 1.60278 7.9486 1.60278 7.84891 1.62329C7.73628 1.64645 7.62993 1.70553 7.41723 1.8237L1.86723 4.90704C1.64259 5.03184 1.53026 5.09424 1.44847 5.18299C1.37612 5.26151 1.32136 5.35457 1.28786 5.45595C1.25 5.57055 1.25 5.69904 1.25 5.95603V12.0439C1.25 12.3008 1.25 12.4293 1.28786 12.5439C1.32136 12.6453 1.37612 12.7384 1.44847 12.8169C1.53026 12.9056 1.64259 12.968 1.86723 13.0928L7.41723 16.1762C7.62993 16.2943 7.73628 16.3534 7.84891 16.3766C7.9486 16.3971 8.05141 16.3971 8.15109 16.3766C8.26372 16.3534 8.37007 16.2943 8.58277 16.1762L14.1328 13.0928C14.3574 12.968 14.4697 12.9056 14.5515 12.8169C14.6239 12.7384 14.6786 12.6453 14.7121 12.5439C14.75 12.4293 14.75 12.3008 14.75 12.0439Z" stroke="#155EEF" strokeWidth="1.5" strokeLinecap="round" strokeLinejoin="round"/>
 </svg>
 
+const NotionSvg = <svg width="24" height="24" viewBox="0 0 24 24" fill="none" xmlns="http://www.w3.org/2000/svg">
+  <g clip-path="url(#clip0_6294_13848)">
+    <path fill-rule="evenodd" clip-rule="evenodd" d="M4.287 21.9133L1.70748 18.6999C1.08685 17.9267 0.75 16.976 0.75 15.9974V4.36124C0.75 2.89548 1.92269 1.67923 3.43553 1.57594L15.3991 0.759137C16.2682 0.699797 17.1321 0.930818 17.8461 1.41353L22.0494 4.25543C22.8018 4.76414 23.25 5.59574 23.25 6.48319V19.7124C23.25 21.1468 22.0969 22.3345 20.6157 22.4256L7.3375 23.243C6.1555 23.3158 5.01299 22.8178 4.287 21.9133Z" fill="white"/>
+    <path d="M8.43607 10.1842V10.0318C8.43607 9.64564 8.74535 9.32537 9.14397 9.29876L12.0475 9.10491L16.0628 15.0178V9.82823L15.0293 9.69046V9.6181C15.0293 9.22739 15.3456 8.90501 15.7493 8.88433L18.3912 8.74899V9.12918C18.3912 9.30765 18.2585 9.46031 18.0766 9.49108L17.4408 9.59861V18.0029L16.6429 18.2773C15.9764 18.5065 15.2343 18.2611 14.8527 17.6853L10.9545 11.803V17.4173L12.1544 17.647L12.1377 17.7583C12.0853 18.1069 11.7843 18.3705 11.4202 18.3867L8.43607 18.5195C8.39662 18.1447 8.67758 17.8093 9.06518 17.7686L9.45771 17.7273V10.2416L8.43607 10.1842Z" fill="black"/>
+    <path fill-rule="evenodd" clip-rule="evenodd" d="M15.5062 2.22521L3.5426 3.04201C2.82599 3.09094 2.27051 3.66706 2.27051 4.36136V15.9975C2.27051 16.6499 2.49507 17.2837 2.90883 17.7992L5.48835 21.0126C5.90541 21.5322 6.56174 21.8183 7.24076 21.7765L20.519 20.9591C21.1995 20.9172 21.7293 20.3716 21.7293 19.7125V6.48332C21.7293 6.07557 21.5234 5.69348 21.1777 5.45975L16.9743 2.61784C16.546 2.32822 16.0277 2.1896 15.5062 2.22521ZM4.13585 4.54287C3.96946 4.41968 4.04865 4.16303 4.25768 4.14804L15.5866 3.33545C15.9476 3.30956 16.3063 3.40896 16.5982 3.61578L18.8713 5.22622C18.9576 5.28736 18.9171 5.41935 18.8102 5.42516L6.8129 6.07764C6.44983 6.09739 6.09144 5.99073 5.80276 5.77699L4.13585 4.54287ZM6.25018 8.12315C6.25018 7.7334 6.56506 7.41145 6.9677 7.38952L19.6523 6.69871C20.0447 6.67734 20.375 6.97912 20.375 7.35898V18.8141C20.375 19.2031 20.0613 19.5247 19.6594 19.5476L7.05516 20.2648C6.61845 20.2896 6.25018 19.954 6.25018 19.5312V8.12315Z" fill="black"/>
+  </g>
+  <defs>
+    <clipPath id="clip0_6294_13848">
+      <rect width="24" height="24" fill="white"/>
+    </clipPath>
+  </defs>
+</svg>
+
 const ICON_MAP = {
   app: <AppIcon className='border !border-[rgba(0,0,0,0.05)]' />,
   api: <AppIcon innerIcon={ApiSvg} className='border !bg-purple-50 !border-purple-200' />,
   dataset: <AppIcon innerIcon={DatasetSvg} className='!border-[0.5px] !border-indigo-100 !bg-indigo-25' />,
   webapp: <AppIcon innerIcon={WebappSvg} className='border !bg-primary-100 !border-primary-200' />,
+  notion: <AppIcon innerIcon={NotionSvg} className='!border-[0.5px] !border-indigo-100 !bg-white' />,
 }
 
 export default function AppBasic({ icon, icon_background, name, type, hoverTip, textStyle, iconType = 'app' }: IAppBasicProps) {

+ 1 - 2
web/app/components/app-sidebar/index.tsx

@@ -4,7 +4,7 @@ import NavLink from './navLink'
 import AppBasic from './basic'
 
 export type IAppDetailNavProps = {
-  iconType?: 'app' | 'dataset'
+  iconType?: 'app' | 'dataset' | 'notion'
   title: string
   desc: string
   icon: string
@@ -18,7 +18,6 @@ export type IAppDetailNavProps = {
   extraInfo?: React.ReactNode
 }
 
-
 const AppDetailNav: FC<IAppDetailNavProps> = ({ title, desc, icon, icon_background, navigation, extraInfo, iconType = 'app' }) => {
   return (
     <div className="flex flex-col w-56 overflow-y-auto bg-white border-r border-gray-200 shrink-0">

+ 3 - 0
web/app/components/base/checkbox/assets/check.svg

@@ -0,0 +1,3 @@
+<svg width="12" height="12" viewBox="0 0 12 12" fill="none" xmlns="http://www.w3.org/2000/svg">
+<path d="M10 3L4.5 8.5L2 6" stroke="white" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"/>
+</svg>

+ 9 - 0
web/app/components/base/checkbox/index.module.css

@@ -0,0 +1,9 @@
+.wrapper {
+  border-color: #d0d5dd;
+}
+
+.checked {
+  background: #155eef url(./assets/check.svg) center center no-repeat;
+  background-size: 12px 12px;
+  border-color: #155eef;
+}

+ 19 - 0
web/app/components/base/checkbox/index.tsx

@@ -0,0 +1,19 @@
+import cn from 'classnames'
+import s from './index.module.css'
+
+type CheckboxProps = {
+  checked?: boolean
+  onCheck?: () => void
+  className?: string
+}
+
+const Checkbox = ({ checked, onCheck, className }: CheckboxProps) => {
+  return (
+    <div
+      className={cn(s.wrapper, checked && s.checked, 'w-4 h-4 border rounded border-gray-300', className)}
+      onClick={onCheck}
+    />
+  )
+}
+
+export default Checkbox

+ 6 - 0
web/app/components/base/notion-icon/index.module.css

@@ -0,0 +1,6 @@
+.default-page-icon {
+  width: 20px;
+  height: 20px;
+  background: url(../notion-page-selector/assets/notion-page.svg) center center no-repeat;
+  background-size: cover;
+}

+ 58 - 0
web/app/components/base/notion-icon/index.tsx

@@ -0,0 +1,58 @@
+import cn from 'classnames'
+import s from './index.module.css'
+import type { DataSourceNotionPage } from '@/models/common'
+
+type IconTypes = 'workspace' | 'page'
+type NotionIconProps = {
+  type?: IconTypes
+  name?: string | null
+  className?: string
+  src?: string | null | Pick<DataSourceNotionPage, 'page_icon'>['page_icon']
+}
+const NotionIcon = ({
+  type = 'workspace',
+  src,
+  name,
+  className,
+}: NotionIconProps) => {
+  if (type === 'workspace') {
+    if (typeof src === 'string') {
+      if (src.startsWith('https://') || src.startsWith('http://')) {
+        return (
+          <img
+            alt='workspace icon'
+            src={src}
+            className={cn('block object-cover w-5 h-5', className)}
+          />
+        )
+      }
+      return (
+        <div className={cn('flex items-center justify-center w-5 h-5', className)}>{src}</div>
+      )
+    }
+    return (
+      <div className={cn('flex items-center justify-center w-5 h-5 bg-gray-200 text-xs font-medium text-gray-500 rounded', className)}>{name?.[0].toLocaleUpperCase()}</div>
+    )
+  }
+
+  if (typeof src === 'object' && src !== null) {
+    if (src?.type === 'url') {
+      return (
+        <img
+          alt='page icon'
+          src={src.url || ''}
+          className={cn('block object-cover w-5 h-5', className)}
+        />
+      )
+    }
+    return (
+      <div className={cn('flex items-center justify-center w-5 h-5', className)}>{src?.emoji}</div>
+    )
+  }
+
+  return (
+    <div className={cn(s['default-page-icon'], className)} />
+  )
+}
+
+export default NotionIcon

+ 3 - 0
web/app/components/base/notion-page-selector/assets/clear.svg

@@ -0,0 +1,3 @@
+<svg width="16" height="16" viewBox="0 0 16 16" fill="none" xmlns="http://www.w3.org/2000/svg">
+<path fill-rule="evenodd" clip-rule="evenodd" d="M8 2.5C4.96243 2.5 2.5 4.96243 2.5 8C2.5 11.0376 4.96243 13.5 8 13.5C11.0376 13.5 13.5 11.0376 13.5 8C13.5 4.96243 11.0376 2.5 8 2.5ZM9.85355 6.14645C10.0488 6.34171 10.0488 6.65829 9.85355 6.85355L8.70711 8L9.85355 9.14645C10.0488 9.34171 10.0488 9.65829 9.85355 9.85355C9.65829 10.0488 9.34171 10.0488 9.14645 9.85355L8 8.70711L6.85355 9.85355C6.65829 10.0488 6.34171 10.0488 6.14645 9.85355C5.95118 9.65829 5.95118 9.34171 6.14645 9.14645L7.29289 8L6.14645 6.85355C5.95118 6.65829 5.95118 6.34171 6.14645 6.14645C6.34171 5.95118 6.65829 5.95118 6.85355 6.14645L8 7.29289L9.14645 6.14645C9.34171 5.95118 9.65829 5.95118 9.85355 6.14645Z" fill="#98A2B3"/>
+</svg>

+ 3 - 0
web/app/components/base/notion-page-selector/assets/down-arrow.svg

@@ -0,0 +1,3 @@
+<svg width="12" height="12" viewBox="0 0 12 12" fill="none" xmlns="http://www.w3.org/2000/svg">
+<path d="M3 4.5L6 7.5L9 4.5" stroke="#344054" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"/>
+</svg>

+ 3 - 0
web/app/components/base/notion-page-selector/assets/notion-empty-page.svg

@@ -0,0 +1,3 @@
+<svg width="20" height="21" viewBox="0 0 20 21" fill="none" xmlns="http://www.w3.org/2000/svg">
+<path d="M5.49939 19.1498H13.6897C15.3354 19.1498 16.1891 18.2807 16.1891 16.6273V9.6521C16.1891 8.58313 16.0507 8.09095 15.3816 7.41418L11.3441 3.30749C10.6981 2.65381 10.1675 2.5 9.20618 2.5H5.49939C3.85363 2.5 3 3.36902 3 5.02246V16.6273C3 18.2884 3.85363 19.1498 5.49939 19.1498ZM5.62243 17.6424C4.87646 17.6424 4.50732 17.2502 4.50732 16.5351V5.11475C4.50732 4.40722 4.87646 4.00732 5.62243 4.00732H8.89856V8.22168C8.89856 9.32142 9.44457 9.85205 10.5366 9.85205H14.6818V16.5351C14.6818 17.2502 14.3049 17.6424 13.5589 17.6424H5.62243ZM10.675 8.52929C10.3597 8.52929 10.229 8.39087 10.229 8.07556V4.21496L14.4741 8.52929H10.675Z" fill="#37352F" fill-opacity="0.45"/>
+</svg>

+ 3 - 0
web/app/components/base/notion-page-selector/assets/notion-page.svg

@@ -0,0 +1,3 @@
+<svg width="20" height="21" viewBox="0 0 20 21" fill="none" xmlns="http://www.w3.org/2000/svg">
+<path d="M5.49939 19.1498H13.6897C15.3354 19.1498 16.1891 18.2807 16.1891 16.6273V9.6521C16.1891 8.58313 16.0507 8.09095 15.3816 7.41418L11.3441 3.30749C10.6981 2.65381 10.1675 2.5 9.20618 2.5H5.49939C3.85363 2.5 3 3.36902 3 5.02246V16.6273C3 18.2884 3.85363 19.1498 5.49939 19.1498ZM5.62243 17.6424C4.87645 17.6424 4.50732 17.2502 4.50732 16.5351V5.11475C4.50732 4.40722 4.87645 4.00732 5.62243 4.00732H8.89856V8.22168C8.89856 9.32142 9.44457 9.85205 10.5366 9.85205H14.6818V16.5351C14.6818 17.2502 14.3049 17.6424 13.5589 17.6424H5.62243ZM10.675 8.52929C10.3597 8.52929 10.229 8.39087 10.229 8.07556V4.21496L14.4741 8.52929H10.675ZM12.3362 11.8746H6.70678C6.41454 11.8746 6.2069 12.09 6.2069 12.3591C6.2069 12.636 6.41454 12.8513 6.70678 12.8513H12.3362C12.613 12.8513 12.8207 12.636 12.8207 12.3591C12.8207 12.09 12.613 11.8746 12.3362 11.8746ZM12.3362 14.4587H6.70678C6.41454 14.4587 6.2069 14.674 6.2069 14.9509C6.2069 15.22 6.41454 15.4276 6.70678 15.4276H12.3362C12.613 15.4276 12.8207 15.22 12.8207 14.9509C12.8207 14.674 12.613 14.4587 12.3362 14.4587Z" fill="#37352F" fill-opacity="0.45"/>
+</svg>

+ 5 - 0
web/app/components/base/notion-page-selector/assets/search.svg

@@ -0,0 +1,5 @@
+<svg width="14" height="14" viewBox="0 0 14 14" fill="none" xmlns="http://www.w3.org/2000/svg">
+<g id="Icon">
+<path id="Icon_2" d="M12.25 12.25L10.2084 10.2083M11.6667 6.70833C11.6667 9.44675 9.44675 11.6667 6.70833 11.6667C3.96992 11.6667 1.75 9.44675 1.75 6.70833C1.75 3.96992 3.96992 1.75 6.70833 1.75C9.44675 1.75 11.6667 3.96992 11.6667 6.70833Z" stroke="#344054" stroke-width="1.25" stroke-linecap="round" stroke-linejoin="round"/>
+</g>
+</svg>

File diff suppressed because it is too large
+ 3 - 0
web/app/components/base/notion-page-selector/assets/setting.svg


+ 4 - 0
web/app/components/base/notion-page-selector/base.module.css

@@ -0,0 +1,4 @@
+.setting-icon {
+  background: url(./assets/setting.svg) center center no-repeat;
+  background-size: 14px 14px;
+}

+ 141 - 0
web/app/components/base/notion-page-selector/base.tsx

@@ -0,0 +1,141 @@
+import { useCallback, useEffect, useMemo, useState } from 'react'
+import useSWR from 'swr'
+import cn from 'classnames'
+import s from './base.module.css'
+import WorkspaceSelector from './workspace-selector'
+import SearchInput from './search-input'
+import PageSelector from './page-selector'
+import { preImportNotionPages } from '@/service/datasets'
+import AccountSetting from '@/app/components/header/account-setting'
+import { NotionConnector } from '@/app/components/datasets/create/step-one'
+import type { DataSourceNotionPage, DataSourceNotionPageMap, DataSourceNotionWorkspace } from '@/models/common'
+
+export type NotionPageSelectorValue = DataSourceNotionPage & { workspace_id: string }
+
+type NotionPageSelectorProps = {
+  value?: string[]
+  onSelect: (selectedPages: NotionPageSelectorValue[]) => void
+  canPreview?: boolean
+  previewPageId?: string
+  onPreview?: (selectedPage: NotionPageSelectorValue) => void
+  datasetId?: string
+}
+
+const NotionPageSelector = ({
+  value,
+  onSelect,
+  canPreview,
+  previewPageId,
+  onPreview,
+  datasetId = '',
+}: NotionPageSelectorProps) => {
+  const { data, mutate } = useSWR({ url: '/notion/pre-import/pages', datasetId }, preImportNotionPages)
+  const [prevData, setPrevData] = useState(data)
+  const [searchValue, setSearchValue] = useState('')
+  const [showDataSourceSetting, setShowDataSourceSetting] = useState(false)
+  const [currentWorkspaceId, setCurrentWorkspaceId] = useState('')
+
+  const notionWorkspaces = useMemo(() => {
+    return data?.notion_info || []
+  }, [data?.notion_info])
+  const firstWorkspaceId = notionWorkspaces[0]?.workspace_id
+  const currentWorkspace = notionWorkspaces.find(workspace => workspace.workspace_id === currentWorkspaceId)
+
+  const getPagesMapAndSelectedPagesId: [DataSourceNotionPageMap, Set<string>] = useMemo(() => {
+    const selectedPagesId = new Set<string>()
+    const pagesMap = notionWorkspaces.reduce((prev: DataSourceNotionPageMap, next: DataSourceNotionWorkspace) => {
+      next.pages.forEach((page) => {
+        if (page.is_bound)
+          selectedPagesId.add(page.page_id)
+        prev[page.page_id] = {
+          ...page,
+          workspace_id: next.workspace_id,
+        }
+      })
+
+      return prev
+    }, {})
+    return [pagesMap, selectedPagesId]
+  }, [notionWorkspaces])
+  const defaultSelectedPagesId = [...Array.from(getPagesMapAndSelectedPagesId[1]), ...(value || [])]
+  const [selectedPagesId, setSelectedPagesId] = useState<Set<string>>(new Set(defaultSelectedPagesId))
+
+  if (prevData !== data) {
+    setPrevData(data)
+    setSelectedPagesId(new Set(defaultSelectedPagesId))
+  }
+
+  const handleSearchValueChange = useCallback((value: string) => {
+    setSearchValue(value)
+  }, [])
+  const handleSelectWorkspace = useCallback((workspaceId: string) => {
+    setCurrentWorkspaceId(workspaceId)
+  }, [])
+  const handleSelecPages = (selectedPagesId: Set<string>) => {
+    setSelectedPagesId(new Set(Array.from(selectedPagesId)))
+    const selectedPages = Array.from(selectedPagesId).map(pageId => getPagesMapAndSelectedPagesId[0][pageId])
+    onSelect(selectedPages)
+  }
+  const handlePreviewPage = (previewPageId: string) => {
+    if (onPreview)
+      onPreview(getPagesMapAndSelectedPagesId[0][previewPageId])
+  }
+
+  useEffect(() => {
+    setCurrentWorkspaceId(firstWorkspaceId)
+  }, [firstWorkspaceId])
+
+  return (
+    <div className='bg-gray-25 border border-gray-200 rounded-xl'>
+      {
+        data?.notion_info?.length
+          ? (
+            <>
+              <div className='flex items-center pl-[10px] pr-2 h-11 bg-white border-b border-b-gray-200 rounded-t-xl'>
+                <WorkspaceSelector
+                  value={currentWorkspaceId || firstWorkspaceId}
+                  items={notionWorkspaces}
+                  onSelect={handleSelectWorkspace}
+                />
+                <div className='mx-1 w-[1px] h-3 bg-gray-200' />
+                <div
+                  className={cn(s['setting-icon'], 'w-6 h-6 cursor-pointer')}
+                  onClick={() => setShowDataSourceSetting(true)}
+                />
+                <div className='grow' />
+                <SearchInput
+                  value={searchValue}
+                  onChange={handleSearchValueChange}
+                />
+              </div>
+              <div className='rounded-b-xl overflow-hidden'>
+                <PageSelector
+                  value={selectedPagesId}
+                  searchValue={searchValue}
+                  list={currentWorkspace?.pages || []}
+                  pagesMap={getPagesMapAndSelectedPagesId[0]}
+                  onSelect={handleSelecPages}
+                  canPreview={canPreview}
+                  previewPageId={previewPageId}
+                  onPreview={handlePreviewPage}
+                />
+              </div>
+            </>
+          )
+          : (
+            <NotionConnector onSetting={() => setShowDataSourceSetting(true)} />
+          )
+      }
+      {
+        showDataSourceSetting && (
+          <AccountSetting activeTab='data-source' onCancel={() => {
+            setShowDataSourceSetting(false)
+            mutate()
+          }} />
+        )
+      }
+    </div>
+  )
+}
+
+export default NotionPageSelector

+ 2 - 0
web/app/components/base/notion-page-selector/index.tsx

@@ -0,0 +1,2 @@
+export { default as NotionPageSelectorModal } from './notion-page-selector-modal'
+export { default as NotionPageSelector } from './base'

+ 28 - 0
web/app/components/base/notion-page-selector/notion-page-selector-modal/index.module.css

@@ -0,0 +1,28 @@
+.modal {
+  width: 600px !important;
+  max-width: 600px !important;
+  padding: 24px 32px !important;
+}
+
+.operate {
+  padding: 0 8px;
+  min-width: 96px;
+  height: 36px;
+  line-height: 36px;
+  text-align: center;
+  background-color: #ffffff;
+  box-shadow: 0px 1px 2px rgba(16, 24, 40, 0.05);
+  border-radius: 8px;
+  border: 0.5px solid #eaecf0;
+  font-size: 14px;
+  font-weight: 500;
+  color: #667085;
+  cursor: pointer;
+}
+
+.operate-save {
+  margin-left: 8px;
+  border-color: #155eef;
+  background-color: #155eef;
+  color: #ffffff;
+}

+ 62 - 0
web/app/components/base/notion-page-selector/notion-page-selector-modal/index.tsx

@@ -0,0 +1,62 @@
+import { useState } from 'react'
+import { useTranslation } from 'react-i18next'
+import cn from 'classnames'
+import { XMarkIcon } from '@heroicons/react/24/outline'
+import NotionPageSelector from '../base'
+import type { NotionPageSelectorValue } from '../base'
+import s from './index.module.css'
+import Modal from '@/app/components/base/modal'
+
+type NotionPageSelectorModalProps = {
+  isShow: boolean
+  onClose: () => void
+  onSave: (selectedPages: NotionPageSelectorValue[]) => void
+  datasetId: string
+}
+const NotionPageSelectorModal = ({
+  isShow,
+  onClose,
+  onSave,
+  datasetId,
+}: NotionPageSelectorModalProps) => {
+  const { t } = useTranslation()
+  const [selectedPages, setSelectedPages] = useState<NotionPageSelectorValue[]>([])
+
+  const handleClose = () => {
+    onClose()
+  }
+  const handleSelectPage = (newSelectedPages: NotionPageSelectorValue[]) => {
+    setSelectedPages(newSelectedPages)
+  }
+  const handleSave = () => {
+    onSave(selectedPages)
+  }
+
+  return (
+    <Modal
+      className={s.modal}
+      isShow={isShow}
+      onClose={() => {}}
+    >
+      <div className='flex items-center justify-between mb-6 h-8'>
+        <div className='text-xl font-semibold text-gray-900'>{t('common.dataSource.notion.selector.addPages')}</div>
+        <div
+          className='flex items-center justify-center -mr-2 w-8 h-8 cursor-pointer'
+          onClick={handleClose}>
+          <XMarkIcon className='w-4 h-4' />
+        </div>
+      </div>
+      <NotionPageSelector
+        onSelect={handleSelectPage}
+        canPreview={false}
+        datasetId={datasetId}
+      />
+      <div className='mt-8 flex justify-end'>
+        <div className={s.operate} onClick={handleClose}>{t('common.operation.cancel')}</div>
+        <div className={cn(s.operate, s['operate-save'])} onClick={handleSave}>{t('common.operation.save')}</div>
+      </div>
+    </Modal>
+  )
+}
+
+export default NotionPageSelectorModal

+ 17 - 0
web/app/components/base/notion-page-selector/page-selector/index.module.css

@@ -0,0 +1,17 @@
+.arrow {
+  width: 20px;
+  height: 20px;
+  background: url(../assets/down-arrow.svg) center center no-repeat;
+  background-size: 16px 16px;
+  transform: rotate(-90deg);
+}
+
+.arrow-expand {
+  transform: rotate(0);
+}
+
+.preview-item {
+  background-color: #eff4ff;
+  border: 1px solid #D1E0FF;
+  box-shadow: 0px 1px 2px rgba(16, 24, 40, 0.05);
+}

+ 299 - 0
web/app/components/base/notion-page-selector/page-selector/index.tsx

@@ -0,0 +1,299 @@
+import { memo, useMemo, useState } from 'react'
+import { useTranslation } from 'react-i18next'
+import { FixedSizeList as List, areEqual } from 'react-window'
+import type { ListChildComponentProps } from 'react-window'
+import cn from 'classnames'
+import Checkbox from '../../checkbox'
+import NotionIcon from '../../notion-icon'
+import s from './index.module.css'
+import type { DataSourceNotionPage, DataSourceNotionPageMap } from '@/models/common'
+
+type PageSelectorProps = {
+  value: Set<string>
+  searchValue: string
+  pagesMap: DataSourceNotionPageMap
+  list: DataSourceNotionPage[]
+  onSelect: (selectedPagesId: Set<string>) => void
+  canPreview?: boolean
+  previewPageId?: string
+  onPreview?: (selectedPageId: string) => void
+}
+type NotionPageTreeItem = {
+  children: Set<string>
+  descendants: Set<string>
+  deepth: number
+  ancestors: string[]
+} & DataSourceNotionPage
+type NotionPageTreeMap = Record<string, NotionPageTreeItem>
+type NotionPageItem = {
+  expand: boolean
+  deepth: number
+} & DataSourceNotionPage
+
+const recursivePushInParentDescendants = (
+  pagesMap: DataSourceNotionPageMap,
+  listTreeMap: NotionPageTreeMap,
+  current: NotionPageTreeItem,
+  leafItem: NotionPageTreeItem,
+) => {
+  const parentId = current.parent_id
+  const pageId = current.page_id
+
+  if (!parentId || !pageId)
+    return
+
+  if (parentId !== 'root' && pagesMap[parentId]) {
+    if (!listTreeMap[parentId]) {
+      const children = new Set([pageId])
+      const descendants = new Set([pageId, leafItem.page_id])
+      listTreeMap[parentId] = {
+        ...pagesMap[parentId],
+        children,
+        descendants,
+        deepth: 0,
+        ancestors: [],
+      }
+    }
+    else {
+      listTreeMap[parentId].children.add(pageId)
+      listTreeMap[parentId].descendants.add(pageId)
+      listTreeMap[parentId].descendants.add(leafItem.page_id)
+    }
+    leafItem.deepth++
+    leafItem.ancestors.unshift(listTreeMap[parentId].page_name)
+
+    if (listTreeMap[parentId].parent_id !== 'root')
+      recursivePushInParentDescendants(pagesMap, listTreeMap, listTreeMap[parentId], leafItem)
+  }
+}
+
+const Item = memo(({ index, style, data }: ListChildComponentProps<{
+  dataList: NotionPageItem[]
+  handleToggle: (index: number) => void
+  checkedIds: Set<string>
+  handleCheck: (index: number) => void
+  canPreview?: boolean
+  handlePreview: (index: number) => void
+  listMapWithChildrenAndDescendants: NotionPageTreeMap
+  searchValue: string
+  previewPageId: string
+  pagesMap: DataSourceNotionPageMap
+}>) => {
+  const { t } = useTranslation()
+  const { dataList, handleToggle, checkedIds, handleCheck, canPreview, handlePreview, listMapWithChildrenAndDescendants, searchValue, previewPageId, pagesMap } = data
+  const current = dataList[index]
+  const currentWithChildrenAndDescendants = listMapWithChildrenAndDescendants[current.page_id]
+  const hasChild = currentWithChildrenAndDescendants.descendants.size > 0
+  const ancestors = currentWithChildrenAndDescendants.ancestors
+  const breadCrumbs = ancestors.length ? [...ancestors, current.page_name] : [current.page_name]
+
+  const renderArrow = () => {
+    if (hasChild) {
+      return (
+        <div
+          className={cn(s.arrow, current.expand && s['arrow-expand'], 'shrink-0 mr-1 w-5 h-5 hover:bg-gray-200 rounded-md')}
+          style={{ marginLeft: current.deepth * 8 }}
+          onClick={() => handleToggle(index)}
+        />
+      )
+    }
+    if (current.parent_id === 'root' || !pagesMap[current.parent_id]) {
+      return (
+        <div></div>
+      )
+    }
+    return (
+      <div className='shrink-0 mr-1 w-5 h-5' style={{ marginLeft: current.deepth * 8 }} />
+    )
+  }
+
+  return (
+    <div
+      className={cn('group flex items-center pl-2 pr-[2px] rounded-md border border-transparent hover:bg-gray-100 cursor-pointer', previewPageId === current.page_id && s['preview-item'])}
+      style={{ ...style, top: style.top as number + 8, left: 8, right: 8, width: 'calc(100% - 16px)' }}
+    >
+      <Checkbox
+        className='shrink-0 mr-2 group-hover:border-primary-600 group-hover:border-[2px]'
+        checked={checkedIds.has(current.page_id)}
+        onCheck={() => handleCheck(index)}
+      />
+      {!searchValue && renderArrow()}
+      <NotionIcon
+        className='shrink-0 mr-1'
+        type='page'
+        src={current.page_icon}
+      />
+      <div
+        className='grow text-sm font-medium text-gray-700 truncate'
+        title={current.page_name}
+      >
+        {current.page_name}
+      </div>
+      {
+        canPreview && (
+          <div
+            className='shrink-0 hidden group-hover:flex items-center ml-1 px-2 h-6 rounded-md text-xs font-medium text-gray-500 cursor-pointer hover:bg-gray-50 hover:text-gray-700'
+            onClick={() => handlePreview(index)}>
+            {t('common.dataSource.notion.selector.preview')}
+          </div>
+        )
+      }
+      {
+        searchValue && (
+          <div
+            className='shrink-0 ml-1 max-w-[120px] text-xs text-gray-400 truncate'
+            title={breadCrumbs.join(' / ')}
+          >
+            {breadCrumbs.join(' / ')}
+          </div>
+        )
+      }
+    </div>
+  )
+}, areEqual)
+
+const PageSelector = ({
+  value,
+  searchValue,
+  pagesMap,
+  list,
+  onSelect,
+  canPreview = true,
+  previewPageId,
+  onPreview,
+}: PageSelectorProps) => {
+  const { t } = useTranslation()
+  const [prevDataList, setPrevDataList] = useState(list)
+  const [dataList, setDataList] = useState<NotionPageItem[]>([])
+  const [localPreviewPageId, setLocalPreviewPageId] = useState('')
+  if (prevDataList !== list) {
+    setPrevDataList(list)
+    setDataList(list.filter(item => item.parent_id === 'root' || !pagesMap[item.parent_id]).map((item) => {
+      return {
+        ...item,
+        expand: false,
+        deepth: 0,
+      }
+    }))
+  }
+  const searchDataList = list.filter((item) => {
+    return item.page_name.includes(searchValue)
+  }).map((item) => {
+    return {
+      ...item,
+      expand: false,
+      deepth: 0,
+    }
+  })
+  const currentDataList = searchValue ? searchDataList : dataList
+  const currentPreviewPageId = previewPageId === undefined ? localPreviewPageId : previewPageId
+
+  const listMapWithChildrenAndDescendants = useMemo(() => {
+    return list.reduce((prev: NotionPageTreeMap, next: DataSourceNotionPage) => {
+      const pageId = next.page_id
+      if (!prev[pageId])
+        prev[pageId] = { ...next, children: new Set(), descendants: new Set(), deepth: 0, ancestors: [] }
+
+      recursivePushInParentDescendants(pagesMap, prev, prev[pageId], prev[pageId])
+      return prev
+    }, {})
+  }, [list, pagesMap])
+
+  const handleToggle = (index: number) => {
+    const current = dataList[index]
+    const pageId = current.page_id
+    const currentWithChildrenAndDescendants = listMapWithChildrenAndDescendants[pageId]
+    const descendantsIds = Array.from(currentWithChildrenAndDescendants.descendants)
+    const childrenIds = Array.from(currentWithChildrenAndDescendants.children)
+    let newDataList = []
+
+    if (current.expand) {
+      current.expand = false
+
+      newDataList = [...dataList.filter(item => !descendantsIds.includes(item.page_id))]
+    }
+    else {
+      current.expand = true
+
+      newDataList = [
+        ...dataList.slice(0, index + 1),
+        ...childrenIds.map(item => ({
+          ...pagesMap[item],
+          expand: false,
+          deepth: listMapWithChildrenAndDescendants[item].deepth,
+        })),
+        ...dataList.slice(index + 1)]
+    }
+    setDataList(newDataList)
+  }
+
+  const handleCheck = (index: number) => {
+    const current = currentDataList[index]
+    const pageId = current.page_id
+    const currentWithChildrenAndDescendants = listMapWithChildrenAndDescendants[pageId]
+
+    if (value.has(pageId)) {
+      if (!searchValue) {
+        for (const item of currentWithChildrenAndDescendants.descendants)
+          value.delete(item)
+      }
+
+      value.delete(pageId)
+    }
+    else {
+      if (!searchValue) {
+        for (const item of currentWithChildrenAndDescendants.descendants)
+          value.add(item)
+      }
+
+      value.add(pageId)
+    }
+
+    onSelect(new Set([...value]))
+  }
+
+  const handlePreview = (index: number) => {
+    const current = currentDataList[index]
+    const pageId = current.page_id
+
+    setLocalPreviewPageId(pageId)
+
+    if (onPreview)
+      onPreview(pageId)
+  }
+
+  if (!currentDataList.length) {
+    return (
+      <div className='flex items-center justify-center h-[296px] text-[13px] text-gray-500'>
+        {t('common.dataSource.notion.selector.noSearchResult')}
+      </div>
+    )
+  }
+
+  return (
+    <List
+      className='py-2'
+      height={296}
+      itemCount={currentDataList.length}
+      itemSize={28}
+      width='100%'
+      itemKey={(index, data) => data.dataList[index].page_id}
+      itemData={{
+        dataList: currentDataList,
+        handleToggle,
+        checkedIds: value,
+        handleCheck,
+        canPreview,
+        handlePreview,
+        listMapWithChildrenAndDescendants,
+        searchValue,
+        previewPageId: currentPreviewPageId,
+        pagesMap,
+      }}
+    >
+      {Item}
+    </List>
+  )
+}
+
+export default PageSelector

+ 15 - 0
web/app/components/base/notion-page-selector/search-input/index.module.css

@@ -0,0 +1,15 @@
+.search-icon {
+  background: url(../assets/search.svg) center center;
+  background-size: 14px 14px;
+}
+
+.clear-icon {
+  background: url(../assets/clear.svg) center center;
+  background-size: contain;
+}
+
+.input-wrapper {
+  flex-basis: 200px;
+  width: 0;
+  box-shadow: 0px 1px 2px rgba(16, 24, 40, 0.05);
+}

+ 42 - 0
web/app/components/base/notion-page-selector/search-input/index.tsx

@@ -0,0 +1,42 @@
+import { useCallback } from 'react'
+import type { ChangeEvent } from 'react'
+import { useTranslation } from 'react-i18next'
+import cn from 'classnames'
+import s from './index.module.css'
+
+type SearchInputProps = {
+  value: string
+  onChange: (v: string) => void
+}
+const SearchInput = ({
+  value,
+  onChange,
+}: SearchInputProps) => {
+  const { t } = useTranslation()
+
+  const handleClear = useCallback(() => {
+    onChange('')
+  }, [onChange])
+
+  return (
+    <div className={cn(s['input-wrapper'], 'flex items-center px-2 h-7 rounded-md', `${value ? 'bg-white' : 'bg-gray-100'}`)}>
+      <div className={cn(s['search-icon'], 'mr-[6px] w-4 h-4')} />
+      <input
+        className='grow text-[13px] bg-inherit border-0 outline-0 appearance-none'
+        value={value}
+        onChange={(e: ChangeEvent<HTMLInputElement>) => onChange(e.target.value)}
+        placeholder={t('common.dataSource.notion.selector.searchPages') || ''}
+      />
+      {
+        value && (
+          <div
+            className={cn(s['clear-icon'], 'ml-1 w-4 h-4 cursor-pointer')}
+            onClick={handleClear}
+          />
+        )
+      }
+    </div>
+  )
+}
+
+export default SearchInput

+ 9 - 0
web/app/components/base/notion-page-selector/workspace-selector/index.module.css

@@ -0,0 +1,9 @@
+.down-arrow {
+  background: url(../assets/down-arrow.svg) center center no-repeat;
+  background-size: cover;
+}
+
+.popup {
+  box-shadow: 0px 12px 16px -4px rgba(16, 24, 40, 0.08), 0px 4px 6px -2px rgba(16, 24, 40, 0.03);
+  z-index: 10;
+}

+ 84 - 0
web/app/components/base/notion-page-selector/workspace-selector/index.tsx

@@ -0,0 +1,84 @@
+'use client'
+import { useTranslation } from 'react-i18next'
+import { Fragment } from 'react'
+import { Menu, Transition } from '@headlessui/react'
+import cn from 'classnames'
+import NotionIcon from '../../notion-icon'
+import s from './index.module.css'
+import type { DataSourceNotionWorkspace } from '@/models/common'
+
+type WorkspaceSelectorProps = {
+  value: string
+  items: Omit<DataSourceNotionWorkspace, 'total'>[]
+  onSelect: (v: string) => void
+}
+export default function WorkspaceSelector({
+  value,
+  items,
+  onSelect,
+}: WorkspaceSelectorProps) {
+  const { t } = useTranslation()
+  const currentWorkspace = items.find(item => item.workspace_id === value)
+
+  return (
+    <Menu as="div" className="relative inline-block text-left">
+      {
+        ({ open }) => (
+          <>
+            <Menu.Button className={`flex items-center justify-center h-7 rounded-md hover:bg-gray-50 ${open && 'bg-gray-50'} cursor-pointer`}>
+              <NotionIcon
+                className='ml-1 mr-2'
+                src={currentWorkspace?.workspace_icon}
+                name={currentWorkspace?.workspace_name}
+              />
+              <div className='mr-1 w-[90px] text-left text-sm font-medium text-gray-700 truncate' title={currentWorkspace?.workspace_name}>{currentWorkspace?.workspace_name}</div>
+              <div className='mr-1 px-1 h-[18px] bg-primary-50 rounded-lg text-xs font-medium text-primary-600'>{currentWorkspace?.pages.length}</div>
+              <div className={cn(s['down-arrow'], 'mr-2 w-3 h-3')} />
+            </Menu.Button>
+            <Transition
+              as={Fragment}
+              enter="transition ease-out duration-100"
+              enterFrom="transform opacity-0 scale-95"
+              enterTo="transform opacity-100 scale-100"
+              leave="transition ease-in duration-75"
+              leaveFrom="transform opacity-100 scale-100"
+              leaveTo="transform opacity-0 scale-95"
+            >
+              <Menu.Items
+                className={cn(
+                  s.popup,
+                  `absolute left-0 top-8 w-80
+                  origin-top-right rounded-lg bg-white
+                  border-[0.5px] border-gray-200`,
+                )}
+              >
+                <div className="p-1 max-h-50 overflow-auto">
+                  {
+                    items.map(item => (
+                      <Menu.Item key={item.workspace_id}>
+                        <div
+                          className='flex items-center px-3 h-9 hover:bg-gray-50 cursor-pointer'
+                          onClick={() => onSelect(item.workspace_id)}
+                        >
+                          <NotionIcon
+                            className='shrink-0 mr-2'
+                            src={item.workspace_icon}
+                            name={item.workspace_name}
+                          />
+                          <div className='grow mr-2 text-sm text-gray-700 truncate' title={item.workspace_name}>{item.workspace_name}</div>
+                          <div className='shrink-0 text-xs font-medium text-primary-600'>
+                            {item.pages.length} {t('common.dataSource.notion.selector.pageSelected')}
+                          </div>
+                        </div>
+                      </Menu.Item>
+                    ))
+                  }
+                </div>
+              </Menu.Items>
+            </Transition>
+          </>
+        )
+      }
+    </Menu>
+  )
+}

+ 20 - 0
web/app/components/base/progress-bar/index.tsx

@@ -0,0 +1,20 @@
+type ProgressBarProps = {
+  percent: number
+}
+const ProgressBar = ({
+  percent = 0,
+}: ProgressBarProps) => {
+  return (
+    <div className='flex items-center'>
+      <div className='mr-2 w-[100px] bg-gray-100 rounded-lg'>
+        <div
+          className='h-1 bg-[#2970FF] rounded-lg'
+          style={{ width: `${percent}%` }}
+        />
+      </div>
+      <div className='text-xs font-medium text-gray-500'>{percent}%</div>
+    </div>
+  )
+}
+
+export default ProgressBar

+ 3 - 0
web/app/components/datasets/create/assets/Icon-3-dots.svg

@@ -0,0 +1,3 @@
+<svg width="16" height="16" viewBox="0 0 16 16" fill="none" xmlns="http://www.w3.org/2000/svg">
+<path d="M5 6.5V5M8.93934 7.56066L10 6.5M10.0103 11.5H11.5103" stroke="#374151" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"/>
+</svg>

File diff suppressed because it is too large
+ 2 - 0
web/app/components/datasets/create/assets/normal.svg


+ 11 - 0
web/app/components/datasets/create/assets/star.svg

@@ -0,0 +1,11 @@
+<svg width="12" height="12" viewBox="0 0 12 12" fill="none" xmlns="http://www.w3.org/2000/svg">
+<path d="M6 0.5C6.27614 0.5 6.5 0.723858 6.5 1V2C6.5 2.27614 6.27614 2.5 6 2.5C5.72386 2.5 5.5 2.27614 5.5 2V1C5.5 0.723858 5.72386 0.5 6 0.5Z" fill="#FB6514"/>
+<path d="M2.81791 2.11092C2.62265 1.91566 2.30606 1.91566 2.1108 2.11092C1.91554 2.30619 1.91554 2.62277 2.1108 2.81803L2.81791 3.52514C3.01317 3.7204 3.32975 3.7204 3.52502 3.52514C3.72028 3.32988 3.72028 3.01329 3.52502 2.81803L2.81791 2.11092Z" fill="#FB6514"/>
+<path d="M0.5 6C0.5 5.72386 0.723858 5.5 1 5.5H2C2.27614 5.5 2.5 5.72386 2.5 6C2.5 6.27614 2.27614 6.5 2 6.5H1C0.723858 6.5 0.5 6.27614 0.5 6Z" fill="#FB6514"/>
+<path d="M10 5.5C9.72386 5.5 9.5 5.72386 9.5 6C9.5 6.27614 9.72386 6.5 10 6.5H11C11.2761 6.5 11.5 6.27614 11.5 6C11.5 5.72386 11.2761 5.5 11 5.5H10Z" fill="#FB6514"/>
+<path d="M9.18192 8.47482C8.98666 8.27955 8.67008 8.27955 8.47482 8.47482C8.27955 8.67008 8.27955 8.98666 8.47482 9.18192L9.18192 9.88903C9.37718 10.0843 9.69377 10.0843 9.88903 9.88903C10.0843 9.69377 10.0843 9.37718 9.88903 9.18192L9.18192 8.47482Z" fill="#FB6514"/>
+<path d="M9.88903 2.81803C10.0843 2.62277 10.0843 2.30619 9.88903 2.11092C9.69377 1.91566 9.37718 1.91566 9.18192 2.11092L8.47482 2.81803C8.27955 3.01329 8.27955 3.32988 8.47482 3.52514C8.67008 3.7204 8.98666 3.7204 9.18192 3.52514L9.88903 2.81803Z" fill="#FB6514"/>
+<path d="M6 9.5C6.27614 9.5 6.5 9.72386 6.5 10V11C6.5 11.2761 6.27614 11.5 6 11.5C5.72386 11.5 5.5 11.2761 5.5 11V10C5.5 9.72386 5.72386 9.5 6 9.5Z" fill="#FB6514"/>
+<path d="M3.52502 9.18192C3.72028 8.98666 3.72028 8.67008 3.52502 8.47482C3.32975 8.27955 3.01317 8.27955 2.81791 8.47482L2.1108 9.18192C1.91554 9.37718 1.91554 9.69377 2.1108 9.88903C2.30606 10.0843 2.62265 10.0843 2.81791 9.88903L3.52502 9.18192Z" fill="#FB6514"/>
+<path d="M6.44837 3.27869C6.36413 3.10804 6.19032 3 6.00001 3C5.8097 3 5.6359 3.10804 5.55166 3.27869L4.89538 4.60823L3.4277 4.82276C3.23942 4.85028 3.08308 4.98228 3.02439 5.16328C2.9657 5.34429 3.01484 5.54291 3.15115 5.67568L4.21275 6.70968L3.96221 8.17048C3.93004 8.35807 4.00716 8.54766 4.16115 8.65953C4.31514 8.77139 4.51928 8.78613 4.68774 8.69754L6.00001 8.00742L7.31229 8.69754C7.48075 8.78613 7.68489 8.77139 7.83888 8.65953C7.99287 8.54766 8.06999 8.35807 8.03782 8.17048L7.78728 6.70968L8.84888 5.67568C8.98519 5.54291 9.03433 5.34429 8.97564 5.16328C8.91695 4.98228 8.76061 4.85028 8.57233 4.82276L7.10465 4.60823L6.44837 3.27869Z" fill="#FB6514"/>
+</svg>

+ 111 - 0
web/app/components/datasets/create/embedding-process/index.module.css

@@ -0,0 +1,111 @@
+.progressContainer {
+  @apply relative pb-4 w-full;
+  border-bottom: 0.5px solid #EAECF0;
+}
+.sourceItem {
+  position: relative;
+  display: flex;
+  justify-content: space-between;
+  align-items: center;
+  margin-bottom: 4px;
+  padding: 0 4px;
+  height: 24px;
+  background: #EFF4FF;
+  border-radius: 6px;
+  overflow: hidden;
+}
+.sourceItem.error {
+  background: #FEE4E2;
+}
+.sourceItem.success {
+  background: #D1FADF;
+}
+.progressbar {
+  position: absolute;
+  top: 0;
+  left: 0;
+  height: 100%;
+  background-color: #B2CCFF;
+}
+.sourceItem .info {
+  display: flex;
+  align-items: center;
+}
+.sourceItem .info .name {
+  font-weight: 500;
+  font-size: 12px;
+  line-height: 18px;
+  color: #101828;
+}
+.sourceItem.success .info .name {
+  color: #05603A;
+}
+.sourceItem .percent {
+  font-weight: 500;
+  font-size: 12px;
+  line-height: 18px;
+  color: #344054;
+}
+.sourceItem .error {
+  color: #D92D20;
+}
+.sourceItem .success {
+  color: #05603A;
+}
+
+
+.cost {
+  @apply flex justify-between items-center text-xs text-gray-700;
+}
+.embeddingStatus {
+  @apply flex items-center justify-between text-gray-900 font-medium text-sm mr-2;
+}
+.commonIcon {
+  @apply w-3 h-3 mr-1 inline-block align-middle;
+}
+.highIcon {
+  mask-image: url(../assets/star.svg);
+  @apply bg-orange-500;
+}
+.economyIcon {
+  background-color: #444ce7;
+  mask-image: url(../assets/normal.svg);
+}
+.tokens {
+  @apply text-xs font-medium px-1;
+}
+.price {
+  color: #f79009;
+  @apply text-xs font-medium;
+}
+
+.fileIcon {
+  @apply w-4 h-4 mr-1 bg-center bg-no-repeat;
+  background-image: url(../assets/unknow.svg);
+  background-size: 16px;
+}
+.fileIcon.csv {
+  background-image: url(../assets/csv.svg);
+}
+
+.fileIcon.xlsx,
+.fileIcon.xls {
+  background-image: url(../assets/xlsx.svg);
+}
+.fileIcon.pdf {
+  background-image: url(../assets/pdf.svg);
+}
+.fileIcon.html,
+.fileIcon.htm {
+  background-image: url(../assets/html.svg);
+}
+.fileIcon.md,
+.fileIcon.markdown {
+  background-image: url(../assets/md.svg);
+}
+.fileIcon.txt {
+  background-image: url(../assets/txt.svg);
+}
+.fileIcon.json {
+  background-image: url(../assets/json.svg);
+}

+ 242 - 0
web/app/components/datasets/create/embedding-process/index.tsx

@@ -0,0 +1,242 @@
+import type { FC } from 'react'
+import React, { useCallback, useEffect, useMemo } from 'react'
+import useSWR from 'swr'
+import { useRouter } from 'next/navigation'
+import { useTranslation } from 'react-i18next'
+import { omit } from 'lodash-es'
+import { ArrowRightIcon } from '@heroicons/react/24/solid'
+import { useGetState } from 'ahooks'
+import cn from 'classnames'
+import s from './index.module.css'
+import { FieldInfo } from '@/app/components/datasets/documents/detail/metadata'
+import Button from '@/app/components/base/button'
+import type { FullDocumentDetail, IndexingStatusResponse, ProcessRuleResponse } from '@/models/datasets'
+import { formatNumber } from '@/utils/format'
+import { fetchIndexingStatusBatch as doFetchIndexingStatus, fetchIndexingEstimateBatch, fetchProcessRule } from '@/service/datasets'
+import { DataSourceType } from '@/models/datasets'
+import NotionIcon from '@/app/components/base/notion-icon'
+
+type Props = {
+  datasetId: string
+  batchId: string
+  documents?: FullDocumentDetail[]
+  indexingType?: string
+}
+
+const RuleDetail: FC<{ sourceData?: ProcessRuleResponse }> = ({ sourceData }) => {
+  const { t } = useTranslation()
+
+  const segmentationRuleMap = {
+    mode: t('datasetDocuments.embedding.mode'),
+    segmentLength: t('datasetDocuments.embedding.segmentLength'),
+    textCleaning: t('datasetDocuments.embedding.textCleaning'),
+  }
+
+  const getRuleName = (key: string) => {
+    if (key === 'remove_extra_spaces')
+      return t('datasetCreation.stepTwo.removeExtraSpaces')
+
+    if (key === 'remove_urls_emails')
+      return t('datasetCreation.stepTwo.removeUrlEmails')
+
+    if (key === 'remove_stopwords')
+      return t('datasetCreation.stepTwo.removeStopwords')
+  }
+
+  const getValue = useCallback((field: string) => {
+    let value: string | number | undefined = '-'
+    switch (field) {
+      case 'mode':
+        value = sourceData?.mode === 'automatic' ? (t('datasetDocuments.embedding.automatic') as string) : (t('datasetDocuments.embedding.custom') as string)
+        break
+      case 'segmentLength':
+        value = sourceData?.rules?.segmentation?.max_tokens
+        break
+      default:
+        value = sourceData?.mode === 'automatic'
+          ? (t('datasetDocuments.embedding.automatic') as string)
+          // eslint-disable-next-line array-callback-return
+          : sourceData?.rules?.pre_processing_rules?.map((rule) => {
+            if (rule.enabled)
+              return getRuleName(rule.id)
+          }).filter(Boolean).join(';')
+        break
+    }
+    return value
+  }, [sourceData])
+
+  return <div className='flex flex-col pt-8 pb-10 first:mt-0'>
+    {Object.keys(segmentationRuleMap).map((field) => {
+      return <FieldInfo
+        key={field}
+        label={segmentationRuleMap[field as keyof typeof segmentationRuleMap]}
+        displayedValue={String(getValue(field))}
+      />
+    })}
+  </div>
+}
+
+const EmbeddingProcess: FC<Props> = ({ datasetId, batchId, documents = [], indexingType }) => {
+  const { t } = useTranslation()
+
+  const getFirstDocument = documents[0]
+
+  const [indexingStatusBatchDetail, setIndexingStatusDetail, getIndexingStatusDetail] = useGetState<IndexingStatusResponse[]>([])
+  const fetchIndexingStatus = async () => {
+    const status = await doFetchIndexingStatus({ datasetId, batchId })
+    setIndexingStatusDetail(status.data)
+  }
+
+  const [runId, setRunId, getRunId] = useGetState<any>(null)
+
+  const stopQueryStatus = () => {
+    clearInterval(getRunId())
+  }
+
+  const startQueryStatus = () => {
+    const runId = setInterval(() => {
+      const indexingStatusBatchDetail = getIndexingStatusDetail()
+      const isCompleted = indexingStatusBatchDetail.every(indexingStatusDetail => ['completed', 'error'].includes(indexingStatusDetail.indexing_status))
+      if (isCompleted) {
+        stopQueryStatus()
+        return
+      }
+      fetchIndexingStatus()
+    }, 2500)
+    setRunId(runId)
+  }
+
+  useEffect(() => {
+    fetchIndexingStatus()
+    startQueryStatus()
+    return () => {
+      stopQueryStatus()
+    }
+  }, [])
+
+  // get rule
+  const { data: ruleDetail, error: ruleError } = useSWR({
+    action: 'fetchProcessRule',
+    params: { documentId: getFirstDocument.id },
+  }, apiParams => fetchProcessRule(omit(apiParams, 'action')), {
+    revalidateOnFocus: false,
+  })
+  // get cost
+  const { data: indexingEstimateDetail, error: indexingEstimateErr } = useSWR({
+    action: 'fetchIndexingEstimateBatch',
+    datasetId,
+    batchId,
+  }, apiParams => fetchIndexingEstimateBatch(omit(apiParams, 'action')), {
+    revalidateOnFocus: false,
+  })
+
+  const router = useRouter()
+  const navToDocumentList = () => {
+    router.push(`/datasets/${datasetId}/documents`)
+  }
+
+  const isEmbedding = useMemo(() => {
+    return indexingStatusBatchDetail.some((indexingStatusDetail: { indexing_status: any }) => ['indexing', 'splitting', 'parsing', 'cleaning'].includes(indexingStatusDetail?.indexing_status || ''))
+  }, [indexingStatusBatchDetail])
+  const isEmbeddingCompleted = useMemo(() => {
+    return indexingStatusBatchDetail.every((indexingStatusDetail: { indexing_status: any }) => ['completed', 'error'].includes(indexingStatusDetail?.indexing_status || ''))
+  }, [indexingStatusBatchDetail])
+
+  const getSourceName = (id: string) => {
+    const doc = documents.find(document => document.id === id)
+    return doc?.name
+  }
+  const getFileType = (name?: string) => name?.split('.').pop() || 'txt'
+  const getSourcePercent = (detail: IndexingStatusResponse) => {
+    const completedCount = detail.completed_segments || 0
+    const totalCount = detail.total_segments || 0
+    if (totalCount === 0)
+      return 0
+    const percent = Math.round(completedCount * 100 / totalCount)
+    return percent > 100 ? 100 : percent
+  }
+  const getSourceType = (id: string) => {
+    const doc = documents.find(document => document.id === id)
+    return doc?.data_source_type as DataSourceType
+  }
+  const getIcon = (id: string) => {
+    const doc = documents.find(document => document.id === id) as any // TODO type fix
+
+    return doc.data_source_info.notion_page_icon
+  }
+  const isSourceEmbedding = (detail: IndexingStatusResponse) => ['indexing', 'splitting', 'parsing', 'cleaning', 'waiting'].includes(detail.indexing_status || '')
+
+  return (
+    <>
+      <div className='h-5 flex justify-between items-center mb-5'>
+        <div className={s.embeddingStatus}>
+          {isEmbedding && t('datasetDocuments.embedding.processing')}
+          {isEmbeddingCompleted && t('datasetDocuments.embedding.completed')}
+        </div>
+        <div className={s.cost}>
+          {indexingType === 'high_quaility' && (
+            <div className='flex items-center'>
+              <div className={cn(s.commonIcon, s.highIcon)} />
+              {t('datasetDocuments.embedding.highQuality')} · {t('datasetDocuments.embedding.estimate')}
+              <span className={s.tokens}>{formatNumber(indexingEstimateDetail?.tokens || 0)}</span>tokens
+              (<span className={s.price}>${formatNumber(indexingEstimateDetail?.total_price || 0)}</span>)
+            </div>
+          )}
+          {indexingType === 'economy' && (
+            <div className='flex items-center'>
+              <div className={cn(s.commonIcon, s.economyIcon)} />
+              {t('datasetDocuments.embedding.economy')} · {t('datasetDocuments.embedding.estimate')}
+              <span className={s.tokens}>0</span>tokens
+            </div>
+          )}
+        </div>
+      </div>
+      <div className={s.progressContainer}>
+        {indexingStatusBatchDetail.map(indexingStatusDetail => (
+          <div className={cn(
+            s.sourceItem,
+            indexingStatusDetail.indexing_status === 'error' && s.error,
+            indexingStatusDetail.indexing_status === 'completed' && s.success,
+          )}>
+            {isSourceEmbedding(indexingStatusDetail) && (
+              <div className={s.progressbar} style={{ width: `${getSourcePercent(indexingStatusDetail)}%` }}/>
+            )}
+            <div className={s.info}>
+              {getSourceType(indexingStatusDetail.id) === DataSourceType.FILE && (
+                <div className={cn(s.fileIcon, s[getFileType(getSourceName(indexingStatusDetail.id))])}/>
+              )}
+              {getSourceType(indexingStatusDetail.id) === DataSourceType.NOTION && (
+                <NotionIcon
+                  className='shrink-0 mr-1'
+                  type='page'
+                  src={getIcon(indexingStatusDetail.id)}
+                />
+              )}
+              <div className={s.name}>{getSourceName(indexingStatusDetail.id)}</div>
+            </div>
+            <div className='shrink-0'>
+              {isSourceEmbedding(indexingStatusDetail) && (
+                <div className={s.percent}>{`${getSourcePercent(indexingStatusDetail)}%`}</div>
+              )}
+              {indexingStatusDetail.indexing_status === 'error' && (
+                <div className={cn(s.percent, s.error)}>Error</div>
+              )}
+              {indexingStatusDetail.indexing_status === 'completed' && (
+                <div className={cn(s.percent, s.success)}>100%</div>
+              )}
+            </div>
+          </div>
+        ))}
+      </div>
+      <RuleDetail sourceData={ruleDetail} />
+      <div className='flex items-center gap-2 mt-10'>
+        <Button className='w-fit' type='primary' onClick={navToDocumentList}>
+          <span>{t('datasetCreation.stepThree.navTo')}</span>
+          <ArrowRightIcon className='h-4 w-4 ml-2 stroke-current stroke-1' />
+        </Button>
+      </div>
+    </>
+  )
+}
+
+export default EmbeddingProcess

+ 3 - 0
web/app/components/datasets/create/file-preview/index.module.css

@@ -11,6 +11,9 @@
   }
   
   .previewHeader .title {
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
     color: #101828;
     font-weight: 600;
     font-size: 18px;

+ 17 - 10
web/app/components/datasets/create/file-preview/index.tsx

@@ -1,18 +1,21 @@
 'use client'
-import React, { useState, useEffect } from 'react'
+import React, { useEffect, useState } from 'react'
 import { useTranslation } from 'react-i18next'
-import type { File } from '@/models/datasets'
-import { fetchFilePreview } from '@/service/common'
-
 import cn from 'classnames'
+import { XMarkIcon } from '@heroicons/react/20/solid'
 import s from './index.module.css'
+import type { File } from '@/models/datasets'
+import { fetchFilePreview } from '@/service/common'
 
 type IProps = {
-  file?: File,
+  file?: File
+  notionPage?: any
+  hidePreview: () => void
 }
 
 const FilePreview = ({
   file,
+  hidePreview,
 }: IProps) => {
   const { t } = useTranslation()
   const [previewContent, setPreviewContent] = useState('')
@@ -28,23 +31,27 @@ const FilePreview = ({
   }
 
   const getFileName = (currentFile?: File) => {
-    if (!currentFile) {
+    if (!currentFile)
       return ''
-    }
+
     const arr = currentFile.name.split('.')
     return arr.slice(0, -1).join()
   }
 
   useEffect(() => {
-    if (file) {
+    if (file)
       getPreviewContent(file.id)
-    }
   }, [file])
 
   return (
     <div className={cn(s.filePreview)}>
       <div className={cn(s.previewHeader)}>
-        <div className={cn(s.title)}>{t('datasetCreation.stepOne.filePreview')}</div>
+        <div className={cn(s.title)}>
+          <span>{t('datasetCreation.stepOne.filePreview')}</span>
+          <div className='flex items-center justify-center w-6 h-6 cursor-pointer' onClick={hidePreview}>
+            <XMarkIcon className='h-4 w-4'></XMarkIcon>
+          </div>
+        </div>
         <div className={cn(s.fileName)}>
           <span>{getFileName(file)}</span><span className={cn(s.filetype)}>.{file?.extension}</span>
         </div>

+ 38 - 10
web/app/components/datasets/create/index.tsx

@@ -1,32 +1,44 @@
 'use client'
-import React, { useState, useCallback, useEffect } from 'react'
+import React, { useCallback, useEffect, useState } from 'react'
 import { useTranslation } from 'react-i18next'
 import { useBoolean } from 'ahooks'
-import type { DataSet, File, createDocumentResponse } from '@/models/datasets'
-import { fetchTenantInfo } from '@/service/common'
-import { fetchDataDetail } from '@/service/datasets'
-
+import AppUnavailable from '../../base/app-unavailable'
 import StepsNavBar from './steps-nav-bar'
 import StepOne from './step-one'
 import StepTwo from './step-two'
 import StepThree from './step-three'
+import { DataSourceType } from '@/models/datasets'
+import type { DataSet, File, createDocumentResponse } from '@/models/datasets'
+import { fetchDataSource, fetchTenantInfo } from '@/service/common'
+import { fetchDataDetail } from '@/service/datasets'
+import type { DataSourceNotionPage } from '@/models/common'
+
 import AccountSetting from '@/app/components/header/account-setting'
-import AppUnavailable from '../../base/app-unavailable'
+
+type Page = DataSourceNotionPage & { workspace_id: string }
 
 type DatasetUpdateFormProps = {
-  datasetId?: string;
+  datasetId?: string
 }
 
 const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => {
   const { t } = useTranslation()
   const [hasSetAPIKEY, setHasSetAPIKEY] = useState(true)
   const [isShowSetAPIKey, { setTrue: showSetAPIKey, setFalse: hideSetAPIkey }] = useBoolean()
+  const [hasConnection, setHasConnection] = useState(true)
+  const [isShowDataSourceSetting, { setTrue: showDataSourceSetting, setFalse: hideDataSourceSetting }] = useBoolean()
+  const [dataSourceType, setDataSourceType] = useState<DataSourceType>(DataSourceType.FILE)
   const [step, setStep] = useState(1)
   const [indexingTypeCache, setIndexTypeCache] = useState('')
   const [file, setFile] = useState<File | undefined>()
   const [result, setResult] = useState<createDocumentResponse | undefined>()
   const [hasError, setHasError] = useState(false)
 
+  const [notionPages, setNotionPages] = useState<Page[]>([])
+  const updateNotionPages = (value: Page[]) => {
+    setNotionPages(value)
+  }
+
   const updateFile = (file?: File) => {
     setFile(file)
   }
@@ -50,9 +62,15 @@ const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => {
     const hasSetKey = data.providers.some(({ is_valid }) => is_valid)
     setHasSetAPIKEY(hasSetKey)
   }
+  const checkNotionConnection = async () => {
+    const { data } = await fetchDataSource({ url: '/data-source/integrates' })
+    const hasConnection = data.filter(item => item.provider === 'notion') || []
+    setHasConnection(hasConnection.length > 0)
+  }
 
   useEffect(() => {
     checkAPIKey()
+    checkNotionConnection()
   }, [])
 
   const [detail, setDetail] = useState<DataSet | null>(null)
@@ -62,16 +80,16 @@ const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => {
         try {
           const detail = await fetchDataDetail(datasetId)
           setDetail(detail)
-        } catch (e) {
+        }
+        catch (e) {
           setHasError(true)
         }
       }
     })()
   }, [datasetId])
 
-  if (hasError) {
+  if (hasError)
     return <AppUnavailable code={500} unknownReason={t('datasetCreation.error.unavailable') as string} />
-  }
 
   return (
     <div className='flex' style={{ height: 'calc(100vh - 56px)' }}>
@@ -80,9 +98,16 @@ const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => {
       </div>
       <div className="grow bg-white">
         {step === 1 && <StepOne
+          hasConnection={hasConnection}
+          onSetting={showDataSourceSetting}
           datasetId={datasetId}
+          dataSourceType={dataSourceType}
+          dataSourceTypeDisable={!!detail?.data_source_type}
+          changeType={setDataSourceType}
           file={file}
           updateFile={updateFile}
+          notionPages={notionPages}
+          updateNotionPages={updateNotionPages}
           onStepChange={nextStep}
         />}
         {(step === 2 && (!datasetId || (datasetId && !!detail))) && <StepTwo
@@ -90,7 +115,9 @@ const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => {
           onSetting={showSetAPIKey}
           indexingType={detail?.indexing_technique || ''}
           datasetId={datasetId}
+          dataSourceType={dataSourceType}
           file={file}
+          notionPages={notionPages}
           onStepChange={changeStep}
           updateIndexingTypeCache={updateIndexingTypeCache}
           updateResultCache={updateResultCache}
@@ -106,6 +133,7 @@ const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => {
         await checkAPIKey()
         hideSetAPIkey()
       }} />}
+      {isShowDataSourceSetting && <AccountSetting activeTab="data-source" onCancel={hideDataSourceSetting}/>}
     </div>
   )
 }

+ 54 - 0
web/app/components/datasets/create/notion-page-preview/index.module.css

@@ -0,0 +1,54 @@
+.filePreview {
+    @apply flex flex-col border-l border-gray-200 shrink-0;
+    width: 528px;
+    background-color: #fcfcfd;
+  }
+  
+  .previewHeader {
+    @apply border-b border-gray-200 shrink-0;
+    margin: 42px 32px 0;
+    padding-bottom: 16px;
+  }
+  
+  .previewHeader .title {
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    color: #101828;
+    font-weight: 600;
+    font-size: 18px;
+    line-height: 28px;
+  }
+  
+  .previewHeader .fileName {
+    display: flex;
+    align-items: center;
+    font-weight: 400;
+    font-size: 12px;
+    line-height: 18px;
+    color: #1D2939;
+  }
+  
+  .previewHeader .filetype {
+    color: #667085;
+  }
+  
+  .previewContent {
+    @apply overflow-y-auto grow;
+    padding: 20px 32px;
+    font-weight: 400;
+    font-size: 16px;
+    line-height: 24px;
+    color: #344054;
+  }
+  
+  .previewContent .loading {
+    width: 100%;
+    height: 180px;
+    background: #f9fafb center no-repeat url(../assets/Loading.svg);
+    background-size: contain;
+  }
+  .fileContent {
+    white-space: pre-line;
+  }
+  

+ 75 - 0
web/app/components/datasets/create/notion-page-preview/index.tsx

@@ -0,0 +1,75 @@
+'use client'
+import React, { useEffect, useState } from 'react'
+import { useTranslation } from 'react-i18next'
+import cn from 'classnames'
+import { XMarkIcon } from '@heroicons/react/20/solid'
+import s from './index.module.css'
+import type { DataSourceNotionPage } from '@/models/common'
+import NotionIcon from '@/app/components/base/notion-icon'
+import { fetchNotionPagePreview } from '@/service/datasets'
+
+type Page = DataSourceNotionPage & { workspace_id: string }
+type IProps = {
+  currentPage?: Page
+  hidePreview: () => void
+}
+
+const NotionPagePreview = ({
+  currentPage,
+  hidePreview,
+}: IProps) => {
+  const { t } = useTranslation()
+  const [previewContent, setPreviewContent] = useState('')
+  const [loading, setLoading] = useState(true)
+
+  const getPreviewContent = async () => {
+    if (!currentPage)
+      return
+    try {
+      const res = await fetchNotionPagePreview({
+        workspaceID: currentPage.workspace_id,
+        pageID: currentPage.page_id,
+        pageType: currentPage.type,
+      })
+      setPreviewContent(res.content)
+      setLoading(false)
+    }
+    catch {}
+  }
+
+  useEffect(() => {
+    if (currentPage) {
+      setLoading(true)
+      getPreviewContent()
+    }
+  }, [currentPage])
+
+  return (
+    <div className={cn(s.filePreview)}>
+      <div className={cn(s.previewHeader)}>
+        <div className={cn(s.title)}>
+          <span>{t('datasetCreation.stepOne.pagePreview')}</span>
+          <div className='flex items-center justify-center w-6 h-6 cursor-pointer' onClick={hidePreview}>
+            <XMarkIcon className='h-4 w-4'></XMarkIcon>
+          </div>
+        </div>
+        <div className={cn(s.fileName)}>
+          <NotionIcon
+            className='shrink-0 mr-1'
+            type='page'
+            src={currentPage?.page_icon}
+          />
+          {currentPage?.page_name}
+        </div>
+      </div>
+      <div className={cn(s.previewContent)}>
+        {loading && <div className={cn(s.loading)}/>}
+        {!loading && (
+          <div className={cn(s.fileContent)}>{previewContent}</div>
+        )}
+      </div>
+    </div>
+  )
+}
+
+export default NotionPagePreview

+ 50 - 0
web/app/components/datasets/create/step-one/index.module.css

@@ -107,3 +107,53 @@
   background: center no-repeat url(../assets/folder-plus.svg);
   background-size: contain;
 }
+
+.notionConnectionTip {
+  display: flex;
+  flex-direction: column;
+  align-items: flex-start;
+  padding: 24px;
+  max-width: 640px;
+  background: #F9FAFB;
+  border-radius: 16px;
+}
+
+.notionIcon {
+  display: flex;
+  padding: 12px;
+  width: 48px;
+  height: 48px;
+  background: #fff center no-repeat url(../assets/notion.svg);
+  background-size: 24px;
+  border: 0.5px solid #EAECF5;
+  box-shadow: 0px 12px 16px -4px rgba(16, 24, 40, 0.08), 0px 4px 6px -2px rgba(16, 24, 40, 0.03);
+  border-radius: 12px;
+}
+
+.notionConnectionTip .title {
+  position: relative;
+  margin: 24px 0 4px;
+  font-style: normal;
+  font-weight: 600;
+  font-size: 16px;
+  line-height: 24px;
+  color: #374151;
+}
+.notionConnectionTip .title::after {
+  content: '';
+  position: absolute;
+  top: -6px;
+  right: -12px;
+  width: 16px;
+  height: 16px;
+  background: center no-repeat url(../assets/Icon-3-dots.svg);
+  background-size: contain;
+}
+.notionConnectionTip .tip {
+  margin-bottom: 20px;
+  font-style: normal;
+  font-weight: 400;
+  font-size: 13px;
+  line-height: 18px;
+  color: #6B7280;
+}

+ 105 - 24
web/app/components/datasets/create/step-one/index.tsx

@@ -1,36 +1,82 @@
 'use client'
 import React, { useState } from 'react'
 import { useTranslation } from 'react-i18next'
-import type { File } from '@/models/datasets'
+import cn from 'classnames'
 import FilePreview from '../file-preview'
 import FileUploader from '../file-uploader'
+import NotionPagePreview from '../notion-page-preview'
 import EmptyDatasetCreationModal from '../empty-dataset-creation-modal'
-import Button from '@/app/components/base/button'
-
-import cn from 'classnames'
 import s from './index.module.css'
+import type { File } from '@/models/datasets'
+import type { DataSourceNotionPage } from '@/models/common'
+import { DataSourceType } from '@/models/datasets'
+import Button from '@/app/components/base/button'
+import { NotionPageSelector } from '@/app/components/base/notion-page-selector'
 
 type IStepOneProps = {
-  datasetId?: string,
-  file?: File,
-  updateFile: (file?: File) => void,
-  onStepChange: () => void,
+  datasetId?: string
+  dataSourceType: DataSourceType
+  dataSourceTypeDisable: Boolean
+  hasConnection: boolean
+  onSetting: () => void
+  file?: File
+  updateFile: (file?: File) => void
+  notionPages?: any[]
+  updateNotionPages: (value: any[]) => void
+  onStepChange: () => void
+  changeType: (type: DataSourceType) => void
+}
+
+type Page = DataSourceNotionPage & { workspace_id: string }
+
+type NotionConnectorProps = {
+  onSetting: () => void
+}
+export const NotionConnector = ({ onSetting }: NotionConnectorProps) => {
+  const { t } = useTranslation()
+
+  return (
+    <div className={s.notionConnectionTip}>
+      <span className={s.notionIcon}/>
+      <div className={s.title}>{t('datasetCreation.stepOne.notionSyncTitle')}</div>
+      <div className={s.tip}>{t('datasetCreation.stepOne.notionSyncTip')}</div>
+      <Button className='h-8' type='primary' onClick={onSetting}>{t('datasetCreation.stepOne.connect')}</Button>
+    </div>
+  )
 }
 
 const StepOne = ({
   datasetId,
+  dataSourceType,
+  dataSourceTypeDisable,
+  changeType,
+  hasConnection,
+  onSetting,
   onStepChange,
   file,
   updateFile,
+  notionPages = [],
+  updateNotionPages,
 }: IStepOneProps) => {
-  const [dataSourceType, setDataSourceType] = useState('FILE')
   const [showModal, setShowModal] = useState(false)
+  const [showFilePreview, setShowFilePreview] = useState(true)
+  const [currentNotionPage, setCurrentNotionPage] = useState<Page | undefined>()
   const { t } = useTranslation()
 
+  const hidePreview = () => setShowFilePreview(false)
+
   const modalShowHandle = () => setShowModal(true)
 
   const modalCloseHandle = () => setShowModal(false)
 
+  const updateCurrentPage = (page: Page) => {
+    setCurrentNotionPage(page)
+  }
+
+  const hideNotionPagePreview = () => {
+    setCurrentNotionPage(undefined)
+  }
+
   return (
     <div className='flex w-full h-full'>
       <div className='grow overflow-y-auto relative'>
@@ -38,41 +84,76 @@ const StepOne = ({
         <div className={s.form}>
           <div className={s.dataSourceTypeList}>
             <div
-              className={cn(s.dataSourceItem, dataSourceType === 'FILE' && s.active)}
-              onClick={() => setDataSourceType('FILE')}
+              className={cn(
+                s.dataSourceItem,
+                dataSourceType === DataSourceType.FILE && s.active,
+                dataSourceTypeDisable && dataSourceType !== DataSourceType.FILE && s.disabled,
+              )}
+              onClick={() => {
+                if (dataSourceTypeDisable)
+                  return
+                changeType(DataSourceType.FILE)
+                hidePreview()
+              }}
             >
-              <span className={cn(s.datasetIcon)}/>
+              <span className={cn(s.datasetIcon)} />
               {t('datasetCreation.stepOne.dataSourceType.file')}
             </div>
             <div
-              className={cn(s.dataSourceItem, s.disabled, dataSourceType === 'notion' && s.active)}
-              // onClick={() => setDataSourceType('notion')}
+              className={cn(
+                s.dataSourceItem,
+                dataSourceType === DataSourceType.NOTION && s.active,
+                dataSourceTypeDisable && dataSourceType !== DataSourceType.NOTION && s.disabled,
+              )}
+              onClick={() => {
+                if (dataSourceTypeDisable)
+                  return
+                changeType(DataSourceType.NOTION)
+                hidePreview()
+              }}
             >
-              <span className={s.comingTag}>Coming soon</span>
-              <span className={cn(s.datasetIcon, s.notion)}/>
+              <span className={cn(s.datasetIcon, s.notion)} />
               {t('datasetCreation.stepOne.dataSourceType.notion')}
             </div>
             <div
-              className={cn(s.dataSourceItem, s.disabled, dataSourceType === 'web' && s.active)}
-              // onClick={() => setDataSourceType('web')}
+              className={cn(s.dataSourceItem, s.disabled, dataSourceType === DataSourceType.WEB && s.active)}
+            // onClick={() => changeType(DataSourceType.WEB)}
             >
               <span className={s.comingTag}>Coming soon</span>
-              <span className={cn(s.datasetIcon, s.web)}/>  
+              <span className={cn(s.datasetIcon, s.web)} />
               {t('datasetCreation.stepOne.dataSourceType.web')}
             </div>
           </div>
-          <FileUploader onFileUpdate={updateFile} file={file} />
-          <Button disabled={!file} className={s.submitButton} type='primary' onClick={onStepChange}>{t('datasetCreation.stepOne.button')}</Button>
+          {dataSourceType === DataSourceType.FILE && (
+            <>
+              <FileUploader onFileUpdate={updateFile} file={file} />
+              <Button disabled={!file} className={s.submitButton} type='primary' onClick={onStepChange}>{t('datasetCreation.stepOne.button')}</Button>
+            </>
+          )}
+          {dataSourceType === DataSourceType.NOTION && (
+            <>
+              {!hasConnection && <NotionConnector onSetting={onSetting} />}
+              {hasConnection && (
+                <>
+                  <div className='mb-8 w-[640px]'>
+                    <NotionPageSelector value={notionPages.map(page => page.page_id)} onSelect={updateNotionPages} onPreview={updateCurrentPage} />
+                  </div>
+                  <Button disabled={!notionPages.length} className={s.submitButton} type='primary' onClick={onStepChange}>{t('datasetCreation.stepOne.button')}</Button>
+                </>
+              )}
+            </>
+          )}
           {!datasetId && (
             <>
-              <div className={s.dividerLine}/>
+              <div className={s.dividerLine} />
               <div onClick={modalShowHandle} className={s.OtherCreationOption}>{t('datasetCreation.stepOne.emptyDatasetCreation')}</div>
             </>
           )}
         </div>
-        <EmptyDatasetCreationModal show={showModal} onHide={modalCloseHandle}/>
+        <EmptyDatasetCreationModal show={showModal} onHide={modalCloseHandle} />
       </div>
-      {file && <FilePreview file={file} />}
+      {file && showFilePreview && <FilePreview file={file} hidePreview={hidePreview} />}
+      {currentNotionPage && <NotionPagePreview currentPage={currentNotionPage} hidePreview={hideNotionPagePreview} />}
     </div>
   )
 }

+ 11 - 12
web/app/components/datasets/create/step-three/index.tsx

@@ -1,16 +1,16 @@
 'use client'
 import React from 'react'
 import { useTranslation } from 'react-i18next'
-import type { createDocumentResponse } from '@/models/datasets'
-import EmbeddingDetail from '../../documents/detail/embedding'
-
 import cn from 'classnames'
+import EmbeddingProcess from '../embedding-process'
+
 import s from './index.module.css'
+import type { FullDocumentDetail, createDocumentResponse } from '@/models/datasets'
 
 type StepThreeProps = {
-  datasetId?: string,
-  datasetName?: string,
-  indexingType?: string,
+  datasetId?: string
+  datasetName?: string
+  indexingType?: string
   creationCache?: createDocumentResponse
 }
 
@@ -38,12 +38,11 @@ const StepThree = ({ datasetId, datasetName, indexingType, creationCache }: Step
               <div className={s.content}>{`${t('datasetCreation.stepThree.additionP1')} ${datasetName || creationCache?.dataset?.name} ${t('datasetCreation.stepThree.additionP2')}`}</div>
             </div>
           )}
-          <EmbeddingDetail
-            datasetId={datasetId || creationCache?.dataset?.id}
-            documentId={creationCache?.document.id}
+          <EmbeddingProcess
+            datasetId={datasetId || creationCache?.dataset?.id || ''}
+            batchId={creationCache?.batch || ''}
+            documents={creationCache?.documents as FullDocumentDetail[]}
             indexingType={indexingType || creationCache?.dataset?.indexing_technique}
-            stopPosition='bottom'
-            detail={creationCache?.document}
           />
         </div>
       </div>
@@ -58,4 +57,4 @@ const StepThree = ({ datasetId, datasetName, indexingType, creationCache }: Step
   )
 }
 
-export default StepThree;
+export default StepThree

+ 32 - 5
web/app/components/datasets/create/step-two/index.module.css

@@ -14,9 +14,26 @@
 }
 
 .fixed {
+  padding-top: 12px;
+  font-size: 12px;
+  line-height: 18px;
   background: rgba(255, 255, 255, 0.9);
   border-bottom: 0.5px solid #EAECF0;
   backdrop-filter: blur(4px);
+  animation: fix 0.5s;
+}
+
+@keyframes fix {
+  from {
+    padding-top: 42px;
+    font-size: 18px;
+    line-height: 28px;
+  }
+  to {
+    padding-top: 12px;
+    font-size: 12px;
+    line-height: 18px;
+  }
 }
 
 .form {
@@ -273,11 +290,11 @@
   @apply bg-gray-100 caret-primary-600 hover:bg-gray-100 focus:ring-1 focus:ring-inset focus:ring-gray-200 focus-visible:outline-none focus:bg-white placeholder:text-gray-400;
 }
 
-.file {
+.source {
   @apply flex justify-between items-center mt-8 px-6 py-4 rounded-xl bg-gray-50;
 }
 
-.file .divider {
+.source .divider {
   @apply shrink-0 mx-4 w-px bg-gray-200;
   height: 42px;
 }
@@ -318,9 +335,19 @@
 .fileIcon.json {
   background-image: url(../assets/json.svg);
 }
-
-.fileContent {
-  flex: 1 1 50%;
+.sourceContent {
+  flex: 1 1 auto;
+}
+.sourceCount {
+  @apply shrink-0 ml-1;
+  font-weight: 500;
+  font-size: 13px;
+  line-height: 18px;
+  color: #667085;
+}
+.segmentCount {
+  flex: 1 1 30%;
+  max-width: 120px;
 }
 
 .divider {

+ 104 - 21
web/app/components/datasets/create/step-two/index.tsx

@@ -6,9 +6,10 @@ import { useBoolean } from 'ahooks'
 import { XMarkIcon } from '@heroicons/react/20/solid'
 import cn from 'classnames'
 import Link from 'next/link'
+import { groupBy } from 'lodash-es'
 import PreviewItem from './preview-item'
 import s from './index.module.css'
-import type { CreateDocumentReq, File, FullDocumentDetail, FileIndexingEstimateResponse as IndexingEstimateResponse, PreProcessingRule, Rules, createDocumentResponse } from '@/models/datasets'
+import type { CreateDocumentReq, File, FullDocumentDetail, FileIndexingEstimateResponse as IndexingEstimateResponse, NotionInfo, PreProcessingRule, Rules, createDocumentResponse } from '@/models/datasets'
 import {
   createDocument,
   createFirstDocument,
@@ -20,6 +21,11 @@ import Loading from '@/app/components/base/loading'
 
 import Toast from '@/app/components/base/toast'
 import { formatNumber } from '@/utils/format'
+import type { DataSourceNotionPage } from '@/models/common'
+import { DataSourceType } from '@/models/datasets'
+import NotionIcon from '@/app/components/base/notion-icon'
+
+type Page = DataSourceNotionPage & { workspace_id: string }
 
 type StepTwoProps = {
   isSetting?: boolean
@@ -28,7 +34,9 @@ type StepTwoProps = {
   onSetting: () => void
   datasetId?: string
   indexingType?: string
+  dataSourceType: DataSourceType
   file?: File
+  notionPages?: Page[]
   onStepChange?: (delta: number) => void
   updateIndexingTypeCache?: (type: string) => void
   updateResultCache?: (res: createDocumentResponse) => void
@@ -52,7 +60,9 @@ const StepTwo = ({
   onSetting,
   datasetId,
   indexingType,
+  dataSourceType,
   file,
+  notionPages = [],
   onStepChange,
   updateIndexingTypeCache,
   updateResultCache,
@@ -169,12 +179,54 @@ const StepTwo = ({
     return processRule
   }
 
+  const getNotionInfo = () => {
+    const workspacesMap = groupBy(notionPages, 'workspace_id')
+    const workspaces = Object.keys(workspacesMap).map((workspaceId) => {
+      return {
+        workspaceId,
+        pages: workspacesMap[workspaceId],
+      }
+    })
+    return workspaces.map((workspace) => {
+      return {
+        workspace_id: workspace.workspaceId,
+        pages: workspace.pages.map((page) => {
+          const { page_id, page_name, page_icon, type } = page
+          return {
+            page_id,
+            page_name,
+            page_icon,
+            type,
+          }
+        }),
+      }
+    }) as NotionInfo[]
+  }
+
   const getFileIndexingEstimateParams = () => {
-    const params = {
-      file_id: file?.id,
-      dataset_id: datasetId,
-      indexing_technique: getIndexing_technique(),
-      process_rule: getProcessRule(),
+    let params
+    if (dataSourceType === DataSourceType.FILE) {
+      params = {
+        info_list: {
+          data_source_type: dataSourceType,
+          file_info_list: {
+            // TODO multi files
+            file_ids: [file?.id || ''],
+          },
+        },
+        indexing_technique: getIndexing_technique(),
+        process_rule: getProcessRule(),
+      }
+    }
+    if (dataSourceType === DataSourceType.NOTION) {
+      params = {
+        info_list: {
+          data_source_type: dataSourceType,
+          notion_info_list: getNotionInfo(),
+        },
+        indexing_technique: getIndexing_technique(),
+        process_rule: getProcessRule(),
+      }
     }
     return params
   }
@@ -190,13 +242,22 @@ const StepTwo = ({
     else {
       params = {
         data_source: {
-          type: 'upload_file',
-          info: file?.id,
-          name: file?.name,
+          type: dataSourceType,
+          info_list: {
+            data_source_type: dataSourceType,
+          },
         },
         indexing_technique: getIndexing_technique(),
         process_rule: getProcessRule(),
       } as CreateDocumentReq
+      if (dataSourceType === DataSourceType.FILE) {
+        params.data_source.info_list.file_info_list = {
+          // TODO multi files
+          file_ids: [file?.id || ''],
+        }
+      }
+      if (dataSourceType === DataSourceType.NOTION)
+        params.data_source.info_list.notion_info_list = getNotionInfo()
     }
     return params
   }
@@ -249,9 +310,7 @@ const StepTwo = ({
           body: params,
         })
         updateIndexingTypeCache && updateIndexingTypeCache(indexType)
-        updateResultCache && updateResultCache({
-          document: res,
-        })
+        updateResultCache && updateResultCache(res)
       }
       onStepChange && onStepChange(+1)
       isSetting && onSave && onSave()
@@ -319,7 +378,6 @@ const StepTwo = ({
         <div className={cn(s.form)}>
           <div className={s.label}>{t('datasetCreation.stepTwo.segmentation')}</div>
           <div className='max-w-[640px]'>
-
             <div
               className={cn(
                 s.radioItem,
@@ -467,16 +525,41 @@ const StepTwo = ({
                 <Link className='text-[#155EEF]' href={`/datasets/${datasetId}/settings`}>{t('datasetCreation.stepTwo.datasetSettingLink')}</Link>
               </div>
             )}
-            <div className={s.file}>
-              <div className={s.fileContent}>
-                <div className='mb-2 text-xs font-medium text-gray-500'>{t('datasetCreation.stepTwo.fileName')}</div>
-                <div className='flex items-center text-sm leading-6 font-medium text-gray-800'>
-                  <span className={cn(s.fileIcon, file && s[file.extension])} />
-                  {getFileName(file?.name || '')}
-                </div>
+            {/* TODO multi files */}
+            <div className={s.source}>
+              <div className={s.sourceContent}>
+                {dataSourceType === DataSourceType.FILE && (
+                  <>
+                    <div className='mb-2 text-xs font-medium text-gray-500'>{t('datasetCreation.stepTwo.fileSource')}</div>
+                    <div className='flex items-center text-sm leading-6 font-medium text-gray-800'>
+                      <span className={cn(s.fileIcon, file && s[file.extension])} />
+                      {getFileName(file?.name || '')}
+                    </div>
+                  </>
+                )}
+                {dataSourceType === DataSourceType.NOTION && (
+                  <>
+                    <div className='mb-2 text-xs font-medium text-gray-500'>{t('datasetCreation.stepTwo.notionSource')}</div>
+                    <div className='flex items-center text-sm leading-6 font-medium text-gray-800'>
+                      <NotionIcon
+                        className='shrink-0 mr-1'
+                        type='page'
+                        src={notionPages[0]?.page_icon}
+                      />
+                      {notionPages[0]?.page_name}
+                      {notionPages.length > 1 && (
+                        <span className={s.sourceCount}>
+                          <span>{t('datasetCreation.stepTwo.other')}</span>
+                          <span>{notionPages.length - 1}</span>
+                          <span>{t('datasetCreation.stepTwo.notionUnit')}</span>
+                        </span>
+                      )}
+                    </div>
+                  </>
+                )}
               </div>
               <div className={s.divider} />
-              <div className={s.fileContent}>
+              <div className={s.segmentCount}>
                 <div className='mb-2 text-xs font-medium text-gray-500'>{t('datasetCreation.stepTwo.emstimateSegment')}</div>
                 <div className='flex items-center text-sm leading-6 font-medium text-gray-800'>
                   {

+ 26 - 16
web/app/components/datasets/documents/detail/index.tsx

@@ -8,15 +8,16 @@ import { useTranslation } from 'react-i18next'
 import { useRouter } from 'next/navigation'
 import { omit } from 'lodash-es'
 import cn from 'classnames'
-import Divider from '@/app/components/base/divider'
-import Loading from '@/app/components/base/loading'
-import { fetchDocumentDetail, MetadataType } from '@/service/datasets'
 import { OperationAction, StatusItem } from '../list'
+import s from '../style.module.css'
 import Completed from './completed'
 import Embedding from './embedding'
 import Metadata from './metadata'
-import s from '../style.module.css'
 import style from './style.module.css'
+import Divider from '@/app/components/base/divider'
+import Loading from '@/app/components/base/loading'
+import type { MetadataType } from '@/service/datasets'
+import { fetchDocumentDetail } from '@/service/datasets'
 
 export const BackCircleBtn: FC<{ onClick: () => void }> = ({ onClick }) => {
   return (
@@ -29,11 +30,11 @@ export const BackCircleBtn: FC<{ onClick: () => void }> = ({ onClick }) => {
 export const DocumentContext = createContext<{ datasetId?: string; documentId?: string }>({})
 
 type DocumentTitleProps = {
-  extension?: string;
-  name?: string;
-  iconCls?: string;
-  textCls?: string;
-  wrapperCls?: string;
+  extension?: string
+  name?: string
+  iconCls?: string
+  textCls?: string
+  wrapperCls?: string
 }
 
 export const DocumentTitle: FC<DocumentTitleProps> = ({ extension, name, iconCls, textCls, wrapperCls }) => {
@@ -58,15 +59,16 @@ const DocumentDetail: FC<Props> = ({ datasetId, documentId }) => {
     action: 'fetchDocumentDetail',
     datasetId,
     documentId,
-    params: { metadata: 'without' as MetadataType }
+    params: { metadata: 'without' as MetadataType },
   }, apiParams => fetchDocumentDetail(omit(apiParams, 'action')))
 
   const { data: documentMetadata, error: metadataErr, mutate: metadataMutate } = useSWR({
     action: 'fetchDocumentDetail',
     datasetId,
     documentId,
-    params: { metadata: 'only' as MetadataType }
-  }, apiParams => fetchDocumentDetail(omit(apiParams, 'action')))
+    params: { metadata: 'only' as MetadataType },
+  }, apiParams => fetchDocumentDetail(omit(apiParams, 'action')),
+  )
 
   const backToPrev = () => {
     router.push(`/datasets/${datasetId}/documents`)
@@ -77,6 +79,13 @@ const DocumentDetail: FC<Props> = ({ datasetId, documentId }) => {
 
   const embedding = ['queuing', 'indexing', 'paused'].includes((documentDetail?.display_status || '').toLowerCase())
 
+  const handleOperate = (operateName?: string) => {
+    if (operateName === 'delete')
+      backToPrev()
+    else
+      detailMutate()
+  }
+
   return (
     <DocumentContext.Provider value={{ datasetId, documentId }}>
       <div className='flex flex-col h-full'>
@@ -90,10 +99,10 @@ const DocumentDetail: FC<Props> = ({ datasetId, documentId }) => {
             detail={{
               enabled: documentDetail?.enabled || false,
               archived: documentDetail?.archived || false,
-              id: documentId
+              id: documentId,
             }}
             datasetId={datasetId}
-            onUpdate={detailMutate}
+            onUpdate={handleOperate}
             className='!w-[216px]'
           />
           <button
@@ -102,8 +111,9 @@ const DocumentDetail: FC<Props> = ({ datasetId, documentId }) => {
           />
         </div>
         <div className='flex flex-row flex-1' style={{ height: 'calc(100% - 4rem)' }}>
-          {isDetailLoading ? <Loading type='app' /> :
-            <div className={`box-border h-full w-full overflow-y-scroll ${embedding ? 'py-12 px-16' : 'pb-[30px] pt-3 px-6'}`}>
+          {isDetailLoading
+            ? <Loading type='app' />
+            : <div className={`box-border h-full w-full overflow-y-scroll ${embedding ? 'py-12 px-16' : 'pb-[30px] pt-3 px-6'}`}>
               {embedding ? <Embedding detail={documentDetail} /> : <Completed />}
             </div>
           }

+ 12 - 1
web/app/components/datasets/documents/detail/settings/index.tsx

@@ -1,5 +1,5 @@
 'use client'
-import React, { useEffect, useState } from 'react'
+import React, { useEffect, useMemo, useState } from 'react'
 import { useTranslation } from 'react-i18next'
 import { useBoolean } from 'ahooks'
 import { useContext } from 'use-context-selector'
@@ -43,6 +43,15 @@ const DocumentSettings = ({ datasetId, documentId }: DocumentSettingsProps) => {
   }, [])
 
   const [documentDetail, setDocumentDetail] = useState<FullDocumentDetail | null>(null)
+  const currentPage = useMemo(() => {
+    return {
+      workspace_id: documentDetail?.data_source_info.notion_workspace_id,
+      page_id: documentDetail?.data_source_info.notion_page_id,
+      page_name: documentDetail?.name,
+      page_icon: documentDetail?.data_source_info.notion_page_icon,
+      type: documentDetail?.data_source_info.type,
+    }
+  }, [documentDetail])
   useEffect(() => {
     (async () => {
       try {
@@ -71,6 +80,8 @@ const DocumentSettings = ({ datasetId, documentId }: DocumentSettingsProps) => {
             hasSetAPIKEY={hasSetAPIKEY}
             onSetting={showSetAPIKey}
             datasetId={datasetId}
+            dataSourceType={documentDetail.data_source_type}
+            notionPages={[currentPage]}
             indexingType={indexingTechnique || ''}
             isSetting
             documentDetail={documentDetail}

+ 118 - 12
web/app/components/datasets/documents/index.tsx

@@ -4,7 +4,7 @@ import React, { useMemo, useState } from 'react'
 import useSWR from 'swr'
 import { useTranslation } from 'react-i18next'
 import { useRouter } from 'next/navigation'
-import { debounce, omit } from 'lodash-es'
+import { debounce, groupBy, omit } from 'lodash-es'
 // import Link from 'next/link'
 import { PlusIcon } from '@heroicons/react/24/solid'
 import List from './list'
@@ -14,7 +14,12 @@ import Button from '@/app/components/base/button'
 import Input from '@/app/components/base/input'
 import Pagination from '@/app/components/base/pagination'
 import { get } from '@/service/base'
-import { fetchDocuments } from '@/service/datasets'
+import { createDocument, fetchDocuments } from '@/service/datasets'
+import { useDatasetDetailContext } from '@/context/dataset-detail'
+import { NotionPageSelectorModal } from '@/app/components/base/notion-page-selector'
+import type { DataSourceNotionPage } from '@/models/common'
+import type { CreateDocumentReq } from '@/models/datasets'
+import { DataSourceType } from '@/models/datasets'
 
 // Custom page count is not currently supported.
 const limit = 15
@@ -75,20 +80,63 @@ const Documents: FC<IDocumentsProps> = ({ datasetId }) => {
   const [searchValue, setSearchValue] = useState<string>('')
   const [currPage, setCurrPage] = React.useState<number>(0)
   const router = useRouter()
+  const { dataset } = useDatasetDetailContext()
+  const [notionPageSelectorModalVisible, setNotionPageSelectorModalVisible] = useState(false)
+  const [timerCanRun, setTimerCanRun] = useState(true)
+  const isDataSourceNotion = dataset?.data_source_type === DataSourceType.NOTION
 
   const query = useMemo(() => {
-    return { page: currPage + 1, limit, keyword: searchValue }
-  }, [searchValue, currPage])
-
-  const { data: documentsRes, error, mutate } = useSWR({
-    action: 'fetchDocuments',
-    datasetId,
-    params: query,
-  }, apiParams => fetchDocuments(omit(apiParams, 'action')))
+    return { page: currPage + 1, limit, keyword: searchValue, fetch: isDataSourceNotion ? true : '' }
+  }, [searchValue, currPage, isDataSourceNotion])
+
+  const { data: documentsRes, error, mutate } = useSWR(
+    {
+      action: 'fetchDocuments',
+      datasetId,
+      params: query,
+    },
+    apiParams => fetchDocuments(omit(apiParams, 'action')),
+    { refreshInterval: (isDataSourceNotion && timerCanRun) ? 2500 : 0 },
+  )
 
+  const documentsWithProgress = useMemo(() => {
+    let completedNum = 0
+    let percent = 0
+    const documentsData = documentsRes?.data?.map((documentItem) => {
+      const { indexing_status, completed_segments, total_segments } = documentItem
+      const isEmbeddinged = indexing_status === 'completed' || indexing_status === 'paused' || indexing_status === 'error'
+
+      if (isEmbeddinged)
+        completedNum++
+
+      const completedCount = completed_segments || 0
+      const totalCount = total_segments || 0
+      if (totalCount === 0 && completedCount === 0) {
+        percent = isEmbeddinged ? 100 : 0
+      }
+      else {
+        const per = Math.round(completedCount * 100 / totalCount)
+        percent = per > 100 ? 100 : per
+      }
+      return {
+        ...documentItem,
+        percent,
+      }
+    })
+    if (completedNum === documentsRes?.data?.length)
+      setTimerCanRun(false)
+    return {
+      ...documentsRes,
+      data: documentsData,
+    }
+  }, [documentsRes])
   const total = documentsRes?.total || 0
 
   const routeToDocCreate = () => {
+    if (isDataSourceNotion) {
+      setNotionPageSelectorModalVisible(true)
+      return
+    }
     router.push(`/datasets/${datasetId}/documents/create`)
   }
 
@@ -96,6 +144,54 @@ const Documents: FC<IDocumentsProps> = ({ datasetId }) => {
 
   const isLoading = !documentsRes && !error
 
+  const handleSaveNotionPageSelected = async (selectedPages: (DataSourceNotionPage & { workspace_id: string })[]) => {
+    const workspacesMap = groupBy(selectedPages, 'workspace_id')
+    const workspaces = Object.keys(workspacesMap).map((workspaceId) => {
+      return {
+        workspaceId,
+        pages: workspacesMap[workspaceId],
+      }
+    })
+    const params = {
+      data_source: {
+        type: dataset?.data_source_type,
+        info_list: {
+          data_source_type: dataset?.data_source_type,
+          notion_info_list: workspaces.map((workspace) => {
+            return {
+              workspace_id: workspace.workspaceId,
+              pages: workspace.pages.map((page) => {
+                const { page_id, page_name, page_icon, type } = page
+                return {
+                  page_id,
+                  page_name,
+                  page_icon,
+                  type,
+                }
+              }),
+            }
+          }),
+        },
+      },
+      indexing_technique: dataset?.indexing_technique,
+      process_rule: {
+        rules: {},
+        mode: 'automatic',
+      },
+    } as CreateDocumentReq
+
+    await createDocument({
+      datasetId,
+      body: params,
+    })
+    mutate()
+    setTimerCanRun(true)
+    // mutateDatasetIndexingStatus(undefined, { revalidate: true })
+    setNotionPageSelectorModalVisible(false)
+  }
+
+  const documentsList = isDataSourceNotion ? documentsWithProgress?.data : documentsRes?.data
+
   return (
     <div className='flex flex-col h-full overflow-y-auto'>
       <div className='flex flex-col justify-center gap-1 px-6 pt-4'>
@@ -113,19 +209,29 @@ const Documents: FC<IDocumentsProps> = ({ datasetId }) => {
           />
           <Button type='primary' onClick={routeToDocCreate} className='!h-8 !text-[13px]'>
             <PlusIcon className='h-4 w-4 mr-2 stroke-current' />
-            {t('datasetDocuments.list.addFile')}
+            {
+              isDataSourceNotion
+                ? t('datasetDocuments.list.addPages')
+                : t('datasetDocuments.list.addFile')
+            }
           </Button>
         </div>
         {isLoading
           ? <Loading type='app' />
           : total > 0
-            ? <List documents={documentsRes?.data || []} datasetId={datasetId} onUpdate={mutate} />
+            ? <List documents={documentsList || []} datasetId={datasetId} onUpdate={mutate} />
             : <EmptyElement onClick={routeToDocCreate} />
         }
         {/* Show Pagination only if the total is more than the limit */}
         {(total && total > limit)
           ? <Pagination current={currPage} onChange={setCurrPage} total={total} limit={limit} />
           : null}
+        <NotionPageSelectorModal
+          isShow={notionPageSelectorModalVisible}
+          onClose={() => setNotionPageSelectorModalVisible(false)}
+          onSave={handleSaveNotionPageSelected}
+          datasetId={dataset?.id || ''}
+        />
       </div>
     </div>
   )

+ 45 - 16
web/app/components/datasets/documents/list.tsx

@@ -22,8 +22,10 @@ import type { IndicatorProps } from '@/app/components/header/indicator'
 import Indicator from '@/app/components/header/indicator'
 import { asyncRunSafe } from '@/utils'
 import { formatNumber } from '@/utils/format'
-import { archiveDocument, deleteDocument, disableDocument, enableDocument } from '@/service/datasets'
-import type { DocumentDisplayStatus, DocumentListResponse } from '@/models/datasets'
+import { archiveDocument, deleteDocument, disableDocument, enableDocument, syncDocument } from '@/service/datasets'
+import NotionIcon from '@/app/components/base/notion-icon'
+import ProgressBar from '@/app/components/base/progress-bar'
+import { DataSourceType, type DocumentDisplayStatus, type SimpleDocumentDetail } from '@/models/datasets'
 import type { CommonResponse } from '@/models/common'
 
 export const SettingsIcon: FC<{ className?: string }> = ({ className }) => {
@@ -32,6 +34,12 @@ export const SettingsIcon: FC<{ className?: string }> = ({ className }) => {
   </svg>
 }
 
+export const SyncIcon: FC<{ className?: string }> = () => {
+  return <svg width="16" height="16" viewBox="0 0 16 16" fill="none" xmlns="http://www.w3.org/2000/svg">
+    <path d="M5.69773 13.1783C7.29715 13.8879 9.20212 13.8494 10.8334 12.9075C13.5438 11.3427 14.4724 7.87704 12.9076 5.16672L12.7409 4.87804M3.09233 10.8335C1.52752 8.12314 2.45615 4.65746 5.16647 3.09265C6.7978 2.15081 8.70277 2.11227 10.3022 2.82185M1.66226 10.8892L3.48363 11.3773L3.97166 9.5559M12.0284 6.44393L12.5164 4.62256L14.3378 5.1106" stroke="#667085" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"/>
+  </svg>
+}
+
 export const FilePlusIcon: FC<{ className?: string }> = ({ className }) => {
   return <svg width="16" height="16" viewBox="0 0 16 16" fill="none" xmlns="http://www.w3.org/2000/svg" className={className ?? ''}>
     <path d="M13.3332 6.99992V4.53325C13.3332 3.41315 13.3332 2.85309 13.1152 2.42527C12.9234 2.04895 12.6175 1.74299 12.2412 1.55124C11.8133 1.33325 11.2533 1.33325 10.1332 1.33325H5.8665C4.7464 1.33325 4.18635 1.33325 3.75852 1.55124C3.3822 1.74299 3.07624 2.04895 2.88449 2.42527C2.6665 2.85309 2.6665 3.41315 2.6665 4.53325V11.4666C2.6665 12.5867 2.6665 13.1467 2.88449 13.5746C3.07624 13.9509 3.3822 14.2569 3.75852 14.4486C4.18635 14.6666 4.7464 14.6666 5.8665 14.6666H7.99984M9.33317 7.33325H5.33317M6.6665 9.99992H5.33317M10.6665 4.66659H5.33317M11.9998 13.9999V9.99992M9.99984 11.9999H13.9998" stroke="#667085" strokeWidth="1.5" strokeLinecap="round" strokeLinejoin="round" />
@@ -77,7 +85,7 @@ export const StatusItem: FC<{
   </div>
 }
 
-type OperationName = 'delete' | 'archive' | 'enable' | 'disable'
+type OperationName = 'delete' | 'archive' | 'enable' | 'disable' | 'sync'
 
 // operation action for list and detail
 export const OperationAction: FC<{
@@ -85,13 +93,14 @@ export const OperationAction: FC<{
     enabled: boolean
     archived: boolean
     id: string
+    data_source_type: string
   }
   datasetId: string
-  onUpdate: () => void
+  onUpdate: (operationName?: string) => void
   scene?: 'list' | 'detail'
   className?: string
 }> = ({ datasetId, detail, onUpdate, scene = 'list', className = '' }) => {
-  const { id, enabled = false, archived = false } = detail || {}
+  const { id, enabled = false, archived = false, data_source_type } = detail || {}
   const [showModal, setShowModal] = useState(false)
   const { notify } = useContext(ToastContext)
   const { t } = useTranslation()
@@ -111,6 +120,9 @@ export const OperationAction: FC<{
       case 'disable':
         opApi = disableDocument
         break
+      case 'sync':
+        opApi = syncDocument
+        break
       default:
         opApi = deleteDocument
         break
@@ -120,7 +132,7 @@ export const OperationAction: FC<{
       notify({ type: 'success', message: t('common.actionMsg.modifiedSuccessfully') })
     else
       notify({ type: 'error', message: t('common.actionMsg.modificationFailed') })
-    onUpdate()
+    onUpdate(operationName)
   }
 
   return <div
@@ -173,10 +185,14 @@ export const OperationAction: FC<{
                 <SettingsIcon />
                 <span className={s.actionName}>{t('datasetDocuments.list.action.settings')}</span>
               </div>
-              {/* <div className={s.actionItem} onClick={() => router.push(`/datasets/${datasetId}/documents/create`)}>
-                <FilePlusIcon />
-                <span className={s.actionName}>{t('datasetDocuments.list.action.uploadFile')}</span>
-              </div> */}
+              {
+                data_source_type === 'notion_import' && (
+                  <div className={s.actionItem} onClick={() => onOperate('sync')}>
+                    <SyncIcon />
+                    <span className={s.actionName}>{t('datasetDocuments.list.action.sync')}</span>
+                  </div>
+                )
+              }
               <Divider className='my-1' />
             </>
           )}
@@ -236,8 +252,9 @@ const renderCount = (count: number | undefined) => {
   return `${formatNumber((count / 1000).toFixed(1))}k`
 }
 
+type LocalDoc = SimpleDocumentDetail & { percent?: number }
 type IDocumentListProps = {
-  documents: DocumentListResponse['data']
+  documents: LocalDoc[]
   datasetId: string
   onUpdate: () => void
 }
@@ -248,7 +265,7 @@ type IDocumentListProps = {
 const DocumentList: FC<IDocumentListProps> = ({ documents = [], datasetId, onUpdate }) => {
   const { t } = useTranslation()
   const router = useRouter()
-  const [localDocs, setLocalDocs] = useState<DocumentListResponse['data']>(documents)
+  const [localDocs, setLocalDocs] = useState<LocalDoc[]>(documents)
   const [enableSort, setEnableSort] = useState(false)
 
   useEffect(() => {
@@ -296,8 +313,16 @@ const DocumentList: FC<IDocumentListProps> = ({ documents = [], datasetId, onUpd
               }}>
               <td className='text-left align-middle text-gray-500 text-xs'>{doc.position}</td>
               <td className={s.tdValue}>
-                <div className={cn(s[`${doc?.data_source_info?.upload_file?.extension ?? suffix}Icon`], s.commonIcon, 'mr-1.5')}></div>
-                <span>{doc?.name?.replace(/\.[^/.]+$/, '')}<span className='text-gray-500'>.{suffix}</span></span>
+                {
+                  doc?.data_source_type === DataSourceType.NOTION
+                    ? <NotionIcon className='inline-flex -mt-[3px] mr-1.5 align-middle' type='page' src={doc.data_source_info.notion_page_icon} />
+                    : <div className={cn(s[`${doc?.data_source_info?.upload_file?.extension ?? suffix}Icon`], s.commonIcon, 'mr-1.5')}></div>
+                }
+                {
+                  doc.data_source_type === DataSourceType.NOTION
+                    ? <span>{doc.name}</span>
+                    : <span>{doc?.name?.replace(/\.[^/.]+$/, '')}<span className='text-gray-500'>.{suffix}</span></span>
+                }
               </td>
               <td>{renderCount(doc.word_count)}</td>
               <td>{renderCount(doc.hit_count)}</td>
@@ -305,12 +330,16 @@ const DocumentList: FC<IDocumentListProps> = ({ documents = [], datasetId, onUpd
                 {dayjs.unix(doc.created_at).format(t('datasetHitTesting.dateTimeFormat') as string)}
               </td>
               <td>
-                <StatusItem status={doc.display_status} />
+                {
+                  (['indexing', 'splitting', 'parsing', 'cleaning'].includes(doc.indexing_status) && doc?.data_source_type === DataSourceType.NOTION)
+                    ? <ProgressBar percent={doc.percent || 0} />
+                    : <StatusItem status={doc.display_status} />
+                }
               </td>
               <td>
                 <OperationAction
                   datasetId={datasetId}
-                  detail={pick(doc, ['enabled', 'archived', 'id'])}
+                  detail={pick(doc, ['enabled', 'archived', 'id', 'data_source_type'])}
                   onUpdate={onUpdate}
                 />
               </td>

+ 3 - 0
web/app/components/datasets/documents/style.module.css

@@ -75,6 +75,9 @@
 .markdownIcon {
   background-image: url(./assets/md.svg);
 }
+.mdIcon {
+  background-image: url(./assets/md.svg);
+}
 .xlsIcon {
   background-image: url(./assets/xlsx.svg);
 }

+ 1 - 1
web/app/components/header/account-dropdown/index.tsx

@@ -9,9 +9,9 @@ import { Menu, Transition } from '@headlessui/react'
 import Indicator from '../indicator'
 import AccountSetting from '../account-setting'
 import AccountAbout from '../account-about'
+import WorkplaceSelector from './workplace-selector'
 import type { LangGeniusVersionResponse, UserProfileResponse } from '@/models/common'
 import I18n from '@/context/i18n'
-import WorkplaceSelector from './workplace-selector'
 import Avatar from '@/app/components/base/avatar'
 
 type IAppSelectorProps = {

+ 102 - 0
web/app/components/header/account-setting/data-source-page/data-source-notion/index.tsx

@@ -0,0 +1,102 @@
+import { useTranslation } from 'react-i18next'
+import Link from 'next/link'
+import { PlusIcon } from '@heroicons/react/24/solid'
+import cn from 'classnames'
+import Indicator from '../../../indicator'
+import Operate from './operate'
+import s from './style.module.css'
+import NotionIcon from '@/app/components/base/notion-icon'
+import { apiPrefix } from '@/config'
+import type { DataSourceNotion as TDataSourceNotion } from '@/models/common'
+
+type DataSourceNotionProps = {
+  workspaces: TDataSourceNotion[]
+}
+const DataSourceNotion = ({
+  workspaces,
+}: DataSourceNotionProps) => {
+  const { t } = useTranslation()
+  const connected = !!workspaces.length
+
+  return (
+    <div className='mb-2 border-[0.5px] border-gray-200 bg-gray-50 rounded-xl'>
+      <div className='flex items-center px-3 py-[9px]'>
+        <div className={cn(s['notion-icon'], 'w-8 h-8 mr-3 border border-gray-100 rounded-lg')} />
+        <div className='grow'>
+          <div className='leading-5 text-sm font-medium text-gray-800'>
+            {t('common.dataSource.notion.title')}
+          </div>
+          {
+            !connected && (
+              <div className='leading-5 text-xs text-gray-500'>
+                {t('common.dataSource.notion.description')}
+              </div>
+            )
+          }
+        </div>
+        {
+          !connected
+            ? (
+              <Link
+                className='flex items-center ml-3 px-3 h-7 bg-white border border-gray-200 rounded-md text-xs font-medium text-gray-700 cursor-pointer'
+                href={`${apiPrefix}/oauth/data-source/notion`}>
+                {t('common.dataSource.connect')}
+              </Link>
+            )
+            : (
+              <Link
+                href={`${apiPrefix}/oauth/data-source/notion`}
+                className='flex items-center px-3 h-7 bg-white border-[0.5px] border-gray-200 text-xs font-medium text-primary-600 rounded-md cursor-pointer'>
+                <PlusIcon className='w-[14px] h-[14px] mr-[5px]' />
+                {t('common.dataSource.notion.addWorkspace')}
+              </Link>
+            )
+        }
+      </div>
+      {
+        connected && (
+          <div className='flex items-center px-3 h-[18px]'>
+            <div className='text-xs font-medium text-gray-500'>
+              {t('common.dataSource.notion.connectedWorkspace')}
+            </div>
+            <div className='grow ml-3 border-t border-t-gray-100' />
+          </div>
+        )
+      }
+      {
+        connected && (
+          <div className='px-3 pt-2 pb-3'>
+            {
+              workspaces.map(workspace => (
+                <div className={cn(s['workspace-item'], 'flex items-center mb-1 py-1 pr-1 bg-white rounded-lg')} key={workspace.id}>
+                  <NotionIcon
+                    className='ml-3 mr-[6px]'
+                    src={workspace.source_info.workspace_icon}
+                    name={workspace.source_info.workspace_name}
+                  />
+                  <div className='grow py-[7px] leading-[18px] text-[13px] font-medium text-gray-700 truncate' title={workspace.source_info.workspace_name}>{workspace.source_info.workspace_name}</div>
+                  {
+                    workspace.is_bound
+                      ? <Indicator className='shrink-0 mr-[6px]' />
+                      : <Indicator className='shrink-0 mr-[6px]' color='yellow' />
+                  }
+                  <div className='shrink-0 mr-3 text-xs font-medium'>
+                    {
+                      workspace.is_bound
+                        ? t('common.dataSource.notion.connected')
+                        : t('common.dataSource.notion.disconnected')
+                    }
+                  </div>
+                  <div className='mr-2 w-[1px] h-3 bg-gray-100' />
+                  <Operate workspace={workspace} />
+                </div>
+              ))
+            }
+          </div>
+        )
+      }
+    </div>
+  )
+}
+
+export default DataSourceNotion

+ 14 - 0
web/app/components/header/account-setting/data-source-page/data-source-notion/operate/index.module.css

@@ -0,0 +1,14 @@
+.file-icon {
+  background: url(../../../../assets/file.svg) center center no-repeat;
+  background-size: contain;
+}
+
+.sync-icon {
+  background: url(../../../../assets/sync.svg) center center no-repeat;
+  background-size: contain;
+}
+
+.trash-icon {
+  background: url(../../../../assets/trash.svg) center center no-repeat;
+  background-size: contain;
+}

+ 107 - 0
web/app/components/header/account-setting/data-source-page/data-source-notion/operate/index.tsx

@@ -0,0 +1,107 @@
+'use client'
+import { useTranslation } from 'react-i18next'
+import { Fragment } from 'react'
+import Link from 'next/link'
+import { useSWRConfig } from 'swr'
+import { EllipsisHorizontalIcon } from '@heroicons/react/24/solid'
+import { Menu, Transition } from '@headlessui/react'
+import cn from 'classnames'
+import s from './index.module.css'
+import { apiPrefix } from '@/config'
+import { syncDataSourceNotion, updateDataSourceNotionAction } from '@/service/common'
+import Toast from '@/app/components/base/toast'
+import type { DataSourceNotion } from '@/models/common'
+
+type OperateProps = {
+  workspace: DataSourceNotion
+}
+export default function Operate({
+  workspace,
+}: OperateProps) {
+  const itemClassName = `
+    flex px-3 py-2 hover:bg-gray-50 text-sm text-gray-700
+    cursor-pointer
+  `
+  const itemIconClassName = `
+  mr-2 mt-[2px] w-4 h-4
+  `
+  const { t } = useTranslation()
+  const { mutate } = useSWRConfig()
+
+  const updateIntegrates = () => {
+    Toast.notify({
+      type: 'success',
+      message: t('common.api.success'),
+    })
+    mutate({ url: 'data-source/integrates' })
+  }
+  const handleSync = async () => {
+    await syncDataSourceNotion({ url: `/oauth/data-source/notion/${workspace.id}/sync` })
+    updateIntegrates()
+  }
+  const handleRemove = async () => {
+    await updateDataSourceNotionAction({ url: `/data-source/integrates/${workspace.id}/disable` })
+    updateIntegrates()
+  }
+
+  return (
+    <Menu as="div" className="relative inline-block text-left">
+      {
+        ({ open }) => (
+          <>
+            <Menu.Button className={`flex items-center justify-center w-8 h-8 rounded-lg hover:bg-gray-100 ${open && 'bg-gray-100'}`}>
+              <EllipsisHorizontalIcon className='w-4 h-4' />
+            </Menu.Button>
+            <Transition
+              as={Fragment}
+              enter="transition ease-out duration-100"
+              enterFrom="transform opacity-0 scale-95"
+              enterTo="transform opacity-100 scale-100"
+              leave="transition ease-in duration-75"
+              leaveFrom="transform opacity-100 scale-100"
+              leaveTo="transform opacity-0 scale-95"
+            >
+              <Menu.Items
+                className="
+                  absolute right-0 top-9 w-60 max-w-80
+                  divide-y divide-gray-100 origin-top-right rounded-lg bg-white
+                  shadow-[0_10px_15px_-3px_rgba(0,0,0,0.1),0_4px_6px_rgba(0,0,0,0.05)]
+                "
+              >
+                <div className="px-1 py-1">
+                  <Menu.Item>
+                    <Link
+                      className={itemClassName}
+                      href={`${apiPrefix}/oauth/data-source/notion`}>
+                      <div className={cn(s['file-icon'], itemIconClassName)}></div>
+                      <div>
+                        <div className='leading-5'>{t('common.dataSource.notion.changeAuthorizedPages')}</div>
+                        <div className='leading-5 text-xs text-gray-500'>
+                          {workspace.source_info.total} {t('common.dataSource.notion.pagesAuthorized')}
+                        </div>
+                      </div>
+                    </Link>
+                  </Menu.Item>
+                  <Menu.Item>
+                    <div className={itemClassName} onClick={handleSync}>
+                      <div className={cn(s['sync-icon'], itemIconClassName)} />
+                      <div className='leading-5'>{t('common.dataSource.notion.sync')}</div>
+                    </div>
+                  </Menu.Item>
+                </div>
+                <Menu.Item>
+                  <div className='p-1'>
+                    <div className={itemClassName} onClick={handleRemove}>
+                      <div className={cn(s['trash-icon'], itemIconClassName)} />
+                      <div className='leading-5'>{t('common.dataSource.notion.remove')}</div>
+                    </div>
+                  </div>
+                </Menu.Item>
+              </Menu.Items>
+            </Transition>
+          </>
+        )
+      }
+    </Menu>
+  )
+}

+ 12 - 0
web/app/components/header/account-setting/data-source-page/data-source-notion/style.module.css

@@ -0,0 +1,12 @@
+.notion-icon {
+  background: #ffffff url(../../../assets/notion.svg) center center no-repeat;
+  background-size: 20px 20px;
+}
+
+.workspace-item {
+  box-shadow: 0px 1px 2px rgba(16, 24, 40, 0.05);
+}
+
+.workspace-item:last-of-type {
+  margin-bottom: 0;
+}

+ 0 - 0
web/app/components/header/account-setting/data-source-page/index.module.css


+ 17 - 0
web/app/components/header/account-setting/data-source-page/index.tsx

@@ -0,0 +1,17 @@
+import useSWR from 'swr'
+import { useTranslation } from 'react-i18next'
+import DataSourceNotion from './data-source-notion'
+import { fetchDataSource } from '@/service/common'
+
+export default function DataSourcePage() {
+  const { t } = useTranslation()
+  const { data } = useSWR({ url: 'data-source/integrates' }, fetchDataSource)
+  const notionWorkspaces = data?.data.filter(item => item.provider === 'notion') || []
+
+  return (
+    <div className='mb-8'>
+      <div className='mb-2 text-sm font-medium text-gray-900'>{t('common.dataSource.add')}</div>
+      <DataSourceNotion workspaces={notionWorkspaces} />
+    </div>
+  )
+}

+ 10 - 0
web/app/components/header/account-setting/index.module.css

@@ -2,4 +2,14 @@
   max-width: 720px !important;
   padding: 0 !important;
   overflow-y: auto;
+}
+
+.data-source-icon {
+  background: url(../assets/data-source.svg) center center no-repeat;
+  background-size: cover;
+}
+
+.data-source-solid-icon {
+  background: url(../assets/data-source-blue.svg) center center no-repeat;
+  background-size: cover;
 }

+ 26 - 5
web/app/components/header/account-setting/index.tsx

@@ -1,20 +1,32 @@
 'use client'
 import { useTranslation } from 'react-i18next'
 import { useState } from 'react'
-import { AtSymbolIcon, GlobeAltIcon, UserIcon, XMarkIcon, CubeTransparentIcon, UsersIcon } from '@heroicons/react/24/outline'
+import { AtSymbolIcon, CubeTransparentIcon, GlobeAltIcon, UserIcon, UsersIcon, XMarkIcon } from '@heroicons/react/24/outline'
 import { GlobeAltIcon as GlobalAltIconSolid, UserIcon as UserIconSolid, UsersIcon as UsersIconSolid } from '@heroicons/react/24/solid'
+import cn from 'classnames'
 import AccountPage from './account-page'
 import MembersPage from './members-page'
 import IntegrationsPage from './Integrations-page'
 import LanguagePage from './language-page'
 import ProviderPage from './provider-page'
+import DataSourcePage from './data-source-page'
 import s from './index.module.css'
 import Modal from '@/app/components/base/modal'
 
 const iconClassName = `
-  w-[18px] h-[18px] ml-3 mr-2
+  w-4 h-4 ml-3 mr-2
 `
 
+type IconProps = {
+  className?: string
+}
+const DataSourceIcon = ({ className }: IconProps) => (
+  <div className={cn(s['data-source-icon'], className)} />
+)
+const DataSourceSolidIcon = ({ className }: IconProps) => (
+  <div className={cn(s['data-source-solid-icon'], className)} />
+)
+
 type IAccountSettingProps = {
   onCancel: () => void
   activeTab?: string
@@ -48,7 +60,7 @@ export default function AccountSetting({
           icon: <GlobeAltIcon className={iconClassName} />,
           activeIcon: <GlobalAltIconSolid className={iconClassName} />,
         },
-      ]
+      ],
     },
     {
       key: 'workspace-group',
@@ -66,8 +78,14 @@ export default function AccountSetting({
           icon: <CubeTransparentIcon className={iconClassName} />,
           activeIcon: <CubeTransparentIcon className={iconClassName} />,
         },
-      ]
-    }
+        {
+          key: 'data-source',
+          name: t('common.settings.dataSource'),
+          icon: <DataSourceIcon className={iconClassName} />,
+          activeIcon: <DataSourceSolidIcon className={iconClassName} />,
+        },
+      ],
+    },
   ]
 
   return (
@@ -126,6 +144,9 @@ export default function AccountSetting({
           {
             activeMenu === 'provider' && <ProviderPage />
           }
+          {
+            activeMenu === 'data-source' && <DataSourcePage />
+          }
         </div>
       </div>
     </Modal>

+ 9 - 9
web/app/components/header/account-setting/members-page/index.tsx

@@ -1,19 +1,19 @@
 'use client'
 import { useState } from 'react'
-import s from './index.module.css'
 import cn from 'classnames'
 import useSWR from 'swr'
 import dayjs from 'dayjs'
 import 'dayjs/locale/zh-cn'
 import relativeTime from 'dayjs/plugin/relativeTime'
-import I18n from '@/context/i18n'
 import { useContext } from 'use-context-selector'
-import { fetchMembers } from '@/service/common'
 import { UserPlusIcon } from '@heroicons/react/24/outline'
 import { useTranslation } from 'react-i18next'
+import s from './index.module.css'
 import InviteModal from './invite-modal'
 import InvitedModal from './invited-modal'
 import Operation from './operation'
+import { fetchMembers } from '@/service/common'
+import I18n from '@/context/i18n'
 import { useAppContext } from '@/context/app-context'
 import Avatar from '@/app/components/base/avatar'
 import { useWorkspacesContext } from '@/context/workspace-context'
@@ -35,18 +35,18 @@ const MembersPage = () => {
   const owner = accounts.filter(account => account.role === 'owner')?.[0]?.email === userProfile.email
   const { workspaces } = useWorkspacesContext()
   const currentWrokspace = workspaces.filter(item => item.current)?.[0]
-  
+
   return (
     <>
       <div>
         <div className='flex items-center mb-4 p-3 bg-gray-50 rounded-2xl'>
           <div className={cn(s['logo-icon'], 'shrink-0')}></div>
           <div className='grow mx-2'>
-            <div className='text-sm font-medium text-gray-900'>{currentWrokspace.name}</div>
+            <div className='text-sm font-medium text-gray-900'>{currentWrokspace?.name}</div>
             <div className='text-xs text-gray-500'>{t('common.userProfile.workspace')}</div>
           </div>
           <div className='
-            shrink-0 flex items-center py-[7px] px-3 border-[0.5px] border-gray-200 
+            shrink-0 flex items-center py-[7px] px-3 border-[0.5px] border-gray-200
             text-[13px] font-medium text-primary-600 bg-white
             shadow-[0_1px_2px_rgba(16,24,40,0.05)] rounded-lg cursor-pointer
           ' onClick={() => setInviteModalVisible(true)}>
@@ -78,10 +78,10 @@ const MembersPage = () => {
                   <div className='shrink-0 flex items-center w-[104px] py-2 text-[13px] text-gray-700'>{dayjs(Number((account.last_login_at || account.created_at)) * 1000).locale(locale === 'zh-Hans' ? 'zh-cn' : 'en').fromNow()}</div>
                   <div className='shrink-0 w-[96px] flex items-center'>
                     {
-                      owner && account.role !== 'owner'
+                      (owner && account.role !== 'owner')
                         ? <Operation member={account} onOperate={() => mutate()} />
                         : <div className='px-3 text-[13px] text-gray-700'>{RoleMap[account.role] || RoleMap.normal}</div>
-                    } 
+                    }
                   </div>
                 </div>
               ))
@@ -111,4 +111,4 @@ const MembersPage = () => {
   )
 }
 
-export default MembersPage
+export default MembersPage

File diff suppressed because it is too large
+ 1 - 0
web/app/components/header/assets/data-source-blue.svg


+ 3 - 0
web/app/components/header/assets/data-source.svg

@@ -0,0 +1,3 @@
+<svg width="16" height="16" viewBox="0 0 16 16" fill="none" xmlns="http://www.w3.org/2000/svg">
+<path d="M9.33333 13.3333C9.33333 14.0696 8.73638 14.6666 8 14.6666C7.26362 14.6666 6.66667 14.0696 6.66667 13.3333M9.33333 13.3333C9.33333 12.5969 8.73638 11.9999 8 11.9999M9.33333 13.3333H14M6.66667 13.3333C6.66667 12.5969 7.26362 11.9999 8 11.9999M6.66667 13.3333H2M8 11.9999V9.33325M14 3.33325C14 4.43782 11.3137 5.33325 8 5.33325C4.68629 5.33325 2 4.43782 2 3.33325M14 3.33325C14 2.22868 11.3137 1.33325 8 1.33325C4.68629 1.33325 2 2.22868 2 3.33325M14 3.33325V7.33325C14 8.43992 11.3333 9.33325 8 9.33325M2 3.33325V7.33325C2 8.43992 4.66667 9.33325 8 9.33325" stroke="#344054" stroke-width="1.25" stroke-linecap="round" stroke-linejoin="round"/>
+</svg>

+ 3 - 0
web/app/components/header/assets/file.svg

@@ -0,0 +1,3 @@
+<svg width="16" height="16" viewBox="0 0 16 16" fill="none" xmlns="http://www.w3.org/2000/svg">
+<path d="M13.3333 6.99992V4.53325C13.3333 3.41315 13.3333 2.85309 13.1153 2.42527C12.9236 2.04895 12.6176 1.74299 12.2413 1.55124C11.8135 1.33325 11.2534 1.33325 10.1333 1.33325H5.86666C4.74655 1.33325 4.1865 1.33325 3.75868 1.55124C3.38235 1.74299 3.07639 2.04895 2.88464 2.42527C2.66666 2.85309 2.66666 3.41315 2.66666 4.53325V11.4666C2.66666 12.5867 2.66666 13.1467 2.88464 13.5746C3.07639 13.9509 3.38235 14.2569 3.75868 14.4486C4.1865 14.6666 4.74655 14.6666 5.86666 14.6666H7.99999M9.33332 7.33325H5.33332M6.66666 9.99992H5.33332M10.6667 4.66659H5.33332M12 13.9999V9.99992M9.99999 11.9999H14" stroke="#667085" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"/>
+</svg>

+ 12 - 0
web/app/components/header/assets/notion.svg

@@ -0,0 +1,12 @@
+<svg width="20" height="20" viewBox="0 0 20 20" fill="none" xmlns="http://www.w3.org/2000/svg">
+<g clip-path="url(#clip0_5364_42310)">
+<path fill-rule="evenodd" clip-rule="evenodd" d="M3.5725 18.2611L1.4229 15.5832C0.905706 14.9389 0.625 14.1466 0.625 13.3312V3.63437C0.625 2.4129 1.60224 1.39936 2.86295 1.31328L12.8326 0.632614C13.5569 0.583164 14.2768 0.775682 14.8717 1.17794L18.3745 3.5462C19.0015 3.97012 19.375 4.66312 19.375 5.40266V16.427C19.375 17.6223 18.4141 18.6121 17.1798 18.688L6.11458 19.3692C5.12958 19.4298 4.17749 19.0148 3.5725 18.2611Z" fill="white"/>
+<path d="M7.03006 8.48663V8.35968C7.03006 8.03787 7.28779 7.77098 7.61997 7.7488L10.0396 7.58726L13.3857 12.5146V8.19003L12.5244 8.07522V8.01492C12.5244 7.68933 12.788 7.42068 13.1244 7.40344L15.326 7.29066V7.60749C15.326 7.75622 15.2154 7.88343 15.0638 7.90907L14.534 7.99868V15.0022L13.8691 15.2309C13.3136 15.4219 12.6952 15.2174 12.3772 14.7376L9.12879 9.83568V14.5143L10.1287 14.7056L10.1147 14.7984C10.0711 15.0889 9.82028 15.3086 9.51687 15.3221L7.03006 15.4328C6.99718 15.1204 7.23132 14.8409 7.55431 14.807L7.88143 14.7726V8.53447L7.03006 8.48663Z" fill="black"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M12.9218 1.85418L2.95217 2.53485C2.35499 2.57562 1.89209 3.05572 1.89209 3.63431V13.3311C1.89209 13.8748 2.07923 14.4029 2.42402 14.8325L4.57362 17.5104C4.92117 17.9433 5.46812 18.1817 6.03397 18.1469L17.0991 17.4658C17.6663 17.4309 18.1078 16.9761 18.1078 16.4269V5.4026C18.1078 5.06281 17.9362 4.74441 17.6481 4.54963L14.1453 2.18137C13.7883 1.94002 13.3564 1.82451 12.9218 1.85418ZM3.44654 3.78556C3.30788 3.6829 3.37387 3.46903 3.54806 3.45654L12.9889 2.77938C13.2897 2.75781 13.5886 2.84064 13.8318 3.01299L15.7261 4.35502C15.798 4.40597 15.7642 4.51596 15.6752 4.5208L5.67742 5.06454C5.37485 5.081 5.0762 4.99211 4.83563 4.814L3.44654 3.78556ZM5.20848 6.76913C5.20848 6.44433 5.47088 6.17604 5.80642 6.15777L16.3769 5.5821C16.7039 5.56429 16.9792 5.81577 16.9792 6.13232V15.6782C16.9792 16.0024 16.7177 16.2705 16.3829 16.2895L5.8793 16.8871C5.51537 16.9079 5.20848 16.6282 5.20848 16.2759V6.76913Z" fill="black"/>
+</g>
+<defs>
+<clipPath id="clip0_5364_42310">
+<rect width="20" height="20" fill="white"/>
+</clipPath>
+</defs>
+</svg>

+ 3 - 0
web/app/components/header/assets/sync.svg

@@ -0,0 +1,3 @@
+<svg width="16" height="16" viewBox="0 0 16 16" fill="none" xmlns="http://www.w3.org/2000/svg">
+<path d="M5.69773 13.1783C7.29715 13.8879 9.20212 13.8494 10.8334 12.9075C13.5438 11.3427 14.4724 7.87704 12.9076 5.16672L12.7409 4.87804M3.09233 10.8335C1.52752 8.12314 2.45615 4.65746 5.16647 3.09265C6.7978 2.15081 8.70277 2.11227 10.3022 2.82185M1.66226 10.8892L3.48363 11.3773L3.97166 9.5559M12.0284 6.44393L12.5164 4.62256L14.3378 5.1106" stroke="#667085" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"/>
+</svg>

+ 3 - 0
web/app/components/header/assets/trash.svg

@@ -0,0 +1,3 @@
+<svg width="16" height="16" viewBox="0 0 16 16" fill="none" xmlns="http://www.w3.org/2000/svg">
+<path d="M6 2H10M2 4H14M12.6667 4L12.1991 11.0129C12.129 12.065 12.0939 12.5911 11.8667 12.99C11.6666 13.3412 11.3648 13.6235 11.0011 13.7998C10.588 14 10.0607 14 9.00623 14H6.99377C5.93927 14 5.41202 14 4.99889 13.7998C4.63517 13.6235 4.33339 13.3412 4.13332 12.99C3.90607 12.5911 3.871 12.065 3.80086 11.0129L3.33333 4M6.66667 7V10.3333M9.33333 7V10.3333" stroke="#667085" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"/>
+</svg>

+ 4 - 4
web/app/components/header/nav/nav-selector/index.tsx

@@ -24,7 +24,7 @@ export type INavSelectorProps = {
 
 const itemClassName = `
   flex items-center w-full h-10 px-3 text-gray-700 text-[14px]
-  rounded-lg font-normal hover:bg-gray-100 cursor-pointer
+  rounded-lg font-normal hover:bg-gray-100 cursor-pointer truncate
 `
 
 const NavSelector = ({ curNav, navs, createText, onCreate, onLoadmore }: INavSelectorProps) => {
@@ -50,9 +50,9 @@ const NavSelector = ({ curNav, navs, createText, onCreate, onLoadmore }: INavSel
               text-[#1C64F2] hover:bg-[#EBF5FF]
             "
           >
-            {curNav?.name}
+            <div className='max-w-[180px] truncate' title={curNav?.name}>{curNav?.name}</div>
             <ChevronDownIcon
-              className="w-3 h-3 ml-1"
+              className="shrink-0 w-3 h-3 ml-1"
               aria-hidden="true"
             />
           </Menu.Button>
@@ -68,7 +68,7 @@ const NavSelector = ({ curNav, navs, createText, onCreate, onLoadmore }: INavSel
             {
               navs.map(nav => (
                 <Menu.Item key={nav.id}>
-                  <div className={itemClassName} onClick={() => router.push(nav.link)}>
+                  <div className={itemClassName} onClick={() => router.push(nav.link)} title={nav.name}>
                     <div className='relative w-6 h-6 mr-2 bg-[#D5F5F6] rounded-[6px]'>
                       <AppIcon size='tiny' icon={nav.icon} background={nav.icon_background}/>
                       <div className='flex justify-center items-center absolute -right-0.5 -bottom-0.5 w-2.5 h-2.5 bg-white rounded'>

+ 3 - 1
web/context/dataset-detail.ts

@@ -1,6 +1,8 @@
-import { createContext } from 'use-context-selector'
+import { createContext, useContext } from 'use-context-selector'
 import type { DataSet } from '@/models/datasets'
 
 const DatasetDetailContext = createContext<{ indexingTechnique?: string; dataset?: DataSet }>({})
 
+export const useDatasetDetailContext = () => useContext(DatasetDetailContext)
+
 export default DatasetDetailContext

+ 24 - 0
web/i18n/lang/common.en.ts

@@ -87,6 +87,7 @@ const translation = {
     integrations: 'Integrations',
     language: 'Language',
     provider: 'Model Provider',
+    dataSource: 'Data Source',
   },
   account: {
     avatar: 'Avatar',
@@ -172,6 +173,29 @@ const translation = {
       back: ' technology.',
     },
   },
+  dataSource: {
+    add: 'Add a data source',
+    connect: 'Connect',
+    notion: {
+      title: 'Notion',
+      description: 'Using Notion as a data source for the dataset.',
+      connectedWorkspace: 'Connected workspace',
+      addWorkspace: 'Add workspace',
+      connected: 'Connected',
+      disconnected: 'Disconnected',
+      changeAuthorizedPages: 'Change authorized pages',
+      pagesAuthorized: 'Pages authorized',
+      sync: 'Sync',
+      remove: 'Remove',
+      selector: {
+        pageSelected: 'Pages Selected',
+        searchPages: 'Search pages...',
+        noSearchResult: 'No search resluts',
+        addPages: 'Add pages',
+        preview: 'PREVIEW',
+      },
+    },
+  },
   about: {
     changeLog: 'Changlog',
     updateNow: 'Update now',

+ 24 - 0
web/i18n/lang/common.zh.ts

@@ -87,6 +87,7 @@ const translation = {
     integrations: '集成',
     language: '语言',
     provider: '模型供应商',
+    dataSource: '数据来源',
   },
   account: {
     avatar: '头像',
@@ -173,6 +174,29 @@ const translation = {
       back: ' 技术进行加密和存储。',
     },
   },
+  dataSource: {
+    add: '添加数据源',
+    connect: '绑定',
+    notion: {
+      title: 'Notion',
+      description: '使用 Notion 作为数据集的数据源。',
+      connectedWorkspace: '已绑定工作空间',
+      addWorkspace: '添加工作空间',
+      connected: '已绑定',
+      disconnected: '未绑定',
+      changeAuthorizedPages: '更改授权页面',
+      pagesAuthorized: '已授权页面',
+      sync: '同步',
+      remove: '删除',
+      selector: {
+        pageSelected: '已选页面',
+        searchPages: '搜索页面...',
+        noSearchResult: '无搜索结果',
+        addPages: '添加页面',
+        preview: '预览',
+      },
+    },
+  },
   about: {
     changeLog: '更新日志',
     updateNow: '现在更新',

+ 9 - 1
web/i18n/lang/dataset-creation.en.ts

@@ -13,6 +13,7 @@ const translation = {
   },
   stepOne: {
     filePreview: 'File Preview',
+    pagePreview: 'Page Preview',
     dataSourceType: {
       file: 'Import from text file',
       notion: 'Sync from Notion',
@@ -32,6 +33,9 @@ const translation = {
       change: 'Change',
       failed: 'Upload failed',
     },
+    notionSyncTitle: 'Notion is not connected',
+    notionSyncTip: 'To sync with Notion, connection to Notion must be established first.',
+    connect: 'Go to connect',
     button: 'next',
     emptyDatasetCreation: 'I want to create an empty dataset',
     modal: {
@@ -73,7 +77,11 @@ const translation = {
     emstimateSegment: 'Estimated segments',
     segmentCount: 'segments',
     calculating: 'Calculating...',
-    fileName: 'Preprocess document',
+    fileSource: 'Preprocess documents',
+    notionSource: 'Preprocess pages',
+    other: 'and other ',
+    fileUnit: ' files',
+    notionUnit: ' pages',
     lastStep: 'Last step',
     nextStep: 'Save & Process',
     save: 'Save & Process',

+ 9 - 1
web/i18n/lang/dataset-creation.zh.ts

@@ -13,6 +13,7 @@ const translation = {
   },
   stepOne: {
     filePreview: '文件预览',
+    pagePreview: '页面预览',
     dataSourceType: {
       file: '导入已有文本',
       notion: '同步自 Notion 内容',
@@ -32,6 +33,9 @@ const translation = {
       change: '更改文件',
       failed: '上传失败',
     },
+    notionSyncTitle: 'Notion 未绑定',
+    notionSyncTip: '同步 Notion 内容前,须先绑定 Notion 空间',
+    connect: '去绑定',
     button: '下一步',
     emptyDatasetCreation: '创建一个空数据集',
     modal: {
@@ -73,7 +77,11 @@ const translation = {
     emstimateSegment: '预估分段数',
     segmentCount: '段',
     calculating: '计算中...',
-    fileName: '预处理文档',
+    fileSource: '预处理文档',
+    notionSource: '预处理页面',
+    other: '和其他 ',
+    fileUnit: ' 个文件',
+    notionUnit: ' 个页面',
     lastStep: '上一步',
     nextStep: '保存并处理',
     save: '保存并处理',

+ 2 - 0
web/i18n/lang/dataset-documents.en.ts

@@ -3,6 +3,7 @@ const translation = {
     title: 'Documents',
     desc: 'All files of the dataset are shown here, and the entire dataset can be linked to Dify citations or indexed via the Chat plugin.',
     addFile: 'add file',
+    addPages: 'Add Pages',
     table: {
       header: {
         fileName: 'FILE NAME',
@@ -19,6 +20,7 @@ const translation = {
       archive: 'Archive',
       delete: 'Delete',
       enableWarning: 'Archived file cannot be enabled',
+      sync: 'Sync',
     },
     index: {
       enable: 'Enable',

+ 2 - 0
web/i18n/lang/dataset-documents.zh.ts

@@ -3,6 +3,7 @@ const translation = {
     title: '文档',
     desc: '数据集的所有文件都在这里显示,整个数据集都可以链接到 Dify 引用或通过 Chat 插件进行索引。',
     addFile: '添加文件',
+    addPages: '添加页面',
     table: {
       header: {
         fileName: '文件名',
@@ -19,6 +20,7 @@ const translation = {
       archive: '归档',
       delete: '删除',
       enableWarning: '归档的文件无法启用',
+      sync: '同步',
     },
     index: {
       enable: '启用中',

+ 32 - 0
web/models/common.ts

@@ -100,6 +100,38 @@ export type IWorkspace = {
   current: boolean
 }
 
+export type DataSourceNotionPage = {
+  page_icon: null | {
+    type: string | null
+    url: string | null
+    emoji: string | null
+  }
+  page_id: string
+  page_name: string
+  parent_id: string
+  type: string
+  is_bound: boolean
+}
+
+export type DataSourceNotionPageMap = Record<string, DataSourceNotionPage & { workspace_id: string }>
+
+export type DataSourceNotionWorkspace = {
+  workspace_name: string
+  workspace_id: string
+  workspace_icon: string | null
+  total?: number
+  pages: DataSourceNotionPage[]
+}
+
+export type DataSourceNotionWorkspaceMap = Record<string, DataSourceNotionWorkspace>
+
+export type DataSourceNotion = {
+  id: string
+  provider: string
+  is_bound: boolean
+  source_info: DataSourceNotionWorkspace
+}
+
 export type GithubRepo = {
   stargazers_count: number
 }

+ 62 - 35
web/models/datasets.ts

@@ -1,4 +1,11 @@
-import { AppMode } from './app'
+import type { AppMode } from './app'
+import type { DataSourceNotionPage } from './common'
+
+export enum DataSourceType {
+  FILE = 'upload_file',
+  NOTION = 'notion_import',
+  WEB = 'web_import',
+}
 
 export type DataSet = {
   id: string
@@ -7,7 +14,7 @@ export type DataSet = {
   icon_background: string
   description: string
   permission: 'only_me' | 'all_team_members'
-  data_source_type: 'upload_file'
+  data_source_type: DataSourceType
   indexing_technique: 'high_quality' | 'economy'
   created_by: string
   updated_by: string
@@ -43,9 +50,9 @@ export type IndexingEstimateResponse = {
   preview: string[]
 }
 
-export interface FileIndexingEstimateResponse extends IndexingEstimateResponse {
+export type FileIndexingEstimateResponse = {
   total_nodes: number
-}
+} & IndexingEstimateResponse
 
 export type IndexingStatusResponse = {
   id: string
@@ -61,6 +68,9 @@ export type IndexingStatusResponse = {
   completed_segments: number
   total_segments: number
 }
+export type IndexingStatusBatchResponse = {
+  data: IndexingStatusResponse[]
+}
 
 export type ProcessMode = 'automatic' | 'custom'
 
@@ -98,17 +108,17 @@ export const DocumentIndexingStatusList = [
 export type DocumentIndexingStatus = typeof DocumentIndexingStatusList[number]
 
 export const DisplayStatusList = [
-  "queuing",
-  "indexing",
-  "paused",
-  "error",
-  "available",
-  "enabled",
-  "disabled",
-  "archived",
-] as const;
-
-export type DocumentDisplayStatus = typeof DisplayStatusList[number];
+  'queuing',
+  'indexing',
+  'paused',
+  'error',
+  'available',
+  'enabled',
+  'disabled',
+  'archived',
+] as const
+
+export type DocumentDisplayStatus = typeof DisplayStatusList[number]
 
 export type DataSourceInfo = {
   upload_file: {
@@ -124,9 +134,10 @@ export type DataSourceInfo = {
 
 export type InitialDocumentDetail = {
   id: string
+  batch: string
   position: number
   dataset_id: string
-  data_source_type: 'upload_file'
+  data_source_type: DataSourceType
   data_source_info: DataSourceInfo
   dataset_process_rule_id: string
   name: string
@@ -135,6 +146,8 @@ export type InitialDocumentDetail = {
   created_at: number
   indexing_status: DocumentIndexingStatus
   display_status: DocumentDisplayStatus
+  completed_segments?: number
+  total_segments?: number
 }
 
 export type SimpleDocumentDetail = InitialDocumentDetail & {
@@ -157,16 +170,29 @@ export type DocumentListResponse = {
 
 export type CreateDocumentReq = {
   original_document_id?: string
-  indexing_technique?: string;
-  name: string
+  indexing_technique?: string
   data_source: DataSource
   process_rule: ProcessRule
 }
 
 export type DataSource = {
+  type: DataSourceType
+  info_list: {
+    data_source_type: DataSourceType
+    notion_info_list?: NotionInfo[]
+    file_info_list?: {
+      file_ids: string[]
+    }
+  }
+}
+
+export type NotionInfo = {
+  workspace_id: string
+  pages: DataSourceNotionPage[]
+}
+export type NotionPage = {
+  page_id: string
   type: string
-  info: string // upload_file_id
-  name: string
 }
 
 export type ProcessRule = {
@@ -176,7 +202,8 @@ export type ProcessRule = {
 
 export type createDocumentResponse = {
   dataset?: DataSet
-  document: InitialDocumentDetail
+  batch: string
+  documents: InitialDocumentDetail[]
 }
 
 export type FullDocumentDetail = SimpleDocumentDetail & {
@@ -216,20 +243,20 @@ export type DocMetadata = {
 }
 
 export const CUSTOMIZABLE_DOC_TYPES = [
-  "book",
-  "web_page",
-  "paper",
-  "social_media_post",
-  "personal_document",
-  "business_document",
-  "im_chat_log",
-] as const;
-
-export const FIXED_DOC_TYPES = ["synced_from_github", "synced_from_notion", "wikipedia_entry"] as const;
-
-export type CustomizableDocType = typeof CUSTOMIZABLE_DOC_TYPES[number];
-export type FixedDocType = typeof FIXED_DOC_TYPES[number];
-export type DocType = CustomizableDocType | FixedDocType;
+  'book',
+  'web_page',
+  'paper',
+  'social_media_post',
+  'personal_document',
+  'business_document',
+  'im_chat_log',
+] as const
+
+export const FIXED_DOC_TYPES = ['synced_from_github', 'synced_from_notion', 'wikipedia_entry'] as const
+
+export type CustomizableDocType = typeof CUSTOMIZABLE_DOC_TYPES[number]
+export type FixedDocType = typeof FIXED_DOC_TYPES[number]
+export type DocType = CustomizableDocType | FixedDocType
 
 export type DocumentDetailResponse = FullDocumentDetail
 

+ 17 - 4
web/service/common.ts

@@ -1,9 +1,10 @@
 import type { Fetcher } from 'swr'
-import { del, get, post, put } from './base'
+import { del, get, patch, post, put } from './base'
 import type {
-  AccountIntegrate, CommonResponse, IWorkspace,
-  LangGeniusVersionResponse, Member, OauthResponse,
-  Provider, ProviderAzureToken, TenantInfoResponse, UserProfileOriginResponse,
+  AccountIntegrate, CommonResponse, DataSourceNotion,
+  IWorkspace, LangGeniusVersionResponse, Member,
+  OauthResponse, Provider, ProviderAzureToken, TenantInfoResponse,
+  UserProfileOriginResponse,
 } from '@/models/common'
 import type {
   UpdateOpenAIKeyResponse,
@@ -88,3 +89,15 @@ export const fetchWorkspaces: Fetcher<{ workspaces: IWorkspace[] }, { url: strin
 export const switchWorkspace: Fetcher<CommonResponse & { new_tenant: IWorkspace }, { url: string; body: Record<string, any> }> = ({ url, body }) => {
   return post(url, { body }) as Promise<CommonResponse & { new_tenant: IWorkspace }>
 }
+
+export const fetchDataSource: Fetcher<{ data: DataSourceNotion[] }, { url: string }> = ({ url }) => {
+  return get(url) as Promise<{ data: DataSourceNotion[] }>
+}
+
+export const syncDataSourceNotion: Fetcher<CommonResponse, { url: string }> = ({ url }) => {
+  return get(url) as Promise<CommonResponse>
+}
+
+export const updateDataSourceNotionAction: Fetcher<CommonResponse, { url: string }> = ({ url }) => {
+  return patch(url) as Promise<CommonResponse>
+}

+ 39 - 11
web/service/datasets.ts

@@ -1,8 +1,8 @@
 import type { Fetcher } from 'swr'
-import { del, get, post, put, patch } from './base'
 import qs from 'qs'
-import type { RelatedAppResponse, DataSet, HitTestingResponse, HitTestingRecordsResponse, DataSetListResponse, CreateDocumentReq, InitialDocumentDetail, DocumentDetailResponse, DocumentListResponse, IndexingEstimateResponse, FileIndexingEstimateResponse, IndexingStatusResponse, ProcessRuleResponse, SegmentsQuery, SegmentsResponse, createDocumentResponse } from '@/models/datasets'
-import type { CommonResponse } from '@/models/common'
+import { del, get, patch, post, put } from './base'
+import type { CreateDocumentReq, DataSet, DataSetListResponse, DocumentDetailResponse, DocumentListResponse, FileIndexingEstimateResponse, HitTestingRecordsResponse, HitTestingResponse, IndexingEstimateResponse, IndexingStatusBatchResponse, IndexingStatusResponse, ProcessRuleResponse, RelatedAppResponse, SegmentsQuery, SegmentsResponse, createDocumentResponse } from '@/models/datasets'
+import type { CommonResponse, DataSourceNotionWorkspace } from '@/models/common'
 
 // apis for documents in a dataset
 
@@ -11,6 +11,11 @@ type CommonDocReq = {
   documentId: string
 }
 
+type BatchReq = {
+  datasetId: string
+  batchId: string
+}
+
 export type SortType = 'created_at' | 'hit_count' | '-created_at' | '-hit_count'
 
 export type MetadataType = 'all' | 'only' | 'without'
@@ -19,17 +24,17 @@ export const fetchDataDetail: Fetcher<DataSet, string> = (datasetId: string) =>
   return get(`/datasets/${datasetId}`) as Promise<DataSet>
 }
 
-export const updateDatasetSetting: Fetcher<DataSet, { datasetId: string, body: Partial<Pick<DataSet, 'name' | 'description' | 'permission' | 'indexing_technique'>>}> = ({ datasetId, body }) => {
-  return patch(`/datasets/${datasetId}`, { body } ) as Promise<DataSet>
+export const updateDatasetSetting: Fetcher<DataSet, { datasetId: string; body: Partial<Pick<DataSet, 'name' | 'description' | 'permission' | 'indexing_technique'>> }> = ({ datasetId, body }) => {
+  return patch(`/datasets/${datasetId}`, { body }) as Promise<DataSet>
 }
 
 export const fetchDatasetRelatedApps: Fetcher<RelatedAppResponse, string> = (datasetId: string) => {
   return get(`/datasets/${datasetId}/related-apps`) as Promise<RelatedAppResponse>
 }
 
-export const fetchDatasets: Fetcher<DataSetListResponse, { url: string, params: { page: number, ids?: string[], limit?: number } }> = ({ url, params }) => {
+export const fetchDatasets: Fetcher<DataSetListResponse, { url: string; params: { page: number; ids?: string[]; limit?: number } }> = ({ url, params }) => {
   const urlParams = qs.stringify(params, { indices: false })
-  return get(`${url}?${urlParams}`,) as Promise<DataSetListResponse>
+  return get(`${url}?${urlParams}`) as Promise<DataSetListResponse>
 }
 
 export const createEmptyDataset: Fetcher<DataSet, { name: string }> = ({ name }) => {
@@ -52,21 +57,28 @@ export const fetchDocuments: Fetcher<DocumentListResponse, { datasetId: string;
 }
 
 export const createFirstDocument: Fetcher<createDocumentResponse, { body: CreateDocumentReq }> = ({ body }) => {
-  return post(`/datasets/init`, { body }) as Promise<createDocumentResponse>
+  return post('/datasets/init', { body }) as Promise<createDocumentResponse>
 }
 
-export const createDocument: Fetcher<InitialDocumentDetail, { datasetId: string; body: CreateDocumentReq }> = ({ datasetId, body }) => {
-  return post(`/datasets/${datasetId}/documents`, { body }) as Promise<InitialDocumentDetail>
+export const createDocument: Fetcher<createDocumentResponse, { datasetId: string; body: CreateDocumentReq }> = ({ datasetId, body }) => {
+  return post(`/datasets/${datasetId}/documents`, { body }) as Promise<createDocumentResponse>
 }
 
 export const fetchIndexingEstimate: Fetcher<IndexingEstimateResponse, CommonDocReq> = ({ datasetId, documentId }) => {
   return get(`/datasets/${datasetId}/documents/${documentId}/indexing-estimate`, {}) as Promise<IndexingEstimateResponse>
 }
+export const fetchIndexingEstimateBatch: Fetcher<IndexingEstimateResponse, BatchReq> = ({ datasetId, batchId }) => {
+  return get(`/datasets/${datasetId}/batch/${batchId}/indexing-estimate`, {}) as Promise<IndexingEstimateResponse>
+}
 
 export const fetchIndexingStatus: Fetcher<IndexingStatusResponse, CommonDocReq> = ({ datasetId, documentId }) => {
   return get(`/datasets/${datasetId}/documents/${documentId}/indexing-status`, {}) as Promise<IndexingStatusResponse>
 }
 
+export const fetchIndexingStatusBatch: Fetcher<IndexingStatusBatchResponse, BatchReq> = ({ datasetId, batchId }) => {
+  return get(`/datasets/${datasetId}/batch/${batchId}/indexing-status`, {}) as Promise<IndexingStatusBatchResponse>
+}
+
 export const fetchDocumentDetail: Fetcher<DocumentDetailResponse, CommonDocReq & { params: { metadata?: MetadataType } }> = ({ datasetId, documentId, params }) => {
   return get(`/datasets/${datasetId}/documents/${documentId}`, { params }) as Promise<DocumentDetailResponse>
 }
@@ -95,10 +107,22 @@ export const disableDocument: Fetcher<CommonResponse, CommonDocReq> = ({ dataset
   return patch(`/datasets/${datasetId}/documents/${documentId}/status/disable`) as Promise<CommonResponse>
 }
 
+export const syncDocument: Fetcher<CommonResponse, CommonDocReq> = ({ datasetId, documentId }) => {
+  return get(`/datasets/${datasetId}/documents/${documentId}/notion/sync`) as Promise<CommonResponse>
+}
+
+export const preImportNotionPages: Fetcher<{ notion_info: DataSourceNotionWorkspace[] }, { url: string; datasetId?: string }> = ({ url, datasetId }) => {
+  return get(url, { params: { dataset_id: datasetId } }) as Promise<{ notion_info: DataSourceNotionWorkspace[] }>
+}
+
 export const modifyDocMetadata: Fetcher<CommonResponse, CommonDocReq & { body: { doc_type: string; doc_metadata: Record<string, any> } }> = ({ datasetId, documentId, body }) => {
   return put(`/datasets/${datasetId}/documents/${documentId}/metadata`, { body }) as Promise<CommonResponse>
 }
 
+export const getDatasetIndexingStatus: Fetcher<{ data: IndexingStatusResponse[] }, string> = (datasetId) => {
+  return get(`/datasets/${datasetId}/indexing-status`) as Promise<{ data: IndexingStatusResponse[] }>
+}
+
 // apis for segments in a document
 
 export const fetchSegments: Fetcher<SegmentsResponse, CommonDocReq & { params: SegmentsQuery }> = ({ datasetId, documentId, params }) => {
@@ -123,5 +147,9 @@ export const fetchTestingRecords: Fetcher<HitTestingRecordsResponse, { datasetId
 }
 
 export const fetchFileIndexingEstimate: Fetcher<FileIndexingEstimateResponse, any> = (body: any) => {
-  return post(`/datasets/file-indexing-estimate`, { body }) as Promise<FileIndexingEstimateResponse>
+  return post('/datasets/indexing-estimate', { body }) as Promise<FileIndexingEstimateResponse>
+}
+
+export const fetchNotionPagePreview: Fetcher<{ content: string }, { workspaceID: string; pageID: string; pageType: string }> = ({ workspaceID, pageID, pageType }) => {
+  return get(`notion/workspaces/${workspaceID}/pages/${pageID}/${pageType}/preview`) as Promise<{ content: string }>
 }

Some files were not shown because too many files changed in this diff