hai 1 mes · 2c9af712a2
--- a/api/commands.py
+++ b/api/commands.py
@@ -20,7 +20,7 @@ from libs.helper import email as email_validate
 
															 from libs.password import hash_password, password_pattern, valid_password
														
 
															 from libs.rsa import generate_key_pair
														
 
															 from models import Tenant
														
 
															-from models.dataset import Dataset, DatasetCollectionBinding, DocumentSegment
														
 
															+from models.dataset import Dataset, DatasetCollectionBinding, DatasetMetadata, DatasetMetadataBinding, DocumentSegment
														
 
															 from models.dataset import Document as DatasetDocument
														
 
															 from models.model import Account, App, AppAnnotationSetting, AppMode, Conversation, MessageAnnotation
														
 
															 from models.provider import Provider, ProviderModel
														
@@ -483,14 +483,11 @@ def convert_to_agent_apps():
 
															     click.echo(click.style("Conversion complete. Converted {} agent apps.".format(len(proceeded_app_ids)), fg="green"))
														
 
															-@click.command("add-qdrant-doc-id-index", help="Add Qdrant doc_id index.")
														
 
															+@click.command("add-qdrant-index", help="Add Qdrant index.")
														
 
															 @click.option("--field", default="metadata.doc_id", prompt=False, help="Index field , default is metadata.doc_id.")
														
 
															-def add_qdrant_doc_id_index(field: str):
														
 
															-    click.echo(click.style("Starting Qdrant doc_id index creation.", fg="green"))
														
 
															-    vector_type = dify_config.VECTOR_STORE
														
 
															-    if vector_type != "qdrant":
														
 
															-        click.echo(click.style("This command only supports Qdrant vector store.", fg="red"))
														
 
															-        return
														
 
															+def add_qdrant_index(field: str):
														
 
															+    click.echo(click.style("Starting Qdrant index creation.", fg="green"))
														
 
															+
														
 
															     create_count = 0
														
 
															     try:
														
@@ -539,6 +536,72 @@ def add_qdrant_doc_id_index(field: str):
 
															     click.echo(click.style(f"Index creation complete. Created {create_count} collection indexes.", fg="green"))
														
 
															+@click.command("old-metadata-migration", help="Old metadata migration.")
														
 
															+def old_metadata_migration():
														
 
															+    """
														
 
															+    Old metadata migration.
														
 
															+    """
														
 
															+    click.echo(click.style("Starting old metadata migration.", fg="green"))
														
 
															+
														
 
															+    page = 1
														
 
															+    while True:
														
 
															+        try:
														
 
															+            documents = (
														
 
															+                DatasetDocument.query.filter(DatasetDocument.doc_metadata is not None)
														
 
															+                .order_by(DatasetDocument.created_at.desc())
														
 
															+                .paginate(page=page, per_page=50)
														
 
															+            )
														
 
															+        except NotFound:
														
 
															+            break
														
 
															+        if not documents:
														
 
															+            break
														
 
															+        for document in documents:
														
 
															+            if document.doc_metadata:
														
 
															+                doc_metadata = document.doc_metadata
														
 
															+                for key, value in doc_metadata.items():
														
 
															+                    dataset_metadata = (
														
 
															+                        db.session.query(DatasetMetadata)
														
 
															+                        .filter(DatasetMetadata.dataset_id == document.dataset_id, DatasetMetadata.name == key)
														
 
															+                        .first()
														
 
															+                    )
														
 
															+                    if not dataset_metadata:
														
 
															+                        dataset_metadata = DatasetMetadata(
														
 
															+                            tenant_id=document.tenant_id,
														
 
															+                            dataset_id=document.dataset_id,
														
 
															+                            name=key,
														
 
															+                            type="string",
														
 
															+                            created_by=document.created_by,
														
 
															+                        )
														
 
															+                        db.session.add(dataset_metadata)
														
 
															+                        db.session.flush()
														
 
															+                        dataset_metadata_binding = DatasetMetadataBinding(
														
 
															+                            tenant_id=document.tenant_id,
														
 
															+                            dataset_id=document.dataset_id,
														
 
															+                            metadata_id=dataset_metadata.id,
														
 
															+                            document_id=document.id,
														
 
															+                            created_by=document.created_by,
														
 
															+                        )
														
 
															+                        db.session.add(dataset_metadata_binding)
														
 
															+                    else:
														
 
															+                        dataset_metadata_binding = DatasetMetadataBinding.query.filter(
														
 
															+                            DatasetMetadataBinding.dataset_id == document.dataset_id,
														
 
															+                            DatasetMetadataBinding.document_id == document.id,
														
 
															+                            DatasetMetadataBinding.metadata_id == dataset_metadata.id,
														
 
															+                        ).first()
														
 
															+                        if not dataset_metadata_binding:
														
 
															+                            dataset_metadata_binding = DatasetMetadataBinding(
														
 
															+                                tenant_id=document.tenant_id,
														
 
															+                                dataset_id=document.dataset_id,
														
 
															+                                metadata_id=dataset_metadata.id,
														
 
															+                                document_id=document.id,
														
 
															+                                created_by=document.created_by,
														
 
															+                            )
														
 
															+                            db.session.add(dataset_metadata_binding)
														
 
															+                db.session.commit()
														
 
															+        page += 1
														
 
															+    click.echo(click.style("Old metadata migration completed.", fg="green"))
														
 
															+
														
 
															+
														
 
															 @click.command("create-tenant", help="Create account and tenant.")
														
 
															 @click.option("--email", prompt=True, help="Tenant account email.")
														
 
															 @click.option("--name", prompt=True, help="Workspace name.")
														
--- a/api/controllers/service_api/dataset/document.py
+++ b/api/controllers/service_api/dataset/document.py
@@ -18,7 +18,6 @@ from controllers.service_api.app.error import (
 
															 from controllers.service_api.dataset.error import (
														
 
															     ArchivedDocumentImmutableError,
														
 
															     DocumentIndexingError,
														
 
															-    InvalidMetadataError,
														
 
															 )
														
 
															 from controllers.service_api.wraps import DatasetApiResource, cloud_edition_billing_resource_check
														
 
															 from core.errors.error import ProviderTokenNotInitError
														
@@ -51,8 +50,6 @@ class DocumentAddByTextApi(DatasetApiResource):
 
															             "indexing_technique", type=str, choices=Dataset.INDEXING_TECHNIQUE_LIST, nullable=False, location="json"
														
 
															         )
														
 
															         parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json")
														
 
															-        parser.add_argument("doc_type", type=str, required=False, nullable=True, location="json")
														
 
															-        parser.add_argument("doc_metadata", type=dict, required=False, nullable=True, location="json")
														
 
															         args = parser.parse_args()
														
 
															         dataset_id = str(dataset_id)
														
@@ -65,28 +62,6 @@ class DocumentAddByTextApi(DatasetApiResource):
 
															         if not dataset.indexing_technique and not args["indexing_technique"]:
														
 
															             raise ValueError("indexing_technique is required.")
														
 
															-        # Validate metadata if provided
														
 
															-        if args.get("doc_type") or args.get("doc_metadata"):
														
 
															-            if not args.get("doc_type") or not args.get("doc_metadata"):
														
 
															-                raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
														
 
															-
														
 
															-            if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
														
 
															-                raise InvalidMetadataError(
														
 
															-                    "Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
														
 
															-                )
														
 
															-
														
 
															-            if not isinstance(args["doc_metadata"], dict):
														
 
															-                raise InvalidMetadataError("doc_metadata must be a dictionary")
														
 
															-
														
 
															-            # Validate metadata schema based on doc_type
														
 
															-            if args["doc_type"] != "others":
														
 
															-                metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
														
 
															-                for key, value in args["doc_metadata"].items():
														
 
															-                    if key in metadata_schema and not isinstance(value, metadata_schema[key]):
														
 
															-                        raise InvalidMetadataError(f"Invalid type for metadata field {key}")
														
 
															-            # set to MetaDataConfig
														
 
															-            args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
														
 
															-
														
 
															         text = args.get("text")
														
 
															         name = args.get("name")
														
 
															         if text is None or name is None:
														
@@ -133,8 +108,6 @@ class DocumentUpdateByTextApi(DatasetApiResource):
 
															             "doc_language", type=str, default="English", required=False, nullable=False, location="json"
														
 
															         )
														
 
															         parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json")
														
 
															-        parser.add_argument("doc_type", type=str, required=False, nullable=True, location="json")
														
 
															-        parser.add_argument("doc_metadata", type=dict, required=False, nullable=True, location="json")
														
 
															         args = parser.parse_args()
														
 
															         dataset_id = str(dataset_id)
														
 
															         tenant_id = str(tenant_id)
														
@@ -146,29 +119,6 @@ class DocumentUpdateByTextApi(DatasetApiResource):
 
															         # indexing_technique is already set in dataset since this is an update
														
 
															         args["indexing_technique"] = dataset.indexing_technique
														
 
															-        # Validate metadata if provided
														
 
															-        if args.get("doc_type") or args.get("doc_metadata"):
														
 
															-            if not args.get("doc_type") or not args.get("doc_metadata"):
														
 
															-                raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
														
 
															-
														
 
															-            if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
														
 
															-                raise InvalidMetadataError(
														
 
															-                    "Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
														
 
															-                )
														
 
															-
														
 
															-            if not isinstance(args["doc_metadata"], dict):
														
 
															-                raise InvalidMetadataError("doc_metadata must be a dictionary")
														
 
															-
														
 
															-            # Validate metadata schema based on doc_type
														
 
															-            if args["doc_type"] != "others":
														
 
															-                metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
														
 
															-                for key, value in args["doc_metadata"].items():
														
 
															-                    if key in metadata_schema and not isinstance(value, metadata_schema[key]):
														
 
															-                        raise InvalidMetadataError(f"Invalid type for metadata field {key}")
														
 
															-
														
 
															-            # set to MetaDataConfig
														
 
															-            args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
														
 
															-
														
 
															         if args["text"]:
														
 
															             text = args.get("text")
														
 
															             name = args.get("name")
														
@@ -216,29 +166,6 @@ class DocumentAddByFileApi(DatasetApiResource):
 
															         if "doc_language" not in args:
														
 
															             args["doc_language"] = "English"
														
 
															-        # Validate metadata if provided
														
 
															-        if args.get("doc_type") or args.get("doc_metadata"):
														
 
															-            if not args.get("doc_type") or not args.get("doc_metadata"):
														
 
															-                raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
														
 
															-
														
 
															-            if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
														
 
															-                raise InvalidMetadataError(
														
 
															-                    "Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
														
 
															-                )
														
 
															-
														
 
															-            if not isinstance(args["doc_metadata"], dict):
														
 
															-                raise InvalidMetadataError("doc_metadata must be a dictionary")
														
 
															-
														
 
															-            # Validate metadata schema based on doc_type
														
 
															-            if args["doc_type"] != "others":
														
 
															-                metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
														
 
															-                for key, value in args["doc_metadata"].items():
														
 
															-                    if key in metadata_schema and not isinstance(value, metadata_schema[key]):
														
 
															-                        raise InvalidMetadataError(f"Invalid type for metadata field {key}")
														
 
															-
														
 
															-            # set to MetaDataConfig
														
 
															-            args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
														
 
															-
														
 
															         # get dataset info
														
 
															         dataset_id = str(dataset_id)
														
 
															         tenant_id = str(tenant_id)
														
@@ -306,29 +233,6 @@ class DocumentUpdateByFileApi(DatasetApiResource):
 
															         if "doc_language" not in args:
														
 
															             args["doc_language"] = "English"
														
 
															-        # Validate metadata if provided
														
 
															-        if args.get("doc_type") or args.get("doc_metadata"):
														
 
															-            if not args.get("doc_type") or not args.get("doc_metadata"):
														
 
															-                raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
														
 
															-
														
 
															-            if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
														
 
															-                raise InvalidMetadataError(
														
 
															-                    "Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
														
 
															-                )
														
 
															-
														
 
															-            if not isinstance(args["doc_metadata"], dict):
														
 
															-                raise InvalidMetadataError("doc_metadata must be a dictionary")
														
 
															-
														
 
															-            # Validate metadata schema based on doc_type
														
 
															-            if args["doc_type"] != "others":
														
 
															-                metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
														
 
															-                for key, value in args["doc_metadata"].items():
														
 
															-                    if key in metadata_schema and not isinstance(value, metadata_schema[key]):
														
 
															-                        raise InvalidMetadataError(f"Invalid type for metadata field {key}")
														
 
															-
														
 
															-            # set to MetaDataConfig
														
 
															-            args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
														
 
															-
														
 
															         # get dataset info
														
 
															         dataset_id = str(dataset_id)
														
 
															         tenant_id = str(tenant_id)
														
--- a/api/extensions/ext_commands.py
+++ b/api/extensions/ext_commands.py
@@ -3,7 +3,7 @@ from dify_app import DifyApp
 
															 def init_app(app: DifyApp):
														
 
															     from commands import (
														
 
															-        add_qdrant_doc_id_index,
														
 
															+        add_qdrant_index,
														
 
															         convert_to_agent_apps,
														
 
															         create_tenant,
														
 
															         extract_plugins,
														
@@ -11,6 +11,7 @@ def init_app(app: DifyApp):
 
															         fix_app_site_missing,
														
 
															         install_plugins,
														
 
															         migrate_data_for_plugin,
														
 
															+        old_metadata_migration,
														
 
															         reset_email,
														
 
															         reset_encrypt_key_pair,
														
 
															         reset_password,
														
@@ -24,7 +25,7 @@ def init_app(app: DifyApp):
 
															         reset_encrypt_key_pair,
														
 
															         vdb_migrate,
														
 
															         convert_to_agent_apps,
														
 
															-        add_qdrant_doc_id_index,
														
 
															+        add_qdrant_index,
														
 
															         create_tenant,
														
 
															         upgrade_db,
														
 
															         fix_app_site_missing,
														
@@ -32,6 +33,7 @@ def init_app(app: DifyApp):
 
															         extract_plugins,
														
 
															         extract_unique_plugins,
														
 
															         install_plugins,
														
 
															+        old_metadata_migration,
														
 
															     ]
														
 
															     for cmd in cmds_to_register:
														
 
															         app.cli.add_command(cmd)
														
--- a/api/services/dataset_service.py
+++ b/api/services/dataset_service.py
@@ -46,7 +46,6 @@ from models.source import DataSourceOauthBinding
 
															 from services.entities.knowledge_entities.knowledge_entities import (
														
 
															     ChildChunkUpdateArgs,
														
 
															     KnowledgeConfig,
														
 
															-    MetaDataConfig,
														
 
															     RerankingModel,
														
 
															     RetrievalModel,
														
 
															     SegmentUpdateArgs,
														
@@ -999,9 +998,6 @@ class DocumentService:
 
															                                 document.data_source_info = json.dumps(data_source_info)
														
 
															                                 document.batch = batch
														
 
															                                 document.indexing_status = "waiting"
														
 
															-                                if knowledge_config.metadata:
														
 
															-                                    document.doc_type = knowledge_config.metadata.doc_type
														
 
															-                                    document.metadata = knowledge_config.metadata.doc_metadata
														
 
															                                 db.session.add(document)
														
 
															                                 documents.append(document)
														
 
															                                 duplicate_document_ids.append(document.id)
														
@@ -1018,7 +1014,6 @@ class DocumentService:
 
															                             account,
														
 
															                             file_name,
														
 
															                             batch,
														
 
															-                            knowledge_config.metadata,
														
 
															                         )
														
 
															                         db.session.add(document)
														
 
															                         db.session.flush()
														
@@ -1076,7 +1071,6 @@ class DocumentService:
 
															                                     account,
														
 
															                                     truncated_page_name,
														
 
															                                     batch,
														
 
															-                                    knowledge_config.metadata,
														
 
															                                 )
														
 
															                                 db.session.add(document)
														
 
															                                 db.session.flush()
														
@@ -1117,7 +1111,6 @@ class DocumentService:
 
															                             account,
														
 
															                             document_name,
														
 
															                             batch,
														
 
															-                            knowledge_config.metadata,
														
 
															                         )
														
 
															                         db.session.add(document)
														
 
															                         db.session.flush()
														
@@ -1155,7 +1148,6 @@ class DocumentService:
 
															         account: Account,
														
 
															         name: str,
														
 
															         batch: str,
														
 
															-        metadata: Optional[MetaDataConfig] = None,
														
 
															     ):
														
 
															         document = Document(
														
 
															             tenant_id=dataset.tenant_id,
														
@@ -1180,9 +1172,6 @@ class DocumentService:
 
															                 BuiltInField.last_update_date: datetime.datetime.now(datetime.UTC).strftime("%Y-%m-%d %H:%M:%S"),
														
 
															                 BuiltInField.source: data_source_type,
														
 
															             }
														
 
															-        if metadata is not None:
														
 
															-            doc_metadata.update(metadata.doc_metadata)
														
 
															-            document.doc_type = metadata.doc_type
														
 
															         if doc_metadata:
														
 
															             document.doc_metadata = doc_metadata
														
 
															         return document
														
@@ -1297,10 +1286,6 @@ class DocumentService:
 
															         # update document name
														
 
															         if document_data.name:
														
 
															             document.name = document_data.name
														
 
															-        # update doc_type and doc_metadata if provided
														
 
															-        if document_data.metadata is not None:
														
 
															-            document.doc_metadata = document_data.metadata.doc_metadata
														
 
															-            document.doc_type = document_data.metadata.doc_type
														
 
															         # update document to be waiting
														
 
															         document.indexing_status = "waiting"
														
 
															         document.completed_at = None
														
--- a/api/services/entities/knowledge_entities/knowledge_entities.py
+++ b/api/services/entities/knowledge_entities/knowledge_entities.py
@@ -128,7 +128,6 @@ class KnowledgeConfig(BaseModel):
 
															     embedding_model: Optional[str] = None
														
 
															     embedding_model_provider: Optional[str] = None
														
 
															     name: Optional[str] = None
														
 
															-    metadata: Optional[MetaDataConfig] = None
														
 
															 class SegmentUpdateArgs(BaseModel):
														
--- a/web/app/(commonLayout)/datasets/template/template.en.mdx
+++ b/web/app/(commonLayout)/datasets/template/template.en.mdx
@@ -47,44 +47,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
 
															       <Property name='text' type='string' key='text'>
														
 
															         Document content
														
 
															       </Property>
														
 
															-      <Property name='doc_type' type='string' key='doc_type'>
														
 
															-        Type of document (optional):
														
 
															-          - <code>book</code> Book
														
 
															-          - <code>web_page</code> Web page
														
 
															-          - <code>paper</code> Academic paper/article 
														
 
															-          - <code>social_media_post</code> Social media post
														
 
															-          - <code>wikipedia_entry</code> Wikipedia entry
														
 
															-          - <code>personal_document</code> Personal document
														
 
															-          - <code>business_document</code> Business document
														
 
															-          - <code>im_chat_log</code> Chat log
														
 
															-          - <code>synced_from_notion</code> Notion document
														
 
															-          - <code>synced_from_github</code> GitHub document
														
 
															-          - <code>others</code> Other document types
														
 
															-      </Property>
														
 
															-      <Property name='doc_metadata' type='object' key='doc_metadata'>
														
 
															-        Document metadata (required if doc_type is provided). Fields vary by doc_type:
														
 
															-          For <code>book</code>:
														
 
															-          - <code>title</code> Book title 
														
 
															-          - <code>language</code> Book language
														
 
															-          - <code>author</code> Book author
														
 
															-          - <code>publisher</code> Publisher name
														
 
															-          - <code>publication_date</code> Publication date
														
 
															-          - <code>isbn</code> ISBN number
														
 
															-          - <code>category</code> Book category
														
 
															-
														
 
															-          For <code>web_page</code>:
														
 
															-          - <code>title</code> Page title
														
 
															-          - <code>url</code> Page URL
														
 
															-          - <code>language</code> Page language
														
 
															-          - <code>publish_date</code> Publish date
														
 
															-          - <code>author/publisher</code> Author or publisher
														
 
															-          - <code>topic/keywords</code> Topic or keywords
														
 
															-          - <code>description</code> Page description
														
 
															-
														
 
															-          Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type.
														
 
															-
														
 
															-          For doc_type "others", any valid JSON object is accepted
														
 
															-      </Property>
														
 
															       <Property name='indexing_technique' type='string' key='indexing_technique'>
														
 
															         Index mode
														
 
															           - <code>high_quality</code> High quality: embedding using embedding model, built as vector database index
														
@@ -233,68 +195,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
 
															           - <code>hierarchical_model</code> Parent-child mode
														
 
															           - <code>qa_model</code> Q&A Mode: Generates Q&A pairs for segmented documents and then embeds the questions
														
 
															-        - <code>doc_type</code> Type of document (optional)
														
 
															-          - <code>book</code> Book
														
 
															-            Document records a book or publication
														
 
															-          - <code>web_page</code> Web page 
														
 
															-            Document records web page content
														
 
															-          - <code>paper</code> Academic paper/article
														
 
															-            Document records academic paper or research article
														
 
															-          - <code>social_media_post</code> Social media post
														
 
															-            Content from social media posts
														
 
															-          - <code>wikipedia_entry</code> Wikipedia entry
														
 
															-            Content from Wikipedia entries
														
 
															-          - <code>personal_document</code> Personal document
														
 
															-            Documents related to personal content
														
 
															-          - <code>business_document</code> Business document
														
 
															-            Documents related to business content
														
 
															-          - <code>im_chat_log</code> Chat log
														
 
															-            Records of instant messaging chats
														
 
															-          - <code>synced_from_notion</code> Notion document
														
 
															-            Documents synchronized from Notion
														
 
															-          - <code>synced_from_github</code> GitHub document
														
 
															-            Documents synchronized from GitHub
														
 
															-          - <code>others</code> Other document types
														
 
															-            Other document types not listed above
														
 
															-
														
 
															-        - <code>doc_metadata</code> Document metadata (required if doc_type is provided)
														
 
															-          Fields vary by doc_type:
														
 
															-
														
 
															-          For <code>book</code>:
														
 
															-          - <code>title</code> Book title
														
 
															-            Title of the book
														
 
															-          - <code>language</code> Book language
														
 
															-            Language of the book
														
 
															-          - <code>author</code> Book author
														
 
															-            Author of the book
														
 
															-          - <code>publisher</code> Publisher name
														
 
															-            Name of the publishing house
														
 
															-          - <code>publication_date</code> Publication date
														
 
															-            Date when the book was published
														
 
															-          - <code>isbn</code> ISBN number
														
 
															-            International Standard Book Number
														
 
															-          - <code>category</code> Book category
														
 
															-            Category or genre of the book
														
 
															-
														
 
															-          For <code>web_page</code>:
														
 
															-          - <code>title</code> Page title
														
 
															-            Title of the web page
														
 
															-          - <code>url</code> Page URL
														
 
															-            URL address of the web page
														
 
															-          - <code>language</code> Page language
														
 
															-            Language of the web page
														
 
															-          - <code>publish_date</code> Publish date
														
 
															-            Date when the web page was published
														
 
															-          - <code>author/publisher</code> Author or publisher
														
 
															-            Author or publisher of the web page
														
 
															-          - <code>topic/keywords</code> Topic or keywords
														
 
															-            Topics or keywords of the web page
														
 
															-          - <code>description</code> Page description
														
 
															-            Description of the web page content
														
 
															-
														
 
															-          Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type.
														
 
															-          For doc_type "others", any valid JSON object is accepted
														
 
															-
														
 
															         - <code>doc_language</code> In Q&A mode, specify the language of the document, for example: <code>English</code>, <code>Chinese</code>
														
 
															         - <code>process_rule</code> Processing rules
														
@@ -407,44 +307,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
 
															       <Property name='description' type='string' key='description'>
														
 
															         Knowledge description (optional)
														
 
															       </Property>
														
 
															-      <Property name='doc_type' type='string' key='doc_type'>
														
 
															-        Type of document (optional):
														
 
															-          - <code>book</code> Book
														
 
															-          - <code>web_page</code> Web page
														
 
															-          - <code>paper</code> Academic paper/article 
														
 
															-          - <code>social_media_post</code> Social media post
														
 
															-          - <code>wikipedia_entry</code> Wikipedia entry
														
 
															-          - <code>personal_document</code> Personal document
														
 
															-          - <code>business_document</code> Business document
														
 
															-          - <code>im_chat_log</code> Chat log
														
 
															-          - <code>synced_from_notion</code> Notion document
														
 
															-          - <code>synced_from_github</code> GitHub document
														
 
															-          - <code>others</code> Other document types
														
 
															-      </Property>
														
 
															-      <Property name='doc_metadata' type='object' key='doc_metadata'>
														
 
															-        Document metadata (required if doc_type is provided). Fields vary by doc_type:
														
 
															-          For <code>book</code>:
														
 
															-          - <code>title</code> Book title 
														
 
															-          - <code>language</code> Book language
														
 
															-          - <code>author</code> Book author
														
 
															-          - <code>publisher</code> Publisher name
														
 
															-          - <code>publication_date</code> Publication date
														
 
															-          - <code>isbn</code> ISBN number
														
 
															-          - <code>category</code> Book category
														
 
															-
														
 
															-          For <code>web_page</code>:
														
 
															-          - <code>title</code> Page title
														
 
															-          - <code>url</code> Page URL
														
 
															-          - <code>language</code> Page language
														
 
															-          - <code>publish_date</code> Publish date
														
 
															-          - <code>author/publisher</code> Author or publisher
														
 
															-          - <code>topic/keywords</code> Topic or keywords
														
 
															-          - <code>description</code> Page description
														
 
															-
														
 
															-          Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type.
														
 
															-
														
 
															-          For doc_type "others", any valid JSON object is accepted
														
 
															-      </Property>
														
 
															       <Property name='indexing_technique' type='string' key='indexing_technique'>
														
 
															         Index technique (optional)
														
 
															           - <code>high_quality</code> High quality
														
@@ -762,67 +624,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
 
															               - <code>separator</code> Segmentation identifier. Currently, only one delimiter is allowed. The default is <code>***</code>
														
 
															               - <code>max_tokens</code> The maximum length (tokens) must be validated to be shorter than the length of the parent chunk
														
 
															               - <code>chunk_overlap</code> Define the overlap between adjacent chunks (optional)
														
 
															-            - <code>doc_type</code> Type of document (optional)
														
 
															-              - <code>book</code> Book
														
 
															-                Document records a book or publication
														
 
															-              - <code>web_page</code> Web page 
														
 
															-                Document records web page content
														
 
															-              - <code>paper</code> Academic paper/article
														
 
															-                Document records academic paper or research article
														
 
															-              - <code>social_media_post</code> Social media post
														
 
															-                Content from social media posts
														
 
															-              - <code>wikipedia_entry</code> Wikipedia entry
														
 
															-                Content from Wikipedia entries
														
 
															-              - <code>personal_document</code> Personal document
														
 
															-                Documents related to personal content
														
 
															-              - <code>business_document</code> Business document
														
 
															-                Documents related to business content
														
 
															-              - <code>im_chat_log</code> Chat log
														
 
															-                Records of instant messaging chats
														
 
															-              - <code>synced_from_notion</code> Notion document
														
 
															-                Documents synchronized from Notion
														
 
															-              - <code>synced_from_github</code> GitHub document
														
 
															-                Documents synchronized from GitHub
														
 
															-              - <code>others</code> Other document types
														
 
															-                Other document types not listed above
														
 
															-
														
 
															-            - <code>doc_metadata</code> Document metadata (required if doc_type is provided)
														
 
															-              Fields vary by doc_type:
														
 
															-
														
 
															-              For <code>book</code>:
														
 
															-              - <code>title</code> Book title
														
 
															-                Title of the book
														
 
															-              - <code>language</code> Book language
														
 
															-                Language of the book
														
 
															-              - <code>author</code> Book author
														
 
															-                Author of the book
														
 
															-              - <code>publisher</code> Publisher name
														
 
															-                Name of the publishing house
														
 
															-              - <code>publication_date</code> Publication date
														
 
															-                Date when the book was published
														
 
															-              - <code>isbn</code> ISBN number
														
 
															-                International Standard Book Number
														
 
															-              - <code>category</code> Book category
														
 
															-                Category or genre of the book
														
 
															-
														
 
															-              For <code>web_page</code>:
														
 
															-              - <code>title</code> Page title
														
 
															-                Title of the web page
														
 
															-              - <code>url</code> Page URL
														
 
															-                URL address of the web page
														
 
															-              - <code>language</code> Page language
														
 
															-                Language of the web page
														
 
															-              - <code>publish_date</code> Publish date
														
 
															-                Date when the web page was published
														
 
															-              - <code>author/publisher</code> Author or publisher
														
 
															-                Author or publisher of the web page
														
 
															-              - <code>topic/keywords</code> Topic or keywords
														
 
															-                Topics or keywords of the web page
														
 
															-              - <code>description</code> Page description
														
 
															-                Description of the web page content
														
 
															-
														
 
															-              Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type.
														
 
															-              For doc_type "others", any valid JSON object is accepted
														
 
															       </Property>
														
 
															     </Properties>
														
 
															   </Col>
														
@@ -1528,7 +1329,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
 
															               "id": "a8c6c36f-9f5d-4d7a-8472-f5d7b75d71d2",
														
 
															               "data_source_type": "upload_file",
														
 
															               "name": "readme.txt",
														
 
															-              "doc_type": null
														
 
															             }
														
 
															           },
														
 
															           "score": 3.730463140527718e-05,
														
--- a/web/app/(commonLayout)/datasets/template/template.zh.mdx
+++ b/web/app/(commonLayout)/datasets/template/template.zh.mdx
@@ -47,46 +47,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
 
															       <Property name='text' type='string' key='text'>
														
 
															         文档内容
														
 
															       </Property>
														
 
															-      <Property name='doc_type' type='string' key='doc_type'>
														
 
															-        文档类型（选填）
														
 
															-          - <code>book</code> 图书 Book
														
 
															-          - <code>web_page</code> 网页 Web page
														
 
															-          - <code>paper</code> 学术论文/文章 Academic paper/article 
														
 
															-          - <code>social_media_post</code> 社交媒体帖子 Social media post
														
 
															-          - <code>wikipedia_entry</code> 维基百科条目 Wikipedia entry
														
 
															-          - <code>personal_document</code> 个人文档 Personal document
														
 
															-          - <code>business_document</code> 商业文档 Business document
														
 
															-          - <code>im_chat_log</code> 即时通讯记录 Chat log
														
 
															-          - <code>synced_from_notion</code> Notion同步文档 Notion document
														
 
															-          - <code>synced_from_github</code> GitHub同步文档 GitHub document
														
 
															-          - <code>others</code> 其他文档类型 Other document types
														
 
															-      </Property>
														
 
															-      <Property name='doc_metadata' type='object' key='doc_metadata'>
														
 
															-      
														
 
															-        文档元数据（如提供文档类型则必填）。字段因文档类型而异：
														
 
															-          
														
 
															-          针对图书 For <code>book</code>:
														
 
															-          - <code>title</code> 书名 Book title 
														
 
															-          - <code>language</code> 图书语言 Book language
														
 
															-          - <code>author</code> 作者 Book author
														
 
															-          - <code>publisher</code> 出版社 Publisher name
														
 
															-          - <code>publication_date</code> 出版日期 Publication date
														
 
															-          - <code>isbn</code> ISBN号码 ISBN number
														
 
															-          - <code>category</code> 图书分类 Book category
														
 
															-
														
 
															-          针对网页 For <code>web_page</code>:
														
 
															-          - <code>title</code> 页面标题 Page title
														
 
															-          - <code>url</code> 页面网址 Page URL
														
 
															-          - <code>language</code> 页面语言 Page language
														
 
															-          - <code>publish_date</code> 发布日期 Publish date
														
 
															-          - <code>author/publisher</code> 作者/发布者 Author or publisher
														
 
															-          - <code>topic/keywords</code> 主题/关键词 Topic or keywords
														
 
															-          - <code>description</code> 页面描述 Page description
														
 
															-
														
 
															-          请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。
														
 
															-
														
 
															-          针对"其他"类型文档，接受任何有效的JSON对象
														
 
															-      </Property>
														
 
															       <Property name='indexing_technique' type='string' key='indexing_technique'>
														
 
															         索引方式
														
 
															           - <code>high_quality</code> 高质量：使用  embedding 模型进行嵌入，构建为向量数据库索引
														
@@ -234,68 +194,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
 
															           - <code>text_model</code> text 文档直接 embedding，经济模式默认为该模式
														
 
															           - <code>hierarchical_model</code> parent-child 模式
														
 
															           - <code>qa_model</code> Q&A 模式：为分片文档生成 Q&A 对，然后对问题进行 embedding
														
 
															-        - <code>doc_type</code> 文档类型（选填）Type of document (optional)
														
 
															-          - <code>book</code> 图书
														
 
															-            文档记录一本书籍或出版物
														
 
															-          - <code>web_page</code> 网页
														
 
															-            网页内容的文档记录
														
 
															-          - <code>paper</code> 学术论文/文章
														
 
															-            学术论文或研究文章的记录
														
 
															-          - <code>social_media_post</code> 社交媒体帖子
														
 
															-            社交媒体上的帖子内容
														
 
															-          - <code>wikipedia_entry</code> 维基百科条目
														
 
															-            维基百科的词条内容
														
 
															-          - <code>personal_document</code> 个人文档
														
 
															-            个人相关的文档记录
														
 
															-          - <code>business_document</code> 商业文档
														
 
															-            商业相关的文档记录
														
 
															-          - <code>im_chat_log</code> 即时通讯记录
														
 
															-            即时通讯的聊天记录
														
 
															-          - <code>synced_from_notion</code> Notion同步文档
														
 
															-            从Notion同步的文档内容
														
 
															-          - <code>synced_from_github</code> GitHub同步文档
														
 
															-            从GitHub同步的文档内容
														
 
															-          - <code>others</code> 其他文档类型
														
 
															-            其他未列出的文档类型
														
 
															-
														
 
															-        - <code>doc_metadata</code> 文档元数据（如提供文档类型则必填
														
 
															-          字段因文档类型而异
														
 
															-
														
 
															-          针对图书类型 For <code>book</code>:
														
 
															-          - <code>title</code> 书名
														
 
															-            书籍的标题
														
 
															-          - <code>language</code> 图书语言
														
 
															-            书籍的语言
														
 
															-          - <code>author</code> 作者
														
 
															-            书籍的作者
														
 
															-          - <code>publisher</code> 出版社
														
 
															-            出版社的名称
														
 
															-          - <code>publication_date</code> 出版日期
														
 
															-            书籍的出版日期
														
 
															-          - <code>isbn</code> ISBN号码
														
 
															-            书籍的ISBN编号
														
 
															-          - <code>category</code> 图书分类
														
 
															-            书籍的分类类别
														
 
															-
														
 
															-          针对网页类型 For <code>web_page</code>:
														
 
															-          - <code>title</code> 页面标题
														
 
															-            网页的标题
														
 
															-          - <code>url</code> 页面网址
														
 
															-            网页的URL地址
														
 
															-          - <code>language</code> 页面语言
														
 
															-            网页的语言
														
 
															-          - <code>publish_date</code> 发布日期
														
 
															-            网页的发布日期
														
 
															-          - <code>author/publisher</code> 作者/发布者
														
 
															-            网页的作者或发布者
														
 
															-          - <code>topic/keywords</code> 主题/关键词
														
 
															-            网页的主题或关键词
														
 
															-          - <code>description</code> 页面描述
														
 
															-            网页的描述信息
														
 
															-
														
 
															-          请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。
														
 
															-
														
 
															-          针对"其他"类型文档，接受任何有效的JSON对象
														
 
															         - <code>doc_language</code> 在 Q&A 模式下，指定文档的语言，例如：<code>English</code>、<code>Chinese</code>
														
@@ -606,46 +504,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
 
															       <Property name='text' type='string' key='text'>
														
 
															         文档内容（选填）
														
 
															       </Property>
														
 
															-      <Property name='doc_type' type='string' key='doc_type'>
														
 
															-        文档类型（选填）
														
 
															-          - <code>book</code> 图书 Book
														
 
															-          - <code>web_page</code> 网页 Web page
														
 
															-          - <code>paper</code> 学术论文/文章 Academic paper/article 
														
 
															-          - <code>social_media_post</code> 社交媒体帖子 Social media post
														
 
															-          - <code>wikipedia_entry</code> 维基百科条目 Wikipedia entry
														
 
															-          - <code>personal_document</code> 个人文档 Personal document
														
 
															-          - <code>business_document</code> 商业文档 Business document
														
 
															-          - <code>im_chat_log</code> 即时通讯记录 Chat log
														
 
															-          - <code>synced_from_notion</code> Notion同步文档 Notion document
														
 
															-          - <code>synced_from_github</code> GitHub同步文档 GitHub document
														
 
															-          - <code>others</code> 其他文档类型 Other document types
														
 
															-      </Property>
														
 
															-      <Property name='doc_metadata' type='object' key='doc_metadata'>
														
 
															-      
														
 
															-        文档元数据（如提供文档类型则必填）。字段因文档类型而异：
														
 
															-          
														
 
															-          针对图书 For <code>book</code>:
														
 
															-          - <code>title</code> 书名 Book title 
														
 
															-          - <code>language</code> 图书语言 Book language
														
 
															-          - <code>author</code> 作者 Book author
														
 
															-          - <code>publisher</code> 出版社 Publisher name
														
 
															-          - <code>publication_date</code> 出版日期 Publication date
														
 
															-          - <code>isbn</code> ISBN号码 ISBN number
														
 
															-          - <code>category</code> 图书分类 Book category
														
 
															-
														
 
															-          针对网页 For <code>web_page</code>:
														
 
															-          - <code>title</code> 页面标题 Page title
														
 
															-          - <code>url</code> 页面网址 Page URL
														
 
															-          - <code>language</code> 页面语言 Page language
														
 
															-          - <code>publish_date</code> 发布日期 Publish date
														
 
															-          - <code>author/publisher</code> 作者/发布者 Author or publisher
														
 
															-          - <code>topic/keywords</code> 主题/关键词 Topic or keywords
														
 
															-          - <code>description</code> 页面描述 Page description
														
 
															-
														
 
															-          请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。
														
 
															-
														
 
															-          针对"其他"类型文档，接受任何有效的JSON对象
														
 
															-      </Property>
														
 
															       <Property name='process_rule' type='object' key='process_rule'>
														
 
															         处理规则（选填）
														
 
															           - <code>mode</code> (string) 清洗、分段模式 ，automatic 自动 / custom 自定义
														
@@ -766,68 +624,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
 
															               - <code>separator</code> 分段标识符，目前仅允许设置一个分隔符。默认为 <code>***</code>
														
 
															               - <code>max_tokens</code> 最大长度 (token) 需要校验小于父级的长度
														
 
															               - <code>chunk_overlap</code> 分段重叠指的是在对数据进行分段时，段与段之间存在一定的重叠部分（选填）
														
 
															-            - <code>doc_type</code> 文档类型（选填）Type of document (optional)
														
 
															-              - <code>book</code> 图书
														
 
															-                文档记录一本书籍或出版物
														
 
															-              - <code>web_page</code> 网页
														
 
															-                网页内容的文档记录
														
 
															-              - <code>paper</code> 学术论文/文章
														
 
															-                学术论文或研究文章的记录
														
 
															-              - <code>social_media_post</code> 社交媒体帖子
														
 
															-                社交媒体上的帖子内容
														
 
															-              - <code>wikipedia_entry</code> 维基百科条目
														
 
															-                维基百科的词条内容
														
 
															-              - <code>personal_document</code> 个人文档
														
 
															-                个人相关的文档记录
														
 
															-              - <code>business_document</code> 商业文档
														
 
															-                商业相关的文档记录
														
 
															-              - <code>im_chat_log</code> 即时通讯记录
														
 
															-                即时通讯的聊天记录
														
 
															-              - <code>synced_from_notion</code> Notion同步文档
														
 
															-                从Notion同步的文档内容
														
 
															-              - <code>synced_from_github</code> GitHub同步文档
														
 
															-                从GitHub同步的文档内容
														
 
															-              - <code>others</code> 其他文档类型
														
 
															-                其他未列出的文档类型
														
 
															-
														
 
															-            - <code>doc_metadata</code> 文档元数据（如提供文档类型则必填
														
 
															-              字段因文档类型而异
														
 
															-
														
 
															-              针对图书类型 For <code>book</code>:
														
 
															-              - <code>title</code> 书名
														
 
															-                书籍的标题
														
 
															-              - <code>language</code> 图书语言
														
 
															-                书籍的语言
														
 
															-              - <code>author</code> 作者
														
 
															-                书籍的作者
														
 
															-              - <code>publisher</code> 出版社
														
 
															-                出版社的名称
														
 
															-              - <code>publication_date</code> 出版日期
														
 
															-                书籍的出版日期
														
 
															-              - <code>isbn</code> ISBN号码
														
 
															-                书籍的ISBN编号
														
 
															-              - <code>category</code> 图书分类
														
 
															-                书籍的分类类别
														
 
															-
														
 
															-              针对网页类型 For <code>web_page</code>:
														
 
															-              - <code>title</code> 页面标题
														
 
															-                网页的标题
														
 
															-              - <code>url</code> 页面网址
														
 
															-                网页的URL地址
														
 
															-              - <code>language</code> 页面语言
														
 
															-                网页的语言
														
 
															-              - <code>publish_date</code> 发布日期
														
 
															-                网页的发布日期
														
 
															-              - <code>author/publisher</code> 作者/发布者
														
 
															-                网页的作者或发布者
														
 
															-              - <code>topic/keywords</code> 主题/关键词
														
 
															-                网页的主题或关键词
														
 
															-              - <code>description</code> 页面描述
														
 
															-                网页的描述信息
														
 
															-
														
 
															-              请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。
														
 
															-
														
 
															-              针对"其他"类型文档，接受任何有效的JSON对象
														
 
															       </Property>
														
 
															     </Properties>
														
 
															   </Col>
														
@@ -1534,7 +1330,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
 
															               "id": "a8c6c36f-9f5d-4d7a-8472-f5d7b75d71d2",
														
 
															               "data_source_type": "upload_file",
														
 
															               "name": "readme.txt",
														
 
															-              "doc_type": null
														
 
															             }
														
 
															           },
														
 
															           "score": 3.730463140527718e-05,