Explorar o código

Fix/create document by api with metadata (#16307)

Co-authored-by: zxhlyh <jasonapring2015@outlook.com>
Jyong hai 1 mes
pai
achega
2c9af712a2

+ 71 - 8
api/commands.py

@@ -20,7 +20,7 @@ from libs.helper import email as email_validate
 from libs.password import hash_password, password_pattern, valid_password
 from libs.password import hash_password, password_pattern, valid_password
 from libs.rsa import generate_key_pair
 from libs.rsa import generate_key_pair
 from models import Tenant
 from models import Tenant
-from models.dataset import Dataset, DatasetCollectionBinding, DocumentSegment
+from models.dataset import Dataset, DatasetCollectionBinding, DatasetMetadata, DatasetMetadataBinding, DocumentSegment
 from models.dataset import Document as DatasetDocument
 from models.dataset import Document as DatasetDocument
 from models.model import Account, App, AppAnnotationSetting, AppMode, Conversation, MessageAnnotation
 from models.model import Account, App, AppAnnotationSetting, AppMode, Conversation, MessageAnnotation
 from models.provider import Provider, ProviderModel
 from models.provider import Provider, ProviderModel
@@ -483,14 +483,11 @@ def convert_to_agent_apps():
     click.echo(click.style("Conversion complete. Converted {} agent apps.".format(len(proceeded_app_ids)), fg="green"))
     click.echo(click.style("Conversion complete. Converted {} agent apps.".format(len(proceeded_app_ids)), fg="green"))
 
 
 
 
-@click.command("add-qdrant-doc-id-index", help="Add Qdrant doc_id index.")
+@click.command("add-qdrant-index", help="Add Qdrant index.")
 @click.option("--field", default="metadata.doc_id", prompt=False, help="Index field , default is metadata.doc_id.")
 @click.option("--field", default="metadata.doc_id", prompt=False, help="Index field , default is metadata.doc_id.")
-def add_qdrant_doc_id_index(field: str):
-    click.echo(click.style("Starting Qdrant doc_id index creation.", fg="green"))
-    vector_type = dify_config.VECTOR_STORE
-    if vector_type != "qdrant":
-        click.echo(click.style("This command only supports Qdrant vector store.", fg="red"))
-        return
+def add_qdrant_index(field: str):
+    click.echo(click.style("Starting Qdrant index creation.", fg="green"))
+
     create_count = 0
     create_count = 0
 
 
     try:
     try:
@@ -539,6 +536,72 @@ def add_qdrant_doc_id_index(field: str):
     click.echo(click.style(f"Index creation complete. Created {create_count} collection indexes.", fg="green"))
     click.echo(click.style(f"Index creation complete. Created {create_count} collection indexes.", fg="green"))
 
 
 
 
+@click.command("old-metadata-migration", help="Old metadata migration.")
+def old_metadata_migration():
+    """
+    Old metadata migration.
+    """
+    click.echo(click.style("Starting old metadata migration.", fg="green"))
+
+    page = 1
+    while True:
+        try:
+            documents = (
+                DatasetDocument.query.filter(DatasetDocument.doc_metadata is not None)
+                .order_by(DatasetDocument.created_at.desc())
+                .paginate(page=page, per_page=50)
+            )
+        except NotFound:
+            break
+        if not documents:
+            break
+        for document in documents:
+            if document.doc_metadata:
+                doc_metadata = document.doc_metadata
+                for key, value in doc_metadata.items():
+                    dataset_metadata = (
+                        db.session.query(DatasetMetadata)
+                        .filter(DatasetMetadata.dataset_id == document.dataset_id, DatasetMetadata.name == key)
+                        .first()
+                    )
+                    if not dataset_metadata:
+                        dataset_metadata = DatasetMetadata(
+                            tenant_id=document.tenant_id,
+                            dataset_id=document.dataset_id,
+                            name=key,
+                            type="string",
+                            created_by=document.created_by,
+                        )
+                        db.session.add(dataset_metadata)
+                        db.session.flush()
+                        dataset_metadata_binding = DatasetMetadataBinding(
+                            tenant_id=document.tenant_id,
+                            dataset_id=document.dataset_id,
+                            metadata_id=dataset_metadata.id,
+                            document_id=document.id,
+                            created_by=document.created_by,
+                        )
+                        db.session.add(dataset_metadata_binding)
+                    else:
+                        dataset_metadata_binding = DatasetMetadataBinding.query.filter(
+                            DatasetMetadataBinding.dataset_id == document.dataset_id,
+                            DatasetMetadataBinding.document_id == document.id,
+                            DatasetMetadataBinding.metadata_id == dataset_metadata.id,
+                        ).first()
+                        if not dataset_metadata_binding:
+                            dataset_metadata_binding = DatasetMetadataBinding(
+                                tenant_id=document.tenant_id,
+                                dataset_id=document.dataset_id,
+                                metadata_id=dataset_metadata.id,
+                                document_id=document.id,
+                                created_by=document.created_by,
+                            )
+                            db.session.add(dataset_metadata_binding)
+                db.session.commit()
+        page += 1
+    click.echo(click.style("Old metadata migration completed.", fg="green"))
+
+
 @click.command("create-tenant", help="Create account and tenant.")
 @click.command("create-tenant", help="Create account and tenant.")
 @click.option("--email", prompt=True, help="Tenant account email.")
 @click.option("--email", prompt=True, help="Tenant account email.")
 @click.option("--name", prompt=True, help="Workspace name.")
 @click.option("--name", prompt=True, help="Workspace name.")

+ 0 - 96
api/controllers/service_api/dataset/document.py

@@ -18,7 +18,6 @@ from controllers.service_api.app.error import (
 from controllers.service_api.dataset.error import (
 from controllers.service_api.dataset.error import (
     ArchivedDocumentImmutableError,
     ArchivedDocumentImmutableError,
     DocumentIndexingError,
     DocumentIndexingError,
-    InvalidMetadataError,
 )
 )
 from controllers.service_api.wraps import DatasetApiResource, cloud_edition_billing_resource_check
 from controllers.service_api.wraps import DatasetApiResource, cloud_edition_billing_resource_check
 from core.errors.error import ProviderTokenNotInitError
 from core.errors.error import ProviderTokenNotInitError
@@ -51,8 +50,6 @@ class DocumentAddByTextApi(DatasetApiResource):
             "indexing_technique", type=str, choices=Dataset.INDEXING_TECHNIQUE_LIST, nullable=False, location="json"
             "indexing_technique", type=str, choices=Dataset.INDEXING_TECHNIQUE_LIST, nullable=False, location="json"
         )
         )
         parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json")
         parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json")
-        parser.add_argument("doc_type", type=str, required=False, nullable=True, location="json")
-        parser.add_argument("doc_metadata", type=dict, required=False, nullable=True, location="json")
 
 
         args = parser.parse_args()
         args = parser.parse_args()
         dataset_id = str(dataset_id)
         dataset_id = str(dataset_id)
@@ -65,28 +62,6 @@ class DocumentAddByTextApi(DatasetApiResource):
         if not dataset.indexing_technique and not args["indexing_technique"]:
         if not dataset.indexing_technique and not args["indexing_technique"]:
             raise ValueError("indexing_technique is required.")
             raise ValueError("indexing_technique is required.")
 
 
-        # Validate metadata if provided
-        if args.get("doc_type") or args.get("doc_metadata"):
-            if not args.get("doc_type") or not args.get("doc_metadata"):
-                raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
-
-            if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
-                raise InvalidMetadataError(
-                    "Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
-                )
-
-            if not isinstance(args["doc_metadata"], dict):
-                raise InvalidMetadataError("doc_metadata must be a dictionary")
-
-            # Validate metadata schema based on doc_type
-            if args["doc_type"] != "others":
-                metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
-                for key, value in args["doc_metadata"].items():
-                    if key in metadata_schema and not isinstance(value, metadata_schema[key]):
-                        raise InvalidMetadataError(f"Invalid type for metadata field {key}")
-            # set to MetaDataConfig
-            args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
-
         text = args.get("text")
         text = args.get("text")
         name = args.get("name")
         name = args.get("name")
         if text is None or name is None:
         if text is None or name is None:
@@ -133,8 +108,6 @@ class DocumentUpdateByTextApi(DatasetApiResource):
             "doc_language", type=str, default="English", required=False, nullable=False, location="json"
             "doc_language", type=str, default="English", required=False, nullable=False, location="json"
         )
         )
         parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json")
         parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json")
-        parser.add_argument("doc_type", type=str, required=False, nullable=True, location="json")
-        parser.add_argument("doc_metadata", type=dict, required=False, nullable=True, location="json")
         args = parser.parse_args()
         args = parser.parse_args()
         dataset_id = str(dataset_id)
         dataset_id = str(dataset_id)
         tenant_id = str(tenant_id)
         tenant_id = str(tenant_id)
@@ -146,29 +119,6 @@ class DocumentUpdateByTextApi(DatasetApiResource):
         # indexing_technique is already set in dataset since this is an update
         # indexing_technique is already set in dataset since this is an update
         args["indexing_technique"] = dataset.indexing_technique
         args["indexing_technique"] = dataset.indexing_technique
 
 
-        # Validate metadata if provided
-        if args.get("doc_type") or args.get("doc_metadata"):
-            if not args.get("doc_type") or not args.get("doc_metadata"):
-                raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
-
-            if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
-                raise InvalidMetadataError(
-                    "Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
-                )
-
-            if not isinstance(args["doc_metadata"], dict):
-                raise InvalidMetadataError("doc_metadata must be a dictionary")
-
-            # Validate metadata schema based on doc_type
-            if args["doc_type"] != "others":
-                metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
-                for key, value in args["doc_metadata"].items():
-                    if key in metadata_schema and not isinstance(value, metadata_schema[key]):
-                        raise InvalidMetadataError(f"Invalid type for metadata field {key}")
-
-            # set to MetaDataConfig
-            args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
-
         if args["text"]:
         if args["text"]:
             text = args.get("text")
             text = args.get("text")
             name = args.get("name")
             name = args.get("name")
@@ -216,29 +166,6 @@ class DocumentAddByFileApi(DatasetApiResource):
         if "doc_language" not in args:
         if "doc_language" not in args:
             args["doc_language"] = "English"
             args["doc_language"] = "English"
 
 
-        # Validate metadata if provided
-        if args.get("doc_type") or args.get("doc_metadata"):
-            if not args.get("doc_type") or not args.get("doc_metadata"):
-                raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
-
-            if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
-                raise InvalidMetadataError(
-                    "Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
-                )
-
-            if not isinstance(args["doc_metadata"], dict):
-                raise InvalidMetadataError("doc_metadata must be a dictionary")
-
-            # Validate metadata schema based on doc_type
-            if args["doc_type"] != "others":
-                metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
-                for key, value in args["doc_metadata"].items():
-                    if key in metadata_schema and not isinstance(value, metadata_schema[key]):
-                        raise InvalidMetadataError(f"Invalid type for metadata field {key}")
-
-            # set to MetaDataConfig
-            args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
-
         # get dataset info
         # get dataset info
         dataset_id = str(dataset_id)
         dataset_id = str(dataset_id)
         tenant_id = str(tenant_id)
         tenant_id = str(tenant_id)
@@ -306,29 +233,6 @@ class DocumentUpdateByFileApi(DatasetApiResource):
         if "doc_language" not in args:
         if "doc_language" not in args:
             args["doc_language"] = "English"
             args["doc_language"] = "English"
 
 
-        # Validate metadata if provided
-        if args.get("doc_type") or args.get("doc_metadata"):
-            if not args.get("doc_type") or not args.get("doc_metadata"):
-                raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
-
-            if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
-                raise InvalidMetadataError(
-                    "Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
-                )
-
-            if not isinstance(args["doc_metadata"], dict):
-                raise InvalidMetadataError("doc_metadata must be a dictionary")
-
-            # Validate metadata schema based on doc_type
-            if args["doc_type"] != "others":
-                metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
-                for key, value in args["doc_metadata"].items():
-                    if key in metadata_schema and not isinstance(value, metadata_schema[key]):
-                        raise InvalidMetadataError(f"Invalid type for metadata field {key}")
-
-            # set to MetaDataConfig
-            args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
-
         # get dataset info
         # get dataset info
         dataset_id = str(dataset_id)
         dataset_id = str(dataset_id)
         tenant_id = str(tenant_id)
         tenant_id = str(tenant_id)

+ 4 - 2
api/extensions/ext_commands.py

@@ -3,7 +3,7 @@ from dify_app import DifyApp
 
 
 def init_app(app: DifyApp):
 def init_app(app: DifyApp):
     from commands import (
     from commands import (
-        add_qdrant_doc_id_index,
+        add_qdrant_index,
         convert_to_agent_apps,
         convert_to_agent_apps,
         create_tenant,
         create_tenant,
         extract_plugins,
         extract_plugins,
@@ -11,6 +11,7 @@ def init_app(app: DifyApp):
         fix_app_site_missing,
         fix_app_site_missing,
         install_plugins,
         install_plugins,
         migrate_data_for_plugin,
         migrate_data_for_plugin,
+        old_metadata_migration,
         reset_email,
         reset_email,
         reset_encrypt_key_pair,
         reset_encrypt_key_pair,
         reset_password,
         reset_password,
@@ -24,7 +25,7 @@ def init_app(app: DifyApp):
         reset_encrypt_key_pair,
         reset_encrypt_key_pair,
         vdb_migrate,
         vdb_migrate,
         convert_to_agent_apps,
         convert_to_agent_apps,
-        add_qdrant_doc_id_index,
+        add_qdrant_index,
         create_tenant,
         create_tenant,
         upgrade_db,
         upgrade_db,
         fix_app_site_missing,
         fix_app_site_missing,
@@ -32,6 +33,7 @@ def init_app(app: DifyApp):
         extract_plugins,
         extract_plugins,
         extract_unique_plugins,
         extract_unique_plugins,
         install_plugins,
         install_plugins,
+        old_metadata_migration,
     ]
     ]
     for cmd in cmds_to_register:
     for cmd in cmds_to_register:
         app.cli.add_command(cmd)
         app.cli.add_command(cmd)

+ 0 - 15
api/services/dataset_service.py

@@ -46,7 +46,6 @@ from models.source import DataSourceOauthBinding
 from services.entities.knowledge_entities.knowledge_entities import (
 from services.entities.knowledge_entities.knowledge_entities import (
     ChildChunkUpdateArgs,
     ChildChunkUpdateArgs,
     KnowledgeConfig,
     KnowledgeConfig,
-    MetaDataConfig,
     RerankingModel,
     RerankingModel,
     RetrievalModel,
     RetrievalModel,
     SegmentUpdateArgs,
     SegmentUpdateArgs,
@@ -999,9 +998,6 @@ class DocumentService:
                                 document.data_source_info = json.dumps(data_source_info)
                                 document.data_source_info = json.dumps(data_source_info)
                                 document.batch = batch
                                 document.batch = batch
                                 document.indexing_status = "waiting"
                                 document.indexing_status = "waiting"
-                                if knowledge_config.metadata:
-                                    document.doc_type = knowledge_config.metadata.doc_type
-                                    document.metadata = knowledge_config.metadata.doc_metadata
                                 db.session.add(document)
                                 db.session.add(document)
                                 documents.append(document)
                                 documents.append(document)
                                 duplicate_document_ids.append(document.id)
                                 duplicate_document_ids.append(document.id)
@@ -1018,7 +1014,6 @@ class DocumentService:
                             account,
                             account,
                             file_name,
                             file_name,
                             batch,
                             batch,
-                            knowledge_config.metadata,
                         )
                         )
                         db.session.add(document)
                         db.session.add(document)
                         db.session.flush()
                         db.session.flush()
@@ -1076,7 +1071,6 @@ class DocumentService:
                                     account,
                                     account,
                                     truncated_page_name,
                                     truncated_page_name,
                                     batch,
                                     batch,
-                                    knowledge_config.metadata,
                                 )
                                 )
                                 db.session.add(document)
                                 db.session.add(document)
                                 db.session.flush()
                                 db.session.flush()
@@ -1117,7 +1111,6 @@ class DocumentService:
                             account,
                             account,
                             document_name,
                             document_name,
                             batch,
                             batch,
-                            knowledge_config.metadata,
                         )
                         )
                         db.session.add(document)
                         db.session.add(document)
                         db.session.flush()
                         db.session.flush()
@@ -1155,7 +1148,6 @@ class DocumentService:
         account: Account,
         account: Account,
         name: str,
         name: str,
         batch: str,
         batch: str,
-        metadata: Optional[MetaDataConfig] = None,
     ):
     ):
         document = Document(
         document = Document(
             tenant_id=dataset.tenant_id,
             tenant_id=dataset.tenant_id,
@@ -1180,9 +1172,6 @@ class DocumentService:
                 BuiltInField.last_update_date: datetime.datetime.now(datetime.UTC).strftime("%Y-%m-%d %H:%M:%S"),
                 BuiltInField.last_update_date: datetime.datetime.now(datetime.UTC).strftime("%Y-%m-%d %H:%M:%S"),
                 BuiltInField.source: data_source_type,
                 BuiltInField.source: data_source_type,
             }
             }
-        if metadata is not None:
-            doc_metadata.update(metadata.doc_metadata)
-            document.doc_type = metadata.doc_type
         if doc_metadata:
         if doc_metadata:
             document.doc_metadata = doc_metadata
             document.doc_metadata = doc_metadata
         return document
         return document
@@ -1297,10 +1286,6 @@ class DocumentService:
         # update document name
         # update document name
         if document_data.name:
         if document_data.name:
             document.name = document_data.name
             document.name = document_data.name
-        # update doc_type and doc_metadata if provided
-        if document_data.metadata is not None:
-            document.doc_metadata = document_data.metadata.doc_metadata
-            document.doc_type = document_data.metadata.doc_type
         # update document to be waiting
         # update document to be waiting
         document.indexing_status = "waiting"
         document.indexing_status = "waiting"
         document.completed_at = None
         document.completed_at = None

+ 0 - 1
api/services/entities/knowledge_entities/knowledge_entities.py

@@ -128,7 +128,6 @@ class KnowledgeConfig(BaseModel):
     embedding_model: Optional[str] = None
     embedding_model: Optional[str] = None
     embedding_model_provider: Optional[str] = None
     embedding_model_provider: Optional[str] = None
     name: Optional[str] = None
     name: Optional[str] = None
-    metadata: Optional[MetaDataConfig] = None
 
 
 
 
 class SegmentUpdateArgs(BaseModel):
 class SegmentUpdateArgs(BaseModel):

+ 0 - 200
web/app/(commonLayout)/datasets/template/template.en.mdx

@@ -47,44 +47,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
       <Property name='text' type='string' key='text'>
       <Property name='text' type='string' key='text'>
         Document content
         Document content
       </Property>
       </Property>
-      <Property name='doc_type' type='string' key='doc_type'>
-        Type of document (optional):
-          - <code>book</code> Book
-          - <code>web_page</code> Web page
-          - <code>paper</code> Academic paper/article 
-          - <code>social_media_post</code> Social media post
-          - <code>wikipedia_entry</code> Wikipedia entry
-          - <code>personal_document</code> Personal document
-          - <code>business_document</code> Business document
-          - <code>im_chat_log</code> Chat log
-          - <code>synced_from_notion</code> Notion document
-          - <code>synced_from_github</code> GitHub document
-          - <code>others</code> Other document types
-      </Property>
-      <Property name='doc_metadata' type='object' key='doc_metadata'>
-        Document metadata (required if doc_type is provided). Fields vary by doc_type:
-          For <code>book</code>:
-          - <code>title</code> Book title 
-          - <code>language</code> Book language
-          - <code>author</code> Book author
-          - <code>publisher</code> Publisher name
-          - <code>publication_date</code> Publication date
-          - <code>isbn</code> ISBN number
-          - <code>category</code> Book category
-
-          For <code>web_page</code>:
-          - <code>title</code> Page title
-          - <code>url</code> Page URL
-          - <code>language</code> Page language
-          - <code>publish_date</code> Publish date
-          - <code>author/publisher</code> Author or publisher
-          - <code>topic/keywords</code> Topic or keywords
-          - <code>description</code> Page description
-
-          Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type.
-
-          For doc_type "others", any valid JSON object is accepted
-      </Property>
       <Property name='indexing_technique' type='string' key='indexing_technique'>
       <Property name='indexing_technique' type='string' key='indexing_technique'>
         Index mode
         Index mode
           - <code>high_quality</code> High quality: embedding using embedding model, built as vector database index
           - <code>high_quality</code> High quality: embedding using embedding model, built as vector database index
@@ -233,68 +195,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
           - <code>hierarchical_model</code> Parent-child mode
           - <code>hierarchical_model</code> Parent-child mode
           - <code>qa_model</code> Q&A Mode: Generates Q&A pairs for segmented documents and then embeds the questions
           - <code>qa_model</code> Q&A Mode: Generates Q&A pairs for segmented documents and then embeds the questions
 
 
-        - <code>doc_type</code> Type of document (optional)
-          - <code>book</code> Book
-            Document records a book or publication
-          - <code>web_page</code> Web page 
-            Document records web page content
-          - <code>paper</code> Academic paper/article
-            Document records academic paper or research article
-          - <code>social_media_post</code> Social media post
-            Content from social media posts
-          - <code>wikipedia_entry</code> Wikipedia entry
-            Content from Wikipedia entries
-          - <code>personal_document</code> Personal document
-            Documents related to personal content
-          - <code>business_document</code> Business document
-            Documents related to business content
-          - <code>im_chat_log</code> Chat log
-            Records of instant messaging chats
-          - <code>synced_from_notion</code> Notion document
-            Documents synchronized from Notion
-          - <code>synced_from_github</code> GitHub document
-            Documents synchronized from GitHub
-          - <code>others</code> Other document types
-            Other document types not listed above
-
-        - <code>doc_metadata</code> Document metadata (required if doc_type is provided)
-          Fields vary by doc_type:
-
-          For <code>book</code>:
-          - <code>title</code> Book title
-            Title of the book
-          - <code>language</code> Book language
-            Language of the book
-          - <code>author</code> Book author
-            Author of the book
-          - <code>publisher</code> Publisher name
-            Name of the publishing house
-          - <code>publication_date</code> Publication date
-            Date when the book was published
-          - <code>isbn</code> ISBN number
-            International Standard Book Number
-          - <code>category</code> Book category
-            Category or genre of the book
-
-          For <code>web_page</code>:
-          - <code>title</code> Page title
-            Title of the web page
-          - <code>url</code> Page URL
-            URL address of the web page
-          - <code>language</code> Page language
-            Language of the web page
-          - <code>publish_date</code> Publish date
-            Date when the web page was published
-          - <code>author/publisher</code> Author or publisher
-            Author or publisher of the web page
-          - <code>topic/keywords</code> Topic or keywords
-            Topics or keywords of the web page
-          - <code>description</code> Page description
-            Description of the web page content
-
-          Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type.
-          For doc_type "others", any valid JSON object is accepted
-
         - <code>doc_language</code> In Q&A mode, specify the language of the document, for example: <code>English</code>, <code>Chinese</code>
         - <code>doc_language</code> In Q&A mode, specify the language of the document, for example: <code>English</code>, <code>Chinese</code>
 
 
         - <code>process_rule</code> Processing rules
         - <code>process_rule</code> Processing rules
@@ -407,44 +307,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
       <Property name='description' type='string' key='description'>
       <Property name='description' type='string' key='description'>
         Knowledge description (optional)
         Knowledge description (optional)
       </Property>
       </Property>
-      <Property name='doc_type' type='string' key='doc_type'>
-        Type of document (optional):
-          - <code>book</code> Book
-          - <code>web_page</code> Web page
-          - <code>paper</code> Academic paper/article 
-          - <code>social_media_post</code> Social media post
-          - <code>wikipedia_entry</code> Wikipedia entry
-          - <code>personal_document</code> Personal document
-          - <code>business_document</code> Business document
-          - <code>im_chat_log</code> Chat log
-          - <code>synced_from_notion</code> Notion document
-          - <code>synced_from_github</code> GitHub document
-          - <code>others</code> Other document types
-      </Property>
-      <Property name='doc_metadata' type='object' key='doc_metadata'>
-        Document metadata (required if doc_type is provided). Fields vary by doc_type:
-          For <code>book</code>:
-          - <code>title</code> Book title 
-          - <code>language</code> Book language
-          - <code>author</code> Book author
-          - <code>publisher</code> Publisher name
-          - <code>publication_date</code> Publication date
-          - <code>isbn</code> ISBN number
-          - <code>category</code> Book category
-
-          For <code>web_page</code>:
-          - <code>title</code> Page title
-          - <code>url</code> Page URL
-          - <code>language</code> Page language
-          - <code>publish_date</code> Publish date
-          - <code>author/publisher</code> Author or publisher
-          - <code>topic/keywords</code> Topic or keywords
-          - <code>description</code> Page description
-
-          Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type.
-
-          For doc_type "others", any valid JSON object is accepted
-      </Property>
       <Property name='indexing_technique' type='string' key='indexing_technique'>
       <Property name='indexing_technique' type='string' key='indexing_technique'>
         Index technique (optional)
         Index technique (optional)
           - <code>high_quality</code> High quality
           - <code>high_quality</code> High quality
@@ -762,67 +624,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
               - <code>separator</code> Segmentation identifier. Currently, only one delimiter is allowed. The default is <code>***</code>
               - <code>separator</code> Segmentation identifier. Currently, only one delimiter is allowed. The default is <code>***</code>
               - <code>max_tokens</code> The maximum length (tokens) must be validated to be shorter than the length of the parent chunk
               - <code>max_tokens</code> The maximum length (tokens) must be validated to be shorter than the length of the parent chunk
               - <code>chunk_overlap</code> Define the overlap between adjacent chunks (optional)
               - <code>chunk_overlap</code> Define the overlap between adjacent chunks (optional)
-            - <code>doc_type</code> Type of document (optional)
-              - <code>book</code> Book
-                Document records a book or publication
-              - <code>web_page</code> Web page 
-                Document records web page content
-              - <code>paper</code> Academic paper/article
-                Document records academic paper or research article
-              - <code>social_media_post</code> Social media post
-                Content from social media posts
-              - <code>wikipedia_entry</code> Wikipedia entry
-                Content from Wikipedia entries
-              - <code>personal_document</code> Personal document
-                Documents related to personal content
-              - <code>business_document</code> Business document
-                Documents related to business content
-              - <code>im_chat_log</code> Chat log
-                Records of instant messaging chats
-              - <code>synced_from_notion</code> Notion document
-                Documents synchronized from Notion
-              - <code>synced_from_github</code> GitHub document
-                Documents synchronized from GitHub
-              - <code>others</code> Other document types
-                Other document types not listed above
-
-            - <code>doc_metadata</code> Document metadata (required if doc_type is provided)
-              Fields vary by doc_type:
-
-              For <code>book</code>:
-              - <code>title</code> Book title
-                Title of the book
-              - <code>language</code> Book language
-                Language of the book
-              - <code>author</code> Book author
-                Author of the book
-              - <code>publisher</code> Publisher name
-                Name of the publishing house
-              - <code>publication_date</code> Publication date
-                Date when the book was published
-              - <code>isbn</code> ISBN number
-                International Standard Book Number
-              - <code>category</code> Book category
-                Category or genre of the book
-
-              For <code>web_page</code>:
-              - <code>title</code> Page title
-                Title of the web page
-              - <code>url</code> Page URL
-                URL address of the web page
-              - <code>language</code> Page language
-                Language of the web page
-              - <code>publish_date</code> Publish date
-                Date when the web page was published
-              - <code>author/publisher</code> Author or publisher
-                Author or publisher of the web page
-              - <code>topic/keywords</code> Topic or keywords
-                Topics or keywords of the web page
-              - <code>description</code> Page description
-                Description of the web page content
-
-              Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type.
-              For doc_type "others", any valid JSON object is accepted
       </Property>
       </Property>
     </Properties>
     </Properties>
   </Col>
   </Col>
@@ -1528,7 +1329,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
               "id": "a8c6c36f-9f5d-4d7a-8472-f5d7b75d71d2",
               "id": "a8c6c36f-9f5d-4d7a-8472-f5d7b75d71d2",
               "data_source_type": "upload_file",
               "data_source_type": "upload_file",
               "name": "readme.txt",
               "name": "readme.txt",
-              "doc_type": null
             }
             }
           },
           },
           "score": 3.730463140527718e-05,
           "score": 3.730463140527718e-05,

+ 0 - 205
web/app/(commonLayout)/datasets/template/template.zh.mdx

@@ -47,46 +47,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
       <Property name='text' type='string' key='text'>
       <Property name='text' type='string' key='text'>
         文档内容
         文档内容
       </Property>
       </Property>
-      <Property name='doc_type' type='string' key='doc_type'>
-        文档类型(选填)
-          - <code>book</code> 图书 Book
-          - <code>web_page</code> 网页 Web page
-          - <code>paper</code> 学术论文/文章 Academic paper/article 
-          - <code>social_media_post</code> 社交媒体帖子 Social media post
-          - <code>wikipedia_entry</code> 维基百科条目 Wikipedia entry
-          - <code>personal_document</code> 个人文档 Personal document
-          - <code>business_document</code> 商业文档 Business document
-          - <code>im_chat_log</code> 即时通讯记录 Chat log
-          - <code>synced_from_notion</code> Notion同步文档 Notion document
-          - <code>synced_from_github</code> GitHub同步文档 GitHub document
-          - <code>others</code> 其他文档类型 Other document types
-      </Property>
-      <Property name='doc_metadata' type='object' key='doc_metadata'>
-      
-        文档元数据(如提供文档类型则必填)。字段因文档类型而异:
-          
-          针对图书 For <code>book</code>:
-          - <code>title</code> 书名 Book title 
-          - <code>language</code> 图书语言 Book language
-          - <code>author</code> 作者 Book author
-          - <code>publisher</code> 出版社 Publisher name
-          - <code>publication_date</code> 出版日期 Publication date
-          - <code>isbn</code> ISBN号码 ISBN number
-          - <code>category</code> 图书分类 Book category
-
-          针对网页 For <code>web_page</code>:
-          - <code>title</code> 页面标题 Page title
-          - <code>url</code> 页面网址 Page URL
-          - <code>language</code> 页面语言 Page language
-          - <code>publish_date</code> 发布日期 Publish date
-          - <code>author/publisher</code> 作者/发布者 Author or publisher
-          - <code>topic/keywords</code> 主题/关键词 Topic or keywords
-          - <code>description</code> 页面描述 Page description
-
-          请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。
-
-          针对"其他"类型文档,接受任何有效的JSON对象
-      </Property>
       <Property name='indexing_technique' type='string' key='indexing_technique'>
       <Property name='indexing_technique' type='string' key='indexing_technique'>
         索引方式
         索引方式
           - <code>high_quality</code> 高质量:使用  embedding 模型进行嵌入,构建为向量数据库索引
           - <code>high_quality</code> 高质量:使用  embedding 模型进行嵌入,构建为向量数据库索引
@@ -234,68 +194,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
           - <code>text_model</code> text 文档直接 embedding,经济模式默认为该模式
           - <code>text_model</code> text 文档直接 embedding,经济模式默认为该模式
           - <code>hierarchical_model</code> parent-child 模式
           - <code>hierarchical_model</code> parent-child 模式
           - <code>qa_model</code> Q&A 模式:为分片文档生成 Q&A 对,然后对问题进行 embedding
           - <code>qa_model</code> Q&A 模式:为分片文档生成 Q&A 对,然后对问题进行 embedding
-        - <code>doc_type</code> 文档类型(选填)Type of document (optional)
-          - <code>book</code> 图书
-            文档记录一本书籍或出版物
-          - <code>web_page</code> 网页
-            网页内容的文档记录
-          - <code>paper</code> 学术论文/文章
-            学术论文或研究文章的记录
-          - <code>social_media_post</code> 社交媒体帖子
-            社交媒体上的帖子内容
-          - <code>wikipedia_entry</code> 维基百科条目
-            维基百科的词条内容
-          - <code>personal_document</code> 个人文档
-            个人相关的文档记录
-          - <code>business_document</code> 商业文档
-            商业相关的文档记录
-          - <code>im_chat_log</code> 即时通讯记录
-            即时通讯的聊天记录
-          - <code>synced_from_notion</code> Notion同步文档
-            从Notion同步的文档内容
-          - <code>synced_from_github</code> GitHub同步文档
-            从GitHub同步的文档内容
-          - <code>others</code> 其他文档类型
-            其他未列出的文档类型
-
-        - <code>doc_metadata</code> 文档元数据(如提供文档类型则必填
-          字段因文档类型而异
-
-          针对图书类型 For <code>book</code>:
-          - <code>title</code> 书名
-            书籍的标题
-          - <code>language</code> 图书语言
-            书籍的语言
-          - <code>author</code> 作者
-            书籍的作者
-          - <code>publisher</code> 出版社
-            出版社的名称
-          - <code>publication_date</code> 出版日期
-            书籍的出版日期
-          - <code>isbn</code> ISBN号码
-            书籍的ISBN编号
-          - <code>category</code> 图书分类
-            书籍的分类类别
-
-          针对网页类型 For <code>web_page</code>:
-          - <code>title</code> 页面标题
-            网页的标题
-          - <code>url</code> 页面网址
-            网页的URL地址
-          - <code>language</code> 页面语言
-            网页的语言
-          - <code>publish_date</code> 发布日期
-            网页的发布日期
-          - <code>author/publisher</code> 作者/发布者
-            网页的作者或发布者
-          - <code>topic/keywords</code> 主题/关键词
-            网页的主题或关键词
-          - <code>description</code> 页面描述
-            网页的描述信息
-
-          请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。
-
-          针对"其他"类型文档,接受任何有效的JSON对象
 
 
         - <code>doc_language</code> 在 Q&A 模式下,指定文档的语言,例如:<code>English</code>、<code>Chinese</code>
         - <code>doc_language</code> 在 Q&A 模式下,指定文档的语言,例如:<code>English</code>、<code>Chinese</code>
 
 
@@ -606,46 +504,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
       <Property name='text' type='string' key='text'>
       <Property name='text' type='string' key='text'>
         文档内容(选填)
         文档内容(选填)
       </Property>
       </Property>
-      <Property name='doc_type' type='string' key='doc_type'>
-        文档类型(选填)
-          - <code>book</code> 图书 Book
-          - <code>web_page</code> 网页 Web page
-          - <code>paper</code> 学术论文/文章 Academic paper/article 
-          - <code>social_media_post</code> 社交媒体帖子 Social media post
-          - <code>wikipedia_entry</code> 维基百科条目 Wikipedia entry
-          - <code>personal_document</code> 个人文档 Personal document
-          - <code>business_document</code> 商业文档 Business document
-          - <code>im_chat_log</code> 即时通讯记录 Chat log
-          - <code>synced_from_notion</code> Notion同步文档 Notion document
-          - <code>synced_from_github</code> GitHub同步文档 GitHub document
-          - <code>others</code> 其他文档类型 Other document types
-      </Property>
-      <Property name='doc_metadata' type='object' key='doc_metadata'>
-      
-        文档元数据(如提供文档类型则必填)。字段因文档类型而异:
-          
-          针对图书 For <code>book</code>:
-          - <code>title</code> 书名 Book title 
-          - <code>language</code> 图书语言 Book language
-          - <code>author</code> 作者 Book author
-          - <code>publisher</code> 出版社 Publisher name
-          - <code>publication_date</code> 出版日期 Publication date
-          - <code>isbn</code> ISBN号码 ISBN number
-          - <code>category</code> 图书分类 Book category
-
-          针对网页 For <code>web_page</code>:
-          - <code>title</code> 页面标题 Page title
-          - <code>url</code> 页面网址 Page URL
-          - <code>language</code> 页面语言 Page language
-          - <code>publish_date</code> 发布日期 Publish date
-          - <code>author/publisher</code> 作者/发布者 Author or publisher
-          - <code>topic/keywords</code> 主题/关键词 Topic or keywords
-          - <code>description</code> 页面描述 Page description
-
-          请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。
-
-          针对"其他"类型文档,接受任何有效的JSON对象
-      </Property>
       <Property name='process_rule' type='object' key='process_rule'>
       <Property name='process_rule' type='object' key='process_rule'>
         处理规则(选填)
         处理规则(选填)
           - <code>mode</code> (string) 清洗、分段模式 ,automatic 自动 / custom 自定义
           - <code>mode</code> (string) 清洗、分段模式 ,automatic 自动 / custom 自定义
@@ -766,68 +624,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
               - <code>separator</code> 分段标识符,目前仅允许设置一个分隔符。默认为 <code>***</code>
               - <code>separator</code> 分段标识符,目前仅允许设置一个分隔符。默认为 <code>***</code>
               - <code>max_tokens</code> 最大长度 (token) 需要校验小于父级的长度
               - <code>max_tokens</code> 最大长度 (token) 需要校验小于父级的长度
               - <code>chunk_overlap</code> 分段重叠指的是在对数据进行分段时,段与段之间存在一定的重叠部分(选填)
               - <code>chunk_overlap</code> 分段重叠指的是在对数据进行分段时,段与段之间存在一定的重叠部分(选填)
-            - <code>doc_type</code> 文档类型(选填)Type of document (optional)
-              - <code>book</code> 图书
-                文档记录一本书籍或出版物
-              - <code>web_page</code> 网页
-                网页内容的文档记录
-              - <code>paper</code> 学术论文/文章
-                学术论文或研究文章的记录
-              - <code>social_media_post</code> 社交媒体帖子
-                社交媒体上的帖子内容
-              - <code>wikipedia_entry</code> 维基百科条目
-                维基百科的词条内容
-              - <code>personal_document</code> 个人文档
-                个人相关的文档记录
-              - <code>business_document</code> 商业文档
-                商业相关的文档记录
-              - <code>im_chat_log</code> 即时通讯记录
-                即时通讯的聊天记录
-              - <code>synced_from_notion</code> Notion同步文档
-                从Notion同步的文档内容
-              - <code>synced_from_github</code> GitHub同步文档
-                从GitHub同步的文档内容
-              - <code>others</code> 其他文档类型
-                其他未列出的文档类型
-
-            - <code>doc_metadata</code> 文档元数据(如提供文档类型则必填
-              字段因文档类型而异
-
-              针对图书类型 For <code>book</code>:
-              - <code>title</code> 书名
-                书籍的标题
-              - <code>language</code> 图书语言
-                书籍的语言
-              - <code>author</code> 作者
-                书籍的作者
-              - <code>publisher</code> 出版社
-                出版社的名称
-              - <code>publication_date</code> 出版日期
-                书籍的出版日期
-              - <code>isbn</code> ISBN号码
-                书籍的ISBN编号
-              - <code>category</code> 图书分类
-                书籍的分类类别
-
-              针对网页类型 For <code>web_page</code>:
-              - <code>title</code> 页面标题
-                网页的标题
-              - <code>url</code> 页面网址
-                网页的URL地址
-              - <code>language</code> 页面语言
-                网页的语言
-              - <code>publish_date</code> 发布日期
-                网页的发布日期
-              - <code>author/publisher</code> 作者/发布者
-                网页的作者或发布者
-              - <code>topic/keywords</code> 主题/关键词
-                网页的主题或关键词
-              - <code>description</code> 页面描述
-                网页的描述信息
-
-              请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。
-
-              针对"其他"类型文档,接受任何有效的JSON对象
       </Property>
       </Property>
     </Properties>
     </Properties>
   </Col>
   </Col>
@@ -1534,7 +1330,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
               "id": "a8c6c36f-9f5d-4d7a-8472-f5d7b75d71d2",
               "id": "a8c6c36f-9f5d-4d7a-8472-f5d7b75d71d2",
               "data_source_type": "upload_file",
               "data_source_type": "upload_file",
               "name": "readme.txt",
               "name": "readme.txt",
-              "doc_type": null
             }
             }
           },
           },
           "score": 3.730463140527718e-05,
           "score": 3.730463140527718e-05,