Fix/create document by api with metadata (langgenius#16307)

JohnJyong · zxhlyh · parambharat · commit 9394e4abaf4e · 2025-03-31T12:19:51.000+05:30
Co-authored-by: zxhlyh &lt;jasonapring2015@outlook.com&gt;
diff --git a/api/commands.py b/api/commands.py
@@ -20,7 +20,7 @@
 from libs.password import hash_password, password_pattern, valid_password
 from libs.rsa import generate_key_pair
 from models import Tenant
-from models.dataset import Dataset, DatasetCollectionBinding, DocumentSegment
+from models.dataset import Dataset, DatasetCollectionBinding, DatasetMetadata, DatasetMetadataBinding, DocumentSegment
 from models.dataset import Document as DatasetDocument
 from models.model import Account, App, AppAnnotationSetting, AppMode, Conversation, MessageAnnotation
 from models.provider import Provider, ProviderModel
@@ -483,14 +483,11 @@ def convert_to_agent_apps():
     click.echo(click.style("Conversion complete. Converted {} agent apps.".format(len(proceeded_app_ids)), fg="green"))
 
 
-@click.command("add-qdrant-doc-id-index", help="Add Qdrant doc_id index.")
+@click.command("add-qdrant-index", help="Add Qdrant index.")
 @click.option("--field", default="metadata.doc_id", prompt=False, help="Index field , default is metadata.doc_id.")
-def add_qdrant_doc_id_index(field: str):
-    click.echo(click.style("Starting Qdrant doc_id index creation.", fg="green"))
-    vector_type = dify_config.VECTOR_STORE
-    if vector_type != "qdrant":
-        click.echo(click.style("This command only supports Qdrant vector store.", fg="red"))
-        return
+def add_qdrant_index(field: str):
+    click.echo(click.style("Starting Qdrant index creation.", fg="green"))
+
     create_count = 0
 
     try:
@@ -539,6 +536,72 @@ def add_qdrant_doc_id_index(field: str):
     click.echo(click.style(f"Index creation complete. Created {create_count} collection indexes.", fg="green"))
 
 
+@click.command("old-metadata-migration", help="Old metadata migration.")
+def old_metadata_migration():
+    """
+    Old metadata migration.
+    """
+    click.echo(click.style("Starting old metadata migration.", fg="green"))
+
+    page = 1
+    while True:
+        try:
+            documents = (
+                DatasetDocument.query.filter(DatasetDocument.doc_metadata is not None)
+                .order_by(DatasetDocument.created_at.desc())
+                .paginate(page=page, per_page=50)
+            )
+        except NotFound:
+            break
+        if not documents:
+            break
+        for document in documents:
+            if document.doc_metadata:
+                doc_metadata = document.doc_metadata
+                for key, value in doc_metadata.items():
+                    dataset_metadata = (
+                        db.session.query(DatasetMetadata)
+                        .filter(DatasetMetadata.dataset_id == document.dataset_id, DatasetMetadata.name == key)
+                        .first()
+                    )
+                    if not dataset_metadata:
+                        dataset_metadata = DatasetMetadata(
+                            tenant_id=document.tenant_id,
+                            dataset_id=document.dataset_id,
+                            name=key,
+                            type="string",
+                            created_by=document.created_by,
+                        )
+                        db.session.add(dataset_metadata)
+                        db.session.flush()
+                        dataset_metadata_binding = DatasetMetadataBinding(
+                            tenant_id=document.tenant_id,
+                            dataset_id=document.dataset_id,
+                            metadata_id=dataset_metadata.id,
+                            document_id=document.id,
+                            created_by=document.created_by,
+                        )
+                        db.session.add(dataset_metadata_binding)
+                    else:
+                        dataset_metadata_binding = DatasetMetadataBinding.query.filter(
+                            DatasetMetadataBinding.dataset_id == document.dataset_id,
+                            DatasetMetadataBinding.document_id == document.id,
+                            DatasetMetadataBinding.metadata_id == dataset_metadata.id,
+                        ).first()
+                        if not dataset_metadata_binding:
+                            dataset_metadata_binding = DatasetMetadataBinding(
+                                tenant_id=document.tenant_id,
+                                dataset_id=document.dataset_id,
+                                metadata_id=dataset_metadata.id,
+                                document_id=document.id,
+                                created_by=document.created_by,
+                            )
+                            db.session.add(dataset_metadata_binding)
+                db.session.commit()
+        page += 1
+    click.echo(click.style("Old metadata migration completed.", fg="green"))
+
+
 @click.command("create-tenant", help="Create account and tenant.")
 @click.option("--email", prompt=True, help="Tenant account email.")
 @click.option("--name", prompt=True, help="Workspace name.")
diff --git a/api/controllers/service_api/dataset/document.py b/api/controllers/service_api/dataset/document.py
@@ -18,7 +18,6 @@
 from controllers.service_api.dataset.error import (
     ArchivedDocumentImmutableError,
     DocumentIndexingError,
-    InvalidMetadataError,
 )
 from controllers.service_api.wraps import DatasetApiResource, cloud_edition_billing_resource_check
 from core.errors.error import ProviderTokenNotInitError
@@ -51,8 +50,6 @@ def post(self, tenant_id, dataset_id):
             "indexing_technique", type=str, choices=Dataset.INDEXING_TECHNIQUE_LIST, nullable=False, location="json"
         )
         parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json")
-        parser.add_argument("doc_type", type=str, required=False, nullable=True, location="json")
-        parser.add_argument("doc_metadata", type=dict, required=False, nullable=True, location="json")
 
         args = parser.parse_args()
         dataset_id = str(dataset_id)
@@ -65,28 +62,6 @@ def post(self, tenant_id, dataset_id):
         if not dataset.indexing_technique and not args["indexing_technique"]:
             raise ValueError("indexing_technique is required.")
 
-        # Validate metadata if provided
-        if args.get("doc_type") or args.get("doc_metadata"):
-            if not args.get("doc_type") or not args.get("doc_metadata"):
-                raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
-
-            if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
-                raise InvalidMetadataError(
-                    "Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
-                )
-
-            if not isinstance(args["doc_metadata"], dict):
-                raise InvalidMetadataError("doc_metadata must be a dictionary")
-
-            # Validate metadata schema based on doc_type
-            if args["doc_type"] != "others":
-                metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
-                for key, value in args["doc_metadata"].items():
-                    if key in metadata_schema and not isinstance(value, metadata_schema[key]):
-                        raise InvalidMetadataError(f"Invalid type for metadata field {key}")
-            # set to MetaDataConfig
-            args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
-
         text = args.get("text")
         name = args.get("name")
         if text is None or name is None:
@@ -133,8 +108,6 @@ def post(self, tenant_id, dataset_id, document_id):
             "doc_language", type=str, default="English", required=False, nullable=False, location="json"
         )
         parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json")
-        parser.add_argument("doc_type", type=str, required=False, nullable=True, location="json")
-        parser.add_argument("doc_metadata", type=dict, required=False, nullable=True, location="json")
         args = parser.parse_args()
         dataset_id = str(dataset_id)
         tenant_id = str(tenant_id)
@@ -146,29 +119,6 @@ def post(self, tenant_id, dataset_id, document_id):
         # indexing_technique is already set in dataset since this is an update
         args["indexing_technique"] = dataset.indexing_technique
 
-        # Validate metadata if provided
-        if args.get("doc_type") or args.get("doc_metadata"):
-            if not args.get("doc_type") or not args.get("doc_metadata"):
-                raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
-
-            if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
-                raise InvalidMetadataError(
-                    "Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
-                )
-
-            if not isinstance(args["doc_metadata"], dict):
-                raise InvalidMetadataError("doc_metadata must be a dictionary")
-
-            # Validate metadata schema based on doc_type
-            if args["doc_type"] != "others":
-                metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
-                for key, value in args["doc_metadata"].items():
-                    if key in metadata_schema and not isinstance(value, metadata_schema[key]):
-                        raise InvalidMetadataError(f"Invalid type for metadata field {key}")
-
-            # set to MetaDataConfig
-            args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
-
         if args["text"]:
             text = args.get("text")
             name = args.get("name")
@@ -216,29 +166,6 @@ def post(self, tenant_id, dataset_id):
         if "doc_language" not in args:
             args["doc_language"] = "English"
 
-        # Validate metadata if provided
-        if args.get("doc_type") or args.get("doc_metadata"):
-            if not args.get("doc_type") or not args.get("doc_metadata"):
-                raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
-
-            if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
-                raise InvalidMetadataError(
-                    "Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
-                )
-
-            if not isinstance(args["doc_metadata"], dict):
-                raise InvalidMetadataError("doc_metadata must be a dictionary")
-
-            # Validate metadata schema based on doc_type
-            if args["doc_type"] != "others":
-                metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
-                for key, value in args["doc_metadata"].items():
-                    if key in metadata_schema and not isinstance(value, metadata_schema[key]):
-                        raise InvalidMetadataError(f"Invalid type for metadata field {key}")
-
-            # set to MetaDataConfig
-            args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
-
         # get dataset info
         dataset_id = str(dataset_id)
         tenant_id = str(tenant_id)
@@ -306,29 +233,6 @@ def post(self, tenant_id, dataset_id, document_id):
         if "doc_language" not in args:
             args["doc_language"] = "English"
 
-        # Validate metadata if provided
-        if args.get("doc_type") or args.get("doc_metadata"):
-            if not args.get("doc_type") or not args.get("doc_metadata"):
-                raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
-
-            if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
-                raise InvalidMetadataError(
-                    "Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
-                )
-
-            if not isinstance(args["doc_metadata"], dict):
-                raise InvalidMetadataError("doc_metadata must be a dictionary")
-
-            # Validate metadata schema based on doc_type
-            if args["doc_type"] != "others":
-                metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
-                for key, value in args["doc_metadata"].items():
-                    if key in metadata_schema and not isinstance(value, metadata_schema[key]):
-                        raise InvalidMetadataError(f"Invalid type for metadata field {key}")
-
-            # set to MetaDataConfig
-            args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
-
         # get dataset info
         dataset_id = str(dataset_id)
         tenant_id = str(tenant_id)
diff --git a/api/extensions/ext_commands.py b/api/extensions/ext_commands.py
@@ -3,14 +3,15 @@
 
 def init_app(app: DifyApp):
     from commands import (
-        add_qdrant_doc_id_index,
+        add_qdrant_index,
         convert_to_agent_apps,
         create_tenant,
         extract_plugins,
         extract_unique_plugins,
         fix_app_site_missing,
         install_plugins,
         migrate_data_for_plugin,
+        old_metadata_migration,
         reset_email,
         reset_encrypt_key_pair,
         reset_password,
@@ -24,14 +25,15 @@ def init_app(app: DifyApp):
         reset_encrypt_key_pair,
         vdb_migrate,
         convert_to_agent_apps,
-        add_qdrant_doc_id_index,
+        add_qdrant_index,
         create_tenant,
         upgrade_db,
         fix_app_site_missing,
         migrate_data_for_plugin,
         extract_plugins,
         extract_unique_plugins,
         install_plugins,
+        old_metadata_migration,
     ]
     for cmd in cmds_to_register:
         app.cli.add_command(cmd)
diff --git a/api/services/dataset_service.py b/api/services/dataset_service.py
@@ -46,7 +46,6 @@
 from services.entities.knowledge_entities.knowledge_entities import (
     ChildChunkUpdateArgs,
     KnowledgeConfig,
-    MetaDataConfig,
     RerankingModel,
     RetrievalModel,
     SegmentUpdateArgs,
@@ -999,9 +998,6 @@ def save_document_with_dataset_id(
                                 document.data_source_info = json.dumps(data_source_info)
                                 document.batch = batch
                                 document.indexing_status = "waiting"
-                                if knowledge_config.metadata:
-                                    document.doc_type = knowledge_config.metadata.doc_type
-                                    document.metadata = knowledge_config.metadata.doc_metadata
                                 db.session.add(document)
                                 documents.append(document)
                                 duplicate_document_ids.append(document.id)
@@ -1018,7 +1014,6 @@ def save_document_with_dataset_id(
                             account,
                             file_name,
                             batch,
-                            knowledge_config.metadata,
                         )
                         db.session.add(document)
                         db.session.flush()
@@ -1076,7 +1071,6 @@ def save_document_with_dataset_id(
                                     account,
                                     truncated_page_name,
                                     batch,
-                                    knowledge_config.metadata,
                                 )
                                 db.session.add(document)
                                 db.session.flush()
@@ -1117,7 +1111,6 @@ def save_document_with_dataset_id(
                             account,
                             document_name,
                             batch,
-                            knowledge_config.metadata,
                         )
                         db.session.add(document)
                         db.session.flush()
@@ -1155,7 +1148,6 @@ def build_document(
         account: Account,
         name: str,
         batch: str,
-        metadata: Optional[MetaDataConfig] = None,
     ):
         document = Document(
             tenant_id=dataset.tenant_id,
@@ -1180,9 +1172,6 @@ def build_document(
                 BuiltInField.last_update_date: datetime.datetime.now(datetime.UTC).strftime("%Y-%m-%d %H:%M:%S"),
                 BuiltInField.source: data_source_type,
             }
-        if metadata is not None:
-            doc_metadata.update(metadata.doc_metadata)
-            document.doc_type = metadata.doc_type
         if doc_metadata:
             document.doc_metadata = doc_metadata
         return document
@@ -1297,10 +1286,6 @@ def update_document_with_dataset_id(
         # update document name
         if document_data.name:
             document.name = document_data.name
-        # update doc_type and doc_metadata if provided
-        if document_data.metadata is not None:
-            document.doc_metadata = document_data.metadata.doc_metadata
-            document.doc_type = document_data.metadata.doc_type
         # update document to be waiting
         document.indexing_status = "waiting"
         document.completed_at = None
diff --git a/api/services/entities/knowledge_entities/knowledge_entities.py b/api/services/entities/knowledge_entities/knowledge_entities.py
@@ -128,7 +128,6 @@ class KnowledgeConfig(BaseModel):
     embedding_model: Optional[str] = None
     embedding_model_provider: Optional[str] = None
     name: Optional[str] = None
-    metadata: Optional[MetaDataConfig] = None
 
 
 class SegmentUpdateArgs(BaseModel):
diff --git a/web/app/(commonLayout)/datasets/template/template.en.mdx b/web/app/(commonLayout)/datasets/template/template.en.mdx
diff --git a/web/app/(commonLayout)/datasets/template/template.zh.mdx b/web/app/(commonLayout)/datasets/template/template.zh.mdx