Skip to content

Commit 9394e4a

Browse files
JohnJyongzxhlyh
authored andcommitted
Fix/create document by api with metadata (langgenius#16307)
Co-authored-by: zxhlyh <[email protected]>
1 parent 87b5765 commit 9394e4a

File tree

7 files changed

+75
-527
lines changed

7 files changed

+75
-527
lines changed

api/commands.py

+71-8
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from libs.password import hash_password, password_pattern, valid_password
2121
from libs.rsa import generate_key_pair
2222
from models import Tenant
23-
from models.dataset import Dataset, DatasetCollectionBinding, DocumentSegment
23+
from models.dataset import Dataset, DatasetCollectionBinding, DatasetMetadata, DatasetMetadataBinding, DocumentSegment
2424
from models.dataset import Document as DatasetDocument
2525
from models.model import Account, App, AppAnnotationSetting, AppMode, Conversation, MessageAnnotation
2626
from models.provider import Provider, ProviderModel
@@ -483,14 +483,11 @@ def convert_to_agent_apps():
483483
click.echo(click.style("Conversion complete. Converted {} agent apps.".format(len(proceeded_app_ids)), fg="green"))
484484

485485

486-
@click.command("add-qdrant-doc-id-index", help="Add Qdrant doc_id index.")
486+
@click.command("add-qdrant-index", help="Add Qdrant index.")
487487
@click.option("--field", default="metadata.doc_id", prompt=False, help="Index field , default is metadata.doc_id.")
488-
def add_qdrant_doc_id_index(field: str):
489-
click.echo(click.style("Starting Qdrant doc_id index creation.", fg="green"))
490-
vector_type = dify_config.VECTOR_STORE
491-
if vector_type != "qdrant":
492-
click.echo(click.style("This command only supports Qdrant vector store.", fg="red"))
493-
return
488+
def add_qdrant_index(field: str):
489+
click.echo(click.style("Starting Qdrant index creation.", fg="green"))
490+
494491
create_count = 0
495492

496493
try:
@@ -539,6 +536,72 @@ def add_qdrant_doc_id_index(field: str):
539536
click.echo(click.style(f"Index creation complete. Created {create_count} collection indexes.", fg="green"))
540537

541538

539+
@click.command("old-metadata-migration", help="Old metadata migration.")
540+
def old_metadata_migration():
541+
"""
542+
Old metadata migration.
543+
"""
544+
click.echo(click.style("Starting old metadata migration.", fg="green"))
545+
546+
page = 1
547+
while True:
548+
try:
549+
documents = (
550+
DatasetDocument.query.filter(DatasetDocument.doc_metadata is not None)
551+
.order_by(DatasetDocument.created_at.desc())
552+
.paginate(page=page, per_page=50)
553+
)
554+
except NotFound:
555+
break
556+
if not documents:
557+
break
558+
for document in documents:
559+
if document.doc_metadata:
560+
doc_metadata = document.doc_metadata
561+
for key, value in doc_metadata.items():
562+
dataset_metadata = (
563+
db.session.query(DatasetMetadata)
564+
.filter(DatasetMetadata.dataset_id == document.dataset_id, DatasetMetadata.name == key)
565+
.first()
566+
)
567+
if not dataset_metadata:
568+
dataset_metadata = DatasetMetadata(
569+
tenant_id=document.tenant_id,
570+
dataset_id=document.dataset_id,
571+
name=key,
572+
type="string",
573+
created_by=document.created_by,
574+
)
575+
db.session.add(dataset_metadata)
576+
db.session.flush()
577+
dataset_metadata_binding = DatasetMetadataBinding(
578+
tenant_id=document.tenant_id,
579+
dataset_id=document.dataset_id,
580+
metadata_id=dataset_metadata.id,
581+
document_id=document.id,
582+
created_by=document.created_by,
583+
)
584+
db.session.add(dataset_metadata_binding)
585+
else:
586+
dataset_metadata_binding = DatasetMetadataBinding.query.filter(
587+
DatasetMetadataBinding.dataset_id == document.dataset_id,
588+
DatasetMetadataBinding.document_id == document.id,
589+
DatasetMetadataBinding.metadata_id == dataset_metadata.id,
590+
).first()
591+
if not dataset_metadata_binding:
592+
dataset_metadata_binding = DatasetMetadataBinding(
593+
tenant_id=document.tenant_id,
594+
dataset_id=document.dataset_id,
595+
metadata_id=dataset_metadata.id,
596+
document_id=document.id,
597+
created_by=document.created_by,
598+
)
599+
db.session.add(dataset_metadata_binding)
600+
db.session.commit()
601+
page += 1
602+
click.echo(click.style("Old metadata migration completed.", fg="green"))
603+
604+
542605
@click.command("create-tenant", help="Create account and tenant.")
543606
@click.option("--email", prompt=True, help="Tenant account email.")
544607
@click.option("--name", prompt=True, help="Workspace name.")

api/controllers/service_api/dataset/document.py

-96
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
from controllers.service_api.dataset.error import (
1919
ArchivedDocumentImmutableError,
2020
DocumentIndexingError,
21-
InvalidMetadataError,
2221
)
2322
from controllers.service_api.wraps import DatasetApiResource, cloud_edition_billing_resource_check
2423
from core.errors.error import ProviderTokenNotInitError
@@ -51,8 +50,6 @@ def post(self, tenant_id, dataset_id):
5150
"indexing_technique", type=str, choices=Dataset.INDEXING_TECHNIQUE_LIST, nullable=False, location="json"
5251
)
5352
parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json")
54-
parser.add_argument("doc_type", type=str, required=False, nullable=True, location="json")
55-
parser.add_argument("doc_metadata", type=dict, required=False, nullable=True, location="json")
5653

5754
args = parser.parse_args()
5855
dataset_id = str(dataset_id)
@@ -65,28 +62,6 @@ def post(self, tenant_id, dataset_id):
6562
if not dataset.indexing_technique and not args["indexing_technique"]:
6663
raise ValueError("indexing_technique is required.")
6764

68-
# Validate metadata if provided
69-
if args.get("doc_type") or args.get("doc_metadata"):
70-
if not args.get("doc_type") or not args.get("doc_metadata"):
71-
raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
72-
73-
if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
74-
raise InvalidMetadataError(
75-
"Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
76-
)
77-
78-
if not isinstance(args["doc_metadata"], dict):
79-
raise InvalidMetadataError("doc_metadata must be a dictionary")
80-
81-
# Validate metadata schema based on doc_type
82-
if args["doc_type"] != "others":
83-
metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
84-
for key, value in args["doc_metadata"].items():
85-
if key in metadata_schema and not isinstance(value, metadata_schema[key]):
86-
raise InvalidMetadataError(f"Invalid type for metadata field {key}")
87-
# set to MetaDataConfig
88-
args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
89-
9065
text = args.get("text")
9166
name = args.get("name")
9267
if text is None or name is None:
@@ -133,8 +108,6 @@ def post(self, tenant_id, dataset_id, document_id):
133108
"doc_language", type=str, default="English", required=False, nullable=False, location="json"
134109
)
135110
parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json")
136-
parser.add_argument("doc_type", type=str, required=False, nullable=True, location="json")
137-
parser.add_argument("doc_metadata", type=dict, required=False, nullable=True, location="json")
138111
args = parser.parse_args()
139112
dataset_id = str(dataset_id)
140113
tenant_id = str(tenant_id)
@@ -146,29 +119,6 @@ def post(self, tenant_id, dataset_id, document_id):
146119
# indexing_technique is already set in dataset since this is an update
147120
args["indexing_technique"] = dataset.indexing_technique
148121

149-
# Validate metadata if provided
150-
if args.get("doc_type") or args.get("doc_metadata"):
151-
if not args.get("doc_type") or not args.get("doc_metadata"):
152-
raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
153-
154-
if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
155-
raise InvalidMetadataError(
156-
"Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
157-
)
158-
159-
if not isinstance(args["doc_metadata"], dict):
160-
raise InvalidMetadataError("doc_metadata must be a dictionary")
161-
162-
# Validate metadata schema based on doc_type
163-
if args["doc_type"] != "others":
164-
metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
165-
for key, value in args["doc_metadata"].items():
166-
if key in metadata_schema and not isinstance(value, metadata_schema[key]):
167-
raise InvalidMetadataError(f"Invalid type for metadata field {key}")
168-
169-
# set to MetaDataConfig
170-
args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
171-
172122
if args["text"]:
173123
text = args.get("text")
174124
name = args.get("name")
@@ -216,29 +166,6 @@ def post(self, tenant_id, dataset_id):
216166
if "doc_language" not in args:
217167
args["doc_language"] = "English"
218168

219-
# Validate metadata if provided
220-
if args.get("doc_type") or args.get("doc_metadata"):
221-
if not args.get("doc_type") or not args.get("doc_metadata"):
222-
raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
223-
224-
if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
225-
raise InvalidMetadataError(
226-
"Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
227-
)
228-
229-
if not isinstance(args["doc_metadata"], dict):
230-
raise InvalidMetadataError("doc_metadata must be a dictionary")
231-
232-
# Validate metadata schema based on doc_type
233-
if args["doc_type"] != "others":
234-
metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
235-
for key, value in args["doc_metadata"].items():
236-
if key in metadata_schema and not isinstance(value, metadata_schema[key]):
237-
raise InvalidMetadataError(f"Invalid type for metadata field {key}")
238-
239-
# set to MetaDataConfig
240-
args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
241-
242169
# get dataset info
243170
dataset_id = str(dataset_id)
244171
tenant_id = str(tenant_id)
@@ -306,29 +233,6 @@ def post(self, tenant_id, dataset_id, document_id):
306233
if "doc_language" not in args:
307234
args["doc_language"] = "English"
308235

309-
# Validate metadata if provided
310-
if args.get("doc_type") or args.get("doc_metadata"):
311-
if not args.get("doc_type") or not args.get("doc_metadata"):
312-
raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
313-
314-
if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
315-
raise InvalidMetadataError(
316-
"Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
317-
)
318-
319-
if not isinstance(args["doc_metadata"], dict):
320-
raise InvalidMetadataError("doc_metadata must be a dictionary")
321-
322-
# Validate metadata schema based on doc_type
323-
if args["doc_type"] != "others":
324-
metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
325-
for key, value in args["doc_metadata"].items():
326-
if key in metadata_schema and not isinstance(value, metadata_schema[key]):
327-
raise InvalidMetadataError(f"Invalid type for metadata field {key}")
328-
329-
# set to MetaDataConfig
330-
args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
331-
332236
# get dataset info
333237
dataset_id = str(dataset_id)
334238
tenant_id = str(tenant_id)

api/extensions/ext_commands.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,15 @@
33

44
def init_app(app: DifyApp):
55
from commands import (
6-
add_qdrant_doc_id_index,
6+
add_qdrant_index,
77
convert_to_agent_apps,
88
create_tenant,
99
extract_plugins,
1010
extract_unique_plugins,
1111
fix_app_site_missing,
1212
install_plugins,
1313
migrate_data_for_plugin,
14+
old_metadata_migration,
1415
reset_email,
1516
reset_encrypt_key_pair,
1617
reset_password,
@@ -24,14 +25,15 @@ def init_app(app: DifyApp):
2425
reset_encrypt_key_pair,
2526
vdb_migrate,
2627
convert_to_agent_apps,
27-
add_qdrant_doc_id_index,
28+
add_qdrant_index,
2829
create_tenant,
2930
upgrade_db,
3031
fix_app_site_missing,
3132
migrate_data_for_plugin,
3233
extract_plugins,
3334
extract_unique_plugins,
3435
install_plugins,
36+
old_metadata_migration,
3537
]
3638
for cmd in cmds_to_register:
3739
app.cli.add_command(cmd)

api/services/dataset_service.py

-15
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,6 @@
4646
from services.entities.knowledge_entities.knowledge_entities import (
4747
ChildChunkUpdateArgs,
4848
KnowledgeConfig,
49-
MetaDataConfig,
5049
RerankingModel,
5150
RetrievalModel,
5251
SegmentUpdateArgs,
@@ -999,9 +998,6 @@ def save_document_with_dataset_id(
999998
document.data_source_info = json.dumps(data_source_info)
1000999
document.batch = batch
10011000
document.indexing_status = "waiting"
1002-
if knowledge_config.metadata:
1003-
document.doc_type = knowledge_config.metadata.doc_type
1004-
document.metadata = knowledge_config.metadata.doc_metadata
10051001
db.session.add(document)
10061002
documents.append(document)
10071003
duplicate_document_ids.append(document.id)
@@ -1018,7 +1014,6 @@ def save_document_with_dataset_id(
10181014
account,
10191015
file_name,
10201016
batch,
1021-
knowledge_config.metadata,
10221017
)
10231018
db.session.add(document)
10241019
db.session.flush()
@@ -1076,7 +1071,6 @@ def save_document_with_dataset_id(
10761071
account,
10771072
truncated_page_name,
10781073
batch,
1079-
knowledge_config.metadata,
10801074
)
10811075
db.session.add(document)
10821076
db.session.flush()
@@ -1117,7 +1111,6 @@ def save_document_with_dataset_id(
11171111
account,
11181112
document_name,
11191113
batch,
1120-
knowledge_config.metadata,
11211114
)
11221115
db.session.add(document)
11231116
db.session.flush()
@@ -1155,7 +1148,6 @@ def build_document(
11551148
account: Account,
11561149
name: str,
11571150
batch: str,
1158-
metadata: Optional[MetaDataConfig] = None,
11591151
):
11601152
document = Document(
11611153
tenant_id=dataset.tenant_id,
@@ -1180,9 +1172,6 @@ def build_document(
11801172
BuiltInField.last_update_date: datetime.datetime.now(datetime.UTC).strftime("%Y-%m-%d %H:%M:%S"),
11811173
BuiltInField.source: data_source_type,
11821174
}
1183-
if metadata is not None:
1184-
doc_metadata.update(metadata.doc_metadata)
1185-
document.doc_type = metadata.doc_type
11861175
if doc_metadata:
11871176
document.doc_metadata = doc_metadata
11881177
return document
@@ -1297,10 +1286,6 @@ def update_document_with_dataset_id(
12971286
# update document name
12981287
if document_data.name:
12991288
document.name = document_data.name
1300-
# update doc_type and doc_metadata if provided
1301-
if document_data.metadata is not None:
1302-
document.doc_metadata = document_data.metadata.doc_metadata
1303-
document.doc_type = document_data.metadata.doc_type
13041289
# update document to be waiting
13051290
document.indexing_status = "waiting"
13061291
document.completed_at = None

api/services/entities/knowledge_entities/knowledge_entities.py

-1
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,6 @@ class KnowledgeConfig(BaseModel):
128128
embedding_model: Optional[str] = None
129129
embedding_model_provider: Optional[str] = None
130130
name: Optional[str] = None
131-
metadata: Optional[MetaDataConfig] = None
132131

133132

134133
class SegmentUpdateArgs(BaseModel):

0 commit comments

Comments
 (0)