-
Notifications
You must be signed in to change notification settings - Fork 14k
Tool HTTP request, returned application/octet-stream, file type not recognized #15262
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Comments
How do I get the tool http request recognition extension to docx? |
To configure the tool HTTP request to recognize the file extension as import os
import re
import mimetypes
from uuid import uuid4
import magic
def guess_file_info_from_response(response):
url = str(response.url)
# Try to extract filename from URL
parsed_url = urllib.parse.urlparse(url)
url_path = parsed_url.path
filename = os.path.basename(url_path)
# If filename couldn't be extracted, use Content-Disposition header
if not filename:
content_disposition = response.headers.get("Content-Disposition")
if content_disposition:
filename_match = re.search(r'filename="?(.+)"?', content_disposition)
if filename_match:
filename = filename_match.group(1)
# If still no filename, generate a unique one
if not filename:
unique_name = str(uuid4())
filename = f"{unique_name}"
# Guess MIME type from filename first, then URL
mimetype, _ = mimetypes.guess_type(filename)
if mimetype is None:
mimetype, _ = mimetypes.guess_type(url)
if mimetype is None:
# If guessing fails, use Content-Type from response headers
mimetype = response.headers.get("Content-Type", "application/octet-stream")
# Use python-magic to guess MIME type if still unknown or generic
if mimetype == "application/octet-stream" and magic is not None:
try:
mimetype = magic.from_buffer(response.content[:1024], mime=True)
except magic.MagicException:
pass
extension = os.path.splitext(filename)[1]
# Ensure filename has an extension
if not extension:
extension = mimetypes.guess_extension(mimetype) or ".bin"
filename = f"{filename}{extension}"
# Handle special cases for extensions
if extension == ".bin" and mimetype == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
extension = ".docx"
filename = f"{os.path.splitext(filename)[0]}.docx"
return {
"filename": filename,
"extension": extension,
"mimetype": mimetype,
"size": int(response.headers.get("Content-Length", -1)),
} This code uses the To continue talking to Dosu, mention @dosu. Help Dosu learn! Give it feedback: Great Response | Irrelevant Answer | Incorrect Sources | Too Verbose | Hallucination | Bug Report | Other |
How to configure the detection process? |
To configure the detection process to ensure the tool HTTP request recognizes the file extension as
Here's a Python snippet demonstrating this approach: import os
import re
import mimetypes
from uuid import uuid4
import magic
def guess_file_info_from_response(response):
url = str(response.url)
parsed_url = urllib.parse.urlparse(url)
url_path = parsed_url.path
filename = os.path.basename(url_path)
if not filename:
content_disposition = response.headers.get("Content-Disposition")
if content_disposition:
filename_match = re.search(r'filename="?(.+)"?', content_disposition)
if filename_match:
filename = filename_match.group(1)
if not filename:
unique_name = str(uuid4())
filename = f"{unique_name}"
mimetype, _ = mimetypes.guess_type(filename)
if mimetype is None:
mimetype, _ = mimetypes.guess_type(url)
if mimetype is None:
mimetype = response.headers.get("Content-Type", "application/octet-stream")
if mimetype == "application/octet-stream" and magic is not None:
try:
mimetype = magic.from_buffer(response.content[:1024], mime=True)
except magic.MagicException:
pass
extension = os.path.splitext(filename)[1]
if not extension:
extension = mimetypes.guess_extension(mimetype) or ".bin"
filename = f"{filename}{extension}"
if extension == ".bin" and mimetype == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
extension = ".docx"
filename = f"{os.path.splitext(filename)[0]}.docx"
return {
"filename": filename,
"extension": extension,
"mimetype": mimetype,
"size": int(response.headers.get("Content-Length", -1)),
} This code ensures that if the MIME type is detected as To continue talking to Dosu, mention @dosu. Help Dosu learn! Give it feedback: Great Response | Irrelevant Answer | Incorrect Sources | Too Verbose | Hallucination | Bug Report | Other |
Self Checks
Dify version
1.0.0
Cloud or Self Hosted
Self Hosted (Docker)
Steps to reproduce
curl --location --request POST 'http://10.27.24.233:8081/api/v1/convert/pdf/word'
--form 'fileInput=@"/Users/soya/Desktop/1219926043.pdf"'
--form 'outputFormat="docx"'
The above is the request curl,postman the returned information is as follows:
Content-Type | application/octet-stream
Content-Disposition | form-data; name="attachment"; filename="1219926043.docx"
Content-Length | 116257
The workflow is as follows:
The debug trace information is as follows:
{
"status_code": 200,
"body": "",
"headers": {
"date": "Sat, 08 Mar 2025 04:42:33 GMT",
"content-type": "application/octet-stream",
"content-disposition": "form-data; name="attachment"; filename="1219926043.docx"",
"content-length": "116257",
"cache-status": "ac5c6a2e8fde;detail=mismatch",
"via": "1.1 ac5c6a2e8fde (squid/6.10)",
"connection": "keep-alive"
},
"files": [
{
"dify_model_identity": "dify__file",
"id": null,
"tenant_id": "5185fd0a-047c-4abf-b27f-f91ed93f6c0f",
"type": "custom",
"transfer_method": "tool_file",
"remote_url": null,
"related_id": "71aef328-2b8a-4808-b49d-5dd1079f900f",
"filename": "2ff8edd9bb0d4557a2c68d2a02e98271.bin",
"extension": ".bin",
"mime_type": "application/octet-stream",
"size": 116257,
"url": "http://10.27.24.230//files/tools/71aef328-2b8a-4808-b49d-5dd1079f900f.bin?timestamp=1741408957&nonce=d0a6a819f20afe7291b99d35a98c1082&sign=T9Qu-AmUwlsvOJmomm6CrrxJaZCmQEkAl1Vc2Fwp6ng="
}
]
}
✔️ Expected Behavior
{
"status_code": 200,
"body": "",
"headers": {
"date": "Sat, 08 Mar 2025 04:42:33 GMT",
"content-type": "application/octet-stream",
"content-disposition": "form-data; name="attachment"; filename="1219926043.docx"",
"content-length": "116257",
"cache-status": "ac5c6a2e8fde;detail=mismatch",
"via": "1.1 ac5c6a2e8fde (squid/6.10)",
"connection": "keep-alive"
},
"files": [
{
"dify_model_identity": "dify__file",
"id": null,
"tenant_id": "5185fd0a-047c-4abf-b27f-f91ed93f6c0f",
"type": "custom",
"transfer_method": "tool_file",
"remote_url": null,
"related_id": "71aef328-2b8a-4808-b49d-5dd1079f900f",
"filename": "2ff8edd9bb0d4557a2c68d2a02e98271.docx",
"extension": ".docx",
"mime_type": "application/octet-stream",
"size": 116257,
"url": "http://10.27.24.230//files/tools/71aef328-2b8a-4808-b49d-5dd1079f900f.bin?timestamp=1741408957&nonce=d0a6a819f20afe7291b99d35a98c1082&sign=T9Qu-AmUwlsvOJmomm6CrrxJaZCmQEkAl1Vc2Fwp6ng="
}
]
}
❌ Actual Behavior
{
"status_code": 200,
"body": "",
"headers": {
"date": "Sat, 08 Mar 2025 04:42:33 GMT",
"content-type": "application/octet-stream",
"content-disposition": "form-data; name="attachment"; filename="1219926043.docx"",
"content-length": "116257",
"cache-status": "ac5c6a2e8fde;detail=mismatch",
"via": "1.1 ac5c6a2e8fde (squid/6.10)",
"connection": "keep-alive"
},
"files": [
{
"dify_model_identity": "dify__file",
"id": null,
"tenant_id": "5185fd0a-047c-4abf-b27f-f91ed93f6c0f",
"type": "custom",
"transfer_method": "tool_file",
"remote_url": null,
"related_id": "71aef328-2b8a-4808-b49d-5dd1079f900f",
"filename": "2ff8edd9bb0d4557a2c68d2a02e98271.bin",
"extension": ".bin",
"mime_type": "application/octet-stream",
"size": 116257,
"url": "http://10.27.24.230//files/tools/71aef328-2b8a-4808-b49d-5dd1079f900f.bin?timestamp=1741408957&nonce=d0a6a819f20afe7291b99d35a98c1082&sign=T9Qu-AmUwlsvOJmomm6CrrxJaZCmQEkAl1Vc2Fwp6ng="
}
]
}
The text was updated successfully, but these errors were encountered: