Skip to content

Commit 9f04bda

Browse files
committed
bench:
1 parent cb9d218 commit 9f04bda

File tree

265 files changed

+28379
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

265 files changed

+28379
-0
lines changed
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# ObjectConstraint-embedded-json-desc
2+
## Summary
3+
- Aggregation
4+
- Trial: #10
5+
- Success: #0
6+
- 1st: #0
7+
- 2nd: #0
8+
- 3rd: #0
9+
- Failure: #0
10+
- Nothing: #9
11+
- Error: 1
12+
- Average Time: 226883 ms
13+
- Token Usage:
14+
- Everything
15+
- Input
16+
- Total
17+
- Cached
18+
- Output
19+
- Total
20+
- Reasoning
21+
- Accepted Predication
22+
- Rejected Predication
23+
24+
## Scenario
25+
### Conversation (user)
26+
Make a virtual person with random values,
27+
and make it to participate in a campaign.
28+
29+
## Trials
30+
No | Status | Time
31+
---:|:-------|------:
32+
[1. nothing](./trials/1.nothing.json) | ⚠️ | 432,365 ms
33+
[2. nothing](./trials/2.nothing.json) | ⚠️ | 153,074 ms
34+
[3. nothing](./trials/3.nothing.json) | ⚠️ | 453,786 ms
35+
[4. nothing](./trials/4.nothing.json) | ⚠️ | 11,375 ms
36+
[5. nothing](./trials/5.nothing.json) | ⚠️ | 202,347 ms
37+
[6. error](./trials/6.error.json) | 😱 | 7,588 ms
38+
[7. nothing](./trials/7.nothing.json) | ⚠️ | 449 ms
39+
[8. nothing](./trials/8.nothing.json) | ⚠️ | 475,276 ms
40+
[9. nothing](./trials/9.nothing.json) | ⚠️ | 156,456 ms
41+
[10. nothing](./trials/10.nothing.json) | ⚠️ | 376,114 ms

reports/validate/mistralai/magistral-small-2506/ObjectConstraint-embedded-json-desc/trials/1.nothing.json

Lines changed: 32 additions & 0 deletions
Large diffs are not rendered by default.

reports/validate/mistralai/magistral-small-2506/ObjectConstraint-embedded-json-desc/trials/10.nothing.json

Lines changed: 32 additions & 0 deletions
Large diffs are not rendered by default.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
{
2+
"type": "nothing",
3+
"completion": {
4+
"id": "gen-1749645849-6CcZSaE9NrWHHupXgqgk",
5+
"provider": "Mistral",
6+
"model": "mistralai/magistral-small-2506",
7+
"object": "chat.completion",
8+
"created": 1749645849,
9+
"choices": [
10+
{
11+
"logprobs": null,
12+
"finish_reason": "stop",
13+
"native_finish_reason": "stop",
14+
"index": 0,
15+
"message": {
16+
"role": "assistant",
17+
"content": "",
18+
"refusal": null,
19+
"reasoning": null
20+
}
21+
}
22+
],
23+
"usage": {
24+
"prompt_tokens": 325,
25+
"completion_tokens": 0,
26+
"total_tokens": 325
27+
}
28+
},
29+
"started_at": "2025-06-11T12:44:09.549Z",
30+
"completed_at": "2025-06-11T12:46:42.623Z",
31+
"previous": []
32+
}

reports/validate/mistralai/magistral-small-2506/ObjectConstraint-embedded-json-desc/trials/3.nothing.json

Lines changed: 32 additions & 0 deletions
Large diffs are not rendered by default.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
{
2+
"type": "nothing",
3+
"completion": {
4+
"id": "gen-1749645849-1zjAv8IxE1TLzm2qhYkX",
5+
"provider": "Mistral",
6+
"model": "mistralai/magistral-small-2506",
7+
"object": "chat.completion",
8+
"created": 1749645849,
9+
"choices": [
10+
{
11+
"logprobs": null,
12+
"finish_reason": "stop",
13+
"native_finish_reason": "stop",
14+
"index": 0,
15+
"message": {
16+
"role": "assistant",
17+
"content": "",
18+
"refusal": null,
19+
"reasoning": null
20+
}
21+
}
22+
],
23+
"usage": {
24+
"prompt_tokens": 325,
25+
"completion_tokens": 0,
26+
"total_tokens": 325
27+
}
28+
},
29+
"started_at": "2025-06-11T12:44:09.549Z",
30+
"completed_at": "2025-06-11T12:44:20.924Z",
31+
"previous": []
32+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
{
2+
"type": "nothing",
3+
"completion": {
4+
"id": "gen-1749645849-bJycVvwuUpFjxnW0wMQv",
5+
"provider": "Mistral",
6+
"model": "mistralai/magistral-small-2506",
7+
"object": "chat.completion",
8+
"created": 1749645849,
9+
"choices": [
10+
{
11+
"logprobs": null,
12+
"finish_reason": "stop",
13+
"native_finish_reason": "stop",
14+
"index": 0,
15+
"message": {
16+
"role": "assistant",
17+
"content": "",
18+
"refusal": null,
19+
"reasoning": null
20+
}
21+
}
22+
],
23+
"usage": {
24+
"prompt_tokens": 325,
25+
"completion_tokens": 0,
26+
"total_tokens": 325
27+
}
28+
},
29+
"started_at": "2025-06-11T12:44:09.549Z",
30+
"completed_at": "2025-06-11T12:47:31.896Z",
31+
"previous": []
32+
}
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
{
2+
"type": "error",
3+
"error": {
4+
"status": 400,
5+
"headers": {
6+
"access-control-allow-origin": "*",
7+
"cf-ray": "94e132ed9ff5b364-MAN",
8+
"connection": "keep-alive",
9+
"content-length": "294",
10+
"content-type": "application/json",
11+
"date": "Wed, 11 Jun 2025 12:44:17 GMT",
12+
"server": "cloudflare",
13+
"vary": "Accept-Encoding",
14+
"x-clerk-auth-message": "Invalid JWT form. A JWT consists of three parts separated by dots. (reason=token-invalid, token-carrier=header)",
15+
"x-clerk-auth-reason": "token-invalid",
16+
"x-clerk-auth-status": "signed-out"
17+
},
18+
"error": {
19+
"message": "Provider returned error",
20+
"code": 400,
21+
"metadata": {
22+
"raw": "{\"object\":\"error\",\"message\":\"Unexpected role 'tool' after role 'user'\",\"type\":\"invalid_request_error\",\"param\":null,\"code\":null}",
23+
"provider_name": "Mistral"
24+
}
25+
},
26+
"code": 400,
27+
"name": "Error",
28+
"message": "400 Provider returned error",
29+
"stack": "Error: 400 Provider returned error\n at APIError.generate (file:///Users/ryoppippi/ghq/github.com/wrtnlabs/benchmark/node_modules/.pnpm/[email protected]/node_modules/openai/error.mjs:41:20)\n at OpenAI.makeStatusError (file:///Users/ryoppippi/ghq/github.com/wrtnlabs/benchmark/node_modules/.pnpm/[email protected]/node_modules/openai/core.mjs:295:25)\n at OpenAI.makeRequest (file:///Users/ryoppippi/ghq/github.com/wrtnlabs/benchmark/node_modules/.pnpm/[email protected]/node_modules/openai/core.mjs:339:30)\n at process.processTicksAndRejections (node:internal/process/task_queues:105:5)\n at async process (file:///Users/ryoppippi/ghq/github.com/wrtnlabs/benchmark/dist/index.mjs:329:24)\n at async tryValidateExperiment (file:///Users/ryoppippi/ghq/github.com/wrtnlabs/benchmark/dist/index.mjs:313:17)\n at async file:///Users/ryoppippi/ghq/github.com/wrtnlabs/benchmark/dist/index.mjs:1228:23\n at async Promise.all (index 5)\n at async file:///Users/ryoppippi/ghq/github.com/wrtnlabs/benchmark/dist/index.mjs:1226:38\n at async Promise.all (index 0)"
30+
},
31+
"started_at": "2025-06-11T12:44:16.889Z",
32+
"completed_at": "2025-06-11T12:44:17.137Z",
33+
"previous": [
34+
{
35+
"type": "failure",
36+
"id": "iHDUuEmIz",
37+
"arguments": {
38+
"id": "550e8400-2e2b-40d2-a411-411b4a3d",
39+
"name": "John Smith",
40+
"thumbnail": "https://example.com/john.jpg",
41+
"age": 25,
42+
"email": "[email protected]",
43+
"hobbies": [
44+
"reading",
45+
"hiking",
46+
"photography"
47+
],
48+
"isActive": true,
49+
"created_at": "2023-10-07T15:30:09Z"
50+
},
51+
"result": {
52+
"success": false,
53+
"errors": [
54+
{
55+
"path": "$input.id",
56+
"expected": "string & Format<\"uuid\">",
57+
"value": "550e8400-2e2b-40d2-a411-411b4a3d"
58+
}
59+
],
60+
"data": {
61+
"id": "550e8400-2e2b-40d2-a411-411b4a3d",
62+
"name": "John Smith",
63+
"thumbnail": "https://example.com/john.jpg",
64+
"age": 25,
65+
"email": "[email protected]",
66+
"hobbies": [
67+
"reading",
68+
"hiking",
69+
"photography"
70+
],
71+
"isActive": true,
72+
"created_at": "2023-10-07T15:30:09Z"
73+
}
74+
},
75+
"completion": {
76+
"id": "gen-1749645849-osuPwijgijRsCwH19k0o",
77+
"provider": "Mistral",
78+
"model": "mistralai/magistral-small-2506",
79+
"object": "chat.completion",
80+
"created": 1749645849,
81+
"choices": [
82+
{
83+
"logprobs": null,
84+
"finish_reason": "tool_calls",
85+
"native_finish_reason": "tool_calls",
86+
"index": 0,
87+
"message": {
88+
"role": "assistant",
89+
"content": "",
90+
"refusal": null,
91+
"reasoning": null,
92+
"tool_calls": [
93+
{
94+
"id": "XJtYW3R1V",
95+
"function": {
96+
"name": "think",
97+
"arguments": "{\n \"id\": \"550e8400-2e2b-40d2-a411-411b4a3d\", \"name\": \"John Smith\", \"thumbnail\": \"https://example.com/john.jpg\", \"age\": 25, \"email\": \"[email protected]\", \"hobbies\": [\"reading\", \"hiking\", \"photography\"], \"isActive\": true, \"created_at\": \"2023-10-07T15:30:09Z\"\n}"
98+
},
99+
"index": 0,
100+
"type": "function"
101+
},
102+
{
103+
"id": "iHDUuEmIz",
104+
"function": {
105+
"name": "participate",
106+
"arguments": "{\"id\": \"550e8400-2e2b-40d2-a411-411b4a3d\", \"name\": \"John Smith\", \"thumbnail\": \"https://example.com/john.jpg\", \"age\": 25, \"email\": \"[email protected]\", \"hobbies\": [\"reading\", \"hiking\", \"photography\"], \"isActive\": true, \"created_at\": \"2023-10-07T15:30:09Z\"}"
107+
},
108+
"index": 1,
109+
"type": "function"
110+
}
111+
]
112+
}
113+
}
114+
],
115+
"usage": {
116+
"prompt_tokens": 325,
117+
"completion_tokens": 263,
118+
"total_tokens": 588
119+
}
120+
},
121+
"started_at": "2025-06-11T12:44:09.549Z",
122+
"completed_at": "2025-06-11T12:44:16.886Z",
123+
"previous": []
124+
}
125+
]
126+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
{
2+
"type": "nothing",
3+
"completion": {
4+
"id": "gen-1749645849-dYkobbj0N0ywBKNNscqp",
5+
"provider": "Mistral",
6+
"model": "mistralai/magistral-small-2506",
7+
"object": "chat.completion",
8+
"created": 1749645849,
9+
"choices": [
10+
{
11+
"logprobs": null,
12+
"finish_reason": "tool_calls",
13+
"native_finish_reason": "tool_calls",
14+
"index": 0,
15+
"message": {
16+
"role": "assistant",
17+
"content": "think",
18+
"refusal": null,
19+
"reasoning": null
20+
}
21+
}
22+
],
23+
"usage": {
24+
"prompt_tokens": 325,
25+
"completion_tokens": 2,
26+
"total_tokens": 327
27+
}
28+
},
29+
"started_at": "2025-06-11T12:44:09.549Z",
30+
"completed_at": "2025-06-11T12:44:09.998Z",
31+
"previous": []
32+
}

reports/validate/mistralai/magistral-small-2506/ObjectConstraint-embedded-json-desc/trials/8.nothing.json

Lines changed: 32 additions & 0 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)