Skip to content

Commit 0cc6b38

Browse files
authored
[Frontend] Support scores endpoint in run_batch (#12430)
Signed-off-by: Pooya Davoodi <[email protected]>
1 parent 28e0750 commit 0cc6b38

File tree

4 files changed

+99
-7
lines changed

4 files changed

+99
-7
lines changed

examples/offline_inference/openai/openai_batch.md

+32-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ The OpenAI batch file format consists of a series of json objects on new lines.
1313
Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details.
1414

1515
```{note}
16-
We currently only support `/v1/chat/completions` and `/v1/embeddings` endpoints (completions coming soon).
16+
We currently support `/v1/chat/completions`, `/v1/embeddings`, and `/v1/score` endpoints (completions coming soon).
1717
```
1818

1919
## Pre-requisites
@@ -203,3 +203,34 @@ $ cat results.jsonl
203203
{"id":"vllm-db0f71f7dec244e6bce530e0b4ef908b","custom_id":"request-1","response":{"status_code":200,"request_id":"vllm-batch-3580bf4d4ae54d52b67eee266a6eab20","body":{"id":"embd-33ac2efa7996430184461f2e38529746","object":"list","created":444647,"model":"intfloat/e5-mistral-7b-instruct","data":[{"index":0,"object":"embedding","embedding":[0.016204833984375,0.0092010498046875,0.0018358230590820312,-0.0028228759765625,0.001422882080078125,-0.0031147003173828125,...]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0}}},"error":null}
204204
...
205205
```
206+
207+
## Example 5: Using score endpoint
208+
209+
### Additional prerequisites
210+
211+
* Ensure you are using `vllm >= 0.7.0`.
212+
213+
### Step 1: Create your batch file
214+
215+
Add score requests to your batch file. The following is an example:
216+
217+
```
218+
{"custom_id": "request-1", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
219+
{"custom_id": "request-2", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
220+
```
221+
222+
You can mix chat completion, embedding, and score requests in the batch file, as long as the model you are using supports them all (note that all requests must use the same model).
223+
224+
### Step 2: Run the batch
225+
226+
You can run the batch using the same command as in earlier examples.
227+
228+
### Step 3: Check your results
229+
230+
You can check your results by running `cat results.jsonl`
231+
232+
```
233+
$ cat results.jsonl
234+
{"id":"vllm-f87c5c4539184f618e555744a2965987","custom_id":"request-1","response":{"status_code":200,"request_id":"vllm-batch-806ab64512e44071b37d3f7ccd291413","body":{"id":"score-4ee45236897b4d29907d49b01298cdb1","object":"list","created":1737847944,"model":"BAAI/bge-reranker-v2-m3","data":[{"index":0,"object":"score","score":0.0010900497436523438},{"index":1,"object":"score","score":1.0}],"usage":{"prompt_tokens":37,"total_tokens":37,"completion_tokens":0,"prompt_tokens_details":null}}},"error":null}
235+
{"id":"vllm-41990c51a26d4fac8419077f12871099","custom_id":"request-2","response":{"status_code":200,"request_id":"vllm-batch-73ce66379026482699f81974e14e1e99","body":{"id":"score-13f2ffe6ba40460fbf9f7f00ad667d75","object":"list","created":1737847944,"model":"BAAI/bge-reranker-v2-m3","data":[{"index":0,"object":"score","score":0.001094818115234375},{"index":1,"object":"score","score":1.0}],"usage":{"prompt_tokens":37,"total_tokens":37,"completion_tokens":0,"prompt_tokens_details":null}}},"error":null}
236+
```

tests/entrypoints/openai/test_run_batch.py

+37
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import json
12
import subprocess
23
import sys
34
import tempfile
@@ -21,6 +22,9 @@
2122
{"custom_id": "request-3", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "Hello world!"}}
2223
{"custom_id": "request-4", "method": "POST", "url": "/v1/embeddings", "body": {"model": "NonExistModel", "input": "Hello world!"}}"""
2324

25+
INPUT_SCORE_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
26+
{"custom_id": "request-2", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}"""
27+
2428

2529
def test_empty_file():
2630
with tempfile.NamedTemporaryFile(
@@ -102,3 +106,36 @@ def test_embeddings():
102106
# Ensure that the output format conforms to the openai api.
103107
# Validation should throw if the schema is wrong.
104108
BatchRequestOutput.model_validate_json(line)
109+
110+
111+
def test_score():
112+
with tempfile.NamedTemporaryFile(
113+
"w") as input_file, tempfile.NamedTemporaryFile(
114+
"r") as output_file:
115+
input_file.write(INPUT_SCORE_BATCH)
116+
input_file.flush()
117+
proc = subprocess.Popen([
118+
sys.executable,
119+
"-m",
120+
"vllm.entrypoints.openai.run_batch",
121+
"-i",
122+
input_file.name,
123+
"-o",
124+
output_file.name,
125+
"--model",
126+
"BAAI/bge-reranker-v2-m3",
127+
], )
128+
proc.communicate()
129+
proc.wait()
130+
assert proc.returncode == 0, f"{proc=}"
131+
132+
contents = output_file.read()
133+
for line in contents.strip().split("\n"):
134+
# Ensure that the output format conforms to the openai api.
135+
# Validation should throw if the schema is wrong.
136+
BatchRequestOutput.model_validate_json(line)
137+
138+
# Ensure that there is no error in the response.
139+
line_dict = json.loads(line)
140+
assert isinstance(line_dict, dict)
141+
assert line_dict["error"] is None

vllm/entrypoints/openai/protocol.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -1283,7 +1283,7 @@ class BatchRequestInput(OpenAIBaseModel):
12831283
url: str
12841284

12851285
# The parameters of the request.
1286-
body: Union[ChatCompletionRequest, EmbeddingRequest]
1286+
body: Union[ChatCompletionRequest, EmbeddingRequest, ScoreRequest]
12871287

12881288

12891289
class BatchResponseData(OpenAIBaseModel):
@@ -1294,7 +1294,8 @@ class BatchResponseData(OpenAIBaseModel):
12941294
request_id: str
12951295

12961296
# The body of the response.
1297-
body: Optional[Union[ChatCompletionResponse, EmbeddingResponse]] = None
1297+
body: Optional[Union[ChatCompletionResponse, EmbeddingResponse,
1298+
ScoreResponse]] = None
12981299

12991300

13001301
class BatchRequestOutput(OpenAIBaseModel):

vllm/entrypoints/openai/run_batch.py

+27-4
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,14 @@
1616
BatchRequestOutput,
1717
BatchResponseData,
1818
ChatCompletionResponse,
19-
EmbeddingResponse, ErrorResponse)
19+
EmbeddingResponse, ErrorResponse,
20+
ScoreResponse)
2021
# yapf: enable
2122
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
2223
from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
2324
from vllm.entrypoints.openai.serving_models import (BaseModelPath,
2425
OpenAIServingModels)
26+
from vllm.entrypoints.openai.serving_score import OpenAIServingScores
2527
from vllm.usage.usage_lib import UsageContext
2628
from vllm.utils import FlexibleArgumentParser, random_uuid
2729
from vllm.version import __version__ as VLLM_VERSION
@@ -167,7 +169,8 @@ async def run_request(serving_engine_func: Callable,
167169
tracker: BatchProgressTracker) -> BatchRequestOutput:
168170
response = await serving_engine_func(request.body)
169171

170-
if isinstance(response, (ChatCompletionResponse, EmbeddingResponse)):
172+
if isinstance(response,
173+
(ChatCompletionResponse, EmbeddingResponse, ScoreResponse)):
171174
batch_output = BatchRequestOutput(
172175
id=f"vllm-{random_uuid()}",
173176
custom_id=request.custom_id,
@@ -239,6 +242,12 @@ async def main(args):
239242
chat_template=None,
240243
chat_template_content_format="auto",
241244
) if model_config.task == "embed" else None
245+
openai_serving_scores = (OpenAIServingScores(
246+
engine,
247+
model_config,
248+
openai_serving_models,
249+
request_logger=request_logger,
250+
) if model_config.task == "score" else None)
242251

243252
tracker = BatchProgressTracker()
244253
logger.info("Reading batch from %s...", args.input_file)
@@ -279,14 +288,28 @@ async def main(args):
279288
))
280289
continue
281290

291+
response_futures.append(run_request(handler_fn, request, tracker))
292+
tracker.submitted()
293+
elif request.url == "/v1/score":
294+
handler_fn = (None if openai_serving_scores is None else
295+
openai_serving_scores.create_score)
296+
if handler_fn is None:
297+
response_futures.append(
298+
make_async_error_request_output(
299+
request,
300+
error_msg="The model does not support Scores API",
301+
))
302+
continue
303+
282304
response_futures.append(run_request(handler_fn, request, tracker))
283305
tracker.submitted()
284306
else:
285307
response_futures.append(
286308
make_async_error_request_output(
287309
request,
288-
error_msg="Only /v1/chat/completions and "
289-
"/v1/embeddings are supported in the batch endpoint.",
310+
error_msg=
311+
"Only /v1/chat/completions, /v1/embeddings, and /v1/score "
312+
"are supported in the batch endpoint.",
290313
))
291314

292315
with tracker.pbar():

0 commit comments

Comments
 (0)