From f58ec3cd8a65880592d442027aa5b4d16aa10671 Mon Sep 17 00:00:00 2001
From: Ryan McCormick <rmccormick@nvidia.com>
Date: Fri, 8 Dec 2023 18:08:28 -0800
Subject: [PATCH 1/9] Keep content-type header to one value, check headers in
 generate endpoint tests

---
 qa/L0_http/generate_endpoint_test.py | 11 ++++++++++-
 src/http_server.cc                   |  7 +++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/qa/L0_http/generate_endpoint_test.py b/qa/L0_http/generate_endpoint_test.py
index 0b0a2eb81d..593154ee8f 100755
--- a/qa/L0_http/generate_endpoint_test.py
+++ b/qa/L0_http/generate_endpoint_test.py
@@ -71,6 +71,9 @@ def generate_expect_failure(self, model_name, inputs, msg):
         r = requests.post(
             url, data=inputs if isinstance(inputs, str) else json.dumps(inputs)
         )
+        # Content-Type header should always be JSON for errors
+        self.assertEqual(r.headers["Content-Type"], "'application/json'")
+
         try:
             r.raise_for_status()
             self.assertTrue(False, f"Expected failure, success for {inputs}")
@@ -79,6 +82,9 @@ def generate_expect_failure(self, model_name, inputs, msg):
 
     def generate_stream_expect_failure(self, model_name, inputs, msg):
         r = self.generate_stream(model_name, inputs)
+        # Content-Type header should always be JSON for errors
+        self.assertEqual(r.headers["Content-Type"], "'application/json'")
+
         try:
             r.raise_for_status()
             self.assertTrue(False, f"Expected failure, success for {inputs}")
@@ -90,6 +96,9 @@ def generate_stream_expect_success(
     ):
         r = self.generate_stream(model_name, inputs)
         r.raise_for_status()
+        self.assertEqual(
+            r.headers["Content-Type"], "'text/event-stream; charset=utf-8'"
+        )
         self.check_sse_responses(r, [{"TEXT": expected_output}] * rep_count)
 
     def check_sse_responses(self, res, expected_res):
@@ -128,7 +137,7 @@ def test_generate(self):
         r.raise_for_status()
 
         self.assertIn("Content-Type", r.headers)
-        self.assertIn("application/json", r.headers["Content-Type"])
+        self.assertEqual(r.headers["Content-Type"], "'application/json'")
 
         data = r.json()
         self.assertIn("TEXT", data)
diff --git a/src/http_server.cc b/src/http_server.cc
index 71ae5ca13c..32cf6956cd 100644
--- a/src/http_server.cc
+++ b/src/http_server.cc
@@ -116,6 +116,13 @@ EVBufferAddErrorJson(evbuffer* buffer, TRITONSERVER_Error* err)
 void
 AddContentTypeHeader(evhtp_request_t* req, const char* type)
 {
+  // Remove existing header if found
+  auto content_header =
+      evhtp_headers_find_header(req->headers_out, kContentTypeHeader);
+  if (content_header) {
+    evhtp_header_rm_and_free(req->headers_out, content_header);
+  }
+
   evhtp_headers_add_header(
       req->headers_out, evhtp_header_new(kContentTypeHeader, type, 1, 1));
 }

From a7d4524849d7bf80b299f2123247e0e5585228d3 Mon Sep 17 00:00:00 2001
From: Ryan McCormick <rmccormick@nvidia.com>
Date: Sat, 9 Dec 2023 01:17:28 -0800
Subject: [PATCH 2/9] Fix checks

---
 qa/L0_http/generate_endpoint_test.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/qa/L0_http/generate_endpoint_test.py b/qa/L0_http/generate_endpoint_test.py
index 593154ee8f..46d90c34e7 100755
--- a/qa/L0_http/generate_endpoint_test.py
+++ b/qa/L0_http/generate_endpoint_test.py
@@ -72,7 +72,7 @@ def generate_expect_failure(self, model_name, inputs, msg):
             url, data=inputs if isinstance(inputs, str) else json.dumps(inputs)
         )
         # Content-Type header should always be JSON for errors
-        self.assertEqual(r.headers["Content-Type"], "'application/json'")
+        self.assertEqual(r.headers["Content-Type"], "application/json")
 
         try:
             r.raise_for_status()
@@ -83,7 +83,7 @@ def generate_expect_failure(self, model_name, inputs, msg):
     def generate_stream_expect_failure(self, model_name, inputs, msg):
         r = self.generate_stream(model_name, inputs)
         # Content-Type header should always be JSON for errors
-        self.assertEqual(r.headers["Content-Type"], "'application/json'")
+        self.assertEqual(r.headers["Content-Type"], "application/json")
 
         try:
             r.raise_for_status()
@@ -96,15 +96,12 @@ def generate_stream_expect_success(
     ):
         r = self.generate_stream(model_name, inputs)
         r.raise_for_status()
-        self.assertEqual(
-            r.headers["Content-Type"], "'text/event-stream; charset=utf-8'"
-        )
         self.check_sse_responses(r, [{"TEXT": expected_output}] * rep_count)
 
     def check_sse_responses(self, res, expected_res):
         # Validate SSE format
         self.assertIn("Content-Type", res.headers)
-        self.assertIn("text/event-stream", res.headers["Content-Type"])
+        self.assertEqual("text/event-stream", res.headers["Content-Type"])
 
         # SSE format (data: []) is hard to parse, use helper library for simplicity
         client = sseclient.SSEClient(res)
@@ -137,7 +134,7 @@ def test_generate(self):
         r.raise_for_status()
 
         self.assertIn("Content-Type", r.headers)
-        self.assertEqual(r.headers["Content-Type"], "'application/json'")
+        self.assertEqual(r.headers["Content-Type"], "application/json")
 
         data = r.json()
         self.assertIn("TEXT", data)

From 23f6573743b7443ad2c44ea03108b5ff0dff6e4a Mon Sep 17 00:00:00 2001
From: Ryan McCormick <rmccormick@nvidia.com>
Date: Sat, 9 Dec 2023 19:49:32 -0800
Subject: [PATCH 3/9] Include charset in event-stream header check

---
 qa/L0_http/generate_endpoint_test.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/qa/L0_http/generate_endpoint_test.py b/qa/L0_http/generate_endpoint_test.py
index 46d90c34e7..29d2e20d96 100755
--- a/qa/L0_http/generate_endpoint_test.py
+++ b/qa/L0_http/generate_endpoint_test.py
@@ -101,7 +101,9 @@ def generate_stream_expect_success(
     def check_sse_responses(self, res, expected_res):
         # Validate SSE format
         self.assertIn("Content-Type", res.headers)
-        self.assertEqual("text/event-stream", res.headers["Content-Type"])
+        self.assertEqual(
+            "text/event-stream; charset=utf-8", res.headers["Content-Type"]
+        )
 
         # SSE format (data: []) is hard to parse, use helper library for simplicity
         client = sseclient.SSEClient(res)

From a35212d241f7980e59c183f3ce30773a2ce5e46a Mon Sep 17 00:00:00 2001
From: Ryan McCormick <rmccormick@nvidia.com>
Date: Sat, 9 Dec 2023 22:08:03 -0800
Subject: [PATCH 4/9] Add notes on event-stream vs json for success vs error in
 generate_stream responses

---
 docs/protocol/extension_generate.md | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/docs/protocol/extension_generate.md b/docs/protocol/extension_generate.md
index da110972ea..af66b29f05 100644
--- a/docs/protocol/extension_generate.md
+++ b/docs/protocol/extension_generate.md
@@ -55,17 +55,22 @@ POST v2/models/${MODEL_NAME}[/versions/${MODEL_VERSION}]/generate_stream
 
 ### generate v.s. generate_stream
 
-Both URLs expect the same request JSON object, and generate the same response
-JSON object. However, `generate` returns exactly 1 response JSON object, while
-`generate_stream` may return multiple responses based on the inference
-results. `generate_stream` returns the responses as
+Both URLs expect the same request JSON object, and generate the same response.
+However, there are some differences in the format used to return each:
+* `generate` returns exactly 1 response JSON object with a
+`Content-Type` of `application/json`
+* `generate_stream` may return multiple responses based on the inference
+results. Successful responses will be sent as
 [Server-Sent Events](https://html.spec.whatwg.org/multipage/server-sent-events.html#server-sent-events)
-(SSE), where each response will be a "data" chunk in the HTTP response body.
-Also, note that an error may be returned during inference, whereas the HTTP
-response code has been set in the first response of the SSE, which can result in
-receiving an [error object](#generate-response-json-error-object) while status
-code shows success (200). Therefore the user must always check whether an error
-object is received when generating responses through `generate_stream`.
+(SSE), where each response will be a "data" chunk in the HTTP
+response body with a `Content-Type` of `text/event-stream; charset=utf-8`.
+Error responses will have an [error JSON object](#generate-response-json-error-object)
+with `Content-Type` of `application/json`.
+    * Note that the HTTP response code is set in the first response of the SSE,
+    so if an error occurs after the first response for the request,
+    it can result in receiving an error object while the status code shows
+    success (200). Therefore, the user must always check whether an error
+    object is received when generating responses through `generate_stream`.
 
 ### Generate Request JSON Object
 
@@ -175,4 +180,4 @@ A failed generate request must be indicated by an HTTP error status
 {
   "error" : "error message"
 }
-```
\ No newline at end of file
+```

From edc59cee0415230b0e4d1cd6a325c4f8c2086c51 Mon Sep 17 00:00:00 2001
From: Ryan McCormick <rmccormick@nvidia.com>
Date: Sat, 9 Dec 2023 22:09:22 -0800
Subject: [PATCH 5/9] add / prefix to routes

---
 docs/protocol/extension_generate.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/protocol/extension_generate.md b/docs/protocol/extension_generate.md
index af66b29f05..47d1e94d8b 100644
--- a/docs/protocol/extension_generate.md
+++ b/docs/protocol/extension_generate.md
@@ -57,9 +57,9 @@ POST v2/models/${MODEL_NAME}[/versions/${MODEL_VERSION}]/generate_stream
 
 Both URLs expect the same request JSON object, and generate the same response.
 However, there are some differences in the format used to return each:
-* `generate` returns exactly 1 response JSON object with a
+* `/generate` returns exactly 1 response JSON object with a
 `Content-Type` of `application/json`
-* `generate_stream` may return multiple responses based on the inference
+* `/generate_stream` may return multiple responses based on the inference
 results. Successful responses will be sent as
 [Server-Sent Events](https://html.spec.whatwg.org/multipage/server-sent-events.html#server-sent-events)
 (SSE), where each response will be a "data" chunk in the HTTP
@@ -70,7 +70,7 @@ with `Content-Type` of `application/json`.
     so if an error occurs after the first response for the request,
     it can result in receiving an error object while the status code shows
     success (200). Therefore, the user must always check whether an error
-    object is received when generating responses through `generate_stream`.
+    object is received when generating responses through `/generate_stream`.
 
 ### Generate Request JSON Object
 

From a15629e6b376eb846c8d91aeebade3e83d5beec9 Mon Sep 17 00:00:00 2001
From: Ryan McCormick <rmccormick@nvidia.com>
Date: Sat, 9 Dec 2023 23:15:32 -0800
Subject: [PATCH 6/9] Clarify wording. Errors during inference are still in
 event-stream format

---
 docs/protocol/extension_generate.md | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/docs/protocol/extension_generate.md b/docs/protocol/extension_generate.md
index 47d1e94d8b..1d66ab06f2 100644
--- a/docs/protocol/extension_generate.md
+++ b/docs/protocol/extension_generate.md
@@ -55,17 +55,18 @@ POST v2/models/${MODEL_NAME}[/versions/${MODEL_VERSION}]/generate_stream
 
 ### generate v.s. generate_stream
 
-Both URLs expect the same request JSON object, and generate the same response.
-However, there are some differences in the format used to return each:
+Both URLs expect the same request JSON object, and generate the same JSON
+response object. However, there are some differences in the format used to
+return each:
 * `/generate` returns exactly 1 response JSON object with a
 `Content-Type` of `application/json`
 * `/generate_stream` may return multiple responses based on the inference
-results. Successful responses will be sent as
+results, with a `Content-Type` of `text/event-stream; charset=utf-8`.
+These responses will be sent as
 [Server-Sent Events](https://html.spec.whatwg.org/multipage/server-sent-events.html#server-sent-events)
 (SSE), where each response will be a "data" chunk in the HTTP
-response body with a `Content-Type` of `text/event-stream; charset=utf-8`.
-Error responses will have an [error JSON object](#generate-response-json-error-object)
-with `Content-Type` of `application/json`.
+response body. In the case of inference errors, responses will have
+an [error JSON object](#generate-response-json-error-object).
     * Note that the HTTP response code is set in the first response of the SSE,
     so if an error occurs after the first response for the request,
     it can result in receiving an error object while the status code shows

From ef40ae8a30fd93c7d4ec78de8ba33df531c7869e Mon Sep 17 00:00:00 2001
From: Ryan McCormick <rmccormick@nvidia.com>
Date: Mon, 11 Dec 2023 11:25:43 -0800
Subject: [PATCH 7/9] Clarify error before inference

---
 docs/protocol/extension_generate.md | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/docs/protocol/extension_generate.md b/docs/protocol/extension_generate.md
index 1d66ab06f2..f88909a1dd 100644
--- a/docs/protocol/extension_generate.md
+++ b/docs/protocol/extension_generate.md
@@ -68,10 +68,14 @@ These responses will be sent as
 response body. In the case of inference errors, responses will have
 an [error JSON object](#generate-response-json-error-object).
     * Note that the HTTP response code is set in the first response of the SSE,
-    so if an error occurs after the first response for the request,
-    it can result in receiving an error object while the status code shows
-    success (200). Therefore, the user must always check whether an error
-    object is received when generating responses through `/generate_stream`.
+    so if the first response succeeds but an error occurs in a subsequent
+    response for the request, it can result in receiving an error object
+    while the status code shows success (200). Therefore, the user must
+    always check whether an error object is received when generating
+    responses through `/generate_stream`.
+    * If the request fails before inference begins, then a JSON error will
+    be returned with `Content-Type` of `application/json`, similar to errors
+    from other endpoints and the status code should correctly reflect an error.
 
 ### Generate Request JSON Object
 

From 34d33060043745f55e0cb6566062734aeef18967 Mon Sep 17 00:00:00 2001
From: Ryan McCormick <rmccormick@nvidia.com>
Date: Mon, 11 Dec 2023 13:39:06 -0800
Subject: [PATCH 8/9] Update docs/protocol/extension_generate.md

Co-authored-by: Neelay Shah <neelays@nvidia.com>
---
 docs/protocol/extension_generate.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/protocol/extension_generate.md b/docs/protocol/extension_generate.md
index f88909a1dd..689c280d32 100644
--- a/docs/protocol/extension_generate.md
+++ b/docs/protocol/extension_generate.md
@@ -75,7 +75,7 @@ an [error JSON object](#generate-response-json-error-object).
     responses through `/generate_stream`.
     * If the request fails before inference begins, then a JSON error will
     be returned with `Content-Type` of `application/json`, similar to errors
-    from other endpoints and the status code should correctly reflect an error.
+    from other endpoints with the status code set to an error.
 
 ### Generate Request JSON Object
 

From 0bd93bc80166a1855f8e4478f2aa4f16f7eebc46 Mon Sep 17 00:00:00 2001
From: Ryan McCormick <rmccormick@nvidia.com>
Date: Mon, 11 Dec 2023 13:39:11 -0800
Subject: [PATCH 9/9] Update docs/protocol/extension_generate.md

Co-authored-by: Neelay Shah <neelays@nvidia.com>
---
 docs/protocol/extension_generate.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/protocol/extension_generate.md b/docs/protocol/extension_generate.md
index 689c280d32..92c7e8f52b 100644
--- a/docs/protocol/extension_generate.md
+++ b/docs/protocol/extension_generate.md
@@ -53,7 +53,7 @@ POST v2/models/${MODEL_NAME}[/versions/${MODEL_VERSION}]/generate
 POST v2/models/${MODEL_NAME}[/versions/${MODEL_VERSION}]/generate_stream
 ```
 
-### generate v.s. generate_stream
+### generate vs. generate_stream
 
 Both URLs expect the same request JSON object, and generate the same JSON
 response object. However, there are some differences in the format used to