feat: Add streaming_callback to run methods of OllamaGenerator and OllamaChatGenerator (#1636)

sjrl · web-flow · commit 3e17629e77f1 · 2025-04-10T15:33:16.000+02:00
* Add streaming_callback to runtime of ollama

* Add tests
diff --git a/integrations/ollama/src/haystack_integrations/components/generators/ollama/chat/chat_generator.py b/integrations/ollama/src/haystack_integrations/components/generators/ollama/chat/chat_generator.py
@@ -258,16 +258,18 @@ def _build_chunk(self, chunk_response: Any) -> StreamingChunk:
         chunk_message = StreamingChunk(content, meta)
         return chunk_message
 
-    def _handle_streaming_response(self, response) -> Dict[str, List[Any]]:
+    def _handle_streaming_response(
+        self, response: Any, streaming_callback: Optional[Callable[[StreamingChunk], None]]
+    ) -> Dict[str, List[Any]]:
         """
         Handles streaming response and converts it to Haystack format
         """
         chunks: List[StreamingChunk] = []
         for chunk in response:
             chunk_delta = self._build_chunk(chunk)
             chunks.append(chunk_delta)
-            if self.streaming_callback is not None:
-                self.streaming_callback(chunk_delta)
+            if streaming_callback is not None:
+                streaming_callback(chunk_delta)
 
         replies = [ChatMessage.from_assistant("".join([c.content for c in chunks]))]
         meta = {key: value for key, value in chunks[0].meta.items() if key != "message"}
@@ -280,6 +282,8 @@ def run(
         messages: List[ChatMessage],
         generation_kwargs: Optional[Dict[str, Any]] = None,
         tools: Optional[List[Tool]] = None,
+        *,
+        streaming_callback: Optional[Callable[[StreamingChunk], None]] = None,
     ):
         """
         Runs an Ollama Model on a given chat history.
@@ -293,12 +297,15 @@ def run(
         :param tools:
             A list of tools for which the model can prepare calls. If set, it will override the `tools` parameter set
             during component initialization.
+        :param streaming_callback:
+            A callback function that is called when a new token is received from the stream.
         :returns: A dictionary with the following keys:
             - `replies`: The responses from the model
         """
         generation_kwargs = {**self.generation_kwargs, **(generation_kwargs or {})}
+        resolved_streaming_callback = streaming_callback or self.streaming_callback
 
-        stream = self.streaming_callback is not None
+        stream = resolved_streaming_callback is not None
         tools = tools or self.tools
         _check_duplicate_tool_names(tools)
 
@@ -328,6 +335,6 @@ def run(
         )
 
         if stream:
-            return self._handle_streaming_response(response)
+            return self._handle_streaming_response(response, resolved_streaming_callback)
 
         return {"replies": [_convert_ollama_response_to_chatmessage(response)]}
diff --git a/integrations/ollama/src/haystack_integrations/components/generators/ollama/generator.py b/integrations/ollama/src/haystack_integrations/components/generators/ollama/generator.py
@@ -137,16 +137,18 @@ def _convert_to_streaming_response(self, chunks: List[StreamingChunk]) -> Dict[s
 
         return {"replies": replies, "meta": [meta]}
 
-    def _handle_streaming_response(self, response) -> List[StreamingChunk]:
+    def _handle_streaming_response(
+        self, response: Any, streaming_callback: Optional[Callable[[StreamingChunk], None]]
+    ) -> List[StreamingChunk]:
         """
         Handles Streaming response cases
         """
         chunks: List[StreamingChunk] = []
         for chunk in response:
             chunk_delta: StreamingChunk = self._build_chunk(chunk)
             chunks.append(chunk_delta)
-            if self.streaming_callback is not None:
-                self.streaming_callback(chunk_delta)
+            if streaming_callback is not None:
+                streaming_callback(chunk_delta)
         return chunks
 
     def _build_chunk(self, chunk_response: Any) -> StreamingChunk:
@@ -165,6 +167,8 @@ def run(
         self,
         prompt: str,
         generation_kwargs: Optional[Dict[str, Any]] = None,
+        *,
+        streaming_callback: Optional[Callable[[StreamingChunk], None]] = None,
     ):
         """
         Runs an Ollama Model on the given prompt.
@@ -175,20 +179,27 @@ def run(
             Optional arguments to pass to the Ollama generation endpoint, such as temperature,
             top_p, and others. See the available arguments in
             [Ollama docs](https://github.com/jmorganca/ollama/blob/main/docs/modelfile.md#valid-parameters-and-values).
+        :param streaming_callback:
+            A callback function that is called when a new token is received from the stream.
         :returns: A dictionary with the following keys:
             - `replies`: The responses from the model
             - `meta`: The metadata collected during the run
         """
         generation_kwargs = {**self.generation_kwargs, **(generation_kwargs or {})}
 
-        stream = self.streaming_callback is not None
+        resolved_streaming_callback = streaming_callback or self.streaming_callback
+        stream = resolved_streaming_callback is not None
 
         response = self._client.generate(
-            model=self.model, prompt=prompt, stream=stream, keep_alive=self.keep_alive, options=generation_kwargs
+            model=self.model,
+            prompt=prompt,
+            stream=stream,
+            keep_alive=self.keep_alive,
+            options=generation_kwargs,
         )
 
         if stream:
-            chunks: List[StreamingChunk] = self._handle_streaming_response(response)
+            chunks: List[StreamingChunk] = self._handle_streaming_response(response, resolved_streaming_callback)
             return self._convert_to_streaming_response(chunks)
 
         return self._convert_to_response(response)
diff --git a/integrations/ollama/tests/test_chat_generator.py b/integrations/ollama/tests/test_chat_generator.py
@@ -418,6 +418,51 @@ def streaming_callback(_: StreamingChunk) -> None:
         assert result["replies"][0].text == "first chunk second chunk"
         assert result["replies"][0].role == "assistant"
 
+    @patch("haystack_integrations.components.generators.ollama.chat.chat_generator.Client")
+    def test_run_streaming_at_runtime(self, mock_client):
+        streaming_callback_called = False
+
+        def streaming_callback(_: StreamingChunk) -> None:
+            nonlocal streaming_callback_called
+            streaming_callback_called = True
+
+        generator = OllamaChatGenerator(streaming_callback=None)
+
+        mock_response = iter(
+            [
+                ChatResponse(
+                    model="llama3.2",
+                    created_at="2023-12-12T14:13:43.416799Z",
+                    message={"role": "assistant", "content": "first chunk "},
+                    done=False,
+                ),
+                ChatResponse(
+                    model="llama3.2",
+                    created_at="2023-12-12T14:13:43.416799Z",
+                    message={"role": "assistant", "content": "second chunk"},
+                    done=True,
+                    total_duration=4883583458,
+                    load_duration=1334875,
+                    prompt_eval_count=26,
+                    prompt_eval_duration=342546000,
+                    eval_count=282,
+                    eval_duration=4535599000,
+                ),
+            ]
+        )
+
+        mock_client_instance = mock_client.return_value
+        mock_client_instance.chat.return_value = mock_response
+
+        result = generator.run(messages=[ChatMessage.from_user("irrelevant")], streaming_callback=streaming_callback)
+
+        assert streaming_callback_called
+
+        assert "replies" in result
+        assert len(result["replies"]) == 1
+        assert result["replies"][0].text == "first chunk second chunk"
+        assert result["replies"][0].role == "assistant"
+
     def test_run_fail_with_tools_and_streaming(self, tools):
         component = OllamaChatGenerator(tools=tools, streaming_callback=print_streaming_chunk)
 
@@ -459,7 +504,9 @@ def test_run_with_chat_history(self):
         assert isinstance(response, dict)
         assert isinstance(response["replies"], list)
 
-        assert any(city in response["replies"][-1].text for city in ["Manchester", "Birmingham", "Glasgow"])
+        assert any(
+            city.lower() in response["replies"][-1].text.lower() for city in ["Manchester", "Birmingham", "Glasgow"]
+        )
 
     @pytest.mark.integration
     def test_run_model_unavailable(self):
@@ -486,7 +533,9 @@ def test_run_with_streaming(self):
 
         assert isinstance(response, dict)
         assert isinstance(response["replies"], list)
-        assert any(city in response["replies"][-1].text for city in ["Manchester", "Birmingham", "Glasgow"])
+        assert any(
+            city.lower() in response["replies"][-1].text.lower() for city in ["Manchester", "Birmingham", "Glasgow"]
+        )
 
     @pytest.mark.integration
     def test_run_with_tools(self, tools):
@@ -525,7 +574,7 @@ def test_run_with_response_format(self):
         assert isinstance(response_data["capital"], str)
         assert "population" in response_data
         assert isinstance(response_data["population"], (int, float))
-        assert response_data["capital"] == "Paris"
+        assert response_data["capital"].lower() == "paris"
 
     def test_run_with_streaming_and_format(self):
         response_format = {
diff --git a/integrations/ollama/tests/test_generator.py b/integrations/ollama/tests/test_generator.py
@@ -135,7 +135,7 @@ def test_from_dict(self):
         assert component.keep_alive == "5m"
 
     @pytest.mark.integration
-    def test_ollama_generator_run_streaming(self):
+    def test_ollama_generator_streaming(self):
         class Callback:
             def __init__(self):
                 self.responses = ""
@@ -151,7 +151,29 @@ def __call__(self, chunk):
         results = component.run(prompt="What's the capital of Netherlands?")
 
         assert len(results["replies"]) == 1
-        assert "Amsterdam" in results["replies"][0]
+        assert "amsterdam" in results["replies"][0].lower()
+        assert len(results["meta"]) == 1
+        assert callback.responses == results["replies"][0]
+        assert callback.count_calls > 1
+
+    @pytest.mark.integration
+    def test_ollama_generator_streaming_in_run(self):
+        class Callback:
+            def __init__(self):
+                self.responses = ""
+                self.count_calls = 0
+
+            def __call__(self, chunk):
+                self.responses += chunk.content
+                self.count_calls += 1
+                return chunk
+
+        callback = Callback()
+        component = OllamaGenerator(model="llama3.2:3b", streaming_callback=None)
+        results = component.run(prompt="What's the capital of Netherlands?", streaming_callback=callback)
+
+        assert len(results["replies"]) == 1
+        assert "amsterdam" in results["replies"][0].lower()
         assert len(results["meta"]) == 1
         assert callback.responses == results["replies"][0]
         assert callback.count_calls > 1