Add prompt caching (Sonnet, Haiku only) (#3411)

Kaushikdkrikhanu · tobitege · xingyaoww · web-flow · commit 5bb931e4d647 · 2024-08-26T20:46:44.000-04:00
* Add prompt caching

* remove anthropic-version from extra_headers

* change supports_prompt_caching method to attribute

* change caching strat and log cache statistics

* add reminder as a new message to fix caching

* fix unit test

* append reminder to the end of the last message content

* move token logs to post completion function

* fix unit test failure

* fix reminder and prompt caching

* unit tests for prompt caching

* add test

* clean up tests

* separate reminder, use latest two messages

* fix tests

---------

Co-authored-by: tobitege &lt;10787084+tobitege@users.noreply.github.com&gt;
Co-authored-by: Xingyao Wang &lt;xingyao6@illinois.edu&gt;
Co-authored-by: Engel Nyst &lt;enyst@users.noreply.github.com&gt;
diff --git a/agenthub/codeact_agent/codeact_agent.py b/agenthub/codeact_agent/codeact_agent.py
@@ -172,26 +172,44 @@ def step(self, state: State) -> Action:
         # prepare what we want to send to the LLM
         messages = self._get_messages(state)
 
-        response = self.llm.completion(
-            messages=[message.model_dump() for message in messages],
-            stop=[
+        params = {
+            'messages': [message.model_dump() for message in messages],
+            'stop': [
                 '</execute_ipython>',
                 '</execute_bash>',
                 '</execute_browse>',
             ],
-            temperature=0.0,
-        )
+            'temperature': 0.0,
+        }
+
+        if self.llm.supports_prompt_caching:
+            params['extra_headers'] = {
+                'anthropic-beta': 'prompt-caching-2024-07-31',
+            }
+
+        response = self.llm.completion(**params)
+
         return self.action_parser.parse(response)
 
     def _get_messages(self, state: State) -> list[Message]:
         messages: list[Message] = [
             Message(
                 role='system',
-                content=[TextContent(text=self.prompt_manager.system_message)],
+                content=[
+                    TextContent(
+                        text=self.prompt_manager.system_message,
+                        cache_prompt=self.llm.supports_prompt_caching,  # Cache system prompt
+                    )
+                ],
             ),
             Message(
                 role='user',
-                content=[TextContent(text=self.prompt_manager.initial_user_message)],
+                content=[
+                    TextContent(
+                        text=self.prompt_manager.initial_user_message,
+                        cache_prompt=self.llm.supports_prompt_caching,  # if the user asks the same query,
+                    )
+                ],
             ),
         ]
 
@@ -214,6 +232,16 @@ def _get_messages(self, state: State) -> list[Message]:
                 else:
                     messages.append(message)
 
+        # Add caching to the last 2 user messages
+        if self.llm.supports_prompt_caching:
+            user_turns_processed = 0
+            for message in reversed(messages):
+                if message.role == 'user' and user_turns_processed < 2:
+                    message.content[
+                        -1
+                    ].cache_prompt = True  # Last item inside the message content
+                    user_turns_processed += 1
+
         # the latest user message is important:
         # we want to remind the agent of the environment constraints
         latest_user_message = next(
@@ -225,25 +253,8 @@ def _get_messages(self, state: State) -> list[Message]:
             ),
             None,
         )
-
-        # Get the last user text inside content
         if latest_user_message:
-            latest_user_message_text = next(
-                (
-                    t
-                    for t in reversed(latest_user_message.content)
-                    if isinstance(t, TextContent)
-                )
-            )
-            # add a reminder to the prompt
             reminder_text = f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task. When finished reply with <finish></finish>.'
-
-            if latest_user_message_text:
-                latest_user_message_text.text = (
-                    latest_user_message_text.text + reminder_text
-                )
-            else:
-                latest_user_message_text = TextContent(text=reminder_text)
-                latest_user_message.content.append(latest_user_message_text)
+            latest_user_message.content.append(TextContent(text=reminder_text))
 
         return messages
diff --git a/openhands/core/message.py b/openhands/core/message.py
@@ -11,6 +11,7 @@ class ContentType(Enum):
 
 class Content(BaseModel):
     type: ContentType
+    cache_prompt: bool = False
 
     @model_serializer
     def serialize_model(self):
@@ -23,7 +24,13 @@ class TextContent(Content):
 
     @model_serializer
     def serialize_model(self):
-        return {'type': self.type.value, 'text': self.text}
+        data: dict[str, str | dict[str, str]] = {
+            'type': self.type.value,
+            'text': self.text,
+        }
+        if self.cache_prompt:
+            data['cache_control'] = {'type': 'ephemeral'}
+        return data
 
 
 class ImageContent(Content):
@@ -35,6 +42,8 @@ def serialize_model(self):
         images: list[dict[str, str | dict[str, str]]] = []
         for url in self.image_urls:
             images.append({'type': self.type.value, 'image_url': {'url': url}})
+        if self.cache_prompt and images:
+            images[-1]['cache_control'] = {'type': 'ephemeral'}
         return images
 
 
diff --git a/openhands/llm/llm.py b/openhands/llm/llm.py
@@ -35,6 +35,11 @@
 
 message_separator = '\n\n----------\n\n'
 
+cache_prompting_supported_models = [
+    'claude-3-5-sonnet-20240620',
+    'claude-3-haiku-20240307',
+]
+
 
 class LLM:
     """The LLM class represents a Language Model instance.
@@ -58,6 +63,9 @@ def __init__(
         self.config = copy.deepcopy(config)
         self.metrics = metrics if metrics is not None else Metrics()
         self.cost_metric_supported = True
+        self.supports_prompt_caching = (
+            self.config.model in cache_prompting_supported_models
+        )
 
         # Set up config attributes with default values to prevent AttributeError
         LLMConfig.set_missing_attributes(self.config)
@@ -184,6 +192,7 @@ def wrapper(*args, **kwargs):
 
             # log the response
             message_back = resp['choices'][0]['message']['content']
+
             llm_response_logger.debug(message_back)
 
             # post-process to log costs
@@ -421,19 +430,51 @@ def async_streaming_completion(self):
     def supports_vision(self):
         return litellm.supports_vision(self.config.model)
 
-    def _post_completion(self, response: str) -> None:
+    def _post_completion(self, response) -> None:
         """Post-process the completion response."""
         try:
             cur_cost = self.completion_cost(response)
         except Exception:
             cur_cost = 0
+
+        stats = ''
         if self.cost_metric_supported:
-            logger.info(
-                'Cost: %.2f USD | Accumulated Cost: %.2f USD',
+            stats = 'Cost: %.2f USD | Accumulated Cost: %.2f USD\n' % (
                 cur_cost,
                 self.metrics.accumulated_cost,
             )
 
+        usage = response.get('usage')
+
+        if usage:
+            input_tokens = usage.get('prompt_tokens')
+            output_tokens = usage.get('completion_tokens')
+
+            if input_tokens:
+                stats += 'Input tokens: ' + str(input_tokens) + '\n'
+
+            if output_tokens:
+                stats += 'Output tokens: ' + str(output_tokens) + '\n'
+
+            model_extra = usage.get('model_extra', {})
+
+            cache_creation_input_tokens = model_extra.get('cache_creation_input_tokens')
+            if cache_creation_input_tokens:
+                stats += (
+                    'Input tokens (cache write): '
+                    + str(cache_creation_input_tokens)
+                    + '\n'
+                )
+
+            cache_read_input_tokens = model_extra.get('cache_read_input_tokens')
+            if cache_read_input_tokens:
+                stats += (
+                    'Input tokens (cache read): ' + str(cache_read_input_tokens) + '\n'
+                )
+
+        if stats:
+            logger.info(stats)
+
     def get_token_count(self, messages):
         """Get the number of tokens in a list of messages.
 
diff --git a/tests/unit/test_prompt_caching.py b/tests/unit/test_prompt_caching.py