UKGovernmentBEIS · craigwalton-dsit · Jun 26, 2025 · Jun 26, 2025 · Jun 26, 2025 · Jun 26, 2025
diff --git a/docs/extensions.qmd b/docs/extensions.qmd
@@ -14,6 +14,8 @@ There are several ways to extend Inspect to integrate with systems not directly
 
 4.  Storage Systems (for datasets, prompts, and evaluation logs)
 
+5.  Hooks (for logging and monitoring frameworks)
+
 For each of these, you can create an extension within a Python package, and then use it without any special registration with Inspect (this is done via [setuptools entry points](https://setuptools.pypa.io/en/latest/userguide/entry_point.html)).
 
 ## Model APIs {#sec-model-api-extensions}
@@ -425,3 +427,54 @@ myfs = "evaltools:MyFs"
 :::
 
 Once this package is installed, you'll be able to use `myfs://` with Inspect without any further registration.
+
+## Hooks
+
+Hooks allow you to run arbitrary code during Inspect's lifecycle, for example when runs, tasks or samples start and end.
+
+Here is a hypothetical hook for integration with Weights & Biases.
+
+``` python
+import wandb
+
+from inspect_ai.hooks import Hooks, RunEnd, RunStart, SampleEnd, hooks
+
+@hooks(name="w&b_hook")
+class WBHook(Hooks):
+    async def on_run_start(self, data: RunStart) -> None:
+        wandb.init(name=data.run_id)
+
+    async def on_run_end(self, data: RunEnd) -> None:
+        wandb.finish()
+
+    async def on_sample_end(self, data: SampleEnd) -> None:
+        scores = {k: v.value for k, v in data.summary.scores.items()}
+        wandb.log({
+            "sample_id": data.sample_id,
+            "scores": scores
+        })
+```
+
+See the `Hooks` class for more documentation and the full list of available hooks.
+
+### API Key Override
+
+There is a hook to optionally override the value of model API key environment variables. This could be used to:
+
+* inject API keys at runtime (e.g. fetched from a secrets manager), to avoid having to store these in your environment or .env file
+* use some custom model API authentication mechanism in conjunction with a custom reverse proxy for the model API to avoid Inspect ever having access to real API keys
+
+``` python
+from inspect_ai.hooks import hooks, Hooks, ApiKeyOverride
+
+@hooks(name="api_key_fetcher")
+class ApiKeyFetcher(Hooks):
+    def override_api_key(self, data: ApiKeyOverride) -> str | None:
+        original_env_var_value = data.value
+        if original_env_var_value.startswith("arn:aws:secretsmanager:"):
+            return fetch_aws_secret(original_env_var_value)
+        return None
+
+def fetch_aws_secret(aws_arn: str) -> str:
+    ...
+```
diff --git a/docs/reference/_sidebar.yml b/docs/reference/_sidebar.yml
@@ -617,6 +617,31 @@ website:
           href: reference/inspect_ai.util.qmd#jsonschema
         - text: json_schema
           href: reference/inspect_ai.util.qmd#json_schema
+      - section: inspect_ai.hooks
+        href: reference/inspect_ai.hooks.qmd
+        contents:
+        - text: Hooks
+          href: reference/inspect_ai.hooks.qmd#hooks
+        - text: hooks
+          href: reference/inspect_ai.hooks.qmd#hooks
+        - text: ApiKeyOverride
+          href: reference/inspect_ai.hooks.qmd#apikeyoverride
+        - text: ModelUsageData
+          href: reference/inspect_ai.hooks.qmd#modelusagedata
+        - text: RunEnd
+          href: reference/inspect_ai.hooks.qmd#runend
+        - text: RunStart
+          href: reference/inspect_ai.hooks.qmd#runstart
+        - text: SampleAbort
+          href: reference/inspect_ai.hooks.qmd#sampleabort
+        - text: SampleEnd
+          href: reference/inspect_ai.hooks.qmd#sampleend
+        - text: SampleStart
+          href: reference/inspect_ai.hooks.qmd#samplestart
+        - text: TaskEnd
+          href: reference/inspect_ai.hooks.qmd#taskend
+        - text: TaskStart
+          href: reference/inspect_ai.hooks.qmd#taskstart
     - section: Inspect CLI
       href: reference/inspect_eval.qmd
       contents:

diff --git a/docs/reference/filter/parse.py b/docs/reference/filter/parse.py
@@ -1,3 +1,4 @@
+import sys
 from dataclasses import dataclass
 from itertools import islice
 from pathlib import Path
@@ -268,6 +269,7 @@ def read_source(
     object: Object, options: DocParseOptions
 ) -> tuple[str, str, list[DocstringSection]]:
     # assert preconditions
+    sys.stderr.write(object.name + "\n")
     assert isinstance(object.filepath, Path)
     assert object.lineno is not None
     assert object.docstring is not None

diff --git a/docs/reference/filter/sidebar.py b/docs/reference/filter/sidebar.py
@@ -22,6 +22,7 @@
         "log.qmd",
         "analysis.qmd",
         "util.qmd",
+        "hooks.qmd"
     ]
 ]
 

diff --git a/docs/reference/inspect_ai.hooks.qmd b/docs/reference/inspect_ai.hooks.qmd
@@ -0,0 +1,20 @@
+---
+title: "inspect_ai.hooks"
+---
+
+## Registration
+
+### Hooks
+### hooks
+
+## Hook Data
+
+### ApiKeyOverride
+### ModelUsageData
+### RunEnd
+### RunStart
+### SampleAbort
+### SampleEnd
+### SampleStart
+### TaskEnd
+### TaskStart
diff --git a/docs/scripts/post-render.sh b/docs/scripts/post-render.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-files=("index" "tutorial" "options" "log-viewer" "vscode" "tasks" "datasets" "solvers" "scorers" "models" "providers" "caching" "multimodal" "reasoning" "structured" "tools" "tools-standard" "tools-mcp" "tools-custom" "sandboxing"  "approval"  "agents" "react-agent" "agent-custom" "agent-bridge" "human-agent"  "eval-logs" "dataframe" "eval-sets"  "errors-and-limits"  "typing" "tracing" "parallelism" "interactivity" "extensions" "reference/inspect_ai" "reference/inspect_ai.solver" "reference/inspect_ai.tool" "reference/inspect_ai.agent" "reference/inspect_ai.scorer" "reference/inspect_ai.model" "reference/inspect_ai.agent" "reference/inspect_ai.dataset" "reference/inspect_ai.approval" "reference/inspect_ai.log" "reference/inspect_ai.analysis" "reference/inspect_ai.util" "reference/inspect_eval" "reference/inspect_eval-set" "reference/inspect_eval-retry" "reference/inspect_score" "reference/inspect_view" "reference/inspect_log"  "reference/inspect_trace" "reference/inspect_sandbox" "reference/inspect_cache" "reference/inspect_list" "reference/inspect_info")
+files=("index" "tutorial" "options" "log-viewer" "vscode" "tasks" "datasets" "solvers" "scorers" "models" "providers" "caching" "multimodal" "reasoning" "structured" "tools" "tools-standard" "tools-mcp" "tools-custom" "sandboxing"  "approval"  "agents" "react-agent" "agent-custom" "agent-bridge" "human-agent"  "eval-logs" "dataframe" "eval-sets"  "errors-and-limits"  "typing" "tracing" "parallelism" "interactivity" "extensions" "reference/inspect_ai" "reference/inspect_ai.solver" "reference/inspect_ai.tool" "reference/inspect_ai.agent" "reference/inspect_ai.scorer" "reference/inspect_ai.model" "reference/inspect_ai.agent" "reference/inspect_ai.dataset" "reference/inspect_ai.approval" "reference/inspect_ai.log" "reference/inspect_ai.analysis" "reference/inspect_ai.util" "reference/inspect_ai.hooks" "reference/inspect_eval" "reference/inspect_eval-set" "reference/inspect_eval-retry" "reference/inspect_score" "reference/inspect_view" "reference/inspect_log"  "reference/inspect_trace" "reference/inspect_sandbox" "reference/inspect_cache" "reference/inspect_list" "reference/inspect_info")
 
 
 if [ "$QUARTO_PROJECT_RENDER_ALL" = "1" ]; then

diff --git a/src/inspect_ai/_eval/context.py b/src/inspect_ai/_eval/context.py
@@ -2,7 +2,6 @@
 
 from inspect_ai._util.dotenv import init_dotenv
 from inspect_ai._util.eval_task_group import init_eval_task_group
-from inspect_ai._util.hooks import init_hooks
 from inspect_ai._util.logger import init_logger
 from inspect_ai.approval._apply import have_tool_approval, init_tool_approval
 from inspect_ai.approval._human.manager import init_human_approval_manager
@@ -28,7 +27,6 @@ def init_eval_context(
     init_logger(log_level, log_level_transcript)
     init_concurrency()
     init_max_subprocesses(max_subprocesses)
-    init_hooks()
     init_active_samples()
     init_human_approval_manager()
     init_eval_task_group(task_group)

diff --git a/src/inspect_ai/_eval/eval.py b/src/inspect_ai/_eval/eval.py
@@ -469,6 +469,8 @@ async def _eval_async_inner(
     score_display: bool | None = None,
     **kwargs: Unpack[GenerateConfigArgs],
 ) -> list[EvalLog]:
+    from inspect_ai.hooks._hooks import emit_run_end, emit_run_start
+
     # only a single call to eval_async can be active at a time, this used
     # to be due to running tasks switching to the task's directory, however
     # that feature no longer exists so we may be able to revisit this
@@ -488,6 +490,8 @@ async def _eval_async_inner(
     model_args = resolve_args(model_args)
     task_args = resolve_args(task_args)
 
+    run_id = uuid()
+
     try:
         # intialise eval
         model = eval_init(
@@ -609,10 +613,11 @@ async def _eval_async_inner(
         # run tasks - 2 codepaths, one for the traditional task at a time
         # (w/ optional multiple models) and the other for true multi-task
         # (which requires different scheduling and UI)
-        run_id = uuid()
         task_definitions = len(resolved_tasks) // len(model)
         parallel = 1 if (task_definitions == 1 or max_tasks is None) else max_tasks
 
+        await emit_run_start(run_id, resolved_tasks)
+
         # single task definition (could be multi-model) or max_tasks capped to 1
         if parallel == 1:
             results: list[EvalLog] = []
@@ -668,6 +673,10 @@ async def _eval_async_inner(
         cleanup_sample_buffers(log_dir)
 
     finally:
+        try:
+            await emit_run_end(run_id, logs)
+        except UnboundLocalError:
+            await emit_run_end(run_id, EvalLogs([]))
         _eval_async_running = False
 
     # return logs