RobotecAI · jmatejcz · Jul 1, 2025 · May 26, 2025 · May 26, 2025 · May 26, 2025
diff --git a/docs/simulation_and_benchmarking/rai_bench.md b/docs/simulation_and_benchmarking/rai_bench.md
@@ -109,7 +109,7 @@ The `Validator` class can combine single or multiple subtasks to create a single
 
 ### Task
 
-A Task represents a specific prompt and set of tools available. A list of validators is assigned to validate the performance.
+A Task represents a specific prompts and set of tools available. A list of validators is assigned to validate the performance.
 
 ??? info "Task class definition"
 
@@ -123,14 +123,29 @@ The ToolCallingAgentBenchmark class manages the execution of tasks and collects
 
 ### Available Tasks
 
-Tasks of this benchmark are grouped by type:
+There are predefined Tasks available which are grouped by categories:
 
--   Basic - basic usage of tools
+-   Basic - require retrieving info from certain topics
 -   Navigation
 -   Spatial reasoning - questions about surroundings with images attached
 -   Manipulation
 -   Custom Interfaces - requires using messages with custom interfaces
 
-If you want to know details about every task, visit `rai_bench/tool_calling_agent/tasks`
+Every Task has assigned the `complexity` which reflects the difficulty.
+
+When creating a Task, you can define few params:
+
+```python
+class TaskArgs(BaseModel):
+    """Holds the configurations specified by user"""
 
-## Test Models
+    extra_tool_calls: int = 0
+    prompt_detail: Literal["brief", "moderate", "descriptive"] = "brief"
+    examples_in_system_prompt: Literal[0, 2, 5] = 0
+```
+
+-   examples_in_system_prompt - How many examples there are in system prompts.
+-   prompt_detail - How descriptive should the Task prompt be.
+-   extra_tool_calls - How many extra tool calls an agent can make and still pass the Task.
+
+If you want to know details about every task, visit `rai_bench/tool_calling_agent/tasks`
diff --git a/docs/tutorials/benchmarking.md b/docs/tutorials/benchmarking.md
@@ -53,12 +53,12 @@ If your goal is creating custom tasks and scenarios, visit [Creating Custom Task
 This benchmark does not require any additional setup besides the main one [Basic Setup](../setup/install.md), just run:
 
 ```bash
-python src/rai_bench/rai_bench/examples/tool_calling_agent.py --model-name <model-name> --vendor <vendor> --extra-tool-calls <5> --task-types <basic> --out-dir <out_dir>
+python src/rai_bench/rai_bench/examples/tool_calling_agent.py --model-name <qwen2.5:7b> --vendor <ollama> --extra-tool-calls <0 5> --task-types basic  --n-shots <0 2> --prompt-detail <brief  descriptive> --complexities <easy medium hard> --out-dir <out_dir>
 ```
 
 !!! note
 
-    This Benchmark is significantly faster, but still if just trying out, we recommend choosing just one task-type.
+    This Benchmark is significantly faster, but still, if just trying out, we recommend choosing just one parameter per flag as every combination on params will create more tasks.
 
 ## Testing Models
 
@@ -90,12 +90,18 @@ if __name__ == "__main__":
         ],
         repeats=1,  # how many times to repeat
     )
-    tool_conf = ToolCallingAgentBenchmarkConfig(
-        extra_tool_calls=5,  # how many extra tool calls allowed to still pass
+     tool_conf = ToolCallingAgentBenchmarkConfig(
+        extra_tool_calls=[0, 5],  # how many extra tool calls allowed to still pass
         task_types=[  # what types of tasks to include
             "basic",
             "spatial_reasoning",
-            "manipulation",
+            "custom_interfaces",
+        ],
+        N_shots=[0, 2],  # examples in system prompt
+        prompt_detail=[  # how descriptive should task prompt be
+            "brief",
+            "moderate",
+            "descriptive",
         ],
         repeats=1,
     )
@@ -222,6 +228,21 @@ class ThrowObjectsOffTableTask(ManipulationTask):
 
         incorrect: int = len(selected_type_objects) - correct
         return correct, incorrect
+
+# configure existing Task with different params
+target_coords = (0.1, 0.1)
+disp = 0.1
+task = PlaceObjectAtCoordTask(
+    obj_type="apple",
+    target_position=target_coords,
+    allowable_displacement=disp,
+)
+
+Scenario(
+    task=task,
+    scene_config=scene_config,
+    scene_config_path=path_to_your_config
+)
 ```
 
 As `obj_types` is parameterizable, it enables various variants of this Task. In combination with a lot of simulation configs available, it means that a single Task can provide dozens of scenarios.
@@ -240,23 +261,14 @@ from rai_bench.tool_calling_agent.subtasks import (
 from rai_bench.tool_calling_agent.validators import (
     OrderedCallsValidator,
 )
-from rai_bench.tool_calling_agent.tasks.basic import BasicTask
 from rai_bench.tool_calling_agent.mocked_tools import (
     MockGetROS2TopicsNamesAndTypesTool,
 )
+from rai_bench.tool_calling_agent.interfaces import Task, TaskArgs
 from langchain_core.tools import BaseTool
 from typing import List
 
-# configure existing Task with different params
-target_coords = (0.1, 0.1)
-disp = 0.1
-task = PlaceObjectAtCoordTask(
-    obj_type="apple",
-    target_position=target_coords,
-    allowable_displacement=disp,
-)
 
-Scenario(task=task, scene_config=scene_config, scene_config_path=path_to_your_config)
 
 # define subtask that requires
 receive_robot_pos_subtask = CheckArgsToolCallSubTask(
@@ -270,7 +282,7 @@ receive_robot_pos_subtask = CheckArgsToolCallSubTask(
 topics_ord_val = OrderedCallsValidator(subtasks=[receive_robot_pos_subtask])
 
 
-class GetROS2RobotPositionTask(BasicTask):
+class GetROS2RobotPositionTask(Task):
     complexity = "easy"
 
     @property
@@ -287,9 +299,18 @@ class GetROS2RobotPositionTask(BasicTask):
             ),
         ]
 
+    def get_system_prompt(self) -> str:
+        return "You are a ROS 2 expert that want to solve tasks. You have access to various tools that allow you to query the ROS 2 system."
+
     def get_prompt(self) -> str:
         return "Get the position of the robot."
 
+    @property
+    def optional_tool_calls_number(self) -> int:
+        # Listing topics before getting any message
+        return 1
+
 # optionally pass number of extra tool calls
-task = GetROS2RobotPositionTask(validators=[topics_ord_val], extra_tool_calls=1)
+args = TaskArgs(extra_tool_calls=0)
+task = GetROS2RobotPositionTask(validators=[topics_ord_val], task_args=args)
 ```
diff --git a/src/rai_bench/pyproject.toml b/src/rai_bench/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "rai-bench"
-version = "0.1.0"
+version = "0.2.0"
 description = "Package for running and creating benchmarks."
 authors = ["Jakub Matejczyk <[email protected]>", "Magdalena Kotynia <[email protected]>"]
 readme = "README.md"

diff --git a/src/rai_bench/rai_bench/examples/benchmarking_models.py b/src/rai_bench/rai_bench/examples/benchmarking_models.py
@@ -20,31 +20,39 @@
 
 if __name__ == "__main__":
     # Define models you want to benchmark
-    model_names = ["qwen2.5:7b", "llama3.2:3b"]
-    vendors = ["ollama", "ollama"]
+    model_names = ["qwen2.5:7b"]
+    vendors = ["ollama"]
 
     # Define benchmarks that will be used
-    man_conf = ManipulationO3DEBenchmarkConfig(
+    mani_conf = ManipulationO3DEBenchmarkConfig(
         o3de_config_path="src/rai_bench/rai_bench/manipulation_o3de/predefined/configs/o3de_config.yaml",  # path to your o3de config
         levels=[  # define what difficulty of tasks to include in benchmark
             "trivial",
         ],
         repeats=1,  # how many times to repeat
     )
     tool_conf = ToolCallingAgentBenchmarkConfig(
-        extra_tool_calls=5,  # how many extra tool calls allowed to still pass
+        extra_tool_calls=[0],  # how many extra tool calls allowed to still pass
         task_types=[  # what types of tasks to include
             "basic",
             "spatial_reasoning",
+            # "navigation",
+            "custom_interfaces",
             "manipulation",
         ],
+        N_shots=[2],  # examples in system prompt
+        prompt_detail=[  # how descriptive should task prompt be
+            "brief",
+            # "moderate",
+            "descriptive",
+        ],
         repeats=1,
     )
 
     out_dir = "src/rai_bench/rai_bench/experiments"
     test_models(
         model_names=model_names,
         vendors=vendors,
-        benchmark_configs=[man_conf, tool_conf],
+        benchmark_configs=[tool_conf],
         out_dir=out_dir,
     )
diff --git a/src/rai_bench/rai_bench/examples/tool_calling_agent.py b/src/rai_bench/rai_bench/examples/tool_calling_agent.py
@@ -34,6 +34,8 @@
         extra_tool_calls=args.extra_tool_calls,
         complexities=args.complexities,
         task_types=args.task_types,
+        n_shots=args.n_shots,
+        prompt_detail=args.prompt_detail,
     )
     for task in tasks:
         task.set_logger(bench_logger)

diff --git a/src/rai_bench/rai_bench/results_processing/data_loading.py b/src/rai_bench/rai_bench/results_processing/data_loading.py
@@ -70,20 +70,19 @@ def convert_row_to_task_result(row: pd.Series) -> TaskResult:
         )
         validator_results.append(validator_result)
 
-    return TaskResult(
-        task_prompt=row["task_prompt"],
-        system_prompt=row["system_prompt"],
-        complexity=row["complexity"],
-        type=row["type"],
-        model_name=row["model_name"],
-        validation_info=validator_results,
-        extra_tool_calls=int(row["extra_tool_calls"]),
-        extra_tool_calls_used=int(row["extra_tool_calls_used"]),
-        score=float(row["score"]),
-        total_time=float(row["total_time"]),
-        run_id=uuid.UUID(row["run_id"]),
+    row.update(
+        {
+            "validation_info": validator_results,
+            "extra_tool_calls": int(row["extra_tool_calls"]),
+            "extra_tool_calls_used": int(row["extra_tool_calls_used"]),
+            "score": float(row["score"]),
+            "total_time": float(row["total_time"]),
+            "run_id": uuid.UUID(row["run_id"]),
+        }
     )
 
+    return TaskResult(**row)
+
 
 def convert_row_to_scenario_result(row: pd.Series) -> ScenarioResult:
     """
@@ -100,10 +99,7 @@ def convert_row_to_scenario_result(row: pd.Series) -> ScenarioResult:
         A ScenarioResult object
     """
     return ScenarioResult(
-        task_prompt=row["task_prompt"],
-        system_prompt=row["system_prompt"],
-        model_name=row["model_name"],
-        scene_config_path=row["scene_config_path"],
+        **row,
         score=float(row["score"]),
         total_time=float(row["total_time"]),
         number_of_tool_calls=int(row["number_of_tool_calls"]),

diff --git a/src/rai_bench/rai_bench/results_processing/data_processing.py b/src/rai_bench/rai_bench/results_processing/data_processing.py
@@ -181,17 +181,25 @@ def create_task_metrics_dataframe(
 
 
 def create_task_details_dataframe(
-    model_results: ModelResults, task_type: Optional[str] = None
+    model_results: ModelResults,
+    task_type: Optional[str] = None,
+    complexity: Optional[str] = None,
+    examples_in_system_prompt: Optional[int] = None,
+    prompt_detail: Optional[str] = None,
 ) -> pd.DataFrame:
     """
-    Create a DataFrame with task details, optionally filtered by task type.
+    Create a DataFrame with task details, optionally filtered by multiple criteria.
 
     Parameters
     ----------
     model_results : ModelResults
         The model results object
     task_type : Optional[str]
         Task type to filter by
+    complexity : Optional[str]
+        Complexity to filter by
+    examples_in_system_prompt : Optional[str]
+        Examples in system prompt to filter by
 
     Returns
     -------
@@ -201,14 +209,30 @@ def create_task_details_dataframe(
     all_detailed_results = get_all_detailed_results_from_model_results(
         model_results=model_results
     )
-
     if not all_detailed_results:
         return pd.DataFrame()
 
-    # filter by task type
+    # Apply filters
     if task_type:
         all_detailed_results = [r for r in all_detailed_results if r.type == task_type]
 
+    if complexity:
+        all_detailed_results = [
+            r for r in all_detailed_results if r.complexity == complexity
+        ]
+
+    if examples_in_system_prompt:
+        all_detailed_results = [
+            r
+            for r in all_detailed_results
+            if r.examples_in_system_prompt == examples_in_system_prompt
+        ]
+
+    if prompt_detail:
+        all_detailed_results = [
+            r for r in all_detailed_results if r.prompt_detail == prompt_detail
+        ]
+
     rows: List[Dict[str, Any]] = [
         {
             "task_prompt": result.task_prompt,
@@ -217,10 +241,10 @@ def create_task_details_dataframe(
             "score": result.score,
             "total_time": result.total_time,
             "extra_tool_calls_used": result.extra_tool_calls_used,
+            "examples_in_system_prompt": result.examples_in_system_prompt,
         }
         for result in all_detailed_results
     ]
-
     return pd.DataFrame(rows)