fix: updates to rest of examples, modified default args

jmatejcz · jmatejcz · commit 5b47a8c23467 · 2025-06-02T08:51:27.000+02:00
diff --git a/src/rai_bench/rai_bench/examples/manipulation_o3de.py b/src/rai_bench/rai_bench/examples/manipulation_o3de.py
@@ -16,6 +16,7 @@
 
 from rai_bench import define_benchmark_logger, parse_manipulation_o3de_benchmark_args
 from rai_bench.manipulation_o3de import get_scenarios, run_benchmark
+from rai_bench.utils import get_llm_for_benchmark
 
 if __name__ == "__main__":
     args = parse_manipulation_o3de_benchmark_args()
@@ -26,9 +27,13 @@
     # import ready scenarios
     scenarios = get_scenarios(logger=bench_logger, levels=args.levels)
 
-    run_benchmark(
+    llm = get_llm_for_benchmark(
         model_name=args.model_name,
         vendor=args.vendor,
+    )
+
+    run_benchmark(
+        llm=llm,
         out_dir=experiment_dir,
         o3de_config_path=args.o3de_config_path,
         scenarios=scenarios,
diff --git a/src/rai_bench/rai_bench/examples/tool_calling_agent.py b/src/rai_bench/rai_bench/examples/tool_calling_agent.py
@@ -22,6 +22,7 @@
     get_tasks,
     run_benchmark,
 )
+from rai_bench.utils import get_llm_for_benchmark
 
 if __name__ == "__main__":
     args = parse_tool_calling_benchmark_args()
@@ -36,9 +37,14 @@
     )
     for task in tasks:
         task.set_logger(bench_logger)
-    run_benchmark(
+
+    llm = get_llm_for_benchmark(
         model_name=args.model_name,
         vendor=args.vendor,
+    )
+
+    run_benchmark(
+        llm=llm,
         out_dir=args.out_dir,
         tasks=tasks,
         bench_logger=bench_logger,
diff --git a/src/rai_bench/rai_bench/manipulation_o3de/benchmark.py b/src/rai_bench/rai_bench/manipulation_o3de/benchmark.py
@@ -52,6 +52,7 @@
     ScenarioResult,
 )
 from rai_bench.results_processing.langfuse_scores_tracing import ScoreTracingHandler
+from rai_bench.utils import get_llm_model_name
 from rai_sim.o3de.o3de_bridge import (
     O3DEngineArmManipulationBridge,
     O3DExROS2SimulationConfig,
@@ -422,15 +423,14 @@ def _setup_benchmark_environment(
 
 def run_benchmark(
     llm: BaseChatModel,
-    model_name: str,
     out_dir: Path,
     o3de_config_path: str,
     scenarios: List[Scenario],
-    experiment_id: uuid.UUID,
     bench_logger: logging.Logger,
+    experiment_id: uuid.UUID = uuid.uuid4(),
 ):
     connector, o3de, benchmark, tools = _setup_benchmark_environment(
-        o3de_config_path, model_name, scenarios, out_dir, bench_logger
+        o3de_config_path, get_llm_model_name(llm), scenarios, out_dir, bench_logger
     )
     try:
         for scenario in scenarios:
@@ -459,17 +459,20 @@ def run_benchmark(
 def run_benchmark_dual_agent(
     multimodal_llm: BaseChatModel,
     tool_calling_llm: BaseChatModel,
-    model_name: str,
     out_dir: Path,
     scenarios: List[Scenario],
     o3de_config_path: str,
-    experiment_id: uuid.UUID,
     bench_logger: logging.Logger,
+    experiment_id: uuid.UUID = uuid.uuid4(),
     m_system_prompt: Optional[str] = None,
     tool_system_prompt: Optional[str] = None,
 ):
     connector, o3de, benchmark, tools = _setup_benchmark_environment(
-        o3de_config_path, model_name, scenarios, out_dir, bench_logger
+        o3de_config_path,
+        get_llm_model_name(multimodal_llm),
+        scenarios,
+        out_dir,
+        bench_logger,
     )
     basic_tool_system_prompt = (
         "Based on the conversation call the tools with appropriate arguments"
@@ -489,7 +492,6 @@ def run_benchmark_dual_agent(
                     else basic_tool_system_prompt
                 ),
                 logger=bench_logger,
-                debug=True,
             )
 
             benchmark.run_next(agent=agent, experiment_id=experiment_id)
diff --git a/src/rai_bench/rai_bench/test_models.py b/src/rai_bench/rai_bench/test_models.py
@@ -123,7 +123,6 @@ def test_dual_agents(
                             tool_calling_llm=tool_llm,
                             m_system_prompt=m_system_prompt,
                             tool_system_prompt=tool_system_prompt,
-                            model_name=get_llm_model_name(m_llm),
                             out_dir=Path(curr_out_dir),
                             tasks=tool_calling_tasks,
                             experiment_id=experiment_id,
@@ -137,7 +136,6 @@ def test_dual_agents(
                         manipulation_o3de.run_benchmark_dual_agent(
                             multimodal_llm=m_llm,
                             tool_calling_llm=tool_llm,
-                            model_name=m_llm.get_name(),
                             out_dir=Path(curr_out_dir),
                             o3de_config_path=bench_conf.o3de_config_path,
                             scenarios=manipulation_o3de_scenarios,
@@ -195,7 +193,6 @@ def test_models(
                             )
                             tool_calling_agent.run_benchmark(
                                 llm=llm,
-                                model_name=model_name,
                                 out_dir=Path(curr_out_dir),
                                 tasks=tool_calling_tasks,
                                 experiment_id=experiment_id,
@@ -210,7 +207,6 @@ def test_models(
                             )
                             manipulation_o3de.run_benchmark(
                                 llm=llm,
-                                model_name=model_name,
                                 out_dir=Path(curr_out_dir),
                                 o3de_config_path=bench_conf.o3de_config_path,
                                 scenarios=manipulation_o3de_scenarios,
diff --git a/src/rai_bench/rai_bench/tool_calling_agent/benchmark.py b/src/rai_bench/rai_bench/tool_calling_agent/benchmark.py
@@ -41,6 +41,7 @@
 from rai_bench.tool_calling_agent.tasks.spatial import (
     SpatialReasoningAgentTask,
 )
+from rai_bench.utils import get_llm_model_name
 
 
 class ToolCallingAgentBenchmark(BaseBenchmark):
@@ -207,16 +208,15 @@ def compute_and_save_summary(self):
 
 def run_benchmark(
     llm: BaseChatModel,
-    model_name: str,
     out_dir: Path,
     tasks: List[Task],
-    experiment_id: uuid.UUID,
     bench_logger: logging.Logger,
+    experiment_id: uuid.UUID = uuid.uuid4(),
 ):
     benchmark = ToolCallingAgentBenchmark(
         tasks=tasks,
         logger=bench_logger,
-        model_name=model_name,
+        model_name=get_llm_model_name(llm),
         results_dir=out_dir,
     )
 
@@ -237,18 +237,17 @@ def run_benchmark(
 def run_benchmark_dual_agent(
     multimodal_llm: BaseChatModel,
     tool_calling_llm: BaseChatModel,
-    model_name: str,
     out_dir: Path,
     tasks: List[Task],
-    experiment_id: uuid.UUID,
     bench_logger: logging.Logger,
+    experiment_id: uuid.UUID = uuid.uuid4(),
     m_system_prompt: Optional[str] = None,
     tool_system_prompt: Optional[str] = None,
 ):
     benchmark = ToolCallingAgentBenchmark(
         tasks=tasks,
         logger=bench_logger,
-        model_name=model_name,
+        model_name=get_llm_model_name(multimodal_llm),
         results_dir=out_dir,
     )