Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix: evaluation leaks api_key in metadata; fix llm_config argument in run infer #2998

Merged
merged 2 commits into from
Jul 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion evaluation/EDA/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def process_instance(
reset_logger: bool = True,
):
# Create the agent
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
eval_output_dir = metadata.eval_output_dir
if reset_logger:
Expand Down
2 changes: 1 addition & 1 deletion evaluation/agent_bench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def process_instance(
reset_logger: bool = True,
):
# Create the agent
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))

inst_id = instance.instance_id
question = instance.description
Expand Down
2 changes: 1 addition & 1 deletion evaluation/biocoder/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def process_instance(
reset_logger: bool = True,
):
# Create the agent
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
instance = BiocoderData(**instance)
print(instance)
workspace_dir_name = (
Expand Down
2 changes: 1 addition & 1 deletion evaluation/bird/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def process_instance(
reset_logger: bool = True,
):
# Create the agent
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
workspace_mount_path = os.path.join(
config.workspace_mount_path, 'bird_eval_workspace'
)
Expand Down
2 changes: 1 addition & 1 deletion evaluation/browsing_delegation/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def process_instance(
reset_logger: bool = True,
):
# Create the agent
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
env_id = instance.instance_id
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
if reset_logger:
Expand Down
2 changes: 1 addition & 1 deletion evaluation/gaia/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def process_instance(
reset_logger: bool = True,
):
# Create the agent
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
# create process-specific workspace dir
# we will create a workspace directory for EACH process
# so that different agent don't interfere with each other.
Expand Down
2 changes: 1 addition & 1 deletion evaluation/gpqa/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ def process_instance(
reset_logger: bool = True,
):
# Create the agent
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
old_workspace_mount_path = config.workspace_mount_path
old_workspace_base = config.workspace_base
try:
Expand Down
2 changes: 1 addition & 1 deletion evaluation/humanevalfix/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def process_instance(
reset_logger: bool = True,
):
# Create the agent
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
old_workspace_mount_path = config.workspace_mount_path
old_workspace_base = config.workspace_base

Expand Down
2 changes: 1 addition & 1 deletion evaluation/logic_reasoning/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def process_instance(
reset_logger: bool = True,
):
# Create the agent
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
old_workspace_mount_path = config.workspace_mount_path
old_workspace_base = config.workspace_base

Expand Down
2 changes: 1 addition & 1 deletion evaluation/miniwob/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def process_instance(
reset_logger: bool = True,
):
# Create the agent
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
env_id = instance.id
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
if reset_logger:
Expand Down
2 changes: 1 addition & 1 deletion evaluation/ml_bench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@


def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool = True):
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
old_workspace_mount_path = config.workspace_mount_path
old_workspace_base = config.workspace_base
try:
Expand Down
2 changes: 1 addition & 1 deletion evaluation/swe_bench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ def process_instance(
reset_logger: bool = True,
):
# Create the agent
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))

workspace_mount_path = os.path.join(config.workspace_mount_path, '_eval_workspace')
# create process-specific workspace dir
Expand Down
2 changes: 1 addition & 1 deletion evaluation/toolqa/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@


def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool = True):
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
# create process-specific workspace dir
# we will create a workspace directory for EACH process
# so that different agent don't interfere with each other.
Expand Down
8 changes: 8 additions & 0 deletions evaluation/utils/shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,14 @@ class EvalMetadata(BaseModel):
data_split: str | None = None
details: dict[str, Any] | None = None

def model_dump_json(self, *args, **kwargs):
dumped = super().model_dump_json(*args, **kwargs)
dumped_dict = json.loads(dumped)
logger.debug(f'Dumped metadata: {dumped_dict}')
# avoid leaking sensitive information
dumped_dict['llm_config'] = self.llm_config.to_safe_dict()
return json.dumps(dumped_dict)


def codeact_user_response(
state: State,
Expand Down
2 changes: 1 addition & 1 deletion evaluation/webarena/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def process_instance(
reset_logger: bool = True,
):
# Create the agent
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
env_id = instance.id
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
if reset_logger:
Expand Down
13 changes: 12 additions & 1 deletion opendevin/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@
load_dotenv()


LLM_SENSITIVE_FIELDS = ['api_key', 'aws_access_key_id', 'aws_secret_access_key']


@dataclass
class LLMConfig:
"""Configuration for the LLM model.
Expand Down Expand Up @@ -86,7 +89,7 @@ def __str__(self):
attr_name = f.name
attr_value = getattr(self, f.name)

if attr_name in ['api_key', 'aws_access_key_id', 'aws_secret_access_key']:
if attr_name in LLM_SENSITIVE_FIELDS:
attr_value = '******' if attr_value else None

attr_str.append(f'{attr_name}={repr(attr_value)}')
Expand All @@ -96,6 +99,14 @@ def __str__(self):
def __repr__(self):
return self.__str__()

def to_safe_dict(self):
"""Return a dict with the sensitive fields replaced with ******."""
ret = self.__dict__.copy()
for k, v in ret.items():
if k in LLM_SENSITIVE_FIELDS:
ret[k] = '******' if v else None
return ret


@dataclass
class AgentConfig:
Expand Down