Skip to content

Gracefully skip BigBench tasks with no data & guard final aggregation #3066

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
148 changes: 91 additions & 57 deletions lm_eval/api/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -861,6 +861,16 @@ def __init__(
self._higher_is_better[metric_name] = is_higher_better(metric_name)

self.download(self.config.dataset_kwargs)
try:
self.download(self.config.dataset_kwargs)
except ValueError as e:
# e.g. “Instruction 'train' corresponds to no data!”
eval_logger.warning(
f"Task {self.config.task!r} download failed ({e}), marking as empty."
)
# build an empty default split so eval_docs == []
from datasets import Dataset, DatasetDict
self.dataset = DatasetDict({"default": Dataset.from_dict({})})
self._training_docs = None
self._fewshot_docs = None

Expand Down Expand Up @@ -923,76 +933,100 @@ def __init__(
self.features = list(self.task_docs.features.keys())
self.multiple_input = 0
self.multiple_target = 0
test_doc = self.task_docs[0]
test_text = self.doc_to_text(test_doc)
test_target = self.doc_to_target(test_doc)

if self.config.doc_to_choice is not None:
test_choice = self.doc_to_choice(test_doc)
if not isinstance(test_choice, list):
eval_logger.error("doc_to_choice must return list")
else:
num_choice = len(test_choice)

if isinstance(test_text, int):
eval_logger.debug(
"doc_to_text returned an int. Assuming multiple inputs."
)
self.multiple_input = num_choice
# assign whatever eval_docs (could now be empty)
self.task_docs = self.eval_docs
if len(self.task_docs) == 0:
# skip the “one‐doc” sanity checks entirely
self.features = []
self.multiple_input = 0
self.multiple_target = 0
else:
test_choice = None
# Test One Doc (only if we have at least one example)
self.features = list(self.task_docs.features.keys())
self.multiple_input = 0
self.multiple_target = 0
test_doc = self.task_docs[0]
test_text = self.doc_to_text(test_doc)
test_target = self.doc_to_target(test_doc)

if isinstance(test_target, list):
eval_logger.debug(
"doc_to_target returned a list. Assuming multiple targets."
)
self.multiple_target = len(test_target)
else:
if (isinstance(test_target, int)) and (test_choice is not None):
test_target = test_choice[test_target]
if self.config.doc_to_choice is not None:
test_choice = self.doc_to_choice(test_doc)
if not isinstance(test_choice, list):
eval_logger.error("doc_to_choice must return list")
else:
num_choice = len(test_choice)

if isinstance(test_text, int):
eval_logger.debug(
"doc_to_text returned an int. Assuming multiple inputs."
)
self.multiple_input = num_choice
else:
test_target = str(test_target)
test_choice = None

if test_choice is not None:
check_choices = test_choice
else:
check_choices = [test_target]
if self.config.doc_to_choice is not None:
for choice in check_choices:
choice_has_whitespace = True if choice[0].isspace() else False
delimiter_has_whitespace = (
True
if self.config.target_delimiter.rstrip()
!= self.config.target_delimiter
else False
if isinstance(test_target, list):
eval_logger.debug(
"doc_to_target returned a list. Assuming multiple targets."
)
self.multiple_target = len(test_target)
else:
if (isinstance(test_target, int)) and (test_choice is not None):
test_target = test_choice[test_target]
else:
test_target = str(test_target)

if delimiter_has_whitespace and choice_has_whitespace:
eval_logger.debug(
f'Both target_delimiter "{self.config.target_delimiter}" and target choice: "{choice}" have whitespace'
)
elif (not delimiter_has_whitespace) and (not choice_has_whitespace):
eval_logger.debug(
f'Both target_delimiter "{self.config.target_delimiter}" and target choice: "{choice}" do not have whitespace, ignore if the language you are evaluating on does not require/use whitespace'
if test_choice is not None:
check_choices = test_choice
else:
check_choices = [test_target]
if self.config.doc_to_choice is not None:
for choice in check_choices:
choice_has_whitespace = True if choice[0].isspace() else False
delimiter_has_whitespace = (
True
if self.config.target_delimiter.rstrip()
!= self.config.target_delimiter
else False
)

if delimiter_has_whitespace and choice_has_whitespace:
eval_logger.debug(
f'Both target_delimiter "{self.config.target_delimiter}" and target choice: "{choice}" have whitespace'
)
elif (not delimiter_has_whitespace) and (not choice_has_whitespace):
eval_logger.debug(
f'Both target_delimiter "{self.config.target_delimiter}" and target choice: "{choice}" do not have whitespace, ignore if the language you are evaluating on does not require/use whitespace'
)

def download(
self, dataset_kwargs: Optional[Dict[str, Any]] = None, **kwargs
) -> None:
if isinstance(self.config.custom_dataset, Callable):
"""Download dataset, but if HF raises ValueError about a missing split, turn it into an empty default split."""
try:
if isinstance(self.config.custom_dataset, Callable):
eval_logger.warning(
f"{self.config.task}: Custom kwargs can be passed to `--metadata`…"
)
self.dataset = self.config.custom_dataset(
**(self.config.metadata or {}),
**(self.config.dataset_kwargs or {}),
)
else:
# <-- this is where load_dataset often errors
self.dataset = datasets.load_dataset(
path=self.DATASET_PATH,
name=self.DATASET_NAME,
**(dataset_kwargs or {}),
)
except ValueError as e:
# e.g. “Instruction 'train' corresponds to no data!”
eval_logger.warning(
f"{self.config.task}: Custom kwargs can be passed to `--metadata` in console (as json string) or to the TaskManager."
+ "\nFor example --metadata='{\"max_seq_lengths\":[4096, 8192]}'. For details see task Readme."
)
self.dataset = self.config.custom_dataset(
**(self.config.metadata or {}), **(self.config.dataset_kwargs or {})
)
else:
self.dataset = datasets.load_dataset(
path=self.DATASET_PATH,
name=self.DATASET_NAME,
**dataset_kwargs if dataset_kwargs is not None else {},
f"Task {self.config.task!r} download error ({e}), using empty default split."
)
from datasets import Dataset, DatasetDict
# create an empty DatasetDict with a zero‐row "default" split
self.dataset = DatasetDict({"default": Dataset.from_dict({})})


def has_training_docs(self) -> bool:
if self.config.training_split is not None:
Expand Down
47 changes: 25 additions & 22 deletions lm_eval/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -691,11 +691,14 @@ def evaluate(
) = consolidate_results(eval_tasks)

### Calculate group metrics ###
# default to no group-table if there are no results
show_group_table = False
if bool(results):
results, versions, show_group_table, *_ = consolidate_group_results(
results, versions, task_dict
)


results_agg, group_agg = prepare_print_tasks(task_dict, results)
subtask_list = get_subtask_list(task_dict)

Expand Down Expand Up @@ -724,28 +727,28 @@ def evaluate(
higher_is_better[group] = _higher_is_better

results_dict = {
"results": dict(results_agg.items()),
**(
{"groups": dict(group_agg.items())}
if (bool(group_agg) & show_group_table)
else {}
),
"group_subtasks": dict(reversed(subtask_list.items())),
"configs": dict(sorted(configs.items())),
"versions": dict(sorted(versions.items())),
"n-shot": dict(sorted(num_fewshot.items())),
"higher_is_better": dict(sorted(higher_is_better.items())),
"n-samples": {
task_output.task_name: {
"original": len(task_output.task.eval_docs),
"effective": min(
limit if limit else len(task_output.task.eval_docs),
len(task_output.task.eval_docs),
),
}
for task_output, limit in zip(eval_tasks, limits)
},
}
"results": dict(results_agg.items()),
**(
{"groups": dict(group_agg.items())}
if bool(group_agg) and show_group_table
else {}
),
"group_subtasks": dict(reversed(subtask_list.items())),
"configs": dict(sorted(configs.items())),
"versions": dict(sorted(versions.items())),
"n-shot": dict(sorted(num_fewshot.items())),
"higher_is_better": dict(sorted(higher_is_better.items())),
"n-samples": {
task_output.task_name: {
"original": len(task_output.task.eval_docs),
"effective": min(
limit if limit else len(task_output.task.eval_docs),
len(task_output.task.eval_docs),
),
}
for task_output, limit in zip(eval_tasks, limits)
},
}
if log_samples:
results_dict["samples"] = dict(samples)

Expand Down
Loading