diff --git a/examples/configs/polygraph_eval_aeslc.yaml b/examples/configs/polygraph_eval_aeslc.yaml index 8e7b4b06..e3067b6d 100644 --- a/examples/configs/polygraph_eval_aeslc.yaml +++ b/examples/configs/polygraph_eval_aeslc.yaml @@ -4,6 +4,7 @@ hydra: defaults: - model: bloomz-560m + - _self_ cache_path: ./workdir/output save_path: '${hydra:run.dir}' diff --git a/examples/configs/polygraph_eval_babiqa.yaml b/examples/configs/polygraph_eval_babiqa.yaml index fee1ec64..20777949 100644 --- a/examples/configs/polygraph_eval_babiqa.yaml +++ b/examples/configs/polygraph_eval_babiqa.yaml @@ -4,6 +4,7 @@ hydra: defaults: - model: bloomz-560m + - _self_ cache_path: ./workdir/output save_path: '${hydra:run.dir}' diff --git a/examples/configs/polygraph_eval_coqa.yaml b/examples/configs/polygraph_eval_coqa.yaml index 7248168c..4aaa4050 100644 --- a/examples/configs/polygraph_eval_coqa.yaml +++ b/examples/configs/polygraph_eval_coqa.yaml @@ -4,6 +4,7 @@ hydra: defaults: - model: bloomz-560m + - _self_ cache_path: ./workdir/output save_path: '${hydra:run.dir}' diff --git a/examples/configs/polygraph_eval_gsm8k.yaml b/examples/configs/polygraph_eval_gsm8k.yaml index 59536e2d..4a6f5acb 100644 --- a/examples/configs/polygraph_eval_gsm8k.yaml +++ b/examples/configs/polygraph_eval_gsm8k.yaml @@ -4,6 +4,7 @@ hydra: defaults: - model: bloomz-560m + - _self_ cache_path: ./workdir/output save_path: '${hydra:run.dir}' diff --git a/examples/configs/polygraph_eval_mmlu.yaml b/examples/configs/polygraph_eval_mmlu.yaml index d9fef84f..f993bab9 100644 --- a/examples/configs/polygraph_eval_mmlu.yaml +++ b/examples/configs/polygraph_eval_mmlu.yaml @@ -4,6 +4,7 @@ hydra: defaults: - model: bloomz-560m + - _self_ cache_path: ./workdir/output save_path: '${hydra:run.dir}' diff --git a/examples/configs/polygraph_eval_person_bio.yaml b/examples/configs/polygraph_eval_person_bio.yaml index 5f4239e2..18186085 100644 --- a/examples/configs/polygraph_eval_person_bio.yaml +++ b/examples/configs/polygraph_eval_person_bio.yaml @@ -4,6 +4,7 @@ hydra: defaults: - model: bloomz-560m + - _self_ cache_path: ./workdir/output save_path: '${hydra:run.dir}' diff --git a/examples/configs/polygraph_eval_triviaqa.yaml b/examples/configs/polygraph_eval_triviaqa.yaml index 3a816fda..25f9c1c6 100644 --- a/examples/configs/polygraph_eval_triviaqa.yaml +++ b/examples/configs/polygraph_eval_triviaqa.yaml @@ -4,6 +4,7 @@ hydra: defaults: - model: bloomz-560m + - _self_ cache_path: ./workdir/output save_path: '${hydra:run.dir}' diff --git a/examples/configs/polygraph_eval_wiki_bio.yaml b/examples/configs/polygraph_eval_wiki_bio.yaml index ddf2ba37..90331dd2 100644 --- a/examples/configs/polygraph_eval_wiki_bio.yaml +++ b/examples/configs/polygraph_eval_wiki_bio.yaml @@ -4,6 +4,7 @@ hydra: defaults: - model: bloomz-560m + - _self_ cache_path: ./workdir/output save_path: '${hydra:run.dir}' diff --git a/examples/configs/polygraph_eval_wmt14_deen.yaml b/examples/configs/polygraph_eval_wmt14_deen.yaml index c97d59ec..293ea6d8 100644 --- a/examples/configs/polygraph_eval_wmt14_deen.yaml +++ b/examples/configs/polygraph_eval_wmt14_deen.yaml @@ -4,6 +4,7 @@ hydra: defaults: - model: bloomz-560m + - _self_ cache_path: ./workdir/output save_path: '${hydra:run.dir}' diff --git a/examples/configs/polygraph_eval_wmt14_fren.yaml b/examples/configs/polygraph_eval_wmt14_fren.yaml index 8d673888..61419c3a 100644 --- a/examples/configs/polygraph_eval_wmt14_fren.yaml +++ b/examples/configs/polygraph_eval_wmt14_fren.yaml @@ -4,6 +4,7 @@ hydra: defaults: - model: bloomz-560m + - _self_ cache_path: ./workdir/output save_path: '${hydra:run.dir}' diff --git a/examples/configs/polygraph_eval_wmt19_deen.yaml b/examples/configs/polygraph_eval_wmt19_deen.yaml index 3bbe6f62..6d798f38 100644 --- a/examples/configs/polygraph_eval_wmt19_deen.yaml +++ b/examples/configs/polygraph_eval_wmt19_deen.yaml @@ -4,6 +4,7 @@ hydra: defaults: - model: bloomz-560m + - _self_ cache_path: ./workdir/output save_path: '${hydra:run.dir}' diff --git a/examples/configs/polygraph_eval_xsum.yaml b/examples/configs/polygraph_eval_xsum.yaml index 8211b14d..1e2ea3ca 100644 --- a/examples/configs/polygraph_eval_xsum.yaml +++ b/examples/configs/polygraph_eval_xsum.yaml @@ -4,6 +4,7 @@ hydra: defaults: - model: bloomz-560m + - _self_ cache_path: ./workdir/output save_path: '${hydra:run.dir}' diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval index 1fd80351..20cd8e92 100755 --- a/scripts/polygraph_eval +++ b/scripts/polygraph_eval @@ -11,7 +11,7 @@ import json import logging -log = logging.getLogger() +log = logging.getLogger('lm_polygraph') from lm_polygraph.utils.manager import UEManager from lm_polygraph.utils.dataset import Dataset @@ -27,7 +27,6 @@ from lm_polygraph.ue_metrics import * hydra_config = Path(os.environ["HYDRA_CONFIG"]) - @hydra.main( version_base=None, config_path=str(hydra_config.parent), @@ -95,13 +94,19 @@ def main(args): load_from_disk=args.load_from_disk, **cache_kwargs ) + log.info("Done with loading eval data.") + log.info("="*100) + log.info("Initializing UE estimators...") estimators = [] estimators += get_ue_methods(args, model) density_based_ue_methods = get_density_based_ue_methods(args, model.model_type) estimators += density_based_ue_methods + log.info("Done loading UE estimators") if any([not getattr(method, "is_fitted", False) for method in density_based_ue_methods]): + log.info("="*100) + log.info(f"Loading train dataset...") if (args.train_dataset is not None) and ( args.train_dataset != args.dataset ): @@ -162,6 +167,7 @@ def main(args): background_train_dataset.subsample( args.subsample_background_train_dataset, seed=seed ) + log.info(f"Done loading train data.") else: train_dataset = None background_train_dataset = None @@ -169,8 +175,6 @@ def main(args): if args.subsample_eval_dataset != -1: dataset.subsample(args.subsample_eval_dataset, seed=seed) - log.info("Done with loading data.") - generation_metrics = get_generation_metrics(args) ue_metrics = get_ue_metrics(args) @@ -339,6 +343,9 @@ def get_ue_methods(args, model): def get_generation_metrics(args): + log.info("="*100) + log.info("Initializing generation metrics...") + generation_metrics = getattr(args, "generation_metrics", None) if not generation_metrics: result = [ @@ -372,6 +379,9 @@ def get_generation_metrics(args): raise ValueError("BartScoreSeqMetric does not support multiref") metric_class = globals()[metric_name] result.append(metric_class(*metric.get("args", []))) + + log.info("Done with initializing generation metrics.") + return result diff --git a/src/lm_polygraph/estimators/estimator.py b/src/lm_polygraph/estimators/estimator.py index fb3db28f..f3ae4839 100644 --- a/src/lm_polygraph/estimators/estimator.py +++ b/src/lm_polygraph/estimators/estimator.py @@ -2,6 +2,7 @@ from abc import ABC, abstractmethod from typing import List, Dict +from lm_polygraph.utils.common import polygraph_module_init class Estimator(ABC): @@ -9,6 +10,7 @@ class Estimator(ABC): Abstract estimator class, which estimates the uncertainty of a language model. """ + @polygraph_module_init def __init__(self, stats_dependencies: List[str], level: str): """ Parameters: diff --git a/src/lm_polygraph/estimators/lexical_similarity.py b/src/lm_polygraph/estimators/lexical_similarity.py index 8fb673f8..bcf6d11b 100644 --- a/src/lm_polygraph/estimators/lexical_similarity.py +++ b/src/lm_polygraph/estimators/lexical_similarity.py @@ -6,6 +6,11 @@ from .estimator import Estimator +from absl import logging as absl_logging + +# This prevents bullshit spam from rouge scorer +absl_logging.set_verbosity(absl_logging.WARNING) + class LexicalSimilarity(Estimator): """ diff --git a/src/lm_polygraph/generation_metrics/generation_metric.py b/src/lm_polygraph/generation_metrics/generation_metric.py index 3f6f0220..ec58820a 100644 --- a/src/lm_polygraph/generation_metrics/generation_metric.py +++ b/src/lm_polygraph/generation_metrics/generation_metric.py @@ -2,6 +2,7 @@ from typing import List, Dict from abc import ABC, abstractmethod +from lm_polygraph.utils.common import polygraph_module_init class GenerationMetric(ABC): @@ -11,6 +12,7 @@ class GenerationMetric(ABC): compared with different estimators' uncertainties in UEManager using ue_metrics. """ + @polygraph_module_init def __init__(self, stats_dependencies: List[str], level: str): """ Parameters: diff --git a/src/lm_polygraph/generation_metrics/rouge.py b/src/lm_polygraph/generation_metrics/rouge.py index afcfc794..abc2f8c8 100644 --- a/src/lm_polygraph/generation_metrics/rouge.py +++ b/src/lm_polygraph/generation_metrics/rouge.py @@ -4,6 +4,11 @@ from typing import List, Dict from .generation_metric import GenerationMetric +from absl import logging as absl_logging + +# This prevents bullshit spam from rouge scorer +absl_logging.set_verbosity(absl_logging.WARNING) + class RougeMetric(GenerationMetric): """ diff --git a/src/lm_polygraph/stat_calculators/greedy_alternatives_nli.py b/src/lm_polygraph/stat_calculators/greedy_alternatives_nli.py index 8f0e400a..2ab39c62 100644 --- a/src/lm_polygraph/stat_calculators/greedy_alternatives_nli.py +++ b/src/lm_polygraph/stat_calculators/greedy_alternatives_nli.py @@ -42,6 +42,7 @@ def _eval_nli_model(nli_queue: List[Tuple[str, str]], deberta: Deberta) -> List[ class GreedyAlternativesNLICalculator(StatCalculator): + def __init__(self, nli_model): super().__init__( [ diff --git a/src/lm_polygraph/stat_calculators/model_score.py b/src/lm_polygraph/stat_calculators/model_score.py index d107e85f..cc6200d9 100644 --- a/src/lm_polygraph/stat_calculators/model_score.py +++ b/src/lm_polygraph/stat_calculators/model_score.py @@ -19,6 +19,7 @@ def _batch_tokens(tokens_list: List[List[int]], model: WhiteboxModel): class ModelScoreCalculator(StatCalculator): + def __init__(self, prompt: str = 'Paraphrase "{}": ', batch_size: int = 10): super().__init__(["model_rh"], ["greedy_tokens", "input_tokens"]) self.batch_size = batch_size diff --git a/src/lm_polygraph/stat_calculators/stat_calculator.py b/src/lm_polygraph/stat_calculators/stat_calculator.py index 71134d3e..eb56d5e9 100644 --- a/src/lm_polygraph/stat_calculators/stat_calculator.py +++ b/src/lm_polygraph/stat_calculators/stat_calculator.py @@ -3,6 +3,7 @@ from typing import List, Dict from abc import ABC, abstractmethod from lm_polygraph.utils.model import Model +from lm_polygraph.utils.common import polygraph_module_init class StatCalculator(ABC): @@ -20,6 +21,7 @@ class StatCalculator(ABC): Each new StatCalculator needs to be registered at lm_polygraph/stat_calculators/__init__.py to be seen be UEManager. """ + @polygraph_module_init def __init__(self, stats: List[str], stat_dependencies: List[str]): """ Parameters: diff --git a/src/lm_polygraph/utils/common.py b/src/lm_polygraph/utils/common.py new file mode 100644 index 00000000..66a6475c --- /dev/null +++ b/src/lm_polygraph/utils/common.py @@ -0,0 +1,12 @@ +import logging + +log = logging.getLogger("lm_polygraph") + + +def polygraph_module_init(func): + def wrapper(*args, **kwargs): + if func.__name__ == "__init__": + log.info(f"Initializing {args[0].__class__.__name__}") + func(*args, **kwargs) + + return wrapper diff --git a/src/lm_polygraph/utils/model.py b/src/lm_polygraph/utils/model.py index fd3e3e65..ceb2ea59 100644 --- a/src/lm_polygraph/utils/model.py +++ b/src/lm_polygraph/utils/model.py @@ -520,9 +520,7 @@ def tokenize(self, texts: List[str]) -> Dict[str, torch.Tensor]: return_token_type_ids=False, ) else: - tokenized = self.tokenizer( - texts, truncation=True, padding=True, return_tensors="pt" - ) + tokenized = self.tokenizer(texts, padding=True, return_tensors="pt") return tokenized diff --git a/src/lm_polygraph/utils/register_stat_calculators.py b/src/lm_polygraph/utils/register_stat_calculators.py index 01e48ca8..72f9bf38 100644 --- a/src/lm_polygraph/utils/register_stat_calculators.py +++ b/src/lm_polygraph/utils/register_stat_calculators.py @@ -1,4 +1,5 @@ import os +import logging from lm_polygraph.stat_calculators import * from lm_polygraph.utils.deberta import Deberta @@ -6,6 +7,8 @@ from typing import Dict, List, Optional, Tuple +log = logging.getLogger("lm_polygraph") + def register_stat_calculators( deberta_batch_size: int = 10, # TODO: rename to NLI model @@ -20,7 +23,13 @@ def register_stat_calculators( stat_calculators: Dict[str, "StatCalculator"] = {} stat_dependencies: Dict[str, List[str]] = {} + log.info("=" * 100) + log.info("Loading NLI model...") nli_model = Deberta(batch_size=deberta_batch_size, device=deberta_device) + + log.info("=" * 100) + log.info("Initializing stat calculators...") + openai_chat = OpenAIChat(cache_path=cache_path) def _register(calculator_class: StatCalculator): @@ -75,4 +84,6 @@ def _register(calculator_class: StatCalculator): _register(GreedyAlternativesFactPrefNLICalculator(nli_model=nli_model)) _register(ClaimsExtractor(openai_chat=openai_chat)) + log.info("Done intitializing stat calculators...") + return stat_calculators, stat_dependencies diff --git a/test/configs/test_polygraph_eval.yaml b/test/configs/test_polygraph_eval.yaml index d6d47b48..507bea2d 100644 --- a/test/configs/test_polygraph_eval.yaml +++ b/test/configs/test_polygraph_eval.yaml @@ -4,6 +4,7 @@ hydra: defaults: - model: bloomz-560m + - _self_ cache_path: ./workdir/output save_path: '${hydra:run.dir}' diff --git a/test/configs/test_polygraph_eval_ensemble.yaml b/test/configs/test_polygraph_eval_ensemble.yaml index 5693b631..064e71a5 100644 --- a/test/configs/test_polygraph_eval_ensemble.yaml +++ b/test/configs/test_polygraph_eval_ensemble.yaml @@ -4,6 +4,7 @@ hydra: defaults: - model: default + - _self_ cache_path: ./workdir/output save_path: '${hydra:run.dir}' diff --git a/test/configs/test_polygraph_eval_seq_ue.yaml b/test/configs/test_polygraph_eval_seq_ue.yaml index 788d710f..a088e123 100644 --- a/test/configs/test_polygraph_eval_seq_ue.yaml +++ b/test/configs/test_polygraph_eval_seq_ue.yaml @@ -4,6 +4,7 @@ hydra: defaults: - model: bloomz-560m + - _self_ cache_path: ./workdir/output save_path: '${hydra:run.dir}'