IINemo · IINemo · Jul 1, 2024 · Jun 12, 2024 · Jun 13, 2024 · Jun 13, 2024
diff --git a/examples/configs/polygraph_eval_aeslc.yaml b/examples/configs/polygraph_eval_aeslc.yaml
@@ -4,6 +4,7 @@ hydra:
 
 defaults:
   - model: bloomz-560m
+  - _self_
 
 cache_path: ./workdir/output
 save_path: '${hydra:run.dir}'

diff --git a/examples/configs/polygraph_eval_babiqa.yaml b/examples/configs/polygraph_eval_babiqa.yaml
@@ -4,6 +4,7 @@ hydra:
 
 defaults:
   - model: bloomz-560m
+  - _self_
 
 cache_path: ./workdir/output
 save_path: '${hydra:run.dir}'

diff --git a/examples/configs/polygraph_eval_coqa.yaml b/examples/configs/polygraph_eval_coqa.yaml
@@ -4,6 +4,7 @@ hydra:
 
 defaults:
   - model: bloomz-560m
+  - _self_
 
 cache_path: ./workdir/output
 save_path: '${hydra:run.dir}'

diff --git a/examples/configs/polygraph_eval_gsm8k.yaml b/examples/configs/polygraph_eval_gsm8k.yaml
@@ -4,6 +4,7 @@ hydra:
 
 defaults:
   - model: bloomz-560m
+  - _self_
 
 cache_path: ./workdir/output
 save_path: '${hydra:run.dir}'

diff --git a/examples/configs/polygraph_eval_mmlu.yaml b/examples/configs/polygraph_eval_mmlu.yaml
@@ -4,6 +4,7 @@ hydra:
 
 defaults:
   - model: bloomz-560m
+  - _self_
 
 cache_path: ./workdir/output
 save_path: '${hydra:run.dir}'

diff --git a/examples/configs/polygraph_eval_person_bio.yaml b/examples/configs/polygraph_eval_person_bio.yaml
@@ -4,6 +4,7 @@ hydra:
 
 defaults:
   - model: bloomz-560m
+  - _self_
 
 cache_path: ./workdir/output
 save_path: '${hydra:run.dir}'

diff --git a/examples/configs/polygraph_eval_triviaqa.yaml b/examples/configs/polygraph_eval_triviaqa.yaml
@@ -4,6 +4,7 @@ hydra:
 
 defaults:
   - model: bloomz-560m
+  - _self_
 
 cache_path: ./workdir/output
 save_path: '${hydra:run.dir}'

diff --git a/examples/configs/polygraph_eval_wiki_bio.yaml b/examples/configs/polygraph_eval_wiki_bio.yaml
@@ -4,6 +4,7 @@ hydra:
 
 defaults:
   - model: bloomz-560m
+  - _self_
 
 cache_path: ./workdir/output
 save_path: '${hydra:run.dir}'

diff --git a/examples/configs/polygraph_eval_wmt14_deen.yaml b/examples/configs/polygraph_eval_wmt14_deen.yaml
@@ -4,6 +4,7 @@ hydra:
 
 defaults:
   - model: bloomz-560m
+  - _self_
 
 cache_path: ./workdir/output
 save_path: '${hydra:run.dir}'

diff --git a/examples/configs/polygraph_eval_wmt14_fren.yaml b/examples/configs/polygraph_eval_wmt14_fren.yaml
@@ -4,6 +4,7 @@ hydra:
 
 defaults:
   - model: bloomz-560m
+  - _self_
 
 cache_path: ./workdir/output
 save_path: '${hydra:run.dir}'

diff --git a/examples/configs/polygraph_eval_wmt19_deen.yaml b/examples/configs/polygraph_eval_wmt19_deen.yaml
@@ -4,6 +4,7 @@ hydra:
 
 defaults:
   - model: bloomz-560m
+  - _self_
 
 cache_path: ./workdir/output
 save_path: '${hydra:run.dir}'

diff --git a/examples/configs/polygraph_eval_xsum.yaml b/examples/configs/polygraph_eval_xsum.yaml
@@ -4,6 +4,7 @@ hydra:
 
 defaults:
   - model: bloomz-560m
+  - _self_
 
 cache_path: ./workdir/output
 save_path: '${hydra:run.dir}'

diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval
@@ -11,7 +11,7 @@ import json
 
 import logging
 
-log = logging.getLogger()
+log = logging.getLogger('lm_polygraph')
 
 from lm_polygraph.utils.manager import UEManager
 from lm_polygraph.utils.dataset import Dataset
@@ -27,7 +27,6 @@ from lm_polygraph.ue_metrics import *
 
 hydra_config = Path(os.environ["HYDRA_CONFIG"])
 
-
 @hydra.main(
     version_base=None,
     config_path=str(hydra_config.parent),
@@ -95,13 +94,19 @@ def main(args):
             load_from_disk=args.load_from_disk,
             **cache_kwargs
         )
+        log.info("Done with loading eval data.")
 
+        log.info("="*100)
+        log.info("Initializing UE estimators...")
         estimators = []
         estimators += get_ue_methods(args, model)
         density_based_ue_methods = get_density_based_ue_methods(args, model.model_type)
         estimators += density_based_ue_methods
+        log.info("Done loading UE estimators")
 
         if any([not getattr(method, "is_fitted", False) for method in density_based_ue_methods]):
+            log.info("="*100)
+            log.info(f"Loading train dataset...")
             if (args.train_dataset is not None) and (
                     args.train_dataset != args.dataset
             ):
@@ -162,15 +167,14 @@ def main(args):
                 background_train_dataset.subsample(
                     args.subsample_background_train_dataset, seed=seed
                 )
+            log.info(f"Done loading train data.")
         else:
             train_dataset = None
             background_train_dataset = None
 
         if args.subsample_eval_dataset != -1:
             dataset.subsample(args.subsample_eval_dataset, seed=seed)
 
-        log.info("Done with loading data.")
-
         generation_metrics = get_generation_metrics(args)
 
         ue_metrics = get_ue_metrics(args)
@@ -339,6 +343,9 @@ def get_ue_methods(args, model):
 
 
 def get_generation_metrics(args):
+    log.info("="*100)
+    log.info("Initializing generation metrics...")
+
     generation_metrics = getattr(args, "generation_metrics", None)
     if not generation_metrics:
         result = [
@@ -372,6 +379,9 @@ def get_generation_metrics(args):
                 raise ValueError("BartScoreSeqMetric does not support multiref")
             metric_class = globals()[metric_name]
             result.append(metric_class(*metric.get("args", [])))
+
+    log.info("Done with initializing generation metrics.")
+
     return result
 
 

diff --git a/src/lm_polygraph/estimators/estimator.py b/src/lm_polygraph/estimators/estimator.py
@@ -2,13 +2,15 @@
 
 from abc import ABC, abstractmethod
 from typing import List, Dict
+from lm_polygraph.utils.common import polygraph_module_init
 
 
 class Estimator(ABC):
     """
     Abstract estimator class, which estimates the uncertainty of a language model.
     """
 
+    @polygraph_module_init
     def __init__(self, stats_dependencies: List[str], level: str):
         """
         Parameters:

diff --git a/src/lm_polygraph/estimators/lexical_similarity.py b/src/lm_polygraph/estimators/lexical_similarity.py
@@ -6,6 +6,11 @@
 
 from .estimator import Estimator
 
+from absl import logging as absl_logging
+
+# This prevents bullshit spam from rouge scorer
+absl_logging.set_verbosity(absl_logging.WARNING)
+
 
 class LexicalSimilarity(Estimator):
     """

diff --git a/src/lm_polygraph/generation_metrics/generation_metric.py b/src/lm_polygraph/generation_metrics/generation_metric.py
@@ -2,6 +2,7 @@
 
 from typing import List, Dict
 from abc import ABC, abstractmethod
+from lm_polygraph.utils.common import polygraph_module_init
 
 
 class GenerationMetric(ABC):
@@ -11,6 +12,7 @@ class GenerationMetric(ABC):
     compared with different estimators' uncertainties in UEManager using ue_metrics.
     """
 
+    @polygraph_module_init
     def __init__(self, stats_dependencies: List[str], level: str):
         """
         Parameters:

diff --git a/src/lm_polygraph/generation_metrics/rouge.py b/src/lm_polygraph/generation_metrics/rouge.py
@@ -4,6 +4,11 @@
 from typing import List, Dict
 from .generation_metric import GenerationMetric
 
+from absl import logging as absl_logging
+
+# This prevents bullshit spam from rouge scorer
+absl_logging.set_verbosity(absl_logging.WARNING)
+
 
 class RougeMetric(GenerationMetric):
     """

diff --git a/src/lm_polygraph/stat_calculators/greedy_alternatives_nli.py b/src/lm_polygraph/stat_calculators/greedy_alternatives_nli.py
@@ -42,6 +42,7 @@ def _eval_nli_model(nli_queue: List[Tuple[str, str]], deberta: Deberta) -> List[
 
 
 class GreedyAlternativesNLICalculator(StatCalculator):
+
     def __init__(self, nli_model):
         super().__init__(
             [

diff --git a/src/lm_polygraph/stat_calculators/model_score.py b/src/lm_polygraph/stat_calculators/model_score.py
@@ -19,6 +19,7 @@ def _batch_tokens(tokens_list: List[List[int]], model: WhiteboxModel):
 
 
 class ModelScoreCalculator(StatCalculator):
+
     def __init__(self, prompt: str = 'Paraphrase "{}": ', batch_size: int = 10):
         super().__init__(["model_rh"], ["greedy_tokens", "input_tokens"])
         self.batch_size = batch_size

diff --git a/src/lm_polygraph/stat_calculators/stat_calculator.py b/src/lm_polygraph/stat_calculators/stat_calculator.py
@@ -3,6 +3,7 @@
 from typing import List, Dict
 from abc import ABC, abstractmethod
 from lm_polygraph.utils.model import Model
+from lm_polygraph.utils.common import polygraph_module_init
 
 
 class StatCalculator(ABC):
@@ -20,6 +21,7 @@ class StatCalculator(ABC):
     Each new StatCalculator needs to be registered at lm_polygraph/stat_calculators/__init__.py to be seen be UEManager.
     """
 
+    @polygraph_module_init
     def __init__(self, stats: List[str], stat_dependencies: List[str]):
         """
         Parameters:

diff --git a/src/lm_polygraph/utils/common.py b/src/lm_polygraph/utils/common.py
@@ -0,0 +1,12 @@
+import logging
+
+log = logging.getLogger("lm_polygraph")
+
+
+def polygraph_module_init(func):
+    def wrapper(*args, **kwargs):
+        if func.__name__ == "__init__":
+            log.info(f"Initializing {args[0].__class__.__name__}")
+        func(*args, **kwargs)
+
+    return wrapper
diff --git a/src/lm_polygraph/utils/model.py b/src/lm_polygraph/utils/model.py
@@ -520,9 +520,7 @@ def tokenize(self, texts: List[str]) -> Dict[str, torch.Tensor]:
                 return_token_type_ids=False,
             )
         else:
-            tokenized = self.tokenizer(
-                texts, truncation=True, padding=True, return_tensors="pt"
-            )
+            tokenized = self.tokenizer(texts, padding=True, return_tensors="pt")
 
         return tokenized
 

diff --git a/src/lm_polygraph/utils/register_stat_calculators.py b/src/lm_polygraph/utils/register_stat_calculators.py
@@ -1,11 +1,14 @@
 import os
+import logging
 
 from lm_polygraph.stat_calculators import *
 from lm_polygraph.utils.deberta import Deberta
 from lm_polygraph.utils.openai_chat import OpenAIChat
 
 from typing import Dict, List, Optional, Tuple
 
+log = logging.getLogger("lm_polygraph")
+
 
 def register_stat_calculators(
     deberta_batch_size: int = 10,  # TODO: rename to NLI model
@@ -20,7 +23,13 @@ def register_stat_calculators(
     stat_calculators: Dict[str, "StatCalculator"] = {}
     stat_dependencies: Dict[str, List[str]] = {}
 
+    log.info("=" * 100)
+    log.info("Loading NLI model...")
     nli_model = Deberta(batch_size=deberta_batch_size, device=deberta_device)
+
+    log.info("=" * 100)
+    log.info("Initializing stat calculators...")
+
     openai_chat = OpenAIChat(cache_path=cache_path)
 
     def _register(calculator_class: StatCalculator):
@@ -75,4 +84,6 @@ def _register(calculator_class: StatCalculator):
     _register(GreedyAlternativesFactPrefNLICalculator(nli_model=nli_model))
     _register(ClaimsExtractor(openai_chat=openai_chat))
 
+    log.info("Done intitializing stat calculators...")
+
     return stat_calculators, stat_dependencies
diff --git a/test/configs/test_polygraph_eval.yaml b/test/configs/test_polygraph_eval.yaml
@@ -4,6 +4,7 @@ hydra:
 
 defaults:
   - model: bloomz-560m
+  - _self_
 
 cache_path: ./workdir/output
 save_path: '${hydra:run.dir}'

diff --git a/test/configs/test_polygraph_eval_ensemble.yaml b/test/configs/test_polygraph_eval_ensemble.yaml
@@ -4,6 +4,7 @@ hydra:
 
 defaults:
   - model: default
+  - _self_
 
 cache_path: ./workdir/output
 save_path: '${hydra:run.dir}'

diff --git a/test/configs/test_polygraph_eval_seq_ue.yaml b/test/configs/test_polygraph_eval_seq_ue.yaml
@@ -4,6 +4,7 @@ hydra:
 
 defaults:
   - model: bloomz-560m
+  - _self_
 
 cache_path: ./workdir/output
 save_path: '${hydra:run.dir}'
-Original file line number
+Diff line change
@@ Expand Up @@
     class GreedyAlternativesNLICalculator(StatCalculator):
         def __init__(self, nli_model):
             super().__init__(
                 [
@@ Expand Down @@