Skip to content

Commit f7b68eb

Browse files
author
Seb
authored
negate the result and prefix the metric name for error/loss metrics (#278)
* negate the result and prefix the metric name for error/loss metrics * use the occasion to document available metrics and add more of them * docstring * fixed reports library+notebook to accommodate the changes * small cleanup in reports notebook * fixed wrong search/replace in reporting notebook
1 parent 3ba4030 commit f7b68eb

File tree

5 files changed

+181
-159
lines changed

5 files changed

+181
-159
lines changed

amlb/datautils.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,9 @@
1414
import numpy as np
1515
import pandas as pd
1616
from sklearn.base import TransformerMixin
17-
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, log_loss, balanced_accuracy_score, mean_absolute_error, mean_squared_error, mean_squared_log_error, r2_score, roc_auc_score # just aliasing
17+
from sklearn.metrics import accuracy_score, auc, average_precision_score, balanced_accuracy_score, confusion_matrix, fbeta_score, \
18+
log_loss, mean_absolute_error, mean_squared_error, mean_squared_log_error, precision_recall_curve, \
19+
r2_score, roc_auc_score # just aliasing
1820
from sklearn.preprocessing import LabelEncoder, LabelBinarizer, OneHotEncoder
1921

2022
from .utils import profile, path_from_split, repr_def, split_path, touch

amlb/results.py

Lines changed: 74 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,9 @@
1616
import pandas as pd
1717

1818
from .data import Dataset, DatasetType, Feature
19-
from .datautils import accuracy_score, confusion_matrix, f1_score, log_loss, balanced_accuracy_score, mean_absolute_error, mean_squared_error, mean_squared_log_error, r2_score, roc_auc_score, read_csv, write_csv, is_data_frame, to_data_frame
19+
from .datautils import accuracy_score, auc, average_precision_score, balanced_accuracy_score, confusion_matrix, fbeta_score, log_loss, \
20+
mean_absolute_error, mean_squared_error, mean_squared_log_error, precision_recall_curve, r2_score, roc_auc_score, \
21+
read_csv, write_csv, is_data_frame, to_data_frame
2022
from .resources import get as rget, config as rconfig, output_dirs
2123
from .utils import Namespace, backup_file, cached, datetime_iso, json_load, memoize, profile
2224

@@ -395,6 +397,10 @@ def do_score(m):
395397
for metric in metadata.metrics or []:
396398
scores[metric] = do_score(metric)
397399
scores.result = scores[scores.metric] if scores.metric in scores else do_score(scores.metric)
400+
if not higher_is_better(scores.metric):
401+
scores.metric = f"neg_{scores.metric}"
402+
scores.result = - scores.result
403+
398404
scores.info = result.info
399405
if scoring_errors:
400406
scores.info = "; ".join(filter(lambda it: it, [scores.info, *scoring_errors]))
@@ -454,6 +460,8 @@ def __init__(self, error):
454460

455461
class ClassificationResult(Result):
456462

463+
multi_class_average = 'weighted' # used by metrics like fbeta or auc
464+
457465
def __init__(self, predictions_df, info=None):
458466
super().__init__(predictions_df, info)
459467
self.classes = self.df.columns[:-2].values.astype(str, copy=False)
@@ -465,42 +473,80 @@ def __init__(self, predictions_df, info=None):
465473
self.labels = self._autoencode(self.classes)
466474

467475
def acc(self):
476+
"""Accuracy"""
468477
return float(accuracy_score(self.truth, self.predictions))
469478

470-
def balacc(self):
471-
return float(balanced_accuracy_score(self.truth, self.predictions))
472-
473479
def auc(self):
480+
"""Array Under (ROC) Curve, computed on probabilities, not on predictions"""
474481
if self.type != DatasetType.binary:
475-
# raise ValueError("AUC metric is only supported for binary classification: {}.".format(self.classes))
476-
log.warning("AUC metric is only supported for binary classification: %s.", self.labels)
482+
log.warning("For multiclass problems, please use `auc_ovr` or `auc_ovo` metrics instead of `auc`.")
477483
return nan
478-
return float(roc_auc_score(self.truth, self.probabilities[:, 1], labels=self.labels))
484+
return float(roc_auc_score(self.truth, self.probabilities[:, 1]))
479485

480-
def cm(self):
481-
return confusion_matrix(self.truth, self.predictions, labels=self.labels)
486+
def auc_ovo(self):
487+
"""AUC One-vs-One"""
488+
return self._auc_multi(mc='ovo')
482489

483-
def _per_class_errors(self):
484-
return [(s-d)/s for s, d in ((sum(r), r[i]) for i, r in enumerate(self.cm()))]
490+
def auc_ovr(self):
491+
"""AUC One-vs-Rest"""
492+
return self._auc_multi(mc='ovr')
485493

486-
def mean_pce(self):
487-
"""mean per class error"""
488-
return statistics.mean(self._per_class_errors())
494+
def balacc(self):
495+
"""Balanced accuracy"""
496+
return float(balanced_accuracy_score(self.truth, self.predictions))
489497

490-
def max_pce(self):
491-
"""max per class error"""
492-
return max(self._per_class_errors())
498+
def f05(self):
499+
"""F-beta 0.5"""
500+
return self._fbeta(0.5)
493501

494502
def f1(self):
495-
return float(f1_score(self.truth, self.predictions, labels=self.labels))
503+
"""F-beta 1"""
504+
return self._fbeta(1)
505+
506+
def f2(self):
507+
"""F-beta 2"""
508+
return self._fbeta(2)
496509

497510
def logloss(self):
511+
"""Log Loss"""
498512
return float(log_loss(self.truth, self.probabilities, labels=self.labels))
499513

514+
def max_pce(self):
515+
"""Max per Class Error"""
516+
return max(self._per_class_errors())
517+
518+
def mean_pce(self):
519+
"""Mean per Class Error"""
520+
return statistics.mean(self._per_class_errors())
521+
522+
def pr_auc(self):
523+
"""Precision Recall AUC"""
524+
if self.type != DatasetType.binary:
525+
log.warning("PR AUC metric is only available for binary problems.")
526+
return nan
527+
# precision, recall, thresholds = precision_recall_curve(self.truth, self.probabilities[:, 1])
528+
# return float(auc(recall, precision))
529+
return float(average_precision_score(self.truth, self.probabilities[:, 1]))
530+
500531
def _autoencode(self, vec):
501532
needs_encoding = not _encode_predictions_and_truth_ or (isinstance(vec[0], str) and not vec[0].isdigit())
502533
return self.target.label_encoder.transform(vec) if needs_encoding else vec
503534

535+
def _auc_multi(self, mc='raise'):
536+
average = ClassificationResult.multi_class_average
537+
return float(roc_auc_score(self.truth, self.probabilities, average=average, labels=self.labels, multi_class=mc))
538+
539+
def _cm(self):
540+
return confusion_matrix(self.truth, self.predictions, labels=self.labels)
541+
542+
def _fbeta(self, beta):
543+
average = ClassificationResult.multi_class_average if self.truth == DatasetType.multiclass else 'binary'
544+
return float(fbeta_score(self.truth, self.predictions, beta=beta, average=average, labels=self.labels))
545+
546+
def _per_class_errors(self):
547+
return [(s-d)/s for s, d in ((sum(r), r[i]) for i, r in enumerate(self._cm()))]
548+
549+
504550

505551
class RegressionResult(Result):
506552

@@ -511,24 +557,34 @@ def __init__(self, predictions_df, info=None):
511557
self.type = DatasetType.regression
512558

513559
def mae(self):
560+
"""Mean Absolute Error"""
514561
return float(mean_absolute_error(self.truth, self.predictions))
515562

516563
def mse(self):
564+
"""Mean Squared Error"""
517565
return float(mean_squared_error(self.truth, self.predictions))
518566

519567
def msle(self):
568+
"""Mean Squared Logarithmic Error"""
520569
return float(mean_squared_log_error(self.truth, self.predictions))
521570

522571
def rmse(self):
572+
"""Root Mean Square Error"""
523573
return math.sqrt(self.mse())
524574

525575
def rmsle(self):
576+
"""Root Mean Square Logarithmic Error"""
526577
return math.sqrt(self.msle())
527578

528579
def r2(self):
580+
"""R^2"""
529581
return float(r2_score(self.truth, self.predictions))
530582

531583

584+
def higher_is_better(metric):
585+
return re.fullmatch(r"((pr_)?auc(_\w*)?)|(\w*acc)|(f\d+)|(r2)", metric)
586+
587+
532588
_encode_predictions_and_truth_ = False
533589

534590
save_predictions = TaskResult.save_predictions

amlb_report/results.py

Lines changed: 24 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
"""
2-
Loading results, formatting and adding columns
3-
result is the raw result metric computed from predictions at the end the benchmark. For classification problems, it is usually auc for binomial classification and logloss for multinomial classification.
4-
score ensures a standard comparison between tasks: higher is always better.
5-
norm_score is a normalization of score on a [0, 1] scale, with {{zero_one_refs[0]}} score as 0 and {{zero_one_refs[1]}} score as 1.
6-
imp_result and imp_score for imputed results/scores. Given a task and a framework:
7-
if all folds results/scores are missing, then no imputation occurs, and the result is nan for each fold.
8-
if only some folds results/scores are missing, then the missing result is imputed by the {{imp_framework}} result for this fold.
2+
Loading results, formatting and adding columns.
3+
result is the raw result metric computed from predictions at the end the benchmark: higher is always better!
4+
- For classification problems, it is usually auc for binary problems and negative log loss for multiclass problems.
5+
- For regression problems, it is usually negative rmse.
6+
norm_result is a normalization of result on a [0, 1] scale, with {{zero_one_refs[0]}} scoring as 0 and {{zero_one_refs[1]}} scoring as 1.
7+
imp_result for imputed results. Given a task and a framework:
8+
- if all folds results are missing, then no imputation occurs, and the result is nan for each fold.
9+
- if only some folds results are missing, then the missing result is imputed by the {{imp_framework}} result for this fold.
910
"""
1011

1112
import numpy as np
@@ -52,35 +53,21 @@ def imputed(row):
5253
return pd.isna(row.result) and pd.notna(row.imp_result)
5354

5455

55-
fit_metrics = ['auc', 'acc', 'r2']
56-
57-
58-
def metric_type(row, res_col='result'):
59-
return 'fit' if any([row[res_col] == getattr(row, m, None) for m in fit_metrics]) else 'loss'
60-
61-
62-
def score(row, res_col='result'):
63-
return (row[res_col] if row['metric_type'] == 'fit'
64-
else - row[res_col])
65-
66-
67-
def norm_score(row, score_col='score',
68-
zero_one_refs=None, ref_results=None,
69-
aggregation=None):
56+
def norm_result(row, res_col='result', zero_one_refs=None, ref_results=None, aggregation=None):
7057
if zero_one_refs is None:
71-
return row[score_col]
58+
return row[res_col]
7259

7360
def get_val(ref, default):
7461
try:
7562
if isinstance(ref, str):
7663
return (ref_results.loc[(ref_results.framework == ref)
7764
& (ref_results.task == row.task)]
78-
[score_col]
65+
[res_col]
7966
.agg(aggregation) if aggregation
8067
else ref_results.loc[(ref_results.framework == ref)
8168
& (ref_results.task == row.task)
8269
& (ref_results.fold == row.fold)]
83-
[score_col]
70+
[res_col]
8471
.item())
8572
else:
8673
return ref
@@ -89,9 +76,9 @@ def get_val(ref, default):
8976
# return default
9077

9178
zero, one = (get_val(ref, i) for i, ref in enumerate(zero_one_refs))
92-
rel_score = (row[score_col] - zero) / (one - zero)
93-
return (- rel_score if row['metric_type'] == 'loss' and one < 0 <= zero
94-
else rel_score)
79+
norm_res = (row[res_col] - zero) / (one - zero)
80+
return (- norm_res if row['metric'].startswith("neg_") and one < 0 <= zero
81+
else norm_res)
9582

9683

9784
def sorted_ints(arr):
@@ -117,7 +104,8 @@ def prepare_results(results,
117104
imputation=None,
118105
normalization=None,
119106
ref_results=None,
120-
duplicates_handling='fail' # other options are 'keep_first', 'keep_last', 'keep_none'
107+
duplicates_handling='fail', # other options are 'keep_first', 'keep_last', 'keep_none'
108+
include_metadata=False
121109
):
122110
if results is None or len(results) == 0:
123111
return None
@@ -139,7 +127,7 @@ def prepare_results(results,
139127

140128
folds = results.fold.unique()
141129

142-
metadata = load_dataset_metadata(results)
130+
metadata = load_dataset_metadata(results) if include_metadata else {}
143131

144132
done = results.set_index(['task', 'fold', 'framework'])
145133
done = remove_duplicates(done, handling=duplicates_handling)
@@ -158,9 +146,8 @@ def prepare_results(results,
158146

159147
# extending the data frame
160148
results = results.append(missing.reset_index())
161-
results['type'] = [task_prop(row, metadata, 'type') for _, row in results.iterrows()]
162-
results['metric_type'] = [metric_type(row) for _, row in results.iterrows()]
163-
results['score'] = [score(row) for _, row in results.iterrows()]
149+
if 'type' not in results:
150+
results['type'] = [task_prop(row, metadata, 'type') for _, row in results.iterrows()]
164151

165152
if ref_results is None:
166153
ref_results = results
@@ -177,18 +164,14 @@ def prepare_results(results,
177164
imp_framework=imp_fr, imp_results=ref_results,
178165
imp_value=imp_val, aggregation=aggr)
179166
for _, row in results.iterrows()]
180-
results['imp_score'] = [impute_result(row, results, 'score',
181-
imp_framework=imp_fr, imp_results=ref_results,
182-
imp_value=imp_val, aggregation=aggr)
183-
for _, row in results.iterrows()]
184167

185168
if normalization is not None:
186-
score_col = 'imp_score' if imputation is not None else 'score'
169+
res_col = 'imp_result' if imputation is not None else 'result'
187170
zero_one = normalization[0:2]
188171
aggr = normalization[2] if len(normalization) > 2 else None
189-
results['norm_score'] = [norm_score(row, score_col,
190-
zero_one_refs=zero_one, ref_results=ref_results, aggregation=aggr)
191-
for _, row in results.iterrows()]
172+
results['norm_result'] = [norm_result(row, res_col,
173+
zero_one_refs=zero_one, ref_results=ref_results, aggregation=aggr)
174+
for _, row in results.iterrows()]
192175

193176
return Namespace(
194177
results=results,

0 commit comments

Comments
 (0)