Skip to content

Commit 3d53cd9

Browse files
authored
Merge pull request #1192 from automl/development
Development
2 parents 904a692 + 96b9ad0 commit 3d53cd9

File tree

15 files changed

+513
-35
lines changed

15 files changed

+513
-35
lines changed

.gitignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,13 @@
11
# Documentation
22
docs/build/*
3+
docs/examples
34

45
*.py[cod]
56

7+
# Exmaples
8+
# examples 40_advanced generate a tmp_folder
9+
examples/40_advanced/tmp_folder
10+
611
# C extensions
712
*.c
813
*.so

autosklearn/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
"""Version information."""
22

33
# The following line *must* be the last in the module, exactly as formatted:
4-
__version__ = "0.12.8"
4+
__version__ = "0.13.0"

autosklearn/automl.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,6 @@ def __init__(self,
201201
self.cv_models_ = None
202202
self.ensemble_ = None
203203
self._can_predict = False
204-
205204
self._debug_mode = debug_mode
206205

207206
self.InputValidator = None # type: Optional[InputValidator]

autosklearn/ensembles/ensemble_selection.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -278,14 +278,20 @@ def predict(self, predictions: Union[np.ndarray, List[np.ndarray]]) -> np.ndarra
278278
return average
279279

280280
def __str__(self) -> str:
281-
return 'Ensemble Selection:\n\tTrajectory: %s\n\tMembers: %s' \
282-
'\n\tWeights: %s\n\tIdentifiers: %s' % \
283-
(' '.join(['%d: %5f' % (idx, performance)
284-
for idx, performance in enumerate(self.trajectory_)]),
285-
self.indices_, self.weights_,
286-
' '.join([str(identifier) for idx, identifier in
287-
enumerate(self.identifiers_)
288-
if self.weights_[idx] > 0]))
281+
trajectory_str = ' '.join([
282+
f'{id}: {perf:.5f}'
283+
for id, perf in enumerate(self.trajectory_)
284+
])
285+
identifiers_str = ' '.join([
286+
f'{identifier}'
287+
for idx, identifier in enumerate(self.identifiers_)
288+
if self.weights_[idx] > 0
289+
])
290+
return ("Ensemble Selection:\n"
291+
f"\tTrajectory: {trajectory_str}\n"
292+
f"\tMembers: {self.indices_}\n"
293+
f"\tWeights: {self.weights_}\n"
294+
f"\tIdentifiers: {identifiers_str}\n")
289295

290296
def get_models_with_weights(
291297
self,

autosklearn/estimators.py

Lines changed: 277 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
# -*- encoding: utf-8 -*-
2-
3-
from typing import Optional, Dict, List, Tuple, Union
2+
from typing import Optional, Dict, List, Tuple, Union, Iterable
3+
from typing_extensions import Literal
44

55
from ConfigSpace.configuration_space import Configuration
66
import dask.distributed
77
import joblib
88
import numpy as np
9+
import pandas as pd
910
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
1011
from sklearn.utils.multiclass import type_of_target
1112
from smac.runhistory.runhistory import RunInfo, RunValue
@@ -550,6 +551,280 @@ def sprint_statistics(self):
550551
"""
551552
return self.automl_.sprint_statistics()
552553

554+
def leaderboard(
555+
self,
556+
detailed: bool = False,
557+
ensemble_only: bool = True,
558+
top_k: Union[int, Literal['all']] = 'all',
559+
sort_by: str = 'cost',
560+
sort_order: Literal['auto', 'ascending', 'descending'] = 'auto',
561+
include: Optional[Union[str, Iterable[str]]] = None
562+
) -> pd.DataFrame:
563+
""" Returns a pandas table of results for all evaluated models.
564+
565+
Gives an overview of all models trained during the search process along
566+
with various statistics about their training.
567+
568+
The availble statistics are:
569+
570+
**Simple**:
571+
572+
* ``"model_id"`` - The id given to a model by ``autosklearn``.
573+
* ``"rank"`` - The rank of the model based on it's ``"cost"``.
574+
* ``"ensemble_weight"`` - The weight given to the model in the ensemble.
575+
* ``"type"`` - The type of classifier/regressor used.
576+
* ``"cost"`` - The loss of the model on the validation set.
577+
* ``"duration"`` - Length of time the model was optimized for.
578+
579+
**Detailed**:
580+
The detailed view includes all of the simple statistics along with the
581+
following.
582+
583+
* ``"config_id"`` - The id used by SMAC for optimization.
584+
* ``"budget"`` - How much budget was allocated to this model.
585+
* ``"status"`` - The return status of training the model with SMAC.
586+
* ``"train_loss"`` - The loss of the model on the training set.
587+
* ``"balancing_strategy"`` - The balancing strategy used for data preprocessing.
588+
* ``"start_time"`` - Time the model began being optimized
589+
* ``"end_time"`` - Time the model ended being optimized
590+
* ``"data_preprocessors"`` - The preprocessors used on the data
591+
* ``"feature_preprocessors"`` - The preprocessors for features types
592+
593+
Parameters
594+
----------
595+
detailed: bool = False
596+
Whether to give detailed information or just a simple overview.
597+
598+
ensemble_only: bool = True
599+
Whether to view only models included in the ensemble or all models
600+
trained.
601+
602+
top_k: int or "all" = "all"
603+
How many models to display.
604+
605+
sort_by: str = 'cost'
606+
What column to sort by. If that column is not present, the
607+
sorting defaults to the ``"model_id"`` index column.
608+
609+
sort_order: "auto" or "ascending" or "descending" = "auto"
610+
Which sort order to apply to the ``sort_by`` column. If left
611+
as ``"auto"``, it will sort by a sensible default where "better" is
612+
on top, otherwise defaulting to the pandas default for
613+
`DataFrame.sort_values`_ if there is no obvious "better".
614+
615+
.. _DataFrame.sort_values: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sort_values.html
616+
617+
include: Optional[str or Iterable[str]]
618+
Items to include, other items not specified will be excluded.
619+
The exception is the ``"model_id"`` index column which is always included.
620+
621+
If left as ``None``, it will resort back to using the ``detailed``
622+
param to decide the columns to include.
623+
624+
Returns
625+
-------
626+
pd.DataFrame
627+
A dataframe of statistics for the models, ordered by ``sort_by``.
628+
629+
""" # noqa (links are too long)
630+
# TODO validate that `self` is fitted. This is required for
631+
# self.ensemble_ to get the identifiers of models it will generate
632+
# weights for.
633+
column_types = AutoSklearnEstimator._leaderboard_columns()
634+
635+
# Validation of top_k
636+
if (
637+
not (isinstance(top_k, str) or isinstance(top_k, int))
638+
or (isinstance(top_k, str) and top_k != 'all')
639+
or (isinstance(top_k, int) and top_k <= 0)
640+
):
641+
raise ValueError(f"top_k={top_k} must be a positive integer or pass"
642+
" `top_k`='all' to view results for all models")
643+
644+
# Validate columns to include
645+
if isinstance(include, str):
646+
include = [include]
647+
648+
if include == ['model_id']:
649+
raise ValueError('Must provide more than just `model_id`')
650+
651+
if include is not None:
652+
columns = [*include]
653+
654+
# 'model_id' should always be present as it is the unique index
655+
# used for pandas
656+
if 'model_id' not in columns:
657+
columns.append('model_id')
658+
659+
invalid_include_items = set(columns) - set(column_types['all'])
660+
if len(invalid_include_items) != 0:
661+
raise ValueError(f"Values {invalid_include_items} are not known"
662+
f" columns to include, must be contained in "
663+
f"{column_types['all']}")
664+
elif detailed:
665+
columns = column_types['all']
666+
else:
667+
columns = column_types['simple']
668+
669+
# Validation of sorting
670+
if sort_by not in column_types['all']:
671+
raise ValueError(f"sort_by='{sort_by}' must be one of included "
672+
f"columns {set(column_types['all'])}")
673+
674+
valid_sort_orders = ['auto', 'ascending', 'descending']
675+
if not (isinstance(sort_order, str) and sort_order in valid_sort_orders):
676+
raise ValueError(f"`sort_order` = {sort_order} must be a str in "
677+
f"{valid_sort_orders}")
678+
679+
# To get all the models that were optmized, we collect what we can from
680+
# runhistory first.
681+
def has_key(rv, key):
682+
return rv.additional_info and key in rv.additional_info
683+
684+
model_runs = {
685+
rval.additional_info['num_run']: {
686+
'model_id': rval.additional_info['num_run'],
687+
'seed': rkey.seed,
688+
'budget': rkey.budget,
689+
'duration': rval.time,
690+
'config_id': rkey.config_id,
691+
'start_time': rval.starttime,
692+
'end_time': rval.endtime,
693+
'status': str(rval.status),
694+
'cost': rval.cost,
695+
'train_loss': rval.additional_info['train_loss']
696+
if has_key(rval, 'train_loss') else None,
697+
'config_origin': rval.additional_info['configuration_origin']
698+
if has_key(rval, 'configuration_origin') else None
699+
}
700+
for rkey, rval in self.automl_.runhistory_.data.items()
701+
if has_key(rval, 'num_run')
702+
}
703+
704+
# Next we get some info about the model itself
705+
model_class_strings = {
706+
AutoMLClassifier: 'classifier',
707+
AutoMLRegressor: 'regressor'
708+
}
709+
model_type = model_class_strings.get(self._get_automl_class(), None)
710+
if model_type is None:
711+
raise RuntimeError(f"Unknown `automl_class` {self._get_automl_class()}")
712+
713+
# A dict mapping model ids to their configurations
714+
configurations = self.automl_.runhistory_.ids_config
715+
716+
for model_id, run_info in model_runs.items():
717+
config_id = run_info['config_id']
718+
run_config = configurations[config_id]._values
719+
720+
run_info.update({
721+
'balancing_strategy': run_config.get('balancing:strategy', None),
722+
'type': run_config[f'{model_type}:__choice__'],
723+
'data_preprocessors': [
724+
value for key, value in run_config.items()
725+
if 'data_preprocessing' in key and '__choice__' in key
726+
],
727+
'feature_preprocessors': [
728+
value for key, value in run_config.items()
729+
if 'feature_preprocessor' in key and '__choice__' in key
730+
]
731+
})
732+
733+
# Get the models ensemble weight if it has one
734+
# TODO both implementing classes of AbstractEnsemble have a property
735+
# `identifiers_` and `weights_`, might be good to put it as an
736+
# abstract property
737+
# TODO `ensemble_.identifiers_` and `ensemble_.weights_` are loosely
738+
# tied together by ordering, might be better to store as tuple
739+
for i, weight in enumerate(self.automl_.ensemble_.weights_):
740+
(_, model_id, _) = self.automl_.ensemble_.identifiers_[i]
741+
model_runs[model_id]['ensemble_weight'] = weight
742+
743+
# Filter out non-ensemble members if needed, else fill in a default
744+
# value of 0 if it's missing
745+
if ensemble_only:
746+
model_runs = {
747+
model_id: info
748+
for model_id, info in model_runs.items()
749+
if ('ensemble_weight' in info and info['ensemble_weight'] > 0)
750+
}
751+
else:
752+
for model_id, info in model_runs.items():
753+
if 'ensemble_weight' not in info:
754+
info['ensemble_weight'] = 0
755+
756+
# `rank` relies on `cost` so we include `cost`
757+
# We drop it later if it's not requested
758+
if 'rank' in columns and 'cost' not in columns:
759+
columns = [*columns, 'cost']
760+
761+
# Finally, convert into a tabular format by converting the dict into
762+
# column wise orientation.
763+
dataframe = pd.DataFrame({
764+
col: [run_info[col] for run_info in model_runs.values()]
765+
for col in columns if col != 'rank'
766+
})
767+
768+
# Give it an index, even if not in the `include`
769+
dataframe.set_index('model_id', inplace=True)
770+
771+
# Add the `rank` column if needed, dropping `cost` if it's not
772+
# requested by the user
773+
if 'rank' in columns:
774+
dataframe.sort_values(by='cost', ascending=True, inplace=True)
775+
dataframe.insert(column='rank',
776+
value=range(1, len(dataframe) + 1),
777+
loc=list(columns).index('rank') - 1) # account for `model_id`
778+
779+
if 'cost' not in columns:
780+
dataframe.drop('cost', inplace=True)
781+
782+
# Decide on the sort order depending on what it gets sorted by
783+
descending_columns = ['ensemble_weight', 'duration']
784+
if sort_order == 'auto':
785+
ascending_param = False if sort_by in descending_columns else True
786+
else:
787+
ascending_param = False if sort_order == 'descending' else True
788+
789+
# Sort by the given column name, defaulting to 'model_id' if not present
790+
if sort_by not in dataframe.columns:
791+
self.automl_._logger.warning(f"sort_by = '{sort_by}' was not present"
792+
", defaulting to sort on the index "
793+
"'model_id'")
794+
sort_by = 'model_id'
795+
796+
# Cost can be the same but leave rank all over the place
797+
if 'rank' in columns and sort_by == 'cost':
798+
dataframe.sort_values(by=[sort_by, 'rank'],
799+
ascending=[ascending_param, True],
800+
inplace=True)
801+
else:
802+
dataframe.sort_values(by=sort_by,
803+
ascending=ascending_param,
804+
inplace=True)
805+
806+
# Lastly, just grab the top_k
807+
if top_k == 'all' or top_k >= len(dataframe):
808+
top_k = len(dataframe)
809+
810+
dataframe = dataframe.head(top_k)
811+
812+
return dataframe
813+
814+
@staticmethod
815+
def _leaderboard_columns() -> Dict[Literal['all', 'simple', 'detailed'], List[str]]:
816+
all = [
817+
"model_id", "rank", "ensemble_weight", "type", "cost", "duration",
818+
"config_id", "train_loss", "seed", "start_time", "end_time",
819+
"budget", "status", "data_preprocessors", "feature_preprocessors",
820+
"balancing_strategy", "config_origin"
821+
]
822+
simple = [
823+
"model_id", "rank", "ensemble_weight", "type", "cost", "duration"
824+
]
825+
detailed = all
826+
return {'all': all, 'detailed': detailed, 'simple': simple}
827+
553828
def _get_automl_class(self):
554829
raise NotImplementedError()
555830

doc/Makefile

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
1919
# the i18n builder cannot share the environment and doctrees with the others
2020
I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
2121

22-
.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
22+
.PHONY: help clean html html-noexamples dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
2323

2424
all: html
2525

@@ -59,6 +59,12 @@ html:
5959
@echo
6060
@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
6161

62+
html-noexamples:
63+
$(SPHINXBUILD) -D plot_gallery=0 -b html $(ALLSPHINXOPTS) $(SOURCEDIR) $(BUILDDIR)/html
64+
@echo
65+
@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
66+
67+
6268
dirhtml:
6369
$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
6470
@echo

0 commit comments

Comments
 (0)