|
1 | 1 | # -*- encoding: utf-8 -*- |
2 | | - |
3 | | -from typing import Optional, Dict, List, Tuple, Union |
| 2 | +from typing import Optional, Dict, List, Tuple, Union, Iterable |
| 3 | +from typing_extensions import Literal |
4 | 4 |
|
5 | 5 | from ConfigSpace.configuration_space import Configuration |
6 | 6 | import dask.distributed |
7 | 7 | import joblib |
8 | 8 | import numpy as np |
| 9 | +import pandas as pd |
9 | 10 | from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin |
10 | 11 | from sklearn.utils.multiclass import type_of_target |
11 | 12 | from smac.runhistory.runhistory import RunInfo, RunValue |
@@ -550,6 +551,280 @@ def sprint_statistics(self): |
550 | 551 | """ |
551 | 552 | return self.automl_.sprint_statistics() |
552 | 553 |
|
| 554 | + def leaderboard( |
| 555 | + self, |
| 556 | + detailed: bool = False, |
| 557 | + ensemble_only: bool = True, |
| 558 | + top_k: Union[int, Literal['all']] = 'all', |
| 559 | + sort_by: str = 'cost', |
| 560 | + sort_order: Literal['auto', 'ascending', 'descending'] = 'auto', |
| 561 | + include: Optional[Union[str, Iterable[str]]] = None |
| 562 | + ) -> pd.DataFrame: |
| 563 | + """ Returns a pandas table of results for all evaluated models. |
| 564 | +
|
| 565 | + Gives an overview of all models trained during the search process along |
| 566 | + with various statistics about their training. |
| 567 | +
|
| 568 | + The availble statistics are: |
| 569 | +
|
| 570 | + **Simple**: |
| 571 | +
|
| 572 | + * ``"model_id"`` - The id given to a model by ``autosklearn``. |
| 573 | + * ``"rank"`` - The rank of the model based on it's ``"cost"``. |
| 574 | + * ``"ensemble_weight"`` - The weight given to the model in the ensemble. |
| 575 | + * ``"type"`` - The type of classifier/regressor used. |
| 576 | + * ``"cost"`` - The loss of the model on the validation set. |
| 577 | + * ``"duration"`` - Length of time the model was optimized for. |
| 578 | +
|
| 579 | + **Detailed**: |
| 580 | + The detailed view includes all of the simple statistics along with the |
| 581 | + following. |
| 582 | +
|
| 583 | + * ``"config_id"`` - The id used by SMAC for optimization. |
| 584 | + * ``"budget"`` - How much budget was allocated to this model. |
| 585 | + * ``"status"`` - The return status of training the model with SMAC. |
| 586 | + * ``"train_loss"`` - The loss of the model on the training set. |
| 587 | + * ``"balancing_strategy"`` - The balancing strategy used for data preprocessing. |
| 588 | + * ``"start_time"`` - Time the model began being optimized |
| 589 | + * ``"end_time"`` - Time the model ended being optimized |
| 590 | + * ``"data_preprocessors"`` - The preprocessors used on the data |
| 591 | + * ``"feature_preprocessors"`` - The preprocessors for features types |
| 592 | +
|
| 593 | + Parameters |
| 594 | + ---------- |
| 595 | + detailed: bool = False |
| 596 | + Whether to give detailed information or just a simple overview. |
| 597 | +
|
| 598 | + ensemble_only: bool = True |
| 599 | + Whether to view only models included in the ensemble or all models |
| 600 | + trained. |
| 601 | +
|
| 602 | + top_k: int or "all" = "all" |
| 603 | + How many models to display. |
| 604 | +
|
| 605 | + sort_by: str = 'cost' |
| 606 | + What column to sort by. If that column is not present, the |
| 607 | + sorting defaults to the ``"model_id"`` index column. |
| 608 | +
|
| 609 | + sort_order: "auto" or "ascending" or "descending" = "auto" |
| 610 | + Which sort order to apply to the ``sort_by`` column. If left |
| 611 | + as ``"auto"``, it will sort by a sensible default where "better" is |
| 612 | + on top, otherwise defaulting to the pandas default for |
| 613 | + `DataFrame.sort_values`_ if there is no obvious "better". |
| 614 | +
|
| 615 | + .. _DataFrame.sort_values: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sort_values.html |
| 616 | +
|
| 617 | + include: Optional[str or Iterable[str]] |
| 618 | + Items to include, other items not specified will be excluded. |
| 619 | + The exception is the ``"model_id"`` index column which is always included. |
| 620 | +
|
| 621 | + If left as ``None``, it will resort back to using the ``detailed`` |
| 622 | + param to decide the columns to include. |
| 623 | +
|
| 624 | + Returns |
| 625 | + ------- |
| 626 | + pd.DataFrame |
| 627 | + A dataframe of statistics for the models, ordered by ``sort_by``. |
| 628 | +
|
| 629 | + """ # noqa (links are too long) |
| 630 | + # TODO validate that `self` is fitted. This is required for |
| 631 | + # self.ensemble_ to get the identifiers of models it will generate |
| 632 | + # weights for. |
| 633 | + column_types = AutoSklearnEstimator._leaderboard_columns() |
| 634 | + |
| 635 | + # Validation of top_k |
| 636 | + if ( |
| 637 | + not (isinstance(top_k, str) or isinstance(top_k, int)) |
| 638 | + or (isinstance(top_k, str) and top_k != 'all') |
| 639 | + or (isinstance(top_k, int) and top_k <= 0) |
| 640 | + ): |
| 641 | + raise ValueError(f"top_k={top_k} must be a positive integer or pass" |
| 642 | + " `top_k`='all' to view results for all models") |
| 643 | + |
| 644 | + # Validate columns to include |
| 645 | + if isinstance(include, str): |
| 646 | + include = [include] |
| 647 | + |
| 648 | + if include == ['model_id']: |
| 649 | + raise ValueError('Must provide more than just `model_id`') |
| 650 | + |
| 651 | + if include is not None: |
| 652 | + columns = [*include] |
| 653 | + |
| 654 | + # 'model_id' should always be present as it is the unique index |
| 655 | + # used for pandas |
| 656 | + if 'model_id' not in columns: |
| 657 | + columns.append('model_id') |
| 658 | + |
| 659 | + invalid_include_items = set(columns) - set(column_types['all']) |
| 660 | + if len(invalid_include_items) != 0: |
| 661 | + raise ValueError(f"Values {invalid_include_items} are not known" |
| 662 | + f" columns to include, must be contained in " |
| 663 | + f"{column_types['all']}") |
| 664 | + elif detailed: |
| 665 | + columns = column_types['all'] |
| 666 | + else: |
| 667 | + columns = column_types['simple'] |
| 668 | + |
| 669 | + # Validation of sorting |
| 670 | + if sort_by not in column_types['all']: |
| 671 | + raise ValueError(f"sort_by='{sort_by}' must be one of included " |
| 672 | + f"columns {set(column_types['all'])}") |
| 673 | + |
| 674 | + valid_sort_orders = ['auto', 'ascending', 'descending'] |
| 675 | + if not (isinstance(sort_order, str) and sort_order in valid_sort_orders): |
| 676 | + raise ValueError(f"`sort_order` = {sort_order} must be a str in " |
| 677 | + f"{valid_sort_orders}") |
| 678 | + |
| 679 | + # To get all the models that were optmized, we collect what we can from |
| 680 | + # runhistory first. |
| 681 | + def has_key(rv, key): |
| 682 | + return rv.additional_info and key in rv.additional_info |
| 683 | + |
| 684 | + model_runs = { |
| 685 | + rval.additional_info['num_run']: { |
| 686 | + 'model_id': rval.additional_info['num_run'], |
| 687 | + 'seed': rkey.seed, |
| 688 | + 'budget': rkey.budget, |
| 689 | + 'duration': rval.time, |
| 690 | + 'config_id': rkey.config_id, |
| 691 | + 'start_time': rval.starttime, |
| 692 | + 'end_time': rval.endtime, |
| 693 | + 'status': str(rval.status), |
| 694 | + 'cost': rval.cost, |
| 695 | + 'train_loss': rval.additional_info['train_loss'] |
| 696 | + if has_key(rval, 'train_loss') else None, |
| 697 | + 'config_origin': rval.additional_info['configuration_origin'] |
| 698 | + if has_key(rval, 'configuration_origin') else None |
| 699 | + } |
| 700 | + for rkey, rval in self.automl_.runhistory_.data.items() |
| 701 | + if has_key(rval, 'num_run') |
| 702 | + } |
| 703 | + |
| 704 | + # Next we get some info about the model itself |
| 705 | + model_class_strings = { |
| 706 | + AutoMLClassifier: 'classifier', |
| 707 | + AutoMLRegressor: 'regressor' |
| 708 | + } |
| 709 | + model_type = model_class_strings.get(self._get_automl_class(), None) |
| 710 | + if model_type is None: |
| 711 | + raise RuntimeError(f"Unknown `automl_class` {self._get_automl_class()}") |
| 712 | + |
| 713 | + # A dict mapping model ids to their configurations |
| 714 | + configurations = self.automl_.runhistory_.ids_config |
| 715 | + |
| 716 | + for model_id, run_info in model_runs.items(): |
| 717 | + config_id = run_info['config_id'] |
| 718 | + run_config = configurations[config_id]._values |
| 719 | + |
| 720 | + run_info.update({ |
| 721 | + 'balancing_strategy': run_config.get('balancing:strategy', None), |
| 722 | + 'type': run_config[f'{model_type}:__choice__'], |
| 723 | + 'data_preprocessors': [ |
| 724 | + value for key, value in run_config.items() |
| 725 | + if 'data_preprocessing' in key and '__choice__' in key |
| 726 | + ], |
| 727 | + 'feature_preprocessors': [ |
| 728 | + value for key, value in run_config.items() |
| 729 | + if 'feature_preprocessor' in key and '__choice__' in key |
| 730 | + ] |
| 731 | + }) |
| 732 | + |
| 733 | + # Get the models ensemble weight if it has one |
| 734 | + # TODO both implementing classes of AbstractEnsemble have a property |
| 735 | + # `identifiers_` and `weights_`, might be good to put it as an |
| 736 | + # abstract property |
| 737 | + # TODO `ensemble_.identifiers_` and `ensemble_.weights_` are loosely |
| 738 | + # tied together by ordering, might be better to store as tuple |
| 739 | + for i, weight in enumerate(self.automl_.ensemble_.weights_): |
| 740 | + (_, model_id, _) = self.automl_.ensemble_.identifiers_[i] |
| 741 | + model_runs[model_id]['ensemble_weight'] = weight |
| 742 | + |
| 743 | + # Filter out non-ensemble members if needed, else fill in a default |
| 744 | + # value of 0 if it's missing |
| 745 | + if ensemble_only: |
| 746 | + model_runs = { |
| 747 | + model_id: info |
| 748 | + for model_id, info in model_runs.items() |
| 749 | + if ('ensemble_weight' in info and info['ensemble_weight'] > 0) |
| 750 | + } |
| 751 | + else: |
| 752 | + for model_id, info in model_runs.items(): |
| 753 | + if 'ensemble_weight' not in info: |
| 754 | + info['ensemble_weight'] = 0 |
| 755 | + |
| 756 | + # `rank` relies on `cost` so we include `cost` |
| 757 | + # We drop it later if it's not requested |
| 758 | + if 'rank' in columns and 'cost' not in columns: |
| 759 | + columns = [*columns, 'cost'] |
| 760 | + |
| 761 | + # Finally, convert into a tabular format by converting the dict into |
| 762 | + # column wise orientation. |
| 763 | + dataframe = pd.DataFrame({ |
| 764 | + col: [run_info[col] for run_info in model_runs.values()] |
| 765 | + for col in columns if col != 'rank' |
| 766 | + }) |
| 767 | + |
| 768 | + # Give it an index, even if not in the `include` |
| 769 | + dataframe.set_index('model_id', inplace=True) |
| 770 | + |
| 771 | + # Add the `rank` column if needed, dropping `cost` if it's not |
| 772 | + # requested by the user |
| 773 | + if 'rank' in columns: |
| 774 | + dataframe.sort_values(by='cost', ascending=True, inplace=True) |
| 775 | + dataframe.insert(column='rank', |
| 776 | + value=range(1, len(dataframe) + 1), |
| 777 | + loc=list(columns).index('rank') - 1) # account for `model_id` |
| 778 | + |
| 779 | + if 'cost' not in columns: |
| 780 | + dataframe.drop('cost', inplace=True) |
| 781 | + |
| 782 | + # Decide on the sort order depending on what it gets sorted by |
| 783 | + descending_columns = ['ensemble_weight', 'duration'] |
| 784 | + if sort_order == 'auto': |
| 785 | + ascending_param = False if sort_by in descending_columns else True |
| 786 | + else: |
| 787 | + ascending_param = False if sort_order == 'descending' else True |
| 788 | + |
| 789 | + # Sort by the given column name, defaulting to 'model_id' if not present |
| 790 | + if sort_by not in dataframe.columns: |
| 791 | + self.automl_._logger.warning(f"sort_by = '{sort_by}' was not present" |
| 792 | + ", defaulting to sort on the index " |
| 793 | + "'model_id'") |
| 794 | + sort_by = 'model_id' |
| 795 | + |
| 796 | + # Cost can be the same but leave rank all over the place |
| 797 | + if 'rank' in columns and sort_by == 'cost': |
| 798 | + dataframe.sort_values(by=[sort_by, 'rank'], |
| 799 | + ascending=[ascending_param, True], |
| 800 | + inplace=True) |
| 801 | + else: |
| 802 | + dataframe.sort_values(by=sort_by, |
| 803 | + ascending=ascending_param, |
| 804 | + inplace=True) |
| 805 | + |
| 806 | + # Lastly, just grab the top_k |
| 807 | + if top_k == 'all' or top_k >= len(dataframe): |
| 808 | + top_k = len(dataframe) |
| 809 | + |
| 810 | + dataframe = dataframe.head(top_k) |
| 811 | + |
| 812 | + return dataframe |
| 813 | + |
| 814 | + @staticmethod |
| 815 | + def _leaderboard_columns() -> Dict[Literal['all', 'simple', 'detailed'], List[str]]: |
| 816 | + all = [ |
| 817 | + "model_id", "rank", "ensemble_weight", "type", "cost", "duration", |
| 818 | + "config_id", "train_loss", "seed", "start_time", "end_time", |
| 819 | + "budget", "status", "data_preprocessors", "feature_preprocessors", |
| 820 | + "balancing_strategy", "config_origin" |
| 821 | + ] |
| 822 | + simple = [ |
| 823 | + "model_id", "rank", "ensemble_weight", "type", "cost", "duration" |
| 824 | + ] |
| 825 | + detailed = all |
| 826 | + return {'all': all, 'detailed': detailed, 'simple': simple} |
| 827 | + |
553 | 828 | def _get_automl_class(self): |
554 | 829 | raise NotImplementedError() |
555 | 830 |
|
|
0 commit comments