Source code for rain.nodes.tpot.regression

"""
 Copyright (C) 2023 Università degli Studi di Camerino and Sigma S.p.A.
 Authors: Alessandro Antinori, Rosario Capparuccia, Riccardo Coltrinari, Flavio Corradini, Marco Piangerelli, Barbara Re, Marco Scarpetta

 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU Affero General Public License as
 published by the Free Software Foundation, either version 3 of the
 License, or (at your option) any later version.

 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU Affero General Public License for more details.

 You should have received a copy of the GNU Affero General Public License
 along with this program.  If not, see <https://www.gnu.org/licenses/>.
 """

import pickle

import pandas as pd
from tpot import TPOTRegressor
from rain import Tags, LibTag, TypeTag, ComputationalNode
from rain.core.parameter import Parameters, KeyValueParameter


[docs]class TPOTRegressionTrainer(ComputationalNode):
    """Node that returns the regression model trained with the TPOT library.

    Input
    -----
    dataset : pandas.DataFrame
        The dataset for training.

    Output
    ------
    code : str
        The Python code corresponding to the model.
    model : pickle
        The TPOT model in pickle format.

    Parameters
    ----------
    target_feature : str
        Name of the target feature.
    export_script : bool, default=False
        Whether to export the resulting Python script.
    generations : int, default=100
        Number of iterations to the run pipeline optimization process. It must be
        a positive number. If not set, the parameter max_time_mins must be defined
        as the runtime limit. Generally, TPOT will work better when you give it more
        generations (and therefore time) to optimize the pipeline. TPOT will evaluate
        POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total.
    population_size : int, default=100
        Number of individuals to retain in the GP population every generation.
        Generally, TPOT will work better when you give it more individuals
        (and therefore time) to optimize the pipeline. TPOT will evaluate
        POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total.
    offspring_size : int
        Number of offspring to produce in each GP generation.
        By default, offspring_size = population_size.
    mutation_rate : float, default=0.9
        Mutation rate for the genetic programming algorithm in the range [0.0, 1.0].
        This parameter tells the GP algorithm how many pipelines to apply random
        changes to every generation. We recommend using the default parameter unless
        you understand how the mutation rate affects GP algorithms.
    crossover_rate : float, default=0.1
        Crossover rate for the genetic programming algorithm in the range [0.0, 1.0].
        This parameter tells the genetic programming algorithm how many pipelines to
        "breed" every generation. We recommend using the default parameter unless you
        understand how the mutation rate affects GP algorithms.
    scoring : str, default='neg_mean_squared_error'
        Function used to evaluate the quality of a given pipeline for the
        problem. By default, mean squared error (MSE) is used for regression problems.
        Offers the same options as sklearn.model_selection.cross_val_score as well as
        a built-in score 'balanced_accuracy'. Regression metrics: ['neg_median_absolute_error',
        'neg_mean_absolute_error', 'neg_mean_squared_error', 'r2']
    cv : int, default=5
        The number of folds to evaluate each pipeline over in k-fold cross-validation
        during the TPOT optimization process.
    subsample : float, default=1.0
        Subsample ratio of the training instance. Setting it to 0.5 means that TPOT
        randomly collects half of training samples for pipeline optimization process.
    n_jobs : int, default=1
        Number of CPUs for evaluating pipelines in parallel during the TPOT
        optimization process. Assigning this to -1 will use as many cores as available
        on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used.
        Thus for n_jobs = -2, all CPUs but one are used.
    max_time_mins : int
        How many minutes TPOT has to optimize the pipeline.
        If not None, this setting will allow TPOT to run until max_time_mins minutes
        elapsed and then stop. TPOT will stop earlier if generationsis set and all
        generations are already evaluated.
    max_eval_time_mins : float, default=5
        How many minutes TPOT has to optimize a single pipeline.
        Setting this parameter to higher values will allow TPOT to explore more
        complex pipelines, but will also allow TPOT to run longer.
    random_state : int
        Random number generator seed for TPOT. Use this parameter to make sure
        that TPOT will give you the same results each time you run it against the
        same data set with that seed.
    config_dict : {TPOT light, TPOT MDR, TPOT sparse, TPOT NN}, default=None
        String 'TPOT light':
            TPOT uses a light version of operator configuration dictionary instead of
            the default one.
        String 'TPOT MDR':
            TPOT uses a list of TPOT-MDR operator configuration dictionary instead of
            the default one.
        String 'TPOT sparse':
            TPOT uses a configuration dictionary with a one-hot-encoder and the
            operators normally included in TPOT that also support sparse matrices.
        String 'TPOT NN':
            TPOT uses a configuration dictionary for PyTorch neural network classifiers
            included in `tpot.nn`.
    template : str, default=None
        Template of predefined pipeline structure. The option is for specifying a desired structure
        for the machine learning pipeline evaluated in TPOT. So far this option only supports
        linear pipeline structure. Each step in the pipeline should be a main class of operators
        (Selector, Transformer, Classifier or Regressor) or a specific operator
        (e.g. SelectPercentile) defined in TPOT operator configuration. If one step is a main class,
        TPOT will randomly assign all subclass operators (subclasses of SelectorMixin,
        TransformerMixin, ClassifierMixin or RegressorMixin in scikit-learn) to that step.
        Steps in the template are delimited by "-", e.g. "SelectPercentile-Transformer-Classifier".
        By default value of template is None, TPOT generates tree-based pipeline randomly.
    warm_start : bool, default=False
        Flag indicating whether the TPOT instance will reuse the population from
        previous calls to fit().
    memory : str, default=None
        If supplied, pipeline will cache each transformer after calling fit. This feature
        is used to avoid computing the fit transformers within a pipeline if the parameters
        and input data are identical with another fitted pipeline during optimization process.
        String 'auto':
            TPOT uses memory caching with a temporary directory and cleans it up upon shutdown.
        String path of a caching directory
            TPOT uses memory caching with the provided directory and TPOT does NOT clean
            the caching directory up upon shutdown. If the directory does not exist, TPOT will
            create it.
        None:
            TPOT does not use memory caching.
    use_dask : bool, default=False
        Whether to use Dask-ML's pipeline optimizations. This avoid re-fitting
        the same estimator on the same split of data multiple times. It
        will also provide more detailed diagnostics when using Dask's
        distributed scheduler.
    periodic_checkpoint_folder : str, default=None
        If supplied, a folder in which tpot will periodically save pipelines in pareto front so far while optimizing.
        Currently once per generation but not more often than once per 30 seconds.
        Useful in multiple cases:
            Sudden death before tpot could save optimized pipeline
            Track its progress
            Grab pipelines while it's still optimizing
    early_stop : int, default=None
        How many generations TPOT checks whether there is no improvement in optimization process.
        End optimization process if there is no improvement in the set number of generations.
    verbosity : int, default=0
        How much information TPOT communicates while it's running.
        0 = none, 1 = minimal, 2 = high, 3 = all.
        A setting of 2 or higher will add a progress bar during the optimization procedure.
    """

    _input_vars = {"dataset": pd.DataFrame}

    _output_vars = {"code": "str", "model": "pickle"}

    def __init__(self, node_id: str, target_feature: str = None, export_script: bool = False, generations: int = 100,
                 population_size: int = 100, offspring_size: int = None, mutation_rate: float = 0.9,
                 crossover_rate: float = 0.1, scoring: str = 'neg_mean_squared_error', cv: int = 5,
                 subsample: float = 1.0, n_jobs: int = 1, max_time_mins: int = None, max_eval_time_mins: float = 5,
                 random_state: int = None, config_dict: str = None, template: str = None, warm_start: bool = False,
                 memory: str = None, use_dask: bool = False, periodic_checkpoint_folder: str = None,
                 early_stop: int = None, verbosity: int = 0):
        super(TPOTRegressionTrainer, self).__init__(node_id)

        self.parameters = Parameters(
            target_feature=KeyValueParameter("target_feature", str, target_feature, True),
            export_script=KeyValueParameter("export_script", bool, export_script),
            generations=KeyValueParameter("generations", int, generations),
            population_size=KeyValueParameter("population_size", int, population_size),
            offspring_size=KeyValueParameter("offspring_size", int, offspring_size),
            mutation_rate=KeyValueParameter("mutation_rate", float, mutation_rate),
            crossover_rate=KeyValueParameter("crossover_rate", float, crossover_rate),
            scoring=KeyValueParameter("scoring", str, scoring),
            cv=KeyValueParameter("cv", int, cv),
            subsample=KeyValueParameter("subsample", float, subsample),
            n_jobs=KeyValueParameter("n_jobs", int, n_jobs),
            max_time_mins=KeyValueParameter("max_time_mins", int, max_time_mins),
            max_eval_time_mins=KeyValueParameter("max_eval_time_mins", float, max_eval_time_mins),
            random_state=KeyValueParameter("random_state", int, random_state),
            config_dict=KeyValueParameter("config_dict", str, config_dict),
            template=KeyValueParameter("template", str, template),
            warm_start=KeyValueParameter("warm_start", bool, warm_start),
            memory=KeyValueParameter("memory", str, memory),
            use_dask=KeyValueParameter("use_dask", bool, use_dask),
            periodic_checkpoint_folder=KeyValueParameter("periodic_checkpoint_folder", str, periodic_checkpoint_folder),
            early_stop=KeyValueParameter("early_stop", int, early_stop),
            verbosity=KeyValueParameter("verbosity", int, verbosity),
        )

[docs]    def execute(self):
        x_train = self.dataset.drop(self.parameters.target_feature.value, axis=1)
        y_train = self.dataset[self.parameters.target_feature.value]
        params = self.parameters.get_dict()
        del params['target_feature']
        del params['export_script']
        params['disable_update_check'] = True
        tpot = TPOTRegressor(**params)
        tpot.fit(x_train, y_train)
        self.code = tpot.export()
        self.model = pickle.dumps(tpot.fitted_pipeline_)

    @classmethod
    def _get_tags(cls):
        return Tags(LibTag.TPOT, TypeTag.TRAINER)


[docs]class TPOTRegressionPredictor(ComputationalNode):
    """Node that returns the predictions performed with a TPOT Regression model on the columns of a dataset
    without the target feature column.

    Input
    -----
    dataset : pandas.DataFrame
        The pandas DataFrame.

    model : pickle
        The TPOT Regression model in pickle format.

    Output
    ------
    predictions : pandas.DataFrame
        The DataFrame containing the predictions.
    """

    _input_vars = {"dataset": pd.DataFrame, "model": "pickle"}

    _output_vars = {"predictions": pd.DataFrame}

    def __init__(self, node_id: str):
        super(TPOTRegressionPredictor, self).__init__(node_id)
        self.predictions = {}

[docs]    def execute(self):
        tpot = pickle.loads(self.model)
        res = tpot.predict(self.dataset)
        self.predictions = pd.DataFrame(res)

    @classmethod
    def _get_tags(cls):
        return Tags(LibTag.TPOT, TypeTag.PREDICTOR)