Source code for rain.nodes.sklearn.node_structure

"""
 Copyright (C) 2023 Università degli Studi di Camerino and Sigma S.p.A.
 Authors: Alessandro Antinori, Rosario Capparuccia, Riccardo Coltrinari, Flavio Corradini, Marco Piangerelli, Barbara Re, Marco Scarpetta

 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU Affero General Public License as
 published by the Free Software Foundation, either version 3 of the
 License, or (at your option) any later version.

 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU Affero General Public License for more details.

 You should have received a copy of the GNU Affero General Public License
 along with this program.  If not, see <https://www.gnu.org/licenses/>.
 """

import pandas
import sklearn.base
from abc import abstractmethod

from rain.core.base import ComputationalNode, Tags, LibTag, TypeTag
from rain.core.exception import (
    EstimatorNotFoundException,
    InputNotFoundException,
)


[docs]class SklearnNode(ComputationalNode): """Base class for all the nodes that use the sklearn library.""" _methods = {} def __init__(self, node_id): super(SklearnNode, self).__init__(node_id) self._estimator_or_function = None
[docs] @abstractmethod def execute(self): raise NotImplementedError( "Method execute for class {} is not implemented yet.".format( self.__class__.__name__ ) )
[docs]class SklearnFunction(SklearnNode): """Base class for all the nodes that use an sklearn function.""" def __init__(self, node_id: str): super(SklearnFunction, self).__init__(node_id)
[docs] @abstractmethod def execute(self): raise NotImplementedError( "Method execute for class {} is not implemented yet.".format( self.__class__.__name__ ) )
@classmethod def _get_tags(cls): return Tags(LibTag.SKLEARN, TypeTag.TRANSFORMER)
[docs]class SklearnEstimator(SklearnNode): """Base class for all the nodes that use an sklearn Estimator. Input ----- fitted_model : sklearn.base.BaseEstimator A previously fitted model. dataset : pandas.DataFrame The dataset that will be used to perform the different methods on. Output ------ fitted_model : sklearn.base.BaseEstimator The model that results from the fit of the estimator. Parameters ---------- node_id : str Id of the node. execute : [fit] List of strings to specify the methods to execute. The allowed strings are those from the _method attribute. """ _input_vars = {"fitted_model": sklearn.base.BaseEstimator, "dataset": pandas.DataFrame} _methods = {"fit": False} _output_vars = {"fitted_model": sklearn.base.BaseEstimator} def __init__(self, node_id: str, execute: list): super(SklearnEstimator, self).__init__(node_id) for method in execute: if method not in self._methods.keys(): raise Exception( "Method {} not found for estimator {}".format( method, self.__class__.__name__ ) ) self._methods[method] = True
[docs] def fit(self): self.fitted_model = self._estimator_or_function.fit(self.dataset)
[docs] def execute(self): if self._estimator_or_function is None: raise EstimatorNotFoundException( "The estimator to use is not set for class {}".format( self.__class__.__name__ ) ) # se la fit deve essere eseguita, allora sarà sempre eseguita per prima. # se la fit deve sovrascrivere il fitted_model, allora rimuovere la # seconda parte dell'if statement ("and self.fitted_model is None") if self._methods.get("fit") and self.fitted_model is None: self.fit() remaining_methods = [ method for method, must_exec in self._methods.items() if must_exec and not method == "fit" ] for method_name in remaining_methods: method = eval("self.{}".format(method_name)) method()
@classmethod def _get_tags(cls): return Tags(LibTag.SKLEARN, TypeTag.ESTIMATOR)
[docs]class PredictorMixin: """Mixin class to add a prediction functionality to an estimator.""" _output_vars = {"predictions": pandas.DataFrame} _methods = {"predict": False}
[docs] def predict(self): if self.dataset is None: raise InputNotFoundException( "The 'dataset' input is not set for node {}".format( self.__class__.__name__ ) ) if self.fitted_model is not None: self.predictions = self.fitted_model.predict(self.dataset) if ( type(self.predictions) is not pandas.DataFrame ): # some estimators returns a numpy ndarray self.predictions = pandas.DataFrame(self.predictions)
[docs]class ScorerMixin: """Mixin class to add a scoring functionality to an estimator.""" _input_vars = {"score_targets": pandas.DataFrame} _output_vars = {"score_value": float} _methods = {"score": False}
[docs] def score(self): if self.dataset is None: raise InputNotFoundException( "The 'dataset' input is not set for node {}".format( self.__class__.__name__ ) ) if self.fitted_model is not None: if self._estimator_type == "classifier": if self.score_targets is None: raise InputNotFoundException( "The 'score_targets' input is not set for node {}".format( self.__class__.__name__ ) ) self.score_value = self.fitted_model.score( self.dataset, self.score_targets ) else: self.score_value = self.fitted_model.score(self.dataset)
[docs]class TransformerMixin: """Mixin class to add a transformer functionality to an estimator.""" _output_vars = {"transformed_dataset": pandas.DataFrame} _methods = {"transform": False}
[docs] def transform(self): if self.dataset is None: raise InputNotFoundException( "The 'dataset' input is not set for node {}".format( self.__class__.__name__ ) ) if self.fitted_model is not None: self.transformed_dataset = self.fitted_model.transform( self.dataset ) if ( type(self.transformed_dataset) is not pandas.DataFrame ): # some estimators returns a numpy ndarray self.transformed_dataset = pandas.DataFrame(self.dataset)
[docs]class SklearnClassifier(SklearnEstimator, PredictorMixin, ScorerMixin): """Base class for all the nodes that use an sklearn classifier. Input ----- fitted_model : sklearn.base.BaseEstimator A previously fitted model. dataset : pandas.DataFrame The dataset to be used by the estimator. fit_targets : pandas.DataFrame The dataset that will be used as targets (labels) to perform the fit of the classifier. score_targets : pandas.DataFrame The dataset that will be used as targets (labels) to perform the scoring. Output ------ fitted_model : sklearn.base.BaseEstimator The model that results from the fit of the estimator. predictions : pandas.DataFrame The predictions that result from the predict. score_value : float The score value that results from the scoring. Parameters ---------- node_id : str Id of the node. execute : [fit, predict, score] List of strings to specify the methods to execute. The allowed strings are those from the _method attribute. """ _estimator_type = "classifier" _input_vars = {"fit_targets": pandas.DataFrame} def __init__(self, node_id: str, execute: list): super(SklearnClassifier, self).__init__(node_id, execute)
[docs] def fit(self): if self.dataset is None: raise InputNotFoundException( "The 'fit_dataset' input is not set for node {}".format( self.__class__.__name__ ) ) elif self.fit_targets is None: raise InputNotFoundException( "The 'fit_targets' input is not set for node {}".format( self.__class__.__name__ ) ) self.fitted_model = self._estimator_or_function.fit( self.dataset, self.fit_targets )
@classmethod def _get_tags(cls): return Tags(LibTag.SKLEARN, TypeTag.CLASSIFIER)
[docs]class SklearnClusterer(SklearnEstimator, PredictorMixin, ScorerMixin, TransformerMixin): """Base class for all the nodes that use an sklearn clusterer. Input ----- fitted_model : sklearn.base.BaseEstimator A previously fitted model. dataset : pandas.DataFrame The dataset to be used by the estimator. score_targets : pandas.DataFrame The dataset that will be used as targets (labels) to perform the scoring. Output ------ fitted_model : sklearn.base.BaseEstimator The model that results from the fit of the estimator. predictions : pandas.DataFrame The predictions that result from the predict. score_value : float The score value that results from the scoring. transformed_dataset : pandas.DataFrame The dataset that results from the transform. Parameters ---------- node_id : str Id of the node. execute : [fit, predict, score, transform] List of strings to specify the methods to execute. The allowed strings are those from the _method attribute. """ _estimator_type = "clusterer" def __init__(self, node_id: str, execute: list): super(SklearnClusterer, self).__init__(node_id, execute) @classmethod def _get_tags(cls): return Tags(LibTag.SKLEARN, TypeTag.CLUSTERER)