Source code for rain.nodes.sklearn.functions

"""
 Copyright (C) 2023 Università degli Studi di Camerino and Sigma S.p.A.
 Authors: Alessandro Antinori, Rosario Capparuccia, Riccardo Coltrinari, Flavio Corradini, Marco Piangerelli, Barbara Re, Marco Scarpetta

 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU Affero General Public License as
 published by the Free Software Foundation, either version 3 of the
 License, or (at your option) any later version.

 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU Affero General Public License for more details.

 You should have received a copy of the GNU Affero General Public License
 along with this program.  If not, see <https://www.gnu.org/licenses/>.
 """

import pandas
from sklearn.metrics import davies_bouldin_score
from sklearn.model_selection import train_test_split

from rain.core.base import TypeTag, LibTag, Tags
from rain.core.parameter import Parameters, KeyValueParameter
from rain.nodes.sklearn.node_structure import SklearnFunction


[docs]class TrainTestDatasetSplit(SklearnFunction): """Node that uses the 'sklearn.model_selection.train_test_split' to split a dataset in two parts. Input ----- dataset : pandas.DataFrame The dataset to split. Output ------ train_dataset : pandas.DataFrame The training dataset. test_dataset : pandas.DataFrame The test dataset. Parameters ---------- node_id : str Id of the node. test_size : float, default=None The size as percentage of the test dataset (e.g. 0.3 is 30%). train_size : float, default=None The size as percentage of the train dataset (e.g. 0.7 is 70%) random_state : int, default=None Seed for the random generation. shuffle : bool, default=True Whether to shuffle the dataset before the splitting. """ _input_vars = {"dataset": pandas.DataFrame} _output_vars = { "train_dataset": pandas.DataFrame, "test_dataset": pandas.DataFrame, } def __init__( self, node_id: str, test_size: float = None, train_size: float = None, random_state: int = None, shuffle: bool = True, ): super(TrainTestDatasetSplit, self).__init__(node_id) self.parameters = Parameters( test_size=KeyValueParameter("test_size", float, test_size), train_size=KeyValueParameter("train_size", float, train_size), random_state=KeyValueParameter("random_state", int, random_state), shuffle=KeyValueParameter("shuffle", bool, shuffle), )
[docs] def execute(self): self.train_dataset, self.test_dataset = train_test_split( self.dataset, **self.parameters.get_dict() )
[docs]class TrainTestSampleTargetSplit(SklearnFunction): """Node that uses the 'sklearn.model_selection.train_test_split' to split two datasets in four parts. It is useful for classification where you have to split equally the sample and the target datasets. Input ----- sample_dataset : pandas.DataFrame The dataset containing the samples. target_dataset : pandas.DataFrame The dataset containing the target labels. Output ------ sample_train_dataset : pandas.DataFrame The training dataset containing the samples. sample_test_dataset : pandas.DataFrame The test dataset containing the samples. target_train_dataset : pandas.DataFrame The training dataset containing the target labels. target_test_dataset : pandas.DataFrame The test dataset containing the target labels. Parameters ---------- node_id : str Id of the node. test_size : float, default=None The size as percentage of the test dataset (e.g. 0.3 is 30%). train_size : float, default=None The size as percentage of the train dataset (e.g. 0.7 is 70%) random_state : int, default=None Seed for the random generation. shuffle : bool, default=True Whether to shuffle the dataset before the splitting. """ _input_vars = { "sample_dataset": pandas.DataFrame, "target_dataset": pandas.DataFrame, } _output_vars = { "sample_train_dataset": pandas.DataFrame, "sample_test_dataset": pandas.DataFrame, "target_train_dataset": pandas.DataFrame, "target_test_dataset": pandas.DataFrame, } def __init__( self, node_id: str, test_size: float = None, train_size: float = None, random_state: int = None, shuffle: bool = True, ): super(TrainTestSampleTargetSplit, self).__init__(node_id) self.parameters = Parameters( test_size=KeyValueParameter("test_size", float, test_size), train_size=KeyValueParameter("train_size", float, train_size), random_state=KeyValueParameter("random_state", int, random_state), shuffle=KeyValueParameter("shuffle", bool, shuffle), )
[docs] def execute(self): ( self.sample_train_dataset, self.sample_test_dataset, self.target_train_dataset, self.target_test_dataset, ) = train_test_split( self.sample_dataset, self.target_dataset, **self.parameters.get_dict() )
[docs]class DaviesBouldinScore(SklearnFunction): """ Computes the Davies-Bouldin score using the 'sklearn.metrics.davies_bouldin_score'. The score is defined as the average similarity measure of each cluster with its most similar cluster, where similarity is the ratio of within-cluster distances to between-cluster distances. Thus, clusters which are farther apart and less dispersed will result in a better score. The minimum score is zero, with lower values indicating better clustering. Input ----- samples_dataset : pandas.DataFrame The dataset containing the samples. labels : pandas.DataFrame The dataset containing the target labels. Output ------ score : float The davies boulding score value. Parameters ---------- node_id : str Id of the node. """ _input_vars = {"samples_dataset": pandas.DataFrame, "labels": pandas.DataFrame} _output_vars = {"score": float} def __init__(self, node_id: str): super(DaviesBouldinScore, self).__init__(node_id)
[docs] def execute(self): self.score = davies_bouldin_score(self.samples_dataset, self.labels)
@classmethod def _get_tags(cls): return Tags(LibTag.SKLEARN, TypeTag.METRICS)