Source code for rain.nodes.pysad.transformer

"""
 Copyright (C) 2023 Università degli Studi di Camerino and Sigma S.p.A.
 Authors: Alessandro Antinori, Rosario Capparuccia, Riccardo Coltrinari, Flavio Corradini, Marco Piangerelli, Barbara Re, Marco Scarpetta

 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU Affero General Public License as
 published by the Free Software Foundation, either version 3 of the
 License, or (at your option) any later version.

 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU Affero General Public License for more details.

 You should have received a copy of the GNU Affero General Public License
 along with this program.  If not, see <https://www.gnu.org/licenses/>.
 """

import pandas as pd

from rain import Tags, LibTag, TypeTag
from rain.core.parameter import Parameters, KeyValueParameter
from rain.nodes.pysad.node_structure import PySadTransformer, PySadNode
from pysad.transform.preprocessing import InstanceUnitNormScaler as IUNScaler
from pysad.transform.probability_calibration import ConformalProbabilityCalibrator as Cpc, GaussianTailProbabilityCalibrator as Gtpc


[docs]class InstanceUnitNormScaler(PySadTransformer):
    """A scaler that makes the instance feature vector's norm equal to 1, i.e., the unit vector.

    Input
    -----
    dataset : pd.DataFrame
        A Pandas DataFrame.

    Output
    ------
    dataset : pd.DataFrame
        A Pandas DataFrame.

    Parameters
    ----------
    node_id : str
        Id of the node.
    pow : float, default=2
        The power, for which the norm is calculated. pow=2 is equivalent to the euclidean distance.
    """

    def __init__(self, node_id: str, pow: int = 2):
        super(InstanceUnitNormScaler, self).__init__(node_id)

        self.parameters = Parameters(
            pow=KeyValueParameter("pow", str, pow)
        )
        self.transformer = IUNScaler(**self.parameters.get_dict())

[docs]    def execute(self):
        columns = self.dataset.columns
        self.dataset = self.transformer.fit_transform(self.dataset.to_numpy())
        self.dataset = pd.DataFrame(self.dataset, columns=columns)


[docs]class ConformalProbabilityCalibrator(PySadNode):
    """This class provides an interface to convert the scores into probabilities through conformal prediction.

    Input
    -----
    scores : pd.DataFrame
        A Pandas DataFrame containing the scores.

    Output
    ------
    scores : pd.DataFrame
        A Pandas DataFrame containing the scores.

    Parameters
    ----------
    node_id : str
        Id of the node.
    windowed : bool, default=True
        Whether the probability calibrator is windowed so that forget scores that are older than `window_size`.
    window_size : int, default=300
        The size of window for running average and std.
    """

    _input_vars = {"scores": pd.DataFrame}
    _output_vars = {"scores": pd.DataFrame}

    def __init__(self, node_id: str, windowed: bool = True, window_size: int = 300):
        super(ConformalProbabilityCalibrator, self).__init__(node_id)

        self.parameters = Parameters(
            windowed=KeyValueParameter("windowed", bool, windowed),
            window_size=KeyValueParameter("window_size", int, window_size)
        )
        self.calibrator = Cpc(**self.parameters.get_dict())

[docs]    def execute(self):
        self.scores = self.calibrator.fit_transform(self.scores)

    @classmethod
    def _get_tags(cls):
        return Tags(LibTag.PYSAD, TypeTag.TRANSFORMER)


[docs]class GaussianTailProbabilityCalibrator(PySadNode):
    """This class provides an interface to convert the scores into probabilities via Q-function, i.e., the tail
     function of Gaussian distribution.

    Input
    -----
    scores : pd.DataFrame
        A Pandas DataFrame containing the scores.

    Output
    ------
    scores : pd.DataFrame
        A Pandas DataFrame containing the scores.

    Parameters
    ----------
    node_id : str
        Id of the node.
    running_statistics : bool, default=True
        Whether to calculate the mean and variance through running window.
    window_size : int, default=300
        The size of window for running average and std. Ignored if `running_statistics` parameter is False.
    """

    _input_vars = {"scores": pd.DataFrame}
    _output_vars = {"scores": pd.DataFrame}

    def __init__(self, node_id: str, running_statistics: bool = True, window_size: int = 300):
        super(GaussianTailProbabilityCalibrator, self).__init__(node_id)

        self.parameters = Parameters(
            running_statistics=KeyValueParameter("windowed", bool, running_statistics),
            window_size=KeyValueParameter("window_size", int, window_size)
        )
        self.calibrator = Cpc(**self.parameters.get_dict())

[docs]    def execute(self):
        self.scores = self.calibrator.fit_transform(self.scores)

    @classmethod
    def _get_tags(cls):
        return Tags(LibTag.PYSAD, TypeTag.TRANSFORMER)