Source code for bamt.preprocess.discretization

from copy import copy
from typing import Tuple

import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import KBinsDiscretizer


[docs] def get_nodes_sign(data: pd.DataFrame) -> dict: """Function to define sign of the node neg - if node has negative values pos - if node has only positive values Args: data (pd.DataFrame): input dataset Returns: dict: output dictionary where 'key' - node name and 'value' - sign of data """ nodes_types = get_nodes_type(data) columns_sign = dict() for c in data.columns.to_list(): if nodes_types[c] == "cont": if (data[c] < 0).any(): columns_sign[c] = "neg" else: columns_sign[c] = "pos" return columns_sign
[docs] def get_nodes_type(data: pd.DataFrame) -> dict: """Function to define the type of the node disc - discrete node cont - continuous Args: data (pd.DataFrame): input dataset Returns: dict: output dictionary where 'key' - node name and 'value' - node type """ column_type = dict() for c in data.columns.to_list(): if (data[c].dtypes == "float64") | (data[c].dtypes == "float32"): column_type[c] = "cont" if ( (data[c].dtypes == "str") | (data[c].dtypes == "O") | (data[c].dtypes == "b") ): column_type[c] = "disc" if (data[c].dtypes == "int64") | (data[c].dtypes == "int32"): column_type[c] = "disc" return column_type
[docs] def discretization( data: pd.DataFrame, method: str, columns: list, bins: int = 5 ) -> Tuple[pd.DataFrame, KBinsDiscretizer]: """Discretization of continuous parameters Args: data (pd.DataFrame): input dataset method (str): discretization approach (equal_intervals, equal_frequency, kmeans) columns (list): name of columns for discretization bins (int, optional): number of bins. Defaults to 5. Returns: pd.DataFrame: output dataset with discretized parameters KBinsDiscretizer: fitted exemplar of discretization class """ data = data.dropna() data.reset_index(inplace=True, drop=True) d_data = copy(data) est = KBinsDiscretizer(n_bins=bins, encode="ordinal") strategy_dict = { "equal_intervals": "uniform", "equal_frequency": "quantile", "kmeans": "kmeans", } if method in strategy_dict: est.strategy = strategy_dict[method] data_discrete = est.fit_transform(d_data.loc[:, columns].values) d_data[columns] = data_discrete.astype("int") else: raise Exception("This discretization method is not supported") return d_data, est
[docs] def label_encoding(data, columns): d_data = copy(data) encoder_dict = dict() for column in columns: le = preprocessing.LabelEncoder() d_data[column] = le.fit_transform(d_data[column].values) mapping = dict(zip(le.classes_, range(len(le.classes_)))) encoder_dict[column] = mapping return d_data, encoder_dict
[docs] def onehot_encoding(data, columns): d_data = pd.get_dummies(data, columns=columns) return d_data, None
[docs] def code_categories( data: pd.DataFrame, method: str, columns: list ) -> Tuple[pd.DataFrame, dict]: """Encoding categorical parameters Args: data (pd.DataFrame): input dataset method (str): method of encoding (label or onehot) columns (list): name of categorical columns Returns: pd.DataFrame: output dataset with encoded parameters dict: dictionary with values and codes """ data = data.dropna() data.reset_index(inplace=True, drop=True) encoding_func_dict = {"label": label_encoding, "onehot": onehot_encoding} if method in encoding_func_dict: d_data, encoder_dict = encoding_func_dict[method](data, columns) else: raise Exception("This encoding method is not supported") return d_data, encoder_dict
[docs] def inverse_discretization( data: pd.DataFrame, columns: list, discretizer: KBinsDiscretizer ) -> pd.DataFrame: """Inverse discretization for numeric params Args: data (pd.DataFrame): input dataset with discrete values columns (list): colums for inverse_discretization discretizer (KBinsDiscretizer): fitted exemplar of discretization class Returns: pd.DataFrame: output dataset with continuous values """ new_data = copy(data) new_data[columns] = discretizer.inverse_transform(new_data[columns].values) return new_data
[docs] def decode(data: pd.DataFrame, columns: list, encoder_dict: dict) -> pd.DataFrame: """Decoding categorical params to initial labels Args: data (pd.DataFrame): input dataset with encoded params columns (list): columns for decoding encoder_dict (dict): dictionary with values and codes Returns: pd.DataFrame: output dataset with decoded params """ for column in columns: dict_parameter = encoder_dict[column] inv_map = {v: k for k, v in dict_parameter.items()} data[column] = data[column].apply(lambda x: inv_map(x)) return data