Source code for waveletml.helpers.data_scaler

#!/usr/bin/env python
# Created by "Thieu" at 03:48, 19/05/2025 ----------%
#       Email: nguyenthieu2102@gmail.com            %                                                    
#       Github: https://github.com/thieu1995        %                         
# --------------------------------------------------%

import numpy as np
from scipy.stats import boxcox, yeojohnson
from scipy.special import inv_boxcox
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler


[docs]class OneHotEncoder: """ A simple implementation of one-hot encoding for 1D categorical data. Attributes: categories_ (np.ndarray): Sorted array of unique categories fitted from the input data. """ def __init__(self): """Initialize the encoder with no categories.""" self.categories_ = None
[docs] def fit(self, X): """ Fit the encoder to the unique categories in X. Args: X (array-like): 1D array of categorical values. Returns: self: Fitted OneHotEncoder instance. """ X = np.asarray(X).ravel() self.categories_ = np.unique(X) return self
[docs] def transform(self, X): """ Transform input data into one-hot encoded format. Args: X (array-like): 1D array of categorical values. Returns: np.ndarray: One-hot encoded array of shape (n_samples, n_categories). Raises: ValueError: If the encoder has not been fitted or unknown category is found. """ if self.categories_ is None: raise ValueError("The encoder has not been fitted yet.") X = np.asarray(X).ravel() one_hot = np.zeros((X.shape[0], len(self.categories_)), dtype=int) for i, val in enumerate(X): indices = np.where(self.categories_ == val)[0] if len(indices) == 0: raise ValueError(f"Unknown category encountered during transform: {val}") one_hot[i, indices[0]] = 1 return one_hot
[docs] def fit_transform(self, X): """ Fit the encoder to X and transform X. Args: X (array-like): 1D array of categorical values. Returns: np.ndarray: One-hot encoded array of shape (n_samples, n_categories). """ return self.fit(X).transform(X)
[docs] def inverse_transform(self, one_hot): """ Convert one-hot encoded data back to original categories. Args: one_hot (np.ndarray): 2D array of one-hot encoded data. Returns: np.ndarray: 1D array of original categorical values. Raises: ValueError: If the encoder has not been fitted or shape mismatch occurs. """ if self.categories_ is None: raise ValueError("The encoder has not been fitted yet.") if one_hot.shape[1] != len(self.categories_): raise ValueError("The shape of the input does not match the number of categories.") return np.array([self.categories_[np.argmax(row)] for row in one_hot])
[docs]class LabelEncoder: """ Encode categorical labels as integer indices and decode them back. This class maps unique categorical labels to integers from 0 to n_classes - 1. """ def __init__(self): """ Initialize the label encoder. """ self.unique_labels = None self.label_to_index = {}
[docs] def fit(self, y): """ Fit the encoder by finding unique labels in the input data. Parameters ---------- y : array-like Input labels. Returns ------- self : LabelEncoder Fitted LabelEncoder instance. """ y = np.asarray(y).ravel() self.unique_labels = np.unique(y) self.label_to_index = {label: i for i, label in enumerate(self.unique_labels)} return self
[docs] def transform(self, y): """ Transform labels to integer indices. Parameters ---------- y : array-like Labels to encode. Returns ------- encoded_labels : np.ndarray Encoded integer labels. Raises ------ ValueError If the encoder has not been fitted or unknown labels are found. """ if self.unique_labels is None: raise ValueError("Label encoder has not been fit yet.") y = np.asarray(y).ravel() encoded = [] for label in y: if label not in self.label_to_index: raise ValueError(f"Unknown label: {label}") encoded.append(self.label_to_index[label]) return np.array(encoded)
[docs] def fit_transform(self, y): """ Fit the encoder and transform labels in one step. Parameters ---------- y : array-like of shape (n_samples,) Input labels. Returns ------- np.ndarray Encoded integer labels. """ return self.fit(y).transform(y)
[docs] def inverse_transform(self, y): """ Transform integer indices back to original labels. Parameters ---------- y : array-like of int Encoded integer labels. Returns ------- original_labels : np.ndarray Original labels. Raises ------ ValueError If the encoder has not been fitted or index is out of bounds. """ if self.unique_labels is None: raise ValueError("Label encoder has not been fit yet.") y = np.asarray(y).ravel() return np.array([self.unique_labels[i] if 0 <= i < len(self.unique_labels) else "unknown" for i in y])
[docs]class ObjectiveScaler: """ For label scaler in classification (binary and multiple classification) """ def __init__(self, obj_name="sigmoid", ohe_scaler=None): """ ohe_scaler: Need to be an instance of One-Hot-Encoder for softmax scaler (multiple classification problem) """ self.obj_name = obj_name self.ohe_scaler = ohe_scaler
[docs] def transform(self, data): if self.obj_name == "sigmoid" or self.obj_name == "self": return data elif self.obj_name == "hinge": data = np.squeeze(np.array(data)) data[np.where(data == 0)] = -1 return data elif self.obj_name == "softmax": data = self.ohe_scaler.fit_transform(np.reshape(data, (-1, 1))) return data
[docs] def inverse_transform(self, data): if self.obj_name == "sigmoid": data = np.squeeze(np.array(data)) data = np.rint(data).astype(int) elif self.obj_name == "hinge": data = np.squeeze(np.array(data)) data = np.ceil(data).astype(int) data[np.where(data == -1)] = 0 elif self.obj_name == "softmax": data = np.squeeze(np.array(data)) data = np.argmax(data, axis=1) return data
[docs]class Log1pScaler(BaseEstimator, TransformerMixin): """ Apply the natural logarithm (base e) to each element of the input data. This is useful for transforming data that may have a long tail distribution. """
[docs] def fit(self, X, y=None): # LogETransformer doesn't require fitting, so we simply return self. return self
[docs] def transform(self, X): # Apply the natural logarithm to each element of the input data return np.log1p(X)
[docs] def inverse_transform(self, X): # Apply the exponential function to reverse the logarithmic transformation return np.expm1(X)
[docs]class LogeScaler(BaseEstimator, TransformerMixin): """ Apply the natural logarithm (base e) to each element of the input data. This is useful for transforming data that may have a long tail distribution. """
[docs] def fit(self, X, y=None): # LogETransformer doesn't require fitting, so we simply return self. return self
[docs] def transform(self, X): # Apply the natural logarithm (base e) to each element of the input data return np.log(X)
[docs] def inverse_transform(self, X): # Apply the exponential function to reverse the logarithmic transformation return np.exp(X)
[docs]class SqrtScaler(BaseEstimator, TransformerMixin): """ Apply the square root transformation to each element of the input data. This is useful for transforming data that may have a long tail distribution. """
[docs] def fit(self, X, y=None): # SqrtScaler doesn't require fitting, so we simply return self. return self
[docs] def transform(self, X): # Apply the square root transformation to each element of the input data return np.sqrt(X)
[docs] def inverse_transform(self, X): # Apply the square of each element to reverse the square root transformation return X ** 2
[docs]class BoxCoxScaler(BaseEstimator, TransformerMixin): """ Apply the Box-Cox transformation to stabilize variance and make the data more normally distributed. The Box-Cox transformation is only defined for positive data. """ def __init__(self, lmbda=None): self.lmbda = lmbda
[docs] def fit(self, X, y=None): # Estimate the lambda parameter from the data if not provided if self.lmbda is None: _, self.lmbda = boxcox(X.flatten()) return self
[docs] def transform(self, X): # Apply the Box-Cox transformation to the data X_new = boxcox(X.flatten(), lmbda=self.lmbda) return X_new.reshape(X.shape)
[docs] def inverse_transform(self, X): # Inverse transform using the original lambda parameter return inv_boxcox(X, self.lmbda)
[docs]class YeoJohnsonScaler(BaseEstimator, TransformerMixin): """ Apply the Yeo-Johnson transformation to stabilize variance and make the data more normally distributed. The Yeo-Johnson transformation can handle both positive and negative data. """ def __init__(self, lmbda=None): self.lmbda = lmbda
[docs] def fit(self, X, y=None): # Estimate the lambda parameter from the data if not provided if self.lmbda is None: _, self.lmbda = yeojohnson(X.flatten()) return self
[docs] def transform(self, X): # Apply the Yeo-Johnson transformation to the data X_new = boxcox(X.flatten(), lmbda=self.lmbda) return X_new.reshape(X.shape)
[docs] def inverse_transform(self, X): # Inverse transform using the original lambda parameter return inv_boxcox(X, self.lmbda)
[docs]class SinhArcSinhScaler(BaseEstimator, TransformerMixin): """ Apply the sinh-arc-sinh transformation to increase kurtosis and skewness of normal random variable. This transformation is useful for data that are normally distributed but need to be transformed to have higher kurtosis and skewness. """ # https://stats.stackexchange.com/questions/43482/transformation-to-increase-kurtosis-and-skewness-of-normal-r-v def __init__(self, epsilon=0.1, delta=1.0): self.epsilon = epsilon self.delta = delta
[docs] def fit(self, X, y=None): return self
[docs] def transform(self, X): return np.sinh(self.delta * np.arcsinh(X) - self.epsilon)
[docs] def inverse_transform(self, X): return np.sinh((np.arcsinh(X) + self.epsilon) / self.delta)