Source code for finmlkit.feature.base

from abc import ABC, abstractmethod
from typing import Union, Optional, Sequence, Callable
import pandas as pd
from finmlkit.utils.log import get_logger
import numpy as np
from numpy.typing import NDArray

logger = get_logger(__name__)


[docs] class BaseTransform(ABC): requires: list[str] # input column names produces: list[str] # output column name _output_name: Union[str, list[str]] def __init__(self, input_cols: Union[Sequence[str], str], output_cols: Union[Sequence[str], str]): assert isinstance(input_cols, (str, tuple, list)), f"Input columns must be a string or a sequence of strings. Got {type(input_cols)}" assert isinstance(output_cols, (str, tuple, list)), f"Output columns must be a string or a sequence of strings. Got {type(output_cols)}" self.requires = [input_cols] if isinstance(input_cols, str) else list(input_cols) self.produces = [output_cols] if isinstance(output_cols, str) else list(output_cols) # --- public API --------------------------------------------------------- @abstractmethod def __call__(self, x: pd.DataFrame, *, backend="nb") -> Union[pd.Series, tuple[pd.Series, ...]]: """ Apply the transform to the input data. :param x: DataFrame or Series to transform :param backend: Backend to use for the transform. Can be "pd" or "nb". Default is "nb". :return: """ pass @abstractmethod def _validate_input(self, x: pd.DataFrame) -> bool: """ Check if the input columns are present in the input DataFrame. This method is called before applying the transform. :param x: DataFrame to validate :return: True if the input is valid """ pass @property @abstractmethod def output_name(self) -> Union[str, list[str]]: """ Get the output names of the transform. This is used to determine the output column names in the DataFrame. Used by prepare_output_nb to create the output Series. :return: Output name or list of output names """ pass
[docs] class CoreTransform(BaseTransform, ABC): def __init__(self, input_cols: Union[Sequence[str], str], output_cols: Union[Sequence[str], str]): super().__init__(input_cols, output_cols) # --- public API --------------------------------------------------------- def __call__(self, x: pd.DataFrame, *, backend="nb") -> Union[pd.Series, tuple[pd.Series, ...]]: """ Apply the transform to the input data. :param x: DataFrame or Series to transform :param backend: Backend to use for the transform. Can be "pd" or "nb". Default is "nb". :return: """ self._validate_input(x) if backend == "pd": return self._pd(x) elif backend == "nb": return self._nb(x) raise ValueError(f"Unknown backend {backend!r}") @staticmethod def _check_datetime_index(x: pd.DataFrame) -> bool: """ Helper function to check if the input DataFrame has a datetime index. This will be used for time based features. :param x: DataFrame :return: True if the index is a datetime index """ if isinstance(x, pd.DataFrame): if not pd.api.types.is_datetime64_any_dtype(x.index): raise ValueError("Input DataFrame must have a datetime index for time-based features.") else: raise TypeError("Input must be a pandas DataFrame") return True def _get_timestamps(self, x: pd.DataFrame) -> NDArray[np.int64]: """ Helper function the get timestamps nanoseconds timestamp from the input DataFrame. :param x: DataFrame to get timestamps from :return: numpy array of timestamps in nanoseconds """ self._check_datetime_index(x) return x.index.values.astype(np.int64) # --- to be implemented by children -------------------------------------- @abstractmethod def _prepare_input_nb(self, x: pd.DataFrame) -> Union[dict[str, NDArray], NDArray]: """ Prepare array inputs for numba functions. :param x: DataFrame or Series to transform :return: Dict of input data for DataFrame or array for Series """ pass @abstractmethod def _prepare_output_nb(self, idx: pd.Index, y: Union[NDArray, tuple[NDArray]]) -> Union[pd.Series, tuple[pd.Series, ...]]: """ Prepare the output data for numba functions. :param idx: index of the original DataFrame :param y: Output data from the transform :return: Series or tuple of Series with the same index as the input data """ pass @abstractmethod def _pd(self, x: Union[pd.DataFrame, pd.Series]) -> Union[pd.Series, tuple[pd.Series]]: """ Transform the input data using pandas. For fast prototyping :param x: DataFrame or Series to transform """ pass @abstractmethod def _nb(self, x: Union[pd.DataFrame, pd.Series]) -> Union[pd.Series, tuple[pd.Series]]: # Fall back to pandas if not overridden pass
[docs] class SISOTransform(CoreTransform, ABC): """ Implement a single input, single output transform on a DataFrame. """ def __init__(self, input_col: str, output_col: str): super().__init__(input_col, output_col) def _validate_input(self, x: pd.DataFrame) -> bool: if not isinstance(x, pd.DataFrame): raise TypeError("Input must be a pandas DataFrame") if self.requires[0] not in x.columns: raise ValueError(f"Input column {self.requires[0]} not found in DataFrame") return True def _prepare_input_nb(self, x: pd.DataFrame) -> NDArray: """ Prepare the input data for numba functions. :param x: DataFrame to transform :return: Numpy array of the input column """ return x[self.requires[0]].values @property def output_name(self) -> str: """ Get the output name of the transform. This is used to determine the output column name in the DataFrame. :return: Output name """ return f"{self.requires[0]}_{self.produces[0]}" def _prepare_output_nb(self, idx: pd.Index, y: NDArray) -> pd.Series: """ Prepare the output data for numba functions. :param idx: index of the original DataFrame :param y: Output data from the transform :return: Series with the same index as the input data """ return pd.Series(y, index=idx, name=self.output_name)
[docs] class MISOTransform(CoreTransform, ABC): """ Implement a multiple input, single output transform on a DataFrame. """ def __init__(self, input_cols: Sequence[str], output_col: str): super().__init__(input_cols, output_col) def _validate_input(self, x: pd.DataFrame) -> bool: if not isinstance(x, pd.DataFrame): raise TypeError("Input must be a pandas DataFrame") missing_cols = [col for col in self.requires if col not in x.columns] if missing_cols: raise ValueError(f"Input columns {missing_cols} not found in DataFrame") return True def _prepare_input_nb(self, x: pd.DataFrame) -> dict[str, NDArray]: """ Prepare the input data for numba functions. :param x: DataFrame to transform :return: Dict of input data for each column """ return {col: x[col].values for col in self.requires} @property def output_name(self) -> str: """ For MISO transforms, the output name is the same as the produces. :return: Output name """ return self.produces[0] def _prepare_output_nb(self, idx: pd.Index, y: NDArray) -> pd.Series: """ Prepare the output data for numba functions. :param idx: index of the original DataFrame :param y: Output data from the transform :return: Series with the same index as the input data """ return pd.Series(y, index=idx, name=self.output_name)
[docs] class SIMOTransform(CoreTransform, ABC): """ Implement a single input, multiple output transform on a DataFrame. """ def __init__(self, input_col: str, output_cols: Sequence[str]): super().__init__(input_col, output_cols) def _validate_input(self, x: pd.DataFrame) -> bool: if not isinstance(x, pd.DataFrame): raise TypeError("Input must be a pandas DataFrame") if self.requires[0] not in x.columns: raise ValueError(f"Input column {self.requires[0]} not found in DataFrame") return True def _prepare_input_nb(self, x: pd.DataFrame) -> NDArray: """ Prepare the input data for numba functions. :param x: DataFrame to transform :return: Numpy array of the input column """ return x[self.requires[0]].values @property def output_name(self) -> list[str]: """ Get the output names of the transform. For SIMO transforms, the output names are derived from the input column name. :return: List of output names """ return [f"{self.requires[0]}_{col}" for col in self.produces] def _prepare_output_nb(self, idx: pd.Index, y: tuple[NDArray, ...]) -> tuple[pd.Series, ...]: """ Prepare the output data for numba functions. :param idx: index of the original DataFrame :param y: Output data from the transform :return: Tuple of Series with the same index as the input data """ if len(y) != len(self.produces): raise ValueError(f"Expected {len(self.produces)} outputs, got {len(y)}") return tuple(pd.Series(y_i, index=idx, name=name) for y_i, name in zip(y, self.output_name))
[docs] class MIMOTransform(CoreTransform, ABC): """ Implement a multiple input, multiple output transform on a DataFrame. """ def __init__(self, input_cols: Sequence[str], output_cols: Sequence[str]): super().__init__(input_cols, output_cols) def _validate_input(self, x: pd.DataFrame) -> bool: if not isinstance(x, pd.DataFrame): raise TypeError("Input must be a pandas DataFrame") missing_cols = [col for col in self.requires if col not in x.columns] if missing_cols: raise ValueError(f"Input columns {missing_cols} not found in DataFrame") return True def _prepare_input_nb(self, x: pd.DataFrame) -> dict[str, NDArray]: """ Prepare the input data for numba functions. :param x: DataFrame to transform :return: Dict of input data for each column """ return {col: x[col].values for col in self.requires} @property def output_name(self) -> list[str]: """ Get the output names of the transform. :return: List of output names """ return self.produces def _prepare_output_nb(self, idx: pd.Index, y: tuple[NDArray]) -> tuple[pd.Series, ...]: """ Prepare the output data for numba functions. :param idx: index of the original DataFrame :param y: Output data from the transform :return: Tuple of Series with the same index as the input data """ if len(y) != len(self.produces): raise ValueError(f"Expected {len(self.produces)} outputs, got {len(y)}") return tuple(pd.Series(y_i, index=idx, name=name) for y_i, name in zip(y, self.output_name))
[docs] class BinaryOpTransform(BaseTransform): """Transform that applies binary operations between two transforms""" def __init__(self, left: BaseTransform, right: BaseTransform, op_name: str, op_func: Callable): # Combine all input requirements from both transforms combined_inputs = list(set(left.requires + right.requires)) output_name = f"{op_name}({left.output_name},{right.output_name})" super().__init__(combined_inputs, output_name) self.left = left self.right = right self.op_func = op_func def _validate_input(self, x): # binary operations are valid for SISO and MISO transforms if not isinstance(self.left, (SISOTransform, MISOTransform)): raise TypeError(f"Left transform must be SISO or MISO for binary OP, got {type(self.left)}") if not isinstance(self.right, (SISOTransform, MISOTransform)): raise TypeError(f"Right transform must be SISO or MISO for binary OP, got {type(self.right)}") return self.left._validate_input(x) and self.right._validate_input(x) @property def output_name(self) -> str|list[str]: if isinstance(self.produces, list) and len(self.produces) == 1: return self.produces[0] return self.produces def __call__(self, x, *, backend="nb"): left_result = self.left(x, backend=backend) right_result = self.right(x, backend=backend) result = self.op_func(left_result, right_result) result.name = self.output_name return result
[docs] class ConstantOpTransform(BaseTransform): """Transform that applies operations between a transform and a constant""" def __init__(self, transform: BaseTransform, constant: float, op_name: str, op_func: Callable): super().__init__(transform.requires, f"{op_name}({transform.output_name},{constant})") self.transform = transform self.constant = constant self.op_func = op_func def _validate_input(self, x): return self.transform._validate_input(x) @property def output_name(self) -> str|list[str]: if isinstance(self.produces, list) and len(self.produces) == 1: return self.produces[0] return self.produces def __call__(self, x, *, backend="nb"): result = self.transform(x, backend=backend) result = self.op_func(result, self.constant) result.name = self.output_name return result
[docs] class UnaryOpTransform(BaseTransform): """Transform that applies unary operations to a transform""" def __init__(self, transform: BaseTransform, op_name: str, op_func: Callable): super().__init__(transform.requires, f"{op_name}({transform.output_name})") self.transform = transform self.op_func = op_func def _validate_input(self, x): return self.transform._validate_input(x) @property def output_name(self) -> str|list[str]: if isinstance(self.produces, list) and len(self.produces) == 1: return self.produces[0] return self.produces def __call__(self, x, *, backend="nb"): result = self.transform(x, backend=backend) result = self.op_func(result) result.name = self.output_name return result
[docs] class MinMaxOpTransform(BaseTransform): """Transform that applies min or max operations between two transforms""" def __init__(self, left: BaseTransform, right: BaseTransform, op_name: str, op_func: Callable): # Combine all input requirements from both transforms combined_inputs = list(set(left.requires + right.requires)) output_name = f"{op_name}({left.output_name},{right.output_name})" super().__init__(combined_inputs, output_name) self.left = left self.right = right self.op_func = op_func def _validate_input(self, x): # min/max operations are valid for SISO and MISO transforms if not isinstance(self.left, (SISOTransform, MISOTransform)): raise TypeError(f"Left transform must be SISO or MISO for {self.produces[0]} OP, got {type(self.left)}") if not isinstance(self.right, (SISOTransform, MISOTransform)): raise TypeError(f"Right transform must be SISO or MISO for {self.produces[0]} OP, got {type(self.right)}") return self.left._validate_input(x) and self.right._validate_input(x) @property def output_name(self) -> str|list[str]: if isinstance(self.produces, list) and len(self.produces) == 1: return self.produces[0] return self.produces def __call__(self, x, *, backend="nb"): left_result = self.left(x, backend=backend) right_result = self.right(x, backend=backend) result = self.op_func(left_result, right_result) result.name = self.output_name return result