Local
Feature engineering
Compute transformations on exogenous regressors
import numpy as np
import pandas as pd
from nbdev import show_doc
from window_ops.expanding import expanding_mean
from mlforecast.utils import generate_daily_series
Setup
rng = np.random.RandomState(0)
series = generate_daily_series(100, equal_ends=True)
starts_ends = series.groupby('unique_id', as_index=False)['ds'].agg([min, max])
prices = []
for r in starts_ends.itertuples():
dates = pd.date_range(r.min, r.max + 14 * pd.offsets.Day())
df = pd.DataFrame({'ds': dates, 'price': rng.rand(dates.size)})
df['unique_id'] = r.Index
prices.append(df)
prices = pd.concat(prices)
prices['price2'] = prices['price'] * rng.rand(prices.shape[0])
prices.head()
ds | price | unique_id | price2 | |
---|---|---|---|---|
0 | 2000-10-05 | 0.548814 | id_00 | 0.345011 |
1 | 2000-10-06 | 0.715189 | id_00 | 0.445598 |
2 | 2000-10-07 | 0.602763 | id_00 | 0.165147 |
3 | 2000-10-08 | 0.544883 | id_00 | 0.041373 |
4 | 2000-10-09 | 0.423655 | id_00 | 0.391577 |
source
transform_exog
transform_exog (df:Union[pandas.core.frame.DataFrame,polars.dataframe.fr ame.DataFrame], lags:Optional[Iterable[int]]=None, lag_tr ansforms:Optional[Dict[int,List[Union[Callable,Tuple[Call able,Any]]]]]=None, id_col:str='unique_id', time_col:str='ds', num_threads:int=1)
Compute lag features for dynamic exogenous regressors.
Type | Default | Details | |
---|---|---|---|
df | Union | Dataframe with ids, times and values for the exogenous regressors. | |
lags | Optional | None | Lags of the target to use as features. |
lag_transforms | Optional | None | Mapping of target lags to their transformations. |
id_col | str | unique_id | Column that identifies each serie. |
time_col | str | ds | Column that identifies each timestep, its values can be timestamps or integers. |
num_threads | int | 1 | Number of threads to use when computing the features. |
Returns | Union | Original DataFrame with the computed features |
transformed = transform_exog(
prices,
lags=[1, 2],
lag_transforms={1: [expanding_mean]}
)
transformed.head()
ds | price | unique_id | price2 | price_lag1 | price_lag2 | price_expanding_mean_lag1 | price2_lag1 | price2_lag2 | price2_expanding_mean_lag1 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 2000-10-05 | 0.548814 | id_00 | 0.345011 | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 2000-10-06 | 0.715189 | id_00 | 0.445598 | 0.548814 | NaN | 0.548814 | 0.345011 | NaN | 0.345011 |
2 | 2000-10-07 | 0.602763 | id_00 | 0.165147 | 0.715189 | 0.548814 | 0.632001 | 0.445598 | 0.345011 | 0.395304 |
3 | 2000-10-08 | 0.544883 | id_00 | 0.041373 | 0.602763 | 0.715189 | 0.622255 | 0.165147 | 0.445598 | 0.318585 |
4 | 2000-10-09 | 0.423655 | id_00 | 0.391577 | 0.544883 | 0.602763 | 0.602912 | 0.041373 | 0.165147 | 0.249282 |
import polars as pl
prices_pl = pl.from_pandas(prices)
transformed_pl = transform_exog(
prices_pl,
lags=[1, 2],
lag_transforms={1: [expanding_mean]},
num_threads=2,
)
transformed_pl.head()
ds | price | unique_id | price2 | price_lag1 | price_lag2 | price_expanding_mean_lag1 | price2_lag1 | price2_lag2 | price2_expanding_mean_lag1 |
---|---|---|---|---|---|---|---|---|---|
datetime[ns] | f64 | str | f64 | f64 | f64 | f64 | f64 | f64 | f64 |
2000-10-05 00:00:00 | 0.548814 | “id_00” | 0.345011 | NaN | NaN | NaN | NaN | NaN | NaN |
2000-10-06 00:00:00 | 0.715189 | “id_00” | 0.445598 | 0.548814 | NaN | 0.548814 | 0.345011 | NaN | 0.345011 |
2000-10-07 00:00:00 | 0.602763 | “id_00” | 0.165147 | 0.715189 | 0.548814 | 0.632001 | 0.445598 | 0.345011 | 0.395304 |
2000-10-08 00:00:00 | 0.544883 | “id_00” | 0.041373 | 0.602763 | 0.715189 | 0.622255 | 0.165147 | 0.445598 | 0.318585 |
2000-10-09 00:00:00 | 0.423655 | “id_00” | 0.391577 | 0.544883 | 0.602763 | 0.602912 | 0.041373 | 0.165147 | 0.249282 |