Skip to content

Commit 90b9145

Browse files
authored
Optimized the logic applying univariate model to DataFrame (#67)
* optimized applying univariate model to DF * updated version number and changelogs * minor optimization * Fixed a bug that model trained with Series cannot be applied to DataFrame due to name matching error * modified docstrings * updated version number * updated changelog
1 parent 9e9b86b commit 90b9145

File tree

11 files changed

+164
-167
lines changed

11 files changed

+164
-167
lines changed

docs/conf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@
6666
# The short X.Y version.
6767
version = "0.5"
6868
# The full version, including alpha/beta/rc tags.
69-
release = "0.5.3"
69+
release = "0.5.4"
7070

7171
# The language for content autogenerated by Sphinx. Refer to documentation
7272
# for a list of supported languages.

docs/releasehistory.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,13 @@
22
Release History
33
***************
44

5+
Version 0.5.4 (Feb 18, 2020)
6+
===================================
7+
- Optimized the workflow of how a univariate model is applied to pandas DataFrame
8+
- Added more informative error messages
9+
- Fixed some bugs resulting in model-column matching error due to inconsistency between output Series names and DataFrame columns
10+
- Clarified the workflow in the documentation
11+
512
Version 0.5.3 (Feb 12, 2020)
613
===================================
714
- Quick hotfix to avoid errors caused by statsmodels v0.11 by requiring statsmodels dependency <0.11

setup.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[metadata]
22
name = adtk
3-
version = 0.5.3
3+
version = 0.5.4
44
author = Arundo Analytics, Inc.
55
maintainer = Tailai Wen
66
maintainer_email = [email protected]

src/adtk/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,4 +20,4 @@
2020
2121
"""
2222

23-
__version__ = "0.5.3"
23+
__version__ = "0.5.4"

src/adtk/_base.py

Lines changed: 65 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,9 @@ class _Model(ABC):
1111
def __init__(self, **kwargs):
1212
for key, value in kwargs.items():
1313
setattr(self, key, value)
14-
self._fitted = False
14+
self._fitted = (
15+
0
16+
) # 0 for not fitted, 1 for fitted, 2 for univariate model fitted by DF
1517

1618
@abstractmethod
1719
def _fit(self, ts):
@@ -91,37 +93,78 @@ def _fit(self, ts):
9193
s = ts.copy()
9294
self._fit_core(s)
9395
self._models = None
96+
self._fitted = 1
9497
elif isinstance(ts, pd.DataFrame):
9598
df = ts.copy()
99+
if df.columns.duplicated().any():
100+
raise ValueError(
101+
"Input DataFrame must have unique column names."
102+
)
96103
if self._need_fit:
97104
self._update_models(df.columns)
98105
# fit model for each column
99106
for col in df.columns:
100107
self._models[col].fit(df[col])
108+
self._fitted = 2
109+
else:
110+
pass
101111
else:
102112
raise TypeError("Input must be a pandas Series or DataFrame.")
103-
self._fitted = True
104113

105114
def _predict(self, ts):
106-
if self._need_fit and (not self._fitted):
115+
if self._need_fit and (self._fitted == 0):
107116
raise RuntimeError("The model must be trained first.")
108117
if isinstance(ts, pd.Series):
118+
if self._need_fit and (
119+
self._fitted == 2
120+
): # fitted by DF, to be applied to Series
121+
raise RuntimeError(
122+
"The model was trained by a pandas DataFrame object, "
123+
"it can only be applied to a pandas DataFrame object."
124+
)
109125
s = ts.copy()
110126
predicted = self._predict_core(s)
111127
# if a Series-to-Series operation, make sure Series name keeps
112128
if isinstance(predicted, pd.Series):
113129
predicted.name = ts.name
114130
elif isinstance(ts, pd.DataFrame):
115131
df = ts.copy()
116-
# if the model doesn't neef fit, initialize or reset a model for
117-
# each column
118-
if not self._need_fit:
119-
self._update_models(df.columns)
120-
# predict for each column
121-
predicted = pd.concat(
122-
[self._models[col]._predict(df[col]) for col in df.columns],
123-
axis=1,
124-
)
132+
if df.columns.duplicated().any():
133+
raise ValueError(
134+
"Input DataFrame must have unique column names."
135+
)
136+
if (not self._need_fit) or (self._fitted == 1):
137+
# apply the model to each column
138+
predicted = []
139+
for col in df.columns:
140+
predicted_this_col = self._predict(df[col])
141+
if isinstance(predicted_this_col, pd.DataFrame):
142+
predicted_this_col = predicted_this_col.rename(
143+
columns={
144+
col1: "{}_{}".format(col, col1)
145+
for col1 in predicted_this_col.columns
146+
}
147+
)
148+
predicted.append(predicted_this_col)
149+
predicted = pd.concat(predicted, axis=1)
150+
else:
151+
# predict for each column
152+
if not (set(self._models.keys()) >= set(df.columns)):
153+
raise ValueError(
154+
"The model was trained by a pandas DataFrame with "
155+
"columns {}, but the input DataFrame contains columns "
156+
"{} which are unknown to the model.".format(
157+
list(set(self._models.keys())),
158+
list(set(df.columns) - set(self._models.keys())),
159+
)
160+
)
161+
predicted = pd.concat(
162+
[
163+
self._models[col]._predict(df[col])
164+
for col in df.columns
165+
],
166+
axis=1,
167+
)
125168
else:
126169
raise TypeError("Input must be a pandas Series or DataFrame.")
127170
# make sure index freq is the same (because pandas has a bug that some
@@ -153,16 +196,24 @@ def fit_predict(self, ts):
153196
class _ModelHD(_Model):
154197
def _fit(self, df):
155198
if isinstance(df, pd.DataFrame):
199+
if df.columns.duplicated().any():
200+
raise ValueError(
201+
"Input DataFrame must have unique column names."
202+
)
156203
df_copy = df.copy()
157204
self._fit_core(df_copy)
158205
else:
159206
raise TypeError("Input must be a pandas DataFrame.")
160-
self._fitted = True
207+
self._fitted = 1
161208

162209
def _predict(self, df):
163-
if self._need_fit and (not self._fitted):
210+
if self._need_fit and (self._fitted == 0):
164211
raise RuntimeError("The model must be trained first.")
165212
if isinstance(df, pd.DataFrame):
213+
if df.columns.duplicated().any():
214+
raise ValueError(
215+
"Input DataFrame must have unique column names."
216+
)
166217
df_copy = df.copy()
167218
predicted = self._predict_core(df_copy)
168219
else:

src/adtk/_detector_base.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,14 @@ def detect(self, ts, return_list=False):
2525
Parameters
2626
----------
2727
ts: pandas.Series or pandas.DataFrame
28-
Time series to detect anomalies from.
29-
If a DataFrame with k columns, k univariate detectors will be
30-
applied to them independently.
28+
Time series to detect anomalies from. If a DataFrame with k
29+
columns, it is treated as k independent univariate time series.
30+
31+
- If the detector was trained with a Series, the detector will be
32+
applied to each univariate series independently;
33+
- If the detector was trained with a DataFrame, i.e. the detector
34+
is essentially k detectors, those detectors will be applied to
35+
each univariate series respectivley.
3136
3237
return_list: bool, optional
3338
Whether to return a list of anomalous time stamps, or a binary
@@ -66,8 +71,9 @@ def fit_detect(self, ts, return_list=False):
6671
----------
6772
ts: pandas.Series or pandas.DataFrame
6873
Time series to be used for training and be detected for anomalies.
69-
If a DataFrame with k columns, k univariate detectors will be
70-
trained and applied to them independently.
74+
If a DataFrame with k columns, it is treated as k independent
75+
univariate time series, and k univariate detectors will be trained
76+
and applied to each series independently.
7177
7278
return_list: bool, optional
7379
Whether to return a list of anomalous time stamps, or a binary
@@ -109,8 +115,9 @@ def score(self, ts, anomaly_true, scoring="recall", **kwargs):
109115
----------
110116
ts: pandas Series or pandas.DataFrame
111117
Time series to detect anomalies from.
112-
If a DataFrame with k columns, k univariate detectors will be
113-
applied to them independently.
118+
If a DataFrame with k columns, it is treated as k independent
119+
univariate time series, and k univariate detectors will be trained
120+
and applied to each series independently.
114121
115122
anomaly_true: pandas.Series, pandas.DataFrame, list, or dict
116123
True anomalies.

src/adtk/_transformer_base.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,14 @@ def transform(self, ts):
2121
Parameters
2222
----------
2323
ts: pandas.Series or pandas.DataFrame
24-
Time series to be transformed.
25-
If a DataFrame with k columns, k univariate transformers will be
26-
applied to them independently.
24+
Time series to be transformed. If a DataFrame with k columns, it is
25+
treated as k independent univariate time series.
26+
27+
- If the transformer was trained with a Series, the transformer
28+
will be applied to each univariate series independently;
29+
- If the transformer was trained with a DataFrame, i.e. the
30+
transformer is essentially k transformers, those transformers
31+
will be applied to each univariate series respectivley.
2732
2833
Returns
2934
-------
@@ -41,8 +46,9 @@ def fit_transform(self, ts):
4146
----------
4247
ts: pandas.Series or pandas.DataFrame
4348
Time series to be used for training and be transformed.
44-
If a DataFrame with k columns, k univariate transformers will be
45-
applied to them independently.
49+
If a DataFrame with k columns, it is treated as k independent
50+
univariate time series, and k univariate transformers will be
51+
trained and applied to each series independently.
4652
4753
Returns
4854
-------

src/adtk/detector/detector_1d.py

Lines changed: 1 addition & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -39,13 +39,6 @@
3939
class CustomizedDetector1D(_Detector1D):
4040
"""Detector derived from a user-given function and parameters.
4141
42-
This is an univariate detector. When it is applied to a multivariate time
43-
series (i.e. pandas DataFrame), it will be applied to every series
44-
independently. All parameters can be defined as a dict object where key-
45-
value pairs are series names (i.e. column names of DataFrame) and the
46-
model parameter for that series. If not, then the same parameter will be
47-
applied to all series.
48-
4942
Parameters
5043
----------
5144
detect_func: function
@@ -133,13 +126,6 @@ class ThresholdAD(_Detector1D):
133126
This detector compares time series values with user-given thresholds, and
134127
identifies time points as anomalous when values are beyond the thresholds.
135128
136-
This is an univariate detector. When it is applied to a multivariate time
137-
series (i.e. pandas DataFrame), it will be applied to every series
138-
independently. All parameters can be defined as a dict object where key-
139-
value pairs are series names (i.e. column names of DataFrame) and the
140-
model parameter for that series. If not, then the same parameter will be
141-
applied to all series.
142-
143129
Parameters
144130
----------
145131
low: float, optional
@@ -178,13 +164,6 @@ class QuantileAD(_Detector1D):
178164
of historical data, and identifies time points as anomalous when values
179165
are beyond the thresholds.
180166
181-
This is an univariate detector. When it is applied to a multivariate time
182-
series (i.e. pandas DataFrame), it will be applied to every series
183-
independently. All parameters can be defined as a dict object where key-
184-
value pairs are series names (i.e. column names of DataFrame) and the
185-
model parameter for that series. If not, then the same parameter will be
186-
applied to all series.
187-
188167
Parameters
189168
----------
190169
low: float, optional
@@ -239,13 +218,6 @@ class InterQuartileRangeAD(_Detector1D):
239218
historical data, and identifies time points as anomalous when differences
240219
are beyond the inter-quartile range times a user-given factor c.
241220
242-
This is an univariate detector. When it is applied to a multivariate time
243-
series (i.e. pandas DataFrame), it will be applied to every series
244-
independently. All parameters can be defined as a dict object where key-
245-
value pairs are series names (i.e. column names of DataFrame) and the
246-
model parameter for that series. If not, then the same parameter will be
247-
applied to all series.
248-
249221
Parameters
250222
----------
251223
c: float, or 2-tuple (float, float), optional
@@ -317,13 +289,6 @@ class GeneralizedESDTestAD(_Detector1D):
317289
follow an approximately normal distribution. Please only use this detector
318290
when this assumption holds.
319291
320-
This is an univariate detector. When it is applied to a multivariate time
321-
series (i.e. pandas DataFrame), it will be applied to every series
322-
independently. All parameters can be defined as a dict object where key-
323-
value pairs are series names (i.e. column names of DataFrame) and the
324-
model parameter for that series. If not, then the same parameter will be
325-
applied to all series.
326-
327292
[1] Rosner, Bernard (May 1983), Percentage Points for a Generalized ESD
328293
Many-Outlier Procedure,Technometrics, 25(2), pp. 165-172.
329294
@@ -412,13 +377,6 @@ class PersistAD(_Detector1D):
412377
This detector is internally implemented as a `Pipenet` object. Advanced
413378
users may learn more details by checking attribute `pipe_`.
414379
415-
This is an univariate detector. When it is applied to a multivariate time
416-
series (i.e. pandas DataFrame), it will be applied to every series
417-
independently. All parameters can be defined as a dict object where key-
418-
value pairs are series names (i.e. column names of DataFrame) and the
419-
model parameter for that series. If not, then the same parameter will be
420-
applied to all series.
421-
422380
Parameters
423381
----------
424382
window: int, optional
@@ -575,13 +533,6 @@ class LevelShiftAD(_Detector1D):
575533
This detector is internally implemented as a `Pipenet` object. Advanced
576534
users may learn more details by checking attribute `pipe_`.
577535
578-
This is an univariate detector. When it is applied to a multivariate time
579-
series (i.e. pandas DataFrame), it will be applied to every series
580-
independently. All parameters can be defined as a dict object where key-
581-
value pairs are series names (i.e. column names of DataFrame) and the
582-
model parameter for that series. If not, then the same parameter will be
583-
applied to all series.
584-
585536
Parameters
586537
----------
587538
window: int, optional
@@ -723,13 +674,6 @@ class VolatilityShiftAD(_Detector1D):
723674
This detector is internally implemented as a `Pipenet` object. Advanced
724675
users may learn more details by checking attribute `pipe_`.
725676
726-
This is an univariate detector. When it is applied to a multivariate time
727-
series (i.e. pandas DataFrame), it will be applied to every series
728-
independently. All parameters can be defined as a dict object where key-
729-
value pairs are series names (i.e. column names of DataFrame) and the
730-
model parameter for that series. If not, then the same parameter will be
731-
applied to all series.
732-
733677
Parameters
734678
----------
735679
window: int, optional
@@ -886,13 +830,6 @@ class AutoregressionAD(_Detector1D):
886830
This detector is internally implemented aattribute `pipe_`.nced
887831
users may learn more details by checking attribute `pipe_`.
888832
889-
This is an univariate detector. When it is applied to a multivariate time
890-
series (i.e. pandas DataFrame), it will be applied to every series
891-
independently. All parameters can be defined as a dict object where key-
892-
value pairs are series names (i.e. column names of DataFrame) and the
893-
model parameter for that series. If not, then the same parameter will be
894-
applied to all series.
895-
896833
Parameters
897834
----------
898835
n_steps: int, optional
@@ -1042,13 +979,6 @@ class SeasonalAD(_Detector1D):
1042979
This detector is internally implemented aattribute `pipe_`.nced
1043980
users may learn more details by checking attribute `pipe_`.
1044981
1045-
This is an univariate detector. When it is applied to a multivariate time
1046-
series (i.e. pandas DataFrame), it will be applied to every series
1047-
independently. All parameters can be defined as a dict object where key-
1048-
value pairs are series names (i.e. column names of DataFrame) and the
1049-
model parameter for that series. If not, then the same parameter will be
1050-
applied to all series.
1051-
1052982
Parameters
1053983
----------
1054984
freq: int, optional
@@ -1084,12 +1014,7 @@ class SeasonalAD(_Detector1D):
10841014
10851015
"""
10861016

1087-
_default_params = {
1088-
"freq": None,
1089-
"side": "both",
1090-
"c": 3.0,
1091-
"trend": False,
1092-
}
1017+
_default_params = {"freq": None, "side": "both", "c": 3.0, "trend": False}
10931018

10941019
def __init__(
10951020
self,

0 commit comments

Comments
 (0)