2020-06-10 16:04:54 +00:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
"""
|
|
|
|
This module borrows and adapts `Pipeline` from `sklearn.pipeline` and
|
|
|
|
`TransformerMixin` from `sklearn.base` in the scikit-learn framework
|
|
|
|
(commit hash d205638475ca542dc46862652e3bb0be663a8eac) to be precise).
|
|
|
|
Both are BSD licensed and allow for this sort of thing; attribution
|
|
|
|
is given as a comment above each class.
|
|
|
|
"""
|
|
|
|
from collections import defaultdict
|
|
|
|
from itertools import islice
|
2022-01-24 04:07:52 +00:00
|
|
|
from typing import Any, Callable, Optional
|
|
|
|
from typing_extensions import Protocol
|
|
|
|
|
|
|
|
|
|
|
|
class TransformerProtocol(Protocol):
|
|
|
|
fit: Callable[..., "TransformerProtocol"]
|
|
|
|
transform: Callable[["TransformerProtocol", Any], Any]
|
2020-06-10 16:04:54 +00:00
|
|
|
|
|
|
|
|
|
|
|
# Author: Gael Varoquaux <gael.varoquaux@normalesup.org>
|
|
|
|
# License: BSD 3 clause
|
2022-01-24 04:07:52 +00:00
|
|
|
class TransformerMixin(TransformerProtocol):
|
2020-06-10 16:04:54 +00:00
|
|
|
"""Mixin class for all transformers."""
|
|
|
|
|
2022-01-24 04:07:52 +00:00
|
|
|
def fit_transform(self, X: Any, y: Optional[Any] = None, **fit_params: Any) -> Any:
|
2020-06-10 16:04:54 +00:00
|
|
|
"""
|
|
|
|
Fit to data, then transform it.
|
|
|
|
Fits transformer to X and y with optional parameters fit_params
|
|
|
|
and returns a transformed version of X.
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
X : ndarray of shape (n_samples, n_features)
|
|
|
|
Training set.
|
|
|
|
y : ndarray of shape (n_samples,), default=None
|
|
|
|
Target values.
|
|
|
|
**fit_params : dict
|
|
|
|
Additional fit parameters.
|
|
|
|
Returns
|
|
|
|
-------
|
|
|
|
X_new : ndarray array of shape (n_samples, n_features_new)
|
|
|
|
Transformed array.
|
|
|
|
"""
|
|
|
|
# non-optimized default implementation; override when a better
|
|
|
|
# method is possible for a given clustering algorithm
|
|
|
|
if y is None:
|
|
|
|
# fit method of arity 1 (unsupervised transformation)
|
|
|
|
return self.fit(X, **fit_params).transform(X)
|
|
|
|
else:
|
|
|
|
# fit method of arity 2 (supervised transformation)
|
|
|
|
return self.fit(X, y, **fit_params).transform(X)
|
|
|
|
|
|
|
|
|
|
|
|
# Author: Edouard Duchesnay
|
|
|
|
# Gael Varoquaux
|
|
|
|
# Virgile Fritsch
|
|
|
|
# Alexandre Gramfort
|
|
|
|
# Lars Buitinck
|
|
|
|
# License: BSD
|
2022-01-24 04:07:52 +00:00
|
|
|
class Pipeline:
|
2020-06-10 16:04:54 +00:00
|
|
|
def __init__(self, steps, verbose=False):
|
|
|
|
self.steps = steps
|
|
|
|
self.verbose = verbose
|
|
|
|
self._validate_steps()
|
|
|
|
|
|
|
|
def _validate_steps(self):
|
|
|
|
names, estimators = zip(*self.steps)
|
|
|
|
|
|
|
|
# validate estimators
|
|
|
|
transformers = estimators[:-1]
|
|
|
|
estimator = estimators[-1]
|
|
|
|
|
|
|
|
for t in transformers:
|
2022-01-24 04:07:52 +00:00
|
|
|
if t is None or t == "passthrough":
|
2020-06-10 16:04:54 +00:00
|
|
|
continue
|
2022-01-24 04:07:52 +00:00
|
|
|
if not (hasattr(t, "fit") or hasattr(t, "fit_transform")) or not hasattr(
|
|
|
|
t, "transform"
|
|
|
|
):
|
|
|
|
raise TypeError(
|
|
|
|
"All intermediate steps should be "
|
|
|
|
"transformers and implement fit and transform "
|
|
|
|
"or be the string 'passthrough' "
|
|
|
|
"'%s' (type %s) doesn't" % (t, type(t))
|
|
|
|
)
|
2020-06-10 16:04:54 +00:00
|
|
|
|
|
|
|
# We allow last estimator to be None as an identity transformation
|
2022-01-24 04:07:52 +00:00
|
|
|
if (
|
|
|
|
estimator is not None
|
|
|
|
and estimator != "passthrough"
|
|
|
|
and not hasattr(estimator, "fit")
|
|
|
|
):
|
2020-06-10 16:04:54 +00:00
|
|
|
raise TypeError(
|
|
|
|
"Last step of Pipeline should implement fit "
|
|
|
|
"or be the string 'passthrough'. "
|
2022-01-24 04:07:52 +00:00
|
|
|
"'%s' (type %s) doesn't" % (estimator, type(estimator))
|
|
|
|
)
|
2020-06-10 16:04:54 +00:00
|
|
|
|
|
|
|
def _iter(self, with_final=True, filter_passthrough=True):
|
|
|
|
"""
|
|
|
|
Generate (idx, (name, trans)) tuples from self.steps
|
|
|
|
|
|
|
|
When filter_passthrough is True, 'passthrough' and None transformers
|
|
|
|
are filtered out.
|
|
|
|
"""
|
|
|
|
stop = len(self.steps)
|
|
|
|
if not with_final:
|
|
|
|
stop -= 1
|
|
|
|
|
|
|
|
for idx, (name, trans) in enumerate(islice(self.steps, 0, stop)):
|
|
|
|
if not filter_passthrough:
|
|
|
|
yield idx, name, trans
|
2022-01-24 04:07:52 +00:00
|
|
|
elif trans is not None and trans != "passthrough":
|
2020-06-10 16:04:54 +00:00
|
|
|
yield idx, name, trans
|
|
|
|
|
2022-01-24 04:07:52 +00:00
|
|
|
def __len__(self) -> int:
|
2020-06-10 16:04:54 +00:00
|
|
|
"""
|
|
|
|
Returns the length of the Pipeline
|
|
|
|
"""
|
|
|
|
return len(self.steps)
|
|
|
|
|
|
|
|
def __getitem__(self, ind):
|
|
|
|
"""Returns a sub-pipeline or a single esimtator in the pipeline
|
|
|
|
|
|
|
|
Indexing with an integer will return an estimator; using a slice
|
|
|
|
returns another Pipeline instance which copies a slice of this
|
|
|
|
Pipeline. This copy is shallow: modifying (or fitting) estimators in
|
|
|
|
the sub-pipeline will affect the larger pipeline and vice-versa.
|
|
|
|
However, replacing a value in `step` will not affect a copy.
|
|
|
|
"""
|
|
|
|
if isinstance(ind, slice):
|
|
|
|
if ind.step not in (1, None):
|
2022-01-24 04:07:52 +00:00
|
|
|
raise ValueError("Pipeline slicing only supports a step of 1")
|
2020-06-10 16:04:54 +00:00
|
|
|
return self.__class__(self.steps[ind])
|
|
|
|
try:
|
|
|
|
name, est = self.steps[ind]
|
|
|
|
except TypeError:
|
|
|
|
# Not an int, try get step by name
|
|
|
|
return self.named_steps[ind]
|
|
|
|
return est
|
|
|
|
|
|
|
|
@property
|
|
|
|
def _estimator_type(self):
|
|
|
|
return self.steps[-1][1]._estimator_type
|
|
|
|
|
|
|
|
@property
|
|
|
|
def named_steps(self):
|
|
|
|
return dict(self.steps)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def _final_estimator(self):
|
|
|
|
estimator = self.steps[-1][1]
|
2022-01-24 04:07:52 +00:00
|
|
|
return "passthrough" if estimator is None else estimator
|
2020-06-10 16:04:54 +00:00
|
|
|
|
|
|
|
def _log_message(self, step_idx):
|
|
|
|
if not self.verbose:
|
|
|
|
return None
|
|
|
|
name, step = self.steps[step_idx]
|
|
|
|
|
2022-01-24 04:07:52 +00:00
|
|
|
return "(step %d of %d) Processing %s" % (step_idx + 1, len(self.steps), name)
|
2020-06-10 16:04:54 +00:00
|
|
|
|
|
|
|
# Estimator interface
|
|
|
|
|
|
|
|
def _fit(self, X, y=None, **fit_params):
|
|
|
|
# shallow copy of steps - this should really be steps_
|
|
|
|
self.steps = list(self.steps)
|
|
|
|
self._validate_steps()
|
|
|
|
|
2022-01-24 04:07:52 +00:00
|
|
|
fit_params_steps = {name: {} for name, step in self.steps if step is not None}
|
2020-06-10 16:04:54 +00:00
|
|
|
for pname, pval in fit_params.items():
|
2022-01-24 04:07:52 +00:00
|
|
|
if "__" not in pname:
|
2020-06-10 16:04:54 +00:00
|
|
|
raise ValueError(
|
|
|
|
"Pipeline.fit does not accept the {} parameter. "
|
|
|
|
"You can pass parameters to specific steps of your "
|
|
|
|
"pipeline using the stepname__parameter format, e.g. "
|
|
|
|
"`Pipeline.fit(X, y, logisticregression__sample_weight"
|
2022-01-24 04:07:52 +00:00
|
|
|
"=sample_weight)`.".format(pname)
|
|
|
|
)
|
|
|
|
step, param = pname.split("__", 1)
|
2020-06-10 16:04:54 +00:00
|
|
|
fit_params_steps[step][param] = pval
|
2022-01-24 04:07:52 +00:00
|
|
|
for (step_idx, name, transformer) in self._iter(
|
|
|
|
with_final=False, filter_passthrough=False
|
|
|
|
):
|
|
|
|
if transformer is None or transformer == "passthrough":
|
2020-06-10 16:04:54 +00:00
|
|
|
continue
|
|
|
|
|
|
|
|
# Fit or load from cache the current transformer
|
|
|
|
X, fitted_transformer = _fit_transform_one(
|
2022-01-24 04:07:52 +00:00
|
|
|
transformer, X, y, None, **fit_params_steps[name]
|
|
|
|
)
|
2020-06-10 16:04:54 +00:00
|
|
|
# Replace the transformer of the step with the fitted
|
|
|
|
# transformer. This is necessary when loading the transformer
|
|
|
|
# from the cache.
|
|
|
|
self.steps[step_idx] = (name, fitted_transformer)
|
2022-01-24 04:07:52 +00:00
|
|
|
if self._final_estimator == "passthrough":
|
2020-06-10 16:04:54 +00:00
|
|
|
return X, {}
|
|
|
|
return X, fit_params_steps[self.steps[-1][0]]
|
|
|
|
|
|
|
|
def fit(self, X, y=None, **fit_params):
|
|
|
|
"""Fit the model
|
|
|
|
|
|
|
|
Fit all the transforms one after the other and transform the
|
|
|
|
data, then fit the transformed data using the final estimator.
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
X : iterable
|
|
|
|
Training data. Must fulfill input requirements of first step of the
|
|
|
|
pipeline.
|
|
|
|
|
|
|
|
y : iterable, default=None
|
|
|
|
Training targets. Must fulfill label requirements for all steps of
|
|
|
|
the pipeline.
|
|
|
|
|
|
|
|
**fit_params : dict of string -> object
|
|
|
|
Parameters passed to the ``fit`` method of each step, where
|
|
|
|
each parameter name is prefixed such that parameter ``p`` for step
|
|
|
|
``s`` has key ``s__p``.
|
|
|
|
|
|
|
|
Returns
|
|
|
|
-------
|
|
|
|
self : Pipeline
|
|
|
|
This estimator
|
|
|
|
"""
|
|
|
|
Xt, fit_params = self._fit(X, y, **fit_params)
|
2022-01-24 04:07:52 +00:00
|
|
|
if self._final_estimator != "passthrough":
|
2020-06-10 16:04:54 +00:00
|
|
|
self._final_estimator.fit(Xt, y, **fit_params)
|
|
|
|
return self
|
|
|
|
|
|
|
|
def fit_transform(self, X, y=None, **fit_params):
|
|
|
|
"""Fit the model and transform with the final estimator
|
|
|
|
|
|
|
|
Fits all the transforms one after the other and transforms the
|
|
|
|
data, then uses fit_transform on transformed data with the final
|
|
|
|
estimator.
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
X : iterable
|
|
|
|
Training data. Must fulfill input requirements of first step of the
|
|
|
|
pipeline.
|
|
|
|
|
|
|
|
y : iterable, default=None
|
|
|
|
Training targets. Must fulfill label requirements for all steps of
|
|
|
|
the pipeline.
|
|
|
|
|
|
|
|
**fit_params : dict of string -> object
|
|
|
|
Parameters passed to the ``fit`` method of each step, where
|
|
|
|
each parameter name is prefixed such that parameter ``p`` for step
|
|
|
|
``s`` has key ``s__p``.
|
|
|
|
|
|
|
|
Returns
|
|
|
|
-------
|
|
|
|
Xt : array-like of shape (n_samples, n_transformed_features)
|
|
|
|
Transformed samples
|
|
|
|
"""
|
|
|
|
last_step = self._final_estimator
|
|
|
|
Xt, fit_params = self._fit(X, y, **fit_params)
|
2022-01-24 04:07:52 +00:00
|
|
|
if last_step == "passthrough":
|
2020-06-10 16:04:54 +00:00
|
|
|
return Xt
|
2022-01-24 04:07:52 +00:00
|
|
|
if hasattr(last_step, "fit_transform"):
|
2020-06-10 16:04:54 +00:00
|
|
|
return last_step.fit_transform(Xt, y, **fit_params)
|
|
|
|
else:
|
|
|
|
return last_step.fit(Xt, y, **fit_params).transform(Xt)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def transform(self):
|
|
|
|
"""Apply transforms, and transform with the final estimator
|
|
|
|
|
|
|
|
This also works where final estimator is ``None``: all prior
|
|
|
|
transformations are applied.
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
X : iterable
|
|
|
|
Data to transform. Must fulfill input requirements of first step
|
|
|
|
of the pipeline.
|
|
|
|
|
|
|
|
Returns
|
|
|
|
-------
|
|
|
|
Xt : array-like of shape (n_samples, n_transformed_features)
|
|
|
|
"""
|
|
|
|
# _final_estimator is None or has transform, otherwise attribute error
|
|
|
|
# XXX: Handling the None case means we can't use if_delegate_has_method
|
2022-01-24 04:07:52 +00:00
|
|
|
if self._final_estimator != "passthrough":
|
2020-06-10 16:04:54 +00:00
|
|
|
self._final_estimator.transform
|
|
|
|
return self._transform
|
|
|
|
|
|
|
|
def _transform(self, X):
|
|
|
|
Xt = X
|
|
|
|
for _, _, transform in self._iter():
|
|
|
|
Xt = transform.transform(Xt)
|
|
|
|
return Xt
|
|
|
|
|
|
|
|
@property
|
|
|
|
def classes_(self):
|
|
|
|
return self.steps[-1][-1].classes_
|
|
|
|
|
|
|
|
@property
|
|
|
|
def _pairwise(self):
|
|
|
|
# check if first estimator expects pairwise input
|
2022-01-24 04:07:52 +00:00
|
|
|
return getattr(self.steps[0][1], "_pairwise", False)
|
2020-06-10 16:04:54 +00:00
|
|
|
|
|
|
|
@property
|
|
|
|
def n_features_in_(self):
|
|
|
|
# delegate to first step (which will call _check_is_fitted)
|
|
|
|
return self.steps[0][1].n_features_in_
|
|
|
|
|
|
|
|
|
|
|
|
def _name_estimators(estimators):
|
|
|
|
"""Generate names for estimators."""
|
|
|
|
|
|
|
|
names = [
|
2022-01-24 04:07:52 +00:00
|
|
|
estimator if isinstance(estimator, str) else type(estimator).__name__.lower()
|
2020-06-10 16:04:54 +00:00
|
|
|
for estimator in estimators
|
|
|
|
]
|
|
|
|
namecount = defaultdict(int)
|
|
|
|
for est, name in zip(estimators, names):
|
|
|
|
namecount[name] += 1
|
|
|
|
|
|
|
|
for k, v in list(namecount.items()):
|
|
|
|
if v == 1:
|
|
|
|
del namecount[k]
|
|
|
|
|
|
|
|
for i in reversed(range(len(estimators))):
|
|
|
|
name = names[i]
|
|
|
|
if name in namecount:
|
|
|
|
names[i] += "-%d" % namecount[name]
|
|
|
|
namecount[name] -= 1
|
|
|
|
|
|
|
|
return list(zip(names, estimators))
|
|
|
|
|
|
|
|
|
2022-01-24 04:07:52 +00:00
|
|
|
def make_pipeline(*steps, **kwargs) -> Pipeline:
|
2020-06-10 16:04:54 +00:00
|
|
|
"""Construct a Pipeline from the given estimators.
|
|
|
|
|
|
|
|
This is a shorthand for the Pipeline constructor; it does not require, and
|
|
|
|
does not permit, naming the estimators. Instead, their names will be set
|
|
|
|
to the lowercase of their types automatically.
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
*steps : list of estimators.
|
|
|
|
|
|
|
|
verbose : bool, default=False
|
|
|
|
If True, the time elapsed while fitting each step will be printed as it
|
|
|
|
is completed.
|
|
|
|
|
|
|
|
Returns
|
|
|
|
-------
|
|
|
|
p : Pipeline
|
|
|
|
"""
|
2022-01-24 04:07:52 +00:00
|
|
|
verbose = kwargs.pop("verbose", False)
|
2020-06-10 16:04:54 +00:00
|
|
|
if kwargs:
|
2022-01-24 04:07:52 +00:00
|
|
|
raise TypeError(
|
|
|
|
'Unknown keyword arguments: "{}"'.format(list(kwargs.keys())[0])
|
|
|
|
)
|
2020-06-10 16:04:54 +00:00
|
|
|
return Pipeline(_name_estimators(steps), verbose=verbose)
|
|
|
|
|
|
|
|
|
|
|
|
def _transform_one(transformer, X, y, weight, **fit_params):
|
|
|
|
res = transformer.transform(X)
|
|
|
|
# if we have a weight for this transformer, multiply output
|
|
|
|
if weight is None:
|
|
|
|
return res
|
|
|
|
return res * weight
|
|
|
|
|
|
|
|
|
2022-01-24 04:07:52 +00:00
|
|
|
def _fit_transform_one(transformer, X, y, weight, **fit_params):
|
2020-06-10 16:04:54 +00:00
|
|
|
"""
|
|
|
|
Fits ``transformer`` to ``X`` and ``y``. The transformed result is returned
|
|
|
|
with the fitted transformer. If ``weight`` is not ``None``, the result will
|
|
|
|
be multiplied by ``weight``.
|
|
|
|
"""
|
2022-01-24 04:07:52 +00:00
|
|
|
if hasattr(transformer, "fit_transform"):
|
2020-06-10 16:04:54 +00:00
|
|
|
res = transformer.fit_transform(X, y, **fit_params)
|
|
|
|
else:
|
|
|
|
res = transformer.fit(X, y, **fit_params).transform(X)
|
|
|
|
|
|
|
|
if weight is None:
|
|
|
|
return res, transformer
|
|
|
|
return res * weight, transformer
|