Skip to content
Snippets Groups Projects

Speed up prediction and outlier detection.

Merged Danilo Enoque Ferreira de Lima requested to merge speedup into main
1 file
+ 41
16
Compare changes
  • Side-by-side
  • Inline
+ 41
16
@@ -14,10 +14,11 @@ from sklearn.pipeline import Pipeline
from sklearn.kernel_approximation import Nystroem
#from sklearn.linear_model import ARDRegression
from sklearn.linear_model import BayesianRidge
from sklearn.covariance import EllipticEnvelope
#from sklearn.covariance import EllipticEnvelope
from scipy.stats import gaussian_kde
from functools import reduce
from itertools import product
from time import time_ns
from sklearn.base import clone, MetaEstimatorMixin
from joblib import Parallel, delayed
@@ -637,7 +638,7 @@ class MultiOutputWithStd(MetaEstimatorMixin, BaseEstimator):
"""
if n_jobs is None:
n_jobs = self.n_jobs
y = Parallel(n_jobs=n_jobs)(
y = Parallel(n_jobs=n_jobs, backend="threading")(
delayed(e.predict)(X, return_std) for e in self.estimators_
#delayed(e.predict)(X) for e in self.estimators_
)
@@ -652,7 +653,7 @@ class UncorrelatedDeviation(OutlierMixin, BaseEstimator):
"""
Detect outliers from uncorrelated whitened mean-centered inputs.
"""
def __init__(self, contamination: float=0.0000005):
def __init__(self, contamination: float=0.003):
super().__init__()
self.contamination = contamination
@@ -661,20 +662,19 @@ class UncorrelatedDeviation(OutlierMixin, BaseEstimator):
Does nothing.
Args:
X: Irrelevant.
X: Data where outliers are.
y: Irrelevant.
Returns: Itself.
"""
self.dist_ = self.score_samples(X)
self.offset_ = -np.percentile(-self.dist_, 100.0 * self.contamination)
self.bounds_ = np.quantile(X, (self.contamination/2.0, 0.5, 1.0 - self.contamination/2.0), axis=0)
return self
def decision_function(self, X: np.ndarray) -> np.ndarray:
"""
Return the decision function.
"""
return self.offset_ - self.score_samples(X)
return self.score_samples(X) - 1.0
def score_samples(self, X: np.ndarray) -> np.ndarray:
"""
@@ -685,7 +685,9 @@ class UncorrelatedDeviation(OutlierMixin, BaseEstimator):
Returns: The Mahalanobis distance.
"""
return np.sqrt(np.sum(X**2, axis=1))
med = self.bounds_[1, np.newaxis, ...]
sigma = (self.bounds_[2, np.newaxis, ...] - self.bounds_[0, np.newaxis, ...])/0.5
return np.fabs((X - med)/sigma)
def predict(self, X):
"""
@@ -700,9 +702,10 @@ class UncorrelatedDeviation(OutlierMixin, BaseEstimator):
Returns -1 for anomalies/outliers and +1 for inliers.
"""
values = self.decision_function(X)
is_lower = np.any(X < self.bounds_[0, np.newaxis, ...], axis=1)
is_upper = np.any(X > self.bounds_[2, np.newaxis, ...], axis=1)
is_inlier = np.full(values.shape[0], -1, dtype=int)
is_inlier[values >= 0] = 1
is_inlier[is_lower | is_upper] = 1
return is_inlier
def score(self, X, y, sample_weight=None):
@@ -781,10 +784,10 @@ class Model(TransformerMixin, BaseEstimator):
])
#self.ood = {ch: IsolationForest(n_jobs=-1)
# for ch in channels+['full']}
#self.ood = {ch: UncorrelatedDeviation(contamination=0.003)
# for ch in channels+['full']}
self.ood = {ch: EllipticEnvelope(contamination=0.003)
self.ood = {ch: UncorrelatedDeviation(contamination=0.003)
for ch in channels+['full']}
#self.ood = {ch: EllipticEnvelope(contamination=0.003)
# for ch in channels+['full']}
#self.fit_model = MultiOutputWithStd(ARDRegression(n_iter=300, tol=1e-8, verbose=True), n_jobs=8)
self.fit_model = MultiOutputWithStd(BayesianRidge(n_iter=300, tol=1e-8, verbose=True), n_jobs=8)
#self.fit_model = FitModel()
@@ -1080,23 +1083,45 @@ class Model(TransformerMixin, BaseEstimator):
the expected prediction in key "expected", the stat. uncertainty in key "unc" and
a (1, energy channel) array for the PCA syst. uncertainty in key "pca".
"""
#t = list()
#n = list()
#t += [time_ns()*1e-9]
#n += ["Initial"]
low_res_pre = self.x_select.transform(low_res_data)
#t += [time_ns()*1e-9]
#n += ["Select"]
low_pca = self.x_model.transform(low_res_pre)
high_pca, high_pca_unc = self.fit_model.predict(low_pca, return_std=True, n_jobs=-1)
#high_pca = self.fit_model.predict(low_pca)
#high_pca_unc = 0
#t += [time_ns()*1e-9]
#n += ["PCA x"]
high_pca, high_pca_unc = self.fit_model.predict(low_pca, return_std=True)
#t += [time_ns()*1e-9]
#n += ["Fit model"]
n_trains = high_pca.shape[0]
# Note: The whiten=True setting in the PCA model leads to an affine transformation
pca_y = np.concatenate((high_pca,
high_pca + high_pca_unc,
),
axis=0)
#t += [time_ns()*1e-9]
#n += ["Concat"]
high_res_predicted = self.y_model["pca"].inverse_transform(pca_y)
expected = high_res_predicted[:n_trains, :]
e_plus = high_res_predicted[n_trains:(2*n_trains), :]
unc = np.fabs(e_plus - expected)
pca_unc = self.y_model['unc'].uncertainty()
total_unc = np.sqrt(pca_unc**2 + unc**2)
#t += [time_ns()*1e-9]
#n += ["Unc"]
#t = np.diff(np.array(t))
#n = n[1:]
#print("Times")
#print(dict(zip(n, t)))
return dict(expected=expected,
unc=unc,
Loading