Skip to content
Snippets Groups Projects

Speed up prediction and outlier detection.

Merged Danilo Enoque Ferreira de Lima requested to merge speedup into main
1 file
+ 41
16
Compare changes
  • Side-by-side
  • Inline
+ 56
25
@@ -14,10 +14,11 @@ from sklearn.pipeline import Pipeline
from sklearn.kernel_approximation import Nystroem
#from sklearn.linear_model import ARDRegression
from sklearn.linear_model import BayesianRidge
from sklearn.covariance import EllipticEnvelope
#from sklearn.covariance import EllipticEnvelope
from scipy.stats import gaussian_kde
from functools import reduce
from itertools import product
from time import time_ns
from sklearn.base import clone, MetaEstimatorMixin
from joblib import Parallel, delayed
@@ -637,7 +638,7 @@ class MultiOutputWithStd(MetaEstimatorMixin, BaseEstimator):
"""
if n_jobs is None:
n_jobs = self.n_jobs
y = Parallel(n_jobs=n_jobs)(
y = Parallel(n_jobs=n_jobs, backend="threading")(
delayed(e.predict)(X, return_std) for e in self.estimators_
#delayed(e.predict)(X) for e in self.estimators_
)
@@ -650,31 +651,40 @@ class MultiOutputWithStd(MetaEstimatorMixin, BaseEstimator):
class UncorrelatedDeviation(OutlierMixin, BaseEstimator):
"""
Detect outliers from uncorrelated whitened mean-centered inputs.
Detect outliers from uncorrelated inputs.
It uses a chi^2 sum over the features to flatten the features.
The standard deviation is estimated using quantiles.
Args:
sigma: Number of standard deviations of the chi^2 distribution.
"""
def __init__(self, contamination: float=0.0000005):
def __init__(self, sigma: float=5.0):
super().__init__()
self.contamination = contamination
self.sigma = sigma
def fit(self, X, y=None) -> OutlierMixin:
"""
Does nothing.
Args:
X: Irrelevant.
X: Data where outliers are.
y: Irrelevant.
Returns: Itself.
"""
self.dist_ = self.score_samples(X)
self.offset_ = -np.percentile(-self.dist_, 100.0 * self.contamination)
bounds_ = np.quantile(X, (0.003/2.0, 0.5, 1.0 - 0.003/2.0), axis=0)
self.ndof_ = float(X.shape[1] - 1.0)
self.med_ = bounds_[1, np.newaxis, ...]
self.sigma_ = (bounds_[2, np.newaxis, ...] - bounds_[0, np.newaxis, ...])/3.0
return self
def decision_function(self, X: np.ndarray) -> np.ndarray:
"""
Return the decision function.
This is chi^2/ndof - 1 - sigma*sqrt(var_chi2)
"""
return self.offset_ - self.score_samples(X)
return (self.score_samples(X) - 1.0 - self.sigma*np.sqrt(2.0/self.ndof_))
def score_samples(self, X: np.ndarray) -> np.ndarray:
"""
@@ -683,9 +693,9 @@ class UncorrelatedDeviation(OutlierMixin, BaseEstimator):
Args:
X: The new input data.
Returns: The Mahalanobis distance.
Returns: The chi^2 test statistic.
"""
return np.sqrt(np.sum(X**2, axis=1))
return np.sum(((X - self.med_)/self.sigma_)**2, axis=1)/float(self.ndof_)
def predict(self, X):
"""
@@ -700,9 +710,8 @@ class UncorrelatedDeviation(OutlierMixin, BaseEstimator):
Returns -1 for anomalies/outliers and +1 for inliers.
"""
values = self.decision_function(X)
is_inlier = np.full(values.shape[0], -1, dtype=int)
is_inlier[values >= 0] = 1
is_inlier = np.full(values.shape[0], 1, dtype=int)
is_inlier[values > 0] = -1
return is_inlier
def score(self, X, y, sample_weight=None):
@@ -781,10 +790,10 @@ class Model(TransformerMixin, BaseEstimator):
])
#self.ood = {ch: IsolationForest(n_jobs=-1)
# for ch in channels+['full']}
#self.ood = {ch: UncorrelatedDeviation(contamination=0.003)
# for ch in channels+['full']}
self.ood = {ch: EllipticEnvelope(contamination=0.003)
self.ood = {ch: UncorrelatedDeviation(sigma=5)
for ch in channels+['full']}
#self.ood = {ch: EllipticEnvelope(contamination=0.003)
# for ch in channels+['full']}
#self.fit_model = MultiOutputWithStd(ARDRegression(n_iter=300, tol=1e-8, verbose=True), n_jobs=8)
self.fit_model = MultiOutputWithStd(BayesianRidge(n_iter=300, tol=1e-8, verbose=True), n_jobs=8)
#self.fit_model = FitModel()
@@ -928,13 +937,13 @@ class Model(TransformerMixin, BaseEstimator):
E = np.fft.fftfreq(len(e), de)
e_axis = np.linspace(-0.5*(e[-1] - e[0]), 0.5*(e[-1] - e[0]), len(e))
# generate a gaussian
gaussian = np.exp(-0.5*(e_axis)**2/self.high_res_sigma**2)
gaussian /= np.sum(gaussian, axis=0, keepdims=True)
gaussian = np.clip(gaussian, a_min=1e-6, a_max=None)
gaussian_ft = np.fft.fft(gaussian)
#gaussian = np.exp(-0.5*(e_axis)**2/self.high_res_sigma**2)
#gaussian /= np.sum(gaussian, axis=0, keepdims=True)
#gaussian = np.clip(gaussian, a_min=1e-6, a_max=None)
#gaussian_ft = np.fft.fft(gaussian)
H = np.mean(Z/D, axis=0)
N = np.absolute(gaussian_ft*V)**2
N = np.absolute(V)**2
S = np.mean(np.absolute(D)**2, axis=0)
H2 = np.absolute(H)**2
nonzero = np.absolute(H) > 0.2
@@ -1080,23 +1089,45 @@ class Model(TransformerMixin, BaseEstimator):
the expected prediction in key "expected", the stat. uncertainty in key "unc" and
a (1, energy channel) array for the PCA syst. uncertainty in key "pca".
"""
#t = list()
#n = list()
#t += [time_ns()*1e-9]
#n += ["Initial"]
low_res_pre = self.x_select.transform(low_res_data)
#t += [time_ns()*1e-9]
#n += ["Select"]
low_pca = self.x_model.transform(low_res_pre)
high_pca, high_pca_unc = self.fit_model.predict(low_pca, return_std=True, n_jobs=-1)
#high_pca = self.fit_model.predict(low_pca)
#high_pca_unc = 0
#t += [time_ns()*1e-9]
#n += ["PCA x"]
high_pca, high_pca_unc = self.fit_model.predict(low_pca, return_std=True)
#t += [time_ns()*1e-9]
#n += ["Fit model"]
n_trains = high_pca.shape[0]
# Note: The whiten=True setting in the PCA model leads to an affine transformation
pca_y = np.concatenate((high_pca,
high_pca + high_pca_unc,
),
axis=0)
#t += [time_ns()*1e-9]
#n += ["Concat"]
high_res_predicted = self.y_model["pca"].inverse_transform(pca_y)
expected = high_res_predicted[:n_trains, :]
e_plus = high_res_predicted[n_trains:(2*n_trains), :]
unc = np.fabs(e_plus - expected)
pca_unc = self.y_model['unc'].uncertainty()
total_unc = np.sqrt(pca_unc**2 + unc**2)
#t += [time_ns()*1e-9]
#n += ["Unc"]
#t = np.diff(np.array(t))
#n = n[1:]
#print("Times")
#print(dict(zip(n, t)))
return dict(expected=expected,
unc=unc,
Loading