Skip to content
Snippets Groups Projects
Commit 2a3770fe authored by Danilo Enoque Ferreira de Lima's avatar Danilo Enoque Ferreira de Lima
Browse files

Merge branch 'speedup' into 'main'

Speed up prediction and outlier detection.

See merge request !9
parents 479df0e3 3b94f54a
No related branches found
No related tags found
1 merge request!9Speed up prediction and outlier detection.
......@@ -2,4 +2,4 @@
Estimate high-resolution photon spectrometer data from low-resolution non-invasive measurements.
"""
VERSION = "0.2.0"
VERSION = "0.2.1"
......@@ -14,10 +14,11 @@ from sklearn.pipeline import Pipeline
from sklearn.kernel_approximation import Nystroem
#from sklearn.linear_model import ARDRegression
from sklearn.linear_model import BayesianRidge
from sklearn.covariance import EllipticEnvelope
#from sklearn.covariance import EllipticEnvelope
from scipy.stats import gaussian_kde
from functools import reduce
from itertools import product
from time import time_ns
from sklearn.base import clone, MetaEstimatorMixin
from joblib import Parallel, delayed
......@@ -637,7 +638,7 @@ class MultiOutputWithStd(MetaEstimatorMixin, BaseEstimator):
"""
if n_jobs is None:
n_jobs = self.n_jobs
y = Parallel(n_jobs=n_jobs)(
y = Parallel(n_jobs=n_jobs, backend="threading")(
delayed(e.predict)(X, return_std) for e in self.estimators_
#delayed(e.predict)(X) for e in self.estimators_
)
......@@ -650,31 +651,40 @@ class MultiOutputWithStd(MetaEstimatorMixin, BaseEstimator):
class UncorrelatedDeviation(OutlierMixin, BaseEstimator):
"""
Detect outliers from uncorrelated whitened mean-centered inputs.
Detect outliers from uncorrelated inputs.
It uses a chi^2 sum over the features to flatten the features.
The standard deviation is estimated using quantiles.
Args:
sigma: Number of standard deviations of the chi^2 distribution.
"""
def __init__(self, contamination: float=0.0000005):
def __init__(self, sigma: float=5.0):
super().__init__()
self.contamination = contamination
self.sigma = sigma
def fit(self, X, y=None) -> OutlierMixin:
"""
Does nothing.
Args:
X: Irrelevant.
X: Data where outliers are.
y: Irrelevant.
Returns: Itself.
"""
self.dist_ = self.score_samples(X)
self.offset_ = -np.percentile(-self.dist_, 100.0 * self.contamination)
bounds_ = np.quantile(X, (0.003/2.0, 0.5, 1.0 - 0.003/2.0), axis=0)
self.ndof_ = float(X.shape[1] - 1.0)
self.med_ = bounds_[1, np.newaxis, ...]
self.sigma_ = (bounds_[2, np.newaxis, ...] - bounds_[0, np.newaxis, ...])/3.0
return self
def decision_function(self, X: np.ndarray) -> np.ndarray:
"""
Return the decision function.
This is chi^2/ndof - 1 - sigma*sqrt(var_chi2)
"""
return self.offset_ - self.score_samples(X)
return (self.score_samples(X) - 1.0 - self.sigma*np.sqrt(2.0/self.ndof_))
def score_samples(self, X: np.ndarray) -> np.ndarray:
"""
......@@ -683,9 +693,9 @@ class UncorrelatedDeviation(OutlierMixin, BaseEstimator):
Args:
X: The new input data.
Returns: The Mahalanobis distance.
Returns: The chi^2 test statistic.
"""
return np.sqrt(np.sum(X**2, axis=1))
return np.sum(((X - self.med_)/self.sigma_)**2, axis=1)/float(self.ndof_)
def predict(self, X):
"""
......@@ -700,9 +710,8 @@ class UncorrelatedDeviation(OutlierMixin, BaseEstimator):
Returns -1 for anomalies/outliers and +1 for inliers.
"""
values = self.decision_function(X)
is_inlier = np.full(values.shape[0], -1, dtype=int)
is_inlier[values >= 0] = 1
is_inlier = np.full(values.shape[0], 1, dtype=int)
is_inlier[values > 0] = -1
return is_inlier
def score(self, X, y, sample_weight=None):
......@@ -781,10 +790,10 @@ class Model(TransformerMixin, BaseEstimator):
])
#self.ood = {ch: IsolationForest(n_jobs=-1)
# for ch in channels+['full']}
#self.ood = {ch: UncorrelatedDeviation(contamination=0.003)
# for ch in channels+['full']}
self.ood = {ch: EllipticEnvelope(contamination=0.003)
self.ood = {ch: UncorrelatedDeviation(sigma=5)
for ch in channels+['full']}
#self.ood = {ch: EllipticEnvelope(contamination=0.003)
# for ch in channels+['full']}
#self.fit_model = MultiOutputWithStd(ARDRegression(n_iter=300, tol=1e-8, verbose=True), n_jobs=8)
self.fit_model = MultiOutputWithStd(BayesianRidge(n_iter=300, tol=1e-8, verbose=True), n_jobs=8)
#self.fit_model = FitModel()
......@@ -928,13 +937,13 @@ class Model(TransformerMixin, BaseEstimator):
E = np.fft.fftfreq(len(e), de)
e_axis = np.linspace(-0.5*(e[-1] - e[0]), 0.5*(e[-1] - e[0]), len(e))
# generate a gaussian
gaussian = np.exp(-0.5*(e_axis)**2/self.high_res_sigma**2)
gaussian /= np.sum(gaussian, axis=0, keepdims=True)
gaussian = np.clip(gaussian, a_min=1e-6, a_max=None)
gaussian_ft = np.fft.fft(gaussian)
#gaussian = np.exp(-0.5*(e_axis)**2/self.high_res_sigma**2)
#gaussian /= np.sum(gaussian, axis=0, keepdims=True)
#gaussian = np.clip(gaussian, a_min=1e-6, a_max=None)
#gaussian_ft = np.fft.fft(gaussian)
H = np.mean(Z/D, axis=0)
N = np.absolute(gaussian_ft*V)**2
N = np.absolute(V)**2
S = np.mean(np.absolute(D)**2, axis=0)
H2 = np.absolute(H)**2
nonzero = np.absolute(H) > 0.2
......@@ -1080,23 +1089,45 @@ class Model(TransformerMixin, BaseEstimator):
the expected prediction in key "expected", the stat. uncertainty in key "unc" and
a (1, energy channel) array for the PCA syst. uncertainty in key "pca".
"""
#t = list()
#n = list()
#t += [time_ns()*1e-9]
#n += ["Initial"]
low_res_pre = self.x_select.transform(low_res_data)
#t += [time_ns()*1e-9]
#n += ["Select"]
low_pca = self.x_model.transform(low_res_pre)
high_pca, high_pca_unc = self.fit_model.predict(low_pca, return_std=True, n_jobs=-1)
#high_pca = self.fit_model.predict(low_pca)
#high_pca_unc = 0
#t += [time_ns()*1e-9]
#n += ["PCA x"]
high_pca, high_pca_unc = self.fit_model.predict(low_pca, return_std=True)
#t += [time_ns()*1e-9]
#n += ["Fit model"]
n_trains = high_pca.shape[0]
# Note: The whiten=True setting in the PCA model leads to an affine transformation
pca_y = np.concatenate((high_pca,
high_pca + high_pca_unc,
),
axis=0)
#t += [time_ns()*1e-9]
#n += ["Concat"]
high_res_predicted = self.y_model["pca"].inverse_transform(pca_y)
expected = high_res_predicted[:n_trains, :]
e_plus = high_res_predicted[n_trains:(2*n_trains), :]
unc = np.fabs(e_plus - expected)
pca_unc = self.y_model['unc'].uncertainty()
total_unc = np.sqrt(pca_unc**2 + unc**2)
#t += [time_ns()*1e-9]
#n += ["Unc"]
#t = np.diff(np.array(t))
#n = n[1:]
#print("Times")
#print(dict(zip(n, t)))
return dict(expected=expected,
unc=unc,
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment