Merge branch 'speedup' into 'main'

Speed up prediction and outlier detection. See merge request !9

Merge branch 'speedup' into 'main'
Speed up prediction and outlier detection. See merge request !9
2a3770fe · Danilo Enoque Ferreira de Lima · 479df0e3 · 3b94f54a · 2a3770fe · 2a3770fe
Commit 2a3770fe authored 2 years ago by Danilo Enoque Ferreira de Lima
--- a/pes_to_spec/__init__.py
+++ b/pes_to_spec/__init__.py
@@ -2,4 +2,4 @@
 Estimate high-resolution photon spectrometer data from low-resolution non-invasive measurements.
 """

-VERSION = "0.2.0"
+VERSION = "0.2.1"
--- a/pes_to_spec/model.py
+++ b/pes_to_spec/model.py
@@ -14,10 +14,11 @@ from sklearn.pipeline import Pipeline
 from sklearn.kernel_approximation import Nystroem
 #from sklearn.linear_model import ARDRegression
 from sklearn.linear_model import BayesianRidge
-from sklearn.covariance import EllipticEnvelope
+#from sklearn.covariance import EllipticEnvelope
 from scipy.stats import gaussian_kde
 from functools import reduce
 from itertools import product
+from time import time_ns

 from sklearn.base import clone, MetaEstimatorMixin
 from joblib import Parallel, delayed
@@ -637,7 +638,7 @@ class MultiOutputWithStd(MetaEstimatorMixin, BaseEstimator):
        """
        if n_jobs is None:
            n_jobs = self.n_jobs
-        y = Parallel(n_jobs=n_jobs)(
+        y = Parallel(n_jobs=n_jobs, backend="threading")(
            delayed(e.predict)(X, return_std) for e in self.estimators_
            #delayed(e.predict)(X) for e in self.estimators_
        )
@@ -650,31 +651,40 @@ class MultiOutputWithStd(MetaEstimatorMixin, BaseEstimator):

 class UncorrelatedDeviation(OutlierMixin, BaseEstimator):
    """
-    Detect outliers from uncorrelated whitened mean-centered inputs.
+    Detect outliers from uncorrelated inputs.
+    It uses a chi^2 sum over the features to flatten the features.
+    The standard deviation is estimated using quantiles.
+
+    Args:
+      sigma: Number of standard deviations of the chi^2 distribution.
+
    """
-    def __init__(self, contamination: float=0.0000005):
+    def __init__(self, sigma: float=5.0):
        super().__init__()
-        self.contamination = contamination
+        self.sigma = sigma

    def fit(self, X, y=None) -> OutlierMixin:
        """
        Does nothing.

        Args:
-          X: Irrelevant.
+          X: Data where outliers are.
          y: Irrelevant.

        Returns: Itself.
        """
-        self.dist_ = self.score_samples(X)
-        self.offset_ = -np.percentile(-self.dist_, 100.0 * self.contamination)
+        bounds_ = np.quantile(X, (0.003/2.0, 0.5, 1.0 - 0.003/2.0), axis=0)
+        self.ndof_ = float(X.shape[1] - 1.0)
+        self.med_ = bounds_[1, np.newaxis, ...]
+        self.sigma_ = (bounds_[2, np.newaxis, ...] - bounds_[0, np.newaxis, ...])/3.0
        return self

    def decision_function(self, X: np.ndarray) -> np.ndarray:
        """
        Return the decision function.
+        This is chi^2/ndof - 1 - sigma*sqrt(var_chi2)
        """
-        return self.offset_ - self.score_samples(X)
+        return (self.score_samples(X) - 1.0 - self.sigma*np.sqrt(2.0/self.ndof_))

    def score_samples(self, X: np.ndarray) -> np.ndarray:
        """
@@ -683,9 +693,9 @@ class UncorrelatedDeviation(OutlierMixin, BaseEstimator):
        Args:
          X: The new input data.

-        Returns: The Mahalanobis distance.
+        Returns: The chi^2 test statistic.
        """
-        return np.sqrt(np.sum(X**2, axis=1))
+        return np.sum(((X - self.med_)/self.sigma_)**2, axis=1)/float(self.ndof_)

    def predict(self, X):
        """
@@ -700,9 +710,8 @@ class UncorrelatedDeviation(OutlierMixin, BaseEstimator):
            Returns -1 for anomalies/outliers and +1 for inliers.
        """
        values = self.decision_function(X)
-        is_inlier = np.full(values.shape[0], -1, dtype=int)
-        is_inlier[values >= 0] = 1
-
+        is_inlier = np.full(values.shape[0], 1, dtype=int)
+        is_inlier[values > 0] = -1
        return is_inlier

    def score(self, X, y, sample_weight=None):
@@ -781,10 +790,10 @@ class Model(TransformerMixin, BaseEstimator):
                                ])
        #self.ood = {ch: IsolationForest(n_jobs=-1)
        #            for ch in channels+['full']}
-        #self.ood = {ch: UncorrelatedDeviation(contamination=0.003)
-        #            for ch in channels+['full']}
-        self.ood = {ch: EllipticEnvelope(contamination=0.003)
+        self.ood = {ch: UncorrelatedDeviation(sigma=5)
                    for ch in channels+['full']}
+        #self.ood = {ch: EllipticEnvelope(contamination=0.003)
+        #            for ch in channels+['full']}
        #self.fit_model = MultiOutputWithStd(ARDRegression(n_iter=300, tol=1e-8, verbose=True), n_jobs=8)
        self.fit_model = MultiOutputWithStd(BayesianRidge(n_iter=300, tol=1e-8, verbose=True), n_jobs=8)
        #self.fit_model = FitModel()
@@ -928,13 +937,13 @@ class Model(TransformerMixin, BaseEstimator):
        E = np.fft.fftfreq(len(e), de)
        e_axis = np.linspace(-0.5*(e[-1] - e[0]), 0.5*(e[-1] - e[0]), len(e))
        # generate a gaussian
-        gaussian = np.exp(-0.5*(e_axis)**2/self.high_res_sigma**2)
-        gaussian /= np.sum(gaussian, axis=0, keepdims=True)
-        gaussian = np.clip(gaussian, a_min=1e-6, a_max=None)
-        gaussian_ft = np.fft.fft(gaussian)
+        #gaussian = np.exp(-0.5*(e_axis)**2/self.high_res_sigma**2)
+        #gaussian /= np.sum(gaussian, axis=0, keepdims=True)
+        #gaussian = np.clip(gaussian, a_min=1e-6, a_max=None)
+        #gaussian_ft = np.fft.fft(gaussian)

        H = np.mean(Z/D, axis=0)
-        N = np.absolute(gaussian_ft*V)**2
+        N = np.absolute(V)**2
        S = np.mean(np.absolute(D)**2, axis=0)
        H2 = np.absolute(H)**2
        nonzero = np.absolute(H) > 0.2
@@ -1080,23 +1089,45 @@ class Model(TransformerMixin, BaseEstimator):
                 the expected prediction in key "expected", the stat. uncertainty in key "unc" and
                 a (1, energy channel) array for the PCA syst. uncertainty in key "pca".
        """
+        #t = list()
+        #n = list()
+        #t += [time_ns()*1e-9]
+        #n += ["Initial"]
+
        low_res_pre = self.x_select.transform(low_res_data)
+        #t += [time_ns()*1e-9]
+        #n += ["Select"]
+
        low_pca = self.x_model.transform(low_res_pre)
-        high_pca, high_pca_unc = self.fit_model.predict(low_pca, return_std=True, n_jobs=-1)
-        #high_pca = self.fit_model.predict(low_pca)
-        #high_pca_unc = 0
+        #t += [time_ns()*1e-9]
+        #n += ["PCA x"]
+
+        high_pca, high_pca_unc = self.fit_model.predict(low_pca, return_std=True)
+        #t += [time_ns()*1e-9]
+        #n += ["Fit model"]
+
        n_trains = high_pca.shape[0]
        # Note: The whiten=True setting in the PCA model leads to an affine transformation
        pca_y = np.concatenate((high_pca,
                                high_pca + high_pca_unc,
                               ),
                               axis=0)
+        #t += [time_ns()*1e-9]
+        #n += ["Concat"]
+
        high_res_predicted = self.y_model["pca"].inverse_transform(pca_y)
        expected = high_res_predicted[:n_trains, :]
        e_plus = high_res_predicted[n_trains:(2*n_trains), :]
        unc = np.fabs(e_plus - expected)
        pca_unc = self.y_model['unc'].uncertainty()
        total_unc = np.sqrt(pca_unc**2 + unc**2)
+        #t += [time_ns()*1e-9]
+        #n += ["Unc"]
+
+        #t = np.diff(np.array(t))
+        #n = n[1:]
+        #print("Times")
+        #print(dict(zip(n, t)))

        return dict(expected=expected,
                    unc=unc,