Danilo Enoque Ferreira de Lima
--- a/pes_to_spec/model.py

+ 56

− 25
+++ b/pes_to_spec/model.py

+ 56

− 25
 @@ -14,10 +14,11 @@ from sklearn.pipeline import Pipeline
 from sklearn.kernel_approximation import Nystroem
 #from sklearn.linear_model import ARDRegression
 from sklearn.linear_model import BayesianRidge
-from sklearn.covariance import EllipticEnvelope
+#from sklearn.covariance import EllipticEnvelope
 from scipy.stats import gaussian_kde
 from functools import reduce
 from itertools import product
+from time import time_ns

 from sklearn.base import clone, MetaEstimatorMixin
 from joblib import Parallel, delayed
 @@ -637,7 +638,7 @@ class MultiOutputWithStd(MetaEstimatorMixin, BaseEstimator):
        """
        if n_jobs is None:
            n_jobs = self.n_jobs
-        y = Parallel(n_jobs=n_jobs)(
+        y = Parallel(n_jobs=n_jobs, backend="threading")(
            delayed(e.predict)(X, return_std) for e in self.estimators_
            #delayed(e.predict)(X) for e in self.estimators_
        )
 @@ -650,31 +651,40 @@ class MultiOutputWithStd(MetaEstimatorMixin, BaseEstimator):

 class UncorrelatedDeviation(OutlierMixin, BaseEstimator):
    """
-    Detect outliers from uncorrelated whitened mean-centered inputs.
+    Detect outliers from uncorrelated inputs.
+    It uses a chi^2 sum over the features to flatten the features.
+    The standard deviation is estimated using quantiles.
+
+    Args:
+      sigma: Number of standard deviations of the chi^2 distribution.
+
    """
-    def __init__(self, contamination: float=0.0000005):
+    def __init__(self, sigma: float=5.0):
        super().__init__()
-        self.contamination = contamination
+        self.sigma = sigma

    def fit(self, X, y=None) -> OutlierMixin:
        """
        Does nothing.

        Args:
-          X: Irrelevant.
+          X: Data where outliers are.
          y: Irrelevant.

        Returns: Itself.
        """
-        self.dist_ = self.score_samples(X)
-        self.offset_ = -np.percentile(-self.dist_, 100.0 * self.contamination)
+        bounds_ = np.quantile(X, (0.003/2.0, 0.5, 1.0 - 0.003/2.0), axis=0)
+        self.ndof_ = float(X.shape[1] - 1.0)
+        self.med_ = bounds_[1, np.newaxis, ...]
+        self.sigma_ = (bounds_[2, np.newaxis, ...] - bounds_[0, np.newaxis, ...])/3.0
        return self

    def decision_function(self, X: np.ndarray) -> np.ndarray:
        """
        Return the decision function.
+        This is chi^2/ndof - 1 - sigma*sqrt(var_chi2)
        """
-        return self.offset_ - self.score_samples(X)
+        return (self.score_samples(X) - 1.0 - self.sigma*np.sqrt(2.0/self.ndof_))

    def score_samples(self, X: np.ndarray) -> np.ndarray:
        """
 @@ -683,9 +693,9 @@ class UncorrelatedDeviation(OutlierMixin, BaseEstimator):
        Args:
          X: The new input data.

-        Returns: The Mahalanobis distance.
+        Returns: The chi^2 test statistic.
        """
-        return np.sqrt(np.sum(X**2, axis=1))
+        return np.sum(((X - self.med_)/self.sigma_)**2, axis=1)/float(self.ndof_)

    def predict(self, X):
        """
 @@ -700,9 +710,8 @@ class UncorrelatedDeviation(OutlierMixin, BaseEstimator):
            Returns -1 for anomalies/outliers and +1 for inliers.
        """
        values = self.decision_function(X)
-        is_inlier = np.full(values.shape[0], -1, dtype=int)
-        is_inlier[values >= 0] = 1
-
+        is_inlier = np.full(values.shape[0], 1, dtype=int)
+        is_inlier[values > 0] = -1
        return is_inlier

    def score(self, X, y, sample_weight=None):
 @@ -781,10 +790,10 @@ class Model(TransformerMixin, BaseEstimator):
                                ])
        #self.ood = {ch: IsolationForest(n_jobs=-1)
        #            for ch in channels+['full']}
-        #self.ood = {ch: UncorrelatedDeviation(contamination=0.003)
-        #            for ch in channels+['full']}
-        self.ood = {ch: EllipticEnvelope(contamination=0.003)
+        self.ood = {ch: UncorrelatedDeviation(sigma=5)
                    for ch in channels+['full']}
+        #self.ood = {ch: EllipticEnvelope(contamination=0.003)
+        #            for ch in channels+['full']}
        #self.fit_model = MultiOutputWithStd(ARDRegression(n_iter=300, tol=1e-8, verbose=True), n_jobs=8)
        self.fit_model = MultiOutputWithStd(BayesianRidge(n_iter=300, tol=1e-8, verbose=True), n_jobs=8)
        #self.fit_model = FitModel()
 @@ -928,13 +937,13 @@ class Model(TransformerMixin, BaseEstimator):
        E = np.fft.fftfreq(len(e), de)
        e_axis = np.linspace(-0.5*(e[-1] - e[0]), 0.5*(e[-1] - e[0]), len(e))
        # generate a gaussian
-        gaussian = np.exp(-0.5*(e_axis)**2/self.high_res_sigma**2)
-        gaussian /= np.sum(gaussian, axis=0, keepdims=True)
-        gaussian = np.clip(gaussian, a_min=1e-6, a_max=None)
-        gaussian_ft = np.fft.fft(gaussian)
+        #gaussian = np.exp(-0.5*(e_axis)**2/self.high_res_sigma**2)
+        #gaussian /= np.sum(gaussian, axis=0, keepdims=True)
+        #gaussian = np.clip(gaussian, a_min=1e-6, a_max=None)
+        #gaussian_ft = np.fft.fft(gaussian)

        H = np.mean(Z/D, axis=0)
-        N = np.absolute(gaussian_ft*V)**2
+        N = np.absolute(V)**2
        S = np.mean(np.absolute(D)**2, axis=0)
        H2 = np.absolute(H)**2
        nonzero = np.absolute(H) > 0.2
 @@ -1080,23 +1089,45 @@ class Model(TransformerMixin, BaseEstimator):
                 the expected prediction in key "expected", the stat. uncertainty in key "unc" and
                 a (1, energy channel) array for the PCA syst. uncertainty in key "pca".
        """
+        #t = list()
+        #n = list()
+        #t += [time_ns()*1e-9]
+        #n += ["Initial"]
+
        low_res_pre = self.x_select.transform(low_res_data)
+        #t += [time_ns()*1e-9]
+        #n += ["Select"]
+
        low_pca = self.x_model.transform(low_res_pre)
-        high_pca, high_pca_unc = self.fit_model.predict(low_pca, return_std=True, n_jobs=-1)
-        #high_pca = self.fit_model.predict(low_pca)
-        #high_pca_unc = 0
+        #t += [time_ns()*1e-9]
+        #n += ["PCA x"]
+
+        high_pca, high_pca_unc = self.fit_model.predict(low_pca, return_std=True)
+        #t += [time_ns()*1e-9]
+        #n += ["Fit model"]
+
        n_trains = high_pca.shape[0]
        # Note: The whiten=True setting in the PCA model leads to an affine transformation
        pca_y = np.concatenate((high_pca,
                                high_pca + high_pca_unc,
                               ),
                               axis=0)
+        #t += [time_ns()*1e-9]
+        #n += ["Concat"]
+
        high_res_predicted = self.y_model["pca"].inverse_transform(pca_y)
        expected = high_res_predicted[:n_trains, :]
        e_plus = high_res_predicted[n_trains:(2*n_trains), :]
        unc = np.fabs(e_plus - expected)
        pca_unc = self.y_model['unc'].uncertainty()
        total_unc = np.sqrt(pca_unc**2 + unc**2)
+        #t += [time_ns()*1e-9]
+        #n += ["Unc"]
+
+        #t = np.diff(np.array(t))
+        #n = n[1:]
+        #print("Times")
+        #print(dict(zip(n, t)))

        return dict(expected=expected,
                    unc=unc,