Danilo Enoque Ferreira de Lima
--- a/pes_to_spec/model.py

+ 41

− 16
+++ b/pes_to_spec/model.py

+ 41

− 16
 @@ -14,10 +14,11 @@ from sklearn.pipeline import Pipeline
 from sklearn.kernel_approximation import Nystroem
 #from sklearn.linear_model import ARDRegression
 from sklearn.linear_model import BayesianRidge
-from sklearn.covariance import EllipticEnvelope
+#from sklearn.covariance import EllipticEnvelope
 from scipy.stats import gaussian_kde
 from functools import reduce
 from itertools import product
+from time import time_ns

 from sklearn.base import clone, MetaEstimatorMixin
 from joblib import Parallel, delayed
 @@ -637,7 +638,7 @@ class MultiOutputWithStd(MetaEstimatorMixin, BaseEstimator):
        """
        if n_jobs is None:
            n_jobs = self.n_jobs
-        y = Parallel(n_jobs=n_jobs)(
+        y = Parallel(n_jobs=n_jobs, backend="threading")(
            delayed(e.predict)(X, return_std) for e in self.estimators_
            #delayed(e.predict)(X) for e in self.estimators_
        )
 @@ -652,7 +653,7 @@ class UncorrelatedDeviation(OutlierMixin, BaseEstimator):
    """
    Detect outliers from uncorrelated whitened mean-centered inputs.
    """
-    def __init__(self, contamination: float=0.0000005):
+    def __init__(self, contamination: float=0.003):
        super().__init__()
        self.contamination = contamination

 @@ -661,20 +662,19 @@ class UncorrelatedDeviation(OutlierMixin, BaseEstimator):
        Does nothing.

        Args:
-          X: Irrelevant.
+          X: Data where outliers are.
          y: Irrelevant.

        Returns: Itself.
        """
-        self.dist_ = self.score_samples(X)
-        self.offset_ = -np.percentile(-self.dist_, 100.0 * self.contamination)
+        self.bounds_ = np.quantile(X, (self.contamination/2.0, 0.5, 1.0 - self.contamination/2.0), axis=0)
        return self

    def decision_function(self, X: np.ndarray) -> np.ndarray:
        """
        Return the decision function.
        """
-        return self.offset_ - self.score_samples(X)
+        return self.score_samples(X) - 1.0

    def score_samples(self, X: np.ndarray) -> np.ndarray:
        """
 @@ -685,7 +685,9 @@ class UncorrelatedDeviation(OutlierMixin, BaseEstimator):

        Returns: The Mahalanobis distance.
        """
-        return np.sqrt(np.sum(X**2, axis=1))
+        med = self.bounds_[1, np.newaxis, ...]
+        sigma = (self.bounds_[2, np.newaxis, ...] - self.bounds_[0, np.newaxis, ...])/0.5
+        return np.fabs((X - med)/sigma)

    def predict(self, X):
        """
 @@ -700,9 +702,10 @@ class UncorrelatedDeviation(OutlierMixin, BaseEstimator):
            Returns -1 for anomalies/outliers and +1 for inliers.
        """
        values = self.decision_function(X)
+        is_lower = np.any(X < self.bounds_[0, np.newaxis, ...], axis=1)
+        is_upper = np.any(X > self.bounds_[2, np.newaxis, ...], axis=1)
        is_inlier = np.full(values.shape[0], -1, dtype=int)
-        is_inlier[values >= 0] = 1
-
+        is_inlier[is_lower | is_upper] = 1
        return is_inlier

    def score(self, X, y, sample_weight=None):
 @@ -781,10 +784,10 @@ class Model(TransformerMixin, BaseEstimator):
                                ])
        #self.ood = {ch: IsolationForest(n_jobs=-1)
        #            for ch in channels+['full']}
-        #self.ood = {ch: UncorrelatedDeviation(contamination=0.003)
-        #            for ch in channels+['full']}
-        self.ood = {ch: EllipticEnvelope(contamination=0.003)
+        self.ood = {ch: UncorrelatedDeviation(contamination=0.003)
                    for ch in channels+['full']}
+        #self.ood = {ch: EllipticEnvelope(contamination=0.003)
+        #            for ch in channels+['full']}
        #self.fit_model = MultiOutputWithStd(ARDRegression(n_iter=300, tol=1e-8, verbose=True), n_jobs=8)
        self.fit_model = MultiOutputWithStd(BayesianRidge(n_iter=300, tol=1e-8, verbose=True), n_jobs=8)
        #self.fit_model = FitModel()
 @@ -1080,23 +1083,45 @@ class Model(TransformerMixin, BaseEstimator):
                 the expected prediction in key "expected", the stat. uncertainty in key "unc" and
                 a (1, energy channel) array for the PCA syst. uncertainty in key "pca".
        """
+        #t = list()
+        #n = list()
+        #t += [time_ns()*1e-9]
+        #n += ["Initial"]
+
        low_res_pre = self.x_select.transform(low_res_data)
+        #t += [time_ns()*1e-9]
+        #n += ["Select"]
+
        low_pca = self.x_model.transform(low_res_pre)
-        high_pca, high_pca_unc = self.fit_model.predict(low_pca, return_std=True, n_jobs=-1)
-        #high_pca = self.fit_model.predict(low_pca)
-        #high_pca_unc = 0
+        #t += [time_ns()*1e-9]
+        #n += ["PCA x"]
+
+        high_pca, high_pca_unc = self.fit_model.predict(low_pca, return_std=True)
+        #t += [time_ns()*1e-9]
+        #n += ["Fit model"]
+
        n_trains = high_pca.shape[0]
        # Note: The whiten=True setting in the PCA model leads to an affine transformation
        pca_y = np.concatenate((high_pca,
                                high_pca + high_pca_unc,
                               ),
                               axis=0)
+        #t += [time_ns()*1e-9]
+        #n += ["Concat"]
+
        high_res_predicted = self.y_model["pca"].inverse_transform(pca_y)
        expected = high_res_predicted[:n_trains, :]
        e_plus = high_res_predicted[n_trains:(2*n_trains), :]
        unc = np.fabs(e_plus - expected)
        pca_unc = self.y_model['unc'].uncertainty()
        total_unc = np.sqrt(pca_unc**2 + unc**2)
+        #t += [time_ns()*1e-9]
+        #n += ["Unc"]
+
+        #t = np.diff(np.array(t))
+        #n = n[1:]
+        #print("Times")
+        #print(dict(zip(n, t)))

        return dict(expected=expected,
                    unc=unc,