Skip to content
Snippets Groups Projects
Commit 3db94efa authored by Danilo Ferreira de Lima's avatar Danilo Ferreira de Lima
Browse files

Corrected chi^2 definition.

parent 649fc407
No related branches found
No related tags found
1 merge request!9Speed up prediction and outlier detection.
This commit is part of merge request !9. Comments created here will be created in the context of that merge request.
...@@ -2,4 +2,4 @@ ...@@ -2,4 +2,4 @@
Estimate high-resolution photon spectrometer data from low-resolution non-invasive measurements. Estimate high-resolution photon spectrometer data from low-resolution non-invasive measurements.
""" """
VERSION = "0.2.0" VERSION = "0.2.1"
...@@ -651,11 +651,17 @@ class MultiOutputWithStd(MetaEstimatorMixin, BaseEstimator): ...@@ -651,11 +651,17 @@ class MultiOutputWithStd(MetaEstimatorMixin, BaseEstimator):
class UncorrelatedDeviation(OutlierMixin, BaseEstimator): class UncorrelatedDeviation(OutlierMixin, BaseEstimator):
""" """
Detect outliers from uncorrelated whitened mean-centered inputs. Detect outliers from uncorrelated inputs.
It uses a chi^2 sum over the features to flatten the features.
The standard deviation is estimated using quantiles.
Args:
sigma: Number of standard deviations of the chi^2 distribution.
""" """
def __init__(self, contamination: float=0.003): def __init__(self, sigma: float=5.0):
super().__init__() super().__init__()
self.contamination = contamination self.sigma = sigma
def fit(self, X, y=None) -> OutlierMixin: def fit(self, X, y=None) -> OutlierMixin:
""" """
...@@ -667,14 +673,18 @@ class UncorrelatedDeviation(OutlierMixin, BaseEstimator): ...@@ -667,14 +673,18 @@ class UncorrelatedDeviation(OutlierMixin, BaseEstimator):
Returns: Itself. Returns: Itself.
""" """
self.bounds_ = np.quantile(X, (self.contamination/2.0, 0.5, 1.0 - self.contamination/2.0), axis=0) bounds_ = np.quantile(X, (0.003/2.0, 0.5, 1.0 - 0.003/2.0), axis=0)
self.ndof_ = float(X.shape[1] - 1.0)
self.med_ = bounds_[1, np.newaxis, ...]
self.sigma_ = (bounds_[2, np.newaxis, ...] - bounds_[0, np.newaxis, ...])/3.0
return self return self
def decision_function(self, X: np.ndarray) -> np.ndarray: def decision_function(self, X: np.ndarray) -> np.ndarray:
""" """
Return the decision function. Return the decision function.
This is chi^2/ndof - 1 - sigma*sqrt(var_chi2)
""" """
return self.score_samples(X) - 1.0 return (self.score_samples(X) - 1.0 - self.sigma*np.sqrt(2.0/self.ndof_))
def score_samples(self, X: np.ndarray) -> np.ndarray: def score_samples(self, X: np.ndarray) -> np.ndarray:
""" """
...@@ -683,11 +693,9 @@ class UncorrelatedDeviation(OutlierMixin, BaseEstimator): ...@@ -683,11 +693,9 @@ class UncorrelatedDeviation(OutlierMixin, BaseEstimator):
Args: Args:
X: The new input data. X: The new input data.
Returns: The Mahalanobis distance. Returns: The chi^2 test statistic.
""" """
med = self.bounds_[1, np.newaxis, ...] return np.sum(((X - self.med_)/self.sigma_)**2, axis=1)/float(self.ndof_)
sigma = (self.bounds_[2, np.newaxis, ...] - self.bounds_[0, np.newaxis, ...])/0.5
return np.fabs((X - med)/sigma)
def predict(self, X): def predict(self, X):
""" """
...@@ -702,10 +710,8 @@ class UncorrelatedDeviation(OutlierMixin, BaseEstimator): ...@@ -702,10 +710,8 @@ class UncorrelatedDeviation(OutlierMixin, BaseEstimator):
Returns -1 for anomalies/outliers and +1 for inliers. Returns -1 for anomalies/outliers and +1 for inliers.
""" """
values = self.decision_function(X) values = self.decision_function(X)
is_lower = np.any(X < self.bounds_[0, np.newaxis, ...], axis=1) is_inlier = np.full(values.shape[0], 1, dtype=int)
is_upper = np.any(X > self.bounds_[2, np.newaxis, ...], axis=1) is_inlier[values > 0] = -1
is_inlier = np.full(values.shape[0], -1, dtype=int)
is_inlier[is_lower | is_upper] = 1
return is_inlier return is_inlier
def score(self, X, y, sample_weight=None): def score(self, X, y, sample_weight=None):
...@@ -784,7 +790,7 @@ class Model(TransformerMixin, BaseEstimator): ...@@ -784,7 +790,7 @@ class Model(TransformerMixin, BaseEstimator):
]) ])
#self.ood = {ch: IsolationForest(n_jobs=-1) #self.ood = {ch: IsolationForest(n_jobs=-1)
# for ch in channels+['full']} # for ch in channels+['full']}
self.ood = {ch: UncorrelatedDeviation(contamination=0.003) self.ood = {ch: UncorrelatedDeviation(sigma=5)
for ch in channels+['full']} for ch in channels+['full']}
#self.ood = {ch: EllipticEnvelope(contamination=0.003) #self.ood = {ch: EllipticEnvelope(contamination=0.003)
# for ch in channels+['full']} # for ch in channels+['full']}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment