diff --git a/CHANGES.txt b/CHANGES.txt index 83eed4930..364b46e17 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -179,4 +179,7 @@ v<1.0.8>, <03/08/2023> -- Optimized ECDF and drop Statsmodels dependency (#467). v<1.0.9>, <03/19/2023> -- Hot fix for errors in ECOD and COPOD due to the issue of scipy. v<1.1.0>, <06/19/2023> -- Further integration of PyThresh. v<1.1.1>, <07/03/2023> -- Bump up sklearn requirement and some hot fixes. -v<1.1.1>, <10/24/2023> -- Add deep isolation forest (#506) \ No newline at end of file +v<1.1.1>, <10/24/2023> -- Add deep isolation forest (#506). +v<1.1.2>, <11/17/2023> -- Massive documentation optimization. +v<1.1.2>, <11/17/2023> -- Fix the issue of contamination. +v<1.1.2>, <11/17/2023> -- KPCA bug fix (#494). \ No newline at end of file diff --git a/README.rst b/README.rst index 1a01529a1..c8b443f9d 100644 --- a/README.rst +++ b/README.rst @@ -58,20 +58,35 @@ Python Outlier Detection (PyOD) ----- -**News**: We have a 45-page, the most comprehensive `anomaly detection benchmark paper `_. -The fully `open-sourced ADBench `_ compares 30 anomaly detection algorithms on 57 benchmark datasets. -**For time-series outlier detection**, please use `TODS `_. -**For graph outlier detection**, please use `PyGOD `_. +Read Me First +^^^^^^^^^^^^^ + +Welcome to PyOD, a versatile Python library for detecting anomalies in multivariate data. Whether you're tackling a small-scale project or large datasets, PyOD offers a range of algorithms to suit your needs. + +* **For time-series outlier detection**, please use `TODS `_. + +* **For graph outlier detection**, please use `PyGOD `_. + +* **Performance Comparison \& Datasets**: We have a 45-page, the most comprehensive `anomaly detection benchmark paper `_. The fully `open-sourced ADBench `_ compares 30 anomaly detection algorithms on 57 benchmark datasets. + +* **Learn more about anomaly detection** \@ `Anomaly Detection Resources `_ + +* **PyOD on Distributed Systems**: you could also run `PyOD on databricks `_. + +---- + +About PyOD +^^^^^^^^^^ -PyOD is the most comprehensive and scalable **Python library** for **detecting outlying objects** in +PyOD, established in 2017, has become a go-to **Python library** for **detecting anomalous/outlying objects** in multivariate data. This exciting yet challenging field is commonly referred as `Outlier Detection `_ or `Anomaly Detection `_. -PyOD includes more than 40 detection algorithms, from classical LOF (SIGMOD 2000) to -the latest ECOD and DIF (TKDE 2022 and 2023). Since 2017, PyOD has been successfully used in numerous academic researches and -commercial products with more than `10 million downloads `_. +PyOD includes more than 50 detection algorithms, from classical LOF (SIGMOD 2000) to +the cutting-edge ECOD and DIF (TKDE 2022 and 2023). Since 2017, PyOD has been successfully used in numerous academic researches and +commercial products with more than `17 million downloads `_. It is also well acknowledged by the machine learning community with various dedicated posts/tutorials, including `Analytics Vidhya `_, `KDnuggets `_, and @@ -80,10 +95,10 @@ It is also well acknowledged by the machine learning community with various dedi **PyOD is featured for**: -* **Unified APIs, detailed documentation, and interactive examples** across various algorithms. -* **Advanced models**\, including **classical distance and density estimation**, **latest deep learning methods**, and **emerging algorithms like ECOD**. -* **Optimized performance with JIT and parallelization** using `numba `_ and `joblib `_. -* **Fast training & prediction with SUOD** [#Zhao2021SUOD]_. +* **Unified, User-Friendly Interface** across various algorithms. +* **Wide Range of Models**\, from classic techniques to the latest deep learning methods. +* **High Performance & Efficiency**, leveraging `numba `_ and `joblib `_ for JIT compilation and parallel processing. +* **Fast Training & Prediction**, achieved through the SUOD framework [#Zhao2021SUOD]_. **Outlier Detection with 5 Lines of Code**\ : @@ -92,22 +107,19 @@ It is also well acknowledged by the machine learning community with various dedi .. code-block:: python - # train an ECOD detector + # Example: Training an ECOD detector from pyod.models.ecod import ECOD clf = ECOD() clf.fit(X_train) + y_train_scores = clf.decision_scores_ # Outlier scores for training data + y_test_scores = clf.decision_function(X_test) # Outlier scores for test data - # get outlier scores - y_train_scores = clf.decision_scores_ # raw outlier scores on the train data - y_test_scores = clf.decision_function(X_test) # predict raw outlier scores on test - - -**Personal suggestion on selecting an OD algorithm**. If you do not know which algorithm to try, go with: +**Selecting the Right Algorithm:**. Unsure where to start? Consider these robust and interpretable options: - `ECOD `_: Example of using ECOD for outlier detection - `Isolation Forest `_: Example of using Isolation Forest for outlier detection -They are both fast and interpretable. Or, you could try more data-driven approach `MetaOD `_. +Alternatively, explore `MetaOD `_ for a data-driven approach. **Citing PyOD**\ : @@ -131,21 +143,26 @@ or:: Zhao, Y., Nasrullah, Z. and Li, Z., 2019. PyOD: A Python Toolbox for Scalable Outlier Detection. Journal of machine learning research (JMLR), 20(96), pp.1-7. -If you want more general insights of anomaly detection and/or algorithm performance comparison, please see our -NeurIPS 2022 paper `ADBench: Anomaly Detection Benchmark Paper `_:: +For a broader perspective on anomaly detection, see our NeurIPS papers +`ADBench: Anomaly Detection Benchmark Paper `_ \& `ADGym: Design Choices for Deep Anomaly Detection `_:: - @inproceedings{han2022adbench, - title={ADBench: Anomaly Detection Benchmark}, - author={Songqiao Han and Xiyang Hu and Hailiang Huang and Mingqi Jiang and Yue Zhao}, - booktitle={Neural Information Processing Systems (NeurIPS)} - year={2022}, + @article{han2022adbench, + title={Adbench: Anomaly detection benchmark}, + author={Han, Songqiao and Hu, Xiyang and Huang, Hailiang and Jiang, Minqi and Zhao, Yue}, + journal={Advances in Neural Information Processing Systems}, + volume={35}, + pages={32142--32159}, + year={2022} } -**Key Links and Resources**\ : - + @article{jiang2023adgym, + title={ADGym: Design Choices for Deep Anomaly Detection}, + author={Jiang, Minqi and Hou, Chaochuan and Zheng, Ao and Han, Songqiao and Huang, Hailiang and Wen, Qingsong and Hu, Xiyang and Zhao, Yue}, + journal={Advances in Neural Information Processing Systems}, + volume={36}, + year={2023} + } -* `View the latest codes on Github `_ -* `Anomaly Detection Resources `_ **Table of Contents**\ : @@ -153,7 +170,7 @@ NeurIPS 2022 paper `ADBench: Anomaly Detection Benchmark Paper `_ * `API Cheatsheet & Reference <#api-cheatsheet--reference>`_ -* `ADBench Benchmark <#adbench-benchmark>`_ +* `ADBench Benchmark and Datasets <#adbench-benchmark-and-datasets>`_ * `Model Save & Load <#model-save--load>`_ * `Fast Train with SUOD <#fast-train-with-suod>`_ * `Thresholding Outlier Scores <#thresholding-outlier-scores>`_ @@ -169,8 +186,8 @@ NeurIPS 2022 paper `ADBench: Anomaly Detection Benchmark Paper =1.19 @@ -207,19 +224,12 @@ Alternatively, you could clone and run setup.py file: * combo (optional, required for models/combination.py and FeatureBagging) * keras/tensorflow (optional, required for AutoEncoder, and other deep learning models) -* pandas (optional, required for running benchmark) * suod (optional, required for running SUOD model) * xgboost (optional, required for XGBOD) -* pythresh to use thresholding +* pythresh (optional, required for thresholding) **Warning**\ : -PyOD has multiple neural network based models, e.g., AutoEncoders, which are -implemented in both Tensorflow and PyTorch. However, PyOD does **NOT** install these deep learning libraries for you. -This reduces the risk of interfering with your local copies. -If you want to use neural-net based models, please make sure these deep learning libraries are installed. -Instructions are provided: `neural-net FAQ `_. -Similarly, models depending on **xgboost**, e.g., XGBOD, would **NOT** enforce xgboost installation by default. - +PyOD includes several neural network-based models, such as AutoEncoders, implemented in Tensorflow and PyTorch. These deep learning libraries are not automatically installed by PyOD to avoid conflicts with existing installations. If you plan to use neural-net based models, please ensure these libraries are installed. See the `neural-net FAQ `_ for guidance. Additionally, xgboost is not installed by default but is required for models like XGBOD. ---- @@ -228,29 +238,27 @@ Similarly, models depending on **xgboost**, e.g., XGBOD, would **NOT** enforce x API Cheatsheet & Reference ^^^^^^^^^^^^^^^^^^^^^^^^^^ -Full API Reference: (https://pyod.readthedocs.io/en/latest/pyod.html). API cheatsheet for all detectors: - +The full API Reference is available at `PyOD Documentation `_. Below is a quick cheatsheet for all detectors: -* **fit(X)**\ : Fit detector. y is ignored in unsupervised methods. -* **decision_function(X)**\ : Predict raw anomaly score of X using the fitted detector. -* **predict(X)**\ : Predict if a particular sample is an outlier or not using the fitted detector. -* **predict_proba(X)**\ : Predict the probability of a sample being outlier using the fitted detector. -* **predict_confidence(X)**\ : Predict the model's sample-wise confidence (available in predict and predict_proba) [#Perini2020Quantifying]_. +* **fit(X)**\ : Fit the detector. The parameter y is ignored in unsupervised methods. +* **decision_function(X)**\ : Predict raw anomaly scores for X using the fitted detector. +* **predict(X)**\ : Determine whether a sample is an outlier or not as binary labels using the fitted detector. +* **predict_proba(X)**\ : Estimate the probability of a sample being an outlier using the fitted detector. +* **predict_confidence(X)**\ : Assess the model's confidence on a per-sample basis (applicable in predict and predict_proba) [#Perini2020Quantifying]_. -Key Attributes of a fitted model: +**Key Attributes of a fitted model**: -* **decision_scores_**\ : The outlier scores of the training data. The higher, the more abnormal. - Outliers tend to have higher scores. -* **labels_**\ : The binary labels of the training data. 0 stands for inliers and 1 for outliers/anomalies. +* **decision_scores_**\ : Outlier scores of the training data. Higher scores typically indicate more abnormal behavior. Outliers usually have higher scores. +* **labels_**\ : Binary labels of the training data, where 0 indicates inliers and 1 indicates outliers/anomalies. ---- -ADBench Benchmark -^^^^^^^^^^^^^^^^^ +ADBench Benchmark and Datasets +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ We just released a 45-page, the most comprehensive `ADBench: Anomaly Detection Benchmark `_ [#Han2022ADBench]_. The fully `open-sourced ADBench `_ compares 30 anomaly detection algorithms on 57 benchmark datasets. @@ -262,16 +270,12 @@ The organization of **ADBench** is provided below: :alt: benchmark-fig -**The comparison of selected models** is made available below -(\ `Figure `_\ , -`compare_all_models.py `_\ , -`Interactive Jupyter Notebooks `_\ ). -For Jupyter Notebooks, please navigate to **"/notebooks/Compare All Models.ipynb"**. - +For a simpler visualization, we make **the comparison of selected models** via +`compare_all_models.py `_\. -.. image:: https://raw.githubusercontent.com/yzhao062/pyod/master/examples/ALL.png - :target: https://raw.githubusercontent.com/yzhao062/pyod/master/examples/ALL.png - :alt: Comparision_of_All +.. image:: https://github.com/yzhao062/pyod/blob/development/examples/ALL.png?raw=true + :target: https://github.com/yzhao062/pyod/blob/development/examples/ALL.png?raw=true + :alt: Comparison_of_All diff --git a/Threshold.rst b/Threshold.rst new file mode 100644 index 000000000..e69de29bb diff --git a/docs/api_cc.rst b/docs/api_cc.rst index 456459916..03aaf6b13 100644 --- a/docs/api_cc.rst +++ b/docs/api_cc.rst @@ -1,20 +1,20 @@ API CheatSheet ============== -The following APIs are applicable for all detector models for easy use. +The full API Reference is available at `PyOD Documentation `_. Below is a quick cheatsheet for all detectors: -* :func:`pyod.models.base.BaseDetector.fit`: Fit detector. y is ignored in unsupervised methods. -* :func:`pyod.models.base.BaseDetector.decision_function`: Predict raw anomaly score of X using the fitted detector. -* :func:`pyod.models.base.BaseDetector.predict`: Predict if a particular sample is an outlier or not using the fitted detector. -* :func:`pyod.models.base.BaseDetector.predict_proba`: Predict the probability of a sample being outlier using the fitted detector. -* :func:`pyod.models.base.BaseDetector.predict_confidence`: Predict the model's sample-wise confidence (available in predict and predict_proba). +* :func:`pyod.models.base.BaseDetector.fit`: The parameter y is ignored in unsupervised methods. +* :func:`pyod.models.base.BaseDetector.decision_function`: Predict raw anomaly scores for X using the fitted detector. +* :func:`pyod.models.base.BaseDetector.predict`: Determine whether a sample is an outlier or not as binary labels using the fitted detector. +* :func:`pyod.models.base.BaseDetector.predict_proba`: Estimate the probability of a sample being an outlier using the fitted detector. +* :func:`pyod.models.base.BaseDetector.predict_confidence`: Assess the model's confidence on a per-sample basis (applicable in predict and predict_proba) [#Perini2020Quantifying]_. -Key Attributes of a fitted model: +**Key Attributes of a fitted model**: -* :attr:`pyod.models.base.BaseDetector.decision_scores_`: The outlier scores of the training data. The higher, the more abnormal. +* :attr:`pyod.models.base.BaseDetector.decision_scores_`: Outlier scores of the training data. Higher scores typically indicate more abnormal behavior. Outliers usually have higher scores. Outliers tend to have higher scores. -* :attr:`pyod.models.base.BaseDetector.labels_`: The binary labels of the training data. 0 stands for inliers and 1 for outliers/anomalies. +* :attr:`pyod.models.base.BaseDetector.labels_`: Binary labels of the training data, where 0 indicates inliers and 1 indicates outliers/anomalies. See base class definition below: diff --git a/docs/benchmark.rst b/docs/benchmark.rst index fbdec994a..b0c97ebfb 100644 --- a/docs/benchmark.rst +++ b/docs/benchmark.rst @@ -4,7 +4,7 @@ Benchmarks Latest ADBench (2022) --------------------- -We just released a 36-page, the most comprehensive `anomaly detection benchmark paper `_ :cite:`a-han2022adbench`. +We just released a 36-page, the most comprehensive `anomaly detection benchmark paper `_ :cite:`a-han2022adbench`. The fully `open-sourced ADBench `_ compares 30 anomaly detection algorithms on 55 benchmark datasets. The organization of **ADBench** is provided below: @@ -14,6 +14,13 @@ The organization of **ADBench** is provided below: :alt: benchmark +For a simpler visualization, we make **the comparison of selected models** via +`compare_all_models.py `_\. + +.. image:: https://github.com/yzhao062/pyod/blob/development/examples/ALL.png?raw=true + :target: https://github.com/yzhao062/pyod/blob/development/examples/ALL.png?raw=true + :alt: Comparison_of_All + Old Results (2019) ------------------ diff --git a/docs/index.rst b/docs/index.rst index 9519607e6..08e1d0be5 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -64,20 +64,34 @@ Welcome to PyOD documentation! ---- -**News**: We just released a 45-page, the most comprehensive `anomaly detection benchmark paper `_. -The fully `open-sourced ADBench `_ compares 30 anomaly detection algorithms on 57 benchmark datasets. +Read Me First +^^^^^^^^^^^^^ + +Welcome to PyOD, a versatile Python library for detecting anomalies in multivariate data. Whether you're tackling a small-scale project or large datasets, PyOD offers a range of algorithms to suit your needs. + +* **For time-series outlier detection**, please use `TODS `_. + +* **For graph outlier detection**, please use `PyGOD `_. + +* **Performance Comparison \& Datasets**: We have a 45-page, the most comprehensive `anomaly detection benchmark paper `_. The fully `open-sourced ADBench `_ compares 30 anomaly detection algorithms on 57 benchmark datasets. + +* **Learn more about anomaly detection** \@ `Anomaly Detection Resources `_ + +* **PyOD on Distributed Systems**: you could also run `PyOD on databricks `_. + +---- -**For time-series outlier detection**, please use `TODS `_. -**For graph outlier detection**, please use `PyGOD `_. +About PyOD +^^^^^^^^^^ -PyOD is the most comprehensive and scalable **Python library** for **detecting outlying objects** in +PyOD, established in 2017, has become a go-to **Python library** for **detecting anomalous/outlying objects** in multivariate data. This exciting yet challenging field is commonly referred as `Outlier Detection `_ or `Anomaly Detection `_. -PyOD includes more than 40 detection algorithms, from classical LOF (SIGMOD 2000) to -the latest ECOD and DIF (TKDE 2022 and 2023). Since 2017, PyOD :cite:`a-zhao2019pyod` has been successfully used in numerous -academic researches and commercial products with more than `10 million downloads `_. +PyOD includes more than 50 detection algorithms, from classical LOF (SIGMOD 2000) to +the cutting-edge ECOD and DIF (TKDE 2022 and 2023). Since 2017, PyOD has been successfully used in numerous academic researches and +commercial products with more than `17 million downloads `_. It is also well acknowledged by the machine learning community with various dedicated posts/tutorials, including `Analytics Vidhya `_, `KDnuggets `_, and @@ -86,10 +100,10 @@ It is also well acknowledged by the machine learning community with various dedi **PyOD is featured for**: -* **Unified APIs, detailed documentation, and interactive examples** across various algorithms. -* **Advanced models**\, including **classical distance and density estimation**, **latest deep learning methods**, and **emerging algorithms like ECOD**. -* **Optimized performance with JIT and parallelization** using `numba `_ and `joblib `_. -* **Fast training & prediction with SUOD** :cite:`a-zhao2021suod`. +* **Unified, User-Friendly Interface** across various algorithms. +* **Wide Range of Models**\, from classic techniques to the latest deep learning methods. +* **High Performance & Efficiency**, leveraging `numba `_ and `joblib `_ for JIT compilation and parallel processing. +* **Fast Training & Prediction**, achieved through the SUOD framework [#Zhao2021SUOD]_. **Outlier Detection with 5 Lines of Code**\ : @@ -98,23 +112,19 @@ It is also well acknowledged by the machine learning community with various dedi .. code-block:: python - # train an ECOD detector + # Example: Training an ECOD detector from pyod.models.ecod import ECOD clf = ECOD() clf.fit(X_train) + y_train_scores = clf.decision_scores_ # Outlier scores for training data + y_test_scores = clf.decision_function(X_test) # Outlier scores for test data - # get outlier scores - y_train_scores = clf.decision_scores_ # raw outlier scores on the train data - y_test_scores = clf.decision_function(X_test) # predict raw outlier scores on test - - -**Personal suggestion on selecting an OD algorithm**. If you do not know which algorithm to try, go with: +**Selecting the Right Algorithm:**. Unsure where to start? Consider these robust and interpretable options: - `ECOD `_: Example of using ECOD for outlier detection - `Isolation Forest `_: Example of using Isolation Forest for outlier detection -They are both fast and interpretable. Or, you could try more data-driven approach `MetaOD `_. - +Alternatively, explore `MetaOD `_ for a data-driven approach. **Citing PyOD**\ : @@ -138,36 +148,49 @@ or:: Zhao, Y., Nasrullah, Z. and Li, Z., 2019. PyOD: A Python Toolbox for Scalable Outlier Detection. Journal of machine learning research (JMLR), 20(96), pp.1-7. -If you want more general insights of anomaly detection and/or algorithm performance comparison, please see our -NeurIPS 2022 paper `ADBench: Anomaly Detection Benchmark `_:: +For a broader perspective on anomaly detection, see our NeurIPS papers +`ADBench: Anomaly Detection Benchmark `_ \& `ADGym: Design Choices for Deep Anomaly Detection `_:: - @inproceedings{han2022adbench, - title={ADBench: Anomaly Detection Benchmark}, - author={Songqiao Han and Xiyang Hu and Hailiang Huang and Mingqi Jiang and Yue Zhao}, - booktitle={Neural Information Processing Systems (NeurIPS)} - year={2022}, + @article{han2022adbench, + title={Adbench: Anomaly detection benchmark}, + author={Han, Songqiao and Hu, Xiyang and Huang, Hailiang and Jiang, Minqi and Zhao, Yue}, + journal={Advances in Neural Information Processing Systems}, + volume={35}, + pages={32142--32159}, + year={2022} } -**Key Links and Resources**\ : + @article{jiang2023adgym, + title={ADGym: Design Choices for Deep Anomaly Detection}, + author={Jiang, Minqi and Hou, Chaochuan and Zheng, Ao and Han, Songqiao and Huang, Hailiang and Wen, Qingsong and Hu, Xiyang and Zhao, Yue}, + journal={Advances in Neural Information Processing Systems}, + volume={36}, + year={2023} + } -* `View the latest codes on Github `_ -* `Execute Interactive Jupyter Notebooks `_ -* `Anomaly Detection Resources `_ ---- -Benchmark -========= +ADBench Benchmark and Datasets +============================== -We just released a 45-page, the most comprehensive `ADBench: Anomaly Detection Benchmark `_. +We just released a 45-page, the most comprehensive `ADBench: Anomaly Detection Benchmark `_ :cite:`a-han2022adbench`. The fully `open-sourced ADBench `_ compares 30 anomaly detection algorithms on 57 benchmark datasets. The organization of **ADBench** is provided below: .. image:: https://github.com/Minqi824/ADBench/blob/main/figs/ADBench.png?raw=true :target: https://github.com/Minqi824/ADBench/blob/main/figs/ADBench.png?raw=true - :alt: benchmark + :alt: benchmark-fig + + +For a simpler visualization, we make **the comparison of selected models** via +`compare_all_models.py `_\. + +.. image:: https://github.com/yzhao062/pyod/blob/development/examples/ALL.png?raw=true + :target: https://github.com/yzhao062/pyod/blob/development/examples/ALL.png?raw=true + :alt: Comparison_of_All Implemented Algorithms diff --git a/docs/install.rst b/docs/install.rst index f840e98b3..58c0c4aba 100644 --- a/docs/install.rst +++ b/docs/install.rst @@ -1,8 +1,8 @@ Installation ============ -It is recommended to use **pip** or **conda** for installation. Please make sure -**the latest version** is installed, as PyOD is updated frequently: +PyOD is designed for easy installation using either **pip** or **conda**. +We recommend using the latest version of PyOD due to frequent updates and enhancements: .. code-block:: bash @@ -25,13 +25,13 @@ Alternatively, you could clone and run setup.py file: **Required Dependencies**\ : -* Python 3.6+ +* Python 3.6 or higher * joblib * matplotlib * numpy>=1.19 * numba>=0.51 * scipy>=1.5.1 -* scikit_learn>=0.20.0 +* scikit_learn>=0.22.0 * six @@ -39,16 +39,10 @@ Alternatively, you could clone and run setup.py file: * combo (optional, required for models/combination.py and FeatureBagging) * keras/tensorflow (optional, required for AutoEncoder, and other deep learning models) -* pandas (optional, required for running benchmark) * suod (optional, required for running SUOD model) * xgboost (optional, required for XGBOD) -* pythresh to use thresholding +* pythresh (optional, required for thresholding) .. warning:: - PyOD has multiple neural network based models, e.g., AutoEncoders, which are - implemented in both Tensorflow and PyTorch. However, PyOD does **NOT** install these deep learning libraries for you. - This reduces the risk of interfering with your local copies. - If you want to use neural-net based models, please make sure these deep learning libraries are installed. - Instructions are provided: `neural-net FAQ `_. - Similarly, models depending on **xgboost**, e.g., XGBOD, would **NOT** enforce xgboost installation by default. + PyOD includes several neural network-based models, such as AutoEncoders, implemented in Tensorflow and PyTorch. These deep learning libraries are not automatically installed by PyOD to avoid conflicts with existing installations. If you plan to use neural-net based models, please ensure these libraries are installed. See the `neural-net FAQ `_ for guidance. Additionally, xgboost is not installed by default but is required for models like XGBOD. diff --git a/docs/pyod.models.rst b/docs/pyod.models.rst index 76792a8a1..56dbe5b9a 100644 --- a/docs/pyod.models.rst +++ b/docs/pyod.models.rst @@ -43,7 +43,7 @@ pyod.models.auto\_encoder\_torch module .. automodule:: pyod.models.auto_encoder_torch :members: - :exclude-members: inner_autoencoder + :exclude-members: :show-inheritance: :inherited-members: @@ -110,7 +110,7 @@ pyod.models.dif module .. automodule:: pyod.models.dif :members: - :exclude-members: + :exclude-members: LinearBlock, MLPnet :undoc-members: :show-inheritance: :inherited-members: @@ -159,7 +159,7 @@ pyod.models.iforest module .. automodule:: pyod.models.iforest :members: - :exclude-members: estimators_, max_samples_, estimators_samples_ + :exclude-members: :undoc-members: :show-inheritance: :inherited-members: diff --git a/examples/ALL.png b/examples/ALL.png index 705898baa..c35bfa3e6 100644 Binary files a/examples/ALL.png and b/examples/ALL.png differ diff --git a/examples/compare_all_models.py b/examples/compare_all_models.py index 2bf2388ae..75409fe3f 100644 --- a/examples/compare_all_models.py +++ b/examples/compare_all_models.py @@ -14,7 +14,7 @@ # temporary solution for relative imports in case pyod is not installed # if pyod is installed, no need to use the following line sys.path.append( - os.path.abspath(os.path.join(os.path.dirname("__file__"), '..'))) + os.path.abspath(os.path.join(os.path.dirname("__file__"), '..'))) # supress warnings for clean output import warnings @@ -42,6 +42,15 @@ from pyod.models.kde import KDE from pyod.models.lmdd import LMDD +from pyod.models.dif import DIF +from pyod.models.copod import COPOD +from pyod.models.ecod import ECOD +from pyod.models.suod import SUOD +from pyod.models.qmcd import QMCD +from pyod.models.sampling import Sampling +from pyod.models.kpca import KPCA +from pyod.models.lunar import LUNAR + # TODO: add neural networks, LOCI, SOS, COF, SOD # Define the number of inliers and outliers @@ -59,114 +68,145 @@ # initialize a set of detectors for LSCP detector_list = [LOF(n_neighbors=5), LOF(n_neighbors=10), LOF(n_neighbors=15), - LOF(n_neighbors=20), LOF(n_neighbors=25), LOF(n_neighbors=30), - LOF(n_neighbors=35), LOF(n_neighbors=40), LOF(n_neighbors=45), - LOF(n_neighbors=50)] + LOF(n_neighbors=20), LOF(n_neighbors=25), LOF(n_neighbors=30), + LOF(n_neighbors=35), LOF(n_neighbors=40), LOF(n_neighbors=45), + LOF(n_neighbors=50)] # Show the statics of the data print('Number of inliers: %i' % n_inliers) print('Number of outliers: %i' % n_outliers) print( - 'Ground truth shape is {shape}. Outlier are 1 and inliers are 0.\n'.format( - shape=ground_truth.shape)) + 'Ground truth shape is {shape}. Outlier are 1 and inliers are 0.\n'.format( + shape=ground_truth.shape)) print(ground_truth, '\n') random_state = 42 # Define nine outlier detection tools to be compared classifiers = { - 'Angle-based Outlier Detector (ABOD)': - ABOD(contamination=outliers_fraction), - 'Cluster-based Local Outlier Factor (CBLOF)': - CBLOF(contamination=outliers_fraction, - check_estimator=False, random_state=random_state), - 'Feature Bagging': - FeatureBagging(LOF(n_neighbors=35), - contamination=outliers_fraction, - random_state=random_state), - 'Histogram-base Outlier Detection (HBOS)': HBOS( - contamination=outliers_fraction), - 'Isolation Forest': IForest(contamination=outliers_fraction, - random_state=random_state), - 'K Nearest Neighbors (KNN)': KNN( - contamination=outliers_fraction), - 'Average KNN': KNN(method='mean', - contamination=outliers_fraction), - 'Local Outlier Factor (LOF)': - LOF(n_neighbors=35, contamination=outliers_fraction), - 'Minimum Covariance Determinant (MCD)': MCD( - contamination=outliers_fraction, random_state=random_state), - 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), - 'Principal Component Analysis (PCA)': PCA( - contamination=outliers_fraction, random_state=random_state), - 'Locally Selective Combination (LSCP)': LSCP( - detector_list, contamination=outliers_fraction, - random_state=random_state), - 'INNE': INNE( - max_samples=2, contamination=outliers_fraction, - random_state=random_state, - ), - 'GMM': GMM(contamination=outliers_fraction, - random_state=random_state), - 'KDE': KDE(contamination=outliers_fraction), - 'LMDD': LMDD(contamination=outliers_fraction, - random_state=random_state), + 'Angle-based Outlier Detector (ABOD)': + ABOD(contamination=outliers_fraction), + 'K Nearest Neighbors (KNN)': KNN( + contamination=outliers_fraction), + 'Average KNN': KNN(method='mean', + contamination=outliers_fraction), + 'Median KNN': KNN(method='median', + contamination=outliers_fraction), + 'Local Outlier Factor (LOF)': + LOF(n_neighbors=35, contamination=outliers_fraction), + + 'Isolation Forest': IForest(contamination=outliers_fraction, + random_state=random_state), + 'Deep Isolation Forest (DIF)': DIF(contamination=outliers_fraction, + random_state=random_state), + 'INNE': INNE( + max_samples=2, contamination=outliers_fraction, + random_state=random_state, + ), + + 'Locally Selective Combination (LSCP)': LSCP( + detector_list, contamination=outliers_fraction, + random_state=random_state), + 'Feature Bagging': + FeatureBagging(LOF(n_neighbors=35), + contamination=outliers_fraction, + random_state=random_state), + 'SUOD': SUOD(contamination=outliers_fraction), + + 'Minimum Covariance Determinant (MCD)': MCD( + contamination=outliers_fraction, random_state=random_state), + + 'Principal Component Analysis (PCA)': PCA( + contamination=outliers_fraction, random_state=random_state), + 'KPCA': KPCA( + contamination=outliers_fraction), + + 'Probabilistic Mixture Modeling (GMM)': GMM(contamination=outliers_fraction, + random_state=random_state), + + 'LMDD': LMDD(contamination=outliers_fraction, + random_state=random_state), + + 'Histogram-based Outlier Detection (HBOS)': HBOS( + contamination=outliers_fraction), + + 'Copula-base Outlier Detection (COPOD)': COPOD( + contamination=outliers_fraction), + + 'ECDF-baseD Outlier Detection (ECOD)': ECOD( + contamination=outliers_fraction), + 'Kernel Density Functions (KDE)': KDE(contamination=outliers_fraction), + + 'QMCD': QMCD( + contamination=outliers_fraction), + + 'Sampling': Sampling( + contamination=outliers_fraction), + + 'LUNAR': LUNAR(), + + 'Cluster-based Local Outlier Factor (CBLOF)': + CBLOF(contamination=outliers_fraction, + check_estimator=False, random_state=random_state), + + 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), } # Show all detectors for i, clf in enumerate(classifiers.keys()): - print('Model', i + 1, clf) + print('Model', i + 1, clf) # Fit the models with the generated data and # compare model performances for i, offset in enumerate(clusters_separation): - np.random.seed(42) - # Data generation - X1 = 0.3 * np.random.randn(n_inliers // 2, 2) - offset - X2 = 0.3 * np.random.randn(n_inliers // 2, 2) + offset - X = np.r_[X1, X2] - # Add outliers - X = np.r_[X, np.random.uniform(low=-6, high=6, size=(n_outliers, 2))] - - # Fit the model - plt.figure(figsize=(15, 16)) - for i, (clf_name, clf) in enumerate(classifiers.items()): - print() - print(i + 1, 'fitting', clf_name) - # fit the data and tag outliers - clf.fit(X) - scores_pred = clf.decision_function(X) * -1 - y_pred = clf.predict(X) - threshold = percentile(scores_pred, 100 * outliers_fraction) - n_errors = (y_pred != ground_truth).sum() - # plot the levels lines and the points - - Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) * -1 - Z = Z.reshape(xx.shape) - subplot = plt.subplot(4, 4, i + 1) - subplot.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7), - cmap=plt.cm.Blues_r) - # a = subplot.contour(xx, yy, Z, levels=[threshold], - # linewidths=2, colors='red') - subplot.contourf(xx, yy, Z, levels=[threshold, Z.max()], - colors='orange') - b = subplot.scatter(X[:-n_outliers, 0], X[:-n_outliers, 1], c='white', - s=20, edgecolor='k') - c = subplot.scatter(X[-n_outliers:, 0], X[-n_outliers:, 1], c='black', - s=20, edgecolor='k') - subplot.axis('tight') - subplot.legend( - [ - # a.collections[0], - b, c], - [ - # 'learned decision function', - 'true inliers', 'true outliers'], - prop=matplotlib.font_manager.FontProperties(size=10), - loc='lower right') - subplot.set_xlabel("%d. %s (errors: %d)" % (i + 1, clf_name, n_errors)) - subplot.set_xlim((-7, 7)) - subplot.set_ylim((-7, 7)) - plt.subplots_adjust(0.04, 0.1, 0.96, 0.94, 0.1, 0.26) - plt.suptitle("Outlier detection") -plt.savefig('ALL.png', dpi=300) + np.random.seed(42) + # Data generation + X1 = 0.3 * np.random.randn(n_inliers // 2, 2) - offset + X2 = 0.3 * np.random.randn(n_inliers // 2, 2) + offset + X = np.r_[X1, X2] + # Add outliers + X = np.r_[X, np.random.uniform(low=-6, high=6, size=(n_outliers, 2))] + + # Fit the model + plt.figure(figsize=(20, 22)) + for i, (clf_name, clf) in enumerate(classifiers.items()): + print() + print(i + 1, 'fitting', clf_name) + # fit the data and tag outliers + clf.fit(X) + scores_pred = clf.decision_function(X) * -1 + y_pred = clf.predict(X) + threshold = percentile(scores_pred, 100 * outliers_fraction) + n_errors = (y_pred != ground_truth).sum() + # plot the levels lines and the points + + Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) * -1 + Z = Z.reshape(xx.shape) + subplot = plt.subplot(5, 5, i + 1) + subplot.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7), + cmap=plt.cm.Blues_r) + # a = subplot.contour(xx, yy, Z, levels=[threshold], + # linewidths=2, colors='red') + subplot.contourf(xx, yy, Z, levels=[threshold, Z.max()], + colors='orange') + b = subplot.scatter(X[:-n_outliers, 0], X[:-n_outliers, 1], c='white', + s=20, edgecolor='k') + c = subplot.scatter(X[-n_outliers:, 0], X[-n_outliers:, 1], c='black', + s=20, edgecolor='k') + subplot.axis('tight') + subplot.legend( + [ + # a.collections[0], + b, c], + [ + # 'learned decision function', + 'true inliers', 'true outliers'], + prop=matplotlib.font_manager.FontProperties(size=10), + loc='lower right') + subplot.set_xlabel("%d. %s (errors: %d)" % (i + 1, clf_name, n_errors)) + subplot.set_xlim((-7, 7)) + subplot.set_ylim((-7, 7)) + plt.subplots_adjust(0.04, 0.1, 0.96, 0.94, 0.1, 0.26) + plt.suptitle("25 outlier detection algorithms on synthetic data", + fontsize=35) +plt.savefig('ALL.png', dpi=300, bbox_inches='tight') plt.show() diff --git a/pyod/models/dif.py b/pyod/models/dif.py index 4e6025941..3f8eb5975 100644 --- a/pyod/models/dif.py +++ b/pyod/models/dif.py @@ -34,8 +34,7 @@ class DIF(BaseDetector): hidden_neurons, list, optional (default=[64, 32]) The number of neurons per hidden layers. So the network has the - structure as [n_features, hidden_neurons[0], hidden_neurons[1], - ..., representation_dim] + structure as [n_features, hidden_neurons[0], hidden_neurons[1], ..., representation_dim] hidden_activation, str, optional (default='tanh') Activation function to use for hidden layers. diff --git a/pyod/models/iforest.py b/pyod/models/iforest.py index 02259992d..c4c20e278 100644 --- a/pyod/models/iforest.py +++ b/pyod/models/iforest.py @@ -49,255 +49,276 @@ class IForest(BaseDetector): - """Wrapper of scikit-learn Isolation Forest with more functionalities. - - The IsolationForest 'isolates' observations by randomly selecting a - feature and then randomly selecting a split value between the maximum and - minimum values of the selected feature. - See :cite:`liu2008isolation,liu2012isolation` for details. - - Since recursive partitioning can be represented by a tree structure, the - number of splittings required to isolate a sample is equivalent to the path - length from the root node to the terminating node. - - This path length, averaged over a forest of such random trees, is a - measure of normality and our decision function. - - Random partitioning produces noticeably shorter paths for anomalies. - Hence, when a forest of random trees collectively produce shorter path - lengths for particular samples, they are highly likely to be anomalies. - - Parameters - ---------- - n_estimators : int, optional (default=100) - The number of base estimators in the ensemble. - - max_samples : int or float, optional (default="auto") - The number of samples to draw from X to train each base estimator. - - - If int, then draw `max_samples` samples. - - If float, then draw `max_samples * X.shape[0]` samples. - - If "auto", then `max_samples=min(256, n_samples)`. - - If max_samples is larger than the number of samples provided, - all samples will be used for all trees (no sampling). - - contamination : float in (0., 0.5), optional (default=0.1) - The amount of contamination of the data set, i.e. the proportion - of outliers in the data set. Used when fitting to define the threshold - on the decision function. - - max_features : int or float, optional (default=1.0) - The number of features to draw from X to train each base estimator. - - - If int, then draw `max_features` features. - - If float, then draw `max_features * X.shape[1]` features. - - bootstrap : bool, optional (default=False) - If True, individual trees are fit on random subsets of the training - data sampled with replacement. If False, sampling without replacement - is performed. - - n_jobs : integer, optional (default=1) - The number of jobs to run in parallel for both `fit` and `predict`. - If -1, then the number of jobs is set to the number of cores. - - behaviour : str, default='old' - Behaviour of the ``decision_function`` which can be either 'old' or - 'new'. Passing ``behaviour='new'`` makes the ``decision_function`` - change to match other anomaly detection algorithm API which will be - the default behaviour in the future. As explained in details in the - ``offset_`` attribute documentation, the ``decision_function`` becomes - dependent on the contamination parameter, in such a way that 0 becomes - its natural threshold to detect outliers. - - .. versionadded:: 0.7.0 - ``behaviour`` is added in 0.7.0 for back-compatibility purpose. - - .. deprecated:: 0.20 - ``behaviour='old'`` is deprecated in sklearn 0.20 and will not be - possible in 0.22. - - .. deprecated:: 0.22 - ``behaviour`` parameter will be deprecated in sklearn 0.22 and - removed in 0.24. - - .. warning:: - Only applicable for sklearn 0.20 above. - - random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. - - verbose : int, optional (default=0) - Controls the verbosity of the tree building process. - - Attributes - ---------- - estimators_ : list of DecisionTreeClassifier - The collection of fitted sub-estimators. - - estimators_samples_ : list of arrays - The subset of drawn samples (i.e., the in-bag samples) for each base - estimator. - - max_samples_ : integer - The actual number of samples - - decision_scores_ : numpy array of shape (n_samples,) - The outlier scores of the training data. - The higher, the more abnormal. Outliers tend to have higher - scores. This value is available once the detector is - fitted. - - threshold_ : float - The threshold is based on ``contamination``. It is the - ``n_samples * contamination`` most abnormal samples in - ``decision_scores_``. The threshold is calculated for generating - binary outlier labels. - - labels_ : int, either 0 or 1 - The binary labels of the training data. 0 stands for inliers - and 1 for outliers/anomalies. It is generated by applying - ``threshold_`` on ``decision_scores_``. - """ - - def __init__(self, n_estimators=100, - max_samples="auto", - contamination=0.1, - max_features=1., - bootstrap=False, - n_jobs=1, - behaviour='old', - random_state=None, - verbose=0): - super(IForest, self).__init__(contamination=contamination) - self.n_estimators = n_estimators - self.max_samples = max_samples - self.max_features = max_features - self.bootstrap = bootstrap - self.n_jobs = n_jobs - self.behaviour = behaviour - self.random_state = random_state - self.verbose = verbose - - def fit(self, X, y=None): - """Fit detector. y is ignored in unsupervised methods. - - Parameters - ---------- - X : numpy array of shape (n_samples, n_features) - The input samples. - - y : Ignored - Not used, present for API consistency by convention. - - Returns - ------- - self : object - Fitted estimator. - """ - # validate inputs X and y (optional) - X = check_array(X) - self._set_n_classes(y) - - # In sklearn 0.20+ new behaviour is added (arg behaviour={'new','old'}) - # to IsolationForest that shifts the location of the anomaly scores - # noinspection PyProtectedMember - - self.detector_ = IsolationForest(n_estimators=self.n_estimators, - max_samples=self.max_samples, - contamination=self.contamination, - max_features=self.max_features, - bootstrap=self.bootstrap, - n_jobs=self.n_jobs, - random_state=self.random_state, - verbose=self.verbose) - - self.detector_.fit(X=X, y=None, sample_weight=None) - - # invert decision_scores_. Outliers comes with higher outlier scores. - self.decision_scores_ = invert_order( - self.detector_.decision_function(X)) - self._process_decision_scores() - return self - - def decision_function(self, X): - """Predict raw anomaly score of X using the fitted detector. - - The anomaly score of an input sample is computed based on different - detector algorithms. For consistency, outliers are assigned with - larger anomaly scores. - - Parameters - ---------- - X : numpy array of shape (n_samples, n_features) - The training input samples. Sparse matrices are accepted only - if they are supported by the base estimator. - - Returns - ------- - anomaly_scores : numpy array of shape (n_samples,) - The anomaly score of the input samples. - """ - check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_']) - # invert outlier scores. Outliers comes with higher outlier scores - return invert_order(self.detector_.decision_function(X)) - - @property - def estimators_(self): - """The collection of fitted sub-estimators. - Decorator for scikit-learn Isolation Forest attributes. - """ - return self.detector_.estimators_ - - @property - def estimators_samples_(self): - """The subset of drawn samples (i.e., the in-bag samples) for - each base estimator. - Decorator for scikit-learn Isolation Forest attributes. - """ - return self.detector_.estimators_samples_ - - @property - def max_samples_(self): - """The actual number of samples. - Decorator for scikit-learn Isolation Forest attributes. - """ - return self.detector_.max_samples_ - - @property - def feature_importances_(self): - """The impurity-based feature importance. The higher, the more - important the feature. The importance of a feature is computed as the - (normalized) total reduction of the criterion brought by that feature. - It is also known as the Gini importance. - - .. warning:: - impurity-based feature importance can be misleading for - high cardinality features (many unique values). See - https://scikit-learn.org/stable/modules/generated/sklearn.inspection.permutation_importance.html - as an alternative. - - Returns - ------- - feature_importances_ : ndarray of shape (n_features,) - The values of this array sum to 1, unless all trees are single node - trees consisting of only the root node, in which case it will be an - array of zeros. - """ - check_is_fitted(self) - all_importances = Parallel( - n_jobs=self.n_jobs)( - delayed(getattr)(tree, "feature_importances_") - for tree in self.detector_.estimators_ - if tree.tree_.node_count > 1 - ) - - if not all_importances: - return np.zeros(self.n_features_in_, dtype=np.float64) - - all_importances = np.mean(all_importances, axis=0, dtype=np.float64) - return all_importances / np.sum(all_importances) + """Wrapper of scikit-learn Isolation Forest with more functionalities. + + The IsolationForest 'isolates' observations by randomly selecting a + feature and then randomly selecting a split value between the maximum and + minimum values of the selected feature. + See :cite:`liu2008isolation,liu2012isolation` for details. + + Since recursive partitioning can be represented by a tree structure, the + number of splittings required to isolate a sample is equivalent to the path + length from the root node to the terminating node. + + This path length, averaged over a forest of such random trees, is a + measure of normality and our decision function. + + Random partitioning produces noticeably shorter paths for anomalies. + Hence, when a forest of random trees collectively produce shorter path + lengths for particular samples, they are highly likely to be anomalies. + + Parameters + ---------- + n_estimators : int, optional (default=100) + The number of base estimators in the ensemble. + + max_samples : int or float, optional (default="auto") + The number of samples to draw from X to train each base estimator. + + - If int, then draw `max_samples` samples. + - If float, then draw `max_samples * X.shape[0]` samples. + - If "auto", then `max_samples=min(256, n_samples)`. + + If max_samples is larger than the number of samples provided, + all samples will be used for all trees (no sampling). + + contamination : float in (0., 0.5), optional (default=0.1) + The amount of contamination of the data set, i.e. the proportion + of outliers in the data set. Used when fitting to define the threshold + on the decision function. + + max_features : int or float, optional (default=1.0) + The number of features to draw from X to train each base estimator. + + - If int, then draw `max_features` features. + - If float, then draw `max_features * X.shape[1]` features. + + bootstrap : bool, optional (default=False) + If True, individual trees are fit on random subsets of the training + data sampled with replacement. If False, sampling without replacement + is performed. + + n_jobs : integer, optional (default=1) + The number of jobs to run in parallel for both `fit` and `predict`. + If -1, then the number of jobs is set to the number of cores. + + behaviour : str, default='old' + Behaviour of the ``decision_function`` which can be either 'old' or + 'new'. Passing ``behaviour='new'`` makes the ``decision_function`` + change to match other anomaly detection algorithm API which will be + the default behaviour in the future. As explained in details in the + ``offset_`` attribute documentation, the ``decision_function`` becomes + dependent on the contamination parameter, in such a way that 0 becomes + its natural threshold to detect outliers. + + .. versionadded:: 0.7.0 + ``behaviour`` is added in 0.7.0 for back-compatibility purpose. + + .. deprecated:: 0.20 + ``behaviour='old'`` is deprecated in sklearn 0.20 and will not be + possible in 0.22. + + .. deprecated:: 0.22 + ``behaviour`` parameter will be deprecated in sklearn 0.22 and + removed in 0.24. + + .. warning:: + Only applicable for sklearn 0.20 above. + + random_state : int, RandomState instance or None, optional (default=None) + If int, random_state is the seed used by the random number generator; + If RandomState instance, random_state is the random number generator; + If None, the random number generator is the RandomState instance used + by `np.random`. + + verbose : int, optional (default=0) + Controls the verbosity of the tree building process. + + Attributes + ---------- + estimators_ : list of DecisionTreeClassifier + The collection of fitted sub-estimators. + + estimators_samples_ : list of arrays + The subset of drawn samples (i.e., the in-bag samples) for each base + estimator. + + max_samples_ : integer + The actual number of samples + + decision_scores_ : numpy array of shape (n_samples,) + The outlier scores of the training data. + The higher, the more abnormal. Outliers tend to have higher + scores. This value is available once the detector is + fitted. + + threshold_ : float + The threshold is based on ``contamination``. It is the + ``n_samples * contamination`` most abnormal samples in + ``decision_scores_``. The threshold is calculated for generating + binary outlier labels. + + labels_ : int, either 0 or 1 + The binary labels of the training data. 0 stands for inliers + and 1 for outliers/anomalies. It is generated by applying + ``threshold_`` on ``decision_scores_``. + """ + + def __init__(self, n_estimators=100, + max_samples="auto", + contamination=0.1, + max_features=1., + bootstrap=False, + n_jobs=1, + behaviour='old', + random_state=None, + verbose=0): + super(IForest, self).__init__(contamination=contamination) + self.n_estimators = n_estimators + self.max_samples = max_samples + self.max_features = max_features + self.bootstrap = bootstrap + self.n_jobs = n_jobs + self.behaviour = behaviour + self.random_state = random_state + self.verbose = verbose + + def fit(self, X, y=None): + """Fit detector. y is ignored in unsupervised methods. + + Parameters + ---------- + X : numpy array of shape (n_samples, n_features) + The input samples. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + self : object + Fitted estimator. + """ + # validate inputs X and y (optional) + X = check_array(X) + self._set_n_classes(y) + + # In sklearn 0.20+ new behaviour is added (arg behaviour={'new','old'}) + # to IsolationForest that shifts the location of the anomaly scores + # noinspection PyProtectedMember + + self.detector_ = IsolationForest(n_estimators=self.n_estimators, + max_samples=self.max_samples, + contamination=self.contamination, + max_features=self.max_features, + bootstrap=self.bootstrap, + n_jobs=self.n_jobs, + random_state=self.random_state, + verbose=self.verbose) + + self.detector_.fit(X=X, y=None, sample_weight=None) + + # invert decision_scores_. Outliers comes with higher outlier scores. + self.decision_scores_ = invert_order( + self.detector_.decision_function(X)) + self._process_decision_scores() + return self + + def decision_function(self, X): + """Predict raw anomaly score of X using the fitted detector. + + The anomaly score of an input sample is computed based on different + detector algorithms. For consistency, outliers are assigned with + larger anomaly scores. + + Parameters + ---------- + X : numpy array of shape (n_samples, n_features) + The training input samples. Sparse matrices are accepted only + if they are supported by the base estimator. + + Returns + ------- + anomaly_scores : numpy array of shape (n_samples,) + The anomaly score of the input samples. + """ + check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_']) + # invert outlier scores. Outliers comes with higher outlier scores + return invert_order(self.detector_.decision_function(X)) + + @property + def estimators_(self): + """The collection of fitted sub-estimators. + Decorator for scikit-learn Isolation Forest attributes. + """ + return self.detector_.estimators_ + + @property + def estimators_samples_(self): + """The subset of drawn samples (i.e., the in-bag samples) for + each base estimator. + Decorator for scikit-learn Isolation Forest attributes. + """ + return self.detector_.estimators_samples_ + + @property + def max_samples_(self): + """The actual number of samples. + Decorator for scikit-learn Isolation Forest attributes. + """ + return self.detector_.max_samples_ + + @property + def estimators_features_(self): + """The indeces of the subset of features used to train the estimators. + Decorator for scikit-learn Isolation Forest attributes. + """ + return self.detector_.estimators_features_ + + @property + def n_features_in_(self): + """The number of features seen during the fit. + Decorator for scikit-learn Isolation Forest attributes. + """ + return self.detector_.n_features_in_ + + @property + def offset_(self): + """Offset used to define the decision function from the raw scores. + Decorator for scikit-learn Isolation Forest attributes. + """ + return self.detector_.offset_ + + @property + def feature_importances_(self): + """The impurity-based feature importance. The higher, the more + important the feature. The importance of a feature is computed as the + (normalized) total reduction of the criterion brought by that feature. + It is also known as the Gini importance. + + .. warning:: + impurity-based feature importance can be misleading for + high cardinality features (many unique values). See + https://scikit-learn.org/stable/modules/generated/sklearn.inspection.permutation_importance.html + as an alternative. + + Returns + ------- + feature_importances_ : ndarray of shape (n_features,) + The values of this array sum to 1, unless all trees are single node + trees consisting of only the root node, in which case it will be an + array of zeros. + """ + check_is_fitted(self) + all_importances = Parallel( + n_jobs=self.n_jobs)( + delayed(getattr)(tree, "feature_importances_") + for tree in self.detector_.estimators_ + if tree.tree_.node_count > 1 + ) + + if not all_importances: + return np.zeros(self.n_features_in_, dtype=np.float64) + + all_importances = np.mean(all_importances, axis=0, dtype=np.float64) + return all_importances / np.sum(all_importances) diff --git a/pyod/models/kpca.py b/pyod/models/kpca.py index 02d081ff7..5b3c57df1 100644 --- a/pyod/models/kpca.py +++ b/pyod/models/kpca.py @@ -5,7 +5,6 @@ # License: BSD 2 clause import numpy as np -import sklearn from sklearn.decomposition import KernelPCA from sklearn.utils import check_array, check_random_state from sklearn.utils.validation import check_is_fitted @@ -18,22 +17,22 @@ class PyODKernelPCA(KernelPCA): """A wrapper class for KernelPCA class of scikit-learn.""" def __init__( - self, - n_components=None, - kernel="rbf", - gamma=None, - degree=3, - coef0=1, - kernel_params=None, - alpha=1.0, - fit_inverse_transform=False, - eigen_solver="auto", - tol=0, - max_iter=None, - remove_zero_eig=False, - copy_X=True, - n_jobs=None, - random_state=None, + self, + n_components=None, + kernel="rbf", + gamma=None, + degree=3, + coef0=1, + kernel_params=None, + alpha=1.0, + fit_inverse_transform=False, + eigen_solver="auto", + tol=0, + max_iter=None, + remove_zero_eig=False, + copy_X=True, + n_jobs=None, + random_state=None, ): super().__init__( kernel=kernel, @@ -198,53 +197,47 @@ class KPCA(BaseDetector): """ def __init__( - self, - contamination=0.1, - n_components=None, - n_selected_components=None, - kernel="rbf", - gamma=None, - degree=3, - coef0=1, - kernel_params=None, - alpha=1.0, - eigen_solver="auto", - tol=0, - max_iter=None, - remove_zero_eig=False, - copy_X=True, - n_jobs=None, - sampling=False, - subset_size=20, - random_state=None, + self, + contamination=0.1, + n_components=None, + n_selected_components=None, + kernel="rbf", + gamma=None, + degree=3, + coef0=1, + kernel_params=None, + alpha=1.0, + eigen_solver="auto", + tol=0, + max_iter=None, + remove_zero_eig=False, + copy_X=True, + n_jobs=None, + sampling=False, + subset_size=20, + random_state=None, ): super().__init__(contamination=contamination) self.n_components = n_components self.n_selected_components = n_selected_components - self.copy_x = copy_X + self.kernel = kernel + self.gamma = gamma + self.degree = degree + self.coef0 = coef0 + self.kernel_params = kernel_params + self.alpha = alpha + self.eigen_solver = eigen_solver + self.tol = tol + self.max_iter = max_iter + self.remove_zero_eig = remove_zero_eig + self.copy_X = copy_X + self.n_jobs = n_jobs self.sampling = sampling self.subset_size = subset_size self.random_state = check_random_state(random_state) self.decision_scores_ = None self.n_selected_components_ = None - self.kpca = PyODKernelPCA( - n_components=n_components, - kernel=kernel, - gamma=gamma, - degree=degree, - coef0=coef0, - kernel_params=kernel_params, - alpha=alpha, - fit_inverse_transform=False, - eigen_solver=eigen_solver, - tol=tol, - max_iter=max_iter, - remove_zero_eig=remove_zero_eig, - copy_X=copy_X, - n_jobs=n_jobs, - ) - def _check_subset_size(self, array): """Check subset size.""" n_samples, _ = array.shape @@ -283,7 +276,7 @@ def fit(self, X, y=None): """ # validate inputs X and y (optional) - X = check_array(X, copy=self.copy_x) + X = check_array(X, copy=self.copy_X) self._set_n_classes(y) # perform subsampling to reduce time complexity @@ -298,7 +291,7 @@ def fit(self, X, y=None): # copy the attributes from the sklearn Kernel PCA object if self.n_components is None: - n_components = X.shape[1] # use all dimensions + n_components = X.shape[0] # use all dimensions else: if self.n_components < 1: raise ValueError( @@ -320,20 +313,29 @@ def fit(self, X, y=None): param_name="n_selected_components", ) - self.kpca.fit(X) + self.kpca = PyODKernelPCA( + n_components=self.n_components, + kernel=self.kernel, + gamma=self.gamma, + degree=self.degree, + coef0=self.coef0, + kernel_params=self.kernel_params, + alpha=self.alpha, + fit_inverse_transform=False, + eigen_solver=self.eigen_solver, + tol=self.tol, + max_iter=self.max_iter, + remove_zero_eig=self.remove_zero_eig, + copy_X=self.copy_X, + n_jobs=self.n_jobs, + random_state=self.random_state, + ) + x_transformed = self.kpca.fit_transform(X) + x_transformed = x_transformed[:, : self.n_selected_components_] + centerer = self.kpca.get_centerer kernel = self.kpca.get_kernel - if int(sklearn.__version__[0]) < 1: - eigenvalues_ = self.kpca.lambdas_ - eigenvectors_ = self.kpca.alphas_ - else: - eigenvalues_ = self.kpca.eigenvalues_ - eigenvectors_ = self.kpca.eigenvectors_ - - x_transformed = eigenvectors_ * np.sqrt(eigenvalues_) - x_transformed = x_transformed[:, : self.n_selected_components_] - potential = [] for i in range(X.shape[0]): sample = X[i, :].reshape(1, -1) @@ -372,24 +374,8 @@ def decision_function(self, X): centerer = self.kpca.get_centerer kernel = self.kpca.get_kernel gram_matrix = kernel(X, self.kpca.X_fit_) - centered_g = centerer.transform(gram_matrix) - - if int(sklearn.__version__[0]) < 1: - eigenvalues_ = self.kpca.lambdas_ - eigenvectors_ = self.kpca.alphas_ - else: - eigenvalues_ = self.kpca.eigenvalues_ - eigenvectors_ = self.kpca.eigenvectors_ - - # scale eigenvectors (properly account for null-space for dot product) - non_zeros = np.flatnonzero(eigenvalues_) - scaled_alphas = np.zeros_like(eigenvectors_) - scaled_alphas[:, non_zeros] = eigenvectors_[:, non_zeros] / np.sqrt( - eigenvalues_[non_zeros] - ) - # Project with a scalar product between K and the scaled eigenvectors - x_transformed = np.dot(centered_g, scaled_alphas) + x_transformed = self.kpca.transform(X) x_transformed = x_transformed[:, : self.n_selected_components_] potential = [] diff --git a/pyod/models/lunar.py b/pyod/models/lunar.py index ee486cbc8..00af32ec2 100644 --- a/pyod/models/lunar.py +++ b/pyod/models/lunar.py @@ -154,8 +154,8 @@ class LUNAR(BaseDetector): def __init__(self, model_type="WEIGHT", n_neighbours=5, negative_sampling="MIXED", val_size=0.1, scaler=MinMaxScaler(), epsilon=0.1, proportion=1.0, - n_epochs=200, lr=0.001, wd=0.1, verbose=0): - super(LUNAR, self).__init__() + n_epochs=200, lr=0.001, wd=0.1, verbose=0, contamination=0.1): + super(LUNAR, self).__init__(contamination=contamination) self.model_type = model_type self.n_neighbours = n_neighbours diff --git a/pyod/models/mad.py b/pyod/models/mad.py index 27b3288b6..428051057 100644 --- a/pyod/models/mad.py +++ b/pyod/models/mad.py @@ -55,10 +55,8 @@ class MAD(BaseDetector): ``threshold_`` on ``decision_scores_``. """ - def __init__(self, threshold=3.5): - # contamination is unneeded since threshold must be - # decided manually by the user - super(MAD, self).__init__() + def __init__(self, threshold=3.5, contamination=0.1): + super(MAD, self).__init__(contamination=contamination) if not isinstance(threshold, (float, int)): raise TypeError( 'threshold must be a number. Got {}'.format(type(threshold))) diff --git a/pyod/test/test_iforest.py b/pyod/test/test_iforest.py index 042e937a9..4b80cc532 100644 --- a/pyod/test/test_iforest.py +++ b/pyod/test/test_iforest.py @@ -54,6 +54,13 @@ def test_parameters(self): self.clf.estimators_samples_ is not None) assert (hasattr(self.clf, 'max_samples_') and self.clf.max_samples_ is not None) + assert (hasattr(self.clf, 'estimators_features_') and + self.clf.estimators_features_ is not None) + assert (hasattr(self.clf, 'n_features_in_') and + self.clf.n_features_in_ is not None) + assert (hasattr(self.clf, 'offset_') and + self.clf.offset_ is not None) + def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) diff --git a/pyod/test/test_kpca.py b/pyod/test/test_kpca.py index 621efe5ec..9a928b7c6 100644 --- a/pyod/test/test_kpca.py +++ b/pyod/test/test_kpca.py @@ -38,8 +38,8 @@ def setUp(self): def test_parameters(self): assert ( - hasattr(self.clf, "decision_scores_") - and self.clf.decision_scores_ is not None + hasattr(self.clf, "decision_scores_") + and self.clf.decision_scores_ is not None ) assert hasattr(self.clf, "labels_") and self.clf.labels_ is not None assert hasattr(self.clf, "threshold_") and self.clf.threshold_ is not None @@ -108,24 +108,6 @@ def test_fit_predict_score(self): with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring="something") - def test_predict_rank(self): - pred_socres = self.clf.decision_function(self.X_test) - pred_ranks = self.clf._predict_rank(self.X_test) - - # assert the order is reserved - assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=4) - assert_array_less(pred_ranks, self.X_train.shape[0] + 1) - assert_array_less(-0.1, pred_ranks) - - def test_predict_rank_normalized(self): - pred_socres = self.clf.decision_function(self.X_test) - pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) - - # assert the order is reserved - assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=4) - assert_array_less(pred_ranks, 1.01) - assert_array_less(-0.1, pred_ranks) - def test_model_clone(self): clone_clf = clone(self.clf) diff --git a/pyod/version.py b/pyod/version.py index 0cdbd9ac2..f9adc2347 100644 --- a/pyod/version.py +++ b/pyod/version.py @@ -20,4 +20,4 @@ # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer. # 'X.Y.dev0' is the canonical version of 'X.Y.dev' # -__version__ = '1.1.1' # pragma: no cover +__version__ = '1.1.2' # pragma: no cover