FIX raise error in OneHotEncoder.inverse_transform (scikit-learn#14982)

kwinata · web-flow · commit 5d5329c47379 · 2020-10-19T21:21:47.000+02:00
diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
@@ -393,7 +393,7 @@ Changelog
   curve classification metric.
   :pr:`10591` by :user:`Jeremy Karnowski <jkarnows>` and
   :user:`Daniel Mohns <dmohns>`.
-  
+
 - |Feature| Added :func:`metrics.plot_det_curve` and
   :class:`metrics.DetCurveDisplay` to ease the plot of DET curves.
   :pr:`18176` by :user:`Guillaume Lemaitre <glemaitre>`.
@@ -645,6 +645,12 @@ Changelog
   :class:`preprocessing.KBinsDiscretizer`.
   :pr:`16335` by :user:`Arthur Imbert <Henley13>`.
 
+- |Fix| Raise error on
+  :meth:`sklearn.preprocessing.OneHotEncoder.inverse_transform`
+  when `handle_unknown='error'` and `drop=None` for samples
+  encoded as all zeros. :pr:`14982` by
+  :user:`Kevin Winata <kwinata>`.
+
 :mod:`sklearn.svm`
 ..................
 
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
@@ -571,13 +571,20 @@ def inverse_transform(self, X):
                 # ignored unknown categories: we have a row of all zero
                 if unknown.any():
                     found_unknown[i] = unknown
-            # drop will either be None or handle_unknown will be error. If
-            # self.drop_idx_ is not None, then we can safely assume that all of
-            # the nulls in each column are the dropped value
-            elif self.drop_idx_ is not None:
+            else:
                 dropped = np.asarray(sub.sum(axis=1) == 0).flatten()
                 if dropped.any():
-                    X_tr[dropped, i] = self.categories_[i][self.drop_idx_[i]]
+                    if self.drop_idx_ is None:
+                        all_zero_samples = np.flatnonzero(dropped)
+                        raise ValueError(
+                            f"Samples {all_zero_samples} can not be inverted "
+                            "when drop=None and handle_unknown='error' "
+                            "because they contain all zeros")
+                    # we can safely assume that all of the nulls in each column
+                    # are the dropped value
+                    X_tr[dropped, i] = self.categories_[i][
+                        self.drop_idx_[i]
+                    ]
 
             j += n_categories
 
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
@@ -9,6 +9,7 @@
 from sklearn.exceptions import NotFittedError
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import assert_allclose
+from sklearn.utils._testing import _convert_container
 from sklearn.utils import is_scalar_nan
 
 from sklearn.preprocessing import OneHotEncoder
@@ -266,6 +267,36 @@ def test_one_hot_encoder_inverse(sparse_, drop):
         enc.inverse_transform(X_tr)
 
 
+@pytest.mark.parametrize('sparse_', [False, True])
+@pytest.mark.parametrize(
+    "X, X_trans",
+    [
+        ([[2, 55], [1, 55], [2, 55]], [[0, 1, 1], [0, 0, 0], [0, 1, 1]]),
+        ([['one', 'a'], ['two', 'a'], ['three', 'b'], ['two', 'a']],
+         [[0, 0, 0, 0, 0], [0, 0, 0, 0, 1], [0, 1, 0, 0, 0]]),
+    ]
+)
+def test_one_hot_encoder_inverse_transform_raise_error_with_unknown(
+    X, X_trans, sparse_
+):
+    """Check that `inverse_transform` raise an error with unknown samples, no
+    dropped feature, and `handle_unknow="error`.
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/14934
+    """
+    enc = OneHotEncoder(sparse=sparse_).fit(X)
+    msg = (
+        r"Samples \[(\d )*\d\] can not be inverted when drop=None and "
+        r"handle_unknown='error' because they contain all zeros"
+    )
+
+    if sparse_:
+        # emulate sparse data transform by a one-hot encoder sparse.
+        X_trans = _convert_container(X_trans, "sparse")
+    with pytest.raises(ValueError, match=msg):
+        enc.inverse_transform(X_trans)
+
+
 def test_one_hot_encoder_inverse_if_binary():
     X = np.array([['Male', 1],
                   ['Female', 3],