BUG: Raise MergeError when suffixes result in duplicate column names (GH#61402)

Farsidetfs · Farsidetfs · commit 9a40cd0d9c9b · 2025-05-12T20:38:39.000Z
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -845,6 +845,7 @@ Reshaping
 - Bug in :meth:`DataFrame.stack` with the new implementation where ``ValueError`` is raised when ``level=[]`` (:issue:`60740`)
 - Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`)
 - Bug in :meth:`concat` where concatenating DataFrame and Series with ``ignore_index = True`` drops the series name (:issue:`60723`, :issue:`56257`)
+- Bug in :meth:`DataFrame.merge` where either user-provided or default suffixes could result in duplicate column names if the resulting names matched existing columns. Now raises a :class:`MergeError` in such cases. (:issue:`61402`)
 
 Sparse
 ^^^^^^
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
@@ -3058,16 +3058,23 @@ def renamer(x, suffix: str | None):
     llabels = left._transform_index(lrenamer)
     rlabels = right._transform_index(rrenamer)
 
-    dups = []
+    dups = set()
+
     if not llabels.is_unique:
         # Only warn when duplicates are caused because of suffixes, already duplicated
         # columns in origin should not warn
-        dups = llabels[(llabels.duplicated()) & (~left.duplicated())].tolist()
+        dups.update(llabels[(llabels.duplicated()) & (~left.duplicated())])
     if not rlabels.is_unique:
-        dups.extend(rlabels[(rlabels.duplicated()) & (~right.duplicated())].tolist())
+        dups.update(rlabels[(rlabels.duplicated()) & (~right.duplicated())])
+
+    # addition of suffix matches the original column name in the other frame
+    # x["A"]+suffix("_x") == y["A_x"]
+    dups.update(llabels.intersection(right.difference(to_rename)))
+    dups.update(rlabels.intersection(left.difference(to_rename)))
+
     if dups:
         raise MergeError(
-            f"Passing 'suffixes' which cause duplicate columns {set(dups)} is "
+            f"Passing 'suffixes' which cause duplicate columns {dups} is "
             f"not allowed.",
         )
 
diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py
@@ -3060,3 +3060,53 @@ def test_merge_on_all_nan_column():
         {"x": [1, 2, 3], "y": [np.nan, np.nan, np.nan], "z": [4, 5, 6], "zz": [4, 5, 6]}
     )
     tm.assert_frame_equal(result, expected)
+
+
+def test_merge_for_suffix_collisions():
+    # GH#61402
+    # Case 1: suffixes=("_dup", "") test collision
+    df1 = pd.DataFrame({"col1": [1], "col2": [2]})
+    df2 = pd.DataFrame({"col1": [1], "col2": [2], "col2_dup": [3]})    
+    with pytest.raises(MergeError, match="duplicate columns"):    
+        pd.merge(df1, df2, on="col1", suffixes=("_dup", ""))
+
+    # Case 2: suffixes=("", "_dup") test collision
+    df1 = pd.DataFrame({"col1": [1], "col2": [2]})
+    df2 = pd.DataFrame({"col1": [1], "col2": [2], "col2_dup": [3]})
+    with pytest.raises(MergeError, match="duplicate columns"):
+        pd.merge(df1, df2, on="col1", suffixes=("", "_dup"))
+
+    # Case 3: suffixes=("_dup" != "__dup"), test similar 
+    # non-collision
+    df3 = pd.DataFrame({"col1": [1], "col2__dup": [4]})
+    df4 = pd.DataFrame({"col1": [1], "col2": [2]})
+    merged = df3.merge(df4, on='col1')
+    expected = pd.DataFrame([{"col1": 1, "col2__dup": 4, "col2": 2}])
+    pd.testing.assert_frame_equal(merged, expected)    
+    
+
+    # Case 4: suffixes=("", ""), auto-suffixes create 
+    # collision due to co2_x existing
+    df5 = pd.DataFrame({"col1": [1], "col2": [2]})
+    df6 = pd.DataFrame({"col1": [1], "col2": [2], "col2_x": [3]})    
+    with pytest.raises(MergeError, match="duplicate columns"):
+        pd.merge(df5, df6, on="col1")
+    
+    # Case 5: test special characters in suffixes
+    df7 = pd.DataFrame({"col1": [1], "metric": [10]})
+    df8 = pd.DataFrame({"col1": [1], "metric": [10], "metric#$%^": [20]})
+    with pytest.raises(MergeError, match="duplicate columns"):
+        pd.merge(df7, df8, on="col1", suffixes=("#$%^", ""))
+
+    # Case 6: check if escape in column name causes conflict
+    df7 = pd.DataFrame({"col1": [1], "metric": [10]})
+    df8 = pd.DataFrame({"col1": [1], "metric": [10], "metric\#$%^": [20]})
+    with pytest.raises(MergeError, match="duplicate columns"):
+        pd.merge(df7, df8, on="col1", suffixes=("\#$%^", ""))
+
+    # Case 7: suffix is a special character string
+    df9 = pd.DataFrame({"col1": [1], "col2": [2], "col2": [3]})
+    df10 = pd.DataFrame({"col1": [1], "col2": [2]})
+    merged = df9.merge(df10, on='col1')
+    expected = pd.DataFrame([{"col1": 1, "col2_x": 3, "col2_y": 2}])
+    pd.testing.assert_frame_equal(merged, expected)