Skip to content

Commit 9a40cd0

Browse files
committed
BUG: Raise MergeError when suffixes result in duplicate column names (GH#61402)
1 parent 2e141aa commit 9a40cd0

File tree

3 files changed

+62
-4
lines changed

3 files changed

+62
-4
lines changed

doc/source/whatsnew/v3.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -845,6 +845,7 @@ Reshaping
845845
- Bug in :meth:`DataFrame.stack` with the new implementation where ``ValueError`` is raised when ``level=[]`` (:issue:`60740`)
846846
- Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`)
847847
- Bug in :meth:`concat` where concatenating DataFrame and Series with ``ignore_index = True`` drops the series name (:issue:`60723`, :issue:`56257`)
848+
- Bug in :meth:`DataFrame.merge` where either user-provided or default suffixes could result in duplicate column names if the resulting names matched existing columns. Now raises a :class:`MergeError` in such cases. (:issue:`61402`)
848849

849850
Sparse
850851
^^^^^^

pandas/core/reshape/merge.py

+11-4
Original file line numberDiff line numberDiff line change
@@ -3058,16 +3058,23 @@ def renamer(x, suffix: str | None):
30583058
llabels = left._transform_index(lrenamer)
30593059
rlabels = right._transform_index(rrenamer)
30603060

3061-
dups = []
3061+
dups = set()
3062+
30623063
if not llabels.is_unique:
30633064
# Only warn when duplicates are caused because of suffixes, already duplicated
30643065
# columns in origin should not warn
3065-
dups = llabels[(llabels.duplicated()) & (~left.duplicated())].tolist()
3066+
dups.update(llabels[(llabels.duplicated()) & (~left.duplicated())])
30663067
if not rlabels.is_unique:
3067-
dups.extend(rlabels[(rlabels.duplicated()) & (~right.duplicated())].tolist())
3068+
dups.update(rlabels[(rlabels.duplicated()) & (~right.duplicated())])
3069+
3070+
# addition of suffix matches the original column name in the other frame
3071+
# x["A"]+suffix("_x") == y["A_x"]
3072+
dups.update(llabels.intersection(right.difference(to_rename)))
3073+
dups.update(rlabels.intersection(left.difference(to_rename)))
3074+
30683075
if dups:
30693076
raise MergeError(
3070-
f"Passing 'suffixes' which cause duplicate columns {set(dups)} is "
3077+
f"Passing 'suffixes' which cause duplicate columns {dups} is "
30713078
f"not allowed.",
30723079
)
30733080

pandas/tests/reshape/merge/test_merge.py

+50
Original file line numberDiff line numberDiff line change
@@ -3060,3 +3060,53 @@ def test_merge_on_all_nan_column():
30603060
{"x": [1, 2, 3], "y": [np.nan, np.nan, np.nan], "z": [4, 5, 6], "zz": [4, 5, 6]}
30613061
)
30623062
tm.assert_frame_equal(result, expected)
3063+
3064+
3065+
def test_merge_for_suffix_collisions():
3066+
# GH#61402
3067+
# Case 1: suffixes=("_dup", "") test collision
3068+
df1 = pd.DataFrame({"col1": [1], "col2": [2]})
3069+
df2 = pd.DataFrame({"col1": [1], "col2": [2], "col2_dup": [3]})
3070+
with pytest.raises(MergeError, match="duplicate columns"):
3071+
pd.merge(df1, df2, on="col1", suffixes=("_dup", ""))
3072+
3073+
# Case 2: suffixes=("", "_dup") test collision
3074+
df1 = pd.DataFrame({"col1": [1], "col2": [2]})
3075+
df2 = pd.DataFrame({"col1": [1], "col2": [2], "col2_dup": [3]})
3076+
with pytest.raises(MergeError, match="duplicate columns"):
3077+
pd.merge(df1, df2, on="col1", suffixes=("", "_dup"))
3078+
3079+
# Case 3: suffixes=("_dup" != "__dup"), test similar
3080+
# non-collision
3081+
df3 = pd.DataFrame({"col1": [1], "col2__dup": [4]})
3082+
df4 = pd.DataFrame({"col1": [1], "col2": [2]})
3083+
merged = df3.merge(df4, on='col1')
3084+
expected = pd.DataFrame([{"col1": 1, "col2__dup": 4, "col2": 2}])
3085+
pd.testing.assert_frame_equal(merged, expected)
3086+
3087+
3088+
# Case 4: suffixes=("", ""), auto-suffixes create
3089+
# collision due to co2_x existing
3090+
df5 = pd.DataFrame({"col1": [1], "col2": [2]})
3091+
df6 = pd.DataFrame({"col1": [1], "col2": [2], "col2_x": [3]})
3092+
with pytest.raises(MergeError, match="duplicate columns"):
3093+
pd.merge(df5, df6, on="col1")
3094+
3095+
# Case 5: test special characters in suffixes
3096+
df7 = pd.DataFrame({"col1": [1], "metric": [10]})
3097+
df8 = pd.DataFrame({"col1": [1], "metric": [10], "metric#$%^": [20]})
3098+
with pytest.raises(MergeError, match="duplicate columns"):
3099+
pd.merge(df7, df8, on="col1", suffixes=("#$%^", ""))
3100+
3101+
# Case 6: check if escape in column name causes conflict
3102+
df7 = pd.DataFrame({"col1": [1], "metric": [10]})
3103+
df8 = pd.DataFrame({"col1": [1], "metric": [10], "metric\#$%^": [20]})
3104+
with pytest.raises(MergeError, match="duplicate columns"):
3105+
pd.merge(df7, df8, on="col1", suffixes=("\#$%^", ""))
3106+
3107+
# Case 7: suffix is a special character string
3108+
df9 = pd.DataFrame({"col1": [1], "col2": [2], "col2": [3]})
3109+
df10 = pd.DataFrame({"col1": [1], "col2": [2]})
3110+
merged = df9.merge(df10, on='col1')
3111+
expected = pd.DataFrame([{"col1": 1, "col2_x": 3, "col2_y": 2}])
3112+
pd.testing.assert_frame_equal(merged, expected)

0 commit comments

Comments
 (0)