Skip to content

Commit c8fcf7e

Browse files
authored
BUG: handle overflow during exponent parsing in read_csv (#62741)
1 parent e9e1b32 commit c8fcf7e

File tree

3 files changed

+51
-64
lines changed

3 files changed

+51
-64
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1087,6 +1087,7 @@ I/O
10871087
- Bug in :meth:`HDFStore.select` causing queries on categorical string columns to return unexpected results (:issue:`57608`)
10881088
- Bug in :meth:`MultiIndex.factorize` incorrectly raising on length-0 indexes (:issue:`57517`)
10891089
- Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string. (:issue:`59059`)
1090+
- Bug in :meth:`read_csv` for the ``c`` and ``python`` engines where parsing numbers with large exponents caused overflows. Now, numbers with large positive exponents are parsed as ``inf`` or ``-inf`` depending on the sign of the mantissa, while those with large negative exponents are parsed as ``0.0`` (:issue:`62617`, :issue:`38794`, :issue:`62740`)
10901091
- Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`)
10911092
- Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
10921093
- Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)

pandas/_libs/src/parser/tokenizer.c

Lines changed: 15 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1620,9 +1620,9 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci,
16201620
}
16211621

16221622
double number = 0.;
1623-
int exponent = 0;
1624-
int num_digits = 0;
1625-
int num_decimals = 0;
1623+
long int exponent = 0;
1624+
long int num_digits = 0;
1625+
long int num_decimals = 0;
16261626

16271627
// Process string of digits.
16281628
while (isdigit_ascii(*p)) {
@@ -1671,39 +1671,26 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci,
16711671
if (maybe_int != NULL)
16721672
*maybe_int = 0;
16731673

1674-
// Handle optional sign
1675-
negative = 0;
1676-
switch (*++p) {
1677-
case '-':
1678-
negative = 1;
1679-
PD_FALLTHROUGH; // Fall through to increment position.
1680-
case '+':
1681-
p++;
1682-
break;
1683-
}
1674+
// move past scientific notation
1675+
p++;
16841676

1685-
// Process string of digits.
1686-
num_digits = 0;
1687-
int n = 0;
1688-
while (num_digits < max_digits && isdigit_ascii(*p)) {
1689-
n = n * 10 + (*p - '0');
1690-
num_digits++;
1691-
p++;
1692-
}
1677+
char *tmp_ptr;
1678+
long int n = strtol(p, &tmp_ptr, 10);
16931679

1694-
if (negative)
1695-
exponent -= n;
1696-
else
1697-
exponent += n;
1680+
if (errno == ERANGE || checked_add(exponent, n, &exponent)) {
1681+
errno = 0;
1682+
exponent = n;
1683+
}
16981684

16991685
// If no digits after the 'e'/'E', un-consume it.
1700-
if (num_digits == 0)
1686+
if (tmp_ptr == p)
17011687
p--;
1688+
else
1689+
p = tmp_ptr;
17021690
}
17031691

17041692
if (exponent > 308) {
1705-
*error = ERANGE;
1706-
return HUGE_VAL;
1693+
number = number == 0 ? 0 : number < 0 ? -HUGE_VAL : HUGE_VAL;
17071694
} else if (exponent > 0) {
17081695
number *= e[exponent];
17091696
} else if (exponent < -308) { // Subnormal
@@ -1718,9 +1705,6 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci,
17181705
number /= e[-exponent];
17191706
}
17201707

1721-
if (number == HUGE_VAL || number == -HUGE_VAL)
1722-
*error = ERANGE;
1723-
17241708
if (skip_trailing) {
17251709
// Skip trailing whitespace.
17261710
while (isspace_ascii(*p))
@@ -1812,8 +1796,6 @@ double round_trip(const char *p, char **q, char decimal, char Py_UNUSED(sci),
18121796
*maybe_int = 0;
18131797
if (PyErr_Occurred() != NULL)
18141798
*error = -1;
1815-
else if (r == Py_HUGE_VAL)
1816-
*error = (int)Py_HUGE_VAL;
18171799
PyErr_Clear();
18181800

18191801
PyGILState_Release(gstate);

pandas/tests/io/parser/common/test_float.py

Lines changed: 35 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,12 @@
88
import numpy as np
99
import pytest
1010

11-
from pandas.compat import is_platform_linux
12-
1311
from pandas import DataFrame
1412
import pandas._testing as tm
1513

1614
pytestmark = pytest.mark.filterwarnings(
1715
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
1816
)
19-
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
2017
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
2118

2219

@@ -42,40 +39,34 @@ def test_scientific_no_exponent(all_parsers_all_precisions):
4239

4340

4441
@pytest.mark.parametrize(
45-
"neg_exp",
42+
"value, expected_value",
4643
[
47-
-617,
48-
-100000,
49-
-99999999999999999,
44+
("0E-617", 0.0),
45+
("0E99999999", 0.0),
46+
("-0E99999999", 0.0),
47+
("-0E-99999999", 0.0),
48+
("10E-617", 0.0),
49+
("10E-100000", 0.0),
50+
("-10E-100000", 0.0),
51+
("10e-99999999999", 0.0),
52+
("10e-999999999999", 0.0),
53+
("10e-9999999999999", 0.0),
54+
("10E999", np.inf),
55+
("-10e99999999999", -np.inf),
56+
("10e99999999999", np.inf),
57+
("10e999999999999", np.inf),
58+
("10e9999999999999", np.inf),
59+
("50060e8007123400", np.inf),
60+
("-50060e8007123400", -np.inf),
5061
],
5162
)
52-
def test_very_negative_exponent(all_parsers_all_precisions, neg_exp):
53-
# GH#38753
63+
def test_large_exponent(all_parsers_all_precisions, value, expected_value):
64+
# GH#38753; GH#38794; GH#62740
5465
parser, precision = all_parsers_all_precisions
5566

56-
data = f"data\n10E{neg_exp}"
67+
data = f"data\n{value}"
5768
result = parser.read_csv(StringIO(data), float_precision=precision)
58-
expected = DataFrame({"data": [0.0]})
59-
tm.assert_frame_equal(result, expected)
60-
61-
62-
@xfail_pyarrow # AssertionError: Attributes of DataFrame.iloc[:, 0] are different
63-
@pytest.mark.parametrize("exp", [999999999999999999, -999999999999999999])
64-
def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request):
65-
# GH#38753
66-
parser, precision = all_parsers_all_precisions
67-
data = f"data\n10E{exp}"
68-
result = parser.read_csv(StringIO(data), float_precision=precision)
69-
if precision == "round_trip":
70-
if exp == 999999999999999999 and is_platform_linux():
71-
mark = pytest.mark.xfail(reason="GH38794, on Linux gives object result")
72-
request.applymarker(mark)
73-
74-
value = np.inf if exp > 0 else 0.0
75-
expected = DataFrame({"data": [value]})
76-
else:
77-
expected = DataFrame({"data": [f"10E{exp}"]})
78-
69+
expected = DataFrame({"data": [expected_value]})
7970
tm.assert_frame_equal(result, expected)
8071

8172

@@ -104,3 +95,16 @@ def test_small_int_followed_by_float(
10495
expected = DataFrame({"data": [42.0, expected_value]})
10596

10697
tm.assert_frame_equal(result, expected)
98+
99+
100+
@pytest.mark.parametrize(
101+
"value", ["81e31d04049863b72", "d81e31d04049863b72", "81e3104049863b72"]
102+
)
103+
def test_invalid_float_number(all_parsers_all_precisions, value):
104+
# GH#62617
105+
parser, precision = all_parsers_all_precisions
106+
data = f"h1,h2,h3\ndata1,{value},data3"
107+
108+
result = parser.read_csv(StringIO(data), float_precision=precision)
109+
expected = DataFrame({"h1": ["data1"], "h2": [value], "h3": "data3"})
110+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)