BUG: handle overflow during exponent parsing in read_csv (#62741)

Alvaro-Kothe · web-flow · commit c8fcf7e92087 · 2025-10-21T15:51:42.000-04:00
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -1087,6 +1087,7 @@ I/O
 - Bug in :meth:`HDFStore.select` causing queries on categorical string columns to return unexpected results (:issue:`57608`)
 - Bug in :meth:`MultiIndex.factorize` incorrectly raising on length-0 indexes (:issue:`57517`)
 - Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string. (:issue:`59059`)
+- Bug in :meth:`read_csv` for the ``c`` and ``python`` engines where parsing numbers with large exponents caused overflows. Now, numbers with large positive exponents are parsed as ``inf`` or ``-inf`` depending on the sign of the mantissa, while those with large negative exponents are parsed as ``0.0`` (:issue:`62617`, :issue:`38794`, :issue:`62740`)
 - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`)
 - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
 - Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)
diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
@@ -1620,9 +1620,9 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci,
   }
 
   double number = 0.;
-  int exponent = 0;
-  int num_digits = 0;
-  int num_decimals = 0;
+  long int exponent = 0;
+  long int num_digits = 0;
+  long int num_decimals = 0;
 
   // Process string of digits.
   while (isdigit_ascii(*p)) {
@@ -1671,39 +1671,26 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci,
     if (maybe_int != NULL)
       *maybe_int = 0;
 
-    // Handle optional sign
-    negative = 0;
-    switch (*++p) {
-    case '-':
-      negative = 1;
-      PD_FALLTHROUGH; // Fall through to increment position.
-    case '+':
-      p++;
-      break;
-    }
+    // move past scientific notation
+    p++;
 
-    // Process string of digits.
-    num_digits = 0;
-    int n = 0;
-    while (num_digits < max_digits && isdigit_ascii(*p)) {
-      n = n * 10 + (*p - '0');
-      num_digits++;
-      p++;
-    }
+    char *tmp_ptr;
+    long int n = strtol(p, &tmp_ptr, 10);
 
-    if (negative)
-      exponent -= n;
-    else
-      exponent += n;
+    if (errno == ERANGE || checked_add(exponent, n, &exponent)) {
+      errno = 0;
+      exponent = n;
+    }
 
     // If no digits after the 'e'/'E', un-consume it.
-    if (num_digits == 0)
+    if (tmp_ptr == p)
       p--;
+    else
+      p = tmp_ptr;
   }
 
   if (exponent > 308) {
-    *error = ERANGE;
-    return HUGE_VAL;
+    number = number == 0 ? 0 : number < 0 ? -HUGE_VAL : HUGE_VAL;
   } else if (exponent > 0) {
     number *= e[exponent];
   } else if (exponent < -308) { // Subnormal
@@ -1718,9 +1705,6 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci,
     number /= e[-exponent];
   }
 
-  if (number == HUGE_VAL || number == -HUGE_VAL)
-    *error = ERANGE;
-
   if (skip_trailing) {
     // Skip trailing whitespace.
     while (isspace_ascii(*p))
@@ -1812,8 +1796,6 @@ double round_trip(const char *p, char **q, char decimal, char Py_UNUSED(sci),
     *maybe_int = 0;
   if (PyErr_Occurred() != NULL)
     *error = -1;
-  else if (r == Py_HUGE_VAL)
-    *error = (int)Py_HUGE_VAL;
   PyErr_Clear();
 
   PyGILState_Release(gstate);
diff --git a/pandas/tests/io/parser/common/test_float.py b/pandas/tests/io/parser/common/test_float.py
@@ -8,15 +8,12 @@
 import numpy as np
 import pytest
 
-from pandas.compat import is_platform_linux
-
 from pandas import DataFrame
 import pandas._testing as tm
 
 pytestmark = pytest.mark.filterwarnings(
     "ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
 )
-xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
 skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
 
 
@@ -42,40 +39,34 @@ def test_scientific_no_exponent(all_parsers_all_precisions):
 
 
 @pytest.mark.parametrize(
-    "neg_exp",
+    "value, expected_value",
     [
-        -617,
-        -100000,
-        -99999999999999999,
+        ("0E-617", 0.0),
+        ("0E99999999", 0.0),
+        ("-0E99999999", 0.0),
+        ("-0E-99999999", 0.0),
+        ("10E-617", 0.0),
+        ("10E-100000", 0.0),
+        ("-10E-100000", 0.0),
+        ("10e-99999999999", 0.0),
+        ("10e-999999999999", 0.0),
+        ("10e-9999999999999", 0.0),
+        ("10E999", np.inf),
+        ("-10e99999999999", -np.inf),
+        ("10e99999999999", np.inf),
+        ("10e999999999999", np.inf),
+        ("10e9999999999999", np.inf),
+        ("50060e8007123400", np.inf),
+        ("-50060e8007123400", -np.inf),
     ],
 )
-def test_very_negative_exponent(all_parsers_all_precisions, neg_exp):
-    # GH#38753
+def test_large_exponent(all_parsers_all_precisions, value, expected_value):
+    # GH#38753; GH#38794; GH#62740
     parser, precision = all_parsers_all_precisions
 
-    data = f"data\n10E{neg_exp}"
+    data = f"data\n{value}"
     result = parser.read_csv(StringIO(data), float_precision=precision)
-    expected = DataFrame({"data": [0.0]})
-    tm.assert_frame_equal(result, expected)
-
-
-@xfail_pyarrow  # AssertionError: Attributes of DataFrame.iloc[:, 0] are different
-@pytest.mark.parametrize("exp", [999999999999999999, -999999999999999999])
-def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request):
-    # GH#38753
-    parser, precision = all_parsers_all_precisions
-    data = f"data\n10E{exp}"
-    result = parser.read_csv(StringIO(data), float_precision=precision)
-    if precision == "round_trip":
-        if exp == 999999999999999999 and is_platform_linux():
-            mark = pytest.mark.xfail(reason="GH38794, on Linux gives object result")
-            request.applymarker(mark)
-
-        value = np.inf if exp > 0 else 0.0
-        expected = DataFrame({"data": [value]})
-    else:
-        expected = DataFrame({"data": [f"10E{exp}"]})
-
+    expected = DataFrame({"data": [expected_value]})
     tm.assert_frame_equal(result, expected)
 
 
@@ -104,3 +95,16 @@ def test_small_int_followed_by_float(
     expected = DataFrame({"data": [42.0, expected_value]})
 
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "value", ["81e31d04049863b72", "d81e31d04049863b72", "81e3104049863b72"]
+)
+def test_invalid_float_number(all_parsers_all_precisions, value):
+    # GH#62617
+    parser, precision = all_parsers_all_precisions
+    data = f"h1,h2,h3\ndata1,{value},data3"
+
+    result = parser.read_csv(StringIO(data), float_precision=precision)
+    expected = DataFrame({"h1": ["data1"], "h2": [value], "h3": "data3"})
+    tm.assert_frame_equal(result, expected)