Skip to content

Commit 9234ed5

Browse files
authored
PERF: fix performance regression from #62542 (#62623)
1 parent c8fcf7e commit 9234ed5

File tree

2 files changed

+31
-75
lines changed

2 files changed

+31
-75
lines changed

pandas/_libs/parsers.pyx

Lines changed: 29 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ cdef extern from "pandas/parser/tokenizer.h":
144144
SKIP_LINE
145145
FINISHED
146146

147-
enum: ERROR_OVERFLOW
147+
enum: ERROR_OVERFLOW, ERROR_INVALID_CHARS
148148

149149
ctypedef enum BadLineHandleMethod:
150150
ERROR,
@@ -1051,7 +1051,7 @@ cdef class TextReader:
10511051
if col_dtype is not None:
10521052
col_res, na_count = self._convert_with_dtype(
10531053
col_dtype, i, start, end, na_filter,
1054-
1, na_hashset, na_fset)
1054+
1, na_hashset, na_fset, False)
10551055

10561056
# Fallback on the parse (e.g. we requested int dtype,
10571057
# but its actually a float).
@@ -1062,30 +1062,34 @@ cdef class TextReader:
10621062
return self._string_convert(i, start, end, na_filter, na_hashset)
10631063
else:
10641064
col_res = None
1065+
maybe_int = True
10651066
for dt in self.dtype_cast_order:
1066-
if (dt.kind in "iu" and
1067-
self._column_has_float(i, start, end, na_filter, na_hashset)):
1067+
if not maybe_int and dt.kind in "iu":
10681068
continue
10691069

10701070
try:
10711071
col_res, na_count = self._convert_with_dtype(
1072-
dt, i, start, end, na_filter, 0, na_hashset, na_fset)
1073-
except ValueError:
1074-
# This error is raised from trying to convert to uint64,
1075-
# and we discover that we cannot convert to any numerical
1076-
# dtype successfully. As a result, we leave the data
1077-
# column AS IS with object dtype.
1078-
col_res, na_count = self._convert_with_dtype(
1079-
np.dtype("object"), i, start, end, 0,
1080-
0, na_hashset, na_fset)
1072+
dt, i, start, end, na_filter, 0, na_hashset, na_fset, True)
1073+
except ValueError as e:
1074+
if str(e) == "Number is not int":
1075+
maybe_int = False
1076+
continue
1077+
else:
1078+
# This error is raised from trying to convert to uint64,
1079+
# and we discover that we cannot convert to any numerical
1080+
# dtype successfully. As a result, we leave the data
1081+
# column AS IS with object dtype.
1082+
col_res, na_count = self._convert_with_dtype(
1083+
np.dtype("object"), i, start, end, 0,
1084+
0, na_hashset, na_fset, False)
10811085
except OverflowError:
10821086
try:
10831087
col_res, na_count = _try_pylong(self.parser, i, start,
10841088
end, na_filter, na_hashset)
10851089
except ValueError:
10861090
col_res, na_count = self._convert_with_dtype(
10871091
np.dtype("object"), i, start, end, 0,
1088-
0, na_hashset, na_fset)
1092+
0, na_hashset, na_fset, False)
10891093

10901094
if col_res is not None:
10911095
break
@@ -1133,7 +1137,7 @@ cdef class TextReader:
11331137
bint na_filter,
11341138
bint user_dtype,
11351139
kh_str_starts_t *na_hashset,
1136-
set na_fset):
1140+
set na_fset, bint raise_on_invalid):
11371141
if isinstance(dtype, CategoricalDtype):
11381142
# TODO: I suspect that _categorical_convert could be
11391143
# optimized when dtype is an instance of CategoricalDtype
@@ -1174,14 +1178,14 @@ cdef class TextReader:
11741178

11751179
elif dtype.kind in "iu":
11761180
try:
1177-
result, na_count = _try_int64(self.parser, i, start,
1178-
end, na_filter, na_hashset)
1181+
result, na_count = _try_int64(self.parser, i, start, end,
1182+
na_filter, na_hashset, raise_on_invalid)
11791183
if user_dtype and na_count is not None:
11801184
if na_count > 0:
11811185
raise ValueError(f"Integer column has NA values in column {i}")
11821186
except OverflowError:
11831187
result = _try_uint64(self.parser, i, start, end,
1184-
na_filter, na_hashset)
1188+
na_filter, na_hashset, raise_on_invalid)
11851189
na_count = 0
11861190

11871191
if result is not None and dtype != "int64":
@@ -1344,59 +1348,6 @@ cdef class TextReader:
13441348
else:
13451349
return None
13461350

1347-
cdef bint _column_has_float(self, Py_ssize_t col,
1348-
int64_t start, int64_t end,
1349-
bint na_filter, kh_str_starts_t *na_hashset):
1350-
"""Check if the column contains any float number."""
1351-
cdef:
1352-
Py_ssize_t i, j, lines = end - start
1353-
coliter_t it
1354-
const char *word = NULL
1355-
const char *ignored_chars = " +-"
1356-
const char *digits = "0123456789"
1357-
const char *float_indicating_chars = "eE"
1358-
char null_byte = 0
1359-
1360-
coliter_setup(&it, self.parser, col, start)
1361-
1362-
for i in range(lines):
1363-
COLITER_NEXT(it, word)
1364-
1365-
if na_filter and kh_get_str_starts_item(na_hashset, word):
1366-
continue
1367-
1368-
found_first_digit = False
1369-
j = 0
1370-
while word[j] != null_byte:
1371-
if word[j] == self.parser.decimal:
1372-
return True
1373-
elif not found_first_digit and word[j] in ignored_chars:
1374-
# no-op
1375-
pass
1376-
elif not found_first_digit and word[j] not in digits:
1377-
# word isn't numeric
1378-
return False
1379-
elif not found_first_digit and word[j] in digits:
1380-
found_first_digit = True
1381-
elif word[j] in float_indicating_chars:
1382-
# preceding chars indicates numeric and
1383-
# current char indicates float
1384-
return True
1385-
elif word[j] not in digits:
1386-
# previous characters indicates numeric
1387-
# current character shows otherwise
1388-
return False
1389-
elif word[j] in digits:
1390-
# no-op
1391-
pass
1392-
else:
1393-
raise AssertionError(
1394-
f"Unhandled case {word[j]=} {found_first_digit=}"
1395-
)
1396-
j += 1
1397-
1398-
return False
1399-
14001351
# Factor out code common to TextReader.__dealloc__ and TextReader.close
14011352
# It cannot be a class method, since calling self.close() in __dealloc__
14021353
# which causes a class attribute lookup and violates best practices
@@ -1793,7 +1744,8 @@ cdef int _try_double_nogil(parser_t *parser,
17931744

17941745
cdef _try_uint64(parser_t *parser, int64_t col,
17951746
int64_t line_start, int64_t line_end,
1796-
bint na_filter, kh_str_starts_t *na_hashset):
1747+
bint na_filter, kh_str_starts_t *na_hashset,
1748+
bint raise_on_invalid):
17971749
cdef:
17981750
int error
17991751
Py_ssize_t lines
@@ -1815,6 +1767,8 @@ cdef _try_uint64(parser_t *parser, int64_t col,
18151767
if error == ERROR_OVERFLOW:
18161768
# Can't get the word variable
18171769
raise OverflowError("Overflow")
1770+
elif raise_on_invalid and error == ERROR_INVALID_CHARS:
1771+
raise ValueError("Number is not int")
18181772
return None
18191773

18201774
if uint64_conflict(&state):
@@ -1863,7 +1817,7 @@ cdef int _try_uint64_nogil(parser_t *parser, int64_t col,
18631817

18641818
cdef _try_int64(parser_t *parser, int64_t col,
18651819
int64_t line_start, int64_t line_end,
1866-
bint na_filter, kh_str_starts_t *na_hashset):
1820+
bint na_filter, kh_str_starts_t *na_hashset, bint raise_on_invalid):
18671821
cdef:
18681822
int error, na_count = 0
18691823
Py_ssize_t lines
@@ -1883,6 +1837,8 @@ cdef _try_int64(parser_t *parser, int64_t col,
18831837
if error == ERROR_OVERFLOW:
18841838
# Can't get the word variable
18851839
raise OverflowError("Overflow")
1840+
elif raise_on_invalid and error == ERROR_INVALID_CHARS:
1841+
raise ValueError("Number is not int")
18861842
return None, None
18871843

18881844
return result, na_count

pandas/_libs/src/parser/tokenizer.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1889,7 +1889,7 @@ int64_t str_to_int64(const char *p_item, int *error, char tsep) {
18891889
int64_t number = strtoll(p, &endptr, 10);
18901890

18911891
if (errno == ERANGE) {
1892-
*error = ERROR_OVERFLOW;
1892+
*error = *endptr ? ERROR_INVALID_CHARS : ERROR_OVERFLOW;
18931893
errno = 0;
18941894
return 0;
18951895
}
@@ -1949,7 +1949,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int *error,
19491949
uint64_t number = strtoull(p, &endptr, 10);
19501950

19511951
if (errno == ERANGE) {
1952-
*error = ERROR_OVERFLOW;
1952+
*error = *endptr ? ERROR_INVALID_CHARS : ERROR_OVERFLOW;
19531953
errno = 0;
19541954
return 0;
19551955
}

0 commit comments

Comments
 (0)