@@ -144,7 +144,7 @@ cdef extern from "pandas/parser/tokenizer.h":
144144 SKIP_LINE
145145 FINISHED
146146
147- enum : ERROR_OVERFLOW
147+ enum : ERROR_OVERFLOW, ERROR_INVALID_CHARS
148148
149149 ctypedef enum BadLineHandleMethod:
150150 ERROR,
@@ -1051,7 +1051,7 @@ cdef class TextReader:
10511051 if col_dtype is not None :
10521052 col_res, na_count = self ._convert_with_dtype(
10531053 col_dtype, i, start, end, na_filter,
1054- 1 , na_hashset, na_fset)
1054+ 1 , na_hashset, na_fset, False )
10551055
10561056 # Fallback on the parse (e.g. we requested int dtype,
10571057 # but its actually a float).
@@ -1062,30 +1062,34 @@ cdef class TextReader:
10621062 return self ._string_convert(i, start, end, na_filter, na_hashset)
10631063 else :
10641064 col_res = None
1065+ maybe_int = True
10651066 for dt in self .dtype_cast_order:
1066- if (dt.kind in " iu" and
1067- self ._column_has_float(i, start, end, na_filter, na_hashset)):
1067+ if not maybe_int and dt.kind in " iu" :
10681068 continue
10691069
10701070 try :
10711071 col_res, na_count = self ._convert_with_dtype(
1072- dt, i, start, end, na_filter, 0 , na_hashset, na_fset)
1073- except ValueError :
1074- # This error is raised from trying to convert to uint64,
1075- # and we discover that we cannot convert to any numerical
1076- # dtype successfully. As a result, we leave the data
1077- # column AS IS with object dtype.
1078- col_res, na_count = self ._convert_with_dtype(
1079- np.dtype(" object" ), i, start, end, 0 ,
1080- 0 , na_hashset, na_fset)
1072+ dt, i, start, end, na_filter, 0 , na_hashset, na_fset, True )
1073+ except ValueError as e:
1074+ if str (e) == " Number is not int" :
1075+ maybe_int = False
1076+ continue
1077+ else :
1078+ # This error is raised from trying to convert to uint64,
1079+ # and we discover that we cannot convert to any numerical
1080+ # dtype successfully. As a result, we leave the data
1081+ # column AS IS with object dtype.
1082+ col_res, na_count = self ._convert_with_dtype(
1083+ np.dtype(" object" ), i, start, end, 0 ,
1084+ 0 , na_hashset, na_fset, False )
10811085 except OverflowError :
10821086 try :
10831087 col_res, na_count = _try_pylong(self .parser, i, start,
10841088 end, na_filter, na_hashset)
10851089 except ValueError :
10861090 col_res, na_count = self ._convert_with_dtype(
10871091 np.dtype(" object" ), i, start, end, 0 ,
1088- 0 , na_hashset, na_fset)
1092+ 0 , na_hashset, na_fset, False )
10891093
10901094 if col_res is not None :
10911095 break
@@ -1133,7 +1137,7 @@ cdef class TextReader:
11331137 bint na_filter,
11341138 bint user_dtype,
11351139 kh_str_starts_t * na_hashset,
1136- set na_fset):
1140+ set na_fset, bint raise_on_invalid ):
11371141 if isinstance (dtype, CategoricalDtype):
11381142 # TODO: I suspect that _categorical_convert could be
11391143 # optimized when dtype is an instance of CategoricalDtype
@@ -1174,14 +1178,14 @@ cdef class TextReader:
11741178
11751179 elif dtype.kind in " iu" :
11761180 try :
1177- result, na_count = _try_int64(self .parser, i, start,
1178- end, na_filter, na_hashset)
1181+ result, na_count = _try_int64(self .parser, i, start, end,
1182+ na_filter, na_hashset, raise_on_invalid )
11791183 if user_dtype and na_count is not None :
11801184 if na_count > 0 :
11811185 raise ValueError (f" Integer column has NA values in column {i}" )
11821186 except OverflowError :
11831187 result = _try_uint64(self .parser, i, start, end,
1184- na_filter, na_hashset)
1188+ na_filter, na_hashset, raise_on_invalid )
11851189 na_count = 0
11861190
11871191 if result is not None and dtype != " int64" :
@@ -1344,59 +1348,6 @@ cdef class TextReader:
13441348 else :
13451349 return None
13461350
1347- cdef bint _column_has_float(self , Py_ssize_t col,
1348- int64_t start, int64_t end,
1349- bint na_filter, kh_str_starts_t * na_hashset):
1350- """ Check if the column contains any float number."""
1351- cdef:
1352- Py_ssize_t i, j, lines = end - start
1353- coliter_t it
1354- const char * word = NULL
1355- const char * ignored_chars = " +-"
1356- const char * digits = " 0123456789"
1357- const char * float_indicating_chars = " eE"
1358- char null_byte = 0
1359-
1360- coliter_setup(& it, self .parser, col, start)
1361-
1362- for i in range (lines):
1363- COLITER_NEXT(it, word)
1364-
1365- if na_filter and kh_get_str_starts_item(na_hashset, word):
1366- continue
1367-
1368- found_first_digit = False
1369- j = 0
1370- while word[j] != null_byte:
1371- if word[j] == self .parser.decimal:
1372- return True
1373- elif not found_first_digit and word[j] in ignored_chars:
1374- # no-op
1375- pass
1376- elif not found_first_digit and word[j] not in digits:
1377- # word isn't numeric
1378- return False
1379- elif not found_first_digit and word[j] in digits:
1380- found_first_digit = True
1381- elif word[j] in float_indicating_chars:
1382- # preceding chars indicates numeric and
1383- # current char indicates float
1384- return True
1385- elif word[j] not in digits:
1386- # previous characters indicates numeric
1387- # current character shows otherwise
1388- return False
1389- elif word[j] in digits:
1390- # no-op
1391- pass
1392- else :
1393- raise AssertionError (
1394- f" Unhandled case {word[j]=} {found_first_digit=}"
1395- )
1396- j += 1
1397-
1398- return False
1399-
14001351# Factor out code common to TextReader.__dealloc__ and TextReader.close
14011352# It cannot be a class method, since calling self.close() in __dealloc__
14021353# which causes a class attribute lookup and violates best practices
@@ -1793,7 +1744,8 @@ cdef int _try_double_nogil(parser_t *parser,
17931744
17941745cdef _try_uint64(parser_t * parser, int64_t col,
17951746 int64_t line_start, int64_t line_end,
1796- bint na_filter, kh_str_starts_t * na_hashset):
1747+ bint na_filter, kh_str_starts_t * na_hashset,
1748+ bint raise_on_invalid):
17971749 cdef:
17981750 int error
17991751 Py_ssize_t lines
@@ -1815,6 +1767,8 @@ cdef _try_uint64(parser_t *parser, int64_t col,
18151767 if error == ERROR_OVERFLOW:
18161768 # Can't get the word variable
18171769 raise OverflowError (" Overflow" )
1770+ elif raise_on_invalid and error == ERROR_INVALID_CHARS:
1771+ raise ValueError (" Number is not int" )
18181772 return None
18191773
18201774 if uint64_conflict(& state):
@@ -1863,7 +1817,7 @@ cdef int _try_uint64_nogil(parser_t *parser, int64_t col,
18631817
18641818cdef _try_int64(parser_t * parser, int64_t col,
18651819 int64_t line_start, int64_t line_end,
1866- bint na_filter, kh_str_starts_t * na_hashset):
1820+ bint na_filter, kh_str_starts_t * na_hashset, bint raise_on_invalid ):
18671821 cdef:
18681822 int error, na_count = 0
18691823 Py_ssize_t lines
@@ -1883,6 +1837,8 @@ cdef _try_int64(parser_t *parser, int64_t col,
18831837 if error == ERROR_OVERFLOW:
18841838 # Can't get the word variable
18851839 raise OverflowError (" Overflow" )
1840+ elif raise_on_invalid and error == ERROR_INVALID_CHARS:
1841+ raise ValueError (" Number is not int" )
18861842 return None , None
18871843
18881844 return result, na_count
0 commit comments