diff --git a/.gitignore b/.gitignore index 6914fce..db4575b 100644 --- a/.gitignore +++ b/.gitignore @@ -25,6 +25,7 @@ pyvenv.cfg # *.DBF *.pickle *.parquet +.virtual_documents # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/pysus/data/__init__.py b/pysus/data/__init__.py index e10e523..48b2354 100644 --- a/pysus/data/__init__.py +++ b/pysus/data/__init__.py @@ -1,4 +1,6 @@ import os +import struct +import logging from datetime import datetime from pathlib import Path @@ -109,9 +111,12 @@ def dbf_to_parquet(dbf: str, _pbar=None) -> str: chunk_df = pd.DataFrame(chunk) table = pa.Table.from_pandas(chunk_df.applymap(decode_column)) pq.write_to_dataset(table, root_path=str(parquet)) - except Exception as exc: - parquet.absolute().unlink() - raise exc + except struct.error as err: + if _pbar: + _pbar.close() + Path(path).unlink() + parquet.rmdir() + raise err if _pbar: _pbar.update(approx_final_size - _pbar.n) @@ -138,14 +143,16 @@ def str_to_int(string: str): # spaces as well if str(string).replace(" ", "").isnumeric(): return int(string.replace(" ", "")) + return string def str_to_date(string: str): if isinstance(string, str): try: return datetime.strptime(string, "%Y%m%d").date() - except Exception: + except ValueError: # Ignore errors, bad value - pass + return string + return string map_column_func(["DT_NOTIFIC", "DT_SIN_PRI"], str_to_date) map_column_func(["CODMUNRES", "SEXO"], str_to_int) diff --git a/pysus/ftp/__init__.py b/pysus/ftp/__init__.py index 7c6f381..a6245b3 100644 --- a/pysus/ftp/__init__.py +++ b/pysus/ftp/__init__.py @@ -389,6 +389,16 @@ def line_file_parser(file_line): finally: ftp.close() + upper_names = [n.upper() for n in content] + to_remove = [] + for name in content: + if ".DBF" in name.upper(): + if name.upper().replace(".DBF", ".DBC") in upper_names: + to_remove.append(name) + + for name in to_remove: + del content[name] + return content