From 82da723ce35d081899bbd3c90a353beedae6f2a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lu=C3=A3=20Bida=20Vacaro?= Date: Thu, 19 Oct 2023 16:05:33 -0300 Subject: [PATCH] fix(FTP): remove DBF from content if DBC is present (#168) --- .gitignore | 1 + pysus/data/__init__.py | 17 ++++++++++++----- pysus/ftp/__init__.py | 10 ++++++++++ 3 files changed, 23 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index 6914fce0..db4575bc 100644 --- a/.gitignore +++ b/.gitignore @@ -25,6 +25,7 @@ pyvenv.cfg # *.DBF *.pickle *.parquet +.virtual_documents # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/pysus/data/__init__.py b/pysus/data/__init__.py index e10e523d..48b2354e 100644 --- a/pysus/data/__init__.py +++ b/pysus/data/__init__.py @@ -1,4 +1,6 @@ import os +import struct +import logging from datetime import datetime from pathlib import Path @@ -109,9 +111,12 @@ def dbf_to_parquet(dbf: str, _pbar=None) -> str: chunk_df = pd.DataFrame(chunk) table = pa.Table.from_pandas(chunk_df.applymap(decode_column)) pq.write_to_dataset(table, root_path=str(parquet)) - except Exception as exc: - parquet.absolute().unlink() - raise exc + except struct.error as err: + if _pbar: + _pbar.close() + Path(path).unlink() + parquet.rmdir() + raise err if _pbar: _pbar.update(approx_final_size - _pbar.n) @@ -138,14 +143,16 @@ def str_to_int(string: str): # spaces as well if str(string).replace(" ", "").isnumeric(): return int(string.replace(" ", "")) + return string def str_to_date(string: str): if isinstance(string, str): try: return datetime.strptime(string, "%Y%m%d").date() - except Exception: + except ValueError: # Ignore errors, bad value - pass + return string + return string map_column_func(["DT_NOTIFIC", "DT_SIN_PRI"], str_to_date) map_column_func(["CODMUNRES", "SEXO"], str_to_int) diff --git a/pysus/ftp/__init__.py b/pysus/ftp/__init__.py index 7c6f3811..a6245b3a 100644 --- a/pysus/ftp/__init__.py +++ b/pysus/ftp/__init__.py @@ -389,6 +389,16 @@ def line_file_parser(file_line): finally: ftp.close() + upper_names = [n.upper() for n in content] + to_remove = [] + for name in content: + if ".DBF" in name.upper(): + if name.upper().replace(".DBF", ".DBC") in upper_names: + to_remove.append(name) + + for name in to_remove: + del content[name] + return content