diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 00000000..1e182035 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,2 @@ +[run] +omit =cumulus_library/schema/* \ No newline at end of file diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 0e3fe5e9..fe872d65 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -29,9 +29,23 @@ jobs: run: | python -m pip install --upgrade pip pip install ".[test]" + - name: Create mock AWS credentials + run: | + mkdir ~/.aws && touch ~/.aws/credentials + echo -e "[test]\naws_access_key_id = test\naws_secret_access_key = test" > ~/.aws/credentials - name: Test with pytest run: | - python -m pytest + python -m pytest --cov-report xml --cov=cumulus_library tests + - name: Generate coverage report + uses: orgoro/coverage@v3.1 + with: + coverageFile: coverage.xml + token: ${{ secrets.GITHUB_TOKEN }} + thresholdAll: .9 + thresholdNew: 1 + thresholdModified: .95 + + lint: runs-on: ubuntu-22.04 steps: diff --git a/.gitignore b/.gitignore index 705a1721..8e47ed3d 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ output.sql *generated.md MRCONSO.RRF *.zip +coverage.xml # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/cumulus_library/actions/exporter.py b/cumulus_library/actions/exporter.py index f28fa55f..4384afee 100644 --- a/cumulus_library/actions/exporter.py +++ b/cumulus_library/actions/exporter.py @@ -1,6 +1,7 @@ -import csv import pathlib +import pyarrow +from pyarrow import csv, parquet from rich.progress import track from cumulus_library import base_utils, databases, study_parser @@ -25,6 +26,18 @@ def reset_counts_exports( file.unlink() +def _write_chunk(writer, chunk, schema): + writer.write( + pyarrow.Table.from_pandas( + chunk.sort_values( + by=list(chunk.columns), ascending=False, na_position="first" + ), + preserve_index=False, + schema=schema, + ) + ) + + def export_study( manifest_parser: study_parser.StudyManifestParser, db: databases.DatabaseBackend, @@ -56,13 +69,31 @@ def export_study( description=f"Exporting {manifest_parser.get_study_prefix()} data...", ): query = f"SELECT * FROM {table}" - dataframe = db.execute_as_pandas(query) + # Note: we assume that, for duckdb, you are unlikely to be dealing with large + # exports, so it will ignore the chunksize parameter, as it does not provide + # a pandas enabled cursor. + dataframe_chunks, db_schema = db.execute_as_pandas(query, chunksize=1000000) + first_chunk = next(dataframe_chunks) path.mkdir(parents=True, exist_ok=True) - dataframe = dataframe.sort_values( - by=list(dataframe.columns), ascending=False, na_position="first" - ) - dataframe.to_csv(f"{path}/{table}.csv", index=False, quoting=csv.QUOTE_MINIMAL) - dataframe.to_parquet(f"{path}/{table}.parquet", index=False) + # print(pyarrow.Schema.from_pandas(first_chunk)) + # print(db_schema) + schema = pyarrow.schema(db.col_pyarrow_types_from_sql(db_schema)) + # print(schema) + with parquet.ParquetWriter(f"{path}/{table}.parquet", schema) as p_writer: + with csv.CSVWriter( + f"{path}/{table}.csv", + schema, + write_options=csv.WriteOptions( + # Note that this quoting style is not exactly csv.QUOTE_MINIMAL + # https://github.com/apache/arrow/issues/42032 + quoting_style="needed" + ), + ) as c_writer: + _write_chunk(p_writer, first_chunk, schema) + _write_chunk(c_writer, first_chunk, schema) + for chunk in dataframe_chunks: + _write_chunk(p_writer, chunk, schema) + _write_chunk(c_writer, chunk, schema) queries.append(queries) if archive: base_utils.zip_dir(path, data_path, manifest_parser.get_study_prefix()) diff --git a/cumulus_library/cli.py b/cumulus_library/cli.py index 5e951008..159316ae 100755 --- a/cumulus_library/cli.py +++ b/cumulus_library/cli.py @@ -429,7 +429,7 @@ def run_cli(args: dict): "set[/italic], primarily dates, on a per patient level.\n\n" "[bold]By doing this, you are assuming the responsibility for " "meeting your organization's security requirements for " - "storing this data in a secure manager.[/bold]\n\n" + "storing this data in a secure manner.[/bold]\n\n" "Type Y to proceed, or any other value to quit.\n" ) console.print(warning_text) @@ -493,7 +493,7 @@ def main(cli_args=None): ("umls_key", "UMLS_API_KEY"), ("url", "CUMULUS_AGGREGATOR_URL"), ("user", "CUMULUS_AGGREGATOR_USER"), - ("workgroup", "CUMULUS_LIBRARY_WORKGROUP"), + ("work_group", "CUMULUS_LIBRARY_WORKGROUP"), ) read_env_vars = [] for pair in arg_env_pairs: diff --git a/cumulus_library/cli_parser.py b/cumulus_library/cli_parser.py index f4f42aac..fa79aeb8 100644 --- a/cumulus_library/cli_parser.py +++ b/cumulus_library/cli_parser.py @@ -12,6 +12,7 @@ def add_aws_config(parser: argparse.ArgumentParser) -> None: aws.add_argument( "--workgroup", default="cumulus", + dest="work_group", help="Cumulus Athena workgroup (default: cumulus)", ) aws.add_argument( diff --git a/cumulus_library/databases.py b/cumulus_library/databases.py index 9d0520c5..ffa41631 100644 --- a/cumulus_library/databases.py +++ b/cumulus_library/databases.py @@ -9,6 +9,7 @@ """ import abc +import collections import datetime import json import os @@ -35,16 +36,16 @@ class DatabaseCursor(Protocol): """Protocol for a PEP-249 compatible cursor""" def execute(self, sql: str) -> None: - pass + pass # pragma: no cover def fetchone(self) -> list | None: - pass + pass # pragma: no cover def fetchmany(self, size: int | None) -> list[list] | None: - pass + pass # pragma: no cover def fetchall(self) -> list[list] | None: - pass + pass # pragma: no cover class DatabaseParser(abc.ABC): @@ -151,7 +152,9 @@ def pandas_cursor(self) -> DatabaseCursor: """ @abc.abstractmethod - def execute_as_pandas(self, sql: str) -> pandas.DataFrame: + def execute_as_pandas( + self, sql: str, chunksize: int | None = None + ) -> (pandas.DataFrame | collections.abc.Iterator[pandas.DataFrame], list[tuple]): """Returns a pandas.DataFrame version of the results from the provided SQL""" @abc.abstractmethod @@ -172,9 +175,9 @@ def operational_errors(self) -> tuple[Exception]: def col_parquet_types_from_pandas(self, field_types: list) -> list: """Returns appropriate types for creating tables based from parquet. - By default, returns the input (which assumes that the DB infers directly + By default, returns an empty list (which assumes that the DB infers directly from parquet data types). Only override if your DB uses an explicit SerDe - format, or otherwise needs a modified typing to inject directly into a query.""" + format, or otherwise needs a modfied typing to inject directly into a query.""" # The following example shows the types we're expecting to catch with this # approach and the rough type to cast them to. @@ -196,9 +199,12 @@ def col_parquet_types_from_pandas(self, field_types: list) -> list: # raise errors.CumulusLibraryError( # f"Unsupported type {type(field)} found." # ) - # return output + return [] + + return None - return field_types + def col_pyarrow_types_from_sql(self, columns: list[tuple]) -> list: + return columns def upload_file( self, @@ -257,8 +263,11 @@ def cursor(self) -> AthenaCursor: def pandas_cursor(self) -> AthenaPandasCursor: return self.connection.cursor(cursor=AthenaPandasCursor) - def execute_as_pandas(self, sql: str) -> pandas.DataFrame: - return self.pandas_cursor().execute(sql).as_pandas() + def execute_as_pandas( + self, sql: str, chunksize: int | None = None + ) -> (pandas.DataFrame | collections.abc.Iterator[pandas.DataFrame], list[tuple]): + query = self.pandas_cursor().execute(sql, chunksize=chunksize) + return query.as_pandas(), query.description def parser(self) -> DatabaseParser: return AthenaParser() @@ -272,7 +281,10 @@ def col_parquet_types_from_pandas(self, field_types: list) -> list: match field: case numpy.dtypes.ObjectDType(): output.append("STRING") - case pandas.core.arrays.integer.Int64Dtype(): + case ( + pandas.core.arrays.integer.Int64Dtype() + | numpy.dtypes.Int64DType() + ): output.append("INT") case numpy.dtypes.Float64DType(): output.append("DOUBLE") @@ -282,7 +294,31 @@ def col_parquet_types_from_pandas(self, field_types: list) -> list: output.append("TIMESTAMP") case _: raise errors.CumulusLibraryError( - f"Unsupported type {type(field)} found." + f"Unsupported pandas type {type(field)} found." + ) + return output + + def col_pyarrow_types_from_sql(self, columns: list[tuple]) -> list: + output = [] + for column in columns: + match column[1]: + case "varchar": + output.append((column[0], pyarrow.string())) + case "bigint": + output.append((column[0], pyarrow.int64())) + case "integer": + output.append((column[0], pyarrow.int64())) + case "double": + output.append((column[0], pyarrow.float64())) + case "boolean": + output.append((column[0], pyarrow.bool_())) + case "date": + output.append((column[0], pyarrow.date64())) + case "timestamp": + output.append((column[0], pyarrow.timestamp("s"))) + case _: + raise errors.CumulusLibraryError( + output.append(f"Unsupported SQL type '{column}' found.") ) return output @@ -296,9 +332,8 @@ def upload_file( force_upload=False, ) -> str | None: # We'll investigate the connection to get the relevant S3 upload path. - wg_conf = self.connection._client.get_work_group(WorkGroup=self.work_group)[ - "WorkGroup" - ]["Configuration"]["ResultConfiguration"] + workgroup = self.connection._client.get_work_group(WorkGroup=self.work_group) + wg_conf = workgroup["WorkGroup"]["Configuration"]["ResultConfiguration"] s3_path = wg_conf["OutputLocation"] bucket = "/".join(s3_path.split("/")[2:3]) key_prefix = "/".join(s3_path.split("/")[3:]) @@ -315,7 +350,7 @@ def upload_file( f"{key_prefix}cumulus_user_uploads/{self.schema_name}/" f"{study}/{topic}" ) if not remote_filename: - remote_filename = file + remote_filename = file.name session = boto3.Session(profile_name=self.connection.profile_name) s3_client = session.client("s3") @@ -525,12 +560,41 @@ def pandas_cursor(self) -> duckdb.DuckDBPyConnection: # Since this is not provided, return the vanilla cursor return self.connection - def execute_as_pandas(self, sql: str) -> pandas.DataFrame: + def execute_as_pandas( + self, sql: str, chunksize: int | None = None + ) -> (pandas.DataFrame | collections.abc.Iterator[pandas.DataFrame], list[tuple]): # We call convert_dtypes here in case there are integer columns. # Pandas will normally cast nullable-int as a float type unless # we call this to convert to its nullable int column type. # PyAthena seems to do this correctly for us, but not DuckDB. - return self.connection.execute(sql).df().convert_dtypes() + result = self.connection.execute(sql) + if chunksize: + return iter([result.df().convert_dtypes()]), result.description + return result.df().convert_dtypes(), result.description + + def col_pyarrow_types_from_sql(self, columns: list[tuple]) -> list: + output = [] + for column in columns: + match column[1]: + case "STRING": + output.append((column[0], pyarrow.string())) + case "INTEGER": + output.append((column[0], pyarrow.int64())) + case "NUMBER": + output.append((column[0], pyarrow.float64())) + case "DOUBLE": + output.append((column[0], pyarrow.float64())) + case "boolean" | "bool": + output.append((column[0], pyarrow.bool_())) + case "Date": + output.append((column[0], pyarrow.date64())) + case "TIMESTAMP" | "DATETIME": + output.append((column[0], pyarrow.timestamp("s"))) + case _: + raise errors.CumulusLibraryError( + f"{column[0],column[1]} does not have a conversion type" + ) + return output def parser(self) -> DatabaseParser: return DuckDbParser() @@ -652,23 +716,25 @@ def read_ndjson_dir(path: str) -> dict[str, pyarrow.Table]: def create_db_backend(args: dict[str, str]) -> DatabaseBackend: db_config.db_type = args["db_type"] - database = args["schema_name"] + schema = args["schema_name"] load_ndjson_dir = args.get("load_ndjson_dir") if db_config.db_type == "duckdb": - backend = DuckDatabaseBackend(database) # `database` is path name in this case + backend = DuckDatabaseBackend(schema) # `database` is path name in this case if load_ndjson_dir: backend.insert_tables(read_ndjson_dir(load_ndjson_dir)) elif db_config.db_type == "athena": backend = AthenaDatabaseBackend( args["region"], - args["workgroup"], + args["work_group"], args["profile"], - database, + schema, ) if load_ndjson_dir: sys.exit("Loading an ndjson dir is not supported with --db-type=athena.") else: - raise ValueError(f"Unexpected --db-type value '{db_config.db_type}'") + raise errors.CumulusLibraryError( + f"'{db_config.db_type}' is not a supported database." + ) return backend diff --git a/main b/main new file mode 100644 index 00000000..ee7d60ff Binary files /dev/null and b/main differ diff --git a/pyproject.toml b/pyproject.toml index 46c12dce..154c97dd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,6 +38,7 @@ dev = [ test = [ "freezegun", "pytest", + "pytest-cov", "responses" ] diff --git a/test b/test new file mode 100644 index 00000000..ee7d60ff Binary files /dev/null and b/test differ diff --git a/tests/test_data/duckdb_data/expected_export/core/core__count_condition_month.csv b/tests/test_data/duckdb_data/expected_export/core/core__count_condition_month.csv index 0128853f..4e278870 100644 --- a/tests/test_data/duckdb_data/expected_export/core/core__count_condition_month.csv +++ b/tests/test_data/duckdb_data/expected_export/core/core__count_condition_month.csv @@ -1,3 +1,3 @@ -cnt,category_code,recordedDate_month,code_display +"cnt","category_code","recordedDate_month","code_display" 15,,, -15,encounter-diagnosis,, +15,"encounter-diagnosis",, diff --git a/tests/test_data/duckdb_data/expected_export/core/core__count_documentreference_month.csv b/tests/test_data/duckdb_data/expected_export/core/core__count_documentreference_month.csv index 42a04ba1..2257390e 100644 --- a/tests/test_data/duckdb_data/expected_export/core/core__count_documentreference_month.csv +++ b/tests/test_data/duckdb_data/expected_export/core/core__count_documentreference_month.csv @@ -1,19 +1,19 @@ -cnt,type_display,author_month,class_display +"cnt","type_display","author_month","class_display" 50,,, -50,Evaluation + Plan note,, -50,Emergency department note,, -46,,,ambulatory -46,Evaluation + Plan note,,ambulatory -46,Emergency department note,,ambulatory -26,,2018-07-01, -26,Evaluation + Plan note,2018-07-01, -26,Emergency department note,2018-07-01, -24,,2018-07-01,ambulatory -24,,2018-06-01, -24,Evaluation + Plan note,2018-07-01,ambulatory -24,Evaluation + Plan note,2018-06-01, -24,Emergency department note,2018-07-01,ambulatory -24,Emergency department note,2018-06-01, -22,,2018-06-01,ambulatory -22,Evaluation + Plan note,2018-06-01,ambulatory -22,Emergency department note,2018-06-01,ambulatory +50,"Evaluation + Plan note",, +50,"Emergency department note",, +46,,,"ambulatory" +46,"Evaluation + Plan note",,"ambulatory" +46,"Emergency department note",,"ambulatory" +26,,"2018-07-01", +26,"Evaluation + Plan note","2018-07-01", +26,"Emergency department note","2018-07-01", +24,,"2018-07-01","ambulatory" +24,,"2018-06-01", +24,"Evaluation + Plan note","2018-07-01","ambulatory" +24,"Evaluation + Plan note","2018-06-01", +24,"Emergency department note","2018-07-01","ambulatory" +24,"Emergency department note","2018-06-01", +22,,"2018-06-01","ambulatory" +22,"Evaluation + Plan note","2018-06-01","ambulatory" +22,"Emergency department note","2018-06-01","ambulatory" diff --git a/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_all_types.csv b/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_all_types.csv index d20a99fb..2ff56f2c 100644 --- a/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_all_types.csv +++ b/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_all_types.csv @@ -1,9 +1,9 @@ -cnt,class_display,type_display,serviceType_display,priority_display +"cnt","class_display","type_display","serviceType_display","priority_display" 50,,,, -50,,,,cumulus__none -50,,,cumulus__none, -50,,,cumulus__none,cumulus__none -46,ambulatory,,, -46,ambulatory,,,cumulus__none -46,ambulatory,,cumulus__none, -46,ambulatory,,cumulus__none,cumulus__none \ No newline at end of file +50,,,,"cumulus__none" +50,,,"cumulus__none", +50,,,"cumulus__none","cumulus__none" +46,"ambulatory",,, +46,"ambulatory",,,"cumulus__none" +46,"ambulatory",,"cumulus__none", +46,"ambulatory",,"cumulus__none","cumulus__none" diff --git a/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_all_types_month.csv b/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_all_types_month.csv index 35640a26..324ac11c 100644 --- a/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_all_types_month.csv +++ b/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_all_types_month.csv @@ -1,25 +1,25 @@ -cnt,class_display,type_display,serviceType_display,priority_display,period_start_month +"cnt","class_display","type_display","serviceType_display","priority_display","period_start_month" 50,,,,, -50,,,,cumulus__none, -50,,,cumulus__none,, -50,,,cumulus__none,cumulus__none, -46,ambulatory,,,, -46,ambulatory,,,cumulus__none, -46,ambulatory,,cumulus__none,, -46,ambulatory,,cumulus__none,cumulus__none, -26,,,,,2018-07-01 -26,,,,cumulus__none,2018-07-01 -26,,,cumulus__none,,2018-07-01 -26,,,cumulus__none,cumulus__none,2018-07-01 -24,,,,,2018-06-01 -24,,,,cumulus__none,2018-06-01 -24,,,cumulus__none,,2018-06-01 -24,,,cumulus__none,cumulus__none,2018-06-01 -24,ambulatory,,,,2018-07-01 -24,ambulatory,,,cumulus__none,2018-07-01 -24,ambulatory,,cumulus__none,,2018-07-01 -24,ambulatory,,cumulus__none,cumulus__none,2018-07-01 -22,ambulatory,,,,2018-06-01 -22,ambulatory,,,cumulus__none,2018-06-01 -22,ambulatory,,cumulus__none,,2018-06-01 -22,ambulatory,,cumulus__none,cumulus__none,2018-06-01 \ No newline at end of file +50,,,,"cumulus__none", +50,,,"cumulus__none",, +50,,,"cumulus__none","cumulus__none", +46,"ambulatory",,,, +46,"ambulatory",,,"cumulus__none", +46,"ambulatory",,"cumulus__none",, +46,"ambulatory",,"cumulus__none","cumulus__none", +26,,,,,"2018-07-01" +26,,,,"cumulus__none","2018-07-01" +26,,,"cumulus__none",,"2018-07-01" +26,,,"cumulus__none","cumulus__none","2018-07-01" +24,,,,,"2018-06-01" +24,,,,"cumulus__none","2018-06-01" +24,,,"cumulus__none",,"2018-06-01" +24,,,"cumulus__none","cumulus__none","2018-06-01" +24,"ambulatory",,,,"2018-07-01" +24,"ambulatory",,,"cumulus__none","2018-07-01" +24,"ambulatory",,"cumulus__none",,"2018-07-01" +24,"ambulatory",,"cumulus__none","cumulus__none","2018-07-01" +22,"ambulatory",,,,"2018-06-01" +22,"ambulatory",,,"cumulus__none","2018-06-01" +22,"ambulatory",,"cumulus__none",,"2018-06-01" +22,"ambulatory",,"cumulus__none","cumulus__none","2018-06-01" diff --git a/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_month.csv b/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_month.csv index 267d1133..5c8242e6 100644 --- a/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_month.csv +++ b/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_month.csv @@ -1,65 +1,65 @@ -cnt,period_start_month,class_display,age_at_visit,gender,race_display,ethnicity_display +"cnt","period_start_month","class_display","age_at_visit","gender","race_display","ethnicity_display" 50,,,,,, -47,,,,,white, -46,,ambulatory,,,, -45,,,,,,not hispanic or latino -43,,,,,white,not hispanic or latino -43,,ambulatory,,,white, -42,,ambulatory,,,,not hispanic or latino -40,,ambulatory,,,white,not hispanic or latino -29,,,,female,, -28,,,,female,white, -27,,,,female,,not hispanic or latino -27,,ambulatory,,female,, -26,,,,female,white,not hispanic or latino -26,,ambulatory,,female,white, -26,2018-07-01,,,,, -26,2018-07-01,,,,white, -25,,ambulatory,,female,,not hispanic or latino -24,,ambulatory,,female,white,not hispanic or latino -24,2018-07-01,,,,,not hispanic or latino -24,2018-07-01,,,,white,not hispanic or latino -24,2018-07-01,ambulatory,,,, -24,2018-07-01,ambulatory,,,white, -24,2018-06-01,,,,, -23,2018-07-01,ambulatory,,,,not hispanic or latino -23,2018-07-01,ambulatory,,,white,not hispanic or latino -22,2018-06-01,ambulatory,,,, -21,,,,male,, -21,2018-06-01,,,,,not hispanic or latino -21,2018-06-01,,,,white, -19,,,,male,white, -19,,ambulatory,,male,, -19,2018-06-01,,,,white,not hispanic or latino -19,2018-06-01,ambulatory,,,,not hispanic or latino -19,2018-06-01,ambulatory,,,white, -18,,,,male,,not hispanic or latino -17,,,,male,white,not hispanic or latino -17,,ambulatory,,male,,not hispanic or latino -17,,ambulatory,,male,white, -17,2018-06-01,ambulatory,,,white,not hispanic or latino -16,,ambulatory,,male,white,not hispanic or latino -15,2018-06-01,,,female,, -14,2018-07-01,,,female,, -14,2018-07-01,,,female,,not hispanic or latino -14,2018-07-01,,,female,white, -14,2018-07-01,,,female,white,not hispanic or latino -14,2018-06-01,,,female,white, -14,2018-06-01,ambulatory,,female,, -13,2018-07-01,ambulatory,,female,, -13,2018-07-01,ambulatory,,female,,not hispanic or latino -13,2018-07-01,ambulatory,,female,white, -13,2018-07-01,ambulatory,,female,white,not hispanic or latino -13,2018-06-01,,,female,,not hispanic or latino -13,2018-06-01,ambulatory,,female,white, -12,2018-07-01,,,male,, -12,2018-07-01,,,male,white, -12,2018-06-01,,,female,white,not hispanic or latino -12,2018-06-01,ambulatory,,female,,not hispanic or latino -11,2018-07-01,ambulatory,,male,, -11,2018-07-01,ambulatory,,male,white, -11,2018-06-01,ambulatory,,female,white,not hispanic or latino -10,2018-07-01,,,male,,not hispanic or latino -10,2018-07-01,,,male,white,not hispanic or latino -10,2018-07-01,ambulatory,,male,,not hispanic or latino -10,2018-07-01,ambulatory,,male,white,not hispanic or latino +47,,,,,"white", +46,,"ambulatory",,,, +45,,,,,,"not hispanic or latino" +43,,,,,"white","not hispanic or latino" +43,,"ambulatory",,,"white", +42,,"ambulatory",,,,"not hispanic or latino" +40,,"ambulatory",,,"white","not hispanic or latino" +29,,,,"female",, +28,,,,"female","white", +27,,,,"female",,"not hispanic or latino" +27,,"ambulatory",,"female",, +26,,,,"female","white","not hispanic or latino" +26,,"ambulatory",,"female","white", +26,"2018-07-01",,,,, +26,"2018-07-01",,,,"white", +25,,"ambulatory",,"female",,"not hispanic or latino" +24,,"ambulatory",,"female","white","not hispanic or latino" +24,"2018-07-01",,,,,"not hispanic or latino" +24,"2018-07-01",,,,"white","not hispanic or latino" +24,"2018-07-01","ambulatory",,,, +24,"2018-07-01","ambulatory",,,"white", +24,"2018-06-01",,,,, +23,"2018-07-01","ambulatory",,,,"not hispanic or latino" +23,"2018-07-01","ambulatory",,,"white","not hispanic or latino" +22,"2018-06-01","ambulatory",,,, +21,,,,"male",, +21,"2018-06-01",,,,,"not hispanic or latino" +21,"2018-06-01",,,,"white", +19,,,,"male","white", +19,,"ambulatory",,"male",, +19,"2018-06-01",,,,"white","not hispanic or latino" +19,"2018-06-01","ambulatory",,,,"not hispanic or latino" +19,"2018-06-01","ambulatory",,,"white", +18,,,,"male",,"not hispanic or latino" +17,,,,"male","white","not hispanic or latino" +17,,"ambulatory",,"male",,"not hispanic or latino" +17,,"ambulatory",,"male","white", +17,"2018-06-01","ambulatory",,,"white","not hispanic or latino" +16,,"ambulatory",,"male","white","not hispanic or latino" +15,"2018-06-01",,,"female",, +14,"2018-07-01",,,"female",, +14,"2018-07-01",,,"female",,"not hispanic or latino" +14,"2018-07-01",,,"female","white", +14,"2018-07-01",,,"female","white","not hispanic or latino" +14,"2018-06-01",,,"female","white", +14,"2018-06-01","ambulatory",,"female",, +13,"2018-07-01","ambulatory",,"female",, +13,"2018-07-01","ambulatory",,"female",,"not hispanic or latino" +13,"2018-07-01","ambulatory",,"female","white", +13,"2018-07-01","ambulatory",,"female","white","not hispanic or latino" +13,"2018-06-01",,,"female",,"not hispanic or latino" +13,"2018-06-01","ambulatory",,"female","white", +12,"2018-07-01",,,"male",, +12,"2018-07-01",,,"male","white", +12,"2018-06-01",,,"female","white","not hispanic or latino" +12,"2018-06-01","ambulatory",,"female",,"not hispanic or latino" +11,"2018-07-01","ambulatory",,"male",, +11,"2018-07-01","ambulatory",,"male","white", +11,"2018-06-01","ambulatory",,"female","white","not hispanic or latino" +10,"2018-07-01",,,"male",,"not hispanic or latino" +10,"2018-07-01",,,"male","white","not hispanic or latino" +10,"2018-07-01","ambulatory",,"male",,"not hispanic or latino" +10,"2018-07-01","ambulatory",,"male","white","not hispanic or latino" diff --git a/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_priority_month.csv b/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_priority_month.csv index bc2cc2d6..2b16c7e2 100644 --- a/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_priority_month.csv +++ b/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_priority_month.csv @@ -1,13 +1,13 @@ -cnt,class_display,priority_display,period_start_month +"cnt","class_display","priority_display","period_start_month" 50,,, -50,,cumulus__none, -46,ambulatory,, -46,ambulatory,cumulus__none, -26,,,2018-07-01 -26,,cumulus__none,2018-07-01 -24,,,2018-06-01 -24,,cumulus__none,2018-06-01 -24,ambulatory,,2018-07-01 -24,ambulatory,cumulus__none,2018-07-01 -22,ambulatory,,2018-06-01 -22,ambulatory,cumulus__none,2018-06-01 +50,,"cumulus__none", +46,"ambulatory",, +46,"ambulatory","cumulus__none", +26,,,"2018-07-01" +26,,"cumulus__none","2018-07-01" +24,,,"2018-06-01" +24,,"cumulus__none","2018-06-01" +24,"ambulatory",,"2018-07-01" +24,"ambulatory","cumulus__none","2018-07-01" +22,"ambulatory",,"2018-06-01" +22,"ambulatory","cumulus__none","2018-06-01" diff --git a/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_service_month.csv b/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_service_month.csv index 1ef0dae9..2dc35115 100644 --- a/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_service_month.csv +++ b/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_service_month.csv @@ -1,13 +1,13 @@ -cnt,class_display,serviceType_display,period_start_month +"cnt","class_display","serviceType_display","period_start_month" 50,,, -50,,cumulus__none, -46,ambulatory,, -46,ambulatory,cumulus__none, -26,,,2018-07-01 -26,,cumulus__none,2018-07-01 -24,,,2018-06-01 -24,,cumulus__none,2018-06-01 -24,ambulatory,,2018-07-01 -24,ambulatory,cumulus__none,2018-07-01 -22,ambulatory,,2018-06-01 -22,ambulatory,cumulus__none,2018-06-01 +50,,"cumulus__none", +46,"ambulatory",, +46,"ambulatory","cumulus__none", +26,,,"2018-07-01" +26,,"cumulus__none","2018-07-01" +24,,,"2018-06-01" +24,,"cumulus__none","2018-06-01" +24,"ambulatory",,"2018-07-01" +24,"ambulatory","cumulus__none","2018-07-01" +22,"ambulatory",,"2018-06-01" +22,"ambulatory","cumulus__none","2018-06-01" diff --git a/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_type_month.csv b/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_type_month.csv index bb5afe09..02621fa4 100644 --- a/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_type_month.csv +++ b/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_type_month.csv @@ -1,7 +1,7 @@ -cnt,class_display,type_display,period_start_month +"cnt","class_display","type_display","period_start_month" 50,,, -46,ambulatory,, -26,,,2018-07-01 -24,,,2018-06-01 -24,ambulatory,,2018-07-01 -22,ambulatory,,2018-06-01 +46,"ambulatory",, +26,,,"2018-07-01" +24,,,"2018-06-01" +24,"ambulatory",,"2018-07-01" +22,"ambulatory",,"2018-06-01" diff --git a/tests/test_data/duckdb_data/expected_export/core/core__count_medicationrequest_month.csv b/tests/test_data/duckdb_data/expected_export/core/core__count_medicationrequest_month.csv index 0c01d794..b0299974 100644 --- a/tests/test_data/duckdb_data/expected_export/core/core__count_medicationrequest_month.csv +++ b/tests/test_data/duckdb_data/expected_export/core/core__count_medicationrequest_month.csv @@ -1,13 +1,13 @@ -cnt,status,intent,authoredon_month,medication_display +"cnt","status","intent","authoredon_month","medication_display" 27,,,, -27,,order,, -26,stopped,,, -26,stopped,order,, -15,,,2018-07-01, -15,,order,2018-07-01, -15,stopped,,2018-07-01, -15,stopped,order,2018-07-01, -12,,,2018-06-01, -12,,order,2018-06-01, -11,stopped,,2018-06-01, -11,stopped,order,2018-06-01, +27,,"order",, +26,"stopped",,, +26,"stopped","order",, +15,,,"2018-07-01", +15,,"order","2018-07-01", +15,"stopped",,"2018-07-01", +15,"stopped","order","2018-07-01", +12,,,"2018-06-01", +12,,"order","2018-06-01", +11,"stopped",,"2018-06-01", +11,"stopped","order","2018-06-01", diff --git a/tests/test_data/duckdb_data/expected_export/core/core__count_observation_lab_month.csv b/tests/test_data/duckdb_data/expected_export/core/core__count_observation_lab_month.csv index 51b9e49e..5555c1ec 100644 --- a/tests/test_data/duckdb_data/expected_export/core/core__count_observation_lab_month.csv +++ b/tests/test_data/duckdb_data/expected_export/core/core__count_observation_lab_month.csv @@ -1,15 +1,15 @@ -cnt,effectiveDateTime_month,observation_code,valueCodeableConcept_display,class_display +"cnt","effectiveDateTime_month","observation_code","valueCodeableConcept_display","class_display" 20,,,, -20,,,,ambulatory -10,,,Urine smell ammoniacal (finding), -10,,,Urine smell ammoniacal (finding),ambulatory -10,,,Brown color (qualifier value), -10,,,Brown color (qualifier value),ambulatory -10,,5778-6,, -10,,5778-6,,ambulatory -10,,5778-6,Brown color (qualifier value), -10,,5778-6,Brown color (qualifier value),ambulatory -10,,34533-0,, -10,,34533-0,,ambulatory -10,,34533-0,Urine smell ammoniacal (finding), -10,,34533-0,Urine smell ammoniacal (finding),ambulatory +20,,,,"ambulatory" +10,,,"Urine smell ammoniacal (finding)", +10,,,"Urine smell ammoniacal (finding)","ambulatory" +10,,,"Brown color (qualifier value)", +10,,,"Brown color (qualifier value)","ambulatory" +10,,"5778-6",, +10,,"5778-6",,"ambulatory" +10,,"5778-6","Brown color (qualifier value)", +10,,"5778-6","Brown color (qualifier value)","ambulatory" +10,,"34533-0",, +10,,"34533-0",,"ambulatory" +10,,"34533-0","Urine smell ammoniacal (finding)", +10,,"34533-0","Urine smell ammoniacal (finding)","ambulatory" diff --git a/tests/test_data/duckdb_data/expected_export/core/core__count_patient.csv b/tests/test_data/duckdb_data/expected_export/core/core__count_patient.csv index d9a72050..c19cc543 100644 --- a/tests/test_data/duckdb_data/expected_export/core/core__count_patient.csv +++ b/tests/test_data/duckdb_data/expected_export/core/core__count_patient.csv @@ -1,13 +1,13 @@ -cnt,gender,race_display,ethnicity_display +"cnt","gender","race_display","ethnicity_display" 50,,, -47,,white, -45,,,not hispanic or latino -43,,white,not hispanic or latino -29,female,, -28,female,white, -27,female,,not hispanic or latino -26,female,white,not hispanic or latino -21,male,, -19,male,white, -18,male,,not hispanic or latino -17,male,white,not hispanic or latino +47,,"white", +45,,,"not hispanic or latino" +43,,"white","not hispanic or latino" +29,"female",, +28,"female","white", +27,"female",,"not hispanic or latino" +26,"female","white","not hispanic or latino" +21,"male",, +19,"male","white", +18,"male",,"not hispanic or latino" +17,"male","white","not hispanic or latino" diff --git a/tests/test_data/duckdb_data/expected_export/core/core__meta_date.csv b/tests/test_data/duckdb_data/expected_export/core/core__meta_date.csv index 6d2ad6b7..2e2c4f61 100644 --- a/tests/test_data/duckdb_data/expected_export/core/core__meta_date.csv +++ b/tests/test_data/duckdb_data/expected_export/core/core__meta_date.csv @@ -1,2 +1,2 @@ -min_date,max_date +"min_date","max_date" 2018-06-01,2018-07-31 diff --git a/tests/test_data/duckdb_data/expected_export/core/core__meta_version.csv b/tests/test_data/duckdb_data/expected_export/core/core__meta_version.csv index 4d65fc4e..cbd90d88 100644 --- a/tests/test_data/duckdb_data/expected_export/core/core__meta_version.csv +++ b/tests/test_data/duckdb_data/expected_export/core/core__meta_version.csv @@ -1,2 +1,2 @@ -data_package_version +"data_package_version" 3 diff --git a/tests/test_databases.py b/tests/test_databases.py new file mode 100644 index 00000000..836c12d7 --- /dev/null +++ b/tests/test_databases.py @@ -0,0 +1,367 @@ +"""Low level database tests + +This is intended to exercise edge cases not covered via more integrated testing""" + +import datetime +import os +import pathlib +from contextlib import nullcontext as does_not_raise +from unittest import mock + +import pandas +import pyarrow +import pytest + +from cumulus_library import databases, errors + +ATHENA_KWARGS = { + "region": "test", + "work_group": "test", + "profile": "test", + "schema_name": "test", +} +DUCKDB_KWARGS = { + "db_file": ":memory:", +} + + +@mock.patch.dict( + os.environ, + clear=True, +) +@pytest.mark.parametrize( + "db,data,expected,raises", + [ + ( + databases.AthenaDatabaseBackend(**ATHENA_KWARGS), + pandas.DataFrame( + { + "str": ["str"], + "int": [123], + "float": [1.23], + "bool": [True], + "datetime": [datetime.datetime.now()], + } + ), + ["STRING", "INT", "DOUBLE", "BOOLEAN", "TIMESTAMP"], + does_not_raise(), + ), + ( + databases.DuckDatabaseBackend(**DUCKDB_KWARGS), + pandas.DataFrame( + { + "str": ["str"], + "int": [123], + "float": [1.23], + "bool": [True], + "datetime": [datetime.datetime.now()], + } + ), + [], + does_not_raise(), + ), + ( + databases.AthenaDatabaseBackend(**ATHENA_KWARGS), + pandas.DataFrame({"cat": pandas.Series(["a"], dtype="category")}), + ["STRING", "INT", "DOUBLE", "BOOLEAN", "TIMESTAMP"], + pytest.raises(errors.CumulusLibraryError), + ), + ], +) +def test_col_types_from_pandas(db, data, expected, raises): + with raises: + vals = db.col_parquet_types_from_pandas(data.dtypes) + assert set(expected) == set(vals) + + +@mock.patch.dict( + os.environ, + clear=True, +) +@pytest.mark.parametrize( + "db,data,expected,raises", + [ + ( + databases.AthenaDatabaseBackend(**ATHENA_KWARGS), + [ + ( + "a", + "varchar", + ), + ( + "b", + "bigint", + ), + ( + "c", + "integer", + ), + ( + "d", + "double", + ), + ( + "e", + "boolean", + ), + ( + "f", + "date", + ), + ("g", "timestamp"), + ], + [ + ( + "a", + pyarrow.string(), + ), + ( + "b", + pyarrow.int64(), + ), + ( + "c", + pyarrow.int64(), + ), + ( + "d", + pyarrow.float64(), + ), + ( + "e", + pyarrow.bool_(), + ), + ( + "f", + pyarrow.date64(), + ), + ("g", pyarrow.timestamp("s")), + ], + does_not_raise(), + ), + ( + databases.AthenaDatabaseBackend(**ATHENA_KWARGS), + [("a", "other_type")], + [], + pytest.raises(errors.CumulusLibraryError), + ), + ( + databases.DuckDatabaseBackend(**DUCKDB_KWARGS), + [ + ( + "a", + "STRING", + ), + ( + "b", + "INTEGER", + ), + ( + "c", + "NUMBER", + ), + ( + "d", + "DOUBLE", + ), + ( + "e", + "boolean", + ), + ( + "f", + "Date", + ), + ("g", "TIMESTAMP"), + ], + [ + ( + "a", + pyarrow.string(), + ), + ( + "b", + pyarrow.int64(), + ), + ( + "c", + pyarrow.float64(), + ), + ( + "d", + pyarrow.float64(), + ), + ( + "e", + pyarrow.bool_(), + ), + ( + "f", + pyarrow.date64(), + ), + ("g", pyarrow.timestamp("s")), + ], + does_not_raise(), + ), + ( + databases.DuckDatabaseBackend(**DUCKDB_KWARGS), + [("a", "other_type")], + [], + pytest.raises(errors.CumulusLibraryError), + ), + ], +) +def test_pyarrow_types_from_sql(db, data, expected, raises): + with raises: + vals = db.col_pyarrow_types_from_sql(data) + assert len(expected) == len(vals) + for index in range(0, len(vals)): + assert vals[index][-1] == expected[index][-1] + + +@mock.patch.dict( + os.environ, + clear=True, +) +@pytest.mark.parametrize( + "args,expected_type, raises", + [ + ( + {**{"db_type": "duckdb", "schema_name": "test"}, **DUCKDB_KWARGS}, + databases.DuckDatabaseBackend, + does_not_raise(), + ), + ( + {**{"db_type": "athena"}, **ATHENA_KWARGS}, + databases.AthenaDatabaseBackend, + does_not_raise(), + ), + ( + {**{"db_type": "athena", "load_ndjson_dir": "file.json"}, **ATHENA_KWARGS}, + databases.AthenaDatabaseBackend, + pytest.raises(SystemExit), + ), + ( + # https://en.wikipedia.org/wiki/Cornerstone_(software) + {**{"db_type": "cornerstone", "schema_name": "test"}}, + None, + pytest.raises(errors.CumulusLibraryError), + ), + ], +) +def test_create_db_backend(args, expected_type, raises): + with raises: + db = databases.create_db_backend(args) + assert isinstance(db, expected_type) + + +def test_upload_file_default(): + db = databases.DuckDatabaseBackend(**DUCKDB_KWARGS) + location = db.upload_file( + file=pathlib.Path(__file__).resolve(), + study="test", + topic="table", + ) + assert location is None + + +@mock.patch.dict( + os.environ, + clear=True, +) +@pytest.mark.parametrize( + "args,sse,keycount,expected,raises", + [ + ( + { + "file": pathlib.Path(__file__).resolve(), + "study": "study", + "topic": "table", + "remote_filename": None, + "force_upload": False, + }, + "SSE_KMS", + 1, + "s3://test_bucket/test_location/cumulus_user_uploads/test/study/table", + does_not_raise(), + ), + ( + { + "file": pathlib.Path(__file__).resolve(), + "study": "study", + "topic": "table", + "remote_filename": None, + "force_upload": False, + }, + "SSE_KMS", + 0, + "s3://test_bucket/test_location/cumulus_user_uploads/test/study/table", + does_not_raise(), + ), + ( + { + "file": pathlib.Path(__file__).resolve(), + "study": "study", + "topic": "table", + "remote_filename": None, + "force_upload": False, + }, + "SSE-S3", + 0, + "s3://test_bucket/test_location/cumulus_user_uploads/test/study/table", + pytest.raises(errors.AWSError), + ), + ( + { + "file": pathlib.Path(__file__).resolve(), + "study": "study", + "topic": "table", + "remote_filename": None, + "force_upload": True, + }, + "SSE_KMS", + 1, + "s3://test_bucket/test_location/cumulus_user_uploads/test/study/table", + does_not_raise(), + ), + ( + { + "file": pathlib.Path(__file__).resolve(), + "study": "study", + "topic": "table", + "remote_filename": "custom.name", + "force_upload": False, + }, + "SSE_KMS", + 0, + "s3://test_bucket/test_location/cumulus_user_uploads/test/study/table", + does_not_raise(), + ), + ], +) +@mock.patch("botocore.client") +def test_upload_file_athena(mock_botocore, args, sse, keycount, expected, raises): + mock_data = { + "WorkGroup": { + "Configuration": { + "ResultConfiguration": { + "OutputLocation": "s3://test_bucket/test_location/", + "EncryptionConfiguration": {"EncryptionOption": sse}, + } + } + } + } + mock_clientobj = mock_botocore.ClientCreator.return_value.create_client.return_value + mock_clientobj.get_work_group.return_value = mock_data + mock_clientobj.list_objects_v2.return_value = {"KeyCount": keycount} + db = databases.AthenaDatabaseBackend(**ATHENA_KWARGS) + with raises: + location = db.upload_file(**args) + assert location == expected + if keycount == 0 or args["force_upload"]: + assert mock_clientobj.put_object.called + kwargs = mock_clientobj.put_object.call_args_list[0][1] + if args["remote_filename"]: + assert kwargs["Key"].endswith(args["remote_filename"]) + else: + assert kwargs["Key"].endswith(args["file"].name)