From 65d6aca73534c6f65ef7a22ad68d26cb097ebe3c Mon Sep 17 00:00:00 2001 From: Zakariyya Mughal Date: Wed, 10 Jan 2024 17:05:29 -0500 Subject: [PATCH 1/2] Set input file encoding to latin1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Without this, the encoding of strings for the output will be invalid UTF-8. This gives errors when the file is read by various tools such as with DuckDB: ```shell $ duckdb -c "$(cat <<'EOF' > SELECT DISTINCT chnm FROM 'brick/invitrodb.parquet/part-*.parquet'; > EOF > )" Error: Invalid Input Error: Invalid string encoding found in Parquet file: value "(\xB1)-cis-3-Methyl fentanyl hydrochloride" is not valid UTF8! ``` or with `pyarrow` when converting to `pandas`: ``` pyarrow.lib.ArrowException: Unknown error: Wrapping (�)-Bornyl acetate failed ``` --- stages/download.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/stages/download.R b/stages/download.R index ecdcd5e..fca14d7 100644 --- a/stages/download.R +++ b/stages/download.R @@ -8,7 +8,9 @@ invitrodb = "https://clowder.edap-cluster.com/files/63642290e4b04f6bb140a10d/blo options(timeout = 600) download.file(invitrodb, destfile = stage, mode = "wb") -df = readr::read_csv(stage) +# Need encoding due to the ± character: +# $ perl -F, -nE 'next unless /[^\x00-\x7F]/; say $F[3]' staging/invitrodb.csv | sort | uniq -c +df = readr::read_csv( file = stage, locale = readr::locale(encoding = "latin1") ) out = fs::dir_create("brick/invitrodb.parquet") # See From 79147a935db3d14abb5aa590ef73676f07b5a776 Mon Sep 17 00:00:00 2001 From: Zakariyya Mughal Date: Wed, 10 Jan 2024 19:51:16 -0500 Subject: [PATCH 2/2] Update dvc.lock --- dvc.lock | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dvc.lock b/dvc.lock index 0b029e0..2acdb68 100644 --- a/dvc.lock +++ b/dvc.lock @@ -19,11 +19,11 @@ stages: size: 33866 - path: stages/download.R hash: md5 - md5: fde14a12dfe43458e11081702c2d3625 - size: 831 + md5: ec800973d471bab40b8abda8e60f39cc + size: 1025 outs: - path: brick/invitrodb.parquet hash: md5 - md5: 79a0001b906d40f06a0efa2fea6a4bc1.dir - size: 718228047 + md5: 110a6e966af64a8ad0c686b7bf8c908e.dir + size: 718228060 nfiles: 4