diff --git a/.nojekyll b/.nojekyll index 4ff88a3..3889da9 100644 --- a/.nojekyll +++ b/.nojekyll @@ -1 +1 @@ -80e1cb7c \ No newline at end of file +3fa9d1f5 \ No newline at end of file diff --git a/aemo_data.html b/aemo_data.html index 129c3d6..70fcd81 100644 --- a/aemo_data.html +++ b/aemo_data.html @@ -20,6 +20,40 @@ margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ vertical-align: middle; } +/* CSS for syntax highlighting */ +pre > code.sourceCode { white-space: pre; position: relative; } +pre > code.sourceCode > span { display: inline-block; line-height: 1.25; } +pre > code.sourceCode > span:empty { height: 1.2em; } +.sourceCode { overflow: visible; } +code.sourceCode > span { color: inherit; text-decoration: inherit; } +div.sourceCode { margin: 1em 0; } +pre.sourceCode { margin: 0; } +@media screen { +div.sourceCode { overflow: auto; } +} +@media print { +pre > code.sourceCode { white-space: pre-wrap; } +pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; } +} +pre.numberSource code + { counter-reset: source-line 0; } +pre.numberSource code > span + { position: relative; left: -4em; counter-increment: source-line; } +pre.numberSource code > span > a:first-child::before + { content: counter(source-line); + position: relative; left: -1em; text-align: right; vertical-align: baseline; + border: none; display: inline-block; + -webkit-touch-callout: none; -webkit-user-select: none; + -khtml-user-select: none; -moz-user-select: none; + -ms-user-select: none; user-select: none; + padding: 0 4px; width: 4em; + } +pre.numberSource { margin-left: 3em; padding-left: 4px; } +div.sourceCode + { } +@media screen { +pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; } +} @@ -150,7 +184,16 @@
@@ -172,8 +215,125 @@

AEMO Data Snippets

+
+

Dividing large AEMO Data CSVs into parquet partitions

+

This script can be run via the command line to divide a large AEMO data CSV (e.g. from the Monthly Data Archive, such as rebids in BIDPEROFFER) into Parquet partitions. This is advantageous for using packages such as Dask to analyse such data.

+

It assumes that the first row of the table is the header (i.e. columns) for a single data table.

+
+

Requirements

+

Written using Python 3.11. Uses pathlib and type annotations, so probably need at least Python > 3.5.

+
# Python script (executable via CLI) to create parquet partitions
+# for large AEMO data CSVs. Assumes first line is table header and that only one table
+# type is in the file
+#
+# Copyright (C) 2023 Abhijith Prakash
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+import argparse
+import logging
+from pathlib import Path
+
+import pandas as pd
+from tqdm import tqdm
+
+
+def arg_parser():
+    description = (
+        "Chunk large monthly AEMO data table CSVs into parquet partitions. "
+        + "Assumes that the table header is in the 2nd row"
+    )
+    parser = argparse.ArgumentParser(description=description)
+    parser.add_argument(
+        "-file", type=str, required=True, help=("File to process. Must be CSV")
+    )
+    parser.add_argument(
+        "-output_dir",
+        type=str,
+        required=True,
+        help=(
+            "Directory to write parquet chunks to. Will be created if it does not exist"
+        ),
+    )
+    parser.add_argument(
+        "-chunksize",
+        type=int,
+        default=10**6,
+        help=("Size of each DataFrame chunk (# of lines). Default 10^6"),
+    )
+    args = parser.parse_args()
+    return args
+
+
+def get_columns(file_path: Path) -> pd.Index:
+    col_df = pd.read_csv(file_path, header=1, nrows=0)
+    return col_df.columns
+
+
+def estimate_size_of_lines(file_path: Path, columns=pd.Index) -> float:
+    sample_size = 1000
+    sample = pd.read_csv(file_path, skiprows=2, nrows=sample_size, header=None)
+    sample.columns = columns
+    total_size = sample.memory_usage().sum()
+    size_per_line = total_size / len(sample)
+    return size_per_line
+
+
+def chunk_file(file_path: Path, output_dir: Path, chunksize: int) -> None:
+    if not file_path.suffix.lower() == ".csv":
+        logging.error("File is not a CSV")
+        exit()
+    cols = get_columns(file_path)
+    size_per_line = estimate_size_of_lines(file_path, cols)
+    file_size = file_path.stat().st_size
+    file_stem = file_path.stem
+    with pd.read_csv(file_path, chunksize=chunksize, skiprows=2, header=None) as reader:
+        with tqdm(total=file_size, desc="Progress estimate based on file size") as pbar:
+            for i, chunk in enumerate(reader):
+                chunk.columns = cols
+                out_file = Path(file_stem + f"_chunk{i}.parquet")
+                chunk.to_parquet(output_dir / out_file)
+                # See here for comparison of pandas DataFrame size vs CSV size:
+                # https://stackoverflow.com/questions/18089667/how-to-estimate-how-much-memory-a-pandas-dataframe-will-need#32970117
+                pbar.update((size_per_line * chunksize) / 2)
+
+
+def main():
+    logging.basicConfig(format="\n%(levelname)s:%(message)s", level=logging.INFO)
+    args = arg_parser()
+    f = Path(args.file)
+    output_dir = Path(args.output_dir)
+    if not output_dir.exists():
+        output_dir.mkdir(parents=True)
+    elif len(sorted(output_dir.glob(f.stem + "*.parquet"))) > 1:
+        logging.error("Pre-existing chunks of this file in output directory. Exiting.")
+        exit()
+    if not f.exists():
+        logging.error("Path does not exist")
+        exit()
+    if not f.is_file():
+        logging.error("Path provided does not point to a file")
+        exit()
+    chunk_file(f, output_dir, args.chunksize)
+
+
+if __name__ == "__main__":
+    main()
+
+