diff --git a/notebooks/api/01_utils.ipynb b/notebooks/api/01_utils.ipynb index 5152153..242206f 100644 --- a/notebooks/api/01_utils.ipynb +++ b/notebooks/api/01_utils.ipynb @@ -762,9 +762,21 @@ ], "metadata": { "kernelspec": { - "display_name": "plpy", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" } }, "nbformat": 4, diff --git a/notebooks/api/02a_pds.indexes.ipynb b/notebooks/api/02a_pds.indexes.ipynb index ef6c98f..400e1ff 100644 --- a/notebooks/api/02a_pds.indexes.ipynb +++ b/notebooks/api/02a_pds.indexes.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "latest-oakland", "metadata": {}, "outputs": [], @@ -21,7 +21,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "nervous-hunter", "metadata": {}, "outputs": [], @@ -32,7 +32,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "commercial-pension", "metadata": {}, "outputs": [], @@ -42,7 +42,7 @@ "from datetime import datetime\n", "from urllib.parse import urlsplit, urlunsplit\n", "from urllib.request import URLError\n", - "\n", + "from dask import dataframe as dd\n", "import tomlkit as toml\n", "from dateutil import parser\n", "from dateutil.parser import ParserError\n", @@ -55,7 +55,7 @@ "from planetarypy.config import config\n", "from planetarypy.pds.ctx_index import CTXIndex\n", "from planetarypy.pds.lroc_index import LROCIndex\n", - "from planetarypy.pds.utils import IndexLabel, fix_hirise_edrcumindex\n", + "from planetarypy.pds.utils import IndexLabel, fix_hirise_edrcumindex, convert_times\n", "\n", "logger = logging.getLogger(__name__)\n", "\n", @@ -64,7 +64,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "3bfb2fdb-fa60-4ab1-80a0-e847d0d95a74", "metadata": {}, "outputs": [], @@ -75,17 +75,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, + "id": "0e87c9bf-dee7-422b-94a1-88fc5a56ac4c", + "metadata": {}, + "outputs": [], + "source": [ + "#| hide\n", + "pd.set_option('display.max_columns', 100)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, "id": "welcome-humanitarian", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Path('/home/ayek72/mnt/slowdata/planetarypy')" + "Path('/media/ayek72/data/planetarypy')" ] }, - "execution_count": null, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -97,7 +108,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "id": "developed-entertainment", "metadata": {}, "outputs": [], @@ -233,14 +244,17 @@ " def label(self):\n", " return IndexLabel(self.local_label_path)\n", "\n", - " def read_index_data(self, convert_times=True):\n", - " df = self.label.read_index_data(convert_times=convert_times)\n", + " def read_index_data(self, do_convert_times=True):\n", + " df = self.label.read_index_data(do_convert_times=do_convert_times)\n", " return df\n", "\n", " def convert_to_parquet(self):\n", + " print(\"Reading index to memory for conversion to parquet. Will take up lots of memory for a bit.\")\n", " df = self.read_index_data()\n", " df = df.convert_dtypes()\n", + " print(\"Storing into parquet.\")\n", " df.to_parquet(self.local_parq_path)\n", + " print(\"Finished. Enjoy your freshly baked PDS Index. :\")\n", "\n", " def __str__(self):\n", " s = f\"Key: {self.key}\\n\"\n", @@ -254,7 +268,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "id": "865aff5d-e3eb-4e6a-8952-8992f7b70800", "metadata": {}, "outputs": [], @@ -279,14 +293,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "id": "74b468af-9fb1-46e6-ae49-7769af8ed57b", "metadata": {}, "outputs": [], "source": [ "#| export\n", "@patch\n", - "def download(self:Index):\n", + "def download(\n", + " self:Index, # the Index object defined in this module\n", + " convert_to_parquet:bool=True, # set to False if you just want download the files\n", + "):\n", " \"\"\"Wrapping URLs for downloading PDS indices and their label files.\"\"\"\n", " # check timestamp\n", " label_url = self.url\n", @@ -295,22 +312,23 @@ " logger.info(\"Downloading %s.\", self.table_url)\n", " utils.url_retrieve(self.table_url, self.local_table_path)\n", " print(f\"Downloaded {self.local_label_path} and {self.local_table_path}\")\n", - " if (\n", - " self.key == \"missions.mro.hirise.indexes.edr\"\n", - " ): # HiRISE EDR index is broken on the PDS. Team knows.\n", - " print(\"Fixing broken EDR index...\")\n", - " fix_hirise_edrcumindex(\n", - " self.local_table_path, self.local_table_path.with_name(\"temp.tab\")\n", - " )\n", - " self.local_table_path.with_name(\"temp.tab\").rename(self.local_table_path)\n", + " # if (\n", + " # self.key == \"missions.mro.hirise.indexes.edr\"\n", + " # ): # HiRISE EDR index is broken on the PDS. Team knows.\n", + " # print(\"Fixing broken EDR index...\")\n", + " # fix_hirise_edrcumindex(\n", + " # self.local_table_path, self.local_table_path.with_name(\"temp.tab\")\n", + " # )\n", + " # self.local_table_path.with_name(\"temp.tab\").rename(self.local_table_path)\n", " self.timestamp = self.remote_timestamp\n", " self.update_timestamp()\n", - " self.convert_to_parquet()" + " if convert_to_parquet:\n", + " self.convert_to_parquet()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "id": "329db7ae-b55a-4b0c-b59a-f7755726b5a1", "metadata": {}, "outputs": [], @@ -327,30 +345,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "id": "4de51298-40f3-475d-9bdf-24d6a110c82e", "metadata": {}, "outputs": [], "source": [ "key = \"missions.cassini.iss.indexes.moon_summary\"\n", - "key = \"missions.lro.lroc.edr\"" + "key = \"missions.mro.hirise.edr\"" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "id": "238e9b89-c0f9-4548-a641-e1a070454f1f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Key: missions.lro.lroc.indexes.edr\n", - "URL: http://pds.lroc.asu.edu/data/LRO-L-LROC-2-EDR-V1.0/LROLRC_0048C/INDEX/CUMINDEX.LBL\n", - "Timestamp: 2023-10-10 15:30:56.195462" + "Key: missions.mro.hirise.indexes.edr\n", + "URL: https://hirise-pds.lpl.arizona.edu/PDS/INDEX/EDRCUMINDEX.LBL\n", + "Timestamp: 2024-04-30 21:31:58" ] }, - "execution_count": null, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -362,17 +380,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "id": "e32c42b6", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'missions.lro.lroc.indexes.edr'" + "'missions.mro.hirise.indexes.edr'" ] }, - "execution_count": null, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -383,17 +401,68 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, + "id": "70970ef8-ad63-41fc-a3f6-8df070fafad4", + "metadata": {}, + "outputs": [], + "source": [ + "# this will ALWAYS download!!\n", + "#index.download(convert_to_parquet=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "1bc7c9df-ceaa-4eab-84f8-7c739b05d3f8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reading index to memory for conversion to parquet. Will take up lots of memory for a bit.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "df9a70d9536549869b371ed0274d7c80", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Loading index in chunks: 0%| | 0/459 [00:00 bool: # Boolean indicating if there's a new index diff --git a/planetarypy/pds/utils.py b/planetarypy/pds/utils.py index 3374c18..df76418 100644 --- a/planetarypy/pds/utils.py +++ b/planetarypy/pds/utils.py @@ -1,17 +1,18 @@ # AUTOGENERATED! DO NOT EDIT! File to edit: ../../notebooks/api/02f_pds.utils.ipynb. # %% auto 0 -__all__ = ['IndexLabel', 'index_to_df', 'PVLColumn', 'decode_line', 'find_mixed_type_cols', 'fix_hirise_edrcumindex'] +__all__ = ['IndexLabel', 'convert_times', 'index_to_df', 'PVLColumn', 'decode_line', 'find_mixed_type_cols', + 'fix_hirise_edrcumindex'] # %% ../../notebooks/api/02f_pds.utils.ipynb 3 import warnings from typing import Union -from tqdm.auto import tqdm - import pandas as pd import pvl from fastcore.utils import Path +from tqdm.auto import tqdm + from .. import utils # %% ../../notebooks/api/02f_pds.utils.ipynb 4 @@ -81,10 +82,24 @@ def colspecs(self): colspecs.extend(pvlcol.colspecs) return colspecs - def read_index_data(self, convert_times=True): - return index_to_df(self.index_path, self, convert_times=convert_times) + def read_index_data(self, do_convert_times=True): + return index_to_df(self.index_path, self, do_convert_times=do_convert_times) # %% ../../notebooks/api/02f_pds.utils.ipynb 5 +def convert_times(df): + for column in [col for col in df.columns if "TIME" in col]: + if column in ["LOCAL_TIME", "DWELL_TIME"]: + continue + try: + df[column] = pd.to_datetime(df[column]) + except ValueError: + df[column] = pd.to_datetime( + df[column], format=utils.nasa_dt_format_with_ms, errors="coerce" + ) + print("Convert time strings to datetime objects.") + return df + +# %% ../../notebooks/api/02f_pds.utils.ipynb 6 def index_to_df( # Path to the index TAB file indexpath: Union[str, Path], @@ -92,7 +107,7 @@ def index_to_df( # 'colnames' and 'colspecs' label: IndexLabel, # Switch to control if to convert columns with "TIME" in name (unless COUNT is as well in name) to datetime - convert_times=True, + do_convert_times=True, ): """The main reader function for PDS Indexfiles. @@ -100,21 +115,27 @@ def index_to_df( this reader should work for all PDS TAB files. """ indexpath = Path(indexpath) - df = pd.read_csv(indexpath,header=None,names=label.colnames,) - if convert_times: - for column in [col for col in df.columns if "TIME" in col]: - if column in ["LOCAL_TIME", "DWELL_TIME"]: - continue - try: - df[column] = pd.to_datetime(df[column]) - except ValueError: - df[column] = pd.to_datetime(df[column], format=utils.nasa_dt_format_with_ms, errors="coerce") - except KeyError: - raise KeyError(f"{column} not in {df.columns}") - print("Done.") + # get n_lines fast for progress bar + with open(indexpath, "rb") as f: # courtesy of https://stackoverflow.com/a/1019572 + num_lines = sum(1 for _ in f) + chunksize = 5000 + df = pd.concat( + [ + chunk + for chunk in tqdm( + pd.read_csv( + indexpath, header=None, names=label.colnames, chunksize=chunksize + ), + total=int(num_lines / chunksize), + desc="Loading index in chunks", + ) + ] + ) + if do_convert_times: + df = convert_times(df) return df -# %% ../../notebooks/api/02f_pds.utils.ipynb 6 +# %% ../../notebooks/api/02f_pds.utils.ipynb 7 class PVLColumn: "Manages just one of the columns in a table that is described via PVL." @@ -173,17 +194,19 @@ def decode(self, linedata): return linedata[start:stop] else: bucket = [] - for (start, stop) in self.colspecs: + for start, stop in self.colspecs: bucket.append(linedata[start:stop]) return bucket def __repr__(self): return self.pvlobj.__repr__() -# %% ../../notebooks/api/02f_pds.utils.ipynb 7 +# %% ../../notebooks/api/02f_pds.utils.ipynb 8 def decode_line( - linedata: str, # One line of a .tab data file - labelpath: Union[str, Path], # Path to the appropriate label that describes the data. + linedata: str, # One line of a .tab data file + labelpath: Union[ + str, Path + ], # Path to the appropriate label that describes the data. ): "Decode one line of tabbed data with the appropriate label file." label = IndexLabel(labelpath) @@ -191,7 +214,7 @@ def decode_line( pvlcol = PVLColumn(column) print(pvlcol.name, pvlcol.decode(linedata)) -# %% ../../notebooks/api/02f_pds.utils.ipynb 8 +# %% ../../notebooks/api/02f_pds.utils.ipynb 9 def find_mixed_type_cols( # Dataframe to be searched for mixed data-types df: pd.DataFrame, @@ -214,10 +237,10 @@ def find_mixed_type_cols( df[col].fillna("UNKNOWN", inplace=True) return result -# %% ../../notebooks/api/02f_pds.utils.ipynb 9 +# %% ../../notebooks/api/02f_pds.utils.ipynb 10 def fix_hirise_edrcumindex( - infname: Union[str, Path], # Path to broken EDRCUMINDEX.TAB - outfname: Union[str, Path], # Path where to store the fixed TAB file + infname: Union[str, Path], # Path to broken EDRCUMINDEX.TAB + outfname: Union[str, Path], # Path where to store the fixed TAB file ): """Fix HiRISE EDRCUMINDEX.