Skip to content

Commit

Permalink
75 handle pre built json objects (#92)
Browse files Browse the repository at this point in the history
* handling pre-built jsons for grid and point data

* refactoring loading tests

* updating point loading example

* loading examples updates

* grid example typo

* example notebook updates, get_dataset credentials

* docstring updates

* version, readme and changelog updates

* version bump, kerchunk method name, remove filecmp

* version bump commit

* fix changelog

---------

Co-authored-by: Sam Lamont <[email protected]>
  • Loading branch information
samlamont and Sam Lamont authored Dec 8, 2023
1 parent efe3c9d commit 1a11e98
Show file tree
Hide file tree
Showing 39 changed files with 1,396 additions and 2,334 deletions.
16 changes: 15 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,24 @@
-# Changelog
# Changelog

All notable changes to this project will be documented in this file.

The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [0.2.9] - 2023-12-08

### Added
* Three options related to kerchunk jsons
* `local` - (default) previous behavior, manually creates the jsons based on GCS netcdf files using Kerchunk's `SingleHdf5ToZarr`. Any locally existing files will be used before creating new jsons from the remote store.
* `remote` - use pre-created jsons, skipping any that do not exist within the specified time frame. Jsons are read directly from s3 using fsspec
* `auto` - use pre-created jsons, creating any that do not exist within the specified time frame
* Adds `nwm_version` (nwm22 or nwm30) and `data_source` (GCS, NOMADS, DSTOR - currently on GCS implemented) as loading arguments

### Changed
* Combines loading modules into one directory `loading/nwm`
* Updates to loading example notebooks
* Updates to loading tests

## [0.2.8] - 2023-11-14

### Added
Expand Down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ $ python3 -m pip install --upgrade pip
# Build and install from source
$ python3 -m pip install --upgrade build
$ python -m build
$ python -m pip install dist/teehr-0.2.8.tar.gz
$ python -m pip install dist/teehr-0.2.9.tar.gz
```

Install from GitHub
Expand All @@ -35,8 +35,8 @@ $ pip install 'teehr @ git+https://github.com/RTIInternational/teehr@[BRANCH_TAG

Use Docker
```bash
$ docker build -t teehr:v0.2.8 .
$ docker run -it --rm --volume $HOME:$HOME -p 8888:8888 teehr:v0.2.8 jupyter lab --ip 0.0.0.0 $HOME
$ docker build -t teehr:v0.2.9 .
$ docker run -it --rm --volume $HOME:$HOME -p 8888:8888 teehr:v0.2.9 jupyter lab --ip 0.0.0.0 $HOME
```

## Examples
Expand Down
644 changes: 89 additions & 555 deletions examples/loading/grid_loading_example.ipynb

Large diffs are not rendered by default.

595 changes: 64 additions & 531 deletions examples/loading/point_loading_example.ipynb

Large diffs are not rendered by default.

File renamed without changes.
236 changes: 236 additions & 0 deletions src/teehr/loading/nwm/const.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
from datetime import datetime

import numpy as np

NWM_BUCKET = "national-water-model"
NWM22_UNIT_LOOKUP = {"m3 s-1": "m3/s"}
NWM30_START_DATE = datetime(2023, 9, 19, 0)
NWM_S3_JSON_PATH = "s3://ciroh-nwm-zarr-copy"


NWM22_ANALYSIS_CONFIG = {
"analysis_assim": {
"num_lookback_hrs": 3,
"cycle_z_hours": np.arange(0, 24, 1),
"domain": "conus",
"configuration_name_in_filepath": "analysis_assim",
},
"analysis_assim_no_da": {
"num_lookback_hrs": 3,
"cycle_z_hours": np.arange(0, 24, 1),
"domain": "conus",
"configuration_name_in_filepath": "analysis_assim_no_da",
},
"analysis_assim_extend": {
"num_lookback_hrs": 28,
"cycle_z_hours": [16],
"domain": "conus",
"configuration_name_in_filepath": "analysis_assim_extend",
},
"analysis_assim_extend_no_da": {
"num_lookback_hrs": 28,
"cycle_z_hours": [16],
"domain": "conus",
"configuration_name_in_filepath": "analysis_assim_extend_no_da",
},
"analysis_assim_long": {
"num_lookback_hrs": 12,
"cycle_z_hours": np.arange(0, 24, 6),
"domain": "conus",
"configuration_name_in_filepath": "analysis_assim_long",
},
"analysis_assim_long_no_da": {
"num_lookback_hrs": 12,
"cycle_z_hours": np.arange(0, 24, 6),
"domain": "conus",
"configuration_name_in_filepath": "analysis_assim_long_no_da",
},
"analysis_assim_hawaii": {
"num_lookback_hrs": 3,
"cycle_z_hours": np.arange(0, 24, 1),
"domain": "hawaii",
"configuration_name_in_filepath": "analysis_assim",
},
"analysis_assim_hawaii_no_da": {
"num_lookback_hrs": 3,
"cycle_cycle_z_hourstimes": np.arange(0, 24, 1),
"domain": "hawaii",
"configuration_name_in_filepath": "analysis_assim_no_da",
},
"analysis_assim_puertorico": {
"num_lookback_hrs": 3,
"cycle_z_hours": np.arange(0, 24, 1),
"domain": "puertorico",
"configuration_name_in_filepath": "analysis_assim",
},
"analysis_assim_puertorico_no_da": {
"num_lookback_hrs": 3,
"cycle_z_hours": np.arange(0, 24, 1),
"domain": "puertorico",
"configuration_name_in_filepath": "analysis_assim_no_da",
},
"forcing_analysis_assim": {
"num_lookback_hrs": 3,
"cycle_z_hours": np.arange(0, 24, 1),
"domain": "conus",
"configuration_name_in_filepath": "analysis_assim",
},
"forcing_analysis_assim_extend": {
"num_lookback_hrs": 28,
"cycle_z_hours": [16],
"domain": "conus",
"configuration_name_in_filepath": "analysis_assim_extend",
},
"forcing_analysis_assim_hawaii": {
"num_lookback_hrs": 3,
"cycle_z_hours": np.arange(0, 24, 1),
"domain": "hawaii",
"configuration_name_in_filepath": "analysis_assim",
},
"forcing_analysis_assim_puertorico": {
"num_lookback_hrs": 3,
"cycle_z_hours": np.arange(0, 24, 1),
"domain": "puertorico",
"configuration_name_in_filepath": "analysis_assim",
},
}

NWM30_ANALYSIS_CONFIG = {
"analysis_assim": {
"num_lookback_hrs": 3,
"cycle_z_hours": np.arange(0, 24, 1),
"domain": "conus",
"configuration_name_in_filepath": "analysis_assim",
},
"analysis_assim_no_da": {
"num_lookback_hrs": 3,
"cycle_z_hours": np.arange(0, 24, 1),
"domain": "conus",
"configuration_name_in_filepath": "analysis_assim_no_da",
},
"analysis_assim_extend": {
"num_lookback_hrs": 28,
"cycle_z_hours": [16],
"domain": "conus",
"configuration_name_in_filepath": "analysis_assim_extend",
},
"analysis_assim_extend_no_da": {
"num_lookback_hrs": 28,
"cycle_z_hours": [16],
"domain": "conus",
"configuration_name_in_filepath": "analysis_assim_extend_no_da",
},
"analysis_assim_long": {
"num_lookback_hrs": 12,
"cycle_z_hours": np.arange(0, 24, 6),
"domain": "conus",
"configuration_name_in_filepath": "analysis_assim_long",
},
"analysis_assim_long_no_da": {
"num_lookback_hrs": 12,
"cycle_z_hours": np.arange(0, 24, 6),
"domain": "conus",
"configuration_name_in_filepath": "analysis_assim_long_no_da",
},
"analysis_assim_alaska": {
"num_lookback_hrs": 3,
"cycle_z_hours": np.arange(0, 24, 1),
"domain": "alaska",
"configuration_name_in_filepath": "analysis_assim",
},
"analysis_assim_alaska_no_da": {
"num_lookback_hrs": 3,
"cycle_z_hours": np.arange(0, 24, 1),
"domain": "alaska",
"configuration_name_in_filepath": "analysis_assim_no_da",
},
"analysis_assim_extend_alaska": {
"num_lookback_hrs": 32,
"cycle_z_hours": [20],
"domain": "alaska",
"configuration_name_in_filepath": "analysis_assim_extend",
},
"analysis_assim_extend_alaska_no_da": {
"num_lookback_hrs": 32,
"cycle_z_hours": [20],
"domain": "alaska",
"configuration_name_in_filepath": "analysis_assim_extend_no_da",
},
"analysis_assim_hawaii": {
"num_lookback_hrs": 3,
"cycle_z_hours": np.arange(0, 24, 1),
"domain": "hawaii",
"configuration_name_in_filepath": "analysis_assim",
},
"analysis_assim_hawaii_no_da": {
"num_lookback_hrs": 3,
"cycle_cycle_z_hourstimes": np.arange(0, 24, 1),
"domain": "hawaii",
"configuration_name_in_filepath": "analysis_assim_no_da",
},
"analysis_assim_puertorico": {
"num_lookback_hrs": 3,
"cycle_z_hours": np.arange(0, 24, 1),
"domain": "puertorico",
"configuration_name_in_filepath": "analysis_assim",
},
"analysis_assim_puertorico_no_da": {
"num_lookback_hrs": 3,
"cycle_z_hours": np.arange(0, 24, 1),
"domain": "puertorico",
"configuration_name_in_filepath": "analysis_assim_no_da",
},
"forcing_analysis_assim": {
"num_lookback_hrs": 3,
"cycle_z_hours": np.arange(0, 24, 1),
"domain": "conus",
"configuration_name_in_filepath": "analysis_assim",
},
"forcing_analysis_assim_extend": {
"num_lookback_hrs": 28,
"cycle_z_hours": [16],
"domain": "conus",
"configuration_name_in_filepath": "analysis_assim_extend",
},
"forcing_analysis_assim_alaska": {
"num_lookback_hrs": 3,
"cycle_z_hours": np.arange(0, 24, 1),
"domain": "alaska",
"configuration_name_in_filepath": "analysis_assim",
},
"forcing_analysis_assim_hawaii": {
"num_lookback_hrs": 3,
"cycle_z_hours": np.arange(0, 24, 1),
"domain": "hawaii",
"configuration_name_in_filepath": "analysis_assim",
},
"forcing_analysis_assim_puertorico": {
"num_lookback_hrs": 3,
"cycle_z_hours": np.arange(0, 24, 1),
"domain": "puertorico",
"configuration_name_in_filepath": "analysis_assim",
},
}


# WKT strings extracted from NWM grids
CONUS_NWM_WKT = 'PROJCS["Lambert_Conformal_Conic",GEOGCS["GCS_Sphere",DATUM["D_Sphere",SPHEROID["Sphere",6370000.0,0.0]], \
PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]],PROJECTION["Lambert_Conformal_Conic_2SP"],PARAMETER["false_easting",0.0],\
PARAMETER["false_northing",0.0],PARAMETER["central_meridian",-97.0],PARAMETER["standard_parallel_1",30.0],\
PARAMETER["standard_parallel_2",60.0],PARAMETER["latitude_of_origin",40.0],UNIT["Meter",1.0]]' # noqa

HI_NWM_WKT = 'PROJCS["Lambert_Conformal_Conic",GEOGCS["GCS_Sphere",DATUM["D_Sphere",SPHEROID["Sphere",6370000.0,0.0]],\
PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]],PROJECTION["Lambert_Conformal_Conic_2SP"],PARAMETER["false_easting",0.0],\
PARAMETER["false_northing",0.0],PARAMETER["central_meridian",-157.42],PARAMETER["standard_parallel_1",10.0],\
PARAMETER["standard_parallel_2",30.0],PARAMETER["latitude_of_origin",20.6],UNIT["Meter",1.0]]' # noqa

PR_NWM_WKT = 'PROJCS["Sphere_Lambert_Conformal_Conic",GEOGCS["GCS_Sphere",DATUM["D_Sphere",SPHEROID["Sphere",6370000.0,0.0]],\
PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]],PROJECTION["Lambert_Conformal_Conic_2SP"],PARAMETER["false_easting",0.0],\
PARAMETER["false_northing",0.0],PARAMETER["central_meridian",-65.91],PARAMETER["standard_parallel_1",18.1],\
PARAMETER["standard_parallel_2",18.1],PARAMETER["latitude_of_origin",18.1],UNIT["Meter",1.0]]' # noqa

AL_NWM_WKT = 'PROJCS["Sphere_Stereographic",GEOGCS["Sphere",DATUM["Sphere",SPHEROID["unnamed",6370000,0]], \
PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]]], \
PROJECTION["Polar_Stereographic"],PARAMETER["latitude_of_origin",60],PARAMETER["central_meridian",-135], \
PARAMETER["false_easting",0],PARAMETER["false_northing",0],UNIT["metre",1],AXIS["Easting",SOUTH], \
AXIS["Northing",SOUTH]]' # noqa
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import dask
from pathlib import Path
from typing import Dict, List
from typing import Dict, List, Tuple
import re

import dask
import numpy as np
import pandas as pd
import xarray as xr

from teehr.loading.nwm_common.utils_nwm import get_dataset, write_parquet_file
from teehr.loading.nwm.utils import get_dataset, write_parquet_file


def compute_zonal_mean(
Expand Down Expand Up @@ -34,7 +36,7 @@ def compute_zonal_mean(

@dask.delayed
def process_single_file(
singlefile: str,
row: Tuple,
configuration: str,
variable_name: str,
weights_filepath: str,
Expand All @@ -43,17 +45,20 @@ def process_single_file(
):
"""Compute zonal mean for a single json reference file and format
to a dataframe using the TEEHR data model"""
ds = get_dataset(singlefile, ignore_missing_file)
ds = get_dataset(
row.filepath,
ignore_missing_file,
target_options={'anon': True}
)
if not ds:
return None
filename = Path(singlefile).name
yrmoday = filename.split(".")[1]
z_hour = filename.split(".")[3][1:3]
yrmoday = row.day
z_hour = row.z_hour[1:3]
ref_time = pd.to_datetime(yrmoday) \
+ pd.to_timedelta(int(z_hour), unit="H")

nwm22_units = ds[variable_name].attrs["units"]
teehr_units = units_format_dict.get(nwm22_units, nwm22_units)
nwm_units = ds[variable_name].attrs["units"]
teehr_units = units_format_dict.get(nwm_units, nwm_units)
value_time = ds.time.values[0]
da = ds[variable_name]

Expand Down Expand Up @@ -88,13 +93,19 @@ def fetch_and_format_nwm_grids(
output_parquet_dir.mkdir(parents=True)

# Format file list into a dataframe and group by reference time
pattern = re.compile(r'[0-9]+')
days = []
z_hours = []

for path in json_paths:
filename = Path(path).name
days.append(filename.split(".")[1])
z_hours.append(filename.split(".")[3])
if path.split(":")[0] == "s3":
# If it's a remote json day and z-hour are in the path
res = re.findall(pattern, path)
days.append(res[1])
z_hours.append(f"t{res[2]}z")
else:
days.append(filename.split(".")[1])
z_hours.append(filename.split(".")[3])
df_refs = pd.DataFrame(
{"day": days, "z_hour": z_hours, "filepath": json_paths}
)
Expand All @@ -104,10 +115,10 @@ def fetch_and_format_nwm_grids(
_, df = gp

results = []
for singlefile in df.filepath.tolist():
for row in df.itertuples():
results.append(
process_single_file(
singlefile,
row,
configuration,
variable_name,
zonal_weights_filepath,
Expand Down
Loading

0 comments on commit 1a11e98

Please sign in to comment.