Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

1072 modelbuilder improve era5 experience #1073

Merged
merged 5 commits into from
Jan 20, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions dfm_tools/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,9 @@ def download_ERA5(varkey,
raise KeyError(f'"{varkey}" not available, choose from: {", ".join(variables_dict.keys())}')

period_range = pd.period_range(date_min,date_max,freq='M')
if len(period_range) == 0:
raise ValueError(f"requested time extents ({date_min} to {date_max}) "
"resulted in empty period_range")
print(f'retrieving data from {period_range[0]} to {period_range[-1]} (freq={period_range.freq})')

#make sure the data fully covers the desired spatial extent. Download 1 additional grid cell (resolution is 1/4 degrees) in both directions
Expand Down
48 changes: 36 additions & 12 deletions dfm_tools/xarray_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,25 +96,49 @@ def preprocess_hisnc(ds):

def preprocess_ERA5(ds):
"""
Reduces the expver dimension in some of the ERA5 data (mtpr and other variables), which occurs in files with very recent data. The dimension contains the unvalidated data from the latest month in the second index in the expver dimension. The reduction is done with mean, but this is arbitrary, since there is only one valid value per timestep and the other one is nan.
Aligning ERA5 datasets before merging them. These operations are currently
(2025) only required when (also) using previously retrieved ERA5 data.

In recent datasets retrieved from ERA5 the time dimension and variable are
now called valid_time. This is inconvenient since it causes issues when
merging with previously retrieved datasets. However, it is not necessary
for succesfully running a Delft3D FM simulation.

Reducing the expver dimension: In the past, the expver dimension was
present if you downloaded ERA5 data that consisted of a mix of ERA5 and
ERA5T data. This dimension was also present in the data variables, so it
broke code. Therefore this dimension is reduced with a mean operation.
Any reduction operation would do the trick since there is only one valid
value per timestep and the other one is nan. In datasets downloaded
currently (2025) the expver dimension is not present anymore,
but anexpver variable is present defining whether the data comes
from ERA5 (1) or ERA5T (5).

Removing scale_factor and add_offset: In the past, the ERA5 data was
supplied as integers with a scaling and offset that was different for
each downloaded file. This caused serious issues with merging files,
since the scaling/offset from the first file was assumed to be valid
for the others also, leading to invalid values. Only relevant for old
files. More info at https://github.com/Deltares/dfm_tools/issues/239.
"""
if 'expver' in ds.dims:
# TODO: this drops int encoding which leads to unzipped float32 netcdf files: https://github.com/Deltares/dfm_tools/issues/781
ds = ds.mean(dim='expver')

# datasets retrieved with new cds-beta have valid_time instead of time dimn/varn
# https://forum.ecmwf.int/t/new-time-format-in-era5-netcdf-files/3796/5?u=jelmer_veenstra
# TODO: can be removed after https://github.com/Unidata/netcdf4-python/issues/1357 or https://forum.ecmwf.int/t/3796 is fixed
# datasets retrieved with new CDS have valid_time instead of time dim/var
# https://forum.ecmwf.int/t/new-time-format-in-era5-netcdf-files/3796/5
if 'valid_time' in ds.coords:
ds = ds.rename({'valid_time':'time'})

# Prevent writing to (incorrectly scaled) int, since it might mess up mfdataset (https://github.com/Deltares/dfm_tools/issues/239)
# By dropping scaling/offset encoding and converting to float32 (will result in a larger dataset)
# ERA5 datasets retrieved with the new CDS-beta are zipped float32 instead of scaled int, so this is only needed for backwards compatibility with old files.
# reduce the expver dimension (not present in newly retrieved files)
if 'expver' in ds.dims:
ds = ds.mean(dim='expver')

# drop scaling/offset encoding if present and converting to float32. Not
# present in newly retrieved files, variables are zipped float32 instead
for var in ds.data_vars.keys():
if not set(['dtype','scale_factor','add_offset']).issubset(ds.variables[var].encoding.keys()):
list_attrs = ['dtype','scale_factor','add_offset']
if not set(list_attrs).issubset(ds.variables[var].encoding.keys()):
continue
# the _FillValue will still be -32767 (int default), but this is no issue for float32
# the _FillValue will still be -32767 (int default)
# this is no issue for float32
ds[var].encoding.pop('scale_factor')
ds[var].encoding.pop('add_offset')
ds[var].encoding["dtype"] = "float32"
Expand Down
4 changes: 2 additions & 2 deletions docs/notebooks/modelbuilder_example.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@
"dir_output = os.path.abspath(f'./{model_name}_model')\n",
"# path_style = 'windows' # windows / unix\n",
"overwrite = False # overwrite the downloaded forcing data or not. Always set to True when changing the domain\n",
"crs = 'EPSG:4326' # coordinate reference system\n",
"crs = 'EPSG:4326' # coordinate reference system, only EPSG 4326 (WGS84) is currently supported by the ERA5 and CMEMS download scripts.\n",
"\n",
"# domain and resolution\n",
"# the actual maximum extents can slightly vary: see dfmt.meshkernel_get_bbox() below\n",
Expand Down Expand Up @@ -144,7 +144,7 @@
],
"source": [
"# generate spherical regular grid\n",
"mk_object = dfmt.make_basegrid(lon_min, lon_max, lat_min, lat_max, dx=dxy, dy=dxy, crs=crs)\n",
"mk_object = dfmt.make_basegrid(lon_min=lon_min, lon_max=lon_max, lat_min=lat_min, lat_max=lat_max, dx=dxy, dy=dxy, crs=crs)\n",
"\n",
"# retrieve actual lat/lon bounds from grid, the lon_max and lat_max are likely larger than requested\n",
"lon_min, lat_min, lon_max, lat_max = dfmt.meshkernel_get_bbox(mk_object)\n",
Expand Down
45 changes: 31 additions & 14 deletions tests/test_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,16 +351,35 @@ def test_download_hycom(tmp_path):
def test_download_era5_unsupported_varkey():
date_min = '2010-01-31'
date_max = '2010-02-01'
longitude_min, longitude_max, latitude_min, latitude_max = 2, 3, 51, 52 #test domain
longitude_min, longitude_max, latitude_min, latitude_max = 2, 3, 51, 52
varkey = 'unexisting'
with pytest.raises(KeyError) as e:
dfmt.download_ERA5(varkey,
longitude_min=longitude_min, longitude_max=longitude_max, latitude_min=latitude_min, latitude_max=latitude_max,
date_min=date_min, date_max=date_max,
dir_output='.', overwrite=True)
dfmt.download_ERA5(
varkey,
longitude_min=longitude_min, longitude_max=longitude_max,
latitude_min=latitude_min, latitude_max=latitude_max,
date_min=date_min, date_max=date_max,
dir_output='.', overwrite=True)
assert '"unexisting" not available' in str(e.value)


@pytest.mark.requiressecrets
@pytest.mark.unittest
def test_download_era5_incorrect_times():
date_min = '2010-01-01'
date_max = '2009-01-01'
longitude_min, longitude_max, latitude_min, latitude_max = 2, 3, 51, 52
varkey = 'msl'
with pytest.raises(ValueError) as e:
dfmt.download_ERA5(
varkey,
longitude_min=longitude_min, longitude_max=longitude_max,
latitude_min=latitude_min, latitude_max=latitude_max,
date_min=date_min, date_max=date_max,
dir_output='.', overwrite=True)
assert 'resulted in empty period_range' in str(e.value)


@pytest.mark.requiressecrets
@pytest.mark.unittest
@pytest.mark.era5slow # temporarily skip these on github
Expand All @@ -372,21 +391,19 @@ def test_download_era5(file_nc_era5_pattern):

ds = xr.open_mfdataset(file_nc_era5_pattern)

assert 'valid_time' in ds.dims # TODO: if this fails, remove the exception below and in preprocess_ERA5
# datasets retrieved with intermediate CDS had expver dimension causing issues
assert 'expver' not in ds.dims # TODO: if this fails, update the docstring of preprocess_ERA5

timedim = 'time'
# datasets retrieved with new cds-beta have valid_time instead of time dimn/varn
# https://forum.ecmwf.int/t/new-time-format-in-era5-netcdf-files/3796/5?u=jelmer_veenstra
# TODO: can be removed after https://github.com/Unidata/netcdf4-python/issues/1357 or https://forum.ecmwf.int/t/3796 is fixed
if 'valid_time' in ds.dims:
timedim = 'valid_time'
# datasets retrieved with new CDS have valid_time instead of time dim/var
assert 'valid_time' in ds.dims # TODO: if this fails, remove the exception below and in preprocess_ERA5
timedim = 'valid_time'

assert ds.sizes[timedim] == 1416
assert ds[timedim].to_pandas().iloc[0] == pd.Timestamp('2010-01-01')
assert ds[timedim].to_pandas().iloc[-1] == pd.Timestamp('2010-02-28 23:00')

# check if there are no integers in the dataset anymore
# this was the case before CDS-beta in https://github.com/Deltares/dfm_tools/issues/239
# datasets retrieved with new CDS are float32 instead of scaled ints
# ints raised problem in https://github.com/Deltares/dfm_tools/issues/239
msl_encoding = ds['msl'].encoding
assert str(msl_encoding['dtype']) == 'float32'
assert 'scale_factor' not in msl_encoding.keys()
Loading