Description
I'm trying to write a netcdf file directly from xarray to S3 object storage. I'm wondering:
- Why writing NetCDF files requires a "seek"
- Why the
scipy
engine is getting used instead of the specifiednetcdf4
engine. - If there are nice workarounds (besides writing the NetCDF file locally, then using the AWS CLI to transfer to S3)
Code sample:
import fsspec
import xarray as xr
ds = xr.open_dataset('http://geoport.usgs.esipfed.org/thredds/dodsC'
'/silt/usgs/Projects/stellwagen/CF-1.6/BUZZ_BAY/2651-A.cdf')
outfile = fsspec.open('s3://chs-pangeo-data-bucket/rsignell/test.nc',
mode='wb', profile='default')
with outfile as f:
ds.to_netcdf(f, engine='netcdf4')
which produces:
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
<ipython-input-3-024939f31fe4> in <module>
2 mode='wb', profile='default')
3 with outfile as f:
----> 4 ds.to_netcdf(f, engine='netcdf4')
/srv/conda/envs/pangeo/lib/python3.7/site-packages/xarray/core/dataset.py in to_netcdf(self, path, mode, format, group, engine, encoding, unlimited_dims, compute, invalid_netcdf)
1552 unlimited_dims=unlimited_dims,
1553 compute=compute,
-> 1554 invalid_netcdf=invalid_netcdf,
1555 )
1556
/srv/conda/envs/pangeo/lib/python3.7/site-packages/xarray/backends/api.py in to_netcdf(dataset, path_or_file, mode, format, group, engine, encoding, unlimited_dims, compute, multifile, invalid_netcdf)
1102 finally:
1103 if not multifile and compute:
-> 1104 store.close()
1105
1106 if not compute:
/srv/conda/envs/pangeo/lib/python3.7/site-packages/xarray/backends/scipy_.py in close(self)
221
222 def close(self):
--> 223 self._manager.close()
/srv/conda/envs/pangeo/lib/python3.7/site-packages/xarray/backends/file_manager.py in close(***failed resolving arguments***)
331 def close(self, needs_lock=True):
332 del needs_lock # ignored
--> 333 self._value.close()
/srv/conda/envs/pangeo/lib/python3.7/site-packages/scipy/io/netcdf.py in close(self)
297 if hasattr(self, 'fp') and not self.fp.closed:
298 try:
--> 299 self.flush()
300 finally:
301 self.variables = OrderedDict()
/srv/conda/envs/pangeo/lib/python3.7/site-packages/scipy/io/netcdf.py in flush(self)
407 """
408 if hasattr(self, 'mode') and self.mode in 'wa':
--> 409 self._write()
410 sync = flush
411
/srv/conda/envs/pangeo/lib/python3.7/site-packages/scipy/io/netcdf.py in _write(self)
411
412 def _write(self):
--> 413 self.fp.seek(0)
414 self.fp.write(b'CDF')
415 self.fp.write(array(self.version_byte, '>b').tostring())
/srv/conda/envs/pangeo/lib/python3.7/site-packages/fsspec/spec.py in seek(self, loc, whence)
1122 loc = int(loc)
1123 if not self.mode == "rb":
-> 1124 raise OSError("Seek only available in read mode")
1125 if whence == 0:
1126 nloc = loc
OSError: Seek only available in read mode
Output of xr.show_versions()
INSTALLED VERSIONS
commit: None
python: 3.7.6 | packaged by conda-forge | (default, Mar 23 2020, 23:03:20)
[GCC 7.3.0]
python-bits: 64
OS: Linux
OS-release: 4.14.138-114.102.amzn2.x86_64
machine: x86_64
processor: x86_64
byteorder: little
LC_ALL: C.UTF-8
LANG: C.UTF-8
LOCALE: en_US.UTF-8
libhdf5: 1.10.5
libnetcdf: 4.7.4
xarray: 0.15.1
pandas: 1.0.3
numpy: 1.18.1
scipy: 1.4.1
netCDF4: 1.5.3
pydap: installed
h5netcdf: 0.8.0
h5py: 2.10.0
Nio: None
zarr: 2.4.0
cftime: 1.1.1.2
nc_time_axis: 1.2.0
PseudoNetCDF: None
rasterio: 1.1.3
cfgrib: None
iris: 2.4.0
bottleneck: None
dask: 2.14.0
distributed: 2.14.0
matplotlib: 3.2.1
cartopy: 0.17.0
seaborn: None
numbagg: None
setuptools: 46.1.3.post20200325
pip: 20.1
conda: None
pytest: 5.4.1
IPython: 7.13.0
sphinx: None