Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Vendor kerchunk fits reader #399

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 68 additions & 0 deletions virtualizarr/vendor/kerchunk/codecs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import ast

Check warning on line 1 in virtualizarr/vendor/kerchunk/codecs.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/vendor/kerchunk/codecs.py#L1

Added line #L1 was not covered by tests

import numcodecs
import numpy as np

Check warning on line 4 in virtualizarr/vendor/kerchunk/codecs.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/vendor/kerchunk/codecs.py#L3-L4

Added lines #L3 - L4 were not covered by tests


class AsciiTableCodec(numcodecs.abc.Codec):

Check warning on line 7 in virtualizarr/vendor/kerchunk/codecs.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/vendor/kerchunk/codecs.py#L7

Added line #L7 was not covered by tests
"""Decodes ASCII-TABLE extensions in FITS files"""

codec_id = "FITSAscii"

Check warning on line 10 in virtualizarr/vendor/kerchunk/codecs.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/vendor/kerchunk/codecs.py#L10

Added line #L10 was not covered by tests

def __init__(self, indtypes, outdtypes):

Check warning on line 12 in virtualizarr/vendor/kerchunk/codecs.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/vendor/kerchunk/codecs.py#L12

Added line #L12 was not covered by tests
"""

Parameters
----------
indtypes: list[str]
dtypes of the fields as in the table
outdtypes: list[str]
requested final dtypes
"""
self.indtypes = indtypes
self.outdtypes = outdtypes

Check warning on line 23 in virtualizarr/vendor/kerchunk/codecs.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/vendor/kerchunk/codecs.py#L22-L23

Added lines #L22 - L23 were not covered by tests

def decode(self, buf, out=None):
indtypes = np.dtype([tuple(d) for d in self.indtypes])
outdtypes = np.dtype([tuple(d) for d in self.outdtypes])
arr = np.frombuffer(buf, dtype=indtypes)
return arr.astype(outdtypes)

Check warning on line 29 in virtualizarr/vendor/kerchunk/codecs.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/vendor/kerchunk/codecs.py#L25-L29

Added lines #L25 - L29 were not covered by tests

def encode(self, _):
pass

Check warning on line 32 in virtualizarr/vendor/kerchunk/codecs.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/vendor/kerchunk/codecs.py#L31-L32

Added lines #L31 - L32 were not covered by tests


class VarArrCodec(numcodecs.abc.Codec):

Check warning on line 35 in virtualizarr/vendor/kerchunk/codecs.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/vendor/kerchunk/codecs.py#L35

Added line #L35 was not covered by tests
"""Variable length arrays in a FITS BINTABLE extension"""

codec_id = "FITSVarBintable"

Check warning on line 38 in virtualizarr/vendor/kerchunk/codecs.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/vendor/kerchunk/codecs.py#L38

Added line #L38 was not covered by tests
# https://heasarc.gsfc.nasa.gov/docs/software/fitsio/quick/node10.html
ftypes = {"B": "uint8", "I": ">i2", "J": ">i4"}

Check warning on line 40 in virtualizarr/vendor/kerchunk/codecs.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/vendor/kerchunk/codecs.py#L40

Added line #L40 was not covered by tests

def __init__(self, dt_in, dt_out, nrow, types):
self.dt_in = dt_in
self.dt_out = dt_out
self.nrow = nrow
self.types = types

Check warning on line 46 in virtualizarr/vendor/kerchunk/codecs.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/vendor/kerchunk/codecs.py#L42-L46

Added lines #L42 - L46 were not covered by tests

def encode(self, _):
raise NotImplementedError

Check warning on line 49 in virtualizarr/vendor/kerchunk/codecs.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/vendor/kerchunk/codecs.py#L48-L49

Added lines #L48 - L49 were not covered by tests

def decode(self, buf, out=None):
dt_in = np.dtype(ast.literal_eval(self.dt_in))
dt_out = np.dtype(ast.literal_eval(self.dt_out))
arr = np.frombuffer(buf, dtype=dt_in, count=self.nrow)
arr2 = np.empty((self.nrow,), dtype=dt_out)
heap = buf[arr.nbytes :]
for name in dt_out.names:
if dt_out[name] == "O":
dt = np.dtype(self.ftypes[self.types[name]])
counts = arr[name][:, 0]
offs = arr[name][:, 1]
for i, (off, count) in enumerate(zip(offs, counts)):
arr2[name][i] = np.frombuffer(

Check warning on line 63 in virtualizarr/vendor/kerchunk/codecs.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/vendor/kerchunk/codecs.py#L51-L63

Added lines #L51 - L63 were not covered by tests
heap[off : off + count * dt.itemsize], dtype=dt
)
else:
arr2[name][:] = arr[name][:]
return arr2

Check warning on line 68 in virtualizarr/vendor/kerchunk/codecs.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/vendor/kerchunk/codecs.py#L67-L68

Added lines #L67 - L68 were not covered by tests
191 changes: 191 additions & 0 deletions virtualizarr/vendor/kerchunk/fits.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
import logging

Check warning on line 1 in virtualizarr/vendor/kerchunk/fits.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/vendor/kerchunk/fits.py#L1

Added line #L1 was not covered by tests

import fsspec
import numcodecs
import numcodecs.abc
import numpy as np
import zarr
from fsspec.implementations.reference import LazyReferenceMapper

Check warning on line 8 in virtualizarr/vendor/kerchunk/fits.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/vendor/kerchunk/fits.py#L3-L8

Added lines #L3 - L8 were not covered by tests

from virtualizarr.vendor.kerchunk.codecs import AsciiTableCodec, VarArrCodec

Check warning on line 10 in virtualizarr/vendor/kerchunk/fits.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/vendor/kerchunk/fits.py#L10

Added line #L10 was not covered by tests

try:
from astropy.io import fits
from astropy.wcs import WCS

Check warning on line 14 in virtualizarr/vendor/kerchunk/fits.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/vendor/kerchunk/fits.py#L12-L14

Added lines #L12 - L14 were not covered by tests
except ModuleNotFoundError: # pragma: no cover
raise ImportError(
"astropy is required for kerchunking FITS files. Please install with "
"`pip/conda install astropy`."
)

logger = logging.getLogger("fits-to-zarr")

Check warning on line 21 in virtualizarr/vendor/kerchunk/fits.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/vendor/kerchunk/fits.py#L21

Added line #L21 was not covered by tests


BITPIX2DTYPE = {

Check warning on line 24 in virtualizarr/vendor/kerchunk/fits.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/vendor/kerchunk/fits.py#L24

Added line #L24 was not covered by tests
8: "uint8",
16: ">i2",
32: ">i4",
64: ">i8",
-32: "float32",
-64: "float64",
} # always bigendian


def process_file(

Check warning on line 34 in virtualizarr/vendor/kerchunk/fits.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/vendor/kerchunk/fits.py#L34

Added line #L34 was not covered by tests
url,
storage_options=None,
extension=None,
inline_threshold=100,
primary_attr_to_group=False,
out=None,
):
"""
Create JSON references for a single FITS file as a zarr group

Parameters
----------
url: str
Where the file is
storage_options: dict
How to load that file (passed to fsspec)
extension: list(int | str) | int | str or None
Which extensions to include. Can be ordinal integer(s), the extension name (str) or if None,
uses the first data extension
inline_threshold: int
(not yet implemented)
primary_attr_to_group: bool
Whether the output top-level group contains the attributes of the primary extension
(which often contains no data, just a general description)
out: dict-like or None
This allows you to supply an fsspec.implementations.reference.LazyReferenceMapper
to write out parquet as the references get filled, or some other dictionary-like class
to customise how references get stored


Returns
-------
dict of the references
"""
from astropy.io import fits

Check warning on line 69 in virtualizarr/vendor/kerchunk/fits.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/vendor/kerchunk/fits.py#L69

Added line #L69 was not covered by tests

storage_options = storage_options or {}
out = out or {}
g = zarr.open(out)

Check warning on line 73 in virtualizarr/vendor/kerchunk/fits.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/vendor/kerchunk/fits.py#L71-L73

Added lines #L71 - L73 were not covered by tests

with fsspec.open(url, mode="rb", **storage_options) as f:
infile = fits.open(f, do_not_scale_image_data=True)
if extension is None:
found = False
for i, hdu in enumerate(infile):
if hdu.header.get("NAXIS", 0) > 0:
extension = [i]
found = True
break
if not found:
raise ValueError("No data extensions")
elif isinstance(extension, (int, str)):
extension = [extension]

Check warning on line 87 in virtualizarr/vendor/kerchunk/fits.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/vendor/kerchunk/fits.py#L75-L87

Added lines #L75 - L87 were not covered by tests

for ext in extension:
hdu = infile[ext]
hdu.header.__str__() # causes fixing of invalid cards

Check warning on line 91 in virtualizarr/vendor/kerchunk/fits.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/vendor/kerchunk/fits.py#L89-L91

Added lines #L89 - L91 were not covered by tests

attrs = dict(hdu.header)
kwargs = {}
if hdu.is_image:

Check warning on line 95 in virtualizarr/vendor/kerchunk/fits.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/vendor/kerchunk/fits.py#L93-L95

Added lines #L93 - L95 were not covered by tests
# for images/cubes (i.e., ndarrays with simple type)
nax = hdu.header["NAXIS"]
shape = tuple(int(hdu.header[f"NAXIS{i}"]) for i in range(nax, 0, -1))
dtype = BITPIX2DTYPE[hdu.header["BITPIX"]]
length = np.dtype(dtype).itemsize
for s in shape:
length *= s

Check warning on line 102 in virtualizarr/vendor/kerchunk/fits.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/vendor/kerchunk/fits.py#L97-L102

Added lines #L97 - L102 were not covered by tests

if "BSCALE" in hdu.header or "BZERO" in hdu.header:
kwargs["filters"] = [

Check warning on line 105 in virtualizarr/vendor/kerchunk/fits.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/vendor/kerchunk/fits.py#L104-L105

Added lines #L104 - L105 were not covered by tests
numcodecs.FixedScaleOffset(
offset=float(hdu.header.get("BZERO", 0)),
scale=float(hdu.header.get("BSCALE", 1)),
astype=dtype,
dtype=float,
)
]
else:
kwargs["filters"] = None
elif isinstance(hdu, fits.hdu.table.TableHDU):

Check warning on line 115 in virtualizarr/vendor/kerchunk/fits.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/vendor/kerchunk/fits.py#L114-L115

Added lines #L114 - L115 were not covered by tests
# ascii table
spans = hdu.columns._spans
outdtype = [

Check warning on line 118 in virtualizarr/vendor/kerchunk/fits.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/vendor/kerchunk/fits.py#L117-L118

Added lines #L117 - L118 were not covered by tests
[name, hdu.columns[name].format.recformat]
for name in hdu.columns.names
]
indtypes = [

Check warning on line 122 in virtualizarr/vendor/kerchunk/fits.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/vendor/kerchunk/fits.py#L122

Added line #L122 was not covered by tests
[name, f"S{i + 1}"] for name, i in zip(hdu.columns.names, spans)
]
nrows = int(hdu.header["NAXIS2"])
shape = (nrows,)
kwargs["filters"] = [AsciiTableCodec(indtypes, outdtype)]
dtype = [tuple(d) for d in outdtype]
length = (sum(spans) + len(spans)) * nrows
elif isinstance(hdu, fits.hdu.table.BinTableHDU):

Check warning on line 130 in virtualizarr/vendor/kerchunk/fits.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/vendor/kerchunk/fits.py#L125-L130

Added lines #L125 - L130 were not covered by tests
# binary table
dtype = hdu.columns.dtype.newbyteorder(">") # always big endian
nrows = int(hdu.header["NAXIS2"])
shape = (nrows,)

Check warning on line 134 in virtualizarr/vendor/kerchunk/fits.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/vendor/kerchunk/fits.py#L132-L134

Added lines #L132 - L134 were not covered by tests
# if hdu.fileinfo()["datSpan"] > length
if any(_.format.startswith(("P", "Q")) for _ in hdu.columns):

Check warning on line 136 in virtualizarr/vendor/kerchunk/fits.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/vendor/kerchunk/fits.py#L136

Added line #L136 was not covered by tests
# contains var fields
length = hdu.fileinfo()["datSpan"]
dt2 = [

Check warning on line 139 in virtualizarr/vendor/kerchunk/fits.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/vendor/kerchunk/fits.py#L138-L139

Added lines #L138 - L139 were not covered by tests
(name, "O")
if hdu.columns[name].format.startswith(("P", "Q"))
else (name, str(dtype[name].base))
+ ((dtype[name].shape,) if dtype[name].shape else ())
for name in dtype.names
]
types = {

Check warning on line 146 in virtualizarr/vendor/kerchunk/fits.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/vendor/kerchunk/fits.py#L146

Added line #L146 was not covered by tests
name: hdu.columns[name].format[1]
for name in dtype.names
if hdu.columns[name].format.startswith(("P", "Q"))
}
kwargs["object_codec"] = VarArrCodec(

Check warning on line 151 in virtualizarr/vendor/kerchunk/fits.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/vendor/kerchunk/fits.py#L151

Added line #L151 was not covered by tests
str(dtype), str(dt2), nrows, types
)
dtype = dt2

Check warning on line 154 in virtualizarr/vendor/kerchunk/fits.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/vendor/kerchunk/fits.py#L154

Added line #L154 was not covered by tests
else:
length = dtype.itemsize * nrows
kwargs["filters"] = None

Check warning on line 157 in virtualizarr/vendor/kerchunk/fits.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/vendor/kerchunk/fits.py#L156-L157

Added lines #L156 - L157 were not covered by tests
else:
logger.warning(f"Skipping unknown extension type: {hdu}")
continue

Check warning on line 160 in virtualizarr/vendor/kerchunk/fits.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/vendor/kerchunk/fits.py#L159-L160

Added lines #L159 - L160 were not covered by tests
# one chunk for whole thing.
# TODO: we could sub-chunk on biggest dimension
name = hdu.name or str(ext)
arr = g.empty(

Check warning on line 164 in virtualizarr/vendor/kerchunk/fits.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/vendor/kerchunk/fits.py#L163-L164

Added lines #L163 - L164 were not covered by tests
name, dtype=dtype, shape=shape, chunks=shape, compression=None, **kwargs
)
arr.attrs.update(

Check warning on line 167 in virtualizarr/vendor/kerchunk/fits.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/vendor/kerchunk/fits.py#L167

Added line #L167 was not covered by tests
{
k: str(v) if not isinstance(v, (int, float, str)) else v
for k, v in attrs.items()
if k != "COMMENT"
}
)
arr.attrs["_ARRAY_DIMENSIONS"] = ["z", "y", "x"][-len(shape) :]
loc = hdu.fileinfo()["datLoc"]
parts = ".".join(["0"] * len(shape))
out[f"{name}/{parts}"] = [url, loc, length]
if primary_attr_to_group:

Check warning on line 178 in virtualizarr/vendor/kerchunk/fits.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/vendor/kerchunk/fits.py#L174-L178

Added lines #L174 - L178 were not covered by tests
# copy attributes of primary extension to top-level group
hdu = infile[0]
hdu.header.__str__()
g.attrs.update(

Check warning on line 182 in virtualizarr/vendor/kerchunk/fits.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/vendor/kerchunk/fits.py#L180-L182

Added lines #L180 - L182 were not covered by tests
{
k: str(v) if not isinstance(v, (int, float, str)) else v
for k, v in dict(hdu.header).items()
if k != "COMMENT"
}
)
if isinstance(out, LazyReferenceMapper):
out.flush()
return out

Check warning on line 191 in virtualizarr/vendor/kerchunk/fits.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/vendor/kerchunk/fits.py#L189-L191

Added lines #L189 - L191 were not covered by tests
Loading