-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscan_zarr.py
executable file
·132 lines (118 loc) · 4.99 KB
/
scan_zarr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/env python3
import argparse
import itertools
import math
import os
import pprint
import numpy as np
import zarr
def show_attrs(src, indent):
for ak in src.attrs:
if isinstance(src.attrs[ak], (dict, list)):
form = pprint.pformat(src.attrs[ak], compact=True, width=80 - (
indent + 2) * 2).strip().split('\n')
if len(form) > 1:
print('%s:%s:' % (' ' * indent, ak))
for line in form:
print('%s%s' % (' ' * (indent + 2), line))
else:
print('%s:%s: %s' % (' ' * indent, ak, form[0]))
else:
print('%s:%s: %r' % (' ' * indent, ak, src.attrs[ak]))
def scan_dataset(v, analyze, showattrs, sample, empty, indent):
minv = maxv = None
if empty is not False:
empty = empty + 1 if empty is not True else 0
print('%s - %s %s %r %r %s' % (
' ' * (indent + 1), v.name, v.dtype, v.shape,
v.chunks, v.compressor.cname if v.compressor else ''))
if showattrs:
show_attrs(v, indent + 1)
if v.dtype.kind in {'f', 'i', 'u'} and empty is not False:
v[...] = empty
if v.dtype.kind in {'f', 'i', 'u'} and analyze:
sumv = 0
for coor in itertools.product(*(
range(0, v.shape[idx], v.chunks[idx]) for idx in range(len(v.shape)))):
field = tuple(
slice(coor[idx], min(coor[idx] + v.chunks[idx], v.shape[idx]))
for idx in range(len(v.shape)))
part = v[field]
if minv is None:
minv = np.amin(part)
maxv = np.amax(part)
else:
minv = min(minv, np.amin(part))
maxv = max(maxv, np.amax(part))
if part.dtype == np.float16:
part = part.astype(np.float32)
sumv += part.sum()
avgv = sumv / v.size
print('%s [%g,%g] %g' % (
' ' * (indent + 1), minv, maxv, avgv))
if sample and len(v.shape) == 1:
checksize = int(math.ceil(v.shape[0] ** 0.5))
sampleset = np.unique(v[:min(v.shape[0], checksize * 2)])
if len(sampleset) < checksize:
sampleset = dict(zip(*np.unique(v, return_counts=True)))
sampleset = {k: val for val, k in sorted([
(val, k) for k, val in sampleset.items()], reverse=True)}
if len(sampleset) < max(10, checksize):
print('%s [%d kinds] %r' % (
' ' * (indent + 1), len(sampleset),
{k: sampleset[k] for k in itertools.islice(sampleset, 100)}))
return minv, maxv, empty
def scan_node(src, analyze=False, showattrs=False, sample=False, empty=False, indent=0):
print('%s%s' % (' ' * indent, src.name))
if showattrs:
show_attrs(src, indent)
for _k, v in src.items():
if isinstance(v, zarr.core.Array):
minv, maxv, empty = scan_dataset(v, analyze, showattrs, sample, empty, indent)
elif isinstance(v, zarr.hierarchy.Group):
empty = scan_node(v, analyze, showattrs, sample, empty, indent=indent + 1)
return empty
def scan_zarr(path, analyze=False, showattrs=False, sample=False, empty=False):
if os.path.isdir(path):
if (not os.path.exists(os.path.join(path, '.zgroup')) and
not os.path.exists(os.path.join(path, '.zattrs')) and
not os.path.exists(os.path.join(path, '.zarray'))):
print(f'Cannot parse {path}')
return
try:
fptr = zarr.open(zarr.SQLiteStore(str(path)))
except Exception:
try:
fptr = zarr.open(path, mode='r' if not empty else 'r+')
except Exception:
print(f'Cannot parse {path}')
return
if isinstance(fptr, zarr.core.Array):
scan_dataset(fptr, analyze, showattrs, sample, empty, 0)
else:
scan_node(fptr, analyze, showattrs, sample, empty)
def command():
parser = argparse.ArgumentParser(
description='Scan a zarr file or directory and report on its groups, '
'datasets, and attributes. Optionally report mininum, maximum, and '
'average values for datasets with integer or float datatypes.')
parser.add_argument(
'source', type=str, help='Source file to read and analyze.')
parser.add_argument(
'--analyze', '-s', action='store_true',
help='Analyze the min/max/average of datasets.')
parser.add_argument(
'--sample', action='store_true',
help='Show a sample of 1-d data sets if they have fewer unique values '
'than the square root of their size.')
parser.add_argument(
'--attrs', '-k', action='store_true',
help='Show attributes on groups and datasets.')
parser.add_argument(
'--empty', action='store_true',
help='Modify the file, making all arrays a constant value.')
opts = parser.parse_args()
print(opts.source)
scan_zarr(opts.source, opts.analyze, opts.attrs, opts.sample, opts.empty)
if __name__ == '__main__':
command()