-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.tex
702 lines (606 loc) · 31.2 KB
/
main.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
\documentclass[10pt,a4paper%% ,twoside,openright,titlepage,
%% headinclude,footinclude,numbers=noenddot,
%% captions=tableheading
]{article}
%% \usepackage[utf8]{inputenc}
\usepackage{url}
\usepackage{parskip}
\usepackage{authblk}
\usepackage{tabularx}
\usepackage{adjustbox}
\usepackage[dvipsnames,table]{xcolor} % needs to come before forest for some reason
\usepackage{forest}
%% \usepackage{amsmath,amssymb}
%% \usepackage{newpxtext,newpxmath}
\usepackage{biblatex}
\addbibresource{sample.bib}
\usepackage{listings}
%\usepackage{inconsolata}
\usepackage[T1]{fontenc}
\usepackage[scaled]{beramono}
\definecolor{keyword}{rgb}{0.6,0,0}
\definecolor{codegreen}{rgb}{0,0.6,0}
\definecolor{codegray}{rgb}{0.5,0.5,0.5}
\definecolor{codepurple}{rgb}{0.58,0,0.82}
\definecolor{backcolour}{rgb}{0.97,0.97,0.97}
% https://tex.stackexchange.com/questions/297345/why-is-the-start-row-of-rowcolors-ignored-in-tabularx
\newcounter{tblerows}
\expandafter\let\csname c@tblerows\endcsname\rownum
\lstdefinestyle{mystyle}{
backgroundcolor=\color{backcolour},
commentstyle=\color{codegreen},
keywordstyle=\color{keyword},
numberstyle=\tiny\color{codegray},
stringstyle=\color{codepurple},
basicstyle=\ttfamily\footnotesize,
breakatwhitespace=false,
breaklines=true,
captionpos=b,
keepspaces=true,
numbers=left,
numbersep=10pt,
showspaces=false,
showstringspaces=false,
showtabs=false,
tabsize=2,
framexleftmargin=5pt,
framexrightmargin=5pt,
framextopmargin=3pt,
framexbottommargin=3pt,
frame=tb,
framerule=0pt,
}
\lstset{style=mystyle}
\usepackage{subfig}
\usepackage[nochapters%% eulermath=false,
%% beramono=true,
%% palatino=true,
%% pdfspacing,
%% style=classicthesis
]{classicthesis}
%% \newcounter{mparcnt}
%% \renewcommand\themparcnt{\raise0.5ex\hbox{\arabic{mparcnt}}}
%% \newcommand\mpar[1]{\refstepcounter{mparcnt}\themparcnt\marginpar{\footnotesize\themparcnt #1}}
\begin{document}
\pagestyle{plain}
\title{\rmfamily\normalfont\spacedallcaps{The Hail System: Computing for Data-Intensive Science}}
\author[4]{John Compitello}
\author[1,2,3]{Jacqueline I. Goldstein}
\author[1,2,3]{Daniel Goldstein}
\author[1,2,3]{Dan King}
\author[1,2,3]{Timothy Poterba}
\author[1,2,3]{Iris Rademacher}
\author[1,2,3]{Patrick Schultz}
\author[1,2,3]{Christopher Vittal}
\author[4]{Arcturus Wang}
\author[1,2,3]{Konrad Karczewski}
\author[1,2,3]{Cotton Seed}
\author[1,2,3]{Benjamin M Neale}
\affil[1]{Program in Medical and Population Genetics\\ Broad Institute of MIT and Harvard\\ Cambridge, MA, USA.}
\affil[2]{Analytic and Translational Genetics Unit\\ Massachusetts General Hospital\\ Boston, MA, USA.}
\affil[3]{Stanley Center for Psychiatric Research\\ Broad Institute of MIT and Harvard\\ Cambridge, MA, USA.}
\affil[4]{TBD}
\renewcommand\Authfont{\scshape\small}
\renewcommand\Affilfont{\itshape\scriptsize}
%% \author{
%% \spacedlowsmallcaps{Daniel King} \and
%% \spacedlowsmallcaps{John Compitello} \and
%% \spacedlowsmallcaps{Jacqueline I. Goldstein} \and
%% \spacedlowsmallcaps{Daniel Goldstein} \and
%% \spacedlowsmallcaps{Konrad Karczewski} \and
%% \spacedlowsmallcaps{Timothy Poterba} \and
%% \spacedlowsmallcaps{Iris Rademacher} \and
%% \spacedlowsmallcaps{Patrick Schultz} \and
%% \spacedlowsmallcaps{Christopher Vittal} \and
%% \spacedlowsmallcaps{Arcturus Wang} \and
%% \spacedlowsmallcaps{Cotton Seed} \and
%% \spacedlowsmallcaps{Benjamin M Neale}
%% }
\date{\spacedlowsmallcaps{December 2022}}
\newcommand{\tableheadline}[1]{\spacedlowsmallcaps{#1}}
\maketitle
\begin{abstract}
Abstract.
\end{abstract}
\tableofcontents
\section{Introduction}
The anticipated release, in 2023, of the Genome Aggregation Database's 955,000 exome callset marks two decades of yearly doubling in the sample size of human genetic datasets.
We have no evidence of deceleration; in fact, cheaper sequencing and the rise of biobanks suggest an acceleration.
We present here a philosophy and reification thereof for extracting knowledge from genetic datasets at and beyond one million sequences.
\section{The Three Phases}
On January 11, 2007, during a talk to the NRC-CSTB, Jim Gray exhorted us to ``do better at producing tools to support the whole research cycle from data capture and data curation to data analysis and data visualization'' \cite{fourth-paradigm}. %
Our experience in the genetics and genomics communities, suggest an additional and orthogonal division of the research cycle into the three phases:
\begin{itemize}
\item Primary. The physical-digital interface: sequencers, telescopes, etc.
\item Secondary. Technical measurements become model-relevant quantities: e.g. alignment.
\item Tertiary. Interactive \& iterative knowledge making: regression, machine learning, etc.
\end{itemize}
These three phases were originally conceived within the context of genomics\cite{illumina-three-phases}, but we believe they apply broadly to data-intensive science.
We may further characterize each phase in terms of its inherent concurrency of execution and inherent mutability by scientists.
Primary analysis achieves concurrency both through multiplexing within one machine and the use of multiple machines.
Primary analysis is the least mutable because it involves physical laboratory processes, physical machines, and logic encoded in circuitry.
Secondary analysis is historically characterized by concurrent processing of many samples without interdependence.
Unlike primary analysis, secondary analysis is more easily mutated.
The analyses are largely defined in high-level programming languages and they are executed on commodity hardware.
Tertiary analysis is highly mutable because it is largely implemented in terms of very high-level concepts in very widely understood programming languages such as R and Python.
However, concurrency in tertiary analysis ranges from naively sample- or experiment-parallel to the complex dependency graphs of distributed linear algebra.
Every tertiary analysis produces knowledge within the context of some model. Scientific knowledge is built on a series of increasingly abstract models.
It is critical that our analytical systems facilitate the elevation of novel tertiary analyses into productionized secondary analyses because scientific knowledge making begins with the outputs of secondary analysis.
Moreover, a unified analytical system empowers tertiary analysts to innovate across the three phases.
For example, the development of a new blended genome-exome product at the Broad Institute required a change in every phase to realize the cost savings.
\section{The Four Interfaces}
We believe four interfaces are key to the success of data-intensive science:
\begin{itemize}
\item Relational Algebra
\item Linear Algebra
\item ``Workflows'': directed acyclic graphs of containers
\item Differentiable programming
\end{itemize}
Our conception of relational algebra is focused not on one-dimensional tables of data but on multi-dimensional ``tensor tables''.
A zero-dimensional ``field'' holds only one value, e.g. a release date, a name, an author list. A one-dimensional ``field'' is equivalent to an RDBMS column.
It takes on zero or more values indexed by a ``key''. A two-dimensional ``field'' takes on zero or more values and is indexed by two orthogonal keys: the ``row key'' and ``column key''.
A two-dimensional field is subtly different from a one-dimensional field with a ``compound'' key: a two-dimensional field ought to permit access to a subset of data in time linear in the size of the subset.
This is not true, for example, of accessing all records with $j \in {3, 4, 5}$ for a traditional RDBMS table with the key: $i, j$.
Relational algebra is commonly used for dataset ``quality-control''. For example, aggregations, such as the mean, by row (respectively: by column) are used to determine the quality of variants (respectively: samples) of large human sequencing studies.
Linear algebra appears in both quality-control and model fitting.
During quality-control, $XX^T$ and $X^TX$ naturally express row correlation and column correlation.
In genome-wide association studies, analysts remove highly correlated variants and samples.
Model fitting typically leans heavily on linear algebraic operations whether directly solving linear models with ordinary least squares or using matrix operations within an iterative solver.
User-defined functions are widely recognized as a useful but costly feature.
We argue for the most extreme version of user-defined functions: arbitrary containers.
We also require these containers have a shell.
The shell is the natural language for describing the relationship between input and output files of an arbitrary binary.
Differentiable programming is key linguistic infrastructure for modern machine learning and deep learning systems like PyTorch and TensorFlow.
The relative difficulty of training large models retards their use in genomics.
The system we present below does not support differentiable programming but we believe it is a critical missing piece.
Science is the process of proposing, refuting, and fitting parameters of models.
Data-intensive science relies on automated refutation and parameter-fitting of quantitative models.
In papers and at blackboards, scientists describe these models in both relational and linear algebraic terms.
For example, ``For genomic variants with an allele frequency of at least 1\%, fit the model:
$\mathrm{phenotype} \sim \mathrm{GT} + \mathrm{PC}_0 \cdots \mathrm{PC}_n$
.''
\section{The Five Trusses}
We believe the four interfaces are best implemented in terms of these five infrastructural
components.
\begin{enumerate}
\item Elastic, multi-tenant, scalable, container-based cloud cluster scheduler.
\item Cloud-native file system abstraction supporting local files and cloud blob storage.
\item In-memory data format for arbitrary datatypes.
\item On-disk data format for arbitrary datatypes.
\item On-disk data format for partitioned data sets.
\end{enumerate}
\section{The Laptop \& The Cloud}
Modern clouds provide numerous services.
Let us focus on the essence of the cloud: blob storage and virtual machines.
Cloud blob storage buckets should appear to the user as an effectively infinite capacity mounted disks.
Virtual machines should appear to the user as an effectively infinite collection of cores.
Of course, the user will notice the slightly increased latency of network-attached cores and disks.
However, we strongly advocate against additional barriers between the laptop and the cloud.
For example, cloud-hosted Notebooks are common (e.g. Google Collab, Dataproc Notebooks, Terra Notebooks).
We believe this is a critical mistake.
The user is severed from the customized and comfortable environment of their laptop.
SSH, Notebooks, and blob storage REST APIs are not facile and familiar interfaces to scientists.
In contrast the four interface of science (relation \& linear algebra, container DAGs, differentiable programming) are facile and familiar to scientists.
We argue for the implementation of these interfaces atop a laptop-cloud hybrid environment.
Relational algebra, linear algebra, and differentiable programming enable scientists to pose questions in a familiar high-level language.
Directed acyclic graphs of containers provide the necessary escape hatch from
The user should not think about where their code executes.
Instead, the user should describe the desired outcome or algorithm and the necessary resources.
The system should
\section{Fast \& Facile Computing}
The ideal system reads our mind and responds fast enough to saturate our mental bandwidth.
Such a system is many PhD theses away from existence.
Instead of mind-reading, we suggest the high-level language of relational algebra, linear algebra, and DAGs of containers.
We endeavor to build a system that produces results faster than the human visual reaction time ($\sim250$ ms).
We believe a user should not concern themselves with where their code runs.
The user should specify the hardware
\section{Approximate Methods}
\section{The Hail System}
The Hail System is a cloud-native, elastic, multi-tenant, scientific analysis system.
Cloud-native means the Hail System was designed for spot instances and cloud blob storage.
We built for spot instances because their cost-per-core-hour is consistently a third of reserved instances.
We built a file system API that can be efficiently implemented on cloud blob storage.
Replicating traditional file system API semantics on the cloud blob storage is an unnecessary cost.
Elastic means the Hail System adjusts its resource use to meet demand.
Elasticity avoids wasted resources while allowing extensive scale.
Multi-tenant means the Hail System allows multiple users to commingle on one cluster.
Commingling is critical for two reasons.
First, commingling on an elastic cluster amortizes virtual machine startup costs across all jobs.
Second, commingling overlays ``spiky'' workloads to yield consistent use of many cores.
Consistent use allows every user to cost-effectively use many cores for short periods of time.
Data-intensive science includes both long-running and interactive jobs.
Interactive jobs are characterized by usage spikes: short periods using many cores.
Anticipating every possible query with an index is neither possible nor cheap.
In contrast to the cost of creating and storing a litany of indices, the price of ``spot'' core-hours is $\sim$0.01 USD per hour.
Compare that to the cost of a first year NIH post-doc: 27 USD per hour.
Using 10,000 cores for ten seconds to search through hundreds of thousands of files costs a bit more than a quarter.
In contrast, waiting for a ten core laptop to run for 1,000 seconds wastes a quarter-hour of the post-doc's time: almost 7 USD.
\section{The Hail System Specifics}
The Hail System has two primary interfaces: the Query interface and the Batch interface.
The Query interface presents relational and linear algebra with a ``data frame'' interface.
The Batch interface concerns ``pipelines'' and ``workflows''.
A distinguishing feature of the Hail System is the ability to \emph{compose} Query pipelines with Batch pipelines.
Consider, for example, a single Python file, which uses Hail Query to count variants per genes per sample, export that to a file per-gene, uses Hail Batch to execute SAIGE
\footnote{SAIGE is a very fast C++ program for fitting linear mixed models.}
on each file, then uses Hail Query to load all the individual results files, and, finally, produce a plot of p-values.
\subsection{Hail Query}
The Hail Query interface is best thought of as a handful of languages.
We manifest these languages in Python in the style of ``data frames''.
Listing \ref{fig:data-frame-style} demonstrates this style with a snippet of Python that uses Hail Query.
In particular, queries are built through library calls and applying operators in a style evocative of the host language.
\subsubsection{Datatypes}
Hail Query distinguishes between ``small'' datatypes and ``big'' datatypes.
Small datatypes must fit in memory.
Big datatypes are partitioned and evaluated in parallel.
Hail Query has many small datatypes including integers, floating-point numbers, arrays, sets, dictionaries, and structures.
An extensive listing is available in Figure \ref{fig:small-datatypes}.
Every small datatype \texttt{T} includes a unique missing value \texttt{hl.missing(T)}.
Every small datatype is totally ordered and has decidable equality.
\begin{figure}[h]
\begin{adjustbox}{minipage=19cm, center}
\rowcolors{2}{gray!20}{white}
\begin{tabularx}{\textwidth}{>{\raggedright\arraybackslash}llX}
\toprule
\tableheadline{Name} & \tableheadline{Description} & \tableheadline{Examples} \\
\midrule
\texttt{bool} & Boolean values & \texttt{\footnotesize True} \\
\texttt{int}, \texttt{int32} & 32-bit integers & \texttt{\footnotesize 123} \\
\texttt{int64} & 64-bit integers & \texttt{\footnotesize 123} \\
\texttt{float32} & 32-bit floating-point number & \texttt{\footnotesize 1.0} \\
\texttt{float}, \texttt{float64} & 64-bit floating-point number & \texttt{\footnotesize 1.0} \\
\texttt{array(T)} & an array of data of type \texttt{\footnotesize T} & \texttt{\footnotesize []} \newline \texttt{\footnotesize [1.0, 1.0]} \\
\texttt{set(T)} & set of data of type \texttt{\footnotesize T} & \texttt{\footnotesize \{\}} \newline \texttt{\footnotesize \{5, 10, -4\}} \\
\texttt{dict(K,V)} & a finite mapping & \texttt{\footnotesize \{\}} \newline \texttt{\footnotesize \{5:"a", 10:"applepie"\}} \\
\texttt{struct(...,fN=TN)} & a structure with $N$ fields & \texttt{\footnotesize struct()} \newline \texttt{\footnotesize struct(x=5, y="a", z=struct(x=42.0))} \\
\texttt{tuple(..., TN)} & a structure with $N$ nameless fields & \texttt{\footnotesize (,)} \newline \texttt{\footnotesize (1.0, 1, "abc")} \\
\texttt{str} & a UTF-8 string & \texttt{\footnotesize "abc 123"} \\
\texttt{call} & a genotype call & \texttt{\footnotesize call(0, 1)} \newline \texttt{\footnotesize call(2)} \\
\texttt{locus(ref)} & a genomic locus in the reference \texttt{\footnotesize ref} & \texttt{\footnotesize locus("1", 100000, "GRCh37")} \newline \texttt{\footnotesize locus("chrM", 1234, "GRCh38")} \\
\texttt{interval(T)} & an interval on the type \texttt{T} & \texttt{\footnotesize interval(4.5, 1e8)} \newline \texttt{\footnotesize interval("abc", "xyzdef")} \newline \texttt{\footnotesize interval(locus("1",1234), locus("1",5000))} \\
\texttt{ndarray(T)} & a vector, matrix, or tensor containing \texttt{\footnotesize T} & \texttt{\footnotesize nd.array([1, 2])} \newline \texttt{\footnotesize nd.array([[1.0, 0.0], [0.0, 1.0]])} \\
\end{tabularx}
\end{adjustbox}
\caption{The Small Datatypes}
\label{fig:small-datatypes}
\end{figure}
Hail Query has three big datatypes: the Table, the Matrix Table, and the Block Matrix.
A Table is Hail's analogue of the pandas DataFrame, the R dataframe, and the SQL table.
A Table contains zero or more rows.
Every row has the same \texttt{struct} type with zero or more fields.
Every Table has a ``key'': an ordered list of zero or more fields.
The rows of a table are stored and processed in key-order.
A Table is partitioned: each row belongs to exactly one partition and partitions contain zero or more rows.
For every partition, the interval of keys contained within is statically known.
If a Table has a non-empty key, the key-intervals of the partitions are disjoint.
Every Table also has zero or more ``global'' fields used to store metadata about the table such as the name of the dataset.
A Matrix Table is used to represent matrical datasets with complex entry schemas.
For example, genome sequencing callsets are represented as Matrix Tables.
The rows of such a Matrix Table are genomic loci.
The columns of such a Matrix Table are individuals (samples).
The entries of such a Matrix Table are structured data containing genotype calls, finite probability distributions, and other experimental metadata.
A Matrix Table has two orthogonal and disjoint keys: the row key and the column key.
A Matrix Table has four classes of fields: ``global'', ``row'', ``column'', and ``entry''.
Global fields are not indexed by either key.
Row fields are indexed by the row key.
Column fields are indexed by the column key.
Entry fields are indexed by both keys.
A Matrix Table only partitions its data along the row-axis, not the column-axis.
In particular, operations that use entry fields will load an entire row of the matrix into memory.
A Block Matrix is used to represent numerical matrices for linear algebraic operations.
A Block Matrix is partitioned into square block of a statically known size.
A Block Matrix has one nameless field.
The type of that field may only be integeral or a floating-point.
Block Matrix support matrix multiply, the singular value decomposition, and a few other simple matrical operations.
\begin{figure}[h]
\begin{adjustbox}{minipage=19cm, center}
\rowcolors{2}{gray!20}{white}
\begin{tabularx}{\textwidth}{>{\raggedright\arraybackslash}llX}
\toprule
\tableheadline{Name} & \tableheadline{Description} & \tableheadline{Examples} \\
\midrule
\texttt{Table} & a partitioned tabular dataset & \texttt{\footnotesize hl.utils.range\_table(10)} \\
\texttt{MatrixTable} & a partioned matrical dataset & \texttt{\footnotesize hl.utils.range\_matrix\_table(10, 10)} \newline \texttt{\footnotesize hl.balding\_nichols\_model(1, 10000, 100)} \\
\texttt{BlockMatrix} & a block-partitioned linear algebraic, numeric matrix & \texttt{\footnotesize hl.linalg.BlockMatrix.from\_numpy(\newline np.array([[1, 2], [3, 4]]))} \newline \texttt{\footnotesize hl.linalg.BlockMatrix.from\_entry\_expr(\newline \quad mt.GT.n\_alt\_alleles()} \\
\end{tabularx}
\end{adjustbox}
\caption{The Big Datatypes}
\label{fig:big-datatypes}
\end{figure}
\begin{figure}[h]
\begin{lstlisting}[language=Python, caption=An example of Hail's ``data frame'' style,label=fig:data-frame-style]
import hail as hl
t = hl.utils.range_table(10)
t = t.annotate(foo = t.idx * t.idx)
t = t.group_by(
group_id = t.idx % 3
).aggregate(
group_mean=hl.agg.mean(t.foo)
)
\end{lstlisting}
\end{figure}
\subsubsection{On-Disk, In-Fight, and In-Memory Formats}
Hail has custom on-disk, in-flight, and in-memory data representations for small and large datatypes.
We recognize that this infrasturcture largely duplicates similar efforts such as Apache Arrow and Zarr.
When Hail began in 2015, the importance of zero-copy serialization to data formats was just becoming clear.
Also around this time, the Apache Spark project introduced Tungsten: an efficient, off-heap, columnar data format.
The Hail project, in the face of exponentially growing human sequencing datasets, could not wait for these community-supported projects to arrive and mature.
Instead, we built a parallel ecosystem of data formats.
Hail's big datatypes' ``on-disk''%
\footnote{On-disk either means in a traditional file system or in cloud blob storage}
formats are built using Hail's RVD%
\footnote{
This once referred to ``Region Value Dataset'' which began as a Spark RDD containing off-heap values.
Today, Hail is not tied to Spark and this term has lost its original meaning.
It now only refers to a partitioned \& on-disk data representation.
}
format.
The RVD format describes how to store a partitioned collection of keyed records.
Each partition is stored in a separate file.
The list of partition files and their key intervals are stored in a metadata file.
Every partition uses the same small data on-disk format to store the component records.
Figure \ref{fig:rvd-format} visualizes the RVD format.
The RVD format may optionally have an index located at an arbitrary relative location.
If an index is present, every data partition has one index partition with the same name suffixed by ``.idx''.
Every index partition is itself a B-tree of order 4096.
Each B-tree leaf node is an array of index records.
Each index record must contain the key for and offset to records in the corresponding data partition.
Each index record may also contain arbitrary additional data stored in a field named ``annotation''.
Every B-tree leaf node uses the same small data on-disk format and small datatype which is listed in Listing \ref{fig:index-type}.
\begin{figure}[h]
\begin{lstlisting}[language=Python, caption=The type of an RVD index file., label=fig:index-type]
struct(
first_idx=int64,
keys=array(
struct(
key=KeyType,
offset=int64,
annotation=ExtraDataType)))
\end{lstlisting}
\end{figure}
Multiple RVDs may share the same index.
When two RVDs share an index, one of the offsets is stored in the annotation field.
Hail only uses this feature in Matrix Tables.
A Matrix Table's row fields and entry fields are identically partitioned along their shared row key.
The row record offsets are stored in the ``offset'' field.
The entry record offsets are stored under a nested structure in the ``annotation'' field.
\begin{figure}[h]
\begin{forest}
for tree={
font=\ttfamily,
grow'=0,
child anchor=west,
parent anchor=south,
anchor=west,
calign=first,
edge path={
\noexpand\path [draw, \forestoption{edge}]
(!u.south west) +(7.5pt,0) |- node[fill,inner sep=1.25pt] {} (.child anchor)\forestoption{edge label};
},
before typesetting nodes={
if n=1
{insert before={[,phantom]}}
{}
},
fit=band,
before computing xy={l=15pt},
}
[directory
[...]
[name
[metadata.json.gz]
[parts
[part-00-0c960d5b-f57a-4e85-a94c-32cd6989904a]
[part-01-2213c977-f3ba-4f51-bcfb-898f6bb65f2f]
[...]
[part-10-8869e10a-0c88-495c-9ce2-38c4b5aafa11]
]
]
[index,
tikz={
\node [name=foo, draw,gray,fit=()(!l)(!ll), pin={[align=center,pin edge={gray,thick}] right:{Indices are an \\ optional component.}}] {};
}
[part-00-0c960d5b-f57a-4e85-a94c-32cd6989904a.idx
[index]
[metadata.json.gz]]
[part-01-2213c977-f3ba-4f51-bcfb-898f6bb65f2f.idx
[index]
[metadata.json.gz]]
[...]
[part-50-8869e10a-0c88-495c-9ce2-38c4b5aafa11.idx
[index]
[metadata.json.gz]]
]
]
\end{forest}
\caption{The RVD format}
\label{fig:rvd-format}
\end{figure}
The Table on-disk format at minimum is a directory containing three files.
The directory may also contain a globals RVD, a rows RVD, an index folder, and a references folder.
A README.txt file is intended for human beings who stumble across this Table.
Hail Query ensures writing the $0$-byte \_SUCCESS file has a happens-after relationship with every other file in the Table.
The metadata.json.gz file contains version information, where to find the globals and rows RVDs, the number of records in each partition, and the Table's type.
The globals and rows RVDs may be contained within the Table directory or may reside at another, usually relative, path.
The index folder contains a separate index file for every partition of the dataset.
The references folder contains any non-default Genome References referenced in the Table's type.
The MatrixTable on-disk format contains the same three files, two RVDs, index folder, and references folder as the Table format but also has an entries RVD.
\begin{figure}[h]
\begin{forest}
for tree={
font=\ttfamily,
grow'=0,
child anchor=west,
parent anchor=south,
anchor=west,
calign=first,
edge path={
\noexpand\path [draw, \forestoption{edge}]
(!u.south west) +(7.5pt,0) |- node[fill,inner sep=1.25pt] {} (.child anchor)\forestoption{edge label};
},
before typesetting nodes={
if n=1
{insert before={[,phantom]}}
{}
},
fit=band,
before computing xy={l=15pt},
}
[name.ht
[README.txt]
[\_SUCCESS]
[metadata.json.gz]
[globals,
tikz={
\node [name=foo, draw,gray,fit=()(!2)(!ll), pin={[align=center,pin edge={gray,thick}] right:{This is an RVD.}}] {};
}
[metadata.json.gz]
[parts
[part-0]]
]
[rows,
tikz={
\node [name=foo, draw,gray,fit=()(!ll), pin={[align=center,pin edge={gray,thick}] right:{This is also \\ an RVD.}}] {};
}
[metadata.json.gz]
[parts
[part-00-0c960d5b-f57a-4e85-a94c-32cd6989904a] % FIXME: we should not publish a paper with the randomness after a sequential index
[part-01-2213c977-f3ba-4f51-bcfb-898f6bb65f2f]
[...]
[part-50-8869e10a-0c88-495c-9ce2-38c4b5aafa11]
]
]
[index
[part-00-0c960d5b-f57a-4e85-a94c-32cd6989904a.idx
[index]
[metadata.json.gz]]
[part-01-2213c977-f3ba-4f51-bcfb-898f6bb65f2f.idx
[index]
[metadata.json.gz]]
[...]
[part-50-8869e10a-0c88-495c-9ce2-38c4b5aafa11.idx
[index]
[metadata.json.gz]]
]
[references]
]
\end{forest}
\caption{The Table on-disk format}
\label{fig:table-format}
\end{figure}
\begin{figure}[h]
\begin{forest}
for tree={
font=\ttfamily,
grow'=0,
child anchor=west,
parent anchor=south,
anchor=west,
calign=first,
edge path={
\noexpand\path [draw, \forestoption{edge}]
(!u.south west) +(7.5pt,0) |- node[fill,inner sep=1.25pt] {} (.child anchor)\forestoption{edge label};
},
before typesetting nodes={
if n=1
{insert before={[,phantom]}}
{}
},
fit=band,
before computing xy={l=15pt},
}
[name.mt
[README.txt]
[\_SUCCESS]
[metadata.json.gz]
[globals
[README.txt]
[\_SUCCESS]
[metadata.json.gz]
[parts
[part-0]]
]
[cols,
tikz={
\node [name=foo, draw,gray,fit=()(!4)(!ll), pin={[align=center,pin edge={gray,thick}] right:{This is a Table. \\ The globals, rows, and \\ entries are all also Tables}}] {};
}
[README.txt]
[\_SUCCESS]
[metadata.json.gz]
[parts
[part-0]
]
]
[rows
[metadata.json.gz]
[parts
[part-00-0c960d5b-f57a-4e85-a94c-32cd6989904a] % FIXME: we should not publish a paper with the randomness after a sequential index
[part-01-2213c977-f3ba-4f51-bcfb-898f6bb65f2f]
[...]
[part-50-8869e10a-0c88-495c-9ce2-38c4b5aafa11]
]
]
[entries
[metadata.json.gz]
[parts
[part-00-0c960d5b-f57a-4e85-a94c-32cd6989904a] % FIXME: we should not publish a paper with the randomness after a sequential index
[part-01-2213c977-f3ba-4f51-bcfb-898f6bb65f2f]
[...]
[part-50-8869e10a-0c88-495c-9ce2-38c4b5aafa11]
]
]
[index
[...]]
]
\end{forest}
\caption{The Matrix Table on-disk format}
\label{fig:matrix-table-format}
\end{figure}
\begin{figure}[h]
\begin{forest}
for tree={
font=\ttfamily,
grow'=0,
child anchor=west,
parent anchor=south,
anchor=west,
calign=first,
edge path={
\noexpand\path [draw, \forestoption{edge}]
(!u.south west) +(7.5pt,0) |- node[fill,inner sep=1.25pt] {} (.child anchor)\forestoption{edge label};
},
before typesetting nodes={
if n=1
{insert before={[,phantom]}}
{}
},
fit=band,
before computing xy={l=15pt},
}
[name.bm
%[README.txt] % FIXME: block matrices should write README.txt files
[\_SUCCESS]
[metadata.json] % FIXME: block matrices should compress their metadata files
[parts
[part-00-0c960d5b-f57a-4e85-a94c-32cd6989904a] % FIXME: we should not publish a paper with the randomness after a sequential index
[part-01-2213c977-f3ba-4f51-bcfb-898f6bb65f2f]
[...]
[part-50-8869e10a-0c88-495c-9ce2-38c4b5aafa11]
]
]
\end{forest}
\caption{The Block Matrix on-disk format}
\label{fig:block-matrix-format}
\end{figure}
\subsection{Hail Batch}
\section{Future Work}
\subsection{Tensor Table}
\subsection{Differentiable Programming}
\cite{dremel}
\cite{numpywren}
\cite{rdd}
\cite{towards-scalable-dataframe-systems}
\nocite{*}
\addtocontents{toc}{\protect\vspace{\beforebibskip}}
\addcontentsline{toc}{section}{\refname}
\printbibliography
\end{document}