diff --git a/grobid_client/format/TEI2LossyJSON.py b/grobid_client/format/TEI2LossyJSON.py index ba4b448..76154ee 100644 --- a/grobid_client/format/TEI2LossyJSON.py +++ b/grobid_client/format/TEI2LossyJSON.py @@ -877,7 +877,7 @@ def traverse_and_collect(node, current_pos=0): # The reference text was also cleaned, so we need to find it in the final cleaned text # We can search around the original position to find the correct occurrence search_start = max(0, ref['offset_start'] - 10) # Look a bit before the original position - search_end = min(len(final_text), ref['offset_start'] + 10) # Look a bit after + search_end = min(len(final_text), ref['offset_end'] + 10) # Look a bit after search_area = final_text[search_start:search_end] # Find the reference in the search area diff --git a/tests/resources/refs_offsets/2021.naacl-main.224.grobid.tei.xml b/tests/resources/refs_offsets/2021.naacl-main.224.grobid.tei.xml new file mode 100644 index 0000000..49329a9 --- /dev/null +++ b/tests/resources/refs_offsets/2021.naacl-main.224.grobid.tei.xml @@ -0,0 +1,999 @@ + + + + + + Incorporating External Knowledge to Enhance Tabular Reasoning + + Verisk Inc. + + + National Science Foundation + NSF + + + + + + + + + + + JNeeraja + jneeraja@iitg.ac.in + + IIT Guwahati + University of Utah + University of Utah + + + + VivekGupta + vgupta@cs.utah.edu + + IIT Guwahati + University of Utah + University of Utah + + + + VivekSrikumar + + IIT Guwahati + University of Utah + University of Utah + + + Incorporating External Knowledge to Enhance Tabular Reasoning + + + + + + + A865E57304B72949D7A3BC3FC4FB3F75 + + + + + + + GROBID - A machine learning software for extracting information from scholarly documents + + + + + + + + +

Reasoning about tabular information presents unique challenges to modern NLP approaches which largely rely on pre-trained contextualized embeddings of text. In this paper, we study these challenges through the problem of tabular natural language inference. We propose easy and effective modifications to how information is presented to a model for this task. We show via systematic experiments that these strategies substantially improve tabular inference performance.

+
+
+
+ + +
Introduction

Natural Language Inference (NLI) is the task of determining if a hypothesis sentence can be inferred as true, false, or undetermined given a premise sentence (Dagan et al., 2013). Contextual sentence embeddings such as BERT (Devlin et al., 2019) and RoBERTa (Liu et al., 2019), applied to large datasets such as SNLI (Bowman et al., 2015) and MultiNLI (Williams et al., 2018), have led to nearhuman performance of NLI systems.

In this paper, we study the harder problem of reasoning about tabular premises, as instantiated in datasets such as TabFact (Chen et al., 2019) and InfoTabS (Gupta et al., 2020). This problem is similar to standard NLI, but the premises are Wikipedia tables rather than sentences. Models similar to the best ones for the standard NLI datasets struggle with tabular inference. Using the InfoTabS dataset as an example, we present a focused study that investigates (a) the poor performance of existing models, (b) connections to information deficiency in the tabular premises, and, (c) simple yet effective mitigations for these problems.

We use the table and hypotheses in Figure fer to the left column as its keys.

foot_0 Tabular inference is challenging for several reasons: (a) Poor table representation: The table does not explicitly state the relationship between the keys and values. (b) Missing implicit lexical knowledge due to limited training data: This affects interpreting words like 'fewer', and 'over' in H1 and H2 respectively. (c) Presence of distracting information: All keys except No. of listings are unrelated to the hypotheses H1 and H2. (d) Missing domain knowledge about keys: We need to interpret the key Volume in the financial context for this table.

In the absence of large labeled corpora, any modeling strategy needs to explicitly address these problems. In this paper, we propose effective approaches for addressing them, and show that they lead to substantial improvements in prediction quality, especially on adversarial test sets. This focused study makes the following contributions:

1. We analyse why the existing state-of-the-art BERT class models struggle on the challenging task of NLI over tabular data. 2. We propose solutions to overcome these challenges via simple modifications to inputs using existing language resources.

3. Through extensive experiments, we show significant improvements to model performance, especially on challenging adversarial test sets. The updated dataset, along with associated scripts, are available at https://github.com/ utahnlp/knowledge_infotabs.

+
Challenges and Proposed Solutions

We examine the issues highlighted in §1 and propose simple solutions to mitigate them below.

Better Paragraph Representation (BPR): One way to represent the premise table is to use a universal template to convert each row of the table into sentence which serves as input to a BERT-style model. Gupta et al. (2020) suggest that in a table titled t, a row with key k and value v should be converted to a sentence using the template: "The k of t are v." Despite the advantage of simplicity, the approach produces ungrammatical sentences.

In our example, the template converts the Founded row to the sentence "The Founded of New York Stock Exchange are May 17, 1792; 226 years ago.".

We note that keys are associated with values of specific entity types such as MONEY, DATE, CAR-DINAL, and BOOL, and the entire table itself has a category. Therefore, we propose type-specific templates, instead of using the universal one. 2 In our example, the table category is Organization and the key Founded has the type DATE. A better template for this key is "t was k on v", which produces the more grammatical sentence "New York Stock Exchange was Founded on May 17, 1792; 226 years ago.". Furthermore, we observe that including the table category information i.e. "New York Stock Exchange is an Organization." helps in better premise context understanding. 3 Appendix A provides more such templates.

+
Implicit Knowledge Addition (KG implicit):

Tables represent information implicitly; they do not employ connectives to link their cells. As a result, a model trained only on tables struggles to make lexical inferences about the hypothesis, such as the difference between the meanings of 'before' and 'after', and the function of negations. This is surprising, because the models have the benefit of being pre-trained on large textual corpora. Recently, Andreas (2020) and Pruksachatkun et al. (2020)showed that we can pre-train models on specific tasks to incorporate such implicit knowledge. Eisenschlos et al. (2020) use pre-training on synthetic data to improve the performance on the TabFact dataset. Inspired by these, we first train our model on the large, diverse and human-written MultiNLI dataset. Then, we fine tune it to the InfoTabS task. Pre-training with MultiNLI data exposes the model to diverse lexical constructions. Furthermore, it increases the training data size by 433K (MultiNLI) example pairs. This makes the representation better tuned to the NLI task, thereby leading to better generalization.

Distracting Rows Removal (DRR) Not all premise table rows are necessary to reason about a given hypothesis. In our example, for the hypotheses H1 and H2, the row corresponding to the key No. of listings is sufficient to decide the label for the hypothesis. The other rows are an irrelevant distraction. Further, as a practical concern, when longer tables are encoded into sentences as described above, the resulting number of tokens is more than the input size restrictions of existing models, leading to useful rows potentially being cropped. Appendix F shows one such example on the InfoTabS. Therefore, it becomes important to prune irrelevant rows.

To identify relevant rows, we employ a simplified version of the alignment algorithm used by Yadav et al. (2019Yadav et al. ( , 2020) ) for retrieval in reading comprehension.

First, every word in the hypothesis sentence is aligned with the most similar word in the table sentences using cosine similarity. We use fast-Text (Joulin et al., 2016;Mikolov et al., 2018) embeddings for this purpose, which preliminary experiments revealed to be better than other embeddings. Then, we rank rows by their similarity to the hypothesis, by aggregating similarity over content words in the hypothesis. Yadav et al. (2019) used inverse document frequency for weighting words, but we found that simple stop word pruning was sufficient. We took the top k rows by similarity as the pruned representative of the table for this hypothesis. The hyper-parameter k is selected by tuning on a development set. Appendix B gives more details about these design choices.

+
Explicit Knowledge Addition (KG explicit):

We found that adding explicit information to enrich keys improves a model's ability to disambiguate and understand them. We expand the pruned table premises with contextually relevant key information from existing resources such as WordNet (definitions) or Wikipedia (first sentence, usually a definition). 4To find the best expansion of a key, we use the sentential form of a row to obtain the BERT embedding (on-the-fly) for its key. We also obtain the BERT embeddings of the same key from WordNet examples (or Wikipedia sentences). 5 Finally, we concatenate the WordNet definition (or the Wikipedia sentence) corresponding to the highest key embedding similarity to the table. As we want the contextually relevant definition of the key, we use the BERT embeddings rather than noncontextual ones (e.g., fastText). For example, the key volume can have different meanings in various contexts. For our example, the contextually best definition is "In capital markets, volume, is the total number of a security that was traded during a given period of time." rather than the other definition "In thermodynamics, the volume of a system is an extensive parameter for describing its thermodynamic state.".

+
Experiment and Analysis

Our experiments are designed to study the research question: Can today's large pre-trained models exploit the information sources described in §2 to better reason about tabular information?

+
Experimental setup

Datasets Our experiments uses InfoTabS, a tabular inference dataset from Gupta et al. (2020). The dataset is heterogeneous in the types of tables and keys, and relies on background knowledge and common sense. Unlike the TabFact dataset (Chen et al., 2019), it has all three inference labels, namely entailment, contradiction and neutral. Importantly, for the purpose of our evaluation, it has three test sets. In addition to the usual development set and the test set (called α 1 ), the dataset has two adversarial test sets: a contrast set α 2 that is lexically similar to α 1 , but with minimal changes in the hypotheses and flip entail-contradict label, and a zero-shot set α 3 which has long tables from different domains with little key overlap with the training set.

Models For a fair comparison with earlier baselines, we use RoBERTa-large (RoBERTa L ) for all our experiments. We represent the premise table by converting each table row into a sentence, and then appending them into a paragraph, i.e. the Para representation of Gupta et al. (2020).

Hyperparameters Settingsfoot_5 For the distracting row removal (+DRR) step, we have a hyperparameter k.

We experimented with k ∈ {2, 3, 4, 5, 6}, by predicting on +DRR development premise on model trained on orignal training set (i.e. BPR), as shown in Table 1. The development accuracy increases significantly as k increases from 2 to 4 and then from 4 to 6, increases marginally ( 1.5% improvement). Since our goal is to remove distracting rows, we use the lowest hyperparameter with good performance i.e. k = 4.foot_6 .

Train Dev k = 2 k = 3 k = 4 k = 5 k = 6 BPR DRR 71.72 74.83 77.50 78.50 79. 00 Table 1: Dev accuracy on increasing hyperparameter k.

+
Results and Analysis

Table 2 shows the results of our experiments. Premise Dev α1 α2 α3 Human 79.78 84.04 83.88 79.33 Para 75.55 74.88 65.55 64.94 BPR 76.42 75.29 66.50 64.26 +KG implicit 79.57 78.27 71.87 66.77 +DRR 78.77 78.13 70.90 68.98 +KG explicit 79.44 78.42 71.97 70.03

Table 2: Accuracy with the proposed modifications on the Dev and test sets. Here, + represents the change with respect to the previous row. Reported numbers are the average over three random seed runs with standard deviation of 0.33 (+KG explicit), 0.46 (+DRR), 0.61 (+KG implicit), 0.86 (BPR), over all sets. All improvements are statistically significant with p < 0.05, except α 1 for BPR representation w.r.t to Para (Original). Here the Human and Para results are taken from Gupta et al. (2020).

BPR As shown in Table 2, with BPR, we observe that the RoBERTa L model improves performance on all dev and test sets except α 3 . There are two main reasons behind this poor performance on α 3 .

First, the zero-shot α 3 data includes unseen keys. The number of keys common to α 3 and the training set is 94, whereas for, dev, α 1 and α 2 it is 334, 312, and 273 respectively (i.e., 3-5 times more). Second, despite being represented by better sentences, due to the input size restriction of RoBERTa L some relevant rows are still ignored.

+
KG implicit

We observe that implicit knowledge addition via MNLI pre-training helps the model reason and generalize better. From Table 2, we can see significant performance improvement in the dev and all three test sets.

DRR This leads to significant improvement in the α 3 set. We attribute this to two primary reasons: First, α 3 tables are longer (13.1 keys per table on average, vs. 8.8 keys on average in the others), and DRR is important to avoid automatically removing keys from the bottom of a table due to the limitations in RoBERTa L model's input size. Without these relevant rows, the model incorrectly predicts the neutral label. Second, α 3 is a zero-shot dataset and has significant proportion of unseen keys which could end up being noise for the model. The slight decrease in performance on the dev, α 1 and α 2 sets can be attributed to model utilising spurious patterns over irrelevant keys for prediction. 8 We validated this experimentally by testing the original premise trained model on the DRR test tables. Table 5 in the Appendix C shows that without pruning, the model focuses on irrelevant rows for prediction.

+
KG explicit

With explicit contextualized knowledge about the table keys, we observe a marginal improvement in dev, α 1 test sets and a significant performance gain on the α 2 and α 3 test sets. Improvement in the α 3 set shows that adding external knowledge helps in the zero-shot setting. With α 2 , the model can not utilize spurious lexical correlations 9 due to its adversarial nature, and is forced to use the relevant keys in the premise tables, thus 8 Performance drop of dev and α2 is also marginal i.e. (dev: 79.57 to 78.77, α1: 78.27 to 78.13, α2: 71.87 to 70.90), as compared to InfoTabS WMD-top3 i.e (dev: 75.5 to 72.55,α1: 74.88 to 70.38, α2: 65.44 to 62.55), here WMD-top3 performance numbers are taken from Gupta et al. (2020). 9 The hypothesis-only baseline for α2 is 48.5% vs. α1: 60.5 % and dev: 60.5 % (Gupta et al., 2020) adding explicit information about the key improves performance more for α 2 than α 1 or dev. Appendix F shows some qualitative examples.

+
Ablation Study

We perform an ablation study as shown in table 3, where instead of doing all modification sequentially one after another (+), we do only one modification at a time to analyze its effects.

Through our ablation study we observe that: (a) DRR improves performance on the dev, α 1 , and α 2 sets, but slightly degrades it on the α 3 set. The drop in performance on α 3 is due to spurious artifact deletion as explained in details in Appendix E. (b) KG explicit gives performance improvement in all sets. Furthermore, there is significant boost in performance of the adversarial α 2 and α 3 sets.foot_7 (c) Similarly, KG implicit shows significant improvement in all test sets. The large improvements on the adversarial sets α 2 and α 3 sets, suggest that the model can now reason better. Although, implicit knowledge provides most performance gain, all modifications are needed to obtain the best performance for all sets (especially on the α 3 set).

+
Comparison with Related Work

Recently, there have been many papers which study several NLP tasks on semi-structured tabular data. These include tabular NLI and fact verification tasks such as TabFact (Chen et al., 2019), and In-foTabS (Gupta et al., 2020), various question answering and semantic parsing tasks (Pasupat and Liang, 2015;Krishnamurthy et al., 2017;Abbas et al., 2016;Sun et al., 2016;Chen et al., 2020;Lin et al., 2020, inter alia), and table-to-text generation and its evaluation (e.g., Parikh et al., 2020;Radev et al., 2020). Several, models for better representation of tables such as TAPAS (Herzig et al., 2020), TaBERT (Yin et al., 2020), and Tab-Struc (Zhang et al., 2020) were recently proposed. Yu et al. (2018, 2021) and Eisenschlos et al. (2020)study pre-training for improving tabular inference, similar to our MutliNLI pre-training.

The proposed modifications in this work are simple and intuitive. Yet, existing table reasoning papers have not studied the impact of such input modifications. Furthermore, much of the recent work focuses on building sophisticated neural models, without explicit focus on how these models (designed for raw text) adapt to the tabular data. In this work, we argue that instead of relying on the neural network to "magically" work for tabular structures, we should carefully think about the representation of semi-structured data, and the incorporation of both implicit and explicit knowledge into neural models. Our work highlights that simple pre-processing steps are important, especially for better generalization, as evident from the significant improvement in performance on adversarial test sets with the same RoBERTa models. We recommend that these pre-processing steps should be standardized across table reasoning tasks.

+
Conclusion & Future Work

We introduced simple and effective modifications that rely on introducing additional knowledge to improve tabular NLI. These modifications governs what information is provided to a tabular NLI and how the given information is presented to the model. We presented a case study with the recently published InfoTabS dataset and showed that our proposed changes lead to significant improvements. Furthermore, we also carefully studied the effect of these modifications on the multiple test-sets, and why a certain modification seems to help a particular adversarial set.

We believe that our study and proposed solutions will be valuable to researchers working on question answering and generation problems involving both tabular and textual inputs, such as tabular/hybrid question answering and table-to-text generation, especially with difficult or adversarial evaluation. Looking ahead, our work can be extended to include explicit knowledge for hypothesis tokens as well. To increase robustness, we can also integrate structural constraints via data augmentation through NLI training. Moreover, we expect that structural information such as position encoding could also help better represent tables.

+
A BPR Templates

Here, we are listing down some of the diverse example templates we have framed.

• For the

table category Bus/Train Lines and key Disabled access with BOOL value YES, follow template: "t has k." Orignal Premise Sentence "The Disabled access of Tukwila International Boulevard Station are Yes." BPR Sentence "Tukwila International Boulevard Station has Disabled access." • For the table category Movie and key Box office with MONEY type, follow template: "In the k, t made v." Orignal Premise Sentence "The Box office of Brokeback Mountain are $178.1 million." BPR Sentence "In the Box office, Brokeback Mountain made $178.1 million." • For the table category City and key Total with CARDINAL type, follow template: "The k area of t is v." Orignal Premise Sentence "The Total of Cusco are 435,114." BPR Sentence "The Total area of Cusco is 435,114." • For the table category Painting and key Also known as, follow template: "The k of t is v." Orignal Premise Sentence "The Also known as of Et in Arcadia ego are Les Bergers d'Arcadie." BPR Sentence "Et in Arcadia ego is Also known as Les Bergers d'Arcadie." • For the table category Person and key Died with DATE type , follow template: "t k on v." Orignal Premise Sentence "The Died of Jesse Ramsden are November 1800 (1800-11-05) (aged 65) Brighton, Sussex."

+
BPR Sentence "Jesse Ramsden Died on 5

November 1800 (1800-11-05) (aged 65) Brighton, Sussex." B DRR: fastText and Binary weighting fastText: For word representation, (Yadav et al., 2019) have used BERT and Glove embeddings. In our case, we prefer to use fastText word embeddings over Glove because fastText embedding uses sub-word information which helps in capturing different variations of the context words. Furthermore, fastText embeddings is also as better choice than BERT for our task because 1. Firstly, we are embedding single sentential form of diverse rows instead of longer context similar paragraphs, 2. Secondly, all words (especially keys) of the rows across all the tables are used only in one context, whereas BERT is useful when same word is used with different contexts across paragraphs, 3. Thirdly, in all tables, the number sentences to select from is bounded by maximum rows in the table, which is a small number (8.8 in train, dev, α 1 , α 2 and 13.1 in α 3 ), and 4. Lastly, using fastText is much faster to compute than BERT for obtaining embeddings.

Binary weighting: Since, we are embedding single sentential form of diverse rowsinstead of longer context related paragraphs, we found that using binary weighting 0 for stop words and 1 for others is more effective than the idf weighting, which is useful only for longer paragraph context with several lexical terms.

+
C Hyperparameters k vs test-sets accuracy

We also trained a model both train and tested on the DRR table premise for increasing values of the hyper parameter k, as shown in Table 1. We also test the model trained on the entire para on pruned para with increasing value of hyperparameters k ∈ {2, 3, 4, 5, 6} for the test sets α 1 , α 2 , and α 3 . In all cases, except α 3 , the performance with larger k is better. The increase in performance, even with k > 4, shows that the model is using more then required keys for prediction. Thus, the model is utlising the spurious pattern in irrelevant rows for the prediction. Table 5: Accuracy of model trained with orignal table but tested with DRR table with increasing hyper parameter k on all test sets.

+
D TabFact Representation Experiment

Table 6 implicit knowledge addition effect on nonpara Struc representation i.e. a key value linearize representation as "key k : value v", rows separated by semicolon ";" (Gupta et al., 2020;Chen et al., 2019). Here too the implicit knowledge addition leads to improvement in performance on all the sets.

+
E Artifacts and Model Predictions

In Table 7 we show percentage of example which were corrected after modification and vice versa. Surprisingly, there is a small percentage of examples which are predicted correctly earlier with original premise (Para) but predicted wrongly after all the modifications (Mod), although such examples are much lesser than opposite case. We suspect that earlier model was also relying on spurious pattern (artifacts) for correct prediction on these examples earlier, which are now corrupted after the proposed modifications. Hence, the new model struggle to predict correctly on such examples.

Para Mod Dev α1 α2 α3 × 6.77 7.83 9.27 10.01 × 10.94 12.55 14.33 16.05

Table 7: Correct vs Incorrect Predictions for Para model (Gupta et al., 2020) and the model after the modifcations (Mod).

In the next section F, we also shows qualitative examples, where modification helps model predict correctly. We also provide some examples via distracting row removal modification, where model fails after modification.

+
F Qualitative Examples

In this section, we provide examples where model is able to predict well after the proposed modifications. We also provide some examples, where model struggles to make the correct prediction after distracting row removal (DRR) modification. Table 11: Prediction after DRR. Here, + represents the change with respect to the previous row.

+
Result and Explanation

In this example from the α 3 set, removing distracting rows (sentence except the one in green and blue) definitely helps as there are irrelevant distracting noise and also make premise paragraph long beyond BERT maximum tokenization limits. Before DRR is applied, the model predicts neutral due to a) distracting rows and b) required information i.e. relevant keysrows highlighted as green being removed due to maximum tokenization limitation (it's second last sentence). However, after DRR, the prune information retained is only the relevant keys highlighted as green and thus the model is able to predict the correct label.

+
Negative Example

In some examples distracting row removal for DRR remove an relevant rows and hence the model failed to predict correctly on the DRR premise, as shown below: Table 13: Prediction after DRR. Here, + represents the change with respect to the previous row.

+
Original Premise
1 as a running example through this paper, and re- * *The first two authors contributed equally to the work. The first author was a remote intern at University of Utah during the work. has fewer than 3,000 stocks listed. H2: Over 2,500 stocks are listed in the NYSE. H3: S&P 500 stock trading volume is over $10 trillion.
+
Figure 1 :Figure 1: A tabular premise example. The hypotheses H1 is entailed by it, H2 is a contradiction and H3 is neutral i.e. neither entailed nor contradictory.
+
F. 1BPR Original Premise The Birth name of Eva Mendes are Eva de la Caridad Méndez. Eva Mendes was Born on March 5, 1974 (1974-03-05) (age 44) Miami, Florida, U.S.. The Occupation of Eva Mendes are Actress, model, businesswoman. The Years active of Eva Mendes are 1998 -present. The Partner(s) of Eva Mendes are Ryan Gosling (2011 -present). The Children of Eva Mendes are 2. Better Paragraph Premise Eva Mendes is a person. The birth name of Eva Mendes is Eva de la Caridad Méndez. Eva Mendes was born on March 5, 1974 (1974-03-05) (age 44) Miami, Florida, U.S.. The occupation of Eva Mendes is Actress, model, businesswoman. The years active of Eva Mendes was on 1998 -present. The partner(s) of Eva Mendes is Ryan Gosling (2011 -present). The number of children of Eva Mendes are 2.Hypothesis Eva Mendes has two children.
+
Table 3 :Ablation results with individual modifications.11
+
Table 4 :Dev accuracy with increasing hyper parameter k trained with both BPR and +DRR table.Train Devk=2 k=3 k=4 k=5 k=6+DRR +DRR 77.61 77.94 78.16 78.38 79.00BPR +DRR 71.72 74.83 77.50 78.50 79.00
+
Table 6 :Accuracy on InfoTabS data for Struc representation of Tables. Here, + represents the change with respect to the previous row.PremiseDevα1α2α3Struc77.61 75.06 69.02 64.61+ KG implicit 79.55 78.66 72.33 70.44
+
Table 8 :Prediction after BPR. Here, + represents the change with respect to the previous row.Result and Explanation In this example fromα 2 , the model predicts Neutral for this hypothe-sis with orignal premise. However, forming bettersentences by adding the "number of children are2" (highlighted as green) in case of CARDINALtype for the category PERSON helps the modelunderstand the relation and reasoning behind thechildren and the number two and arrive at the cor-rect prediction of entailment.
+
Table 12 :Et in Arcadia ego is a painting. Et in Arcadia ego is also known as Les Bergers d'Arcadie. Et in Arcadia ego is a painting. The artist of Et in Arcadia ego is Nicolas Poussin. The medium of Et in Arcadia ego is oil on canvas. The dimensions of Et in Arcadia ego is 87 cm 120 cm (34.25 in 47.24 in).Prediction after DRR. Here, + represents the change with respect to the previous row.Result and ExplanationIn this example from the Dev set, the DRR technique used removes the required key "Location" (highlighted in red) from the para representation. Hence, the model here predicts neutral as the information regarding where the painting is stored i.e. "Location" is removed in the DRR, which the model require for making the correct inference. While in original para, this information is still present and the model is able to arrive at the correct label. Another interesting observation is RoBERTa L knows Musee du Louvre is a museum in the United Kingdom, showing sign of world-knowledge.Negative Example In another negative examples distracting row removal for DRR got the relevant rows correct but still the model failed to predict correct label due to spurious correlation, as shown below:OriginalPremise Idiocracy is a movie. Idiocracy was directed by Mike Judge. Idiocracy was produced by Mike Judge, Elysa Koplovitz, Michael Nelson. Idiocracy was written by Etan Cohen, Mike Judge. Idiocracy was starring Luke Wilson, Maya Rudolph, Dax Shepard. Idiocracy was music by Theodore Shapiro. The cinematography of Idiocracy was by Tim Suhrstedt. Idiocracy was edited by David Rennie. The production company of Idiocracy is Ternion. Idiocracy was distributed by 20th Century Fox. The release date of Idiocracy is September 1, 2006. The running time of Idiocracy is 84 minutes. The country of Idiocracy is United States. The language of Idiocracy is English. The budget of Idiocracy is $2-4 million. In the box office, Idiocracy made $495,303 (worldwide). Idiocracy was directed by Mike Judge. Idiocracy was produced by Mike Judge, Elysa Koplovitz, Michael Nelson. Idiocracy was written by Etan Cohen, Mike Judge. Idiocracy was edited by David Rennie.The artist of Et in Arcadia ego is Nicolas Poussin. Theyear of Et in Arcadia ego is 1637 -1638. The medium ofEt in Arcadia ego is oil on canvas. The dimensions of Etin Arcadia ego is 87 cm 120 cm (34.25 in 47.24 in). Thelocation of Et in Arcadia ego is Musee du Louvre.Hypothesis The art piece Et in Arcadia ego is stored in the United Kingdom.Distracting Row Removal (DRR) Hypothesis Idiocracy was directed and written by the same person.PremiseLabelPremiseLabelHuman Label (Gold) ContradictionHuman Label (Gold) EntailedOrignal PremiseContradictionOrignal PremiseEntailed+DRRNeutral+DRRNeutral

Distracting Row Removal (DRR)

+

Keys in the InfoTabS tables are similar to column headers in the TabFact database-style tables.

+

The construction of the template sentences based on entity type is a one-time manual step.

+

This category information is provided in the InfoTabS and TabFact datasets. For other datasets, it can be inferred easily by clustering over the keys of the training tables.

+

Usually multi-word keys are absent in WordNet, in this case we use Wikipedia. The WordNet definition of each word in the key is used if the multi-word key is absent in Wikipedia.

+

We prefer using WordNet examples over definition for BERT embedding because (a) an example captures the context in which key is used, and (b) the definition may not always contain the key tokens.

+

Appendix C has more details about hyperparameters.

+

Indeed, the original InfoTabs work points out that no more than four rows in a table are needed for any hypothesis.

+

The KG explicit step is performed only for relevant keys (after DRR).

+

We show in Appendix D, Table6, that implicit knowledge addition to a non-sentential table representation i.e. Struc(Chen et al., 2019;Gupta et al., 2020) leads to performance improvement as well.

+ + + +
+
Acknowledgements

We thank members of the Utah NLP group for their valuable insights and suggestions at various stages of the project; and reviewers their helpful comments. We also thank the support of NSF grants #1801446 (SATC) and #1822877 (Cyberlearning) and a generous gift from Verisk Inc.

+
+ + + 1801446 + + + 1822877 + + +
+

Table 10: Prediction on Hypothesis B (from α 2 ). Here, + represents the change with respect to the previous row

+
Result and Explanation

In this example from α 2 , the model without implicit knowledge and the model with implicit knowledge addition predict the correct label on the Hypothesis A. However for Hypothesis B which is an example from α 2 , and originally generated by replacing the word "over" to word "under" in the Hypothesis A and flipping gold label from entail to contradiction, the ealier model which is using artifacts over lexical patterns arrive to predict the original wrong label entail instead of contradiction. On adding implicit knowledge while training, the model is now able to reason rather than relying on artifacts and correctly predicts contradiction. Note, that both hypothesis A and hypothesis B require exactly same reasoning for inference i.e. they are equally hard. The discovery of Fluorine is André-Marie Ampère (1810).

The first isolation of Fluorine is Henri Moissan (June 26, 1886). The named by of Fluorine is Humphry Davy.

+
Distracting Row Removal (DRR)

The first isolation of Fluorine is Henri Moissan (June 26, 1886). The group of Fluorine is group 17 (halogens). The discovery of Fluorine is André-Marie Ampère (1810). Fluorine was ionization energies on 1st: 1681 kJ/mol, 2nd: 3374 kJ/mol, 3rd: 6147 kJ/mol, (more).

Hypothesis Flourine was discovered in the 18th century.

+
Result and Explanation

In this example from the Dev set, the model before DRR predicts the correct label but however on DRR, it predicts incorrect label of neutral. Despite the fact that both the relevant rows require for inference (highlighted in green) is present after DRR. This shows, that the model is looking at more keys than required in the initial case, which are eliminated in the DRR, which force the model to change it prediction. Thus, model is utilising spurious correlation from irrelevant rows to predict the label.

+
Orignal Premise

Julius Caesar was born on 12 or 13 July 100 BC Rome. Julius Caesar died on 15 March 44 BC (aged 55) Rome. The resting place of Julius Caesar is Temple of Caesar, Rome. The spouse(s) of Julius Caesar are Cornelia (84-69 BC; her death), Pompeia (67-61 BC; divorced), Calpurnia (59-44 BC; his death). Orignal Premise + KG explicit Julius Caesar died on 15 March 44 BC (aged 55) Rome. The resting place of Julius Caesar is Temple of Caesar, Rome. Julius Caesar was born on 12 or 13 July 100 BC Rome. The spouse(s) of Julius Caesar Cornelia (84-69 BC; her death), Pompeia (67-61 BC; divorced), Calpurnia (59-44 BC; his death). KEY: Died is defined as pass from physical life and lose all bodily attributes and functions necessary to sustain life . KEY: Resting place is defined as a cemetery or graveyard is a place where the remains of dead people are buried or otherwise interred . KEY: Born is defined as british nuclear physicist (born in germany) honored for his contributions to quantum mechanics (1882-1970) . KEY:

Spouse is defined as a spouse is a significant other in a marriage, civil union, or common-law marriage .

Hypothesis Julius Caesar was buried in Rome.

+
Model Label

Human Label (Gold) Entailed Original Premise Neutral + KG explicit Entailed Table 14: Prediction after KG explicit addition. Here, + represents the change with respect to the previous row.

+
Result and Explanation

In this example from α 2 , the model without explicit knowledge predicts neutral for the hypothesis as it is not able to infer that resting place is where people are buried, so it predicts neutral as it implicitly lack buried key understanding. On explicit KG addition (highlighted as blue+ green), we add the definition of resting place to be the place where remains of the dead are buried (highlighted as green). Now the model uses this extra information (highlighted as green) plus the original key related to death (highlighted in bold) to correctly infer that the statement Caesar is buried in Rome is entailed.

Feen, -in, -yn) and(FLOR-een, -in, -yn).The allotropes of Fluorine is alpha, beta. The appearance of Fluorine is gas: very pale yellow , liquid: bright yellow , solid: alpha is opaque, beta is transparent. The standard atomic weight are, std(f) of Fluorine is 18.998403163(6). The atomic number (z) of Fluorine is 9. The group of Fluorine is group 17 (halogens). The period of Fluorine is period 2. The block of Fluorine is p-block. The element category of Fluorine is Reactive nonmetal. The electron configuration of Fluorine is [He] 2s 2 2p 5. The electrons per shell of Fluorine is 2, 7. The phase at stp of Fluorine is gas. The melting point of Fluorine is (F-2) 53.48 K (-219.67 °C, -363.41 °F). The boiling point of Fluorine is (F 2 ) 85.03 K (-188.11 °C, -306.60 °F). The density (at stp) of Fluorine is 1.696 g/L. The when liquid (at b.p.) of Fluorine is 1.505 g/cm 3. The triple point of Fluorine is 53.48 K, 90 kPa. The critical point of Fluorine is 144.41 K, 5.1724 MPa. The heat of vaporization of Fluorine is 6.51 kJ/mol. The molar heat capacity of Fluorine is C p : 31 J/(mol•K) (at 21.1 °C) , C v : 23 J/(mol•K) (at 21.1 °C). The oxidation states of Fluorine is -1 (oxidizes oxygen). The electronegativity of Fluorine is Pauling scale: 3.98. Fluorine was ionization energies on 1st: 1681 kJ/mol, 2nd: 3374 kJ/mol, 3rd: 6147 kJ/mol, (more). The covalent radius of Fluorine is 64 pm. The van der waals radius of Fluorine is 135 pm. The natural occurrence of Fluorine is primordial. The thermal conductivity of Fluorine is 0.02591 W/(m•K). The magnetic ordering of Fluorine is diamagnetic (-1.2×10 -4 ). The cas number of Fluorine is 7782-41-4. The naming of Fluorine is after the mineral fluorite, itself named after Latin fluo (to flow, in smelting).
+
Table 9 :Hypothesis A Janet Leigh's career spanned over 55 years long.Prediction on Hypothesis A. Here, + represents the change with respect to the previous rowHypothesis B Janet Leigh's career spanned under 55years long.PremiseLabelHuman Label (Gold) EntailedOrignal PremiseEntailed+ KG implicitEntailedPremiseLabelHuman Label (Gold) ContradictionOrignal PremiseEntailed+ KG implicitContradiction
+
+
+ + + + + + + MKFaheem Abbas + + + MMalik + + + RizwanRashid + + + Zafar + + Wikiqa -a question answering system on wikipedia using freebase, dbpedia and infobox + + INTECH + 2016. 2016 + + + + Sixth International Conference on Innovative Computing Technology + + + + + Good-enough compositional data augmentation + + JacobAndreas + + 10.18653/v1/2020.acl-main.676 + + + Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics + the 58th Annual Meeting of the Association for Computational Linguistics + + Online. Association for Computational Linguistics + 2020 + + + + + + + + A Large Annotated Corpus for Learning Natural Language Inference + + RSamuel + + + GaborBowman + + + ChristopherAngeli + + + ChristopherDPotts + + + Manning + + 10.18653/v1/D15-1075 + + + Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing + the 2015 Conference on Empirical Methods in Natural Language Processing + + 2015 + + + + + + + Tabfact: A large-scale dataset for table-based fact verification + + WenhuChen + + + HongminWang + + + JianshuChen + + + YunkaiZhang + + + HongWang + + + ShiyangLi + + + XiyouZhou + + + WilliamYang + + + Wang + + + + International Conference on Learning Representations + + 2019 + + + + + + + + WenhuChen + + + HanwenZha + + + ZhiyuChen + + + WenhanXiong + + + HongWang + + + WilliamWang + + Hybridqa: A dataset of multi-hop question answering over tabular and textual data. Findings of EMNLP + + 2020. 2020 + + + + + + + Recognizing textual entailment: Models and applications + + IdoDagan + + + DanRoth + + + MarkSammons + + + FabioMassimoZanzotto + + + + Synthesis Lectures on Human Language Technologies + + 6 + 4 + + 2013 + + + + + + + BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding + + JacobDevlin + + + Ming-WeiChang + + + KentonLee + + + KristinaToutanova + + 10.18653/v1/N19-1423 + + + Proceedings of the 2019 Conference of the North American Chapter + the 2019 Conference of the North American Chapter + + Human Language Technologies + 2019 + + + + + + + Understanding tables with intermediate pre-training + + JulianEisenschlos + + + SyrineKrichene + + + ThomasMueller + + + + Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: Findings + the 2020 Conference on Empirical Methods in Natural Language Processing: Findings + + 2020 + + + + + + + + INFOTABS: Inference on tables as semi-structured data + + VivekGupta + + + MaitreyMehta + + + PegahNokhiz + + + VivekSrikumar + + + + Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics + the 58th Annual Meeting of the Association for Computational Linguistics + + Online. Association for Computational Linguistics + 2020 + + + + + + + + TaPas: Weakly supervised table parsing via pre-training + + JonathanHerzig + + + KrzysztofPawel + + + ThomasNowak + + + FrancescoMüller + + + JulianPiccinno + + + Eisenschlos + + 10.18653/v1/2020.acl-main.398 + + + Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics + the 58th Annual Meeting of the Association for Computational Linguistics + + Online. Association for Computational Linguistics + 2020 + + + + + + + + + ArmandJoulin + + + EdouardGrave + + + PiotrBojanowski + + + MatthijsDouze + + + HérveJégou + + + TomasMikolov + + arXiv:1612.03651 + Fasttext.zip: Compressing text classification models + + 2016 + + + arXiv preprint + + + + + Neural semantic parsing with type constraints for semi-structured tables + + JayantKrishnamurthy + + + PradeepDasigi + + + MattGardner + + + + Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing + the 2017 Conference on Empirical Methods in Natural Language Processing + + 2017 + + + + + + + + Bridging textual and tabular data for crossdomain text-to-sql semantic parsing + + VictoriaXi + + + RichardLin + + + CaimingSocher + + + Xiong + + + + Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: Findings + the 2020 Conference on Empirical Methods in Natural Language Processing: Findings + + 2020 + + + + + + + + + YinhanLiu + + + MyleOtt + + + NamanGoyal + + + JingfeiDu + + + MandarJoshi + + + DanqiChen + + + OmerLevy + + + MikeLewis + + + LukeZettlemoyer + + + VeselinStoyanov + + arXiv:1907.11692 + Roberta: A Robustly Optimized BERT Pretraining Approach + + 2019 + + + arXiv preprint + + + + + Advances in pre-training distributed word representations + + TomasMikolov + + + EdouardGrave + + + PiotrBojanowski + + + ChristianPuhrsch + + + ArmandJoulin + + + + Proceedings of the International Conference on Language Resources and Evaluation + the International Conference on Language Resources and Evaluation + + LREC + 2018. 2018 + + + + + + + ToTTo: A controlled table-totext generation dataset + + XuezhiAnkur P Parikh + + + SebastianWang + + + ManaalGehrmann + + + BhuwanFaruqui + + + DiyiDhingra + + + DipanjanYang + + + Das + + + + Proceedings of EMNLP + EMNLP + + 2020 + + + + + + + Compositional semantic parsing on semi-structured tables + + PanupongPasupat + + + PercyLiang + + + + Proceedings of the 53rd Annual Meeting of the Association for Computational Linguistics and the 7th International Joint Conference on Natural Language Processing + the 53rd Annual Meeting of the Association for Computational Linguistics and the 7th International Joint Conference on Natural Language Processing + + Long Papers + 2015 + 1 + + + + + + + + Intermediate-task transfer learning with pretrained language models: When and why does it work? + + YadaPruksachatkun + + + JasonPhang + + + HaokunLiu + + + MonPhu + + + XiaoyiHtut + + + RichardYuanzheZhang + + + ClaraPang + + + KatharinaVania + + + SamuelKann + + + Bowman + + + + Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics + the 58th Annual Meeting of the Association for Computational Linguistics + + 2020 + + + + + + + + Dart: Open-domain structured data record to text generation + + DragomirRadev + + + RuiZhang + + + AmritRau + + + AbhinandSivaprasad + + + ChiachunHsieh + + + NazneenFatema Rajani + + + XiangruTang + + + AaditVyas + + + NehaVerma + + + PranavKrishna + + arXiv:2007.02871 + + 2020 + + + arXiv preprint + + + + + Table cell search for question answering + + HuanSun + + + HaoMa + + + XiaodongHe + + + Wen-TauYih + + + YuSu + + + XifengYan + + + + Proceedings of the 25th International Conference on World Wide Web + the 25th International Conference on World Wide Web + + 2016 + + + + + + + + A Broad-Coverage Challenge Corpus for Sentence Understanding through Inference + + AdinaWilliams + + + NikitaNangia + + + SamuelBowman + + 10.18653/v1/N18-1101 + + + Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies + the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies + + 2018 + + + + + + + Alignment over heterogeneous embeddings for question answering + + VikasYadav + + + StevenBethard + + + MihaiSurdeanu + + + + Proceedings of the 2019 Conference of the North American Chapter + Long and Short Papers + the 2019 Conference of the North American Chapter + + the Association for Computational Linguistics + 2019 + 1 + + + + + + + + Unsupervised alignment-based iterative evidence retrieval for multi-hop question answering + + VikasYadav + + + StevenBethard + + + MihaiSurdeanu + + 10.18653/v1/2020.acl-main.414 + + + Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics + the 58th Annual Meeting of the Association for Computational Linguistics + + Online. Association for Computational Linguistics + 2020 + + + + + + + + TaBERT: Pretraining for joint understanding of textual and tabular data + + PengchengYin + + + GrahamNeubig + + + Wen-TauYih + + + SebastianRiedel + + 10.18653/v1/2020.acl-main.745 + + + Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics + the 58th Annual Meeting of the Association for Computational Linguistics + + Online. Association for Computational Linguistics + 2020 + + + + + + + + Grappa: Grammar-augmented pre-training for table semantic parsing + + TaoYu + + + Chien-ShengWu + + + XiVictoria Lin + + + BailinWang + + + YiChern Tan + + + XinyiYang + + + DragomirRadev + + + RichardSocher + + + CaimingXiong + + + + International Conference of Learning Representation + + 2021 + + + + + + + Spider: A large-scale human-labeled dataset for complex and cross-domain semantic parsing and text-to-sql task + + TaoYu + + + RuiZhang + + + KaiYang + + + MichihiroYasunaga + + + DongxuWang + + + ZifanLi + + + JamesMa + + + IreneLi + + + QingningYao + + + ShanelleRoman + + + + Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing + the 2018 Conference on Empirical Methods in Natural Language Processing + + 2018 + + + + + + + + Table fact verification with structure-aware transformer + + HongzhiZhang + + + YingyaoWang + + + SiruiWang + + + XuezhiCao + + + FuzhengZhang + + + ZhongyuanWang + + + + Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP) + the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP) + + Online. Association for Computational Linguistics + 2020 + + + + + + + + KG implicit Original Premise Janet Leigh is a person + + F + + + + Janet Leigh was born as Jeanette Helen Morrison (1927-07-06 + + + + + The resting place of Janet Leigh is Westwood Village Memorial Park Cemetery. The alma mater of Janet Leigh is University of the Pacific. The occupation of Janet Leigh are Actress, singer, dancer, author. The years active of Janet Leigh was on 1947-2004. The political party of Janet Leigh is Democratic. The spouse(s) of + + CaliforniaMerced + + + US + + + JanetLeigh + + + + Stanley Reames (m. 1945; div. 1949) +
Los Angeles, California, U.S.; Tony Curtis
+ + 2004. 2004-10-03 + 3 + +
+ aged 77 Janet Leigh are John Carlisle (m. 1942; annulled 1942 m. 1951; div. 1962 Robert Brandt (m. 1962). The children of Janet Leigh are Kelly Curtis, Jamie Lee Curtis +
+ +
+
+
+
+
diff --git a/tests/resources/refs_offsets/bao.json b/tests/resources/refs_offsets/bao.json deleted file mode 100644 index dafecf2..0000000 --- a/tests/resources/refs_offsets/bao.json +++ /dev/null @@ -1,2364 +0,0 @@ -{ - "level": "paragraph", - "biblio": { - "title": "Increased mutation and gene conversion within human segmental duplications", - "authors": [ - "Mitchell Vollger", - "Philip Dishuck", - "William Harvey", - "William Dewitt", - "Xavi Guitart", - "Michael Goldberg", - "Allison Rozanski", - "Julian Lucas", - "Mobin Asri", - "Human Pangenome", - "Reference Consortium", - "Katherine Munson", - "Alexandra Lewis", - "Kendra Hoekzema", - "Glennis Logsdon", - "David Porubsky", - "Benedict Paten", - "Kelley Harris", - "Pinghsun Hsieh", - "Evan Eichler" - ], - "doi": "10.1038/s41586-023-05895-y", - "hash": "594D0C4697A7042FA377CE4EA49AF1B5", - "publication_date": "2023-05-10", - "publication_year": 2023, - "publisher": "", - "abstract": [ - { - "id": 0, - "text": "Single-nucleotide variants (SNVs) in segmental duplications (SDs) have not been systematically assessed because of the limitations of mapping short-read sequencing data 1,2 . Here we constructed 1:1 unambiguous alignments spanning high-identity SDs across 102 human haplotypes and compared the pattern of SNVs between unique and duplicated regions 3,4 . We find that human SNVs are elevated 60% in SDs compared to unique regions and estimate that at least 23% of this increase is due to interlocus gene conversion (IGC) with up to 4.3 megabase pairs of SD sequence converted on average per human haplotype. We develop a genome-wide map of IGC donors and acceptors, including 498 acceptor and 454 donor hotspots affecting the exons of about 800 protein-coding genes. These include 171 genes that have 'relocated' on average 1.61 megabase pairs in a subset of human haplotypes. Using a coalescent framework, we show that SD regions are slightly evolutionarily older when compared to unique sequences, probably owing to IGC. SNVs in SDs, however, show a distinct mutational spectrum: a 27.1% increase in transversions that convert cytosine to guanine or the reverse across all triplet contexts and a 7.6% reduction in the frequency of CpGassociated mutations when compared to unique DNA. We reason that these distinct mutational properties help to maintain an overall higher GC content of SD DNA compared to that of unique DNA, probably driven by GC-biased conversion between paralogous sequences 5,6 .", - "coords": [], - "refs": [ - { - "type": "bibr", - "target": "#b2", - "text": "3,", - "offset_start": 348, - "offset_end": 350 - }, - { - "type": "bibr", - "target": "#b3", - "text": "4", - "offset_start": 350, - "offset_end": 351 - }, - { - "type": "bibr", - "target": "#b4", - "text": "5,", - "offset_start": 1494, - "offset_end": 1496 - }, - { - "type": "bibr", - "target": "#b5", - "text": "6", - "offset_start": 1496, - "offset_end": 1497 - } - ] - }, - { - "id": 1, - "text": "The landscape of human SNVs has been well characterized for more than a decade in large part owing to wide-reaching efforts such as the International HapMap Project and the 1000 Genomes Project 7,8 . Although these consortia helped to establish the genome-wide pattern of SNVs (as low as 0.1% allele frequency) and linkage disequilibrium on the basis of sequencing and genotyping thousands of human genomes, not all parts of the human genome could be equally ascertained. Approximately 10-15% of the human genome 8 has remained inaccessible to these types of analysis either because of gaps in the human genome sequence or, more frequently, the low mapping quality associated with aligning short-read whole-genome sequencing data. This is because short-read sequence data are of insufficient length (<200 base pairs (bp)) to unambiguously assign reads and, therefore, variants to specific loci 9 . Although certain classes of large, highly identical repeats (for example, α-satellites in centromeres) were readily recognized, others, especially SDs 1 and their 859 associated genes 10 , in euchromatin were much more problematic to recognize.", - "coords": [], - "refs": [ - { - "type": "bibr", - "target": "#b6", - "text": "7,", - "offset_start": 194, - "offset_end": 196 - }, - { - "type": "bibr", - "target": "#b7", - "text": "8", - "offset_start": 196, - "offset_end": 197 - } - ] - }, - { - "id": 2, - "text": "Operationally, SDs are defined as interchromosomal or intrachromosomal homologous regions in any genome that are >1 kbp in length and >90% identical in sequence 1,11 . As such regions arise by duplication as opposed to retrotransposition, they were initially difficult to identify and early versions of the human genome sequence had either missed or misassembled these regions owing to their high sequence identity 12,13 . Large-insert BAC clones ultimately led to many of these regions being resolved. Subsequent analyses showed that SDs contribute disproportionately to copy number polymorphisms and disease structural variation 9,14 , are hotspots for gene conversion 15 , are substantially enriched in GC-rich DNA and Alu repeats 16,17 , and are transcriptionally diverse leading to the emergence, in some cases, of human-specific genes thought to be important for human adaptation [18][19][20][21] . Despite their importance, the pattern of SNVs among humans has remained poorly characterized. Early on, paralogous sequence variants were misclassified as SNVs 2 and, as a result, later high-identity SDs became blacklisted from SNV analyses because short-read sequence data could not be uniquely placed 22,23 . This exclusion has translated into a fundamental lack of understanding in mutational processes precisely in regions predicted to be more mutable owing to the action of IGC [24][25][26][27][28] . Previously, we noted an increase in SNV density in duplicated regions when compared to unique regions of the genome on the basis of our comparison of GRCh38 and the complete telomere-to-telomere", - "coords": [], - "refs": [ - { - "type": "bibr", - "target": "#b0", - "text": "1,", - "offset_start": 161, - "offset_end": 163 - }, - { - "type": "bibr", - "target": "#b10", - "text": "11", - "offset_start": 163, - "offset_end": 165 - }, - { - "type": "bibr", - "target": "#b11", - "text": "12,", - "offset_start": 415, - "offset_end": 418 - }, - { - "type": "bibr", - "target": "#b12", - "text": "13", - "offset_start": 418, - "offset_end": 420 - }, - { - "type": "bibr", - "target": "#b15", - "text": "16,", - "offset_start": 734, - "offset_end": 737 - }, - { - "type": "bibr", - "target": "#b16", - "text": "17", - "offset_start": 737, - "offset_end": 739 - }, - { - "type": "bibr", - "target": "#b17", - "text": "[18]", - "offset_start": 886, - "offset_end": 890 - }, - { - "type": "bibr", - "target": "#b18", - "text": "[19]", - "offset_start": 890, - "offset_end": 894 - }, - { - "type": "bibr", - "target": "#b19", - "text": "[20]", - "offset_start": 894, - "offset_end": 898 - }, - { - "type": "bibr", - "target": "#b20", - "text": "[21]", - "offset_start": 898, - "offset_end": 902 - }, - { - "type": "bibr", - "target": "#b21", - "text": "22,", - "offset_start": 1208, - "offset_end": 1211 - }, - { - "type": "bibr", - "target": "#b23", - "text": "[24]", - "offset_start": 1388, - "offset_end": 1392 - }, - { - "type": "bibr", - "target": "#b24", - "text": "[25]", - "offset_start": 1392, - "offset_end": 1396 - }, - { - "type": "bibr", - "target": "#b25", - "text": "[26]", - "offset_start": 1396, - "offset_end": 1400 - }, - { - "type": "bibr", - "target": "#b26", - "text": "[27]", - "offset_start": 1400, - "offset_end": 1404 - }, - { - "type": "bibr", - "target": "#b27", - "text": "[28]", - "offset_start": 1404, - "offset_end": 1408 - } - ] - } - ] - }, - "body_text": [ - { - "id": "p_604cd3d1", - "text": "(T2T) human reference genome 10 . Leveraging high-quality phased genome assemblies from 47 humans generated as part of the Human Pangenome Reference Consortium (HPRC) 3 , we sought to investigate this difference more systematically and compare the SNV landscape of duplicated and unique DNA in the human genome revealing distinct mutational properties.", - "coords": [], - "refs": [ - { - "type": "bibr", - "target": "#b9", - "text": "10", - "offset_start": 29, - "offset_end": 31 - }, - { - "type": "bibr", - "target": "#b2", - "text": "3", - "offset_start": 167, - "offset_end": 168 - } - ], - "head_section": "Article" - }, - { - "id": "p_1ea3ff46", - "text": "Unlike previous SNV discovery efforts, which catalogued SNVs on the basis of the alignment of sequence reads, our strategy was assembly driven (Extended Data Fig. 1). We focused on the comparison of 102 haplotype-resolved genomes (Supplementary Table 1) generated as part of the HPRC (n = 94) or other efforts (n = 8) 3,4,12,29 in which phased genome assemblies had been assembled using high-fidelity (HiFi) long-read sequencing 30 . The extraordinary assembly contiguity of these haplotypes (contig N50, defined as the sequence length of the shortest contig at 50% of the total assembly length, > 40 Mbp) provided an unprecedented opportunity to align large swathes (>1 Mbp) of the genome, including high-identity SD repeats anchored by megabases of synteny.", - "coords": [], - "refs": [ - { - "type": "bibr", - "target": "#b2", - "text": "3,", - "offset_start": 318, - "offset_end": 320 - }, - { - "type": "bibr", - "target": "#b3", - "text": "4,", - "offset_start": 320, - "offset_end": 322 - }, - { - "type": "bibr", - "target": "#b11", - "text": "12,", - "offset_start": 322, - "offset_end": 325 - }, - { - "type": "bibr", - "target": "#b28", - "text": "29", - "offset_start": 325, - "offset_end": 327 - }, - { - "type": "bibr", - "target": "#b29", - "text": "30", - "offset_start": 429, - "offset_end": 431 - } - ], - "head_section": "Strategy and quality control" - }, - { - "id": "p_9f47bae9", - "text": "As SD regions are often enriched in assembly errors even among long-read assemblies 3,4,31 , we carried out a series of analyses to assess the integrity and quality of these regions in each assembled haplotype. First, we searched for regions of collapse 11 by identifying unusual increases or decreases in sequence read depth 3 . We determine that, on average, only 1.64 Mbp (1.37%) of the analysed SD sequence was suspect owing to unusually high or low sequence read depth on the basis of mapping of underlying read data-as such patterns are often indicative of a misassembly 3 (Methods). Next, for all SD regions used in our analysis we compared the predicted copy number by Illumina sequence read depth with the sum based on the total copy number from the two assembled haplotypes. These orthogonal copy number estimates were highly correlated (Pearson's R = 0.99, P < 2.2 × 10 -16 ; Supplementary Fig. 1) implying that most SD sequences in the assemblies have the correct copy number. To confirm these results in even the most difficult to assemble duplications, we selected 19 of the largest and most identical SDs across 47 haplotypes for a total of 893 tests. These estimates were also highly correlated (Pearson's R = 0.99, P < 2.2 × 10 -16 ; Supplementary Figs. 2 and 3), and of the 893 tests conducted, 756 were identical. For the 137 tests for which estimates differed, most (n = 125) differed by only one copy. Finally, most of these discrepancies came from just three large (>140 kbp) and highly identical (>99.3%) SDs (Supplementary Fig. 3).", - "coords": [], - "refs": [ - { - "type": "bibr", - "target": "#b2", - "text": "3,", - "offset_start": 84, - "offset_end": 86 - }, - { - "type": "bibr", - "target": "#b3", - "text": "4,", - "offset_start": 86, - "offset_end": 88 - }, - { - "type": "bibr", - "target": "#b30", - "text": "31", - "offset_start": 88, - "offset_end": 90 - }, - { - "type": "bibr", - "target": "#b10", - "text": "11", - "offset_start": 254, - "offset_end": 256 - }, - { - "type": "bibr", - "target": "#b2", - "text": "3", - "offset_start": 326, - "offset_end": 327 - } - ], - "head_section": "Strategy and quality control" - }, - { - "id": "p_e45a52f4", - "text": "To validate the base-level accuracy, we next compared the quality value for both SD and unique sequences using Illumina sequencing data for 45 of the HPRC samples (Methods). Both unique (average quality value = 59 s.d. 1.9) and SD (average quality value = 53 s.d. 1.9) regions are remarkably high quality, which in the case of SDs translates into less than 1 SNV error every 200 kbp (Supplementary Fig. 4). We further show that these high-quality assembles result in accurate variant calls (Supplementary Notes and Supplementary Figs. 56789). We also assessed the contiguity of the underlying assemblies using a recently developed tool, GAVISUNK, which compares unique k-mer distributions between HiFi-based assemblies and orthogonal Oxford Nanopore Technologies sequencing data from the same samples. We found that, on average, only 0.11% of assayable SD sequence was in error compared to 0.14% of unique regions assayed (Supplementary Table 2), implying high and comparable assembly contiguity. As a final control for potential haplotype-phasing errors introduced by trio HiFi assembly of diploid samples, we generated deep Oxford Nanopore Technologies and HiFi data from a second complete hydatidiform mole (CHM1) for which a single paternal haplotype was present and applied a different assembly algorithm 32 (Verkko 1.0; Extended Data Fig. 2). We show across our many analyses that the results from the CHM1 Verkko assembly are consistent with individual haplotypes obtained from diploid HPRC samples produced by trio hifiasm 3,32 (Supplementary Fig. 10). We therefore conclude that phasing errors have, at most, a negligible effect on our results and that most (>98%) SDs analysed were accurately assembled from multiple human genomes allowing the pattern of SNV diversity in SDs to be systematically interrogated.", - "coords": [], - "refs": [ - { - "type": "bibr", - "target": "#b31", - "text": "32", - "offset_start": 1310, - "offset_end": 1312 - }, - { - "type": "bibr", - "target": "#b2", - "text": "3,", - "offset_start": 1531, - "offset_end": 1533 - }, - { - "type": "bibr", - "target": "#b31", - "text": "32", - "offset_start": 1533, - "offset_end": 1535 - } - ], - "head_section": "Strategy and quality control" - }, - { - "id": "p_eb12e3ae", - "text": "To assess SNVs, we limited our analysis to portions of the genome where a 1:1 orthologous relationship could be unambiguously assigned (as opposed to regions with extensive copy number variation). Using the T2T-CHM13 reference genome, we aligned the HPRC haplotypes requiring alignments to be a minimum of 1 Mbp in length and carry no structural variation events greater than 10 kbp (Methods and Extended Data Fig. 1). Although the proportion of haplotypes compared for any locus varied (Fig. 1a), the procedure allowed us to establish, on average, 120.2 Mbp 1:1 fully aligned sequence per genome for SD regions out of a total of 217 Mbp from the finished human genome (T2T-CHM13 v1.1). We repeated the analysis for 'unique' (or single-copy) regions of the genome and recovered by comparison 2,508 Mbp as 1:1 alignments (Fig. 1a). All downstream analyses were then carried out using this orthologous alignment set. We first compared the SNV diversity between unique and duplicated regions excluding suboptimal alignments mapping to tandem repeats or homopolymer stretches. Overall, we observe a significant 60% increase in SNVs in SD regions (Methods; Pearson's chi-squared test with Yates's continuity correction P < 2.2 × 10 -16 ; Fig. 1b). Specifically, we observe an average of 15.3 SNVs per 10 kbp versus 9.57 SNVs per 10 kbp for unique sequences (Fig. 1d). An empirical cumulative distribution comparing the number of SNVs in 10-kbp windows between SD and unique sequence confirms that this is a general property and not driven simply by outliers. The empirical cumulative distribution shows that more than half of the SD sequences have more SNVs than their unique counterparts (Fig. 1b). Moreover, for all haplotypes we divided the unique portions of the genome into 125-Mbp bins and found that all SD bins of equivalent size have more SNVs than any of the bins of unique sequence (empirical P value < 0.0005; Extended Data Fig. 3). This elevation in SNVs is only modestly affected by the sequence identity of the underlying SDs (Pearson's correlation of only 0.008; Supplementary Fig. 11). The increase in SNVs (60%) in SDs is greater than that in all other assayable classes of repeats: Alu (23%), L1 (-9.4%), human endogenous retroviruses (-9.4%) and ancient SDs for which the divergence is greater than 10% (12%) (Extended Data Fig. 4 and Supplementary Table 3). We find, however, that SNV density correlates with increasing GC content (Supplementary Fig. 12) consistent with Alu repeats representing the only other class of common repeat to show an elevation.", - "coords": [], - "refs": [], - "head_section": "Increased SNV density in SD regions" - }, - { - "id": "p_0cc621a7", - "text": "Previous publications have shown that African haplotypes are genetically more diverse, having on average about 20% more variant sites compared to non-African haplotypes 8 . To confirm this observation in our data, we examined the number of SNVs per 10 kbp of unique sequence in African versus non-African haplotypes (Fig. 1c,d) and observed a 27% (10.8 versus 8.5) excess in African haplotypes. As a result, among African haplotypes, we see that the average distance between SNVs (979 bp) is 19.4% closer than in non-African haplotypes (1,215 bp), as expected 8,12 . African genomes also show increased variation in SDs, but it is less pronounced with an average distance of 784 bases between consecutive SNVs as compared to 909 bases in non-African haplotypes (13.8%). Although elevated in African haplotypes, SNV density is higher in SD sequence across populations and these properties are not driven by a few sites but, once again, are a genome-wide feature. We put forward three possible hypotheses to account for this increase although note these are not mutually exclusive: SDs have unique mutational mechanisms that increase SNVs; SDs have a deeper average coalescence than unique parts of the genome; and differences in sequence composition (for example, GC richness) make SDs more prone to particular classes of mutation.", - "coords": [], - "refs": [ - { - "type": "bibr", - "target": "#b7", - "text": "8", - "offset_start": 169, - "offset_end": 170 - }, - { - "type": "bibr", - "target": "#b7", - "text": "8,", - "offset_start": 560, - "offset_end": 562 - }, - { - "type": "bibr", - "target": "#b11", - "text": "12", - "offset_start": 562, - "offset_end": 564 - } - ], - "head_section": "Increased SNV density in SD regions" - }, - { - "id": "p_6b43a0ba", - "text": "One possible explanation for increased diversity in SDs is IGC in which sequence that is orthologous by position no longer shares an evolutionary history because a paralogue from a different location has 'donated' its sequence through ectopic template-driven conversion 33 , also known as nonallelic gene conversion 27 . To identify regions of IGC, we developed a method that compares two independent alignment strategies to pinpoint regions where the orthologous alignment of an SD sequence is inferior to an independent alignment of the sequence without flanking information (Fig. 2a and Methods). We note several limitations of our approach (Supplementary Notes); however, we show that our high-confidence IGC calls (20+ supporting SNVs) have strong overlap with other methods for identifying IGC (Supplementary Notes and Supplementary Fig. 13). Using this approach, we created a genome-wide map of putative large IGC events for all of the HPRC haplotypes for which 1:1 orthologous relationships could be established (Fig. 2).", - "coords": [], - "refs": [ - { - "type": "bibr", - "target": "#b32", - "text": "33", - "offset_start": 270, - "offset_end": 272 - }, - { - "type": "bibr", - "target": "#b26", - "text": "27", - "offset_start": 316, - "offset_end": 318 - } - ], - "head_section": "Putative IGC" - }, - { - "id": "p_267fbe43", - "text": "Across all 102 haplotypes, we observe 121,631 putative IGC events for an average of 1,193 events per human haplotype (Fig. 2b,c and Supplementary Table 4). Of these events, 17,949 are rare and restricted to a single haplotype (singletons) whereas the remaining events are observed in several human haplotypes grouping into 14,663 distinct events (50% reciprocal overlap at both the donor and acceptor site). In total, we estimate that there is evidence for 32,612 different putative IGC events (Supplementary Table 5) among the SD regions that are assessed at present. Considering the redundant IGC callset (n = 121,631), the average IGC length observed in our data is 6.26 kbp with the largest event observed being 504 kbp (Extended Data Fig. 5). On average, each IGC event has 13.3 SNVs that support the conversion event and 2.03 supporting SNVs per kilobase pair, and as expected, there is strong", - "coords": [], - "refs": [], - "head_section": "Putative IGC" - }, - { - "id": "p_5f65e8d3", - "text": "Mean = 784 Mean = 979 Non-African African 1.0 10.0 100.0 1,000.0 10,000.0 0 0.25 0.50 0.75 1.00 1.25 0 0.25 0.50 0.75 1.00 1.25 Distance to next closest SNV Density Density chr1 chr6 chr8 chrX a b e d c HLA CHM1 CHM1 African haplotypes Non-African haplotypes 105.0 110.0 115.0 120.0 125.0 130.0 2,400 2,450 2,500 2,550 Amount of sequence within synteny blocks >1 Mbp (Mbp)", - "coords": [], - "refs": [], - "head_section": "Putative IGC" - }, - { - "id": "p_a89b0223", - "text": "17.4 10.8 13.3 8.4 13.7 8.6 13.7 8.1 12.7 8.4 13.4 8.4 African American East Asian European South Asian Non-African SD Unique SD Unique SD Unique SD Unique SD Unique SD Unique 10 15 Genomic region No. SNVs per 10 kbp 0 0.25 0.50 0.75 1.00 0 1 10 100 1,000 Number of SNVs in 10-kbp windows Cumulative fraction of windows SD Unique chrX SD Unique Mean = 909 Mean = 1,215 SD Unique Fig. 1 | Increased single-nucleotide variation in SDs. a, The portion of the human genome analysed for SD (red) and unique (blue) regions among African and non-African genomes. Shown are the number of megabase pairs aligned in 1:1 syntenic blocks to T2T-CHM13 v1.1 for each assembled haplotype. Data are shown as both a single point per haplotype originating from a single individual and a smoothed violin plot to represent the population distribution. b, Empirical cumulative distribution showing the number of SNVs in 10-kbp windows in the syntenic regions stratified by unique (grey), SD (red) and the X chromosome (chrX; green). Dashed lines represent individual haplotypes and thick lines represent the average trend of all the data. c, Distribution of the average distance to the next closest SNV in SD (red) and unique (grey) space separating African (top) and non-African (bottom) samples. Dashed vertical lines are drawn at the mean of each distribution. d, Average number of SNVs per 10-kbp window in SD (red) versus unique (grey) space by superpopulation and with mean value shown underneath each violin. The non-African column represents an aggregation of the data from all non-African populations in this study. e, Density of SNVs in 10 bp of each other for SD (top, red) and unique (bottom, grey) regions for chromosomes 1, 6, 8 and X comparing the relative density of known (for example, HLA) and new hotspots of single-nucleotide variation.", - "coords": [], - "refs": [], - "head_section": "Putative IGC" - }, - { - "id": "p_3449bca4", - "text": "correlation (Pearson's R = 0.63, P < 2.2 × 10 -16 ; Fig. 2d) between the length of the events and supporting SNVs. Furthermore, we validated these supporting SNVs against Illumina sequencing data and find that on average only 1% (12/1,192) of IGC events contain even one erroneous SNV (Supplementary Fig. 4). The putative IGC events detected with our method are largely restricted to higher identity duplications with only 325 events detected in 66.1 Mbp of SDs with >10% sequence divergence (Supplementary Figs. 14 and 15). We further stratify these results by callset, minimum number of supporting SNVs and haplotype (Supplementary Table 6). Finally, we use the number of supporting informative SNVs to estimate the statistical confidence of every putative IGC call (Fig. 2c, Supplementary Table 7 and Methods). Using these P values, we identify a subset of the high-confidence (P value < 0.05) IGC calls with 31,910 IGC events and 10,102 nonredundant events. On average, we identify 7.5 Mbp of sequence per haplotype affected by putative IGC and 4.3 Mbp in our high-confidence callset (Fig. 2b). Overall, 33.8% (60.77/180.0 Mbp) of the analysed SD sequence is affected by putative IGC in at least one human haplotype. Furthermore, among all SDs covered by at least 20 assembled haplotypes, we identify 498 acceptor and 454 donor IGC hotspots with at least 20 distinct IGC events (Fig. 3 and Supplementary Table 8). IGC hotspots are more likely to associate with higher copy number SDs compared to a random sample of SD windows of equal size (median of 9 overlaps compared to 3, one-sided Wilcoxon rank sum test P < 2.2 × 10 -16 ) and regions with more IGC events are moderately correlated with the copy number of the SD (Pearson's R = 0.23, P < 2.2 × 10 -16 ; Supplementary Fig. 16). IGC hotspots also preferentially overlap higher identity duplications (median 99.4%) compared to randomly sampled windows (median 98.0%, one-sided Wilcoxon rank sum test P < 2.2 × 10 -16 ).", - "coords": [], - "refs": [], - "head_section": "Putative IGC" - }, - { - "id": "p_025a4a1c", - "text": "These events intersect 1,179 protein-coding genes, and of these genes, 799 have at least one coding exon affected by IGC (Supplementary Tables 9 and 10). As a measure of functional constraint, we used the probability of being loss-of-function intolerant (pLI) for each of the 799 genes 34 (Fig. 4a). Among these, 314 (39.3%) have never been assessed Fig. 2 | Candidate IGC events. a, Method to detect IGC. The assembled human haplotype query sequence from 1:1 syntenic alignments was fragmented into 1-kbp windows in 100-bp increments and realigned back to T2T-CHM13 v1.1 independent of the flanking sequence information using minimap2 v2.24 to identify each window's single best alignment position. These alignments were compared to their original syntenic alignment positions, and if they were not overlapping, we considered them to be candidate IGC windows. Candidate IGC windows were then merged into larger intervals and realigned when windows were overlapping in both the donor and the acceptor sequence. We then used the CIGAR string to identify the number of matching and mismatching bases at the 'donor' site and compared that to the number of matching and mismatching bases at the acceptor site determined by the syntenic alignment to calculate the number of supporting SNVs. b, The amount of SDs (in megabase pairs) predicted to be affected by IGC per haplotype, as a function of the minimum number of SNVs that support the IGC call. Dashed lines represent individual haplotypes and the solid line represents the average. c, Empirical cumulative distribution of the megabase pairs of candidate IGC observed in HPRC haplotypes, as a function of the minimum underlying P-value threshold used to define the IGC callset (see Methods for IGC P-value calculation). Dashed lines represent individual haplotypes and the solid line represents the average. d, Correlation between IGC length and the number of supporting SNVs. e, Distribution of the distance between predicted IGC acceptor and donor sites for intrachromosomal events by chromosome.", - "coords": [], - "refs": [ - { - "type": "bibr", - "target": "#b33", - "text": "34", - "offset_start": 286, - "offset_end": 288 - } - ], - "head_section": "Putative IGC" - }, - { - "id": "p_bc4df1f2", - "text": "for mutation intolerance (that is, no pLI) owing to the limitations of mapping short-read data from population samples 34 . Of the remaining genes, we identify 38 with a pLI greater than 0.5, including genes associated with disease (F8, HBG1 and C4B) and human evolution (NOTCH2 and TCAF). Of the genes with high pLI scores, 12 are the acceptor site for at least 50 IGC events, including CB4, NOTCH2 and OPNL1W-a locus for red-green colour blindness (Fig. 4b-e). We identify a subset of 418 nonredundant IGC events that are predicted to copy the entirety of a gene body to a 'new location' in the genome (Fig. 4f,g). As a result, 171 different protein-coding genes with at least 2 exons and 200 coding base pairs are converted in their entirety by putative IGC events in a subset of human haplotypes (Supplementary Table 11), and we refer to this phenomenon as gene repositioning. These gene-repositioning events are large (average 26 kbp; median 16.7 kbp) and supported by a high number of SNVs (average 64.7; median 15.3 SNVs), suggesting that they are unlikely to be mapping artefacts. Markedly, these putative IGC events copy the reference gene model on average a distance of 1.66 Mbp (median 216 kbp) from its original location. These include several disease-associated genes (for example, TAOK2, C4A, C4B, PDPK1 and IL27) as well as genes that have eluded complete characterization owing to their duplicative nature [35][36][37] .", - "coords": [], - "refs": [ - { - "type": "bibr", - "target": "#b33", - "text": "34", - "offset_start": 119, - "offset_end": 121 - }, - { - "type": "bibr", - "target": "#b34", - "text": "[35]", - "offset_start": 1422, - "offset_end": 1426 - }, - { - "type": "bibr", - "target": "#b35", - "text": "[36]", - "offset_start": 1426, - "offset_end": 1430 - }, - { - "type": "bibr", - "target": "#b36", - "text": "[37]", - "offset_start": 1430, - "offset_end": 1434 - } - ], - "head_section": "Putative IGC" - }, - { - "id": "p_60ea8d58", - "text": "Our analysis suggests that putative IGC contributes modestly to the significant increase of human SNV diversity in SDs. For example, if we apply the least conservative definition of IGC (1 supporting SNV) and exclude all putative IGC events from the human haplotypes, we estimate that it accounts for only 23% of the increase (Extended Data Fig. 6). If we restrict to higher confidence IGC events (P < 0.05), only 19.6% of the increase could be accounted for. An alternative explanation may be that the SDs are evolutionarily older, perhaps owing to reduced selective constraint on duplicated copies 38,39 . To test whether SD sequences seem to have a deeper average coalescence than unique regions, we constructed a high-quality, locally phased assembly (hifiasm v0.15.2) of a chimpanzee (Pan troglodytes) genome to calibrate age since the time of divergence and to distinguish ancestral versus derived alleles in human SD regions (Methods). Constraining our analysis to syntenic regions between human and chimpanzee genomes (Methods), we characterized 4,316 SD regions (10 kbp in size) where we had variant calls from at least 50 human and one chimpanzee haplotype. We selected at random 9,247 analogous windows from unique regions for comparison. We constructed a multiple sequence alignment", - "coords": [], - "refs": [ - { - "type": "bibr", - "target": "#b37", - "text": "38,", - "offset_start": 600, - "offset_end": 603 - }, - { - "type": "bibr", - "target": "#b38", - "text": "39", - "offset_start": 603, - "offset_end": 605 - } - ], - "head_section": "Evolutionary age of SDs" - }, - { - "id": "p_0669b35b", - "text": "Acceptor site density Donor site density Chromosome: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X a b c HG03492 0-0.6 0.7-5.8 13.9-15.3 15.6-16 17.6-21.5 21.8-22.4 22.9-23.1 26-26.7 41.3-41.6 42.1-42.4 42.5-42.9 27.4-28.7 29.4-30.5 32.1-32.5 76.4-76.7 77.5-77.9 80-80.5 80.9-81.3 81.9-82.4 83.2-83.6 97.4-97.7 43-43.4 71.7-72.1 73.6-73.9 NA12878 HG002 GRCh38 CHM1 HG02080 HG00673 HG00621 HG00514 HG005 HG00438 HG02148 HG01978 HG01952 HG01358 HG01258 HG01175 HG01123 HG01109 HG01106 HG02572 HG02559 HG02055 HG01891 Prader-Willi syndrome 15q11-q13 Centromere 34 1.9 14.4 13.6 10.9 35.7 7.1 3.1 1.5 4.3 7.8 5.2 19.4 52.7 36.5 37.6 6.6 4.1 4.2 4.9 1.2 5.9 20.9 6.9 70.3 30.9 27 64 2.9 5 33.8 13.0 13.0 22.0 17.0 27.6 15.0 9.0 8.8 14.0 7.7 13.7 23.1 33.0 35.0 37.5 12.0 8.5 7.4 7.0 6.0 12.7 14.0 13.7 46.8 18.7 30.7 30.9 8.0 10.0 20,000,000 24,000,000 28,000,000 Genomic position Acceptor Donor 0 10 20 30 No. of haplotypes with IGC event ABCB10P1 for each window and estimated the time to the most recent common ancestor (TMRCA) for each 10-kbp window independently. We infer that SDs are significantly older than the corresponding unique regions of similar size (Supplementary Figs. 17 and 18; one-sided Wilcoxon rank sum test P value = 4.3 × 10 -14 ), assuming that mutation rates have remained constant over time within these regions since the humanchimpanzee divergence. The TMRCAs inferred from SD regions are, on average, 22% more ancient when compared to unique regions (650 versus 530 thousand years ago (ka)), but only a 5% difference is noted when comparing the median (520 versus 490 ka). However, this effect all but disappears (only a 0.2% increase) after excluding windows classified as IGC (Supplementary Fig. 19; one-sided Wilcoxon rank sum test P = 0.05; mean TMRCA unique = 528 ka, mean TMRCA SD = 581 ka, median TMRCA unique = 495 ka, median TMRCA SD = 496 ka).", - "coords": [], - "refs": [], - "head_section": "Evolutionary age of SDs" - }, - { - "id": "p_d185289a", - "text": "As a third possibility, we considered potential differences in the sequence context of unique and duplicated DNA. It has been recognized for almost two decades that human SDs are particularly biased towards Alu repeats and GC-rich DNA of the human genome 16,40 . Notably, among the SNVs in SDs, we observed a significant excess of transversions (transition/transversion ratio (Ti/Tv) = 1.78) when compared to unique sequence (Ti/Tv = 2.06; P < 2.2 × 10 -16 , Pearson's chi-squared test with Yates's continuity correction). Increased mutability of GC-rich DNA is expected and may explain, in part, the increased variation in SDs and transversion bias 6,27,41 . Using a more complete genome, we compared the GC composition of unique and duplicated DNA specifically for the regions considered in this analysis. We find that, on average, 42.4% of the analysed SD regions are guanine or cytosine (43.0% across all SDs) when compared to 40.8% of the unique DNA (P value < 2.2 × 10 -16 , one-sided t-test). Notably, this enrichment drops slightly (41.8%) if we exclude IGC regions. Consequently, we observe an increase of all GC-containing triplets in SD sequences compared to unique regions of the genome (Fig. 5a). Furthermore, the enrichment levels of particular triplet contexts in SD sequence correlate with the mutability of the same triplet sequence in unique regions of the genome (Pearson's R = 0.77, P = 2.4 × 10 -7 ; Fig. 5b). This effect is primarily driven by CpG-containing triplets, which are enriched between 14 and 30% in SD sequences. Note, we observe a weaker and insignificant correlation for the non-CpG-containing triplets (Pearson's R = 0.22, P = 0.27). Extrapolating from the mutational frequencies seen in unique sequences, we estimate that there is 3.21% more variation with SDs due to their sequence composition alone.", - "coords": [], - "refs": [ - { - "type": "bibr", - "target": "#b15", - "text": "16,", - "offset_start": 255, - "offset_end": 258 - }, - { - "type": "bibr", - "target": "#b39", - "text": "40", - "offset_start": 258, - "offset_end": 260 - }, - { - "type": "bibr", - "target": "#b5", - "text": "6,", - "offset_start": 650, - "offset_end": 652 - }, - { - "type": "bibr", - "target": "#b26", - "text": "27,", - "offset_start": 652, - "offset_end": 655 - }, - { - "type": "bibr", - "target": "#b40", - "text": "41", - "offset_start": 655, - "offset_end": 657 - } - ], - "head_section": "SNV mutational spectra in SDs" - }, - { - "id": "p_fae2f7d3", - "text": "To further investigate the changes in GC content and their effect on variation in SDs, we compared the triplet mutational spectra of SNVs from unique and duplicated regions of the genome to determine whether the predominant modes of SNV mutation differed (Methods). We considered all possible triplet changes, first quantifying the number of ancestral GC bases and triplets in SDs (Fig. 5a). A principal component analysis (PCA) of these normalized mutational spectra shows clear discrimination (Fig. 5c) between unique and SD regions (PC1) beyond that of African and non-African diversity, with the first principal component capturing 80.2% of the variation separating the mutational spectrum of SDs and unique DNA. We observe several differences when comparing the triplet-normalized mutation frequency AC244197.3 ACTR3B", - "coords": [], - "refs": [], - "head_section": "SNV mutational spectra in SDs" - }, - { - "id": "p_68e7b291", - "text": "TCAF1 0 100 200 300 No pLI data available 0 0.25 0.50 0.75 1.00 pLI pLI pLI Count of genes with IGC over exons C4B 0 0.25 0.50 0.75 1.00 Number of IGC donor events ANAPC1 C4B HERC2 HIC2 PDPK1 NOTCH2 PPIE T0126759 T0126762 T0126763 T0126764 T0126765 T0204050 T0204051 TCAF1 0 0.25 0.50 0.75 1.00 2.8 11 8.3 1.1 1.4 3 1.4 4.8 23.6 2.3 12.2 3.6 7.2 3.5 3.8 2.3 3.4 2.0 1.0 69.0 3.0 1.0 2.0 1.0 3.0 3.7 10.7 1.7 8.5 1.0 1.0 2.6 1.0 1.8 2.2 31.82 31.84 31.86 31.88 31.90 Genomic position (Mbp) Genomic position (Mbp) 0 5 10 15 20 C4A C4B CYP21A2 STK19 T NXB 82 88 1:1 alignment coverage FCGR2B FCGR3B FCGR3B FCGR3A 48.4 39.1 64.6 64 38.3 32.9 15.8 225.0 201.0 637.0 265.5 120.0 115.5 48.8 160.80 160.85 160.90 160.95 161.00 chr1 position (Mbp) 0 1 2 3 4 5 TRIM49 TRIM64B TRIM49C 15.6 57.3 23.9 45.4 15.5 66.5 11.0 1.5 85.0 23.0 35.6 221.7 89.7 89.8 89.9 90.0 chr11 position (Mbp) 0 2.5 5.0 7.5 1.4 7.7 1 1.7 14.4 10.3 1.3 1.5 7.5 1.7 1.9 3.8 1.4 11.8 21.1 1.2 1.6 7.9 20.7 1 7.3 1.6 2.0 7.0 1.0 1.0 3.0 5.0 1.0 1.0 1.0 1.5 1.0 2.0 2.0 12.7 9.3 1.0 1.0 21.5 3.8 1.2 2.5 1.0 152.40 152.45 152.50 0 2 4 6 Number of haplotypes with IGC event OPN1LW OPN1MW OPN1MW2 TEX28 35 45 55 0 500 1,000 1,500 2,000 Number of IGC acceptor events 0 500 1,000 1,500 2,000 e d b c g f a 1:1 alignment coverage OPN1LW CORO1A NOTCH2 ISY1-RAB43 PDPK1 DHX40 T0218473 Number of haplotypes with IGC event Acceptor Donor Number of haplotypes with IGC event Acceptor Donor Number of haplotypes with IGC event of particular mutational events in SD and unique sequences (Fig. 5d). Most notable is a 7.6% reduction in CpG transition mutations-the most predominant mode of mutation in unique regions of the genome due to spontaneous deamination of methylated CpGs 6 (Supplementary Tables 12 and 13).", - "coords": [], - "refs": [], - "head_section": "SNV mutational spectra in SDs" - }, - { - "id": "p_44058dbf", - "text": "The most notable changes in mutational spectra in SD sequences are a 27.1% increase in C>G mutations, a 15.3% increase in C>A mutations and a 10.5% increase in A>C mutations. C>G mutations are associated with double-strand breaks in humans and some other apes 42,43 . This effect becomes more pronounced (+40.4%) in our candidate IGC regions consistent with previous observations showing increases in C>G mutations in regions of non-crossover gene conversion and double-strand breaks [43][44][45] . However, the increase remains in SD regions without IGC (+20.0%) perhaps owing to extensive nonallelic homologous recombination associated with SDs or undetected IGC events 4,9 .", - "coords": [], - "refs": [ - { - "type": "bibr", - "target": "#b41", - "text": "42,", - "offset_start": 260, - "offset_end": 263 - }, - { - "type": "bibr", - "target": "#b42", - "text": "43", - "offset_start": 263, - "offset_end": 265 - }, - { - "type": "bibr", - "target": "#b42", - "text": "[43]", - "offset_start": 484, - "offset_end": 488 - }, - { - "type": "bibr", - "target": "#b43", - "text": "[44]", - "offset_start": 488, - "offset_end": 492 - }, - { - "type": "bibr", - "target": "#b44", - "text": "[45]", - "offset_start": 492, - "offset_end": 496 - }, - { - "type": "bibr", - "target": "#b3", - "text": "4,", - "offset_start": 672, - "offset_end": 674 - }, - { - "type": "bibr", - "target": "#b8", - "text": "9", - "offset_start": 674, - "offset_end": 675 - } - ], - "head_section": "SNV mutational spectra in SDs" - }, - { - "id": "p_9579b828", - "text": "To further investigate the potential effect of GC-biased gene conversion (gBGC) on the mutational spectra in SDs, we measured the frequency of (A,T)>(G,C) mutations in SD regions with evidence of IGC to determine whether cytosine and guanine bases are being preferentially maintained as might be expected in regions undergoing gBGC. If we measure the frequency of (A,T)>(C,G) in windows with at least one haplotype showing evidence of IGC, then we observe that the frequency is 4.7% higher than in unique regions of the genome; notably, in SDs", - "coords": [], - "refs": [], - "head_section": "SNV mutational spectra in SDs" - }, - { - "id": "p_52b9fb72", - "text": "0.9 1.0 1.1 1.2 1.3 TAA AAA AAG ACT ACA TCA GAC CAG TCC TCG GCG TAT TAG AAC TCT GAT GCT CCT GAG CAC GCC CCG AAT TAC CAA CAT GAA GCA ACC CCA ACG CCC SD composition Unique composition No. of GC bases 0 1 2 3 a ACG 1.14 GCG 1.27 CCG 1.3 TCG 1.22 CAT 0.99 CAC 1.08 ACC 1.04 CCC 1.11 ACA 0.99 GCC 1.1 TAT 0.91 CAG 1.05 ACT 0.97 GCA 1.02 CCT 1.04 TCC 1.07 GCT 1.02 TCT 0.98 CCA 1.07 CAA 0.97 GAT 1 AAT 0.94 TAC 0.95 GAC 1.04 AAC 0.97 TCA 1 TAA 0.9 TAG 0.93 GAG 1.05 AAG 0.95 AAA 0.95 GAA 1 R = 0.77, P = 2.4 × 10 -7 0.9 1.0 1.1 1.2 1.3 0.1 0.3 1.0 Frequency of mutation in unique sequence SD composition Unique composition b -0.4 -0.2 0 0.2 -0.10 -0.05 0 0.05 0.10 PC1 (80.19%) PC2 (2.14%) AFR AMR EAS EUR SAS SD Unique c A>C A >G A>T C >A C>G C >T A C G T A C G T A C G T A C G T A C G T A C G T A C G T 3′ base 5′ base -0.6 -0.5 -0.4 -0.3 -0.2 -0.1 0 0.1 0.2 0.3 0.4 0.5 0.6 log 2 [FC] d Triplet -0.6 without IGC, this rate is reduced compared to that of unique sequence (-3.5%). Additionally, there is a 5.8% reduction in (G,C)>(A,T) bases consistent with IGC preferentially restoring CG bases that have mutated to AT bases through gBGC. These results indicate that gBGC between paralogous sequences may be a strong factor in shaping the mutational landscape of SDs. Although, the (A,T)>(C,G) frequency is comparable in SD regions not affected by IGC, the mutational landscape at large is still very distinct between SDs and unique parts of the genome. In PCA of the mutational spectra in SDs without IGC, the first principal component distinguishing the mutational spectrum of SDs and unique DNA captures a larger fraction of the variation (94.6%) than in the PCA including IGC sites (80.2%; Supplementary Fig. 20).", - "coords": [], - "refs": [], - "head_section": "SNV mutational spectra in SDs" - }, - { - "id": "p_54d7a22a", - "text": "To model the combined effect of unique mutational properties, evolutionary age and sequence content on the frequency of SNVs, we developed a multivariable linear regression using copy number, SD identity, number of unique IGC events, GC content and TMRCA to predict the number of SNVs seen in a 10-kbp window. A linear model containing all pairwise interactions of these predictors was able to explain 10.5% of the variation in SNVs per 10 kbp (adjusted R 2 ), whereas a model containing only the number of IGC events explained only 1.8% of the variation. We note that this measure of variance is related but not directly comparable to the finding that the elevation in the number of SNVs is reduced by 23% when excluding IGC regions. All of the random variables, including their pairwise interactions, were significant (P value < 0.05) predictors of SNVs per 10 kbp except the interaction of number of IGC events with GC content, copy number and TMRCA. The strongest single predictors were the number of unique IGC events and the divergence of the overlapping SD (Supplementary Table 14).", - "coords": [], - "refs": [], - "head_section": "Modelling of elevated SNV frequency" - }, - { - "id": "p_46d41d28", - "text": "Since the first publications of the human genome 12,13 , the pattern of single-nucleotide variation in recently duplicated sequence has been difficult to ascertain, leading to errors 2,11 . Later, indirect approaches were used to infer true SNVs in SDs, but these were far from complete 40 . More often than not, large-scale sequencing efforts simply excluded such regions in an effort to prevent paralogous sequence variants from contaminating single-nucleotide polymorphism databases and leading to false genetic associations 8,23 . The use of phased genome assemblies as opposed to aligned sequence reads had the advantage of allowing us to establish 1:1 orthologous relationships as well as the ability to discern the effect of IGC while comparing the pattern of single-nucleotide variation for both duplicated and unique DNA within the same haplotypes. As a result, we identify over 1.99 million nonredundant SNVs in a gene-rich portion of the genome previously considered largely inaccessible. SNV density is significantly elevated (60%) in duplicated DNA when compared to unique DNA consistent with suggestions from primate genome comparisons and more recent de novo mutation studies from long-read sequencing data [46][47][48] . Furthermore, an increased de novo mutation rate in SDs could support our observation of an elevated SNV density without the need for an increase in TMRCA. We estimate that at least 23% of this increase is due to the action of IGC between paralogous sequences that essentially diversify allelic copies through concerted evolution. IGC in SDs seems to be more pervasive in the human genome compared to earlier estimates 15,27 , which owing to mapping uncertainties or gaps could assay only a smaller subset of regions 15,27 . We estimate more than 32,000 candidate regions (including 799 protein-coding genes) with the average human haplotype showing 1,192 events when compared to the reference. The putative IGC events are also much larger (mean 6.26 kbp) than those of most previous reports 28,49 , with the top 10% of the size distribution >14.4 kbp in length. This has the net effect that entire genes are copied hundreds of kilobase pairs into a new genomic context when compared to the reference. The effect of such 'repositioning events' on gene regulation will be an interesting avenue of future research.", - "coords": [], - "refs": [ - { - "type": "bibr", - "target": "#b11", - "text": "12,", - "offset_start": 49, - "offset_end": 52 - }, - { - "type": "bibr", - "target": "#b12", - "text": "13", - "offset_start": 52, - "offset_end": 54 - }, - { - "type": "bibr", - "target": "#b1", - "text": "2,", - "offset_start": 183, - "offset_end": 185 - }, - { - "type": "bibr", - "target": "#b10", - "text": "11", - "offset_start": 185, - "offset_end": 187 - }, - { - "type": "bibr", - "target": "#b39", - "text": "40", - "offset_start": 287, - "offset_end": 289 - }, - { - "type": "bibr", - "target": "#b7", - "text": "8,", - "offset_start": 528, - "offset_end": 530 - }, - { - "type": "bibr", - "target": "", - "text": "23", - "offset_start": 530, - "offset_end": 532 - }, - { - "type": "bibr", - "target": "", - "text": "[46]", - "offset_start": 1222, - "offset_end": 1226 - }, - { - "type": "bibr", - "target": "", - "text": "[47]", - "offset_start": 1226, - "offset_end": 1230 - }, - { - "type": "bibr", - "target": "#b47", - "text": "[48]", - "offset_start": 1230, - "offset_end": 1234 - }, - { - "type": "bibr", - "target": "#b14", - "text": "15,", - "offset_start": 1655, - "offset_end": 1658 - }, - { - "type": "bibr", - "target": "#b26", - "text": "27", - "offset_start": 1658, - "offset_end": 1660 - }, - { - "type": "bibr", - "target": "#b14", - "text": "15,", - "offset_start": 1753, - "offset_end": 1756 - }, - { - "type": "bibr", - "target": "#b26", - "text": "27", - "offset_start": 1756, - "offset_end": 1758 - }, - { - "type": "bibr", - "target": "#b27", - "text": "28,", - "offset_start": 2028, - "offset_end": 2031 - }, - { - "type": "bibr", - "target": "#b48", - "text": "49", - "offset_start": 2031, - "offset_end": 2033 - } - ], - "head_section": "Discussion" - }, - { - "id": "p_a61f39ae", - "text": "As for allelic gene conversion, our predicted nonallelic gene conversion events are abundant, cluster into larger regional hotspots and favour G and C mutations, although this last property is not restricted to IGC regions 45,50 . Although we classify these regions as putative IGC events, other mutational processes such as deletion followed by duplicative transposition could, in principle, generate the same signal creating large tracts of 'repositioned' DNA. It should also be stressed that our method simply relies on the discovery of a closer match within the reference; by definition, this limits the detection of IGC events to regions where the donor sequence is already present in the reference as opposed to an alternative. Moreover, we interrogated only regions where 1:1 synteny could be unambiguously established. As more of the genome is assessed in the context of a pangenome reference framework, we anticipate that the proportion of IGC will increase, especially as large-copy-number polymorphic SDs, centromeres and acrocentric DNA become fully sequence resolved 3 . Although we estimate 4.3 Mbp of IGC in SDs on average per human haplotype, we caution that this almost certainly represents a lower bound and should not yet be regarded as a rate until more of the genome is surveyed and studies are carried out in the context of parent-child trios to observe germline events.", - "coords": [], - "refs": [ - { - "type": "bibr", - "target": "#b44", - "text": "45,", - "offset_start": 223, - "offset_end": 226 - }, - { - "type": "bibr", - "target": "#b49", - "text": "50", - "offset_start": 226, - "offset_end": 228 - }, - { - "type": "bibr", - "target": "#b2", - "text": "3", - "offset_start": 1080, - "offset_end": 1081 - } - ], - "head_section": "Discussion" - }, - { - "id": "p_a0cef58e", - "text": "One of the most notable features of duplicated DNA is its higher GC content. In this study, we show that there is a clear skew in the mutational spectrum of SNVs to maintain this property of SDs beyond expectations from unique DNA. This property and the unexpected Ti/Tv ratio cannot be explained by lower accuracy of the assembly of SD regions. We find a 27.1% increase in transversions that convert cytosine to guanine or the reverse across all triplet contexts. GC-rich DNA has long been regarded as hypermutable. For example, C>G mutations preferentially associate with double-strand breaks in humans and apes 42,43 and GC-rich regions in yeast show about 2-5 times more mutations depending on sequence context compared to AT-rich DNA 41 . Notably, in human SD regions, we observe a paucity of CpG transition mutations, characteristically associated with spontaneous deamination of CpG dinucleotides and concomitant transitions 6 . The basis for this is unclear, but it may be partially explained by the recent observation that duplicated genes show a greater degree of hypomethylation when compared to their unique counterparts 10 . We propose that excess of guanosine and cytosine transversions is a direct consequence of GC-biased gene conversion 5 driven by an excess of double-strand breaks that result from a high rate of nonallelic homologous recombination events and other break-induced replication mechanisms among paralogous sequences.", - "coords": [], - "refs": [ - { - "type": "bibr", - "target": "#b41", - "text": "42,", - "offset_start": 614, - "offset_end": 617 - }, - { - "type": "bibr", - "target": "#b42", - "text": "43", - "offset_start": 617, - "offset_end": 619 - }, - { - "type": "bibr", - "target": "#b40", - "text": "41", - "offset_start": 739, - "offset_end": 741 - }, - { - "type": "bibr", - "target": "#b5", - "text": "6", - "offset_start": 932, - "offset_end": 933 - }, - { - "type": "bibr", - "target": "#b9", - "text": "10", - "offset_start": 1133, - "offset_end": 1135 - }, - { - "type": "bibr", - "target": "#b4", - "text": "5", - "offset_start": 1254, - "offset_end": 1255 - } - ], - "head_section": "Discussion" - }, - { - "id": "p_924408b8", - "text": "Any methods, additional references, Nature Portfolio reporting summaries, source data, extended data, supplementary information, acknowledgements, peer review information; details of author contributions and competing interests; and statements of data and code availability are available at https://doi.org/10.1038/s41586-023-05895-y.", - "coords": [], - "refs": [], - "head_section": "Online content" - }, - { - "id": "p_47f4100e", - "text": "To define regions of SD, we used the annotations available for T2T-CHM13 v1.1 (ref. 10), which include all nonallelic intrachromosomal and interchromosomal pairwise alignments >1 kbp and with >90% sequence identity that do not consist entirely of common repeats or satellite sequences 11 . To define unique regions, we found the coordinates in T2T-CHM13 that were not SDs, ancient SDs (<90% sequence identity), centromeres or satellite arrays 51 and defined these areas to be the non-duplicated (unique) parts of the genome. For both SDs and unique regions, variants in tandem repeat elements as identified by Tandem Repeats Finder 52 were excluded because many SNVs called in these regions are ultimately alignment artefacts. RepeatMasker v4.1.2 was used to annotate SNVs with additional repeat classes beyond SDs 53 .", - "coords": [], - "refs": [ - { - "type": "bibr", - "target": "#b10", - "text": "11", - "offset_start": 285, - "offset_end": 287 - }, - { - "type": "bibr", - "target": "#b58", - "text": "51", - "offset_start": 443, - "offset_end": 445 - }, - { - "type": "bibr", - "target": "#b59", - "text": "52", - "offset_start": 632, - "offset_end": 634 - }, - { - "type": "bibr", - "target": "#b60", - "text": "53", - "offset_start": 815, - "offset_end": 817 - } - ], - "head_section": "Defining unique and SD regions" - }, - { - "id": "p_c5cad720", - "text": "The goal of this analysis was to validate copy number from the assembled HPRC haplotypes compared to estimates from read-depth analysis of the same samples sequenced using Illumina whole-genome sequencing (WGS). Large, recently duplicated segments are prone to copy number variation and are also susceptible to collapse and misassembly owing to their repetitive nature. HPRC haplotypes were assembled using PacBio HiFi with hifiasm 3,54 creating contiguous long-read assemblies. We selected 19 SD loci corresponding to genes that were known to be duplicated and copy number variable in the human species. We k-merized the 2 haplotype assemblies corresponding to each locus for each individual into k-mers of 31 base pairs in length. We then computed copy number estimates over each locus for the sum haplotype assemblies and calculated the difference based on Illumina WGS from the same sample. For both datasets, we derived these estimates using FastCN, an algorithm implementing whole-genome shotgun sequence detection 55 . When averaging across each region and comparing differences in assembly copy versus Illumina WGS copy estimate, we observe that 756 out of 893 tests were perfectly matched (δ = 0), suggesting that most of these assemblies correctly represent the underlying genomic sequence of the samples.", - "coords": [], - "refs": [ - { - "type": "bibr", - "target": "#b62", - "text": "55", - "offset_start": 1021, - "offset_end": 1023 - } - ], - "head_section": "Copy number estimate validation" - }, - { - "id": "p_b017d4e7", - "text": "Estimates of the quality value of SD and unique regions were made using Merqury v1.1 and parental Illumina sequencing data 56 . We first used Meryl to create k-mer databases (with a k-mer length of 21) using the parental sequencing data following the instructions in the Merqury documentation. Then Merqury was run with default parameters (merqury. sh {k-mer meryl database} {paternal sequence} {maternal sequence}) to generate quality value estimates for the hifiasm assemblies.", - "coords": [], - "refs": [ - { - "type": "bibr", - "target": "#b63", - "text": "56", - "offset_start": 123, - "offset_end": 125 - } - ], - "head_section": "Quality value estimations with Merqury" - }, - { - "id": "p_2585952d", - "text": "For the 35 HPRC assemblies with matched ultralong Oxford Nanopore Technologies (ONT) data, we applied GAVISUNK v1.0.0 as an orthogonal validation of HiFi assembly integrity 57 . In brief, candidate haplotype-specific singly unique nucleotide k-mers (SUNKs) of length 20 are determined from the HiFi assembly and compared to ONT reads phased with parental Illumina data. Inter-SUNK distances are required to be consistent between the assembly and ONT reads, and regions that can be spanned and tiled with consistent ONT reads are considered validated. ONT read dropouts do not necessarily correspond to misassembly-they are also caused by large regions devoid of haplotype-specific SUNKs from recent duplications, homozygosity or over-assembly of the region, as well as Poisson dropout of read coverage.", - "coords": [], - "refs": [ - { - "type": "bibr", - "target": "#b64", - "text": "57", - "offset_start": 173, - "offset_end": 175 - } - ], - "head_section": "Haplotype integrity analysis using inter-SUNK approach" - }, - { - "id": "p_08efedc4", - "text": "For the 94 assembled HPRC haplotypes, we downloaded the regions identified to have abnormal coverage form S3 (s3://human-pangenomics/ submissions/e9ad8022-1b30-11ec-ab04-0a13c5208311-COVERAGE_ ANALYSIS_Y1_GENBANK/FLAGGER/JAN_09_2022/FINAL_HIFI_BASED/ FLAGGER_HIFI_ASM_SIMPLIFIED_BEDS/ALL/). We then intersected these regions with the callable SD regions in each assembly to determine the number of collapsed, falsely duplicated and low-coverage base pairs in each assembly. The unreliable regions were determined by the HPRC using Flagger v0.1 (https://github.com/mobinasri/flagger/) 3 .", - "coords": [], - "refs": [ - { - "type": "bibr", - "target": "#b2", - "text": "3", - "offset_start": 584, - "offset_end": 585 - } - ], - "head_section": "Read-depth analysis using the HPRC unreliable callset" - }, - { - "id": "p_e445a978", - "text": "Whole-genome alignments were calculated against T2T-CHM13 v1.1 with a copy of GRCh38 chrY using minimap2 v2. 24 (ref. 58) with the parameters -a -x asm20-secondary=no -s 25000 -K 8G. The alignments were further processed with rustybam v0. 1.29 (ref. 59) using the subcommands trim-paf to remove redundant alignments in the query sequence and break-paf to split alignments on structural variants over 10 kbp. After these steps, the remaining alignments over 1 Mbp of continuously aligned sequence were defined to be syntenic. The software pipeline is available on GitHub at https://github.com/ mrvollger/asm-to-reference-alignment/ (refs. 58-67).", - "coords": [], - "refs": [], - "head_section": "Whole-genome alignments and synteny definition" - }, - { - "id": "p_a5c2c811", - "text": "When enumerating the number of SNVs, we count all pairwise differences between the haplotypes and the reference, counting events observed in multiple haplotypes multiple times. Therefore, except when otherwise indicated, we are referring to the total number of pairwise differences rather than the total number of nonredundant SNVs (number of segregation sites). The software pipeline is available on GitHub at https://github.com/mrvollger/sd-divergence (refs. 60-63,65,66,68).", - "coords": [], - "refs": [], - "head_section": "Estimating the diversity of SNVs in SDs and unique sequences" - }, - { - "id": "p_517fcf65", - "text": "Each query haplotype genome sequence was aligned to the reference genome (T2T-CHM13 v1.1) using minimap2 v2. 24 (ref. 58) considering only those regions that align in a 1:1 fashion for >1 Mbp without any evidence of gaps or discontinuities greater than 10 kbp in size. This eliminates large forms of structural variation, including copy number variants or regions of large-scale inversion restricting the analysis to largely copy number invariant SD regions (about 120 Mbp) and flanking unique sequence. Once these syntenic alignments were defined, we carried out a second alignment fragmenting the 1:1 synteny blocks into 1-kbp windows (100-bp increments) and remapped back to T2T-CHM13 to identify each window's single best alignment position. These second alignments were then compared to original syntenic ones and if they no longer overlapped, we considered them to be candidate IGC regions. Adjacent IGC windows were subsequently merged into larger intervals when windows continued to be mapped non-syntenically with respect to the original alignment. We then used the CIGAR string to identify the number of matching and mismatching bases at the 'donor' site and compared that to the number of matching and mismatching bases at the acceptor site determined by the syntenic alignment. A donor sequence is, thus, defined as a segment in T2T-CHM13 that now maps with higher sequence identity to a new location in the human haplotype (alignment method 2) and the acceptor sequence is the segment in T2T-CHM13 that has an orthologous mapping to the same region in the human haplotype (alignment method 1). As such, there is dependence on both the reference genome and the haplotype being compared. The software pipeline is available on GitHub at https://github.com/mrvollger/asm-to-reference-alignment/ (refs. 58-67).", - "coords": [], - "refs": [], - "head_section": "Defining IGC events" - }, - { - "id": "p_b423d281", - "text": "To assign confidence measures to our IGC events, we adapted a previously described method 69 to calculate a P value for every one of our candidate IGC calls. Our method uses a cumulative binomial distribution constructed from the number of SNVs supporting the IGC event and the total number of informative sites between two paralogues to assign a one-sided P value to each event. Specifically:", - "coords": [], - "refs": [ - { - "type": "bibr", - "target": "#b76", - "text": "69", - "offset_start": 90, - "offset_end": 92 - } - ], - "head_section": "Assigning confidence to IGC events" - }, - { - "id": "p_4ec9ee0b", - "text": "in which B is the binomial cumulative distribution, n is the number of informative sites between paralogues, k is the number of informative sites that agree with the non-converted sequence (acceptor site), and p is the probability that at an informative site the base matches the acceptor sequence. We assume p to be 0.5 reflecting that a supporting base change can come from one of two sources: the donor or acceptor paralogue. With these assumptions, our binomial model reports the probability that we observe k or fewer sites that support the acceptor site (that is, no IGC) at random given the data, giving us a one-sided P value for each IGC event. No adjustments were made for multiple comparisons.", - "coords": [], - "refs": [], - "head_section": "Assigning confidence to IGC events" - }, - { - "id": "p_66028545", - "text": "To test the specificity of our method, we applied it to an equivalent total of unique sequence (125 Mbp) on each haplotype, which we expected to show no or low levels of IGC. On average, we identify only 33.5 IGC events affecting 38.2 kbp of sequence per haplotype. If we restrict this to high-confidence IGC events, we see only 5.93 events on average affecting 7.29 kbp. This implies that our method is detecting IGC above background in SDs and that the frequency of IGC in SDs is more than 50 times higher in the high-confidence callsets (31,910 versus 605).", - "coords": [], - "refs": [], - "head_section": "Testing for IGC in unique regions" - }, - { - "id": "p_e73ae673", - "text": "We assembled HG00514, NA12878 and HG03125 using HiFi long-read data and hifiasm v0.", - "coords": [], - "refs": [], - "head_section": "Additional genome assemblies" - }, - { - "id": "p_2d7f5702", - "text": "15.2 with parental Illumina data 54 . Using HiFi long-read data and hifiasm v0.15.2 we also assembled the genome of the now-deceased chimpanzee Clint (sample S006007). The assembly is locally phased as trio-binning and HiC data were unavailable. Data are available on the National Center for Biotechnology Information (NCBI) Sequence Read Archive (SRA) under the BioProjects PRJNA551670 (ref. 4), PRJNA540705 (ref. 70), PRJEB36100 (ref. 4) and PRJNA659034 (ref. 47). These assemblies are made available on Zenodo (https://doi. org/10.5281/zenodo.6792653) 71 .", - "coords": [], - "refs": [], - "head_section": "Additional genome assemblies" - }, - { - "id": "p_90ea3542", - "text": "The mutational spectra for unique and SD regions from each individual were computed using mutyper on the basis of derived SNVs polarized against the chimpanzee genome assembly described above [72][73][74] . These spectra were normalized to the triplet content of the respective unique or SD regions by dividing the count of each triplet mutation type by the total count of each triplet context in the ancestral region and normalizing the number of counts in SD and unique sequences to be the same. For PCA, the data were further normalized using the centred log-ratio transformation, which is commonly used for compositional measurements 75 . The code is available on GitHub at https://github.com/ mrvollger/mutyper_workflow/ (refs. 61-63,65,72,76).", - "coords": [], - "refs": [ - { - "type": "bibr", - "target": "#b79", - "text": "[72]", - "offset_start": 192, - "offset_end": 196 - }, - { - "type": "bibr", - "target": "#b80", - "text": "[73]", - "offset_start": 196, - "offset_end": 200 - }, - { - "type": "bibr", - "target": "#b81", - "text": "[74]", - "offset_start": 200, - "offset_end": 204 - }, - { - "type": "bibr", - "target": "#b82", - "text": "75", - "offset_start": 638, - "offset_end": 640 - } - ], - "head_section": "Determining the composition of triplet mutations in SD and unique sequences" - }, - { - "id": "p_a48d733f", - "text": "To estimate TMRCA for a locus of interest, we focus on orthologous sequences (10-kbp windows) identified in synteny among human and chimpanzee haplotypes. Under an assumption of infinite sites, the number of mutations x i between a human sequence and its most recent common ancestor is Poisson distributed with a mean of µ T × , in which µ is the mutation rate scaled with respect to the substitutions between human and chimpanzee lineages, and T is the TMRCA. That is,", - "coords": [], - "refs": [], - "head_section": "Estimation of TMRCA" - }, - { - "id": "p_2c35888b", - "text": ", in which n is the number of human haplotypes. To convert TMRCA to time in years, we assume six million years of divergence between human and chimpanzee lineages. We note that the TMRCA estimates reported in the present study account for mutation variation across loci (that is, if the mutation rate is elevated for a locus, the effect would be accounted for). Thus, for each individual locus, an independent mutation (not uniform) rate is applied depending on the observed pattern of mutations compared to the chimpanzee outgroup.", - "coords": [], - "refs": [], - "head_section": "Estimation of TMRCA" - }, - { - "id": "p_c8940adf", - "text": "Whole-genome alignments were calculated for the HPRC assemblies against T2T-CHM13 v1.1 with a copy of GRCh38 chrY using minimap2 v2.24. The alignments were further processed to remove alignments that were redundant in query sequence or that had structural variants over 10 kbp in length. After these steps, the remaining alignments over 1 Mbp were defined to be syntenic and used in downstream analyses. We then counted all pairwise singlenucleotide differences between the haplotypes and the reference and stratified these results into unique regions versus SD regions based on the SD annotations from T2T-CHM13 v1.1. All variants intersecting tandem repeats were filtered to avoid spurious SNV calls. To detect candidate regions of IGC, the query sequence with syntenic alignments was fragmented into 1 kbp windows with a 100 bp slide and realigned back to T2T-CHM13 v1.1 independent of the flanking sequence using minimap2 v2.24 to identify each window's single best alignment position. These alignments were compared to their original syntenic alignment positions, and if they were not overlapping, we considered them to be candidate IGC windows. Candidate IGC windows were then merged into larger intervals and realigned when windows were overlapping in both the donor and the acceptor sequence. We then used the CIGAR string to identify the number of matching and mismatching bases at the \"donor\" site and compared that to the number of matching and mismatching bases at the acceptor site determined by the syntenic alignment to calculate the number of supporting SNVs. S3.", - "coords": [], - "refs": [], - "head_section": "Extended Data Fig. 1 | Analysis schema for variant and IGC calling." - }, - { - "id": "p_b3947b3a", - "text": "Extended Data Fig. 5 | Largest IGC events in the human genome. The ideogram depicts as red arcs the positions of the largest IGC events between and within human chromosomes (top 10% of the length distribution).", - "coords": [], - "refs": [], - "head_section": "Extended" - }, - { - "id": "p_d754ebc9", - "text": "Extended Data Fig. 6 | Percent of increased single-nucleotide variation explained by IGC. Shown is the fraction of the increased SNV diversity in SDs that can be attributed to IGC for each of the HPRC haplotypes stratified by global superpopulation. In text is the average across all haplotypes (23%).", - "coords": [], - "refs": [], - "head_section": "Extended" - }, - { - "id": "p_69ac39e6", - "text": "Acknowledgements We thank T. Brown for help in editing this manuscript, P. Green for valuable suggestions, and R. Seroussi and his staff for their generous donation of time and resources. This work was supported in part by grants from the US National Institutes of Health (NIH 5R01HG002385, 5U01HG010971 and 1U01HG010973 to E.E.E.; K99HG011041 to P.H.; and F31AI150163 to W.S.D.). W.S.D. was supported in part by a Fellowship in Understanding Dynamic and Multi-scale Systems from the James S. McDonnell Foundation. E.E.E. is an investigator of the Howard Hughes Medical Institute (HHMI). This article is subject to HHMI's Open Access to Publications policy. HHMI laboratory heads have previously granted a nonexclusive CC BY 4.0 licence to the public and a sublicensable licence to HHMI in their research articles. Pursuant to those licences, the author-accepted manuscript of this article can be made freely available under a CC BY 4.0 licence immediately on publication.", - "coords": [], - "refs": [] - }, - { - "id": "p_05e26b0b", - "text": "PacBio HiFi and ONT data have been deposited into NCBI SRA under the following BioProject IDs: PRJNA850430, PRJNA731524, PRJNA551670, PRJNA540705 and PRJEB36100. PacBio HiFi data for CHM1 are available under the following SRA accessions: SRX10759865 and SRX10759866. Sequencing data for Clint PTR are available on NCBI SRA under the Bio-Project PRJNA659034. The T2T-CHM13 v1.1 assembly can be found on NCBI (GCA_009914755.3). Cell lines obtained from the NIGMS Human Genetic Cell Repository at the Coriell Institute for Medical Research are listed in Supplementary Table 1. Assemblies of HPRC samples are available on NCBI under the BioProject PRJNA730822. All additional assemblies used in this work (Clint PTR, CHM1, HG00514, NA12878 and HG03125), variant calls, assembly alignments, and other annotation data used in analysis are available on Zenodo (https://doi.org/10.5281/ zenodo.6792653) 71 .", - "coords": [], - "refs": [ - { - "type": "bibr", - "target": "#b78", - "text": "71", - "offset_start": 895, - "offset_end": 897 - } - ], - "head_section": "Data availability" - }, - { - "id": "p_c5ec8ced", - "text": "The software pipeline for aligning assemblies and calling IGC is available on GitHub (https://github.com/mrvollger/asm-to-reference- alignmentv0.1) and Zenodo (https://zenodo.org/record/7653446) 67 . Code for analysing variants called against T2T-CHM13 v1.1 is available on GitHub (https://github.com/mrvollger/sd-divergencev0.1 and Zenodo (https://zenodo.org/record/7653464) 68 . The software pipeline for analysing the triple context of SNVs is available on GitHub (https://github.com/mrvollger/mutyper_workflowv0.1) and Zenodo (https://zenodo.org/record/7653472) 76 . Scripts for figure and table generation are available on GitHub (https://github.com/mrvollger/ sd-divergence-and-igc-figuresv0.1) and Zenodo (https://zenodo. org/record/7653486) 77 . GAVISUNK is available on GitHub (https:// github.com/pdishuck/GAVISUNK) and Zenodo (https://zenodo.org/ record/7655335) 57 .", - "coords": [], - "refs": [ - { - "type": "bibr", - "target": "#b74", - "text": "67", - "offset_start": 195, - "offset_end": 197 - }, - { - "type": "bibr", - "target": "#b75", - "text": "68", - "offset_start": 376, - "offset_end": 378 - }, - { - "type": "bibr", - "target": "#b83", - "text": "76", - "offset_start": 557, - "offset_end": 559 - }, - { - "type": "bibr", - "target": "#b84", - "text": "77", - "offset_start": 749, - "offset_end": 751 - }, - { - "type": "bibr", - "target": "#b64", - "text": "57", - "offset_start": 874, - "offset_end": 876 - } - ], - "head_section": "Code availability" - }, - { - "id": "p_1ecfda9c", - "text": "Competing interests E.E.E. is a scientific advisory board member of Variant Bio, Inc. All other authors declare no competing interests.", - "coords": [], - "refs": [] - }, - { - "id": "p_0b9ff802", - "text": "Author contributions Conceptualization and design: M.R.V., K. Harris, W.S.D., P.H. and E.E.E. Identification and analysis of SNVs from phased assemblies: M.R.V. Mutational spectrum analysis: M.R.V., W.S.D., M.E.G. and K. Harris. Evolutionary age analysis: M.R.V. and P.H. Assembly generation: M.A., J.L., B.P. and HPRC. PacBio genome sequence generation: K.M.M., A.P.L., K. Hoekzema and G.A.L. Copy number analysis and validation: P.C.D., X.G., W.T.H., A.N.R., D. Porubsky and M.R.V. Table organization: M.R.V. Supplementary material organization: M.R.V. Display items: M.R.V., X.G., P.H. and P.C.D. Resources: HPRC, K. Harris, B.P. and E.E.E. Manuscript writing: M.R.V. and E.E.E. with input from all authors.", - "coords": [], - "refs": [] - }, - { - "id": "p_d44e4a02", - "text": "Further information on research design is available in the Nature Portfolio Reporting Summary linked to this article.", - "coords": [], - "refs": [], - "head_section": "Reporting summary" - }, - { - "id": "p_4ed7ad2e", - "text": "The online version contains supplementary material available at https://doi.org/10.1038/s41586-023-05895-y. Correspondence and requests for materials should be addressed to Evan E. Eichler. Peer review information Nature thanks Anna Lindstrand and the other, anonymous, reviewer(s) for their contribution to the peer review of this work. Reprints and permissions information is available at http://www.nature.com/reprints.", - "coords": [], - "refs": [], - "head_section": "Additional information Supplementary information" - } - ], - "figures_and_tables": [ - { - "id": "fig_0", - "label": "", - "head": "", - "type": "figure", - "desc": "of SNV events that must map better at a new location Average amount of gene conversion per haplotype (Mbp)", - "note": "", - "coords": [] - }, - { - "id": "fig_1", - "label": "3", - "head": "Fig. 3 |", - "type": "figure", - "desc": "Fig. 3 | IGC hotspots. a, Density of IGC acceptor (top, blue) and donor (bottom, orange) sites across the 'SD genome'. The SD genome consists of all main SD regions (>50 kbp) minus the intervening unique sequences. b, All intrachromosomal IGC events on 24 human haplotypes analysed for chromosome 15. Arcs drawn in blue (top) have the acceptor site on the left-hand side and the donor site on the right. Arcs drawn in orange (bottom) are arranged oppositely. Protein-coding genes are drawn as vertical black lines above the ideogram, and large duplication (blue) and deletion (red) events associated with human diseases are drawn as horizontal lines just above the ideogram. c, Zoom of the 30 highest confidence (lowest P value) IGC events on chromosome 15 between 17 and 31 Mbp. The number to the left of each event shows its length (kbp) and that to the right shows its number of SNVs. Genes with IGC events are highlighted in red and associate with the breakpoint regions of Prader-Willi syndrome. An expanded graphic with all haplotypes is included in Extended Data Fig. 7.", - "note": "", - "coords": [] - }, - { - "id": "fig_2", - "label": "4", - "head": "Fig. 4 |", - "type": "figure", - "desc": "Fig. 4 | Protein-coding genes affected by IGC. a, Number of putative IGC events intersecting exons of protein-coding genes as a function of a gene's pLI. Of the 799 genes, 314 (39.3%) did not have a pLI score and are shown in the column labelled No pLI data available. b,c, Number of times a gene exon acts as an acceptor (b) or a donor (c) of an IGC event. d,e, IGC events at the complement factor locus, C4A and C4B (d), and the opsin middle-and long-wavelength-sensitive genes associated with colour blindness (OPN1MW and OPN1LW locus; e). Predicted donor (orange) and acceptor (blue) segments by length (number to left of event) and average number of supporting SNVs (number to right of event) are shown. The number of human haplotypes supporting each configuration is depicted by the histograms to the right. f,g, IGC events that reposition entire gene models for the FCGR (f) and TRIM (g) loci.", - "note": "", - "coords": [] - }, - { - "id": "fig_3", - "label": "5", - "head": "Fig. 5 |", - "type": "figure", - "desc": "Fig. 5 | Sequence composition and mutational spectra of SD SNVs. a, Compositional increase in GC-containing triplets in SD versus unique regions of the genome (coloured by GC content). b, Correlation between the enrichment of certain triplets in SDs compared to the mutability of that triplet in unique regions of the genome. Mutability is defined as the sum of all SNVs that change a triplet divided by the total count of that triplet in the genome. The enrichment ratio of SD over unique regions is indicated in text next to each triplet sequence. The text (upper left) indicates the value of the Pearson's correlation coefficient and the P value from a two-sided t-test without adjustment for multiple comparisons. c, PCA of the mutational spectra of triplets in SD (circles) versus unique (triangles) regions polarized against a chimpanzee genome assembly and coloured by the continental superpopulation of the sample. AFR, African; AMR, American; EAS, East Asian; EUR, European; SAS, South Asian. d, The log[fold change] in triplet mutation frequency between SD and unique sequences. The y axis represents the 5′ base of the triplet context; the first level of the x axis shows which central base has changed and the second level of the x axis shows the 3′ base: heatmap depicts the log[fold change]. As an example, the top left corner shows the log[fold change] in frequency of TAA>TCA mutations in SD versus unique sequences.", - "note": "", - "coords": [] - }, - { - "id": "fig_4", - "label": "2", - "head": "Data Fig. 2 |", - "type": "figure", - "desc": "Ideogram of an assembly of CHM1 aligned to T2T-CHM13. The ideogram depicts the contiguity (alternating blue and orange contigs) of a CHM1 assembly generated by Verkko as compared to T2T-CHM13. The overall contig N50 is 105.2 Mbp providing near chromosome arm contiguity with the exception of breaks at the centromere (red) and other large satellite arrays. Because the sequence is derived from a monoploid complete hydatidiform mole, there is no opportunity for assembly errors due to inadvertent haplotype switching.ExtendedData Fig. 3 | Increased variation in SD sequences and African haplotypes. Histograms of the average number of SNVs per 10 kbp over all 125 Mbp bins of unique (blue) and SD (red) sequence for all haplotypes. African haplotypes (bottom) are compared separately to non-African (top) haplotypes. All SD bins (125 Mbp each) have more SNVs than any unique bin irrespective of human superpopulation. Extended Data Fig. 4 | Average number of SNVs across different repeat classes. Shown are the average number of SNVs per 10 kbp within SDs (red), unique (blue), and additional sequence classes (gray) across the HPRC haplotypes. These classes include exonic regions, ancient SDs (SD with <90% sequence identity) and all elements identified by RepeatMasker (RM) with Alu, L1 LINE, and HERV elements broken out separately. Below each sequence class we show the average number of SNVs per 10 kbp for the median haplotype. Standard deviations and measurements for additional repeat classes are provided in Table", - "note": "", - "coords": [] - }, - { - "id": "fig_5", - "label": "7", - "head": "Data Fig. 7 |", - "type": "figure", - "desc": "IGC hotspots. a) Density of IGC acceptor (top, blue) and donor (bottom, orange) sites across the \"SD genome\". The SD genome consists of all main SD regions (>50 kbp) minus the intervening unique sequences. b) All intrachromosomal IGC events from 102 human haplotypes analyzed for chromosome 15. Arcs drawn in blue (top) have the acceptor site on the left-hand side and the donor site on the right. Arcs drawn in orange (bottom) are arranged oppositely. Protein-coding genes are drawn as vertical black lines above the ideogram, and large duplication (blue) and deletion (red) events associated with human diseases are drawn as horizontal lines just above the ideogram. c) Zoom of the 100 highest confidence (lowest p-value) IGC events identified on chromosome 15 between 17 and 31 Mbp. Genes that are intersected by IGC events are highlighted in red.", - "note": "", - "coords": [] - }, - { - "id": "fig_6", - "label": "", - "head": "", - "type": "figure", - "desc": "", - "note": "", - "coords": [ - { - "x": 16.0, - "y": 45.47, - "width": 48.96, - "height": 510.0 - } - ] - }, - { - "id": "fig_7", - "label": "", - "head": "", - "type": "figure", - "desc": "", - "note": "", - "coords": [ - { - "x": 17.0, - "y": 45.47, - "width": 48.96, - "height": 510.0 - } - ] - } - ], - "references": [ - { - "id": "b1", - "target": "b0", - "title": "Segmental duplications: organization and impact within the current human genome project assembly", - "authors": [ - "J Bailey", - "A Yavor", - "H Massa", - "B Trask", - "E Eichler" - ], - "journal": "Genome Res", - "publication_date": "2001", - "year": 2001, - "volume": "11", - "page_start": "1005", - "page_end": "1017" - }, - { - "id": "b2", - "target": "b1", - "title": "Complex SNP-related sequence variation in segmental genome duplications", - "authors": "D Fredman", - "journal": "Nat. Genet", - "publication_date": "2004", - "year": 2004, - "volume": "36", - "page_start": "861", - "page_end": "866" - }, - { - "id": "b3", - "target": "b2", - "title": "A draft human pangenome reference", - "authors": "W.-W Liao", - "journal": "Nature", - "publication_date": "2023", - "year": 2023, - "doi": "10.1038/s41586-023-05896-x", - "urls": [ - "https://doi.org/10.1038/s41586-023-05896-x", - "https://doi.org/10.1038/s41586-023-05896-x" - ] - }, - { - "id": "b4", - "target": "b3", - "title": "Haplotype-resolved diverse human genomes and integrated analysis of structural variation", - "authors": "P Ebert", - "journal": "Science", - "publication_date": "2021", - "year": 2021, - "volume": "372", - "pages": "7117" - }, - { - "id": "b5", - "target": "b4", - "title": "Biased gene conversion and the evolution of mammalian genomic landscapes", - "authors": [ - "L Duret", - "N Galtier" - ], - "journal": "Annu. Rev. Genomics Hum. Genet", - "publication_date": "2009-05", - "year": 2009, - "volume": "10", - "issue": "11", - "pages": "333", - "notes": "Nature |" - }, - { - "id": "b6", - "target": "b5", - "title": "Mutagenic deamination of cytosine residues in DNA", - "authors": [ - "B Duncan", - "J Miller" - ], - "journal": "Nature", - "publication_date": "1980", - "year": 1980, - "volume": "287", - "page_start": "560", - "page_end": "561" - }, - { - "id": "b7", - "target": "b6", - "title": "The International HapMap Project", - "authors": [ - "International Hapmap", - "Consortium" - ], - "journal": "Nature", - "publication_date": "2003", - "year": 2003, - "volume": "426", - "page_start": "789", - "page_end": "796" - }, - { - "id": "b8", - "target": "b7", - "title": "1000 Genomes Project Consortium et al. An integrated map of genetic variation from 1,092 human genomes", - "journal": "Nature", - "publication_date": "2012", - "year": 2012, - "volume": "491", - "page_start": "56", - "page_end": "65" - }, - { - "id": "b9", - "target": "b8", - "title": "Diversity of human copy number", - "authors": "P Sudmant", - "journal": "Science", - "publication_date": "2010", - "year": 2010, - "volume": "11184", - "page_start": "2", - "page_end": "7" - }, - { - "id": "b10", - "target": "b9", - "title": "Segmental duplications and their variation in a complete human genome", - "authors": "M Vollger", - "journal": "Science", - "publication_date": "2022", - "year": 2022, - "volume": "376", - "pages": "6965" - }, - { - "id": "b11", - "target": "b10", - "title": "Recent segmental duplications in the human genome", - "authors": "J Bailey", - "journal": "Science", - "publication_date": "2002", - "year": 2002, - "volume": "297", - "page_start": "1003", - "page_end": "1007" - }, - { - "id": "b12", - "target": "b11", - "title": "Initial sequencing and analysis of the human genome", - "authors": "Ihgsc", - "journal": "Nature", - "publication_date": "2001", - "year": 2001, - "volume": "409", - "page_start": "860", - "page_end": "921" - }, - { - "id": "b13", - "target": "b12", - "title": "The sequence of the human genome", - "authors": "J Venter", - "journal": "Science", - "publication_date": "2001", - "year": 2001, - "volume": "291", - "page_start": "1304", - "page_end": "1351" - }, - { - "id": "b14", - "target": "b13", - "title": "Segmental duplications and copy-number variation in the human genome", - "authors": "A Sharp", - "journal": "Am. J. Hum. Genet", - "publication_date": "2005", - "year": 2005, - "volume": "77", - "page_start": "78", - "page_end": "88" - }, - { - "id": "b15", - "target": "b14", - "title": "Interlocus gene conversion explains at least 2.7% of single nucleotide variants in human segmental duplications", - "authors": "B Dumont", - "journal": "BMC Genomics", - "publication_date": "2015", - "year": 2015, - "volume": "16", - "pages": "456" - }, - { - "id": "b16", - "target": "b15", - "title": "Alu transposition model for the origin and expansion of human segmental duplications", - "authors": [ - "J Bailey", - "G Liu", - "E Eichler", - "An" - ], - "journal": "Am. J. Hum. Genet", - "publication_date": "2003", - "year": 2003, - "volume": "73", - "page_start": "823", - "page_end": "834" - }, - { - "id": "b17", - "target": "b16", - "title": "Ancestral reconstruction of segmental duplications reveals punctuated cores of human genome evolution", - "authors": "Z Jiang", - "journal": "Nat. Genet", - "publication_date": "2007", - "year": 2007, - "volume": "39", - "page_start": "1361", - "page_end": "1368" - }, - { - "id": "b18", - "target": "b17", - "title": "Emergence of a Homo sapiens-specific gene family and chromosome 16p11. 2 CNV susceptibility", - "authors": "X Nuttle", - "journal": "Nature", - "publication_date": "2016", - "year": 2016, - "volume": "536", - "page_start": "205", - "page_end": "209" - }, - { - "id": "b19", - "target": "b18", - "title": "Transcriptional fates of human-specific segmental duplications in brain", - "authors": "M Dougherty", - "journal": "Genome Res", - "publication_date": "2018", - "year": 2018, - "volume": "28", - "page_start": "1566", - "page_end": "1576" - }, - { - "id": "b20", - "target": "b19", - "title": "Human-specific NOTCH2NL genes affect notch signaling and cortical neurogenesis", - "authors": "I Fiddes", - "journal": "Cell", - "publication_date": "2018", - "year": 2018, - "volume": "173", - "page_start": "1356", - "page_end": "1369" - }, - { - "id": "b21", - "target": "b20", - "title": "The hominoid-specific gene TBC1D3 promotes generation of basal neural progenitors and induces cortical folding in mice", - "authors": "X.-C Ju", - "publication_date": "2016", - "year": 2016, - "volume": "5", - "pages": "18197" - }, - { - "id": "b22", - "target": "b21", - "title": "The ENCODE blacklist: identification of problematic regions of the genome", - "authors": [ - "H Amemiya", - "A Kundaje", - "A Boyle" - ], - "journal": "Sci. Rep", - "publication_date": "2019", - "year": 2019, - "volume": "9", - "pages": "9354" - }, - { - "id": "b23", - "target": "b22", - "title": "An open resource for accurately benchmarking small variant and reference calls", - "authors": "J Zook", - "journal": "Nat. Biotechnol", - "publication_date": "2019", - "year": 2019, - "volume": "37", - "page_start": "561", - "page_end": "566" - }, - { - "id": "b24", - "target": "b23", - "title": "The coalescent with selection on copy number variants", - "authors": [ - "K Teshima", - "H Innan" - ], - "journal": "Genetics", - "publication_date": "2012", - "year": 2012, - "volume": "190", - "page_start": "1077", - "page_end": "1086" - }, - { - "id": "b25", - "target": "b24", - "title": "The coalescent and infinite-site model of a small multigene family", - "authors": "H Innan", - "journal": "Genetics", - "publication_date": "2003", - "year": 2003, - "volume": "163", - "page_start": "803", - "page_end": "810" - }, - { - "id": "b26", - "target": "b25", - "title": "Interplay of interlocus gene conversion and crossover in segmental duplications under a neutral scenario", - "authors": [ - "D Hartasánchez", - "O Vallès-Codina", - "M Brasó-Vives", - "A Navarro" - ], - "journal": "G3 Genes Genomes Genet", - "publication_date": "2014", - "year": 2014, - "volume": "4", - "page_start": "1479", - "page_end": "1489" - }, - { - "id": "b27", - "target": "b26", - "title": "Frequent nonallelic gene conversion on the human lineage and its effect on the divergence of gene duplicates", - "authors": [ - "A Harpak", - "X Lan", - "Z Gao", - "J Pritchard" - ], - "journal": "Proc. Natl Acad. Sci. USA", - "publication_date": "2017", - "year": 2017, - "volume": "114", - "pages": "201708151" - }, - { - "id": "b28", - "target": "b27", - "title": "The rate and tract length of gene conversion between duplicated genes", - "authors": [ - "S Mansai", - "T Kado", - "H Innan" - ], - "journal": "Genes", - "publication_date": "2011", - "year": 2011, - "volume": "2", - "page_start": "313", - "page_end": "331" - }, - { - "id": "b29", - "target": "b28", - "title": "The complete sequence of a human genome", - "authors": "S Nurk", - "journal": "Science", - "publication_date": "2022", - "year": 2022, - "volume": "376", - "page_start": "44", - "page_end": "53" - }, - { - "id": "b30", - "target": "b29", - "title": "Semi-automated assembly of high-quality diploid human reference genomes", - "authors": "E Jarvis", - "journal": "Nature", - "publication_date": "2022", - "year": 2022, - "volume": "611", - "page_start": "519", - "page_end": "531" - }, - { - "id": "b31", - "target": "b30", - "title": "Gaps and complex structurally variant loci in phased genome assemblies", - "authors": "D Porubsky", - "journal": "Genom. Res", - "publication_date": "2023", - "year": 2023, - "doi": "10.1101/gr.277334.122", - "urls": [ - "https://doi.org/10.1101/gr.277334.122", - "https://doi.org/10.1101/gr.277334.122" - ] - }, - { - "id": "b32", - "target": "b31", - "title": "Telomere-to-telomere assembly of diploid chromosomes with Verkko", - "authors": "M Rautiainen", - "journal": "Nat. Biotechnol", - "publication_date": "2023", - "year": 2023, - "doi": "10.1038/s41587-023-01662-6", - "urls": [ - "https://doi.org/10.1038/s41587-023-01662-6", - "https://doi.org/10.1038/s41587-023-01662-6" - ] - }, - { - "id": "b33", - "target": "b32", - "title": "Dynamics of a human interparalog gene conversion hotspot", - "authors": [ - "E Bosch", - "M Hurles", - "A Navarro", - "M Jobling" - ], - "journal": "Genome Res", - "publication_date": "2004", - "year": 2004, - "volume": "14", - "page_start": "835", - "page_end": "844" - }, - { - "id": "b34", - "target": "b33", - "title": "Analysis of protein-coding genetic variation in 60,706 humans", - "authors": "M Lek", - "journal": "Nature", - "publication_date": "2016", - "year": 2016, - "volume": "536", - "page_start": "285", - "page_end": "291" - }, - { - "id": "b35", - "target": "b34", - "title": "Altered TAOK2 activity causes autism-related neurodevelopmental and cognitive abnormalities through RhoA signaling", - "authors": "M Richter", - "journal": "Mol. Psychiatry", - "publication_date": "2019", - "year": 2019, - "volume": "24", - "page_start": "1329", - "page_end": "1350" - }, - { - "id": "b36", - "target": "b35", - "title": "Schizophrenia risk from complex variation of complement component 4", - "authors": "A Sekar", - "journal": "Nature", - "publication_date": "2016", - "year": 2016, - "volume": "530", - "page_start": "177", - "page_end": "183" - }, - { - "id": "b37", - "target": "b36", - "title": "PDK1 decreases TACE-mediated α-secretase activity and promotes disease progression in prion and Alzheimer's diseases", - "authors": "M Pietri", - "journal": "Nat. Med", - "publication_date": "2013", - "year": 2013, - "volume": "19", - "page_start": "1124", - "page_end": "1131" - }, - { - "id": "b38", - "target": "b37", - "title": "Preservation of duplicate genes by complementary, degenerative mutations", - "authors": "A Force", - "journal": "Genetics", - "publication_date": "1999", - "year": 1999, - "volume": "151", - "page_start": "1531", - "page_end": "1545" - }, - { - "id": "b39", - "target": "b38", - "title": "Asymmetric sequence divergence of duplicate genes", - "authors": [ - "G Conant", - "A Wagner" - ], - "journal": "Genome Res", - "publication_date": "2003", - "year": 2003, - "volume": "13", - "page_start": "2052", - "page_end": "2058" - }, - { - "id": "b40", - "target": "b39", - "title": "Large-scale inference of the point mutational spectrum in human segmental duplications", - "authors": [ - "S Nakken", - "E Rødland", - "T Rognes", - "E Hovig" - ], - "journal": "BMC Genomics", - "publication_date": "2009", - "year": 2009, - "volume": "10", - "pages": "43" - }, - { - "id": "b41", - "target": "b40", - "title": "GC content elevates mutation and recombination rates in the yeast Saccharomyces cerevisiae", - "authors": [ - "D Kiktev", - "Z Sheng", - "K Lobachev", - "T Petes" - ], - "journal": "Proc. Natl Acad. Sci. USA", - "publication_date": "2018", - "year": 2018, - "volume": "115", - "notes": "E7109-E7118" - }, - { - "id": "b42", - "target": "b41", - "title": "Germline de novo mutation clusters arise during oocyte aging in genomic regions with high double-strand-break incidence", - "authors": "J Goldmann", - "journal": "Nat. Genet", - "publication_date": "2018", - "year": 2018, - "volume": "50", - "page_start": "487", - "page_end": "492" - }, - { - "id": "b43", - "target": "b42", - "title": "Overlooked roles of DNA damage and maternal age in generating human germline mutations", - "authors": "Z Gao", - "publication_date": "2019", - "year": 2019, - "volume": "116", - "page_start": "9491", - "page_end": "9500" - }, - { - "id": "b44", - "target": "b43", - "title": "Gene conversion tracts from double-strand break repair in mammalian cells", - "authors": [ - "B Elliott", - "C Richardson", - "J Winderbaum", - "J Nickoloff", - "M Jasin" - ], - "journal": "Mol. Cell. Biol", - "publication_date": "1998", - "year": 1998, - "volume": "18", - "page_start": "93", - "page_end": "101" - }, - { - "id": "b45", - "target": "b44", - "title": "Non-crossover gene conversions show strong GC bias and unexpected clustering in humans", - "authors": "A Williams", - "publication_date": "2015", - "year": 2015, - "volume": "4", - "pages": "4637" - }, - { - "id": "b46", - "target": "b45", - "title": "Analysis of primate genomic variation reveals a repeat-driven expansion of the human genome", - "authors": "G Liu", - "journal": "Genome Res", - "publication_date": "2003", - "year": 2003, - "volume": "13", - "page_start": "358", - "page_end": "368" - }, - { - "id": "b47", - "target": "b46", - "title": "The structure, function and evolution of a complete human chromosome 8", - "authors": "G Logsdon", - "journal": "Nature", - "publication_date": "2021", - "year": 2021, - "volume": "593", - "page_start": "101", - "page_end": "107" - }, - { - "id": "b48", - "target": "b47", - "title": "Familial long-read sequencing increases yield of de novo mutations", - "authors": "M Noyes", - "journal": "Am. J. Hum. Genet", - "publication_date": "2022", - "year": 2022, - "volume": "109", - "page_start": "631", - "page_end": "646" - }, - { - "id": "b49", - "target": "b48", - "title": "A phylogenetic approach disentangles interlocus gene conversion tract length and initiation rate", - "note_report_type": "Preprint at", - "authors": [ - "X Ji", - "J Thorne" - ], - "publication_date": "2019", - "year": 2019, - "urls": [ - "https://arxiv.org/abs/1908.08608", - "https://arxiv.org/abs/1908.08608" - ] - }, - { - "id": "b50", - "target": "b49", - "title": "Estimating the human mutation rate from autozygous segments reveals population differences in human mutational processes", - "authors": "V Narasimhan", - "journal": "Nat. Commun", - "publication_date": "2017", - "year": 2017, - "volume": "8", - "pages": "303" - }, - { - "id": "b51", - "target": "b50", - "title": "Open Access This article is licensed under a Creative Commons Attribution 4.0 International License, which permits use, sharing, adaptation, distribution and reproduction in any medium or format, as long as you give appropriate credit to the original author(s) and the source, provide a link to the Creative Commons licence, and indicate if changes were made. The images or other third party material in this article are included in the article's Creative Commons licence, unless indicated otherwise in a credit line to the material. If material is not included in the article's Creative Commons licence and your intended use is not permitted by statutory regulation or exceeds the permitted use", - "notes": "Publisher's note Springer Nature remains neutral with regard to jurisdictional claims in published maps and institutional affiliations", - "urls": [ - "http://creativecommons.org/licenses/by/4.0/", - "http://creativecommons.org/licenses/by/4.0/" - ] - }, - { - "id": "b52", - "target": "b51", - "authors": "Author The", - "pages": "2023" - }, - { - "id": "b53", - "target": "b52", - "title": "Human Pangenome Reference Consortium" - }, - { - "id": "b54", - "target": "b53", - "title": "Yan Gao 27 , Shilpa Garg 28", - "authors": "Julian Lucas", - "editors": [ - "Jennifer Mcdaniel 51", - "Karen Miga", - "Matthew Mitchell", - "Jean Monlong 5", - "Jacquelyn Mountcastle 24", - "Katherine Munson", - "Moses Njagi Mwaniki 53", - "Maria Nattestad 9", - "Adam Novak", - "Sergey Nurk 47", - "Hugh Olsen", - "Nathan Olson 51", - "Trevor Benedict Paten 5", - "Adam Pesout 5", - "Phillippy" - ], - "journal": "Nanibaa' A. Garrison", - "publication_date_text": "Jan. Hugo Magalhães 21. Tobias Marschall 21", - "volume": "25", - "pages": "61", - "notes": "Paul Flicek Xiaowen Feng Adam Frankish Giulio Formenti Cristian Groza Andrea Guarracino Miten Jain Erich D. Jarvis 6,24,42 , Hanlee P. Ji 43 , Eimear E. Kenny 44 Alexey Kolesnikov Jennifer Kordosky Sergey Koren HoJoon Lee 43 Alexandra P. Lewis Heng Li Shuangjia Lu Tsung-Yu Lu Pierre Marijon Charles Markello Fergal J. Martin Ann McCartney Pjotr Prins Daniela Puiu Mikko Rautiainen Baergen I. Schultz Kishwar Shafin Jonas A. Sibbesen Jouni Sirén Michael W. Smith Heidi J. Sofia Chad Tomlinson 8 , Francesca Floriana Tricomi 10 , Flavia Villani 18 , Mitchell R. Vollger 1,2 , Justin Wagner 51 , Brian Walenz 47 , Ting Wang 8,26 , Jonathan M. D. Wood 40 , Aleksey V. Zimin 55,62 & Justin M. Zook 51" - }, - { - "id": "b55", - "target": "b54", - "title": "16 Department of Data Sciences, Dana-Farber Cancer Institute", - "authors": "Llc Google", - "volume": "18", - "notes": "13 Institute for the Advanced Study of Human Biology 22 Center for Digital Medicine" - }, - { - "id": "b56", - "target": "b55", - "title": "27 Center for Computational and Genomic Medicine, The Children's Hospital of Philadelphia", - "journal": "Quantitative Biology Center (QBiC)", - "volume": "32", - "notes": "28 Novo Nordisk Foundation USA. 30 Institute for Precision Health 31 Division of General Internal Medicine and Health Services Research Dovetail Genomics 39 Biomedical Data Science 43 Division of Oncology" - }, - { - "id": "b57", - "target": "b56", - "journal": "European Molecular Biology Laboratory", - "notes": "Genome Biology Unit" - }, - { - "id": "b58", - "target": "b57", - "title": "50 Departament d'Arquitectura de Computadors i Sistemes Operatius", - "publisher": "United Arab Emirates. 61 Center for Genomic Discovery", - "volume": "52", - "notes": "National Library of Medicine 60 Al Jalila Genomics Center of Excellence National Institutes of Health United Arab Emirates. 62 Center for Computational Biology" - }, - { - "id": "b59", - "target": "b58", - "title": "Complete genomic and epigenetic maps of human centromeres", - "authors": "N Altemose", - "journal": "Science", - "publication_date": "2022", - "year": 2022, - "volume": "376", - "pages": "4178" - }, - { - "id": "b60", - "target": "b59", - "title": "Tandem repeats finder: a program to analyze DNA sequences", - "authors": "G Benson", - "journal": "Nucleic Acids Res", - "publication_date": "1999", - "year": 1999, - "volume": "27", - "page_start": "573", - "page_end": "580" - }, - { - "id": "b61", - "target": "b60", - "authors": [ - "A Smit", - "R Hubley", - "P Green", - "Repeatmasker" - ], - "page_start": "2013", - "page_end": "2015", - "identifiers": { - "monograph_identifiers_unknown": "Open-4.0", - "biblstruct_identifiers_unknown": "Open-4.0" - }, - "urls": [ - "http://www.repeatmasker.org", - "http://www.repeatmasker.org" - ] - }, - { - "id": "b62", - "target": "b61", - "title": "Haplotype-resolved de novo assembly using phased assembly graphs with hifiasm", - "authors": [ - "H Cheng", - "G Concepcion", - "X Feng", - "H Zhang", - "H Li" - ], - "journal": "Nat. Methods", - "publication_date": "2021", - "year": 2021, - "volume": "18", - "page_start": "170", - "page_end": "175" - }, - { - "id": "b63", - "target": "b62", - "title": "Comparison of village dog and wolf genomes highlights the role of the neural crest in dog domestication", - "authors": "A Pendleton", - "journal": "BMC Biol", - "publication_date": "2018", - "year": 2018, - "volume": "16", - "pages": "64" - }, - { - "id": "b64", - "target": "b63", - "title": "Merqury: reference-free quality, completeness, and phasing assessment for genome assemblies", - "authors": [ - "A Rhie", - "B Walenz", - "S Koren", - "A Phillippy" - ], - "journal": "Genome Biol", - "publication_date": "2020", - "year": 2020, - "volume": "21", - "pages": "245" - }, - { - "id": "b65", - "target": "b64", - "title": "GAVISUNK: genome assembly validation via inter-SUNK distances in Oxford Nanopore reads", - "authors": [ - "P Dishuck", - "A Rozanski", - "G Logsdon", - "D Porubsky", - "E Eichler" - ], - "journal": "Bioinformatics", - "publication_date": "2022", - "year": 2022, - "volume": "39", - "pages": "714" - }, - { - "id": "b66", - "target": "b65", - "title": "Minimap2: pairwise alignment for nucleotide sequences", - "authors": "H Li", - "journal": "Bioinformatics", - "publication_date": "2018", - "year": 2018, - "volume": "34", - "page_start": "3094", - "page_end": "3100" - }, - { - "id": "b67", - "target": "b66", - "title": "mrvollger/rustybam: v0.1.29. Zenodo", - "authors": "M Vollger", - "publication_date": "2022", - "year": 2022, - "doi": "10.5281/ZENODO.6342176", - "urls": [ - "https://doi.org/10.5281/ZENODO.6342176", - "https://doi.org/10.5281/ZENODO.6342176" - ] - }, - { - "id": "b68", - "target": "b67", - "title": "The Sequence Alignment/Map format and SAMtools", - "authors": "H Li", - "journal": "Bioinformatics", - "publication_date": "2009", - "year": 2009, - "volume": "25", - "page_start": "2078", - "page_end": "2079" - }, - { - "id": "b69", - "target": "b68", - "title": "Twelve years of SAMtools and BCFtools", - "authors": "P Danecek", - "journal": "Gigascience", - "publication_date": "2021", - "year": 2021, - "volume": "10", - "pages": "8" - }, - { - "id": "b70", - "target": "b69", - "title": "HTSlib: C library for reading/writing high-throughput sequencing data", - "authors": "J Bonfield", - "journal": "Gigascience", - "publication_date": "2021", - "year": 2021, - "volume": "10", - "pages": "7" - }, - { - "id": "b71", - "target": "b70", - "title": "Sustainable data analysis with Snakemake. F1000Res", - "authors": "F Mölder", - "publication_date": "2021", - "year": 2021, - "volume": "10", - "pages": "33" - }, - { - "id": "b72", - "target": "b71", - "title": "Python module for reading and manipulating SAM/BAM/VCF/BCF files. GitHub", - "publication_date": "2021", - "year": 2021, - "urls": [ - "https://github.com/pysam-developers/pysam", - "https://github.com/pysam-developers/pysam" - ] - }, - { - "id": "b73", - "target": "b72", - "title": "BEDTools: the Swiss-army tool for genome feature analysis", - "authors": "A Quinlan", - "journal": "Curr. Protoc. Bioinformatics", - "publication_date": "2014", - "year": 2014, - "volume": "47", - "page_start": "11", - "page_end": "12" - }, - { - "id": "b74", - "target": "b73", - "title": "A synthetic-diploid benchmark for accurate variant-calling evaluation", - "authors": "H Li", - "journal": "Nat. Methods", - "publication_date": "2018", - "year": 2018, - "volume": "15", - "page_start": "595", - "page_end": "597" - }, - { - "id": "b75", - "target": "b74", - "title": "mrvollger/asm-to-reference-alignment: v0.1. Zenodo", - "authors": "M Vollger", - "publication_date": "2023", - "year": 2023, - "doi": "10.5281/ZENODO.7653446", - "urls": [ - "https://doi.org/10.5281/ZENODO.7653446", - "https://doi.org/10.5281/ZENODO.7653446" - ] - }, - { - "id": "b76", - "target": "b75", - "title": "mrvollger/sd-divergence: v0.1. Zenodo", - "authors": "M Vollger", - "publication_date": "2023", - "year": 2023, - "doi": "10.5281/ZENODO.7653464", - "urls": [ - "https://doi.org/10.5281/ZENODO.7653464", - "https://doi.org/10.5281/ZENODO.7653464" - ] - }, - { - "id": "b77", - "target": "b76", - "title": "Transposable element subfamily annotation has a reproducibility problem", - "authors": [ - "K Carey", - "G Patterson", - "T Wheeler" - ], - "journal": "Mob. DNA", - "publication_date": "2021", - "year": 2021, - "volume": "12", - "pages": "4" - }, - { - "id": "b78", - "target": "b77", - "title": "Fully phased human genome assembly without parental data using single-cell strand sequencing and long reads", - "authors": "D Porubsky", - "journal": "Nat. Biotechnol", - "publication_date": "2021", - "year": 2021, - "volume": "39", - "page_start": "302", - "page_end": "308" - }, - { - "id": "b79", - "target": "b78", - "title": "Supplementary data for: Increased mutation and gene conversion within human segmental duplications", - "authors": "M Vollger", - "journal": "Zenodo", - "publication_date": "2023", - "year": 2023, - "doi": "10.5281/zenodo.7651064", - "urls": [ - "https://doi.org/10.5281/zenodo.7651064", - "https://doi.org/10.5281/zenodo.7651064" - ] - }, - { - "id": "b80", - "target": "b79", - "title": "mutyper: assigning and summarizing mutation types for analyzing germline mutation spectra", - "note_report_type": "Preprint at", - "authors": "W Dewitt", - "publication_date": "2020", - "year": 2020, - "doi": "10.1101/2020.07.01.183392", - "urls": [ - "https://doi.org/10.1101/2020.07.01.183392", - "https://doi.org/10.1101/2020.07.01.183392" - ] - }, - { - "id": "b81", - "target": "b80", - "title": "Inferring evolutionary dynamics of mutation rates through the lens of mutation spectrum variation", - "authors": [ - "J Carlson", - "W Dewitt", - "K Harris" - ], - "journal": "Curr. Opin. Genet. Dev", - "publication_date": "2020", - "year": 2020, - "volume": "62", - "page_start": "50", - "page_end": "57" - }, - { - "id": "b82", - "target": "b81", - "title": "Evidence for recent, population-specific evolution of the human mutation rate", - "authors": "K Harris", - "journal": "Proc. Natl Acad. Sci. USA", - "publication_date": "2015", - "year": 2015, - "volume": "112", - "page_start": "3439", - "page_end": "3444" - }, - { - "id": "b83", - "target": "b82", - "title": "The statistical analysis of compositional data", - "authors": "J Aitchison", - "journal": "J. R. Stat. Soc", - "publication_date": "1982", - "year": 1982, - "volume": "44", - "page_start": "139", - "page_end": "160" - }, - { - "id": "b84", - "target": "b83", - "title": "mrvollger/mutyper_workflow: v0.1. Zenodo", - "authors": "M Vollger", - "publication_date": "2023", - "year": 2023, - "doi": "10.5281/ZENODO.7653472", - "urls": [ - "https://doi.org/10.5281/ZENODO.7653472", - "https://doi.org/10.5281/ZENODO.7653472" - ] - }, - { - "id": "b85", - "target": "b84", - "title": "mrvollger/sd-divergence-and-igc-figures: v0.1. Zenodo", - "authors": "M Vollger", - "publication_date": "2023", - "year": 2023, - "doi": "10.5281/ZENODO.7653486", - "urls": [ - "https://doi.org/10.5281/ZENODO.7653486", - "https://doi.org/10.5281/ZENODO.7653486" - ] - } - ] -} \ No newline at end of file diff --git a/tests/test_conversions.py b/tests/test_conversions.py index 0dbe48a..62666ac 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -620,4 +620,31 @@ def test_offset_validation_for_specific_references(self): if total_offset_differences > 0: print(f"*** DETECTED {total_offset_differences} OFFSET ISSUES ***") else: - print("No offset differences detected between conversion and expected output") \ No newline at end of file + print("No offset differences detected between conversion and expected output") + + + def test_conversion_JSON(self): + from grobid_client.format.TEI2LossyJSON import TEI2LossyJSONConverter + + converter = TEI2LossyJSONConverter() + refs_offsets_dir = os.path.join(TEST_DATA_PATH, 'refs_offsets') + + xml_path = os.path.join(refs_offsets_dir, "2021.naacl-main.224.grobid.tei.xml") + + converted_json = converter.convert_tei_file(xml_path, stream=False) + + body = converted_json['body_text'] + + for paragraph in body: + if 'refs' in paragraph and paragraph['refs']: + for ref in paragraph['refs']: + offset_start = ref['offset_start'] + offset_end = ref['offset_end'] + ref_text = ref['text'] + paragraph_text = paragraph['text'] + + # Validate the offset actually points to the correct text + if 0 <= offset_start < offset_end <= len(paragraph_text): + actual_text = paragraph_text[offset_start:offset_end] + assert actual_text == ref_text, f"Reference text at offsets ({offset_start}-{offset_end}) should match '{ref_text}' but got '{actual_text}'" +