mmkim1210 · danielduyvo · Jul 15, 2023 · Jul 17, 2023 · Jul 17, 2023 · Jul 17, 2023
diff --git a/Project.toml b/Project.toml
@@ -5,9 +5,12 @@ version = "0.2.0"
 
 [deps]
 CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
+CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
+FASTX = "c2308a5c-f048-11e8-3e8a-31650f418d12"
 Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a"
+Parsers = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
 SnpArrays = "4e780e97-f5bf-4111-9dc4-b70aaf691b06"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
@@ -20,8 +23,8 @@ SnpArrays = "0.3"
 julia = "1.6, 1.7, 1.8"
 
 [extras]
-Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
 test = ["Test", "CSV"]
diff --git a/docs/src/examples/gtf.md b/docs/src/examples/gtf.md
@@ -26,8 +26,9 @@ gencode = CSV.read("data/gencode/$(file)", DataFrame; delim = "\t", comment = "#
 !!! warning "Human genome build"
     The latest human genome assembly is [GRCh38](https://www.ncbi.nlm.nih.gov/grc/human/data?asm=GRCh38.p14), but we use an annotation with coordinates 
     from the older version (GRCh37), because a lot of the GWAS results are shared in 
-    GRCh37 genomic coordinates. Make sure to use the matching human genome build when
-    visualizing your results. 
+    GRCh37 genomic coordinates. Make sure to either use the matching human genome 
+    build or perform liftover on the GWAS summary statistics (as described in 
+    [Munging summary statistics](@ref)) when visualizing your results. 
 
 The ninth column of a [GTF file](https://uswest.ensembl.org/info/website/upload/gff.html) 
 contains rich information about features, so we can parse this column.
@@ -62,4 +63,4 @@ GeneticsMakie.findgene("ENSG00000078328", gencode)
 
 !!! tip "Gene names"
     Make sure to use the correct gene name in case the gene cannot be found.
-    The latest gene names can be looked up in databases such as [GeneCards](https://www.genecards.org/).
+    The latest gene names can be looked up in databases such as [GeneCards](https://www.genecards.org/).
diff --git a/docs/src/examples/summary.md b/docs/src/examples/summary.md
@@ -61,4 +61,122 @@ To reduce memory intake, we can store and load GWAS summary statistics as Arrow
 for (i, key) in enumerate(keys(gwas))
     Arrow.write("data/gwas/$(key).arrow", dfs[i])
 end
-```
+```
+
+Sometimes, it will be useful or necessary to harmonize the variant names as well. 
+One set of summary statistics may have the variants named by their chromosome and 
+position while another set may use the rsIDs from dbSNP to label the SNPs. With the 
+following code, we can harmonize variant names for our summary statistics to a specific 
+dbSNP version, falling back to `"$CHR:$BP:$A1:$A2"` if the variant is absent from 
+the dbSNP database.  
+
+First, we will need to normalize our summary statistics to the reference genome. Sometimes 
+summary statistics may have the alleles swapped by certain tools, so we will normalize 
+the summary statistics to match the GRCh37 reference build, which the dbSNP database 
+is also matched to.  
+
+The reference build is stored in the FASTA format. We will need version GRCh37 as 
+that is what our summary statistics are based on. UCSC shares these reference genomes 
+freely on their site; we will download the files from there and to uncompress and 
+index these files to use them with the `FASTX` julia package. We also download GRCh38 
+here too, as it will be relevant for our later examples (but not for this particular 
+example of variant name harmonization).
+
+```julia
+isdir("data/fasta") || mkdir("data/fasta")
+urls = [
+    "https://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/hg19.fa.gz",
+    "https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz"
+]
+for url in urls
+    isfile("data/fasta/$(basename(url))") || Downloads.download(url, "data/fasta/$(basename(url))")
+end
+```
+
+```bash
+gunzip data/fasta/hg19.fa.gz
+gunzip data/fasta/hg38.fa.gz
+samtools faidx data/fasta/hg19.fa
+samtools faidx data/fasta/hg38.fa
+```
+
+```julia
+import FASTX: FASTA
+hg19 = FASTA.Reader(open("data/fasta/hg19.fa"), index = "data/fasta/hg19.fa.fai")
+GeneticsMakie.normalizesumstats!(dfs, hg19)
+close(hg19)
+```
+
+Now, we will harmonize the variant names to dbSNP 151. Several versions of this reference 
+can be downloaded from the NIH site; we will download the version containing common 
+variants.
+```julia
+isdir("data/vcf") || mkdir("data/vcf")
+urls = [
+    "https://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606_b151_GRCh37p13/VCF/00-common_all.vcf.gz",
+]
+for url in urls
+    isfile("data/vcf/$(basename(url))") || Downloads.download(url, "data/vcf/$(basename(url))")
+end
+```
+
+There are two different ways to use the `harmonizevariantnames!` function; either 
+you load in the VCF file into memory as a DataFrame or you use the CSV.Rows iterator 
+to read in one row at a time. Loading the VCF file into memory will speed up repeated 
+calls of `harmonizevariantnames!` at the cost of high memory usage while iterating 
+row by row will keep memory usage to a minimum but necessitate iterating through the 
+whole VCF file every function call. Additionally, when iterating, the file is expected 
+to be sorted by chromosome and position (i.e. `bcftools sort`); the dbSNP VCF files 
+are already sorted.
+
+```julia
+dbsnp_dataframe = CSV.read("data/vcf/00-common_all.vcf.gz", DataFrame; delim = "\t", comment = "##")
+GeneticsMakie.harmonizevariantnames!(dfs, dbsnp_dataframe)
+
+dbsnp_iterator = CSV.Rows("data/vcf/00-common_all.vcf.gz", delim = "\t", comment = "##")
+GeneticsMakie.harmonizevariantnames!(dfs, dbsnp_iterator)
+```
+
+The summary statistics we are using are on the genome assembly GRCh37. However, newer 
+GWAS and annotations may be shared on the latest genome assembly GRCh38. The genomic 
+coordinates between the two builds are incompatible; in order to visualize data across 
+builds, we need to "liftover" the coordinates all to one genome assembly. GeneticsMakie 
+provides functions for perfoming liftover on summary statistics.  
+
+To perform liftover, we need to read a chain file describing how genomic coordinates 
+map between one build to the other. Similar to `GeneticsMakie.mungesumstats!`, chromosome 
+names are of type `String` and are stripped of the prefix "chr".
+```julia
+isdir("data/chain") || mkdir("data/chain")
+url = "https://hgdownload.soe.ucsc.edu/goldenPath/hg19/liftOver/hg19ToHg38.over.chain.gz"
+isfile("data/chain/$(basename(url))") || Downloads.download(url, "data/chain/$(basename(url))")
+
+chain = GeneticsMakie.readchain("data/chain/$(basename(url))")
+```
+
+Additionally, we will need to read in FASTA files describing both the reference and 
+the query genome builds (GRCh37 and GRCh38 respectively). These are used to properly 
+liftover indels that were on the opposite strand. You will need to uncompress and 
+index these files to use them with the `FASTX` julia package.
+
+```julia
+hg19 = FASTA.Reader(open("data/fasta/hg19.fa"), index = "data/fasta/hg19.fa.fai")
+hg38 = FASTA.Reader(open("data/fasta/hg38.fa"), index = "data/fasta/hg38.fa.fai")
+```
+
+With the chain file and the FASTA files loaded, we can now perform liftover on our 
+GWAS. `GeneticsMakie.liftoversumstats!` will liftover the sumstats in place and return 
+a DataFrame of `unmapped`) unmapped variants still on the original build and `multiple`) 
+variants on the target build that has mapped to multiple positions. Liftover will 
+take a while to complete, especially summary statistics with many variants.
+```julia
+dfs_hg38 = deepcopy(dfs)
+unmapped, multiple = GeneticsMakie.liftoversumstats!(dfs_hg38, hg19fa, hg38fa, chain; 
+                                                     multiplematches = :warning,
+                                                     whichreference = :first_silent,
+                                                     indelreference = :start,
+                                                     extendambiguous = true)
+close(hg19)
+close(hg38)
+```
+
diff --git a/src/GeneticsMakie.jl b/src/GeneticsMakie.jl
@@ -1,11 +1,14 @@
 module GeneticsMakie
 
+using CodecZlib
 using CairoMakie
 using Makie.GeometryBasics
 using DataFrames
+using Parsers
 using SnpArrays
 using Statistics
 using Distributions
+import FASTX: FASTA
 
 include("parsegtf.jl")
 include("plotgenes.jl")