Merge pull request #8 from Truongphi20/main

Correct tutorial content
Cloufield · Oct 25, 2024 · d88595c · d88595c
2 parents 16744dd + 1f4626d
commit d88595c
Show file tree

Hide file tree

Showing 11 changed files with 94 additions and 73 deletions.
diff --git a/01_Dataset/t2d/1kgeas_t2d.log b/01_Dataset/t2d/1kgeas_t2d.log
@@ -8,7 +8,7 @@ Analysis started at 15:16:23 JST on Mon Jan 23 2023.
 Hostname: HEs-MacBook-Air.local
 
 Accepted options:
---bfile ../1KG.EAS.auto.snp.norm.nodup.split.maf005.thinp020
+--bfile ../1KG.EAS.auto.snp.norm.nodup.split.rare002.common015.missing
 --simu-cc 200 304
 --simu-causal-loci t2d_casual.txt
 --simu-hsq 0.2
@@ -17,12 +17,12 @@ Accepted options:
 --out 1kgeas_t2d
 
 
-Reading PLINK FAM file from [../1KG.EAS.auto.snp.norm.nodup.split.maf005.thinp020.fam].
-504 individuals to be included from [../1KG.EAS.auto.snp.norm.nodup.split.maf005.thinp020.fam].
-Reading PLINK BIM file from [../1KG.EAS.auto.snp.norm.nodup.split.maf005.thinp020.bim].
-1122299 SNPs to be included from [../1KG.EAS.auto.snp.norm.nodup.split.maf005.thinp020.bim].
-Reading PLINK BED file from [../1KG.EAS.auto.snp.norm.nodup.split.maf005.thinp020.bed] in SNP-major format ...
-Genotype data for 504 individuals and 1122299 SNPs to be included from [../1KG.EAS.auto.snp.norm.nodup.split.maf005.thinp020.bed].
+Reading PLINK FAM file from [../1KG.EAS.auto.snp.norm.nodup.split.rare002.common015.missing.fam].
+504 individuals to be included from [../1KG.EAS.auto.snp.norm.nodup.split.rare002.common015.missing.fam].
+Reading PLINK BIM file from [../1KG.EAS.auto.snp.norm.nodup.split.rare002.common015.missing.bim].
+1122299 SNPs to be included from [../1KG.EAS.auto.snp.norm.nodup.split.rare002.common015.missing.bim].
+Reading PLINK BED file from [../1KG.EAS.auto.snp.norm.nodup.split.rare002.common015.missing.bed] in SNP-major format ...
+Genotype data for 504 individuals and 1122299 SNPs to be included from [../1KG.EAS.auto.snp.norm.nodup.split.rare002.common015.missing.bed].
 Simulation parameters:
 Number of simulation replicate(s) = 1 (Default = 1)
 Heritability of liability = 0.2 (Default = 0.1)

diff --git a/01_Dataset/t2d/simulate.sh b/01_Dataset/t2d/simulate.sh
@@ -9,6 +9,6 @@
 export PATH=~/tools/bin:$PATH
 export OMP_NUM_THREADS=1
 
-gcta64  --bfile ../1KG.EAS.auto.snp.norm.nodup.split.maf005.thinp020 --simu-cc 200 304  --simu-causal-loci t2d_causal.txt  --simu-hsq 0.2  --simu-k 0.4  --simu-rep 1  --out 1kgeas_t2d
+gcta64  --bfile ../1KG.EAS.auto.snp.norm.nodup.split.rare002.common015.missing --simu-cc 200 304  --simu-causal-loci t2d_causal.txt  --simu-hsq 0.2  --simu-k 0.4  --simu-rep 1  --out 1kgeas_t2d
 echo "FID IID T2D" >1kgeas_t2d.txt
 cat 1kgeas_t2d.phen >>1kgeas_t2d.txt
diff --git a/03_Data_formats/README.md b/03_Data_formats/README.md
@@ -180,7 +180,7 @@ Original standard text format for sample pedigree information and genotype calls
 !!! example "`.ped`"
     ```
     # check the first 16 rows and 16 columns of the ped file
-    cut -d " " -f 1-16 1KG.EAS.auto.snp.norm.nodup.split.maf005.thinp020.ped | head
+    cut -d " " -f 1-16 1KG.EAS.auto.snp.norm.nodup.split.rare002.common015.missing.ped | head
     0 HG00403 0 0 0 -9 G G T T A A G A C C
     0 HG00404 0 0 0 -9 G G T T A A G A T C
     0 HG00406 0 0 0 -9 G G T T A A G A T C
@@ -210,7 +210,7 @@ Variant information file accompanying a .ped text pedigree + genotype table. A t
 
 !!! example "`.map`"
     ```
-    head 1KG.EAS.auto.snp.norm.nodup.split.maf005.thinp020.map
+    head 1KG.EAS.auto.snp.norm.nodup.split.rare002.common015.missing.map
     1       1:13273:G:C     0       13273
     1       1:14599:T:A     0       14599
     1       1:14604:A:G     0       14604
@@ -230,16 +230,16 @@ bed/fam/bim formats are the binary implementation of ped/map formats.
 bed/bim/fam files contain the same information as ped/map but are much smaller in size.
 
 ```
--rw-r----- 1 yunye yunye 135M Dec 23 11:45 1KG.EAS.auto.snp.norm.nodup.split.maf005.thinp020.bed
--rw-r----- 1 yunye yunye  36M Dec 23 11:46 1KG.EAS.auto.snp.norm.nodup.split.maf005.thinp020.bim
--rw-r----- 1 yunye yunye 9.4K Dec 23 11:46 1KG.EAS.auto.snp.norm.nodup.split.maf005.thinp020.fam
--rw-r--r-- 1 yunye yunye  32M Dec 27 17:51 1KG.EAS.auto.snp.norm.nodup.split.maf005.thinp020.map
--rw-r--r-- 1 yunye yunye 2.2G Dec 27 17:51 1KG.EAS.auto.snp.norm.nodup.split.maf005.thinp020.ped
+-rw-r----- 1 yunye yunye 135M Dec 23 11:45 1KG.EAS.auto.snp.norm.nodup.split.rare002.common015.missing.bed
+-rw-r----- 1 yunye yunye  36M Dec 23 11:46 1KG.EAS.auto.snp.norm.nodup.split.rare002.common015.missing.bim
+-rw-r----- 1 yunye yunye 9.4K Dec 23 11:46 1KG.EAS.auto.snp.norm.nodup.split.rare002.common015.missing.fam
+-rw-r--r-- 1 yunye yunye  32M Dec 27 17:51 1KG.EAS.auto.snp.norm.nodup.split.rare002.common015.missing.map
+-rw-r--r-- 1 yunye yunye 2.2G Dec 27 17:51 1KG.EAS.auto.snp.norm.nodup.split.rare002.common015.missing.ped
 ```
 
 !!! example "`.fam`"
     ```
-    head 1KG.EAS.auto.snp.norm.nodup.split.maf005.thinp020.fam
+    head 1KG.EAS.auto.snp.norm.nodup.split.rare002.common015.missing.fam
     0 HG00403 0 0 0 -9
     0 HG00404 0 0 0 -9
     0 HG00406 0 0 0 -9
@@ -254,7 +254,7 @@ bed/bim/fam files contain the same information as ped/map but are much smaller i
 
 !!! example "`.bim`"
     ```
-    head 1KG.EAS.auto.snp.norm.nodup.split.maf005.thinp020.bim
+    head 1KG.EAS.auto.snp.norm.nodup.split.rare002.common015.missing.bim
     1       1:13273:G:C     0       13273   C       G
     1       1:14599:T:A     0       14599   A       T
     1       1:14604:A:G     0       14604   G       A
@@ -272,7 +272,7 @@ bed/bim/fam files contain the same information as ped/map but are much smaller i
     The first three bytes should be 0x6c, 0x1b, and 0x01 in that order.
     The rest of the file is a sequence of V blocks of N/4 (rounded up) bytes each, where V is the number of variants and N is the number of samples. The first block corresponds to the first marker in the .bim file, etc."
     ```
-    hexdump -C 1KG.EAS.auto.snp.norm.nodup.split.maf005.thinp020.bed | head
+    hexdump -C 1KG.EAS.auto.snp.norm.nodup.split.rare002.common015.missing.bed | head
     00000000  6c 1b 01 ff ff bf bf ff  ff ff ef fb ff ff ff fe  |l...............|
     00000010  ff ff ff ff fb ff bb ff  ff fb af ff ff fe fb ff  |................|
     00000020  ff ff ff fe ff ff ff ff  ff bf ff ff ef ff ff ef  |................|

diff --git a/04_Data_QC/README.md b/04_Data_QC/README.md
@@ -447,47 +447,68 @@ The following command can calculate the Hardy-Weinberg equilibrium exact test st
 
     This code is converted from [here](https://github.com/jeremymcrae/snphwe/blob/master/src/snp_hwe.cpp) (Jeremy McRae) to python. Orginal citation: Wigginton, JE, Cutler, DJ, and Abecasis, GR (2005) A Note on Exact Tests of Hardy-Weinberg Equilibrium. AJHG 76: 887-893
     ```
-    def snphwe(obs_hets, obs_hom1, obs_hom2):
-        obs_homr = min(obs_hom1, obs_hom2)
-        obs_homc = max(obs_hom1, obs_hom2)
-        
-        rare = 2 * obs_homr + obs_hets
-        genotypes = obs_hets + obs_homc + obs_homr
-        
-        probs = [0.0 for i in range(rare +1)]
-        
-        mid = rare * (2 * genotypes - rare) // (2 * genotypes)
-        if mid % 2 != rare%2:
-            mid += 1
-            
-        probs[mid] = 1.0
-        sum_p = 1 #probs[mid]
-
-        curr_homr = (rare - mid) // 2
-        curr_homc = genotypes - mid - curr_homr
-        
-        for curr_hets in range(mid, 1, -2):
-            probs[curr_hets - 2] = probs[curr_hets] * curr_hets * (curr_hets - 1.0)/ (4.0 * (curr_homr + 1.0) * (curr_homc + 1.0))
-            sum_p+= probs[curr_hets - 2]
-            curr_homr += 1
-            curr_homc += 1
-        
-        curr_homr = (rare - mid) // 2
-        curr_homc = genotypes - mid - curr_homr
-        
-        for curr_hets in range(mid, rare-1, 2):
-            probs[curr_hets + 2] = probs[curr_hets] * 4.0 * curr_homr * curr_homc/ ((curr_hets + 2.0) * (curr_hets + 1.0))
-            sum_p += probs[curr_hets + 2]
-            curr_homr -= 1
-            curr_homc -= 1
-        
-        target = probs[obs_hets]
-        p_hwe = 0.0
-        for p in probs:
-            if p <= target :
-                p_hwe += p / sum_p  
+    def snphwe(obs_hets: int, obs_hom1: int, obs_hom2: int) -> float:
+    """Calculate Hardy-Weinberg equilibrium exact test for a variant
+
+    Args:
+        obs_hets (int): observed heterozygous number
+        obs_hom1 (int): first observed homozygous number
+        obs_hom2 (int): second observed homozygous number
+
+    Returns:
+        float: Hardy-Weinberg equilibrium exact test p-value.
         
-        return min(p_hwe,1)
+        - P > 0.05: No significant deviation from Hardy-Weinberg equilibrium (HWE).
+        - P ≤ 0.05: Significant deviation from HWE, which could be due to factors such as population stratification, inbreeding, genotyping errors, or natural selection.
+    """
+    obs_minor_homs = min(obs_hom1, obs_hom2)
+    obs_mijor_homs = max(obs_hom1, obs_hom2)
+
+    rare = 2 * obs_minor_homs + obs_hets
+    genotypes = obs_hets + obs_mijor_homs + obs_minor_homs
+
+    probs = [0.0 for i in range(rare +1)]
+
+    mid = rare * (2 * genotypes - rare) // (2 * genotypes)
+    if mid % 2 != rare%2:
+        mid += 1
+
+    probs[mid] = 1.0
+    sum_p = 1 #probs[mid]
+
+    curr_homr = (rare - mid) // 2
+    curr_homc = genotypes - mid - curr_homr
+
+    for curr_hets in range(mid, 1, -2):
+        probs[curr_hets - 2] = probs[curr_hets] * curr_hets * (curr_hets - 1.0)/ (4.0 * (curr_homr + 1.0) * (curr_homc + 1.0))
+        sum_p+= probs[curr_hets - 2]
+        curr_homr += 1
+        curr_homc += 1
+
+    curr_homr = (rare - mid) // 2
+    curr_homc = genotypes - mid - curr_homr
+
+    for curr_hets in range(mid, rare-1, 2):
+        probs[curr_hets + 2] = probs[curr_hets] * 4.0 * curr_homr * curr_homc/ ((curr_hets + 2.0) * (curr_hets + 1.0))
+        sum_p += probs[curr_hets + 2]
+        curr_homr -= 1
+        curr_homc -= 1
+
+    target = probs[obs_hets]
+    p_hwe = 0.0
+    for p in probs:
+        if p <= target :
+            p_hwe += p / sum_p  
+
+    return min(p_hwe,1)
+
+    if __name__ == '__main__':
+    # For an example, we had 502 samples. 
+    # At position 1:14930:A:G, we count:
+    #     + 407 heterozygous genotypes (signed 0|1 or 1|0).
+    #     + 4 homozygous genotypes AA (signed 0|0)
+    #     + 91 homozygous genotypes GG (signed 1|1)
+    print(snphwe(407,4,91))
     ```
 
 
@@ -828,4 +849,4 @@ plink \
 ## Reference
 - Purcell, S., Neale, B., Todd-Brown, K., Thomas, L., Ferreira, M. A., Bender, D., ... & Sham, P. C. (2007). PLINK: a tool set for whole-genome association and population-based linkage analyses. The American journal of human genetics, 81(3), 559-575.
 - Chang, C. C., Chow, C. C., Tellier, L. C., Vattikuti, S., Purcell, S. M., & Lee, J. J. (2015). Second-generation PLINK: rising to the challenge of larger and richer datasets. Gigascience, 4(1), s13742-015.
-- Manichaikul, A., Mychaleckyj, J. C., Rich, S. S., Daly, K., Sale, M., & Chen, W. M. (2010). Robust relationship inference in genome-wide association studies. Bioinformatics, 26(22), 2867-2873.
+- Manichaikul, A., Mychaleckyj, J. C., Rich, S. S., Daly, K., Sale, M., & Chen, W. M. (2010). Robust relationship inference in genome-wide association studies. Bioinformatics, 26(22), 2867-2873.
diff --git a/05_PCA/README.md b/05_PCA/README.md
@@ -192,8 +192,8 @@ For downstream analysis, we can exclude these SNPs using `--exclude hild.set`.
             --bfile ${plinkFile} \
     	    --threads ${threadnum} \
             --read-freq ${outPrefix}.acount \
-    	    --score ${outPrefix}.eigenvec.allele 2 5 header-read no-mean-imputation variance-standardize \
-            --score-col-nums 6-15 \
+    	    --score ${outPrefix}.eigenvec.allele 2 6 header-read no-mean-imputation variance-standardize \
+            --score-col-nums 7-16 \
             --out ${outPrefix}_projected
     ```
 

diff --git a/07_Annotation/README.md b/07_Annotation/README.md
@@ -32,7 +32,7 @@ We will only use the first 100000 variants as an example.
 
 !!! example "annovar_input"
     ```bash
-    awk 'NR>1 && NR<100000 {print $1,$2,$2,$4,$5}' ../06_Association_tests/1kgeas.B1.glm.logistic.    hybrid > annovar_input.txt
+    awk 'NR>1 && NR<100000 {print $1,$2,$2,$4,$5}' ../06_Association_tests/1kgeas.B1.glm.firth > annovar_input.txt
     ```
 
     ```

diff --git a/12_fine_mapping/README.md b/12_fine_mapping/README.md
@@ -151,7 +151,7 @@ For fine-mapping with summary statistics using Susie (SuSiE-RSS), IBSS was modif
     ```
     #!/bin/bash
 
-    plinkFile="../01_Dataset/1KG.EAS.auto.snp.norm.nodup.split.maf005.thinp020"
+    plinkFile="../01_Dataset/1KG.EAS.auto.snp.norm.nodup.split.rare002.common015.missing"
 
     # LD r matrix
     plink \

diff --git a/14_gcta_greml/README.md b/14_gcta_greml/README.md
@@ -61,7 +61,7 @@ Download the version of GCTA for your system from : https://yanglab.westlake.edu
 
 ```bash
 #!/bin/bash
-plinkFile="../01_Dataset/1KG.EAS.auto.snp.norm.nodup.split.maf005.thinp020"
+plinkFile="../01_Dataset/1KG.EAS.auto.snp.norm.nodup.split.rare002.common015.missing"
 gcta \
   --bfile ${plinkFile} \
   --autosome \
@@ -134,7 +134,7 @@ awk '{print $1,$2,$5,$6,$7,$8,$9}' ../05_PCA/plink_results_projected.sscore > 5P
 
 gcta \
   --grm ${GRM} \
-  --pheno ${phenotypeFIile} \
+  --pheno ${phenotypeFile} \
   --prevalence ${prevalence} \
   --qcovar  5PCs.txt \
   --reml \

diff --git a/19_ld/README.md b/19_ld/README.md
@@ -33,11 +33,11 @@ The allele frequencies are:
 |Allele|Frequency|
 |-|-|
 |A|$p_A=p_{AB}+p_{Ab}$|
-|a|$p_A=p_{aB}+p_{ab}$|
-|B|$p_A=p_{AB}+p_{aB}$|
-|b|$p_A=p_{Ab}+p_{ab}$|
+|a|$p_a=p_{aB}+p_{ab}$|
+|B|$p_B=p_{AB}+p_{aB}$|
+|b|$p_b=p_{Ab}+p_{ab}$|
 
-D : the level of LD between A and B can be estimated using **coefficient of linkage disequilibrium (D)**, which is defined as:
+D: the level of LD between A and B can be estimated using **coefficient of linkage disequilibrium (D)**, which is defined as:
 
 $$D_{AB} = p_{AB} - p_Ap_B$$
 
@@ -56,7 +56,7 @@ So we can simply denote $D = D_{AB}$, and the relationship between haplotype fre
 |Allele|A|a|Total|
 |-|-|-|-|
 |B|$p_{AB}=p_Ap_B+D$|$p_{aB}=p_ap_B-D$|$p_B$|
-|b|$p_{AB}=p_Ap_b-D$|$p_{AB}=p_ap_b+D$|$p_b$|
+|b|$p_{Ab}=p_Ap_b-D$|$p_{ab}=p_ap_b+D$|$p_b$|
 |Total|$p_A$|$p_a$|1|
 
 !!! warning "The range of possible values of D depends on the allele frequencies, which is not suitable for comparison between different pairs of alleles."

diff --git a/32_whole_genome_regression/README.md b/32_whole_genome_regression/README.md
@@ -25,7 +25,7 @@ Please check [here](https://rgcgithub.github.io/regenie/install/)
 
 !!! example "Sample codes for running step 1"
     ```
-    plinkFile=../01_Dataset/1KG.EAS.auto.snp.norm.nodup.split.maf005.thinp020
+    plinkFile=../01_Dataset/1KG.EAS.auto.snp.norm.nodup.split.rare002.common015.missing
     phenoFile=../01_Dataset/1kgeas_binary_regenie.txt
     covarFile=../05_PCA/plink_results_projected.sscore
     covarList="PC1_AVG,PC2_AVG,PC3_AVG,PC4_AVG,PC5_AVG,PC6_AVG,PC7_AVG,PC8_AVG,PC9_AVG,PC10_AVG"
@@ -54,7 +54,7 @@ Please check [here](https://rgcgithub.github.io/regenie/install/)
 
 !!! example "Sample codes for running step 2"
     ```
-    plinkFile=../01_Dataset/1KG.EAS.auto.snp.norm.nodup.split.maf005.thinp020
+    plinkFile=../01_Dataset/1KG.EAS.auto.snp.norm.nodup.split.rare002.common015.missing
     phenoFile=../01_Dataset/1kgeas_binary_regenie.txt
     covarFile=../05_PCA/plink_results_projected.sscore
     covarList="PC1_AVG,PC2_AVG,PC3_AVG,PC4_AVG,PC5_AVG,PC6_AVG,PC7_AVG,PC8_AVG,PC9_AVG,PC10_AVG"

diff --git a/32_whole_genome_regression/run_step2.sh b/32_whole_genome_regression/run_step2.sh
@@ -1,6 +1,6 @@
 #/bin/bash
 
-plinkFile=../01_Dataset/1KG.EAS.auto.snp.norm.nodup.split.maf005.thinp020
+plinkFile=../01_Dataset/1KG.EAS.auto.snp.norm.nodup.split.rare002.common015.missing
 phenoFile=../01_Dataset/1kgeas_binary_regenie.txt
 covarFile=../05_PCA/plink_results_projected.sscore
 covarList="PC1_AVG,PC2_AVG,PC3_AVG,PC4_AVG,PC5_AVG,PC6_AVG,PC7_AVG,PC8_AVG,PC9_AVG,PC10_AVG"
-Original file line number
+Diff line change
@@ Expand Up / @@ -32,7 +32,7 @@ We will only use the first 100000 variants as an example. @@
     !!! example "annovar_input"
         ```bash
-        awk 'NR>1 && NR<100000 {print $1,$2,$2,$4,$5}' ../06_Association_tests/1kgeas.B1.glm.logistic.    hybrid > annovar_input.txt
+        awk 'NR>1 && NR<100000 {print $1,$2,$2,$4,$5}' ../06_Association_tests/1kgeas.B1.glm.firth > annovar_input.txt
         ```
         ```
@@ Expand Down @@