-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGenomewide_het_cal.sh
47 lines (34 loc) · 1.39 KB
/
Genomewide_het_cal.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#!/bin/bash
#SBATCH --job-name="mummer"
#SBATCH --partition=priority-mem --qos=gbru
#SBATCH -n 40
#SBATCH -t 1-10:00:00
#SBATCH --output="%x_%j.o" # job standard output file (%j replaced by job id)
#SBATCH --error="%x_%j.e" # job standard error file (%j replaced by job id)
module load mummer
# Query
q_dir='STEP_5/YakHifi_HDA149_B5/'
query='ragtag.scaffold.fasta'
# Reference
r_dir='STEP_5/YakHifi_HDA330_B5/'
ref='ragtag.scaffold.fasta'
out='YakHifi_HDA149_B5_onto_YakHifi_HDA149_B5'
nucmer --prefix=${out} ${r_dir}/${ref} ${q_dir}/${query}
# Non repeat snps
show-snps -Clr ${out}.delta > ${out}_nonrepeat.snps
module load samtools
# index the assemblies and get the column that has the contig lengths
samtools faidx ${query}
samtools faidx ${ref}
querysz=$(cut -f2 ${query}.fai | paste -sd+ | bc)
refsz=$(cut -f2 ${ref}.fai | paste -sd+ | bc)
# calculate the average genome sizes
avgsz=$(echo "("${genomesize1}"+"${genomesize2}")/2" | bc -l)
# count the number of snps, and then subtract out the five header lines
numsnp=$(wc -l ${out}_nonrepeat.snps | cut -f1 --delimiter=" ")
numsnp=$(echo $numsnp"-5" | bc)
# calculate the percent heterozygosity
hetperc=$(echo ${numsnp}"/"${avgsz}"*100" | bc -l)
# pretty format the calculation
# take snps and divide by genome size then multiply by 100 to get percent heterozygosity
printf "(%'.0f SNPs / %'.0f bp) x 100 = %0.4f%% heterozygosity\n" $numsnp $avgsz $hetperc