Variant calling
# First rename some files
mv SRRXXXXXXX_sort_q30_rmdup.bam SRRXXXXXXX_q30_sort_rmdup.bam
# To label our samples
java -jar /services/tools/picard-tools/2.20.2/picard.jar AddOrReplaceReadGroups I= SRRXXXXXXX_q30_sort_rmdup.bam O= SRRXXXXXXX_q30_sort_rmdup_group.bam SORT_ORDER=coordinate RGID=foo RGLB=bar RGPL=illumina RGPU= NONE RGSM= SRRXXXXXXX CREATE_INDEX=True
# To perform variant calling, first we need to index our new bam file
samtools index SRRXXXXXXX_q30_sort_rmdup_group.bam
#We then call variants for each sample.
#We will have as output a compress the vcf file, to save space. In this case we “concatenate” two different actions
samtools mpileup -uf one_chromosome.fasta SRRXXXXXXX_q30_sort_rmdup_group.bam | bcftools call -m -v -O z - > SRRXXXXXXX_var.raw.vcf.gz
# Look at the first lines of the vcf and follow the explanations of the slides
# and of https://samtools.github.io/hts-specs/VCFv4.1.pdf.
zcat SRRXXXXXXX_var.raw.vcf.gz | more
# Move files to Variants folder to be merged once they are separated by origin
mv SRRXXXXXXX_var.raw.vcf.gz /home/gstud01/Variants/SRRXXXXXXX_var.raw.vcf.gz
#If we have to analyze several samples, we need to combine everything in an unique vcf file, and unique quality score for each of the variant detected, using vcf-merge, from vcf tools.
#To do so, we first need to index the file in a way that
tabix -p vcf SRRXXXXXXX_var.raw.vcf.gz
#Then we merge with e.g. another file, that belong to another salmon sample that has been process in the same way In the VARIANT_CALLING directory and that has been already indexed (so, it has its tbi file in the same directory). It is not necessary to copy it (you can do it of course) but we can “call” the file in the command line using its absolute or relative path We will use vcf-merge and vcftools (http://vcftools.sourceforge.net/man_latest.html)
# USA acc
vcf-merge SRRXXXXXX1_var.raw.vcf.gz SRRXXXXXX2_var.raw.vcf.gz SRRXXXXXX3_var.raw.vcf.gz SRRXXXXXX4_var.raw.vcf.gz SRRXXXXXX5_var.raw.vcf.gz SRRXXXXXX6_var.raw.vcf.gz | bgzip -c > USA_samtools.vcf.gz
# Norway acc
vcf-merge SRRXXXXXX1_var.raw.vcf.gz SRRXXXXXX2_var.raw.vcf.gz SRRXXXXXX3_var.raw.vcf.gz SRRXXXXXX4_var.raw.vcf.gz SRRXXXXXX5_var.raw.vcf.gz SRRXXXXXX6_var.raw.vcf.gz | bgzip -c > Norway_samtools.vcf.gz
# How many Variants have been detected?
zcat samtools.vcf.gz | grep -v "#" | wc –l
# We can filter our big file, removing INDEL, as we are not interested on that at the moment and removing SNPs with a low quality (in this case 30).
vcftools --gzvcf samtools.vcf.gz --recode --out samtools_filtered --minQ 30 --remove-indels
# Output file comparing the sites in two vcf files
vcftools --gzvcf USA_samtools_filtered.recode.vcf --gzdiff Norway_samtools_filtered.recode.vcf --diff-site --out in1_v_in2
# Found NW_018522775.1 in file 1 and NW_018522776.1 in file 2.
# Could you tell me the a) SNV quality b) allele of the reference genome c) alternative allele d) genotypes of the two animals in these three SNVs?
# You can use the commads e.g.
awk '$2=="1129" {print $0}' USA_samtools_filtered.recode.vcf
# We run the chr.py to chane the chromosmes names to numbes
Identifiying SNPs
# Convert the vcf file in something that plink can read. We can do so still using vcftools
vcftools --vcf Norway_new.vcf --plink --out Norway_new.recode
vcftools --vcf USA_new.vcf --plink --out USA_new.recode
# We now convert the dataset that is in ped+map format, into tped+tmap We do this through
plink --allow-extra-chr --file Norway_new.recode --transpose --recode --out Norway_new.recode --chr-set 29
# Visualize the first lines
cut -d " " -f1-31 Norway_new.recode.ped | head
# How many SNPs have been genotyped? You can do this through
wc -l Norway_new.recode.map
# How many animals have been genotyped? You can do this through
wc -l Norway_new.recode.ped