-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path05processVCF.py
36 lines (23 loc) · 954 Bytes
/
05processVCF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# conda activate allel
import allel, zarr
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import copy
import sys
tDir = sys.argv[1]
zarr_path = tDir+"mitos.zarr"
allel.vcf_to_zarr(tDir+"sim.vcf", zarr_path, rename_fields={"NUMALT":"numalt2"}, fields="*",overwrite=True, )
callset = zarr.open_group(zarr_path, mode='r')
#print(callset.tree())
gtDf = pd.DataFrame(callset["calldata/GT"][:][:,:,0])
gtDf.columns = callset["samples"]
gtDf["POS"] = callset["variants/POS"][:]
gtDf.to_csv(tDir+"genotypes.csv", "\t")
alleleCounts = np.dstack((callset["calldata/RO"][:], callset["calldata/AO"][:]))
alleleCounts[alleleCounts==-1] = 0
sumPerSampleAndSite = np.sum(alleleCounts, axis=2)
acDf = pd.DataFrame(np.vstack([alleleCounts[:,:,i] for i in range(4)]))
acDf["position"] = np.tile(callset["variants/POS"][:], 4)
acDf["allele"] =np.repeat([1,2,3,4], sumPerSampleAndSite.shape[0])
acDf.to_csv(tDir+"alleleCounts.csv", "\t")