-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathftpg.py
83 lines (70 loc) · 2.1 KB
/
ftpg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
from ftplib import FTP
import progressbar
import os
import sys
import urllib.request as urllib
import requests
def download_data(x,dir_name):
fname=x.split("/")[-1]
path=os.path.join(dir_name,fname)
urllib.urlretrieve(x,path)
def get_data_file(file,dir):
if not os.path.isfile(file):
print("The specified file does not exist!!!")
sys.exit(1)
with open(file,"r")as f:
lf=f.read().splitlines()
if not os.path.exists(dir):
os.mkdir(dir)
for x in progressbar.progressbar(lf):
download_data(x,dir)
#This wil download all the fasta files for the coding sequences. To change the directory, change the argument in the get_data_file argument.
host ="ftp.ensembl.org"
user = "anonymous"
password = ""
print("Connecting to {}".format(host))
ftp = FTP(host)
ftp.login(user, password)
print("Connected to {}".format(host))
base_link="ftp://ftp.ensembl.org"
#find sequences of all the cds files
l=ftp.nlst("/pub/release-96/fasta")
lt=[]
for x in l:
y=ftp.nlst(x+"/cds")
for z in y:
if z.endswith(".cds.all.fa.gz"):
lt.append(z)
with open("seq_link.txt","w") as file:
for x in lt:
file.write(base_link+x)
file.write("\n")
#find all the files with protein sequences
l=ftp.nlst("/pub/release-96/fasta")
lt=[]
for x in progressbar.progressbar(l):
y=ftp.nlst(x+"/pep")
for z in y:
if z.endswith(".pep.all.fa.gz"):
lt.append(z)
with open("protein_seq.txt","w") as file:
for x in lt:
file.write(base_link+x)
file.write("\n")
#get link of all the gtf files
l=ftp.nlst("/pub/release-96/gtf")
lt=[]
for x in l:
y=ftp.nlst(x)
for z in y:
if z.endswith(".96.gtf.gz"):
lt.append(z)
with open("gtf_link.txt","w") as file:
for x in lt:
file.write(base_link+x)
file.write("\n")
print("Downloading Data")
get_data_file("gtf_link.txt","data")
get_data_file("seq_link.txt","geneseq")
get_data_file("protein_seq.txt","pro_seq")
print("Download Complete.................")