Skip to content

Commit 98e8b79

Browse files
committed
v0.8.9
1 parent 9e9a4da commit 98e8b79

File tree

7 files changed

+58
-83
lines changed

7 files changed

+58
-83
lines changed

.DS_Store

0 Bytes
Binary file not shown.

runHiC/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,4 @@
33
# Author: XiaoTao Wang
44

55
__author__ = 'XiaoTao Wang'
6-
__version__ = '0.8.8'
6+
__version__ = '0.8.9'

runHiC/binning.py

Lines changed: 17 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -7,39 +7,22 @@
77

88
log = logging.getLogger(__name__)
99

10-
def mcool_from_pairs(pairpath, outcool, outmcool, ignore_diags=2, nproc=1, mad_max=5, min_nnz=10,
11-
min_count=0, max_split=2, high_res=False):
12-
13-
chromsizes_file, assembly = chromsizes_from_pairs(pairpath)
14-
15-
if high_res:
16-
log.log(21, '{0}: Building contact matrix at 1kb ...'.format(pairpath))
17-
bin_label = ':'.join([chromsizes_file, str(1000)])
18-
bin_command = ['cooler', 'cload', 'pairix', '--assembly', assembly, '--nproc', str(nproc),
19-
'--max-split', str(max_split), bin_label, pairpath, outcool]
20-
subprocess.check_call(' '.join(bin_command), shell=True)
21-
22-
log.log(21, '{0}: Generating a multi-resolution cooler file by coarsening the 1kb contact matrix ...'.format(pairpath))
23-
command = ['cooler', 'zoomify', '-p', str(nproc), '-r 1000,2000,5000,10000,25000,50000,100000,250000,500000,1000000,2500000,5000000',
24-
'--balance', '--balance-args "--mad-max {0} --min-nnz {1} --min-count {2} --ignore-diags {3}"'.format(mad_max, min_nnz, min_count, ignore_diags),
25-
'-o', outmcool, outcool]
26-
subprocess.check_call(' '.join(command), shell=True)
27-
log.log(21, '{0}: Done'.format(pairpath))
10+
def mcool_from_pairs(pairpath, outcool, outmcool, resolutions, ignore_diags=2, nproc=1, mad_max=5,
11+
min_nnz=10, min_count=0, max_split=2):
2812

29-
os.remove(outcool)
30-
else:
31-
log.log(21, '{0}: Building contact matrix at 5kb ...'.format(pairpath))
32-
bin_label = ':'.join([chromsizes_file, str(5000)])
33-
bin_command = ['cooler', 'cload', 'pairix', '--assembly', assembly, '--nproc', str(nproc),
34-
'--max-split', str(max_split), bin_label, pairpath, outcool]
35-
subprocess.check_call(' '.join(bin_command), shell=True)
36-
37-
log.log(21, '{0}: Generating a multi-resolution cooler file by coarsening the 5kb contact matrix ...'.format(pairpath))
38-
command = ['cooler', 'zoomify', '-p', str(nproc), '-r 5000,10000,25000,50000,100000,250000,500000,1000000,2500000,5000000',
39-
'--balance', '--balance-args "--mad-max {0} --min-nnz {1} --min-count {2} --ignore-diags {3}"'.format(mad_max, min_nnz, min_count, ignore_diags),
40-
'-o', outmcool, outcool]
41-
subprocess.check_call(' '.join(command), shell=True)
42-
log.log(21, '{0}: Done'.format(pairpath))
13+
chromsizes_file, assembly = chromsizes_from_pairs(pairpath)
4314

44-
os.remove(outcool)
45-
15+
log.log(21, '{0}: Building contact matrix at a {1} base-pair resolution ...'.format(pairpath, resolutions[0]))
16+
bin_label = ':'.join([chromsizes_file, str(resolutions[0])])
17+
bin_command = ['cooler', 'cload', 'pairix', '--assembly', assembly, '--nproc', str(nproc),
18+
'--max-split', str(max_split), bin_label, pairpath, outcool]
19+
subprocess.check_call(' '.join(bin_command), shell=True)
20+
21+
log.log(21, '{0}: Generating a multi-resolution cooler file by coarsening the {1}bp matrix ...'.format(pairpath, resolutions[0]))
22+
command = ['cooler', 'zoomify', '-p', str(nproc), '-r {0}'.format(','.join(list(map(str, resolutions)))), '--balance',
23+
'--balance-args "--mad-max {0} --min-nnz {1} --min-count {2} --ignore-diags {3}"'.format(mad_max, min_nnz, min_count, ignore_diags),
24+
'-o', outmcool, outcool]
25+
subprocess.check_call(' '.join(command), shell=True)
26+
log.log(21, '{0}: Done'.format(pairpath))
27+
28+
os.remove(outcool)

runHiC/filtering.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,8 @@ def collect_stats(pair_paths):
8181

8282
def stats_pairs(inpath, refkey, matchpre=[], nproc_in=3, nproc_out=8):
8383

84-
stat_command = ['pairtools', 'stats', '--nproc-in', str(nproc_in), '--nproc-out', str(nproc_out), inpath]
84+
stat_command = ['pairtools', 'stats', '--n-dist-bins-decade 8', '--nproc-in', str(nproc_in),
85+
'--nproc-out', str(nproc_out), inpath]
8586
pipe = subprocess.Popen(stat_command, stdout=subprocess.PIPE)
8687
inStream = pipe.stdout
8788
stats = defaultdict(int)

runHiC/mapping.py

Lines changed: 10 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ def splitSingleFastq(filename, pre, pair_index, folder, splitBy=4000000):
178178
##### functions that invoke different read aligners
179179
def buildMapIndex(aligner, genomeFolder, genomeName):
180180
"""
181-
Build bwa/chromap/minimap2 index files.
181+
Build bwa-mem/bwa-mem2/chromap index files.
182182
"""
183183
lockFile = os.path.join(genomeFolder, '.'.join([genomeName, aligner, 'lock']))
184184
# Aquire lock
@@ -187,14 +187,13 @@ def buildMapIndex(aligner, genomeFolder, genomeName):
187187

188188
wholeGenome = os.path.join(genomeFolder, '.'.join([genomeName, 'fa']))
189189

190-
if aligner=='minimap2':
191-
indexOut = os.path.join(genomeFolder, '.'.join([genomeName, 'minimap2', 'mmi']))
192-
build_command = ['minimap2', '-x', 'sr', '-d', indexOut, wholeGenome]
193-
elif aligner=='chromap':
190+
if aligner=='chromap':
194191
indexOut = os.path.join(genomeFolder, '.'.join([genomeName, 'chromap-runhic', 'mmi']))
195192
build_command = ['chromap', '-i', '-r', wholeGenome, '-o', indexOut]
196-
else:
193+
elif aligner=='bwa-mem':
197194
build_command = ['bwa', 'index', wholeGenome]
195+
else:
196+
build_command = ['bwa-mem2', 'index', wholeGenome]
198197

199198
subprocess.check_call(' '.join(build_command), shell=True)
200199

@@ -207,6 +206,7 @@ def map_core(fastq_1, fastq_2, ref_fa, ref_index, outdir, tmpdir, aligner='chrom
207206
outformat = '.pairs'
208207
else:
209208
outformat = '.bam'
209+
210210
# output file name
211211
if fastq_1.endswith('_1.fastq.gz'):
212212
outpath = os.path.join(outdir,
@@ -216,22 +216,15 @@ def map_core(fastq_1, fastq_2, ref_fa, ref_index, outdir, tmpdir, aligner='chrom
216216
os.path.split(fastq_1)[1].replace('_1.fastq', outformat))
217217

218218
if aligner=='chromap':
219-
'''
220-
map_command = ['chromap', '--preset', 'hic', '-x', ref_index,
221-
'-r', ref_fa, '-t', str(nthread), '-1', fastq_1, '-2', fastq_2,
222-
'-o', outpath, '-q', str(min_mapq), '--chr-order', sort_order_fil,
223-
'--pairs-natural-chr-order', flip_order_fil]
224-
'''
225219
map_command = ['chromap', '--preset', 'hic', '--low-mem', '-x', ref_index,
226220
'-r', ref_fa, '-t', str(nthread), '-1', fastq_1, '-2', fastq_2,
227221
'-o', outpath]
228222
bam_command = []
223+
elif aligner=='bwa-mem':
224+
map_command = ['bwa', 'mem', '-SP', '-t', str(nthread), ref_index, fastq_1, fastq_2]
225+
bam_command = ['samtools', 'view', '-bS', '-']
229226
else:
230-
if aligner=='minimap2':
231-
map_command = ['minimap2', '-ax', 'sr', '-t', str(nthread), ref_index, fastq_1, fastq_2]
232-
else:
233-
map_command = ['bwa', 'mem', '-SP5M', '-t', str(nthread), ref_index, fastq_1, fastq_2]
234-
227+
map_command = ['bwa-mem2', 'mem', '-SP', '-t', str(nthread), ref_index, fastq_1, fastq_2]
235228
bam_command = ['samtools', 'view', '-bS', '-']
236229

237230
pipeline = []

runHiC/quality.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ def typePlot(stats, outfile, dpi = 300):
125125
ot.append(np.nan)
126126
lt = np.r_[lt]; rt = np.r_[rt]; it = np.r_[it]; ot = np.r_[ot]
127127

128-
x = np.arange(0, 8, 0.25)
128+
x = np.arange(0.25, 8, 0.125)
129129
xticks = list(range(1, 8))
130130
xticklabels = ['10bp', '100bp', '1K', '10K', '100K', '1M', '10M']
131131

scripts/runHiC

Lines changed: 27 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -69,23 +69,23 @@ def getargs():
6969
iterM.add_argument('-f', '--fastqDir', help = 'Name of the folder containing sequencing reads.')
7070
iterM.add_argument('-F', '--Format', default = 'SRA', choices = ['SRA', 'FASTQ'],
7171
help = 'Format of the sequencing reads.')
72-
iterM.add_argument('-A', '--aligner', default = 'chromap', choices = ['bwa-mem', 'chromap', 'minimap2'],
72+
iterM.add_argument('-A', '--aligner', default = 'chromap', choices = ['bwa-mem', 'bwa-mem2', 'chromap'],
7373
help = '''Name of the sequence alignment software to invoke.''')
7474
iterM.add_argument('-i', '--Index',
75-
help = '''Path to the bwa/chromap/minimap2 genome index. For example, if your reference genome
76-
is hg38.fa, and it is located within ~/data/hg38, then you need to specify --Index as "~/data/hg38/hg38.fa",
77-
"~/data/hg38/hg38.chromap-runhic.mmi", and "~/data/hg38/hg38.minimap2.mmi" for bwa-mem, chromap, and
78-
minimap2, respectively. When not specified, the index will be built automatically according to
79-
your aligner choice.''')
75+
help = '''Path to the bwa-mem/bwa-mem2/chromap index. For example, if the reference genome
76+
is hg38.fa, and it is located within the folder ~/data/hg38, then you need to specify
77+
this parameter as "~/data/hg38/hg38.fa", "~/data/hg38/hg38.fa", and "~/data/hg38/hg38.chromap-runhic.mmi"
78+
for bwa-mem, bwa-mem2, and chromap, respectively. If not specified, the index will be built
79+
automatically according to the aligner choice.''')
8080
iterM.add_argument('-t', '--threads', type = int, default = 8, help = 'Number of threads.')
8181
iterM.add_argument('--min-mapq', type = int, default = 1,
8282
help = '''The minimal MAPQ score to consider a read as uniquely mapped.''')
83-
iterM.add_argument('--max-molecule-size', type = int, default = 2000,
83+
iterM.add_argument('--max-molecule-size', type = int, default = 750,
8484
help = '''The maximal size of a Hi-C molecule, used to rescue single ligations from
8585
molecules with three alignments.''')
8686
iterM.add_argument('--max-inter-align-gap', type = int, default = 20,
8787
help = '''A key parameter used by pairtools to rescue single ligations from walks.''')
88-
iterM.add_argument('--walks-policy', default = 'all', choices = ['mask', '5any', '5unique', '3any', '3unique', 'all'],
88+
iterM.add_argument('--walks-policy', default = '5unique', choices = ['mask', '5any', '5unique', '3any', '3unique', 'all'],
8989
help = '''The policy used by pairtools to report unrescuable walks.''')
9090
iterM.add_argument('--include-readid', action = 'store_true',
9191
help = '''If specified, add read IDs to the outputed .pairsam files.''')
@@ -151,9 +151,8 @@ def getargs():
151151
binReads.add_argument('--mad-max', type = int, default = 5,
152152
help = '''Before ICE, drop bins whose log marginal sum is less than ``mad_max``
153153
median absolute deviations below the median log marginal sum.''')
154-
binReads.add_argument('--high-res', action = 'store_true', help='''If specified, bin pairs at 11 base-pair-delimited resolutions:
155-
2500000,1000000,500000,250000,100000,50000,25000,10000,5000,2000,1000. The default setting is binning pairs at
156-
9 resolutions: 2500000,1000000,500000,250000,100000,50000,25000,10000,5000.''')
154+
binReads.add_argument('--resolutions', default = '5000,10000,25000,50000,100000,250000,500000,1000000,2500000',
155+
help='''List of resolutions at which the contact matrices will be generated.''')
157156
binReads.add_argument('--nproc', type = int, default = 8, help = '''Number of allocated proccesses.''')
158157
binReads.add_argument('--max-split', type = int, default = 2, help = '''Divide the pairs from each chromosome
159158
into at most this many chunks.''')
@@ -186,9 +185,8 @@ def getargs():
186185
add_help = False)
187186
streamline.add_argument('--max-split', type = int, default = 2, help = '''Divide the pairs from each chromosome
188187
into at most this many chunks.''')
189-
streamline.add_argument('--high-res', action = 'store_true', help='''If specified, bin pairs at 11 base-pair-delimited resolutions:
190-
2500000,1000000,500000,250000,100000,50000,25000,10000,5000,2000,1000. The default setting is binning pairs at
191-
9 resolutions: 2500000,1000000,500000,250000,100000,50000,25000,10000,5000.''')
188+
streamline.add_argument('--resolutions', default = '5000,10000,25000,50000,100000,250000,500000,1000000,2500000',
189+
help='''List of resolutions at which the contact matrices will be generated.''')
192190
streamline.set_defaults(func = pileup)
193191

194192
## Parse the command-line arguments
@@ -255,7 +253,7 @@ def run(args, commands):
255253
'# Temporary Dir = {0}'.format(args.tmpdir)
256254
])
257255
if (commands[0] == 'pileup'):
258-
arglist.extend(['# Generate contact maps at 11 resolutions = {0}'.format(args.high_res)])
256+
arglist.extend(['# Resolutions = {0}'.format(args.resolutions)])
259257

260258
if commands[0] == 'filtering':
261259
arglist.extend(['# Original Pairs = {0}'.format(args.pairFolder),
@@ -270,7 +268,7 @@ def run(args, commands):
270268
'# Minimum Marginal Nonzeros = {0}'.format(args.min_nnz),
271269
'# Minimum Marginal Counts = {0}'.format(args.min_count),
272270
'# MAD Max = {0}'.format(args.mad_max),
273-
'# Generate contact maps at 11 resolutions = {0}'.format(args.high_res),
271+
'# Resolutions = {0}'.format(args.resolutions),
274272
'# Number of processes = {0}'.format(args.nproc)])
275273

276274
if commands[0] == 'quality':
@@ -303,9 +301,7 @@ def mapping(args, commands):
303301
if not args.Index is None:
304302
indexpath = os.path.abspath(os.path.expanduser(args.Index))
305303
else:
306-
if aligner=='minimap2':
307-
indexpath = os.path.join(genomeFolder, '.'.join([genomeName, 'minimap2', 'mmi']))
308-
elif aligner=='chromap':
304+
if aligner=='chromap':
309305
indexpath = os.path.join(genomeFolder, '.'.join([genomeName, 'chromap-runhic', 'mmi']))
310306
else:
311307
indexpath = os.path.join(genomeFolder, '.'.join([genomeName, 'fa']))
@@ -325,10 +321,14 @@ def mapping(args, commands):
325321
indexlock = os.path.join(genomeFolder, '.'.join([genomeName, aligner, 'lock']))
326322
if os.path.exists(indexlock):
327323
raise Exception('''Another index building process is on. Leaving''')
328-
if aligner in ['minimap2', 'chromap']:
324+
325+
if aligner in ['chromap']:
329326
icheck = glob.glob(indexpath)
327+
elif aligner in ['bwa-mem2']:
328+
icheck = glob.glob(indexpath+'.bwt.2bit.64')
330329
else:
331-
icheck = glob.glob(indexpath+'.sa')
330+
icheck = glob.glob(indexpath+'.bwt')
331+
332332
if len(icheck):
333333
logging.log(21, 'Set --Index to {0}'.format(indexpath))
334334
else:
@@ -737,14 +737,12 @@ def binning(args, commands):
737737

738738
logging.log(21, 'Contact Matrices will be saved in .mcool format under {0}'.format(hFolder))
739739

740-
if args.high_res:
741-
intermediate = os.path.join(hFolder, os.path.basename(f).replace('.pairs.gz', '.1kb.cool'))
742-
else:
743-
intermediate = os.path.join(hFolder, os.path.basename(f).replace('.pairs.gz', '.5kb.cool'))
740+
resolutions = sorted([int(r) for r in args.resolutions.split(',')])
741+
intermediate = os.path.join(hFolder, os.path.basename(f).replace('.pairs.gz', '.{0}.cool'.format(resolutions[0])))
744742
hFile = os.path.join(hFolder, os.path.basename(f).replace('.pairs.gz', '.mcool'))
745-
mcool_from_pairs(f, intermediate, hFile, ignore_diags=args.ignore_diags, nproc=args.nproc,
746-
mad_max=args.mad_max, min_count=args.min_count, min_nnz=args.min_nnz,
747-
max_split=args.max_split, high_res=args.high_res)
743+
mcool_from_pairs(f, intermediate, hFile, resolutions, ignore_diags=args.ignore_diags,
744+
nproc=args.nproc, mad_max=args.mad_max, min_count=args.min_count,
745+
min_nnz=args.min_nnz, max_split=args.max_split)
748746

749747
completed = open(Indicator, 'wb')
750748
completed.close()
@@ -818,7 +816,7 @@ def pileup(args, commands):
818816
"""
819817
mapping(args, commands)
820818
args.stats_cache = 'allinone.cache'
821-
args.nproc = args.threads
819+
args.nproc = 8
822820
filtering(args, commands)
823821
args.ignore_diags = 2
824822
args.mad_max = 5

0 commit comments

Comments
 (0)