Skip to content

Commit

Permalink
fix bug in non-resolved meta-clusters
Browse files Browse the repository at this point in the history
  • Loading branch information
lpantano committed Sep 20, 2019
1 parent c48b3bd commit 38297c9
Show file tree
Hide file tree
Showing 7 changed files with 27 additions and 26 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ before_install:
- wget -O miniconda.sh http://repo.continuum.io/miniconda/${miniconda}
- bash miniconda.sh -b -p ~/install
- export PATH=~/install/bin/:$PATH
- conda install --yes -c conda-forge -c bioconda memory_profiler openjdk pysam pybedtools pandas numpy biopython progressbar2 pyyaml bedtools samtools mirtop viennarna -q
- conda install --yes -c conda-forge -c bioconda memory_profiler openjdk pysam pybedtools pandas numpy scipy biopython progressbar2 pyyaml bedtools samtools mirtop viennarna -q
- pip install -r requirements.txt
# scipy pandas pybedtools progressbar pip biopython nose scipy setuptools pyyaml -q
- python setup.py install
Expand Down
7 changes: 7 additions & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
- 1.2.7

* Fix bug when writing files for debug of big meta-clusters:
https://github.com/lpantano/seqcluster/issues/47
https://github.com/bcbio/bcbio-nextgen/issues/2948
* Add version option

- 1.2.5

* Fix error when the precursor is too long to ignore RNAfold calculation. Thanks to @ZhuZhuoHSPH and @kthlnktng
Expand Down
2 changes: 1 addition & 1 deletion seqcluster/detect/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def clean_bam_file(bam_in, mask=None):
if mask:
mask_file = op.splitext(bam_in)[0] + "_mask.bam"
if not file_exists(mask_file):
pybedtools.BedTool(bam_file).intersect(b=mask, v=True).saveas(mask_file)
pybedtools.BedTool(bam_in).intersect(b=mask, v=True).saveas(mask_file)
bam_in = mask_file
out_file = op.splitext(bam_in)[0] + "_rmlw.bam"
# bam.index(bam_in, {'algorithm':{}})
Expand Down
36 changes: 15 additions & 21 deletions seqcluster/detect/metacluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,7 @@
import copy
from progressbar import ProgressBar

# import time
import math
# import numpy as np
# import pybedtools


import seqcluster.libs.logger as mylog
from seqcluster.libs import utils
Expand Down Expand Up @@ -39,15 +35,10 @@ def _get_seqs_from_cluster(seqs, seen):
already_in = set()
not_in = []

already_in = map(seen.get, seqs)
already_in = [e for e in map(seen.get, seqs)]
# if isinstance(already_in, list):
already_in = filter(None, already_in)
not_in = set(seqs) - set(seen.keys())
# for s in seqs:
# if s in seen:
# already_in.add(seen[s])
# else:
# not_in.append(s)
return list(set(already_in)), list(not_in)


Expand All @@ -61,15 +52,21 @@ def reduceloci(clus_obj, path):
large = 0
current = clus_obj.clusid
logger.info("Number of loci: %s" % len(clus_obj.loci.keys()))
avg_loci_cluster = len(clus_obj.loci.keys())/len(clus_obj.seq)
if avg_loci_cluster > 0.5:
logger.warning("The avg number of loci by sequences is close to 0.5: %s",
avg_loci_cluster)
logger.warning("This could mean you have sequences over the genome, rare in "
"a typical small RNA data. This can cause errors during the execution "
"or long computing time.")
bar = ProgressBar(maxval=len(current)).start()
bar.update(0)
for itern, idmc in enumerate(current):
bar.update(itern)
logger.debug("_reduceloci: cluster %s" % idmc)
c = copy.deepcopy(list(current[idmc]))

n_loci = len(c)
if n_loci < 1000:
if n_loci < 100:
filtered, n_cluster = _iter_loci(c, clus_obj.clus, (clus_obj.loci, clus_obj.seq), filtered, n_cluster)
else:
large += 1
Expand Down Expand Up @@ -98,7 +95,7 @@ def _write_cluster(metacluster, cluster, loci, idx, path):
with open(out_file, 'w') as out_handle:
for idc in metacluster:
for idl in cluster[idc].loci2seq:
pos = loci[idl].list()
pos = [e for e in loci[idl].list()]
print("\t".join(pos[:4] + [str(len(cluster[idc].loci2seq[idl]))] + [pos[-1]]), file=out_handle, end="")


Expand Down Expand Up @@ -136,7 +133,6 @@ def _iter_loci(meta, clusters, s2p, filtered, n_cluster):
n_loci = len(meta)
n_loci_prev = n_loci + 1
cicle = 0
# [logger.note("BEFORE %s %s %s" % (c.id, idl, len(c.loci2seq[idl]))) for idl in c.loci2seq]
internal_cluster = {}
if n_loci == 1:
n_cluster += 1
Expand Down Expand Up @@ -176,11 +172,11 @@ def _iter_loci(meta, clusters, s2p, filtered, n_cluster):
return filtered, n_cluster


def _remove_loci(ci, idl):
for (idl, lenl) in locilen_sorted:
logger.debug("_remove_loci:remove locus %s with len %s:" % (idl, lenl))
c.loci2seq.pop(idl, "None")
c.locilen.pop(idl, "None")
# def _remove_loci(ci, idl):
# for (idl, lenl) in locilen_sorted:
# logger.debug("_remove_loci:remove locus %s with len %s:" % (idl, lenl))
# c.loci2seq.pop(idl, "None")
# c.locilen.pop(idl, "None")


def _convert_to_clusters(c):
Expand Down Expand Up @@ -512,7 +508,6 @@ def _merge_loci_in_cluster(c, new_c, idl, current_seqs):
def _merge_with_first_loci(c, new_c, first_idl, idl, current_seqs):
logger.debug("_merge_with_first_loci:join first")
locus_seqs = c.loci2seq[idl]
common = len(set(locus_seqs).intersection(current_seqs))
seen = list(set(locus_seqs).union(current_seqs))
new_c.add_id_member(list(locus_seqs), first_idl)
c.loci2seq.pop(idl, "None")
Expand All @@ -522,7 +517,6 @@ def _merge_with_first_loci(c, new_c, first_idl, idl, current_seqs):

def _remove_seqs_from_loci(c, idl, current_seqs):
current = c.loci2seq[idl]
common = len(set(current).intersection(current_seqs))
seen = list(set(current).intersection(current_seqs))
unseen = list(set(sorted(current)).difference(sorted(seen)))
logger.debug("_remove_seqs_from_loci:seen %s unseen %s" % (len(seen), len(unseen)))
Expand Down
2 changes: 1 addition & 1 deletion seqcluster/libs/classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def __init__(self, idl, chr, start, end, strand):
self.db_ann = {}

def list(self):
return map(str, [self.chr, self. start, self.end, self.idl, self.strand])
return [e for e in map(str, [self.chr, self. start, self.end, self.idl, self.strand])]

def add_db(self, db, ndb):
self.db_ann[db] = ndb
Expand Down
2 changes: 1 addition & 1 deletion seqcluster/make_clusters.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def _check_args(args):
global similar
if not os.path.isdir(args.out):
logger.warning("the output folder doens't exists")
os.mkdirs(args.out)
os.mkdir(args.out)
if args.bed and args.gtf:
logger.error("cannot provide -b and -g at the same time")
raise SyntaxError
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def readme():


setup(name='seqcluster',
version='1.2.7a',
version='1.2.7',
description='Small RNA-seq pipeline',
long_description=readme(),
long_description_content_type="text/markdown",
Expand Down

0 comments on commit 38297c9

Please sign in to comment.