fix bug in non-resolved meta-clusters

#47 bcbio/bcbio-nextgen#2948 Thanks @lurebgi @keenhl @chapmanb
lpantano · Sep 20, 2019 · 38297c9 · 38297c9
1 parent c48b3bd
commit 38297c9
Show file tree

Hide file tree

Showing 7 changed files with 27 additions and 26 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -16,7 +16,7 @@ before_install:
 - wget -O miniconda.sh http://repo.continuum.io/miniconda/${miniconda}
 - bash miniconda.sh -b -p ~/install
 - export PATH=~/install/bin/:$PATH
-- conda install --yes -c conda-forge -c bioconda memory_profiler openjdk pysam pybedtools pandas numpy biopython progressbar2 pyyaml bedtools samtools mirtop viennarna -q
+- conda install --yes -c conda-forge -c bioconda memory_profiler openjdk pysam pybedtools pandas numpy scipy biopython progressbar2 pyyaml bedtools samtools mirtop viennarna -q
 - pip install -r requirements.txt
 # scipy pandas pybedtools progressbar pip biopython nose scipy setuptools pyyaml -q
 - python setup.py install

diff --git a/HISTORY.md b/HISTORY.md
@@ -1,3 +1,10 @@
+- 1.2.7
+
+  * Fix bug when writing files for debug of big meta-clusters: 
+    https://github.com/lpantano/seqcluster/issues/47
+    https://github.com/bcbio/bcbio-nextgen/issues/2948
+  * Add version option
+
 - 1.2.5
 
   * Fix error when the precursor is too long to ignore RNAfold calculation. Thanks to @ZhuZhuoHSPH and @kthlnktng

diff --git a/seqcluster/detect/cluster.py b/seqcluster/detect/cluster.py
@@ -43,7 +43,7 @@ def clean_bam_file(bam_in, mask=None):
     if mask:
         mask_file = op.splitext(bam_in)[0] + "_mask.bam"
         if not file_exists(mask_file):
-            pybedtools.BedTool(bam_file).intersect(b=mask, v=True).saveas(mask_file)
+            pybedtools.BedTool(bam_in).intersect(b=mask, v=True).saveas(mask_file)
         bam_in = mask_file
     out_file = op.splitext(bam_in)[0] + "_rmlw.bam"
     # bam.index(bam_in, {'algorithm':{}})

diff --git a/seqcluster/detect/metacluster.py b/seqcluster/detect/metacluster.py
@@ -6,11 +6,7 @@
 import copy
 from progressbar import ProgressBar
 
-# import time
 import math
-# import numpy as np
-# import pybedtools
-
 
 import seqcluster.libs.logger as mylog
 from seqcluster.libs import utils
@@ -39,15 +35,10 @@ def _get_seqs_from_cluster(seqs, seen):
     already_in = set()
     not_in = []
 
-    already_in = map(seen.get, seqs)
+    already_in = [e for e in map(seen.get, seqs)]
     # if isinstance(already_in, list):
     already_in = filter(None, already_in)
     not_in = set(seqs) - set(seen.keys())
-    # for s in seqs:
-    #    if s in seen:
-    #        already_in.add(seen[s])
-    #    else:
-    #        not_in.append(s)
     return list(set(already_in)), list(not_in)
 
 
@@ -61,15 +52,21 @@ def reduceloci(clus_obj,  path):
     large = 0
     current = clus_obj.clusid
     logger.info("Number of loci: %s" % len(clus_obj.loci.keys()))
+    avg_loci_cluster = len(clus_obj.loci.keys())/len(clus_obj.seq)
+    if avg_loci_cluster > 0.5:
+        logger.warning("The avg number of loci by sequences is close to 0.5: %s",
+                       avg_loci_cluster)
+        logger.warning("This could mean you have sequences over the genome, rare in "
+                       "a typical small RNA data. This can cause errors during the execution "
+                       "or long computing time.")
     bar = ProgressBar(maxval=len(current)).start()
     bar.update(0)
     for itern, idmc in enumerate(current):
         bar.update(itern)
         logger.debug("_reduceloci: cluster %s" % idmc)
         c = copy.deepcopy(list(current[idmc]))
-
         n_loci = len(c)
-        if n_loci < 1000:
+        if n_loci < 100:
             filtered, n_cluster = _iter_loci(c, clus_obj.clus, (clus_obj.loci, clus_obj.seq), filtered, n_cluster)
         else:
             large += 1
@@ -98,7 +95,7 @@ def _write_cluster(metacluster, cluster, loci, idx, path):
         with open(out_file, 'w') as out_handle:
             for idc in metacluster:
                 for idl in cluster[idc].loci2seq:
-                    pos = loci[idl].list()
+                    pos = [e for e in loci[idl].list()]
                     print("\t".join(pos[:4] + [str(len(cluster[idc].loci2seq[idl]))] + [pos[-1]]), file=out_handle, end="")
 
 
@@ -136,7 +133,6 @@ def _iter_loci(meta, clusters, s2p, filtered, n_cluster):
     n_loci = len(meta)
     n_loci_prev = n_loci + 1
     cicle = 0
-    # [logger.note("BEFORE %s %s %s" % (c.id, idl, len(c.loci2seq[idl]))) for idl in c.loci2seq]
     internal_cluster = {}
     if n_loci == 1:
         n_cluster += 1
@@ -176,11 +172,11 @@ def _iter_loci(meta, clusters, s2p, filtered, n_cluster):
     return filtered, n_cluster
 
 
-def _remove_loci(ci, idl):
-    for (idl, lenl) in locilen_sorted:
-        logger.debug("_remove_loci:remove locus %s with len %s:" % (idl, lenl))
-        c.loci2seq.pop(idl, "None")
-        c.locilen.pop(idl, "None")
+# def _remove_loci(ci, idl):
+#     for (idl, lenl) in locilen_sorted:
+#         logger.debug("_remove_loci:remove locus %s with len %s:" % (idl, lenl))
+#         c.loci2seq.pop(idl, "None")
+#         c.locilen.pop(idl, "None")
 
 
 def _convert_to_clusters(c):
@@ -512,7 +508,6 @@ def _merge_loci_in_cluster(c, new_c, idl, current_seqs):
 def _merge_with_first_loci(c, new_c, first_idl, idl, current_seqs):
     logger.debug("_merge_with_first_loci:join first")
     locus_seqs = c.loci2seq[idl]
-    common = len(set(locus_seqs).intersection(current_seqs))
     seen = list(set(locus_seqs).union(current_seqs))
     new_c.add_id_member(list(locus_seqs), first_idl)
     c.loci2seq.pop(idl, "None")
@@ -522,7 +517,6 @@ def _merge_with_first_loci(c, new_c, first_idl, idl, current_seqs):
 
 def _remove_seqs_from_loci(c, idl, current_seqs):
     current = c.loci2seq[idl]
-    common = len(set(current).intersection(current_seqs))
     seen = list(set(current).intersection(current_seqs))
     unseen = list(set(sorted(current)).difference(sorted(seen)))
     logger.debug("_remove_seqs_from_loci:seen %s unseen %s" % (len(seen), len(unseen)))

diff --git a/seqcluster/libs/classes.py b/seqcluster/libs/classes.py
@@ -129,7 +129,7 @@ def __init__(self, idl, chr, start, end, strand):
         self.db_ann = {}
 
     def list(self):
-        return map(str, [self.chr, self. start, self.end, self.idl, self.strand])
+        return [e for e in map(str, [self.chr, self. start, self.end, self.idl, self.strand])]
 
     def add_db(self, db, ndb):
         self.db_ann[db] = ndb

diff --git a/seqcluster/make_clusters.py b/seqcluster/make_clusters.py
@@ -115,7 +115,7 @@ def _check_args(args):
     global similar
     if not os.path.isdir(args.out):
         logger.warning("the output folder doens't exists")
-        os.mkdirs(args.out)
+        os.mkdir(args.out)
     if args.bed and args.gtf:
         logger.error("cannot provide -b and -g at the same time")
         raise SyntaxError

diff --git a/setup.py b/setup.py
@@ -13,7 +13,7 @@ def readme():
 
 
 setup(name='seqcluster',
-      version='1.2.7a',
+      version='1.2.7',
       description='Small RNA-seq pipeline',
       long_description=readme(),
       long_description_content_type="text/markdown",