From d2c5b5bb6174fc8528acec868709d905dca6ed54 Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Wed, 20 Dec 2023 12:06:14 -0500 Subject: [PATCH 1/2] fix stats file numbers --- HISTORY.md | 4 ++++ seqcluster/detect/cluster.py | 4 ++-- seqcluster/detect/metacluster.py | 1 + seqcluster/make_clusters.py | 15 +++++++++++---- setup.py | 2 +- 5 files changed, 19 insertions(+), 7 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 045f138..ea2abcb 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,3 +1,7 @@ +- 1.2.10 + + * Fix bug counting sequences expression in stats file. + - 1.2.9 * Fix bug where UMI is mistakenly detected in read names containing "ILLUMINA" diff --git a/seqcluster/detect/cluster.py b/seqcluster/detect/cluster.py index a7931f1..65bc789 100644 --- a/seqcluster/detect/cluster.py +++ b/seqcluster/detect/cluster.py @@ -130,7 +130,7 @@ def detect_clusters(c, current_seq, MIN_SEQ, non_un_gl=False): logger.info("%s Clusters read" % eindex) # merge cluster with shared sequences metacluster_obj, cluster_id = _find_metaclusters(current_clus, sequence2clusters, current_seq, MIN_SEQ) - + # import pdb; pdb.set_trace() return cluster_info_obj(current_clus, metacluster_obj, current_loci, current_seq) def _common(items, seen): @@ -158,7 +158,7 @@ def _find_metaclusters(clus_obj, sequence2clusters, current_seq, min_seqs): for itern, name in enumerate(sequence2clusters): clusters = sequence2clusters[name] - if len(clusters) == 0: + if len(clusters) == 0: # when sequence doesn't belong to a cluster c_index -= 1 continue current_seq[name].align = 1 diff --git a/seqcluster/detect/metacluster.py b/seqcluster/detect/metacluster.py index f3bc7c2..b7122d0 100644 --- a/seqcluster/detect/metacluster.py +++ b/seqcluster/detect/metacluster.py @@ -83,6 +83,7 @@ def reduceloci(clus_obj, path): logger.info("Clusters too long to be analyzed: %s" % large) logger.info("Number of clusters removed because low number of reads: %s" % REMOVED) logger.info("Number of clusters with conflicts: %s" % CONFLICT) + #import pdb;pdb.set_trace() return clus_obj diff --git a/seqcluster/make_clusters.py b/seqcluster/make_clusters.py index 33fe6a0..4df5471 100644 --- a/seqcluster/make_clusters.py +++ b/seqcluster/make_clusters.py @@ -62,10 +62,10 @@ def cluster(args): dt = pd.DataFrame({'sample': y.keys(), 'counts': y.values()}) dt['step'] = 'cleaned' dt.to_csv(read_stats_file, sep="\t", index=False, header=False, mode='a') - clusL = _create_clusters(seqL, bam_file, args) - - y, l = _total_counts(list(clusL.seq.keys()), clusL.seq, aligned=True) + # y, l = _total_counts(list(clusL.seq.keys()), clusL.seq, aligned=True) + y, l = _total_counts(clusL.clus, seqL) + # import pdb;pdb.set_trace() logger.info("counts after: %s" % sum(y.values())) logger.info("# sequences after: %s" % l) dt = pd.DataFrame({'sample': y.keys(), 'counts': y.values()}) @@ -75,6 +75,8 @@ def cluster(args): logger.info("Solving multi-mapping events in the network of clusters") clusLred = _cleaning(clusL, args.dir_out) y, l = _total_counts(clusLred.clus, seqL) + # import pdb;pdb.set_trace() + # y, l = _total_counts(list(clusLred.seq.keys()), clusLred.seq, aligned=True) logger.info("counts after: %s" % sum(y.values())) logger.info("# sequences after: %s" % l) dt = pd.DataFrame({'sample': y.keys(), 'counts': y.values()}) @@ -157,7 +159,12 @@ def _total_counts(seqs, seqL, aligned=False): else: nseqs = len([total.update(seqL[s].freq) for s in seqs if seqL[s].align > 0]) elif isinstance(seqs, dict): - [total.update(seqs[s].get_freq(seqL)) for s in seqs] + #[total.update(seqs[s].get_freq(seqL)) for s in seqs] + # import pdb;pdb.set_trace() + # !import code; code.interact(local=vars()) + seqs_in=[] + void=[seqs_in.extend(list(seqs[s].idmembers.keys())) for s in seqs] + len([total.update(seqL[s].freq) for s in set(seqs_in)]) nseqs = sum(len(seqs[s].idmembers) for s in seqs) return total, nseqs diff --git a/setup.py b/setup.py index 00540ae..ec3fd4f 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ def readme(): setup(name='seqcluster', - version='1.2.9', + version='1.2.10', description='Small RNA-seq pipeline', long_description=readme(), long_description_content_type="text/markdown", From 3518ea5ec4d80fbe4fe8eaf3bacc4bdc60965e0d Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Wed, 20 Dec 2023 12:06:44 -0500 Subject: [PATCH 2/2] ignore data --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 3b9a5ed..18ed188 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ # test */test_* +data/LorenaP/* data/test_*/report/* data/test_*/res/* data/res/*