fine tuning release 0.2.2

CGATOxford · Nov 7, 2014 · 5004a6a · 5004a6a
1 parent bf1bbbd
commit 5004a6a
Show file tree

Hide file tree

Showing 21 changed files with 235 additions and 114 deletions.
diff --git a/CGATPipelines/PipelineGeneset.py b/CGATPipelines/PipelineGeneset.py
@@ -340,7 +340,8 @@ def buildProteinCodingGenes(infile, outfile):
     | python %(scriptsdir)s/gtf2gtf.py
     --method=sort --sort-order=contig+gene
     | python %(scriptsdir)s/gff2gff.py
-    --method=sanitize=genome
+    --method=sanitize
+    --sanitize-method=genome
     --skip-missing
     --genome-file=%(genome_dir)s/%(genome)s
     | python %(scriptsdir)s/gtf2gtf.py
@@ -770,7 +771,8 @@ def buildPromotorRegions(infile, outfile):
     '''annotate promotor regions from reference gene set.'''
     statement = """
     gunzip < %(infile)s |\
-    python %(scriptsdir)s/gff2gff.py --method=sanitize=genome
+    python %(scriptsdir)s/gff2gff.py --method=sanitize
+    --sanitize-method=genome
     --skip-missing --genome-file=%(genome_dir)s/%(genome)s
     --log=%(outfile)s.log
     | python %(scriptsdir)s/gtf2gff.py --method=promotors
@@ -797,7 +799,8 @@ def buildTSSRegions(infile, outfile):
     '''
     statement = """
     gunzip < %(infile)s
-    | python %(scriptsdir)s/gff2gff.py --method=sanitize=genome
+    | python %(scriptsdir)s/gff2gff.py --method=sanitize
+    --sanitize-method=genome
     --skip-missing
     --genome-file=%(genome_dir)s/%(genome)s --log=%(outfile)s.log
     | python %(scriptsdir)s/gtf2gff.py --method=promotors

diff --git a/CGATPipelines/PipelineUCSC.py b/CGATPipelines/PipelineUCSC.py
@@ -111,7 +111,8 @@ def getRepeatsFromUCSC(dbhandle, repclasses, outfile):
     statement = ['''cat %(tmpfilename)s
     | %(scriptsdir)s/gff_sort pos
     | python %(scriptsdir)s/gff2gff.py
-    --method=sanitize=genome
+    --method=sanitize
+    --sanitize-method=genome
     --skip-missing
     --genome-file=%(genome_dir)s/%(genome)s
     --log=%(outfile)s.log ''']

diff --git a/CGATPipelines/pipeline_ancestral_repeats.py b/CGATPipelines/pipeline_ancestral_repeats.py
@@ -341,7 +341,8 @@ def importRepeatsFromUCSC(infile, outfile, ucsc_database, repeattypes, genome):
     statement = '''cat %(tmpfilename)s
     | %(scriptsdir)s/gff_sort pos
     | python %(scriptsdir)s/gff2gff.py
-    --method=sanitize=genome
+    --method=sanitize
+    --sanitize-method=genome
     --skip-missing
     --genome-file=%(genome)s
     --log=%(outfile)s.log
@@ -371,7 +372,8 @@ def importRepeatsFromEnsembl(infile, outfile,
     --repeattypes %(repeattypes)s
     | %(scriptsdir)s/gff_sort pos
     | python %(scriptsdir)s/gff2gff.py
-    --method=sanitize=genome
+    --method=sanitize
+    --sanitize-method=genome
     --skip-missing
     --genome-file=%(genome)s
     --log=%(outfile)s.log

diff --git a/CGATPipelines/pipeline_annotations.py b/CGATPipelines/pipeline_annotations.py
@@ -833,7 +833,8 @@ def buildGeneSet(infile, outfile):
     statement = ['''zcat %(infile)s
     | grep 'transcript_id'
     | python %(scriptsdir)s/gff2gff.py
-    --method=sanitize=genome
+    --method=sanitize
+    --sanitize-method=genome
     --skip-missing
     --genome-file=%(genome_dir)s/%(genome)s
     --log=%(outfile)s.log ''']
@@ -971,8 +972,7 @@ def downloadTranscriptInformation(infile, outfile):
         "source": "source",
         "status": "gene_status",
         "transcript_status": "transcript_status",
-        "external_gene_id": "gene_name",
-        "external_transcript_id": "transcript_name",
+        "external_gene_name": "gene_name",
         "uniprot_sptrembl": "uniprot_id",
         "uniprot_genename": "uniprot_name",
     }
@@ -1084,7 +1084,7 @@ def downloadTranscriptSynonyms(infile, outfile):
 
     columns = {
         "ensembl_transcript_id": "transcript_id",
-        "external_transcript_id": "transcript_name",
+        "external_transcript_name": "transcript_name",
         "refseq_mrna": "refseq_id",
     }
 

diff --git a/CGATPipelines/pipeline_liftover.py b/CGATPipelines/pipeline_liftover.py
@@ -151,17 +151,18 @@ def convertGtf2Psl(infile, outfile):
         raise IOError("genome %s does not exist" % genomefile)
 
     statement = """gunzip 
-    < %(infile)s 
-    | awk '$3 == "exon"' 
-    | python %(scriptsdir)s/gff2gff.py 
-           --method=sanitize=genome 
-           --skip-missing
-           --genome=%(genomefile)s
-           --log=%(outfile)s.log
-    | python %(scriptsdir)s/gff2psl.py 
-           --allow-duplicates
-           --is-gtf 
-           --log=%(outfile)s.log 
+    < %(infile)s
+    | awk '$3 == "exon"'
+    | python %(scriptsdir)s/gff2gff.py
+    --method=sanitize
+    --sanitize-method=genome
+    --skip-missing
+    --genome=%(genomefile)s
+    --log=%(outfile)s.log
+    | python %(scriptsdir)s/gff2psl.py
+    --allow-duplicates
+    --is-gtf
+    --log=%(outfile)s.log
     | gzip > %(outfile)s
     """
     P.run()

diff --git a/CGATPipelines/pipeline_mapping.py b/CGATPipelines/pipeline_mapping.py
@@ -210,7 +210,8 @@
      "../pipeline.ini",
      "pipeline.ini"],
     defaults={
-        'paired_end': False})
+        'paired_end': False},
+    only_import=__name__ != "__main__")
 
 PARAMS = P.PARAMS
 

diff --git a/CGATPipelines/pipeline_variants.py b/CGATPipelines/pipeline_variants.py
@@ -853,18 +853,19 @@ def buildBaseAnnotations(infile, outfile):
 def buildExonAnnotations(infile, outfile):
     """build exon annotations"""
 
-    to_cluster = True
-
     statement = """
-        gunzip < %(infile)s 
-        | awk '$3 == "CDS"' 
-        | python %(scriptsdir)s/gff2gff.py --method=sanitize=genome --skip-missing 
-               --genome-file=%(genome_dir)s/%(genome)s --log=%(outfile)s.log 
-        | python %(scriptsdir)s/gtf2gff.py 
-                --method=exons 
-                --restrict-source=protein_coding 
-                --log=%(outfile)s.log 
-        > %(outfile)s
+    gunzip < %(infile)s
+    | awk '$3 == "CDS"'
+    | python %(scriptsdir)s/gff2gff.py
+    --method=sanitize
+    --sanitize-method=genome
+    --skip-missing
+    --genome-file=%(genome_dir)s/%(genome)s --log=%(outfile)s.log
+    | python %(scriptsdir)s/gtf2gff.py
+    --method=exons
+    --restrict-source=protein_coding
+    --log=%(outfile)s.log
+    > %(outfile)s
     """
 
     P.run()
@@ -884,14 +885,18 @@ def buildGeneAnnotations(infile, outfile):
     output includes the UTR and non-coding genes.
     """
     statement = """
-        gunzip < %(infile)s |\
-        python %(scriptsdir)s/gtf2gtf.py --method=merge-exons --with-utr --log=%(outfile)s.log |\
-        python %(scriptsdir)s/gtf2gtf.py --method=set-transcript-to-gene --log=%(outfile)s.log |\
-        python %(scriptsdir)s/gff2gff.py --skip-missing --method=sanitize=genome --genome-file=%(genome_dir)s/%(genome)s --log=%(outfile)s.log |\
-        %(scriptsdir)s/gff_sort gene-pos \
-        > %(outfile)s
+    gunzip < %(infile)s
+    | python %(scriptsdir)s/gtf2gtf.py --method=merge-exons
+    --with-utr --log=%(outfile)s.log
+    | python %(scriptsdir)s/gtf2gtf.py --method=set-transcript-to-gene
+    --log=%(outfile)s.log
+    | python %(scriptsdir)s/gff2gff.py --skip-missing --method=sanitize
+    --sanitize-method=genome
+    --genome-file=%(genome_dir)s/%(genome)s
+    --log=%(outfile)s.log
+    | %(scriptsdir)s/gff_sort gene-pos
+    > %(outfile)s
     """
-    queue = "server"
     P.run()
 
 ###################################################################

diff --git a/CGATPipelines/pipeline_windows.py b/CGATPipelines/pipeline_windows.py
@@ -879,10 +879,6 @@ def summarizeWindowsReadCounts(infiles, outfile):
               > %(outfile)s'''
     P.run()
 
-#########################################################################
-#########################################################################
-#########################################################################
-
 
 @follows(mkdir("dump.dir"))
 @transform("design*.tsv",
@@ -1804,7 +1800,8 @@ def buildTranscriptProfiles(infiles, outfile):
     #                        input_files[0] + '.bed.gz') )
     statement = '''zcat %(gtffile)s
                    | python %(scriptsdir)s/gtf2gtf.py
-                     --filter=representative-transcript
+                     --method=filter
+                     --filter-method=representative-transcript
                      --log=%(outfile)s.log
                    | python %(scriptsdir)s/bam2geneprofile.py
                       --output-filename-pattern="%(outfile)s.%%s"

diff --git a/doc/CGATRecipes.rst b/doc/CGATRecipes.rst
@@ -2,16 +2,21 @@
 Using CGAT tools - Recipes
 ===========================
 
-In this section you will find representative examples for using tools developed in 
-CGAT. The recipes presented aim to provide intuitive real-life examples of CGAT script use for
-the analysis of genomic datasets. If there is a tool in the CGAT collection for which you
-would like a use case then please post a request on the `CGAT users group`_ website. 
+In this section you will find representative examples for using tools
+developed in CGAT. The recipes presented aim to provide intuitive
+real-life examples of CGAT script use for the analysis of genomic
+datasets. If there is a tool in the CGAT collection for which you
+would like a use case then please post a request on the `CGAT users
+group`_ website.
 
-.. toctree::
-   :maxdepth: 2
+The recipes are implemented as ipython_ notebooks.
 
-   recipes/gat
-   recipes/metagenome_contigs_kmers
+..
+   .. toctree::
+      :maxdepth: 2
+
+      recipes/gat
+      recipes/metagenome_contigs_kmers
 
 :download:`Recipe02 Plotting read-density in Intervals <recipes/Recipe02-BAMCoverageHistograms.html>`
 	  Illustrate how to plot read density in a large number of

diff --git a/doc/Pipelines.rst b/doc/Pipelines.rst
@@ -1,4 +1,4 @@
-.. _Pipelines:
+.. _pipelines:
 
 ==============
 CGAT Pipelines

diff --git a/doc/cgat.rst b/doc/cgat.rst
@@ -1,21 +1,21 @@
 .. _cgat:
 
-============================================
-CGAT - Computational Genomics Analysis Tools
-============================================
+======================================================
+CGAT |version| - Computational Genomics Analysis Tools 
+======================================================
 
 CGAT is a collection of tools for the computational genomicist written
-in the python language. The tools have been developed and accumulated in various
-genome projects (`Heger & Ponting, 2007`_, `Warren et al., 2008`_) and NGS projects
-(`Ramagopalan et al., 2010`_). The tools are continuously being developed
-as part of the `CGAT Training programme`_.
-
-The tools work from the command line, but can readily be installed
-within frameworks such as `Galaxy`_.
-
-Please note that the tools are part of a larger code base also
-including genomics and NGS pipelines. More information about those
-is :ref:`here <contents>`.
+in the python language. The tools have been developed and accumulated
+in various genome projects (`Heger & Ponting, 2007`_, `Warren et al.,
+2008`_) and NGS projects (`Ramagopalan et al., 2010`_). The tools are
+continuously being developed as part of the `CGAT Training
+programme`_. The tools work from the command line, but can readily be
+installed within frameworks such as `Galaxy`_.
+
+The documentation below covers the script published in
+`Bioinformatics <http://www.ncbi.nlm.nih.gov/pubmed/24395753>`_.
+For the complete documentation that also includes the NGS pipelines
+please go to :ref:`contents`.
 
 Detailed instructions on installation, on usage and a tool reference
 are below, followed by a :ref:`quickstart` guide.
@@ -47,23 +47,23 @@ dependencies and troubleshooting.
 
 CGAT tools are run from the unix command line. Lets assume we have
 the results of the binding locations of a ChIP-Seq experiment
-(chipseq.hg19.bed) in bed format and we want to know, how many
+(:file:`chipseq.hg19.bed`) in bed format and we want to know, how many
 binding locations are intronic, intergenic and within exons.
 
 Thus, we need to create a set of genomic annotations denoting
-intronic, intergenic regions, etc. with respect to a reference gene set.
-Here, we download the GENCODE geneset (Harrow et al., 2012) in GTF
-format from ENSEMBL (Flicek et al., 2013). 
+intronic, intergenic regions, etc. with respect to a reference gene
+set.  Here, we download the GENCODE geneset (Harrow et al., 2012) in
+GTF format from ENSEMBL (Flicek et al., 2013).
 
 The following unix statement downloads the ENSEMBL gene set containing
 over-lapping transcripts, and outputs a set of non-overlapping genomic
 annotations in gff format (:file:`annotations.gff`) by piping the data
-through various GAT tools::
+through various CGAT tools::
  
    wget -qO- ftp://ftp.ensembl.org/pub/release-72/gtf/homo_sapiens/Homo_sapiens.GRCh37.72.gtf.gz
    | gunzip
    | awk '$2 == "protein_coding"' 
-   | cgat gff2ff --genome-file=hg19 --method=sanitize=ucsc --skip-missing
+   | cgat gff2ff --genome-file=hg19 --method=sanitize --skip-missing
    | cgat gtf2gtf --method=sort --sort-order=gene
    | cgat gtf2gtf --method=merge-exons --with-utr
    | cgat gtf2gtf --method=filter --filter-method=longest-gene
@@ -73,11 +73,12 @@ through various GAT tools::
    > annotations.gff.gz
 
 .. note::
+
    The statements above need an indexed genome. To create such an
-   indexed genome for hg19, type the following:
+   indexed genome for hg19, type the following::
   
-   wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/chromFa.tar.gz
-   | index_fasta.py hg19 - > hg19.log
+      wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/chromFa.tar.gz
+      | index_fasta.py hg19 - > hg19.log
    
 CGAT tools can be chained into a single work flow using unix
 pipes. The above sequence of commands in turn (1) reconciles UCSC and

diff --git a/doc/conf.py b/doc/conf.py
@@ -66,7 +66,7 @@
 
 # General information about the project.
 project = u'CGAT'
-copyright = u'2011, Andreas Heger'
+copyright = u'2011, 2012, 2013, 2014 Andreas Heger'
 
 
 # Included at the end of each rst file
@@ -106,7 +106,7 @@
 # built documents.
 #
 # The short X.Y version.
-version = version
+version = version.__version__
 # The full version, including alpha/beta/rc tags.
 release = version
 

diff --git a/doc/developing.rst b/doc/developing.rst
@@ -143,6 +143,45 @@ as ENCODE.
 Attempt to add a plot to the end of a recipe, using
 R commands to create the plot within the notebook.
 
+Writing pipelines
+=================
+
+=========================
+CGAT Pipelines
+=========================
+
+Best practice for CGAT pipelines:
+
+1. All non-trivial code should be extracted to modules or scripts.
+
+2. Modules should not access PARAMS dictionary directly, but
+   parameters should be passed to the function.
+
+3. Important processing steps where different external tools could
+   potentially be employed the design of the module classes should be
+   carefully considered to ensure consistent input and output file
+   formats for different tools. PipelineMapping provides a good
+   example for this.
+
+4. All production pipelines should include tests for consistency which
+   can be run automatically.
+
+5. Where appropriate pipelines should include a small test dataset
+   with published results for comparison. This dataset can be run on
+   each pipeline run and included in the pipeline report where it can
+   be used as a pipeline control.
+
+6. Periodic code review meetings where interested parties can agree of
+   major changes to production pipelines and associated modules – to
+   be arranged as required.
+
+7.  The best way to manage pipeline improvements is by individuals
+   using pipelines talking responsibility for incremental
+   improvement. As best practice fellows should announce plans to
+   modify particular pipelines and modules on the CGAT members list to
+   avoid duplication of effort. Fellows should log the changes that
+   they make in a change log and document both modules and pipelines
+   in detail.