Skip to content

Commit

Permalink
fine tuning release 0.2.2
Browse files Browse the repository at this point in the history
  • Loading branch information
AndreasHeger committed Nov 7, 2014
1 parent bf1bbbd commit 5004a6a
Show file tree
Hide file tree
Showing 21 changed files with 235 additions and 114 deletions.
9 changes: 6 additions & 3 deletions CGATPipelines/PipelineGeneset.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,8 @@ def buildProteinCodingGenes(infile, outfile):
| python %(scriptsdir)s/gtf2gtf.py
--method=sort --sort-order=contig+gene
| python %(scriptsdir)s/gff2gff.py
--method=sanitize=genome
--method=sanitize
--sanitize-method=genome
--skip-missing
--genome-file=%(genome_dir)s/%(genome)s
| python %(scriptsdir)s/gtf2gtf.py
Expand Down Expand Up @@ -770,7 +771,8 @@ def buildPromotorRegions(infile, outfile):
'''annotate promotor regions from reference gene set.'''
statement = """
gunzip < %(infile)s |\
python %(scriptsdir)s/gff2gff.py --method=sanitize=genome
python %(scriptsdir)s/gff2gff.py --method=sanitize
--sanitize-method=genome
--skip-missing --genome-file=%(genome_dir)s/%(genome)s
--log=%(outfile)s.log
| python %(scriptsdir)s/gtf2gff.py --method=promotors
Expand All @@ -797,7 +799,8 @@ def buildTSSRegions(infile, outfile):
'''
statement = """
gunzip < %(infile)s
| python %(scriptsdir)s/gff2gff.py --method=sanitize=genome
| python %(scriptsdir)s/gff2gff.py --method=sanitize
--sanitize-method=genome
--skip-missing
--genome-file=%(genome_dir)s/%(genome)s --log=%(outfile)s.log
| python %(scriptsdir)s/gtf2gff.py --method=promotors
Expand Down
3 changes: 2 additions & 1 deletion CGATPipelines/PipelineUCSC.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,8 @@ def getRepeatsFromUCSC(dbhandle, repclasses, outfile):
statement = ['''cat %(tmpfilename)s
| %(scriptsdir)s/gff_sort pos
| python %(scriptsdir)s/gff2gff.py
--method=sanitize=genome
--method=sanitize
--sanitize-method=genome
--skip-missing
--genome-file=%(genome_dir)s/%(genome)s
--log=%(outfile)s.log ''']
Expand Down
6 changes: 4 additions & 2 deletions CGATPipelines/pipeline_ancestral_repeats.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,8 @@ def importRepeatsFromUCSC(infile, outfile, ucsc_database, repeattypes, genome):
statement = '''cat %(tmpfilename)s
| %(scriptsdir)s/gff_sort pos
| python %(scriptsdir)s/gff2gff.py
--method=sanitize=genome
--method=sanitize
--sanitize-method=genome
--skip-missing
--genome-file=%(genome)s
--log=%(outfile)s.log
Expand Down Expand Up @@ -371,7 +372,8 @@ def importRepeatsFromEnsembl(infile, outfile,
--repeattypes %(repeattypes)s
| %(scriptsdir)s/gff_sort pos
| python %(scriptsdir)s/gff2gff.py
--method=sanitize=genome
--method=sanitize
--sanitize-method=genome
--skip-missing
--genome-file=%(genome)s
--log=%(outfile)s.log
Expand Down
8 changes: 4 additions & 4 deletions CGATPipelines/pipeline_annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -833,7 +833,8 @@ def buildGeneSet(infile, outfile):
statement = ['''zcat %(infile)s
| grep 'transcript_id'
| python %(scriptsdir)s/gff2gff.py
--method=sanitize=genome
--method=sanitize
--sanitize-method=genome
--skip-missing
--genome-file=%(genome_dir)s/%(genome)s
--log=%(outfile)s.log ''']
Expand Down Expand Up @@ -971,8 +972,7 @@ def downloadTranscriptInformation(infile, outfile):
"source": "source",
"status": "gene_status",
"transcript_status": "transcript_status",
"external_gene_id": "gene_name",
"external_transcript_id": "transcript_name",
"external_gene_name": "gene_name",
"uniprot_sptrembl": "uniprot_id",
"uniprot_genename": "uniprot_name",
}
Expand Down Expand Up @@ -1084,7 +1084,7 @@ def downloadTranscriptSynonyms(infile, outfile):

columns = {
"ensembl_transcript_id": "transcript_id",
"external_transcript_id": "transcript_name",
"external_transcript_name": "transcript_name",
"refseq_mrna": "refseq_id",
}

Expand Down
23 changes: 12 additions & 11 deletions CGATPipelines/pipeline_liftover.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,17 +151,18 @@ def convertGtf2Psl(infile, outfile):
raise IOError("genome %s does not exist" % genomefile)

statement = """gunzip
< %(infile)s
| awk '$3 == "exon"'
| python %(scriptsdir)s/gff2gff.py
--method=sanitize=genome
--skip-missing
--genome=%(genomefile)s
--log=%(outfile)s.log
| python %(scriptsdir)s/gff2psl.py
--allow-duplicates
--is-gtf
--log=%(outfile)s.log
< %(infile)s
| awk '$3 == "exon"'
| python %(scriptsdir)s/gff2gff.py
--method=sanitize
--sanitize-method=genome
--skip-missing
--genome=%(genomefile)s
--log=%(outfile)s.log
| python %(scriptsdir)s/gff2psl.py
--allow-duplicates
--is-gtf
--log=%(outfile)s.log
| gzip > %(outfile)s
"""
P.run()
Expand Down
3 changes: 2 additions & 1 deletion CGATPipelines/pipeline_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,8 @@
"../pipeline.ini",
"pipeline.ini"],
defaults={
'paired_end': False})
'paired_end': False},
only_import=__name__ != "__main__")

PARAMS = P.PARAMS

Expand Down
41 changes: 23 additions & 18 deletions CGATPipelines/pipeline_variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -853,18 +853,19 @@ def buildBaseAnnotations(infile, outfile):
def buildExonAnnotations(infile, outfile):
"""build exon annotations"""

to_cluster = True

statement = """
gunzip < %(infile)s
| awk '$3 == "CDS"'
| python %(scriptsdir)s/gff2gff.py --method=sanitize=genome --skip-missing
--genome-file=%(genome_dir)s/%(genome)s --log=%(outfile)s.log
| python %(scriptsdir)s/gtf2gff.py
--method=exons
--restrict-source=protein_coding
--log=%(outfile)s.log
> %(outfile)s
gunzip < %(infile)s
| awk '$3 == "CDS"'
| python %(scriptsdir)s/gff2gff.py
--method=sanitize
--sanitize-method=genome
--skip-missing
--genome-file=%(genome_dir)s/%(genome)s --log=%(outfile)s.log
| python %(scriptsdir)s/gtf2gff.py
--method=exons
--restrict-source=protein_coding
--log=%(outfile)s.log
> %(outfile)s
"""

P.run()
Expand All @@ -884,14 +885,18 @@ def buildGeneAnnotations(infile, outfile):
output includes the UTR and non-coding genes.
"""
statement = """
gunzip < %(infile)s |\
python %(scriptsdir)s/gtf2gtf.py --method=merge-exons --with-utr --log=%(outfile)s.log |\
python %(scriptsdir)s/gtf2gtf.py --method=set-transcript-to-gene --log=%(outfile)s.log |\
python %(scriptsdir)s/gff2gff.py --skip-missing --method=sanitize=genome --genome-file=%(genome_dir)s/%(genome)s --log=%(outfile)s.log |\
%(scriptsdir)s/gff_sort gene-pos \
> %(outfile)s
gunzip < %(infile)s
| python %(scriptsdir)s/gtf2gtf.py --method=merge-exons
--with-utr --log=%(outfile)s.log
| python %(scriptsdir)s/gtf2gtf.py --method=set-transcript-to-gene
--log=%(outfile)s.log
| python %(scriptsdir)s/gff2gff.py --skip-missing --method=sanitize
--sanitize-method=genome
--genome-file=%(genome_dir)s/%(genome)s
--log=%(outfile)s.log
| %(scriptsdir)s/gff_sort gene-pos
> %(outfile)s
"""
queue = "server"
P.run()

###################################################################
Expand Down
7 changes: 2 additions & 5 deletions CGATPipelines/pipeline_windows.py
Original file line number Diff line number Diff line change
Expand Up @@ -879,10 +879,6 @@ def summarizeWindowsReadCounts(infiles, outfile):
> %(outfile)s'''
P.run()

#########################################################################
#########################################################################
#########################################################################


@follows(mkdir("dump.dir"))
@transform("design*.tsv",
Expand Down Expand Up @@ -1804,7 +1800,8 @@ def buildTranscriptProfiles(infiles, outfile):
# input_files[0] + '.bed.gz') )
statement = '''zcat %(gtffile)s
| python %(scriptsdir)s/gtf2gtf.py
--filter=representative-transcript
--method=filter
--filter-method=representative-transcript
--log=%(outfile)s.log
| python %(scriptsdir)s/bam2geneprofile.py
--output-filename-pattern="%(outfile)s.%%s"
Expand Down
21 changes: 13 additions & 8 deletions doc/CGATRecipes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,21 @@
Using CGAT tools - Recipes
===========================

In this section you will find representative examples for using tools developed in
CGAT. The recipes presented aim to provide intuitive real-life examples of CGAT script use for
the analysis of genomic datasets. If there is a tool in the CGAT collection for which you
would like a use case then please post a request on the `CGAT users group`_ website.
In this section you will find representative examples for using tools
developed in CGAT. The recipes presented aim to provide intuitive
real-life examples of CGAT script use for the analysis of genomic
datasets. If there is a tool in the CGAT collection for which you
would like a use case then please post a request on the `CGAT users
group`_ website.

.. toctree::
:maxdepth: 2
The recipes are implemented as ipython_ notebooks.

recipes/gat
recipes/metagenome_contigs_kmers
..
.. toctree::
:maxdepth: 2
recipes/gat
recipes/metagenome_contigs_kmers

:download:`Recipe02 Plotting read-density in Intervals <recipes/Recipe02-BAMCoverageHistograms.html>`
Illustrate how to plot read density in a large number of
Expand Down
2 changes: 1 addition & 1 deletion doc/Pipelines.rst
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
.. _Pipelines:
.. _pipelines:

==============
CGAT Pipelines
Expand Down
47 changes: 24 additions & 23 deletions doc/cgat.rst
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
.. _cgat:

============================================
CGAT - Computational Genomics Analysis Tools
============================================
======================================================
CGAT |version| - Computational Genomics Analysis Tools
======================================================

CGAT is a collection of tools for the computational genomicist written
in the python language. The tools have been developed and accumulated in various
genome projects (`Heger & Ponting, 2007`_, `Warren et al., 2008`_) and NGS projects
(`Ramagopalan et al., 2010`_). The tools are continuously being developed
as part of the `CGAT Training programme`_.

The tools work from the command line, but can readily be installed
within frameworks such as `Galaxy`_.

Please note that the tools are part of a larger code base also
including genomics and NGS pipelines. More information about those
is :ref:`here <contents>`.
in the python language. The tools have been developed and accumulated
in various genome projects (`Heger & Ponting, 2007`_, `Warren et al.,
2008`_) and NGS projects (`Ramagopalan et al., 2010`_). The tools are
continuously being developed as part of the `CGAT Training
programme`_. The tools work from the command line, but can readily be
installed within frameworks such as `Galaxy`_.

The documentation below covers the script published in
`Bioinformatics <http://www.ncbi.nlm.nih.gov/pubmed/24395753>`_.
For the complete documentation that also includes the NGS pipelines
please go to :ref:`contents`.

Detailed instructions on installation, on usage and a tool reference
are below, followed by a :ref:`quickstart` guide.
Expand Down Expand Up @@ -47,23 +47,23 @@ dependencies and troubleshooting.

CGAT tools are run from the unix command line. Lets assume we have
the results of the binding locations of a ChIP-Seq experiment
(chipseq.hg19.bed) in bed format and we want to know, how many
(:file:`chipseq.hg19.bed`) in bed format and we want to know, how many
binding locations are intronic, intergenic and within exons.

Thus, we need to create a set of genomic annotations denoting
intronic, intergenic regions, etc. with respect to a reference gene set.
Here, we download the GENCODE geneset (Harrow et al., 2012) in GTF
format from ENSEMBL (Flicek et al., 2013).
intronic, intergenic regions, etc. with respect to a reference gene
set. Here, we download the GENCODE geneset (Harrow et al., 2012) in
GTF format from ENSEMBL (Flicek et al., 2013).

The following unix statement downloads the ENSEMBL gene set containing
over-lapping transcripts, and outputs a set of non-overlapping genomic
annotations in gff format (:file:`annotations.gff`) by piping the data
through various GAT tools::
through various CGAT tools::
wget -qO- ftp://ftp.ensembl.org/pub/release-72/gtf/homo_sapiens/Homo_sapiens.GRCh37.72.gtf.gz
| gunzip
| awk '$2 == "protein_coding"'
| cgat gff2ff --genome-file=hg19 --method=sanitize=ucsc --skip-missing
| cgat gff2ff --genome-file=hg19 --method=sanitize --skip-missing
| cgat gtf2gtf --method=sort --sort-order=gene
| cgat gtf2gtf --method=merge-exons --with-utr
| cgat gtf2gtf --method=filter --filter-method=longest-gene
Expand All @@ -73,11 +73,12 @@ through various GAT tools::
> annotations.gff.gz

.. note::

The statements above need an indexed genome. To create such an
indexed genome for hg19, type the following:
indexed genome for hg19, type the following::
wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/chromFa.tar.gz
| index_fasta.py hg19 - > hg19.log
wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/chromFa.tar.gz
| index_fasta.py hg19 - > hg19.log
CGAT tools can be chained into a single work flow using unix
pipes. The above sequence of commands in turn (1) reconciles UCSC and
Expand Down
4 changes: 2 additions & 2 deletions doc/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@

# General information about the project.
project = u'CGAT'
copyright = u'2011, Andreas Heger'
copyright = u'2011, 2012, 2013, 2014 Andreas Heger'


# Included at the end of each rst file
Expand Down Expand Up @@ -106,7 +106,7 @@
# built documents.
#
# The short X.Y version.
version = version
version = version.__version__
# The full version, including alpha/beta/rc tags.
release = version

Expand Down
39 changes: 39 additions & 0 deletions doc/developing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,45 @@ as ENCODE.
Attempt to add a plot to the end of a recipe, using
R commands to create the plot within the notebook.

Writing pipelines
=================

=========================
CGAT Pipelines
=========================

Best practice for CGAT pipelines:

1. All non-trivial code should be extracted to modules or scripts.

2. Modules should not access PARAMS dictionary directly, but
parameters should be passed to the function.

3. Important processing steps where different external tools could
potentially be employed the design of the module classes should be
carefully considered to ensure consistent input and output file
formats for different tools. PipelineMapping provides a good
example for this.

4. All production pipelines should include tests for consistency which
can be run automatically.

5. Where appropriate pipelines should include a small test dataset
with published results for comparison. This dataset can be run on
each pipeline run and included in the pipeline report where it can
be used as a pipeline control.

6. Periodic code review meetings where interested parties can agree of
major changes to production pipelines and associated modules – to
be arranged as required.

7. The best way to manage pipeline improvements is by individuals
using pipelines talking responsibility for incremental
improvement. As best practice fellows should announce plans to
modify particular pipelines and modules on the CGAT members list to
avoid duplication of effort. Fellows should log the changes that
they make in a change log and document both modules and pipelines
in detail.



Expand Down
Loading

0 comments on commit 5004a6a

Please sign in to comment.