Merge pull request #342 from CGATOxford/Py3-migration

Merge Python 3 code into master
CGATOxford · Jul 7, 2017 · 754f0fa · 754f0fa
2 parents 19292f6 + 56ea408
commit 754f0fa
Show file tree

Hide file tree

Showing 12 changed files with 42 additions and 54 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -20,7 +20,7 @@ env:
     - TEST_ALL=1
 
 python:
-  - "2.7"
+#  - "2.7"
   - "3.5"
 
 # Using xvfb to Run Tests That Require a GUI

diff --git a/CGAT/Expression.py b/CGAT/Expression.py
@@ -2211,7 +2211,9 @@ def runEdgeR(outfile,
                                                             ref_regex)
 
     # output heatmap plot
-    R.png('%(outfile_prefix)sheatmap.png' % locals())
+    fn = '%(outfile_prefix)sheatmap.png' % locals()
+    E.info("outputing heatmap to {}".format(fn))
+    R.png(fn)
     plotCorrelationHeatmap()
     r['dev.off']()
 

diff --git a/CGAT/FastaIterator.py b/CGAT/FastaIterator.py
@@ -92,7 +92,6 @@ def iterate(infile, comment="#", fold=False):
     ------
     FastaRecord
     '''
-
     h = infile.readline()[:-1]
 
     if not h:

diff --git a/CGAT/IOTools.py b/CGAT/IOTools.py
@@ -68,7 +68,7 @@ def getFirstLine(filename, nlines=1):
     return line
 
 
-def getLastLine(filename, nlines=1, read_size=1024):
+def getLastLine(filename, nlines=1, read_size=1024, encoding="utf-8"):
     """return the last line of a file.
 
     This method works by working back in blocks of `read_size` until
@@ -90,8 +90,8 @@ def getLastLine(filename, nlines=1, read_size=1024):
 
     """
 
-    # U is to open it with Universal newline support
-    f = open(filename, 'rU')
+    # py3 requires binary mode for negative seeks
+    f = open(filename, 'rb')
     offset = read_size
     f.seek(0, 2)
     file_size = f.tell()
@@ -102,10 +102,8 @@ def getLastLine(filename, nlines=1, read_size=1024):
             offset = file_size
         f.seek(-1 * offset, 2)
         read_str = f.read(offset)
-        # Remove newline at the end
-        if read_str[offset - 1] == '\n':
-            read_str = read_str[:-1]
-        lines = read_str.split('\n')
+        read_str = read_str.decode(encoding)
+        lines = read_str.strip().splitlines()
         if len(lines) >= nlines + 1:
             return "\n".join(lines[-nlines:])
         if offset == file_size:   # reached the beginning
@@ -189,19 +187,21 @@ def touchFile(filename, times=None):
     as empty 'gzip' files, i.e., with a header.
     '''
     existed = os.path.exists(filename)
-    fhandle = open(filename, 'a')
 
     if filename.endswith(".gz") and not existed:
         # this will automatically add a gzip header
+        fhandle = open(filename, 'a+b')
         fhandle = gzip.GzipFile(filename, fileobj=fhandle)
+    else:
+        fhandle = open(filename, 'a')
 
     try:
         os.utime(filename, times)
     finally:
         fhandle.close()
 
 
-def openFile(filename, mode="r", create_dir=False):
+def openFile(filename, mode="r", create_dir=False, encoding="utf-8"):
     '''open file called *filename* with mode *mode*.
 
     gzip - compressed files are recognized by the
@@ -235,16 +235,15 @@ def openFile(filename, mode="r", create_dir=False):
     if ext.lower() in (".gz", ".z"):
         if sys.version_info.major >= 3:
             if mode == "r":
-                return gzip.open(filename, 'rt', encoding="ascii")
+                return gzip.open(filename, 'rt', encoding=encoding)
             elif mode == "w":
-                return gzip.open(filename, 'wt', encoding="ascii")
-            else:
-                raise NotImplementedError(
-                    "mode '{}' not implemented".format(mode))
+                return gzip.open(filename, 'wt', encoding=encoding)
+            elif mode == "a":
+                return gzip.open(filename, 'wt', encoding=encoding)
         else:
             return gzip.open(filename, mode)
     else:
-        return open(filename, mode)
+        return open(filename, mode, encoding=encoding)
 
 
 def force_str(iterator, encoding="ascii"):
@@ -812,14 +811,6 @@ def __init__(self,
 
         self.mFiles = {}
         self.mOutputPattern = output_pattern
-
-        self.open = open
-
-        if output_pattern:
-            _, ext = os.path.splitext(output_pattern)
-            if ext.lower() in (".gz", ".z"):
-                self.open = gzip.open
-
         self.mCounts = collections.defaultdict(int)
         self.mHeader = header
         if force and output_pattern:
@@ -881,7 +872,7 @@ def openFile(self, filename, mode="w"):
             if dirname and not os.path.exists(dirname):
                 os.makedirs(dirname)
 
-        return self.open(filename, mode)
+        return openFile(filename, mode)
 
     def write(self, identifier, line):
         """write `line` to file specified by `identifier`"""
@@ -894,7 +885,7 @@ def write(self, identifier, line):
                     f.close()
                 self.mFiles = {}
 
-            self.mFiles[filename] = self.openFile(filename, "a")
+            self.mFiles[filename] = openFile(filename, "a")
             if self.mHeader:
                 self.mFiles[filename].write(self.mHeader)
 
@@ -947,7 +938,7 @@ def close(self):
             raise IOError("write on closed FilePool in close()")
 
         for filename, data in self.data.items():
-            f = self.openFile(filename, "a")
+            f = openFile(filename, "a")
             if self.mHeader:
                 f.write(self.mHeader)
             f.write("".join(data))

diff --git a/CGAT/Masker.py b/CGAT/Masker.py
@@ -141,13 +141,11 @@ def maskSequence(self, peptide_sequence):
     def maskSequences(self, sequences):
         '''mask a collection of sequences.'''
 
-        outfile, infile = tempfile.mkstemp()
-
-        for x, s in enumerate(sequences):
-            os.write(outfile, ">%i\n%s\n" % (x, s))
-
-        os.close(outfile)
+        with tempfile.NamedTemporaryFile(mode="w+t", delete=False) as outf:
+            for x, s in enumerate(sequences):
+                outf.write(">%i\n%s\n" % (x, s))
 
+        infile = outf.name
         statement = self.mCommand % locals()
 
         E.debug("statement: %s" % statement)
@@ -164,9 +162,9 @@ def maskSequences(self, sequences):
             raise RuntimeError(
                 "Error in running %s \n%s\nTemporary directory" %
                 (statement, err))
-
+        
         result = [
-            x.sequence for x in FastaIterator.iterate(StringIO(out))]
+            x.sequence for x in FastaIterator.iterate(StringIO(out.decode()))]
 
         os.remove(infile)
 

diff --git a/CGAT/scripts/_bam2stats.pyx b/CGAT/scripts/_bam2stats.pyx
@@ -145,6 +145,9 @@ def count(AlignmentFile samfile,
 
         if count_fastq:
             read_name = pysam_bam_get_qname(read._delegate)
+            if read_name == NULL:
+                raise ValueError("file does not contain read names, can't count using fastq")
+
             # terminate string at first space to
             # truncate read names containing more than
             # just the id

diff --git a/CGAT/scripts/bed2table.py b/CGAT/scripts/bed2table.py
@@ -239,7 +239,7 @@ def _count(self, bed, bamfiles, offsets):
             # if offsets are given, shift tags.
             for samfile, offset in zip(bamfiles, offsets):
 
-                shift = offset / 2
+                shift = offset // 2
                 # for peak counting I follow the MACS protocoll,
                 # see the function def __tags_call_peak in PeakDetect.py
                 # In words

diff --git a/CGAT/scripts/cgat_fasta2cDNA.py b/CGAT/scripts/cgat_fasta2cDNA.py
@@ -1,18 +1,12 @@
 '''
-cgat_fasta2cDNA.py - template for CGAT scripts
-====================================================
+cgat_fasta2cDNA.py - converting multi-fasta of exon features into a multi-fasta of spliced cDNAs/RNAs
+======================================================================================================
 
-:Author:
-:Release: $Id$
-:Date: |today|
 :Tags: Python
 
 Purpose
 -------
 
-.. Mike transcript processing - converting multi-fasta of exon
-features into a multi-fasta of spliced cDNAs/RNAs
-
 Usage
 -----
 
@@ -45,15 +39,15 @@ def makeSplicedFasta(infile):
     '''
 
     fasta_dict = {}
-    with IOTools.openFile(infile, "rb") as fafile:
+    with IOTools.openFile(infile) as fafile:
         for line in fafile.readlines():
             if line[0] == '>':
                 header = line.rstrip("\n")
                 fasta_dict[header] = ''
             else:
                 fasta_dict[header] += line.rstrip("\n")
 
-    for key, value in fasta_dict.items():
+    for key, value in sorted(fasta_dict.items()):
         yield "%s\n%s\n" % (key, value)
 
 

diff --git a/CGAT/scripts/csv_select.py b/CGAT/scripts/csv_select.py
@@ -80,8 +80,7 @@ def main(argv=None):
         reader = csv.DictReader(CSV.CommentStripper(sys.stdin),
                                 dialect=options.csv_dialect)
 
-    exec("f = lambda r: %s" % statement, locals())
-
+    exec("f = lambda r: %s" % statement, globals())
     counter = E.Counter()
     writer = csv.DictWriter(options.stdout,
                             reader.fieldnames,

diff --git a/CGAT/scripts/extract_stats.py b/CGAT/scripts/extract_stats.py
@@ -95,9 +95,11 @@ def cleanStatsTable(stats_file):
     Take in a table containing aggregated stats
     and clean by removing duplicate columns
     '''
-
+    # , mangle_dupe_cols=False)
+    # AH: disabled, because "ValueError: Setting mangle_dupe_cols=False is not supported yet"
     _df = pd.read_table(stats_file, sep="\t", header=0,
-                        index_col=None, mangle_dupe_cols=False)
+                        index_col=None)
+
     # drop duplicates is case sensitive, convert all to
     # same case - SQL is not case sensitive so will throw
     # a hissy fit for same column names in different cases

diff --git a/CGAT/scripts/psl2psl.py b/CGAT/scripts/psl2psl.py
@@ -194,7 +194,7 @@ def iterator_psl_intervals(options):
             except KeyError:
                 tx = []
 
-        if options.stdlog >= 2:
+        if options.loglevel >= 2:
             options.stdlog.write(
                 "###################################################\n")
             options.stdlog.write("# testing %s\n" % (str(match)))

diff --git a/CGAT/scripts/runExpression.py b/CGAT/scripts/runExpression.py
@@ -284,7 +284,7 @@ def main(argv=None):
     (options, args) = E.Start(parser, argv=argv, add_output_options=True)
 
     if options.input_filename_tags == "-":
-        fh = tempfile.NamedTemporaryFile(delete=False)
+        fh = tempfile.NamedTemporaryFile(delete=False, mode="w+t")
         fh.write("".join([x for x in options.stdin]))
         fh.close()
         options.input_filename_tags = fh.name