Skip to content

Commit bb5effa

Browse files
authored
Merge pull request #5 from mmreich/cls-file-support
Added CLS object to Python library
2 parents b200512 + 90ce06e commit bb5effa

File tree

1 file changed

+45
-0
lines changed

1 file changed

+45
-0
lines changed

gp/data.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,51 @@ def GCT(gct_obj):
3737
return df
3838

3939

40+
class CLS:
41+
def __init__(self, cls_obj):
42+
"""
43+
Create a CLS object with the contents of a CLS file
44+
45+
For more information on the CLS format see:
46+
http://software.broadinstitute.org/cancer/software/genepattern/file-formats-guide
47+
48+
:cls_obj: The CLS file. Accepts a file-like object, a file path, a URL to the file
49+
or a string containing the raw data.
50+
"""
51+
52+
hdr_line_re = re.compile("^(?P<samples>[0-9]+)\s+(?P<classes>[0-9]+)\s+1\s*$")
53+
assign_line_re = re.compile("^\s*(?:\d+\s+)*\d+\s*$",re.ASCII)
54+
55+
# Handle all the various initialization types and get an IO object
56+
cls_io = _obtain_io(cls_obj)
57+
58+
# Read the file as an array of lines
59+
raw_lines = cls_io.readlines()
60+
61+
# Convert byte strings to unicode strings
62+
raw_lines = _bytes_to_str(raw_lines)
63+
64+
# Validate cls file format and contents
65+
hdr_line_match = re.match(hdr_line_re, raw_lines[0])
66+
if hdr_line_match:
67+
(self.num_samples, self.num_classes) = (int(hdr_line_match["samples"]), int(hdr_line_match["classes"]))
68+
69+
self.class_names = raw_lines[1].split()[1:]
70+
if len(self.class_names) != self.num_classes:
71+
raise ValueError("Mismatch in {0} between number of class names declared ({1}) and number provided ({2})".format(cls_obj, self.num_classes, len(self.class_names)))
72+
73+
else:
74+
raise ValueError("Bad format in {0} for header line: {1}".format(cls_obj, raw_lines[0]))
75+
76+
assign_line_match = re.match(assign_line_re, raw_lines[2])
77+
if assign_line_match:
78+
self.class_assignments = [int(i) for i in raw_lines[2].split()]
79+
if self.num_samples != len(self.class_assignments):
80+
raise ValueError("Mismatch in {0} between number of samples declared ({1}) and number of class assignments provided ({2})".format(cls_obj, self.num_samples, len(self.class_assignments)))
81+
else:
82+
raise ValueError("Bad format in {0} for class assignment line: {1}".format(cls_obj, raw_lines[2]))
83+
84+
4085
def ODF(odf_obj):
4186
"""
4287
Create a Dataframe with the contents of the ODF file

0 commit comments

Comments
 (0)