Skip to content

Commit

Permalink
Add support for backup cluster file
Browse files Browse the repository at this point in the history
  • Loading branch information
yuanx749 committed Jun 26, 2024
1 parent 0f939af commit f8df5ae
Show file tree
Hide file tree
Showing 3 changed files with 168 additions and 2 deletions.
162 changes: 162 additions & 0 deletions docs/examples/examples.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -415,6 +415,7 @@
" c=0.7,\n",
" d=0,\n",
" sc=1,\n",
" bak=1,\n",
")"
]
},
Expand Down Expand Up @@ -580,6 +581,167 @@
"df_clstr"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Read the bak.clstr file. Note that the cluster IDs are different."
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>identifier</th>\n",
" <th>cluster</th>\n",
" <th>size</th>\n",
" <th>is_representative</th>\n",
" <th>identity</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>00001</td>\n",
" <td>14</td>\n",
" <td>33</td>\n",
" <td>True</td>\n",
" <td>100.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>00002</td>\n",
" <td>12</td>\n",
" <td>34</td>\n",
" <td>True</td>\n",
" <td>100.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>00003</td>\n",
" <td>2</td>\n",
" <td>54</td>\n",
" <td>True</td>\n",
" <td>100.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>00004</td>\n",
" <td>3</td>\n",
" <td>49</td>\n",
" <td>True</td>\n",
" <td>100.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>00005</td>\n",
" <td>13</td>\n",
" <td>34</td>\n",
" <td>True</td>\n",
" <td>100.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>44</th>\n",
" <td>00046</td>\n",
" <td>8</td>\n",
" <td>38</td>\n",
" <td>False</td>\n",
" <td>94.74</td>\n",
" </tr>\n",
" <tr>\n",
" <th>45</th>\n",
" <td>00047</td>\n",
" <td>8</td>\n",
" <td>38</td>\n",
" <td>False</td>\n",
" <td>100.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>46</th>\n",
" <td>00048</td>\n",
" <td>8</td>\n",
" <td>42</td>\n",
" <td>True</td>\n",
" <td>100.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>47</th>\n",
" <td>00049</td>\n",
" <td>15</td>\n",
" <td>24</td>\n",
" <td>False</td>\n",
" <td>95.83</td>\n",
" </tr>\n",
" <tr>\n",
" <th>48</th>\n",
" <td>00050</td>\n",
" <td>15</td>\n",
" <td>27</td>\n",
" <td>True</td>\n",
" <td>100.00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>49 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" identifier cluster size is_representative identity\n",
"0 00001 14 33 True 100.00\n",
"1 00002 12 34 True 100.00\n",
"2 00003 2 54 True 100.00\n",
"3 00004 3 49 True 100.00\n",
"4 00005 13 34 True 100.00\n",
".. ... ... ... ... ...\n",
"44 00046 8 38 False 94.74\n",
"45 00047 8 38 False 100.00\n",
"46 00048 8 42 True 100.00\n",
"47 00049 15 24 False 95.83\n",
"48 00050 15 27 True 100.00\n",
"\n",
"[49 rows x 5 columns]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_bak_clstr = read_clstr(\"out.bak.clstr\")\n",
"df_bak_clstr"
]
},
{
"attachments": {},
"cell_type": "markdown",
Expand Down
2 changes: 1 addition & 1 deletion pycdhit/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from ._commands import * # noqa: F403
from ._io import * # noqa: F403

VERSION = "0.13.0"
VERSION = "1.0.0"

__all__ = [ # noqa: F405
"CommandBase",
Expand Down
6 changes: 5 additions & 1 deletion pycdhit/_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,13 +70,17 @@ def read_clstr(file: FilePath) -> pd.DataFrame:
# refer to PrintInfo
identifier, cluster, size, is_representative, identity = [], [], [], [], []
coverage, strand = [], [] # distance is not used
with open(file) as f:
bak = f.read(1) != ">"
with open(file) as f:
for line in f:
if line[0] == ">":
idx = int(re.search(r">Cluster (\d+)", line).group(1))
continue
cluster.append(idx)
line = line.split()
if bak:
idx = int(line[0])
cluster.append(idx)
size.append(int(re.search(r"(\d+)(aa|nt),", line[1]).group(1)))
identifier.append(re.search(r">(.+)\.{3}", line[2]).group(1))
if line[3] == "*":
Expand Down

0 comments on commit f8df5ae

Please sign in to comment.