Add support for backup cluster file

yuanx749 · Jun 26, 2024 · f8df5ae · f8df5ae
1 parent 0f939af
commit f8df5ae
Show file tree

Hide file tree

Showing 3 changed files with 168 additions and 2 deletions.
diff --git a/docs/examples/examples.ipynb b/docs/examples/examples.ipynb
@@ -415,6 +415,7 @@
     "    c=0.7,\n",
     "    d=0,\n",
     "    sc=1,\n",
+    "    bak=1,\n",
     ")"
    ]
   },
@@ -580,6 +581,167 @@
     "df_clstr"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Read the bak.clstr file. Note that the cluster IDs are different."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>identifier</th>\n",
+       "      <th>cluster</th>\n",
+       "      <th>size</th>\n",
+       "      <th>is_representative</th>\n",
+       "      <th>identity</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>00001</td>\n",
+       "      <td>14</td>\n",
+       "      <td>33</td>\n",
+       "      <td>True</td>\n",
+       "      <td>100.00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>00002</td>\n",
+       "      <td>12</td>\n",
+       "      <td>34</td>\n",
+       "      <td>True</td>\n",
+       "      <td>100.00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>00003</td>\n",
+       "      <td>2</td>\n",
+       "      <td>54</td>\n",
+       "      <td>True</td>\n",
+       "      <td>100.00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>00004</td>\n",
+       "      <td>3</td>\n",
+       "      <td>49</td>\n",
+       "      <td>True</td>\n",
+       "      <td>100.00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>00005</td>\n",
+       "      <td>13</td>\n",
+       "      <td>34</td>\n",
+       "      <td>True</td>\n",
+       "      <td>100.00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>44</th>\n",
+       "      <td>00046</td>\n",
+       "      <td>8</td>\n",
+       "      <td>38</td>\n",
+       "      <td>False</td>\n",
+       "      <td>94.74</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45</th>\n",
+       "      <td>00047</td>\n",
+       "      <td>8</td>\n",
+       "      <td>38</td>\n",
+       "      <td>False</td>\n",
+       "      <td>100.00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>46</th>\n",
+       "      <td>00048</td>\n",
+       "      <td>8</td>\n",
+       "      <td>42</td>\n",
+       "      <td>True</td>\n",
+       "      <td>100.00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>47</th>\n",
+       "      <td>00049</td>\n",
+       "      <td>15</td>\n",
+       "      <td>24</td>\n",
+       "      <td>False</td>\n",
+       "      <td>95.83</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>48</th>\n",
+       "      <td>00050</td>\n",
+       "      <td>15</td>\n",
+       "      <td>27</td>\n",
+       "      <td>True</td>\n",
+       "      <td>100.00</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>49 rows × 5 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   identifier  cluster  size  is_representative  identity\n",
+       "0       00001       14    33               True    100.00\n",
+       "1       00002       12    34               True    100.00\n",
+       "2       00003        2    54               True    100.00\n",
+       "3       00004        3    49               True    100.00\n",
+       "4       00005       13    34               True    100.00\n",
+       "..        ...      ...   ...                ...       ...\n",
+       "44      00046        8    38              False     94.74\n",
+       "45      00047        8    38              False    100.00\n",
+       "46      00048        8    42               True    100.00\n",
+       "47      00049       15    24              False     95.83\n",
+       "48      00050       15    27               True    100.00\n",
+       "\n",
+       "[49 rows x 5 columns]"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_bak_clstr = read_clstr(\"out.bak.clstr\")\n",
+    "df_bak_clstr"
+   ]
+  },
   {
    "attachments": {},
    "cell_type": "markdown",

diff --git a/pycdhit/__init__.py b/pycdhit/__init__.py
@@ -4,7 +4,7 @@
 from ._commands import *  # noqa: F403
 from ._io import *  # noqa: F403
 
-VERSION = "0.13.0"
+VERSION = "1.0.0"
 
 __all__ = [  # noqa: F405
     "CommandBase",

diff --git a/pycdhit/_io.py b/pycdhit/_io.py
@@ -70,13 +70,17 @@ def read_clstr(file: FilePath) -> pd.DataFrame:
     # refer to PrintInfo
     identifier, cluster, size, is_representative, identity = [], [], [], [], []
     coverage, strand = [], []  # distance is not used
+    with open(file) as f:
+        bak = f.read(1) != ">"
     with open(file) as f:
         for line in f:
             if line[0] == ">":
                 idx = int(re.search(r">Cluster (\d+)", line).group(1))
                 continue
-            cluster.append(idx)
             line = line.split()
+            if bak:
+                idx = int(line[0])
+            cluster.append(idx)
             size.append(int(re.search(r"(\d+)(aa|nt),", line[1]).group(1)))
             identifier.append(re.search(r">(.+)\.{3}", line[2]).group(1))
             if line[3] == "*":