Fix formatting

PNNL-CompBio · Feb 2, 2024 · aa1c11d · aa1c11d
1 parent 9c47a7a
commit aa1c11d
Show file tree

Hide file tree

Showing 3 changed files with 49 additions and 40 deletions.
diff --git a/snekmer/rules/motif.smk b/snekmer/rules/motif.smk
@@ -169,7 +169,7 @@ rule preselect:
         data=temp(join(out_dir, "motif", "preselection", "{nb}.csv.gz")),
         kmers=join(out_dir, "motif", "kmers", "{nb}.csv.gz"),
         vecs=join(out_dir, "motif", "sequences", "{nb}.csv.gz"),
-        model=join(out_dir, "motif", "preselection", "{nb}.model")
+        model=join(out_dir, "motif", "preselection", "{nb}.model"),
     script:
         resource_filename("snekmer", join("scripts/motif_preselect.py"))
 

diff --git a/snekmer/scripts/motif_motif.py b/snekmer/scripts/motif_motif.py
@@ -8,13 +8,15 @@
 # Imports
 # ---------------------------------------------------------
 import pickle
+
 # from datetime import datetime
 
 import snekmer as skm
 import pandas as pd
 import numpy as np
 import gzip
 import gc
+
 # import glob
 # from typing import Any, Dict, List, Optional
 # from sklearn.base import BaseEstimator, ClassifierMixin
@@ -40,29 +42,29 @@
 
 # with open(snakemake.log[0], "a") as f:
 #     f.write(f"start time:\t{{start_time}}\n")
-    
+
 # load input data
 with open(snakemake.input.matrix, "rb") as f:
     data = pickle.load(f)
-    
+
 # with open(snakemake.input.kmers, "rb") as f:
 #     kmers = f.readlines()
-    
+
 with gzip.open(snakemake.input.weights, "rb") as f:
     weights = pd.read_csv(f)
-    
+
 with gzip.open(snakemake.input.scores, "rb") as f:
     scores = pd.read_csv(f)
-    scores = scores.astype('float64')
+    scores = scores.astype("float64")
     scores = scores.to_numpy()
-    
+
 
 with gzip.open(snakemake.input.kmers, "rb") as f:
     kmers = pd.read_csv(f)
-    
+
 # set category label name (e.g. "family")
 label = config["score"]["lname"] if str(config["score"]["lname"]) != "None" else "label"
-    
+
 # with open(snakemake.input.model, "rb") as f:
 #     model = pickle.load(f)
 
@@ -73,13 +75,13 @@
 # svm = LinearSVC(class_weight="balanced", random_state=None, max_iter=1000000)
 # vecs=np.array(data["sequence_vector"].astype(str).str.strip('[]').str.split(",").tolist(), dtype='float')
 # svm.fit(vecs, data[label])
-    
+
 # prevent kmer NA being read as np.nan
 if config["k"] == 2:
     kmers = kmers.fillna("NA")
-    scores=scores.fillna("NA")
+    scores = scores.fillna("NA")
 
-# kmers = weights['kmer'].values    
+# kmers = weights['kmer'].values
 # scores = weights['sample'].values
 family = skm.utils.get_family(
     skm.utils.split_file_ext(snakemake.input.weights)[0],
@@ -97,9 +99,7 @@
 #     scores.iloc[i] = scores.iloc[i]/unit_score
 
 # set number of permutations to test
-n_iter = (
-    config["motif"]["n"]  
-    )
+n_iter = config["motif"]["n"]
 
 
 # get kmers for this particular set of sequences
@@ -117,7 +117,7 @@
 else:
     alphabet_name = str(config["alphabet"]).capitalize()
 
-  
+
 # run permutations and score each
 
 score_matrix = kmers.rename(columns={"0": "kmer"})
@@ -128,22 +128,29 @@
     with gzip.open(file) as f:
         perm_scores = pd.DataFrame.to_numpy(pd.read_csv(f))
     score_array = np.hstack((score_array, perm_scores))
-    
+
 else:
     score_array = np.delete(score_array, 0, 1)
-    score_matrix=score_matrix.merge(
+    score_matrix = score_matrix.merge(
         pd.DataFrame(score_array), left_index=True, right_index=True
     )
-    
-    
+
+
 scores = np.ravel(scores)
 output_matrix = motif.p_values(score_matrix, scores, n_iter)
-output_matrix = output_matrix.astype({'kmer': 'str', 'real score': 'float32', 'false positives': 'int32', 'n': 'int32', 'p': 'float32'})
-output_matrix.sort_values(by=['p', 'real score'], ascending=[True, False], inplace=True)
-
+output_matrix = output_matrix.astype(
+    {
+        "kmer": "str",
+        "real score": "float32",
+        "false positives": "int32",
+        "n": "int32",
+        "p": "float32",
+    }
+)
+output_matrix.sort_values(by=["p", "real score"], ascending=[True, False], inplace=True)
+
 # save output
 score_matrix.to_csv(snakemake.output.data, index=False, compression="gzip")
 output_matrix.to_csv(snakemake.output.p_values, index=False, compression="gzip")
-
 # record script endtime
-#skm.utils.log_runtime(snakemake.log[0], start_time)
+# skm.utils.log_runtime(snakemake.log[0], start_time)
diff --git a/snekmer/scripts/motif_preselect.py b/snekmer/scripts/motif_preselect.py
@@ -26,27 +26,27 @@
 with open(snakemake.input.matrix, "rb") as f:
     data = pickle.load(f)
 
-data.astype({'label': 'category'})
-data.astype({'background': 'boolean'})
+data.astype({"label": "category"})
+data.astype({"background": "boolean"})
 
 with gzip.open(snakemake.input.weights, "rb") as f:
     weights = pd.read_csv(f)
-    
+
 # prevent kmer NA being read as np.nan
 if config["k"] == 2:
     weights["kmer"] = weights["kmer"].fillna("NA")
 
 
-kmers = weights['kmer'].values    
-scores = weights['sample'].values
+kmers = weights["kmer"].values
+scores = weights["sample"].values
 family = skm.utils.get_family(
     skm.utils.split_file_ext(snakemake.input.weights)[0],
     regex=config["input_file_regex"],
 )
 
 del weights
 gc.collect()
-    
+
 # set category label name (e.g. "family")
 label = config["score"]["lname"] if str(config["score"]["lname"]) != "None" else "label"
 
@@ -60,7 +60,10 @@
 score_index = np.searchsorted(unique_labels, family)
 
 svm = LinearSVC(class_weight="balanced", random_state=None, max_iter=1000000)
-vecs=np.array(data["sequence_vector"].astype(str).str.strip('[]').str.split(",").tolist(), dtype='float')
+vecs = np.array(
+    data["sequence_vector"].astype(str).str.strip("[]").str.split(",").tolist(),
+    dtype="float",
+)
 svm.fit(vecs, data[label])
 
 sequences = pd.DataFrame(vecs)
@@ -70,36 +73,35 @@
 scores = pd.DataFrame(svm.coef_)
 unit_score = max(scores.iloc[score_index].values)
 for i in range(len(scores.iloc[score_index].values)):
-    scores.iloc[score_index, i] = scores.iloc[score_index, i]/unit_score
+    scores.iloc[score_index, i] = scores.iloc[score_index, i] / unit_score
 
 kmers = pd.Series(kmers)
-while scores.iloc[score_index].lt(-0.2).sum()>0:
+while scores.iloc[score_index].lt(-0.2).sum() > 0:
     # temp_scores = scores
     features = list()
     for i in range(len(scores.iloc[score_index].values)):
-        if scores.iloc[score_index, i]<-0.1:
+        if scores.iloc[score_index, i] < -0.1:
             features.append(i)
-            
+
     scores.drop(scores.columns[features], axis=1, inplace=True)
     kmers.drop(features, inplace=True)
     kmers.index = np.arange(len(kmers.index))
     sequences.drop(sequences.columns[features], axis=1, inplace=True)
     sequences.columns = np.arange(len(sequences.columns))
     del features
     gc.collect()
-            
+
     vecs = sequences.to_numpy()
     svm.fit(vecs, data[label])
     scores = pd.DataFrame(svm.coef_)
     unit_score = max(scores.iloc[score_index].values)
     for i in range(len(scores.iloc[score_index].values)):
-        scores.iloc[score_index, i] = scores.iloc[score_index, i]/unit_score
-
+        scores.iloc[score_index, i] = scores.iloc[score_index, i] / unit_score
 
 
 del data, vecs
 gc.collect()
-    
+
 # save output
 scores.iloc[score_index].to_csv(snakemake.output.data, index=False, compression="gzip")
 kmers.to_csv(snakemake.output.kmers, index=False, compression="gzip")