Skip to content

Commit

Permalink
Fix formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
tnitka committed Feb 2, 2024
1 parent 9c47a7a commit aa1c11d
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 40 deletions.
2 changes: 1 addition & 1 deletion snekmer/rules/motif.smk
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ rule preselect:
data=temp(join(out_dir, "motif", "preselection", "{nb}.csv.gz")),
kmers=join(out_dir, "motif", "kmers", "{nb}.csv.gz"),
vecs=join(out_dir, "motif", "sequences", "{nb}.csv.gz"),
model=join(out_dir, "motif", "preselection", "{nb}.model")
model=join(out_dir, "motif", "preselection", "{nb}.model"),
script:
resource_filename("snekmer", join("scripts/motif_preselect.py"))

Expand Down
55 changes: 31 additions & 24 deletions snekmer/scripts/motif_motif.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,15 @@
# Imports
# ---------------------------------------------------------
import pickle

# from datetime import datetime

import snekmer as skm
import pandas as pd
import numpy as np
import gzip
import gc

# import glob
# from typing import Any, Dict, List, Optional
# from sklearn.base import BaseEstimator, ClassifierMixin
Expand All @@ -40,29 +42,29 @@

# with open(snakemake.log[0], "a") as f:
# f.write(f"start time:\t{{start_time}}\n")

# load input data
with open(snakemake.input.matrix, "rb") as f:
data = pickle.load(f)

# with open(snakemake.input.kmers, "rb") as f:
# kmers = f.readlines()

with gzip.open(snakemake.input.weights, "rb") as f:
weights = pd.read_csv(f)

with gzip.open(snakemake.input.scores, "rb") as f:
scores = pd.read_csv(f)
scores = scores.astype('float64')
scores = scores.astype("float64")
scores = scores.to_numpy()


with gzip.open(snakemake.input.kmers, "rb") as f:
kmers = pd.read_csv(f)

# set category label name (e.g. "family")
label = config["score"]["lname"] if str(config["score"]["lname"]) != "None" else "label"

# with open(snakemake.input.model, "rb") as f:
# model = pickle.load(f)

Expand All @@ -73,13 +75,13 @@
# svm = LinearSVC(class_weight="balanced", random_state=None, max_iter=1000000)
# vecs=np.array(data["sequence_vector"].astype(str).str.strip('[]').str.split(",").tolist(), dtype='float')
# svm.fit(vecs, data[label])

# prevent kmer NA being read as np.nan
if config["k"] == 2:
kmers = kmers.fillna("NA")
scores=scores.fillna("NA")
scores = scores.fillna("NA")

# kmers = weights['kmer'].values
# kmers = weights['kmer'].values
# scores = weights['sample'].values
family = skm.utils.get_family(
skm.utils.split_file_ext(snakemake.input.weights)[0],
Expand All @@ -97,9 +99,7 @@
# scores.iloc[i] = scores.iloc[i]/unit_score

# set number of permutations to test
n_iter = (
config["motif"]["n"]
)
n_iter = config["motif"]["n"]


# get kmers for this particular set of sequences
Expand All @@ -117,7 +117,7 @@
else:
alphabet_name = str(config["alphabet"]).capitalize()


# run permutations and score each

score_matrix = kmers.rename(columns={"0": "kmer"})
Expand All @@ -128,22 +128,29 @@
with gzip.open(file) as f:
perm_scores = pd.DataFrame.to_numpy(pd.read_csv(f))
score_array = np.hstack((score_array, perm_scores))

else:
score_array = np.delete(score_array, 0, 1)
score_matrix=score_matrix.merge(
score_matrix = score_matrix.merge(
pd.DataFrame(score_array), left_index=True, right_index=True
)


scores = np.ravel(scores)
output_matrix = motif.p_values(score_matrix, scores, n_iter)
output_matrix = output_matrix.astype({'kmer': 'str', 'real score': 'float32', 'false positives': 'int32', 'n': 'int32', 'p': 'float32'})
output_matrix.sort_values(by=['p', 'real score'], ascending=[True, False], inplace=True)

output_matrix = output_matrix.astype(
{
"kmer": "str",
"real score": "float32",
"false positives": "int32",
"n": "int32",
"p": "float32",
}
)
output_matrix.sort_values(by=["p", "real score"], ascending=[True, False], inplace=True)

# save output
score_matrix.to_csv(snakemake.output.data, index=False, compression="gzip")
output_matrix.to_csv(snakemake.output.p_values, index=False, compression="gzip")

# record script endtime
#skm.utils.log_runtime(snakemake.log[0], start_time)
# skm.utils.log_runtime(snakemake.log[0], start_time)
32 changes: 17 additions & 15 deletions snekmer/scripts/motif_preselect.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,27 +26,27 @@
with open(snakemake.input.matrix, "rb") as f:
data = pickle.load(f)

data.astype({'label': 'category'})
data.astype({'background': 'boolean'})
data.astype({"label": "category"})
data.astype({"background": "boolean"})

with gzip.open(snakemake.input.weights, "rb") as f:
weights = pd.read_csv(f)

# prevent kmer NA being read as np.nan
if config["k"] == 2:
weights["kmer"] = weights["kmer"].fillna("NA")


kmers = weights['kmer'].values
scores = weights['sample'].values
kmers = weights["kmer"].values
scores = weights["sample"].values
family = skm.utils.get_family(
skm.utils.split_file_ext(snakemake.input.weights)[0],
regex=config["input_file_regex"],
)

del weights
gc.collect()

# set category label name (e.g. "family")
label = config["score"]["lname"] if str(config["score"]["lname"]) != "None" else "label"

Expand All @@ -60,7 +60,10 @@
score_index = np.searchsorted(unique_labels, family)

svm = LinearSVC(class_weight="balanced", random_state=None, max_iter=1000000)
vecs=np.array(data["sequence_vector"].astype(str).str.strip('[]').str.split(",").tolist(), dtype='float')
vecs = np.array(
data["sequence_vector"].astype(str).str.strip("[]").str.split(",").tolist(),
dtype="float",
)
svm.fit(vecs, data[label])

sequences = pd.DataFrame(vecs)
Expand All @@ -70,36 +73,35 @@
scores = pd.DataFrame(svm.coef_)
unit_score = max(scores.iloc[score_index].values)
for i in range(len(scores.iloc[score_index].values)):
scores.iloc[score_index, i] = scores.iloc[score_index, i]/unit_score
scores.iloc[score_index, i] = scores.iloc[score_index, i] / unit_score

kmers = pd.Series(kmers)
while scores.iloc[score_index].lt(-0.2).sum()>0:
while scores.iloc[score_index].lt(-0.2).sum() > 0:
# temp_scores = scores
features = list()
for i in range(len(scores.iloc[score_index].values)):
if scores.iloc[score_index, i]<-0.1:
if scores.iloc[score_index, i] < -0.1:
features.append(i)

scores.drop(scores.columns[features], axis=1, inplace=True)
kmers.drop(features, inplace=True)
kmers.index = np.arange(len(kmers.index))
sequences.drop(sequences.columns[features], axis=1, inplace=True)
sequences.columns = np.arange(len(sequences.columns))
del features
gc.collect()

vecs = sequences.to_numpy()
svm.fit(vecs, data[label])
scores = pd.DataFrame(svm.coef_)
unit_score = max(scores.iloc[score_index].values)
for i in range(len(scores.iloc[score_index].values)):
scores.iloc[score_index, i] = scores.iloc[score_index, i]/unit_score

scores.iloc[score_index, i] = scores.iloc[score_index, i] / unit_score


del data, vecs
gc.collect()

# save output
scores.iloc[score_index].to_csv(snakemake.output.data, index=False, compression="gzip")
kmers.to_csv(snakemake.output.kmers, index=False, compression="gzip")
Expand Down

0 comments on commit aa1c11d

Please sign in to comment.