Skip to content
This repository was archived by the owner on Dec 8, 2022. It is now read-only.

Jordan bio #99

Open
wants to merge 18 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,219 changes: 1,219 additions & 0 deletions stdlib/bio/seqfeature.seq

Large diffs are not rendered by default.

98 changes: 98 additions & 0 deletions stdlib/biopy/SeqUtils/checksum.seq
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@

# TODO/ CAVEATS:
# @jordan - crc32() needs to be implemented
# def crc32(seq):
# """
# crc32(seq)

# Return the crc32 checksum for a sequence (string or Seq object).
# """
# return _crc32(seq)

def _init_table_h() -> list[int]:
_table_h = list[int]()
for i in range(256):
part_l = i
part_h = 0
for j in range(8):
rflag = part_l & 1
part_l >>= 1
if part_h & 1:
part_l |= 1 << 31
part_h >>= 1
if rflag:
part_h ^= 0xD8000000
_table_h.append(part_h)
return _table_h

#Initialisation
_table_h = _init_table_h()

def crc64(s) -> str:
"""
crc64(s)

Return the crc64 checksum for a sequence (string or Seq object).

TODO/CAVEATS:
@jordan - return formatting crch and crcl as hex
"""
crcl = 0
crch = 0
for c in s:
shr = (crch & 0xFF) << 24
temp1h = crch >> 8
temp1l = (crcl >> 8) | shr
idx = (crcl ^ ord(c)) & 0xFF
crch = temp1h ^ _table_h[idx]
crcl = temp1l

# need it in 08X return f"CRC-{crch}{crcl}"
return f"CRC-{crch}{crcl}"
#"CRC-%08X%08X" % (crch, crcl)

def gcg(s) -> int:
"""
gcg(s)

Given a nucleotide or amino-acid secuence (or any string),
returns the GCG checksum (int). Checksum used by GCG program.
seq type = str.
"""

index = checksum = 0

for char in s:
index += 1
checksum += index * ord(char.upper())
if index == 57:
index = 0
return checksum % 10000

# TODO/CAVEATS:
# @jordan - This still needs to be implemented
#hashlib and base64??
# def seguid(s) -> str:
# """
# seguid(seq: seq)

# Return the SEGUID (string) for a sequence (string or Seq object).

# Given a nucleotide or amino-acid secuence (or any string),
# returns the SEGUID string (A SEquence Globally Unique IDentifier).
# seq type = str.
# """
# import hashlib
# import base64

# m = hashlib.sha1()
# m.update(_as_bytes(s.upper()))
# try:
# # For Python 3+
# tmp = base64.encodebytes(m.digest())
# return tmp.decode().replace("\n", "").rstrip("=")
# except AttributeError:
# pass
# # For all other Pythons
# return base64.b64encode(m.digest()).rstrip("=")

194 changes: 194 additions & 0 deletions stdlib/biopy/SeqUtils/codonusage.seq
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@

"""Methods for codon usage calculations."""

import math
from biopy.sequtils.codonusageindices import SharpEcoliIndex
import bio
#DONT HAVE YET from Bio import SeqIO

# Turn black code style off
# fmt: off

CodonsDict = {
"TTT": 0, "TTC": 0, "TTA": 0, "TTG": 0,
"CTT": 0, "CTC": 0, "CTA": 0, "CTG": 0,
"ATT": 0, "ATC": 0, "ATA": 0, "ATG": 0,
"GTT": 0, "GTC": 0, "GTA": 0, "GTG": 0,
"TAT": 0, "TAC": 0, "TAA": 0, "TAG": 0,
"CAT": 0, "CAC": 0, "CAA": 0, "CAG": 0,
"AAT": 0, "AAC": 0, "AAA": 0, "AAG": 0,
"GAT": 0, "GAC": 0, "GAA": 0, "GAG": 0,
"TCT": 0, "TCC": 0, "TCA": 0, "TCG": 0,
"CCT": 0, "CCC": 0, "CCA": 0, "CCG": 0,
"ACT": 0, "ACC": 0, "ACA": 0, "ACG": 0,
"GCT": 0, "GCC": 0, "GCA": 0, "GCG": 0,
"TGT": 0, "TGC": 0, "TGA": 0, "TGG": 0,
"CGT": 0, "CGC": 0, "CGA": 0, "CGG": 0,
"AGT": 0, "AGC": 0, "AGA": 0, "AGG": 0,
"GGT": 0, "GGC": 0, "GGA": 0, "GGG": 0}

# Turn black code style on
# fmt: on

# this dictionary shows which codons encode the same AA
SynonymousCodons = {
"CYS": ["TGT", "TGC"],
"ASP": ["GAT", "GAC"],
"SER": ["TCT", "TCG", "TCA", "TCC", "AGC", "AGT"],
"GLN": ["CAA", "CAG"],
"MET": ["ATG"],
"ASN": ["AAC", "AAT"],
"PRO": ["CCT", "CCG", "CCA", "CCC"],
"LYS": ["AAG", "AAA"],
"STOP": ["TAG", "TGA", "TAA"],
"THR": ["ACC", "ACA", "ACG", "ACT"],
"PHE": ["TTT", "TTC"],
"ALA": ["GCA", "GCC", "GCG", "GCT"],
"GLY": ["GGT", "GGG", "GGA", "GGC"],
"ILE": ["ATC", "ATA", "ATT"],
"LEU": ["TTA", "TTG", "CTC", "CTT", "CTG", "CTA"],
"HIS": ["CAT", "CAC"],
"ARG": ["CGA", "CGC", "CGG", "CGT", "AGG", "AGA"],
"TRP": ["TGG"],
"VAL": ["GTA", "GTC", "GTG", "GTT"],
"GLU": ["GAG", "GAA"],
"TYR": ["TAT", "TAC"],
}

class CodonAdaptationIndex:
"""
A codon adaptation index (CAI) implementation.

Implements the codon adaptation index (CAI) described by Sharp and
Li (Nucleic Acids Res. 1987 Feb 11;15(3):1281-95).

NOTE - This implementation does not currently cope with alternative genetic
codes: only the synonymous codons in the standard table are considered.
"""
index: dict[str, float]
codon_count: dict[str, int]

def __init__(self: CodonAdaptationIndex):
self.index = dict[str, float]()
self.codon_count = dict[str, int]()

# use this method with predefined CAI index
def set_cai_index(self, index):
"""
set_cai_index(self, index)

Set up an index to be used when calculating CAI for a gene.

Just pass a dictionary similar to the SharpEcoliIndex in the
CodonUsageIndices module.
"""
self.index = index

def generate_index(self, fasta_file):
"""
generate_index(self, fasta_file)

Generate a codon usage index from a FASTA file of CDS sequences.

Takes a location of a Fasta file containing CDS sequences
(which must all have a whole number of codons) and generates a codon
usage index.

RCSU values
"""
# first make sure we're not overwriting an existing index:
if self.index != dict[str, float]() or self.codon_count != dict[str, int]():
raise ValueError(
"an index has already been set or a codon count "
"has been done. Cannot overwrite either."
)

# count codon occurrences in the file.
self._count_codons(fasta_file)

# now to calculate the index we first need to sum the number of times
# synonymous codons were used all together.
for aa in SynonymousCodons:
total = 0.0
# RCSU values are CodonCount/((1/num of synonymous codons) * sum of
# all synonymous codons)
rcsu = list[float]()
codons = SynonymousCodons[aa]

for codon in codons:
total += self.codon_count[codon]

# calculate the RSCU value for each of the codons
for codon in codons:
denominator = float(total) / len(codons)
rcsu.append(self.codon_count[codon] / denominator)

# now generate the index W=RCSUi/RCSUmax:
rcsu_max = max(rcsu)
for codon_index, codon in enumerate(codons):
self.index[codon] = rcsu[codon_index] / rcsu_max

def cai_for_gene(self, dna_sequence: str) -> float:
"""
cai_for_gene(self, dna_sequence)

Calculate the CAI (float) for the provided DNA sequence (string).

This method uses the Index (either the one you set or the one you
generated) and returns the CAI for the DNA sequence.
"""
cai_value, cai_length = 0.0, 0

# if no index is set or generated, the default SharpEcoliIndex will
# be used.
if self.index == dict[str, float]():
self.set_cai_index(SharpEcoliIndex)

if dna_sequence.islower():
dna_sequence = dna_sequence.upper()

for i in range(0, len(dna_sequence), 3):
codon = dna_sequence[i : i + 3]
if codon in self.index:
# these two codons are always one, exclude them:
if codon not in ["ATG", "TGG"]:
cai_value += math.log(self.index[codon])
cai_length += 1
# some indices may not include stop codons:
elif codon not in ["TGA", "TAA", "TAG"]:
raise TypeError(
f"illegal codon in sequence: {codon}.\n{self.index}"
)

return math.exp(cai_value / (cai_length - 1.0))

def _count_codons(self, fasta_file):

# make the codon dictionary local
self.codon_count = CodonsDict.copy()

dna_sequence = ''

# iterate over sequence and count all the codons in the FastaFile.
for cur_record in bio.FASTA(fasta_file):
# make sure the sequence is lower case
if str(cur_record.seq).islower():
dna_sequence = str(cur_record.seq).upper()
else:
dna_sequence = str(cur_record.seq)
for i in range(0, len(dna_sequence), 3):
codon = dna_sequence[i : i + 3]
if codon in self.codon_count:
self.codon_count[codon] += 1
else:
raise TypeError(
f"illegal codon {codon} in gene: {cur_record}"
)

def print_index(self):
"""
Print out the index used.
This just gives the index when the objects is printed.
"""
for i in sorted(self.index):
print(f"{i}\t{self.index[i]}")
25 changes: 25 additions & 0 deletions stdlib/biopy/SeqUtils/codonusageindices.seq
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
"""
Codon adaption indxes, including Sharp and Li (1987) E. coli index.

Currently this module only defines a single codon adaption index from
Sharp & Li, Nucleic Acids Res. 1987.
"""
# Turn black code style off
# fmt: off

SharpEcoliIndex = {
"GCA": 0.586, "GCC": 0.122, "GCG": 0.424, "GCT": 1.0, "AGA": 0.004,
"AGG": 0.002, "CGA": 0.004, "CGC": 0.356, "CGG": 0.004, "CGT": 1.0, "AAC": 1.0,
"AAT": 0.051, "GAC": 1.0, "GAT": 0.434, "TGC": 1.0, "TGT": 0.5, "CAA": 0.124,
"CAG": 1.0, "GAA": 1.0, "GAG": 0.259, "GGA": 0.01, "GGC": 0.724, "GGG": 0.019,
"GGT": 1.0, "CAC": 1.0, "CAT": 0.291, "ATA": 0.003, "ATC": 1.0, "ATT": 0.185,
"CTA": 0.007, "CTC": 0.037, "CTG": 1.0, "CTT": 0.042, "TTA": 0.02,
"TTG": 0.02, "AAA": 1.0, "AAG": 0.253, "ATG": 1.0, "TTC": 1.0, "TTT": 0.296,
"CCA": 0.135, "CCC": 0.012, "CCG": 1.0, "CCT": 0.07, "AGC": 0.41,
"AGT": 0.085, "TCA": 0.077, "TCC": 0.744, "TCG": 0.017, "TCT": 1.0,
"ACA": 0.076, "ACC": 1.0, "ACG": 0.099, "ACT": 0.965, "TGG": 1.0, "TAC": 1.0,
"TAT": 0.239, "GTA": 0.495, "GTC": 0.066, "GTG": 0.221, "GTT": 1.0
}

# Turn black code style on
# fmt: on
145 changes: 145 additions & 0 deletions stdlib/biopy/SeqUtils/lcc.seq
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@

"""Local Composition Complexity."""

import math

def lcc_mult(s, wsize: int):
"""
lcc_mult(s, wsize)

Calculate Local Composition Complexity (LCC) values over sliding window.

Returns a list of floats, the LCC values for a sliding window over
the sequence.
"""
l2 = math.log(2.0)
tamseq = len(s)

upper = s.upper()

compone = [0.0]
lccsal = [0.0]
for i in range(wsize):
compone.append(
((i + 1) / float(wsize)) * ((math.log((i + 1) / float(wsize))) / l2)
)
window = s[0:wsize]
cant_a = window.count("A")
cant_c = window.count("C")
cant_t = window.count("T")
cant_g = window.count("G")
term_a = compone[cant_a]
term_c = compone[cant_c]
term_t = compone[cant_t]
term_g = compone[cant_g]
lccsal.append(-(term_a + term_c + term_t + term_g))
tail = s[0]
for x in range(tamseq - wsize):
window = upper[x + 1 : wsize + x + 1]
if tail == window[-1]:
lccsal.append(lccsal[-1])
elif tail == "A":
cant_a -= 1
if window.endswith("C"):
cant_c += 1
term_a = compone[cant_a]
term_c = compone[cant_c]
lccsal.append(-(term_a + term_c + term_t + term_g))
elif window.endswith("T"):
cant_t += 1
term_a = compone[cant_a]
term_t = compone[cant_t]
lccsal.append(-(term_a + term_c + term_t + term_g))
elif window.endswith("G"):
cant_g += 1
term_a = compone[cant_a]
term_g = compone[cant_g]
lccsal.append(-(term_a + term_c + term_t + term_g))
elif tail == "C":
cant_c -= 1
if window.endswith("A"):
cant_a += 1
term_a = compone[cant_a]
term_c = compone[cant_c]
lccsal.append(-(term_a + term_c + term_t + term_g))
elif window.endswith("T"):
cant_t += 1
term_c = compone[cant_c]
term_t = compone[cant_t]
lccsal.append(-(term_a + term_c + term_t + term_g))
elif window.endswith("G"):
cant_g += 1
term_c = compone[cant_c]
term_g = compone[cant_g]
lccsal.append(-(term_a + term_c + term_t + term_g))
elif tail == "T":
cant_t -= 1
if window.endswith("A"):
cant_a += 1
term_a = compone[cant_a]
term_t = compone[cant_t]
lccsal.append(-(term_a + term_c + term_t + term_g))
elif window.endswith("C"):
cant_c += 1
term_c = compone[cant_c]
term_t = compone[cant_t]
lccsal.append(-(term_a + term_c + term_t + term_g))
elif window.endswith("G"):
cant_g += 1
term_t = compone[cant_t]
term_g = compone[cant_g]
lccsal.append(-(term_a + term_c + term_t + term_g))
elif tail == "G":
cant_g -= 1
if window.endswith("A"):
cant_a += 1
term_a = compone[cant_a]
term_g = compone[cant_g]
lccsal.append(-(term_a + term_c + term_t + term_g))
elif window.endswith("C"):
cant_c += 1
term_c = compone[cant_c]
term_g = compone[cant_g]
lccsal.append(-(term_a + term_c + term_t + term_g))
elif window.endswith("T"):
cant_t += 1
term_t = compone[cant_t]
term_g = compone[cant_g]
lccsal.append(-(term_a + term_c + term_t + term_g))
tail = window[0]
return lccsal


def lcc_simp(s):
"""
lcc_simp(s)

Calculate Local Composition Complexity (LCC) for a sequence.
"""
wsize = len(s)
term_a = 0.0
term_c = 0.0
term_t = 0.0
term_g = 0.0


upper = s.upper()

l2 = math.log(2.0)
if "A" in s:
term_a = ((upper.count("A")) / float(wsize)) * (
(math.log((upper.count("A")) / float(wsize))) / l2
)
if "C" in s:
term_c = ((upper.count("C")) / float(wsize)) * (
(math.log((upper.count("C")) / float(wsize))) / l2
)
if "T" in s:
term_t = ((upper.count("T")) / float(wsize)) * (
(math.log((upper.count("T")) / float(wsize))) / l2
)
if "G" in s:
term_g = ((upper.count("G")) / float(wsize)) * (
(math.log((upper.count("G")) / float(wsize))) / l2
)
return -(term_a + term_c + term_t + term_g)
154 changes: 154 additions & 0 deletions stdlib/biopy/SeqUtils/proparamdata.seq
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@

"""Indices to be used with ProtParam."""

# Turn black code style off
# fmt: off

# Kyte & Doolittle index of hydrophobicity
# J. Mol. Biol. 157:105-132(1982).
kd = {"A": 1.8, "R": -4.5, "N": -3.5, "D": -3.5, "C": 2.5,
"Q": -3.5, "E": -3.5, "G": -0.4, "H": -3.2, "I": 4.5,
"L": 3.8, "K": -3.9, "M": 1.9, "F": 2.8, "P": -1.6,
"S": -0.8, "T": -0.7, "W": -0.9, "Y": -1.3, "V": 4.2}

# Flexibility
# Normalized flexibility parameters (B-values), average
# Vihinen M., Torkkila E., Riikonen P. Proteins. 19(2):141-9(1994).
Flex = {"A": 0.984, "C": 0.906, "E": 1.094, "D": 1.068,
"G": 1.031, "F": 0.915, "I": 0.927, "H": 0.950,
"K": 1.102, "M": 0.952, "L": 0.935, "N": 1.048,
"Q": 1.037, "P": 1.049, "S": 1.046, "R": 1.008,
"T": 0.997, "W": 0.904, "V": 0.931, "Y": 0.929}

# Hydrophilicity
# 1 Hopp & Wood
# Proc. Natl. Acad. Sci. U.S.A. 78:3824-3828(1981).
hw = {"A": -0.5, "R": 3.0, "N": 0.2, "D": 3.0, "C": -1.0,
"Q": 0.2, "E": 3.0, "G": 0.0, "H": -0.5, "I": -1.8,
"L": -1.8, "K": 3.0, "M": -1.3, "F": -2.5, "P": 0.0,
"S": 0.3, "T": -0.4, "W": -3.4, "Y": -2.3, "V": -1.5}

# Surface accessibility
# Vergoten G & Theophanides T, Biomolecular Structure and Dynamics,
# pg.138 (1997).
# 1 Emini Surface fractional probability
em = {"A": 0.815, "R": 1.475, "N": 1.296, "D": 1.283, "C": 0.394,
"Q": 1.348, "E": 1.445, "G": 0.714, "H": 1.180, "I": 0.603,
"L": 0.603, "K": 1.545, "M": 0.714, "F": 0.695, "P": 1.236,
"S": 1.115, "T": 1.184, "W": 0.808, "Y": 1.089, "V": 0.606}

# 2 Janin Interior to surface transfer energy scale
ja = {"A": 0.28, "R": -1.14, "N": -0.55, "D": -0.52, "C": 0.97,
"Q": -0.69, "E": -1.01, "G": 0.43, "H": -0.31, "I": 0.60,
"L": 0.60, "K": -1.62, "M": 0.43, "F": 0.46, "P": -0.42,
"S": -0.19, "T": -0.32, "W": 0.29, "Y": -0.15, "V": 0.60}


# A two dimensional dictionary for calculating the instability index.
# Guruprasad K., Reddy B.V.B., Pandit M.W. Protein Engineering 4:155-161(1990).
# It is based on dipeptide values; therefore, the value for the dipeptide DG
# is DIWV['D']['G'].
DIWV = {"A": {"A": 1.0, "C": 44.94, "E": 1.0, "D": -7.49,
"G": 1.0, "F": 1.0, "I": 1.0, "H": -7.49,
"K": 1.0, "M": 1.0, "L": 1.0, "N": 1.0,
"Q": 1.0, "P": 20.26, "S": 1.0, "R": 1.0,
"T": 1.0, "W": 1.0, "V": 1.0, "Y": 1.0},
"C": {"A": 1.0, "C": 1.0, "E": 1.0, "D": 20.26,
"G": 1.0, "F": 1.0, "I": 1.0, "H": 33.60,
"K": 1.0, "M": 33.60, "L": 20.26, "N": 1.0,
"Q": -6.54, "P": 20.26, "S": 1.0, "R": 1.0,
"T": 33.60, "W": 24.68, "V": -6.54, "Y": 1.0},
"E": {"A": 1.0, "C": 44.94, "E": 33.60, "D": 20.26,
"G": 1.0, "F": 1.0, "I": 20.26, "H": -6.54,
"K": 1.0, "M": 1.0, "L": 1.0, "N": 1.0,
"Q": 20.26, "P": 20.26, "S": 20.26, "R": 1.0,
"T": 1.0, "W": -14.03, "V": 1.0, "Y": 1.0},
"D": {"A": 1.0, "C": 1.0, "E": 1.0, "D": 1.0,
"G": 1.0, "F": -6.54, "I": 1.0, "H": 1.0,
"K": -7.49, "M": 1.0, "L": 1.0, "N": 1.0,
"Q": 1.0, "P": 1.0, "S": 20.26, "R": -6.54,
"T": -14.03, "W": 1.0, "V": 1.0, "Y": 1.0},
"G": {"A": -7.49, "C": 1.0, "E": -6.54, "D": 1.0,
"G": 13.34, "F": 1.0, "I": -7.49, "H": 1.0,
"K": -7.49, "M": 1.0, "L": 1.0, "N": -7.49,
"Q": 1.0, "P": 1.0, "S": 1.0, "R": 1.0,
"T": -7.49, "W": 13.34, "V": 1.0, "Y": -7.49},
"F": {"A": 1.0, "C": 1.0, "E": 1.0, "D": 13.34,
"G": 1.0, "F": 1.0, "I": 1.0, "H": 1.0,
"K": -14.03, "M": 1.0, "L": 1.0, "N": 1.0,
"Q": 1.0, "P": 20.26, "S": 1.0, "R": 1.0,
"T": 1.0, "W": 1.0, "V": 1.0, "Y": 33.601},
"I": {"A": 1.0, "C": 1.0, "E": 44.94, "D": 1.0,
"G": 1.0, "F": 1.0, "I": 1.0, "H": 13.34,
"K": -7.49, "M": 1.0, "L": 20.26, "N": 1.0,
"Q": 1.0, "P": -1.88, "S": 1.0, "R": 1.0,
"T": 1.0, "W": 1.0, "V": -7.49, "Y": 1.0},
"H": {"A": 1.0, "C": 1.0, "E": 1.0, "D": 1.0,
"G": -9.37, "F": -9.37, "I": 44.94, "H": 1.0,
"K": 24.68, "M": 1.0, "L": 1.0, "N": 24.68,
"Q": 1.0, "P": -1.88, "S": 1.0, "R": 1.0,
"T": -6.54, "W": -1.88, "V": 1.0, "Y": 44.94},
"K": {"A": 1.0, "C": 1.0, "E": 1.0, "D": 1.0,
"G": -7.49, "F": 1.0, "I": -7.49, "H": 1.0,
"K": 1.0, "M": 33.60, "L": -7.49, "N": 1.0,
"Q": 24.64, "P": -6.54, "S": 1.0, "R": 33.60,
"T": 1.0, "W": 1.0, "V": -7.49, "Y": 1.0},
"M": {"A": 13.34, "C": 1.0, "E": 1.0, "D": 1.0,
"G": 1.0, "F": 1.0, "I": 1.0, "H": 58.28,
"K": 1.0, "M": -1.88, "L": 1.0, "N": 1.0,
"Q": -6.54, "P": 44.94, "S": 44.94, "R": -6.54,
"T": -1.88, "W": 1.0, "V": 1.0, "Y": 24.68},
"L": {"A": 1.0, "C": 1.0, "E": 1.0, "D": 1.0,
"G": 1.0, "F": 1.0, "I": 1.0, "H": 1.0,
"K": -7.49, "M": 1.0, "L": 1.0, "N": 1.0,
"Q": 33.60, "P": 20.26, "S": 1.0, "R": 20.26,
"T": 1.0, "W": 24.68, "V": 1.0, "Y": 1.0},
"N": {"A": 1.0, "C": -1.88, "E": 1.0, "D": 1.0,
"G": -14.03, "F": -14.03, "I": 44.94, "H": 1.0,
"K": 24.68, "M": 1.0, "L": 1.0, "N": 1.0,
"Q": -6.54, "P": -1.88, "S": 1.0, "R": 1.0,
"T": -7.49, "W": -9.37, "V": 1.0, "Y": 1.0},
"Q": {"A": 1.0, "C": -6.54, "E": 20.26, "D": 20.26,
"G": 1.0, "F": -6.54, "I": 1.0, "H": 1.0,
"K": 1.0, "M": 1.0, "L": 1.0, "N": 1.0,
"Q": 20.26, "P": 20.26, "S": 44.94, "R": 1.0,
"T": 1.0, "W": 1.0, "V": -6.54, "Y": -6.54},
"P": {"A": 20.26, "C": -6.54, "E": 18.38, "D": -6.54,
"G": 1.0, "F": 20.26, "I": 1.0, "H": 1.0,
"K": 1.0, "M": -6.54, "L": 1.0, "N": 1.0,
"Q": 20.26, "P": 20.26, "S": 20.26, "R": -6.54,
"T": 1.0, "W": -1.88, "V": 20.26, "Y": 1.0},
"S": {"A": 1.0, "C": 33.60, "E": 20.26, "D": 1.0,
"G": 1.0, "F": 1.0, "I": 1.0, "H": 1.0,
"K": 1.0, "M": 1.0, "L": 1.0, "N": 1.0,
"Q": 20.26, "P": 44.94, "S": 20.26, "R": 20.26,
"T": 1.0, "W": 1.0, "V": 1.0, "Y": 1.0},
"R": {"A": 1.0, "C": 1.0, "E": 1.0, "D": 1.0,
"G": -7.49, "F": 1.0, "I": 1.0, "H": 20.26,
"K": 1.0, "M": 1.0, "L": 1.0, "N": 13.34,
"Q": 20.26, "P": 20.26, "S": 44.94, "R": 58.28,
"T": 1.0, "W": 58.28, "V": 1.0, "Y": -6.54},
"T": {"A": 1.0, "C": 1.0, "E": 20.26, "D": 1.0,
"G": -7.49, "F": 13.34, "I": 1.0, "H": 1.0,
"K": 1.0, "M": 1.0, "L": 1.0, "N": -14.03,
"Q": -6.54, "P": 1.0, "S": 1.0, "R": 1.0,
"T": 1.0, "W": -14.03, "V": 1.0, "Y": 1.0},
"W": {"A": -14.03, "C": 1.0, "E": 1.0, "D": 1.0,
"G": -9.37, "F": 1.0, "I": 1.0, "H": 24.68,
"K": 1.0, "M": 24.68, "L": 13.34, "N": 13.34,
"Q": 1.0, "P": 1.0, "S": 1.0, "R": 1.0,
"T": -14.03, "W": 1.0, "V": -7.49, "Y": 1.0},
"V": {"A": 1.0, "C": 1.0, "E": 1.0, "D": -14.03,
"G": -7.49, "F": 1.0, "I": 1.0, "H": 1.0,
"K": -1.88, "M": 1.0, "L": 1.0, "N": 1.0,
"Q": 1.0, "P": 20.26, "S": 1.0, "R": 1.0,
"T": -7.49, "W": 1.0, "V": 1.0, "Y": -6.54},
"Y": {"A": 24.68, "C": 1.0, "E": -6.54, "D": 24.68,
"G": -7.49, "F": 1.0, "I": 1.0, "H": 13.34,
"K": 1.0, "M": 44.94, "L": 1.0, "N": 1.0,
"Q": 1.0, "P": 13.34, "S": 1.0, "R": -15.91,
"T": -7.49, "W": -9.37, "V": 1.0, "Y": 13.34},
}

# Turn black code style on
# fmt: on
536 changes: 536 additions & 0 deletions stdlib/biopy/seqrecord.seq

Large diffs are not rendered by default.

26 changes: 26 additions & 0 deletions stdlib/core/err.seq
Original file line number Diff line number Diff line change
@@ -149,6 +149,32 @@ class StopIteration:
def message(self: StopIteration):
return self._hdr.msg

class AttributeError:
_hdr: ExcHeader

def __init__(self: AttributeError):
self._hdr = ('AttributeError', '', '', '', 0, 0)

def __init__(self: AttributeError, message: str):
self._hdr = ('AttributeError', message, '', '', 0, 0)

@property
def message(self: AttributeError):
return self._hdr.msg

class RuntimeError:
_hdr: ExcHeader

def __init__(self: RuntimeError):
self._hdr = ('RuntimeError', '', '', '', 0, 0)

def __init__(self: RuntimeError, message: str):
self._hdr = ('RuntimeError', message, '', '', 0, 0)

@property
def message(self: RuntimeError):
return self._hdr.msg

def check_errno(prefix: str):
msg = _C.seq_check_errno()
if msg:
122 changes: 122 additions & 0 deletions test/stdlib/biopy_test/seqfeature_test.seq
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
from Bio.SeqFeature import FeatureLocation, Position
from Bio.SeqFeature import CompoundLocation, UnknownPosition, SeqFeature
from Bio.SeqFeature import WithinPosition, BetweenPosition


####### Test FeatureLocation #######
@test
def test_eq_identical():
"""
Test two identical locations are equal.
"""
loc1 = FeatureLocation(23, 42, 1, '', '')
loc2 = FeatureLocation(23, 42, 1, '', '')
assert loc1 == loc2

loc1 = FeatureLocation(23, 42, -1, '', '')
loc2 = FeatureLocation(23, 42, -1, '', '')
assert loc1 == loc2

loc1 = FeatureLocation(23, 42, -1, '', '')
loc2 = FeatureLocation(23, 42, -1, '', '')
assert loc1 == loc2

loc1 = FeatureLocation(Position(23, 1, 0), Position(42, 2, 0), 1, '', '')
loc2 = FeatureLocation(23, 42, 1, '', '')
assert loc1 == loc2

loc1 = FeatureLocation(23, 42, 1, "foo", "bar")
loc2 = FeatureLocation(23, 42, 1, "foo", "bar")
assert loc1 == loc2
test_eq_identical()

@test
def test_eq_not_identical():
"""
Test two different locations are not equal.
"""
loc1 = FeatureLocation(22, 42, 1, '', '')
loc2 = FeatureLocation(23, 42, 1, '', '')
assert loc1 != loc2

loc1 = FeatureLocation(23, 42, 1, '', '')
loc2 = FeatureLocation(23, 43, 1, '', '')
assert loc1 != loc2

loc1 = FeatureLocation(23, 42, 1, '', '')
loc2 = FeatureLocation(23, 42, -1, '', '')
assert loc1 != loc2

loc1 = FeatureLocation(23, 42, 1, 'foo', '')
loc2 = FeatureLocation(23, 42, 1, 'bar', '')
assert loc1 != loc2

loc1 = FeatureLocation(23, 42, 1, 'foo', 'bar')
loc2 = FeatureLocation(23, 42, 1, 'foo', 'baz')
assert loc1 != loc2
test_eq_not_identical()

@test
# not finished need to have raise error with start > end
def test_start_before_end():
expected = "must be greater than or equal to start location"
print FeatureLocation(42, 23, 1, '', '')

test_start_before_end()

####### Test CompoundLocation #######

@test
def test_eq_identical1():
loc1 = FeatureLocation(12, 17, 1, '', '') + FeatureLocation(23, 42, 1, '', '')
loc2 = FeatureLocation(12, 17, 1, '', '') + FeatureLocation(23, 42, 1, '', '')
assert loc1 == loc2

loc1 = FeatureLocation(12, 17, 1, '', '') + FeatureLocation(23, 42, 1, '', '')
loc2 = CompoundLocation([FeatureLocation(12, 17, 1, '', ''), FeatureLocation(23, 42, 1, '', '')], 'join')
assert loc1 == loc2
test_eq_identical1()

@test
def test_eq_not_identical1():
loc1 = FeatureLocation(12, 17, 1, '', '') + FeatureLocation(23, 42, 1, '', '')
loc2 = FeatureLocation(12, 17, 1, '', '') + FeatureLocation(23, 42, 1, '', '') + FeatureLocation(50, 60, 1, '', '')
assert loc1 != loc2

loc1 = FeatureLocation(12, 17, 1, '', '') + FeatureLocation(23, 42, 1, '', '')
loc2 = FeatureLocation(12, 17, -1, '', '') + FeatureLocation(23, 42, -1, '', '')
assert loc1 != loc2

loc1 = CompoundLocation([FeatureLocation(12, 17, 1, '', ''), FeatureLocation(23, 42, 1, '', '')], "join")
loc2 = CompoundLocation([FeatureLocation(12, 17, 1, '', ''), FeatureLocation(23, 42, 1, '', '')], 'order')
assert loc1 != loc2
test_eq_not_identical1()

####### Test Locations #######

@test
# Not finished
def test_fuzzy():
exact_pos = Position(5, 0, 0)
within_pos_s = WithinPosition(10, left=10, right=13)
within_pos_e = WithinPosition(13, left=10, right=13)
between_pos_e = BetweenPosition(24, 20, 24)
before_pos = Position(15, 1, 0)
after_pos = Position(40, 2, 0)

assert int(within_pos_s) == 10
assert str(within_pos_s) == "(10.13)"
assert int(within_pos_e) == 13
assert str(within_pos_e) == "(10.13)"
assert int(between_pos_e) == 24
assert str(between_pos_e) == "(20^24)"
assert str(before_pos) == "<15"
assert str(after_pos) == ">40"

# TODO/CAVEATS:
# FeatureLocation does not support these types for now.
# location1 = FeatureLocation(exact_pos, within_pos_e, 1, '', '')
# location2 = FeatureLocation(before_pos, between_pos_e)
# location3 = FeatureLocation(within_pos_s, after_pos)
test_fuzzy()

23 changes: 23 additions & 0 deletions test/stdlib/biopy_test/sequtils_test/checksum_test.seq
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from biopy.sequtils.checksum import crc64, gcg

str_light_chain_one = ("QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQ"
"HPGKAPKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGL"
"QAEDEADYYCSSYAGSSTLVFGGGTKLTVL")

str_light_chain_two = ("QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQ"
"HPGKAPKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGL"
"QAEDEADYYCCSYAGSSTWVFGGGTKLTVL")

@test
def test_checksum():
num1 = crc64(str_light_chain_one)
num2 = crc64(str_light_chain_two)
assert num1 == num2
test_checksum()

@test
def test_gcg():
num1 = gcg(str_light_chain_one)
num2 = gcg(str_light_chain_two)
assert num1 != num2
test_gcg()
21 changes: 21 additions & 0 deletions test/stdlib/biopy_test/sequtils_test/codonusage_test.seq
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from biopy.sequtils.codonusage import CodonAdaptationIndex
import math

X = CodonAdaptationIndex()
path = "/Users/jordanwatson1/seq/test/stdlib/biopy_test/sequtils_test/scratch.fasta"

X.generate_index(path)

@test
def cai_for_gene():
PRECISION = 1e-6

# Test Codon Adaptation Index (CAI) using default E. coli data.
CAI = CodonAdaptationIndex()
value = CAI.cai_for_gene("ATGCGTATCGATCGCGATACGATTAGGCGGATG")
assert math.fabs(value - 0.09978) < PRECISION
cai_for_gene()

def print_index():
X.print_index()
# print_index()
25 changes: 25 additions & 0 deletions test/stdlib/biopy_test/sequtils_test/lcc_test.seq
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from biopy.sequtils.lcc import lcc_simp, lcc_mult
import math

str_light_chain_one = ("QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQ"
"HPGKAPKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGL"
"QAEDEADYYCSSYAGSSTLVFGGGTKLTVL")

exp_simple_LCC = 1.03
exp_window_LCC = (0.00, 1.00, 0.96, 0.96, 0.96, 0.65, 0.43, 0.35, 0.35, 0.35, 0.35, 0.53, 0.59, 0.26)

@test
def lcc_simp_test():
PRECISION = 1e2
assert math.fabs(lcc_simp(str_light_chain_one) - exp_simple_LCC) < PRECISION
lcc_simp_test()

@test
def lcc_mult_test():
PRECISION = 1e2
values = lcc_mult(str_light_chain_one, 20)
assert len(values) == len(exp_window_LCC)

for val1, val2 in zip(values, exp_window_LCC):
assert math.fabs(val1 - val2) < PRECISION
lcc_mult_test()