Skip to content

Commit 9e1eb2b

Browse files
author
RJain12
committed
Linting
1 parent 9c7eabd commit 9e1eb2b

7 files changed

Lines changed: 85 additions & 59 deletions

File tree

.gitpod.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# This configuration file was automatically generated by Gitpod.
2+
# Please adjust to your needs (see https://www.gitpod.io/docs/config-gitpod-file)
3+
# and commit this file to your remote git repository to share the goodness with others.
4+
5+
tasks:
6+
- init: pip install -r requirements.txt
7+
8+

.vscode/settings.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{
2+
"python.linting.enabled": true
3+
}

tool/optimizers/brute_force_optimizer.py

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,16 @@
1313
import re
1414

1515
# Set input AA sequence directory and output for writing brute sequences
16-
aa_dir = os.path.join(os.getcwd(),'benchmark_sequences','aa')
17-
out_dir = os.path.join(os.getcwd(),'benchmark_sequences','brute')
16+
aa_dir = os.path.join(os.getcwd(), 'benchmark_sequences', 'aa')
17+
out_dir = os.path.join(os.getcwd(), 'benchmark_sequences', 'brute')
1818

1919
# Define weights for each codon
20-
weights = [0,1,0.647058823500000,0.500000000000000,0.794117647100000,0.0789473684200000,0.131578947400000,0.263157894700000,0.184210526300000,0.973684210500000,1,0.851851851900000,1,1,0.587301587300000,0.818181818200000,1,0.483870967700000,0.129032258100000,1,1,0.515151515200000,0.470588235300000,1,0.384615384600000,0.307692307700000,0.871794871800000,1,1,0.754385964900000,0.180000000000000,1,0.820000000000000,0.265306122400000,0.265306122400000,1,0.0816326530600000,0.224489795900000,0.204081632700000,0.333333333300000,1,1,1,0.754385964900000,1,0.392156862700000,0.333333333300000,0.235294117600000,0.576923076900000,1,0.576923076900000,0.500000000000000,0.615384615400000,0.576923076900000,0.619047619000000,0.357142857100000,0.428571428600000,1,1,1,0.724137931000000,1,0.444444444400000,0.750000000000000,0.583333333300000]
20+
weights = [0, 1, 0.647058823500000, 0.500000000000000, 0.794117647100000, 0.0789473684200000, 0.131578947400000, 0.263157894700000, 0.184210526300000, 0.973684210500000, 1, 0.851851851900000, 1, 1, 0.587301587300000, 0.818181818200000, 1, 0.483870967700000, 0.129032258100000, 1, 1, 0.515151515200000, 0.470588235300000, 1, 0.384615384600000, 0.307692307700000, 0.871794871800000, 1, 1, 0.754385964900000, 0.180000000000000, 1, 0.820000000000000,
21+
0.265306122400000, 0.265306122400000, 1, 0.0816326530600000, 0.224489795900000, 0.204081632700000, 0.333333333300000, 1, 1, 1, 0.754385964900000, 1, 0.392156862700000, 0.333333333300000, 0.235294117600000, 0.576923076900000, 1, 0.576923076900000, 0.500000000000000, 0.615384615400000, 0.576923076900000, 0.619047619000000, 0.357142857100000, 0.428571428600000, 1, 1, 1, 0.724137931000000, 1, 0.444444444400000, 0.750000000000000, 0.583333333300000]
2122

2223
# Create a list of all the codons and match their corresponding weights
24+
25+
2326
def seq2cai(codonarray):
2427
output = []
2528
switcher = {
@@ -89,11 +92,12 @@ def seq2cai(codonarray):
8992
'GTC': 64,
9093
}
9194
for codon in codonarray:
92-
output.append(weights[switcher.get(codon,0)])
95+
output.append(weights[switcher.get(codon, 0)])
9396
length = 1 / len(codonarray)
9497
return pow(math.prod(output), length)
9598

96-
def aa2codons(seq : str) -> list:
99+
100+
def aa2codons(seq: str) -> list:
97101
_aas = {
98102
"A": ["GCT GCC GCA GCG"],
99103
"R": ["CGT CGC CGA CGG AGA AGG"],
@@ -121,12 +125,13 @@ def aa2codons(seq : str) -> list:
121125
}
122126
return [_aas[i] for i in seq]
123127

128+
124129
# Converts an amino acid to a random corresponding codon:
125130
for entry in os.scandir(aa_dir):
126131
# Read in the amino acid sequence:
127132
name = entry.replace("_aa.fasta", "_dna")
128-
record = SeqIO.read(entry,'fasta')
129-
133+
record = SeqIO.read(entry, 'fasta')
134+
130135
masterlist = []
131136
bestcai = 0
132137
curcai = 0
@@ -136,7 +141,7 @@ def aa2codons(seq : str) -> list:
136141
codonarr = []
137142
# Convert amino acid to codons:
138143
for i in record.seq:
139-
#Randomly choose a codon from the list of codons for the amino acid:
144+
# Randomly choose a codon from the list of codons for the amino acid:
140145
codonarr.append(random.choice(aa2codons(i)[0][0].split()))
141146
masterlist.append(codonarr)
142147
# With our new codon array, calculate the CAI:
@@ -149,6 +154,7 @@ def aa2codons(seq : str) -> list:
149154
print(curr_iteration)
150155

151156
# Write the codon array to a file:
152-
record.seq = Seq(re.sub('[^GATC]',"",str("".join(masterlist[bestcai])).upper()))
157+
record.seq = Seq(re.sub('[^GATC]', "", str(
158+
"".join(masterlist[bestcai])).upper()))
153159
complete_name = os.path.join(out_dir, name)
154-
SeqIO.write(record, complete_name + ".fasta", "fasta")
160+
SeqIO.write(record, complete_name + ".fasta", "fasta")

tool/optimizers/naive_optimizer.py

Lines changed: 37 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,63 +1,66 @@
1-
#import standard modules; re is used for regex on ln 59
1+
# import standard modules; re is used for regex on ln 59
22
import os
33
from Bio import SeqIO
44
from Bio.Seq import Seq
55
import random
66
import numpy as np
77
import re
88

9-
#frequencies are from ecoli_codon_frequencies.xlsx file in summaries dir
10-
#create dict with value being a tuple with the codons and their probabilities/frequncies
9+
# frequencies are from ecoli_codon_frequencies.xlsx file in summaries dir
10+
# create dict with value being a tuple with the codons and their probabilities/frequncies
1111
frequency = {
12-
"A": (["GCG", "GCA", "GCT", "GCC"],[0.34, 0.22, 0.17, 0.27]),
13-
"R": (["AGG", "AGA", "CGG","CGA","CGT","CGC"],[0.03, 0.05, 0.1, 0.07, 0.37, 0.38]),
14-
"N": (["AAT", "AAC"],[0.46, 0.54]),
15-
"D": (["GAT","GAC"],[0.63,0.37]),
16-
"C": (["TGT","TGC"],[0.45,0.55]),
17-
"*": (["TGA","TAG","TAA"],[0.3,0.08,0.62]),
18-
"Q": (["CAG","CAA"],[0.66,0.34]),
19-
"E": (["GAG","GAA"],[0.32,0.68]),
20-
"G": (["GGG","GGA","GGT","GGC"],[0.15,0.12,0.34,0.39]),
21-
"H": (["CAT","CAC"],[0.57,0.43]),
22-
"I": (["ATA","ATT","ATC"],[0.09,0.5,0.41]),
23-
"L": (["TTG","TTA","CTG","CTA","CTT","CTC"],[0.13,0.13,0.49,0.04,0.11,0.1]),
24-
"K": (["AAG","AAG"],[0.25,0.75]),
25-
"M": (["ATG"],[1.0]),
26-
"F": (["TTT","TTC"],[0.57,0.43]),
27-
"P": (["CCG","CCA","CCT","CCC"],[0.51,0.2,0.17,0.12]),
28-
"S": (["AGT","AGC","TCG","TCA","TCT","TCC"],[0.15,0.26,0.15,0.13,0.16,0.15]),
29-
"T": (["ACG","ACA","ACT","ACC"],[0.26,0.15,0.18,0.42]),
30-
"W": (["TGG"],[1.0]),
31-
"Y": (["TAT","TAC"],[0.58,0.42]),
32-
"V": (["GTG","GTA","GTT","GTC"],[0.36,0.16,0.27,0.21])
12+
"A": (["GCG", "GCA", "GCT", "GCC"], [0.34, 0.22, 0.17, 0.27]),
13+
"R": (["AGG", "AGA", "CGG", "CGA", "CGT", "CGC"], [0.03, 0.05, 0.1, 0.07, 0.37, 0.38]),
14+
"N": (["AAT", "AAC"], [0.46, 0.54]),
15+
"D": (["GAT", "GAC"], [0.63, 0.37]),
16+
"C": (["TGT", "TGC"], [0.45, 0.55]),
17+
"*": (["TGA", "TAG", "TAA"], [0.3, 0.08, 0.62]),
18+
"Q": (["CAG", "CAA"], [0.66, 0.34]),
19+
"E": (["GAG", "GAA"], [0.32, 0.68]),
20+
"G": (["GGG", "GGA", "GGT", "GGC"], [0.15, 0.12, 0.34, 0.39]),
21+
"H": (["CAT", "CAC"], [0.57, 0.43]),
22+
"I": (["ATA", "ATT", "ATC"], [0.09, 0.5, 0.41]),
23+
"L": (["TTG", "TTA", "CTG", "CTA", "CTT", "CTC"], [0.13, 0.13, 0.49, 0.04, 0.11, 0.1]),
24+
"K": (["AAG", "AAG"], [0.25, 0.75]),
25+
"M": (["ATG"], [1.0]),
26+
"F": (["TTT", "TTC"], [0.57, 0.43]),
27+
"P": (["CCG", "CCA", "CCT", "CCC"], [0.51, 0.2, 0.17, 0.12]),
28+
"S": (["AGT", "AGC", "TCG", "TCA", "TCT", "TCC"], [0.15, 0.26, 0.15, 0.13, 0.16, 0.15]),
29+
"T": (["ACG", "ACA", "ACT", "ACC"], [0.26, 0.15, 0.18, 0.42]),
30+
"W": (["TGG"], [1.0]),
31+
"Y": (["TAT", "TAC"], [0.58, 0.42]),
32+
"V": (["GTG", "GTA", "GTT", "GTC"], [0.36, 0.16, 0.27, 0.21])
3333
}
3434

3535
# Amino acid sequence dir to optimize:
3636
# hardcoded path
37-
aa_dir = os.path.join(os.getcwd(),'benchmark_sequences','aa')
37+
aa_dir = os.path.join(os.getcwd(), 'benchmark_sequences', 'aa')
3838

3939
# Output dir to store optimized seqs:
4040
# hardcoded path
41-
out_dir = os.path.join(os.getcwd(),'benchmark_sequences','naive')
41+
out_dir = os.path.join(os.getcwd(), 'benchmark_sequences', 'naive')
4242

4343

4444
# Normalize probabilities for frequency if sum is not exactly 1.
45-
def fix_p( p):
45+
def fix_p(p):
4646
if p.sum() != 1.0:
4747
p = p*(1./p.sum())
4848
return p
4949

50+
5051
for entry in os.scandir(aa_dir):
5152
name = entry.replace("_aa.fasta", "_dna")
5253

5354
# Replace ambiguities with amino acids from IUPAC guidelines: https://www.bioinformatics.org/sms/iupac.html
54-
record = SeqIO.read(entry,"fasta")
55-
seq = record.seq.replace("B", random.choice(["D","N"])).replace("Z", random.choice(["E", "Q"]))
55+
record = SeqIO.read(entry, "fasta")
56+
seq = record.seq.replace("B", random.choice(["D", "N"])).replace(
57+
"Z", random.choice(["E", "Q"]))
5658
seq_arr = []
5759
for aa in seq:
58-
#append to the array a random choice of codon using the probabilities given (p)
59-
seq_arr.append(np.random.choice(frequency[aa][0],p=fix_p(np.asarray(frequency[aa][1]))))
60-
61-
record.seq = Seq(re.sub('[^GATC]',"",str("".join(seq_arr)).upper()))
60+
# append to the array a random choice of codon using the probabilities given (p)
61+
seq_arr.append(np.random.choice(
62+
frequency[aa][0], p=fix_p(np.asarray(frequency[aa][1]))))
63+
64+
record.seq = Seq(re.sub('[^GATC]', "", str("".join(seq_arr)).upper()))
6265
complete_name = os.path.join(out_dir, name)
63-
SeqIO.write(record, complete_name + ".fasta", "fasta")
66+
SeqIO.write(record, complete_name + ".fasta", "fasta")

tool/optimizers/super_naive_optimizer.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,16 @@
88
from Bio.Seq import Seq
99

1010
# Amino acid sequence dir to optimize:
11-
aa_dir = os.path.join(os.getcwd(),'benchmark_sequences','aa')
11+
aa_dir = os.path.join(os.getcwd(), 'benchmark_sequences', 'aa')
1212

1313

1414
# Output dir to store optimized seqs:
15-
out_dir = os.path.join(os.getcwd(),'benchmark_sequences','super_naive')
15+
out_dir = os.path.join(os.getcwd(), 'benchmark_sequences', 'super_naive')
1616

1717
# Amino acid to codon table, outputs arr of codons:
18-
def aa2codons(seq : str) -> list:
18+
19+
20+
def aa2codons(seq: str) -> list:
1921
_aas = {
2022
"A": ["GCT GCC GCA GCG"],
2123
"R": ["CGT CGC CGA CGG AGA AGG"],
@@ -43,19 +45,20 @@ def aa2codons(seq : str) -> list:
4345
}
4446
return [_aas[i] for i in seq]
4547

48+
4649
# Converts an amino acid to a random corresponding codon:
4750
for entry in os.scandir(aa_dir):
4851
# Read in the amino acid sequence:
4952
name = entry.name[0:-9] + "_dna"
50-
record = SeqIO.read(entry,'fasta')
53+
record = SeqIO.read(entry, 'fasta')
5154
arr = []
5255
# Convert amino acid to codons:
5356
for i in record.seq:
54-
#Randomly choose a codon from the list of codons for the amino acid:
57+
# Randomly choose a codon from the list of codons for the amino acid:
5558
arr.append(random.choice(aa2codons(i)[0][0].split()))
5659

5760
# Convert the array of codons to a string:
58-
record.seq = Seq(re.sub('[^GATC]',"",str("".join(arr)).upper()))
61+
record.seq = Seq(re.sub('[^GATC]', "", str("".join(arr)).upper()))
5962
complete_name = os.path.join(out_dir, name)
6063

6164
# Save the super naively optimized DNA sequence:

tool/scripts/csv_to_seqs.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,10 @@
1616
from Bio.SeqRecord import SeqRecord
1717
from Bio.Seq import Seq
1818

19-
#output directory to write sequences
19+
# output directory to write sequences
2020
out_dir = r"..\..\benchmark_sequences\genscript"
2121

22-
#iterate through the csv file and write sequences to the output directory
22+
# iterate through the csv file and write sequences to the output directory
2323
with open('optimum_seqs.csv', newline='') as csvfile:
2424
data = list(csv.reader(csvfile))
2525
for i in data:
@@ -31,4 +31,4 @@
3131
description="blank",
3232
)
3333
print(record)
34-
SeqIO.write(record,os.path.join(out_dir, file_name),'fasta')
34+
SeqIO.write(record, os.path.join(out_dir, file_name), 'fasta')

tool/scripts/reformat_seqs.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,25 +13,28 @@
1313
dna_directory = r"..\..\benchmark_sequences\dna"
1414

1515
# Print the number of files in each directory.
16-
print("There are %d files in the amino acid directory." % len(os.listdir(aa_directory)))
17-
print("There are %d files in the DNA directory." % len(os.listdir(dna_directory)))
16+
print("There are %d files in the amino acid directory." %
17+
len(os.listdir(aa_directory)))
18+
print("There are %d files in the DNA directory." %
19+
len(os.listdir(dna_directory)))
1820

1921
# Iterate over each file in the directory.
2022
for entry in os.scandir(aa_directory):
2123
record = SeqIO.read(entry, "fasta")
2224
SeqIO.write(record, entry, "fasta")
23-
#Although this does not change the actual sequence, it will reformat it with a fixed spacing (makes seqs more legible).
25+
# Although this does not change the actual sequence, it will reformat it with a fixed spacing (makes seqs more legible).
2426

2527
# Iterate over each file in the directory.
2628
for entry in os.scandir(dna_directory):
2729
record = SeqIO.read(entry, "fasta")
2830

29-
#Just in case, replace ambigious codons with the corresponding IUPAC ones:
30-
record.seq = record.seq.replace('K',random.choice(['G','T'])).replace('M',random.choice(['A','C'])).replace('N',random.choice(['A','C','G','T'])).replace('R',random.choice(['A','G'])).replace('W',random.choice(['A','T'])).replace('Y',random.choice(['C','T']))
31+
# Just in case, replace ambigious codons with the corresponding IUPAC ones:
32+
record.seq = record.seq.replace('K', random.choice(['G', 'T'])).replace('M', random.choice(['A', 'C'])).replace('N', random.choice(
33+
['A', 'C', 'G', 'T'])).replace('R', random.choice(['A', 'G'])).replace('W', random.choice(['A', 'T'])).replace('Y', random.choice(['C', 'T']))
3134
SeqIO.write(record, entry, "fasta")
3235

3336
if len(record.seq) % 3:
34-
#all CDS should be divisible by three because they are all in frame.
37+
# all CDS should be divisible by three because they are all in frame.
3538
print(f"Warning {entry.name} (sequence #{num}) was not divisible by three. All CDS should be divisible by three. The ICOR optimizer will perform best on sequences of that length."
3639
else:
3740
continue

0 commit comments

Comments
 (0)