Linting

RJain12 · RJain12 · commit 9e1eb2b15969 · 2022-10-11T00:23:24.000Z
diff --git a/.gitpod.yml b/.gitpod.yml
@@ -0,0 +1,8 @@
+# This configuration file was automatically generated by Gitpod.
+# Please adjust to your needs (see https://www.gitpod.io/docs/config-gitpod-file)
+# and commit this file to your remote git repository to share the goodness with others.
+
+tasks:
+  - init: pip install -r requirements.txt
+
+
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,3 @@
+{
+    "python.linting.enabled": true
+}
diff --git a/tool/optimizers/brute_force_optimizer.py b/tool/optimizers/brute_force_optimizer.py
@@ -13,13 +13,16 @@
 import re
 
 # Set input AA sequence directory and output for writing brute sequences
-aa_dir = os.path.join(os.getcwd(),'benchmark_sequences','aa')
-out_dir = os.path.join(os.getcwd(),'benchmark_sequences','brute')
+aa_dir = os.path.join(os.getcwd(), 'benchmark_sequences', 'aa')
+out_dir = os.path.join(os.getcwd(), 'benchmark_sequences', 'brute')
 
 # Define weights for each codon
-weights = [0,1,0.647058823500000,0.500000000000000,0.794117647100000,0.0789473684200000,0.131578947400000,0.263157894700000,0.184210526300000,0.973684210500000,1,0.851851851900000,1,1,0.587301587300000,0.818181818200000,1,0.483870967700000,0.129032258100000,1,1,0.515151515200000,0.470588235300000,1,0.384615384600000,0.307692307700000,0.871794871800000,1,1,0.754385964900000,0.180000000000000,1,0.820000000000000,0.265306122400000,0.265306122400000,1,0.0816326530600000,0.224489795900000,0.204081632700000,0.333333333300000,1,1,1,0.754385964900000,1,0.392156862700000,0.333333333300000,0.235294117600000,0.576923076900000,1,0.576923076900000,0.500000000000000,0.615384615400000,0.576923076900000,0.619047619000000,0.357142857100000,0.428571428600000,1,1,1,0.724137931000000,1,0.444444444400000,0.750000000000000,0.583333333300000]
+weights = [0, 1, 0.647058823500000, 0.500000000000000, 0.794117647100000, 0.0789473684200000, 0.131578947400000, 0.263157894700000, 0.184210526300000, 0.973684210500000, 1, 0.851851851900000, 1, 1, 0.587301587300000, 0.818181818200000, 1, 0.483870967700000, 0.129032258100000, 1, 1, 0.515151515200000, 0.470588235300000, 1, 0.384615384600000, 0.307692307700000, 0.871794871800000, 1, 1, 0.754385964900000, 0.180000000000000, 1, 0.820000000000000,
+           0.265306122400000, 0.265306122400000, 1, 0.0816326530600000, 0.224489795900000, 0.204081632700000, 0.333333333300000, 1, 1, 1, 0.754385964900000, 1, 0.392156862700000, 0.333333333300000, 0.235294117600000, 0.576923076900000, 1, 0.576923076900000, 0.500000000000000, 0.615384615400000, 0.576923076900000, 0.619047619000000, 0.357142857100000, 0.428571428600000, 1, 1, 1, 0.724137931000000, 1, 0.444444444400000, 0.750000000000000, 0.583333333300000]
 
 # Create a list of all the codons and match their corresponding weights
+
+
 def seq2cai(codonarray):
     output = []
     switcher = {
@@ -89,11 +92,12 @@ def seq2cai(codonarray):
         'GTC': 64,
     }
     for codon in codonarray:
-        output.append(weights[switcher.get(codon,0)])
+        output.append(weights[switcher.get(codon, 0)])
     length = 1 / len(codonarray)
     return pow(math.prod(output), length)
 
-def aa2codons(seq : str) -> list:
+
+def aa2codons(seq: str) -> list:
     _aas = {
         "A": ["GCT GCC GCA GCG"],
         "R": ["CGT CGC CGA CGG AGA AGG"],
@@ -121,12 +125,13 @@ def aa2codons(seq : str) -> list:
     }
     return [_aas[i] for i in seq]
 
+
 # Converts an amino acid to a random corresponding codon:
 for entry in os.scandir(aa_dir):
     # Read in the amino acid sequence:
     name = entry.replace("_aa.fasta", "_dna")
-    record = SeqIO.read(entry,'fasta')
-    
+    record = SeqIO.read(entry, 'fasta')
+
     masterlist = []
     bestcai = 0
     curcai = 0
@@ -136,7 +141,7 @@ def aa2codons(seq : str) -> list:
         codonarr = []
         # Convert amino acid to codons:
         for i in record.seq:
-            #Randomly choose a codon from the list of codons for the amino acid:
+            # Randomly choose a codon from the list of codons for the amino acid:
             codonarr.append(random.choice(aa2codons(i)[0][0].split()))
         masterlist.append(codonarr)
         # With our new codon array, calculate the CAI:
@@ -149,6 +154,7 @@ def aa2codons(seq : str) -> list:
         print(curr_iteration)
 
     # Write the codon array to a file:
-    record.seq = Seq(re.sub('[^GATC]',"",str("".join(masterlist[bestcai])).upper()))
+    record.seq = Seq(re.sub('[^GATC]', "", str(
+        "".join(masterlist[bestcai])).upper()))
     complete_name = os.path.join(out_dir, name)
-    SeqIO.write(record, complete_name + ".fasta", "fasta")
+    SeqIO.write(record, complete_name + ".fasta", "fasta")
diff --git a/tool/optimizers/naive_optimizer.py b/tool/optimizers/naive_optimizer.py
@@ -1,63 +1,66 @@
-#import standard modules; re is used for regex on ln 59
+# import standard modules; re is used for regex on ln 59
 import os
 from Bio import SeqIO
 from Bio.Seq import Seq
 import random
 import numpy as np
 import re
 
-#frequencies are from ecoli_codon_frequencies.xlsx file in summaries dir
-#create dict with value being a tuple with the codons and their probabilities/frequncies
+# frequencies are from ecoli_codon_frequencies.xlsx file in summaries dir
+# create dict with value being a tuple with the codons and their probabilities/frequncies
 frequency = {
-    "A": (["GCG", "GCA", "GCT", "GCC"],[0.34, 0.22, 0.17, 0.27]),
-    "R": (["AGG", "AGA", "CGG","CGA","CGT","CGC"],[0.03, 0.05, 0.1, 0.07, 0.37, 0.38]),
-    "N": (["AAT", "AAC"],[0.46, 0.54]),
-    "D": (["GAT","GAC"],[0.63,0.37]),
-    "C": (["TGT","TGC"],[0.45,0.55]),
-    "*": (["TGA","TAG","TAA"],[0.3,0.08,0.62]),
-    "Q": (["CAG","CAA"],[0.66,0.34]),
-    "E": (["GAG","GAA"],[0.32,0.68]),
-    "G": (["GGG","GGA","GGT","GGC"],[0.15,0.12,0.34,0.39]),
-    "H": (["CAT","CAC"],[0.57,0.43]),
-    "I": (["ATA","ATT","ATC"],[0.09,0.5,0.41]),
-    "L": (["TTG","TTA","CTG","CTA","CTT","CTC"],[0.13,0.13,0.49,0.04,0.11,0.1]),
-    "K": (["AAG","AAG"],[0.25,0.75]),
-    "M": (["ATG"],[1.0]),
-    "F": (["TTT","TTC"],[0.57,0.43]),
-    "P": (["CCG","CCA","CCT","CCC"],[0.51,0.2,0.17,0.12]),
-    "S": (["AGT","AGC","TCG","TCA","TCT","TCC"],[0.15,0.26,0.15,0.13,0.16,0.15]),
-    "T": (["ACG","ACA","ACT","ACC"],[0.26,0.15,0.18,0.42]),
-    "W": (["TGG"],[1.0]),
-    "Y": (["TAT","TAC"],[0.58,0.42]),
-    "V": (["GTG","GTA","GTT","GTC"],[0.36,0.16,0.27,0.21])
+    "A": (["GCG", "GCA", "GCT", "GCC"], [0.34, 0.22, 0.17, 0.27]),
+    "R": (["AGG", "AGA", "CGG", "CGA", "CGT", "CGC"], [0.03, 0.05, 0.1, 0.07, 0.37, 0.38]),
+    "N": (["AAT", "AAC"], [0.46, 0.54]),
+    "D": (["GAT", "GAC"], [0.63, 0.37]),
+    "C": (["TGT", "TGC"], [0.45, 0.55]),
+    "*": (["TGA", "TAG", "TAA"], [0.3, 0.08, 0.62]),
+    "Q": (["CAG", "CAA"], [0.66, 0.34]),
+    "E": (["GAG", "GAA"], [0.32, 0.68]),
+    "G": (["GGG", "GGA", "GGT", "GGC"], [0.15, 0.12, 0.34, 0.39]),
+    "H": (["CAT", "CAC"], [0.57, 0.43]),
+    "I": (["ATA", "ATT", "ATC"], [0.09, 0.5, 0.41]),
+    "L": (["TTG", "TTA", "CTG", "CTA", "CTT", "CTC"], [0.13, 0.13, 0.49, 0.04, 0.11, 0.1]),
+    "K": (["AAG", "AAG"], [0.25, 0.75]),
+    "M": (["ATG"], [1.0]),
+    "F": (["TTT", "TTC"], [0.57, 0.43]),
+    "P": (["CCG", "CCA", "CCT", "CCC"], [0.51, 0.2, 0.17, 0.12]),
+    "S": (["AGT", "AGC", "TCG", "TCA", "TCT", "TCC"], [0.15, 0.26, 0.15, 0.13, 0.16, 0.15]),
+    "T": (["ACG", "ACA", "ACT", "ACC"], [0.26, 0.15, 0.18, 0.42]),
+    "W": (["TGG"], [1.0]),
+    "Y": (["TAT", "TAC"], [0.58, 0.42]),
+    "V": (["GTG", "GTA", "GTT", "GTC"], [0.36, 0.16, 0.27, 0.21])
 }
 
 # Amino acid sequence dir to optimize:
 # hardcoded path
-aa_dir = os.path.join(os.getcwd(),'benchmark_sequences','aa')
+aa_dir = os.path.join(os.getcwd(), 'benchmark_sequences', 'aa')
 
 # Output dir to store optimized seqs:
 # hardcoded path
-out_dir = os.path.join(os.getcwd(),'benchmark_sequences','naive')
+out_dir = os.path.join(os.getcwd(), 'benchmark_sequences', 'naive')
 
 
 # Normalize probabilities for frequency if sum is not exactly 1.
-def fix_p( p):
+def fix_p(p):
     if p.sum() != 1.0:
         p = p*(1./p.sum())
     return p
 
+
 for entry in os.scandir(aa_dir):
     name = entry.replace("_aa.fasta", "_dna")
 
     # Replace ambiguities with amino acids from IUPAC guidelines: https://www.bioinformatics.org/sms/iupac.html
-    record = SeqIO.read(entry,"fasta")
-    seq = record.seq.replace("B", random.choice(["D","N"])).replace("Z", random.choice(["E", "Q"]))
+    record = SeqIO.read(entry, "fasta")
+    seq = record.seq.replace("B", random.choice(["D", "N"])).replace(
+        "Z", random.choice(["E", "Q"]))
     seq_arr = []
     for aa in seq:
-        #append to the array a random choice of codon using the probabilities given (p)
-        seq_arr.append(np.random.choice(frequency[aa][0],p=fix_p(np.asarray(frequency[aa][1]))))
-    
-    record.seq = Seq(re.sub('[^GATC]',"",str("".join(seq_arr)).upper()))
+        # append to the array a random choice of codon using the probabilities given (p)
+        seq_arr.append(np.random.choice(
+            frequency[aa][0], p=fix_p(np.asarray(frequency[aa][1]))))
+
+    record.seq = Seq(re.sub('[^GATC]', "", str("".join(seq_arr)).upper()))
     complete_name = os.path.join(out_dir, name)
-    SeqIO.write(record, complete_name + ".fasta", "fasta")
+    SeqIO.write(record, complete_name + ".fasta", "fasta")
diff --git a/tool/optimizers/super_naive_optimizer.py b/tool/optimizers/super_naive_optimizer.py
@@ -8,14 +8,16 @@
 from Bio.Seq import Seq
 
 # Amino acid sequence dir to optimize:
-aa_dir = os.path.join(os.getcwd(),'benchmark_sequences','aa')
+aa_dir = os.path.join(os.getcwd(), 'benchmark_sequences', 'aa')
 
 
 # Output dir to store optimized seqs:
-out_dir = os.path.join(os.getcwd(),'benchmark_sequences','super_naive')
+out_dir = os.path.join(os.getcwd(), 'benchmark_sequences', 'super_naive')
 
 # Amino acid to codon table, outputs arr of codons:
-def aa2codons(seq : str) -> list:
+
+
+def aa2codons(seq: str) -> list:
     _aas = {
         "A": ["GCT GCC GCA GCG"],
         "R": ["CGT CGC CGA CGG AGA AGG"],
@@ -43,19 +45,20 @@ def aa2codons(seq : str) -> list:
     }
     return [_aas[i] for i in seq]
 
+
 # Converts an amino acid to a random corresponding codon:
 for entry in os.scandir(aa_dir):
     # Read in the amino acid sequence:
     name = entry.name[0:-9] + "_dna"
-    record = SeqIO.read(entry,'fasta')
+    record = SeqIO.read(entry, 'fasta')
     arr = []
     # Convert amino acid to codons:
     for i in record.seq:
-        #Randomly choose a codon from the list of codons for the amino acid:
+        # Randomly choose a codon from the list of codons for the amino acid:
         arr.append(random.choice(aa2codons(i)[0][0].split()))
 
     # Convert the array of codons to a string:
-    record.seq = Seq(re.sub('[^GATC]',"",str("".join(arr)).upper()))
+    record.seq = Seq(re.sub('[^GATC]', "", str("".join(arr)).upper()))
     complete_name = os.path.join(out_dir, name)
 
     # Save the super naively optimized DNA sequence:
diff --git a/tool/scripts/csv_to_seqs.py b/tool/scripts/csv_to_seqs.py
@@ -16,10 +16,10 @@
 from Bio.SeqRecord import SeqRecord
 from Bio.Seq import Seq
 
-#output directory to write sequences
+# output directory to write sequences
 out_dir = r"..\..\benchmark_sequences\genscript"
 
-#iterate through the csv file and write sequences to the output directory
+# iterate through the csv file and write sequences to the output directory
 with open('optimum_seqs.csv', newline='') as csvfile:
     data = list(csv.reader(csvfile))
     for i in data:
@@ -31,4 +31,4 @@
             description="blank",
         )
         print(record)
-        SeqIO.write(record,os.path.join(out_dir, file_name),'fasta')
+        SeqIO.write(record, os.path.join(out_dir, file_name), 'fasta')
diff --git a/tool/scripts/reformat_seqs.py b/tool/scripts/reformat_seqs.py
@@ -13,25 +13,28 @@
 dna_directory = r"..\..\benchmark_sequences\dna"
 
 # Print the number of files in each directory.
-print("There are %d files in the amino acid directory." % len(os.listdir(aa_directory)))
-print("There are %d files in the DNA directory." % len(os.listdir(dna_directory)))
+print("There are %d files in the amino acid directory." %
+      len(os.listdir(aa_directory)))
+print("There are %d files in the DNA directory." %
+      len(os.listdir(dna_directory)))
 
 # Iterate over each file in the directory.
 for entry in os.scandir(aa_directory):
     record = SeqIO.read(entry, "fasta")
     SeqIO.write(record, entry, "fasta")
-    #Although this does not change the actual sequence, it will reformat it with a fixed spacing (makes seqs more legible).
+    # Although this does not change the actual sequence, it will reformat it with a fixed spacing (makes seqs more legible).
 
 # Iterate over each file in the directory.
 for entry in os.scandir(dna_directory):
     record = SeqIO.read(entry, "fasta")
 
-    #Just in case, replace ambigious codons with the corresponding IUPAC ones:
-    record.seq = record.seq.replace('K',random.choice(['G','T'])).replace('M',random.choice(['A','C'])).replace('N',random.choice(['A','C','G','T'])).replace('R',random.choice(['A','G'])).replace('W',random.choice(['A','T'])).replace('Y',random.choice(['C','T']))
+    # Just in case, replace ambigious codons with the corresponding IUPAC ones:
+    record.seq = record.seq.replace('K', random.choice(['G', 'T'])).replace('M', random.choice(['A', 'C'])).replace('N', random.choice(
+        ['A', 'C', 'G', 'T'])).replace('R', random.choice(['A', 'G'])).replace('W', random.choice(['A', 'T'])).replace('Y', random.choice(['C', 'T']))
     SeqIO.write(record, entry, "fasta")
 
     if len(record.seq) % 3:
-        #all CDS should be divisible by three because they are all in frame.
+        # all CDS should be divisible by three because they are all in frame.
         print(f"Warning {entry.name} (sequence #{num}) was not divisible by three. All CDS should be divisible by three. The ICOR optimizer will perform best on sequences of that length."
     else:
         continue

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+{`
	`2`	`+ "python.linting.enabled": true`
	`3`	`+}`