Code Review

ninjha01 · ninjha01 · commit 2db1ba5d40f3 · 2021-08-31T10:22:55.000-04:00
diff --git a/README.md b/README.md
@@ -25,6 +25,20 @@
 - [Dependencies](#Dependencies)
 ---
 
+### Quickstart
+I really like having a quickstart section that gives me a single command to install prereqs, a single command to run all tests (if any), and a single command to run the application. Something like:
+
+```bash
+# Install prereqs
+pip install -r requriements.txt # or an install_prereqs.sh script if you have more diverse dependencies
+
+# run tests (if you decided to add tests in the future)
+pytest
+
+# run models
+python ./tool/optimizers/brute_force_optimizer.py
+```
+
 ### About
 In protein sequences—as there are 61 sense codons but only 20 standard amino acids—most amino acids are encoded by more than one codon. Although such synonymous codons do not alter the encoded amino acid sequence, their selection can dramatically affect the production of the resulting protein. Codon optimization of synthetic DNA sequences for maximum expression is an important segment of heterologous expression. However, existing solutions are primarily based on choosing high-frequency codons only, neglecting the important effects of rare codons. In this paper, we propose a novel recurrent-neural-network (RNN) based codon optimization tool, ICOR, that aims to learn codon usage bias on a genomic dataset of Escherichia coli. We compile a dataset of over 42,000 non-redundant, robust genes that are used for deep learning. The model uses a bidirectional long short-term memory-based architecture, allowing for the sequential information of genes to be learnt. Our tool can predict synonymous codons for synthetic genes towards optimal expression in E. coli. We demonstrate that sequential context achieved via RNN may yield codon selection that is more similar to the host genome, therefore improving protein expression more than frequency-based approaches. On a benchmark set of over 40 select DNA sequences, ICOR tool improved the codon adaptation index by 41.69% compared to the original sequence. Our resulting algorithm is provided as an open-source software package along with the benchmark set of sequences.
 
diff --git a/tool/optimizers/brute_force_optimizer.py b/tool/optimizers/brute_force_optimizer.py
@@ -3,6 +3,7 @@
 Goal of this is to find a combination of codons to maximize CAI (achieve 1.0 CAI).
 '''
 
+# Shouldn't hardcode profiling code, pass a flag to turn on profiling
 import timeit
 
 start = timeit.default_timer()
@@ -18,6 +19,9 @@
 import re
 
 # Set input AA sequence directory and output for writing brute sequences
+# Shouldn't hard code these - should be relative paths like "../../benchmark_sequences/aa"
+# Also, I think most scientists will be using UNIX-like OSes.
+# I'm not super familiar with it, but have you tried running this in WSL to check for compatibility?
 aa_dir = r"C:\Users\risha\Desktop\icor-codon-optimization\benchmark_sequences\aa"
 out_dir = r"C:\Users\risha\Desktop\icor-codon-optimization\benchmark_sequences\brute"
 
@@ -129,13 +133,18 @@ def aa2codons(seq : str) -> list:
 # Converts an amino acid to a random corresponding codon:
 for entry in os.scandir(aa_dir):
     # Read in the amino acid sequence:
+
+    # I'm guessing this is to strip the _aa.fasta, perhaps replace it with something like
+    # name = entry.replace("_aa.fasta", "_dna")
+    # to be more explicit
     name = entry.name[0:-9] + "_dna"
     record = SeqIO.read(entry,'fasta')
     masterlist = []
     bestcai = 0
     curcai = 0
     z = 0
-    while (z < 100000):
+    # What's the significance of 100000? Could we give it a descriptive name?
+    while z < 100000:
         codonarr = []
         # Convert amino acid to codons:
         for i in record.seq:
@@ -150,6 +159,13 @@ def aa2codons(seq : str) -> list:
             print('new best cai ' + str(cai))
         z += 1
         print(z)
+
+    # ⬆
+    # Style nit, but it would be more pythonic to write
+    # TOTAL_ITERATIONS = 100000
+    # for curr_iteration in range(0, TOTAL_ITERATIONS):
+    #    ...
+
     # Write the codon array to a file:
     record.seq = Seq(re.sub('[^GATC]',"",str("".join(masterlist[bestcai])).upper()))
     complete_name = os.path.join(out_dir, name)
diff --git a/tool/optimizers/icor_optimizer.py b/tool/optimizers/icor_optimizer.py
@@ -1,4 +1,5 @@
 # Define variables (must change!)
+# Good idea! - you can use relative paths as described in ./brute_force_optimizer.py
 model_path = r"C:\Users\risha\Desktop\icor-codon-optimization\tool\models\icor.onnx"
 
 # Import packages
@@ -7,10 +8,12 @@
 import sys
 import onnxruntime as rt
 import numpy as np
+from typing import List
 
 type = input("Welcome to ICOR! Are you optimizing an amino acid sequence (enter in 'aa' below) or a dna/codon sequence (enter in 'dna' below)?\n\n").strip().upper()
 input_seq = input(
     "Enter the coding sequence only.\nEnter in 'demo' to use demo sequence.\n\n").strip().upper()
+# 'type' is a builtin function in python - I'd recommend renaming the var to sequence_type to avoid reassigning it
 
 # Load demo sequence (AKT1 amino acid seq)
 if type == 'AA':
@@ -31,13 +34,18 @@
     # ICOR accepts the amino acid sequence, so we translate the DNA sequence to amino acid sequence:
     input_seq = Seq(input_seq)
     input_seq = input_seq.translate()
+# It's good to handle all cases of your if/elif. Something like
+# else:
+#     sys.exit(f"Invalid sequence type {sequence_type}. Expected 'aa' or 'dna'")
+
 
 print(input_seq)
 # Define categorical labels from when model was trained.
 labels = ['AAA', 'AAC','AAG','AAT','ACA','ACG','ACT','AGC','ATA','ATC','ATG','ATT','CAA','CAC','CAG','CCG','CCT','CTA','CTC','CTG','CTT','GAA','GAT','GCA','GCC','GCG','GCT','GGA','GGC','GTC','GTG','GTT','TAA','TAT','TCA','TCG','TCT','TGG','TGT','TTA','TTC','TTG','TTT','ACC','CAT','CCA','CGG','CGT','GAC','GAG','GGT','AGT','GGG','GTA','TGC','CCC','CGA','CGC','TAC','TAG','TCC','AGA','AGG','TGA']
 
 # Define aa to integer table
-def aa2int(seq: str) -> list:
+# Your 'seq: str' type definition is broken by your reassignment of 'str' below
+def aa2int(seq: str) -> List[int]:
     _aa2int = {
         'A': 1,
         'R': 2,
@@ -77,6 +85,7 @@ def aa2int(seq: str) -> list:
 
 # One-hot encode the amino acid sequence:
 i = 0
+# style nit: more pythonic to write for i in range(0, len(aa_placement)):
 while i < len(aa_placement):
     oh_array[aa_placement[i]-1, i] = 1
     i += 1
@@ -100,7 +109,9 @@ def aa2int(seq: str) -> list:
 for pred in pred_onx[0]:
     pred_indices.append(np.argmax(pred))
 
-str = ""
+# Likewise, 'str' is a bultin type in python
+# I'd rename to 'output_str' or the like
+out_str = ""
 for index in pred_indices:
     str += labels[index]
 print('==== OUTPUT ====\n' + str)
@@ -114,3 +125,16 @@ def aa2int(seq: str) -> list:
     print('\nOutput written to output.txt')
 else:
     print('\nNo output written. Done!')
+# should catch cases explicitly
+# like:
+# while True:
+#     if output == "Y":
+#         with open("output.txt", "w") as f:
+#         f.write(out_str)
+#         print("\nOutput written to output.txt")
+#         break
+#     elif output == "N":
+#         print("\nNo output written. Done!")
+#         break
+#     else:
+#         print("Error! Expected Y/N")
diff --git a/tool/optimizers/naive_optimizer.py b/tool/optimizers/naive_optimizer.py
@@ -33,9 +33,11 @@
 }
 
 # Amino acid sequence dir to optimize:
+# hardcoded path
 aa_dir = r"C:\Users\risha\Desktop\icor-codon-optimization\benchmark_sequences\aa"
 
 # Output dir to store optimized seqs:
+# hardcoded path
 out_dir = r"C:\Users\risha\Desktop\icor-codon-optimization\benchmark_sequences\naive"
 
 
@@ -49,6 +51,7 @@ def fix_p( p):
     name = entry.name[0:-9] + "_dna"
 
     # Replace ambiguities with amino acids from IUPAC guidelines.
+    # Might be nice to have a link to the guidelines?
     record = SeqIO.read(entry,"fasta")
     seq = record.seq.replace("B", random.choice(["D","N"])).replace("Z", random.choice(["E", "Q"]))
     seq_arr = []
diff --git a/tool/optimizers/super_naive_optimizer.py b/tool/optimizers/super_naive_optimizer.py
@@ -8,9 +8,11 @@
 from Bio.Seq import Seq
 
 # Amino acid sequence dir to optimize:
+# hardcoded path
 aa_dir = r"C:\Users\risha\Desktop\icor-codon-optimization\benchmark_sequences\aa"
 
 # Output dir to store optimized seqs:
+# hardcoded path
 out_dir = r"C:\Users\risha\Desktop\icor-codon-optimization\benchmark_sequences\super_naive"
 
 # Amino acid to codon table, outputs arr of codons:
@@ -58,4 +60,4 @@ def aa2codons(seq : str) -> list:
     complete_name = os.path.join(out_dir, name)
 
     # Save the super naively optimized DNA sequence:
-    SeqIO.write(record, complete_name + ".fasta", "fasta")
+    SeqIO.write(record, complete_name + ".fasta", "fasta")
diff --git a/tool/scripts/convert_to_cds.ipynb b/tool/scripts/convert_to_cds.ipynb
@@ -73,6 +73,8 @@
     "for entry in os.scandir(dir):\r\n",
     "    record = SeqIO.read(entry,'fasta')\r\n",
     "    if record.name.startswith(\"NM\"):\r\n",
+    "        # this is fine, but recent python versions support f-strings\n",
+    "        # https://realpython.com/python-f-strings/#f-strings-a-new-and-improved-way-to-format-strings-in-python\n",
     "        browser.get(\"https://www.ncbi.nlm.nih.gov/nuccore/%s\" % record.name)\r\n",
     "        time.sleep(1)\r\n",
     "        cds = browser.find_elements_by_class_name('feature')\r\n",
@@ -141,6 +143,7 @@
     "from Bio.Seq import Seq\r\n",
     "import os\r\n",
     "\r\n",
+    "# hardcoded paths\n",
     "dna_dir = r\"C:\\Users\\risha\\Desktop\\icor-codon-optimization\\benchmark_sequences\\dna\"\r\n",
     "aa_dir = r\"C:\\Users\\risha\\Desktop\\icor-codon-optimization\\benchmark_sequences\\aa\"\r\n",
     "\r\n",
diff --git a/tool/scripts/csv_to_seqs.py b/tool/scripts/csv_to_seqs.py
@@ -17,6 +17,7 @@
 from Bio.Seq import Seq
 
 #output directory to write sequences
+# hardcoded path
 out_dir = r"C:\Users\risha\Desktop\icor-codon-optimization\benchmark_sequences\genscript"
 
 #iterate through the csv file and write sequences to the output directory
@@ -31,4 +32,4 @@
             description="blank",
         )
         print(record)
-        SeqIO.write(record,os.path.join(out_dir, file_name),'fasta')
+        SeqIO.write(record,os.path.join(out_dir, file_name),'fasta')
diff --git a/tool/scripts/reformat_seqs.py b/tool/scripts/reformat_seqs.py
@@ -9,6 +9,7 @@
 import random
 
 # Change this to the directory where your files are stored.
+# hardcoded path
 aa_directory = r"C:\Users\risha\Desktop\icor-codon-optimization\benchmark_sequences\aa"
 dna_directory = r"C:\Users\risha\Desktop\icor-codon-optimization\benchmark_sequences\dna"
 
@@ -32,5 +33,6 @@
     #if there are sequences that are not divisible by three, then truncate them:
     num = len(record.seq) % 3
     print("Warning: truncated" + entry.name + num)
+    # I'd suggest checking for this explicitly in your code and showing the user this warning/error
     #warning: if sequences are being truncated, they are likely not formatted correctly.
-    #all CDS should be divisible by three because they are all in frame.
+    #all CDS should be divisible by three because they are all in frame.
diff --git a/tool/scripts/run_benchmark.ipynb b/tool/scripts/run_benchmark.ipynb
@@ -113,6 +113,7 @@
     "#benchmark_sequences\\naive contains the naive sequences so it was used for those.\r\n",
     "#benchmark_sequences\\ICOR contains the ICOR sequences so it was used for those.\r\n",
     "\r\n",
+    "# hardcoded paths\n",
     "dir = r\"C:\\Users\\risha\\Desktop\\icor-codon-optimization\\benchmark_sequences\\brute\"\r\n",
     "summary_name = \"brute\"\r\n",
     "\r\n",
diff --git a/tool/scripts/run_icor_from_mat.ipynb b/tool/scripts/run_icor_from_mat.ipynb
@@ -33,6 +33,7 @@
     "import numpy\r\n",
     "import onnxruntime as rt\r\n",
     "import mat73\r\n",
+    "# hardcoded paths\n",
     "train_data = mat73.loadmat(r'C:\\Users\\risha\\Desktop\\ICOR MATLAB\\allseqsworkspace71921.mat')\r\n",
     "\r\n",
     "# Define categorical labels from when model was trained.\r\n",