Skip to content

Commit cb95d50

Browse files
author
RJain12
committed
comment + clean-up
1 parent a054768 commit cb95d50

7 files changed

Lines changed: 325 additions & 275 deletions

File tree

scripts/convert2cds.ipynb

Lines changed: 56 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,20 @@
2727
{
2828
"cell_type": "code",
2929
"execution_count": 57,
30-
"metadata": {},
30+
"source": [
31+
"# import modules\r\n",
32+
"from selenium import webdriver\r\n",
33+
"from webdriver_manager.chrome import ChromeDriverManager\r\n",
34+
"from selenium.webdriver.support.select import Select\r\n",
35+
"import os\r\n",
36+
"from Bio import SeqIO\r\n",
37+
"import time\r\n",
38+
"import numpy as np\r\n",
39+
"\r\n",
40+
"#init webdriver + selenium opts (must have chrome installed on machine)\r\n",
41+
"chromeOptions = webdriver.ChromeOptions()\r\n",
42+
"browser = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=chromeOptions)"
43+
],
3144
"outputs": [
3245
{
3346
"output_type": "stream",
@@ -44,25 +57,34 @@
4457
]
4558
}
4659
],
47-
"source": [
48-
"# import modules\n",
49-
"from selenium import webdriver\n",
50-
"from webdriver_manager.chrome import ChromeDriverManager\n",
51-
"from selenium.webdriver.support.select import Select\n",
52-
"import os\n",
53-
"from Bio import SeqIO\n",
54-
"import time\n",
55-
"import numpy as np\n",
56-
"\n",
57-
"#init webdriver + selenium opts\n",
58-
"chromeOptions = webdriver.ChromeOptions()\n",
59-
"browser = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=chromeOptions)"
60-
]
60+
"metadata": {}
6161
},
6262
{
6363
"cell_type": "code",
6464
"execution_count": 71,
65-
"metadata": {},
65+
"source": [
66+
"'''\r\n",
67+
"Convert DNA seqs to include CDS only\r\n",
68+
"Heads to ncbi nuccore webpage for each sequence.\r\n",
69+
"Grabs the \"CDS\" portion and rewrites the sequences for that portion only.\r\n",
70+
"'''\r\n",
71+
"dir = r\"C:\\Users\\risha\\Desktop\\icor-codon-optimization\\benchmark_sequences\\dna\"\r\n",
72+
"x = 1\r\n",
73+
"for entry in os.scandir(dir):\r\n",
74+
" record = SeqIO.read(entry,'fasta')\r\n",
75+
" if record.name.startswith(\"NM\"):\r\n",
76+
" browser.get(\"https://www.ncbi.nlm.nih.gov/nuccore/%s\" % record.name)\r\n",
77+
" time.sleep(1)\r\n",
78+
" cds = browser.find_elements_by_class_name('feature')\r\n",
79+
" for i in cds:\r\n",
80+
" list = i.text.split()\r\n",
81+
" if list[0] == 'CDS':\r\n",
82+
" arr=list[1].split('..')\r\n",
83+
" x = x+1\r\n",
84+
" record.seq = record.seq[int(arr[0])-1:int(arr[1])]\r\n",
85+
" SeqIO.write(record,entry,\"fasta\")\r\n",
86+
" print(\"Finished writing #\" + x)"
87+
],
6688
"outputs": [
6789
{
6890
"output_type": "stream",
@@ -105,96 +127,31 @@
105127
]
106128
}
107129
],
108-
"source": [
109-
"'''\n",
110-
"Convert DNA seqs to include CDS only\n",
111-
"'''\n",
112-
"dir = r\"C:\\Users\\risha\\Desktop\\icor-codon-optimization\\benchmark_sequences\\dna\"\n",
113-
"x = 1\n",
114-
"for entry in os.scandir(dir):\n",
115-
" record = SeqIO.read(entry,'fasta')\n",
116-
" if record.name.startswith(\"NM\"):\n",
117-
" browser.get(\"https://www.ncbi.nlm.nih.gov/nuccore/%s\" % record.name)\n",
118-
" time.sleep(1)\n",
119-
" cds = browser.find_elements_by_class_name('feature')\n",
120-
" for i in cds:\n",
121-
" list = i.text.split()\n",
122-
" if list[0] == 'CDS':\n",
123-
" arr=list[1].split('..')\n",
124-
" print(x, \"====\")\n",
125-
" x = x+1\n",
126-
" record.seq = record.seq[int(arr[0])-1:int(arr[1])]\n",
127-
" SeqIO.write(record,entry,\"fasta\")"
128-
]
130+
"metadata": {}
129131
},
130132
{
131133
"cell_type": "code",
132134
"execution_count": 80,
133-
"metadata": {},
134-
"outputs": [],
135135
"source": [
136-
"'''\n",
137-
"Convert DNA seqs to AA\n",
138-
"'''\n",
139-
"\n",
140-
"from Bio.Seq import Seq\n",
141-
"import os\n",
142-
"\n",
143-
"dna_dir = r\"C:\\Users\\risha\\Desktop\\icor-codon-optimization\\benchmark_sequences\\dna\"\n",
144-
"aa_dir = r\"C:\\Users\\risha\\Desktop\\icor-codon-optimization\\benchmark_sequences\\aa\"\n",
145-
"\n",
146-
"for entry in os.scandir(dna_dir):\n",
147-
" record = SeqIO.read(entry,'fasta')\n",
148-
" record.seq = record.translate().seq\n",
149-
"\n",
136+
"'''\r\n",
137+
"Convert DNA seqs to AA, given newly CDS-only seqs.\r\n",
138+
"Write these AA seqs into record files.\r\n",
139+
"'''\r\n",
140+
"\r\n",
141+
"from Bio.Seq import Seq\r\n",
142+
"import os\r\n",
143+
"\r\n",
144+
"dna_dir = r\"C:\\Users\\risha\\Desktop\\icor-codon-optimization\\benchmark_sequences\\dna\"\r\n",
145+
"aa_dir = r\"C:\\Users\\risha\\Desktop\\icor-codon-optimization\\benchmark_sequences\\aa\"\r\n",
146+
"\r\n",
147+
"for entry in os.scandir(dna_dir):\r\n",
148+
" record = SeqIO.read(entry,'fasta')\r\n",
149+
" record.seq = record.translate().seq\r\n",
150+
"\r\n",
150151
" SeqIO.write(record,os.path.join(aa_dir, entry.name[0:-9] + \"aa.fasta\"),\"fasta\")"
151-
]
152-
},
153-
{
154-
"cell_type": "code",
155-
"execution_count": 81,
156-
"metadata": {},
157-
"outputs": [
158-
{
159-
"output_type": "execute_result",
160-
"data": {
161-
"text/plain": [
162-
"1467"
163-
]
164-
},
165-
"metadata": {},
166-
"execution_count": 81
167-
}
168152
],
169-
"source": [
170-
"str = \"\"\"ATGTCCGATGTTGCCATTGTAAAGGAAGGGTGGTTACACAAGAGGGGCGAATACATTAAG\n",
171-
"ACTTGGCGCCCGCGATACTTTCTGCTGAAGAACGATGGGACCTTTATCGGTTACAAGGAA\n",
172-
"CGCCCGCAGGATGTTGACCAGCGTGAAGCGCCGCTGAACAACTTCTCTGTGGCGCAGTGC\n",
173-
"CAGTTGATGAAGACCGAACGCCCGCGTCCAAACACCTTCATAATCCGTTGTCTTCAATGG\n",
174-
"ACGACGGTGATTGAGAGAACGTTTCATGTGGAGACCCCGGAAGAGCGCGAGGAGTGGACC\n",
175-
"ACGGCCATTCAGACCGTCGCTGACGGCTTAAAGAAGCAGGAAGAAGAAGAAATGGACTTT\n",
176-
"CGCTCCGGCAGCCCCTCCGATAATTCTGGTGCGGAAGAGATGGAAGTAAGTCTTGCAAAG\n",
177-
"CCGAAGCACAGAGTTACGATGAACGAGTTCGAGTATCTGAAGCTCCTGGGCAAGGGAACG\n",
178-
"TTCGGCAAGGTCATCCTGGTCAAGGAGAAGGCCACCGGGCGCTACTACGCCATGAAGATC\n",
179-
"CTGAAGAAGGAGGTGATAGTGGCCAAGGACGAGGTTGCCCACACCCTGACTGAGAACCGG\n",
180-
"GTTCTGCAGAACTCTAGACACCCATTCCTTACCGCCCTCAAGTACAGCTTCCAAACCCAC\n",
181-
"GATCGTCTGTGTTTCGTTATGGAGTATGCGAATGGCGGCGAGCTGTTCTTTCATCTCTCC\n",
182-
"CGCGAACGCGTTTTTAGCGAAGACAGAGCAAGGTTTTACGGGGCGGAGATTGTCAGCGCG\n",
183-
"CTAGATTATTTACACTCTGAAAAGAACGTTGTGTACCGTGATCTGAAGCTGGAAAACTTA\n",
184-
"ATGTTGGATAAGGACGGTCATATTAAGATTACCGATTTTGGCCTGTGTAAGGAAGGTATT\n",
185-
"AAGGATGGTGCGACGATGAAGACCTTTTGCGGCACCCCGGAATATTTGGCTCCGGAAGTA\n",
186-
"TTGGAGGATAATGATTACGGTCGAGCCGTTGATTGGTGGGGCCTGGGTGTAGTCATGTAT\n",
187-
"GAAATGATGTGCGGACGCTTACCTTTCTATAATCAGGACCATGAAAAGCTGTTTGAGCTG\n",
188-
"ATTCTGATGGAGGAAATTCGTTTTCCGCGTACCCTGGGACCAGAAGCGAAGAGCCTGCTT\n",
189-
"TCAGGGCTGTTGAAGAAGGACCCCAAGCAACGCTTAGGCGGTGGCTCGGAAGATGCCAAG\n",
190-
"GAAATTATGCAGCACCGTTTCTTTGCGGGGATCGTCTGGCAGCACGTGTATGAAAAGAAG\n",
191-
"TTAAGTCCACCCTTCAAGCCTCAGGTAACGTCGGAAACTGACACCAGATATTTTGATGAA\n",
192-
"GAATTCACGGCACAGATGATCACAATTACGCCGCCGGATCAGGATGACTCTATGGAATGT\n",
193-
"GTTGATAGCGAACGCAGACCCCATTTTCCACAATTTAGTTACTCAGCGTCTGGCACCGCG\n",
194-
"TGA\"\"\"\n",
195-
"\n",
196-
"len(str)"
197-
]
153+
"outputs": [],
154+
"metadata": {}
198155
}
199156
]
200157
}

scripts/csv_2_seqs.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,16 +6,20 @@
66
file_name | seq
77
...
88
9+
The script will covnert the CSV to sequences that will be written into an output directory which can be specified below.
910
'''
1011

12+
#import modules
1113
import csv
1214
from Bio import SeqIO
1315
import os
1416
from Bio.SeqRecord import SeqRecord
1517
from Bio.Seq import Seq
1618

19+
#output directory to write sequences
1720
out_dir = r"C:\Users\risha\Desktop\icor-codon-optimization\benchmark_sequences\genscript"
1821

22+
#iterate through the csv file and write sequences to the output directory
1923
with open('optimum_seqs.csv', newline='') as csvfile:
2024
data = list(csv.reader(csvfile))
2125
for i in data:

scripts/import_onnx.ipynb

Lines changed: 128 additions & 59 deletions
Large diffs are not rendered by default.

scripts/naive_optimizer.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
#import standard modules; re is used for regex on ln 59
12
import os
23
from Bio import SeqIO
34
from Bio.Seq import Seq

scripts/reformat_seqs.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,25 +3,34 @@
33
# This script simply opens each of the files in a directory, and re-writes them using the SeqIO function.
44
# This script does not change the sequence itself, but helps reformat them by writing each to lines with equal lengths.
55

6+
# Import all necessary modules here
67
import os
78
from Bio import SeqIO
89
import random
910

11+
# Change this to the directory where your files are stored.
1012
aa_directory = r"C:\Users\risha\Desktop\icor-codon-optimization\benchmark_sequences\aa"
1113
dna_directory = r"C:\Users\risha\Desktop\icor-codon-optimization\benchmark_sequences\dna"
1214

15+
# Print the number of files in each directory.
1316
print("There are %d files in the amino acid directory." % len(os.listdir(aa_directory)))
1417
print("There are %d files in the DNA directory." % len(os.listdir(dna_directory)))
1518

19+
# Iterate over each file in the directory.
1620
for entry in os.scandir(aa_directory):
1721
record = SeqIO.read(entry, "fasta")
1822
SeqIO.write(record, entry, "fasta")
23+
#Although this does not change the actual sequence, it will reformat it with a fixed spacing (makes seqs more legible).
1924

25+
# Iterate over each file in the directory.
2026
for entry in os.scandir(dna_directory):
2127
record = SeqIO.read(entry, "fasta")
2228

2329
#Just in case, replace ambigious codons with the corresponding IUPAC ones:
2430
record.seq = record.seq.replace('K',random.choice(['G','T'])).replace('M',random.choice(['A','C'])).replace('N',random.choice(['A','C','G','T'])).replace('R',random.choice(['A','G'])).replace('W',random.choice(['A','T'])).replace('Y',random.choice(['C','T']))
2531

32+
#if there are sequences that are not divisible by three, then truncate them:
2633
num = len(record.seq) % 3
27-
print(num)
34+
print("Warning: truncated" + entry.name + num)
35+
#warning: if sequences are being truncated, they are likely not formatted correctly.
36+
#all CDS should be divisible by three because they are all in frame.

0 commit comments

Comments
 (0)