2727 {
2828 "cell_type" : " code" ,
2929 "execution_count" : 57 ,
30- "metadata" : {},
30+ "source" : [
31+ " # import modules\r\n " ,
32+ " from selenium import webdriver\r\n " ,
33+ " from webdriver_manager.chrome import ChromeDriverManager\r\n " ,
34+ " from selenium.webdriver.support.select import Select\r\n " ,
35+ " import os\r\n " ,
36+ " from Bio import SeqIO\r\n " ,
37+ " import time\r\n " ,
38+ " import numpy as np\r\n " ,
39+ " \r\n " ,
40+ " #init webdriver + selenium opts (must have chrome installed on machine)\r\n " ,
41+ " chromeOptions = webdriver.ChromeOptions()\r\n " ,
42+ " browser = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=chromeOptions)"
43+ ],
3144 "outputs" : [
3245 {
3346 "output_type" : " stream" ,
4457 ]
4558 }
4659 ],
47- "source" : [
48- " # import modules\n " ,
49- " from selenium import webdriver\n " ,
50- " from webdriver_manager.chrome import ChromeDriverManager\n " ,
51- " from selenium.webdriver.support.select import Select\n " ,
52- " import os\n " ,
53- " from Bio import SeqIO\n " ,
54- " import time\n " ,
55- " import numpy as np\n " ,
56- " \n " ,
57- " #init webdriver + selenium opts\n " ,
58- " chromeOptions = webdriver.ChromeOptions()\n " ,
59- " browser = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=chromeOptions)"
60- ]
60+ "metadata" : {}
6161 },
6262 {
6363 "cell_type" : " code" ,
6464 "execution_count" : 71 ,
65- "metadata" : {},
65+ "source" : [
66+ " '''\r\n " ,
67+ " Convert DNA seqs to include CDS only\r\n " ,
68+ " Heads to ncbi nuccore webpage for each sequence.\r\n " ,
69+ " Grabs the \" CDS\" portion and rewrites the sequences for that portion only.\r\n " ,
70+ " '''\r\n " ,
71+ " dir = r\" C:\\ Users\\ risha\\ Desktop\\ icor-codon-optimization\\ benchmark_sequences\\ dna\"\r\n " ,
72+ " x = 1\r\n " ,
73+ " for entry in os.scandir(dir):\r\n " ,
74+ " record = SeqIO.read(entry,'fasta')\r\n " ,
75+ " if record.name.startswith(\" NM\" ):\r\n " ,
76+ " browser.get(\" https://www.ncbi.nlm.nih.gov/nuccore/%s\" % record.name)\r\n " ,
77+ " time.sleep(1)\r\n " ,
78+ " cds = browser.find_elements_by_class_name('feature')\r\n " ,
79+ " for i in cds:\r\n " ,
80+ " list = i.text.split()\r\n " ,
81+ " if list[0] == 'CDS':\r\n " ,
82+ " arr=list[1].split('..')\r\n " ,
83+ " x = x+1\r\n " ,
84+ " record.seq = record.seq[int(arr[0])-1:int(arr[1])]\r\n " ,
85+ " SeqIO.write(record,entry,\" fasta\" )\r\n " ,
86+ " print(\" Finished writing #\" + x)"
87+ ],
6688 "outputs" : [
6789 {
6890 "output_type" : " stream" ,
105127 ]
106128 }
107129 ],
108- "source" : [
109- " '''\n " ,
110- " Convert DNA seqs to include CDS only\n " ,
111- " '''\n " ,
112- " dir = r\" C:\\ Users\\ risha\\ Desktop\\ icor-codon-optimization\\ benchmark_sequences\\ dna\"\n " ,
113- " x = 1\n " ,
114- " for entry in os.scandir(dir):\n " ,
115- " record = SeqIO.read(entry,'fasta')\n " ,
116- " if record.name.startswith(\" NM\" ):\n " ,
117- " browser.get(\" https://www.ncbi.nlm.nih.gov/nuccore/%s\" % record.name)\n " ,
118- " time.sleep(1)\n " ,
119- " cds = browser.find_elements_by_class_name('feature')\n " ,
120- " for i in cds:\n " ,
121- " list = i.text.split()\n " ,
122- " if list[0] == 'CDS':\n " ,
123- " arr=list[1].split('..')\n " ,
124- " print(x, \" ====\" )\n " ,
125- " x = x+1\n " ,
126- " record.seq = record.seq[int(arr[0])-1:int(arr[1])]\n " ,
127- " SeqIO.write(record,entry,\" fasta\" )"
128- ]
130+ "metadata" : {}
129131 },
130132 {
131133 "cell_type" : " code" ,
132134 "execution_count" : 80 ,
133- "metadata" : {},
134- "outputs" : [],
135135 "source" : [
136- " '''\n " ,
137- " Convert DNA seqs to AA\n " ,
138- " '''\n " ,
139- " \n " ,
140- " from Bio.Seq import Seq\n " ,
141- " import os\n " ,
142- " \n " ,
143- " dna_dir = r\" C:\\ Users\\ risha\\ Desktop\\ icor-codon-optimization\\ benchmark_sequences\\ dna\"\n " ,
144- " aa_dir = r\" C:\\ Users\\ risha\\ Desktop\\ icor-codon-optimization\\ benchmark_sequences\\ aa\"\n " ,
145- " \n " ,
146- " for entry in os.scandir(dna_dir):\n " ,
147- " record = SeqIO.read(entry,'fasta')\n " ,
148- " record.seq = record.translate().seq\n " ,
149- " \n " ,
136+ " '''\r\n " ,
137+ " Convert DNA seqs to AA, given newly CDS-only seqs.\r\n " ,
138+ " Write these AA seqs into record files.\r\n " ,
139+ " '''\r\n " ,
140+ " \r\n " ,
141+ " from Bio.Seq import Seq\r\n " ,
142+ " import os\r\n " ,
143+ " \r\n " ,
144+ " dna_dir = r\" C:\\ Users\\ risha\\ Desktop\\ icor-codon-optimization\\ benchmark_sequences\\ dna\"\r\n " ,
145+ " aa_dir = r\" C:\\ Users\\ risha\\ Desktop\\ icor-codon-optimization\\ benchmark_sequences\\ aa\"\r\n " ,
146+ " \r\n " ,
147+ " for entry in os.scandir(dna_dir):\r\n " ,
148+ " record = SeqIO.read(entry,'fasta')\r\n " ,
149+ " record.seq = record.translate().seq\r\n " ,
150+ " \r\n " ,
150151 " SeqIO.write(record,os.path.join(aa_dir, entry.name[0:-9] + \" aa.fasta\" ),\" fasta\" )"
151- ]
152- },
153- {
154- "cell_type" : " code" ,
155- "execution_count" : 81 ,
156- "metadata" : {},
157- "outputs" : [
158- {
159- "output_type" : " execute_result" ,
160- "data" : {
161- "text/plain" : [
162- " 1467"
163- ]
164- },
165- "metadata" : {},
166- "execution_count" : 81
167- }
168152 ],
169- "source" : [
170- " str = \"\"\" ATGTCCGATGTTGCCATTGTAAAGGAAGGGTGGTTACACAAGAGGGGCGAATACATTAAG\n " ,
171- " ACTTGGCGCCCGCGATACTTTCTGCTGAAGAACGATGGGACCTTTATCGGTTACAAGGAA\n " ,
172- " CGCCCGCAGGATGTTGACCAGCGTGAAGCGCCGCTGAACAACTTCTCTGTGGCGCAGTGC\n " ,
173- " CAGTTGATGAAGACCGAACGCCCGCGTCCAAACACCTTCATAATCCGTTGTCTTCAATGG\n " ,
174- " ACGACGGTGATTGAGAGAACGTTTCATGTGGAGACCCCGGAAGAGCGCGAGGAGTGGACC\n " ,
175- " ACGGCCATTCAGACCGTCGCTGACGGCTTAAAGAAGCAGGAAGAAGAAGAAATGGACTTT\n " ,
176- " CGCTCCGGCAGCCCCTCCGATAATTCTGGTGCGGAAGAGATGGAAGTAAGTCTTGCAAAG\n " ,
177- " CCGAAGCACAGAGTTACGATGAACGAGTTCGAGTATCTGAAGCTCCTGGGCAAGGGAACG\n " ,
178- " TTCGGCAAGGTCATCCTGGTCAAGGAGAAGGCCACCGGGCGCTACTACGCCATGAAGATC\n " ,
179- " CTGAAGAAGGAGGTGATAGTGGCCAAGGACGAGGTTGCCCACACCCTGACTGAGAACCGG\n " ,
180- " GTTCTGCAGAACTCTAGACACCCATTCCTTACCGCCCTCAAGTACAGCTTCCAAACCCAC\n " ,
181- " GATCGTCTGTGTTTCGTTATGGAGTATGCGAATGGCGGCGAGCTGTTCTTTCATCTCTCC\n " ,
182- " CGCGAACGCGTTTTTAGCGAAGACAGAGCAAGGTTTTACGGGGCGGAGATTGTCAGCGCG\n " ,
183- " CTAGATTATTTACACTCTGAAAAGAACGTTGTGTACCGTGATCTGAAGCTGGAAAACTTA\n " ,
184- " ATGTTGGATAAGGACGGTCATATTAAGATTACCGATTTTGGCCTGTGTAAGGAAGGTATT\n " ,
185- " AAGGATGGTGCGACGATGAAGACCTTTTGCGGCACCCCGGAATATTTGGCTCCGGAAGTA\n " ,
186- " TTGGAGGATAATGATTACGGTCGAGCCGTTGATTGGTGGGGCCTGGGTGTAGTCATGTAT\n " ,
187- " GAAATGATGTGCGGACGCTTACCTTTCTATAATCAGGACCATGAAAAGCTGTTTGAGCTG\n " ,
188- " ATTCTGATGGAGGAAATTCGTTTTCCGCGTACCCTGGGACCAGAAGCGAAGAGCCTGCTT\n " ,
189- " TCAGGGCTGTTGAAGAAGGACCCCAAGCAACGCTTAGGCGGTGGCTCGGAAGATGCCAAG\n " ,
190- " GAAATTATGCAGCACCGTTTCTTTGCGGGGATCGTCTGGCAGCACGTGTATGAAAAGAAG\n " ,
191- " TTAAGTCCACCCTTCAAGCCTCAGGTAACGTCGGAAACTGACACCAGATATTTTGATGAA\n " ,
192- " GAATTCACGGCACAGATGATCACAATTACGCCGCCGGATCAGGATGACTCTATGGAATGT\n " ,
193- " GTTGATAGCGAACGCAGACCCCATTTTCCACAATTTAGTTACTCAGCGTCTGGCACCGCG\n " ,
194- " TGA\"\"\"\n " ,
195- " \n " ,
196- " len(str)"
197- ]
153+ "outputs" : [],
154+ "metadata" : {}
198155 }
199156 ]
200157}
0 commit comments