1- #import standard modules; re is used for regex on ln 59
1+ # import standard modules; re is used for regex on ln 59
22import os
33from Bio import SeqIO
44from Bio .Seq import Seq
55import random
66import numpy as np
77import re
88
9- #frequencies are from ecoli_codon_frequencies.xlsx file in summaries dir
10- #create dict with value being a tuple with the codons and their probabilities/frequncies
9+ # frequencies are from ecoli_codon_frequencies.xlsx file in summaries dir
10+ # create dict with value being a tuple with the codons and their probabilities/frequncies
1111frequency = {
12- "A" : (["GCG" , "GCA" , "GCT" , "GCC" ],[0.34 , 0.22 , 0.17 , 0.27 ]),
13- "R" : (["AGG" , "AGA" , "CGG" ,"CGA" ,"CGT" ,"CGC" ],[0.03 , 0.05 , 0.1 , 0.07 , 0.37 , 0.38 ]),
14- "N" : (["AAT" , "AAC" ],[0.46 , 0.54 ]),
15- "D" : (["GAT" ,"GAC" ],[0.63 ,0.37 ]),
16- "C" : (["TGT" ,"TGC" ],[0.45 ,0.55 ]),
17- "*" : (["TGA" ,"TAG" ,"TAA" ],[0.3 ,0.08 ,0.62 ]),
18- "Q" : (["CAG" ,"CAA" ],[0.66 ,0.34 ]),
19- "E" : (["GAG" ,"GAA" ],[0.32 ,0.68 ]),
20- "G" : (["GGG" ,"GGA" ,"GGT" ,"GGC" ],[0.15 ,0.12 ,0.34 ,0.39 ]),
21- "H" : (["CAT" ,"CAC" ],[0.57 ,0.43 ]),
22- "I" : (["ATA" ,"ATT" ,"ATC" ],[0.09 ,0.5 ,0.41 ]),
23- "L" : (["TTG" ,"TTA" ,"CTG" ,"CTA" ,"CTT" ,"CTC" ],[0.13 ,0.13 ,0.49 ,0.04 ,0.11 ,0.1 ]),
24- "K" : (["AAG" ,"AAG" ],[0.25 ,0.75 ]),
25- "M" : (["ATG" ],[1.0 ]),
26- "F" : (["TTT" ,"TTC" ],[0.57 ,0.43 ]),
27- "P" : (["CCG" ,"CCA" ,"CCT" ,"CCC" ],[0.51 ,0.2 ,0.17 ,0.12 ]),
28- "S" : (["AGT" ,"AGC" ,"TCG" ,"TCA" ,"TCT" ,"TCC" ],[0.15 ,0.26 ,0.15 ,0.13 ,0.16 ,0.15 ]),
29- "T" : (["ACG" ,"ACA" ,"ACT" ,"ACC" ],[0.26 ,0.15 ,0.18 ,0.42 ]),
30- "W" : (["TGG" ],[1.0 ]),
31- "Y" : (["TAT" ,"TAC" ],[0.58 ,0.42 ]),
32- "V" : (["GTG" ,"GTA" ,"GTT" ,"GTC" ],[0.36 ,0.16 ,0.27 ,0.21 ])
12+ "A" : (["GCG" , "GCA" , "GCT" , "GCC" ], [0.34 , 0.22 , 0.17 , 0.27 ]),
13+ "R" : (["AGG" , "AGA" , "CGG" , "CGA" , "CGT" , "CGC" ], [0.03 , 0.05 , 0.1 , 0.07 , 0.37 , 0.38 ]),
14+ "N" : (["AAT" , "AAC" ], [0.46 , 0.54 ]),
15+ "D" : (["GAT" , "GAC" ], [0.63 , 0.37 ]),
16+ "C" : (["TGT" , "TGC" ], [0.45 , 0.55 ]),
17+ "*" : (["TGA" , "TAG" , "TAA" ], [0.3 , 0.08 , 0.62 ]),
18+ "Q" : (["CAG" , "CAA" ], [0.66 , 0.34 ]),
19+ "E" : (["GAG" , "GAA" ], [0.32 , 0.68 ]),
20+ "G" : (["GGG" , "GGA" , "GGT" , "GGC" ], [0.15 , 0.12 , 0.34 , 0.39 ]),
21+ "H" : (["CAT" , "CAC" ], [0.57 , 0.43 ]),
22+ "I" : (["ATA" , "ATT" , "ATC" ], [0.09 , 0.5 , 0.41 ]),
23+ "L" : (["TTG" , "TTA" , "CTG" , "CTA" , "CTT" , "CTC" ], [0.13 , 0.13 , 0.49 , 0.04 , 0.11 , 0.1 ]),
24+ "K" : (["AAG" , "AAG" ], [0.25 , 0.75 ]),
25+ "M" : (["ATG" ], [1.0 ]),
26+ "F" : (["TTT" , "TTC" ], [0.57 , 0.43 ]),
27+ "P" : (["CCG" , "CCA" , "CCT" , "CCC" ], [0.51 , 0.2 , 0.17 , 0.12 ]),
28+ "S" : (["AGT" , "AGC" , "TCG" , "TCA" , "TCT" , "TCC" ], [0.15 , 0.26 , 0.15 , 0.13 , 0.16 , 0.15 ]),
29+ "T" : (["ACG" , "ACA" , "ACT" , "ACC" ], [0.26 , 0.15 , 0.18 , 0.42 ]),
30+ "W" : (["TGG" ], [1.0 ]),
31+ "Y" : (["TAT" , "TAC" ], [0.58 , 0.42 ]),
32+ "V" : (["GTG" , "GTA" , "GTT" , "GTC" ], [0.36 , 0.16 , 0.27 , 0.21 ])
3333}
3434
3535# Amino acid sequence dir to optimize:
3636# hardcoded path
37- aa_dir = os .path .join (os .getcwd (),'benchmark_sequences' ,'aa' )
37+ aa_dir = os .path .join (os .getcwd (), 'benchmark_sequences' , 'aa' )
3838
3939# Output dir to store optimized seqs:
4040# hardcoded path
41- out_dir = os .path .join (os .getcwd (),'benchmark_sequences' ,'naive' )
41+ out_dir = os .path .join (os .getcwd (), 'benchmark_sequences' , 'naive' )
4242
4343
4444# Normalize probabilities for frequency if sum is not exactly 1.
45- def fix_p ( p ):
45+ def fix_p (p ):
4646 if p .sum () != 1.0 :
4747 p = p * (1. / p .sum ())
4848 return p
4949
50+
5051for entry in os .scandir (aa_dir ):
5152 name = entry .replace ("_aa.fasta" , "_dna" )
5253
5354 # Replace ambiguities with amino acids from IUPAC guidelines: https://www.bioinformatics.org/sms/iupac.html
54- record = SeqIO .read (entry ,"fasta" )
55- seq = record .seq .replace ("B" , random .choice (["D" ,"N" ])).replace ("Z" , random .choice (["E" , "Q" ]))
55+ record = SeqIO .read (entry , "fasta" )
56+ seq = record .seq .replace ("B" , random .choice (["D" , "N" ])).replace (
57+ "Z" , random .choice (["E" , "Q" ]))
5658 seq_arr = []
5759 for aa in seq :
58- #append to the array a random choice of codon using the probabilities given (p)
59- seq_arr .append (np .random .choice (frequency [aa ][0 ],p = fix_p (np .asarray (frequency [aa ][1 ]))))
60-
61- record .seq = Seq (re .sub ('[^GATC]' ,"" ,str ("" .join (seq_arr )).upper ()))
60+ # append to the array a random choice of codon using the probabilities given (p)
61+ seq_arr .append (np .random .choice (
62+ frequency [aa ][0 ], p = fix_p (np .asarray (frequency [aa ][1 ]))))
63+
64+ record .seq = Seq (re .sub ('[^GATC]' , "" , str ("" .join (seq_arr )).upper ()))
6265 complete_name = os .path .join (out_dir , name )
63- SeqIO .write (record , complete_name + ".fasta" , "fasta" )
66+ SeqIO .write (record , complete_name + ".fasta" , "fasta" )
0 commit comments