1
2
3
4 """
5 FindOrfs, Created 2012
6
7 Script to detect start and stop codon of open reading frames in a dna sequence. Note that the algorithm is only rudimentary and does not respect intron-exon structures.
8
9 """
10
11
13 """
14 Get orf positions (forward/backward) and return them in a dictionary
15
16 @type sequence: string
17 @param sequence: nucleotide sequence
18 @type file_ORF: string
19 @param file_ORF: outputfile
20 @type pdic: dictionary
21 @param pdic: Used to start start / end positions of ORFs.
22 @rtype: dictionary
23 @return: Stored start start / end positions of ORFs
24 """
25
26
27 START,STOP,STARTrev,STOPrev = find_orfs(sequence,pdic)
28 t=open(file_ORF,'a')
29 print "\t search forward orfs..."
30 orf_counter = 0
31 orf_name = "forw"
32 for i in START:
33 for j in STOP:
34
35
36
37
38 if i<j and (j+3-i)>149 and (j+3-i)<103050 and (j+3-i)%3==0:
39 orf_counter+=1
40
41 t.write(">"+orf_name+" " + str(orf_counter))
42 t.write('\n')
43 t.write(sequence[i:j+3])
44 t.write('\n')
45 pdic[i]="S"
46 pdic[j]="E"
47
48 temp = orf_counter
49
50 orf_counter = 0
51 orf_name = "rev"
52 print "\t search backward orfs..."
53 for i in STARTrev:
54 for j in STOPrev:
55
56
57
58
59
60 if j<i and (i+3-j)>150 and (j+3-i)<103050 and (j+3-i)%3==0:
61
62 orf_counter+=1
63 t.write(">"+orf_name+" " + str(orf_counter))
64 t.write('\n')
65 t.write(sequence[j:i+3][::-1])
66 t.write('\n')
67 pdic[i]="E"
68 pdic[j]="S"
69 print(str(temp+orf_counter))
70 t.close()
71
72 return (pdic)
73
74
76 """
77 return the index of the first position of codon in the dna sequence
78
79 @type posLastStop: int
80 @param posLastStop: Position of the last found stop codon.
81 @type sequence: string
82 @param sequence: Nucleotide sequence.
83 @type codon: string
84 @param codon: 3-letter DNA code.
85 @rtype: int
86 @return: The position of the stop codon in the nucleotide sequence.
87 """
88 try:
89 return(sequence.index(codon,posLastStop))
90
91 except:
92 return(-1)
93
95 """
96 function to identify open reading frames in a dna sequence, careful: intron exon structures are not respected!
97
98 @type genomeSequence: string
99 @param genomeSequence: Nucleotide sequence.
100 @type pdic: dictionary
101 @param pdic: Used to store start / end positions of ORFs.
102 @rtype: dictionary
103 @return: Found start / end positions in the sequence consindering only the ORFs.
104 """
105
106
107
108
109
110 start =[]
111 stop =[]
112
113 posLastATG = 0
114 posLastStop = 3
115 orfList = []
116
117
118 print("\t..find forward orfs")
119 while True:
120 foundNew = False
121
122 try:
123
124 start.append(genomeSequence.index("ATG",posLastATG))
125
126 posLastATG = start[-1]+1
127 foundNew = True
128 except:
129 pass
130
131 stopSub =[]
132 stopcodons =["TAA","TGA","TAG"]
133 for item in stopcodons:
134 stopSub.append(findstop_help(posLastStop, genomeSequence, item))
135
136 stopSub.sort()
137
138 if(stopSub[0] > -1):
139 stop.append(stopSub[0])
140 posLastStop = stop[-1]+1
141 foundNew = True
142
143 elif(stopSub[1] > -1):
144 stop.append(stopSub[1])
145 posLastStop = stop[-1]+1
146 foundNew = True;
147
148 elif(stopSub[2] > -1):
149 stop.append(stopSub[2]);
150 posLastStop = stop[-1]+1
151 foundNew = True;
152
153 if(foundNew):
154 pass
155 else:
156 break
157
158
159
160
161
162 startRev = []
163 stopRev = []
164
165 posLastCAT = 3
166 posLastStop_rev = 0
167
168 print("\t..find reverse orfs")
169 while True:
170 foundNew_rev = False
171
172 try:
173
174 startRev.append(genomeSequence.index("CAT",posLastCAT))
175
176 posLastCAT = startRev[-1]+1
177 foundNew_rev = True
178 except:
179 pass
180
181 stopSub =[]
182 stopcodons =["TTA","TCA","CTA"]
183 for item in stopcodons:
184 stopSub.append(findstop_help(posLastStop_rev, genomeSequence, item))
185
186 stopSub.sort()
187
188 if(stopSub[0] > -1):
189 stopRev.append(stopSub[0])
190 posLastStop_rev = stopRev[-1]+1
191 foundNew_rev = True
192
193 elif(stopSub[1] > -1):
194 stopRev.append(stopSub[1])
195 posLastStop_rev = stopRev[-1]+1
196 foundNew_rev = True
197
198
199 elif(stopSub[2] > -1):
200 stopRev.append(stopSub[2])
201 posLastStop_rev = stopRev[-1]+1
202 foundNew_rev = True
203
204
205 if(foundNew_rev):
206 pass
207 else:
208 break
209
210 print("START codons : " + str(len(start)))
211 print("STOP codons : " +str(len(stop)))
212 print("revSTART codons : " +str(len(startRev)))
213 print("revSTOP codons : " +str(len(stopRev)))
214
215
216
217
218
219 removeList=[]
220 print("\t..creating forward orfs")
221 for stopPos in stop:
222 foundPartner=False
223 startPos = 0
224 i = 0
225 startPos = start[0]
226
227 while ((i<len(start)-1) and (start[i] <stopPos)):
228 startPos = start[i]
229 i+=1
230 if((stopPos+3-startPos)% 3 == 0):
231
232 if((foundPartner == False) and (stopPos+3)-startPos >149):
233
234 foundPartner = True
235 pdic[stopPos+3]="E"
236 pdic[startPos]="S"
237 orfList.append(startPos)
238 orfList.append(stopPos+3)
239 removeList.append(startPos)
240 else:
241 removeList.append(startPos)
242 for item in removeList:
243
244 start.remove(item)
245 removeList =[]
246
247
248
249 print("\t..creating reverse orfs")
250 removeList_rev=[]
251 l = len(stopRev)-1
252
253 for r in range(l,-1,-1):
254 stopPos = stopRev[r]
255 foundPartner=False
256
257
258
259 i = len(startRev)-1
260
261
262 while((i >=0) and (startRev[i]>stopPos)):
263 startPos=startRev[i]
264 i -=1
265 if((startPos+3-stopPos)%3 == 0):
266
267 if((foundPartner != True) and (startPos+3-stopPos > 149)):
268
269 pdic[stopPos]="S"
270 pdic[startPos+3]="E"
271 foundPartner = True
272 orfList.append(startPos)
273 orfList.append(stopPos+3)
274 removeList_rev.append(startPos)
275 else:
276 removeList_rev.append(startPos)
277
278 for item in removeList_rev:
279
280 startRev.remove(item)
281 removeList_rev =[]
282
283 return(pdic)
284