Augustus 3.4.0
Loading...
Searching...
No Matches
geneticcode.hh
1/*
2 * geneticcode.hh
3 *
4 * License: Artistic License, see file LICENSE.TXT or
5 * https://opensource.org/licenses/artistic-license-1.0
6 */
7
8#ifndef _GENETIC_CODE_HH
9#define _GENETIC_CODE_HH
10
11// project includes
12#include "types.hh"
13#include "projectio.hh"
14
15// standard C/C++ includes
16#include <string>
17#include <cstring>
18#include <iostream>
19#include <fstream>
20
21using namespace std;
22
23
24#define NUM_AA 20
25#define NUM_TRANSTABS 24
26
27/*
28 * === am I at a possible splice site or translation start site ===
29 */
30#define DECLARE_ON(NAME, PATTERN, COUNT) \
31 inline bool NAME(const char* dna) { \
32 return strncmp(dna, PATTERN, COUNT) == 0; \
33 }
34// 'gt'
35#define DSS_SEQUENCE "gt"
36#define RDSS_SEQUENCE "ac"
37DECLARE_ON(onDSS, DSS_SEQUENCE, 2)
38DECLARE_ON(onRDSS, RDSS_SEQUENCE, 2)
39
40// 'gc'
41#define ALT_DSS_SEQUENCE "gc"
42#define ALT_RDSS_SEQUENCE "gc"
43DECLARE_ON(onAltDSS, ALT_DSS_SEQUENCE, 2)
44DECLARE_ON(onAltRDSS, ALT_RDSS_SEQUENCE, 2)
45
46// 'gt' or 'gc'
47inline bool onGenDSS(const char* dna) {
48 return
49 onDSS(dna) || (Constant::dss_gc_allowed && onAltDSS(dna));
50}
51inline bool onGenRDSS(const char* dna) {
52 return
53 onRDSS(dna) || (Constant::dss_gc_allowed && onAltRDSS(dna));
54}
55
56// 'ag'
57#define ASS_SEQUENCE "ag"
58#define RASS_SEQUENCE "ct"
59DECLARE_ON(onASS, ASS_SEQUENCE, 2)
60DECLARE_ON(onRASS, RASS_SEQUENCE, 2)
61
62// 'atg'
63#define STARTCODON "atg"
64#define RCSTARTCODON "cat"
65DECLARE_ON(onStart, STARTCODON, 3)
66DECLARE_ON(onRStart, RCSTARTCODON, 3)
67
68// stop codons
69DECLARE_ON(onOchre, OCHRECODON, 3)
70DECLARE_ON(onAmber, AMBERCODON, 3)
71DECLARE_ON(onOpal, OPALCODON, 3)
72DECLARE_ON(onROchre, RCOCHRECODON, 3)
73DECLARE_ON(onRAmber, RCAMBERCODON, 3)
74DECLARE_ON(onROpal, RCOPALCODON, 3)
75
76/*
77 * wcComplement
78 *
79 * Watson/Crick complement of character. Preserves case.
80 * Doesn't do anything if character is non-acgt
81 *
82 */
83inline char wcComplement(char c) {
84 switch (c) {
85 case 'a': return 't';
86 case 'c': return 'g';
87 case 'g': return 'c';
88 case 't': return 'a';
89 case 'A': return 'T';
90 case 'C': return 'G';
91 case 'G': return 'C';
92 case 'T': return 'A';
93 default:
94 return c;
95 }
96}
97
98
99/*
100 * putReverseComplement
101 *
102 * copies the reverse complement of [dna, dna+len) to result
103 * result may be a char* or a string iterator
104 *
105 * no validation is performed and no null character is appended
106 */
107template <class Iterator>
108inline void putReverseComplement(Iterator result, const char* dna, int len) {
109 const char* s = dna + len;
110 Iterator t = result;
111 while (--s >= dna)
112 *t++ = wcComplement(*s);
113}
114
115/*
116 * reverseComplement
117 *
118 * allocates a new character array and fills it with the reverse complement of 'dna'
119 * eg agcngt -> acngct
120 * doesn't change uppercase or lowercase property
121 */
122inline char* reverseComplement(const char* dna) {
123 if (dna == NULL)
124 return NULL;
125 int len = strlen(dna);
126 char* result = new char[len+1];
127 putReverseComplement(result, dna, len);
128 result[len] = '\0';
129 return result;
130}
131
132inline void reverseComplementString(string &text) {
133 int n = text.length();
134 for (int i=0; i < n/2; i++) {
135 char c;
136 c = text[i];
137 text[i] = wcComplement(text[n-i-1]);
138 text[n-i-1] = wcComplement(c);
139 }
140 if (n%2) // n odd, must complement the middle character
141 text[n/2] = wcComplement(text[n/2]);
142}
143
144
145inline void reverseString(string &text) {
146 int i = 0;
147 int n = text.length();
148 while (i < (n/2)) {
149 char c;
150 c = (text[i]);
151 text[i] = text[n-i-1];
152 text[n-i-1] = c;
153 i++;
154 }
155}
156
157
163class Seq2Int {
164public:
165 Seq2Int(int s) : size(s) {}
166 int operator() (const char* s) const {
167 int erg=0;
168 for( int i = 0; i < size; i++ ){
169 erg <<= 2;
170 erg |= base2int(s[i]);
171 }
172 return erg;
173 }
174 int rc(const char* s) const {
175 int erg=0;
176 for ( int i=0; i<size; i++)
177 erg |= base2int(wcComplement(s[i])) << (2*i);
178 return erg;
179 }
180 int rev(const char* s) const {
181 int erg=0;
182 for ( int i=0; i<size; i++)
183 erg |= base2int(s[i]) << (2*i);
184 return erg;
185 }
186 string inv(int pn) const {
187 string result(size, 'n');
188 for(int i = size-1; i>=0; i--) {
189 result[i] = int2base(pn%4);
190 pn >>= 2; // == k /= 4;
191 }
192 return result;
193 }
194 string INV(int pn) const {
195 string result(size, 'N');
196 for(int i = size-1; i>=0; i--) {
197 result[i] = int2BASE(pn%4);
198 pn >>= 2; // == k /= 4;
199 }
200 return result;
201 }
202
203 int read(istream& strm) const {
204 char* buf = new char[size];
205 // Read the character from the stream until the internal
206 // representation size is reached, or a character
207 // cannot be recognised.
208 //------------------------------------------------------------
209 for( int i = 0; i < size; i++ ){
210 char c;
211 strm.get(c);
212 c = toupper(c);
213 if( c == 'A' || c == 'C' || c == 'G' || c == 'T' )
214 buf[i] = c;
215 }
216 int result = (*this)(buf);
217 delete[] buf;
218 return result;
219 }
220
221private:
222 static int base2int(char c);
223 static char int2base(int i);
224 static char int2BASE(int i);
225 int size;
226};
227
228inline int Seq2Int::base2int(char c) {
229 switch (c) {
230 case 'a': case 'A':
231 return 0;
232 case 'c': case 'C':
233 return 1;
234 case 'g': case 'G':
235 return 2;
236 case 't': case 'T':
237 return 3;
238 default:
239 throw InvalidNucleotideError(c);
240 }
241}
242
243inline char Seq2Int::int2base(int i) {
244 switch (i) {
245 case 0:
246 return 'a';
247 case 1:
248 return 'c';
249 case 2:
250 return 'g';
251 case 3:
252 return 't';
253 default:
254 throw ProjectError("Seq2Int::int2base: internal error: i=" + itoa(i));
255 }
256}
257
258inline char Seq2Int::int2BASE(int i) {
259 switch (i) {
260 case 0:
261 return 'A';
262 case 1:
263 return 'C';
264 case 2:
265 return 'G';
266 case 3:
267 return 'T';
268 default:
269 throw ProjectError("Seq2Int::int2base: internal error: i=" + itoa(i));
270 }
271}
272
276class ORF {
277public:
278 ORF() { start = end = -1; complete5prime = complete3prime = 0; strand = plusstrand; }
279 int len() {return abs(end - start) + 1;}
280 int start;
281 int end;
282 Strand strand;
283 bool complete5prime;
284 bool complete3prime;
285};
286
293public:
294 static void init() {
295 chooseTranslationTable(1);
296 }
297 static void printReverseGeneticMap();
298
299private:
300 GeneticCode() {}
301 static void reverseMap();
302 static const char aa_symbols_with_stop[];
303 static bool start_codons[];
304 static Double start_codon_probs[];
305 static const char * const TranslationTables[];
306 static const char * const StartCodons[];
307 static Seq2Int codon;
308 static int translationtable;
309 static int numStartCodons;
310public:
311 static void chooseTranslationTable( int );
312 static const char* const aa_symbols;
313 static const char* const aa_names[];
314 static int map[];
315 static int **syncodons;
316 static int *codonsOfAA;
317 static int get_aa_from_symbol(char c) {
318 // this one returns -1 on '*' and string::npos-1 on invalid symbols
319 // if (int)string::npos is -1, we get -2 here for invalid
320 return string(GeneticCode::aa_symbols_with_stop).find(c)-1;
321 }
322 static char translate(int n) {
323 return aa_symbols_with_stop[map[n]+1];
324 }
325 static char translate(const char* t);
326 static char revtranslate(const char* t);
327 static bool isStopcodon(const char* t) {
328 return translate(t)=='*';
329 }
330 static bool isStartcodon(const char* t, bool rc=false){
331 try {
332 if (rc)
333 return start_codons[codon.rc(t)];
334 else
335 return start_codons[codon(t)];
336 } catch (InvalidNucleotideError &e){}
337 return false;
338 }
339 static bool isStartcodon(int pn){
340 return start_codons[pn];
341 }
342 static Double startCodonProb(const char* t, bool rc=false){
343 try {
344 int pn = rc? codon.rc(t) : codon(t);
345 if (start_codons[pn])
346 return start_codon_probs[pn];
347 } catch (InvalidNucleotideError &e){}
348 return 0;
349 }
350 static Double startCodonProb(int pn){
351 return start_codon_probs[pn];
352 }
353 static bool isRCStopcodon(const char* t) {
354 return revtranslate(t)=='*';
355 }
356 static bool containsInFrameStopcodon(const char*, int, int, bool, int);
357 static void printStartCodons();
358 static void trainStartCodonProbs(int startcounts[]);
359 static void writeStart(ofstream &out); // write start codon probs to file
360 static void readStart(ifstream &in); // read start codon probs from file
361 static bool is_purine(int b){
362 return (b==0 || b==2); // 0=a, 1=c, 2=g, 3=t, a and g are purines
363 }
364 static ORF longestORF(const char* dna);
365};
366
367// getSampledCDS: sample a coding region from emission probabilities
368char *getSampledCDS(vector<Double> *emiprobs, int k, int numCodons);
369
370#endif // _GENETIC_CODE_HH
371
The genetic code maps codons to amino acids.
Definition geneticcode.hh:292
Definition types.hh:480
This class implements a double object with a very large range.
Definition lldouble.hh:31
Definition geneticcode.hh:276
Definition types.hh:449
a class for converting sequence into integer replacing Base4Int
Definition geneticcode.hh:163