Augustus 3.4.0
Loading...
Searching...
No Matches
gene.hh
1/*
2 * gene.hh
3 *
4 * License: Artistic License, see file LICENSE.TXT or
5 * https://opensource.org/licenses/artistic-license-1.0
6 */
7
8#ifndef _GENE_HH
9#define _GENE_HH
10
11// project includes
12#include "motif.hh"
13#include "pp_scoring.hh"
14#include "hints.hh"
15
16
17// Forward declarations
18
22class State;
23
28class Transcript;
29
35class Gene;
36
42class AltGene;
43
47class AnnoSequence;
48
49template<class T>
51{
52 bool operator()(T* a, T* b) { return *a < *b; }
53};
54
55
60public:
61 SrcEvidence(string srcname){
62 this->srcname = srcname;
63 freq = 0;
64 }
65 string srcname;
66 int freq;
67 list<string> groupnames;
68};
69
70bool operator<(const SrcEvidence& e1, const SrcEvidence& e2);
71
82class Evidence {
83public:
84 Evidence(bool withNames) {
85 numEvidence = 0;
86 this->withNames = withNames;
87 }
88 ~Evidence() {
89 }
90 void add(string source, string name);
91 void add(string source);
92 void print();
93 list<SrcEvidence> sourceEvidence;
94 int numEvidence;
95 bool withNames;
96};
97
98Evidence *stateEvidenceSummary(State *state1, State *state2 = NULL);
99
100
101class State {
102public:
106 long begin;
110 long end;
115 StateType type;
116 Double prob;
117 bool hasScore;
118 float apostprob; // score = posterior probability
119 int sampleCount;
120 Evidence *evidence;
121 char truncated; // truncated left, truncated right?
122 int framemod; // reading frame modifier, frame = frame given by type + framemod (for truncation)
123
124 State(long first, long last, StateType t) :
125 begin(first), end(last),
126 next(NULL),
127 type(t),
128 prob(0),
129 hasScore(false),
130 apostprob(0),
131 sampleCount(1),
132 evidence(NULL),
133 truncated(0),
134 framemod(0)
135 {}
136
137 State() :
138 begin(0),
139 end(0),
140 next(NULL),
141 type(TYPE_UNKNOWN),
142 prob(0),
143 hasScore(false),
144 apostprob(0),
145 sampleCount(1),
146 evidence(NULL),
147 truncated(0),
148 framemod(0) {}
149 ~State(){
150 if (evidence)
151 delete evidence;
152 }
153 State *cloneStateSequence(){
154 State *erg = new State(*this);
155 if (next)
156 erg->next = next->cloneStateSequence();
157 else
158 erg->next = NULL;
159 return erg;
160 }
161 // copy constructor
162 State(const State& other) :
163 begin(other.begin),
164 end(other.end),
165 next(other.next),
166 type(other.type),
167 prob(other.prob),
168 hasScore(other.hasScore),
169 apostprob(other.apostprob),
170 sampleCount(other.sampleCount),
171 evidence(NULL),
172 truncated(other.truncated),
173 framemod(other.framemod)
174 {
175 if (other.evidence)
176 evidence = new Evidence(*other.evidence);
177 }
178 int frame(); // reading frame, in case it is a (coding) exon, usually depends only on the type
179 bool frame_compatible(const Feature *hint);
180 void addEvidence(string srcname) {if (!evidence) evidence = new Evidence(false); evidence->add(srcname);}
181 bool operator< (const State &other) const;
182 bool operator== (const State &other) const;
183 int length() {
184 return (int) (end - begin + 1);
185 }
186 Strand strand() {return isOnFStrand(type)? plusstrand : minusstrand;}
187 State *getBiologicalState();
188 void setTruncFlag(long end, long predEnd, long dnalen);
189 void includeFrameModIntoType();
190};
191
192bool frame_compatible(State *ex1, State* ex2);
193
200public:
201 StatePath(){
202 first = NULL;
203 pathemiProb = 0.0;
204 intron_d = 0;
205 seqname = "";
206 }
207 ~StatePath(){
208 State *tmp;
209 while( first ){
210 tmp = first->next;
211 delete first;
212 first = tmp;
213 }
214 }
215
216 void push(State *st){
217 st->next = first;
218 first = st;
219 }
220
221 int size() {
222 int n = 0;
223 for (State* st = first; st != NULL; st = st->next) {
224 n++;
225 }
226 return n;
227 }
228
229 void print();
230 static StatePath* condenseStatePath(StatePath *oldpath);
231 Transcript* projectOntoGeneSequence(const char *genenames);
232 static StatePath* getInducedStatePath(Transcript *genelist, long dnalen, bool printErrors=true);
233 void reverse();
234 bool operator== (const StatePath &other) const;
235 bool operator< (const StatePath &other) const;
236private:
237 void pushIntron(long begin, long end, int frame, bool onFStrand);
238
239public:
240 string seqname;
241 State* first;
242 Double pathemiProb;
243 int intron_d;
244 list<PP::Match> proteinMatches;
245};
246
247
248int lenStateList(State *head);
249
251public:
252 Transcript() {
253 exons = introns = (State*) NULL;
254 next = NULL;
255 transstart = transend = -1;
256 seqname = id = source = "";
257 strand = plusstrand;
258 complete = true;
259 apostprob = 0.0;
260 hasProbs = false;
261 throwaway = false;
262 viterbi = true;
263 }
264 Transcript (const Transcript& other);
265 Transcript& operator= (const Transcript& other);
266 virtual ~Transcript(){
267 State *st, *tmp;
268 list<State*> sl = getExInInHeads();
269 for (list<State*>::iterator it = sl.begin(); it != sl.end(); ++it){
270 st = *it;
271 while( st ){
272 tmp = st->next;
273 delete st;
274 st = tmp;
275 }
276 }
277 }
278 virtual Transcript* clone() {return new Transcript(*this);}
279 Transcript *cloneGeneSequence(){
280 Transcript *res = clone(); // returns a Transcript or a Gene as appropriate
281 if (next)
282 res->next = next->cloneGeneSequence();
283 else
284 res->next = NULL;
285 return res;
286 }
287 static void destroyGeneSequence(Transcript *head) {
288 Transcript* nextHead = head;
289 while (nextHead) {
290 head = nextHead;
291 nextHead = nextHead->next;
292 delete head;
293 }
294 }
295 void addStatePostProbs(float p);
296 void setStatePostProbs(float p);
297 void addSampleCount(int k);
298 void setSampleCount(int k);
299 virtual long geneBegin() const { return transstart;}
300 virtual long geneEnd() const { return transend;}
301 virtual bool isCoding() const { return false; }
302 bool operator< (const Transcript &other) const;
303 bool operator== (const Transcript &other) const;
304 void normPostProb(float n);
305 void updatePostProb(Transcript* other);
306 virtual list<State*> getExInHeads() const {
307 list<State*> L;
308 L.push_back(exons);
309 L.push_back(introns);
310 return L;
311 }
312 virtual list<State*> getExInInHeads() const { return getExInHeads();}
313 double meanStateProb();
314 char* getExonicSequence(AnnoSequence *annoseq = NULL,
315 bool noOffset = false) const; // CDS or whole RNA, respectively
316 virtual void shiftCoordinates(long d);
317 virtual bool almostIdenticalTo(Transcript *other);
318 virtual void printCodingSeq(AnnoSequence *annoseq) const {}; // print nothing
319 virtual void printProteinSeq(AnnoSequence *annoseq) const {};// for noncoding
320 virtual void printBlockSequences(AnnoSequence *annoseq) const {}; // genes
321 virtual void printGFF() const;
322 virtual void printEvidence() const {}; // implemented only for coding genes
323 void setStateHasScore(bool has);
324 static Transcript* getGenesOnStrand(Transcript* genes, Strand strand);
325 static void filterTranscriptsByMaxTracks(list<Transcript*> &gl, int maxTracks);
326 virtual double supportingFraction(HintGroup *group) {return 0.0;}
327public:
328 State* exons; // the exons (not UTR)
329 State* introns; // the introns between 'exons'
330 long transstart, transend; // transcription boundaries, -1 if not known
331 Transcript* next;
332 string id; // transcript id
333 string seqname;
334 string source;
335 Strand strand;
336 Boolean complete; //coding region is complete
337 string geneid; // id of the AltGene
338 float apostprob;
339 bool hasProbs;
340 bool throwaway;
341 bool viterbi;
342
343 static bool gff3;
344 static bool print_tss;
345 static bool print_tts;
346};
347
348
349void filterGenePrediction(list<Transcript*> &gl, list<Transcript*> &filteredTranscripts, const char *seq, Strand strand, bool noInFrameStop, bool &hasInFrameStop, double minmeanexonintronprob=0.0, double minexonintronprob=0.0);
350
351class Gene : public Transcript {
352public:
353 Gene() {
354 utr5exons = utr3exons = utr5introns = utr3introns = (State*) 0;
355 length = clength = 0;
356 weight = 1;
357 strand = plusstrand;
358 frame = 0;
359 complete5utr = complete3utr = true;
360 codingstart = codingend = -1;
361 supportingEvidence = incompatibleEvidence = CDSexonEvidence = CDSintronEvidence = UTR5stateEvidence = UTR3stateEvidence = NULL;
362 }
363 Gene(const Gene& other);
364 virtual Gene* clone() { return new Gene(*this); }
365 ~Gene(){ // this calls ~Transcript implicitly, which deletes exons and introns
366 if (supportingEvidence)
367 delete supportingEvidence;
368 if (incompatibleEvidence)
369 delete incompatibleEvidence;
370 if (CDSintronEvidence)
371 delete CDSintronEvidence;
372 if (CDSexonEvidence)
373 delete CDSexonEvidence;
374 if (UTR5stateEvidence)
375 delete UTR5stateEvidence;
376 if (UTR3stateEvidence)
377 delete UTR3stateEvidence;
378 }
379 list<State*> getExInHeads() const { list<State*> L; L.push_back(exons); L.push_back(introns); L.push_back(utr5exons); L.push_back(utr3exons); return L;}
380 list<State*> getExInInHeads() const { list<State*> L = getExInHeads(); L.push_back(utr5introns); L.push_back(utr3introns); return L;}
381 bool hasInFrameStop(AnnoSequence *annoseq) const;
382 // void computeBC(char *seq);
383 int numExons() const;
384 State *lastExon() const;
385 bool identicalCDS(Gene *other);
386 using Transcript::almostIdenticalTo; // not really, but to prevent the compiler warning on MAC
387 virtual bool almostIdenticalTo(Gene *other);
388 void shiftCoordinates(long d);
389 long geneBegin() const { return (transstart>=0)? transstart : codingstart;}
390 long geneEnd() const { return (transend>=0)? transend : codingend;}
391 virtual bool isCoding() const { return true; }
392 void addUTR(State *mrnaRanges, bool complete_l=true, bool complete_r=true);
393 void compileExtrinsicEvidence(list<HintGroup> *groupList);
394 double supportingFraction(HintGroup *group);
395 void addSupportedStates(HintGroup *group);
396 double getPercentSupported() const;
397 long getCDSCoord(long loc, bool comp) const;
398 bool completeCDS() const;
399 void print();
400 void printGFF() const;
401 void printCodingSeq(AnnoSequence *annoseq) const;
402 void printProteinSeq(AnnoSequence *annoseq) const;
403 void printBlockSequences(AnnoSequence *annoseq) const;
404 virtual void printEvidence() const;
405 void truncateMaskedUTR(AnnoSequence *annoseq);
406
407 static void init();
408
411 State* utr3exons;
412 State* utr5introns;
413 State* utr3introns;
414 long codingstart, codingend; // transstart <= codingstart <= codingend <= transend, if not -1
415 bool complete5utr;
416 bool complete3utr;
417
423 int frame;
424 BaseCount bc;
425 int weight;
426 Evidence *supportingEvidence;
427 Evidence *incompatibleEvidence;
428 Evidence *CDSexonEvidence;
429 Evidence *CDSintronEvidence;
430 Evidence *UTR5stateEvidence;
431 Evidence *UTR3stateEvidence;
432
433 list<PP::Match> proteinMatches; // true if gene matches protein profile
434
436 static bool print_start;
437 static bool print_stop;
438 static bool print_introns;
439 static bool print_cds;
440 static bool print_exonnames;
441 static bool stopCodonExcludedFromCDS;
442 static bool print_utr;
443 static bool print_blocks;
444};
445
446
447class AltGene {
448public:
449 list<Transcript*> transcripts;
450 long mincodstart; // leftmost position of any start codon (if coding)
451 long maxcodend; //
452 Strand strand;
453 string id;
454 string seqname;
455 float apostprob;
456 bool hasProbs;
457
458 AltGene() {
459 mincodstart = maxcodend = -1;
460 strand = plusstrand;
461 apostprob = 0.0;
462 hasProbs = 0;
463 }
464 bool operator< (const AltGene &other) const;
465 void addGene(Transcript* tx);
466 bool overlaps(Transcript *tx);
467 void shiftCoordinates(long d);
468 void sortTranscripts(int numkeep=-1);
469 void deleteSuboptimalTranscripts(bool uniqueCDS);
470 long minTransBegin();
471 long maxTransEnd();
472 bool isCoding(){ return transcripts.empty() || dynamic_cast<Gene*> (transcripts.front());}
473};
474
475Transcript* getPtr(list<AltGene> *gl);
476
477void printGeneList(list<AltGene> *genelist, AnnoSequence *annoseq, bool withCS, bool withAA, bool withEvidence);
478void printGeneList(Transcript* seq, AnnoSequence *annoseq, bool withCS, bool withAA);
479void printGeneSequence(Transcript* seq, AnnoSequence *annoseq = NULL, bool withCS=false, bool withAA=true);
480list<Gene*>* sortGenePtrList(list<Gene*>);
481list<AltGene> *reverseGeneList(list<AltGene> *altGeneList, long endpos);
482list<AltGene>* groupTranscriptsToGenes(list<Transcript*> &transcripts);
483
484void reverseGeneSequence(Transcript* &seq, long endpos);
485void postProcessGenes(list<AltGene> *genes, AnnoSequence *annoseq);
486Gene* promoteToCoding(Transcript* tx, AnnoSequence *as); // make a coding gene from a non-coding if it contains a good ORF
487
492public:
493 Annotation() {
494 genes = lastGene = (Gene*) 0;
495 forwardGenes = backwardGenes = lastForwardGene = lastBackwardGene = (Gene*) 0;
496 forwardPath = condensedForwardPath = backwardPath = condensedBackwardPath = (StatePath*) 0;
497 path = condensedPath = (StatePath*) 0;
498 emiProb = 1.0;
499 forwardEmiProb = 1.0;
500 backwardEmiProb = 1.0;
501 }
502 ~Annotation() {
503 Transcript::destroyGeneSequence(genes);
504 if (path)
505 delete path;
506 if (condensedPath)
507 delete condensedPath;
508
509 Transcript::destroyGeneSequence(forwardGenes);
510 Transcript::destroyGeneSequence(backwardGenes);
511 if (forwardPath)
512 delete forwardPath;
513 if (backwardPath)
514 delete backwardPath;
515 if (condensedForwardPath)
516 delete condensedForwardPath;
517 if (condensedBackwardPath)
518 delete condensedBackwardPath;
519 }
520
521 void appendGene(Transcript* gene);
522 void appendForwardGene(Transcript* gene);
523 void appendBackwardGene(Transcript* gene);
524 const void printGFF() const;
525
526 StatePath *path;
527 StatePath *condensedPath;
528
529 StatePath *forwardPath;
530 StatePath *backwardPath;
531 StatePath *condensedForwardPath;
532 StatePath *condensedBackwardPath;
533
534 Transcript *genes;
535 Transcript *forwardGenes;
536 Transcript *backwardGenes;
537
538 Double emiProb;
539 Double forwardEmiProb;
540 Double backwardEmiProb;
541
542private:
543 Transcript *lastGene;
544 Transcript *lastForwardGene, *lastBackwardGene;
545};
546
547
549public:
550 AnnoSequence(){
551 length = 0;
552 sequence = 0;
553 seqname = 0;
554 next = (AnnoSequence*) 0;
555 anno = (Annotation*) 0;
556 weight = 1;
557 offset = 0;
558 }
560 if (sequence)
561 delete[] sequence;
562 if (seqname)
563 delete [] seqname;
564 if (anno)
565 delete anno;
566 }
567 static void deleteSequence(AnnoSequence *head){
568 AnnoSequence* nextHead = head;
569 while(nextHead) {
570 head = nextHead;
571 nextHead = nextHead->next;
572 delete head;
573 }
574 }
575 void setWeight(int w) {
576 weight = w;
577 if (!anno)
578 return;
579
580 for (Transcript* t = anno->genes; t != NULL; t = t->next){
581 Gene *g = dynamic_cast<Gene*> (t);
582 if (g)
583 g->weight = w;
584 }
585 }
586 /*
587 * reverse complement the sequence and all genes
588 */
589 AnnoSequence* getReverseComplement();
590 void printGFF();
591
592 char* seqname;
593 long length;
594 long offset; // if >0 this is the number of bases that have been cut out in the front of the original input sequence
595 char* sequence;
596 BaseCount bc;
597 AnnoSequence *next;
598 Annotation* anno;
599 int weight;
600};
601
602bool isAllLC(AnnoSequence *seqs);
603
604
605/*
606 * Print a warning if --softmasking=1 (default) and all sequences are
607 * completely in lowercase characters. This can happen in particular
608 * if the input is from GenBank.
609 */
610void warnAllLowerCase(AnnoSequence *annoseq);
611
612
613
619public:
620 AnnoSeqGeneIterator(const AnnoSequence* annoseqHead){
621 annoseq = annoseqHead;
622 while (annoseq && !(annoseq->anno && annoseq->anno->genes))
623 annoseq = annoseq->next;
624 if (annoseq)
625 gene = annoseq->anno->genes;
626 else
627 gene = NULL;
628 };
629 friend const AnnoSeqGeneIterator& operator++(AnnoSeqGeneIterator& gi);
630
631 bool hasMoreElements(){
632 return (annoseq != NULL && gene != NULL);
633 }
634 const AnnoSequence *annoseq;
635 const Transcript *gene;
636};
637
642public:
645 bool operator()(StatePath *first, StatePath *second){
646 return (first->pathemiProb > second->pathemiProb);
647 }
648};
649
654public:
656 sorted = false;
657 };
659 StatePath *p;
660 while (!pathlist.empty()){
661 p = pathlist.front();
662 pathlist.pop_front();
663 delete p;
664 }
665 }
666 void addPath(StatePath *p){
667 if (!containsPath(p))
668 pathlist.push_back(p);
669 sorted = false;
670 }
671 int size(){
672 return pathlist.size();
673 }
674 void sort(){
675 pathlist.sort(CompareStatePathPtr());
676 sorted = true;
677 }
678 void printAPosterioriProbs(ostream& out) {
679 for(list<StatePath*>::iterator it=pathlist.begin(); it!=pathlist.end(); it++)
680 out << (*it)->pathemiProb << " , ";
681 out << endl;
682 }
683 bool containsPath(StatePath *p);
684 int positionInCollection(StatePath *p);
685
686private:
687 list<StatePath*> pathlist;
688 bool sorted;
689};
690
691void examineBaseCountOfGeneSeq(AnnoSequence *as);
692
693
698public:
699 FreqSegment(long s, int f){
700 start = s;
701 freq = f;
702 }
703 long start;
704 int freq;
705};
706
707#endif // _GENE_HH
Definition gene.hh:447
Definition gene.hh:618
Definition gene.hh:548
Definition gene.hh:491
Definition motif.hh:33
Definition gene.hh:641
A summary of extrinsic evidence by source of evidence.
Definition gene.hh:82
Hints on the gene structure.
Definition hints.hh:60
Definition gene.hh:697
Definition gene.hh:351
int length
The length of the span of the coding part (with introns)
Definition gene.hh:419
State * utr5exons
members for UnTranslated Region
Definition gene.hh:410
int clength
The coding length of the gene.
Definition gene.hh:421
int frame
The reading frame position of the first base (usually 0)
Definition gene.hh:423
static bool print_start
output options
Definition gene.hh:436
HintGroup.
Definition hints.hh:149
This class implements a double object with a very large range.
Definition lldouble.hh:31
Definition gene.hh:59
Definition gene.hh:653
A path through the Hidden-Markov-Model states.
Definition gene.hh:199
void print()
Definition gene.cc:344
Definition gene.hh:101
long begin
Definition gene.hh:106
State * next
Definition gene.hh:114
long end
Definition gene.hh:110
Definition gene.hh:250
Definition gene.hh:51