Augustus 3.4.0
Loading...
Searching...
No Matches
exonmodel.hh
1/*
2 * exonmodel.hh
3 *
4 * License: Artistic License, see file LICENSE.TXT or
5 * https://opensource.org/licenses/artistic-license-1.0
6 */
7
8#ifndef _EXONMODEL_HH
9#define _EXONMODEL_HH
10
11#include "statemodel.hh"
12
13
15public:
16 ExonModelError(string msg) : ProjectError(msg) {}
17};
18
33public:
34 OpenReadingFrame(const char* dna, int _max_exon_length, int _n);
37 int leftmostExonBegin(int frame, int base, bool forward);
38
39private:
40 // used internally, these call GeneticCode
41 static bool isStopcodon(const char* dna);
42 static bool isRCStopcodon(const char* dna);
43
44 vector<Integer> nearestStopForward;
45 vector<Integer> nearestStopReverse;
46 int n;
47 int max_exon_length;
48 // static bool amber, ochre, opal; // now taken care of by GeneticCode
49};
50
51/*
52 * @brief Training the exon model and implementing the algorithms.
53 * @details coding exons only, UTR exons are modelled in utrmodel.cc
54 *
55 * @author Stafilarakis
56 * @author Mario Stanke
57 */
58class ExonModel : public StateModel{
59public:
60 ExonModel();
61 ~ExonModel();
62
63 StateType getStateType() const {
64 return etype;
65 }
66 void buildModel ( const AnnoSequence* annoseq, int parIndex );
67 void registerPars ( Parameters* parameters);
68 void printProbabilities ( int parIndex, BaseCount *bc, const char* suffix = NULL );
69 void viterbiForwardAndSampling(ViterbiMatrixType&, ViterbiMatrixType&, int, int,
70 AlgorithmVariant, OptionListItem&);
71 void processOvlpOption(ViterbiMatrixType& , ViterbiMatrixType&, AlgorithmVariant&,
72 int state, int endOfPred, int beginOfBioExon, Double &maxProb,
73 Double emiProb, Double &fwdsum, OptionsList *, OptionListItem &oli, int base) const;
74 Double emiProbUnderModel(int begin, int end) const;
75 Double endPartEmiProb(int end) const;
76 Double notEndPartEmiProb(int beginOfStart, int right, int frameOfRight, Feature *exonparts) const;
77 void initAlgorithms(Matrix<Double>&, int);
78 static void storeGCPars(int idx);
79
80 // class functions
81 static void init();
82 static void resetPars() {
83 initAlgorithmsCalled = false;
84 }
85 static void updateToLocalGC(int from = -1, int to = -1);
86 static void readProbabilities(int parIndex);
87 static void readAllParameters();
88 static double *getCodonUsage();
89 static void resetModelCount(){exoncount = 0;};
90 static int getMaxStateLen() { return Constant::max_exon_len + trans_init_window; }
91 static void setORF() {
92 if (orf)
93 delete orf;
94 orf = new OpenReadingFrame(sequence, Constant::max_exon_len, dnalen);
95 initAlgorithmsCalled = false;
96 }
97 static vector<Double> lenDistSingle; // Length distribution of Single exons (length of biol. exon)
98 static vector<Double> lenDistInitial; // Length distribution of Initial exons (length of biol. exon)
99 static vector<Double> lenDistInternal; // Length distribution of Internal exons (length of biol. exon)
100 static vector<Double> lenDistTerminal; // Length distribution of Terminal exons (length of biol. exon)
101
102private:
103 void processExons(const Gene* gene);
104 void processSingleExon(const State* exon);
105 void processInitialExon(const State* exon);
106 void processInternalExon(const State* exon);
107 void processTerminalExon(const State* exon);
108 void processInnerSequence(const char* begin, const char* end, int modeltype = 0);
109 void buildProbabilities ( );
110 Double seqProb(int endOfStart, int right, int frameOfRight) const;
111 Double eTermSeqProb(int left, int right, int frameOfRight) const;
112 Double initialSeqProb(int left, int right, int frameOfRight) const;
113 void computeLowerOrderPats();
114 void computeCMFromBW ();
115
116 // internal class functions
117 static void computeLengthDistributions( );
118 static void fillTailsOfLengthDistributions( );
119 static int getBaseOffset(StateType type);
120 static int getInnerPartEndOffset(StateType type);
121
122 StateType etype;
123 Integer win, // reading frame of this state (fixed)
124 curwin; // current reading frame during training
125 int beginPartLen; // constant for every etype: endOfPred = beginOfStart - beginPartLen -1
126 int innerPartOffset; // beginOfStart = beginOfBioExon + innerPartOffset
127 int innerPartEndOffset; // right = endOfBioExon - innerPartEndOffset
128 int baseOffset; // base = endOfBioExon - baseOffset
129 Integer gweight;
130 // int prewin;
131
132 // class variables
133 static vector<Integer> patterncount[3]; // {0,1,2}x{acgt}^(k+1), the reading frame is
134 // the position of the emitted (last) nucleotide
135 static vector<Integer> initpatterncount[3]; // like above, just the first nucleotides of a gene
136 static vector<Integer> etpatterncount[3] ; // internal exon ternimal part
137 static Integer k; // order of the content MM
138 static Integer etorder; // order of the exon terminating motif
139 static Integer etpseudocount;// pseudocount for the exon terminating motif
140 static Integer min_exon_length;
141 static Integer trans_init_window;
142
143 static FramedPatMMGroup emiprobs;
144 static FramedPatMMGroup *GCemiprobs;
145 static vector<Double> initemiprobs[3];
146 static vector<Double> **GCinitemiprobs;
147 static vector<Double> etemiprobs[3];
148 static vector<Double> **GCetemiprobs;
149 static vector<Integer> numExonsOfType;
150 static vector<Integer> numHugeExonsOfType; // number of exons exceeding the maximal length
151 // modelled by the length distribution
152 static Integer numSingle, numInitial, numInternal, numTerminal;
153 static Integer numHugeSingle, numHugeInitial, numHugeInternal, numHugeTerminal;
154 static Matrix<vector<Double> > Pls;
155 static Matrix<vector<Double> >* GCPls; // array with one matrix per GC content class
156 static Integer exoncount;
157 static Boolean hasLenDist;
158 static Integer gesbasen[3];
159 static Double patpseudo; // pseudocount for patterns in sequence
160 static Integer exonLenD; // number of exons of length <= d
161 static Integer minPatSum; // for the decision to shorten the emission pattern
162 static vector<Integer> lenCountSingle; // Length count of Single exons (length of biol. exon)
163 static vector<Integer> lenCountInitial; // Length count of Initial exons (length of biol. exon)
164 static vector<Integer> lenCountInternal; // Length count of Internal exons (length of biol. exon)
165 static vector<Integer> lenCountTerminal; // Length count of Terminal exons (length of biol. exon)
166 static double slope_of_bandwidth;// for smoothing
167 static Integer minwindowcount; // see class Smooth in commontrain.hh
168 static Motif *transInitMotif; // weight matrix before the translation initiation
169 static Motif *GCtransInitMotif; // array for each GC content class
170 static BinnedMMGroup transInitBinProbs; // CRF-features based on transInitMotif
171 static BinnedMMGroup *GCtransInitBinProbs;// for all GC content classes
172 static Integer tis_motif_memory; // order of the trans init motif
173 static Integer tis_motif_radius; // radius for the smoothing of the trans init motif
174 static Motif **etMotif; // weight matrices before the donor splice site (3 frames)
175 static Motif ***GCetMotif; // array with motifs for each GC content class
176 static int numModels;
177 static Double *modelStartProbs;
178 static int ilend;
179 static OpenReadingFrame *orf;
180 static int ochrecount, ambercount, opalcount; // frequencies of the 3 stop codons
181 static bool initAlgorithmsCalled, haveORF;
182 static int lastParIndex; // GC-index of current parameter set
183 static int verbosity;
184 static list<string> *tiswins; // holds translation initiation windows (for CRF training)
185 static int startcounts[]; // how often did the different start codons appear during training?
186 static int lenboostL; // parameters for boosting length distribution of single and initial exons
187 static double lenboostE; // length above L are improved, the more the larger E is
188};
189
190
191#endif // _EXONMODEL_HH
Definition gene.hh:548
Definition motif.hh:33
Contains features for bins of a probability (or a score).
Definition merkmal.hh:120
Definition exonmodel.hh:14
Definition exonmodel.hh:58
Hints on the gene structure.
Definition hints.hh:60
Contains a vector of parameters for each frame. Is used in particular for exon emiprobs.
Definition merkmal.hh:98
Definition gene.hh:351
This class implements a double object with a very large range.
Definition lldouble.hh:31
A simple matrix class. Base class for all mathematical matrix objects.
Definition matrix.hh:27
Definition motif.hh:92
Open Reading Frame (ORF), part of the exon model.
Definition exonmodel.hh:32
Options lists are used for sampling; items also in backtracking.
Definition vitmatrix.hh:748
Definition vitmatrix.hh:779
Definition merkmal.hh:148
Definition types.hh:449
ProjectError()
Definition types.hh:460
This is the base interface class common to all state model classes (ExonModel, IntronModel,...
Definition statemodel.hh:65
Definition gene.hh:101
An array of Viterbi columns.
Definition vitmatrix.hh:687