Augustus 3.4.0
Loading...
Searching...
No Matches
merkmal.hh
1/*
2 * merkmal.cc
3 *
4 * License: Artistic License, see file LICENSE.TXT or
5 * https://opensource.org/licenses/artistic-license-1.0
6 *
7 * Description: Features (German: Merkmale) for the Conditional Random Field
8 */
9
10#ifndef _MERKMAL_HH
11#define _MERKMAL_HH
12
13// project includes
14#include "types.hh"
15#include "extrinsicinfo.hh"
16#include "gene.hh"
17
18// standard C/C++ includes
19#include <iostream>
20#include <vector>
21#include <iomanip> // for setw
22
23using namespace std;
24
25
26// Forward declarations
27
33class Parameters;
34
38class StateModel;
39
50class MMGroup {
51 public:
52 MMGroup(string name){parname = name; parameters = NULL;};
53 MMGroup(){parameters = NULL;};
54 virtual ~MMGroup() {};
55 // derived classes implement:
56 // Double getFactor(...); e.g. getFactor(int frame, int pattern)
57 // int getIndex(...); e.g. getIndex(int frame, int pattern)
58 void registerPars(Parameters* parameters, double alphavalue = 1.0); // adds this group of features to the Parameters
59 virtual string verbalDescription(int index) = 0;
60 virtual Double* getFactor(int index) = 0;
61 virtual void smooth(){} // make parameters of this feature group smooth, consistant or monotonic, etc.
62 void addCount(int index, int count=1);
63 int getOffset() {return offset;}
64 void setOffset(int offset) { this->offset = offset; }
65 void setName(string name) {parname = name;}
66 virtual int getNumPars() {return numPars;}
67protected:
68 string parname; // name for the parameters group, e.g. "exon emiprobs" or "ass motif"
69 int numPars; // number of parameters
70private:
71 Parameters* parameters; // Parameters object that this group belongs to
72 int offset; // offset in the Parameters vector where the features of this group start
73};
74
80class PatMMGroup : public MMGroup{
81public:
82 int getNumPars() {numPars = probs.size(); return numPars;}
83 PatMMGroup(string name) : MMGroup(name){};
84 PatMMGroup() : MMGroup(){};
85 Double* getFactor(int index);
86 string verbalDescription(int index);
87 Double getMinProb(float qthresh);
88 // data members
89 int order;
90 vector<Double> probs;
91};
92
99public:
101 int getNumPars() {numPars = probs[0].size() + probs[1].size() + probs[2].size(); return numPars;}
103 FramedPatMMGroup(string name) : MMGroup(name){};
104 void getFramePat(int index, int &frame, int &pattern);
105 inline int getIndex(int frame, int pattern){ return frame * probs[0].size() + pattern;}
106 Double* getFactor(int index);
107 string verbalDescription(int index);
108 // data members
109 int order;
110 vector<Double> probs[3];
111};
112
120class BinnedMMGroup : public MMGroup{
121public:
122 BinnedMMGroup(string name, int monotonic = 0) : MMGroup(name){this->monotonic = monotonic;};
123 BinnedMMGroup() : MMGroup(){reset();};
124 void reset(){nbins=0; origprobs.clear(); bb.clear(); avprobs.clear();}
125 void removeOrigs(){origprobs.clear();}
126 void addProb(Double p){origprobs.push_back(p);};
127 void trainBins(int maxNumBins);
128 void monotonify(); // change weights so function is monotonic (again)
129 void printBoundaries();
130 int getIndex(Double p);
131 Double factor(Double p){if (nbins==0) return p; return avprobs[getIndex(p)];}
132 void write(ostream &out);
133 void read(ifstream &in);
134 Double getMinProb(float qthresh);
135 // inherited
136 Double* getFactor(int index);
137 string verbalDescription(int index);
138 void smooth(){ monotonify(); }
139 int getNumPars() {numPars = avprobs.size(); return numPars;}
140 // data members
141 int monotonic; // +1: monotonic increasing, -1: monotonic decreasing, 0 not monotonic
142 int nbins; // number of bins
143 vector<Double> origprobs; // original probability set used for "training" of bb and avprobs
144 vector<Double> bb; // bin boundaries,
145 vector<Double> avprobs; // probs for bins [-infty, bb[0]), [bb[0],bb[1]), ... , [bb[nbins-1], infty]
146};
147
149 public:
150 Parameters(){
151 weights.reserve(5000); // adding parameters up to this number won't force a resize of the vector
152 counts.reserve(5000);
153 alpha.reserve(5000);
154 parptrs = NULL;
155 mmgroups = NULL;
156 }
157 ~Parameters(){}
158 int addMMGroup(MMGroup* mmgroup, double alphavalue = 1.0); // returns the offset
159 void addMerkmal(Double* parptr, double alphavalue = 1.0);
160 void addCount(int index, int count=1) { counts[index] += count; }
161 void resetCounts(){ for (int i=0; i<counts.size(); i++) counts[i]=0; };
162 void addWeights(vector<double> &h);
163 void setWeights(vector<double> &v);
164 void smoothFeatures();
165 void updatePars(); // set pars to the exp of weights
166 void updateWeights(); // set weights to the log of pars
167 double getWeight(int i){ return weights[i]; }
168 double getAlpha(int i){return alpha[i]; }
169 string verbalDescription(int index);
170 void print(int numprint, bool countsOnly) { print<int>(counts, numprint, countsOnly); }
171 template <class T>
172 void print(vector<T> &counts, int numprint, bool countsOnly); // numprint < 0 => print all
173
174// void print(vector<int> &counts, bool verbose, bool countsOnly);
175 vector<int> getCounts(){ return counts; }
176 int size() { return weights.size(); }
177 vector<double> getWeights(){ return weights; }
178 private:
179 vector<double> weights; // the weightsparameters of the CRF (weight vector), logarithms of parameters
180 vector<int> counts; // holds counts of how often each feature was used in a parse
181 vector<double> alpha; // regularization weight: alpha[j] large => weight[j] is allowed to vary more
182 vector<Double*> *parptrs; // pointer to vector of pointers, that point to the actual Double variables used in the prediction
183 vector<MMGroup*> *mmgroups; // ordered list of all contained MMGroups
184};
185
186template <class T>
187void Parameters::print(vector<T> &counts, int numprint, bool countsOnly){
188 T countsum = 0;
189 double weightsum = 0.0;
190
191 cout << "[CRF parameters]: ";
192 if (!mmgroups){
193 cout << " empty " << endl;
194 return;
195 }
196
197 cout << weights.size() << " parameters" << endl;
198 int grpidx=0;
199 MMGroup* mmgroup = (*mmgroups)[grpidx];
200 for (int index=0; index < weights.size(); index++){
201 if (numprint > 0 && index == numprint/2 && weights.size() > numprint)
202 cout << "..." << endl;
203 if (numprint < 0 || index < numprint/2 || weights.size()-index <= numprint/2) {
204 cout << index << "\t";
205 while(mmgroup->getOffset() + mmgroup->getNumPars() <= index && grpidx < mmgroups->size()-1){
206 grpidx++;
207 mmgroup = (*mmgroups)[grpidx];
208 }
209 if (grpidx >= mmgroups->size() || mmgroup->getOffset() > index){
210 throw ProjectError("Parameters::print: inconsistency");
211 }
212 cout << left << setw(30) << mmgroup->verbalDescription(index - mmgroup->getOffset()) << "\t";
213 cout << "alpha=" << alpha[index];
214 cout << "\tc=" << counts[index];
215 if (!countsOnly)
216 cout << "\tw=" << setprecision(6) << weights[index] << "\tp=" << *(*parptrs)[index];
217 cout << endl;
218 }
219 countsum += counts[index];
220 weightsum += weights[index];
221 }
222 cout << "***\t" << setw(30) << "average over all features" << "\t";
223 cout << "c=" << countsum;
224 if (!countsOnly)
225 cout << "\tw=" << weightsum/weights.size() << "\tp=" << LLDouble::exp(weightsum/weights.size());
226 cout << endl;
227}
228
229
235class CRF {
236public:
237 static void train(Parameters* startPars, vector<AnnoSequence*> trainGenes, FeatureCollection &extrinsicFeatures);
238 static void onlineLargeMarginTraining(Parameters* startPars, vector<AnnoSequence*> trainGenes, FeatureCollection &extrinsicFeatures);
239 static void improvedIterativeScaling(Parameters* startPars, vector<AnnoSequence*> trainGenes, FeatureCollection &extrinsicFeatures);
240 template <class U, class V>
241 static void compareWeights(Parameters* parameters, vector<U> &startWeights, vector<V> &endWeights, int numPrint=100);
242 static double lossFct(Transcript* correct, Transcript* predicted);
243 static void setEvalAnnoSeqs(vector<AnnoSequence*> seqs){evalAnnoSeqs = seqs;}
244 // this function is only temporary for saving intermediate parameter files
245 static void setPrintPars(int ostatecount, StateModel **ostates){
246 statecount = ostatecount;
247 states = ostates;
248 }
249private:
250 static vector<double> capOutliers(vector<double> bs);
251 static vector<AnnoSequence*> evalAnnoSeqs;
252 static FeatureCollection *evalExtrinsicFeatures;
253 static int statecount;
254 static StateModel **states;
255};
256
261 template <class T>
262 bool operator()(const T * lhs, const T * rhs) const {
263 return *lhs < *rhs;
264 }
265};
266
267template <class U, class V>
268void CRF::compareWeights(Parameters* parameters, vector<U> &startWeights, vector<V> &endWeights, int numPrint){
269 int n = numPrint;
270 if (n > startWeights.size())
271 n = startWeights.size();
272 if (n > 0)
273 cout << "sorted list of the " << n << " most significant changes." << endl;
274 vector<double> absDiffs;
275 absDiffs.reserve(startWeights.size());
276 for (int i=0; i < startWeights.size(); i++)
277 absDiffs.push_back(abs(startWeights[i] - endWeights[i]));
278
279 std::vector<const double *> pointer;
280 pointer.reserve(absDiffs.size());
281 const double *const start = &absDiffs[0];
282 const double *const end = start + absDiffs.size();
283
284 for (const double * iter = start; iter != end; ++iter)
285 pointer.push_back(iter);
286
287
288 std::sort(pointer.begin(), pointer.end(), LessDereference());
289 for (int i = pointer.size()-1; i >= pointer.size()-n && i >= 0; i--) {
290 const double * p = pointer[i];
291 int idx = p-start;
292 if (endWeights[idx] != startWeights[idx]){
293 cout << parameters->verbalDescription(idx) << "\t" << startWeights[idx] << " --> " << endWeights[idx]
294 << " diff= " << endWeights[idx] - startWeights[idx] << "\t" << setprecision(4) << LLDouble::exp(endWeights[idx] - startWeights[idx]) << endl;
295 }
296 }
297}
298
299#endif // _MERKMAL_HH
Contains features for bins of a probability (or a score).
Definition merkmal.hh:120
implements functions for training Conditional Random Fields
Definition merkmal.hh:235
Definition extrinsicinfo.hh:314
Contains a vector of parameters for each frame. Is used in particular for exon emiprobs.
Definition merkmal.hh:98
This class implements a double object with a very large range.
Definition lldouble.hh:31
MMGroup (MM = Merkmal = Feature)
Definition merkmal.hh:50
Definition merkmal.hh:148
Contains a vector of parameters. Is used in particular for intron emiprobs.
Definition merkmal.hh:80
Definition types.hh:449
This is the base interface class common to all state model classes (ExonModel, IntronModel,...
Definition statemodel.hh:65
Definition gene.hh:250
Definition merkmal.hh:260