Augustus 3.4.0
Loading...
Searching...
No Matches
extrinsicinfo.hh
1/*
2 * extrinsicinfo.hh
3 *
4 * License: Artistic License, see file LICENSE.TXT or
5 * https://opensource.org/licenses/artistic-license-1.0
6 *
7 * Description: EST information or homology information or information from inter species
8 * comparisons. Hints in general.
9 */
10
11#ifndef __EXTRINSICINFO_HH
12#define __EXTRINSICINFO_HH
13
14// project includes
15#include "hints.hh"
16#include "properties.hh"
17#include "gene.hh"
18
19// standard C/C++ includes
20#include <set>
21#include <fstream>
22#include <vector>
23#include <iostream>
24
25
26#define forwDSS 0
27#define revDSS 1
28#define forwASS 2
29#define revASS 3
30#define maxStoreMalus 3000
31
38public:
40 begin = end = -1;
41 omittedGroups = NULL;
42 allHints = false;
43 }
44 PredictionRun(int begin, int end, list<HintGroup*> *omittedGroups, bool allHints = false){
45 this->begin = begin;
46 this->end = end;
47 this->omittedGroups = omittedGroups;
48 this->allHints = allHints;
49 }
50 void print(int beginPos = -1);
51 int begin;
52 int end;
53 bool allHints; // this is true for the (first) prediction run that includes all hints, this is not quite the same as omittedGroups=NULL
54 list<HintGroup*> *omittedGroups;
55};
56
63public:
64 PredictionScheme(int n){
65 this->n = n;
66 }
67 void addRun(PredictionRun run);
68 void print(int beginPos = -1);
69 int n;
70 list<PredictionRun> predictionRuns;
71};
72
73
80
87public:
89 this->collection = collection;
90 featureLists = new list<Feature>[NUM_FEATURE_TYPES];
91 groupList = NULL;
92 groupGaps = NULL;
93 predictionScheme = NULL;
94 sorted = true;
95 groupsSorted = false;
96 hintedSites = NULL;
97 hasLocalSSmalus = NULL;
98 seqlen = 0;
99 K = BLOCKSIZE;
100 firstEnd = NULL;
101 lastStart = NULL;
102 }
103 SequenceFeatureCollection(SequenceFeatureCollection& other, int from, int to, bool rc = false);
105 if (featureLists)
106 delete[] featureLists;
107 if (predictionScheme)
108 delete predictionScheme;
109 if (groupList)
110 delete groupList;
111 if (groupGaps)
112 delete groupGaps;
113 if (hintedSites)
114 delete [] hintedSites;
115 if (hasLocalSSmalus)
116 delete [] hasLocalSSmalus;
117 if (firstEnd) {
118 for (int type=0; type < NUM_FEATURE_TYPES; type++)
119 delete [] firstEnd[type];
120 delete [] firstEnd;
121 }
122 if (lastStart) {
123 for (int type=0; type < NUM_FEATURE_TYPES; type++)
124 delete [] lastStart[type];
125 delete [] lastStart;
126 }
127
128 }
129
130 void addFeature(Feature f);
131 void printFeatures(ostream& out);
132 void sortFeatureLists();
133 void checkGroupConsistency(AnnoSequence *seq);
134 void warnInconsistentHints(AnnoSequence *seq);
135 static void deleteEqualElements(list<Feature> &list);
136 list<Feature> getFeatureList(FeatureType type) {
137 return featureLists[(int) type];
138 }
139 Feature *getFeatureAt(FeatureType type, int endPosition, Strand strand);
140 Feature *getFeatureListAt(FeatureType type, int endPosition, Strand strand);
141 Feature *getAllActiveFeatures(FeatureType type);
142 Feature *getFeatureListInRange(FeatureType type, int startPosition, int endPosition,
143 Strand strand, int seqRelFrame=-1);
144 Feature *getFeatureListBeginningInRange(FeatureType type, int startPosition, int endPosition,
145 Strand strand, int seqRelFrame=-1);
146 Feature *getFeatureListOvlpingRange(FeatureType type, int startPosition, int endPosition, Strand strand);
147 Feature *getFeatureListOvlpingRange(Bitmask featuretypes, int startPosition, int endPosition, Strand strand);
148 Feature *getFeatureListContaining(Bitmask featuretypes, int position, Strand strand);
149 Feature *getExonListInRange(int startPosition, int endPosition, Strand strand, int seqRelFrame=-1);
150 Feature *getExonListOvlpingRange(int startPosition, int endPosition, Strand strand, int seqRelFrame=-1);
151 double localSSMalus(FeatureType type, int pos, Strand strand);
152 void shift(int offset);
153 void computeHintedSites(const char* dna);
154 bool validDSSPattern(const char* dna) const {
155 try {
156 return validDSS[Seq2Int(2)(dna)];
157 } catch (...) {
158 return false;
159 }
160 }
161 bool validRDSSPattern(const char* dna) const {
162 try {
163 return validDSS[Seq2Int(2).rc(dna)];
164 } catch (...) {
165 return false;
166 }
167 }
168 bool validASSPattern(const char* dna) const {
169 try {
170 return validASS[Seq2Int(2)(dna)];
171 } catch (...) {
172 return false;
173 }
174 }
175 bool validRASSPattern(const char* dna) const {
176 try {
177 return validASS[Seq2Int(2).rc(dna)];
178 } catch (...) {
179 return false;
180 }
181 }
182 bool validSplicePattern(string s) const {
183 return validHintedSites.count(s) > 0;
184 }
185 bool validRSplicePattern(string s) const {
186 string reverseComplement(4,0);
187 putReverseComplement(reverseComplement.begin(), s.c_str(), 4);
188 return validSplicePattern(reverseComplement);
189 }
190 bool isHintedDSS(int pos, Strand strand) const {
191 return hintedSites &&
192 hintedSites[pos][strand == plusstrand ? forwDSS : revDSS];
193 }
194 bool isHintedASS(int pos, Strand strand) const {
195 return hintedSites &&
196 hintedSites[pos][strand == plusstrand ? forwASS : revASS];
197 }
198 void deleteFeatureAt(FeatureType type, int endPosition, Strand strand);
199 void cleanRedundantFeatures();
200 void setSeqLen(int len){seqlen = len;}
201 void makeGroups();
202 void printGroups();
203 void findGenicGaps(); // find intra-group gaps: nonirpart hints
204 void findGroupGaps(); // find inter-group gaps: good positions to break up sequence
205 void determineInterGroupRelations();
206 void resetConformance();
207 void emptyTrash();
208 void computeIndices();
209 list<Feature>::iterator getPosFirstEndAtOrAfter(int type, int e);
210 list<Feature>::iterator getPosStartAfter(int type, int s);
211 void rescaleBoniByConformance();
212 void createPredictionScheme(list<AltGene> *genes);
213 void setActiveFlag(list<HintGroup*> *groups, bool flag);
214 list<AltGene> *joinGenesFromPredRuns(list<list<AltGene> *> *genesOfRuns, int maxtracks, bool uniqueCDS);
215 void sortIncompGroupsOfGroups(); // sort by pointers, for '==' operator of lists
216 set<HintGroup*> *getCausingGroups(PredictionRun &pr);
217 void prepare(AnnoSequence *annoseq, bool print, bool withEvidence=true);
218 void prepareLocalMalus(const char* dna); // precompute tables for computing a local exonpart/UTRpart/CDSpart malus later
219 int numZeroCov(int start, int end, int type, Strand strand);
220 static void initHintedSplicesites(string ssList);
221 FeatureCollection *collection;
222
223 static Bitmask validDSS;
224 static Bitmask validASS;
225 static set<string> validHintedSites;
226
227 list<Feature> *featureLists;
228 list<HintGroup> *groupList;
229 list<Feature> *groupGaps; // sorted list of gaps between groups (irpart)
230 bool groupsSorted;
231 PredictionScheme *predictionScheme;
232 bool sorted;
233 Bitmask *hintedSites;
234 Bitmask *hasLocalSSmalus;
235 int seqlen;
236 int K; // block size for firstEnd and lastStart
237 // firstEnd[t][k] holds an iterator to featureList[t] to the first element f in the list such that
238 // f->end >= k*K, where k>=0 and k is such that k*K <= seqlen
239 list<Feature>::iterator **firstEnd;
240 // lastStart[t][k] holds an iterator to featureList[t] to the list such that all following list elements f have
241 // f->start > k*K, where k>=0 and k is such that k*K <= seqlen
242 list<Feature>::iterator **lastStart;
243 vector<int> cumCovUTRpartPlus; // cumulative number of positions not covered by UTRpart hints on the plus strand
244 vector<int> cumCovUTRpartMinus; // cumulative number of positions not covered by UTRpart hints on the minus strand
245 vector<int> cumCovCDSpartPlus; // cumulative number of positions not covered by CDSpart hints on the plus strand
246 vector<int> cumCovCDSpartMinus; // cumulative number of positions not covered by CDSpart hints on the minus strand
247 vector<int> cumCovExonpartPlus; // cumulative number of positions not covered by exonpart hints on the plus strand
248 vector<int> cumCovExonpartMinus; // cumulative number of positions not covered by exonpart hints on the minus strand
249
250private:
251 void addCumCov(vector<bool> &cov, const list<Feature>& flist, Strand strand);
252};
253
260 FeatureTypeInfo(int numSources=1, double b=-1.0, double m=1.0, double lm=1.0) :
261 bonus(b), // -1 means not initialized
262 malus(m), // don't change anything unless we have at least searched
263 localMalus(lm),
264 gradeclassbounds(numSources, vector<double>()),
265 gradequots(numSources, vector<double>(1, 1.0))
266 {}
267
268 /*
269 * gradeclass returns the grade grade associated with the score (e-value).
270 */
271 int gradeclass(int source, double score) {
272 int klasse = 0;
273 while (klasse < gradeclassnums(source)-1
274 && score >= gradeclassbounds[source][klasse])
275 klasse++;
276 return klasse;
277 }
278
279 int gradeclassnums(int source) { // for each source the number of classes
280 return gradequots[source].size();
281 }
282 void read(istream& datei, int source) {
283 int numclasses;
284 datei >> numclasses;
285 if (numclasses < 0 || numclasses > 10){
286 cerr << "Error: number of classes=" << numclasses << endl;
287 throw ProjectError("Error: number of classes out of range.");
288 }
289 gradeclassbounds[source].resize(numclasses-1);
290 gradequots[source].resize(numclasses);
291 for (int i=0; i<numclasses-1; i++) {
292 datei >> gradeclassbounds[source][i];
293 if (i>0 && gradeclassbounds[source][i] < gradeclassbounds[source][i-1]) {
294 cerr << "Error: class bounds not increasing!" << endl;
295 throw ProjectError("Error reading class bounds");
296 }
297 }
298 for (int i=0; i < numclasses; i++)
299 datei >> gradequots[source][i];
300 }
301 double bonus;
302 double malus;
303 double localMalus; // (stronger) additional malus given the gene has evidence at other regions
304
305 vector<vector<double> > gradeclassbounds; // for source and class the boundary
306
307 /*
308 * gradequot equals p+(g)/p-(g) for a given type
309 * where g is the grade associated with the score (e-value).
310 */
311 vector<vector<double> > gradequots; // for each source and class the grade quotient
312};
313
315public:
317 hasHintsFile(false),
318 numSeqsWithInfo(0),
319 numSources(1),
320 malustable(NULL),
321 localmalustable(NULL)
322 {
323 try {
324 offset = Properties::getIntProperty( "predictionStart" ) - 1;
325 } catch (...) {
326 offset = 0;
327 }
328 string ssList = "gtag,gcag,";
329 try {
330 ssList += Properties::getProperty("allow_hinted_splicesites");
331 ssList += ",";
332 } catch (KeyNotFoundError &) {}
333 SequenceFeatureCollection::initHintedSplicesites(ssList);
334
335 if (offset < 0)
336 offset = 0;
337 Feature::offset = offset;
338 }
340 // TODO deep assignment operator
341 //FeatureCollection & operator = (const FeatureCollection & other){
342 // return *this;}
343
345 for (map<string, SequenceFeatureCollection*>::iterator it = collections.begin();
346 it != collections.end();
347 it++) {
348 delete it->second;
349 it->second = NULL;
350 }
351 if (malustable){
352 for (int type=0; type < NUM_FEATURE_TYPES; type++)
353 if (malustable[type])
354 delete [] malustable[type];
355 delete [] malustable;
356 }
357 if (localmalustable){
358 for (int type=0; type < NUM_FEATURE_TYPES; type++)
359 if (localmalustable[type])
360 delete [] localmalustable[type];
361 delete [] localmalustable;
362 }
363 /*if(sourceKey)
364 delete [] sourceKey;
365 if(individual_liability)
366 delete [] individual_liability;
367 if(oneGroupOneGene)
368 delete [] oneGroupOneGene;
369 */
370 }
371
372 SequenceFeatureCollection& getSequenceFeatureCollection(const char *seqname){
373 SequenceFeatureCollection*& psfc = collections[seqname];
374 if (psfc == NULL)
375 psfc = new SequenceFeatureCollection(this);
376 return *psfc;
377 }
378 SequenceFeatureCollection* getSequenceFeatureCollection(string seqname){
379 return collections[seqname];
380 }
381 bool isInCollections(string seqname){return collections.count(seqname)>0;}
382 void readGFFFile(const char *filename);
383 void setBonusMalus(Feature& f);
384 void readExtrinsicCFGFile();
385 // reading the extrinsic config file is split into two parts:
386 void readSourceRelatedCFG(istream& datei); // reading in all source related parameters
387 void readTypeInfo(istream& datei); // reading in the bonus/malus table
388 int getNumSeqsWithInfo() { return numSeqsWithInfo;}
389 int getNumCommonSeqs(AnnoSequence *annoseq);
390 void printAccuracyForSequenceSet(const AnnoSequence* annoseqs, bool cleanRedundancies=true);
391 void printAccuracyForSequenceSetOld(const AnnoSequence* annoseqs, bool cleanRedundancies=true);
392 bool skeyExists(string skey);
393 int esource(string skey);
394 bool getIndividualLiability(string skey){return individual_liability[esource(skey)];}
395 bool get1group1gene(string skey){return oneGroupOneGene[esource(skey)];}
396 double malus(FeatureType type){
397 return typeInfo[type].malus;
398 }
399 double localMalus(FeatureType type){
400 return typeInfo[type].localMalus;
401 }
402
403 double partMalus(FeatureType type, int len);
404 double localPartMalus(FeatureType type, int len, Double bonus, int nindep);
405 bool hasHintsFile;
406 int offset;
407private:
408 int numSeqsWithInfo;
409 ifstream datei;
410
411 int numSources; // number of different sources of hints
412 string *sourceKey; // for each source the key string
413 bool *individual_liability; // for each source the flag individual_liability
414 bool *oneGroupOneGene; // for each source the flag 1group1gene
415 FeatureTypeInfo typeInfo[NUM_FEATURE_TYPES]; // the type-specific information
416 map<string, SequenceFeatureCollection*> collections;
417 double** malustable;
418 double** localmalustable;
419};
420
421#endif //__EXTRINSICINFO_HH
Definition gene.hh:548
Definition extrinsicinfo.hh:314
Hints on the gene structure.
Definition hints.hh:60
This class implements a double object with a very large range.
Definition lldouble.hh:31
Range of prediction and a set of HintGroups turned off.
Definition extrinsicinfo.hh:37
Plan of individual prediction steps with each a range and a set of hint groups turned on/off.
Definition extrinsicinfo.hh:62
Definition types.hh:449
a class for converting sequence into integer replacing Base4Int
Definition geneticcode.hh:163
holds all extrinsic feature information for one sequence
Definition extrinsicinfo.hh:86
Definition types.hh:425
holds all type-specific extrinsic feature information
Definition extrinsicinfo.hh:259
Definition properties.hh:68