16#include <boost/archive/text_oarchive.hpp>
17#include <boost/archive/text_iarchive.hpp>
18#include <boost/archive/impl/basic_text_oarchive.ipp>
19#include <boost/archive/impl/text_oarchive_impl.ipp>
20#include <boost/serialization/list.hpp>
21#include <boost/serialization/vector.hpp>
27void expandDir(
string& filename);
41#include <unordered_map>
50enum Strand {STRAND_UNKNOWN=-1, plusstrand, minusstrand, bothstrands};
51char strandChar (Strand s);
52ostream& operator<< (ostream& strm,
const Strand s);
61#define STARTCODON_LEN 3
62#define STOPCODON_LEN 3
63#define OCHRECODON "taa"
64#define OPALCODON "tga"
65#define AMBERCODON "tag"
66#define RCOCHRECODON "tta"
67#define RCOPALCODON "tca"
68#define RCAMBERCODON "cta"
72#define SPECIES_SUBDIR "species/"
73#define MODEL_SUBDIR "model/"
74#define EXTRINSIC_SUBDIR "extrinsic/"
76#define VERSION "3.5.0"
78#define PREAMBLE "# This output was generated with AUGUSTUS (version " VERSION ").\n\
79# AUGUSTUS is a gene prediction tool written by M. Stanke (mario.stanke@uni-greifswald.de),\n\
80# O. Keller, S. König, L. Gerischer, L. Romoth, Katharina Hoff, Henry Mehlan and Daniel Honsel.\n\
81# Please cite: Mario Stanke, Mark Diekhans, Robert Baertsch, David Haussler (2008),\n\
82# Using native and syntenically mapped cDNA alignments to improve de novo gene finding\n\
83# Bioinformatics 24: 637-644, doi 10.1093/bioinformatics/btn013"
85#define GREETING "AUGUSTUS (" VERSION ") is a gene prediction tool.\n\
86Sources and documentation at https://github.com/Gaius-Augustus/Augustus"
88#define SPECIES_LIST "usage:\n\
89augustus [parameters] --species=SPECIES queryfilename\n\
91where SPECIES is one of the following identifiers\n\
93identifier | species\n\
94-----------------------------------------|----------------------\n\
95pea_aphid | Acyrthosiphon pisum\n\
96aedes | Aedes aegypti\n\
97amphimedon | Amphimedon queenslandica\n\
98ancylostoma_ceylanicum | Ancylostoma ceylanicum\n\
99adorsata | Apis dorsata\n\
100honeybee1 | Apis mellifera\n\
101arabidopsis | Arabidopsis thaliana\n\
102aspergillus_fumigatus | Aspergillus fumigatus\n\
103aspergillus_nidulans | Aspergillus nidulans\n\
104(anidulans) | Aspergillus nidulans\n\
105aspergillus_oryzae | Aspergillus oryzae\n\
106aspergillus_terreus | Aspergillus terreus\n\
107bombus_impatiens1 | Bombus impatiens\n\
108bombus_terrestris2 | Bombus terrestris\n\
109botrytis_cinerea | Botrytis cinerea\n\
110brugia | Brugia malayi\n\
111b_pseudomallei | Burkholderia pseudomallei\n\
112caenorhabditis | Caenorhabditis elegans\n\
113(c_elegans_trsk) | Caenorhabditis elegans\n\
114(elegans) | Caenorhabditis elegans\n\
115elephant_shark | Callorhinchus milii\n\
116camponotus_floridanus | Camponotus floridanus\n\
117candida_albicans | Candida albicans\n\
118candida_guilliermondii | Candida guilliermondii\n\
119candida_tropicalis | Candida tropicalis\n\
120chaetomium_globosum | Chaetomium globosum\n\
121chiloscyllium | Chiloscyllium punctatum\n\
122chlamy2011 | Chlamydomonas reinhardtii\n\
123(chlamydomonas) | Chlamydomonas reinhardtii\n\
124chlorella | Chlorella sp.\n\
125ciona | Ciona intestinalis\n\
126coccidioides_immitis | Coccidioides immitis\n\
127coccidioides_immitis | Coccidioides immitis\n\
128Conidiobolus_coronatus | Conidiobolus coronatus\n\
129coprinus_cinereus | Coprinus cinereus\n\
130coprinus_cinereus | Coprinus cinereus\n\
131(coprinus) | Coprinus cinereus\n\
132coprinus | Coprinus cinereus\n\
133cryptococcus_neoformans_gattii | Cryptococcus neoformans gattii\n\
134cryptococcus_neoformans_neoformans_B | Cryptococcus neoformans neoformans\n\
135(cryptococcus) | Cryptococcus neoformans\n\
136culex | Culex pipiens\n\
137zebrafish | Danio rerio\n\
138debaryomyces_hansenii | Debaryomyces hansenii\n\
139fly | Drosophila melanogaster\n\
140(fly_exp) | Drosophila melanogaster\n\
141encephalitozoon_cuniculi_GB | Encephalitozoon cuniculi\n\
142eremothecium_gossypii | Eremothecium gossypii\n\
143E_coli_K12 | Escherichia coli K12\n\
144fusarium_graminearum | Fusarium graminearum\n\
145(fusarium) | Fusarium graminearum\n\
146galdieria | Galdieria sulphuraria\n\
147chicken | Gallus gallus domesticus\n\
148heliconius_melpomene1 | Heliconius melpomene\n\
149histoplasma_capsulatum | Histoplasma capsulatum\n\
150(histoplasma) | Histoplasma capsulatum\n\
151human | Homo sapiens\n\
152kluyveromyces_lactis | Kluyveromyces lactis\n\
153laccaria_bicolor | Laccaria bicolor\n\
154leishmania_tarentolae | Leishmania tarentolae\n\
155japaneselamprey | Lethenteron camtschaticum\n\
156lodderomyces_elongisporus | Lodderomyces elongisporus\n\
157magnaporthe_grisea | Magnaporthe grisea\n\
158mnemiopsis_leidyi | Mnemiopsis leidyi\n\
159nasonia | Nasonia vitripennis\n\
160nematostella_vectensis | Nematostella vectensis\n\
161neurospora_crassa | Neurospora crassa\n\
162(neurospora) | Neurospora crassa\n\
163coyote_tobacco | Nicotiana attenuata\n\
164rice | Oryza sativa\n\
165parasteatoda | Parasteatoda sp.\n\
166sealamprey | Petromyzon marinus\n\
167phanerochaete_chrysosporium | Phanerochaete chrysosporium\n\
168(pchrysosporium) | Phanerochaete chrysosporium\n\
169pichia_stipitis | Pichia stipitis\n\
170pisaster | Pisaster ochraceus\n\
171pfalciparum | Plasmodium falciparum\n\
172pneumocystis | Pneumocystis jirovecii\n\
173rhincodon | Rhincodon typus\n\
174rhizopus_oryzae | Rhizopus oryzae\n\
175rhodnius | Rhodnius prolixus\n\
176saccharomyces_cerevisiae_rm11-1a_1 | Saccharomyces cerevisiae\n\
177saccharomyces_cerevisiae_S288C | Saccharomyces cerevisiae\n\
178(saccharomyces) | Saccharomyces cerevisiae\n\
179schistosoma | Schistosoma mansoni\n\
180(schistosoma2) | Schistosoma mansoni\n\
181schizosaccharomyces_pombe | Schizosaccharomyces pombe\n\
182scyliorhinus | Scyliorhinus torazame\n\
183tomato | Solanum lycopersicum\n\
184s_aureus | Staphylococcus aureus\n\
185s_pneumoniae | Streptococcus pneumoniae\n\
186strongylocentrotus_purpuratus | Strongylocentrotus purpuratus\n\
187sulfolobus_solfataricus | Sulfolobus solfataricus\n\
188tetrahymena | Tetrahymena thermophila\n\
189cacao | Theobroma cacao\n\
190thermoanaerobacter_tengcongensis | Thermoanaerobacter tengcongensis\n\
191toxoplasma | Toxoplasma gondii\n\
192tribolium2012 | Tribolium castaneum\n\
193trichinella | Trichinella sp.\n\
194wheat | Triticum sp.\n\
195ustilago_maydis | Ustilago maydis\n\
196(ustilago) | Ustilago maydis\n\
197verticillium_albo_atrum1 | Verticillium albo atrum\n\
198verticillium_longisporum1 | Verticillium longisporum\n\
199volvox | Volvox sp.\n\
200Xiphophorus_maculatus | Xiphophorus maculatus\n\
201yarrowia_lipolytica | Yarrowia lipolytica\n\
203(maize5) | Zea mays\n\
206#define HELP_USAGE "usage:\n\
207augustus [parameters] --species=SPECIES queryfilename\n\
209'queryfilename' is the filename (including relative path) to the file containing the query sequence(s)\n\
212SPECIES is an identifier for the species. Use --species=help to see a list.\n\
215--strand=both, --strand=forward or --strand=backward\n\
216--genemodel=partial, --genemodel=intronless, --genemodel=complete, --genemodel=atleastone or --genemodel=exactlyone\n\
217 partial : allow prediction of incomplete genes at the sequence boundaries (default)\n\
218 intronless : only predict single-exon genes like in prokaryotes and some eukaryotes\n\
219 complete : only predict complete genes\n\
220 atleastone : predict at least one complete gene\n\
221 exactlyone : predict exactly one complete gene\n\
222--singlestrand=true\n\
223 predict genes independently on each strand, allow overlapping genes on opposite strands\n\
224 This option is turned off by default.\n\
225--hintsfile=hintsfilename\n\
226 When this option is used the prediction considering hints (extrinsic information) is turned on.\n\
227 hintsfilename contains the hints in gff format.\n\
228--AUGUSTUS_CONFIG_PATH=path\n\
229 path to config directory (if not specified as environment variable)\n\
230--alternatives-from-evidence=true/false\n\
231 report alternative transcripts when they are suggested by hints\n\
232--alternatives-from-sampling=true/false\n\
233 report alternative transcripts generated through probabilistic sampling\n\
235--minexonintronprob=p\n\
236--minmeanexonintronprob=p\n\
238 For a description of these parameters see section 2 of RUNNING-AUGUSTUS.md.\n\
239--proteinprofile=filename\n\
240 When this option is used the prediction will consider the protein profile provided as parameter.\n\
241 The protein profile extension is described in section 5 of RUNNING-AUGUSTUS.md.\n\
243 show a progressmeter\n\
245 output in gff3 format\n\
246--predictionStart=A, --predictionEnd=B\n\
247 A and B define the range of the sequence for which predictions should be found.\n\
249 predict the untranslated regions in addition to the coding sequence. This currently works only for a subset of species.\n\
250--noInFrameStop=true/false\n\
251 Do not report transcripts with in-frame stop codons. Otherwise, intron-spanning stop codons could occur. Default: false\n\
252--noprediction=true/false\n\
253 If true and input is in genbank format, no prediction is made. Useful for getting the annotated protein sequences.\n\
254--uniqueGeneId=true/false\n\
255 If true, output gene identifyers like this: seqname.gN\n\
257For a complete list of parameters, type \"augustus --paramlist\". A description of the important ones can be found in the file RUNNING-AUGUSTUS.md.\n"
259#define HELP_USAGE_ETRAINING "usage:\n\
260etraining --species=SPECIES trainfilename\n\
262'trainfilename' is the filename (including relative path) to the file in genbank format containing the training sequences. These can be multi-gene sequences.\n\
263SPECIES is a name for your species.\n\
266--/genbank/verbosity=n\n\
267 Choose one of 0,1,2 or 3. The larger the verbosity, the more (error) messages you get.\n"
270#define POWER4TOTHE(x) ((int) (pow((double) 4,(double) (x)) + 0.1))
277#define BONUSSUREHINT 1e100
282typedef long int Integer;
307 static string fullSpeciesPath() {
return configPath + speciesDir;}
308 static string modelPath() {
return configPath + MODEL_SUBDIR;}
309 static string extrinsicPath() {
return configPath + EXTRINSIC_SUBDIR;}
312 static int ass_size() {
return ass_start + ass_end; }
313 static int ass_whole_size() {
return ass_size() + ASS_MIDDLE; }
314 static int dss_size() {
return dss_start + dss_end; }
315 static int dss_whole_size() {
return dss_size() + DSS_MIDDLE; }
317 static string configPath;
318 static string speciesDir;
319 static int decomp_num_at;
320 static int decomp_num_gc;
321 static int decomp_num_steps;
322 static int trans_init_window;
323 static int tis_maxbinsize;
324 static int ass_upwindow_size;
325 static int init_coding_len;
326 static int et_coding_len;
327 static int ass_start;
329 static int dss_start;
331 static int dss_maxbinsize;
332 static int ass_maxbinsize;
333 static int tss_upwindow_size;
334 static int tss_start;
335 static int min_coding_len;
336 static int max_exon_len;
337 static Integer d_polyasig_cleavage;
338 static bool keep_viterbi;
339 static double gc_range_min;
340 static double gc_range_max;
341 static double probNinCoding;
342 static double opalprob;
343 static double amberprob;
344 static double ochreprob;
345 static bool utr_option_on;
346 static bool nc_option_on;
347 static Integer augustus_verbosity;
348 static bool alternatives_from_evidence;
349 static double subopt_transcript_threshold;
350 static Integer almost_identical_maxdiff;
351 static bool uniqueGeneId;
352 static double max_contra_supp_ratio;
353 static bool reportUtrOnlyGenes;
354 static bool useCRFtraining;
355 static bool CRFtrainTIS;
356 static bool CRFtrainSS;
357 static bool CRFtrainIntron;
358 static bool CRFtrainIgenic;
359 static bool CRFtrainCDS;
360 static bool CRFtrainUTR;
361 static bool dss_gc_allowed;
362 static Boolean tieIgenicIntron;
363 static Boolean exoncands;
364 static Boolean proteinOutput;
365 static Boolean codSeqOutput;
366 static Boolean contentmodels;
367 static Integer min_intron_len;
368 static bool MultSpeciesMode;
369 static string treefile;
370 static string speciesfilenames;
371 static string dbaccess;
372 static string alnfile;
373 static string codonalnfile;
374 static bool overlapmode;
375 static Boolean printEvidence;
376 static Boolean printExonCandsMSA;
377 static Boolean printOEs;
378 static Boolean printHints;
379 static Boolean printMEA;
380 static Boolean printSampled;
381 static Boolean printGeneRangesBED;
382 static Boolean printGeneRangesGFF;
383 static Integer maxOvlp;
384 static vector<Double> head2tail_ovlp;
385 static vector<Double> head2head_ovlp;
386 static vector<Double> tail2tail_ovlp;
387 static unsigned temperature;
388 static bool softmasking;
390 static bool softmasking_explicitly_requested;
395 static string referenceFile;
396 static string refSpecies;
397 static double GD_stepsize;
399 static double label_flip_prob;
400 static double lambda;
402 static unordered_map<string, pair<int, vector<double> > > logReg_feature;
404 static vector<double> ex_sc;
405 static vector<double> in_sc;
406 static int oeExtensionWidth;
407 static vector<double> lg_es;
408 static bool computeNumSubs;
409 static bool useAArates;
410 static bool useNonCodingModel;
411 static bool rescaleBoni;
415extern bool inCRFTraining;
417extern const int power2id[31];
419#define A_SET_FLAG(x) power2id[x]
426 Bitmask (
int n=0) : value(n) {}
428 bool operator[] (
int n)
const {
429 return value & A_SET_FLAG(n);
432 value |= A_SET_FLAG(n);
435 value &= ~A_SET_FLAG(n);
492enum StateType{TYPE_UNKNOWN = -1, igenic,
494 singleG, initial0, initial1, initial2, internal0, internal1, internal2, terminal,
495 lessD0, longdss0, equalD0, geometric0, longass0,
496 lessD1, longdss1, equalD1, geometric1, longass1,
497 lessD2, longdss2, equalD2, geometric2, longass2,
498 utr5single, utr5init, utr5intron, utr5intronvar, utr5internal, utr5term,
499 utr3single, utr3init, utr3intron, utr3intronvar, utr3internal, utr3term,
501 rsingleG, rinitial, rinternal0, rinternal1, rinternal2, rterminal0, rterminal1, rterminal2,
502 rlessD0, rlongdss0, requalD0, rgeometric0, rlongass0,
503 rlessD1, rlongdss1, requalD1, rgeometric1, rlongass1,
504 rlessD2, rlongdss2, requalD2, rgeometric2, rlongass2,
505 rutr5single, rutr5init, rutr5intron, rutr5intronvar, rutr5internal, rutr5term,
506 rutr3single, rutr3init, rutr3intron, rutr3intronvar, rutr3internal, rutr3term,
508 intron_type, rintron_type, exon_type,
510 ncsingle, ncinit, ncintron, ncintronvar, ncinternal, ncterm,
511 rncsingle, rncinit, rncintron, rncintronvar, rncinternal, rncterm
514extern const char* stateTypeNames[NUM_TYPES];
515extern const char* stateTypeIdentifiers[NUM_TYPES];
516extern const int stateReadingFrames[NUM_TYPES];
523inline int mod3 (
int k) {
524 return (k>=0)? k%3 : (k%3+3)%3;
527inline int modm(
int k,
int m) {
528 return (k>=0)? k%m : (k%m+m)%m;
536inline StateType toStateType(
const char* str ){
538 for (i=0; i<NUM_TYPES; i++)
539 if (strcmp(str, stateTypeIdentifiers[i]) == 0)
540 return (StateType) i;
544inline Boolean isInitialExon(StateType type){
545 return (type==initial0 || type==initial1 || type==initial2);
548inline Boolean isInternalExon(StateType type){
549 return (type==internal0 || type==internal1 || type==internal2);
552inline Boolean isRTerminalExon(StateType type){
553 return (type==rterminal0 || type==rterminal1 || type==rterminal2);
556inline Boolean isRInternalExon(StateType type){
557 return (type==rinternal0 || type==rinternal1 || type==rinternal2);
560inline Boolean isFirstExon(StateType type){
561 return (isInitialExon(type) || isRTerminalExon(type) || type==singleG || type==rsingleG);
564inline Boolean isLastExon(StateType type){
565 return (type==terminal || type==rinitial || type==singleG || type == rsingleG);
568inline Boolean is5UTR(StateType type){
569 return (utr5single <= type && type <= utr5term)
570 || (rutr5single <= type && type <= rutr5term);
573inline Boolean is5UTRIntron(StateType type){
574 return (type == utr5intron || type == utr5intronvar
575 || type == rutr5intron || type == rutr5intronvar);
578inline Boolean is5UTRExon(StateType type){
579 return is5UTR(type) && !is5UTRIntron(type);
582inline Boolean is3UTR(StateType type){
583 return (utr3single <= type && type <= utr3term)
584 || (rutr3single <= type && type <= rutr3term);
587inline Boolean is3UTRIntron(StateType type){
588 return (type == utr3intron || type == utr3intronvar
589 || type == rutr3intron || type == rutr3intronvar);
592inline Boolean is3UTRExon(StateType type){
593 return is3UTR(type) && !is3UTRIntron(type);
596inline Boolean isFirstUTRExon(StateType type){
597 return (type == utr5single || type == utr5init || type == rutr3single || type == rutr3term);
600inline Boolean isLastUTRExon(StateType type){
601 return (type == utr3single || type == utr3term || type == rutr5single || type == rutr5init);
604inline Boolean isLongUTRIntron(StateType type){
605 return (type == utr5intronvar || type == utr3intronvar || type == rutr5intronvar || type == rutr3intronvar);
608inline Boolean isCodingExon(StateType type) {
609 return (singleG <= type && type <= terminal)
610 || (rsingleG <= type && type <= rterminal2);
613inline Boolean isNcIntron(StateType type) {
614 return (type == ncintron || type == ncintronvar || type == rncintron || type == rncintronvar);
617inline Boolean isNcExon(StateType type) {
618 return (ncsingle <= type && !isNcIntron(type));
621inline Boolean isNc(StateType type) {
622 return (ncsingle <= type);
625inline Boolean isExon(StateType type) {
626 return (isCodingExon(type) || is5UTRExon(type) || is3UTRExon(type) || isNcExon(type));
629inline Boolean isCodingIntron(StateType type) {
630 return (lessD0 <= type && type <= longass2)
631 || (rlessD0 <= type && type <= rlongass2);
634inline Boolean isIntron(StateType type) {
635 return (isCodingIntron(type) || is5UTRIntron(type) || is3UTRIntron(type)
636 || type == intron_type || type == rintron_type || isNcIntron(type));
639inline Boolean isGeometricIntron(StateType type){
640 return (type==geometric0 || type==geometric1 || type==geometric2);
643inline Boolean isRGeometricIntron(StateType type){
644 return (type==rgeometric0 || type==rgeometric1 || type==rgeometric2);
647inline Boolean isLongAssIntron(StateType type){
648 return (type==longass0 || type== longass1 || type==longass2);
651inline Boolean isRLongDssIntron(StateType type){
652 return (type==rlongdss0 || type== rlongdss1 || type==rlongdss2);
655inline Boolean isLongDssIntron(StateType type){
656 return (type==longdss0 || type== longdss1 || type==longdss2);
659inline Boolean isOnFStrand(StateType type){
660 return ((type >= singleG && type < rsingleG) || (type >= ncsingle && type < rncsingle));
663inline StateType initialExon(
int frame){
674inline StateType internalExon(
int frame){
685inline StateType lessDIntron(
int frame){
697inline StateType geometricIntron(
int frame){
708inline StateType longdssIntron(
int frame){
719inline StateType longassIntron(
int frame){
730inline StateType equalDIntron(
int frame){
741inline StateType rterminalExon(
int frame){
752inline StateType rinternalExon(
int frame){
762inline StateType rlongdssIntron(
int frame){
773inline StateType rlongassIntron(
int frame){
784inline StateType rlessDIntron(
int frame){
795inline StateType rgeometricIntron(
int frame){
806inline StateType requalDIntron(
int frame){
817int howOftenOccursIt(
const char* haystack,
const char* needle,
const char* endhaystack=NULL);
818bool containsJustNonNucs(
const char *dna,
int dnalen);
819bool isNuc(
const char *dna);
824string expandHome(
string filename);
829inline string itoa(
int n) {
838inline string ftoa(
double n) {
847inline char* newstrcpy(
const char* s,
int len) {
848 char* result =
new char[len + 1];
850 return strncpy(result, s, len);
852inline char* newstrcpy(
const char* s) {
854 char* result =
new char[len + 1];
855 return strcpy(result, s);
858inline char* newstrcpy(
const string& s,
int len) {
859 char* result =
new char[len + 1];
864inline char* newstrcpy(
const string& s) {
865 return newstrcpy(s, s.length());
868inline void strip_newlines(
char* p) {
870 while (p != 0 && *p !=
'\0') {
882char *getRandomDNA(
int len);
887Double quantile(
const vector<Double> &v,
float q);
888int quantile(
const vector<int> &v,
float q);
896map<string, size_t> *getMap (vector<string> names);
904 vector<double> logliks;
911 numSubs(s), id(i), numSites(0) {};
913 void addLogliks(vector<double>* ll){
914 if (logliks.size() == 0){
915 logliks.resize(ll->size(), 0.0);
917 for (
unsigned u = 0; u < ll->size(); u++){
918 logliks[u] += (*ll)[u];
922 void addNumSubs(
int subs){
925 void addRows(vector<string> &cs) {
926 if (rows.size() == 0)
927 rows.assign(cs.size(),
"");
928 for (
unsigned i=0; i < cs.size(); i++){
929 rows[i].append(cs[i]);
This class implements a double object with a very large range.
Definition lldouble.hh:31
ProjectError(string msg)
Definition types.hh:455
string getMessage()
Definition types.hh:471
~ProjectError()
Definition types.hh:465
ProjectError()
Definition types.hh:460