@@ -16,7 +16,7 @@ const uint gfo_flag_HAS_ERRORS = 0x00000001;
1616const uint gfo_flag_CHILDREN_PROMOTED= 0x00000002 ;
1717const uint gfo_flag_IS_GENE = 0x00000004 ;
1818const uint gfo_flag_IS_TRANSCRIPT = 0x00000008 ;
19- const uint gfo_flag_HAS_GFF_ID = 0x00000010 ; // found GFF3 feature line with its own ID
19+ const uint gfo_flag_HAS_GFF_ID = 0x00000010 ; // found transcript feature line (GFF3 or GTF)
2020const uint gfo_flag_BY_EXON = 0x00000020 ; // created by subfeature (exon) directly
2121const uint gfo_flag_DISCARDED = 0x00000100 ;
2222const uint gfo_flag_LST_KEEP = 0x00000200 ;
@@ -59,7 +59,7 @@ int gfo_cmpByLoc(const pointer p1, const pointer p2) {
5959 return strcmp (g1.getGSeqName (), g2.getGSeqName ()); // lexicographic !
6060}
6161
62- char * GffLine::extractAttr (const char * attr, bool caseStrict, bool enforce_GTF2) {
62+ char * GffLine::extractAttr (const char * attr, bool caseStrict, bool enforce_GTF2, int * rlen ) {
6363 // parse a key attribute and remove it from the info string
6464 // (only works for attributes that have values following them after ' ' or '=')
6565 static const char GTF2_ERR[]=" Error parsing attribute %s ('\" ' required) at GTF line:\n %s\n " ;
@@ -120,6 +120,7 @@ char* GffLine::extractAttr(const char* attr, bool caseStrict, bool enforce_GTF2)
120120 if (enforce_GTF2 && *vend!=' "' )
121121 GError (GTF2_ERR, attr, dupline);
122122 char *r=Gstrdup (vp, vend-1 );
123+ if (rlen) *rlen = vend-vp;
123124 // -- now remove this attribute from the info string
124125 while (*vend!=0 && (*vend==' "' || *vend==' ;' || *vend==' ' )) vend++;
125126 if (*vend==0 ) vend--;
@@ -239,7 +240,6 @@ GffLine::GffLine(GffReader* reader, const char* l):_parents(NULL), _parents_len(
239240
240241 ID=extractAttr (" ID=" ,true );
241242 if (reader->transcriptsOnly && !is_t_data) {
242- // can_discard=1;
243243 if (ID!=NULL ) {
244244 // ban GFF3 parent if not recognized as transcript
245245 reader->discarded_ids .Add (ID, new int (1 ));
@@ -312,19 +312,25 @@ GffLine::GffLine(GffReader* reader, const char* l):_parents(NULL), _parents_len(
312312 }
313313 } // has Parent field
314314 } // GFF3
315- else { // GTF-like expected
315+ else { // GTF expected
316316 if (reader->transcriptsOnly && !is_t_data) {
317317 return ; // skipping unrecognized non-transcript feature
318318 }
319- Parent=extractAttr (" transcript_id" ,true );
319+ Parent=extractAttr (" transcript_id" , true , true );
320320 if (Parent!=NULL ) { // GTF2 format detected
321321 gene_id=extractAttr (" gene_id" ); // for GTF this is the only attribute accepted as geneID
322322 if (gene_id==NULL )
323- gene_id=extractAttr (" geneid" );
324- if (is_gene || is_transcript) {
323+ gene_id=extractAttr (" geneid" ); // should not allow this fallback..
324+ if (is_gene || is_transcript) { // special GTF with "transcript" or "gene" line
325325 ID=Parent;
326- if (is_gene) Parent=NULL ;
327- else Parent=Gstrdup (gene_id);
326+ if (is_gene) {
327+ // GENCODE GTF dummy
328+ Parent=NULL ; // don't care about gene parents..
329+ }
330+ else { // transcript feature line in GTF
331+ Parent=Gstrdup (gene_id);
332+ is_gtf_transcript=1 ;
333+ }
328334 }
329335
330336 gene_name=extractAttr (" gene_name" );
@@ -336,8 +342,8 @@ GffLine::GffLine(GffReader* reader, const char* l):_parents(NULL), _parents_len(
336342 gene_name=extractAttr (" genesymbol" );
337343 }
338344 }
339- // prepare for parseAttr by adding '=' character instead of spaces for all attributes
340- // after the attribute name
345+ // prepare GTF for parseAttr by adding '=' character after the attribute name
346+ // for all attributes
341347 p=info;
342348 bool noed=true ; // not edited after the last delim
343349 bool nsp=false ; // non-space found after last delim
@@ -375,13 +381,14 @@ GffLine::GffLine(GffReader* reader, const char* l):_parents(NULL), _parents_len(
375381 GMessage (" Warning: invalid GTF record, transcript_id not found:\n %s\n " ,
376382 l);
377383 else return ; // skip unrecognized GTF line (from GTF we only care about transcripts for now)
378-
384+ // NOTE: Ensembl GTF seems to have "gene" feature lines with only gene_id, lacking transcript_id
379385 }
380386 } // Parent was NULL, attempted to find it
381387 if (Parent!=NULL ) { // GTF transcript_id found
382388 _parents=Parent;
383- GMALLOC (parents,sizeof (char *));
384389 num_parents=1 ;
390+ _parents_len=strlen (Parent)+1 ;
391+ GMALLOC (parents, sizeof (char *));
385392 parents[0 ]=_parents;
386393 }
387394 } // GTF-like
@@ -1022,32 +1029,31 @@ GffObj* GffReader::updateGffRec(GffObj* prevgfo, GffLine* gffline,
10221029
10231030
10241031bool GffReader::addExonFeature (GffObj* prevgfo, GffLine* gffline, GHash<CNonExon>& pex, bool noExonAttr) {
1025- bool r=true ;
1026- if (gffline->strand !=prevgfo->strand ) {
1027- if (prevgfo->strand ==' .' ) {
1028- prevgfo->strand =gffline->strand ;
1029- }
1030- else {
1031- GMessage (" GFF Error at %s (%c): exon %d-%d (%c) found on different strand; discarded.\n " ,
1032- prevgfo->gffID , prevgfo->strand ,
1033- gffline->fstart , gffline->fend , gffline->strand , prevgfo->getGSeqName ());
1034- // r=false;
1035- return true ;
1036- }
1037- }
1038- int gdist=(gffline->fstart >prevgfo->end ) ? gffline->fstart -prevgfo->end :
1039- ((gffline->fend <prevgfo->start )? prevgfo->start -gffline->fend :
1040- 0 );
1041- if (gdist>(int )GFF_MAX_LOCUS) { // too far apart, most likely this is a duplicate ID
1042- GMessage (" Error: duplicate GFF ID '%s' (or exons too far apart)!\n " ,prevgfo->gffID );
1043- // validation_errors = true;
1044- r=false ;
1045- if (!gff_warns) exit (1 );
1046- }
1047- int eidx=prevgfo->addExon (this , gffline, !noExonAttr, noExonAttr);
1048- if (eidx>=0 && gffline->ID !=NULL && gffline->exontype ==0 )
1049- subfPoolAdd (pex, prevgfo);
1050- return r;
1032+ bool r=true ;
1033+ if (gffline->strand !=prevgfo->strand ) {
1034+ if (prevgfo->strand ==' .' ) {
1035+ prevgfo->strand =gffline->strand ;
1036+ }
1037+ else {
1038+ GMessage (" GFF Error at %s (%c): exon %d-%d (%c) found on different strand; discarded.\n " ,
1039+ prevgfo->gffID , prevgfo->strand , gffline->fstart , gffline->fend , gffline->strand ,
1040+ prevgfo->getGSeqName ());
1041+ return true ;
1042+ }
1043+ }
1044+ int gdist=(gffline->fstart >prevgfo->end ) ? gffline->fstart -prevgfo->end :
1045+ ((gffline->fend <prevgfo->start )? prevgfo->start -gffline->fend :
1046+ 0 );
1047+ if (gdist>(int )GFF_MAX_LOCUS) { // too far apart, most likely this is a duplicate ID
1048+ GMessage (" Error: duplicate GFF ID '%s' (or exons too far apart)!\n " ,prevgfo->gffID );
1049+ // validation_errors = true;
1050+ r=false ;
1051+ if (!gff_warns) exit (1 );
1052+ }
1053+ int eidx=prevgfo->addExon (this , gffline, !noExonAttr, noExonAttr);
1054+ if (eidx>=0 && gffline->ID !=NULL && gffline->exontype ==0 )
1055+ subfPoolAdd (pex, prevgfo);
1056+ return r;
10511057}
10521058
10531059CNonExon* GffReader::subfPoolCheck (GffLine* gffline, GHash<CNonExon>& pex, char *& subp_name) {
@@ -1069,7 +1075,7 @@ void GffReader::subfPoolAdd(GHash<CNonExon>& pex, GffObj* newgfo) {
10691075// this might become a parent feature later
10701076if (newgfo->exons .Count ()>0 ) {
10711077 char * xbuf=gfoBuildId (gffline->ID , gffline->gseqname );
1072- pex.Add (xbuf, new CNonExon (newgfo, newgfo->exons [0 ], gffline));
1078+ pex.Add (xbuf, new CNonExon (newgfo, newgfo->exons [0 ], * gffline));
10731079 GFREE (xbuf);
10741080 }
10751081}
@@ -1127,38 +1133,19 @@ void GffReader::readAll(bool keepAttr, bool mergeCloseExons, bool noExonAttr) {
11271133 }
11281134 else exit (1 );
11291135 }
1130- // create a new entry with the same ID
1131- /*
1132- int distance=INT_MAX;
1133- if (prevseen->isTranscript() && prevseen->strand==gffline->strand) {
1134- if (prevseen->start>=gffline->fstart)
1135- distance=prevseen->start-gffline->fend;
1136- else
1137- distance=gffline->fstart-prevseen->end;
1138- }
1139- if (distance<1000) {//FIXME: arbitrary proximity threshold (yuck)
1140- //exception: make this an exon of previous ID
1141- //addExonFeature(prevseen, gffline, pex, noExonAttr);
1142- prevseen->addExon(this, gffline, false, true);
1143- if (gff_warns) {
1144- GMessage("GFF Warning: duplicate feature ID %s (%d-%d) added as exon of previous %d-%d!\n",
1145- gffline->ID, gffline->fstart, gffline->fend, prevseen->start, prevseen->end);
1146- }
1147- }
1148- else {
1149- */
11501136 // create a separate entry (true discontinuous feature)
1151- prevseen=newGffRec (gffline, keepAttr, noExonAttr,
1152- prevseen->parent , NULL , prevgflst);
1153- if (gff_warns) {
1154- GMessage (" GFF Warning: duplicate feature ID %s (%d-%d) (discontinuous feature?)\n " ,
1155- gffline->ID , gffline->fstart , gffline->fend );
1156- }
1157- // }
1137+ prevseen=newGffRec (gffline, keepAttr, noExonAttr,
1138+ prevseen->parent , NULL , prevgflst);
1139+ if (gff_warns) {
1140+ GMessage (" GFF Warning: duplicate feature ID %s (%d-%d) (discontinuous feature?)\n " ,
1141+ gffline->ID , gffline->fstart , gffline->fend );
1142+ }
11581143 } // duplicate ID in the same locus
11591144 } // ID seen previously in the same locus
11601145 } // parent-like ID feature (non-exon)
1161- if (gffline->parents ==NULL ) {// start GFF3-like record with no parent (mRNA, gene)
1146+
1147+ if (gffline->parents ==NULL || gffline->is_gtf_transcript ) {
1148+ // top level feature (transcript, gene), no parents (or parents can be ignored)
11621149 if (!prevseen) newGffRec (gffline, keepAttr, noExonAttr, NULL , NULL , prevgflst);
11631150 }
11641151 else { // --- it's a child feature (exon/CDS or even a mRNA with a gene as parent)
@@ -1186,9 +1173,10 @@ void GffReader::readAll(bool keepAttr, bool mergeCloseExons, bool noExonAttr) {
11861173 int i=kparents[k];
11871174 newgflst=kgflst[k];
11881175 GffObj* parentgfo=NULL ;
1189- if (gffline->is_transcript || gffline->exontype ==0 ) {// possibly a transcript
1176+ if (gffline->is_transcript || gffline->exontype ==0 ) {// likely a transcript
11901177 parentgfo=gfoFind (gffline->parents [i], newgflst, gffline->gseqname ,
11911178 gffline->strand , gffline->fstart , gffline->fend );
1179+
11921180 }
11931181 else {
11941182 // for exon-like entities we only need a parent to be in locus distance,
@@ -1209,8 +1197,8 @@ void GffReader::readAll(bool keepAttr, bool mergeCloseExons, bool noExonAttr) {
12091197 }
12101198 }
12111199 else { // potential exon subfeature?
1212- // always discard dummy "intron" features
1213- if (!(gffline->exontype ==exgffIntron && (parentgfo->isTranscript () || parentgfo->exons .Count ()>0 ))) {
1200+ // always discard silly "intron" features
1201+ if (! (gffline->exontype ==exgffIntron && (parentgfo->isTranscript () || parentgfo->exons .Count ()>0 ))) {
12141202 if (!addExonFeature (parentgfo, gffline, pex, noExonAttr))
12151203 validation_errors=true ;
12161204 }
@@ -1221,7 +1209,8 @@ void GffReader::readAll(bool keepAttr, bool mergeCloseExons, bool noExonAttr) {
12211209 // or it could be some chado GFF3 barf with exons coming BEFORE their parent :(
12221210 // check if this feature isn't parented by a previously stored "exon" subfeature
12231211 char * subp_name=NULL ;
1224- CNonExon* subp=subfPoolCheck (gffline, pex, subp_name);
1212+ CNonExon* subp=NULL ;
1213+ if (pex.Count ()>0 ) subp=subfPoolCheck (gffline, pex, subp_name);
12251214 if (subp!=NULL ) { // found a subfeature that is the parent of this gffline
12261215 // promote that subfeature to a full GffObj
12271216 GffObj* gfoh=promoteFeature (subp, subp_name, pex, keepAttr, noExonAttr);
@@ -1289,15 +1278,17 @@ void GfList::finalize(GffReader* gfr, bool mergeCloseExons,
12891278}
12901279
12911280GffObj* GffObj::finalize (GffReader* gfr, bool mergeCloseExons, bool keepAttrs, bool noExonAttr) {
1292- if (isGene ()) {
1281+ /* if (isGene()) {
12931282 if (children.Count()==0) {
12941283 isTranscript(true);
1295- // bacterial annotation: childless gene is in fact a transcript
1284+ //some bacterial annotation, childless genes may be in fact transcripts
12961285 }
1297- else if (gfr->transcriptsOnly ) {
1286+ else
1287+ if (gfr->transcriptsOnly) {
12981288 isDiscarded(true);
1299- }
1289+ }
13001290 }
1291+ */
13011292 // always merge adjacent or overlapping segments
13021293 // but if mergeCloseExons then merge even when distance is up to 5 bases
13031294 if (gfr->transcriptsOnly && !(isTranscript () || (isGene () && children.Count ()==0 ))) {
0 commit comments