Skip to content

Commit ae6d0dd

Browse files
committed
v1.3.0 with improved abundance estimation algorithm; UCSC GTF parsing fix
1 parent d310ec3 commit ae6d0dd

6 files changed

Lines changed: 4434 additions & 1745 deletions

File tree

gclib/gff.cpp

Lines changed: 67 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ const uint gfo_flag_HAS_ERRORS = 0x00000001;
1616
const uint gfo_flag_CHILDREN_PROMOTED= 0x00000002;
1717
const uint gfo_flag_IS_GENE = 0x00000004;
1818
const uint gfo_flag_IS_TRANSCRIPT = 0x00000008;
19-
const uint gfo_flag_HAS_GFF_ID = 0x00000010; //found GFF3 feature line with its own ID
19+
const uint gfo_flag_HAS_GFF_ID = 0x00000010; //found transcript feature line (GFF3 or GTF)
2020
const uint gfo_flag_BY_EXON = 0x00000020; //created by subfeature (exon) directly
2121
const uint gfo_flag_DISCARDED = 0x00000100;
2222
const uint gfo_flag_LST_KEEP = 0x00000200;
@@ -59,7 +59,7 @@ int gfo_cmpByLoc(const pointer p1, const pointer p2) {
5959
return strcmp(g1.getGSeqName(), g2.getGSeqName()); //lexicographic !
6060
}
6161

62-
char* GffLine::extractAttr(const char* attr, bool caseStrict, bool enforce_GTF2) {
62+
char* GffLine::extractAttr(const char* attr, bool caseStrict, bool enforce_GTF2, int* rlen) {
6363
//parse a key attribute and remove it from the info string
6464
//(only works for attributes that have values following them after ' ' or '=')
6565
static const char GTF2_ERR[]="Error parsing attribute %s ('\"' required) at GTF line:\n%s\n";
@@ -120,6 +120,7 @@ char* GffLine::extractAttr(const char* attr, bool caseStrict, bool enforce_GTF2)
120120
if (enforce_GTF2 && *vend!='"')
121121
GError(GTF2_ERR, attr, dupline);
122122
char *r=Gstrdup(vp, vend-1);
123+
if (rlen) *rlen = vend-vp;
123124
//-- now remove this attribute from the info string
124125
while (*vend!=0 && (*vend=='"' || *vend==';' || *vend==' ')) vend++;
125126
if (*vend==0) vend--;
@@ -239,7 +240,6 @@ GffLine::GffLine(GffReader* reader, const char* l):_parents(NULL), _parents_len(
239240

240241
ID=extractAttr("ID=",true);
241242
if (reader->transcriptsOnly && !is_t_data) {
242-
//can_discard=1;
243243
if (ID!=NULL) {
244244
//ban GFF3 parent if not recognized as transcript
245245
reader->discarded_ids.Add(ID, new int(1));
@@ -312,19 +312,25 @@ GffLine::GffLine(GffReader* reader, const char* l):_parents(NULL), _parents_len(
312312
}
313313
} //has Parent field
314314
} //GFF3
315-
else { // GTF-like expected
315+
else { // GTF expected
316316
if (reader->transcriptsOnly && !is_t_data) {
317317
return; //skipping unrecognized non-transcript feature
318318
}
319-
Parent=extractAttr("transcript_id",true);
319+
Parent=extractAttr("transcript_id", true, true);
320320
if (Parent!=NULL) { //GTF2 format detected
321321
gene_id=extractAttr("gene_id"); // for GTF this is the only attribute accepted as geneID
322322
if (gene_id==NULL)
323-
gene_id=extractAttr("geneid");
324-
if (is_gene || is_transcript) {
323+
gene_id=extractAttr("geneid"); //should not allow this fallback..
324+
if (is_gene || is_transcript) { //special GTF with "transcript" or "gene" line
325325
ID=Parent;
326-
if (is_gene) Parent=NULL;
327-
else Parent=Gstrdup(gene_id);
326+
if (is_gene) {
327+
//GENCODE GTF dummy
328+
Parent=NULL; //don't care about gene parents..
329+
}
330+
else { //transcript feature line in GTF
331+
Parent=Gstrdup(gene_id);
332+
is_gtf_transcript=1;
333+
}
328334
}
329335

330336
gene_name=extractAttr("gene_name");
@@ -336,8 +342,8 @@ GffLine::GffLine(GffReader* reader, const char* l):_parents(NULL), _parents_len(
336342
gene_name=extractAttr("genesymbol");
337343
}
338344
}
339-
//prepare for parseAttr by adding '=' character instead of spaces for all attributes
340-
//after the attribute name
345+
//prepare GTF for parseAttr by adding '=' character after the attribute name
346+
//for all attributes
341347
p=info;
342348
bool noed=true; //not edited after the last delim
343349
bool nsp=false; //non-space found after last delim
@@ -375,13 +381,14 @@ GffLine::GffLine(GffReader* reader, const char* l):_parents(NULL), _parents_len(
375381
GMessage("Warning: invalid GTF record, transcript_id not found:\n%s\n",
376382
l);
377383
else return; //skip unrecognized GTF line (from GTF we only care about transcripts for now)
378-
384+
//NOTE: Ensembl GTF seems to have "gene" feature lines with only gene_id, lacking transcript_id
379385
}
380386
} //Parent was NULL, attempted to find it
381387
if (Parent!=NULL) { //GTF transcript_id found
382388
_parents=Parent;
383-
GMALLOC(parents,sizeof(char*));
384389
num_parents=1;
390+
_parents_len=strlen(Parent)+1;
391+
GMALLOC(parents, sizeof(char*));
385392
parents[0]=_parents;
386393
}
387394
} //GTF-like
@@ -1022,32 +1029,31 @@ GffObj* GffReader::updateGffRec(GffObj* prevgfo, GffLine* gffline,
10221029

10231030

10241031
bool GffReader::addExonFeature(GffObj* prevgfo, GffLine* gffline, GHash<CNonExon>& pex, bool noExonAttr) {
1025-
bool r=true;
1026-
if (gffline->strand!=prevgfo->strand) {
1027-
if (prevgfo->strand=='.') {
1028-
prevgfo->strand=gffline->strand;
1029-
}
1030-
else {
1031-
GMessage("GFF Error at %s (%c): exon %d-%d (%c) found on different strand; discarded.\n",
1032-
prevgfo->gffID, prevgfo->strand,
1033-
gffline->fstart, gffline->fend, gffline->strand, prevgfo->getGSeqName());
1034-
//r=false;
1035-
return true;
1036-
}
1037-
}
1038-
int gdist=(gffline->fstart>prevgfo->end) ? gffline->fstart-prevgfo->end :
1039-
((gffline->fend<prevgfo->start)? prevgfo->start-gffline->fend :
1040-
0 );
1041-
if (gdist>(int)GFF_MAX_LOCUS) { //too far apart, most likely this is a duplicate ID
1042-
GMessage("Error: duplicate GFF ID '%s' (or exons too far apart)!\n",prevgfo->gffID);
1043-
//validation_errors = true;
1044-
r=false;
1045-
if (!gff_warns) exit(1);
1046-
}
1047-
int eidx=prevgfo->addExon(this, gffline, !noExonAttr, noExonAttr);
1048-
if (eidx>=0 && gffline->ID!=NULL && gffline->exontype==0)
1049-
subfPoolAdd(pex, prevgfo);
1050-
return r;
1032+
bool r=true;
1033+
if (gffline->strand!=prevgfo->strand) {
1034+
if (prevgfo->strand=='.') {
1035+
prevgfo->strand=gffline->strand;
1036+
}
1037+
else {
1038+
GMessage("GFF Error at %s (%c): exon %d-%d (%c) found on different strand; discarded.\n",
1039+
prevgfo->gffID, prevgfo->strand, gffline->fstart, gffline->fend, gffline->strand,
1040+
prevgfo->getGSeqName());
1041+
return true;
1042+
}
1043+
}
1044+
int gdist=(gffline->fstart>prevgfo->end) ? gffline->fstart-prevgfo->end :
1045+
((gffline->fend<prevgfo->start)? prevgfo->start-gffline->fend :
1046+
0 );
1047+
if (gdist>(int)GFF_MAX_LOCUS) { //too far apart, most likely this is a duplicate ID
1048+
GMessage("Error: duplicate GFF ID '%s' (or exons too far apart)!\n",prevgfo->gffID);
1049+
//validation_errors = true;
1050+
r=false;
1051+
if (!gff_warns) exit(1);
1052+
}
1053+
int eidx=prevgfo->addExon(this, gffline, !noExonAttr, noExonAttr);
1054+
if (eidx>=0 && gffline->ID!=NULL && gffline->exontype==0)
1055+
subfPoolAdd(pex, prevgfo);
1056+
return r;
10511057
}
10521058

10531059
CNonExon* GffReader::subfPoolCheck(GffLine* gffline, GHash<CNonExon>& pex, char*& subp_name) {
@@ -1069,7 +1075,7 @@ void GffReader::subfPoolAdd(GHash<CNonExon>& pex, GffObj* newgfo) {
10691075
//this might become a parent feature later
10701076
if (newgfo->exons.Count()>0) {
10711077
char* xbuf=gfoBuildId(gffline->ID, gffline->gseqname);
1072-
pex.Add(xbuf, new CNonExon(newgfo, newgfo->exons[0], gffline));
1078+
pex.Add(xbuf, new CNonExon(newgfo, newgfo->exons[0], *gffline));
10731079
GFREE(xbuf);
10741080
}
10751081
}
@@ -1127,38 +1133,19 @@ void GffReader::readAll(bool keepAttr, bool mergeCloseExons, bool noExonAttr) {
11271133
}
11281134
else exit(1);
11291135
}
1130-
//create a new entry with the same ID
1131-
/*
1132-
int distance=INT_MAX;
1133-
if (prevseen->isTranscript() && prevseen->strand==gffline->strand) {
1134-
if (prevseen->start>=gffline->fstart)
1135-
distance=prevseen->start-gffline->fend;
1136-
else
1137-
distance=gffline->fstart-prevseen->end;
1138-
}
1139-
if (distance<1000) {//FIXME: arbitrary proximity threshold (yuck)
1140-
//exception: make this an exon of previous ID
1141-
//addExonFeature(prevseen, gffline, pex, noExonAttr);
1142-
prevseen->addExon(this, gffline, false, true);
1143-
if (gff_warns) {
1144-
GMessage("GFF Warning: duplicate feature ID %s (%d-%d) added as exon of previous %d-%d!\n",
1145-
gffline->ID, gffline->fstart, gffline->fend, prevseen->start, prevseen->end);
1146-
}
1147-
}
1148-
else {
1149-
*/
11501136
//create a separate entry (true discontinuous feature)
1151-
prevseen=newGffRec(gffline, keepAttr, noExonAttr,
1152-
prevseen->parent, NULL, prevgflst);
1153-
if (gff_warns) {
1154-
GMessage("GFF Warning: duplicate feature ID %s (%d-%d) (discontinuous feature?)\n",
1155-
gffline->ID, gffline->fstart, gffline->fend);
1156-
}
1157-
//}
1137+
prevseen=newGffRec(gffline, keepAttr, noExonAttr,
1138+
prevseen->parent, NULL, prevgflst);
1139+
if (gff_warns) {
1140+
GMessage("GFF Warning: duplicate feature ID %s (%d-%d) (discontinuous feature?)\n",
1141+
gffline->ID, gffline->fstart, gffline->fend);
1142+
}
11581143
} //duplicate ID in the same locus
11591144
} //ID seen previously in the same locus
11601145
} //parent-like ID feature (non-exon)
1161-
if (gffline->parents==NULL) {//start GFF3-like record with no parent (mRNA, gene)
1146+
1147+
if (gffline->parents==NULL || gffline->is_gtf_transcript) {
1148+
//top level feature (transcript, gene), no parents (or parents can be ignored)
11621149
if (!prevseen) newGffRec(gffline, keepAttr, noExonAttr, NULL, NULL, prevgflst);
11631150
}
11641151
else { //--- it's a child feature (exon/CDS or even a mRNA with a gene as parent)
@@ -1186,9 +1173,10 @@ void GffReader::readAll(bool keepAttr, bool mergeCloseExons, bool noExonAttr) {
11861173
int i=kparents[k];
11871174
newgflst=kgflst[k];
11881175
GffObj* parentgfo=NULL;
1189-
if (gffline->is_transcript || gffline->exontype==0) {//possibly a transcript
1176+
if (gffline->is_transcript || gffline->exontype==0) {//likely a transcript
11901177
parentgfo=gfoFind(gffline->parents[i], newgflst, gffline->gseqname,
11911178
gffline->strand, gffline->fstart, gffline->fend);
1179+
11921180
}
11931181
else {
11941182
//for exon-like entities we only need a parent to be in locus distance,
@@ -1209,8 +1197,8 @@ void GffReader::readAll(bool keepAttr, bool mergeCloseExons, bool noExonAttr) {
12091197
}
12101198
}
12111199
else { //potential exon subfeature?
1212-
//always discard dummy "intron" features
1213-
if (!(gffline->exontype==exgffIntron && (parentgfo->isTranscript() || parentgfo->exons.Count()>0))) {
1200+
//always discard silly "intron" features
1201+
if (! (gffline->exontype==exgffIntron && (parentgfo->isTranscript() || parentgfo->exons.Count()>0))) {
12141202
if (!addExonFeature(parentgfo, gffline, pex, noExonAttr))
12151203
validation_errors=true;
12161204
}
@@ -1221,7 +1209,8 @@ void GffReader::readAll(bool keepAttr, bool mergeCloseExons, bool noExonAttr) {
12211209
//or it could be some chado GFF3 barf with exons coming BEFORE their parent :(
12221210
//check if this feature isn't parented by a previously stored "exon" subfeature
12231211
char* subp_name=NULL;
1224-
CNonExon* subp=subfPoolCheck(gffline, pex, subp_name);
1212+
CNonExon* subp=NULL;
1213+
if (pex.Count()>0) subp=subfPoolCheck(gffline, pex, subp_name);
12251214
if (subp!=NULL) { //found a subfeature that is the parent of this gffline
12261215
//promote that subfeature to a full GffObj
12271216
GffObj* gfoh=promoteFeature(subp, subp_name, pex, keepAttr, noExonAttr);
@@ -1289,15 +1278,17 @@ void GfList::finalize(GffReader* gfr, bool mergeCloseExons,
12891278
}
12901279

12911280
GffObj* GffObj::finalize(GffReader* gfr, bool mergeCloseExons, bool keepAttrs, bool noExonAttr) {
1292-
if (isGene()) {
1281+
/* if (isGene()) {
12931282
if (children.Count()==0) {
12941283
isTranscript(true);
1295-
//bacterial annotation: childless gene is in fact a transcript
1284+
//some bacterial annotation, childless genes may be in fact transcripts
12961285
}
1297-
else if (gfr->transcriptsOnly) {
1286+
else
1287+
if (gfr->transcriptsOnly) {
12981288
isDiscarded(true);
1299-
}
1289+
}
13001290
}
1291+
*/
13011292
//always merge adjacent or overlapping segments
13021293
//but if mergeCloseExons then merge even when distance is up to 5 bases
13031294
if (gfr->transcriptsOnly && !(isTranscript() || (isGene() && children.Count()==0))) {

gclib/gff.h

Lines changed: 27 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -95,9 +95,9 @@ class GffLine {
9595
bool is_cds:1; //"cds" or "start/stop_codon" features
9696
bool is_exon:1; //"exon" and "utr" features
9797
bool is_transcript:1; //if current feature is *RNA or *transcript
98-
bool is_gene:1; //if current feature is *gene
99-
bool is_gff3:1; //if the line appears to be in GFF3 format
100-
bool can_discard:1; //flag unwanted/unrecognized parent features
98+
bool is_gene:1; //current feature is *gene
99+
bool is_gff3:1; //line appears to be in GFF3 format (0=GTF)
100+
bool is_gtf_transcript:1; //GTF transcript line with Parents parsed from gene_id
101101
bool skipLine:1;
102102
};
103103
};
@@ -115,44 +115,43 @@ class GffLine {
115115
GFREE(_parents);
116116
_parents_len=0;
117117
num_parents=0;
118+
GFREE(parents);
118119
parents=NULL;
119120
}
120-
char* extractAttr(const char* pre, bool caseStrict=false, bool enforce_GTF2=false);
121-
GffLine(GffLine* l):_parents(NULL), _parents_len(0),
122-
dupline(NULL), line(NULL), llen(0), gseqname(NULL), track(NULL),
123-
ftype(NULL), ftype_id(-1), info(NULL), fstart(0), fend(0), qstart(0), qend(0), qlen(0),
124-
score(0), strand(0), flags(0), exontype(0), phase(0),
125-
gene_name(NULL), gene_id(NULL),
126-
parents(NULL), num_parents(0), ID(NULL) { //a copy constructor
127-
if (l==NULL || l->line==NULL)
128-
GError("Error: invalid GffLine(l)\n");
129-
memcpy((void*)this, (void*)l, sizeof(GffLine));
121+
char* extractAttr(const char* pre, bool caseStrict=false, bool enforce_GTF2=false, int* rlen=NULL);
122+
GffLine(GffLine& l):_parents(NULL), _parents_len(l._parents_len),
123+
dupline(NULL), line(NULL), llen(l.llen), gseqname(NULL), track(NULL),
124+
ftype(NULL), ftype_id(l.ftype_id), info(NULL), fstart(l.fstart), fend(l.fend), qstart(l.fstart), qend(l.fend),
125+
qlen(l.qlen), score(l.score), strand(l.strand), flags(l.flags), exontype(l.exontype), phase(l.phase),
126+
gene_name(NULL), gene_id(NULL), parents(NULL), num_parents(l.num_parents), ID(NULL) {
127+
//if (l==NULL || l->line==NULL)
128+
// GError("Error: invalid GffLine(l)\n");
129+
//memcpy((void*)this, (void*)l, sizeof(GffLine));
130130
GMALLOC(line, llen+1);
131-
memcpy(line, l->line, llen+1);
131+
memcpy(line, l.line, llen+1);
132132
GMALLOC(dupline, llen+1);
133-
memcpy(dupline, l->dupline, llen+1);
133+
memcpy(dupline, l.dupline, llen+1);
134134
//--offsets within line[]
135-
gseqname=line+(l->gseqname-l->line);
136-
track=line+(l->track-l->line);
137-
ftype=line+(l->ftype-l->line);
138-
info=line+(l->info-l->line);
135+
gseqname=line+(l.gseqname-l.line);
136+
track=line+(l.track-l.line);
137+
ftype=line+(l.ftype-l.line);
138+
info=line+(l.info-l.line);
139139
if (num_parents>0 && parents) {
140-
parents=NULL; //re-init, just copied earlier
141140
GMALLOC(parents, num_parents*sizeof(char*));
142141
//_parents_len=l->_parents_len; copied above
143142
_parents=NULL; //re-init, forget pointer copy
144143
GMALLOC(_parents, _parents_len);
145-
memcpy(_parents, l->_parents, _parents_len);
144+
memcpy(_parents, l._parents, _parents_len);
146145
for (int i=0;i<num_parents;i++) {
147-
parents[i]=_parents+(l->parents[i] - l->_parents);
146+
parents[i]=_parents+(l.parents[i] - l._parents);
148147
}
149148
}
150149
//-- allocated string copies:
151-
ID=Gstrdup(l->ID);
152-
if (l->gene_name!=NULL)
153-
gene_name=Gstrdup(l->gene_name);
154-
if (l->gene_id!=NULL)
155-
gene_id=Gstrdup(l->gene_id);
150+
ID=Gstrdup(l.ID);
151+
if (l.gene_name!=NULL)
152+
gene_name=Gstrdup(l.gene_name);
153+
if (l.gene_id!=NULL)
154+
gene_id=Gstrdup(l.gene_id);
156155
}
157156
GffLine():_parents(NULL), _parents_len(0),
158157
dupline(NULL), line(NULL), llen(0), gseqname(NULL), track(NULL),
@@ -989,7 +988,7 @@ class CNonExon { //utility class used in subfeature promotion
989988
GffExon* exon;
990989
GffLine* gffline;
991990
//CNonExon(int i, GffObj* p, GffExon* e, GffLine* gl) {
992-
CNonExon(GffObj* p, GffExon* e, GffLine* gl) {
991+
CNonExon(GffObj* p, GffExon* e, GffLine& gl) {
993992
parent=p;
994993
exon=e;
995994
//idx=i;

0 commit comments

Comments
 (0)