Skip to content

Commit 50dd7fc

Browse files
committed
wip: redoing Ballgown counting even without -B
1 parent 0c0e061 commit 50dd7fc

4 files changed

Lines changed: 108 additions & 120 deletions

File tree

rlink.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ int processRead(int currentstart, int currentend, BundleData& bdata,
150150
if (leftsupport>maxleftsupport) {
151151
maxleftsupport=leftsupport;
152152
}
153-
leftsup.Add(maxleftsupport);//on the left, always add current max support (?)
153+
leftsup.Add(maxleftsupport);//on the left, always add current max support
154154
support=brec.exons[i].len();
155155
rightsup.Add(support); //..but on the right, real support value is added
156156

stringtie.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -531,7 +531,7 @@ if (ballgown)
531531
else if (xstrand=='-') strand=-1;
532532
countFragment(*bundle, *brec, hi);
533533
//if (!ballgown || ref_overlap)
534-
processRead(currentstart, currentend, *bundle, hashread, *brec, strand, nh, hi);
534+
processRead(currentstart, currentend, *bundle, hashread, *brec, strand, nh, hi);
535535

536536
//update current end to be at least as big as the start of the read pair in the fragment?? -> maybe not because then I could introduce some false positives with paired reads mapped badly
537537

tablemaker.cpp

Lines changed: 35 additions & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -46,120 +46,53 @@ void rc_updateExonCounts(const RC_Feature* exon, int nh) {
4646
if (nh<=1) exon->ucount++;
4747
}
4848

49-
bool BundleData::evalReadAln(GBamRecord& brec, char& strand, int nh) { //, int hi) {
49+
bool BundleData::evalReadAln(GBamRecord& brec, char& strand, int nh) {
5050
if (rc_data==NULL) {
5151
return false; //no ref transcripts available for this reads' region
5252
}
5353
if ((int)brec.end<rc_data->lmin || (int)brec.start>rc_data->rmax) {
5454
return false; //hit outside coverage area
5555
}
5656
if (rc_data->exons.Count()==0) return false;
57-
58-
if (ballgown) {
59-
if (rc_data->tdata.Count()==0) return false; //nothing to do without transcripts
60-
//check this read alignment against ref exons and introns
61-
/*
62-
int gstart=brec.start; //alignment start position on the genome
63-
64-
We NEVER update read-counting boundaries of the bundle based on reads - they should only be based on reference transcripts
65-
if (rc_data->f_cov.size()==0 && gstart<rc_data->lmin) {
66-
fprintf(stderr, "Warning: adjusting lmin coverage bundle from %d to %d !\n", int(rc_data->lmin), (int)gstart);
67-
rc_data->lmin=gstart;
68-
}
69-
*/
70-
/*
71-
int gpos=brec.start; //current genomic position
72-
int rlen=0; //read length, obtained here from the cigar string
73-
int segstart=gstart;
74-
*/
75-
vector<RC_Seg> rsegs;
76-
vector<RC_Seg> rintrons;
77-
for (int i=0;i<brec.exons.Count();i++) {
57+
if (ballgown && rc_data->tdata.Count()==0) return false;
58+
//nothing to do without transcripts
59+
//check this read alignment against ref exons and introns
60+
char strandbits=0;
61+
for (int i=0;i<brec.exons.Count();i++) {
62+
if (ballgown)
7863
rc_data->updateCov(strand, nh, brec.exons[i].start, brec.exons[i].len());
79-
rsegs.push_back(RC_Seg(brec.exons[i].start, brec.exons[i].end) );
80-
if (i>0) {
81-
//add intron
82-
rintrons.push_back(RC_Seg(brec.exons[i-1].end+1, brec.exons[i].start-1));
64+
GArray<RC_FeatOvl> exonOverlaps(true, true); //overlaps sorted by decreasing length
65+
if (rc_data->findOvlExons(exonOverlaps, brec.exons[i].start,
66+
brec.exons[i].end, strand)) {
67+
int max_ovl=exonOverlaps[0].ovlen;
68+
if (max_ovl>=5) { //only count exon overlaps of at least 5bp
69+
for (int k=0;k<exonOverlaps.Count();++k) {
70+
if (exonOverlaps[k].feature->strand=='+') strandbits |= 0x01;
71+
else if (exonOverlaps[k].feature->strand=='-') strandbits |= 0x02;
72+
if (max_ovl - exonOverlaps[k].ovlen > 5 )
73+
break; //ignore overlaps shorter than max_overlap - 5bp
74+
//TODO: perhaps we could use a better approach for non-overlapping exons
75+
// spanned by this same read alignment
76+
//counting this overlap for multiple exons if it's similarly large
77+
//(in the shared region of overlapping exons)
78+
rc_updateExonCounts(exonOverlaps[k].feature, nh);
79+
}
80+
} //exon overlap >= 5
81+
} //exon overlap processing
82+
if (i>0) { //intron processing
83+
RC_Feature* ri=rc_data->findIntron(brec.exons[i-1].end+1, brec.exons[i].start-1, strand);
84+
if (ri) {
85+
ri->rcount++;
86+
ri->mrcount += (nh > 1) ? (1.0/nh) : 1;
87+
if (nh==1) ri->ucount++;
8388
}
84-
}
85-
//now check rexons and rintrons with findExons() and findIntron()
86-
for (size_t i=0;i<rintrons.size();++i) {
87-
/*RC_FeatIt ri=rc_data->findIntron(rintrons[i].l, rintrons[i].r, strand);
88-
if (ri!=rc_data->introns.end()) {
89-
(*ri).rcount++;
90-
(*ri).mrcount += (nh > 1) ? (1.0/nh) : 1;
91-
if (nh==1) (*ri).ucount++;
92-
}
93-
*/
94-
RC_Feature* ri=rc_data->findIntron(rintrons[i].l, rintrons[i].r, strand);
95-
if (ri) {
96-
ri->rcount++;
97-
ri->mrcount += (nh > 1) ? (1.0/nh) : 1;
98-
if (nh==1) ri->ucount++;
99-
}
100-
} //for each intron
10189

102-
char strandbits=0;
103-
for (size_t i=0;i<rsegs.size();++i) {
104-
RC_FeatPtrSet ovlex=rc_data->findExons(rsegs[i].l, rsegs[i].r, strand);
105-
if (ovlex.size()==0) continue;
106-
if (ovlex.size()>1) {
107-
vector< pair<int, const RC_Feature*> > xovl; //overlapped exons sorted by decreasing overlap length
108-
for (RC_FeatPtrSet::iterator ox=ovlex.begin();ox != ovlex.end(); ++ox) {
109-
int ovlen=(*ox)->ovlen(rsegs[i].l, rsegs[i].r);
110-
if (ovlen>=5)
111-
xovl.push_back(pair<int, const RC_Feature*>(ovlen, *ox));
112-
}
113-
if (xovl.size()>1) {
114-
sort(xovl.begin(), xovl.end(), OvlSorter); //larger overlaps first
115-
//update the counts only for ref exons with max overlap to this segment
116-
int max_ovl=xovl.begin()->first;
117-
for (vector<pair<int, const RC_Feature*> >::iterator xo=xovl.begin();xo!=xovl.end();++xo) {
118-
if (xo->second->strand=='+') strandbits |= 0x01;
119-
else if (xo->second->strand=='-') strandbits |= 0x02;
120-
if (max_ovl - xo->first > 5 ) break; //more than +5 bases coverage for the other exons
121-
rc_updateExonCounts(xo->second, nh);
122-
}
123-
} else if (xovl.size() == 1) {
124-
rc_updateExonCounts(xovl.begin()->second, nh);
125-
}
126-
} else {
127-
// 1 exon overlap only
128-
int ovlen=(*ovlex.begin())->ovlen(rsegs[i].l, rsegs[i].r);
129-
if (ovlen>=5) {
130-
if ((*ovlex.begin())->strand=='+') strandbits |= 0x01;
131-
else if ((*ovlex.begin())->strand=='-') strandbits |= 0x02;
132-
rc_updateExonCounts(*ovlex.begin(), nh);
133-
}
134-
}
135-
} //for each read alignment "exon"
136-
if (strand=='.' && strandbits && strandbits<3) {
137-
strand = (strandbits==1) ? '+' : '-';
138-
}
90+
} //intron processing
13991
}
140-
else {
141-
//don't keep track of ballgown coverage data, just check for exon overlaps
142-
//TODO check and assign strand here if needed
143-
if (strand=='.') {
144-
vector< pair<int, const RC_Feature*> > xovl; //overlapped exons sorted by ovl len
145-
for (int i=0;i<brec.exons.Count();i++) {
146-
RC_FeatPtrSet ovlex=rc_data->findExons(brec.exons[i].start, brec.exons[i].end, strand);
147-
for (RC_FeatPtrSet::iterator ox=ovlex.begin();ox != ovlex.end(); ++ox) {
148-
int ovlen=(*ox)->ovlen(brec.exons[i].start, brec.exons[i].end);
149-
if (ovlen>=5)
150-
xovl.push_back(pair<int, const RC_Feature*>(ovlen, *ox));
151-
}
152-
} // for each read alignment "exon"
153-
char strandbits=0;
154-
for (vector<pair<int, const RC_Feature*> >::iterator xo=xovl.begin();xo!=xovl.end();++xo) {
155-
if (xo->second->strand=='+') strandbits |= 0x01;
156-
else if (xo->second->strand=='-') strandbits |= 0x02;
157-
}
158-
if (strandbits && strandbits<3) {
159-
strand = (strandbits==1) ? '+' : '-';
160-
}
161-
}
92+
if (strand=='.' && strandbits && strandbits<3) {
93+
strand = (strandbits==1) ? '+' : '-';
16294
}
95+
16396
return true;
16497
}
16598

tablemaker.h

Lines changed: 71 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,21 @@ struct RC_Feature { //exon or intron of a reference transcript
118118
}
119119
};
120120

121+
struct RC_FeatOvl {
122+
RC_Feature* feature;
123+
int ovlen;
124+
bool operator<(const RC_FeatOvl& o) const {
125+
if (ovlen==o.ovlen) {
126+
return (feature<o.feature) ;
127+
}
128+
else return (ovlen>o.ovlen);
129+
} //operator <
130+
bool operator==(const RC_FeatOvl& o) const {
131+
return (ovlen==o.ovlen && feature==o.feature);
132+
}
133+
RC_FeatOvl(RC_Feature* f=NULL, int olen=0):feature(f), ovlen(olen) {
134+
}
135+
};
121136

122137
typedef set<const RC_Feature*, RC_Feature::PCompare> RC_FeatPtrSet;
123138
typedef set<RC_Feature>::iterator RC_FeatIt;
@@ -219,9 +234,9 @@ struct RC_BundleData {
219234
GPVec<RC_ScaffData> tdata; //all transcripts in this bundle
220235
GList<RC_Feature> exons; //all unique exons in this bundle, by their start coordinate
221236
GList<RC_Feature> introns; //all unique introns in this bundle, by their start coordinate
222-
//RC_FeatIt xcache; //cache the first exon overlapping xcache_pos to speed up exon-overlap queries (findExons())
237+
//RC_FeatIt xcache; //cache the first exon overlapping xcache_pos to speed up exon-overlap queries (findOvlExons())
223238
int xcache; //exons index of the first exon overlapping xcache_pos
224-
int xcache_pos; // left coordinate of last cached exon overlap query (findExons())
239+
int xcache_pos; // left coordinate of last cached exon overlap query (findOvlExons())
225240
vector<float> f_mcov; //coverage data, multi-map aware, per strand
226241
vector<int> f_cov;
227242
vector<float> r_mcov; //coverage data on the reverse strand
@@ -249,15 +264,12 @@ struct RC_BundleData {
249264

250265
//for non-ballgown tracking of guide features
251266
void addTranscriptFeature(uint fstart, uint fend, char fstrand, GList<RC_Feature>& flist, uint tidx) {
252-
RC_Feature* feat=new RC_Feature(fstart, fend, fstrand, 0, tidx);
253-
int fidx=-1;
254-
RC_Feature* p=flist.AddIfNew(feat, true, &fidx);
255-
if (p==feat) {//exon not seen before
256-
p->id=fidx;
257-
}
258-
else {//exon seen before, update t_id list
259-
p->t_ids.Add(tidx);
260-
}
267+
RC_Feature* feat=new RC_Feature(fstart, fend, fstrand, 0, tidx);
268+
int fidx=-1;
269+
RC_Feature* p=flist.AddIfNew(feat, true, &fidx);
270+
if (p!=feat) {//exon seen before, update t_id list
271+
p->t_ids.Add(tidx);
272+
}
261273
}
262274

263275
void addTranscript(GffObj& t) {
@@ -334,9 +346,56 @@ struct RC_BundleData {
334346
transform(r_mcov.begin()+goffs, r_mcov.begin()+goffs+glen,
335347
r_mcov.begin()+goffs, RC_MultiCovInc(numhits));
336348
}
337-
338349
}
339350

351+
bool findOvlExons(GArray<RC_FeatOvl>& exovls, int hl, int hr, char strand='.', bool update_cache=true) {
352+
//exovls should be clear, unless the caller knows what she's doing
353+
bool foundovls=false;
354+
if (exons.Count()==0) return false;
355+
RC_Feature q(hl, hr);
356+
int xstart=0;
357+
bool no_cache=(xcache_pos==0 || xcache_pos>hl);
358+
if (no_cache) {
359+
if (update_cache) {
360+
//xcache=exons.end();
361+
xcache=exons.Count()-1;
362+
xcache_pos=0;
363+
}
364+
}
365+
else xstart=xcache; //must have a valid value
366+
bool upd_cache(update_cache);
367+
int last_checked_exon=exons.Count()-1;
368+
for (int p=xstart;p < exons.Count();++p) {
369+
last_checked_exon=p;
370+
int l=exons[p]->l;
371+
int r=exons[p]->r;
372+
if (l > hr) break;
373+
if (hl > r) continue;
374+
//exon overlap here
375+
int ovlen=0;
376+
if (hl<l) {
377+
ovlen = ( hr<r ? hr-l+1 : r-l+1 );
378+
}
379+
else { // l<=hl
380+
ovlen= ( hr<r ? hr-hl+1 : r-hl+1 );
381+
}
382+
if (upd_cache) {
383+
//cache first overlap
384+
xcache=p;
385+
upd_cache=false;
386+
}
387+
if (strand!='.' && strand!=exons[p]->strand) continue; //non-matching strand
388+
foundovls=true;
389+
RC_FeatOvl fovl(exons[p], ovlen);
390+
exovls.Add(fovl);
391+
}
392+
if (update_cache) {
393+
if (upd_cache) xcache=last_checked_exon; //there was no overlap found
394+
xcache_pos=hl;
395+
}
396+
return foundovls;
397+
}
398+
/*
340399
RC_FeatPtrSet findExons(int hl, int hr, char strand='.', bool update_cache=true) {
341400
//returns exons overlapping given interval hl-hr
342401
RC_FeatPtrSet ovlex; //return set
@@ -377,10 +436,6 @@ struct RC_BundleData {
377436
return ovlex;
378437
}
379438
380-
/*
381-
RC_FeatIt findIntron(int hl, int hr, char strand) {
382-
RC_FeatIt ri=introns.find(RC_Feature(hl, hr, strand));
383-
return ri;
384439
*/
385440
RC_Feature* findIntron(int hl, int hr, char strand) {
386441
int fidx=0;

0 commit comments

Comments
 (0)