Skip to content

Commit 063011d

Browse files
committed
keep transcript data uptr around for all guides
1 parent f146970 commit 063011d

4 files changed

Lines changed: 78 additions & 167 deletions

File tree

rlink.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ extern bool debugMode;
3636
//collect all refguide transcripts for a single genomic sequence
3737
struct GRefData {
3838
GList<GffObj> rnas; //all transcripts on this genomic seq
39+
GList<GRefLocus> loci;
3940
int gseq_id;
4041
const char* gseq_name;
4142
//GList<GTData> tdata; //transcript data (uptr holder for all rnas loaded here)
@@ -60,6 +61,7 @@ struct GRefData {
6061
}
6162
rnas.Add(t);
6263
t->isUsed(true);
64+
//setLocus(t); //use the GRefLocus::mexons to quickly find an overlap with existing loci, or create a new one
6365
}
6466

6567
bool operator==(GRefData& d){

stringtie.cpp

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,7 @@ int main(int argc, char * const argv[]) {
255255
GVec<GRefData> refguides; // plain vector with transcripts for each chromosome
256256

257257
//table indexes for Ballgown Raw Counts data (-B/-b option)
258-
GPVec<RC_TData> guides_RC_tdata(true); //raw count data for all guide transcripts
258+
GPVec<RC_TData> guides_RC_tdata(true); //raw count data or other info for all guide transcripts
259259
GPVec<RC_Feature> guides_RC_exons(true); //raw count data for all guide exons
260260
GPVec<RC_Feature> guides_RC_introns(true);//raw count data for all guide introns
261261

@@ -321,10 +321,11 @@ const char* ERR_BAM_SORT="\nError: the input alignment file is not sorted!\n";
321321
m->getID(),m->start, m->end);
322322
m->exons.Add(new GffExon(m->start, m->end));
323323
}
324-
if (ballgown) {
325-
RC_TData* tdata=new RC_TData(*m, ++c_tid);
326-
m->uptr=tdata;
327-
guides_RC_tdata.Add(tdata);
324+
//TODO: always keep a RC_TData pointer around, with additional info about guides
325+
RC_TData* tdata=new RC_TData(*m, ++c_tid);
326+
m->uptr=tdata;
327+
guides_RC_tdata.Add(tdata);
328+
if (ballgown) { //already gather exon & intron info for all ref transcripts
328329
tdata->rc_addFeatures(c_exon_id, uexons, guides_RC_exons,
329330
c_intron_id, uintrons, guides_RC_introns);
330331
}

tablemaker.cpp

Lines changed: 0 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -288,91 +288,8 @@ void rc_writeRC(GPVec<RC_TData>& RC_data,
288288
//File: i_data.ctab
289289
//i_id chr gstart gend rcount ucount mrcount
290290
rc_write_RCfeature(RC_data, RC_introns, f_idata, f_i2t);
291-
292-
// feature-to-transcript link files
293-
//rc_write_f2t(rc.fe2t, rc.e2t);
294-
//rc_write_f2t(rc.fi2t, rc.i2t);
295-
}
296-
/*
297-
void RC_TData::rc_addFeatures(uint& c_e_id, set<RC_Feature>& exonSet, GPVec<RC_Feature>& exonTable,
298-
uint& c_i_id, set<RC_Feature>& intronSet, GPVec<RC_Feature>& intronTable) {
299-
GASSERT(ref_t);
300-
GffObj& m = *(ref_t);
301-
for (int i = 0; i < m.exons.Count(); ++i) {
302-
set<RC_Feature>::iterator eit=exonSet.end();
303-
set<RC_Feature>::iterator iit=intronSet.end();
304-
addFeature(m.exons[i]->start, m.exons[i]->end, t_exons, c_e_id, exonSet, eit, exonTable);
305-
if (i>0) { //store intron
306-
addFeature(m.exons[i-1]->end+1, m.exons[i]->start-1, t_introns, c_i_id, intronSet, iit, intronTable);
307-
}
308-
} //for each exon
309-
}
310-
311-
void RC_TData::addFeature(int fl, int fr, GPVec<RC_Feature>& fvec,
312-
uint& f_id, set<RC_Feature>& fset, set<RC_Feature>::iterator& fit,
313-
GPVec<RC_Feature>& fdata) {
314-
//f_id is the largest f_id inserted so far in fset
315-
bool add_new = true;
316-
RC_Feature newseg(fl,fr,this->strand);
317-
//RC_Feature* newfeature=NULL;
318-
if (fset.size()>0) {
319-
if (fit == fset.end()) --fit;
320-
if (newseg < (*fit)) {
321-
bool eq=false;
322-
while (newseg < (*fit) || (eq = (newseg==(*fit)))) {
323-
if (eq) {
324-
add_new = false;
325-
newseg.id = fit->id;
326-
break;
327-
}
328-
if (fit==fset.begin()) {
329-
break;
330-
}
331-
--fit;
332-
}
333-
}
334-
else { //newseg >= *fit
335-
bool eq=false;
336-
while ((*fit) < newseg || (eq = (newseg==(*fit)))) {
337-
if (eq) {
338-
add_new = false;
339-
newseg.id = fit->id;
340-
break;
341-
}
342-
++fit;
343-
if (fit==fset.end()) {
344-
--fit;
345-
break;
346-
}
347-
}
348-
}
349-
}
350-
if (add_new) { //never seen this feature before
351-
newseg.id = ++f_id;
352-
pair< set<RC_Feature>::iterator, bool> ret = fset.insert(newseg);
353-
//ret.second = was_inserted (indeed new)
354-
if (!ret.second) {
355-
GError("Error: feature %d-%d (%c) already in segment set!\n",
356-
newseg.l, newseg.r, newseg.strand);
357-
//newseg.id = ret.first->id;
358-
}
359-
fdata.Add(new RC_Feature(newseg, this->t_id));
360-
#ifdef DEBUG
361-
if (fdata.Count()!=(int)f_id) {
362-
GMessage("Error: fdata.Count=%d, f_id=%d\n", fdata.Count(), f_id);
363-
}
364-
#endif
365-
GASSERT((uint)fdata.Count()==f_id);
366-
}
367-
else { //feature seen before, update its parent list
368-
fdata[newseg.id-1]->t_ids.Add(this->t_id);
369-
}
370-
//fvec.push_back(newseg);
371-
GASSERT(fdata[newseg.id-1]->id==newseg.id);
372-
fvec.Add(fdata[newseg.id-1]);
373291
}
374292

375-
*/
376293
void RC_TData::rc_addFeatures(uint& c_e_id, GList<RC_Feature>& exonSet, GPVec<RC_Feature>& exonTable,
377294
uint& c_i_id, GList<RC_Feature>& intronSet, GPVec<RC_Feature>& intronTable) {
378295
GASSERT(ref_t);

tablemaker.h

Lines changed: 70 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,8 @@ struct RC_TSeg {
4949
*/
5050

5151
struct RC_Feature { //exon or intron of a reference transcript
52-
uint id; //feature id (>0) only used with -B (full Ballgown data)
53-
//it's the index in the global refguides_RC_exons/introns array + 1
52+
uint id; //feature id (>0), +1 to the index either in global guides_RC_exons/introns if ballgown,
53+
// or in bundle_RC_exons/introns if not ballgown
5454
GVec<uint> t_ids; //transcripts owning this feature
5555
//if -B, this is the index in the global refguides_RC_Data array + 1
5656
// otherwise it is the index in the BundleData::keepguides array + 1
@@ -128,6 +128,25 @@ struct RC_Feature { //exon or intron of a reference transcript
128128
}
129129
};
130130

131+
//for locus tracking and coverage, keep merged exons/introns in the locus
132+
struct GSegInfo {
133+
int t_id; //index of RC_TData in the guides_RC_data table + 1
134+
int exonum; //exon number
135+
GSegInfo(int tid=-1, int en=-1):t_id(tid), exonum(en) { }
136+
};
137+
class GMSeg:public GSeg {
138+
public:
139+
GVec<GSegInfo> msegs; //keep track of exons contributing to this merged exon
140+
GMSeg(int l=0, int r=0, int tid=-1, int eno=-1):GSeg(l,r), msegs(tid, eno) {
141+
}
142+
};
143+
144+
//reference locus - formed by exon-overlapping transcripts
145+
class GRefLocus:public GSeg {
146+
GArray<GMSeg> mexons; //merged exons in this locus (from any transcript)
147+
GPVec<GffObj> rnas; //transcripts in this locus
148+
};
149+
131150
struct RC_ExonOvl {
132151
RC_Feature* feature; //pointer to an item of RC_BundleData::g_exons
133152
int mate_ovl; // = 1 if the mate of this read overlaps the same exon
@@ -167,27 +186,23 @@ struct RC_TData { //storing RC data for a transcript
167186
//only used with -B (full Ballgown data)
168187
GffObj* ref_t;
169188
uint t_id;
170-
//GStr t_name; //original GFF ID for the transcript
171189
int l;
172190
int r;
173-
//char strand;
174-
//int num_exons;
191+
char in_bundle; // 1 if used by read bundles, 2 if it has any direct read overlap
192+
GRefLocus* locus; //pointer to a locus info
175193
int eff_len;
176194
double cov;
177195
double fpkm;
178196
//char strand;
179197
GPVec<RC_Feature> t_exons;
180198
GPVec<RC_Feature> t_introns;
181-
//int e_idx_cache;
182-
//int i_idx_cache;
183199
void rc_addFeatures(uint& c_e_id, GList<RC_Feature>& fexons, GPVec<RC_Feature>& edata,
184200
uint& c_i_id, GList<RC_Feature>& fintrons, GPVec<RC_Feature>& idata);
185201
void addFeature(int fl, int fr, GPVec<RC_Feature>& fvec, uint& f_id,
186202
GList<RC_Feature>& fset, GPVec<RC_Feature>& fdata,
187203
int& cache_idx);
188-
RC_TData(GffObj& s, uint id=0):ref_t(&s), t_id(id), l(s.start), r(s.end), //t_name(s.getID()),
189-
//num_exons(s.exons.Count()),
190-
eff_len(s.covlen), cov(0), fpkm(0), //strand(s.strand),
204+
RC_TData(GffObj& s, uint id=0):ref_t(&s), t_id(id), l(s.start), r(s.end),
205+
in_bundle(0), locus(NULL), eff_len(s.covlen), cov(0), fpkm(0), //strand(s.strand),
191206
t_exons(false), t_introns(false) { //, e_idx_cache(-1), i_idx_cache(-1) {
192207
}
193208

@@ -196,7 +211,6 @@ struct RC_TData { //storing RC data for a transcript
196211
if (r != o.r) return (r < o.r);
197212
if (char c=(ref_t->strand - o.ref_t->strand)) return (c<0);
198213
return (strcmp(ref_t->getID(), o.ref_t->getID())<0);
199-
//return false;
200214
}
201215
bool operator==(const RC_TData& o) const {
202216
if (t_id!=0 && o.t_id!=0) return (t_id==o.t_id);
@@ -236,8 +250,8 @@ struct RC_BundleData {
236250
int rmax;
237251
GPVec<RC_TData> g_tdata; //raw counting data for all transcripts in this bundle
238252
// RC_TData::t_id-1 = the element index in this array
239-
GList<RC_Feature> g_exons; //set of guide exons in this bundle, sorted by their start coordinate
240-
GList<RC_Feature> g_introns; //set of guide introns in this bundle, sorted by their start coordinate
253+
GList<RC_Feature> g_exons; //set of guide exons in this bundle, sorted by start coordinate
254+
GList<RC_Feature> g_introns; //set of guide introns in this bundle, sorted by start coordinate
241255
//RC_FeatIt xcache; //cache the first exon overlapping xcache_pos to speed up exon-overlap queries (findOvlExons())
242256
int xcache; //exons index of the first exon overlapping xcache_pos
243257
int xcache_pos; // left coordinate of last cached exon overlap query (findOvlExons())
@@ -250,10 +264,11 @@ struct RC_BundleData {
250264

251265
//-- when no global Ballgown data is to be generated, these are
252266
// local (bundle) stable order tables of guide features
253-
GPVec<RC_TData>* guides_RC_tdata; //just a pointer to the current RC tdata table
267+
GPVec<RC_TData>* bundle_RC_tdata; //pointer to the global RC tdata table
254268
// RC_Feature::id-1 = the index in these arrays:
255-
GPVec<RC_Feature>* guides_RC_exons;
256-
GPVec<RC_Feature>* guides_RC_introns;
269+
GPVec<RC_Feature>* bundle_RC_exons; //pointers to global (if ballgown)
270+
GPVec<RC_Feature>* bundle_RC_introns;// OR local exon/intron RC data
271+
//local exon/intron ids within the bundle
257272
uint c_exon_id;
258273
uint c_intron_id;
259274
//--
@@ -263,16 +278,18 @@ struct RC_BundleData {
263278
// features:(sorted, free, unique)
264279
g_exons(true, false, true), g_introns(true, false, true),
265280
xcache(0), xcache_pos(0),
266-
guides_RC_tdata(rc_tdata), guides_RC_exons(rc_exons), guides_RC_introns(rc_introns),
281+
bundle_RC_tdata(rc_tdata), bundle_RC_exons(rc_exons), bundle_RC_introns(rc_introns),
267282
c_exon_id(0), c_intron_id(0)
268283
{
269284
if (ballgown) {
270285
if (rmax>lmin) updateCovSpan();
271286
}else {
272-
g_tdata.setFreeItem(true);
273-
guides_RC_tdata = &g_tdata;
274-
guides_RC_exons = new GPVec<RC_Feature>(true);
275-
guides_RC_introns= new GPVec<RC_Feature>(true);
287+
//g_tdata.setFreeItem(true);
288+
//guides_RC_tdata = &g_tdata;
289+
//-- override the passed rc_exons/rc_introns if not ballgown
290+
//-- because these are now locally maintained so they'll be deallocated with the bundle
291+
bundle_RC_exons = new GPVec<RC_Feature>(true);
292+
bundle_RC_introns= new GPVec<RC_Feature>(true);
276293
}
277294
}
278295

@@ -282,72 +299,46 @@ struct RC_BundleData {
282299
r_cov.clear();
283300
r_mcov.clear();
284301
if (!ballgown) {
285-
delete guides_RC_exons;
286-
delete guides_RC_introns;
302+
delete bundle_RC_exons;
303+
delete bundle_RC_introns;
287304
}
288305
}
289306

290-
//for non-ballgown tracking of guide features
291-
/*
292-
void addTranscriptFeature(uint fstart, uint fend, char fstrand, GList<RC_Feature>& flist, GffObj& t) {
293-
uint tidx=(uint)t.udata; //in non-Ballgown case, this is the index in the BundleData::keepguides array + 1
294-
RC_Feature* feat=new RC_Feature(fstart, fend, fstrand, 0, tidx);
295-
int fidx=-1;
296-
RC_Feature* p=flist.AddIfNew(feat, true, &fidx);
297-
if (p==feat) {//exon never seen before
298-
//TODO: use some stable e_id index here and assign it to t.exons[eidx]->uptr->e_id (?) for quick retrieval
299-
} else {//exon seen before, just update t_id list
300-
p->t_ids.Add(tidx);
301-
}
302-
}
303-
*/
304-
uint addTranscript(GffObj& t) { //shouuld return the guide index in *guides_RC_tdata
305-
//if (!ps.rc_id_data()) return;
306-
//RC_ScaffIds& sdata = *(ps.rc_id_data());
307-
bool boundary_changed=false;
308-
if (lmin==0 || lmin>(int)t.start) { lmin=t.start; boundary_changed=true; }
309-
if (rmax==0 || rmax<(int)t.end) { rmax=t.end; boundary_changed=true; }
310-
RC_TData* tdata=NULL;
307+
uint addTranscript(GffObj& t) { //should return the guide index in *guides_RC_tdata
308+
bool boundary_changed=false;
309+
if (lmin==0 || lmin>(int)t.start) { lmin=t.start; boundary_changed=true; }
310+
if (rmax==0 || rmax<(int)t.end) { rmax=t.end; boundary_changed=true; }
311+
GASSERT(t.uptr); //we should always have a RC_TData for each guide
312+
RC_TData* tdata=(RC_TData*)(t.uptr);
313+
/*RC_TData* tdata=NULL;
311314
if (ballgown) {
312-
tdata=(RC_TData*)(t.uptr);
315+
tdata=(RC_TData*)(t.uptr);
313316
}
314317
else {
315-
//int c_tid=g_tdata.Count();
316-
//add RC transcript data locally for the bundle
317-
tdata=new RC_TData(t, g_tdata.Count()+1);
318-
t.uptr=tdata;
319-
//guides_RC_Data.Add(tdata);
320-
tdata->rc_addFeatures(c_exon_id, g_exons, *guides_RC_exons,
321-
c_intron_id, g_introns, *guides_RC_introns);
322-
}
323-
//if (ballgown) { //ballgown case
324-
//RC_TData& sdata=*(RC_TData*)(t.uptr);
325-
g_tdata.Add(tdata);
326-
if (ballgown) {
327-
if (boundary_changed) updateCovSpan();
328-
//need to add exons and introns because rc_addFeatures()
329-
// was NOT using g_exons and g_introns as unique sets if ballgown
330-
for (int i=0;i<tdata->t_exons.Count();i++) {
331-
g_exons.Add(tdata->t_exons[i]);
332-
}
333-
for (int i=0;i<tdata->t_introns.Count();i++) {
334-
g_introns.Add(tdata->t_introns[i]);
335-
}
318+
//add RC transcript data locally for the bundle
319+
tdata=new RC_TData(t, g_tdata.Count()+1);
320+
t.uptr=tdata;
321+
//guides_RC_Data.Add(tdata);
322+
tdata->rc_addFeatures(c_exon_id, g_exons, *guides_RC_exons,
323+
c_intron_id, g_introns, *guides_RC_introns);
324+
}*/
325+
g_tdata.Add(tdata);
326+
if (ballgown) {
327+
if (boundary_changed) updateCovSpan();
328+
//rc_addFeatures() called before, but we still need to add exons
329+
// and introns to the local sets: g_exons, g_introns
330+
for (int i=0;i<tdata->t_exons.Count();i++) {
331+
g_exons.Add(tdata->t_exons[i]);
336332
}
337-
//} //if ballgown
338-
/*
339-
else { //non-ballgown, regular storage of bundle guides data
340-
//use GffObj::udata as tid, it's the index in bundledata::keepguides
341-
for (int i=0;i<t.exons.Count();++i) {
342-
addTranscriptFeature(t.exons[i]->start, t.exons[i]->end, t.strand, g_exons, t);
343-
if (i>0) {
344-
//add intron
345-
addTranscriptFeature(t.exons[i-1]->end+1, t.exons[i]->start-1, t.strand, g_introns, t);
346-
}
333+
for (int i=0;i<tdata->t_introns.Count();i++) {
334+
g_introns.Add(tdata->t_introns[i]);
347335
}
348-
} //no ballgown coverage
349-
*/
350-
return tdata->t_id;
336+
}
337+
else {
338+
tdata->rc_addFeatures(c_exon_id, g_exons, *bundle_RC_exons,
339+
c_intron_id, g_introns, *bundle_RC_introns);
340+
}
341+
return tdata->t_id;
351342
}
352343

353344
void updateCovSpan() {

0 commit comments

Comments
 (0)