Skip to content

Commit f3453ed

Browse files
committed
wip: refactoring raw counts code
1 parent 111743d commit f3453ed

5 files changed

Lines changed: 100 additions & 57 deletions

File tree

gclib/GArgs.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
22
GArgs is a quick'n'dirty object oriented replacement for the standard
3-
getopts library call available on many unix platforms;
3+
getopts library available on many unix platforms;
44
it accepts the regular single letter, single-dash style options
55
-<letter>[ ][<value>]
66
but also attr=value style options:

rlink.h

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -344,9 +344,11 @@ struct BundleData {
344344
//refseq=ref;
345345
status=BUNDLE_STATUS_READY;
346346
}
347-
void rc_init(GffObj* t) {
347+
void rc_init(GffObj* t, GPVec<RC_TData>* rc_tdata,
348+
GPVec<RC_Feature>* rc_edata, GPVec<RC_Feature>* rc_idata) {
348349
if (rc_data==NULL) {
349-
rc_data = new RC_BundleData(t->start, t->end);
350+
rc_data = new RC_BundleData(t->start, t->end,
351+
rc_tdata, rc_edata, rc_idata);
350352
}
351353
}
352354
/* after reference annotation was loaded
@@ -356,7 +358,8 @@ struct BundleData {
356358
}
357359
Not needed here, we update the coverage span as each transcript is added
358360
*/
359-
void keepGuide(GffObj* scaff);
361+
void keepGuide(GffObj* scaff, GPVec<RC_TData>* rc_tdata=NULL,
362+
GPVec<RC_Feature>* rc_edata=NULL, GPVec<RC_Feature>* rc_idata=NULL);
360363

361364
//bool evalReadAln(GBamRecord& brec, char& strand, int nh); //, int hi);
362365
bool evalReadAln(GReadAlnData& alndata, char& strand);

stringtie.cpp

Lines changed: 21 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -194,16 +194,18 @@ int main(int argc, char * const argv[]) {
194194
// == Process arguments.
195195
GArgs args(argc, argv,
196196
//"debug;help;fast;xhvntj:D:G:C:l:m:o:a:j:c:f:p:g:");
197-
"debug;help;xyzwShvtien:j:s:D:G:C:l:m:o:a:j:c:f:p:g:P:M:Bb:");
197+
"debug;help;exclude=xyzwShvtien:j:s:D:G:C:l:m:o:a:j:c:f:p:g:P:M:Bb:");
198198
args.printError(USAGE, true);
199199

200200
GStr bamfname=Process_Options(&args);
201201
// == Done argument processing.
202202

203203
GVec<GRefData> refguides; // plain vector with transcripts for each chromosome
204-
GPVec<RC_TData> refguides_RC_Data(true); //raw count data for all guide transcripts
205-
GPVec<RC_Feature> refguides_RC_exons(true); //raw count data for all guide exons
206-
GPVec<RC_Feature> refguides_RC_introns(true);//raw count data for all guide introns
204+
//table indexes for Ballgown Raw Counts data
205+
GPVec<RC_TData> guides_RC_tdata(true); //raw count data for all guide transcripts
206+
GPVec<RC_Feature> guides_RC_exons(true); //raw count data for all guide exons
207+
GPVec<RC_Feature> guides_RC_introns(true);//raw count data for all guide introns
208+
207209
GVec<int> alncounts(30,0); //keep track of the number of read alignments per chromosome [gseq_id]
208210

209211
#ifdef DEBUGPRINT
@@ -234,27 +236,27 @@ const char* ERR_BAM_SORT="\nError: the input alignment file is not sorted!\n";
234236
guidegff.chars());
235237
}
236238
refguides.setCount(refseqCount); //maximum gseqid
237-
uint cur_tid=0;
238-
uint cur_exon_id=0;
239-
uint cur_intron_id=0;
239+
uint c_tid=0;
240+
uint c_exon_id=0;
241+
uint c_intron_id=0;
240242
GList<RC_Feature> uexons(true, false, true); //sorted, free items, unique
241243
GList<RC_Feature> uintrons(true, false, true);
242244
//assign unique transcript IDs based on the sorted order
243245
int last_refid=0;
244246
for (int i=0;i<gffr.gflst.Count();i++) {
245247
GffObj* m=gffr.gflst[i];
246-
if (ballgown) {
247-
RC_TData* tdata=new RC_TData(*m, ++cur_tid);
248+
if (ballgown) { //prepare memory storage/tables for all guides
249+
RC_TData* tdata=new RC_TData(*m, ++c_tid);
248250
m->uptr=tdata;
249251
if (last_refid!=m->gseq_id) {
250252
//chromosome switch
251253
uexons.Clear();
252254
uintrons.Clear();
253255
last_refid=m->gseq_id;
254256
}
255-
refguides_RC_Data.Add(tdata);
256-
tdata->rc_addFeatures(cur_exon_id, uexons, refguides_RC_exons,
257-
cur_intron_id, uintrons, refguides_RC_introns);
257+
guides_RC_tdata.Add(tdata);
258+
tdata->rc_addFeatures(c_exon_id, uexons, guides_RC_exons,
259+
c_intron_id, uintrons, guides_RC_introns);
258260
}
259261

260262
GRefData& grefdata = refguides[m->gseq_id];
@@ -492,7 +494,8 @@ if (ballgown)
492494
if (currentend<(int)(*guides)[ng_ovlstart]->end)
493495
currentend=(*guides)[ng_ovlstart]->end;
494496
//if (ballgown)
495-
bundle->keepGuide((*guides)[ng_ovlstart]);
497+
bundle->keepGuide((*guides)[ng_ovlstart],
498+
&guides_RC_tdata, &guides_RC_exons, &guides_RC_introns);
496499
ng_ovlstart++;
497500
}
498501
if (ng_ovlstart>ng_start) ng_end=ng_ovlstart-1;
@@ -514,7 +517,8 @@ if (ballgown)
514517
ng_end++;
515518
//more transcripts overlapping this bundle
516519
//if (ballgown)
517-
bundle->keepGuide((*guides)[ng_end]);
520+
bundle->keepGuide((*guides)[ng_end],
521+
&guides_RC_tdata, &guides_RC_exons, &guides_RC_introns);
518522
if(currentend<(int)(*guides)[ng_end]->end) {
519523
currentend=(*guides)[ng_end]->end;
520524
cend_changed=true;
@@ -605,8 +609,8 @@ if (ballgown)
605609
sscanf(linebuf,"%d %d %d %g %g", &nl, &tlen, &t_id, &fpkm, &tcov);
606610
calc_fpkm=tcov*1000000000/Frag_Len;
607611
if (ballgown && t_id>0) {
608-
refguides_RC_Data[t_id-1]->fpkm=calc_fpkm;
609-
refguides_RC_Data[t_id-1]->cov=tcov;
612+
guides_RC_tdata[t_id-1]->fpkm=calc_fpkm;
613+
guides_RC_tdata[t_id-1]->cov=tcov;
610614
}
611615
for(int i=0;i<nl;i++) {
612616
fgetline(linebuf,linebuflen,t_out);
@@ -633,7 +637,7 @@ if (ballgown)
633637

634638
//lastly, for ballgown, rewrite the tdata file with updated cov and fpkm
635639
if (ballgown) {
636-
rc_writeRC(refguides_RC_Data, refguides_RC_exons, refguides_RC_introns,
640+
rc_writeRC(guides_RC_tdata, guides_RC_exons, guides_RC_introns,
637641
f_tdata, f_edata, f_idata, f_e2t, f_i2t);
638642
}
639643

tablemaker.cpp

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -25,12 +25,13 @@ void rc_update_tdata(BundleData& bundle, GffObj& scaff,
2525
(*tdata).fpkm=fpkm;
2626
}
2727
*/
28-
void BundleData::keepGuide(GffObj* t) {
28+
void BundleData::keepGuide(GffObj* t, GPVec<RC_TData>* rc_tdata,
29+
GPVec<RC_Feature>* rc_edata, GPVec<RC_Feature>* rc_idata) {
2930
if (rc_data==NULL) {
30-
rc_init(t);
31+
rc_init(t, rc_tdata, rc_edata, rc_idata);
3132
}
32-
t->udata=keepguides.Add(t)+1;
33-
rc_data->addTranscript(*t);
33+
keepguides.Add(t);
34+
t->udata=(int)rc_data->addTranscript(*t);
3435
}
3536

3637
struct COvlSorter {
@@ -56,12 +57,10 @@ bool BundleData::evalReadAln(GReadAlnData& alndata, char& xstrand) {
5657
if ((int)brec.end<rc_data->lmin || (int)brec.start>rc_data->rmax) {
5758
return false; //hit outside coverage area
5859
}
59-
if (rc_data->g_exons.Count()==0) return false;
60-
if (ballgown && rc_data->tdata.Count()==0) return false;
61-
//nothing to do without transcripts
60+
if (rc_data->g_exons.Count()==0 || rc_data->g_tdata.Count()==0)
61+
return false; //nothing to do without transcripts
6262
//check this read alignment against ref exons and introns
6363
char strandbits=0;
64-
6564
for (int i=0;i<brec.exons.Count();i++) {
6665
if (ballgown)
6766
rc_data->updateCov(xstrand, nh, brec.exons[i].start, brec.exons[i].len());
@@ -94,7 +93,7 @@ bool BundleData::evalReadAln(GReadAlnData& alndata, char& xstrand) {
9493
int j_r=brec.exons[i].start-1;
9594
RC_Feature* ri=rc_data->findIntron(j_l, j_r, xstrand);
9695
alndata.juncs.Add(new CJunction(j_l, j_r)); //don't set strand, etc. for now
97-
if (ri) {
96+
if (ri) { //update guide intron counts
9897
ri->rcount++;
9998
ri->mrcount += (nh > 1) ? (1.0/nh) : 1;
10099
if (nh==1) ri->ucount++;

tablemaker.h

Lines changed: 64 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -224,28 +224,45 @@ struct RC_BundleData {
224224
int init_lmin;
225225
int lmin;
226226
int rmax;
227-
GPVec<RC_TData> tdata; //all transcripts in this bundle (not used unless -B)
228-
GList<RC_Feature> g_exons; //all unique guide exons in this bundle, sorted by their start coordinate
229-
GList<RC_Feature> g_introns; //all unique guide introns in this bundle, sorted by their start coordinate
227+
GPVec<RC_TData> g_tdata; //raw counting data for all transcripts in this bundle
228+
// RC_TData::t_id-1 = the element index in this array
229+
GList<RC_Feature> g_exons; //set of guide exons in this bundle, sorted by their start coordinate
230+
GList<RC_Feature> g_introns; //set of guide introns in this bundle, sorted by their start coordinate
230231
//RC_FeatIt xcache; //cache the first exon overlapping xcache_pos to speed up exon-overlap queries (findOvlExons())
231232
int xcache; //exons index of the first exon overlapping xcache_pos
232233
int xcache_pos; // left coordinate of last cached exon overlap query (findOvlExons())
234+
233235
// the following coverage arrays will only used with Ballgown data (-B)
234236
vector<float> f_mcov; //coverage data, multi-map aware, per strand
235237
vector<int> f_cov;
236238
vector<float> r_mcov; //coverage data on the reverse strand
237239
vector<int> r_cov;
238-
//
239-
RC_BundleData(int t_l=0, int t_r=0):init_lmin(0), lmin(t_l), rmax(t_r),
240-
tdata(false), // features:(sorted, free, unique)
240+
241+
//-- when no global Ballgown data is to be generated, these are
242+
// local (bundle) stable order tables of guide features
243+
GPVec<RC_TData>* guides_RC_tdata; //just a pointer to the current RC tdata table
244+
// RC_Feature::id-1 = the index in these arrays:
245+
GPVec<RC_Feature>* guides_RC_exons;
246+
GPVec<RC_Feature>* guides_RC_introns;
247+
uint c_exon_id;
248+
uint c_intron_id;
249+
//--
250+
RC_BundleData(int t_l=0, int t_r=0, GPVec<RC_TData>* rc_tdata=NULL,
251+
GPVec<RC_Feature>* rc_exons=NULL,GPVec<RC_Feature>* rc_introns=NULL):
252+
init_lmin(0), lmin(t_l), rmax(t_r), g_tdata(false),
253+
// features:(sorted, free, unique)
241254
g_exons(true, false, true), g_introns(true, false, true),
242-
xcache(0), xcache_pos(0)
243-
{
255+
xcache(0), xcache_pos(0),
256+
guides_RC_tdata(rc_tdata), guides_RC_exons(rc_exons), guides_RC_introns(rc_introns),
257+
c_exon_id(0), c_intron_id(0)
258+
{
244259
if (ballgown) {
245260
if (rmax>lmin) updateCovSpan();
246-
}else { //this will be deallocated when the bundle is cleared
247-
g_exons.setFreeItem(true);
248-
g_introns.setFreeItem(true);
261+
}else {
262+
g_tdata.setFreeItem(true);
263+
guides_RC_tdata = &g_tdata;
264+
guides_RC_exons = new GPVec<RC_Feature>(true);
265+
guides_RC_introns= new GPVec<RC_Feature>(true);
249266
}
250267
}
251268

@@ -254,6 +271,10 @@ struct RC_BundleData {
254271
f_mcov.clear();
255272
r_cov.clear();
256273
r_mcov.clear();
274+
if (!ballgown) {
275+
delete guides_RC_exons;
276+
delete guides_RC_introns;
277+
}
257278
}
258279

259280
//for non-ballgown tracking of guide features
@@ -269,28 +290,42 @@ struct RC_BundleData {
269290
}
270291
}
271292

272-
void addTranscript(GffObj& t) {
293+
uint addTranscript(GffObj& t) { //shouuld return the guide index in *guides_RC_tdata
273294
//if (!ps.rc_id_data()) return;
274295
//RC_ScaffIds& sdata = *(ps.rc_id_data());
275296
bool boundary_changed=false;
276297
if (lmin==0 || lmin>(int)t.start) { lmin=t.start; boundary_changed=true; }
277298
if (rmax==0 || rmax<(int)t.end) { rmax=t.end; boundary_changed=true; }
278-
if (t.uptr) { //ballgown case
279-
RC_TData& sdata=*(RC_TData*)(t.uptr);
280-
//tdata.insert(sdata);
281-
tdata.Add(&sdata);
282-
if (boundary_changed) updateCovSpan();
283-
//for (vector<RC_ScaffSeg>::iterator it=sdata.exons.begin();it!=sdata.exons.end();++it) {
284-
for (int i=0;i<sdata.t_exons.Count();i++) {
285-
g_exons.Add(sdata.t_exons[i]);
286-
}
287-
//store introns:
288-
for (int i=0;i<sdata.t_introns.Count();i++) {
289-
g_introns.Add(sdata.t_introns[i]);
299+
RC_TData* tdata=NULL;
300+
if (ballgown) {
301+
tdata=(RC_TData*)(t.uptr);
302+
}
303+
else {
304+
//int c_tid=g_tdata.Count();
305+
//add RC transcript data locally for the bundle
306+
tdata=new RC_TData(t, g_tdata.Count()+1);
307+
t.uptr=tdata;
308+
//guides_RC_Data.Add(tdata);
309+
tdata->rc_addFeatures(c_exon_id, g_exons, *guides_RC_exons,
310+
c_intron_id, g_introns, *guides_RC_introns);
311+
}
312+
//if (ballgown) { //ballgown case
313+
//RC_TData& sdata=*(RC_TData*)(t.uptr);
314+
g_tdata.Add(tdata);
315+
if (ballgown) {
316+
if (boundary_changed) updateCovSpan();
317+
//need to add exons and introns because rc_addFeatures()
318+
// was NOT using g_exons and g_introns as unique sets if ballgown
319+
for (int i=0;i<tdata->t_exons.Count();i++) {
320+
g_exons.Add(tdata->t_exons[i]);
321+
}
322+
for (int i=0;i<tdata->t_introns.Count();i++) {
323+
g_introns.Add(tdata->t_introns[i]);
324+
}
290325
}
291-
}
292-
else {
293-
//non-ballgown, regular storage of bundle guides data
326+
//} //if ballgown
327+
/*
328+
else { //non-ballgown, regular storage of bundle guides data
294329
//use GffObj::udata as tid, it's the index in bundledata::keepguides
295330
for (int i=0;i<t.exons.Count();++i) {
296331
addTranscriptFeature(t.exons[i]->start, t.exons[i]->end, t.strand, g_exons, t);
@@ -300,6 +335,8 @@ struct RC_BundleData {
300335
}
301336
}
302337
} //no ballgown coverage
338+
*/
339+
return tdata->t_id;
303340
}
304341

305342
void updateCovSpan() {

0 commit comments

Comments
 (0)