Skip to content

Commit 8c12614

Browse files
committed
bringing up to date with svn 1.0.1 version
1 parent 06c8a1b commit 8c12614

2 files changed

Lines changed: 801 additions & 0 deletions

File tree

tablemaker.cpp

Lines changed: 399 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,399 @@
1+
//#include "tablemaker.h"
2+
#include "rlink.h"
3+
#include <numeric>
4+
5+
extern GStr ballgown_dir;
6+
7+
int rc_cov_inc(int i) {
8+
return ++i;
9+
}
10+
11+
/*
12+
void rc_update_tdata(BundleData& bundle, GffObj& scaff,
13+
double cov, double fpkm) {
14+
if (bundle.rc_data==NULL) return;
15+
RC_BundleData& rc = *(bundle.rc_data);
16+
if (rc.exons.size()==0) return;
17+
RC_ScaffData& q = (RC_ScaffData*)scaff.uptr;
18+
set<RC_ScaffData>::iterator tdata = rc.tdata.find(q);
19+
if (tdata==rc.tdata.end()) {
20+
fprintf(stderr, "Error: cannot locate bundle ref. transcript %s (%s:%d-%d)!\n",
21+
scaff.getID(), scaff.getGSeqName(), scaff.start, scaff.end);
22+
return;
23+
}
24+
(*tdata).cov=cov;
25+
(*tdata).fpkm=fpkm;
26+
}
27+
*/
28+
void BundleData::rc_store_t(GffObj* t) {
29+
//if (!rc_stage) return;
30+
if (rc_data==NULL) {
31+
rc_init(t);
32+
}
33+
rc_data->addTranscript(*t);
34+
//check this read alignment against ref exons and introns
35+
}
36+
37+
struct COvlSorter {
38+
bool operator() (pair<int, const RC_Feature*> i,
39+
pair<int, const RC_Feature*> j) {
40+
return (i.first>j.first); //sort in decreasing order of overlap length
41+
}
42+
} OvlSorter;
43+
44+
45+
void rc_updateExonCounts(const RC_Feature* exon, int nh) {
46+
exon->rcount++;
47+
exon->mrcount += (nh > 1) ? (1.0/nh) : 1;
48+
if (nh<=1) exon->ucount++;
49+
}
50+
51+
bool BundleData::rc_count_hit(GBamRecord& brec, char strand, int nh) { //, int hi) {
52+
if (rc_data==NULL) return false; //no ref transcripts available for this reads' region
53+
if (rc_data->tdata.Count()==0) return false; //nothing to do without transcripts
54+
55+
//check this read alignment against ref exons and introns
56+
/*
57+
int gstart=brec.start; //alignment start position on the genome
58+
59+
We NEVER update read-counting boundaries of the bundle based on reads - they should only be based on reference transcripts
60+
if (rc_data->f_cov.size()==0 && gstart<rc_data->lmin) {
61+
fprintf(stderr, "Warning: adjusting lmin coverage bundle from %d to %d !\n", int(rc_data->lmin), (int)gstart);
62+
rc_data->lmin=gstart;
63+
}
64+
*/
65+
if ((int)brec.end<rc_data->lmin || (int)brec.start>rc_data->rmax) {
66+
return false; //hit outside coverage area
67+
}
68+
/*
69+
int gpos=brec.start; //current genomic position
70+
int rlen=0; //read length, obtained here from the cigar string
71+
int segstart=gstart;
72+
*/
73+
vector<RC_Seg> rsegs;
74+
vector<RC_Seg> rintrons;
75+
for (int i=0;i<brec.exons.Count();i++) {
76+
rc_data->updateCov(strand, nh, brec.exons[i].start, brec.exons[i].len());
77+
rsegs.push_back(RC_Seg(brec.exons[i].start, brec.exons[i].end) );
78+
if (i>0) {
79+
//add intron
80+
rintrons.push_back(RC_Seg(brec.exons[i-1].end+1, brec.exons[i].start-1));
81+
}
82+
}
83+
//now check rexons and rintrons with findExons() and findIntron()
84+
for (size_t i=0;i<rintrons.size();++i) {
85+
/*RC_FeatIt ri=rc_data->findIntron(rintrons[i].l, rintrons[i].r, strand);
86+
if (ri!=rc_data->introns.end()) {
87+
(*ri).rcount++;
88+
(*ri).mrcount += (nh > 1) ? (1.0/nh) : 1;
89+
if (nh==1) (*ri).ucount++;
90+
}
91+
*/
92+
RC_Feature* ri=rc_data->findIntron(rintrons[i].l, rintrons[i].r, strand);
93+
if (ri) {
94+
ri->rcount++;
95+
ri->mrcount += (nh > 1) ? (1.0/nh) : 1;
96+
if (nh==1) ri->ucount++;
97+
}
98+
} //for each intron
99+
100+
for (size_t i=0;i<rsegs.size();++i) {
101+
RC_FeatPtrSet ovlex=rc_data->findExons(rsegs[i].l, rsegs[i].r, strand);
102+
if (ovlex.size()==0) continue;
103+
if (ovlex.size()>1) {
104+
vector< pair<int, const RC_Feature*> > xovl; //overlapped exons sorted by decreasing overlap length
105+
for (RC_FeatPtrSet::iterator ox=ovlex.begin();ox != ovlex.end(); ++ox) {
106+
int ovlen=(*ox)->ovlen(rsegs[i].l, rsegs[i].r);
107+
if (ovlen>=5)
108+
xovl.push_back(pair<int, const RC_Feature*>(ovlen, *ox));
109+
}
110+
if (xovl.size()>1) {
111+
sort(xovl.begin(), xovl.end(), OvlSorter); //larger overlaps first
112+
//update the counts only for ref exons with max overlap to this segment
113+
int max_ovl=xovl.begin()->first;
114+
for (vector<pair<int, const RC_Feature*> >::iterator xo=xovl.begin();xo!=xovl.end();++xo) {
115+
if (max_ovl - xo->first > 5 ) break; //more than +5 bases coverage for the other exons
116+
rc_updateExonCounts(xo->second, nh);
117+
}
118+
} else if (xovl.size() == 1) {
119+
rc_updateExonCounts(xovl.begin()->second, nh);
120+
}
121+
} else {
122+
// 1 exon overlap only
123+
int ovlen=(*ovlex.begin())->ovlen(rsegs[i].l, rsegs[i].r);
124+
if (ovlen>=5) rc_updateExonCounts(*ovlex.begin(), nh);
125+
}
126+
} //for each read "exon"
127+
return true;
128+
}
129+
130+
FILE* rc_fwopen(const char* fname) {
131+
if (strcmp(fname,"-")==0) return stdout;
132+
GStr fpath(ballgown_dir); //always ends with '/'
133+
//fpath += '.';
134+
fpath += fname;
135+
//fpath += "/";
136+
//fpath += fname;
137+
fpath += ".ctab";
138+
FILE* fh=fopen(fpath.chars(), "w");
139+
if (fh==NULL) {
140+
fprintf(stderr, "Error: cannot create file %s\n",
141+
fpath.chars());
142+
exit(1);
143+
}
144+
return fh;
145+
}
146+
147+
FILE* rc_frenopen(const char* fname) {
148+
//if (strcmp(fname,"-")==0) return stdout;
149+
GStr fpath(ballgown_dir);
150+
//fpath += '.';
151+
fpath += fname;
152+
//fpath += "/";
153+
//fpath += fname;
154+
fpath += ".ctab";
155+
GStr fren(fpath);
156+
fren += ".tmp";
157+
//rename fpath to fren and open fren for reading
158+
if (rename(fpath.chars(), fren.chars())!=0) {
159+
GError("Error: cannot rename %s to %s!\n", fpath.chars(), fren.chars());
160+
}
161+
FILE* fh=fopen(fren.chars(), "r");
162+
if (fh==NULL) {
163+
GError("Error: cannot open file %s\n", fren.chars());
164+
}
165+
return fh;
166+
}
167+
168+
void rc_frendel(const char* fname) {
169+
GStr fpath(ballgown_dir);
170+
//fpath += '.';
171+
fpath += fname;
172+
fpath += ".ctab";
173+
fpath += ".tmp";
174+
if (remove(fpath.chars())!=0) {
175+
GMessage("Warning: could not remove file %s!\n",fpath.chars());
176+
}
177+
}
178+
179+
void rc_write_f2t(FILE* fh, map<uint, set<uint> >& f2t) {
180+
for (map<uint, set<uint> >::iterator m=f2t.begin(); m!=f2t.end(); ++m) {
181+
uint f_id=(*m).first;
182+
set<uint>& tset = (*m).second;
183+
for (set<uint>::iterator it=tset.begin();it!=tset.end();++it) {
184+
uint t_id = *it;
185+
fprintf(fh, "%u\t%u\n", f_id, t_id);
186+
}
187+
}
188+
fflush(fh);
189+
}
190+
191+
192+
void rc_update_exons(RC_BundleData& rc) {
193+
//update stdev etc. for all exons in bundle
194+
for (int f=0;f<rc.exons.Count(); ++f) {
195+
RC_Feature& exon = *(rc.exons[f]);
196+
//assert( exon.l >= rc.lmin );
197+
int L=exon.l-rc.lmin;
198+
int xlen=exon.r-exon.l+1;
199+
if (exon.l < rc.lmin) {
200+
//shouldn't be here, bundle read-counting boundaries should be based on exons!
201+
if (exon.r<rc.lmin) continue;
202+
xlen-=(rc.lmin-exon.l+1);
203+
L=0;
204+
}
205+
if (rc.rmax<exon.r) {
206+
if (exon.l>rc.rmax) continue; //should never happen
207+
xlen-=(exon.r-rc.rmax+1);
208+
}
209+
int R=L+xlen;
210+
vector<int>::iterator xcov_begin;
211+
vector<int>::iterator xcov_end;
212+
vector<float>::iterator xmcov_begin;
213+
vector<float>::iterator xmcov_end;
214+
if (exon.strand=='+' || exon.strand=='.') {
215+
xcov_begin = rc.f_cov.begin()+L;
216+
xcov_end = rc.f_cov.begin()+R;
217+
xmcov_begin = rc.f_mcov.begin()+L;
218+
xmcov_end = rc.f_mcov.begin()+R;
219+
} else {
220+
xcov_begin = rc.r_cov.begin()+L;
221+
xcov_end = rc.r_cov.begin()+R;
222+
xmcov_begin = rc.r_mcov.begin()+L;
223+
xmcov_end = rc.r_mcov.begin()+R;
224+
}
225+
226+
double avg = (double)accumulate(xcov_begin, xcov_end, 0) / xlen;
227+
vector<double> diff(xlen);
228+
transform(xcov_begin, xcov_end, diff.begin(),
229+
bind2nd( minus<double>(), avg));
230+
double sq_sum = inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
231+
double stdev = sqrt(sq_sum / xlen);
232+
233+
double mavg = (double)accumulate(xmcov_begin, xmcov_end, 0) / xlen;
234+
vector<double> mdiff(xlen);
235+
transform(xmcov_begin, xmcov_end, mdiff.begin(),
236+
bind2nd( minus<double>(), mavg));
237+
sq_sum = inner_product(mdiff.begin(), mdiff.end(), mdiff.begin(), 0.0);
238+
double mstdev = sqrt(sq_sum / xlen);
239+
exon.avg=avg;
240+
exon.stdev=stdev;
241+
exon.mavg=mavg;
242+
exon.mstdev=mstdev;
243+
} //for each exon in bundle
244+
245+
246+
}
247+
248+
249+
void rc_write_RCfeature( GPVec<RC_ScaffData>& rcdata, GPVec<RC_Feature>& features, FILE*& fdata, FILE*& f2t,
250+
bool is_exon=false) {
251+
for (int i=0;i<features.Count();++i) {
252+
RC_Feature& f=*(features[i]);
253+
const char* ref_name=rcdata[f.t_id-1]->scaff->getGSeqName();
254+
if (is_exon) {
255+
fprintf(fdata, "%u\t%s\t%c\t%d\t%d\t%d\t%d\t%.2f\t%.4f\t%.4f\t%.4f\t%.4f\n",
256+
f.id, ref_name, f.strand, f.l, f.r, f.rcount,
257+
f.ucount, f.mrcount, f.avg, f.stdev, f.mavg, f.mstdev);
258+
}
259+
else { //introns
260+
fprintf(fdata,"%u\t%s\t%c\t%d\t%d\t%d\t%d\t%.2f\n",f.id, ref_name,
261+
f.strand, f.l, f.r, f.rcount, f.ucount, f.mrcount);
262+
}
263+
// f2t -------
264+
fprintf(f2t, "%u\t%u\n", f.id, f.t_id);
265+
} //for each feature
266+
fclose(fdata);
267+
fclose(f2t);
268+
}
269+
270+
271+
/*void rc_write_counts(const char* refname, BundleData& bundle) {
272+
RC_BundleData& rc = *bundle.rc_data;
273+
//if (rc.exons.size()==0) return;
274+
*
275+
*/
276+
void rc_writeRC(GPVec<RC_ScaffData>& RC_data,
277+
GPVec<RC_Feature>& RC_exons,
278+
GPVec<RC_Feature>& RC_introns,
279+
FILE* &f_tdata, FILE* &f_edata, FILE* &f_idata,
280+
FILE* &f_e2t, FILE* &f_i2t) {
281+
282+
for (int t=0;t<RC_data.Count();++t) {
283+
//File: t_data.ctab
284+
//t_id tname chr strand start end num_exons gene_id gene_name cufflinks_cov cufflinks_fpkm
285+
const RC_ScaffData& sd=*RC_data[t];
286+
const char* refname = sd.scaff->getGSeqName();
287+
const char* genename= sd.scaff->getGeneName();
288+
if (genename==NULL) genename=".";
289+
fprintf(f_tdata, "%u\t%s\t%c\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%f\t%f\n",
290+
sd.t_id, refname, sd.strand, sd.l, sd.r, sd.scaff->getID(),
291+
sd.num_exons, sd.eff_len, sd.scaff->getGeneID(),
292+
genename, sd.cov, sd.fpkm);
293+
}//for each transcript
294+
//fflush(f_tdata);
295+
fclose(f_tdata);
296+
297+
//File: e_data.ctab
298+
//e_id chr gstart gend rcount ucount mrcount
299+
rc_write_RCfeature(RC_data, RC_exons, f_edata, f_e2t, true);
300+
//File: i_data.ctab
301+
//i_id chr gstart gend rcount ucount mrcount
302+
rc_write_RCfeature(RC_data, RC_introns, f_idata, f_i2t);
303+
304+
// feature-to-transcript link files
305+
//rc_write_f2t(rc.fe2t, rc.e2t);
306+
//rc_write_f2t(rc.fi2t, rc.i2t);
307+
}
308+
309+
void RC_ScaffData::addFeature(int fl, int fr, GPVec<RC_Feature>& fvec,
310+
uint& f_id, set<RC_ScaffSeg>& fset, set<RC_ScaffSeg>::iterator& fit,
311+
GPVec<RC_Feature>& fdata) {
312+
//f_id is the largest f_id inserted so far in fset
313+
bool add_new = true;
314+
RC_ScaffSeg newseg(fl,fr,this->strand);
315+
//RC_Feature* newfeature=NULL;
316+
if (fset.size()>0) {
317+
if (fit == fset.end()) --fit;
318+
if (newseg < (*fit)) {
319+
bool eq=false;
320+
while (newseg < (*fit) || (eq = (newseg==(*fit)))) {
321+
if (eq) {
322+
add_new = false;
323+
newseg.id = fit->id;
324+
break;
325+
}
326+
if (fit==fset.begin()) {
327+
break;
328+
}
329+
--fit;
330+
}
331+
}
332+
else { //newseg >= *fit
333+
bool eq=false;
334+
while ((*fit) < newseg || (eq = (newseg==(*fit)))) {
335+
if (eq) {
336+
add_new = false;
337+
newseg.id = fit->id;
338+
break;
339+
}
340+
++fit;
341+
if (fit==fset.end()) {
342+
--fit;
343+
break;
344+
}
345+
}
346+
}
347+
}
348+
if (add_new) {
349+
newseg.id = ++f_id;
350+
pair< set<RC_ScaffSeg>::iterator, bool> ret = fset.insert(newseg);
351+
//ret.second = was_inserted (indeed new)
352+
if (!ret.second) {
353+
GError("Error: feature %d-%d (%c) already in segment set!\n",
354+
newseg.l, newseg.r, newseg.strand);
355+
//newseg.id = ret.first->id;
356+
}
357+
fdata.Add(new RC_Feature(newseg, this->t_id));
358+
#ifdef DEBUG
359+
if (fdata.Count()!=(int)f_id) {
360+
GMessage("Error: fdata.Count=%d, f_id=%d\n", fdata.Count(), f_id);
361+
}
362+
#endif
363+
GASSERT((uint)fdata.Count()==f_id);
364+
}
365+
//fvec.push_back(newseg);
366+
GASSERT(fdata[newseg.id-1]->id==newseg.id);
367+
fvec.Add(fdata[newseg.id-1]);
368+
}
369+
370+
void Ballgown_setupFiles(FILE* &f_tdata, FILE* &f_edata, FILE* &f_idata,
371+
FILE* &f_e2t, FILE* &f_i2t) {
372+
if (f_tdata == NULL) {
373+
//first call, create the files
374+
f_tdata = rc_fwopen("t_data");
375+
fprintf(f_tdata, "t_id\tchr\tstrand\tstart\tend\tt_name\tnum_exons\tlength\tgene_id\tgene_name\tcov\tFPKM\n");
376+
f_edata = rc_fwopen("e_data");
377+
fprintf(f_edata, "e_id\tchr\tstrand\tstart\tend\trcount\tucount\tmrcount\tcov\tcov_sd\tmcov\tmcov_sd\n");
378+
f_idata = rc_fwopen("i_data");
379+
fprintf(f_idata, "i_id\tchr\tstrand\tstart\tend\trcount\tucount\tmrcount\n");
380+
f_e2t = rc_fwopen("e2t");
381+
fprintf(f_e2t, "e_id\tt_id\n");
382+
f_i2t = rc_fwopen("i2t");
383+
fprintf(f_i2t, "i_id\tt_id\n");
384+
}
385+
}
386+
387+
void RC_ScaffData::rc_addFeatures(uint& c_e_id, set<RC_ScaffSeg>& fexons, GPVec<RC_Feature>& edata,
388+
uint& c_i_id, set<RC_ScaffSeg>& fintrons, GPVec<RC_Feature>& idata) {
389+
GASSERT(scaff);
390+
GffObj& m = *(scaff);
391+
for (int i = 0; i < m.exons.Count(); ++i) {
392+
set<RC_ScaffSeg>::iterator eit=fexons.end();
393+
set<RC_ScaffSeg>::iterator iit=fintrons.end();
394+
addFeature(m.exons[i]->start, m.exons[i]->end, t_exons, c_e_id, fexons, eit, edata);
395+
if (i>0) { //store intron
396+
addFeature(m.exons[i-1]->end+1, m.exons[i]->start-1, t_introns, c_i_id, fintrons, iit, idata);
397+
}
398+
} //for each exon
399+
}

0 commit comments

Comments
 (0)