|
| 1 | +//#include "tablemaker.h" |
| 2 | +#include "rlink.h" |
| 3 | +#include <numeric> |
| 4 | + |
| 5 | +extern GStr ballgown_dir; |
| 6 | + |
| 7 | +int rc_cov_inc(int i) { |
| 8 | + return ++i; |
| 9 | +} |
| 10 | + |
| 11 | +/* |
| 12 | +void rc_update_tdata(BundleData& bundle, GffObj& scaff, |
| 13 | + double cov, double fpkm) { |
| 14 | + if (bundle.rc_data==NULL) return; |
| 15 | + RC_BundleData& rc = *(bundle.rc_data); |
| 16 | + if (rc.exons.size()==0) return; |
| 17 | + RC_ScaffData& q = (RC_ScaffData*)scaff.uptr; |
| 18 | + set<RC_ScaffData>::iterator tdata = rc.tdata.find(q); |
| 19 | + if (tdata==rc.tdata.end()) { |
| 20 | + fprintf(stderr, "Error: cannot locate bundle ref. transcript %s (%s:%d-%d)!\n", |
| 21 | + scaff.getID(), scaff.getGSeqName(), scaff.start, scaff.end); |
| 22 | + return; |
| 23 | + } |
| 24 | + (*tdata).cov=cov; |
| 25 | + (*tdata).fpkm=fpkm; |
| 26 | +} |
| 27 | +*/ |
| 28 | +void BundleData::rc_store_t(GffObj* t) { |
| 29 | + //if (!rc_stage) return; |
| 30 | + if (rc_data==NULL) { |
| 31 | + rc_init(t); |
| 32 | + } |
| 33 | + rc_data->addTranscript(*t); |
| 34 | + //check this read alignment against ref exons and introns |
| 35 | +} |
| 36 | + |
| 37 | +struct COvlSorter { |
| 38 | + bool operator() (pair<int, const RC_Feature*> i, |
| 39 | + pair<int, const RC_Feature*> j) { |
| 40 | + return (i.first>j.first); //sort in decreasing order of overlap length |
| 41 | + } |
| 42 | +} OvlSorter; |
| 43 | + |
| 44 | + |
| 45 | +void rc_updateExonCounts(const RC_Feature* exon, int nh) { |
| 46 | + exon->rcount++; |
| 47 | + exon->mrcount += (nh > 1) ? (1.0/nh) : 1; |
| 48 | + if (nh<=1) exon->ucount++; |
| 49 | +} |
| 50 | + |
| 51 | +bool BundleData::rc_count_hit(GBamRecord& brec, char strand, int nh) { //, int hi) { |
| 52 | + if (rc_data==NULL) return false; //no ref transcripts available for this reads' region |
| 53 | + if (rc_data->tdata.Count()==0) return false; //nothing to do without transcripts |
| 54 | + |
| 55 | + //check this read alignment against ref exons and introns |
| 56 | + /* |
| 57 | + int gstart=brec.start; //alignment start position on the genome |
| 58 | +
|
| 59 | + We NEVER update read-counting boundaries of the bundle based on reads - they should only be based on reference transcripts |
| 60 | + if (rc_data->f_cov.size()==0 && gstart<rc_data->lmin) { |
| 61 | + fprintf(stderr, "Warning: adjusting lmin coverage bundle from %d to %d !\n", int(rc_data->lmin), (int)gstart); |
| 62 | + rc_data->lmin=gstart; |
| 63 | + } |
| 64 | + */ |
| 65 | + if ((int)brec.end<rc_data->lmin || (int)brec.start>rc_data->rmax) { |
| 66 | + return false; //hit outside coverage area |
| 67 | + } |
| 68 | + /* |
| 69 | + int gpos=brec.start; //current genomic position |
| 70 | + int rlen=0; //read length, obtained here from the cigar string |
| 71 | + int segstart=gstart; |
| 72 | + */ |
| 73 | + vector<RC_Seg> rsegs; |
| 74 | + vector<RC_Seg> rintrons; |
| 75 | + for (int i=0;i<brec.exons.Count();i++) { |
| 76 | + rc_data->updateCov(strand, nh, brec.exons[i].start, brec.exons[i].len()); |
| 77 | + rsegs.push_back(RC_Seg(brec.exons[i].start, brec.exons[i].end) ); |
| 78 | + if (i>0) { |
| 79 | + //add intron |
| 80 | + rintrons.push_back(RC_Seg(brec.exons[i-1].end+1, brec.exons[i].start-1)); |
| 81 | + } |
| 82 | + } |
| 83 | + //now check rexons and rintrons with findExons() and findIntron() |
| 84 | + for (size_t i=0;i<rintrons.size();++i) { |
| 85 | + /*RC_FeatIt ri=rc_data->findIntron(rintrons[i].l, rintrons[i].r, strand); |
| 86 | + if (ri!=rc_data->introns.end()) { |
| 87 | + (*ri).rcount++; |
| 88 | + (*ri).mrcount += (nh > 1) ? (1.0/nh) : 1; |
| 89 | + if (nh==1) (*ri).ucount++; |
| 90 | + } |
| 91 | + */ |
| 92 | + RC_Feature* ri=rc_data->findIntron(rintrons[i].l, rintrons[i].r, strand); |
| 93 | + if (ri) { |
| 94 | + ri->rcount++; |
| 95 | + ri->mrcount += (nh > 1) ? (1.0/nh) : 1; |
| 96 | + if (nh==1) ri->ucount++; |
| 97 | + } |
| 98 | + } //for each intron |
| 99 | + |
| 100 | + for (size_t i=0;i<rsegs.size();++i) { |
| 101 | + RC_FeatPtrSet ovlex=rc_data->findExons(rsegs[i].l, rsegs[i].r, strand); |
| 102 | + if (ovlex.size()==0) continue; |
| 103 | + if (ovlex.size()>1) { |
| 104 | + vector< pair<int, const RC_Feature*> > xovl; //overlapped exons sorted by decreasing overlap length |
| 105 | + for (RC_FeatPtrSet::iterator ox=ovlex.begin();ox != ovlex.end(); ++ox) { |
| 106 | + int ovlen=(*ox)->ovlen(rsegs[i].l, rsegs[i].r); |
| 107 | + if (ovlen>=5) |
| 108 | + xovl.push_back(pair<int, const RC_Feature*>(ovlen, *ox)); |
| 109 | + } |
| 110 | + if (xovl.size()>1) { |
| 111 | + sort(xovl.begin(), xovl.end(), OvlSorter); //larger overlaps first |
| 112 | + //update the counts only for ref exons with max overlap to this segment |
| 113 | + int max_ovl=xovl.begin()->first; |
| 114 | + for (vector<pair<int, const RC_Feature*> >::iterator xo=xovl.begin();xo!=xovl.end();++xo) { |
| 115 | + if (max_ovl - xo->first > 5 ) break; //more than +5 bases coverage for the other exons |
| 116 | + rc_updateExonCounts(xo->second, nh); |
| 117 | + } |
| 118 | + } else if (xovl.size() == 1) { |
| 119 | + rc_updateExonCounts(xovl.begin()->second, nh); |
| 120 | + } |
| 121 | + } else { |
| 122 | + // 1 exon overlap only |
| 123 | + int ovlen=(*ovlex.begin())->ovlen(rsegs[i].l, rsegs[i].r); |
| 124 | + if (ovlen>=5) rc_updateExonCounts(*ovlex.begin(), nh); |
| 125 | + } |
| 126 | + } //for each read "exon" |
| 127 | + return true; |
| 128 | +} |
| 129 | + |
| 130 | +FILE* rc_fwopen(const char* fname) { |
| 131 | + if (strcmp(fname,"-")==0) return stdout; |
| 132 | + GStr fpath(ballgown_dir); //always ends with '/' |
| 133 | + //fpath += '.'; |
| 134 | + fpath += fname; |
| 135 | + //fpath += "/"; |
| 136 | + //fpath += fname; |
| 137 | + fpath += ".ctab"; |
| 138 | + FILE* fh=fopen(fpath.chars(), "w"); |
| 139 | + if (fh==NULL) { |
| 140 | + fprintf(stderr, "Error: cannot create file %s\n", |
| 141 | + fpath.chars()); |
| 142 | + exit(1); |
| 143 | + } |
| 144 | + return fh; |
| 145 | +} |
| 146 | + |
| 147 | +FILE* rc_frenopen(const char* fname) { |
| 148 | + //if (strcmp(fname,"-")==0) return stdout; |
| 149 | + GStr fpath(ballgown_dir); |
| 150 | + //fpath += '.'; |
| 151 | + fpath += fname; |
| 152 | + //fpath += "/"; |
| 153 | + //fpath += fname; |
| 154 | + fpath += ".ctab"; |
| 155 | + GStr fren(fpath); |
| 156 | + fren += ".tmp"; |
| 157 | + //rename fpath to fren and open fren for reading |
| 158 | + if (rename(fpath.chars(), fren.chars())!=0) { |
| 159 | + GError("Error: cannot rename %s to %s!\n", fpath.chars(), fren.chars()); |
| 160 | + } |
| 161 | + FILE* fh=fopen(fren.chars(), "r"); |
| 162 | + if (fh==NULL) { |
| 163 | + GError("Error: cannot open file %s\n", fren.chars()); |
| 164 | + } |
| 165 | + return fh; |
| 166 | +} |
| 167 | + |
| 168 | +void rc_frendel(const char* fname) { |
| 169 | + GStr fpath(ballgown_dir); |
| 170 | + //fpath += '.'; |
| 171 | + fpath += fname; |
| 172 | + fpath += ".ctab"; |
| 173 | + fpath += ".tmp"; |
| 174 | + if (remove(fpath.chars())!=0) { |
| 175 | + GMessage("Warning: could not remove file %s!\n",fpath.chars()); |
| 176 | + } |
| 177 | +} |
| 178 | + |
| 179 | +void rc_write_f2t(FILE* fh, map<uint, set<uint> >& f2t) { |
| 180 | + for (map<uint, set<uint> >::iterator m=f2t.begin(); m!=f2t.end(); ++m) { |
| 181 | + uint f_id=(*m).first; |
| 182 | + set<uint>& tset = (*m).second; |
| 183 | + for (set<uint>::iterator it=tset.begin();it!=tset.end();++it) { |
| 184 | + uint t_id = *it; |
| 185 | + fprintf(fh, "%u\t%u\n", f_id, t_id); |
| 186 | + } |
| 187 | + } |
| 188 | + fflush(fh); |
| 189 | +} |
| 190 | + |
| 191 | + |
| 192 | +void rc_update_exons(RC_BundleData& rc) { |
| 193 | + //update stdev etc. for all exons in bundle |
| 194 | + for (int f=0;f<rc.exons.Count(); ++f) { |
| 195 | + RC_Feature& exon = *(rc.exons[f]); |
| 196 | + //assert( exon.l >= rc.lmin ); |
| 197 | + int L=exon.l-rc.lmin; |
| 198 | + int xlen=exon.r-exon.l+1; |
| 199 | + if (exon.l < rc.lmin) { |
| 200 | + //shouldn't be here, bundle read-counting boundaries should be based on exons! |
| 201 | + if (exon.r<rc.lmin) continue; |
| 202 | + xlen-=(rc.lmin-exon.l+1); |
| 203 | + L=0; |
| 204 | + } |
| 205 | + if (rc.rmax<exon.r) { |
| 206 | + if (exon.l>rc.rmax) continue; //should never happen |
| 207 | + xlen-=(exon.r-rc.rmax+1); |
| 208 | + } |
| 209 | + int R=L+xlen; |
| 210 | + vector<int>::iterator xcov_begin; |
| 211 | + vector<int>::iterator xcov_end; |
| 212 | + vector<float>::iterator xmcov_begin; |
| 213 | + vector<float>::iterator xmcov_end; |
| 214 | + if (exon.strand=='+' || exon.strand=='.') { |
| 215 | + xcov_begin = rc.f_cov.begin()+L; |
| 216 | + xcov_end = rc.f_cov.begin()+R; |
| 217 | + xmcov_begin = rc.f_mcov.begin()+L; |
| 218 | + xmcov_end = rc.f_mcov.begin()+R; |
| 219 | + } else { |
| 220 | + xcov_begin = rc.r_cov.begin()+L; |
| 221 | + xcov_end = rc.r_cov.begin()+R; |
| 222 | + xmcov_begin = rc.r_mcov.begin()+L; |
| 223 | + xmcov_end = rc.r_mcov.begin()+R; |
| 224 | + } |
| 225 | + |
| 226 | + double avg = (double)accumulate(xcov_begin, xcov_end, 0) / xlen; |
| 227 | + vector<double> diff(xlen); |
| 228 | + transform(xcov_begin, xcov_end, diff.begin(), |
| 229 | + bind2nd( minus<double>(), avg)); |
| 230 | + double sq_sum = inner_product(diff.begin(), diff.end(), diff.begin(), 0.0); |
| 231 | + double stdev = sqrt(sq_sum / xlen); |
| 232 | + |
| 233 | + double mavg = (double)accumulate(xmcov_begin, xmcov_end, 0) / xlen; |
| 234 | + vector<double> mdiff(xlen); |
| 235 | + transform(xmcov_begin, xmcov_end, mdiff.begin(), |
| 236 | + bind2nd( minus<double>(), mavg)); |
| 237 | + sq_sum = inner_product(mdiff.begin(), mdiff.end(), mdiff.begin(), 0.0); |
| 238 | + double mstdev = sqrt(sq_sum / xlen); |
| 239 | + exon.avg=avg; |
| 240 | + exon.stdev=stdev; |
| 241 | + exon.mavg=mavg; |
| 242 | + exon.mstdev=mstdev; |
| 243 | + } //for each exon in bundle |
| 244 | + |
| 245 | + |
| 246 | +} |
| 247 | + |
| 248 | + |
| 249 | +void rc_write_RCfeature( GPVec<RC_ScaffData>& rcdata, GPVec<RC_Feature>& features, FILE*& fdata, FILE*& f2t, |
| 250 | + bool is_exon=false) { |
| 251 | + for (int i=0;i<features.Count();++i) { |
| 252 | + RC_Feature& f=*(features[i]); |
| 253 | + const char* ref_name=rcdata[f.t_id-1]->scaff->getGSeqName(); |
| 254 | + if (is_exon) { |
| 255 | + fprintf(fdata, "%u\t%s\t%c\t%d\t%d\t%d\t%d\t%.2f\t%.4f\t%.4f\t%.4f\t%.4f\n", |
| 256 | + f.id, ref_name, f.strand, f.l, f.r, f.rcount, |
| 257 | + f.ucount, f.mrcount, f.avg, f.stdev, f.mavg, f.mstdev); |
| 258 | + } |
| 259 | + else { //introns |
| 260 | + fprintf(fdata,"%u\t%s\t%c\t%d\t%d\t%d\t%d\t%.2f\n",f.id, ref_name, |
| 261 | + f.strand, f.l, f.r, f.rcount, f.ucount, f.mrcount); |
| 262 | + } |
| 263 | + // f2t ------- |
| 264 | + fprintf(f2t, "%u\t%u\n", f.id, f.t_id); |
| 265 | + } //for each feature |
| 266 | + fclose(fdata); |
| 267 | + fclose(f2t); |
| 268 | +} |
| 269 | + |
| 270 | + |
| 271 | +/*void rc_write_counts(const char* refname, BundleData& bundle) { |
| 272 | + RC_BundleData& rc = *bundle.rc_data; |
| 273 | + //if (rc.exons.size()==0) return; |
| 274 | + * |
| 275 | +*/ |
| 276 | +void rc_writeRC(GPVec<RC_ScaffData>& RC_data, |
| 277 | + GPVec<RC_Feature>& RC_exons, |
| 278 | + GPVec<RC_Feature>& RC_introns, |
| 279 | + FILE* &f_tdata, FILE* &f_edata, FILE* &f_idata, |
| 280 | + FILE* &f_e2t, FILE* &f_i2t) { |
| 281 | + |
| 282 | + for (int t=0;t<RC_data.Count();++t) { |
| 283 | + //File: t_data.ctab |
| 284 | + //t_id tname chr strand start end num_exons gene_id gene_name cufflinks_cov cufflinks_fpkm |
| 285 | + const RC_ScaffData& sd=*RC_data[t]; |
| 286 | + const char* refname = sd.scaff->getGSeqName(); |
| 287 | + const char* genename= sd.scaff->getGeneName(); |
| 288 | + if (genename==NULL) genename="."; |
| 289 | + fprintf(f_tdata, "%u\t%s\t%c\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%f\t%f\n", |
| 290 | + sd.t_id, refname, sd.strand, sd.l, sd.r, sd.scaff->getID(), |
| 291 | + sd.num_exons, sd.eff_len, sd.scaff->getGeneID(), |
| 292 | + genename, sd.cov, sd.fpkm); |
| 293 | + }//for each transcript |
| 294 | + //fflush(f_tdata); |
| 295 | + fclose(f_tdata); |
| 296 | + |
| 297 | + //File: e_data.ctab |
| 298 | + //e_id chr gstart gend rcount ucount mrcount |
| 299 | + rc_write_RCfeature(RC_data, RC_exons, f_edata, f_e2t, true); |
| 300 | + //File: i_data.ctab |
| 301 | + //i_id chr gstart gend rcount ucount mrcount |
| 302 | + rc_write_RCfeature(RC_data, RC_introns, f_idata, f_i2t); |
| 303 | + |
| 304 | + // feature-to-transcript link files |
| 305 | + //rc_write_f2t(rc.fe2t, rc.e2t); |
| 306 | + //rc_write_f2t(rc.fi2t, rc.i2t); |
| 307 | +} |
| 308 | + |
| 309 | +void RC_ScaffData::addFeature(int fl, int fr, GPVec<RC_Feature>& fvec, |
| 310 | + uint& f_id, set<RC_ScaffSeg>& fset, set<RC_ScaffSeg>::iterator& fit, |
| 311 | + GPVec<RC_Feature>& fdata) { |
| 312 | + //f_id is the largest f_id inserted so far in fset |
| 313 | + bool add_new = true; |
| 314 | + RC_ScaffSeg newseg(fl,fr,this->strand); |
| 315 | + //RC_Feature* newfeature=NULL; |
| 316 | + if (fset.size()>0) { |
| 317 | + if (fit == fset.end()) --fit; |
| 318 | + if (newseg < (*fit)) { |
| 319 | + bool eq=false; |
| 320 | + while (newseg < (*fit) || (eq = (newseg==(*fit)))) { |
| 321 | + if (eq) { |
| 322 | + add_new = false; |
| 323 | + newseg.id = fit->id; |
| 324 | + break; |
| 325 | + } |
| 326 | + if (fit==fset.begin()) { |
| 327 | + break; |
| 328 | + } |
| 329 | + --fit; |
| 330 | + } |
| 331 | + } |
| 332 | + else { //newseg >= *fit |
| 333 | + bool eq=false; |
| 334 | + while ((*fit) < newseg || (eq = (newseg==(*fit)))) { |
| 335 | + if (eq) { |
| 336 | + add_new = false; |
| 337 | + newseg.id = fit->id; |
| 338 | + break; |
| 339 | + } |
| 340 | + ++fit; |
| 341 | + if (fit==fset.end()) { |
| 342 | + --fit; |
| 343 | + break; |
| 344 | + } |
| 345 | + } |
| 346 | + } |
| 347 | + } |
| 348 | + if (add_new) { |
| 349 | + newseg.id = ++f_id; |
| 350 | + pair< set<RC_ScaffSeg>::iterator, bool> ret = fset.insert(newseg); |
| 351 | + //ret.second = was_inserted (indeed new) |
| 352 | + if (!ret.second) { |
| 353 | + GError("Error: feature %d-%d (%c) already in segment set!\n", |
| 354 | + newseg.l, newseg.r, newseg.strand); |
| 355 | + //newseg.id = ret.first->id; |
| 356 | + } |
| 357 | + fdata.Add(new RC_Feature(newseg, this->t_id)); |
| 358 | +#ifdef DEBUG |
| 359 | + if (fdata.Count()!=(int)f_id) { |
| 360 | + GMessage("Error: fdata.Count=%d, f_id=%d\n", fdata.Count(), f_id); |
| 361 | + } |
| 362 | +#endif |
| 363 | + GASSERT((uint)fdata.Count()==f_id); |
| 364 | + } |
| 365 | + //fvec.push_back(newseg); |
| 366 | + GASSERT(fdata[newseg.id-1]->id==newseg.id); |
| 367 | + fvec.Add(fdata[newseg.id-1]); |
| 368 | +} |
| 369 | + |
| 370 | +void Ballgown_setupFiles(FILE* &f_tdata, FILE* &f_edata, FILE* &f_idata, |
| 371 | + FILE* &f_e2t, FILE* &f_i2t) { |
| 372 | + if (f_tdata == NULL) { |
| 373 | + //first call, create the files |
| 374 | + f_tdata = rc_fwopen("t_data"); |
| 375 | + fprintf(f_tdata, "t_id\tchr\tstrand\tstart\tend\tt_name\tnum_exons\tlength\tgene_id\tgene_name\tcov\tFPKM\n"); |
| 376 | + f_edata = rc_fwopen("e_data"); |
| 377 | + fprintf(f_edata, "e_id\tchr\tstrand\tstart\tend\trcount\tucount\tmrcount\tcov\tcov_sd\tmcov\tmcov_sd\n"); |
| 378 | + f_idata = rc_fwopen("i_data"); |
| 379 | + fprintf(f_idata, "i_id\tchr\tstrand\tstart\tend\trcount\tucount\tmrcount\n"); |
| 380 | + f_e2t = rc_fwopen("e2t"); |
| 381 | + fprintf(f_e2t, "e_id\tt_id\n"); |
| 382 | + f_i2t = rc_fwopen("i2t"); |
| 383 | + fprintf(f_i2t, "i_id\tt_id\n"); |
| 384 | + } |
| 385 | +} |
| 386 | + |
| 387 | +void RC_ScaffData::rc_addFeatures(uint& c_e_id, set<RC_ScaffSeg>& fexons, GPVec<RC_Feature>& edata, |
| 388 | + uint& c_i_id, set<RC_ScaffSeg>& fintrons, GPVec<RC_Feature>& idata) { |
| 389 | + GASSERT(scaff); |
| 390 | + GffObj& m = *(scaff); |
| 391 | + for (int i = 0; i < m.exons.Count(); ++i) { |
| 392 | + set<RC_ScaffSeg>::iterator eit=fexons.end(); |
| 393 | + set<RC_ScaffSeg>::iterator iit=fintrons.end(); |
| 394 | + addFeature(m.exons[i]->start, m.exons[i]->end, t_exons, c_e_id, fexons, eit, edata); |
| 395 | + if (i>0) { //store intron |
| 396 | + addFeature(m.exons[i-1]->end+1, m.exons[i]->start-1, t_introns, c_i_id, fintrons, iit, idata); |
| 397 | + } |
| 398 | + } //for each exon |
| 399 | +} |
0 commit comments