@@ -259,8 +259,11 @@ void processRead(int currentstart, int currentend, BundleData& bdata,
259259 bdata.end=currentend;
260260 }
261261
262- float rdcount=float(1)/nh;
263- if(nomulti) rdcount=1;
262+ float rdcount=1;
263+ float unitig_cov=brec.tag_float("Yk");
264+ if(unitig_cov) rdcount=unitig_cov;
265+
266+ if(!nomulti) rdcount/=nh;
264267 readlist[n]->read_count+=rdcount; // increase single count just in case I don't see the pair
265268
266269 // store the mismatch count per junction so that I can eliminate it later
@@ -1021,11 +1024,12 @@ float compute_cost(GVec<float>& wincov,float sumleft,float sumright,int leftstar
10211024
10221025 cost=basecost/len-leftcost/leftlen-rightcost/rightlen;
10231026
1027+ /*
10241028 fprintf(stderr,"leftlen=%d leftsum=%f leftcost=%f muleft=%f rightlen=%d righttsum=%f rightcost=%f muright=%f baselen=%d basesum=%f basecost=%f mubase=%f\n",
10251029 leftlen,sumleft,leftcost/leftlen,muleft,
10261030 rightlen,sumright,rightcost/rightlen,muright,
10271031 len,sumleft+sumright,basecost/len,mubase);
1028-
1032+ */
10291033 return(cost);
10301034}
10311035
@@ -1640,6 +1644,8 @@ CGraphnode *source2guide(int s, int g, int refstart,uint newstart,uint newend, C
16401644 GVec<float>* bpcov,GVec<float>& futuretr, int& graphno,CBundlenode *bundlenode,GVec<CGraphinfo> **bundle2graph,
16411645 GPVec<CGraphnode> **no2gnode, int &edgeno) {
16421646
1647+ //fprintf(stderr,"source2guide for newstart=%d and newend=%d\n",newstart,newend);
1648+
16431649 // compute maxabund
16441650 float leftcov=0;
16451651 float rightcov=0;
@@ -1681,6 +1687,8 @@ CGraphnode *guide2sink(int s, int g, int refstart,uint newstart,uint newend, CGr
16811687 GVec<float>* bpcov,GVec<float>& futuretr, int& graphno,CBundlenode *bundlenode,GVec<CGraphinfo> **bundle2graph,
16821688 GPVec<CGraphnode> **no2gnode, int &edgeno) {
16831689
1690+ //fprintf(stderr,"guide2sink for newstart=%d and newend=%d\n",newstart,newend);
1691+
16841692 // compute maxabund
16851693 float leftcov=0;
16861694 float rightcov=0;
@@ -1939,7 +1947,6 @@ GBitVec traverse_dfs(int s,int g,CGraphnode *node,CGraphnode *sink,GBitVec paren
19391947 }
19401948 else {
19411949 gpos[s][g].Add(key,lastgpos);
1942- //fprintf(stderr,"s=%d g=%d key=%d lastgpos=%d add edge between %d and %d child\n",s,g,key,lastgpos,min, max);
19431950 childparents[lastgpos]=1;
19441951 node->childpat[lastgpos]=1;
19451952 lastgpos++;
@@ -2514,9 +2521,11 @@ CTreePat *construct_treepat(int gno, GIntHash<int>& gpos,GPVec<CTransfrag>& tran
25142521void print_pattern(CTreePat *tree,GStr& pattern,int gno) {
25152522 if(!tree) return;
25162523 pattern+=tree->nodeno;
2524+
25172525 if(tree->tr) {
25182526 fprintf(stderr,"pat: %s %f\n",pattern.chars(),tree->tr->abundance);
25192527 }
2528+
25202529 for(int i=0;i<tree->childno;i++)
25212530 if(tree->nextpat[i]) {
25222531 GStr tmppat(pattern.chars());
@@ -3442,7 +3451,7 @@ CPrediction* store_merge_prediction(GVec<int>& alltr,GPVec<CMTransfrag>& mgt,GVe
34423451 for(int i=0;i<alltr.Count();i++) {
34433452 int a=alltr[i];
34443453 cov+=mgt[alltr[i]]->transfrag->abundance;
3445- fprintf(stderr,"Add transfrag %d(%d %d) with cov=%f\n",alltr[i],mgt[alltr[i]]->nf,mgt[alltr[i]]->nl-mgt[alltr[i]]->transfrag->nodes.Count()+1,mgt[alltr[i]]->transfrag->abundance);
3454+ // fprintf(stderr,"Add transfrag %d(%d %d) with cov=%f\n",alltr[i],mgt[alltr[i]]->nf,mgt[alltr[i]]->nl-mgt[alltr[i]]->transfrag->nodes.Count()+1,mgt[alltr[i]]->transfrag->abundance);
34463455 mgt[alltr[i]]->transfrag->abundance=0;
34473456 if(enableNames || (alltr.Count()==1 && mgt[alltr[0]]->transfrag->real)) for(int j=0;j<mgt[a]->read.Count();j++) {
34483457 int r=mgt[a]->read[j];
@@ -7050,9 +7059,11 @@ float max_flow(int gno,GVec<int>& path,GBitVec& istranscript,GPVec<CTransfrag>&
70507059 }
70517060 }
70527061
7062+ /*
70537063 fprintf(stderr,"Used transcripts:");
70547064 for(int i=0;i<transfrag.Count();i++) if(istranscript[i]) fprintf(stderr," %d(%f)",i,transfrag[i]->abundance);
70557065 fprintf(stderr,"\n");
7066+ */
70567067
70577068 for(int i=0;i<n;i++) link[i].Sort();
70587069
@@ -7112,7 +7123,7 @@ float max_flow(int gno,GVec<int>& path,GBitVec& istranscript,GPVec<CTransfrag>&
71127123 if(flow[n1][n2]>0) {
71137124 if(flow[n1][n2]<transfrag[t]->abundance) {
71147125 if(!i) sumout+=flow[n1][n2];
7115- fprintf(stderr,"Update capacity of transfrag[%d] with value %f to %f\n",t,transfrag[t]->abundance, transfrag[t]->abundance-flow[n1][n2]);
7126+ // fprintf(stderr,"Update capacity of transfrag[%d] with value %f to %f\n",t,transfrag[t]->abundance, transfrag[t]->abundance-flow[n1][n2]);
71167127 update_capacity(0,transfrag[t],flow[n1][n2],nodecapacity,node2path);
71177128 //if(path[i] && transfrag[t]->nodes.Last()!=gno-1) fragno+=flow[n1][n2];
71187129 flow[n1][n2]=0;
@@ -7121,7 +7132,7 @@ float max_flow(int gno,GVec<int>& path,GBitVec& istranscript,GPVec<CTransfrag>&
71217132 if(!i) sumout+=transfrag[t]->abundance;
71227133 flow[n1][n2]-=transfrag[t]->abundance;
71237134 //if(path[i] && transfrag[t]->nodes.Last()!=gno-1) fragno+=transfrag[t]->abundance;
7124- fprintf(stderr,"Update capacity of transfrag[%d] with value=%f to 0\n",t,transfrag[t]->abundance);
7135+ // fprintf(stderr,"Update capacity of transfrag[%d] with value=%f to 0\n",t,transfrag[t]->abundance);
71257136 update_capacity(0,transfrag[t],transfrag[t]->abundance,nodecapacity,node2path);
71267137 }
71277138 }
@@ -9233,6 +9244,7 @@ float store_transcript(GList<CPrediction>& pred,GVec<int>& path,GVec<float>& nod
92339244 if (t && t->uptr) {
92349245 RC_TData &td = *(RC_TData*) (t->uptr);
92359246 td.in_bundle=3;
9247+ //fprintf(stderr,"st guide %s is stored\n",t->getID());
92369248 }
92379249
92389250 /*
@@ -9327,6 +9339,7 @@ int store_guide_transcript(GList<CPrediction>& pred,GVec<int>& path,GVec<float>&
93279339 if (t && t->uptr) {
93289340 RC_TData &td = *(RC_TData*) (t->uptr);
93299341 td.in_bundle=3;
9342+ //fprintf(stderr,"sg guide %s is stored\n",t->getID());
93309343 }
93319344
93329345 update_guide_pred(pred,np,path,nodeflux,nodecov,no2gnode,gno,update);
@@ -9635,7 +9648,7 @@ void process_refguides(int gno,int edgeno,GIntHash<int>& gpos,int& lastgpos,GPVe
96359648
96369649 for(int g=0;g<guides.Count();g++) {
96379650 //fprintf(stderr,"Consider guide[%d out of %d] %s\n",g,guides.Count(),guides[g]->getID());
9638- if((guides[g]->strand==strand) && ((RC_TData*)(guides[g]->uptr))->in_bundle= =2 && (guides[g]->overlap(no2gnode[1]->start,no2gnode[gno-2]->end))) {
9651+ if((guides[g]->strand==strand || guides[g]->strand=='.' ) && ((RC_TData*)(guides[g]->uptr))->in_bundle> =2 && (guides[g]->overlap(no2gnode[1]->start,no2gnode[gno-2]->end))) {
96399652 CTransfrag *trguide=find_guide_pat(guides[g],no2gnode,gno,edgeno,gpos);
96409653 if(trguide) { // the guide can be found among the graph nodes
96419654 //CGuide newguide(trguide,guides[g]);
@@ -9647,9 +9660,9 @@ void process_refguides(int gno,int edgeno,GIntHash<int>& gpos,int& lastgpos,GPVe
96479660 fprintf(stderr,"Added guidetrf %d with ID=%s overlapping transcript interval %d - %d with nodes:",g,guides[g]->getID(),no2gnode[1]->start,no2gnode[gno-2]->end);
96489661 for(int i=0;i<trguide->nodes.Count();i++) fprintf(stderr," %d",trguide->nodes[i]);
96499662 fprintf(stderr,"\n");
9650- fprintf(stderr,"s=%d strand = %c trguide[%d]=",s,strand,g);
9651- printBitVec(trguide->pattern);
9652- fprintf(stderr,"\n");
9663+ // fprintf(stderr,"s=%d strand = %c trguide[%d]=",s,strand,g);
9664+ // printBitVec(trguide->pattern);
9665+ // fprintf(stderr,"\n");
96539666 }
96549667 */
96559668 }
@@ -9700,6 +9713,7 @@ void process_refguides(int gno,int edgeno,GIntHash<int>& gpos,int& lastgpos,GPVe
97009713
97019714 int *pos=gpos[edge(0,nodei,gno)];
97029715 if(!pos) { // here is where I need lastgpos
9716+ //fprintf(stderr,"Add source link to position %d for guide=%d\n",no2gnode[nodei]->start,guidetrf[g].g);
97039717 int key=edge(0,nodei,gno);
97049718 gpos.Add(key,lastgpos);
97059719 lastgpos++;
@@ -9774,6 +9788,7 @@ void process_refguides(int gno,int edgeno,GIntHash<int>& gpos,int& lastgpos,GPVe
97749788 guidetrf[g].trf->pattern[sink]=1;
97759789 int *pos=gpos[edge(nodei,sink,gno)];
97769790 if(!pos) {
9791+ //fprintf(stderr,"Add sink link from position %d for guide=%d\n",no2gnode[nodei]->end,guidetrf[g].g);
97779792 int key=edge(nodei,sink,gno);
97789793 gpos.Add(key,lastgpos);
97799794 lastgpos++;
@@ -10785,11 +10800,6 @@ int guides_pushmaxflow(int gno,int edgeno,GIntHash<int>& gpos,GPVec<CGraphnode>&
1078510800 bool include=true;
1078610801 if(guidepred[guidetrf[0].g]==-1) {
1078710802
10788- if(!strcmp(guides[guidetrf[0].g]->getID(),"rna51274")) {
10789- fprintf(stderr,"nodecov[1]=%f\n",nodecov[guidetrf[0].trf->nodes[1]]);
10790- exit(0);
10791- }
10792-
1079310803 store_transcript(pred,guidetrf[0].trf->nodes,nodeflux,nodecov,no2gnode,geneno,first,s,gno,gpos,include,pathpat,bdata,guides[guidetrf[0].g]);
1079410804 //if(eonly) { // this is not correct because it might have been assigned before
1079510805 guidepred[guidetrf[0].g]=pred.Count()-1; // NEED TO TEST: if this doesn't work for single genes I might want to recombine with the previous prediction in store_transcript
@@ -10856,10 +10866,6 @@ int guides_pushmaxflow(int gno,int edgeno,GIntHash<int>& gpos,GPVec<CGraphnode>&
1085610866 bool include=true;
1085710867 if(flux>epsilon) {
1085810868 if(guidepred[guidetrf[g].g]==-1) {
10859- if(!strcmp(guides[guidetrf[g].g]->getID(),"rna51274")) {
10860- fprintf(stderr,"nodecov[1]=%f\n",nodecov[guidetrf[g].trf->nodes[1]]);
10861- exit(0);
10862- }
1086310869 store_transcript(pred,guidetrf[g].trf->nodes,nodeflux,nodecov,no2gnode,geneno,first,s,gno,gpos,include,pathpat,bdata,guides[guidetrf[g].g]);
1086410870 //if(eonly) {
1086510871 guidepred[guidetrf[g].g]=pred.Count()-1; // NEED TO TEST: if this doesn't work for single genes I might want to recombine with the previous prediction in store_transcript
@@ -12228,16 +12234,36 @@ int build_graphs(BundleData* bdata) {
1222812234 }
1222912235 if(covered) {
1223012236 tdata->in_bundle=2;
12231- int s=0;
12237+ int s=-1; // unknown strand
1223212238 if(guides[g]->strand=='+') s=1; // guide on positive strand
12233- GEdge ge1(guides[g]->start,guides[g]->exons[0]->end,s);
12234- int idx=guideedge.IndexOf(ge1);
12235- if(idx<0) guideedge.Add(ge1);
12239+ else if(guides[g]->strand=='-') s=0; // guide on negative strand
12240+
12241+ //fprintf(stderr,"Look to add guide g=%d start %d-%d and end %d-%d on strand %d\n",g,guides[g]->start,guides[g]->exons[0]->end,guides[g]->end,guides[g]->exons.Last()->start,s);
12242+
12243+ int uses=s;
12244+ if(s<0) uses=0;
12245+ GEdge ge(guides[g]->start,guides[g]->exons[0]->end,uses);
12246+ int idx=guideedge.IndexOf(ge);
12247+ if(idx<0) guideedge.Add(ge);
1223612248 else if(guideedge[idx].endval>guides[g]->exons[0]->end) guideedge[idx].endval=guides[g]->exons[0]->end;
12237- GEdge ge2(guides[g]->end,guides[g]->exons.Last()->start,s);
12238- idx=guideedge.IndexOf(ge2);
12239- if(idx<0) guideedge.Add(ge2);
12249+ if(s<0) {
12250+ ge.strand=1;
12251+ idx=guideedge.IndexOf(ge);
12252+ if(idx<0) guideedge.Add(ge);
12253+ else if(guideedge[idx].endval>guides[g]->exons[0]->end) guideedge[idx].endval=guides[g]->exons[0]->end;
12254+ }
12255+
12256+ ge.val=guides[g]->end;
12257+ ge.endval=guides[g]->exons.Last()->start;
12258+ idx=guideedge.IndexOf(ge);
12259+ if(idx<0) guideedge.Add(ge);
1224012260 else if(guideedge[idx].endval<guides[g]->exons.Last()->start) guideedge[idx].endval=guides[g]->exons.Last()->start;
12261+ if(s<0) {
12262+ ge.strand=0; // ge.strand was set as 1 before
12263+ idx=guideedge.IndexOf(ge);
12264+ if(idx<0) guideedge.Add(ge);
12265+ else if(guideedge[idx].endval<guides[g]->exons.Last()->start) guideedge[idx].endval=guides[g]->exons.Last()->start;
12266+ }
1224112267 }
1224212268 }
1224312269 }
@@ -12873,6 +12899,7 @@ int build_graphs(BundleData* bdata) {
1287312899 p->exoncov.Add(gcov);
1287412900 pred.Add(p);
1287512901 printguides=true;
12902+ guidepred[g]=pred.Count()-1;
1287612903 }
1287712904 }
1287812905
@@ -12999,11 +13026,17 @@ int build_graphs(BundleData* bdata) {
1299913026 lastgpos[s].cAdd(0);
1300013027 // I am overestmating the edgeno below, hopefully not by too much
1300113028
13029+ //fprintf(stderr,"Bundle is: %d - %d start at g=%d\n",bnode[sno][bundle[sno][b]->startnode]->start,bnode[sno][bundle[sno][b]->lastnodeid]->end,g);
13030+
1300213031 while(g<ng && guides[g]->end<bnode[sno][bundle[sno][b]->startnode]->start) g++;
13032+
1300313033 int cg=g;
1300413034 int nolap=0;
1300513035 while(cg<ng && guides[cg]->start<=bnode[sno][bundle[sno][b]->lastnodeid]->end) { // this are potential guides that might overlap the current bundle, and they might introduce extra edges
13006- if(guides[cg]->strand==strnd && ((RC_TData*)(guides[cg]->uptr))->in_bundle==2) {
13036+
13037+ //fprintf(stderr,"...consider guide cg=%d with strand=%c and in_bundle=%d\n",cg,guides[cg]->strand,((RC_TData*)(guides[cg]->uptr))->in_bundle);
13038+ if((guides[cg]->strand==strnd || guides[cg]->strand=='.') && ((RC_TData*)(guides[cg]->uptr))->in_bundle>=2) {
13039+ //fprintf(stderr,"Add guide g=%d with start=%d end=%d\n",cg,guides[cg]->start,guides[cg]->end);
1300713040 edgeno[s][b]+=2; // this is an overestimate: possibly I have both an extra source and an extra sink link
1300813041 nolap++;
1300913042 }
@@ -13012,13 +13045,14 @@ int build_graphs(BundleData* bdata) {
1301213045
1301313046 /*
1301413047 { // DEBUG ONLY
13048+ fprintf(stderr,"edgeno[%d][%d]=%d\n",s,b,edgeno[s][b]);
1301513049 if(bundle[sno][b]->nread) {
1301613050 fprintf(stderr,"proc bundle[%d][%d] %f/%f is %f len=%d and %d guides\n",sno,b,
1301713051 bundle[sno][b]->multi,bundle[sno][b]->nread,(float)bundle[sno][b]->multi/bundle[sno][b]->nread,
1301813052 bundle[sno][b]->len,nolap);
1301913053 }
1302013054 }
13021- */
13055+ */
1302213056
1302313057 // here I can add something in stringtie to lower the mintranscript len if there are guides?
1302413058
@@ -13138,6 +13172,21 @@ int build_graphs(BundleData* bdata) {
1313813172 // include source to guide starts links
1313913173 GVec<CGuide> guidetrf;
1314013174
13175+ /*
13176+ { // DEBUG ONLY
13177+ fprintf(stderr,"process refguides for s=%d b=%d edgeno=%d gno=%d\n",s,b,edgeno[s][b],graphno[s][b]);
13178+ fprintf(stderr,"There are %d nodes for graph[%d][%d]:\n",graphno[s][b],s,b);
13179+ for(int i=0;i<graphno[s][b];i++) {
13180+ fprintf(stderr,"%d (%d-%d): %f len=%d cov=%f",i,no2gnode[s][b][i]->start,no2gnode[s][b][i]->end,no2gnode[s][b][i]->cov,no2gnode[s][b][i]->len(),no2gnode[s][b][i]->cov/no2gnode[s][b][i]->len());
13181+ fprintf(stderr," parents:");
13182+ for(int j=0;j<no2gnode[s][b][i]->parent.Count();j++) fprintf(stderr," %d",no2gnode[s][b][i]->parent[j]);
13183+ fprintf(stderr," trf=");
13184+ for(int j=0;j<no2gnode[s][b][i]->trf.Count();j++) fprintf(stderr," %d",no2gnode[s][b][i]->trf[j]);
13185+ fprintf(stderr,"\n");
13186+ }
13187+ }
13188+ */
13189+
1314113190 if(guides.Count()) process_refguides(graphno[s][b],edgeno[s][b],gpos[s][b],lastgpos[s][b],no2gnode[s][b],transfrag[s][b],s,guides,guidetrf,bdata);
1314213191
1314313192 //process transfrags to eliminate noise, and set compatibilities, and node memberships
0 commit comments