#include "Alignment.h" #include "blastinfo.h" #include "Network.h" #include #include #include #include #include #include #include #include #include #include #include #include #include using namespace std; //todo: note these assume net2 is larger. Ensure that when loading nets. //this constructor just creates an arbitrary non-random alignment. //Make it a random one by calling shuf() Alignment::Alignment(const Network* n1, const Network* n2, const BLASTDict* bit, const GOCDict* goc){ unsigned int size = n2->nodeToNodeName.size(); aln = vector(size,-1); alnInv = vector(size,-1); alnMask = vector(size,true); fitnessValid = false; for(int i = 0; i < size; i++){ aln[i] = i; alnInv[i] = i; } net1 = n1; net2 = n2; domRank = -1; crowdDist = -1.0; bitscores = bit; gocs = goc; actualSize = net1->nodeToNodeName.size(); //initialize conserved counts for fast ICS computation double denom = fastICSDenominator(); currConservedCount = ics()*denom; currInducedCount = (int)denom; } Alignment::Alignment(const Network* n1, const Network* n2, string filename, const BLASTDict* bit, const GOCDict* goc){ int size = n2->nodeToNodeName.size(); //todo: nodeNameToNode is sometimes a different size than //nodeToNodeName and I can't figure out why. aln = vector(size,-1); alnInv = vector(size,-1); alnMask = vector(size,true); fitnessValid = false; domRank = -1; crowdDist = -1.0; bitscores = bit; gocs = goc; net1 = n1; net2 = n2; actualSize = net1->nodeToNodeName.size(); ifstream infile(filename); if(!infile){ throw LineReadException(string("Alignment file ")+filename+ string(" failed to open!")); } string line; //keep track of which nodes in V2 aren't aligned so we can //add them to the permutation somewhere after loading the file. unordered_set v1Unaligned; unordered_set v2Unaligned; for(int i = 0; i < net2->nodeToNodeName.size(); i++){ v2Unaligned.insert(i); } assert(v2Unaligned.size() == aln.size()); int count = 0; //process each line while(getline(infile,line)){ //cout<<"parsing line: "<> a >> b)){ throw LineReadException(string("Parse error in network: ") + filename + string("on line: ") + line + "\n"); } if(!net1->nodeNameToNode.count(a)){ continue; } u = net1->nodeNameToNode.at(a); if(!net2->nodeNameToNode.count(b)){ continue; } v = net2->nodeNameToNode.at(b); count++; aln[u] = v; alnInv[v] = u; v2Unaligned.erase(v); //cout<<"node "< coinFlip(0,1); uniform_real_distribution fltgen(0.0,1.0); if(coinFlip(prng)){ par1 = &p1; par2 = &p2; } else{ par1 = &p2; par2 = &p1; } vector par1Indices(size); //node par1Indices[size]; //todo: benchmark this (and also test it!) for(int i = 0; i < size; i++){ par1Indices[par1->aln[i]] = i; } //do standard initialization aln = par1->aln; alnInv = par1->alnInv; alnMask = par1->alnMask; fitnessValid = false; domRank = -1; numThatDominate = -1; crowdDist = -1.0; bitscores = par1->bitscores; gocs = par1->gocs; actualSize = par1->actualSize; currBitscore = par1->currBitscore; currGOC = par1->currGOC; currConservedCount = par1->currConservedCount; currInducedCount = par1->currInducedCount; net1 = par1->net1; net2 = par1->net2; //do crossover for(int i = 0; ialn[i]; int oldConservedI = conservedCount(i,aln[i],alnMask[i],-1); int oldConservedJ = conservedCount(par1Indices[temp2], aln[par1Indices[temp2]], alnMask[par1Indices[temp2]], i); int oldInducedI = inducedCount(aln[i],-1); int oldInducedJ = inducedCount(aln[par1Indices[temp2]],aln[i]); aln[i] = temp2; alnInv[temp2] = i; aln[par1Indices[temp2]] = temp1; alnInv[temp1] = par1Indices[temp2]; //cout<<"selected to swap:"<nodeToNodeName.at(i)<<" " //<nodeToNodeName.at(par1Indices[temp2])<alnMask[i]; //swap mask bools if(!total){ //if partial alns allowed, either | or & the bit here. alnMask[par1Indices[temp2]] = temp1Mask; if(coinFlip(prng)){ alnMask[i] = par1bool && par2bool; } else{ alnMask[i] = par1bool || par2bool; } } //compute new conservedCounts int newConservedI = conservedCount(i, aln[i], alnMask[i],-1); int newConservedJ = conservedCount(par1Indices[temp2], aln[par1Indices[temp2]], alnMask[par1Indices[temp2]],i); int newInducedI = inducedCount(aln[par1Indices[temp2]],-1); int newInducedJ = inducedCount(aln[i],aln[par1Indices[temp2]]); //update currBitscore and conservedCounts updateBitscore(i,temp1,temp2,temp1Mask,alnMask[i]); updateGOC(i,temp1,temp2,temp1Mask,alnMask[i]); updateBitscore(par1Indices[temp2],temp2,temp1, par1bool, alnMask[par1Indices[temp2]]); updateGOC(par1Indices[temp2],temp2,temp1,par1bool, alnMask[par1Indices[temp2]]); currConservedCount += ((newConservedI - oldConservedI) + (newConservedJ - oldConservedJ)); currInducedCount += ((newInducedI - oldInducedI) + (newInducedJ - oldInducedJ)); //swap index records int itemp = par1Indices[temp1]; par1Indices[temp1] = par1Indices[temp2]; par1Indices[temp2] = itemp; } } fitnessValid = false; } //performs a fast, greedy matching based on bitscore or GOC //does bitscores when arg is true, and GOC otherwise. void Alignment::greedyMatch(bool bit){ if(bit && bitscores == nullptr){ assert(1==2); } if(!bit && gocs == nullptr){ assert(1==2); } auto dict = bit ? bitscores : gocs; aln = vector(net2->nodeToNodeName.size(), -1); alnInv = aln; unordered_set v1Unaligned; for(int i = 0; i < net1->nodeToNodeName.size(); i++){ v1Unaligned.insert(i); } unordered_set v2Unaligned; for(int i = 0; i < net2->nodeToNodeName.size(); i++){ v2Unaligned.insert(i); } unordered_set v1Aligned, v2Aligned; for(auto n1 : v1Unaligned){ node bestN2 = -1; double bestScore = 0.0; for(auto n2 : v2Unaligned){ if((*dict)[n1][n2] > bestScore ){ bestN2 = n2; bestScore = (*dict)[n1][n2]; } else if(bestN2 == -1){ bestN2 = n2; } } aln[n1] = bestN2; alnInv[bestN2] = n1; v1Aligned.insert(n1); v2Aligned.insert(bestN2); v2Unaligned.erase(bestN2); //NOTE: can't change v1Unaligned here because we're iterating over it. //instead changed in the for loop updating the mask below. } //update mask for(int i = 0; i < alnMask.size(); i++){ if(v1Aligned.count(i)){ alnMask[i] = true; v1Unaligned.erase(i); } else{ alnMask[i] = false; } } //look for unaligned nodes in V1 and align them to unaligned nodes //in V2. for(int i = 0; i < aln.size(); i++){ if(aln[i] == -1){ node arbNode = *(v2Unaligned.begin()); aln[i] = arbNode; alnInv[arbNode] = i; alnMask[i] = false; v1Unaligned.insert(i); v2Unaligned.erase(arbNode); } } //initialize conserved counts for fast ICS computation double denom = fastICSDenominator(); currConservedCount = ics()*denom; currInducedCount = (int)denom; if(bitscores) currBitscore = sumBLAST(); if(gocs) currGOC = sumGOC(); } void Alignment::seedExtend(bool bit, bool degdiff){ //crash if called without data available if(bit && bitscores == nullptr){ assert(1==2); } if(!bit && gocs == nullptr){ assert(1==2); } auto dict = bit ? bitscores : gocs; //for priority queue, want to compare degree difference and //bit/goc. do degree diff as small degree/large degree so that //the objective needs to be maximized. //maybe make degree difference objective optional for ease of //comparison to previous seed-extend methods auto comp = [&](pair x, pair y){ double sim1 = (*dict)[x.first][x.second]; double sim2 = (*dict)[y.first][y.second]; //todo: add some sort of optional degree difference comparison return sim1 > sim2; }; priority_queue, vector>, decltype(comp) > q(comp); //throw everything into the queue for(int i = 0; i < net1->nodeToNodeName.size(); i++){ for(int j = 0; j < net2->nodeToNodeName.size(); j++){ q.push(pair(i,j)); } } } void Alignment::shuf(RandGenT& prng, bool uniformsize, bool smallstart, bool total){ shuffle(aln.begin(),aln.end(), prng); for(int i = 0; i < aln.size(); i++){ alnInv[aln[i]] = i; } if(!total){ if(!uniformsize && !smallstart){ uniform_int_distribution coinFlip(0,1); for(int i =0; i < alnMask.size(); i++){ if(coinFlip(prng)){ alnMask[i] = false; } } } else if(uniformsize){ //actually randomly generating number of mask cells to //turn OFF because they are already all on. uniform_int_distribution sizeDist(1,actualSize-1); int numToDeactivate = sizeDist(prng); //generate indices to flip on uniform_int_distribution indexDist(0,actualSize-1); for(int i = 0; i < numToDeactivate; i++){ int index = indexDist(prng); //if index already deactivated, find another while(!alnMask[index]){ index = indexDist(prng); } alnMask[index] = false; } //deactivate all unaligned nodes aligned to dummy nodes for(int i = actualSize; i < aln.size(); i++){ alnMask[i] = false; } } else{ //smallsize int maxsize = net1->nodeToNodeName.size() / 100; if(maxsize == 0){ maxsize = 1; } uniform_int_distribution sizeDist(1,maxsize); int numToDeactivate = actualSize - sizeDist(prng); uniform_int_distribution indexDist(0,actualSize-1); for(int i = 0; i < numToDeactivate; i++){ int index = indexDist(prng); while(!alnMask[index]){ index = indexDist(prng); } alnMask[index] = false; } //deactivate all unaligned nodes aligned to dummy nodes for(int i = actualSize; i < aln.size(); i++){ alnMask[i] = false; } } } if(bitscores) currBitscore = sumBLAST(); if(gocs) currGOC = sumGOC(); //initialize currConservedCount for fast ICS computation double denom = fastICSDenominator(); currConservedCount = ics()*denom; currInducedCount = (int)denom; } //todo: add secondary mutate op for changing mask void Alignment::mutate(RandGenT& prng, float mutswappb, bool total){ int size = aln.size(); uniform_real_distribution fltgen(0.0,1.0); //size-2 because of increment below uniform_int_distribution intgen(0,size-2); for(int i = 0; i < aln.size(); i++){ //swap probabilistically if(fltgen(prng) < mutswappb){ int swapIndex = intgen(prng); if(swapIndex >= i){ swapIndex++; } doSwap(i,swapIndex); } //switch mask bit on/off probabilistically if(!total && fltgen(prng) < mutswappb){ int oldConserved = conservedCount(i, aln[i], alnMask[i],-1); int oldInduced = inducedCount(aln[i],-1); alnMask[i] = !alnMask[i]; int newConserved = conservedCount(i, aln[i], alnMask[i],-1); int newInduced = inducedCount(aln[i],-1); updateBitscore(i, aln[i], aln[i], !alnMask[i], alnMask[i]); updateGOC(i,aln[i],aln[i],!alnMask[i],alnMask[i]); currConservedCount += (newConserved - oldConserved); currInducedCount += (newInduced - oldInduced); } } fitnessValid = false; } //takes 2 nodes from V1 and swaps the nodes they are aligned to in V2. //updates currBitscore accordingly. //updates conservedCounts accordingly as well. void Alignment::doSwap(node x, node y){ int oldConservedX = conservedCount(x, aln[x], alnMask[x],-1); int oldConservedY = conservedCount(y, aln[y], alnMask[y],x); int oldInducedX = inducedCount(aln[x],-1); int oldInducedY = inducedCount(aln[y],aln[x]); node temp = aln[x]; aln[x] = aln[y]; alnInv[aln[x]] = x; aln[y] = temp; alnInv[aln[y]] = y; bool tempb = alnMask[x]; alnMask[x] = alnMask[y]; alnMask[y] = tempb; int newConservedX = conservedCount(x,aln[x],alnMask[x],-1); int newConservedY = conservedCount(y,aln[y],alnMask[y],x); int newInducedX = inducedCount(aln[y],-1); int newInducedY = inducedCount(aln[x],aln[y]); updateBitscore(x, aln[y], aln[x], alnMask[y], alnMask[x]); updateGOC(x, aln[y], aln[x], alnMask[y], alnMask[x]); updateBitscore(y, aln[x], aln[y], alnMask[x], alnMask[y]); updateGOC(y, aln[x], aln[y], alnMask[x], alnMask[y]); currConservedCount += ((newConservedX - oldConservedX) + (newConservedY - oldConservedY)); currInducedCount += ((newInducedX - oldInducedX) + (newInducedY - oldInducedY)); } void Alignment::onBit(node x){ int oldConserved = conservedCount(x, aln[x], alnMask[x],-1); int oldInduced = inducedCount(aln[x],-1); bool old = alnMask[x]; alnMask[x] = true; updateBitscore(x,aln[x],aln[x],old, alnMask[x]); updateGOC(x, aln[x], aln[x], old, alnMask[x]); int newConserved = conservedCount(x, aln[x], alnMask[x],-1); int newInduced = inducedCount(aln[x],-1); currConservedCount += (newConserved - oldConserved); currInducedCount += (newInduced - oldInduced); } double Alignment::fastEC() const{ return double(currConservedCount)/double(net1->edges.size()); } double Alignment::fastS3() const{ double num = (double)currConservedCount; double denom = double(net1->edges.size() + currInducedCount) - num; return num/denom; } double Alignment::icsTimesEC() const{ return fastICS()*fastEC(); } double Alignment::s3Variant() const{ double a = (double)currConservedCount; double b = (double)(net1->edges.size()); double c = (double)currInducedCount; double denom = max(b,c) -a; return a/denom; } //todo: maybe something more principled than fitnessNames (so ad hoc!) void Alignment::computeFitness(const vector& fitnessNames){ if(fitness.size() == 0){ fitness = vector(fitnessNames.size(),0.0); } for(int i = 0; i < fitnessNames.size(); i++){ switch(fitnessNames[i]){ case ICSFit: fitness[i] = fastICS(); break; case ECFit: fitness[i] =fastEC(); break; case BitscoreSumFit: fitness[i] = currBitscore; break; case EvalsSumFit: fitness[i] = -1.0*currBitscore; break; case SizeFit: fitness[i] = alnSize(); break; case GOCFit: fitness[i] = currGOC; break; case S3Fit: fitness[i] = fastS3(); break; case S3DenomFit: fitness[i] = -(double(net1->edges.size() + currInducedCount) -currConservedCount); break; case ICSTimesEC: fitness[i] = icsTimesEC(); break; case S3Variant: fitness[i] = s3Variant(); break; } } fitnessValid = true; } double Alignment::ics() const{ unordered_set v1Unaligned; unordered_set v2Unaligned; //set of unaligned nodes in V2 for(int i = 0; i < aln.size(); i++){ if(!alnMask[i]){ v1Unaligned.insert(i); v2Unaligned.insert(aln[i]); } //second way for a node in V2 to be unaligned: by //being aligned to a dummy node. if(i >= net1->nodeToNodeName.size()){ v2Unaligned.insert(aln[i]); } } //induced subgraph of net2 that has been mapped to: unordered_set inducedES; for(Edge e : net2->edges){ if(!(v2Unaligned.count(e.u()) || v2Unaligned.count(e.v()))){ inducedES.insert(e); } } unordered_set mappedNet1ES; for(Edge e : net1->edges){ if(!(v1Unaligned.count(e.u()) || v1Unaligned.count(e.v()))){ Edge mapped(aln[e.u()],aln[e.v()]); mappedNet1ES.insert(mapped); } } double denominator = double(inducedES.size()); if(denominator == 0.0){ return 0.0; } else{ unordered_set intersect; if(mappedNet1ES.size() < inducedES.size()){ for(auto it = mappedNet1ES.begin(); it != mappedNet1ES.end(); it++){ if(inducedES.count(*it)){ intersect.insert(*it); } } } else{ for(auto it = inducedES.begin(); it != inducedES.end(); it++){ if(mappedNet1ES.count(*it)){ intersect.insert(*it); } } } double numerator = double(intersect.size()); return numerator/denominator; } } //note: this function is probably as fast as it is going to get. //my attempts at optimizing it further have only slowed it down. //todo: if the speed of this function is still unbearable, //consider tracking it as we go like numerator. double Alignment::fastICSDenominator() const{ //construct set of nodes actually mapped to vector mapped(aln.size(),false); for(int i = 0; i < actualSize; i++){ mapped[aln[i]] = alnMask[i]; } int count = 0; for(int i = 0; i < mapped.size(); i++){ if(mapped[i]){ for(auto y : net2->adjList.at(i)){ if(mapped[y]){ if(i==y) count += 2; //self loops must count twice like others else count++; } } } } return double(count / 2); } double Alignment::fastICS() const{ double denom = (double)currInducedCount; if(denom == 0.0){ return 0.0; } return (double(currConservedCount)) / denom; } double Alignment::sumBLAST() const{ double toReturn = 0.0; for(node i = 0; i < actualSize; i++){ if(alnMask[i]){ toReturn += (*bitscores)[i][aln[i]]; } } return toReturn; } double Alignment::sumGOC() const{ double toReturn = 0.0; for(node i = 0; i < actualSize; i++){ if(alnMask[i]){ toReturn += (*gocs)[i][aln.at(i)]; } } return toReturn; } double Alignment::alnSize() const{ double toReturn = 0.0; for(int i = 0; i < actualSize; i++){ if(alnMask[i]){ toReturn += 1.0; } } return toReturn; } void Alignment::save(string filename) const{ ofstream ofile(filename); for(int i = 0; i < net1->nodeToNodeName.size(); i++){ if(alnMask[i]){ string u = net1->nodeToNodeName.at(i); string v = net2->nodeToNodeName.at(aln[i]); ofile<= actualSize) return; double oldScore = 0.0; //set oldScore if(oldMask) oldScore = (*bitscores)[n1][n2old]; double newScore = 0.0; //set newScore if(newMask) newScore = (*bitscores)[n1][n2new]; double delta = newScore - oldScore; currBitscore += delta; } inline void Alignment::updateGOC(node n1, node n2old, node n2new, bool oldMask, bool newMask){ if(!gocs || n1 >= actualSize) return; double oldScore = 0.0; if(oldMask) oldScore = (*gocs)[n1][n2old]; double newScore = 0.0; if(newMask) newScore = (*gocs)[n1][n2new]; currGOC += newScore - oldScore; } //takes: n1 in V1, n2 in V2 that n1 is aligned to, mask that indicates //whether they are actually aligned or not, and a node in V1 to ignore, //in case conservedCount has been previously called where n1 == ignore //so we want to avoid double-counting. int Alignment::conservedCount(node n1, node n2, bool mask, node ignore) const{ int toReturn = 0; if(!mask || n1 >= actualSize || n1 == ignore){ return 0; } else{ for(int ix = 0; ix < net1->adjList[n1].size(); ix++){ int i = net1->adjList[n1][ix]; if(i != ignore && alnMask[i] && net2->adjMatrix[n2][aln[i]]){ toReturn++; } } } return toReturn; } //returns the number of edges in the subgraph induced by our alignment that //are incident to n2 but not incident to ignore. int Alignment::inducedCount(node n2, node ignore) const{ if(alnInv[n2] >= actualSize || alnMask[alnInv[n2]] == false){ return 0; } else{ int toReturn = 0; for(int ix = 0; ix < net2->adjList[n2].size(); ix++){ node x = net2->adjList[n2][ix]; if(alnInv[x] < actualSize && alnMask[alnInv[x]] && x != ignore){ toReturn++; } } return toReturn; } }