11#!/usr/bin/env python
22#Lexi Morrissey, Mahony Lab @ Pennsylvania State University
3- #Last updated 04.22.2024
43#Contains methods for read allocation procedure of Allo.
54
65from Allo import predictPeak
@@ -79,7 +78,7 @@ def getArray(read, winSize, genLand, spliceD):
7978 #print("cigar: " + read[5], flush=True)
8079 for i in range (0 ,len (chars )):
8180 if chars [i ]== "M" or chars [i ]== "D" or chars [i ]== "=" or chars [i ]== "X" :
82- r_end = r_end + int (num [i ])
81+ r_end = r_end + int (num [i ]) - 1
8382 elif chars [i ]== "N" :
8483 gap_loc [r_end + 1 ]= int (num [i ])
8584 r_end = r_end + int (num [i ])
@@ -90,10 +89,10 @@ def getArray(read, winSize, genLand, spliceD):
9089 while l <= math .floor (winSize / 2 ):
9190 #print("up: " + str(k), flush=True)
9291 key = chr + ";" + str (k )
93- if key in spliceD and spliceD [key ] < 0 :
92+ ''' if key in spliceD and spliceD[key] < 0:
9493 #print("splice: " + str(splice[key]), flush=True)
9594 k = k + spliceD[key]
96- key = chr + ";" + str (k )
95+ key = chr + ";" + str(k)'''
9796 if key in genLand :
9897 array .insert (0 ,genLand [key ])
9998 else :
@@ -104,10 +103,9 @@ def getArray(read, winSize, genLand, spliceD):
104103 #Downstream counts
105104 k = start
106105 l = 0
107- #print(gap_loc)
108106 while l <= math .floor (winSize / 2 ):
107+ #Taking splice information from read if present
109108 while k < r_end :
110- #print("down: " + str(k), flush=True)
111109 key = chr + ";" + str (k )
112110 if k in gap_loc :
113111 k = k + gap_loc [k ]
@@ -118,11 +116,11 @@ def getArray(read, winSize, genLand, spliceD):
118116 array .append (0 )
119117 k += 1
120118 l += 1
121- #print("down2: " + str(k), flush=True)
119+ #Else use previously found splice info
122120 key = chr + ";" + str (k )
123- if key in spliceD and spliceD [key ] > 0 :
121+ ''' if key in spliceD and spliceD[key] > 0:
124122 k = k + spliceD[key]
125- key = chr + ";" + str (k )
123+ key = chr + ";" + str(k)'''
126124 if key in genLand :
127125 array .append (genLand [key ])
128126 else :
@@ -144,7 +142,7 @@ def getArray(read, winSize, genLand, spliceD):
144142
145143
146144#Assign reads (straight to dictionary for uniq and actual assign for multi-mapped)
147- def readAssign (rBlock , samOut , winSize , genLand , model , cnn_scores , rc , rmz , modelName , spliceD ):
145+ def readAssign (rBlock , samOut , winSize , genLand , model , cnn_scores , rc , rmz , spliceD ):
148146 random .seed (7 ) #To make results reproducible
149147
150148 ##Multi-mapped reads##
@@ -154,28 +152,35 @@ def readAssign(rBlock, samOut, winSize, genLand, model, cnn_scores, rc, rmz, mod
154152 allZ = True #seeing if all zero regions
155153 for i in rBlock :
156154 #Find closest 100 window, use that score instead if it's already been assigned, saves time
157- pos = i [2 ]+ str (i [3 ])
158- countArray = getArray (i , winSize , genLand , spliceD )
159- s = sum (countArray )
160- if s > 0 :
155+ if spliceD :
156+ pos = i [2 ]+ ";" + str (i [3 ])
157+ else :
158+ pos = i [2 ]+ ";" + str (round (int (i [3 ])/ 100 )* 100 )
159+ if pos in cnn_scores :
160+ scores_nn .append (cnn_scores [pos ])
161161 allZ = False
162- #Allocation options
163- if rc == 1 :
164- if s == 0 :
162+ else :
163+ countArray = getArray (i , winSize , genLand , spliceD )
164+ s = sum (countArray )
165+ if s > 0 :
166+ allZ = False
167+ #Allocation options
168+ if rc == 1 :
169+ if s == 0 :
170+ scores_rc .append (1 )
171+ else :
172+ scores_rc .append (s + 1 )
173+ continue
174+ if rc == 2 :
165175 scores_rc .append (1 )
176+ continue
177+ #Use no read score if zero region
178+ if s == 0 :
179+ scores_nn .append (0.0012 * (s + 1 ))
166180 else :
167- scores_rc .append (s + 1 )
168- continue
169- if rc == 2 :
170- scores_rc .append (1 )
171- continue
172- #Use no read score if zero region
173- if s == 0 :
174- scores_nn .append (0.0012 * (s + 1 ))
175- else :
176- nn = predictPeak .predictNN (countArray , winSize , model )
177- scores_nn .append (nn * (s + 1 ))
178- cnn_scores [pos ] = (nn * (s + 1 ))
181+ nn = predictPeak .predictNN (countArray , winSize , model )
182+ scores_nn .append (nn * (s + 1 ))
183+ cnn_scores [pos ] = (nn * (s + 1 ))
179184
180185 #Removing reads that mapped to all zero regions
181186 if allZ and rmz == 1 :
@@ -214,7 +219,6 @@ def parseUniq(tempFile, winSize, cnn_scores, AS, rc, keep):
214219 cu = 0 #UMRs
215220 cf = 0 #Filtered
216221
217- modelName = None
218222 model = None
219223 rmz = None
220224 rBlock = []
@@ -388,7 +392,6 @@ def parseMulti(tempFile, winSize, genLand, modelName, cnn_scores, rc, keep, rmz,
388392 sys .exit (0 )
389393 else :
390394 model = None
391- modelName = None
392395 #Exception that causes errors
393396 if os .stat (tempFile + "MM" ).st_size == 0 :
394397 return numLoc
@@ -432,7 +435,7 @@ def parseMulti(tempFile, winSize, genLand, modelName, cnn_scores, rc, keep, rmz,
432435 rBlock = []
433436 rBlock .append (r )
434437 continue
435- readAssign (rBlock , AL , winSize , genLand , model , cnn_scores , rc , rmz , modelName , spliceD )
438+ readAssign (rBlock , AL , winSize , genLand , model , cnn_scores , rc , rmz , spliceD )
436439 #Getting average number of locations mapped to
437440 numLoc [0 ] = (numLoc [0 ]* numLoc [1 ] + len (rBlock )) / (numLoc [1 ]+ 1 )
438441 numLoc [1 ] = numLoc [1 ] + 1
@@ -442,7 +445,7 @@ def parseMulti(tempFile, winSize, genLand, modelName, cnn_scores, rc, keep, rmz,
442445
443446 #For last read
444447 if maxa is None or len (rBlock ) <= maxa :
445- readAssign (rBlock , AL , winSize , genLand , model , cnn_scores , rc , rmz , modelName , spliceD )
448+ readAssign (rBlock , AL , winSize , genLand , model , cnn_scores , rc , rmz , spliceD )
446449 numLoc [0 ] = (numLoc [0 ]* numLoc [1 ] + len (rBlock )) / (numLoc [1 ]+ 1 )
447450 numLoc [1 ] = numLoc [1 ] + 1
448451
@@ -457,7 +460,6 @@ def parseUniqPE(tempFile, winSize, cnn_scores, AS, rc, keep, r2):
457460 cu = 0
458461 cf = 0
459462
460- modelName = None
461463 model = None
462464 rmz = None
463465 if "border" in tempFile :
@@ -698,7 +700,7 @@ def parseUniqPE(tempFile, winSize, cnn_scores, AS, rc, keep, r2):
698700
699701
700702#Assign reads (straight to dictionary for uniq and actual assign for multi-mapped)
701- def readAssignPE (rBlock , rBlock2 , samOut , winSize , genLand , model , cnn_scores , rc , rmz , modelName , spliceD ):
703+ def readAssignPE (rBlock , rBlock2 , samOut , winSize , genLand , model , cnn_scores , rc , rmz , spliceD ):
702704 random .seed (7 ) #To make results reproducible
703705 ##Multi-mapped reads##
704706 ###CNN###
@@ -707,9 +709,12 @@ def readAssignPE(rBlock, rBlock2, samOut, winSize, genLand, model, cnn_scores, r
707709 allZ = True #seeing if all zero regions
708710 for i in rBlock :
709711 #Find closest 100 window, use that score instead if it's already been assigned, saves time
710- pos = i [2 ]+ str (round (int (i [3 ])/ 100 )* 100 )
712+ if spliceD :
713+ pos = i [2 ]+ ";" + str (i [3 ])
714+ else :
715+ pos = i [2 ]+ ";" + str (round (int (i [3 ])/ 100 )* 100 )
711716 if pos in cnn_scores :
712- # scores_nn.append(cnn_scores[pos])
717+ scores_nn .append (cnn_scores [pos ])
713718 allZ = False
714719 else :
715720 countArray = getArray (i , winSize , genLand , spliceD )
@@ -780,16 +785,11 @@ def parseMultiPE(tempFile, winSize, genLand, modelName, cnn_scores, rc, keep, rm
780785 model = tf .keras .models .model_from_json (loaded_model_json )
781786 model .load_weights (modelName + '.h5' )
782787 model = LiteModel .from_keras_model (model )
783- if "mixed" in modelName :
784- modelName = 1
785- else :
786- modelName = 0
787788 except :
788789 print ("Could not load Tensorflow model :( Allo was written with Tensorflow version 2.11" )
789790 sys .exit (0 )
790791 else :
791792 model = None
792- modelName = None
793793
794794 #Exception that causes errors
795795 if os .stat (tempFile + "MM" ).st_size == 0 :
@@ -852,7 +852,7 @@ def parseMultiPE(tempFile, winSize, genLand, modelName, cnn_scores, rc, keep, rm
852852 else :
853853 if maxa is not None and len (rBlock ) > maxa :
854854 continue
855- readAssignPE (rBlock , rBlock2 , AL , winSize , genLand , model , cnn_scores , rc , rmz , modelName , spliceD )
855+ readAssignPE (rBlock , rBlock2 , AL , winSize , genLand , model , cnn_scores , rc , rmz , spliceD )
856856 numLoc [0 ] = (numLoc [0 ]* numLoc [1 ] + len (rBlock )) / (numLoc [1 ]+ 1 )
857857 numLoc [1 ] = numLoc [1 ] + 1
858858 rBlock = []
@@ -864,7 +864,7 @@ def parseMultiPE(tempFile, winSize, genLand, modelName, cnn_scores, rc, keep, rm
864864
865865 #For last read
866866 if maxa is None or len (rBlock ) <= maxa :
867- readAssignPE (rBlock , rBlock2 , AL , winSize , genLand , model , cnn_scores , rc , rmz , modelName , spliceD )
867+ readAssignPE (rBlock , rBlock2 , AL , winSize , genLand , model , cnn_scores , rc , rmz , spliceD )
868868 numLoc [0 ] = (numLoc [0 ]* numLoc [1 ] + len (rBlock )) / (numLoc [1 ]+ 1 )
869869 numLoc [1 ] = numLoc [1 ] + 1
870870
@@ -915,3 +915,5 @@ def predict_single(self, inp):
915915 self .interpreter .invoke ()
916916 out = self .interpreter .get_tensor (self .output_index )
917917 return out [0 ]
918+
919+
0 commit comments