1616import numpy as np
1717import sys
1818import multiprocessing
19-
19+ import re
2020
2121#Add reads to UMR dictionary
22- def addToDict (tempFile , genLand , seq ):
22+ def addToDict (tempFile , genLand , spliceD , seq ):
2323 count = 0
2424 with open (tempFile ) as f :
2525 for line in f :
@@ -39,26 +39,115 @@ def addToDict(tempFile, genLand, seq):
3939 genLand [key ] = genLand [key ] + 1
4040 else :
4141 genLand [key ] = 1
42+ if not spliceD == None :
43+ chars = re .findall ('[A-Za-z]+' , l [5 ])
44+ gaps = [i for i , x in enumerate (chars ) if x == "N" ]
45+ if len (gaps ) > 0 :
46+ num = re .findall ('\d+\.\d+|\d+' , l [5 ])
47+ curLoc = int (l [3 ])
48+ for i in range (0 ,len (chars )):
49+ if chars [i ]== "M" or chars [i ]== "D" or chars [i ]== "=" or chars [i ]== "X" :
50+ curLoc = curLoc + int (num [i ])
51+ elif chars [i ]== "N" :
52+ if l [2 ]+ ";" + str (curLoc ) in spliceD :
53+ if spliceD [l [2 ]+ ";" + str (curLoc )] > curLoc + int (num [i ]):
54+ pos = int (round (int (curLoc )/ 5 )* 5 )
55+ size = int (round (int (num [i ])/ 5 )* 5 )
56+ spliceD [l [2 ]+ ";" + str (pos )] = size
57+ spliceD [l [2 ]+ ";" + str (pos + size )] = size + 5
58+ else :
59+ pos = int (round (int (curLoc )/ 5 )* 5 )
60+ size = int (round (int (num [i ])/ 5 )* 5 )
61+ spliceD [l [2 ]+ ";" + str (pos )] = size
62+ #Adding splice in reverse direction
63+ spliceD [l [2 ]+ ";" + str (pos + size )] = - 1 * (size + 5 )
64+ curLoc = curLoc + int (num [i ])
4265
4366#Used to get counts in regions around multimapped reads
44- def getArray (read , winSize , genLand ):
45- array = []
46- pos = round (int (read [3 ])/ 100 )* 100
47- k = pos - math .floor (winSize / 2 )
48- while k < int (pos )+ math .floor (winSize / 2 ):
49- k = round (k / 5 )* 5
50- key = read [2 ] + ";" + str (k )
51- #Seeing if current pos in genetic landscape
52- if key in genLand :
53- array .append (genLand [key ])
54- else :
55- array .append (0 )
56- k = k + 5
67+ def getArray (read , winSize , genLand , spliceD ):
68+ bin = int (winSize / 100 )
69+ #Spliced array using junction info
70+ if spliceD :
71+ #Parsing cigar string
72+ chars = re .findall ('[A-Za-z]+' , read [5 ])
73+ num = re .findall ('\d+\.\d+|\d+' , read [5 ])
74+ gap_loc = {}
75+ start = int (round (int (read [3 ])/ bin )* bin )
76+ chr = read [2 ]
77+ r_end = start #Get information about end of read to use for splice variants
78+ #print("cigar: " + read[5], flush=True)
79+ for i in range (0 ,len (chars )):
80+ if chars [i ]== "M" or chars [i ]== "D" or chars [i ]== "=" or chars [i ]== "X" :
81+ r_end = r_end + int (num [i ])
82+ elif chars [i ]== "N" :
83+ gap_loc [int (round ((int (r_end )+ 4 )/ bin )* bin )]= int (round ((int (num [i ])- 4 )/ bin )* bin )
84+ r_end = r_end + int (num [i ])
85+ r_end = int (round (int (r_end )/ bin )* bin )
86+ array = []
87+ k = start - bin
88+ l = 0
89+ #Upstream counts
90+ while l <= math .floor (winSize / 2 ):
91+ #print("up: " + str(k), flush=True)
92+ key = chr + ";" + str (k )
93+ if key in spliceD and spliceD [key ] < 0 :
94+ #print("splice: " + str(splice[key]), flush=True)
95+ k = k + spliceD [key ]
96+ key = chr + ";" + str (k )
97+ if key in genLand :
98+ array .insert (0 ,genLand [key ])
99+ else :
100+ array .insert (0 ,0 )
101+ k -= bin
102+ l += bin
103+ start_pos = k
104+ #Downstream counts
105+ k = start
106+ l = 0
107+ #print(gap_loc)
108+ while l <= math .floor (winSize / 2 ):
109+ while k < r_end :
110+ #print("down: " + str(k), flush=True)
111+ key = chr + ";" + str (k )
112+ if k in gap_loc :
113+ k = k + gap_loc [k ]
114+ continue
115+ elif key in genLand :
116+ array .append (genLand [key ])
117+ else :
118+ array .append (0 )
119+ k += bin
120+ l += bin
121+ #print("down2: " + str(k), flush=True)
122+ key = chr + ";" + str (k )
123+ if key in spliceD and spliceD [key ] > 0 :
124+ k = k + spliceD [key ]
125+ key = chr + ";" + str (k )
126+ if key in genLand :
127+ array .append (genLand [key ])
128+ else :
129+ array .append (0 )
130+ k += bin
131+ l += bin
132+ #Non-spliced array
133+ else :
134+ array = []
135+ pos = int (round (int (read [3 ])/ 5 )* 5 )
136+ k = pos - math .floor (winSize / 2 )
137+ while k < int (pos )+ math .floor (winSize / 2 ):
138+ k = round (k / 5 )* 5
139+ key = read [2 ] + ";" + str (k )
140+ #Seeing if current pos in genetic landscape
141+ if key in genLand :
142+ array .append (genLand [key ])
143+ else :
144+ array .append (0 )
145+ k = k + 5
57146 return array
58147
59148
60149#Assign reads (straight to dictionary for uniq and actual assign for multi-mapped)
61- def readAssign (rBlock , samOut , winSize , genLand , model , cnn_scores , rc , rmz , modelName ):
150+ def readAssign (rBlock , samOut , winSize , genLand , model , cnn_scores , rc , rmz , modelName , spliceD ):
62151 random .seed (7 ) #To make results reproducible
63152
64153 ##Multi-mapped reads##
@@ -73,7 +162,7 @@ def readAssign(rBlock, samOut, winSize, genLand, model, cnn_scores, rc, rmz, mod
73162 scores_nn .append (cnn_scores [pos ])
74163 allZ = False
75164 else :
76- countArray = getArray (i , winSize , genLand )
165+ countArray = getArray (i , winSize , genLand , spliceD )
77166 s = sum (countArray )
78167 if s > 0 :
79168 allZ = False
@@ -292,7 +381,7 @@ def parseUniq(tempFile, winSize, cnn_scores, AS, rc, keep):
292381 return [cu , cf ]
293382
294383
295- def parseMulti (tempFile , winSize , genLand , modelName , cnn_scores , rc , keep , rmz , maxa ):
384+ def parseMulti (tempFile , winSize , genLand , modelName , cnn_scores , rc , keep , rmz , maxa , spliceD ):
296385 numLoc = [0 ,0 ] #Keep info on average number of places read maps to
297386 #Getting trained CNN
298387 try :
@@ -353,7 +442,7 @@ def parseMulti(tempFile, winSize, genLand, modelName, cnn_scores, rc, keep, rmz,
353442 rBlock = []
354443 rBlock .append (r )
355444 continue
356- readAssign (rBlock , AL , winSize , genLand , model , cnn_scores , rc , rmz , modelName )
445+ readAssign (rBlock , AL , winSize , genLand , model , cnn_scores , rc , rmz , modelName , spliceD )
357446 #Getting average number of locations mapped to
358447 numLoc [0 ] = (numLoc [0 ]* numLoc [1 ] + len (rBlock )) / (numLoc [1 ]+ 1 )
359448 numLoc [1 ] = numLoc [1 ] + 1
@@ -363,7 +452,7 @@ def parseMulti(tempFile, winSize, genLand, modelName, cnn_scores, rc, keep, rmz,
363452
364453 #For last read
365454 if maxa is None or len (rBlock ) <= maxa :
366- readAssign (rBlock , AL , winSize , genLand , model , cnn_scores , rc , rmz , modelName )
455+ readAssign (rBlock , AL , winSize , genLand , model , cnn_scores , rc , rmz , modelName , spliceD )
367456 numLoc [0 ] = (numLoc [0 ]* numLoc [1 ] + len (rBlock )) / (numLoc [1 ]+ 1 )
368457 numLoc [1 ] = numLoc [1 ] + 1
369458
@@ -619,7 +708,7 @@ def parseUniqPE(tempFile, winSize, cnn_scores, AS, rc, keep, r2):
619708
620709
621710#Assign reads (straight to dictionary for uniq and actual assign for multi-mapped)
622- def readAssignPE (rBlock , rBlock2 , samOut , winSize , genLand , model , cnn_scores , rc , rmz , modelName ):
711+ def readAssignPE (rBlock , rBlock2 , samOut , winSize , genLand , model , cnn_scores , rc , rmz , modelName , spliceD ):
623712 random .seed (7 ) #To make results reproducible
624713 ##Multi-mapped reads##
625714 ###CNN###
@@ -633,7 +722,7 @@ def readAssignPE(rBlock, rBlock2, samOut, winSize, genLand, model, cnn_scores, r
633722 scores_nn .append (cnn_scores [pos ])
634723 allZ = False
635724 else :
636- countArray = getArray (i , winSize , genLand )
725+ countArray = getArray (i , winSize , genLand , spliceD )
637726 s = sum (countArray )
638727 if s > 0 :
639728 allZ = False
@@ -692,7 +781,7 @@ def readAssignPE(rBlock, rBlock2, samOut, winSize, genLand, model, cnn_scores, r
692781 return
693782
694783
695- def parseMultiPE (tempFile , winSize , genLand , modelName , cnn_scores , rc , keep , rmz , maxa ):
784+ def parseMultiPE (tempFile , winSize , genLand , modelName , cnn_scores , rc , keep , rmz , maxa , spliceD ):
696785 numLoc = [0 ,0 ] #Retain info on number of mapping sites
697786 #Getting trained CNN and making sure there is a compatible tensorflow installed
698787 try :
@@ -771,7 +860,7 @@ def parseMultiPE(tempFile, winSize, genLand, modelName, cnn_scores, rc, keep, rm
771860 else :
772861 if maxa is not None and len (rBlock ) > maxa :
773862 continue
774- readAssignPE (rBlock , rBlock2 , AL , winSize , genLand , model , cnn_scores , rc , rmz , modelName )
863+ readAssignPE (rBlock , rBlock2 , AL , winSize , genLand , model , cnn_scores , rc , rmz , modelName , spliceD )
775864 numLoc [0 ] = (numLoc [0 ]* numLoc [1 ] + len (rBlock )) / (numLoc [1 ]+ 1 )
776865 numLoc [1 ] = numLoc [1 ] + 1
777866 rBlock = []
@@ -783,7 +872,7 @@ def parseMultiPE(tempFile, winSize, genLand, modelName, cnn_scores, rc, keep, rm
783872
784873 #For last read
785874 if maxa is None or len (rBlock ) <= maxa :
786- readAssignPE (rBlock , rBlock2 , AL , winSize , genLand , model , cnn_scores , rc , rmz , modelName )
875+ readAssignPE (rBlock , rBlock2 , AL , winSize , genLand , model , cnn_scores , rc , rmz , modelName , spliceD )
787876 numLoc [0 ] = (numLoc [0 ]* numLoc [1 ] + len (rBlock )) / (numLoc [1 ]+ 1 )
788877 numLoc [1 ] = numLoc [1 ] + 1
789878
0 commit comments