Skip to content

Commit 515b62d

Browse files
author
Alexis Morrissey
committed
splice test
1 parent bc45b50 commit 515b62d

File tree

2 files changed

+133
-32
lines changed

2 files changed

+133
-32
lines changed

Allo/allo

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@ parser.add_argument('-seq', type=str, nargs=1, help='Single-end or paired-end se
1414
choices=['pe','se'], required=True, dest='seq')
1515
parser.add_argument('-o', type=str, nargs=1, help='Output file name', dest='outfile', default=None)
1616
parser.add_argument('--mixed', help='Use CNN trained on a dataset with mixed peaks, narrow by default', action='store_true', default=None)
17-
parser.add_argument('--rna', help='Use CNN trained on a dataset with mixed peaks, narrow by default', action='store_true', default=None)
17+
parser.add_argument('--rna', help='Use CNN trained on a RNAseq dataset', action='store_true', default=None)
18+
parser.add_argument('--splice', help='Remove splice sites based on cigar string when constructing image', action='store_true', default=None)
1819
parser.add_argument('-p', type=int, nargs=1, help='Number of processes, 1 by default', dest='processes', default=None)
1920
parser.add_argument('-max', type=int, nargs=1, help='Maximum value for number of locations a read can map', dest='maxlocations', default=None)
2021
parser.add_argument('--keep-unmap', help='Keep unmapped reads and reads that include N in their sequence', action='store_true', default=None)
@@ -140,7 +141,11 @@ if __name__ == '__main__':
140141
parse = 1
141142
else:
142143
parse = 0
143-
144+
#Cut out introns based on identified splice sites using cigar string
145+
if args.splice is not None:
146+
splice = 1
147+
else:
148+
splice = 0
144149

145150
#Getting header information and checking to see if file was sorted
146151
collate = 0
@@ -237,11 +242,14 @@ if __name__ == '__main__':
237242

238243
#Add all values to UMR dictionary
239244
genLand = {}
245+
if splice == 1:
246+
spliceD = {}
247+
else:
248+
spliceD = None
240249
if parse == 0:
241250
for i in tempList:
242251
if os.path.exists(str(i)+"UM"):
243-
allocation.addToDict(str(i)+"UM", genLand, seq)
244-
252+
allocation.addToDict(str(i)+"UM", genLand, spliceD, seq)
245253
else: #Stopping if parsing only
246254
l = [allo_dir+"/header"]
247255
for i in tempList:
@@ -259,7 +267,7 @@ if __name__ == '__main__':
259267
print("Parsing finished!\n")
260268

261269
#PHASE II: Parsing multi-mapped reads
262-
info2 = Parallel(n_jobs=thr)(delayed(allocation.parseMulti)(i, winSize, genLand, m, cnn_scores, rc, keep, rmz, maxa) for i in tempList)
270+
info2 = Parallel(n_jobs=thr)(delayed(allocation.parseMulti)(i, winSize, genLand, m, cnn_scores, rc, keep, rmz, maxa, spliceD) for i in tempList)
263271

264272

265273
#Final parsing and file clean up
@@ -319,10 +327,14 @@ if __name__ == '__main__':
319327

320328
#Add all values to UMR dictionary unless only parsing
321329
genLand = {}
330+
if splice == 1:
331+
spliceD = {}
332+
else:
333+
spliceD = None
322334
if parse == 0:
323335
for i in tempList:
324336
if os.path.exists(str(i)+"UM"):
325-
allocation.addToDict(str(i)+"UM", genLand, seq)
337+
allocation.addToDict(str(i)+"UM", genLand, spliceD, seq)
326338
else:
327339
l = [allo_dir+"/header"]
328340
for i in tempList:
@@ -340,7 +352,7 @@ if __name__ == '__main__':
340352
print("Parsing finished!\n")
341353

342354
#PHASE II: Parsing multi-mapped reads
343-
info2 = Parallel(n_jobs=thr)(delayed(allocation.parseMultiPE)(i, winSize, genLand, m, cnn_scores, rc, keep, rmz, maxa) for i in tempList)
355+
info2 = Parallel(n_jobs=thr)(delayed(allocation.parseMultiPE)(i, winSize, genLand, m, cnn_scores, rc, keep, rmz, maxa, spliceD) for i in tempList)
344356

345357
#Final parsing and file clean up
346358
#Deleting temporary files and join allocated files

Allo/allocation.py

Lines changed: 114 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,10 @@
1616
import numpy as np
1717
import sys
1818
import multiprocessing
19-
19+
import re
2020

2121
#Add reads to UMR dictionary
22-
def addToDict(tempFile, genLand, seq):
22+
def addToDict(tempFile, genLand, spliceD, seq):
2323
count = 0
2424
with open(tempFile) as f:
2525
for line in f:
@@ -39,26 +39,115 @@ def addToDict(tempFile, genLand, seq):
3939
genLand[key] = genLand[key] + 1
4040
else:
4141
genLand[key] = 1
42+
if not spliceD == None:
43+
chars = re.findall('[A-Za-z]+', l[5])
44+
gaps = [i for i, x in enumerate(chars) if x == "N"]
45+
if len(gaps) > 0:
46+
num = re.findall('\d+\.\d+|\d+', l[5])
47+
curLoc = int(l[3])
48+
for i in range(0,len(chars)):
49+
if chars[i]=="M" or chars[i]=="D" or chars[i]=="=" or chars[i]=="X":
50+
curLoc = curLoc + int(num[i])
51+
elif chars[i]=="N":
52+
if l[2]+";"+str(curLoc) in spliceD:
53+
if spliceD[l[2]+";"+str(curLoc)] > curLoc + int(num[i]):
54+
pos = int(round(int(curLoc)/5)*5)
55+
size = int(round(int(num[i])/5)*5)
56+
spliceD[l[2]+";"+str(pos)] = size
57+
spliceD[l[2]+";"+str(pos+size)] = size+5
58+
else:
59+
pos = int(round(int(curLoc)/5)*5)
60+
size = int(round(int(num[i])/5)*5)
61+
spliceD[l[2]+";"+str(pos)] = size
62+
#Adding splice in reverse direction
63+
spliceD[l[2]+";"+str(pos+size)] = -1*(size+5)
64+
curLoc = curLoc + int(num[i])
4265

4366
#Used to get counts in regions around multimapped reads
44-
def getArray(read, winSize, genLand):
45-
array = []
46-
pos = round(int(read[3])/100)*100
47-
k = pos-math.floor(winSize/2)
48-
while k < int(pos)+math.floor(winSize/2):
49-
k = round(k/5)*5
50-
key = read[2] + ";" + str(k)
51-
#Seeing if current pos in genetic landscape
52-
if key in genLand:
53-
array.append(genLand[key])
54-
else:
55-
array.append(0)
56-
k = k + 5
67+
def getArray(read, winSize, genLand, spliceD):
68+
bin = int(winSize/100)
69+
#Spliced array using junction info
70+
if spliceD:
71+
#Parsing cigar string
72+
chars = re.findall('[A-Za-z]+', read[5])
73+
num = re.findall('\d+\.\d+|\d+', read[5])
74+
gap_loc = {}
75+
start = int(round(int(read[3])/bin)*bin)
76+
chr = read[2]
77+
r_end = start #Get information about end of read to use for splice variants
78+
#print("cigar: " + read[5], flush=True)
79+
for i in range(0,len(chars)):
80+
if chars[i]=="M" or chars[i]=="D" or chars[i]=="=" or chars[i]=="X":
81+
r_end = r_end + int(num[i])
82+
elif chars[i]=="N":
83+
gap_loc[int(round((int(r_end)+4)/bin)*bin)]=int(round((int(num[i])-4)/bin)*bin)
84+
r_end = r_end + int(num[i])
85+
r_end = int(round(int(r_end)/bin)*bin)
86+
array = []
87+
k = start-bin
88+
l = 0
89+
#Upstream counts
90+
while l <= math.floor(winSize/2):
91+
#print("up: " + str(k), flush=True)
92+
key = chr + ";" + str(k)
93+
if key in spliceD and spliceD[key] < 0:
94+
#print("splice: " + str(splice[key]), flush=True)
95+
k = k + spliceD[key]
96+
key = chr + ";" + str(k)
97+
if key in genLand:
98+
array.insert(0,genLand[key])
99+
else:
100+
array.insert(0,0)
101+
k -= bin
102+
l += bin
103+
start_pos = k
104+
#Downstream counts
105+
k = start
106+
l = 0
107+
#print(gap_loc)
108+
while l <= math.floor(winSize/2):
109+
while k < r_end:
110+
#print("down: " + str(k), flush=True)
111+
key = chr + ";" + str(k)
112+
if k in gap_loc:
113+
k = k + gap_loc[k]
114+
continue
115+
elif key in genLand:
116+
array.append(genLand[key])
117+
else:
118+
array.append(0)
119+
k += bin
120+
l += bin
121+
#print("down2: " + str(k), flush=True)
122+
key = chr + ";" + str(k)
123+
if key in spliceD and spliceD[key] > 0:
124+
k = k + spliceD[key]
125+
key = chr + ";" + str(k)
126+
if key in genLand:
127+
array.append(genLand[key])
128+
else:
129+
array.append(0)
130+
k += bin
131+
l += bin
132+
#Non-spliced array
133+
else:
134+
array = []
135+
pos = int(round(int(read[3])/5)*5)
136+
k = pos-math.floor(winSize/2)
137+
while k < int(pos)+math.floor(winSize/2):
138+
k = round(k/5)*5
139+
key = read[2] + ";" + str(k)
140+
#Seeing if current pos in genetic landscape
141+
if key in genLand:
142+
array.append(genLand[key])
143+
else:
144+
array.append(0)
145+
k = k + 5
57146
return array
58147

59148

60149
#Assign reads (straight to dictionary for uniq and actual assign for multi-mapped)
61-
def readAssign(rBlock, samOut, winSize, genLand, model, cnn_scores, rc, rmz, modelName):
150+
def readAssign(rBlock, samOut, winSize, genLand, model, cnn_scores, rc, rmz, modelName, spliceD):
62151
random.seed(7) #To make results reproducible
63152

64153
##Multi-mapped reads##
@@ -73,7 +162,7 @@ def readAssign(rBlock, samOut, winSize, genLand, model, cnn_scores, rc, rmz, mod
73162
scores_nn.append(cnn_scores[pos])
74163
allZ = False
75164
else:
76-
countArray = getArray(i, winSize, genLand)
165+
countArray = getArray(i, winSize, genLand, spliceD)
77166
s = sum(countArray)
78167
if s > 0:
79168
allZ = False
@@ -292,7 +381,7 @@ def parseUniq(tempFile, winSize, cnn_scores, AS, rc, keep):
292381
return [cu, cf]
293382

294383

295-
def parseMulti(tempFile, winSize, genLand, modelName, cnn_scores, rc, keep, rmz, maxa):
384+
def parseMulti(tempFile, winSize, genLand, modelName, cnn_scores, rc, keep, rmz, maxa, spliceD):
296385
numLoc = [0,0] #Keep info on average number of places read maps to
297386
#Getting trained CNN
298387
try:
@@ -353,7 +442,7 @@ def parseMulti(tempFile, winSize, genLand, modelName, cnn_scores, rc, keep, rmz,
353442
rBlock = []
354443
rBlock.append(r)
355444
continue
356-
readAssign(rBlock, AL, winSize, genLand, model, cnn_scores, rc, rmz, modelName)
445+
readAssign(rBlock, AL, winSize, genLand, model, cnn_scores, rc, rmz, modelName, spliceD)
357446
#Getting average number of locations mapped to
358447
numLoc[0] = (numLoc[0]*numLoc[1] + len(rBlock)) / (numLoc[1]+1)
359448
numLoc[1] = numLoc[1] + 1
@@ -363,7 +452,7 @@ def parseMulti(tempFile, winSize, genLand, modelName, cnn_scores, rc, keep, rmz,
363452

364453
#For last read
365454
if maxa is None or len(rBlock) <= maxa:
366-
readAssign(rBlock, AL, winSize, genLand, model, cnn_scores, rc, rmz, modelName)
455+
readAssign(rBlock, AL, winSize, genLand, model, cnn_scores, rc, rmz, modelName, spliceD)
367456
numLoc[0] = (numLoc[0]*numLoc[1] + len(rBlock)) / (numLoc[1]+1)
368457
numLoc[1] = numLoc[1] + 1
369458

@@ -619,7 +708,7 @@ def parseUniqPE(tempFile, winSize, cnn_scores, AS, rc, keep, r2):
619708

620709

621710
#Assign reads (straight to dictionary for uniq and actual assign for multi-mapped)
622-
def readAssignPE(rBlock, rBlock2, samOut, winSize, genLand, model, cnn_scores, rc, rmz, modelName):
711+
def readAssignPE(rBlock, rBlock2, samOut, winSize, genLand, model, cnn_scores, rc, rmz, modelName, spliceD):
623712
random.seed(7) #To make results reproducible
624713
##Multi-mapped reads##
625714
###CNN###
@@ -633,7 +722,7 @@ def readAssignPE(rBlock, rBlock2, samOut, winSize, genLand, model, cnn_scores, r
633722
scores_nn.append(cnn_scores[pos])
634723
allZ = False
635724
else:
636-
countArray = getArray(i, winSize, genLand)
725+
countArray = getArray(i, winSize, genLand, spliceD)
637726
s = sum(countArray)
638727
if s > 0:
639728
allZ = False
@@ -692,7 +781,7 @@ def readAssignPE(rBlock, rBlock2, samOut, winSize, genLand, model, cnn_scores, r
692781
return
693782

694783

695-
def parseMultiPE(tempFile, winSize, genLand, modelName, cnn_scores, rc, keep, rmz, maxa):
784+
def parseMultiPE(tempFile, winSize, genLand, modelName, cnn_scores, rc, keep, rmz, maxa, spliceD):
696785
numLoc = [0,0] #Retain info on number of mapping sites
697786
#Getting trained CNN and making sure there is a compatible tensorflow installed
698787
try:
@@ -771,7 +860,7 @@ def parseMultiPE(tempFile, winSize, genLand, modelName, cnn_scores, rc, keep, rm
771860
else:
772861
if maxa is not None and len(rBlock) > maxa:
773862
continue
774-
readAssignPE(rBlock, rBlock2, AL, winSize, genLand, model, cnn_scores, rc, rmz, modelName)
863+
readAssignPE(rBlock, rBlock2, AL, winSize, genLand, model, cnn_scores, rc, rmz, modelName, spliceD)
775864
numLoc[0] = (numLoc[0]*numLoc[1] + len(rBlock)) / (numLoc[1]+1)
776865
numLoc[1] = numLoc[1] + 1
777866
rBlock = []
@@ -783,7 +872,7 @@ def parseMultiPE(tempFile, winSize, genLand, modelName, cnn_scores, rc, keep, rm
783872

784873
#For last read
785874
if maxa is None or len(rBlock) <= maxa:
786-
readAssignPE(rBlock, rBlock2, AL, winSize, genLand, model, cnn_scores, rc, rmz, modelName)
875+
readAssignPE(rBlock, rBlock2, AL, winSize, genLand, model, cnn_scores, rc, rmz, modelName, spliceD)
787876
numLoc[0] = (numLoc[0]*numLoc[1] + len(rBlock)) / (numLoc[1]+1)
788877
numLoc[1] = numLoc[1] + 1
789878

0 commit comments

Comments
 (0)