Skip to content

Commit 62c266e

Browse files
author
Alexis Morrissey
committed
Adding additional neural networks
1 parent 89423d2 commit 62c266e

File tree

9 files changed

+70
-61
lines changed

9 files changed

+70
-61
lines changed

Allo/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
from . import predictPeak
22
from . import allocation
33

4-
__version__ = '1.0.5'
4+
__version__ = '1.1.0'

Allo/allo

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
#!/usr/bin/env python
22
#Lexi Morrissey, Mahony Lab @ Pennsylvania State University
3-
#Last updated 03.06.2023
43
#Main method for Allo. Splits the sam files up and sends them to allocation procedure via multiprocessing package.
54

65
#Arguments
@@ -13,8 +12,9 @@ parser.add_argument('input')
1312
parser.add_argument('-seq', type=str, nargs=1, help='Single-end or paired-end sequencing mode', \
1413
choices=['pe','se'], required=True, dest='seq')
1514
parser.add_argument('-o', type=str, nargs=1, help='Output file name', dest='outfile', default=None)
16-
parser.add_argument('--mixed', help='Use CNN trained on a dataset with mixed peaks, narrow by default', action='store_true', default=None)
17-
parser.add_argument('--rna', help='Use CNN trained on a RNAseq dataset', action='store_true', default=None)
15+
parser.add_argument('--mixed', help='Use CNN trained on a dataset with mixed ChIP-seq peaks, narrow by default', action='store_true', default=None)
16+
parser.add_argument('--dnase', help='Use CNN trained on a DNa-seq datasets', action='store_true', default=None)
17+
parser.add_argument('--atac', help='Use CNN trained on a ATAC-seq datasets', action='store_true', default=None)
1818
parser.add_argument('--splice', help='Remove splice sites based on cigar string when constructing image', action='store_true', default=None)
1919
parser.add_argument('-p', type=int, nargs=1, help='Number of processes, 1 by default', dest='processes', default=None)
2020
parser.add_argument('-max', type=int, nargs=1, help='Maximum value for number of locations a read can map', dest='maxlocations', default=None)
@@ -28,20 +28,21 @@ parser.add_argument('--parser', help='Ignore warnings about read sorting', actio
2828
args = parser.parse_args()
2929

3030
#Imports
31+
import Allo
32+
from Allo import allocation
3133
import sys
3234
from joblib import Parallel, delayed
3335
import multiprocessing
3436
import subprocess
3537
import os
36-
from Allo import allocation
3738
import math
3839
from random import randint
39-
import shutil
4040
import glob
4141
import pysam
4242
import shutil
4343
import pkgutil
44-
import Allo
44+
45+
4546

4647

4748
#Function for concatenating files
@@ -50,11 +51,12 @@ def cat(files, outname):
5051
for f in files:
5152
with open(f,'rb') as fd:
5253
shutil.copyfileobj(fd, wfd)
54+
5355

5456
##Main Method##
5557
if __name__ == '__main__':
5658

57-
print("\nRunning Allo version 1.0\n\n")
59+
print("\nRunning Allo version 1.1\n\n")
5860

5961
#Make a folder to store all temp files in allo
6062
ids = str(randint(0, 10000))
@@ -86,9 +88,13 @@ if __name__ == '__main__':
8688
d = os.path.dirname(sys.modules["Allo"].__file__)
8789
m = os.path.join(d, "mixed")
8890
winSize = 500
89-
elif args.rna is not None:
91+
elif args.dnase is not None:
92+
d = os.path.dirname(sys.modules["Allo"].__file__)
93+
m = os.path.join(d, "dnase")
94+
winSize = 500
95+
elif args.atac is not None:
9096
d = os.path.dirname(sys.modules["Allo"].__file__)
91-
m = os.path.join(d, "rna")
97+
m = os.path.join(d, "atac")
9298
winSize = 500
9399
else:
94100
d = os.path.dirname(sys.modules["Allo"].__file__)
@@ -111,7 +117,7 @@ if __name__ == '__main__':
111117
rc = 2
112118
else:
113119
rc = 0
114-
print("Using neural network...", flush=True)
120+
print("Neural network mode on...", flush=True)
115121
#Keep unmapped reads
116122
if args.keep_unmap is not None:
117123
keep = 1

Allo/allocation.py

Lines changed: 46 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
#!/usr/bin/env python
22
#Lexi Morrissey, Mahony Lab @ Pennsylvania State University
3-
#Last updated 04.22.2024
43
#Contains methods for read allocation procedure of Allo.
54

65
from Allo import predictPeak
@@ -79,7 +78,7 @@ def getArray(read, winSize, genLand, spliceD):
7978
#print("cigar: " + read[5], flush=True)
8079
for i in range(0,len(chars)):
8180
if chars[i]=="M" or chars[i]=="D" or chars[i]=="=" or chars[i]=="X":
82-
r_end = r_end + int(num[i])
81+
r_end = r_end + int(num[i]) - 1
8382
elif chars[i]=="N":
8483
gap_loc[r_end+1]=int(num[i])
8584
r_end = r_end + int(num[i])
@@ -90,10 +89,10 @@ def getArray(read, winSize, genLand, spliceD):
9089
while l <= math.floor(winSize/2):
9190
#print("up: " + str(k), flush=True)
9291
key = chr + ";" + str(k)
93-
if key in spliceD and spliceD[key] < 0:
92+
'''if key in spliceD and spliceD[key] < 0:
9493
#print("splice: " + str(splice[key]), flush=True)
9594
k = k + spliceD[key]
96-
key = chr + ";" + str(k)
95+
key = chr + ";" + str(k)'''
9796
if key in genLand:
9897
array.insert(0,genLand[key])
9998
else:
@@ -104,10 +103,9 @@ def getArray(read, winSize, genLand, spliceD):
104103
#Downstream counts
105104
k = start
106105
l = 0
107-
#print(gap_loc)
108106
while l <= math.floor(winSize/2):
107+
#Taking splice information from read if present
109108
while k < r_end:
110-
#print("down: " + str(k), flush=True)
111109
key = chr + ";" + str(k)
112110
if k in gap_loc:
113111
k = k + gap_loc[k]
@@ -118,11 +116,11 @@ def getArray(read, winSize, genLand, spliceD):
118116
array.append(0)
119117
k += 1
120118
l += 1
121-
#print("down2: " + str(k), flush=True)
119+
#Else use previously found splice info
122120
key = chr + ";" + str(k)
123-
if key in spliceD and spliceD[key] > 0:
121+
'''if key in spliceD and spliceD[key] > 0:
124122
k = k + spliceD[key]
125-
key = chr + ";" + str(k)
123+
key = chr + ";" + str(k)'''
126124
if key in genLand:
127125
array.append(genLand[key])
128126
else:
@@ -144,7 +142,7 @@ def getArray(read, winSize, genLand, spliceD):
144142

145143

146144
#Assign reads (straight to dictionary for uniq and actual assign for multi-mapped)
147-
def readAssign(rBlock, samOut, winSize, genLand, model, cnn_scores, rc, rmz, modelName, spliceD):
145+
def readAssign(rBlock, samOut, winSize, genLand, model, cnn_scores, rc, rmz, spliceD):
148146
random.seed(7) #To make results reproducible
149147

150148
##Multi-mapped reads##
@@ -154,28 +152,35 @@ def readAssign(rBlock, samOut, winSize, genLand, model, cnn_scores, rc, rmz, mod
154152
allZ = True #seeing if all zero regions
155153
for i in rBlock:
156154
#Find closest 100 window, use that score instead if it's already been assigned, saves time
157-
pos = i[2]+str(i[3])
158-
countArray = getArray(i, winSize, genLand, spliceD)
159-
s = sum(countArray)
160-
if s > 0:
155+
if spliceD:
156+
pos = i[2]+";"+str(i[3])
157+
else:
158+
pos = i[2]+";"+str(round(int(i[3])/100)*100)
159+
if pos in cnn_scores:
160+
scores_nn.append(cnn_scores[pos])
161161
allZ = False
162-
#Allocation options
163-
if rc == 1:
164-
if s == 0:
162+
else:
163+
countArray = getArray(i, winSize, genLand, spliceD)
164+
s = sum(countArray)
165+
if s > 0:
166+
allZ = False
167+
#Allocation options
168+
if rc == 1:
169+
if s == 0:
170+
scores_rc.append(1)
171+
else:
172+
scores_rc.append(s+1)
173+
continue
174+
if rc == 2:
165175
scores_rc.append(1)
176+
continue
177+
#Use no read score if zero region
178+
if s == 0:
179+
scores_nn.append(0.0012*(s+1))
166180
else:
167-
scores_rc.append(s+1)
168-
continue
169-
if rc == 2:
170-
scores_rc.append(1)
171-
continue
172-
#Use no read score if zero region
173-
if s == 0:
174-
scores_nn.append(0.0012*(s+1))
175-
else:
176-
nn = predictPeak.predictNN(countArray, winSize, model)
177-
scores_nn.append(nn*(s+1))
178-
cnn_scores[pos] = (nn*(s+1))
181+
nn = predictPeak.predictNN(countArray, winSize, model)
182+
scores_nn.append(nn*(s+1))
183+
cnn_scores[pos] = (nn*(s+1))
179184

180185
#Removing reads that mapped to all zero regions
181186
if allZ and rmz == 1:
@@ -214,7 +219,6 @@ def parseUniq(tempFile, winSize, cnn_scores, AS, rc, keep):
214219
cu = 0 #UMRs
215220
cf = 0 #Filtered
216221

217-
modelName = None
218222
model = None
219223
rmz = None
220224
rBlock = []
@@ -388,7 +392,6 @@ def parseMulti(tempFile, winSize, genLand, modelName, cnn_scores, rc, keep, rmz,
388392
sys.exit(0)
389393
else:
390394
model = None
391-
modelName = None
392395
#Exception that causes errors
393396
if os.stat(tempFile+"MM").st_size == 0:
394397
return numLoc
@@ -432,7 +435,7 @@ def parseMulti(tempFile, winSize, genLand, modelName, cnn_scores, rc, keep, rmz,
432435
rBlock = []
433436
rBlock.append(r)
434437
continue
435-
readAssign(rBlock, AL, winSize, genLand, model, cnn_scores, rc, rmz, modelName, spliceD)
438+
readAssign(rBlock, AL, winSize, genLand, model, cnn_scores, rc, rmz, spliceD)
436439
#Getting average number of locations mapped to
437440
numLoc[0] = (numLoc[0]*numLoc[1] + len(rBlock)) / (numLoc[1]+1)
438441
numLoc[1] = numLoc[1] + 1
@@ -442,7 +445,7 @@ def parseMulti(tempFile, winSize, genLand, modelName, cnn_scores, rc, keep, rmz,
442445

443446
#For last read
444447
if maxa is None or len(rBlock) <= maxa:
445-
readAssign(rBlock, AL, winSize, genLand, model, cnn_scores, rc, rmz, modelName, spliceD)
448+
readAssign(rBlock, AL, winSize, genLand, model, cnn_scores, rc, rmz, spliceD)
446449
numLoc[0] = (numLoc[0]*numLoc[1] + len(rBlock)) / (numLoc[1]+1)
447450
numLoc[1] = numLoc[1] + 1
448451

@@ -457,7 +460,6 @@ def parseUniqPE(tempFile, winSize, cnn_scores, AS, rc, keep, r2):
457460
cu = 0
458461
cf = 0
459462

460-
modelName = None
461463
model = None
462464
rmz = None
463465
if "border" in tempFile:
@@ -698,7 +700,7 @@ def parseUniqPE(tempFile, winSize, cnn_scores, AS, rc, keep, r2):
698700

699701

700702
#Assign reads (straight to dictionary for uniq and actual assign for multi-mapped)
701-
def readAssignPE(rBlock, rBlock2, samOut, winSize, genLand, model, cnn_scores, rc, rmz, modelName, spliceD):
703+
def readAssignPE(rBlock, rBlock2, samOut, winSize, genLand, model, cnn_scores, rc, rmz, spliceD):
702704
random.seed(7) #To make results reproducible
703705
##Multi-mapped reads##
704706
###CNN###
@@ -707,9 +709,12 @@ def readAssignPE(rBlock, rBlock2, samOut, winSize, genLand, model, cnn_scores, r
707709
allZ = True #seeing if all zero regions
708710
for i in rBlock:
709711
#Find closest 100 window, use that score instead if it's already been assigned, saves time
710-
pos = i[2]+str(round(int(i[3])/100)*100)
712+
if spliceD:
713+
pos = i[2]+";"+str(i[3])
714+
else:
715+
pos = i[2]+";"+str(round(int(i[3])/100)*100)
711716
if pos in cnn_scores:
712-
#scores_nn.append(cnn_scores[pos])
717+
scores_nn.append(cnn_scores[pos])
713718
allZ = False
714719
else:
715720
countArray = getArray(i, winSize, genLand, spliceD)
@@ -780,16 +785,11 @@ def parseMultiPE(tempFile, winSize, genLand, modelName, cnn_scores, rc, keep, rm
780785
model = tf.keras.models.model_from_json(loaded_model_json)
781786
model.load_weights(modelName+'.h5')
782787
model = LiteModel.from_keras_model(model)
783-
if "mixed" in modelName:
784-
modelName = 1
785-
else:
786-
modelName = 0
787788
except:
788789
print("Could not load Tensorflow model :( Allo was written with Tensorflow version 2.11")
789790
sys.exit(0)
790791
else:
791792
model = None
792-
modelName = None
793793

794794
#Exception that causes errors
795795
if os.stat(tempFile+"MM").st_size == 0:
@@ -852,7 +852,7 @@ def parseMultiPE(tempFile, winSize, genLand, modelName, cnn_scores, rc, keep, rm
852852
else:
853853
if maxa is not None and len(rBlock) > maxa:
854854
continue
855-
readAssignPE(rBlock, rBlock2, AL, winSize, genLand, model, cnn_scores, rc, rmz, modelName, spliceD)
855+
readAssignPE(rBlock, rBlock2, AL, winSize, genLand, model, cnn_scores, rc, rmz, spliceD)
856856
numLoc[0] = (numLoc[0]*numLoc[1] + len(rBlock)) / (numLoc[1]+1)
857857
numLoc[1] = numLoc[1] + 1
858858
rBlock = []
@@ -864,7 +864,7 @@ def parseMultiPE(tempFile, winSize, genLand, modelName, cnn_scores, rc, keep, rm
864864

865865
#For last read
866866
if maxa is None or len(rBlock) <= maxa:
867-
readAssignPE(rBlock, rBlock2, AL, winSize, genLand, model, cnn_scores, rc, rmz, modelName, spliceD)
867+
readAssignPE(rBlock, rBlock2, AL, winSize, genLand, model, cnn_scores, rc, rmz, spliceD)
868868
numLoc[0] = (numLoc[0]*numLoc[1] + len(rBlock)) / (numLoc[1]+1)
869869
numLoc[1] = numLoc[1] + 1
870870

@@ -915,3 +915,5 @@ def predict_single(self, inp):
915915
self.interpreter.invoke()
916916
out = self.interpreter.get_tensor(self.output_index)
917917
return out[0]
918+
919+

Allo/atac.h5

285 KB
Binary file not shown.

Allo/atac.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"class_name": "Sequential", "config": {"name": "sequential_2", "layers": [{"module": "keras.layers", "class_name": "InputLayer", "config": {"batch_input_shape": [null, 100, 100], "dtype": "float32", "sparse": false, "ragged": false, "name": "conv1d_6_input"}, "registered_name": null}, {"module": "keras.layers", "class_name": "Conv1D", "config": {"name": "conv1d_6", "trainable": true, "dtype": "float32", "batch_input_shape": [null, 100, 100], "filters": 2, "kernel_size": [64], "strides": [1], "padding": "same", "data_format": "channels_last", "dilation_rate": [1], "groups": 1, "activation": "linear", "use_bias": true, "kernel_initializer": {"module": "keras.initializers", "class_name": "GlorotUniform", "config": {"seed": null}, "registered_name": null}, "bias_initializer": {"module": "keras.initializers", "class_name": "Zeros", "config": {}, "registered_name": null}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "registered_name": null, "build_config": {"input_shape": [null, 100, 100]}}, {"module": "keras.layers", "class_name": "AveragePooling1D", "config": {"name": "average_pooling1d_2", "trainable": true, "dtype": "float32", "strides": [2], "pool_size": [2], "padding": "valid", "data_format": "channels_last"}, "registered_name": null, "build_config": {"input_shape": [null, 100, 2]}}, {"module": "keras.layers", "class_name": "Conv1D", "config": {"name": "conv1d_7", "trainable": true, "dtype": "float32", "filters": 2, "kernel_size": [32], "strides": [1], "padding": "same", "data_format": "channels_last", "dilation_rate": [1], "groups": 1, "activation": "linear", "use_bias": true, "kernel_initializer": {"module": "keras.initializers", "class_name": "GlorotUniform", "config": {"seed": null}, "registered_name": null}, "bias_initializer": {"module": "keras.initializers", "class_name": "Zeros", "config": {}, "registered_name": null}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "registered_name": null, "build_config": {"input_shape": [null, 50, 2]}}, {"module": "keras.layers", "class_name": "Conv1D", "config": {"name": "conv1d_8", "trainable": true, "dtype": "float32", "filters": 2, "kernel_size": [32], "strides": [1], "padding": "same", "data_format": "channels_last", "dilation_rate": [1], "groups": 1, "activation": "linear", "use_bias": true, "kernel_initializer": {"module": "keras.initializers", "class_name": "GlorotUniform", "config": {"seed": null}, "registered_name": null}, "bias_initializer": {"module": "keras.initializers", "class_name": "Zeros", "config": {}, "registered_name": null}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "registered_name": null, "build_config": {"input_shape": [null, 50, 2]}}, {"module": "keras.layers", "class_name": "Dropout", "config": {"name": "dropout_2", "trainable": true, "dtype": "float32", "rate": 0.1, "noise_shape": null, "seed": null}, "registered_name": null, "build_config": {"input_shape": [null, 50, 2]}}, {"module": "keras.layers", "class_name": "Flatten", "config": {"name": "flatten_2", "trainable": true, "dtype": "float32", "data_format": "channels_last"}, "registered_name": null, "build_config": {"input_shape": [null, 50, 2]}}, {"module": "keras.layers", "class_name": "Dense", "config": {"name": "dense_4", "trainable": true, "dtype": "float32", "units": 528, "activation": "relu", "use_bias": true, "kernel_initializer": {"module": "keras.initializers", "class_name": "GlorotUniform", "config": {"seed": null}, "registered_name": null}, "bias_initializer": {"module": "keras.initializers", "class_name": "Zeros", "config": {}, "registered_name": null}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "registered_name": null, "build_config": {"input_shape": [null, 100]}}, {"module": "keras.layers", "class_name": "Dense", "config": {"name": "dense_5", "trainable": true, "dtype": "float32", "units": 1, "activation": "sigmoid", "use_bias": true, "kernel_initializer": {"module": "keras.initializers", "class_name": "GlorotUniform", "config": {"seed": null}, "registered_name": null}, "bias_initializer": {"module": "keras.initializers", "class_name": "Zeros", "config": {}, "registered_name": null}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "registered_name": null, "build_config": {"input_shape": [null, 528]}}]}, "keras_version": "2.15.0", "backend": "tensorflow"}

Allo/dnase.h5

285 KB
Binary file not shown.

0 commit comments

Comments
 (0)