seqcode
diff --git a/‎Allo/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎Allo/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Allo/allo‎
Lines changed: 16 additions & 10 deletions b/‎Allo/allo‎
Lines changed: 16 additions & 10 deletions
diff --git a/‎Allo/allocation.py‎
Lines changed: 46 additions & 44 deletions b/‎Allo/allocation.py‎
Lines changed: 46 additions & 44 deletions
diff --git a/‎Allo/atac.h5‎
285 KB b/‎Allo/atac.h5‎
285 KB
diff --git a/‎Allo/atac.json‎
Lines changed: 1 addition & 0 deletions b/‎Allo/atac.json‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎Allo/dnase.h5‎
285 KB b/‎Allo/dnase.h5‎
285 KB
@@ -1,4 +1,4 @@
 from . import predictPeak
 from . import allocation
 
-__version__ = '1.0.5'
+__version__ = '1.1.0'
@@ -1,6 +1,5 @@
 #!/usr/bin/env python
 #Lexi Morrissey, Mahony Lab @ Pennsylvania State University
-#Last updated 03.06.2023
 #Main method for Allo. Splits the sam files up and sends them to allocation procedure via multiprocessing package.
 
 #Arguments
@@ -13,8 +12,9 @@ parser.add_argument('input')
 parser.add_argument('-seq', type=str, nargs=1, help='Single-end or paired-end sequencing mode', \
                    choices=['pe','se'], required=True, dest='seq')
 parser.add_argument('-o', type=str, nargs=1, help='Output file name', dest='outfile', default=None)
-parser.add_argument('--mixed', help='Use CNN trained on a dataset with mixed peaks, narrow by default', action='store_true', default=None)
-parser.add_argument('--rna', help='Use CNN trained on a RNAseq dataset', action='store_true', default=None)
+parser.add_argument('--mixed', help='Use CNN trained on a dataset with mixed ChIP-seq peaks, narrow by default', action='store_true', default=None)
+parser.add_argument('--dnase', help='Use CNN trained on a DNa-seq datasets', action='store_true', default=None)
+parser.add_argument('--atac', help='Use CNN trained on a ATAC-seq datasets', action='store_true', default=None)
 parser.add_argument('--splice', help='Remove splice sites based on cigar string when constructing image', action='store_true', default=None)
 parser.add_argument('-p', type=int, nargs=1, help='Number of processes, 1 by default', dest='processes', default=None)
 parser.add_argument('-max', type=int, nargs=1, help='Maximum value for number of locations a read can map', dest='maxlocations', default=None)
@@ -28,20 +28,21 @@ parser.add_argument('--parser', help='Ignore warnings about read sorting', actio
 args = parser.parse_args()
 
 #Imports
+import Allo
+from Allo import allocation
 import sys
 from joblib import Parallel, delayed
 import multiprocessing
 import subprocess
 import os
-from Allo import allocation
 import math
 from random import randint
-import shutil
 import glob
 import pysam
 import shutil
 import pkgutil
-import Allo
+
+
 
 
 #Function for concatenating files
@@ -50,11 +51,12 @@ def cat(files, outname):
         for f in files:
             with open(f,'rb') as fd:
                 shutil.copyfileobj(fd, wfd)
+            
 
 ##Main Method##          
 if __name__ == '__main__':
 
-    print("\nRunning Allo version 1.0\n\n")
+    print("\nRunning Allo version 1.1\n\n")
 
     #Make a folder to store all temp files in allo
     ids = str(randint(0, 10000))
@@ -86,9 +88,13 @@ if __name__ == '__main__':
         d = os.path.dirname(sys.modules["Allo"].__file__)
         m = os.path.join(d, "mixed")
         winSize = 500
-    elif args.rna is not None:
+    elif args.dnase is not None:
+        d = os.path.dirname(sys.modules["Allo"].__file__)
+        m = os.path.join(d, "dnase")
+        winSize = 500
+    elif args.atac is not None:
         d = os.path.dirname(sys.modules["Allo"].__file__)
-        m = os.path.join(d, "rna")
+        m = os.path.join(d, "atac")
         winSize = 500
     else:
         d = os.path.dirname(sys.modules["Allo"].__file__)
@@ -111,7 +117,7 @@ if __name__ == '__main__':
         rc = 2
     else:
         rc = 0
-        print("Using neural network...", flush=True)
+        print("Neural network mode on...", flush=True)
     #Keep unmapped reads
     if args.keep_unmap is not None:
         keep = 1
 
@@ -1,6 +1,5 @@
 #!/usr/bin/env python
 #Lexi Morrissey, Mahony Lab @ Pennsylvania State University
-#Last updated 04.22.2024
 #Contains methods for read allocation procedure of Allo.
 
 from Allo import predictPeak
@@ -79,7 +78,7 @@ def getArray(read, winSize, genLand, spliceD):
         #print("cigar: " + read[5], flush=True)
         for i in range(0,len(chars)):
             if chars[i]=="M" or chars[i]=="D" or chars[i]=="=" or chars[i]=="X":
-                r_end = r_end + int(num[i])
+                r_end = r_end + int(num[i]) - 1
             elif chars[i]=="N":
                 gap_loc[r_end+1]=int(num[i])
                 r_end = r_end + int(num[i])
@@ -90,10 +89,10 @@ def getArray(read, winSize, genLand, spliceD):
         while l <= math.floor(winSize/2):
             #print("up: " + str(k), flush=True)
             key = chr + ";" + str(k)
-            if key in spliceD and spliceD[key] < 0:
+            '''if key in spliceD and spliceD[key] < 0:
                 #print("splice: " + str(splice[key]), flush=True)
                 k = k + spliceD[key]
-                key = chr + ";" + str(k)
+                key = chr + ";" + str(k)'''
             if key in genLand:
                 array.insert(0,genLand[key])
             else:
@@ -104,10 +103,9 @@ def getArray(read, winSize, genLand, spliceD):
         #Downstream counts 
         k = start
         l = 0
-        #print(gap_loc)
         while l <= math.floor(winSize/2):
+            #Taking splice information from read if present
             while k < r_end:
-                #print("down: " + str(k), flush=True)
                 key = chr + ";" + str(k)
                 if k in gap_loc:
                     k = k + gap_loc[k]
@@ -118,11 +116,11 @@ def getArray(read, winSize, genLand, spliceD):
                     array.append(0)
                 k += 1
                 l += 1
-            #print("down2: " + str(k), flush=True)
+            #Else use previously found splice info
             key = chr + ";" + str(k)
-            if key in spliceD and spliceD[key] > 0:
+            '''if key in spliceD and spliceD[key] > 0:
                 k = k + spliceD[key]
-                key = chr + ";" + str(k)
+                key = chr + ";" + str(k)'''
             if key in genLand:
                 array.append(genLand[key])
             else:
@@ -144,7 +142,7 @@ def getArray(read, winSize, genLand, spliceD):
 
 
 #Assign reads (straight to dictionary for uniq and actual assign for multi-mapped)
-def readAssign(rBlock, samOut, winSize, genLand, model, cnn_scores, rc, rmz, modelName, spliceD):
+def readAssign(rBlock, samOut, winSize, genLand, model, cnn_scores, rc, rmz, spliceD):
     random.seed(7)      #To make results reproducible
 
     ##Multi-mapped reads##
@@ -154,28 +152,35 @@ def readAssign(rBlock, samOut, winSize, genLand, model, cnn_scores, rc, rmz, mod
     allZ = True #seeing if all zero regions
     for i in rBlock:
         #Find closest 100 window, use that score instead if it's already been assigned, saves time
-        pos = i[2]+str(i[3])
-        countArray = getArray(i, winSize, genLand, spliceD)
-        s = sum(countArray)
-        if s > 0:
+        if spliceD:
+            pos = i[2]+";"+str(i[3])
+        else:
+            pos = i[2]+";"+str(round(int(i[3])/100)*100)
+        if pos in cnn_scores:
+            scores_nn.append(cnn_scores[pos])
             allZ = False
-        #Allocation options
-        if rc == 1:
-            if s == 0:
+        else:
+            countArray = getArray(i, winSize, genLand, spliceD)
+            s = sum(countArray)
+            if s > 0:
+                allZ = False
+            #Allocation options
+            if rc == 1:
+                if s == 0:
+                    scores_rc.append(1)
+                else:
+                    scores_rc.append(s+1)
+                continue
+            if rc == 2:
                 scores_rc.append(1)
+                continue
+            #Use no read score if zero region
+            if s == 0:
+                scores_nn.append(0.0012*(s+1))
             else:
-                scores_rc.append(s+1)
-            continue
-        if rc == 2:
-            scores_rc.append(1)
-            continue
-        #Use no read score if zero region
-        if s == 0:
-            scores_nn.append(0.0012*(s+1))
-        else:
-            nn = predictPeak.predictNN(countArray, winSize, model)
-            scores_nn.append(nn*(s+1))
-            cnn_scores[pos] = (nn*(s+1))
+                nn = predictPeak.predictNN(countArray, winSize, model)
+                scores_nn.append(nn*(s+1))
+                cnn_scores[pos] = (nn*(s+1))
 
     #Removing reads that mapped to all zero regions
     if allZ and rmz == 1:
@@ -214,7 +219,6 @@ def parseUniq(tempFile, winSize, cnn_scores, AS, rc, keep):
     cu = 0  #UMRs
     cf = 0  #Filtered
 
-    modelName = None
     model = None
     rmz = None
     rBlock = []
@@ -388,7 +392,6 @@ def parseMulti(tempFile, winSize, genLand, modelName, cnn_scores, rc, keep, rmz,
             sys.exit(0)
     else:
         model = None
-        modelName = None
     #Exception that causes errors
     if os.stat(tempFile+"MM").st_size == 0:
         return numLoc
@@ -432,7 +435,7 @@ def parseMulti(tempFile, winSize, genLand, modelName, cnn_scores, rc, keep, rmz,
                         rBlock = []
                         rBlock.append(r)
                         continue
-                    readAssign(rBlock, AL, winSize, genLand, model, cnn_scores, rc, rmz, modelName, spliceD)
+                    readAssign(rBlock, AL, winSize, genLand, model, cnn_scores, rc, rmz, spliceD)
                     #Getting average number of locations mapped to
                     numLoc[0] = (numLoc[0]*numLoc[1] + len(rBlock)) / (numLoc[1]+1)
                     numLoc[1] = numLoc[1] + 1
@@ -442,7 +445,7 @@ def parseMulti(tempFile, winSize, genLand, modelName, cnn_scores, rc, keep, rmz,
 
     #For last read
     if maxa is None or len(rBlock) <= maxa:
-        readAssign(rBlock, AL, winSize, genLand, model, cnn_scores, rc, rmz, modelName, spliceD)
+        readAssign(rBlock, AL, winSize, genLand, model, cnn_scores, rc, rmz, spliceD)
         numLoc[0] = (numLoc[0]*numLoc[1] + len(rBlock)) / (numLoc[1]+1)
         numLoc[1] = numLoc[1] + 1
 
@@ -457,7 +460,6 @@ def parseUniqPE(tempFile, winSize, cnn_scores, AS, rc, keep, r2):
     cu = 0
     cf = 0
 
-    modelName = None
     model = None
     rmz = None
     if "border" in tempFile:
@@ -698,7 +700,7 @@ def parseUniqPE(tempFile, winSize, cnn_scores, AS, rc, keep, r2):
 
 
 #Assign reads (straight to dictionary for uniq and actual assign for multi-mapped)
-def readAssignPE(rBlock, rBlock2, samOut, winSize, genLand, model, cnn_scores, rc, rmz, modelName, spliceD):
+def readAssignPE(rBlock, rBlock2, samOut, winSize, genLand, model, cnn_scores, rc, rmz, spliceD):
     random.seed(7)      #To make results reproducible
     ##Multi-mapped reads##
     ###CNN###
@@ -707,9 +709,12 @@ def readAssignPE(rBlock, rBlock2, samOut, winSize, genLand, model, cnn_scores, r
     allZ = True #seeing if all zero regions
     for i in rBlock:
         #Find closest 100 window, use that score instead if it's already been assigned, saves time
-        pos = i[2]+str(round(int(i[3])/100)*100)
+        if spliceD:
+            pos = i[2]+";"+str(i[3])
+        else:
+            pos = i[2]+";"+str(round(int(i[3])/100)*100)
         if pos in cnn_scores:
-            #scores_nn.append(cnn_scores[pos])
+            scores_nn.append(cnn_scores[pos])
             allZ = False
         else:
             countArray = getArray(i, winSize, genLand, spliceD)
@@ -780,16 +785,11 @@ def parseMultiPE(tempFile, winSize, genLand, modelName, cnn_scores, rc, keep, rm
             model = tf.keras.models.model_from_json(loaded_model_json)
             model.load_weights(modelName+'.h5')
             model = LiteModel.from_keras_model(model)
-            if "mixed" in modelName:
-                modelName = 1
-            else:
-                modelName = 0
         except:
             print("Could not load Tensorflow model :( Allo was written with Tensorflow version 2.11")
             sys.exit(0)
     else:
         model = None
-        modelName = None
 
     #Exception that causes errors
     if os.stat(tempFile+"MM").st_size == 0:
@@ -852,7 +852,7 @@ def parseMultiPE(tempFile, winSize, genLand, modelName, cnn_scores, rc, keep, rm
             else:
                 if maxa is not None and len(rBlock) > maxa:
                     continue
-                readAssignPE(rBlock, rBlock2, AL, winSize, genLand, model, cnn_scores, rc, rmz, modelName, spliceD)
+                readAssignPE(rBlock, rBlock2, AL, winSize, genLand, model, cnn_scores, rc, rmz, spliceD)
                 numLoc[0] = (numLoc[0]*numLoc[1] + len(rBlock)) / (numLoc[1]+1)
                 numLoc[1] = numLoc[1] + 1
                 rBlock = []
@@ -864,7 +864,7 @@ def parseMultiPE(tempFile, winSize, genLand, modelName, cnn_scores, rc, keep, rm
 
     #For last read
     if maxa is None or len(rBlock) <= maxa:
-        readAssignPE(rBlock, rBlock2, AL, winSize, genLand, model, cnn_scores, rc, rmz, modelName, spliceD)
+        readAssignPE(rBlock, rBlock2, AL, winSize, genLand, model, cnn_scores, rc, rmz, spliceD)
         numLoc[0] = (numLoc[0]*numLoc[1] + len(rBlock)) / (numLoc[1]+1)
         numLoc[1] = numLoc[1] + 1
 
@@ -915,3 +915,5 @@ def predict_single(self, inp):
         self.interpreter.invoke()
         out = self.interpreter.get_tensor(self.output_index)
         return out[0]
+
+
@@ -0,0 +1 @@
+{"class_name": "Sequential", "config": {"name": "sequential_2", "layers": [{"module": "keras.layers", "class_name": "InputLayer", "config": {"batch_input_shape": [null, 100, 100], "dtype": "float32", "sparse": false, "ragged": false, "name": "conv1d_6_input"}, "registered_name": null}, {"module": "keras.layers", "class_name": "Conv1D", "config": {"name": "conv1d_6", "trainable": true, "dtype": "float32", "batch_input_shape": [null, 100, 100], "filters": 2, "kernel_size": [64], "strides": [1], "padding": "same", "data_format": "channels_last", "dilation_rate": [1], "groups": 1, "activation": "linear", "use_bias": true, "kernel_initializer": {"module": "keras.initializers", "class_name": "GlorotUniform", "config": {"seed": null}, "registered_name": null}, "bias_initializer": {"module": "keras.initializers", "class_name": "Zeros", "config": {}, "registered_name": null}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "registered_name": null, "build_config": {"input_shape": [null, 100, 100]}}, {"module": "keras.layers", "class_name": "AveragePooling1D", "config": {"name": "average_pooling1d_2", "trainable": true, "dtype": "float32", "strides": [2], "pool_size": [2], "padding": "valid", "data_format": "channels_last"}, "registered_name": null, "build_config": {"input_shape": [null, 100, 2]}}, {"module": "keras.layers", "class_name": "Conv1D", "config": {"name": "conv1d_7", "trainable": true, "dtype": "float32", "filters": 2, "kernel_size": [32], "strides": [1], "padding": "same", "data_format": "channels_last", "dilation_rate": [1], "groups": 1, "activation": "linear", "use_bias": true, "kernel_initializer": {"module": "keras.initializers", "class_name": "GlorotUniform", "config": {"seed": null}, "registered_name": null}, "bias_initializer": {"module": "keras.initializers", "class_name": "Zeros", "config": {}, "registered_name": null}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "registered_name": null, "build_config": {"input_shape": [null, 50, 2]}}, {"module": "keras.layers", "class_name": "Conv1D", "config": {"name": "conv1d_8", "trainable": true, "dtype": "float32", "filters": 2, "kernel_size": [32], "strides": [1], "padding": "same", "data_format": "channels_last", "dilation_rate": [1], "groups": 1, "activation": "linear", "use_bias": true, "kernel_initializer": {"module": "keras.initializers", "class_name": "GlorotUniform", "config": {"seed": null}, "registered_name": null}, "bias_initializer": {"module": "keras.initializers", "class_name": "Zeros", "config": {}, "registered_name": null}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "registered_name": null, "build_config": {"input_shape": [null, 50, 2]}}, {"module": "keras.layers", "class_name": "Dropout", "config": {"name": "dropout_2", "trainable": true, "dtype": "float32", "rate": 0.1, "noise_shape": null, "seed": null}, "registered_name": null, "build_config": {"input_shape": [null, 50, 2]}}, {"module": "keras.layers", "class_name": "Flatten", "config": {"name": "flatten_2", "trainable": true, "dtype": "float32", "data_format": "channels_last"}, "registered_name": null, "build_config": {"input_shape": [null, 50, 2]}}, {"module": "keras.layers", "class_name": "Dense", "config": {"name": "dense_4", "trainable": true, "dtype": "float32", "units": 528, "activation": "relu", "use_bias": true, "kernel_initializer": {"module": "keras.initializers", "class_name": "GlorotUniform", "config": {"seed": null}, "registered_name": null}, "bias_initializer": {"module": "keras.initializers", "class_name": "Zeros", "config": {}, "registered_name": null}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "registered_name": null, "build_config": {"input_shape": [null, 100]}}, {"module": "keras.layers", "class_name": "Dense", "config": {"name": "dense_5", "trainable": true, "dtype": "float32", "units": 1, "activation": "sigmoid", "use_bias": true, "kernel_initializer": {"module": "keras.initializers", "class_name": "GlorotUniform", "config": {"seed": null}, "registered_name": null}, "bias_initializer": {"module": "keras.initializers", "class_name": "Zeros", "config": {}, "registered_name": null}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "registered_name": null, "build_config": {"input_shape": [null, 528]}}]}, "keras_version": "2.15.0", "backend": "tensorflow"}
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+{"class_name": "Sequential", "config": {"name": "sequential_2", "layers": [{"module": "keras.layers", "class_name": "InputLayer", "config": {"batch_input_shape": [null, 100, 100], "dtype": "float32", "sparse": false, "ragged": false, "name": "conv1d_6_input"}, "registered_name": null}, {"module": "keras.layers", "class_name": "Conv1D", "config": {"name": "conv1d_6", "trainable": true, "dtype": "float32", "batch_input_shape": [null, 100, 100], "filters": 2, "kernel_size": [64], "strides": [1], "padding": "same", "data_format": "channels_last", "dilation_rate": [1], "groups": 1, "activation": "linear", "use_bias": true, "kernel_initializer": {"module": "keras.initializers", "class_name": "GlorotUniform", "config": {"seed": null}, "registered_name": null}, "bias_initializer": {"module": "keras.initializers", "class_name": "Zeros", "config": {}, "registered_name": null}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "registered_name": null, "build_config": {"input_shape": [null, 100, 100]}}, {"module": "keras.layers", "class_name": "AveragePooling1D", "config": {"name": "average_pooling1d_2", "trainable": true, "dtype": "float32", "strides": [2], "pool_size": [2], "padding": "valid", "data_format": "channels_last"}, "registered_name": null, "build_config": {"input_shape": [null, 100, 2]}}, {"module": "keras.layers", "class_name": "Conv1D", "config": {"name": "conv1d_7", "trainable": true, "dtype": "float32", "filters": 2, "kernel_size": [32], "strides": [1], "padding": "same", "data_format": "channels_last", "dilation_rate": [1], "groups": 1, "activation": "linear", "use_bias": true, "kernel_initializer": {"module": "keras.initializers", "class_name": "GlorotUniform", "config": {"seed": null}, "registered_name": null}, "bias_initializer": {"module": "keras.initializers", "class_name": "Zeros", "config": {}, "registered_name": null}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "registered_name": null, "build_config": {"input_shape": [null, 50, 2]}}, {"module": "keras.layers", "class_name": "Conv1D", "config": {"name": "conv1d_8", "trainable": true, "dtype": "float32", "filters": 2, "kernel_size": [32], "strides": [1], "padding": "same", "data_format": "channels_last", "dilation_rate": [1], "groups": 1, "activation": "linear", "use_bias": true, "kernel_initializer": {"module": "keras.initializers", "class_name": "GlorotUniform", "config": {"seed": null}, "registered_name": null}, "bias_initializer": {"module": "keras.initializers", "class_name": "Zeros", "config": {}, "registered_name": null}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "registered_name": null, "build_config": {"input_shape": [null, 50, 2]}}, {"module": "keras.layers", "class_name": "Dropout", "config": {"name": "dropout_2", "trainable": true, "dtype": "float32", "rate": 0.1, "noise_shape": null, "seed": null}, "registered_name": null, "build_config": {"input_shape": [null, 50, 2]}}, {"module": "keras.layers", "class_name": "Flatten", "config": {"name": "flatten_2", "trainable": true, "dtype": "float32", "data_format": "channels_last"}, "registered_name": null, "build_config": {"input_shape": [null, 50, 2]}}, {"module": "keras.layers", "class_name": "Dense", "config": {"name": "dense_4", "trainable": true, "dtype": "float32", "units": 528, "activation": "relu", "use_bias": true, "kernel_initializer": {"module": "keras.initializers", "class_name": "GlorotUniform", "config": {"seed": null}, "registered_name": null}, "bias_initializer": {"module": "keras.initializers", "class_name": "Zeros", "config": {}, "registered_name": null}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "registered_name": null, "build_config": {"input_shape": [null, 100]}}, {"module": "keras.layers", "class_name": "Dense", "config": {"name": "dense_5", "trainable": true, "dtype": "float32", "units": 1, "activation": "sigmoid", "use_bias": true, "kernel_initializer": {"module": "keras.initializers", "class_name": "GlorotUniform", "config": {"seed": null}, "registered_name": null}, "bias_initializer": {"module": "keras.initializers", "class_name": "Zeros", "config": {}, "registered_name": null}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "registered_name": null, "build_config": {"input_shape": [null, 528]}}]}, "keras_version": "2.15.0", "backend": "tensorflow"}