-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathdeepmod2
More file actions
174 lines (117 loc) · 12.4 KB
/
deepmod2
File metadata and controls
174 lines (117 loc) · 12.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
#!/usr/bin/env python
import time, itertools, torch
import datetime, os, shutil, argparse, sys, pysam
from src import utils
if __name__ == '__main__':
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("--print_models", help='Print details of models available', default=False, action='store_true')
main_subparsers = parser.add_subparsers(title="Options", dest="option")
parent_parser = argparse.ArgumentParser(add_help=False,)
parent_parser.add_argument("--prefix", help='Prefix for the output files',type=str, default='output')
parent_parser.add_argument("--output", help= 'Path to folder where intermediate and final files will be stored, default is current working directory', type=str)
parent_parser.add_argument("--qscore_cutoff", help='Minimum cutoff for mean quality score of a read',type=float, default=0)
parent_parser.add_argument("--length_cutoff", help='Minimum cutoff for read length',type=int, default=0)
parent_parser.add_argument("--mod_t", help= 'Probability threshold for a per-read prediction to be considered modified. Only predictiond with probability >= mod_t will be considered as modified for calculation of per-site modification levels.', default=0.5, type=float)
parent_parser.add_argument("--unmod_t", help= 'Probability threshold for a per-read prediction to be considered unmodified. Only predictiond with probability < unmod_t will be considered as unmodified for calculation of per-site modification levels.', default=0.5, type=float)
parent_parser.add_argument("--include_non_cpg_ref", help='Include non-CpG reference loci in per-site output where reads have CpG motif.',default=False, action='store_true')
detect_parser = main_subparsers.add_parser("detect", parents=[parent_parser],
add_help=True,
help="Call methylation from Guppy or Dorado basecalled POD5/FAST5 files using move tables for signal alignment.", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
detect_required=detect_parser.add_argument_group("Required Arguments")
detect_parser.add_argument("--motif", help='Motif for detecting modifications followed by zero-based indices of nucleotides within the motif to call modification on. Default is CpG methylation "CG 0". Modification will be called for motif matches on the read and reference unless --reference_motif_only is used to restrict to reference motif matches only. Multiple indices can be specified but they should refer to the same nucleotide letter. The motif and each index listed should be separated by whitespace, e.g. "--motif CGCG 0 2"', nargs='*', default=['CG','0'])
detect_parser.add_argument("--mod_symbol", help='Symbol to use for modified base in BAM tag MM. Default is to use "m" for 5mC in CG motif, and for other motifs the default is the canonical nucleotide symbol.' , type=str)
detect_parser.add_argument("--mod_positions", help='A tab separated list of reference coordinates to call modification on. Modifications will only be called on reference positions specified that overlap with the motif, and no modification detection will be performed on other loci. The file shoule have the following format: "contig position strand" on each line. Position should be zero-based and strand should be "+" or "-".', default=None)
detect_parser.add_argument("--reference_motif_only", help='Restrict modification calling to reference motif matches only', default=False, action='store_true')
detect_parser.add_argument("--seq_type", help='Specify DNA or direct RNA sequencing.',choices=['dna','rna'], type=str,required=True)
detect_parser.add_argument("--threads", help='Number of threads to use for processing signal and running model inference. If a GPU is used for inference, then --threads number of threads will be running on GPU concurrently. The total number of threads used by DeepMod2 is equal to --threads plus --bam_threads. It is recommended to run DeepMod2 with mutliple cores, and use at least 4 bam_threads for compressing BAM file.',type=int, default=4)
detect_parser.add_argument("--ref", help='Path to reference FASTA file to anchor methylation calls to reference loci. If no reference is provided, only the motif loci on reads will be used.', type=str)
detect_required.add_argument("--model", help='Name of the model. Recommended model for R9.4.1 flowcells is "bilstm_r9.4.1", for R10.4.1 flowcell (5kHz sampling) it is "bilstm_r10.4.1_5khz_v4.3", and for R10.4.1 flowcell (4kHz sampling) it is "bilstm_r10.4.1_4khz_v4.1". Use --print_models to display all models available along with compatible basecaller models. For custom models, provide the model config file and model checkpoint path separated by comma, e.g. "model.cfg,modelfile"',type=str, required=True)
detect_required.add_argument("--bam", help='Path to aligned or unaligned BAM file. It is ideal to have move table in BAM file but move table from FAST5 fies can also be used. Aligned BAM file is required for reference anchored methylation calls, otherwise only the motif loci on reads will be called.', type=str, required=True)
detect_required.add_argument("--file_type", help='Specify whether the signal is in FAST5 or POD5 file format. If POD5 file is used, then move table must be in BAM file.',choices=['fast5','pod5'], type=str,required=True)
detect_required.add_argument("--input", help='Path to POD5/FAST5 file or folder containing POD5/FAST5 files. If folder provided, then POD5/FAST5 files will be recusrviely searched', type=str, required=True)
detect_parser.add_argument("--guppy_group", help='Name of the guppy basecall group if move table is in FAST5 file.',type=str, default='Basecall_1D_000')
detect_parser.add_argument("--chrom", nargs='*', help='A space/whitespace separated list of contigs, e.g. chr3 chr6 chr22. If not list is provided then all chromosomes in the reference are used.')
detect_parser.add_argument("--fast5_move", help='Use move table from FAST5 file instead of BAM file. If this flag is set, specify a basecall group for FAST5 file using --guppy_group parameter and ensure that the FAST5 files contains move table.', default=False, action='store_true')
detect_parser.add_argument("--skip_per_site", help='Skip per site output', default=False, action='store_true')
detect_parser.add_argument("--device", help='Device to use for running pytorch models. you can set --device=cpu for cpu, or --device=cuda for GPU. You can also specify a particular GPU device such as --device=cuda:0 or --device=cuda:1 . If --device paramater is not set by user, then GPU will be used if available otherwise CPU will be used.', type=str)
detect_parser.add_argument("--disable_pruning", help='Disable model pruning (not recommended for CPU inference). By default models are pruned to remove some weights with low L1 norm in linear layers. Pruning has little effect on model accuracy, it can signifcantly improve CPU inference time but not GPU inference time.', default=False, action='store_true')
detect_parser.add_argument("--exclude_ref_features", help='Exclude reference sequence from feature matrix. By default, if a reference FASTA file is provided via --ref parameter, then the reference sequence is added as a feature for aligned reads, but not if a read is unmapped or if no reference is provided.', default=False, action='store_true')
detect_parser.add_argument("--batch_size", help='Batch size to use for GPU inference. For CPU inference, batch size is fixed at 512.',type=int, default=1024)
detect_parser.add_argument("--bam_threads", help='Number of threads to use for compressed BAM output. Setting it lower than 4 can significantly lower the runtime.',type=int, default=4)
detect_parser.add_argument("--skip_unmapped", help='Skip unmapped reads from methylation calling. If --chrom is used then unmapped are automatically skipped.', default=False, action='store_true')
merge_parser = main_subparsers.add_parser("merge", parents=[parent_parser],
add_help=True,
help="Merge per-read calls into per-site calls", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
merge_parser.add_argument("--input", nargs='*', help= 'List of paths of per-read methylation calls to merge. File paths should be separated by space/whitespace. Use either --input or --list argument, but not both.')
merge_parser.add_argument("--list", help= 'A file containing paths to per-read methylation calls to merge (one per line). Use either --inputs or --list argument, but not both.', type=str)
merge_parser.add_argument("--cpg_output", help= 'Create an additional per-site output file with forward and negative strand counts for CpG sites combined.', default=False, action='store_true')
if len(sys.argv)==1:
parser.print_help()
parser.exit()
elif len(sys.argv)==2:
if sys.argv[1]=='merge':
merge_parser.print_help()
merge_parser.exit()
elif sys.argv[1]=='detect':
detect_parser.print_help()
detect_parser.exit()
args = parser.parse_args()
if args.print_models:
utils.get_model_help()
parser.exit()
t=time.time()
print('%s: Starting DeepMod2.' %str(datetime.datetime.now()), flush=True)
if not args.output:
args.output=os.getcwd()
os.makedirs(args.output, exist_ok=True)
if args.option=='merge':
if args.input:
input_list= args.input
elif args.list:
with open(args.list,'r') as file_list:
input_list=[x.rstrip('\n') for x in file_list.readlines()]
params={'output':args.output, 'prefix':args.prefix, 'qscore_cutoff':args.qscore_cutoff,
'length_cutoff':args.length_cutoff, 'mod_t':args.mod_t,
'unmod_t':args.unmod_t,'include_non_cpg_ref':args.include_non_cpg_ref, 'cpg_output':args.cpg_output}
site_pred_file=utils.get_per_site(params, input_list)
else:
if args.chrom:
args.skip_unmapped=True
chrom_list=args.chrom
else:
chrom_list=pysam.AlignmentFile(args.bam,'rb',check_sq=False).references
if args.device:
dev=args.device
else:
if torch.cuda.is_available():
dev = "cuda"
else:
dev = "cpu"
motif_seq, exp_motif_seq, motif_ind, valid_motif=utils.motif_check(args.motif)
if not valid_motif:
sys.exit(3)
params={'input':args.input, 'output':args.output, 'threads':args.threads,
'prefix':args.prefix, 'model':args.model,
'qscore_cutoff':args.qscore_cutoff, 'ref':args.ref,
'length_cutoff':args.length_cutoff, 'bam':args.bam,
'file_type':args.file_type, 'fast5_move':args.fast5_move,
'guppy_group':args.guppy_group,
'mod_t':args.mod_t, 'unmod_t':args.unmod_t, 'include_non_cpg_ref': args.include_non_cpg_ref,
'skip_per_site':args.skip_per_site, 'chrom_list':chrom_list, "dev":dev,
'disable_pruning':args.disable_pruning, 'batch_size':args.batch_size,
'exclude_ref_features':args.exclude_ref_features,'bam_threads':args.bam_threads,
'skip_unmapped':args.skip_unmapped, 'mod_positions':args.mod_positions,
'motif_seq':motif_seq, 'motif_ind':motif_ind,'exp_motif_seq':exp_motif_seq,
'reference_motif_only':args.reference_motif_only,
'seq_type':args.seq_type,
'mod_symbol':args.mod_symbol
}
print('\n%s: \nCommand: python %s\n' %(str(datetime.datetime.now()), ' '.join(sys.argv)), flush=True)
with open(os.path.join(args.output,'args'),'w') as file:
file.write('Command: python %s\n\n\n' %(' '.join(sys.argv)))
file.write('------Parameters Used For Running DeepMod2------\n')
for k in vars(args):
file.write('{}: {}\n'.format(k,vars(args)[k]) )
from src import detect
detect.call_manager(params)
print('\n%s: Time elapsed=%.4fs' %(str(datetime.datetime.now()),time.time()-t), flush=True)