-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathIBPC_value_calculation.py
More file actions
380 lines (294 loc) · 16.8 KB
/
IBPC_value_calculation.py
File metadata and controls
380 lines (294 loc) · 16.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
#! env python
import argparse
import os
import sys
import string
import subprocess
# author: Daniel Vodak (danielvo<at>ifi.uio.no)
# Department of Tumor Biology
# The Norwegian Radium Hospital
# Oslo University Hospital
program_name = "IBPC value calculation"
program_version = "1.0 (2017/10/02)"
program_description = r"""A script for calculating variant support within a given sample as well as its "individual-based pool complement" ("IBPC" for short - a set of co-multiplexed samples, which do not originate from the same individual). Information necessary for each processed sample (e.g., the sample pool of origin, the associated BAM file, the variants to be investigated) need to be provided via the mandatory configuration input file. GATK tools are required for generating raw variant support information in the relevant samples. The generated information is subsequently used for calculating variant allelic fractions and IBPC support fractions."""
help_epilog = "Please consult the project GitHub page (https://github.com/danielvo/IBPC) in case of problems and/or questions."
parser = argparse.ArgumentParser(prog=program_name, description=program_description, epilog=help_epilog)
parser.add_argument("-c", "--configuration_file", type=str, required=True, help="Path to the configuration file (please consult the project GitHub page for configuration file format details: https://github.com/danielvo/IBPC)")
parser.add_argument("-g", "--GATK_jar", type=str, required=True, help="Path to GATK jar file (recommended GATK version: 3.7)")
parser.add_argument("-r", "--reference_fasta", type=str, required=True, help="Path to the reference genome fasta file (related fasta index files required by the GATK tools should also be present in the same directory)")
parser.add_argument("-j", "--java_executable", type=str, required=False, default="java", help="Path to Java 8 executable (only necessary to be specified if Java 8 is not the system's default Java version)")
parser.add_argument("-o", "--output_directory", type=str, required=True, help="Path to the intended output directory (the output directory must not exist prior to execution of this script)")
parser.add_argument("-b", "--min_base_quality", type=int, required=False, default=20, choices=range(0, 41), help="Minimum required base quality (bases with quality under the specified threshold will be disregarded during variant support calculation)")
parser.add_argument("-m", "--min_mapping_quality", type=int, required=False, default=20, choices=range(0, 41), help="Minimum required read mapping quality (bases from alignments with mapping quality under the specified threshold will be disregarded during variant support calculation)")
parser.add_argument("-p", "--parallelization_level", type=int, required=False, default=8, choices=range(1, 65), help="Number of GATK Pileup commands run in parallel")
parser.add_argument("-v", "--version", action="version", version=program_version)
args = parser.parse_args()
configuration_file = os.path.abspath(args.configuration_file)
if not (os.path.isfile(configuration_file)):
sys.stderr.write("Specified configuration file not found. Exiting (1).\n")
exit(1)
GATK_jar = os.path.abspath(args.GATK_jar)
if not (os.path.isfile(GATK_jar)):
sys.stderr.write("Specified GATK jar file not found. Exiting (1).\n")
exit(1)
reference_fasta = os.path.abspath(args.reference_fasta)
if not (os.path.isfile(reference_fasta)):
sys.stderr.write("Specified reference fasta file not found. Exiting (1).\n")
exit(1)
if (args.java_executable != "java"):
java_executable = os.path.abspath(args.java_executable)
if not (os.path.isfile(java_executable)):
sys.stderr.write("Specified Java executable file not found. Exiting (1).\n")
exit(1)
else:
java_executable = args.java_executable
output_directory = os.path.abspath(args.output_directory)
if (os.path.isdir(output_directory)):
sys.stderr.write("The specified output directory already exists. Exiting (1).\n")
exit(1)
(output_directory_parent, output_directory_name) = os.path.split(output_directory)
if not (os.path.isdir(output_directory_parent)):
sys.stderr.write("Parent directory of the specified output directory not found. Exiting (1).\n")
exit(1)
else:
os.system("mkdir " + output_directory)
pileup_commands_file_path = os.path.join(output_directory, "pileup_commands.sh")
pileup_commands_stdout_file_path = os.path.join(output_directory, "pileup_commands_stdout.log")
pileup_commands_stderr_file_path = os.path.join(output_directory, "pileup_commands_stderr.log")
IBPC_results_file_path = os.path.join(output_directory, "IBPC_values.tsv")
parallel_process_limit = args.parallelization_level
# A function for extracting variant read support information
# (alt allele occurrences and total allele occurrences
# are counted at given variant locations).
# The read support is extracted from given pileup file ("pileup_file_path")
# at specified variant locations (listed in "variant_dictionary").
def update_read_support_values(variant_dictionary, pileup_file_path,
all_reads_key, alt_reads_key):
SAMPLE_PILEUP = open(pileup_file_path, "r")
for line in SAMPLE_PILEUP:
line_s = line.strip().split()
# skip lines without variant support information
if (len(line_s) < 7):
continue
# identify the variant position and parse the variant pileup information
var_pos = line_s[0] + ":" + line_s[1]
bases = line_s[3]
quals = line_s[4]
read_infos = line_s[6].split(",")
# skip the variant if it's not relevant for the given sample
if not (var_pos in variant_dictionary):
continue
# look up the alt allele base
alt = variant_dictionary[var_pos]["alt"]
# initialize the allele occurrence counters
alt_depth = 0
all_depth = 0
# track support of the alt allele as well as the total support
# only consider alignments/bases satisfying the specified quality thresholds
for i in range(len(bases)):
base = bases[i]
qual = ord(quals[i]) - 33
read_info = read_infos[i]
# read_offset = read_info.split("@")[1]
MAPQ = int(read_info.split("@")[3])
if ((MAPQ >= args.min_mapping_quality) and (qual >= args.min_base_quality)):
if (base == alt):
alt_depth += 1
all_depth += 1
# update the allele occurrence counters
variant_dictionary[var_pos][all_reads_key] += all_depth
variant_dictionary[var_pos][alt_reads_key] += alt_depth
SAMPLE_PILEUP.close()
return
sample_info = {}
pool_info = {}
variant_pools = {}
# read in sample pool information
print "Processing information from the configuration file."
pool_config_file = open(configuration_file, "r")
line_number = 0
for line in pool_config_file:
line_s = line.strip().split("\t")
line_number += 1
# ignore any header/comment lines
if (line[0] == "#"):
continue
# exit immediately in case of an unexpected number of columns
elif (len(line_s) != 8):
sys.stderr.write("Line " + str(line_number) + " of the configuration file doesn't contain 8 tab-separated value columns. Exiting (2).\n")
exit(2)
pool_id = line_s[0]
pool_size = line_s[1]
sample_id = line_s[2]
individual = line_s[3]
BAM_path = line_s[4]
VCF_path = line_s[5]
contamination_value = line_s[6]
contaminant_only = line_s[7]
if (contaminant_only != "Y"):
BAM_path = os.path.abspath(BAM_path)
if not (os.path.isfile(BAM_path)):
sys.stderr.write("BAM file specified on line " + str(line_number) + " of the configuration file (" + BAM_path + ") was not found. Exiting (2).\n")
exit(2)
VCF_path = os.path.abspath(VCF_path)
if not (os.path.isfile(VCF_path)):
sys.stderr.write("VCF file specified on line " + str(line_number) + " of the configuration file (" + VCF_path + ") was not found. Exiting (2).\n")
exit(2)
sample_info[sample_id] = {"pool_id": pool_id, "individual": individual,
"BAM_path": BAM_path, "VCF_path": VCF_path,
"contamination_value": contamination_value,
"contaminant_only": contaminant_only}
if not (pool_id in pool_info):
pool_info[pool_id] = {"pool_size": pool_size,
"pileup_dir": os.path.join(output_directory, pool_id + "_pileups"),
"variant_interval_file": os.path.join(output_directory, pool_id + "_variants.intervals"),
"samples": [sample_id]}
variant_pools[pool_id] = {}
# create a pool-specific pileup file directory
p = subprocess.Popen(["mkdir", pool_info[pool_id]["pileup_dir"]])
p.wait()
else:
pool_info[pool_id]["samples"].append(sample_id)
pool_config_file.close()
print "Step 1: merging sample variant lists into pool-specific variant interval files"
# create pool-specific variant interval files for pileup creation purposes
for sample_id in sample_info:
if (sample_info[sample_id]["contaminant_only"] == "Y"):
print " Skipping contaminant-only sample: " + sample_id
else:
pool_id = sample_info[sample_id]["pool_id"]
print " Extracting variants for sample \"" + sample_id + "\" (pool \"" + pool_id + "\")"
VCF_file = open(sample_info[sample_id]["VCF_path"], "r")
for line in VCF_file:
line_s = line.strip().split("\t")
if (line[0] == "#"):
continue
chrom = line_s[0]
start = line_s[1]
variant_pools[pool_id][chrom + ":" + start + "-" + start] = ""
VCF_file.close()
# control output with pool-wise variant counts
for pool_id in variant_pools:
pool_variant_count = len(variant_pools[pool_id])
pool_variant_interval_file_name = pool_info[pool_id]["variant_interval_file"]
print " " + str(pool_variant_count) + " unique variants in pool \"" + pool_id + "\", creating variant file \"" + pool_variant_interval_file_name + "\""
pool_variant_interval_file = open(pool_variant_interval_file_name, "w")
for variant in variant_pools[pool_id]:
pool_variant_interval_file.write(variant + "\n")
pool_variant_interval_file.close()
print "Step 2: creating pileup files"
# open the pileup-creation bash script file
pileup_commands_file = open(pileup_commands_file_path, "w")
# environment variables for the pileup-creation bash script
pileup_commands_file.write(r"""export _JAVA_OPTIONS="-XX:ParallelGCThreads=1"
GATK_JAR=""" + GATK_jar + r"""
REFERENCE_FASTA=""" + reference_fasta + r"""
JAVA_8=""" + java_executable + "\n\n")
# for each sample, create a pileup file
# based on all the variants called in the associated sample pool
sample_counter = 0
for sample_id in sample_info:
BAM_file = sample_info[sample_id]["BAM_path"]
pool_id = sample_info[sample_id]["pool_id"]
interval_file = pool_info[pool_id]["variant_interval_file"]
pileup_path = os.path.join(pool_info[pool_id]["pileup_dir"], sample_id + ".pileup")
pileup_stdout_path = os.path.join(pool_info[pool_id]["pileup_dir"], sample_id + "_pileup_stdout.log")
pileup_stderr_path = os.path.join(pool_info[pool_id]["pileup_dir"], sample_id + "_pileup_stderr.log")
print " Creating a GATK Pileup command for sample \"" + sample_id + "\" and variants from pool \"" + pool_id + "\""
pileup_commands_file.write(r"""${JAVA_8} -jar ${GATK_JAR} \
-T Pileup \
-R ${REFERENCE_FASTA} \
-I """ + BAM_file + r""" \
-L """ + interval_file + r""" \
-verbose \
-drf DuplicateRead \
-o """ + pileup_path + r""" \
> """ + pileup_stdout_path + r""" \
2> """ + pileup_stderr_path + r""" &""" + "\n\n")
sample_counter += 1
# respect the set parallelization level threshold
if (sample_counter % parallel_process_limit == 0):
pileup_commands_file.write(r"""wait""" + "\n\n")
# unless already present, append a 'wait' command to the end of the bash script
if (sample_counter % parallel_process_limit != 0):
pileup_commands_file.write(r"""wait""" + "\n\n")
pileup_commands_file.close()
print " Running the GATK pileup commands"
pileup_process = subprocess.Popen(["bash", pileup_commands_file_path, ">",
pileup_commands_stdout_file_path, "2>",
pileup_commands_stderr_file_path])
pileup_process.wait()
print "Step 3: calculating IBPC values"
# open the file for result output
IBPC_results_file = open(IBPC_results_file_path, "w")
# print the result file header
IBPC_results_file.write("\t".join(["SAMPLE_ID", "POOL_ID", "POOL_SIZE",
"CHROMOSOME", "POSITION", "REF", "ALT",
"SAMPLE_ALL_READS", "SAMPLE_VARIANT_READS",
"SAMPLE_VARIANT_AF", "IBPC_ALL_READS",
"IBPC_VARIANT_READS", "IBPC_VARIANT_AF",
"CONPAIR_CONTAMINATION_ESTIMATE_TUMOR"]) + "\n")
# calculate all allelic fractions and IBPC support fraction values
for sample_id in sample_info:
if (sample_info[sample_id]["contaminant_only"] == "Y"):
print " Skipping contaminant-only sample: " + sample_id
else:
pool_id = sample_info[sample_id]["pool_id"]
pool_size = pool_info[pool_id]["pool_size"]
VCF_path = sample_info[sample_id]["VCF_path"]
individual = sample_info[sample_id]["individual"]
pileup_path = os.path.join(pool_info[pool_id]["pileup_dir"], sample_id + ".pileup")
contamination_value = sample_info[sample_id]["contamination_value"]
pool_sample_list = pool_info[pool_id]["samples"]
print " Calculating IBPC values for sample \"" + sample_id + "\" (pool \"" + pool_id + "\"); other samples in the pool: " + ", ".join(pool_sample_list)
# extract variants of the given sample (IBPC values will only be calculated for these variants)
var_dict = {}
VAR_LIST_FILE = open(VCF_path, "r")
for line in VAR_LIST_FILE:
line_s = line.strip().split("\t")
if (line[0] == "#"):
continue
chr = line_s[0]
pos = line_s[1]
ref = line_s[3]
alt = line_s[4]
var_pos = chr + ":" + pos
var_dict[var_pos] = {"chr": chr, "pos": pos,
"ref": ref, "alt": alt,
"sample_all_reads": 0, "sample_alt_reads": 0,
"IBPC_all_reads": 0, "IBPC_alt_reads": 0}
VAR_LIST_FILE.close()
# skip to the next sample if no variants are to be annotated for this sample
if (len(var_dict) == 0):
continue
# calculate AF values in the sample of origin
update_read_support_values(var_dict, pileup_path, "sample_all_reads", "sample_alt_reads")
# check for variant support in other samples in the pool,
# unless they originate from the same individual
for complementary_sample_id in pool_sample_list:
complementary_individual = sample_info[complementary_sample_id]["individual"]
if (individual == complementary_individual):
print " " + complementary_sample_id + " will not be used for IBPC calculation"
else:
print " " + complementary_sample_id + " will be used for IBPC calculation"
complementary_pileup_path = os.path.join(pool_info[pool_id]["pileup_dir"], complementary_sample_id + ".pileup")
update_read_support_values(var_dict, complementary_pileup_path, "IBPC_all_reads", "IBPC_alt_reads")
# output results for each variant of the processed sample
for var_pos in var_dict:
chr = var_dict[var_pos]["chr"]
pos = var_dict[var_pos]["pos"]
ref = var_dict[var_pos]["ref"]
alt = var_dict[var_pos]["alt"]
sample_all_reads = var_dict[var_pos]["sample_all_reads"]
sample_alt_reads = var_dict[var_pos]["sample_alt_reads"]
sample_variant_AF = 0
if (sample_all_reads > 0):
sample_variant_AF = round(float(sample_alt_reads)/sample_all_reads, 4)
IBPC_all_reads = var_dict[var_pos]["IBPC_all_reads"]
IBPC_alt_reads = var_dict[var_pos]["IBPC_alt_reads"]
IBPC_variant_AF = 0
if (IBPC_all_reads > 0):
IBPC_variant_AF = round(float(IBPC_alt_reads)/IBPC_all_reads, 4)
IBPC_results_file.write("\t".join([sample_id, pool_id, pool_size, chr, pos, ref, alt, str(sample_all_reads), str(sample_alt_reads), str(sample_variant_AF),
str(IBPC_all_reads), str(IBPC_alt_reads), str(IBPC_variant_AF), contamination_value]) + "\n")
IBPC_results_file.close()