-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathfastq_stats.py
More file actions
141 lines (120 loc) · 6.48 KB
/
fastq_stats.py
File metadata and controls
141 lines (120 loc) · 6.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#!/usr/bin/python
########################################################################################
# A general purpose batch FASTQ parsing script.
#
# Dependencies: core
#
# Gregg Thomas, Fall 2019
########################################################################################
import sys, os, random, argparse
sys.path.append(sys.path.append(os.path.dirname(os.path.realpath(__file__)) + "/lib/"))
import core, fastqlib as fql
####################
def initCSV(globs):
with open(globs['outcsv'], "w") as csvfile:
header = ["File"];
if globs['reads']:
header.append("Num.reads");
if globs['lens']:
header.append("Avg.read.len");
if globs['reads'] and globs['lens'] and globs['genome_size']:
header.append("Coverage");
if globs['bc']:
header.extend(("A", "T", "C", "G", "N"));
if globs['qual']:
header.extend(("Avg.qual", "Avg.qual.first.5"));
csvfile.write(",".join(header) + "\n");
####################
if __name__ == '__main__':
# Main is necessary for multiprocessing to work on Windows.
parser = argparse.ArgumentParser(description="A general purpose FASTA editing script.");
parser.add_argument("-i", dest="input", help="A directory containing FASTQ formatted files, a single FASTQ file, or a pair of FASTQ files (separated by a semicolon).");
parser.add_argument("-g", dest="genome_size", help="The size of the original genome. If specified, coverage will be calculated.", type=int, default=False);
parser.add_argument("-p", dest="procs", help="The number of processes the script should use. Default: 1.", type=int, default=1);
parser.add_argument("--reads", dest="reads", help="Set to count the number of reads in each file.", default=False, action="store_true");
parser.add_argument("--readlen", dest="lens", help="Set to calculate the average read length in each file.", default=False, action="store_true");
parser.add_argument("--basecomp", dest="base_comp", help="Set to count the base compositions in each file.", default=False, action="store_true");
parser.add_argument("--qual", dest="qual", help="Set to calculate the average base quality in the file and across reads.", default=False, action="store_true");
parser.add_argument("--all", dest="all", help="Sets all of --reads, --readlen, --basecomp, --qual to True.", default=False, action="store_true");
parser.add_argument("-outcsv", dest="outcsv", help="CSV output file for easy parsing (only log info will be printed to screen).", default=False);
parser.add_argument("-outtxt", dest="outtxt", help="Text output file for easy reading.", default=False);
parser.add_argument("--summary", dest="summary", help="When input is a dirctory with multiple files, set this to print only combined statistics from all files.", default=False, action="store_true");
parser.add_argument("--overwrite", dest="overwrite", help="Set this to indicate you wish to overwrite files specified by -outcsv and -outtxt if they already exist. WARNING: This means the original contents of the file will be deleted.", default=False, action="store_true");
#parser.add_argument("--header", dest="header", help="Set to extract the header info for each file.", default=False, action="store_true");
args = parser.parse_args();
# Input option definitions.
paired = False;
dirflag = False;
if os.path.isdir(args.input):
dirflag = args.input;
infiles = [ os.path.join(args.input, f) for f in os.listdir(args.input) if any(ext in f for ext in ["fastq","fq"]) ];
if any("_R2_" in f for f in infiles):
paired = True;
pairs = [];
if paired:
for f in infiles:
if "R2" in f:
continue;
pairs.append([f, f.replace("_R1_","_R2_")]);
infiles = pairs;
else:
infiles = [[f] for f in infiles];
# Check if input is paired.
# If input is a directory.
else:
if ";" in args.input:
paired = True;
dirflag = args.input;
f = args.input.split(";");
if not all(os.path.isfile(i) for i in f):
sys.exit(core.errorOut(1, "Cannot find input file/directory (-i)."));
if "_R1_" in f[0]:
f1, f2 = f[0], f[1];
else:
f1, f2 = f[1], f[0];
infiles = [[f1, f2]]
# If input is paired.
else:
if not os.path.isfile(args.input):
sys.exit(core.errorOut(1, "Cannot find input file/directory (-i)."));
infiles = [[args.input]];
# Prepare input files.
#print(infiles);
if args.procs < 1 or type(args.procs) != int:
sys.exit(core.errorOut(2, "Number of processes (-p) must be an integer > 0."));
# Make sure specified number of procs is a positive integer.
if not args.overwrite:
if args.outcsv and os.path.isfile(args.outcsv):
sys.exit(core.errorOut(3, "Specified output CSV file (-outcsv) already exists. Specify a different file name or use --overwrite to overwrite the file's contents."));
if args.outtxt and os.path.isfile(args.outtxt):
sys.exit(core.errorOut(4, "Specified output text file (-outtxt) already exists. Specify a different file name or use --overwrite to overwrite the file's contents."));
# Check output files.
if args.all:
args.reads, args.lens, args.base_comp, args.qual = True, True, True, True;
if args.summary and not dirflag:
print(" #" + core.getTime() + " * MESSAGE: The --summaryonly and --nosummary flags only work when input is a directory with multiple fastq files. Ignoring for this run.");
args.summary = False;
globs = {
'reads' : args.reads,
'lens' : args.lens,
'bc' : args.base_comp,
'qual' : args.qual,
'genome_size' : args.genome_size,
'dirflag' : dirflag,
'paired' : paired,
'outcsv' : args.outcsv,
'outtxt' : args.outtxt,
'procs' : args.procs,
'summary' : args.summary,
'pyv' : sys.version[0]
}
print("# =======================================================================");
print("#\t\t\t" + core.getDateTime());
print("# Parsing FASTQ files in:\t" + args.input);
if globs['outcsv']:
initCSV(globs);
if globs['outtxt']:
with open(globs['outtxt'], "w") as txtfile:
txtfile.write("");
# Prepping the output files.
fql.countReads(infiles, globs);