-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathjtemplate.py
More file actions
237 lines (183 loc) · 7.5 KB
/
jtemplate.py
File metadata and controls
237 lines (183 loc) · 7.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
def write_qc (sample, path_pre, config_param):
script_path = path_pre + "/scripts/"+sample
rawfq_path = path_pre +"/data/"+sample+"/raw"
qc_FILE = open (script_path+"/run_qc.sh", "w")
qc_script = """#!/bin/bash
#$ -cwd
#$ -V
#$ -N fastq_QC
#$ -pe shared 1
#$ -l highp,h_data=4G,time=33:00:00
#$ -m bea
## path: """+script_path+"""
## Usage: Submit this script thru qsub: qsub run_qc.sh
cd """+rawfq_path+"""
cat *fastq* | fastqc /dev/stdin -o ../qc_report/
"""
qc_FILE.write(qc_script)
def write_trim (sample, path_pre, config_param):
script_path = path_pre + "/scripts/"+sample
rawfq_path = path_pre +"/data/"+sample+"/raw"
trim_path = path_pre+"/data/"+sample+"/trim"
barcode_path = path_pre +"/data/"+sample+"/barcode"
trim_FILE = open (script_path+"/run_trim.sh", "w")
trim_script = """#!/bin/bash
#$ -cwd
#$ -V
#$ -N trim_reads
#$ -pe shared 1
#$ -l highp,h_data=4G,time=24:00:00
#$ -m bea
## path: """+script_path+"""
## Usage: Submit this script thru qsub: qsub run_trim.sh
cd """+rawfq_path+"""
for fastq_f in *fastq*; do
outfile=${fastq_f}
if [[ ${outfile} == *.gz ]]; then
outfile=`echo ${outfile} | rev| cut -c 4- |rev`; fi
seqtk trimfq -e 5 ${fastq_f} > """+trim_path+"""/$outfile;
done
"""
trim_FILE.write(trim_script)
def write_demulti (sample, path_pre, config_param):
script_path = path_pre + "/scripts/"+sample
trim_path = path_pre+"/data/"+sample+"/trim"
barcode_path = path_pre +"/data/"+sample+"/barcode"
stack_demulti_FILE = open (script_path+"/Stacks/run_processTags.sh", "w")
stack_demulti = """#!/bin/bash
#$ -cwd
#$ -V
#$ -N stack_process
#$ -pe shared 1
#$ -l highp,h_data=4G,time=33:00:00
#$ -m bea
## path: """+script_path+"""/Stacks
## Usage: Submit this script thru qsub: qsub run_processTags.sh
# base directory
WKDIR="""+path_pre+"""
mkdir -p ${WKDIR}/analysis/"""+sample+"""/Stacks/processTags
# making the stacks' barcode
awk '{print $1}' """+barcode_path+""" > ${WKDIR}/data/"""+sample+"""/RAD_barcode
# parameters for process_radtags
# -p: input directory - make sure you only have the fastq or fastq.gz files in the folder (no SampleSheet or anything else)
# -o: output directory
# -b: barcode directory
# -e: restriction enzyme
# -r: rescue RAD-TAGS and barcode
# -c: clean data, remove uncalled base
# -q: remove low quality reads
# -i: input file type (gzfastq for gzip)
# -s (note: we need to figure out the best limit and add it below, or now we have set it to 20): set the score limit. If the average score within the sliding window drops below this value, the read is discarded (default 10).
process_radtags -p """+trim_path+""" -o ${WKDIR}/analysis/"""+sample+"""/Stacks/processTags -b ${WKDIR}/data/"""+sample+"""/RAD_barcode -e 'sbfI' -r -c -q
# relabel fq barcode file to something more meaningful i.e. the sample name
awk -v basePath=${WKDIR}/analysis/"""+sample+"""/Stacks/processTags '{print "mv "basePath"/sample_"$1".fq "basePath"/sample_"$2".fq"}' ${WKDIR}/data/"""+sample+"""/barcode |bash """
stack_demulti_FILE.write(stack_demulti)
def write_stack_core (sample, path_pre, config_param):
script_path = path_pre + "/scripts/"+sample
stack_catalog_FILE = open (script_path+"/Stacks/run_stackCatalog.sh", "w")
stack_catalog = """#!/bin/bash
#$ -cwd
#$ -V
#$ -N stackCatalog_process
#$ -pe shared 6
#$ -l highp,h_data=4G,time=96:00:00
#$ -m bea
## path: """+script_path+"""/Stacks
###Usage: Submit this script thru qsub: qsub run_stackCatalog.sh
# -m: min read depth
# -M: max bp mismatch between loci for an indiv
# -N: max bp mismatch allowed secondary reads to primary stack (default:M+2)
# -n: max bp mismatch between loci when building catalog
# -S: disable mysql recording
# -b: batch id #
# -T: number of threads
# -t: remove highly rep seq
# -s: path for fastq indiv samples
# -o: output path
WKDIR="""+path_pre+"""
mkdir -p ${WKDIR}/analysis/"""+sample+"""/Stacks/denovo
# make a list of input files
inputL=`ls ${WKDIR}/analysis/"""+sample+"""/Stacks/processTags/*.fq | awk '{printf "-s "$1 " "}'`
echo "nohup denovo_map.pl -m 6 -M 2 -n 2 -S -b 1 -T 6 -t -o ${WKDIR}/analysis/"""+sample+"""/Stacks/denovo" $inputL " >${WKDIR}/analysis/"""+sample+"""/Stacks/denovo/nohup.out" | bash
"""
stack_catalog_FILE.write(stack_catalog)
def write_pyrad (sample, path_pre, config_param):
script_path = path_pre + "/scripts/"+sample
pyrad_FILE = open (script_path+"/pyRAD/run_pyRAD.sh", "w")
pyrad_script = """#!/bin/bash
#$ -cwd
#$ -V
#$ -N pyrad_process
#$ -pe shared 4
#$ -l highp,h_data=4G,time=96:00:00
#$ -m bea
## path: """+script_path+"""/pyRAD
###Usage: Submit this script thru qsub: qsub run_pyRAD.sh
#run all of the steps 1 to 7
WKDIR=="""+path_pre+"""
module load python/2.7
pyRAD -p $WKDIR/scripts/"""+sample+"""/pyRAD/params.txt -s 234567
"""
pyrad_FILE.write(pyrad_script)
def write_blat (sample, path_pre, config_param):
script_path = path_pre + "/scripts/"+sample
blat_FILE = open (script_path+"/blat/run_blat.sh", "w")
blat_script = """#!/bin/bash
#$ -cwd
#$ -V
#$ -N align_RNA_RAD
#$ -l highp,h_data=4G,time=46:00:00
#$ -m bea
## path: """+script_path+"""/blat
###Usage: Submit this script thru qsub: qsub run_blat.sh
WKDIR="""+path_pre+"""
awk '{ split($8,con,","); for (i=1; i <= length(con); i++) {split(con[i], id, "_"); cons[id[1]]++}
if (length(cons)>80) {a+=1; print ">" a "_" $3 "_" length(cons) "\\n" $9} delete cons}' ${WKDIR}/analysis/"""+sample+"""/Stacks/denovo/batch_1.catalog.tags.tsv > ${WKDIR}/analysis/"""+sample+"""/blat/RAD_consen.fa
cd ${WKDIR}/data/"""+sample+"""/rna
for i in *.fa; do
/u/local/apps/blat/34/bin/blat -q=rna -out=pslx ${WKDIR}/analysis/"""+sample+"""/blat/RAD_consen.fa $i ${WKDIR}/analysis/"""+sample+"""/blat/align_${i}.out; done
"""
blat_FILE.write(blat_script)
def write_align_summary (sample, path_pre, config_param):
script_path = path_pre + "/scripts/"+sample
align_summary_FILE = open (script_path+"/blat/gen_tbl.sh", "w")
script = """#!/bin/bash
## path: """+script_path+"""/blat
###Usage: Submit this script thru qsub: bash gen_tbl.sh
WKDIR="""+path_pre+"""
OUTDIR=${WKDIR}/analysis/"""+sample+"""/blat/tbl
mkdir -p ${OUTDIR}
cd ${WKDIR}/data/"""+sample+"""/rna
# grab read numbers
for i in *.fa;
do numline=`wc -l $i|awk '{print $1/2}'`
echo $i $numline;
done > ${OUTDIR}/num_reads.data
# grab number of reads aligned once and more than once
cd ${WKDIR}/analysis/"""+sample+"""/blat
for i in *.out;
do numline=`awk ' {NR>5 && d[$10]++}
END{
for(i in d) {
if(d[i]==1){ct_once++}
else{ct_more++}}
print ct_once "\\t" ct_more}' $i`;
fileN=`echo $i | sed 's/align_//;s/.out//'`
echo $fileN $numline;
done > ${OUTDIR}/num_align.data
cd ${OUTDIR}
join -1 1 -2 1 ${OUTDIR}/num_reads.data ${OUTDIR}/num_align.data |awk 'BEGIN{print "RNA File | # of reads | # of reads aligned once | # of reads aligned more than once"; print "---|:---:|:---:|:---:";} {OFS=" | "; CONVFMT="%0.3f"; print $1,$2, $3 " ( " $3*100/$2 " %)", $4 " (" $4*100/$2" % )" }'
> ${WKDIR}/analysis/"""+sample+"""/blat/alignment_report.md
cat ${WKDIR}/analysis/"""+sample+"""/blat/*.out > ${OUTDIR}/align.out.concat
for i in 30 50 70 90; do
awk '{if($5==0 && $2 < 3 && $1 > '$i'){if(e[$14"_"$16"_"$17]++==0) d[$14]++; }}
END{for (k in d) {print k"\\t"d[k] }}' ${OUTDIR}/align.out.concat > ${OUTDIR}/align_concat_${i}.tbl; done
for i in 30 50 70 90; do
echo -n ">"$i" bp matches ";
for d in 1 5 10 50 100; do
awk '$2>'$d'' ${OUTDIR}/align_concat_${i}.tbl |wc -l |awk '{printf " | "$1 " (" "%3.1f" "%)",$1*100/38070}';
done
echo "";
done > ${WKDIR}/analysis/"""+sample+"""/blat/RAD_RNA_stat.md
"""
align_summary_FILE.write(script)