-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathdownload
More file actions
227 lines (184 loc) · 6.76 KB
/
download
File metadata and controls
227 lines (184 loc) · 6.76 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
#!/bin/bash
#
#Author: Florian Breitwieser
#Script: centrifuge-download
#Site: https://github.com/infphilo/centrifuge/blob/master/centrifuge-download
#Revised by Gaoyang Li and Dengfeng Guan
set -eu -o pipefail
exists() {
command -v "$1" >/dev/null 2>&1
}
if hash wget 2>/dev/null; then
DL_PROG="wget -N --reject=index.html -qO"
else
DL_PROG="curl -s -o"
fi
export DL_PROG
#########################################################
## Functions
function download_n_process() {
IFS=$'\t' read -r TAXID FILEPATH <<< "$1"
NAME=`basename $FILEPATH .gz`
[[ -f "$LIBDIR/$DOMAIN/$NAME.gz" ]] || $DL_PROG "$LIBDIR/$DOMAIN/$NAME.gz" "$FILEPATH" || { printf "\nError downloading $FILEPATH!\n" >&2 && exit 1; }
gunzip -f "$LIBDIR/$DOMAIN/$NAME.gz" ||{ printf "\nError gunzipping $LIBDIR/$DOMAIN/$NAME.gz [ downloaded from $FILEPATH ]!\n" >&2 && exit 255; }
sed -i "s/^>/>tid|$TAXID|ref|/" $LIBDIR/$DOMAIN/$NAME
echo done
}
export -f download_n_process
ceol=`tput el` # terminfo clr_eol
function count {
typeset C=0
while read L; do
if [[ "$L" == "done" ]]; then
C=$(( C + 1 ))
_progress=$(( (${C}*100/${1}*100)/100 ))
_done=$(( (${_progress}*4)/10 ))
_left=$(( 40-$_done ))
# Build progressbar string lengths
_done=$(printf "%${_done}s")
_left=$(printf "%${_left}s")
printf "\rProgress : [${_done// /#}${_left// /-}] ${_progress}%% $C/$1" 1>&2
else
echo "$L"
fi
done
}
function check_or_mkdir_no_fail {
#echo -n "Creating $1 ... " >&2
if [[ -d $1 && ! -n `find $1 -prune -empty -type d` ]]; then
echo "Directory exists already! Continuing" >&2
return `true`
else
#echo "Done" >&2
mkdir -p $1
return `true`
fi
}
## Check if GNU parallel exists
command -v parallel >/dev/null 2>&1 && PARALLEL=1 || PARALLEL=0
ALL_GENOMES="bacteria, viral, archaea"
ALL_DATABASES="refseq, genbank, taxonomy"
ALL_ASSEMBLY_LEVELS="Complete\ Genome, Chromosome, Scaffold, Contig"
ALL_REFSEQ_CATEGORY="representitive\ genome, reference\ genome, na"
## Option parsing
DATABASE="refseq"
ASSEMBLY_LEVEL="Complete Genome"
REFSEQ_CATEGORY=""
TAXID=""
DOWNLOAD_GI_MAP=0
BASE_DIR="."
N_PROC=1
CHANGE_HEADER=0
DOWNLOAD_RNA=0
DO_DUST=0
FILTER_UNPLACED=0
USAGE="
Program: download
Usage:
`basename $0` [<options>] <database>
Basic:
<database> STR One of refseq, genbank, or taxonomy:
- use refseq or genbank for genomic sequences,
- taxonomy for taxonomy mappings.
Opinions:
-o FOLDER Folder to which the files are downloaded['$BASE_DIR'].
-P INT Number of processes when downloading['$N_PROC'].
Opinions: (WHEN USING refseq OR genbank as <database>)
-d STR What domain to download.
- One or more of ${ALL_GENOMES}(comma separated).
-a STR Only download specified assembly level['$ASSEMBLY_LEVEL'].
- One of ($ALL_ASSEMBLY_LEVELS).
-c STR only download genomes the specified refseq category[any].
- ($ALL_REFSEQ_CATEGORY).
-g Download GI map.
"
# arguments: $OPTFIND (current index), $OPTARG (argument for option), $OPTERR (bash-specific)
while getopts "o:P:d:a:c:g" OPT "$@"; do
case $OPT in
o) BASE_DIR="$OPTARG" ;;
P) N_PROC="$OPTARG" ;;
d) DOMAINS=${OPTARG//,/ } ;;
a) ASSEMBLY_LEVEL="$OPTARG" ;;
c) REFSEQ_CATEGORY="$OPTARG" ;;
g) DOWNLOAD_GI_MAP=1 ;;
\?) echo "Invalid option: -$OPTARG" >&2
exit 1
;;
:) echo "Option -$OPTARG requires an argument." >&2
exit 1
;;
esac
done
#echo $REFSEQ_CATEGORY
#echo $ASSEMBLY_LEVEL
shift $((OPTIND-1))
[[ $# -eq 1 ]] || { printf "$USAGE" >&2 && exit 1; };
DATABASE=$1
#### TAXONOMY DOWNLOAD
FTP="ftp://ftp.ncbi.nih.gov"
if [[ "$DATABASE" == "taxonomy" ]]; then
echo "Downloading NCBI taxonomy ... " >&2
if check_or_mkdir_no_fail "$BASE_DIR"; then
cd "$BASE_DIR" > /dev/null
if [[ "$DOWNLOAD_GI_MAP" == "1" ]]; then
$DL_PROG gi_taxid_nucl.dmp.gz $FTP/pub/taxonomy/gi_taxid_nucl.dmp.gz
gunzip -c gi_taxid_nucl.dmp.gz | sed 's/^/gi|/' > gi_taxid_nucl.map
else
$DL_PROG taxdump.tar.gz $FTP/pub/taxonomy/taxdump.tar.gz
tar -zxvf taxdump.tar.gz nodes.dmp
tar -zxvf taxdump.tar.gz names.dmp
rm taxdump.tar.gz
fi
cd - > /dev/null
fi
exit 0
fi
#### REFSEQ/GENBANK DOWNLOAD
export LIBDIR="$BASE_DIR"
#export CHANGE_HEADER="$CHANGE_HEADER"
## Fields in the assembly_summary.txt file
REFSEQ_CAT_FIELD=5
TAXID_FIELD=6
SPECIES_TAXID_FIELD=7
VERSION_STATUS_FIELD=11
ASSEMBLY_LEVEL_FIELD=12
FTP_PATH_FIELD=20
AWK_QUERY="\$$ASSEMBLY_LEVEL_FIELD==\"$ASSEMBLY_LEVEL\" && \$$VERSION_STATUS_FIELD==\"latest\""
[[ "$REFSEQ_CATEGORY" != "" ]] && AWK_QUERY="$AWK_QUERY && \$$REFSEQ_CAT_FIELD==\"$REFSEQ_CATEGORY\""
TAXID=${TAXID//,/|}
[[ "$TAXID" != "" ]] && AWK_QUERY="$AWK_QUERY && match(\$$TAXID_FIELD,\"^($TAXID)\$\")"
#echo "$AWK_QUERY" >&2
#echo "Downloading genomes for $DOMAINS at assembly level $ASSEMBLY_LEVEL" >&2
if exists wget; then
wget -qO- --no-remove-listing ftp://ftp.ncbi.nlm.nih.gov/genomes/$DATABASE/ > /dev/null
else
curl -s ftp://ftp.ncbi.nlm.nih.gov/genomes/$DATABASE/ > .listing
fi
#if [[ "$CHANGE_HEADER" == "1" ]]; then
#echo "Modifying header to include taxonomy ID" >&2
#fi
for DOMAIN in $DOMAINS; do
if [[ ! `grep " $DOMAIN" .listing` ]]; then
echo "$DOMAIN is not a valid domain - use one of the following:" >&2
grep '^d' .listing | sed 's/.* //'
exit 1
fi
#if [[ "$CHANGE_HEADER" != "1" ]]; then
#echo "Writing taxonomy ID to sequence ID map to STDOUT" >&2
#[[ -f "$LIBDIR/$DOMAIN.map" ]] && rm "$LIBDIR/$DOMAIN.map"
#fi
export DOMAIN=$DOMAIN
check_or_mkdir_no_fail $LIBDIR/$DOMAIN
FULL_ASSEMBLY_SUMMARY_FILE="$LIBDIR/$DOMAIN/assembly_summary.txt"
ASSEMBLY_SUMMARY_FILE="$LIBDIR/$DOMAIN/assembly_summary_filtered.txt"
#echo "Downloading and filtering the assembly_summary.txt file ..." >&2
$DL_PROG "$FULL_ASSEMBLY_SUMMARY_FILE" ftp://ftp.ncbi.nlm.nih.gov/genomes/$DATABASE/$DOMAIN/assembly_summary.txt > "$FULL_ASSEMBLY_SUMMARY_FILE"
echo $AWK_QUERY
awk -F "\t" "BEGIN {OFS=\"\t\"} $AWK_QUERY" "$FULL_ASSEMBLY_SUMMARY_FILE" > "$ASSEMBLY_SUMMARY_FILE"
N_EXPECTED=`cat "$ASSEMBLY_SUMMARY_FILE" | wc -l`
[[ $N_EXPECTED -gt 0 ]] || { echo "Domain $DOMAIN has no genomes with specified filter." >&2; exit 1; }
echo "Downloading $N_EXPECTED $DOMAIN genomes at assembly level $ASSEMBLY_LEVEL ... (will take a while)" >&2
cut -f "$TAXID_FIELD,$FTP_PATH_FIELD" "$ASSEMBLY_SUMMARY_FILE" | sed 's#\([^/]*\)$#\1/\1_genomic.fna.gz#' |\
tr '\n' '\0' | xargs -0 -n1 -P $N_PROC bash -c 'download_n_process "$@"' _ | count $N_EXPECTED
echo >&2
done