deSAMBA/download at master · hitbc/deSAMBA

227 lines (184 loc) · 6.76 KB
#!/bin/bash
#Author: Florian Breitwieser
#Script: centrifuge-download
#Site: https://github.com/infphilo/centrifuge/blob/master/centrifuge-download
#Revised by Gaoyang Li and Dengfeng Guan
set -eu -o pipefail
  command -v "$1" >/dev/null 2>&1
if hash wget 2>/dev/null; then
	DL_PROG="wget -N --reject=index.html -qO"
	DL_PROG="curl -s -o"
export DL_PROG
#########################################################
## Functions
function download_n_process() {
    IFS=$'\t' read -r TAXID FILEPATH <<< "$1"
    NAME=`basename $FILEPATH .gz`
    [[ -f "$LIBDIR/$DOMAIN/$NAME.gz" ]] || $DL_PROG "$LIBDIR/$DOMAIN/$NAME.gz" "$FILEPATH" || { printf "\nError downloading $FILEPATH!\n" >&2 && exit 1; }
    gunzip -f "$LIBDIR/$DOMAIN/$NAME.gz" ||{ printf "\nError gunzipping $LIBDIR/$DOMAIN/$NAME.gz [ downloaded from $FILEPATH ]!\n" >&2 &&  exit 255; }
    sed -i "s/^>/>tid|$TAXID|ref|/" $LIBDIR/$DOMAIN/$NAME
    echo done
export -f download_n_process
ceol=`tput el` # terminfo clr_eol
function count {
   typeset C=0
   while read L; do
      if [[ "$L" == "done" ]]; then
        C=$(( C + 1 ))
        _progress=$(( (${C}*100/${1}*100)/100 ))
        _done=$(( (${_progress}*4)/10 ))
        _left=$(( 40-$_done ))
        # Build progressbar string lengths
        _done=$(printf "%${_done}s")
        _left=$(printf "%${_left}s")
        printf "\rProgress : [${_done// /#}${_left// /-}] ${_progress}%% $C/$1"  1>&2
        echo "$L"
function check_or_mkdir_no_fail {
    #echo -n "Creating $1 ... " >&2
    if [[ -d $1 && ! -n `find $1 -prune -empty -type d` ]]; then
        echo "Directory exists already! Continuing" >&2
        return `true`
        #echo "Done" >&2
        mkdir -p $1
        return `true`
## Check if GNU parallel exists
command -v parallel >/dev/null 2>&1 && PARALLEL=1 || PARALLEL=0
ALL_GENOMES="bacteria, viral, archaea"
ALL_DATABASES="refseq, genbank, taxonomy"
ALL_ASSEMBLY_LEVELS="Complete\ Genome, Chromosome, Scaffold, Contig"
ALL_REFSEQ_CATEGORY="representitive\ genome, reference\ genome, na"
## Option parsing
DATABASE="refseq"
ASSEMBLY_LEVEL="Complete Genome"
REFSEQ_CATEGORY=""
DOWNLOAD_GI_MAP=0
BASE_DIR="."
CHANGE_HEADER=0
DOWNLOAD_RNA=0
FILTER_UNPLACED=0
Program:   download
    `basename $0` [<options>] <database>
    <database>  STR    One of refseq, genbank, or taxonomy:
                         - use refseq or genbank for genomic sequences,
                         - taxonomy for taxonomy mappings.
  Opinions:
    -o  FOLDER  Folder to which the files are downloaded['$BASE_DIR'].
    -P  INT     Number of processes when downloading['$N_PROC'].
  Opinions: (WHEN USING refseq OR genbank as <database>)
    -d  STR     What domain to download. 
                 - One or more of ${ALL_GENOMES}(comma separated).
    -a  STR     Only download specified assembly level['$ASSEMBLY_LEVEL'].
                 - One of ($ALL_ASSEMBLY_LEVELS).
    -c  STR     only download genomes the specified refseq category[any].
                 - ($ALL_REFSEQ_CATEGORY). 
    -g          Download GI map.
# arguments: $OPTFIND (current index), $OPTARG (argument for option), $OPTERR (bash-specific)
while getopts "o:P:d:a:c:g" OPT "$@"; do
    case $OPT in
        o) BASE_DIR="$OPTARG" ;;
        P) N_PROC="$OPTARG" ;;
        d) DOMAINS=${OPTARG//,/ } ;;
        a) ASSEMBLY_LEVEL="$OPTARG" ;;
        c) REFSEQ_CATEGORY="$OPTARG" ;;
        g) DOWNLOAD_GI_MAP=1 ;;
        \?) echo "Invalid option: -$OPTARG" >&2 
            exit 1 
        :) echo "Option -$OPTARG requires an argument." >&2
           exit 1
#echo $REFSEQ_CATEGORY
#echo $ASSEMBLY_LEVEL
shift $((OPTIND-1))
[[ $# -eq 1 ]] || { printf "$USAGE" >&2 && exit 1; };
DATABASE=$1
#### TAXONOMY DOWNLOAD
FTP="ftp://ftp.ncbi.nih.gov"
if [[ "$DATABASE" == "taxonomy" ]]; then 
  echo "Downloading NCBI taxonomy ... " >&2
  if check_or_mkdir_no_fail "$BASE_DIR"; then
    cd "$BASE_DIR" > /dev/null
    if [[ "$DOWNLOAD_GI_MAP" == "1" ]]; then
	$DL_PROG gi_taxid_nucl.dmp.gz $FTP/pub/taxonomy/gi_taxid_nucl.dmp.gz
	gunzip -c gi_taxid_nucl.dmp.gz | sed 's/^/gi|/' > gi_taxid_nucl.map
	$DL_PROG taxdump.tar.gz $FTP/pub/taxonomy/taxdump.tar.gz
	tar -zxvf taxdump.tar.gz nodes.dmp
	tar -zxvf taxdump.tar.gz names.dmp
	rm taxdump.tar.gz
    cd - > /dev/null
#### REFSEQ/GENBANK DOWNLOAD
export LIBDIR="$BASE_DIR"
#export CHANGE_HEADER="$CHANGE_HEADER"
## Fields in the assembly_summary.txt file
REFSEQ_CAT_FIELD=5
TAXID_FIELD=6
SPECIES_TAXID_FIELD=7
VERSION_STATUS_FIELD=11
ASSEMBLY_LEVEL_FIELD=12
FTP_PATH_FIELD=20
AWK_QUERY="\$$ASSEMBLY_LEVEL_FIELD==\"$ASSEMBLY_LEVEL\" && \$$VERSION_STATUS_FIELD==\"latest\""
[[ "$REFSEQ_CATEGORY" != "" ]] && AWK_QUERY="$AWK_QUERY && \$$REFSEQ_CAT_FIELD==\"$REFSEQ_CATEGORY\"" 
TAXID=${TAXID//,/|}
[[ "$TAXID" != "" ]] && AWK_QUERY="$AWK_QUERY && match(\$$TAXID_FIELD,\"^($TAXID)\$\")"
#echo "$AWK_QUERY" >&2
#echo "Downloading genomes for $DOMAINS at assembly level $ASSEMBLY_LEVEL" >&2
if exists wget; then
	wget -qO- --no-remove-listing ftp://ftp.ncbi.nlm.nih.gov/genomes/$DATABASE/ > /dev/null
	curl -s ftp://ftp.ncbi.nlm.nih.gov/genomes/$DATABASE/ > .listing
#if [[ "$CHANGE_HEADER" == "1" ]]; then
    #echo "Modifying header to include taxonomy ID" >&2
for DOMAIN in $DOMAINS; do
    if [[ ! `grep " $DOMAIN" .listing` ]]; then
        echo "$DOMAIN is not a valid domain - use one of the following:" >&2
        grep '^d' .listing  | sed 's/.* //'
        exit 1
    #if [[ "$CHANGE_HEADER" != "1" ]]; then
        #echo "Writing taxonomy ID to sequence ID map to STDOUT" >&2
        #[[ -f "$LIBDIR/$DOMAIN.map" ]] && rm "$LIBDIR/$DOMAIN.map"
    export DOMAIN=$DOMAIN
    check_or_mkdir_no_fail $LIBDIR/$DOMAIN
    FULL_ASSEMBLY_SUMMARY_FILE="$LIBDIR/$DOMAIN/assembly_summary.txt"
    ASSEMBLY_SUMMARY_FILE="$LIBDIR/$DOMAIN/assembly_summary_filtered.txt"
    #echo "Downloading and filtering the assembly_summary.txt file ..." >&2
    $DL_PROG "$FULL_ASSEMBLY_SUMMARY_FILE" ftp://ftp.ncbi.nlm.nih.gov/genomes/$DATABASE/$DOMAIN/assembly_summary.txt > "$FULL_ASSEMBLY_SUMMARY_FILE"
    echo $AWK_QUERY
    awk -F "\t" "BEGIN {OFS=\"\t\"} $AWK_QUERY" "$FULL_ASSEMBLY_SUMMARY_FILE" > "$ASSEMBLY_SUMMARY_FILE"
    N_EXPECTED=`cat "$ASSEMBLY_SUMMARY_FILE" | wc -l`
    [[ $N_EXPECTED -gt 0 ]] || { echo "Domain $DOMAIN has no genomes with specified filter." >&2; exit 1; }
    echo "Downloading $N_EXPECTED $DOMAIN genomes at assembly level $ASSEMBLY_LEVEL ... (will take a while)" >&2
    cut -f "$TAXID_FIELD,$FTP_PATH_FIELD" "$ASSEMBLY_SUMMARY_FILE" | sed 's#\([^/]*\)$#\1/\1_genomic.fna.gz#' |\
       tr '\n' '\0' | xargs -0 -n1 -P $N_PROC bash -c 'download_n_process "$@"' _ | count $N_EXPECTED
    echo >&2
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

download

Latest commit

History

download

File metadata and controls