-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathretrieve_assembled_reads.py
More file actions
46 lines (43 loc) · 1.72 KB
/
retrieve_assembled_reads.py
File metadata and controls
46 lines (43 loc) · 1.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
from joblib import Parallel, delayed
import os
import ssl
import sys
from urllib.request import urlopen
from shutil import copyfileobj
from tqdm import tqdm
def download_read(accession, output_dir):
if "contigs" in accession:
try:
## currently only downloading assemblies and not read sets for efficiency
ssl._create_default_https_context = ssl._create_unverified_context
with urlopen(accession) as in_stream, open(os.path.join(output_dir, os.path.basename(accession)), 'wb') as out_file:
copyfileobj(in_stream, out_file)
return [accession]
except:
sys.stderr.write("\nRequest failed with: " + accession + "\n")
output_dir = "retrieved_ena_reads"
if not os.path.exists(output_dir):
os.mkdir(output_dir)
with open("fastq_links.txt", "r") as f:
run_accessions = f.read().splitlines()
cleaned_accessions = []
for line in run_accessions:
if "contigs" in line:
cleaned_accessions.append(line)
job_list = [
cleaned_accessions[i:i + 8] for i in range(0, len(cleaned_accessions), 8)
]
results = []
for job in tqdm(job_list):
results += Parallel(n_jobs=8)(delayed(download_read)(access,
output_dir) for access in job)
results = set(results)
while len(results) != len(cleaned_accessions):
for access in cleaned_accessions:
if not access in results:
sys.stderr.write("\nRerequesting: " + str(access) + "\n")
success = download_read(access,
output_dir)
if success != []:
sys.stderr.write("\nRetrieval was successful for: " + str(access) + "\n")
results.add(success[0])