-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathESC_search_ENA.py
More file actions
64 lines (61 loc) · 2.97 KB
/
ESC_search_ENA.py
File metadata and controls
64 lines (61 loc) · 2.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
"""Search for term in EBI to get accessions that link to genomic metadata in XML format"""
from joblib import Parallel, delayed
import requests
from tqdm import tqdm
import xmltodict
with open("ENA_READS_SUPPRESSED.txt", "r") as inAccess:
Ids = inAccess.read().splitlines()
def search_ENA(Id):
if Id == "ND":
return None
apiURL = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-sample?query={}&fields=acc,description,name&size=100".format(Id.replace("#", "%23"))
urlResponse = requests.get(apiURL)
accession_metadata = dict(xmltodict.parse(urlResponse.text))
try:
if accession_metadata["result"]["hitCount"] == "0":
apiURL = "https://www.ebi.ac.uk/ebisearch/ws/rest/wgs_masters?query={}&fields=acc,description,name&size=100".format(Id.replace("#", "%23"))
urlResponse = requests.get(apiURL)
accession_metadata = dict(xmltodict.parse(urlResponse.text))
if accession_metadata["result"]["hitCount"] == "0":
if "NC_" in Id:
with open("ESC_NCBI_ASSEMBLIES.txt", "a") as assem:
assem.write(Id + "\n")
else:
with open("ESC_NOT_FOUND.txt", "a") as suppressed:
suppressed.write(Id + "\n")
else:
try:
access = accession_metadata["result"]["entries"]['entry']["@id"]
except:
if "NC_" in Id:
with open("ESC_NCBI_ASSEMBLIES.txt", "a") as assem:
assem.write(Id + "\n")
else:
with open("ESC_NOT_FOUND.txt", "a") as suppressed:
suppressed.write(Id + "\n")
txtURL = "https://www.ebi.ac.uk/ena/browser/api/embl/{}?lineLimit=1000".format(access)
urlResponse = requests.get(txtURL).text.splitlines()
for resp in urlResponse:
if "BioSample" in resp:
biosample = resp.split(" ")[4].replace(".", "")
with open("ESC_FOUND.txt", "a") as found:
found.write(Id + "\n")
with open("ESC_SEARCH_FOUND_ACCESSIONS.txt", "a") as identified:
identified.write(biosample + "\n")
return biosample
else:
access = accession_metadata["result"]["entries"]['entry']["@id"]
with open("ESC_FOUND.txt", "a") as found:
found.write(Id + "\n")
with open("ESC_SEARCH_FOUND_ACCESSIONS.txt", "a") as identified:
identified.write(access + "\n")
return access
except:
with open("ESC_NOT_FOUND.txt", "a") as suppressed:
suppressed.write(Id + "\n")
job_list = [
Ids[i:i + 20] for i in range(0, len(Ids), 20)
]
foundAccessions = []
for job in tqdm(job_list):
foundAccessions += Parallel(n_jobs=20)(delayed(search_ENA)(i) for i in job)