From e335961865366c8d8c3e77194621fbe194abea40 Mon Sep 17 00:00:00 2001 From: mtaghiza Date: Wed, 8 Dec 2021 14:10:14 -0500 Subject: [PATCH 01/13] Added base functionality for SciQuery module. --- py3/SciServer/Config.py | 2 + py3/SciServer/SciQuery.py | 352 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 354 insertions(+) create mode 100644 py3/SciServer/SciQuery.py diff --git a/py3/SciServer/Config.py b/py3/SciServer/Config.py index 0a7818c..afb993e 100644 --- a/py3/SciServer/Config.py +++ b/py3/SciServer/Config.py @@ -36,6 +36,8 @@ version = "sciserver-v2.1.0" #sciserver release version ComputeJobDirectoryFile = "/home/idies/jobs.path" #the path to the file in the "Docker job container" that shows the directory path where the asynchronous compute job is being executed. ComputeUrl = "https://apps.sciserver.org/compute" +SciqueryURL = "https://apps.sciserver.org/sciquery" +ComputeWorkDir = "/home/idies/workspace/" def _load_config(filename): if os.path.exists(filename): diff --git a/py3/SciServer/SciQuery.py b/py3/SciServer/SciQuery.py new file mode 100644 index 0000000..ef59e6d --- /dev/null +++ b/py3/SciServer/SciQuery.py @@ -0,0 +1,352 @@ +from SciServer import Authentication, Config, Files, Jobs +import pandas as pd +import numpy as np +import json +import requests +from functools import lru_cache +from datetime import datetime + +def _get_default_rdb_domain(): + rdb_domains = Jobs.getRDBComputeDomainsNames() + if len(rdb_domains) > 0: + return rdb_domains[0] + else: + raise Exception("There are no rdbComputeDomains available for the user."); + + +class OutputTargetType: + """ + Contains a set of allowed database output types. + """ + FILE_JSON = "FILE_JSON" + FILE_CSV = "FILE_CSV" + DATABASE_TABLE = "TABLE" + + +class FileOutput: + """ + Defines the output of a database query to a file. + """ + def __init__(self, target_name: str = "result.json", target_type: str = OutputTargetType.FILE_JSON, + statement_indexes: list = [1]): + """ + :param target_name: name of the file (string), such as "result.json" + :param target_type: type (string) of the file containing the query result(s) (e.g., "FILE_JSON"). As set of possible values is given by the static members of class 'SciQuery.OutputTargetType' + :param statement_indexes: list of integers or integer. Each integer value denotes the index or position (>=1) of the sql statements whithin the input query, whose resultset is going to be written into this OutputTarget + """ + if type(target_name) != str or type(target_type) != str: + raise ValueError("Invalid type(s) for input parameter(s) 'target_name' or 'target_type'") + self.target_name = target_name + self.target_type = target_type + self.set_statement_indexes(statement_indexes) + + def set_statement_indexes(self, statement_indexes: list = [1]): + """ + Sets the index(es) of the sql statement(s) whithin the input query, whose resultset(s) is(are) going to be written into this OutputTarget. + :param statement_indexes: list of integers, which are the indices (starting with 1) of the sql statements whithin the input query, whose resultsets are going to be written into this OutputTarget. + """ + if type(statement_indexes) != list: + statement_indexes = [statement_indexes] + for index in statement_indexes: + if type(index) != int or index <= 0: + raise ValueError("Invalid type for input parameter 'statement_indexes'") + self.statement_indexes = [i for i in sorted(set(statement_indexes))] + return self + + @classmethod + def get_default(cls): + """ + Gets a OutputTarget object filled with default values: JSON output file where only the 1st SQL statement of the query is written in it. + """ + cls.target_name = "result.json" + cls.target_type = OutputTargetType.FILE_JSON + cls.statement_indexes = [1] + return cls + + def __str__(self): + return "File Output of target_name = {}, target_type= {}, statement_indexes = {}".format(self.target_name, + self.target_type, + self.statement_indexes) + + def __repr__(self): + return "FileOutput(target_name = {}, target_type= {}, statement_indexes = {})".format(self.target_name, + self.target_type, + self.statement_indexes) + + +class DatabaseTableOutput: + """ + Defines the output of a database query to a database table + """ + def __init__(self, table: str = "resultTable", database: str = "", rdb_domain: str = "", schema: str = "", + statement_indexes: list = [1]): + """ + :param table: name of the database table (string), such as "resultTable" + :param database: name of the database (string) where the output table in created. If it is owned explicitly by a user, then it should follow the pattern "mydb:username" + :param rdb_domain: name (string) of the relational database (RDB) compute domain that contains the database. Name of such domains available to the user is returned by the function Jobs.getRDBComputeDomainNames(). + :param schema: database schema (string) + :param statement_indexes: list of integers or integer. Each integer value denotes the index or position (>=1) of the sql statements whithin the input query, whose resultset is going to be written into this OutputTarget + """ + if type(table) != str or type(rdb_domain) != str or type(schema) != str: + raise ValueError("Invalid type(s) for input parameter(s) 'target_name' or 'target_type'") + + self.table = table + self.database = database if database != "" else "mydb:" + Authentication.keystoneUser.userName + self.rdb_domain = rdb_domain if rdb_domain != "" else _get_default_rdb_domain() + self.schema = schema + self.set_statement_indexes(statement_indexes) + self.target_name = ".".join([rdb_domain, database, schema, table]) + self.target_type = OutputTargetType.DATABASE_TABLE + + def set_statement_indexes(self, statement_indexes: list = [1]): + """ + Sets the index(es) of the sql statement(s) whithin the input query, whose resultset(s) is(are) going to be written into this OutputTarget. + :param statement_indexes: list of integers, which are the indices (starting with 1) of the sql statements whithin the input query, whose resultsets are going to be written into this OutputTarget. + """ + if type(statement_indexes) != list: + statement_indexes = [statement_indexes] + for index in statement_indexes: + if type(index) != int or index <= 0: + raise ValueError("Invalid type for input parameter 'statement_indexes'") + self.statement_indexes = [i for i in sorted(set(statement_indexes))] + return self + + @classmethod + def get_default(cls): + """ + Gets a OutputTarget object filled with default values: JSON output file where only the 1st SQL statement of the query is written in it. + """ + cls.target_name = "resultTable" + cls.database = "mydb:" + Authentication.keystoneUser.userName + cls.rdb_domain = _get_default_rdb_domain() + cls.schema = "" + cls.target_type = OutputTargetType.DATABASE_TABLE + cls.statement_indexes = [1] + return cls + + def __str__(self): + return "Database Table Output of table = {}, database= {}, rdb_domain = {}, schema = {}, statement_indexes = {}".format( + self.table, self.database, self.rdb_domain, self.schema, self.statement_indexes) + + def __repr__(self): + return "DatabaseTableOutput(table = {}, database= {}, rdb_domain = {}, schema = {}, statement_indexes = {})".format( + self.table, self.database, self.rdb_domain, self.schema, self.statement_indexes) + + +class RDBJob: + """ + Contains the definition of an RDB job + """ + def __init__(self, job): + """ + :param job: can be the job ID (string), or the + """ + if type(job) != dict: + job = Jobs.getJobDescription(job) + for k, v in job.items(): + setattr(self, k, v) + + def get_output_targets(self): + output_targets = {} + for t in self.targets: + i = (t['location'], t['type']) + if i not in output_targets: + output_targets[i] = [t['resultNumber']] + else: + output_targets[i] = output_targets[i].append(t['resultNumber']) + + targets = [] + for k in output_targets: + if k[1] == OutputTargetType.DATABASE_TABLE: + p = k[0].split(".") + targets.append(DatabaseTableOutput(table=p[3], database=p[1], rdb_domain=p[0], schema=p[2], + statement_indexes=output_targets[k])) + else: + targets.append(FileOutput(target_name=k[0], target_type=k[1], statement_indexes=output_targets[k])) + + # output_targets = [ OutputTarget(l,t,output_targets[(l,t)]) for l,t in output_targets] + # return output_targets + return targets + + def get_results_folder_path(self): + path = ":".join(self.resultsFolderURI.split(":")[1:]) + if not path.startswith(Config.ComputeWorkDir): + path = Config.ComputeWorkDir + path[1:] + return path + + def get_output_target_path(self, output_target): + if output_target.target_type == OutputTargetType.DATABASE_TABLE: + raise ValueError("Output target is not a file but a database") + return self.get_results_folder_path() + output_target.target_name + + def get_fileservice_folder_path(self): + path = ":".join(self.resultsFolderURI.split(":")[1:]) + if path.startswith(Config.ComputeWorkDir): + path = "/" + path.replace(Config.ComputeWorkDir, "", 1) + return path + + def get_start_time(self): + return datetime.fromtimestamp(self.startTime / 1000.0) + + def get_end_time(self): + return datetime.fromtimestamp(self.endTime / 1000.0) + + def get_duration(self): + return self.duration + + def __str__(self): + return "RDB Job of id = {}".format(self.id) + + def __repr__(self): + return "RDBJob(id = {})".format(self.id) + + + +@lru_cache(128) +def _get_file_service(file_service_id=""): + print(file_service_id) + file_services = Files.getFileServices(verbose=False) + for file_service in file_services: + if file_service["name"] == file_service_id or file_service["identifier"] == file_service_id: + return file_service + + if len(file_services) > 0: + return file_services[0] + else: + raise Exception("No fileservices available for the user") + + +def submitQueryJob(sqlQuery, + rdbComputeDomain=None, + databaseContextName=None, + output_targets=FileOutput.get_default(), + resultsFolderPath="", + jobAlias="", + file_service_name=""): + """ + Submits a sql query for execution (as an asynchronous job) inside a relational database (RDB) compute domain. + + :param sqlQuery: sql query (string) + :param rdbComputeDomain: object (dictionary) that defines a relational database (RDB) compute domain. A list of + these kind of objects available to the user is returned by the function Jobs.getRDBComputeDomains(). + :param databaseContextName: database context name (string) on which the sql query is executed. + :param output_targets: object of type SciQuery.OutputTarget defining the output of one or multiple statements + within the input query. Could also be a list of OutputTarget objects. + :param resultsFolderPath: full path to results folder (string) where query output tables are written into. + E.g.: /home/idies/workspace/rootVOlume/username/userVolume/jobsFolder . If not set, + then a default folder will be set automatically. + :param jobAlias: alias (string) of job, defined by the user. + :param file_service_name: name or uuid (string) of FileService where the results folder (resultsFolderPath) is + going to be created. If not defined, then the first available FileService is chosen by default. + :return: the ID (integer) that labels the job. + :raises: Throws an exception if the HTTP request to the Authentication URL returns an error. Throws an exception if + the HTTP request to the SciQuery API returns an error, or if the volumes defined by the user are not available in + the Docker compute domain. + :example: job_id = SciQuery.submitQueryJob('select 1';,None, None, 'myQueryResults', 'myNewJob') + + .. seealso:: Jobs.submitNotebookJob, Jobs.submitShellCommandJob, Jobs.getJobStatus, Jobs.getDockerComputeDomains, + Jobs.cancelJob + """ + + token = Authentication.getToken() + if token is not None and token != "": + + if Config.isSciServerComputeEnvironment(): + taskName = "Compute.SciScript-Python.Sciquery.submitQueryJob" + else: + taskName = "SciScript-Python.Sciquery.submitQueryJob" + + if rdbComputeDomain is None: + rdbComputeDomains = Jobs.getRDBComputeDomains(); + if len(rdbComputeDomains) > 0: + rdbComputeDomain = rdbComputeDomains[0]; + else: + raise Exception("There are no rdbComputeDomains available for the user."); + + if databaseContextName is None: + databaseContexts = rdbComputeDomain.get('databaseContexts'); + if len(databaseContexts) > 0: + databaseContextName = databaseContexts[0].get('name') + else: + raise Exception("rbdComputeDomain has no database contexts available for the user."); + + if type(output_targets) != list: + output_targets = [output_targets] + + targets = [] + for target in output_targets: + for index in target.statement_indexes: + targets.append({'location': target.target_name, 'type': target.target_type, 'resultNumber': index}) + + rdbDomainId = rdbComputeDomain.get('id'); + + file_service = _get_file_service(file_service_name) + resultsFolderPath = file_service['identifier'] + ":" + resultsFolderPath + + dockerJobModel = { + "inputSql": sqlQuery, + "submitterDID": jobAlias, + "databaseContextName": databaseContextName, + "rdbDomainId": rdbDomainId, + "targets": targets, + "resultsFolderURI": resultsFolderPath + } + + print(dockerJobModel) + + data = json.dumps(dockerJobModel).encode() + url = Config.SciqueryURL + "/api/jobs/" + str(rdbDomainId) + "?TaskName=" + taskName; + headers = {'X-Auth-Token': token, "Content-Type": "application/json"} + print(url) + res = requests.post(url, data=data, headers=headers, stream=True) + + if res.status_code < 200 or res.status_code >= 300: + raise Exception("Error when submitting a job to the SciQuery API.\nHttp Response from SciQuery API " + + "returned status code " + str(res.status_code) + ":\n" + res.content.decode()); + else: + return res.content.decode() + else: + raise Exception("User token is not defined. First log into SciServer.") + + +def execute_query(query, + rdb_compute_domain=None, + database_context=None, + results_folder_path="", + job_alias="", + poll_time=0.2, + file_service_name=""): + output_target = FileOutput("result1.json", OutputTargetType.FILE_JSON).set_statement_indexes([1]) + + jobId = submitQueryJob(sqlQuery=query, rdbComputeDomain=rdb_compute_domain, databaseContextName=database_context, + output_targets=output_target, + resultsFolderPath=results_folder_path, + jobAlias=job_alias, file_service_name=file_service_name) + + job_status = Jobs.waitForJob(jobId, verbose=False, pollTime=poll_time) + job = RDBJob(jobId) + if job.status > 32: + messages = ". ".join(job.messages) if len(job.messages) > 0 else "" + if (job.status == 64): + raise Exception("Query ended with an error. " + messages) + if (job.status == 128): + raise Exception("Query was cancelled. " + messages) + + if Config.isSciServerComputeEnvironment(): + file_path = job.get_output_target_path(output_target) + with open(file_path, ) as f: + j = json.load(f) + else: + file_service_id = job.resultsFolderURI.split(":")[0] + file_service = _get_file_service(file_service_id) + path = job.get_fileservice_folder_path(output_target) + s = Files.download(file_service, path, format="txt", quiet=True) + j = json.loads(s) + + data = np.asarray(j['Result'][0]['Data']) + column_names = j['Result'][0]['ColumnNames'] + name = j['Result'][0]['TableName'] + + df = pd.DataFrame(data=data, columns=column_names) + df.name = name + return df From 81d299ed8abee09a21b96527c79ba19801774394 Mon Sep 17 00:00:00 2001 From: mtaghiza Date: Fri, 17 Dec 2021 06:40:11 -0500 Subject: [PATCH 02/13] Added more functions to SciQuery module and improved docs. Removed RDB references in Jobs module. --- py3/SciServer/Jobs.py | 161 +------------------------- py3/SciServer/SciQuery.py | 231 ++++++++++++++++++++++++++++++++------ 2 files changed, 199 insertions(+), 193 deletions(-) diff --git a/py3/SciServer/Jobs.py b/py3/SciServer/Jobs.py index 6a372ef..525598e 100644 --- a/py3/SciServer/Jobs.py +++ b/py3/SciServer/Jobs.py @@ -18,7 +18,7 @@ def getDockerComputeDomains(): :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). Throws an exception if the HTTP request to the JOBM API returns an error. :example: dockerComputeDomains = Jobs.getDockerComputeDomains(); - .. seealso:: Jobs.submitShellCommandJob, Jobs.getJobStatus, Jobs.getRDBComputeDomains, Jobs.cancelJob + .. seealso:: Jobs.submitShellCommandJob, Jobs.getJobStatus, Jobs.cancelJob """ token = Authentication.getToken() if token is not None and token != "": @@ -72,7 +72,7 @@ def getDockerComputeDomainFromName(dockerComputeDomainName, dockerComputeDomains :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). Throws an exception if the HTTP request to the JOBM API returns an error. :example: dockerComputeDomain = Jobs.getDockerComputeDomainFromName('dockerComputeDomainAtJHU'); - .. seealso:: Jobs.getDockerComputeDomains, Jobs.getRDBComputeDomains, Jobs.getRDBComputeDomainFromName + .. seealso:: Jobs.getDockerComputeDomains """ if dockerComputeDomainName is None: raise Exception("dockerComputeDomainName is not defined.") @@ -90,83 +90,6 @@ def getDockerComputeDomainFromName(dockerComputeDomainName, dockerComputeDomains raise Exception("DockerComputeDomain of name '" + dockerComputeDomainName + "' is not available or does not exist."); -def getRDBComputeDomains(): - """ - Gets a list of all registered Relational Database (RDB) compute domains that the user has access to. - - :return: a list of dictionaries, each one containing the definition of an RDB compute domain. - :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). Throws an exception if the HTTP request to the JOBM API returns an error. - :example: rdbComputeDomains = Jobs.getRDBComputeDomains(); - - .. seealso:: Jobs.submitShellCommandJob, Jobs.getJobStatus, Jobs.getDockerComputeDomains, Jobs.cancelJob - """ - token = Authentication.getToken() - if token is not None and token != "": - - if Config.isSciServerComputeEnvironment(): - taskName = "Compute.SciScript-Python.Jobs.getRDBComputeDomains" - else: - taskName = "SciScript-Python.Jobs.getRDBComputeDomains" - - url = Config.RacmApiURL + "/jobm/rest/computedomains/rdb?TaskName=" + taskName - headers = {'X-Auth-Token': token, "Content-Type": "application/json"} - res = requests.get(url, headers=headers, stream=True) - if res.status_code != 200: - raise Exception("Error when getting RDB Compute Domains from JOBM API.\nHttp Response from JOBM API returned status code " + str(res.status_code) + ":\n" + res.content.decode()); - else: - return json.loads(res.content.decode()) - else: - raise Exception("User token is not defined. First log into SciServer.") - - -def getRDBComputeDomainsNames(rdbComputeDomains=None): - """ - Returns the names of the RDB compute domains available to the user. - - :param rdbComputeDomains: a list of rdbComputeDomain objects (dictionaries), as returned by Jobs.getRDBComputeDomains(). If not set, then an extra internal call to Jobs.getRDBComputeDomains() is made. - :return: an array of strings, each being the name of a rdb compute domain available to the user. - :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). Throws an exception if the HTTP request to the RACM API returns an error. - :example: dockerComputeDomainsNames = Files.getDockerComputeDomainsNames(); - - .. seealso:: Files.getRDBComputeDomains - """ - if rdbComputeDomains is None: - rdbComputeDomains = getRDBComputeDomains(); - - rdbComputeDomainsNames = []; - for rdbComputeDomain in rdbComputeDomains: - rdbComputeDomainsNames.append(rdbComputeDomain.get('name')) - - return rdbComputeDomainsNames; - - -def getRDBComputeDomainFromName(rdbComputeDomainName, rdbComputeDomains = None): - """ - Returns an RDBComputeDomain object, given its registered name. - - :param rdbComputeDomainName: name of the RDBComputeDomainName, as shown within the results of Jobs.getRDBComputeDomains() - :param rdbComputeDomains: a list of rdbComputeDomain objects (dictionaries), as returned by Jobs.getRDBComputeDomains(). If not set, then an extra internal call to Jobs.getRDBComputeDomains() is made. - :return: an RDBComputeDomain object (dictionary) that defines an RDB compute domain. A list of these kind of objects available to the user is returned by the function Jobs.getRDBComputeDomains(). - :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). Throws an exception if the HTTP request to the JOBM API returns an error. - :example: rdbComputeDomain = Jobs.getRDBComputeDomainFromName('rdbComputeDomainAtJHU'); - - .. seealso:: Jobs.getDockerComputeDomains, Jobs.getRDBComputeDomains, Jobs.getDockerComputeDomainFromName - """ - if rdbComputeDomainName is None: - raise Exception("rdbComputeDomainName is not defined.") - else: - if rdbComputeDomains is None: - rdbComputeDomains = getRDBComputeDomains(); - - if rdbComputeDomains.__len__() > 0: - for rdbComputeDomain in rdbComputeDomains: - if rdbComputeDomainName == rdbComputeDomain.get('name'): - return rdbComputeDomain; - else: - raise Exception("There are no RDBComputeDomains available for the user."); - - raise Exception("RDBComputeDomain of name '" + rdbComputeDomainName + "' is not available or does not exist."); - def getJobsList(top=10, open=None, start=None, end=None, type='all'): """ @@ -511,86 +434,6 @@ def submitShellCommandJob(shellCommand, dockerComputeDomain = None, dockerImageN else: raise Exception("User token is not defined. First log into SciServer.") -def submitRDBQueryJob(sqlQuery, rdbComputeDomain=None, databaseContextName = None, resultsName='queryResults', resultsFolderPath="", jobAlias = ""): - """ - Submits a sql query for execution (as an asynchronous job) inside a relational database (RDB) compute domain. - - :param sqlQuery: sql query (string) - :param rdbComputeDomain: object (dictionary) that defines a relational database (RDB) compute domain. A list of these kind of objects available to the user is returned by the function Jobs.getRDBComputeDomains(). - :param databaseContextName: database context name (string) on which the sql query is executed. - :param resultsName: name (string) of the table or file (without file type ending) that contains the query result. In case the sql query has multiple statements, should be set to a list of names (e.g., ['result1','result2']). - :param resultsFolderPath: full path to results folder (string) where query output tables are written into. E.g.: /home/idies/workspace/rootVOlume/username/userVolume/jobsFolder . If not set, then a default folder will be set automatically. - :param jobAlias: alias (string) of job, defined by the user. - :return: a dictionary containing the definition of the submitted job. - :raises: Throws an exception if the HTTP request to the Authentication URL returns an error. Throws an exception if the HTTP request to the JOBM API returns an error, or if the volumes defined by the user are not available in the Docker compute domain. - :example: job = Jobs.submitRDBQueryJob('select 1';,None, None, 'myQueryResults', 'myNewJob') - - .. seealso:: Jobs.submitNotebookJob, Jobs.submitShellCommandJob, Jobs.getJobStatus, Jobs.getDockerComputeDomains, Jobs.cancelJob - """ - - token = Authentication.getToken() - if token is not None and token != "": - - if Config.isSciServerComputeEnvironment(): - taskName = "Compute.SciScript-Python.Jobs.submitRDBQueryJob" - else: - taskName = "SciScript-Python.Jobs.submitRDBQueryJob" - - if rdbComputeDomain is None: - rdbComputeDomains = getRDBComputeDomains(); - if rdbComputeDomains .__len__() > 0: - rdbComputeDomain = rdbComputeDomains[0]; - else: - raise Exception("There are no rdbComputeDomains available for the user."); - - if databaseContextName is None: - databaseContexts = rdbComputeDomain.get('databaseContexts'); - if databaseContexts.__len__() > 0: - databaseContextName = databaseContexts[0].get('name') - else: - raise Exception("rbdComputeDomain has no database contexts available for the user."); - - targets = []; - if type(resultsName) == str: - targets.append({'location': resultsName, 'type': 'FILE_CSV', 'resultNumber': 1}); - elif type(resultsName) == list: - if len(set(resultsName)) != len(resultsName): - raise Exception("Elements of parameter 'resultsName' must be unique"); - - for i in range(len(resultsName)): - if type(resultsName[i]) == str: - targets.append({'location': resultsName[i], 'type': 'FILE_CSV', 'resultNumber': i+1}); - else: - raise Exception("Elements of array 'resultsName' are not strings"); - - else: - raise Exception("Type of parameter 'resultsName' is not supported"); - - - rdbDomainId = rdbComputeDomain.get('id'); - - dockerJobModel = { - "inputSql": sqlQuery, - "submitterDID": jobAlias, - "databaseContextName": databaseContextName, - "rdbDomainId": rdbDomainId, - "targets": targets, - "resultsFolderURI":resultsFolderPath - } - - data = json.dumps(dockerJobModel).encode() - url = Config.RacmApiURL + "/jobm/rest/jobs/rdb?TaskName="+taskName; - headers = {'X-Auth-Token': token, "Content-Type": "application/json"} - res = requests.post(url, data=data, headers=headers, stream=True) - - if res.status_code != 200: - raise Exception("Error when submitting a job to the JOBM API.\nHttp Response from JOBM API returned status code " + str(res.status_code) + ":\n" + res.content.decode()); - else: - return (json.loads(res.content.decode())).get('id') - else: - raise Exception("User token is not defined. First log into SciServer.") - - def cancelJob(jobId): """ Cancels the execution of a job. diff --git a/py3/SciServer/SciQuery.py b/py3/SciServer/SciQuery.py index ef59e6d..1dc8d28 100644 --- a/py3/SciServer/SciQuery.py +++ b/py3/SciServer/SciQuery.py @@ -7,7 +7,7 @@ from datetime import datetime def _get_default_rdb_domain(): - rdb_domains = Jobs.getRDBComputeDomainsNames() + rdb_domains = getRDBComputeDomainNames() if len(rdb_domains) > 0: return rdb_domains[0] else: @@ -204,7 +204,6 @@ def __repr__(self): @lru_cache(128) def _get_file_service(file_service_id=""): - print(file_service_id) file_services = Files.getFileServices(verbose=False) for file_service in file_services: if file_service["name"] == file_service_id or file_service["identifier"] == file_service_id: @@ -219,33 +218,32 @@ def _get_file_service(file_service_id=""): def submitQueryJob(sqlQuery, rdbComputeDomain=None, databaseContextName=None, - output_targets=FileOutput.get_default(), + outputTargets=FileOutput.get_default(), resultsFolderPath="", jobAlias="", - file_service_name=""): + fileServiceName=""): """ Submits a sql query for execution (as an asynchronous job) inside a relational database (RDB) compute domain. :param sqlQuery: sql query (string) :param rdbComputeDomain: object (dictionary) that defines a relational database (RDB) compute domain. A list of - these kind of objects available to the user is returned by the function Jobs.getRDBComputeDomains(). + these kind of objects available to the user is returned by the function 'getRDBComputeDomains'. :param databaseContextName: database context name (string) on which the sql query is executed. - :param output_targets: object of type SciQuery.OutputTarget defining the output of one or multiple statements + :param outputTargets: object of type SciQuery.OutputTarget defining the output of one or multiple statements within the input query. Could also be a list of OutputTarget objects. :param resultsFolderPath: full path to results folder (string) where query output tables are written into. E.g.: /home/idies/workspace/rootVOlume/username/userVolume/jobsFolder . If not set, then a default folder will be set automatically. :param jobAlias: alias (string) of job, defined by the user. - :param file_service_name: name or uuid (string) of FileService where the results folder (resultsFolderPath) is + :param fileServiceName: name or uuid (string) of FileService where the results folder (resultsFolderPath) is going to be created. If not defined, then the first available FileService is chosen by default. :return: the ID (integer) that labels the job. :raises: Throws an exception if the HTTP request to the Authentication URL returns an error. Throws an exception if the HTTP request to the SciQuery API returns an error, or if the volumes defined by the user are not available in the Docker compute domain. - :example: job_id = SciQuery.submitQueryJob('select 1';,None, None, 'myQueryResults', 'myNewJob') + :example: job_id = SciQuery.submitQueryJob('select 1;') - .. seealso:: Jobs.submitNotebookJob, Jobs.submitShellCommandJob, Jobs.getJobStatus, Jobs.getDockerComputeDomains, - Jobs.cancelJob + .. seealso:: SciQuery.submitQueryJob, SciQuery.getJobStatus, SciQuery.getJob """ token = Authentication.getToken() @@ -257,7 +255,7 @@ def submitQueryJob(sqlQuery, taskName = "SciScript-Python.Sciquery.submitQueryJob" if rdbComputeDomain is None: - rdbComputeDomains = Jobs.getRDBComputeDomains(); + rdbComputeDomains = getRDBComputeDomains(); if len(rdbComputeDomains) > 0: rdbComputeDomain = rdbComputeDomains[0]; else: @@ -270,17 +268,17 @@ def submitQueryJob(sqlQuery, else: raise Exception("rbdComputeDomain has no database contexts available for the user."); - if type(output_targets) != list: - output_targets = [output_targets] + if type(outputTargets) != list: + outputTargets = [outputTargets] targets = [] - for target in output_targets: + for target in outputTargets: for index in target.statement_indexes: targets.append({'location': target.target_name, 'type': target.target_type, 'resultNumber': index}) rdbDomainId = rdbComputeDomain.get('id'); - file_service = _get_file_service(file_service_name) + file_service = _get_file_service(fileServiceName) resultsFolderPath = file_service['identifier'] + ":" + resultsFolderPath dockerJobModel = { @@ -292,12 +290,9 @@ def submitQueryJob(sqlQuery, "resultsFolderURI": resultsFolderPath } - print(dockerJobModel) - data = json.dumps(dockerJobModel).encode() url = Config.SciqueryURL + "/api/jobs/" + str(rdbDomainId) + "?TaskName=" + taskName; headers = {'X-Auth-Token': token, "Content-Type": "application/json"} - print(url) res = requests.post(url, data=data, headers=headers, stream=True) if res.status_code < 200 or res.status_code >= 300: @@ -309,19 +304,40 @@ def submitQueryJob(sqlQuery, raise Exception("User token is not defined. First log into SciServer.") -def execute_query(query, - rdb_compute_domain=None, - database_context=None, - results_folder_path="", - job_alias="", +def executeQuery(sqlQuery, + rdbComputeDomain=None, + databaseContextName=None, + resultsFolderPath="", + jobAlias="", poll_time=0.2, - file_service_name=""): + fileServiceName=""): + """ + Returns the query result (as a Pandas data frame) of a sql query submitted as a job to a relational database (RDB) compute domain. + + :param sqlQuery: sql query (string) + :param rdbComputeDomain: object (dictionary) that defines a relational database (RDB) compute domain. A list of + these kind of objects available to the user is returned by the function 'getRDBComputeDomains'. + :param databaseContextName: database context name (string) on which the sql query is executed. + :param resultsFolderPath: full path to results folder (string) where query output tables are written into. + E.g.: /home/idies/workspace/rootVOlume/username/userVolume/jobsFolder . If not set, + then a default folder will be set automatically. + :param jobAlias: alias (string) of job, defined by the user. + :param fileServiceName: name or uuid (string) of FileService where the results folder (resultsFolderPath) is + going to be created. If not defined, then the first available FileService is chosen by default. + :return: Pandas data frame containing the result of the query. + :raises: Throws an exception if the HTTP request to the Authentication URL returns an error. Throws an exception if + the HTTP request to the SciQuery API returns an error. + :example: df = SciQuery.executeQuery('select 1;') + + .. seealso:: SciQuery.submitQueryJob, SciQuery.getJobStatus, SciQuery.getJob + Jobs.cancelJob + """ output_target = FileOutput("result1.json", OutputTargetType.FILE_JSON).set_statement_indexes([1]) - jobId = submitQueryJob(sqlQuery=query, rdbComputeDomain=rdb_compute_domain, databaseContextName=database_context, - output_targets=output_target, - resultsFolderPath=results_folder_path, - jobAlias=job_alias, file_service_name=file_service_name) + jobId = submitQueryJob(sqlQuery=sqlQuery, rdbComputeDomain=rdbComputeDomain, databaseContextName=databaseContextName, + outputTargets=output_target, + resultsFolderPath=resultsFolderPath, + jobAlias=jobAlias, fileServiceName=fileServiceName) job_status = Jobs.waitForJob(jobId, verbose=False, pollTime=poll_time) job = RDBJob(jobId) @@ -343,10 +359,157 @@ def execute_query(query, s = Files.download(file_service, path, format="txt", quiet=True) j = json.loads(s) - data = np.asarray(j['Result'][0]['Data']) - column_names = j['Result'][0]['ColumnNames'] - name = j['Result'][0]['TableName'] - - df = pd.DataFrame(data=data, columns=column_names) - df.name = name + result=j['Result'][0] + df=pd.DataFrame(result['Data'],columns=result['ColumnNames']) + df.name = result['TableName'] return df + + +def getRDBComputeDomains(): + """ + Gets a list of all registered Relational Database (RDB) compute domains that the user has access to. + + :return: a list of dictionaries, each one containing the definition of an RDB compute domain. + :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). Throws an exception if the HTTP request to the JOBM API returns an error. + :example: rdb_compute_domains = SciQuery.getRDBComputeDomains(); + + .. seealso:: SciQuery.executeQuery, SciQuery.submitQueryJob + """ + token = Authentication.getToken() + if token is not None and token != "": + + if Config.isSciServerComputeEnvironment(): + taskName = "Compute.SciScript-Python.SciQuery.get_compute_domains" + else: + taskName = "SciScript-Python.SciQuery.get_compute_domains" + + url = Config.RacmApiURL + "/jobm/rest/computedomains/rdb?TaskName=" + taskName + headers = {'X-Auth-Token': token, "Content-Type": "application/json"} + res = requests.get(url, headers=headers, stream=True) + if res.status_code != 200: + raise Exception("Error when getting RDB Compute Domains from JOBM API.\nHttp Response from JOBM API returned status code " + str(res.status_code) + ":\n" + res.content.decode()); + else: + return json.loads(res.content.decode()) + else: + raise Exception("User token is not defined. First log into SciServer.") + + +def getRDBComputeDomainNames(rdbComputeDomains=None): + """ + Returns the names of the RDB compute domains available to the user. + + :param rdbComputeDomains: a list of rdbComputeDomain objects (dictionaries), as returned by Jobs.getRDBComputeDomains(). If not set, then an extra internal call to Jobs.getRDBComputeDomains() is made. + :return: an array of strings, each being the name of a rdb compute domain available to the user. + :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). Throws an exception if the HTTP request to the RACM API returns an error. + :example: dockerComputeDomainsNames = Files.getDockerComputeDomainsNames(); + + .. seealso:: Files.getRDBComputeDomains + """ + if rdbComputeDomains is None: + rdbComputeDomains = getRDBComputeDomains(); + + rdbComputeDomainsNames = []; + for rdbComputeDomain in rdbComputeDomains: + rdbComputeDomainsNames.append(rdbComputeDomain.get('name')) + + return rdbComputeDomainsNames; + +def getRDBComputeDomainFromName(rdbComputeDomainName, rdbComputeDomains = None): + """ + Returns an RDBComputeDomain object, given its registered name. + + :param rdbComputeDomainName: name of the RDBComputeDomainName, as shown within the results of Jobs.getRDBComputeDomains() + :param rdbComputeDomains: a list of rdbComputeDomain objects (dictionaries), as returned by Jobs.getRDBComputeDomains(). If not set, then an extra internal call to Jobs.getRDBComputeDomains() is made. + :return: an RDBComputeDomain object (dictionary) that defines an RDB compute domain. A list of these kind of objects available to the user is returned by the function Jobs.getRDBComputeDomains(). + :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). Throws an exception if the HTTP request to the JOBM API returns an error. + :example: rdbComputeDomain = SciQuery.getRDBComputeDomainFromName(rdbComputeDomainName); + + .. seealso:: SciQuery.getRDBComputeDomains + """ + if rdbComputeDomainName is None: + raise Exception("rdbComputeDomainName is not defined.") + else: + if rdbComputeDomains is None: + rdbComputeDomains = getRDBComputeDomains(); + + if rdbComputeDomains.__len__() > 0: + for rdbComputeDomain in rdbComputeDomains: + if rdbComputeDomainName == rdbComputeDomain.get('name'): + return rdbComputeDomain; + else: + raise Exception("There are no RDBComputeDomains available for the user."); + + raise Exception("RDBComputeDomain of name '" + rdbComputeDomainName + "' is not available or does not exist."); + +def getJobsList(top=10, open=None, start=None, end=None): + """ + Gets the list of SciQuery Jobs submitted by the user. + + :param top: top number of jobs (integer) returned. If top=None, then all jobs are returned. + :param open: If set to 'True', then only returns jobs that have not finished executing and wrapped up (status <= FINISHED). If set to 'False' then only returnes jobs that are still running. If set to 'None', then returns both finished and unfinished jobs. + :param start: The earliest date (inclusive) to search for jobs, in string format yyyy-MM-dd hh:mm:ss.SSS. If set to 'None', then there is no lower bound on date. + :param end: The latest date (inclusive) to search for jobs, in string format yyyy-MM-dd hh:mm:ss.SSS. If set to 'None', then there is no upper bound on date. + :return: a list of dictionaries, each one containing the definition of a submitted job. + :raises: Throws an exception if the HTTP request to the Authentication URL returns an error, and if the HTTP request to the JOBM API returns an error. + :example: jobs = SciQuery.getJobsList(top=2); + + .. seealso:: SciQuery,getJob, SciQuery.getJobStatus, SciQuery.cancelJob + """ + job_dict_list = Jobs.getJobsList(top=top, open=open, start=start, end=end, type='rdb') + rdb_job_list = [] + for job_dict in job_dict_list: + rdb_job_list.append(RDBJob(job_dict)) + return rdb_job_list + +def getJob(jobId): + """ + Gets the definition of the job as a RDBJob object. + + :param jobId: Id of job + :return: RDBJob object containing the description or definition of the job. + :raises: Throws an exception if the HTTP request to the Authentication URL returns an error, and if the HTTP request to the JOBM API returns an error. + :example: job = SciQuery.getJob(jobId) + + .. seealso:: SciQuery.getJob, SciQuery.cancelJob, SciQuery.submitQueryJob + """ + return RDBJob(Jobs.getJobDescription(jobId)) + +def getJobStatus(jobId): + """ + Gets a dictionary with the job status as an integer value, together with its semantic meaning. The integer value is a power of 2, that is, 1:PENDING, 2:QUEUED, 4:ACCEPTED, 8:STARTED, 16:FINISHED, 32:SUCCESS, 64:ERROR and 128:CANCELED + + :param jobId: Id of job (integer). + :return: dictionary with the integer value of the job status, as well as its semantic meaning. + :raises: Throws an exception if the HTTP request to the Authentication URL returns an error, and if the HTTP request to the JOBM API returns an error. + :example: status = SciQuery.getJobStatus(jobId) + + .. seealso:: SciQuery.cancelJob, SciQuery.waitForJob, SciQuery.getJob, SciQuery.cancelJob + """ + return Jobs.getJobStatus(jobId) + +def cancelJob(jobId): + """ + Cancels the execution of a job. + + :param jobId: Id of the job (integer) + :raises: Throws an exception if the HTTP request to the Authentication URL returns an error. Throws an exception if the HTTP request to the JOBM API returns an error. + :example: SciQuery.cancelJob(jobId); + + .. seealso:: SciQuery.getJobStatus, SciQuery.getJobDescription + """ + Jobs.cancelJob(jobId) + +def waitForJob(jobId, verbose=False, pollTime = 5): + """ + Queries the job status regularly and waits until the job is completed. + + :param jobId: id of job (integer) + :param verbose: if True, will print "wait" messages on the screen while the job is still running. If False, will suppress the printing of messages on the screen. + :param pollTime: idle time interval (integer, in seconds) before querying again for the job status. Minimum value allowed is 5 seconds. + :return: After the job is finished, returns a dictionary object containing the job status. + :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). Throws an exception if the HTTP request to the JOBM API returns an error. + :example: jobStatus = SciQuery.waitForJob(jobId) + + .. seealso:: SciQuery.getJobStatus, SciQuery.getJobDescription + """ + return Jobs.waitForJob(jobId=jobId, verbose=verbose, pollTime = pollTime) \ No newline at end of file From 05aa804f5c16660a9f7c5b41a32a95b50e82c023 Mon Sep 17 00:00:00 2001 From: mtaghiza Date: Thu, 24 Feb 2022 09:09:08 -0500 Subject: [PATCH 03/13] Added metadata functions and improved docs. --- py3/SciServer/SciQuery.py | 517 +++++++++++++++++++++++++++++++++++--- 1 file changed, 480 insertions(+), 37 deletions(-) diff --git a/py3/SciServer/SciQuery.py b/py3/SciServer/SciQuery.py index 1dc8d28..c1eddae 100644 --- a/py3/SciServer/SciQuery.py +++ b/py3/SciServer/SciQuery.py @@ -31,8 +31,10 @@ def __init__(self, target_name: str = "result.json", target_type: str = OutputTa statement_indexes: list = [1]): """ :param target_name: name of the file (string), such as "result.json" - :param target_type: type (string) of the file containing the query result(s) (e.g., "FILE_JSON"). As set of possible values is given by the static members of class 'SciQuery.OutputTargetType' - :param statement_indexes: list of integers or integer. Each integer value denotes the index or position (>=1) of the sql statements whithin the input query, whose resultset is going to be written into this OutputTarget + :param target_type: type (string) of the file containing the query result(s) (e.g., "FILE_JSON"). + As set of possible values is given by the static members of class 'SciQuery.OutputTargetType' + :param statement_indexes: list of integers or integer. Each integer value denotes the index or position (>=1) + of the sql statements whithin the input query, whose resultset is going to be written into this OutputTarget """ if type(target_name) != str or type(target_type) != str: raise ValueError("Invalid type(s) for input parameter(s) 'target_name' or 'target_type'") @@ -42,8 +44,10 @@ def __init__(self, target_name: str = "result.json", target_type: str = OutputTa def set_statement_indexes(self, statement_indexes: list = [1]): """ - Sets the index(es) of the sql statement(s) whithin the input query, whose resultset(s) is(are) going to be written into this OutputTarget. - :param statement_indexes: list of integers, which are the indices (starting with 1) of the sql statements whithin the input query, whose resultsets are going to be written into this OutputTarget. + Sets the index(es) of the sql statement(s) whithin the input query, whose resultset(s) is(are) going to + be written into this OutputTarget. + :param statement_indexes: list of integers, which are the indices (starting with 1) of the sql statements + within the input query, whose resultsets are going to be written into this OutputTarget. """ if type(statement_indexes) != list: statement_indexes = [statement_indexes] @@ -56,7 +60,8 @@ def set_statement_indexes(self, statement_indexes: list = [1]): @classmethod def get_default(cls): """ - Gets a OutputTarget object filled with default values: JSON output file where only the 1st SQL statement of the query is written in it. + Gets a OutputTarget object filled with default values: JSON output file where only the 1st SQL statement of + the query is written in it. """ cls.target_name = "result.json" cls.target_type = OutputTargetType.FILE_JSON @@ -82,10 +87,13 @@ def __init__(self, table: str = "resultTable", database: str = "", rdb_domain: s statement_indexes: list = [1]): """ :param table: name of the database table (string), such as "resultTable" - :param database: name of the database (string) where the output table in created. If it is owned explicitly by a user, then it should follow the pattern "mydb:username" - :param rdb_domain: name (string) of the relational database (RDB) compute domain that contains the database. Name of such domains available to the user is returned by the function Jobs.getRDBComputeDomainNames(). + :param database: name of the database (string) where the output table in created. If it is owned explicitly by + a user, then it should follow the pattern "mydb:username" + :param rdb_domain: name (string) of the relational database (RDB) compute domain that contains the database. + Name of such domains available to the user is returned by the function Jobs.getRDBComputeDomainNames(). :param schema: database schema (string) - :param statement_indexes: list of integers or integer. Each integer value denotes the index or position (>=1) of the sql statements whithin the input query, whose resultset is going to be written into this OutputTarget + :param statement_indexes: list of integers or integer. Each integer value denotes the index or position (>=1) + of the sql statements whithin the input query, whose resultset is going to be written into this OutputTarget """ if type(table) != str or type(rdb_domain) != str or type(schema) != str: raise ValueError("Invalid type(s) for input parameter(s) 'target_name' or 'target_type'") @@ -100,8 +108,10 @@ def __init__(self, table: str = "resultTable", database: str = "", rdb_domain: s def set_statement_indexes(self, statement_indexes: list = [1]): """ - Sets the index(es) of the sql statement(s) whithin the input query, whose resultset(s) is(are) going to be written into this OutputTarget. - :param statement_indexes: list of integers, which are the indices (starting with 1) of the sql statements whithin the input query, whose resultsets are going to be written into this OutputTarget. + Sets the index(es) of the sql statement(s) whithin the input query, whose resultset(s) is(are) going to be + written into this OutputTarget. + :param statement_indexes: list of integers, which are the indices (starting with 1) of the sql statements + within the input query, whose resultsets are going to be written into this OutputTarget. """ if type(statement_indexes) != list: statement_indexes = [statement_indexes] @@ -114,7 +124,8 @@ def set_statement_indexes(self, statement_indexes: list = [1]): @classmethod def get_default(cls): """ - Gets a OutputTarget object filled with default values: JSON output file where only the 1st SQL statement of the query is written in it. + Gets a OutputTarget object filled with default values: JSON output file where only the 1st SQL statement of + the query is written in it. """ cls.target_name = "resultTable" cls.database = "mydb:" + Authentication.keystoneUser.userName @@ -125,12 +136,12 @@ def get_default(cls): return cls def __str__(self): - return "Database Table Output of table = {}, database= {}, rdb_domain = {}, schema = {}, statement_indexes = {}".format( - self.table, self.database, self.rdb_domain, self.schema, self.statement_indexes) + return "Database Table Output of table= {}, database= {}, rdb_domain= {}, schema= {}, statement_indexes= {}"\ + .format(self.table, self.database, self.rdb_domain, self.schema, self.statement_indexes) def __repr__(self): - return "DatabaseTableOutput(table = {}, database= {}, rdb_domain = {}, schema = {}, statement_indexes = {})".format( - self.table, self.database, self.rdb_domain, self.schema, self.statement_indexes) + return "DatabaseTableOutput(table= {}, database= {}, rdb_domain= {}, schema= {}, statement_indexes= {})"\ + .format(self.table, self.database, self.rdb_domain, self.schema, self.statement_indexes) class RDBJob: @@ -294,7 +305,6 @@ def submitQueryJob(sqlQuery, url = Config.SciqueryURL + "/api/jobs/" + str(rdbDomainId) + "?TaskName=" + taskName; headers = {'X-Auth-Token': token, "Content-Type": "application/json"} res = requests.post(url, data=data, headers=headers, stream=True) - if res.status_code < 200 or res.status_code >= 300: raise Exception("Error when submitting a job to the SciQuery API.\nHttp Response from SciQuery API " + "returned status code " + str(res.status_code) + ":\n" + res.content.decode()); @@ -312,7 +322,8 @@ def executeQuery(sqlQuery, poll_time=0.2, fileServiceName=""): """ - Returns the query result (as a Pandas data frame) of a sql query submitted as a job to a relational database (RDB) compute domain. + Returns the query result (as a Pandas data frame) of a sql query submitted as a job to a + relational database (RDB) compute domain. :param sqlQuery: sql query (string) :param rdbComputeDomain: object (dictionary) that defines a relational database (RDB) compute domain. A list of @@ -370,7 +381,8 @@ def getRDBComputeDomains(): Gets a list of all registered Relational Database (RDB) compute domains that the user has access to. :return: a list of dictionaries, each one containing the definition of an RDB compute domain. - :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). Throws an exception if the HTTP request to the JOBM API returns an error. + :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). + Throws an exception if the HTTP request to the JOBM API returns an error. :example: rdb_compute_domains = SciQuery.getRDBComputeDomains(); .. seealso:: SciQuery.executeQuery, SciQuery.submitQueryJob @@ -387,7 +399,8 @@ def getRDBComputeDomains(): headers = {'X-Auth-Token': token, "Content-Type": "application/json"} res = requests.get(url, headers=headers, stream=True) if res.status_code != 200: - raise Exception("Error when getting RDB Compute Domains from JOBM API.\nHttp Response from JOBM API returned status code " + str(res.status_code) + ":\n" + res.content.decode()); + raise Exception("Error when getting RDB Compute Domains from JOBM API.\nHttp Response from JOBM API returned" + " status code " + str(res.status_code) + ":\n" + res.content.decode()); else: return json.loads(res.content.decode()) else: @@ -398,9 +411,11 @@ def getRDBComputeDomainNames(rdbComputeDomains=None): """ Returns the names of the RDB compute domains available to the user. - :param rdbComputeDomains: a list of rdbComputeDomain objects (dictionaries), as returned by Jobs.getRDBComputeDomains(). If not set, then an extra internal call to Jobs.getRDBComputeDomains() is made. + :param rdbComputeDomains: a list of rdbComputeDomain objects (dictionaries), as returned by + Jobs.getRDBComputeDomains(). If not set, then an extra internal call to Jobs.getRDBComputeDomains() is made. :return: an array of strings, each being the name of a rdb compute domain available to the user. - :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). Throws an exception if the HTTP request to the RACM API returns an error. + :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). + Throws an exception if the HTTP request to the RACM API returns an error. :example: dockerComputeDomainsNames = Files.getDockerComputeDomainsNames(); .. seealso:: Files.getRDBComputeDomains @@ -418,10 +433,14 @@ def getRDBComputeDomainFromName(rdbComputeDomainName, rdbComputeDomains = None): """ Returns an RDBComputeDomain object, given its registered name. - :param rdbComputeDomainName: name of the RDBComputeDomainName, as shown within the results of Jobs.getRDBComputeDomains() - :param rdbComputeDomains: a list of rdbComputeDomain objects (dictionaries), as returned by Jobs.getRDBComputeDomains(). If not set, then an extra internal call to Jobs.getRDBComputeDomains() is made. - :return: an RDBComputeDomain object (dictionary) that defines an RDB compute domain. A list of these kind of objects available to the user is returned by the function Jobs.getRDBComputeDomains(). - :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). Throws an exception if the HTTP request to the JOBM API returns an error. + :param rdbComputeDomainName: name of the RDBComputeDomainName, as shown within the results of + Jobs.getRDBComputeDomains() + :param rdbComputeDomains: a list of rdbComputeDomain objects (dictionaries), as returned by + Jobs.getRDBComputeDomains(). If not set, then an extra internal call to Jobs.getRDBComputeDomains() is made. + :return: an RDBComputeDomain object (dictionary) that defines an RDB compute domain. A list of these kind of objects + available to the user is returned by the function Jobs.getRDBComputeDomains(). + :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). + Throws an exception if the HTTP request to the JOBM API returns an error. :example: rdbComputeDomain = SciQuery.getRDBComputeDomainFromName(rdbComputeDomainName); .. seealso:: SciQuery.getRDBComputeDomains @@ -446,11 +465,16 @@ def getJobsList(top=10, open=None, start=None, end=None): Gets the list of SciQuery Jobs submitted by the user. :param top: top number of jobs (integer) returned. If top=None, then all jobs are returned. - :param open: If set to 'True', then only returns jobs that have not finished executing and wrapped up (status <= FINISHED). If set to 'False' then only returnes jobs that are still running. If set to 'None', then returns both finished and unfinished jobs. - :param start: The earliest date (inclusive) to search for jobs, in string format yyyy-MM-dd hh:mm:ss.SSS. If set to 'None', then there is no lower bound on date. - :param end: The latest date (inclusive) to search for jobs, in string format yyyy-MM-dd hh:mm:ss.SSS. If set to 'None', then there is no upper bound on date. + :param open: If set to 'True', then only returns jobs that have not finished executing and wrapped up + (status <= FINISHED). If set to 'False' then only returnes jobs that are still running. If set to 'None', + then returns both finished and unfinished jobs. + :param start: The earliest date (inclusive) to search for jobs, in string format yyyy-MM-dd hh:mm:ss.SSS. + If set to 'None', then there is no lower bound on date. + :param end: The latest date (inclusive) to search for jobs, in string format yyyy-MM-dd hh:mm:ss.SSS. + If set to 'None', then there is no upper bound on date. :return: a list of dictionaries, each one containing the definition of a submitted job. - :raises: Throws an exception if the HTTP request to the Authentication URL returns an error, and if the HTTP request to the JOBM API returns an error. + :raises: Throws an exception if the HTTP request to the Authentication URL returns an error, and if the HTTP request + to the JOBM API returns an error. :example: jobs = SciQuery.getJobsList(top=2); .. seealso:: SciQuery,getJob, SciQuery.getJobStatus, SciQuery.cancelJob @@ -467,7 +491,8 @@ def getJob(jobId): :param jobId: Id of job :return: RDBJob object containing the description or definition of the job. - :raises: Throws an exception if the HTTP request to the Authentication URL returns an error, and if the HTTP request to the JOBM API returns an error. + :raises: Throws an exception if the HTTP request to the Authentication URL returns an error, and if the HTTP request + to the JOBM API returns an error. :example: job = SciQuery.getJob(jobId) .. seealso:: SciQuery.getJob, SciQuery.cancelJob, SciQuery.submitQueryJob @@ -476,11 +501,13 @@ def getJob(jobId): def getJobStatus(jobId): """ - Gets a dictionary with the job status as an integer value, together with its semantic meaning. The integer value is a power of 2, that is, 1:PENDING, 2:QUEUED, 4:ACCEPTED, 8:STARTED, 16:FINISHED, 32:SUCCESS, 64:ERROR and 128:CANCELED + Gets a dictionary with the job status as an integer value, together with its semantic meaning. The integer value is + a power of 2, that is, 1:PENDING, 2:QUEUED, 4:ACCEPTED, 8:STARTED, 16:FINISHED, 32:SUCCESS, 64:ERROR, 128:CANCELED :param jobId: Id of job (integer). :return: dictionary with the integer value of the job status, as well as its semantic meaning. - :raises: Throws an exception if the HTTP request to the Authentication URL returns an error, and if the HTTP request to the JOBM API returns an error. + :raises: Throws an exception if the HTTP request to the Authentication URL returns an error, and if the HTTP request + to the JOBM API returns an error. :example: status = SciQuery.getJobStatus(jobId) .. seealso:: SciQuery.cancelJob, SciQuery.waitForJob, SciQuery.getJob, SciQuery.cancelJob @@ -492,7 +519,8 @@ def cancelJob(jobId): Cancels the execution of a job. :param jobId: Id of the job (integer) - :raises: Throws an exception if the HTTP request to the Authentication URL returns an error. Throws an exception if the HTTP request to the JOBM API returns an error. + :raises: Throws an exception if the HTTP request to the Authentication URL returns an error. Throws an exception if + the HTTP request to the JOBM API returns an error. :example: SciQuery.cancelJob(jobId); .. seealso:: SciQuery.getJobStatus, SciQuery.getJobDescription @@ -504,12 +532,427 @@ def waitForJob(jobId, verbose=False, pollTime = 5): Queries the job status regularly and waits until the job is completed. :param jobId: id of job (integer) - :param verbose: if True, will print "wait" messages on the screen while the job is still running. If False, will suppress the printing of messages on the screen. - :param pollTime: idle time interval (integer, in seconds) before querying again for the job status. Minimum value allowed is 5 seconds. + :param verbose: if True, will print "wait" messages on the screen while the job is still running. If False, it will + suppress the printing of messages on the screen. + :param pollTime: idle time interval (integer, in seconds) before querying again for the job status. Minimum value + allowed is 5 seconds. :return: After the job is finished, returns a dictionary object containing the job status. - :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). Throws an exception if the HTTP request to the JOBM API returns an error. + :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). + Throws an exception if the HTTP request to the JOBM API returns an error. :example: jobStatus = SciQuery.waitForJob(jobId) .. seealso:: SciQuery.getJobStatus, SciQuery.getJobDescription """ - return Jobs.waitForJob(jobId=jobId, verbose=verbose, pollTime = pollTime) \ No newline at end of file + return Jobs.waitForJob(jobId=jobId, verbose=verbose, pollTime = pollTime) + + +def getDatabasesMetadata(rdbComputeDomain, format="pandas"): + """ + Gets metadata (name and description) of databases in an RDBComputeDomain. + + :param rdbComputeDomain: Name (string) of a relational database (RDB) compute domain, or an object (dictionary) that + defines a RDBComputeDomain. A list of these kind of objects available to the user is returned by + the function 'getRDBComputeDomains'. + :param format: String that defines the returned format. Either "pandas" for a pandas dataframe, or "dict" for + a dictionary object. + :return: pandas dataframe or dict (depending of the value of 'param') with associated metadata. + :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). + Throws an exception if the HTTP request to the JOBM API returns an error. + :example: metadata = SciQuery.getDatabasesMetadata(rdbComputeDomainName); + + .. seealso:: SciQuery.getDatabaseNames + """ + + if type(rdbComputeDomain) == str: + rdbComputeDomain = getRDBComputeDomainFromName(rdbComputeDomain) + databaseContexts = rdbComputeDomain.get("databaseContexts") + if format == "dict": + return databaseContexts + columnNames = ['database_name', 'database_description'] + data = [] + for i in range(len(databaseContexts)): + data.append([databaseContexts[i]['name'], databaseContexts[i]['description']]) + df = pd.DataFrame(data=data, columns=columnNames) + return df + + +def getDatabaseNames(rdbComputeDomain): + """ + Gets a list of the names of databases in an RDBComputeDomain + + :param rdbComputeDomain: Name (string) of a relational database (RDB) compute domain, or an object (dictionary) that + defines an RDBComputeDomain. A list of these kind of objects available to the user is returned by + the function 'getRDBComputeDomains'. + :return: array of database names (strings) + :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). + Throws an exception if the HTTP request to the JOBM API returns an error. + :example: dbnames = SciQuery.getDatabaseNames(rdbComputeDomainName); + + .. seealso:: SciQuery.getDatabasesMetadata + """ + databases = getDatabasesMetadata(rdbComputeDomain, format="pandas") + return [name for name in databases['database_name']] + +def getRDBComputeDomainsMetadata(format="pandas", includeDatabases=False): + """ + Gets metadata related to all relational database (RDB) compute domains (RDBComputeDomains) available to the user. + + :param format: String that defines the returned format. Either "pandas" for a pandas dataframe, or "dict" for + a dictionary object. + :param includeDatabases: Boolean parameter. If True, it will return metadata related to all available databases in + each RDBComputeDomain as well. + :return: pandas dataframe or dict (depending of the value of 'param') containing associated metadata. + :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). + Throws an exception if the HTTP request to the JOBM API returns an error. + :example: metadata = SciQuery.getRDBComputeDomainsMetadata(); + + .. seealso:: SciQuery.getDatabasesMetadata, Sciquery.getDatabaseNames + """ + rdbComputeDomains = getRDBComputeDomains() + if format == "dict": + return rdbComputeDomains + columnNames = ['domain_name', 'domain_description'] + if includeDatabases: + columnNames.append("database_name") + columnNames.append("database_description") + data = [] + for i in range(len(rdbComputeDomains)): + domName = rdbComputeDomains[i].get('name') + domDescr = rdbComputeDomains[i].get('description') + dbs = rdbComputeDomains[i].get('databaseContexts') + if includeDatabases: + for j in range(len(dbs)): + data.append([domName, domDescr, dbs[j].get('name'), dbs[j].get('description')]) + else: + data.append([domName, domDescr]) + + df = pd.DataFrame(data=data, columns=columnNames) + return df + + + + +class MetadataType: + """ + Contains a set of metadata types. + """ + TABLES = "TABLES" + VIEWS = "VIEWS" + COLUMNS = "COLUMNS" + ROUTINES = "ROUTINES" + CONSTRAINTS = "CONSTRAINTS" + PARAMETERS = "PARAMETERS" + + +def _getMetadata(rdbComputeDomain, databaseContextName, resourceName="", metadataType=None, format="pandas"): + """ + Utility function for the use of other metadata functions. + + :param rdbComputeDomain: object (dictionary) that defines a relational database (RDB) compute domain. A list of + these kind of objects available to the user is returned by the function 'getRDBComputeDomains'. + :param databaseContextName: database context name (string) on which the sql query is executed. + :return: the ID (integer) that labels the job. + :raises: Throws an exception if the HTTP request to the Authentication URL returns an error. Throws an exception if + the HTTP request to the SciQuery API returns an error, or if the volumes defined by the user are not available in + the Docker compute domain. + :example: job_id = SciQuery.submitQueryJob('select 1;') + + .. seealso:: SciQuery.submitQueryJob, SciQuery.getJobStatus, SciQuery.getJob + """ + + token = Authentication.getToken() + if token is not None and token != "": + + if metadataType not in [a for a in dir(MetadataType) if not a.startswith("__")]: + raise ValueError("Invalid value of metadataType paramter") + + if format not in ["pandas", "dict"]: + raise ValueError("Invalid value of format paramter") + + if Config.isSciServerComputeEnvironment(): + taskName = "Compute.SciScript-Python.Sciquery.getMetadata_" + metadataType + else: + taskName = "SciScript-Python.Sciquery.getMetadata_" + metadataType + + if type(rdbComputeDomain) == str: + rdbComputeDomain = getRDBComputeDomainFromName(rdbComputeDomain) + + rdbComputeDomainId = rdbComputeDomain.get("id") + + url = Config.SciqueryURL + "/api/metadata/{0}/{1}/".format(rdbComputeDomainId, databaseContextName); + if metadataType == MetadataType.TABLES: + url += "tables" + elif metadataType == MetadataType.VIEWS: + url += "views" + elif metadataType == MetadataType.ROUTINES: + url += "routines" + elif metadataType == MetadataType.COLUMNS: + url += "{0}/{1}".format(resourceName, "columns") + elif metadataType == MetadataType.PARAMETERS: + url += "{0}/{1}".format(resourceName, "parameters") + elif metadataType == MetadataType.CONSTRAINTS: + url += "{0}/{1}".format(resourceName, "constraints") + else: + raise ValueError("Wrong metadataType parameter value of " + metadataType) + + url += "?taskName=" + taskName + + headers = {'X-Auth-Token': token} + res = requests.get(url, headers=headers, stream=True) + + if res.status_code < 200 or res.status_code >= 300: + raise Exception("Error when getting metadata from SciQuery API.\nHttp Response from SciQuery API " + + "returned status code " + str(res.status_code) + ":\n" + res.content.decode()); + else: + res = json.loads(res.content.decode()) + result = res['Result'][0] + if format == "pandas": + df = pd.DataFrame(result['Data'], columns=[c.upper() for c in result['ColumnNames']]) + df.name = result['TableName'] + return df + else: + return result + else: + raise Exception("User token is not defined. First log into SciServer.") + + +def getTablesMetadata(rdbComputeDomain, databaseContextName, format="pandas"): + """ + Gets metadata related to tables in a particular database belonging to an RDBComputeDomain. + + :param rdbComputeDomain: Name (string) of a relational database (RDB) compute domain, or an object (dictionary) + that defines an RDBComputeDomain. A list of these kind of objects available to the user is returned by + the function 'getRDBComputeDomains'. + :param databaseContextName: name (string) of the database. + :param format: String that defines the returned format. Either "pandas" for a pandas dataframe, + or "dict" for a dictionary object. + :return: pandas dataframe or dict (depending of the value of 'param') containing associated metadata. + :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). + Throws an exception if the HTTP request to the JOBM API returns an error. + :example: metadata = SciQuery.getTablesMetadata(rdbComputeDomain, databaseContextName) + + .. seealso:: SciQuery.getTableNames, SciQuery.getDatabasesMetadata, Sciquery.getDatabaseNames + """ + return _getMetadata(rdbComputeDomain, databaseContextName, metadataType=MetadataType.TABLES, format=format) + +def getTableNames(rdbComputeDomain, databaseContextName): + """ + Gets a list of the names of tables in a particular database belonging to an RDBComputeDomain. + + :param rdbComputeDomain: Name (string) of a relational database (RDB) compute domain, or an object (dictionary) + that defines an RDBComputeDomain. A list of these kind of objects available to the user is returned by + the function 'getRDBComputeDomains'. + :param databaseContextName: name (string) of the database. + :return: array of table names (strings) + :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). + Throws an exception if the HTTP request to the JOBM API returns an error. + :example: tablenames = getTableNames(rdbComputeDomain, databaseContextName) + + .. seealso:: SciQuery.getTablesMetadata, SciQuery.getDatabasesMetadata, Sciquery.getDatabaseNames + """ + tables = getTablesMetadata(rdbComputeDomain, databaseContextName, format="pandas") + return [name for name in tables['TABLE_NAME']] + + +def getViewsMetadata(rdbComputeDomain, databaseContextName, format="pandas"): + """ + Gets metadata related to views in a particular database belonging to an RDBComputeDomain. + + :param rdbComputeDomain: Name (string) of a relational database (RDB) compute domain, or an object (dictionary) that + defines an RDBComputeDomain. A list of these kind of objects available to the user is returned by + the function 'getRDBComputeDomains'. + :param databaseContextName: name (string) of the database. + :param format: String that defines the returned format. Either "pandas" for a pandas dataframe, or "dict" for + a dictionary object. + :return: pandas dataframe or dict (depending of the value of 'param') containing associated metadata. + :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). + Throws an exception if the HTTP request to the JOBM API returns an error. + :example: metadata = SciQuery.getViewsMetadata(rdbComputeDomain, databaseContextName) + + .. seealso:: SciQuery.getTablesMetadata, SciQuery.getViewNames, SciQuery.getDatabasesMetadata + """ + return _getMetadata(rdbComputeDomain, databaseContextName, metadataType=MetadataType.VIEWS, format=format) + + +def getViewNames(rdbComputeDomain, databaseContextName): + """ + Gets a list of the names of views in a particular database belonging to an RDBComputeDomain. + + :param rdbComputeDomain: Name (string) of a relational database (RDB) compute domain, or an object (dictionary) + that defines an RDBComputeDomain. A list of + these kind of objects available to the user is returned by the function 'getRDBComputeDomains'. + :param databaseContextName: name (string) of the database. + :return: array of view names (strings) + :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). + Throws an exception if the HTTP request to the JOBM API returns an error. + :example: viewnames = SciQuery.getViewNames(rdbComputeDomain, databaseContextName) + + .. seealso:: SciQuery.getViewsMetadata, SciQuery.getTablesMetadata, SciQuery.getDatabasesMetadata + """ + tables = getViewsMetadata(rdbComputeDomain, databaseContextName, format="pandas") + return [name for name in tables['TABLE_NAME']] + + +def getRoutinesMetadata(rdbComputeDomain, databaseContextName, format="pandas"): + """ + Gets metadata related to routines or functions in a particular database belonging to an RDBComputeDomain. + + :param rdbComputeDomain: Name (string) of a relational database (RDB) compute domain, or an object (dictionary) that + defines an RDBComputeDomain. A list of these kind of objects available to the user is returned by + the function 'getRDBComputeDomains'. + :param databaseContextName: name (string) of the database. + :param format: String that defines the returned format. Either "pandas" for a pandas dataframe, or "dict" for + a dictionary object. + :return: pandas dataframe or dict (depending of the value of 'param') containing associated metadata. + :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). + Throws an exception if the HTTP request to the JOBM API returns an error. + :example: metadata = SciQuery.getRoutinesMetadata(rdbComputeDomain, databaseContextName) + + .. seealso:: SciQuery.getTablesMetadata, SciQuery.getViewsMetadata, SciQuery.getDatabasesMetadata + """ + return _getMetadata(rdbComputeDomain, databaseContextName, metadataType=MetadataType.ROUTINES, format=format) + + +def getRoutineNames(rdbComputeDomain, databaseContextName): + """ + Gets a list of the names of routines or functions in a particular database belonging to an RDBComputeDomain. + + :param rdbComputeDomain: Name (string) of a relational database (RDB) compute domain, or an object (dictionary) + that defines an RDBComputeDomain. A list of these kind of objects available to the user is returned by + the function 'getRDBComputeDomains'. + :param databaseContextName: name (string) of the database. + :return: array of routine names (strings) + :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). + Throws an exception if the HTTP request to the JOBM API returns an error. + :example: routinenames = getRoutineNames(rdbComputeDomain, databaseContextName) + + .. seealso:: SciQuery.getTableNames, SciQuery.getDatabasesMetadata, Sciquery.getDatabaseNames + """ + routines = getRoutinesMetadata(rdbComputeDomain, databaseContextName, format="pandas") + return [routineName for routineName in routines['ROUTINE_NAME']] + + +def getColumnsMetadata(rdbComputeDomain, databaseContextName, tableName, format="pandas"): + """ + Gets metadata related to columns in a particular database table belonging to an RDBComputeDomain. + + :param rdbComputeDomain: Name (string) of a relational database (RDB) compute domain, or an object (dictionary) + that defines an RDBComputeDomain. A list of these kind of objects available to the user is returned by + the function 'getRDBComputeDomains'. + :param databaseContextName: name (string) of the database. + :param tableName: name (string) of the database table. + :param format: String that defines the returned format. Either "pandas" for a pandas dataframe, or "dict" for + a dictionary object. + :return: pandas dataframe or dict (depending of the value of 'param') containing associated metadata. + :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). + Throws an exception if the HTTP request to the JOBM API returns an error. + :example: metadata = SciQuery.getColumnsMetadata(rdbComputeDomain, databaseContextName, tableName) + + .. seealso:: SciQuery.getTablesMetadata, SciQuery.getViewsMetadata, SciQuery.getDatabasesMetadata + """ + return _getMetadata(rdbComputeDomain, databaseContextName, tableName, metadataType=MetadataType.COLUMNS, + format=format) + + +def getColumnNames(rdbComputeDomain, databaseContextName, tableName): + """ + Gets a list of the names of table columns in a particular database belonging to an RDBComputeDomain. + + :param rdbComputeDomain: Name (string) of a relational database (RDB) compute domain, or an object (dictionary) + that defines an RDBComputeDomain. A list of these kind of objects available to the user is returned by + the function 'getRDBComputeDomains'. + :param databaseContextName: name (string) of the database. + :param tableName: name (string) of the database table + :return: array of column names (strings) + :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). + Throws an exception if the HTTP request to the JOBM API returns an error. + :example: columnnames = SciQuery.getColumnNames(rdbComputeDomain, databaseContextName, tableName) + + .. seealso:: SciQuery.getTableNames, SciQuery.getDatabasesMetadata, Sciquery.getDatabaseNames + """ + columns = getColumnsMetadata(rdbComputeDomain, databaseContextName, tableName, format="pandas") + return [columnName for columnName in columns['COLUMN_NAME']] + + +def getConstraintsMetadata(rdbComputeDomain, databaseContextName, tableName, format="pandas"): + """ + Gets metadata related to table constraints in a particular database table belonging to an RDBComputeDomain. + + :param rdbComputeDomain: Name (string) of a relational database (RDB) compute domain, or an object (dictionary) + that defines an RDBComputeDomain. A list of these kind of objects available to the user is returned by + the function 'getRDBComputeDomains'. + :param databaseContextName: name (string) of the database. + :param tableName: name (string) of the database table. + :param format: String that defines the returned format. Either "pandas" for a pandas dataframe, or "dict" for + a dictionary object. + :return: pandas dataframe or dict (depending of the value of 'param') containing associated metadata. + :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). + Throws an exception if the HTTP request to the JOBM API returns an error. + :example: metadata = SciQuery.getConstraintsMetadata(rdbComputeDomain, databaseContextName, tableName) + + .. seealso:: SciQuery.getTablesMetadata, SciQuery.getColumnsMetadata, SciQuery.getDatabasesMetadata + """ + return _getMetadata(rdbComputeDomain, databaseContextName, tableName, metadataType=MetadataType.CONSTRAINTS, + format=format) + + +def getConstraintNames(rdbComputeDomain, databaseContextName, tableName): + """ + Gets a list of the names of table constraints in a particular database belonging to an RDBComputeDomain. + + :param rdbComputeDomain: Name (string) of a relational database (RDB) compute domain, or an object (dictionary) + that defines an RDBComputeDomain. A list of these kind of objects available to the user is returned by + the function 'getRDBComputeDomains'. + :param databaseContextName: name (string) of the database. + :param tableName: name (string) of the database table. + :return: array of constraint names (strings) + :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). + Throws an exception if the HTTP request to the JOBM API returns an error. + :example: names = SciQuery.getConstraintNames(rdbComputeDomain, databaseContextName, tableName) + + .. seealso:: SciQuery.getTableNames, SciQuery.getColumnNames, SciQuery.getDatabasesMetadata + """ + constraints = getConstraintsMetadata(rdbComputeDomain, databaseContextName, tableName, format="pandas") + return [constraintName for constraintName in constraints['CONSTRAINT_NAME']] + + +def getRoutineParametersMetadata(rdbComputeDomain, databaseContextName, routineName, format="pandas"): + """ + Gets metadata related to routine parameters in a particular database belonging to an RDBComputeDomain. + + :param rdbComputeDomain: Name (string) of a relational database (RDB) compute domain, or an object (dictionary) + that defines an RDBComputeDomain. A list of these kind of objects available to the user is returned by + the function 'getRDBComputeDomains'. + :param databaseContextName: name (string) of the database. + :param routineName: name (string) of the routine or function. + :param format: String that defines the returned format. Either "pandas" for a pandas dataframe, or "dict" + for a dictionary object. + :return: pandas dataframe or dict (depending of the value of 'param') containing associated metadata. + :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). + Throws an exception if the HTTP request to the JOBM API returns an error. + :example: metadata = SciQuery.getRoutineParametersMetadata(rdbComputeDomain, databaseContextName, routineName) + + .. seealso:: SciQuery.getTablesMetadata, SciQuery.getColumnsMetadata, SciQuery.getDatabasesMetadata + """ + return _getMetadata(rdbComputeDomain, databaseContextName, routineName, metadataType=MetadataType.PARAMETERS, + format=format) + + +def getRoutineParameterNames(rdbComputeDomain, databaseContextName, routineName): + """ + Gets a list of the names of routine parameters in a particular database belonging to an RDBComputeDomain. + + :param rdbComputeDomain: Name (string) of a relational database (RDB) compute domain, or an object (dictionary) + that defines an RDBComputeDomain. A list of these kind of objects available to the user is returned by + the function 'getRDBComputeDomains'. + :param databaseContextName: name (string) of the database. + :param routineName: name (string) of the routine or function. + :return: array of parameter names (strings) + :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). + Throws an exception if the HTTP request to the JOBM API returns an error. + :example: names = SciQuery.getRoutineParameterNames(rdbComputeDomain, databaseContextName, routineName) + + .. seealso:: SciQuery.getTableNames, SciQuery.getDatabasesMetadata, Sciquery.getDatabaseNames + """ + parameters = getRoutineParametersMetadata(rdbComputeDomain, databaseContextName, routineName, format="pandas") + return [parametersName for parametersName in parameters['SPECIFIC_NAME']] From 2decb39e9c38919734bcc0b629416f4f2909c71a Mon Sep 17 00:00:00 2001 From: mtaghiza Date: Tue, 8 Mar 2022 10:52:16 -0500 Subject: [PATCH 04/13] Fixed job messages in executeQuery function. --- py3/SciServer/SciQuery.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/py3/SciServer/SciQuery.py b/py3/SciServer/SciQuery.py index c1eddae..80bcb37 100644 --- a/py3/SciServer/SciQuery.py +++ b/py3/SciServer/SciQuery.py @@ -353,7 +353,7 @@ def executeQuery(sqlQuery, job_status = Jobs.waitForJob(jobId, verbose=False, pollTime=poll_time) job = RDBJob(jobId) if job.status > 32: - messages = ". ".join(job.messages) if len(job.messages) > 0 else "" + messages = ". ".join([j.get('content') for j in job.messages if j.get('content') is not None]) if (job.status == 64): raise Exception("Query ended with an error. " + messages) if (job.status == 128): From 905faf5fd8a77dc3791a2c20e62bc1e6374181e4 Mon Sep 17 00:00:00 2001 From: mtaghiza Date: Thu, 2 Jun 2022 09:38:22 -0400 Subject: [PATCH 05/13] Pythonifying and adding functionality with more classes. --- py3/SciServer/Config.py | 9 +- py3/SciServer/SciQuery.py | 2007 +++++++++++++++++++++++-------------- 2 files changed, 1256 insertions(+), 760 deletions(-) diff --git a/py3/SciServer/Config.py b/py3/SciServer/Config.py index afb993e..b241298 100644 --- a/py3/SciServer/Config.py +++ b/py3/SciServer/Config.py @@ -22,6 +22,8 @@ - **Config.ComputeUrl**: defines the base URL of the Compute webapp (string). E.g., "https://apps.sciserver.org/compute". +- **Config.SciqueryURL**: defines the base URL of the SciQuery web API (string). E.g., "https://apps.sciserver.org/sciquery-api". + - **Config.version**: defines the SciServer release version tag (string), to which this package belongs. E.g., "sciserver-v1.9.3" """ # URLs for accessing SciServer web services (API endpoints) @@ -36,16 +38,17 @@ version = "sciserver-v2.1.0" #sciserver release version ComputeJobDirectoryFile = "/home/idies/jobs.path" #the path to the file in the "Docker job container" that shows the directory path where the asynchronous compute job is being executed. ComputeUrl = "https://apps.sciserver.org/compute" -SciqueryURL = "https://apps.sciserver.org/sciquery" +SciqueryURL = "https://apps.sciserver.org/sciquery-api" ComputeWorkDir = "/home/idies/workspace/" + def _load_config(filename): if os.path.exists(filename): with open(filename) as f: _config_data = json.load(f) global CasJobsRESTUri, AuthenticationURL, SciDriveHost, SkyQueryUrl, SkyServerWSurl global RacmApiURL, DataRelease, KeystoneTokenPath, version, ComputeJobDirectoryFile - global ComputeUrl + global ComputeUrl, SciqueryURL, ComputeWorkDir CasJobsRESTUri = _config_data.get('CasJobsRESTUri', CasJobsRESTUri) AuthenticationURL = _config_data.get('AuthenticationURL', AuthenticationURL) SciDriveHost = _config_data.get('SciDriveHost', SciDriveHost) @@ -57,6 +60,8 @@ def _load_config(filename): version = _config_data.get('version', version) ComputeJobDirectoryFile = _config_data.get('ComputeJobDirectoryFile', ComputeJobDirectoryFile) ComputeUrl = _config_data.get('ComputeUrl', ComputeUrl) + SciqueryURL = _config_data.get('SciqueryURL', SciqueryURL) + ComputeWorkDir = _config_data.get('ComputeWorkDir', ComputeWorkDir) _CONFIG_DIR = os.environ.get('XDG_CONFIG_HOME', os.path.join(os.path.expanduser('~'), '.config')) _SCISERVER_SYSTEM_CONFIG_DIR = '/etc/' # will not likely exist on non *nix systems diff --git a/py3/SciServer/SciQuery.py b/py3/SciServer/SciQuery.py index 80bcb37..cec1a25 100644 --- a/py3/SciServer/SciQuery.py +++ b/py3/SciServer/SciQuery.py @@ -1,20 +1,17 @@ from SciServer import Authentication, Config, Files, Jobs import pandas as pd -import numpy as np import json import requests -from functools import lru_cache +import cachetools.func +from collections.abc import Iterable from datetime import datetime - -def _get_default_rdb_domain(): - rdb_domains = getRDBComputeDomainNames() - if len(rdb_domains) > 0: - return rdb_domains[0] - else: - raise Exception("There are no rdbComputeDomains available for the user."); +import warnings +from pathlib import Path +from typing import Union, List +import time -class OutputTargetType: +class OutputType: """ Contains a set of allowed database output types. """ @@ -23,103 +20,203 @@ class OutputTargetType: DATABASE_TABLE = "TABLE" -class FileOutput: +@cachetools.func.ttl_cache(maxsize=128, ttl=120) +def _get_file_service(file_service: str = None): + file_services = Files.getFileServices(verbose=False) + if file_service is None: + if len(file_services) > 0: + return file_services[0] + raise Exception("No file services available for the user.") + else: + for fs in file_services: + if file_service == fs.get("name") or file_service == fs.get("identifier"): + return fs + raise Exception("Unable to find fileService") + + +class Output: """ - Defines the output of a database query to a file. + Base class for output objects """ - def __init__(self, target_name: str = "result.json", target_type: str = OutputTargetType.FILE_JSON, - statement_indexes: list = [1]): - """ - :param target_name: name of the file (string), such as "result.json" - :param target_type: type (string) of the file containing the query result(s) (e.g., "FILE_JSON"). - As set of possible values is given by the static members of class 'SciQuery.OutputTargetType' - :param statement_indexes: list of integers or integer. Each integer value denotes the index or position (>=1) - of the sql statements whithin the input query, whose resultset is going to be written into this OutputTarget - """ - if type(target_name) != str or type(target_type) != str: - raise ValueError("Invalid type(s) for input parameter(s) 'target_name' or 'target_type'") - self.target_name = target_name - self.target_type = target_type + def __init__(self, + name: str = "output.json", + output_type: str = OutputType.FILE_JSON, + statement_indexes: Union[int, List[int]] = 1): + + if type(name) != str: + raise TypeError("Invalid type for input parameter 'name'.") + if type(output_type) != str: + raise TypeError("Invalid type for input parameter 'output_type'.") + if type(statement_indexes) not in [list, int]: + raise TypeError("Invalid type for input parameter 'statement_indexes'.") + self.name = name + self.output_type = output_type + self.statement_indexes = None self.set_statement_indexes(statement_indexes) - def set_statement_indexes(self, statement_indexes: list = [1]): + def set_statement_indexes(self, statement_indexes: Union[int, List[int]] = 1): """ - Sets the index(es) of the sql statement(s) whithin the input query, whose resultset(s) is(are) going to - be written into this OutputTarget. - :param statement_indexes: list of integers, which are the indices (starting with 1) of the sql statements - within the input query, whose resultsets are going to be written into this OutputTarget. + Sets the index(es) of the sql statement(s) within the input query, whose result-set(s) is(are) going to be + written into this Output. + :param statement_indexes: integer or list of integers, which are the indices (starting with 1) of the sql + statements within the input query, whose resultsets are going to be written into this Output. """ - if type(statement_indexes) != list: + if not isinstance(statement_indexes, Iterable): statement_indexes = [statement_indexes] for index in statement_indexes: if type(index) != int or index <= 0: - raise ValueError("Invalid type for input parameter 'statement_indexes'") + raise TypeError("Invalid type for input parameter 'statement_indexes'") self.statement_indexes = [i for i in sorted(set(statement_indexes))] - return self + + def __str__(self): + return "Output of name = {}, type= {}, statement_indexes = {}".format(self.name, self.output_type, + self.statement_indexes) + def __repr__(self): + return "Output(name = {}, type= {}, statement_indexes = {})".format(self.name, self.output_type, + self.statement_indexes) + + +class FileOutput(Output): + """ + Defines the output of a database query into a file. + """ + + def __init__(self, + name: str = "output.json", + output_type: str = OutputType.FILE_JSON, + statement_indexes: Union[int, List[int]] = 1, + file_service: str = None): + """ + :param name: name of the file (string), such as "result.json" + :param output_type: type (string) of the file containing the query result(s) (e.g., "FILE_JSON"). + As set of possible values is given by the static members of class 'SciQuery.OutputTargetType' + :param statement_indexes: list of integers or integer. Each integer value denotes the index or position (>=1) + of the sql statements within the input query, whose resultset is going to be written into this OutputTarget + :param file_service: string denoting name or identifier of file service where the output file is written into. + """ + if file_service: + file_service = FileOutput.find_file_service(file_service) + self.file_service_name = file_service['name'] + self.file_service_identifier = file_service['identifier'] + else: + self.file_service_name = None + self.file_service_identifier = None + + if not name: + raise NameError("Input parameter name cannot be empty or None") + name = name.rstrip("/") + file_path = Path(name) + if name == file_path.name: # means no path included in 'name' input parameter + self.file_base_path = None + self.file = name + self.path = None + self.file_service_path = None + else: + if not name.startswith(Config.ComputeWorkDir): + file_path = Path(Config.ComputeWorkDir + name) # in case it is relative path + self.file_base_path = str(file_path.parent) + self.file_base_path = self.file_base_path if self.file_base_path.endswith("/") \ + else self.file_base_path + "/" + self.file = file_path.name + self.path = self.file_base_path + self.file + self.file_service_path = self.path.replace(Config.ComputeWorkDir, "", 1) + name = file_path.name + + super().__init__(name, output_type, statement_indexes) + + def get_path(self) -> str: + if self.path: + return self.path + else: + raise Exception("Attribute 'file_base_path' is not set.") + @classmethod def get_default(cls): """ - Gets a OutputTarget object filled with default values: JSON output file where only the 1st SQL statement of + Gets an Output object filled with default values: JSON output file where only the 1st SQL statement of the query is written in it. """ - cls.target_name = "result.json" - cls.target_type = OutputTargetType.FILE_JSON - cls.statement_indexes = [1] - return cls + return cls("result.json", OutputType.FILE_JSON, 1) + + @staticmethod + def build_file_base_path(top_volume: str = "Temporary", + user_volume: str = "scratch", + user_volume_owner_name: str = "", + relative_path: str = "sciqueryjobs", + add_date_ending: bool = False) -> str: + if not top_volume: + raise NameError("Input parameter top_volume cannot be empty or None") + + if add_date_ending: + now = datetime.now() + date1 = now.strftime("%Y-%m-%d") + date2 = now.strftime("%Hh%Mm%S.%fs") + relative_path = "{0}/{1}/{2}".format(relative_path.rstrip('/'), date1, date2) + + if user_volume: + if not user_volume_owner_name: + user_volume_owner_name = SciQuery.get_user().userName + path = str(Path(Config.ComputeWorkDir, top_volume, user_volume_owner_name, user_volume, relative_path)) + else: + path = str(Path(Config.ComputeWorkDir, top_volume, relative_path)) + return path if path.endswith("/") else path + "/" + + @staticmethod + def find_file_service(file_service: Union[str, dict] = None) -> dict: + if isinstance(file_service, dict): + file_service = file_service.get("identifier") + return _get_file_service(file_service) def __str__(self): - return "File Output of target_name = {}, target_type= {}, statement_indexes = {}".format(self.target_name, - self.target_type, - self.statement_indexes) + return f"File Output of name = {self.name}, type= {self.output_type}, statement_indexes = " \ + f"{self.statement_indexes}" def __repr__(self): - return "FileOutput(target_name = {}, target_type= {}, statement_indexes = {})".format(self.target_name, - self.target_type, - self.statement_indexes) + return f"FileOutput(name= {self.name}, type= {self.output_type}, statement_indexes = {self.statement_indexes})" -class DatabaseTableOutput: +class DatabaseTableOutput(Output): """ - Defines the output of a database query to a database table + Defines the output of a database query into a database table """ - def __init__(self, table: str = "resultTable", database: str = "", rdb_domain: str = "", schema: str = "", - statement_indexes: list = [1]): + + def __init__(self, + table: str = "resultTable", + database: str = None, + statement_indexes: Union[int, List[int]] = 1, + rdb_compute_domain: str = None, + schema: str = ""): + """ - :param table: name of the database table (string), such as "resultTable" + :param table: name of the output database table (string), such as "resultTable" :param database: name of the database (string) where the output table in created. If it is owned explicitly by - a user, then it should follow the pattern "mydb:username" - :param rdb_domain: name (string) of the relational database (RDB) compute domain that contains the database. + a user, then it should follow the pattern "mydb:" + :param statement_indexes: list of integers or integer. Each integer value denotes the index or position (>=1) + :param rdb_compute_domain: name (string) of the relational database (RDB) compute domain that contains the + database, or object of class RDBComputeDomain corresponding to it. Name of such domains available to the user is returned by the function Jobs.getRDBComputeDomainNames(). :param schema: database schema (string) - :param statement_indexes: list of integers or integer. Each integer value denotes the index or position (>=1) - of the sql statements whithin the input query, whose resultset is going to be written into this OutputTarget + of the sql statements within the input query, whose resultset is going to be written into this OutputTarget """ - if type(table) != str or type(rdb_domain) != str or type(schema) != str: - raise ValueError("Invalid type(s) for input parameter(s) 'target_name' or 'target_type'") + if type(table) != str or type(schema) != str: + raise TypeError("Input parameter(s) 'table' or 'schema' should be of type string.") + + domain = RDBComputeDomains.get_default_rdb_compute_domain() if not rdb_compute_domain else rdb_compute_domain + if not database: + if type(domain) == str: + database = SciQuery.get_rdb_compute_domains().get_rdb_compute_domain(domain).get_default_database().name + else: + database = domain.get_default_database().name + domain = domain.name + self.table = table - self.database = database if database != "" else "mydb:" + Authentication.keystoneUser.userName - self.rdb_domain = rdb_domain if rdb_domain != "" else _get_default_rdb_domain() + self.database = database + self.rdb_compute_domain_name = domain self.schema = schema - self.set_statement_indexes(statement_indexes) - self.target_name = ".".join([rdb_domain, database, schema, table]) - self.target_type = OutputTargetType.DATABASE_TABLE - - def set_statement_indexes(self, statement_indexes: list = [1]): - """ - Sets the index(es) of the sql statement(s) whithin the input query, whose resultset(s) is(are) going to be - written into this OutputTarget. - :param statement_indexes: list of integers, which are the indices (starting with 1) of the sql statements - within the input query, whose resultsets are going to be written into this OutputTarget. - """ - if type(statement_indexes) != list: - statement_indexes = [statement_indexes] - for index in statement_indexes: - if type(index) != int or index <= 0: - raise ValueError("Invalid type for input parameter 'statement_indexes'") - self.statement_indexes = [i for i in sorted(set(statement_indexes))] - return self + name = ".".join([self.rdb_compute_domain_name, self.database, self.schema, self.table]) + super().__init__(name, OutputType.DATABASE_TABLE, statement_indexes) @classmethod def get_default(cls): @@ -127,832 +224,1226 @@ def get_default(cls): Gets a OutputTarget object filled with default values: JSON output file where only the 1st SQL statement of the query is written in it. """ - cls.target_name = "resultTable" - cls.database = "mydb:" + Authentication.keystoneUser.userName - cls.rdb_domain = _get_default_rdb_domain() - cls.schema = "" - cls.target_type = OutputTargetType.DATABASE_TABLE - cls.statement_indexes = [1] - return cls + return cls(table = "resultTable", + database = RDBComputeDomains.get_default_rdb_compute_domain().get_default_database().name, + rdb_compute_domain = RDBComputeDomains.get_default_rdb_compute_domain().name, + schema = "", + statement_indexes = [1]) def __str__(self): - return "Database Table Output of table= {}, database= {}, rdb_domain= {}, schema= {}, statement_indexes= {}"\ - .format(self.table, self.database, self.rdb_domain, self.schema, self.statement_indexes) + return "Database Table Output of table= {}, database= {}, rdb_compute_domain_name= {}, schema= {}, " \ + "statement_indexes= {}".format(self.table, self.database, self.rdb_compute_domain_name, self.schema, + self.statement_indexes) def __repr__(self): - return "DatabaseTableOutput(table= {}, database= {}, rdb_domain= {}, schema= {}, statement_indexes= {})"\ - .format(self.table, self.database, self.rdb_domain, self.schema, self.statement_indexes) + return "DatabaseTableOutput(table= {}, database= {}, rdb_compute_domain_name= {}, schema= {}, " \ + "statement_indexes= {})".format(self.table, self.database, self.rdb_compute_domain_name, self.schema, + self.statement_indexes) + + +class Outputs(list): + """ + Contains a list of output objects, defining database query resultset outputs. + """ + + def __init__(self, *outputs): + super().__init__() + for output in outputs: + outs = output if isinstance(output, Iterable) else [output] + for out in outs: + self.append(out) + + def append(self, obj): + if isinstance(obj, Output): + super().append(obj) + else: + raise NameError("Input object is not a subclass of the 'Output' class.") + + def get_target_list(self, file_base_path: str = None, file_service: str = None): + targets = [] + fs = FileOutput.find_file_service(file_service) + for output in self: + for index in output.statement_indexes: + location = output.name + if output.output_type != OutputType.DATABASE_TABLE: # for files + if not output.file_service_identifier: + file_service_identifier = fs.get('identifier') + else: + file_service_identifier = output.file_service_identifier + + if not output.file_base_path: + if file_base_path: + location = file_base_path.rstrip("/") + "/" + output.file + else: + location = FileOutput.build_file_base_path().rstrip("/") + "/" + output.file + else: + location = output.file_base_path.rstrip("/") + "/" + output.file + + location = file_service_identifier + ":" + location; + targets.append({'location': location, 'type': output.output_type, 'resultNumber': index}) + return targets + + @staticmethod + def get_default(): + return Outputs(FileOutput(name="result.json", output_type=OutputType.FILE_JSON, statement_indexes = [1])) class RDBJob: """ - Contains the definition of an RDB job + Contains the definition of a job consisting on a query run in a Relational Database (RDB) """ + _JOB_STATUS_MAP = {1: "PENDING", 2: "QUEUED", 4: "ACCEPTED", 8: "STARTED", 16: "FINISHED", 32: "SUCCESS", + 64: "ERROR", 128: "CANCELED"} + def __init__(self, job): """ - :param job: can be the job ID (string), or the + :param job: can be the job ID (string), or a dictionary containing all the attributes of an RDBJob object. """ if type(job) != dict: job = Jobs.getJobDescription(job) - for k, v in job.items(): - setattr(self, k, v) - def get_output_targets(self): + self.id = job.get('id') + self.alias = job.get('submitterDID') if job.get('submitterDID') is not None else job.get('alias') + self._submitter_trust_id = job.get('submitterTrustId') if job.get('submitterTrustId') is not None else \ + job.get('_submitter_trust_id') + self._run_by_uuid = job.get('runByUUID') if job.get('runByUUID') is not None else job.get('_run_by_uuid') + self.submission_time = self._get_datetime(job.get('submissionTime') if job.get('submissionTime') is not None + else job.get('submission_time')) + self.start_time = self._get_datetime(job.get('startTime') if job.get('startTime') is not None + else job.get('start_time')) + self.end_time = self._get_datetime(job.get('endTime') if job.get('endTime') is not None + else job.get('end_time')) + self.duration = job.get('duration') + self.timeout = job.get('timeout') + self._messages = job.get('messages') if job.get('messages') is not None else job.get('_messages') + self.message_list = [m.get("content") for m in self._messages] if self._messages is not None else [] + self.status = job.get('status') + self.status_string = RDBJob.get_job_status(job.get('status')) + self._results_folder_uri = job.get('resultsFolderURI') if job.get('resultsFolderURI') is not None \ + else job.get('_results_folder_uri') + self._type = job.get('type') if job.get('type') is not None else job.get('_type') + self.user_name = job.get('username') if job.get('username') is not None else job.get('user_name') + self.input_sql = job.get('inputSql') if job.get('inputSql') is not None else job.get('input_sql') + self.targets = job.get('targets') + self.database_name = job.get('databaseContextName') \ + if job.get('databaseContextName') is not None else job.get('database_name') + self._rdb_resource_context_uuid = job.get('rdbResourceContextUUID') \ + if job.get('rdbResourceContextUUID') is not None else job.get('_rdb_resource_context_uuid') + self.rdb_compute_domain_name = job.get('rdbDomainName') if job.get('rdbDomainName') is not None \ + else job.get('rdb_compute_domain_name') + self.rdb_compute_domain_id = job.get('rdbDomainId') if job.get('rdbDomainId') is not None \ + else job.get('rdb_compute_domain_id') + self.outputs = self._get_outputs() + self.get_job_status = self._get_job_status_string + + def get_metadata(self, result_format="pandas") -> pd.DataFrame: + data = [] + column_names = [] + if result_format == "pandas": + for attr, value in self.__dict__.items(): + import inspect + if not attr.startswith("_") and not inspect.ismethod(value): # no private members nor methods + data.append(value) + column_names.append(attr) + df = pd.DataFrame([data], columns=column_names) + df.name = str(RDBJob) + return df + elif result_format == "dict": + return self.__dict__ + else: + raise Exception("Invalid value for input parameter 'result_format'.") + + @staticmethod + def get_job_status(status: int) -> str: + return RDBJob._JOB_STATUS_MAP.get(status) + + @staticmethod + def get_job(job_id: int): + return RDBJob(Jobs.getJobDescription(job_id)) + + def cancel(self): + Jobs.cancelJob(self.id) + + def refresh(self): + self.__init__(Jobs.getJobDescription(self.id)) + + def _get_job_status_string(self) -> str: + return RDBJob.get_job_status(self.status) + + def _get_outputs(self) -> Outputs: output_targets = {} for t in self.targets: i = (t['location'], t['type']) if i not in output_targets: output_targets[i] = [t['resultNumber']] else: - output_targets[i] = output_targets[i].append(t['resultNumber']) + output_targets[i].append(t['resultNumber']) - targets = [] + outputs = Outputs() for k in output_targets: - if k[1] == OutputTargetType.DATABASE_TABLE: + if k[1] == OutputType.DATABASE_TABLE: p = k[0].split(".") - targets.append(DatabaseTableOutput(table=p[3], database=p[1], rdb_domain=p[0], schema=p[2], + outputs.append(DatabaseTableOutput(table=p[3], database=p[1], rdb_compute_domain=p[0], schema=p[2], statement_indexes=output_targets[k])) else: - targets.append(FileOutput(target_name=k[0], target_type=k[1], statement_indexes=output_targets[k])) + file_parts = k[0].split(":") + file_service_identifier = file_parts[0] + name = file_parts[1] + outputs.append(FileOutput(name=name, output_type=k[1], statement_indexes=output_targets[k], + file_service=file_service_identifier)) + + return outputs + + def _get_output_from_index(self, ind: int): + if ind > len(self.outputs) - 1: + raise ValueError("Index is outside of the index range in the outputs list.") + return self.outputs[ind] + + def get_output_path(self, output: Union[Output, int] = 0) -> str: + out = self._get_output_from_index(output) if isinstance(output, int) else output + if out.output_type == OutputType.DATABASE_TABLE: + raise TypeError("Output is not a file but a database") + return out.get_path() + + def get_output_as_string(self, output: Union[Output, int, str] = None): + if not isinstance(output, str): + out = self._get_output_from_index(output) if isinstance(output, int) else output + file_path = self.get_output_path(out) + else: + file_path = output - # output_targets = [ OutputTarget(l,t,output_targets[(l,t)]) for l,t in output_targets] - # return output_targets - return targets + if Config.isSciServerComputeEnvironment(): + with open(file_path, ) as f: + data = f.read() + else: + if isinstance(output, str): + raise Exception(f"Cannot find file_path {output} in local file system.") + + fs = FileOutput.find_file_service(out.file_service_identifier) + path = out.file_service_path + data = Files.download(fs, path, format="txt", quiet=True) + + return data + + def get_json_output(self, output: Union[Output, int, str] = 0) -> dict: + data_dict = json.loads(self.get_output_as_string(output)) + return data_dict.get("Result") + + def get_dataframe_from_output(self, output: Union[Output, int] = 0, result_index: int = 0) -> list: + out = self._get_output_from_index(output) if isinstance(output, int) else output + if out.output_type == OutputType.FILE_JSON: + result = self.get_json_output(out)[result_index] + df = pd.DataFrame(result['Data'], columns=result['ColumnNames']) + df.name = result['TableName'] + elif out.output_type == OutputType.FILE_CSV: + df = pd.read_csv(out.get_path(), skiprows=1) + elif out.output_type == OutputType.DATABASE_TABLE: + sq = SciQuery(rdb_compute_domain=out.rdb_compute_domain_name, database=out.database) + query = f"select * from {out.table};" + df = sq.execute_query(query) + else: + raise Exception(f"Output type {out.output_type} not supported") + return df - def get_results_folder_path(self): - path = ":".join(self.resultsFolderURI.split(":")[1:]) - if not path.startswith(Config.ComputeWorkDir): - path = Config.ComputeWorkDir + path[1:] - return path + def _get_datetime(self, time): + return datetime.fromtimestamp(time / 1000.0) if time is not None else None - def get_output_target_path(self, output_target): - if output_target.target_type == OutputTargetType.DATABASE_TABLE: - raise ValueError("Output target is not a file but a database") - return self.get_results_folder_path() + output_target.target_name + def __str__(self): + return "RDB Job of id={}".format(self.id) - def get_fileservice_folder_path(self): - path = ":".join(self.resultsFolderURI.split(":")[1:]) - if path.startswith(Config.ComputeWorkDir): - path = "/" + path.replace(Config.ComputeWorkDir, "", 1) - return path + def __repr__(self): + return "RDBJob(id={})".format(self.id) - def get_start_time(self): - return datetime.fromtimestamp(self.startTime / 1000.0) - def get_end_time(self): - return datetime.fromtimestamp(self.endTime / 1000.0) +class Database: + """ + Defines a database context where users can run sql queries. + """ - def get_duration(self): - return self.duration + def __init__(self, rdb_compute_domain: Union[str, int, dict], database: Union[str, int, dict]): + """ + :param rdb_compute_domain: Parameter that identifies the relation database domain or environment that + contains the database. Could be either its name (string), ID (integer), or a dictionary containing + the attributes of the domain. + :param database: defines the database. Can be either the database name (string), ID (integer), or a dictionary + containing all the attributes of an object of class Database. + """ + if type(database) not in [str, int, dict]: + raise TypeError("Invalid type for input parameter 'database'.") - def __str__(self): - return "RDB Job of id = {}".format(self.id) + if type(rdb_compute_domain) not in [str, int, dict]: + raise TypeError("Invalid type for input parameter 'rdb_compute_domain'.") - def __repr__(self): - return "RDBJob(id = {})".format(self.id) + if type(rdb_compute_domain) != dict: + domain = RDBComputeDomain(rdb_compute_domain) + else: + domain = rdb_compute_domain + if type(database) != dict: + dbs = domain.get('databaseContexts') or domain.get('databases') + if type(database) == str: + database = [db for db in dbs if db.get('name') == database] + else: + database = [db for db in dbs if db.get('id') == database] + if len(database) == 0: + raise NameError("Unable to find database.") + else: + database = database[0] + + self.id = database.get('id') + self._racm_id = database.get('_racm_id') if database.get('_racm_id') is not None else database.get('racmId') + self.name = database.get('name') if database.get('name') is not None else database.get('contextName') + self.description = database.get('description') + self.vendor = database.get('vendor') + self.schemas = database.get('dbSchemas') if database.get('dbSchemas') is not None else database.get('schemas') + self.rdb_compute_domain_name = domain.get('name') if domain.get('name') is not None else \ + domain.get('displayName') + self.rdb_compute_domain_id = domain.get('id') + + def get_metadata(self) -> pd.DataFrame: + data = [] + column_names = ['database_name', 'database_description', 'database_vendor', 'database_id', + 'rdb_compute_domain_name', 'rdb_compute_domain_id'] + data.append([self.name, self.description, self.vendor, self.id, self.rdb_compute_domain_name, + self.rdb_compute_domain_id]) + df = pd.DataFrame(data=data, columns=column_names, index=[self.id]) + df = df.astype({"rdb_compute_domain_id": int, "database_id": int}) + return df -@lru_cache(128) -def _get_file_service(file_service_id=""): - file_services = Files.getFileServices(verbose=False) - for file_service in file_services: - if file_service["name"] == file_service_id or file_service["identifier"] == file_service_id: - return file_service + def __str__(self): + return "Database of name={}, id={} and rdb_compute_domain_name={}".format(self.name, self.id, + self.rdb_compute_domain_name) - if len(file_services) > 0: - return file_services[0] - else: - raise Exception("No fileservices available for the user") + def __repr__(self): + return "Database(name={}, id={}, rdb_compute_domain_name={})".format(self.name, self.id, + self.rdb_compute_domain_name) -def submitQueryJob(sqlQuery, - rdbComputeDomain=None, - databaseContextName=None, - outputTargets=FileOutput.get_default(), - resultsFolderPath="", - jobAlias="", - fileServiceName=""): +class RDBComputeDomain: """ - Submits a sql query for execution (as an asynchronous job) inside a relational database (RDB) compute domain. - - :param sqlQuery: sql query (string) - :param rdbComputeDomain: object (dictionary) that defines a relational database (RDB) compute domain. A list of - these kind of objects available to the user is returned by the function 'getRDBComputeDomains'. - :param databaseContextName: database context name (string) on which the sql query is executed. - :param outputTargets: object of type SciQuery.OutputTarget defining the output of one or multiple statements - within the input query. Could also be a list of OutputTarget objects. - :param resultsFolderPath: full path to results folder (string) where query output tables are written into. - E.g.: /home/idies/workspace/rootVOlume/username/userVolume/jobsFolder . If not set, - then a default folder will be set automatically. - :param jobAlias: alias (string) of job, defined by the user. - :param fileServiceName: name or uuid (string) of FileService where the results folder (resultsFolderPath) is - going to be created. If not defined, then the first available FileService is chosen by default. - :return: the ID (integer) that labels the job. - :raises: Throws an exception if the HTTP request to the Authentication URL returns an error. Throws an exception if - the HTTP request to the SciQuery API returns an error, or if the volumes defined by the user are not available in - the Docker compute domain. - :example: job_id = SciQuery.submitQueryJob('select 1;') - - .. seealso:: SciQuery.submitQueryJob, SciQuery.getJobStatus, SciQuery.getJob + Defines a domain or environment with databases that users are able to query. """ - token = Authentication.getToken() - if token is not None and token != "": + def __init__(self, rdb_compute_domain: Union[str, int, dict]): + """ + Creates an instance of an RDBComputeDomain, which defines a domain or environment with databases that users. + are able to query. + :param rdb_compute_domain: Parameter that identifies the domain. Could be either its name (string), + ID (integer), or a dictionary containing all the attributes of the domain. + """ + if type(rdb_compute_domain) not in [str, int, dict]: + raise TypeError("Invalid type for input parameter 'rdb_compute_domain'.") + + if type(rdb_compute_domain) != dict: + domains = SciQuery.get_rdb_compute_domains("dict") + if type(rdb_compute_domain) == str: + domain = [d for d in domains if d.get('name') == rdb_compute_domain] + elif type(rdb_compute_domain) == int: + domain = [d for d in domains if d.get('id') == rdb_compute_domain] + else: + raise TypeError("Invalid type for input parameter 'rdb_compute_domain'.") - if Config.isSciServerComputeEnvironment(): - taskName = "Compute.SciScript-Python.Sciquery.submitQueryJob" + if len(domain) > 0: + rdb_compute_domain = domain[0] + else: + raise NameError("Unable to find rdbComputeDomain {0}.".format(rdb_compute_domain)) + + self.id = rdb_compute_domain.get('id') + self._racm_id = rdb_compute_domain.get('_racm_id') if rdb_compute_domain.get('_racm_id') is not None else \ + rdb_compute_domain.get('racmId') + self.name = rdb_compute_domain.get('name') if rdb_compute_domain.get('name') is not None else \ + rdb_compute_domain.get('displayName') + self.description = rdb_compute_domain.get('description') + dbs = [] + databases = rdb_compute_domain.get('dbContexts') if \ + rdb_compute_domain.get('dbContexts') is not None else rdb_compute_domain.get('databases') + for db_name, db_dict in databases.items(): + dbs.append(Database(rdb_compute_domain, db_dict)) + self.databases = dbs + + def get_database_names(self) -> list: + """ + Gets a list of the names of databases in an RDBComputeDomain + + :return: list of database names (strings) + :example: dbnames = SciQuery.get_database_names(rdbComputeDomainName); + .. seealso:: SciQuery.get_databases_metadata + """ + return [db.name for db in self.databases] + + def get_database(self, database: Union[str, int, dict, Database]) -> Database: + if type(database) == str: + dbs = [db for db in self.databases if db.name == database] + elif type(database) == int: + dbs = [db for db in self.databases if db.id == database] + elif type(database) == dict: + dbs = [db for db in self.databases if db.id == database.get('id')] + elif isinstance(database, Database): + return self.get_database(database.id) + else: + raise TypeError("Invalid type for input parameter 'database'.") + if len(dbs) == 0: + raise NameError("Database not found in list of available databases.") + else: + return dbs[0] + + def get_default_database(self) -> Database: + dbs = [db for db in self.databases if db.name == SciQuery.get_mydb_name()] + if len(dbs) > 0: + return dbs[0] + elif len(self.databases) > 0: + return self.databases[0] else: - taskName = "SciScript-Python.Sciquery.submitQueryJob" + raise Exception("No default database available.") + + def get_metadata(self, do_include_databases: bool = False) -> pd.DataFrame: + + column_names = ['rdb_compute_domain_name', 'rdb_compute_domain_description', 'rdb_compute_domain_id'] + data = [[self.name, self.description, self.id]] + domain_metadata = pd.DataFrame(data=data, columns=column_names) + + if do_include_databases: + db_metadata = self.get_databases_metadata() + # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.join.html + domain_metadata = pd.merge(domain_metadata, db_metadata, how="outer", + left_on=["rdb_compute_domain_id", "rdb_compute_domain_name"], + right_on=["rdb_compute_domain_id", "rdb_compute_domain_name"]) + domain_metadata.sort_values(by=['rdb_compute_domain_name', 'database_name'], inplace=True) + domain_metadata = domain_metadata.astype({"database_id": int}) + else: + domain_metadata.sort_values(by=['rdb_compute_domain_name'], inplace=True) - if rdbComputeDomain is None: - rdbComputeDomains = getRDBComputeDomains(); - if len(rdbComputeDomains) > 0: - rdbComputeDomain = rdbComputeDomains[0]; - else: - raise Exception("There are no rdbComputeDomains available for the user."); + domain_metadata = domain_metadata.astype({"rdb_compute_domain_id": int}) + return domain_metadata - if databaseContextName is None: - databaseContexts = rdbComputeDomain.get('databaseContexts'); - if len(databaseContexts) > 0: - databaseContextName = databaseContexts[0].get('name') - else: - raise Exception("rbdComputeDomain has no database contexts available for the user."); + def get_databases_metadata(self) -> pd.DataFrame: + """ + Gets metadata of the databases in this RDBComputeDomain. - if type(outputTargets) != list: - outputTargets = [outputTargets] + :return: pandas dataframe with associated metadata. + .. seealso:: SciQuery.get_database_names + """ + dfs = [db.get_metadata() for db in self.databases] + dfs = pd.concat(dfs, ignore_index=True) + dfs.sort_values(by="database_name", inplace=True) + return dfs - targets = [] - for target in outputTargets: - for index in target.statement_indexes: - targets.append({'location': target.target_name, 'type': target.target_type, 'resultNumber': index}) + def __str__(self): + return "RDBComputeDomain of name={} and id={}".format(self.name, self.id) - rdbDomainId = rdbComputeDomain.get('id'); + def __repr__(self): + return "RDBComputeDomain(name={}, id={})".format(self.name, self.id) - file_service = _get_file_service(fileServiceName) - resultsFolderPath = file_service['identifier'] + ":" + resultsFolderPath - dockerJobModel = { - "inputSql": sqlQuery, - "submitterDID": jobAlias, - "databaseContextName": databaseContextName, - "rdbDomainId": rdbDomainId, - "targets": targets, - "resultsFolderURI": resultsFolderPath - } +class RDBComputeDomains(list): + """ + Defines a list of RDBComputeDomains, which are domains or environments with databases that users are able to query. + """ + def __init__(self, rdb_compute_domains: Union[Iterable, RDBComputeDomain]): + """ + :param rdb_compute_domains: Parameter that identifies a list of RDBComputeDomain objects. + Could be either single RDBComputeDomain object, or an iterable containing multiple RDBComputeDomain objects. + """ + super().__init__() + domains = rdb_compute_domains if isinstance(rdb_compute_domains, Iterable) else [rdb_compute_domains] + for d in domains: + if isinstance(d, RDBComputeDomain): + self.append(d) + else: + raise NameError("Input object is not of class RDBComputeDomain.") + + def get_rdb_compute_domain(self, rdb_compute_domain: Union[str, int, dict, RDBComputeDomain]) -> RDBComputeDomain: + if type(rdb_compute_domain) == str: + domains = [d for d in self if d.name == rdb_compute_domain] + elif type(rdb_compute_domain) == int: + domains = [d for d in self if d.id == rdb_compute_domain] + elif type(rdb_compute_domain) == dict: + domains = [d for d in self if d.id == rdb_compute_domain.get('id')] + elif isinstance(rdb_compute_domain, RDBComputeDomain): + return self.get_rdb_compute_domain(rdb_compute_domain.id) + else: + raise TypeError("Invalid type for input parameter 'rdb_compute_domain'.") - data = json.dumps(dockerJobModel).encode() - url = Config.SciqueryURL + "/api/jobs/" + str(rdbDomainId) + "?TaskName=" + taskName; - headers = {'X-Auth-Token': token, "Content-Type": "application/json"} - res = requests.post(url, data=data, headers=headers, stream=True) - if res.status_code < 200 or res.status_code >= 300: - raise Exception("Error when submitting a job to the SciQuery API.\nHttp Response from SciQuery API " + - "returned status code " + str(res.status_code) + ":\n" + res.content.decode()); + if len(domains) == 0: + raise NameError("RDBComputeDomain not found in list of available rdbComputeDomains") else: - return res.content.decode() - else: - raise Exception("User token is not defined. First log into SciServer.") + return domains[0] + + def get_default_rdb_compute_domain(self) -> RDBComputeDomain: + domains = [domain for domain in self if len(domain.databases) > 0] + if len(domains) > 0: + doms = [dom for dom in domains if dom.get_default_database().name == SciQuery.get_mydb_name()] + if len(doms) > 0: + return doms[0] + return domains[0] + else: + raise Exception("No RDBComputeDomain available with a database.") -def executeQuery(sqlQuery, - rdbComputeDomain=None, - databaseContextName=None, - resultsFolderPath="", - jobAlias="", - poll_time=0.2, - fileServiceName=""): +class SciQuery: """ - Returns the query result (as a Pandas data frame) of a sql query submitted as a job to a - relational database (RDB) compute domain. - - :param sqlQuery: sql query (string) - :param rdbComputeDomain: object (dictionary) that defines a relational database (RDB) compute domain. A list of - these kind of objects available to the user is returned by the function 'getRDBComputeDomains'. - :param databaseContextName: database context name (string) on which the sql query is executed. - :param resultsFolderPath: full path to results folder (string) where query output tables are written into. - E.g.: /home/idies/workspace/rootVOlume/username/userVolume/jobsFolder . If not set, - then a default folder will be set automatically. - :param jobAlias: alias (string) of job, defined by the user. - :param fileServiceName: name or uuid (string) of FileService where the results folder (resultsFolderPath) is - going to be created. If not defined, then the first available FileService is chosen by default. - :return: Pandas data frame containing the result of the query. - :raises: Throws an exception if the HTTP request to the Authentication URL returns an error. Throws an exception if - the HTTP request to the SciQuery API returns an error. - :example: df = SciQuery.executeQuery('select 1;') - - .. seealso:: SciQuery.submitQueryJob, SciQuery.getJobStatus, SciQuery.getJob - Jobs.cancelJob + Instance of the SciQuery app for querying relational databases. """ - output_target = FileOutput("result1.json", OutputTargetType.FILE_JSON).set_statement_indexes([1]) - - jobId = submitQueryJob(sqlQuery=sqlQuery, rdbComputeDomain=rdbComputeDomain, databaseContextName=databaseContextName, - outputTargets=output_target, - resultsFolderPath=resultsFolderPath, - jobAlias=jobAlias, fileServiceName=fileServiceName) - - job_status = Jobs.waitForJob(jobId, verbose=False, pollTime=poll_time) - job = RDBJob(jobId) - if job.status > 32: - messages = ". ".join([j.get('content') for j in job.messages if j.get('content') is not None]) - if (job.status == 64): - raise Exception("Query ended with an error. " + messages) - if (job.status == 128): - raise Exception("Query was cancelled. " + messages) - - if Config.isSciServerComputeEnvironment(): - file_path = job.get_output_target_path(output_target) - with open(file_path, ) as f: - j = json.load(f) - else: - file_service_id = job.resultsFolderURI.split(":")[0] - file_service = _get_file_service(file_service_id) - path = job.get_fileservice_folder_path(output_target) - s = Files.download(file_service, path, format="txt", quiet=True) - j = json.loads(s) + def __init__(self, + rdb_compute_domain: Union[str, int, dict, RDBComputeDomain] = None, + database: Union[str, int, dict, Database] = None, + file_service: Union[str, dict] = None, + results_base_path: str = None, + outputs: Outputs = None, + verbose: bool = True, + hard_fail: bool = False): + """ + Creates instance of SciQuery class. + + :param rdb_compute_domain: defines a domain or environment of multiple databases where users can run queries. + Can be either the domain's name (string), ID (integer), an object of class RDBComputeDomain, or a dictionary + containing all the attributes of an object of class RDBComputeDomain. If set to None, a default value will be + assigned to it. + :param database: defines the database where the queries are executed in. + Can be either the database name (string), ID (integer), an object of class Database, or a dictionary containing + all the attributes of an object of class Database. If set to None, a default value will be assigned to it. + :param file_service: a File Service defines an available file system where query result sets can be written + into. This parameter can be it name or identifier (string), or a dictionary defining a file service. + If set to None, a default value will be assigned to it. + :param results_base_path: base path (string) of the directory where the query results are written into. + Can be constructed by using FileOutput.build_file_base_path(). If set to None, a default value will be assigned + to it at the moment of running a sql query. + :param outputs: Defines the query(ies) output(s). Can be an object derived from the Output base class (such as + FileOutput or DatabaseTableOutput), or a list of those. If set to None, a default value (json file output) + will be assigned to it. + :param verbose: Boolean parameter. If True, warning messages will be written in case of errors, in the case when + the hard_fail parameter is set to False. If False, nothing will be written. + :param hard_fail: Boolean parameter. If True, exceptions will be raised in case of errors during instantiation. + If False, then no exceptions are raised, and warnings might be showed instead + (depending on the value of the verbose parameter). + """ - result=j['Result'][0] - df=pd.DataFrame(result['Data'],columns=result['ColumnNames']) - df.name = result['TableName'] - return df + self.user = SciQuery.get_user() + self.verbose = verbose + self.hard_fail = hard_fail + self._file_service = None + self._results_base_path = None + self._outputs = None + self._rdb_compute_domains = None + self._rdb_compute_domain = None + self._database = None + self.refresh_date = None + self.set(rdb_compute_domain, database, file_service, results_base_path, outputs, verbose, hard_fail) + + @staticmethod + def get_token() -> str: + token = Authentication.getToken() + if token is None or token == "": + raise Exception("User not has not logged into SciServer. Use 'Authentication.login'.") + return token + + @staticmethod + def get_user() -> Authentication.KeystoneUser: + return Authentication.getKeystoneUserWithToken(SciQuery.get_token()) + + def set(self, + rdb_compute_domain: Union[str, int, dict, RDBComputeDomain] = None, + database: Union[str, int, dict, Database] = None, + file_service: Union[str, dict] = None, + results_base_path: str = None, + outputs: Outputs = None, + verbose: bool = None, + hard_fail: bool = None): + """ + Sets or refreshes the parameters in the SciQuery object, all at once. + + :param rdb_compute_domain: defines a domain or environment of multiple databases where users can run queries. + Can be either the domain's name (string), ID (integer), an object of class RDBComputeDomain, or a dictionary + containing all the attributes of an object of class RDBComputeDomain. If set to None, the current value + is refreshed. + :param database: defines the database where the queries are executed in. + Can be either the database name (string), ID (integer), an object of class Database, or a dictionary containing + all the attributes of an object of class Database. If set to None, the current value is refreshed. + :param file_service: a File Service defines an available file system where query result sets can be written + into. This parameter can be it name or identifier (string), or a dictionary defining a file service. + If set to None, the current value is refreshed. + :param results_base_path: base path (string) of the directory where the query results are written into. + Can be constructed by using FileOutput.build_file_base_path(). + :param outputs: Defines the query(ies) output(s). Can be a list of Output objects, + or a single object of class Outputs. If set to None, a default value (json file output) will be assigned to it. + :param verbose: Boolean parameter. If True, warning messages will be written in case of errors, in the case when + the hard_fail parameter is set to False. If False, nothing will be written. + :param hard_fail: Boolean parameter. If True, exceptions will be raised in case of errors during instantiation. + If False, then no exceptions are raised, and warnings might be showed instead + (depending on the value of the verbose parameter). + """ + self.verbose = verbose if verbose else self.verbose + self.hard_fail = hard_fail if hard_fail else self.hard_fail + + # set or refresh current _rdb_compute_domains + try: + self.rdb_compute_domains = SciQuery.get_rdb_compute_domains('class') + except Exception as ex: + self._handle_exception(NameError(ex), "Unable to set or refresh rdb_compute_domains.") + # nothing else to do: + return + + try: + if self.rdb_compute_domain is None: + self.rdb_compute_domain = rdb_compute_domain if rdb_compute_domain else \ + self.get_default_rdb_compute_domain() + else: + self.rdb_compute_domain = rdb_compute_domain if rdb_compute_domain else self.rdb_compute_domain + except Exception as ex: + self._handle_exception(NameError(ex), "Unable to set or refresh rdb_compute_domain.") -def getRDBComputeDomains(): - """ - Gets a list of all registered Relational Database (RDB) compute domains that the user has access to. + try: + if self.database is None: + self.database = database if database else self.get_default_database() + else: + self.database = database if database else self.database + except Exception as ex: + self._handle_exception(NameError(ex), "Unable to set or refresh database.") - :return: a list of dictionaries, each one containing the definition of an RDB compute domain. - :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). - Throws an exception if the HTTP request to the JOBM API returns an error. - :example: rdb_compute_domains = SciQuery.getRDBComputeDomains(); + try: + if self.file_service is None: + self.file_service = file_service if file_service else self.get_default_file_service() + else: + self.file_service = file_service if file_service else self.file_service + except Exception as ex: + self._handle_exception(NameError(ex), "Unable to set or refresh file_service.") - .. seealso:: SciQuery.executeQuery, SciQuery.submitQueryJob - """ - token = Authentication.getToken() - if token is not None and token != "": + try: + if self.outputs is None: + self.outputs = outputs if outputs else self.get_default_outputs() + else: + self.outputs = outputs if outputs else self.outputs + except Exception as ex: + self._handle_exception(NameError(ex), "Unable to set or refresh outputs.") + + try: + self.results_base_path = results_base_path + except Exception as ex: + self._handle_exception(NameError(ex), "Unable to set results_base_path.") + + self.refresh_date = datetime.now() + + def _handle_exception(self, exception: Exception, extra_message: str = ""): + + message = extra_message + " Error: " + str(exception) if extra_message else str(exception) + if self.hard_fail: + exception.message = message + raise exception + elif self.verbose: + warnings.warn(message) + + def refresh(self): + self.set(verbose=self.verbose, hard_fail=self.hard_fail) + + @staticmethod + def get_mydb_name(owner_name: str = None) -> str: + if not owner_name: + owner_name = SciQuery.get_user().userName + return "mydb:" + owner_name + + @staticmethod + def get_rdb_compute_domains(result_format: str = 'class') -> RDBComputeDomains: + """ + Gets a list of all registered Relational Database (RDB) compute domains that the user has access to. + + :param result_format: If set to "class", then the returned value will be of class RDBComputeDomains. + If set to "dict", then the return value will be a list of dictionaries, each of them containing the attributes + of an RDBComputeDomain object. + :return: an object of class RDBComputeDomains, or a list of dictionaries, each of them containing the attributes + of an RDBComputeDomain object. + :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that + purpose). Throws an exception if the HTTP request to the JOBM API returns an error. + :example: rdb_compute_domains = SciQuery.get_rdb_compute_domains(); + """ + token = SciQuery.get_user().token if Config.isSciServerComputeEnvironment(): - taskName = "Compute.SciScript-Python.SciQuery.get_compute_domains" + task_name = "Compute.SciScript-Python.SciQuery.get_rdb_compute_domains" else: - taskName = "SciScript-Python.SciQuery.get_compute_domains" + task_name = "SciScript-Python.SciQuery.get_rdb_compute_domains" - url = Config.RacmApiURL + "/jobm/rest/computedomains/rdb?TaskName=" + taskName + url = Config.SciqueryURL + "/api/info/domain?TaskName=" + task_name headers = {'X-Auth-Token': token, "Content-Type": "application/json"} res = requests.get(url, headers=headers, stream=True) if res.status_code != 200: - raise Exception("Error when getting RDB Compute Domains from JOBM API.\nHttp Response from JOBM API returned" - " status code " + str(res.status_code) + ":\n" + res.content.decode()); + raise Exception( + "Error when getting RDB Compute Domains from JOBM API.\nHttp Response from JOBM API returned" + " status code " + str(res.status_code) + ":\n" + res.content.decode()) else: - return json.loads(res.content.decode()) - else: - raise Exception("User token is not defined. First log into SciServer.") + arr = json.loads(res.content.decode()) + if result_format == 'class': + return RDBComputeDomains([RDBComputeDomain(d) for d in arr]) + else: + return arr + # rdb_compute_domains --------------------------------------------------- -def getRDBComputeDomainNames(rdbComputeDomains=None): - """ - Returns the names of the RDB compute domains available to the user. + @property + def rdb_compute_domains(self) -> RDBComputeDomains: + return self._rdb_compute_domains - :param rdbComputeDomains: a list of rdbComputeDomain objects (dictionaries), as returned by - Jobs.getRDBComputeDomains(). If not set, then an extra internal call to Jobs.getRDBComputeDomains() is made. - :return: an array of strings, each being the name of a rdb compute domain available to the user. - :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). - Throws an exception if the HTTP request to the RACM API returns an error. - :example: dockerComputeDomainsNames = Files.getDockerComputeDomainsNames(); + @rdb_compute_domains.setter + def rdb_compute_domains(self, rdb_compute_domains: RDBComputeDomains): + if not isinstance(rdb_compute_domains, RDBComputeDomains): + raise Exception("'rdb_compute_domains' should be of class RDBComputeDomains.") + self._rdb_compute_domains = rdb_compute_domains - .. seealso:: Files.getRDBComputeDomains - """ - if rdbComputeDomains is None: - rdbComputeDomains = getRDBComputeDomains(); + # rdb_compute_domain --------------------------------------------------- - rdbComputeDomainsNames = []; - for rdbComputeDomain in rdbComputeDomains: - rdbComputeDomainsNames.append(rdbComputeDomain.get('name')) + @property + def rdb_compute_domain(self) -> RDBComputeDomain: + return self._rdb_compute_domain - return rdbComputeDomainsNames; + @rdb_compute_domain.setter + def rdb_compute_domain(self, rdb_compute_domain: Union[str, int, dict, RDBComputeDomain]): + if rdb_compute_domain is None: + raise Exception("'rdb_compute_domain' cannot be set to None.") + self._rdb_compute_domain = self.get_rdb_compute_domain(rdb_compute_domain) -def getRDBComputeDomainFromName(rdbComputeDomainName, rdbComputeDomains = None): - """ - Returns an RDBComputeDomain object, given its registered name. - - :param rdbComputeDomainName: name of the RDBComputeDomainName, as shown within the results of - Jobs.getRDBComputeDomains() - :param rdbComputeDomains: a list of rdbComputeDomain objects (dictionaries), as returned by - Jobs.getRDBComputeDomains(). If not set, then an extra internal call to Jobs.getRDBComputeDomains() is made. - :return: an RDBComputeDomain object (dictionary) that defines an RDB compute domain. A list of these kind of objects - available to the user is returned by the function Jobs.getRDBComputeDomains(). - :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). - Throws an exception if the HTTP request to the JOBM API returns an error. - :example: rdbComputeDomain = SciQuery.getRDBComputeDomainFromName(rdbComputeDomainName); - - .. seealso:: SciQuery.getRDBComputeDomains - """ - if rdbComputeDomainName is None: - raise Exception("rdbComputeDomainName is not defined.") - else: - if rdbComputeDomains is None: - rdbComputeDomains = getRDBComputeDomains(); + def get_rdb_compute_domain(self, rdb_compute_domain: Union[str, int, dict, RDBComputeDomain] = None) \ + -> RDBComputeDomain: + """ + Returns an object of class RDBComputeDomain, either defined by the input name or identifiers, or that + which is set in the SciQuery instance. + + :param rdb_compute_domain: defines a domain or environment of multiple databases where users can run queries. + Can be either the domain's name (string), ID (integer), an object of class RDBComputeDomain, or a dictionary + containing all the attributes of an object of class RDBComputeDomain. If set to None, then the currently set + value of rdb_compute_domain in the SciQuery object is returned. + :return: Object of class RDBComputeDomain. + """ + if rdb_compute_domain is None: + return self._rdb_compute_domain + return self.rdb_compute_domains.get_rdb_compute_domain(rdb_compute_domain) - if rdbComputeDomains.__len__() > 0: - for rdbComputeDomain in rdbComputeDomains: - if rdbComputeDomainName == rdbComputeDomain.get('name'): - return rdbComputeDomain; - else: - raise Exception("There are no RDBComputeDomains available for the user."); + def get_default_rdb_compute_domain(self): + return self.rdb_compute_domains.get_default_rdb_compute_domain() - raise Exception("RDBComputeDomain of name '" + rdbComputeDomainName + "' is not available or does not exist."); + # database --------------------------------------------------- -def getJobsList(top=10, open=None, start=None, end=None): - """ - Gets the list of SciQuery Jobs submitted by the user. - - :param top: top number of jobs (integer) returned. If top=None, then all jobs are returned. - :param open: If set to 'True', then only returns jobs that have not finished executing and wrapped up - (status <= FINISHED). If set to 'False' then only returnes jobs that are still running. If set to 'None', - then returns both finished and unfinished jobs. - :param start: The earliest date (inclusive) to search for jobs, in string format yyyy-MM-dd hh:mm:ss.SSS. - If set to 'None', then there is no lower bound on date. - :param end: The latest date (inclusive) to search for jobs, in string format yyyy-MM-dd hh:mm:ss.SSS. - If set to 'None', then there is no upper bound on date. - :return: a list of dictionaries, each one containing the definition of a submitted job. - :raises: Throws an exception if the HTTP request to the Authentication URL returns an error, and if the HTTP request - to the JOBM API returns an error. - :example: jobs = SciQuery.getJobsList(top=2); - - .. seealso:: SciQuery,getJob, SciQuery.getJobStatus, SciQuery.cancelJob - """ - job_dict_list = Jobs.getJobsList(top=top, open=open, start=start, end=end, type='rdb') - rdb_job_list = [] - for job_dict in job_dict_list: - rdb_job_list.append(RDBJob(job_dict)) - return rdb_job_list + @property + def database(self) -> Database: + return self._database -def getJob(jobId): - """ - Gets the definition of the job as a RDBJob object. + @database.setter + def database(self, database: Union[str, int, dict, Database]): + if database is None: + raise Exception("'database' cannot be set to None.") + self._database = self.get_database(database, self.rdb_compute_domain) - :param jobId: Id of job - :return: RDBJob object containing the description or definition of the job. - :raises: Throws an exception if the HTTP request to the Authentication URL returns an error, and if the HTTP request - to the JOBM API returns an error. - :example: job = SciQuery.getJob(jobId) + def get_database(self, + database: Union[str, int, dict, Database] = None, + rdb_compute_domain: Union[str, int, dict, RDBComputeDomain] = None) -> Database: + """ + Returns an object of class Database, either defined by the input name or identifiers, or that + which is set in the SciQuery instance. + + + :param database: identifies the database, which this function returns as an object of class Database. + Can be either the database name (string), ID (integer), an object of class Database, or a dictionary containing + all the attributes of an object of class Database. If set to None, then the currently set value of database in + the SciQuery object is returned. + :param rdb_compute_domain: defines a domain or environment of multiple databases where users can run queries, + and that contains the database. Can be either the domain's name (string), ID (integer), an object of class + RDBComputeDomain, or a dictionary containing all the attributes of an object of class RDBComputeDomain. + If set to None, then the currently set value of rdb_compute_domain in the SciQuery object is internally used. + :return: Object of class Database + """ + if database is None: + return self._database + return self.get_rdb_compute_domain(rdb_compute_domain).get_database(database) - .. seealso:: SciQuery.getJob, SciQuery.cancelJob, SciQuery.submitQueryJob - """ - return RDBJob(Jobs.getJobDescription(jobId)) + def get_default_database(self, rdb_compute_domain: Union[str, int, dict, RDBComputeDomain] = None) -> Database: + domain = self.get_default_rdb_compute_domain() if rdb_compute_domain is None \ + else self.get_rdb_compute_domain(rdb_compute_domain) + return domain.get_default_database() -def getJobStatus(jobId): - """ - Gets a dictionary with the job status as an integer value, together with its semantic meaning. The integer value is - a power of 2, that is, 1:PENDING, 2:QUEUED, 4:ACCEPTED, 8:STARTED, 16:FINISHED, 32:SUCCESS, 64:ERROR, 128:CANCELED + # file_service --------------------------------------------------- - :param jobId: Id of job (integer). - :return: dictionary with the integer value of the job status, as well as its semantic meaning. - :raises: Throws an exception if the HTTP request to the Authentication URL returns an error, and if the HTTP request - to the JOBM API returns an error. - :example: status = SciQuery.getJobStatus(jobId) + @property + def file_service(self) -> dict: + return self._file_service - .. seealso:: SciQuery.cancelJob, SciQuery.waitForJob, SciQuery.getJob, SciQuery.cancelJob - """ - return Jobs.getJobStatus(jobId) + @file_service.setter + def file_service(self, file_service: Union[str, dict]): + if file_service is None: + raise Exception("'file_service' cannot be set to None.") + self._file_service = self.get_file_service(file_service) -def cancelJob(jobId): - """ - Cancels the execution of a job. + def get_file_service(self, file_service: Union[str, dict] = None) -> dict: + """ + Returns the definition of a file service as a dictionary, either defined by the input name or identifiers, + or that which is set in the SciQuery instance. - :param jobId: Id of the job (integer) - :raises: Throws an exception if the HTTP request to the Authentication URL returns an error. Throws an exception if - the HTTP request to the JOBM API returns an error. - :example: SciQuery.cancelJob(jobId); + :param file_service: name or identifier (string) of a file service, or the dictionary with its definition. + If set to None, then the currently set value of file_service in the SciQuery object is returned. + :return: dictionary with the definition of a file service. + """ + if file_service is None: + return self._file_service + return FileOutput.find_file_service(file_service) - .. seealso:: SciQuery.getJobStatus, SciQuery.getJobDescription - """ - Jobs.cancelJob(jobId) + def get_default_file_service(self) -> dict: + return FileOutput.find_file_service() -def waitForJob(jobId, verbose=False, pollTime = 5): - """ - Queries the job status regularly and waits until the job is completed. - - :param jobId: id of job (integer) - :param verbose: if True, will print "wait" messages on the screen while the job is still running. If False, it will - suppress the printing of messages on the screen. - :param pollTime: idle time interval (integer, in seconds) before querying again for the job status. Minimum value - allowed is 5 seconds. - :return: After the job is finished, returns a dictionary object containing the job status. - :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). - Throws an exception if the HTTP request to the JOBM API returns an error. - :example: jobStatus = SciQuery.waitForJob(jobId) - - .. seealso:: SciQuery.getJobStatus, SciQuery.getJobDescription - """ - return Jobs.waitForJob(jobId=jobId, verbose=verbose, pollTime = pollTime) + # results_base_path --------------------------------------------------- + @property + def results_base_path(self) -> str: + return self._results_base_path -def getDatabasesMetadata(rdbComputeDomain, format="pandas"): - """ - Gets metadata (name and description) of databases in an RDBComputeDomain. - - :param rdbComputeDomain: Name (string) of a relational database (RDB) compute domain, or an object (dictionary) that - defines a RDBComputeDomain. A list of these kind of objects available to the user is returned by - the function 'getRDBComputeDomains'. - :param format: String that defines the returned format. Either "pandas" for a pandas dataframe, or "dict" for - a dictionary object. - :return: pandas dataframe or dict (depending of the value of 'param') with associated metadata. - :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). - Throws an exception if the HTTP request to the JOBM API returns an error. - :example: metadata = SciQuery.getDatabasesMetadata(rdbComputeDomainName); - - .. seealso:: SciQuery.getDatabaseNames - """ + @results_base_path.setter + def results_base_path(self, results_base_path: str): + # if results_base_path is None or not results_base_path.startswith(Config.ComputeWorkDir): + # raise Exception(f"The string 'results_base_path' must start with {Config.ComputeWorkDir}") + self._results_base_path = results_base_path - if type(rdbComputeDomain) == str: - rdbComputeDomain = getRDBComputeDomainFromName(rdbComputeDomain) - databaseContexts = rdbComputeDomain.get("databaseContexts") - if format == "dict": - return databaseContexts - columnNames = ['database_name', 'database_description'] - data = [] - for i in range(len(databaseContexts)): - data.append([databaseContexts[i]['name'], databaseContexts[i]['description']]) - df = pd.DataFrame(data=data, columns=columnNames) - return df + def get_results_base_path(self) -> str: + return self._results_base_path + def get_default_results_base_path(self, add_date_ending=True) -> str: + return FileOutput.build_file_base_path(add_date_ending=add_date_ending) -def getDatabaseNames(rdbComputeDomain): - """ - Gets a list of the names of databases in an RDBComputeDomain + # outputs ------------------------------------------------------------- - :param rdbComputeDomain: Name (string) of a relational database (RDB) compute domain, or an object (dictionary) that - defines an RDBComputeDomain. A list of these kind of objects available to the user is returned by - the function 'getRDBComputeDomains'. - :return: array of database names (strings) - :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). - Throws an exception if the HTTP request to the JOBM API returns an error. - :example: dbnames = SciQuery.getDatabaseNames(rdbComputeDomainName); + @property + def outputs(self) -> Outputs: + return self._outputs - .. seealso:: SciQuery.getDatabasesMetadata - """ - databases = getDatabasesMetadata(rdbComputeDomain, format="pandas") - return [name for name in databases['database_name']] + @outputs.setter + def outputs(self, outputs: Union[Outputs, Output]): + if outputs is None: + raise Exception("'outputs' cannot be set to None.") + self._outputs = self.get_outputs(outputs) -def getRDBComputeDomainsMetadata(format="pandas", includeDatabases=False): - """ - Gets metadata related to all relational database (RDB) compute domains (RDBComputeDomains) available to the user. - - :param format: String that defines the returned format. Either "pandas" for a pandas dataframe, or "dict" for - a dictionary object. - :param includeDatabases: Boolean parameter. If True, it will return metadata related to all available databases in - each RDBComputeDomain as well. - :return: pandas dataframe or dict (depending of the value of 'param') containing associated metadata. - :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). - Throws an exception if the HTTP request to the JOBM API returns an error. - :example: metadata = SciQuery.getRDBComputeDomainsMetadata(); - - .. seealso:: SciQuery.getDatabasesMetadata, Sciquery.getDatabaseNames - """ - rdbComputeDomains = getRDBComputeDomains() - if format == "dict": - return rdbComputeDomains - columnNames = ['domain_name', 'domain_description'] - if includeDatabases: - columnNames.append("database_name") - columnNames.append("database_description") - data = [] - for i in range(len(rdbComputeDomains)): - domName = rdbComputeDomains[i].get('name') - domDescr = rdbComputeDomains[i].get('description') - dbs = rdbComputeDomains[i].get('databaseContexts') - if includeDatabases: - for j in range(len(dbs)): - data.append([domName, domDescr, dbs[j].get('name'), dbs[j].get('description')]) + def get_outputs(self, outputs: Union[Outputs, Output] = None) -> Outputs: + """ + Returns an object of class Outputs, either defined by the inputs parameters, or that + which is set in the SciQuery instance. + + :param outputs: object of class Outputs, or iterable of output objects. If set to None, then the currently + set value of outputs in the SciQuery object is returned. + :return: object of class Outputs. + """ + if outputs is None: + return self.outputs + return Outputs(outputs) + + def get_default_outputs(self) -> Outputs: + return Outputs.get_default() + + # --------------------------------------------------------------------------------------------- + # Running Queries ----------------------------------------------------------------------------- + # --------------------------------------------------------------------------------------------- + + def submit_query_job(self, + sql_query: str, + database: Union[str, int, dict, Database] = None, + outputs: Union[Outputs, Output] = None, + results_base_path: str = None, + rdb_compute_domain: Union[str, int, dict, RDBComputeDomain] = None, + file_service: str = None, + job_alias: str = "") -> int: + """ + Submits a sql query for execution (as an asynchronous job) inside a relational database (RDB) compute domain. + + :param sql_query: sql query (string) + :param database: defines the database where the sql query is executed in. + Can be either the database name (string), ID (integer), an object of class Database, or a dictionary containing + all the attributes of an object of class Database. If set to None, then the current value of the database field + in this SciQuery instance will be used. + :param outputs: Defines the query(ies) output(s). Can be an object derived from the Output base class (such as + FileOutput or DatabaseTableOutput), or a list of those. If set to None, then the current value of the outputs + field in this SciQuery instance will be used. + :param results_base_path: full path to results folder (string) where query output tables are written into. + E.g.: /home/idies/workspace/rootVOlume/username/userVolume/jobsFolder . If set to None, then its current value + in this SciQuery instance will be used. If that value is None, then a default folder will be set automatically. + :param rdb_compute_domain: defines a domain or environment of multiple databases where users can run queries, + and that contains the database. Can be either the domain's name (string), ID (integer), an object of class + RDBComputeDomain, or a dictionary containing all the attributes of an object of class RDBComputeDomain. + If set to None, then the currently set value of rdb_compute_domain in the SciQuery object is internally used. + :param file_service: a File Service defines an available file system where query result sets can be written + into. This parameter can be its name or identifier (string), or a dictionary defining a file service. + If set to None, then the currently set value of file_service in the SciQuery object is internally used. + :param job_alias: alias (string) of job, defined by the user. + :return: the ID (string) that labels the job. + """ + + domain = self.get_rdb_compute_domain(rdb_compute_domain) + db = self.get_database(database, domain) + fs = self.get_file_service(file_service) + outputs = self.get_outputs(outputs) + results_base_path = results_base_path if results_base_path else self.get_results_base_path() + if not results_base_path: + results_base_path = self.get_default_results_base_path() + + targets = outputs.get_target_list(results_base_path, fs.get('identifier')) + + job_model = { + "inputSql": sql_query, + "submitterDID": job_alias, + "databaseContextName": db.name, + "rdbDomainId": domain.id, + "targets": targets, + "resultsFolderURI": fs['identifier'] + ":" + results_base_path + } + + if Config.isSciServerComputeEnvironment(): + task_name = "Compute.SciScript-Python.SciQuery.submit_query_job" else: - data.append([domName, domDescr]) + task_name = "SciScript-Python.SciQuery.submit_query_job" - df = pd.DataFrame(data=data, columns=columnNames) - return df + data = json.dumps(job_model).encode() + url = Config.SciqueryURL + "/api/jobs/" + str(domain._racm_id) + "?TaskName=" + task_name; + headers = {'X-Auth-Token': self.user.token, "Content-Type": "application/json"} + res = requests.post(url, data=data, headers=headers, stream=True) + if res.status_code < 200 or res.status_code >= 300: + raise Exception("Error when submitting a job to the SciQuery API.\nHttp Response from SciQuery API " + + "returned status code " + str(res.status_code) + ":\n" + res.content.decode()); + else: + return res.content.decode() + def execute_query(self, + sql_query, + database: Union[str, int, dict, Database] = None, + results_base_path: str = None, + rdb_compute_domain: Union[str, int, dict, RDBComputeDomain] = None, + job_alias: str = "", + poll_time: float = 1.0, + file_service: str = None) -> pd.DataFrame: + """ + Returns the query result (as a Pandas data frame) of a sql query submitted as a job to a + relational database (RDB) compute domain. + + :param sql_query: sql query (string) + :param database: defines the database where the sql query is executed in. + Can be either the database name (string), ID (integer), an object of class Database, or a dictionary containing + all the attributes of an object of class Database. If set to None, then the current value of the database field + in this SciQuery instance will be used. + :param results_base_path: full path to results folder (string) where query output tables are written into. + E.g.: /home/idies/workspace/rootVOlume/username/userVolume/jobsFolder . If set to None, then its current value in + this SciQuery instance will be used. If that value is None, then a default folder will be set automatically. + :param rdb_compute_domain: defines a domain or environment of multiple databases where users can run queries, + and that contains the database. Can be either the domain's name (string), ID (integer), an object of class + RDBComputeDomain, or a dictionary containing all the attributes of an object of class RDBComputeDomain. + If set to None, then the currently set value of rdb_compute_domain in the SciQuery object is internally used. + :param job_alias: alias (string) of job, defined by the user. + :param poll_time: time (float) in seconds between consecutive requests for updates in the jobs status. + :param file_service: a File Service defines an available file system where query result sets can be written + into. This parameter can be its name or identifier (string), or a dictionary defining a file service. + If set to None, then the currently set value of file_service in the SciQuery object is internally used. + :return: Pandas data frame containing the result of the query. + """ + output = FileOutput("result1.json", OutputType.FILE_JSON, 1) + job_alias = job_alias if job_alias else "synchronous query" + job_id = self.submit_query_job(sql_query=sql_query, rdb_compute_domain=rdb_compute_domain, database=database, + outputs=output, + results_base_path=results_base_path, + job_alias=job_alias, + file_service=file_service) + + job = self.wait_for_job(job_id, verbose=False, poll_time=poll_time) + if job.status > 32: + messages = ". ".join(job.message_list) + if (job.status == 64): + raise Exception("Query ended with an error. " + messages) + if (job.status == 128): + raise Exception("Query was cancelled. " + messages) + + df = job.get_dataframe_from_output(0) + return df + + @staticmethod + def get_jobs_list(top=5, open=None, start=None, end=None, result_format="pandas") \ + -> Union[pd.DataFrame, list]: + """ + Gets the list of SciQuery Jobs submitted by the user. + + :param top: top number of jobs (integer) returned. If top=None, then all jobs are returned. + :param open: If set to 'True', then only returns jobs that have not finished executing and wrapped up + (status <= FINISHED). If set to 'False' then only returns jobs that are still running. If set to 'None', + then returns both finished and unfinished jobs. + :param start: The earliest date (inclusive) to search for jobs, in string format yyyy-MM-dd hh:mm:ss.SSS. + If set to 'None', then there is no lower bound on date. + :param end: The latest date (inclusive) to search for jobs, in string format yyyy-MM-dd hh:mm:ss.SSS. + If set to 'None', then there is no upper bound on date. + :param result_format: string defining the return format. "pandas" for a pandas dataframe and "list" + for a list of RDBJob objects. + :return: pandas dataframe, or list of RDBJob objects or, each containing the definition of a submitted job. + """ + job_dict_list = Jobs.getJobsList(top=top, open=open, start=start, end=end, type='rdb') + jobs = [] + for job_dict in job_dict_list: + j = RDBJob(job_dict) + jobs.append(j if format == "list" else j.get_metadata()) + if result_format == "pandas": + jobs = pd.concat(jobs, ignore_index=True) + return jobs + + @staticmethod + def get_job(job_id): + """ + Gets the definition of the job as a RDBJob object. + :param job_id: Id of job + :return: RDBJob object containing the description or definition of the job. + """ + return RDBJob.get_job(job_id) + @staticmethod + def get_job_status(job_id): + """ + Gets a dictionary with the job status as an integer value, together with its semantic meaning. The integer value is + a power of 2, that is, 1:PENDING, 2:QUEUED, 4:ACCEPTED, 8:STARTED, 16:FINISHED, 32:SUCCESS, 64:ERROR, 128:CANCELED -class MetadataType: - """ - Contains a set of metadata types. - """ - TABLES = "TABLES" - VIEWS = "VIEWS" - COLUMNS = "COLUMNS" - ROUTINES = "ROUTINES" - CONSTRAINTS = "CONSTRAINTS" - PARAMETERS = "PARAMETERS" + :param job_id: Id of job (integer). + :return: dictionary with the integer value of the job status, as well as its semantic meaning. + """ + return Jobs.getJobStatus(job_id) + @staticmethod + def cancel_job(job): + """ + Cancels the execution of a job. -def _getMetadata(rdbComputeDomain, databaseContextName, resourceName="", metadataType=None, format="pandas"): - """ - Utility function for the use of other metadata functions. - - :param rdbComputeDomain: object (dictionary) that defines a relational database (RDB) compute domain. A list of - these kind of objects available to the user is returned by the function 'getRDBComputeDomains'. - :param databaseContextName: database context name (string) on which the sql query is executed. - :return: the ID (integer) that labels the job. - :raises: Throws an exception if the HTTP request to the Authentication URL returns an error. Throws an exception if - the HTTP request to the SciQuery API returns an error, or if the volumes defined by the user are not available in - the Docker compute domain. - :example: job_id = SciQuery.submitQueryJob('select 1;') - - .. seealso:: SciQuery.submitQueryJob, SciQuery.getJobStatus, SciQuery.getJob - """ + :param job_id: Id of the job (integer) + :raises: Throws an exception if the HTTP request to the Authentication URL returns an error. Throws an exception if + the HTTP request to the JOBM API returns an error. + :example: SciQuery.cancelJob(jobId); - token = Authentication.getToken() - if token is not None and token != "": + .. seealso:: SciQuery.get_job_status, SciQuery.getJobDescription + """ + if isinstance(job, str): + Jobs.cancelJob(job) + elif isinstance(job, RDBJob): + job.cancel() + else: + raise NameError("Invalid type for input parameter 'job'.") - if metadataType not in [a for a in dir(MetadataType) if not a.startswith("__")]: - raise ValueError("Invalid value of metadataType paramter") + def wait_for_job(self, job_id, verbose = False, poll_time=1.0): + """ + Queries the job status regularly and waits until the job is completed. + + :param job_id: id of job (integer) + :param verbose: if True, will print "wait" messages on the screen while the job is still running. If False, it + will suppress the printing of messages on the screen. + :param poll_time: idle time interval (integer, in seconds) before querying again for the job status. Minimum + value allowed is 0.1 seconds. + :return: After the job is finished, returns a dictionary object containing the job status. + :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that + purpose). + Throws an exception if the HTTP request to the JOBM API returns an error. + :example: jobStatus = SciQuery.wait_for_job(jobId) + + .. seealso:: SciQuery.get_job_status, SciQuery.getJobDescription + """ + #return Jobs.waitForJob(jobId=jobId, verbose=verbose, pollTime=poll_time) + min_poll_time = 0.1 # in seconds + while True: + if verbose: + print("Waiting...") + job_desc = Jobs.getJobDescription(job_id) + if job_desc.get("status") >= 32: + if verbose: + print("Done!") + return RDBJob(job_desc) + else: + time.sleep(max(min_poll_time, poll_time)) - if format not in ["pandas", "dict"]: - raise ValueError("Invalid value of format paramter") + ### METADATA - if Config.isSciServerComputeEnvironment(): - taskName = "Compute.SciScript-Python.Sciquery.getMetadata_" + metadataType + def get_rdb_compute_domains_metadata(self, do_include_databases=False): + """ + Gets metadata related to all relational database (RDB) compute domains (RDBComputeDomains) available. + + :param do_include_databases: Boolean parameter. If True, it will return metadata related to all available + databases in each RDBComputeDomain as well. + :return: pandas dataframe containing associated metadata. + """ + dfs = [] + for domain in self.rdb_compute_domains: + dfs.append(domain.get_metadata(do_include_databases)) + + df = pd.concat(dfs, ignore_index=True) + df.sort_values(by="rdb_compute_domain_name", inplace=True) + return df + + def get_rdb_compute_domain_names(self): + """ + Returns the names of the RDB compute domains available to the user. + + :return: an array of strings, each being the name of a rdb compute domain available to the user. + """ + return [d.name for d in self.rdb_compute_domains] + + def get_rdb_compute_domain_metadata(self, rdb_compute_domain=None, do_include_databases=False): + return self.get_rdb_compute_domain(rdb_compute_domain).get_metadata(do_include_databases) + + def get_databases_metadata(self, rdb_compute_domain=None): + """ + Gets metadata (name and description) of databases in an RDBComputeDomain. + """ + rdb_compute_domain = self.get_rdb_compute_domain(rdb_compute_domain) + if isinstance(rdb_compute_domain, Iterable): + dfs = [d.get_databases_metadata() for d in rdb_compute_domain] + return pd.concat(dfs, ignore_index=True) else: - taskName = "SciScript-Python.Sciquery.getMetadata_" + metadataType + return rdb_compute_domain.get_databases_metadata() + + def get_database_metadata(self, database=None, rdb_compute_domain=None): + rdb_compute_domain = self.get_rdb_compute_domain(rdb_compute_domain) + database = self.get_database(database, rdb_compute_domain) + return database.get_metadata() - if type(rdbComputeDomain) == str: - rdbComputeDomain = getRDBComputeDomainFromName(rdbComputeDomain) + def get_database_names(self, rdb_compute_domain=None): + """ + Gets a list of the names of databases in an RDBComputeDomain - rdbComputeDomainId = rdbComputeDomain.get("id") + :return: array of database names (strings) + """ + rdb_compute_domain = self.get_rdb_compute_domain(rdb_compute_domain) + return rdb_compute_domain.get_database_names() - url = Config.SciqueryURL + "/api/metadata/{0}/{1}/".format(rdbComputeDomainId, databaseContextName); - if metadataType == MetadataType.TABLES: + def _get_metadata(self, rdb_compute_domain, database, resource_name="", metadata_type=""): + """ + Utility function for the use of other metadata functions. + """ + + rdb_compute_domain = self.get_rdb_compute_domain(rdb_compute_domain) + database = self.get_database(database, rdb_compute_domain.id) + + if metadata_type not in [t for t in dir(_MetadataType) if not t.startswith("__")]: + raise TypeError("Invalid type for input parameter 'metadata_type'.") + + if Config.isSciServerComputeEnvironment(): + task_name = "Compute.SciScript-Python.Sciquery.get_metadata_" + metadata_type + else: + task_name = "SciScript-Python.Sciquery.get_metadata_" + metadata_type + + url = Config.SciqueryURL + "/api/metadata/{0}/{1}/".format(rdb_compute_domain._racm_id, database.name); + if metadata_type == _MetadataType.TABLES: url += "tables" - elif metadataType == MetadataType.VIEWS: + elif metadata_type == _MetadataType.VIEWS: url += "views" - elif metadataType == MetadataType.ROUTINES: + elif metadata_type == _MetadataType.ROUTINES: url += "routines" - elif metadataType == MetadataType.COLUMNS: - url += "{0}/{1}".format(resourceName, "columns") - elif metadataType == MetadataType.PARAMETERS: - url += "{0}/{1}".format(resourceName, "parameters") - elif metadataType == MetadataType.CONSTRAINTS: - url += "{0}/{1}".format(resourceName, "constraints") + elif metadata_type == _MetadataType.COLUMNS: + url += "{0}/{1}".format(resource_name, "columns") + elif metadata_type == _MetadataType.PARAMETERS: + url += "{0}/{1}".format(resource_name, "parameters") + elif metadata_type == _MetadataType.CONSTRAINTS: + url += "{0}/{1}".format(resource_name, "constraints") else: - raise ValueError("Wrong metadataType parameter value of " + metadataType) + raise ValueError("Wrong metadata_type parameter value of " + metadata_type) - url += "?taskName=" + taskName - - headers = {'X-Auth-Token': token} + url += "?taskName=" + task_name + headers = {'X-Auth-Token': self.user.token} res = requests.get(url, headers=headers, stream=True) if res.status_code < 200 or res.status_code >= 300: raise Exception("Error when getting metadata from SciQuery API.\nHttp Response from SciQuery API " + - "returned status code " + str(res.status_code) + ":\n" + res.content.decode()); + "returned status code " + str(res.status_code) + ":\n" + res.content.decode()) else: res = json.loads(res.content.decode()) result = res['Result'][0] - if format == "pandas": - df = pd.DataFrame(result['Data'], columns=[c.upper() for c in result['ColumnNames']]) - df.name = result['TableName'] - return df - else: - return result - else: - raise Exception("User token is not defined. First log into SciServer.") - - -def getTablesMetadata(rdbComputeDomain, databaseContextName, format="pandas"): - """ - Gets metadata related to tables in a particular database belonging to an RDBComputeDomain. - - :param rdbComputeDomain: Name (string) of a relational database (RDB) compute domain, or an object (dictionary) - that defines an RDBComputeDomain. A list of these kind of objects available to the user is returned by - the function 'getRDBComputeDomains'. - :param databaseContextName: name (string) of the database. - :param format: String that defines the returned format. Either "pandas" for a pandas dataframe, - or "dict" for a dictionary object. - :return: pandas dataframe or dict (depending of the value of 'param') containing associated metadata. - :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). - Throws an exception if the HTTP request to the JOBM API returns an error. - :example: metadata = SciQuery.getTablesMetadata(rdbComputeDomain, databaseContextName) - - .. seealso:: SciQuery.getTableNames, SciQuery.getDatabasesMetadata, Sciquery.getDatabaseNames - """ - return _getMetadata(rdbComputeDomain, databaseContextName, metadataType=MetadataType.TABLES, format=format) - -def getTableNames(rdbComputeDomain, databaseContextName): - """ - Gets a list of the names of tables in a particular database belonging to an RDBComputeDomain. - - :param rdbComputeDomain: Name (string) of a relational database (RDB) compute domain, or an object (dictionary) - that defines an RDBComputeDomain. A list of these kind of objects available to the user is returned by - the function 'getRDBComputeDomains'. - :param databaseContextName: name (string) of the database. - :return: array of table names (strings) - :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). - Throws an exception if the HTTP request to the JOBM API returns an error. - :example: tablenames = getTableNames(rdbComputeDomain, databaseContextName) - - .. seealso:: SciQuery.getTablesMetadata, SciQuery.getDatabasesMetadata, Sciquery.getDatabaseNames - """ - tables = getTablesMetadata(rdbComputeDomain, databaseContextName, format="pandas") - return [name for name in tables['TABLE_NAME']] - - -def getViewsMetadata(rdbComputeDomain, databaseContextName, format="pandas"): - """ - Gets metadata related to views in a particular database belonging to an RDBComputeDomain. - - :param rdbComputeDomain: Name (string) of a relational database (RDB) compute domain, or an object (dictionary) that - defines an RDBComputeDomain. A list of these kind of objects available to the user is returned by - the function 'getRDBComputeDomains'. - :param databaseContextName: name (string) of the database. - :param format: String that defines the returned format. Either "pandas" for a pandas dataframe, or "dict" for - a dictionary object. - :return: pandas dataframe or dict (depending of the value of 'param') containing associated metadata. - :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). - Throws an exception if the HTTP request to the JOBM API returns an error. - :example: metadata = SciQuery.getViewsMetadata(rdbComputeDomain, databaseContextName) - - .. seealso:: SciQuery.getTablesMetadata, SciQuery.getViewNames, SciQuery.getDatabasesMetadata - """ - return _getMetadata(rdbComputeDomain, databaseContextName, metadataType=MetadataType.VIEWS, format=format) - - -def getViewNames(rdbComputeDomain, databaseContextName): - """ - Gets a list of the names of views in a particular database belonging to an RDBComputeDomain. - - :param rdbComputeDomain: Name (string) of a relational database (RDB) compute domain, or an object (dictionary) - that defines an RDBComputeDomain. A list of - these kind of objects available to the user is returned by the function 'getRDBComputeDomains'. - :param databaseContextName: name (string) of the database. - :return: array of view names (strings) - :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). - Throws an exception if the HTTP request to the JOBM API returns an error. - :example: viewnames = SciQuery.getViewNames(rdbComputeDomain, databaseContextName) - - .. seealso:: SciQuery.getViewsMetadata, SciQuery.getTablesMetadata, SciQuery.getDatabasesMetadata - """ - tables = getViewsMetadata(rdbComputeDomain, databaseContextName, format="pandas") - return [name for name in tables['TABLE_NAME']] + df = pd.DataFrame(result['Data'], columns=[c.upper() for c in result['ColumnNames']]) + df.name = result['TableName'] + return df + def get_tables_metadata(self, database=None, rdb_compute_domain=None): + """ + Gets metadata related to tables in a particular database belonging to an RDBComputeDomain. + """ + return self._get_metadata(rdb_compute_domain, database, metadata_type=_MetadataType.TABLES) -def getRoutinesMetadata(rdbComputeDomain, databaseContextName, format="pandas"): - """ - Gets metadata related to routines or functions in a particular database belonging to an RDBComputeDomain. - - :param rdbComputeDomain: Name (string) of a relational database (RDB) compute domain, or an object (dictionary) that - defines an RDBComputeDomain. A list of these kind of objects available to the user is returned by - the function 'getRDBComputeDomains'. - :param databaseContextName: name (string) of the database. - :param format: String that defines the returned format. Either "pandas" for a pandas dataframe, or "dict" for - a dictionary object. - :return: pandas dataframe or dict (depending of the value of 'param') containing associated metadata. - :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). - Throws an exception if the HTTP request to the JOBM API returns an error. - :example: metadata = SciQuery.getRoutinesMetadata(rdbComputeDomain, databaseContextName) - - .. seealso:: SciQuery.getTablesMetadata, SciQuery.getViewsMetadata, SciQuery.getDatabasesMetadata - """ - return _getMetadata(rdbComputeDomain, databaseContextName, metadataType=MetadataType.ROUTINES, format=format) + def get_table_names(self, database=None, rdb_compute_domain=None): + """ + Gets a list of the names of tables in a particular database belonging to an RDBComputeDomain. + """ + tables = self.get_tables_metadata(database, rdb_compute_domain) + return [name for name in tables['TABLE_NAME']] + def get_views_metadata(self, database=None, rdb_compute_domain=None): + """ + Gets metadata related to views in a particular database belonging to an RDBComputeDomain. + """ + return self._get_metadata(rdb_compute_domain, database, metadata_type=_MetadataType.VIEWS) -def getRoutineNames(rdbComputeDomain, databaseContextName): - """ - Gets a list of the names of routines or functions in a particular database belonging to an RDBComputeDomain. - - :param rdbComputeDomain: Name (string) of a relational database (RDB) compute domain, or an object (dictionary) - that defines an RDBComputeDomain. A list of these kind of objects available to the user is returned by - the function 'getRDBComputeDomains'. - :param databaseContextName: name (string) of the database. - :return: array of routine names (strings) - :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). - Throws an exception if the HTTP request to the JOBM API returns an error. - :example: routinenames = getRoutineNames(rdbComputeDomain, databaseContextName) - - .. seealso:: SciQuery.getTableNames, SciQuery.getDatabasesMetadata, Sciquery.getDatabaseNames - """ - routines = getRoutinesMetadata(rdbComputeDomain, databaseContextName, format="pandas") - return [routineName for routineName in routines['ROUTINE_NAME']] + def get_view_names(self, database=None, rdb_compute_domain=None): + """ + Gets a list of the names of views in a particular database belonging to an RDBComputeDomain. + """ + tables = self.get_views_metadata(database, rdb_compute_domain) + return [name for name in tables['TABLE_NAME']] + def get_routines_metadata(self, database=None, rdb_compute_domain=None): + """ + Gets metadata related to routines or functions in a particular database belonging to an RDBComputeDomain. + """ + return self._get_metadata(rdb_compute_domain, database, metadata_type=_MetadataType.ROUTINES) -def getColumnsMetadata(rdbComputeDomain, databaseContextName, tableName, format="pandas"): - """ - Gets metadata related to columns in a particular database table belonging to an RDBComputeDomain. - - :param rdbComputeDomain: Name (string) of a relational database (RDB) compute domain, or an object (dictionary) - that defines an RDBComputeDomain. A list of these kind of objects available to the user is returned by - the function 'getRDBComputeDomains'. - :param databaseContextName: name (string) of the database. - :param tableName: name (string) of the database table. - :param format: String that defines the returned format. Either "pandas" for a pandas dataframe, or "dict" for - a dictionary object. - :return: pandas dataframe or dict (depending of the value of 'param') containing associated metadata. - :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). - Throws an exception if the HTTP request to the JOBM API returns an error. - :example: metadata = SciQuery.getColumnsMetadata(rdbComputeDomain, databaseContextName, tableName) - - .. seealso:: SciQuery.getTablesMetadata, SciQuery.getViewsMetadata, SciQuery.getDatabasesMetadata - """ - return _getMetadata(rdbComputeDomain, databaseContextName, tableName, metadataType=MetadataType.COLUMNS, - format=format) + def get_routine_names(self, database=None, rdb_compute_domain=None): + """ + Gets a list of the names of routines or functions in a particular database belonging to an RDBComputeDomain. + """ + routines = self.get_routines_metadata(database, rdb_compute_domain) + return [routine_name for routine_name in routines['ROUTINE_NAME']] + def get_columns_metadata(self, table_name, database=None, rdb_compute_domain=None): + """ + Gets metadata related to columns in a particular database table belonging to an RDBComputeDomain. + """ + return self._get_metadata(rdb_compute_domain, database, table_name, metadata_type=_MetadataType.COLUMNS) -def getColumnNames(rdbComputeDomain, databaseContextName, tableName): - """ - Gets a list of the names of table columns in a particular database belonging to an RDBComputeDomain. - - :param rdbComputeDomain: Name (string) of a relational database (RDB) compute domain, or an object (dictionary) - that defines an RDBComputeDomain. A list of these kind of objects available to the user is returned by - the function 'getRDBComputeDomains'. - :param databaseContextName: name (string) of the database. - :param tableName: name (string) of the database table - :return: array of column names (strings) - :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). - Throws an exception if the HTTP request to the JOBM API returns an error. - :example: columnnames = SciQuery.getColumnNames(rdbComputeDomain, databaseContextName, tableName) - - .. seealso:: SciQuery.getTableNames, SciQuery.getDatabasesMetadata, Sciquery.getDatabaseNames - """ - columns = getColumnsMetadata(rdbComputeDomain, databaseContextName, tableName, format="pandas") - return [columnName for columnName in columns['COLUMN_NAME']] + def get_column_names(self, table_name, database=None, rdb_compute_domain=None): + """ + Gets a list of the names of table columns in a particular database belonging to an RDBComputeDomain. + """ + columns = self.get_columns_metadata(table_name, database, rdb_compute_domain) + return [columnName for columnName in columns['COLUMN_NAME']] + def get_constraints_metadata(self, table_name, database=None, rdb_compute_domain=None): + """ + Gets metadata related to table constraints in a particular database table belonging to an RDBComputeDomain. + """ + return self._get_metadata(rdb_compute_domain, database, table_name, metadata_type=_MetadataType.CONSTRAINTS) -def getConstraintsMetadata(rdbComputeDomain, databaseContextName, tableName, format="pandas"): - """ - Gets metadata related to table constraints in a particular database table belonging to an RDBComputeDomain. - - :param rdbComputeDomain: Name (string) of a relational database (RDB) compute domain, or an object (dictionary) - that defines an RDBComputeDomain. A list of these kind of objects available to the user is returned by - the function 'getRDBComputeDomains'. - :param databaseContextName: name (string) of the database. - :param tableName: name (string) of the database table. - :param format: String that defines the returned format. Either "pandas" for a pandas dataframe, or "dict" for - a dictionary object. - :return: pandas dataframe or dict (depending of the value of 'param') containing associated metadata. - :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). - Throws an exception if the HTTP request to the JOBM API returns an error. - :example: metadata = SciQuery.getConstraintsMetadata(rdbComputeDomain, databaseContextName, tableName) - - .. seealso:: SciQuery.getTablesMetadata, SciQuery.getColumnsMetadata, SciQuery.getDatabasesMetadata - """ - return _getMetadata(rdbComputeDomain, databaseContextName, tableName, metadataType=MetadataType.CONSTRAINTS, - format=format) + def get_constraint_names(self, table_name, database=None, rdb_compute_domain=None): + """ + Gets a list of the names of table constraints in a particular database belonging to an RDBComputeDomain. + """ + constraints = self.get_constraints_metadata(table_name, database, rdb_compute_domain) + return [constraintName for constraintName in constraints['CONSTRAINT_NAME']] + def get_routine_parameters_metadata(self, routine_name, database=None, rdb_compute_domain=None): + """ + Gets metadata related to routine parameters in a particular database belonging to an RDBComputeDomain. + """ + return self._get_metadata(rdb_compute_domain, database, routine_name, metadata_type=_MetadataType.PARAMETERS) -def getConstraintNames(rdbComputeDomain, databaseContextName, tableName): - """ - Gets a list of the names of table constraints in a particular database belonging to an RDBComputeDomain. - - :param rdbComputeDomain: Name (string) of a relational database (RDB) compute domain, or an object (dictionary) - that defines an RDBComputeDomain. A list of these kind of objects available to the user is returned by - the function 'getRDBComputeDomains'. - :param databaseContextName: name (string) of the database. - :param tableName: name (string) of the database table. - :return: array of constraint names (strings) - :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). - Throws an exception if the HTTP request to the JOBM API returns an error. - :example: names = SciQuery.getConstraintNames(rdbComputeDomain, databaseContextName, tableName) - - .. seealso:: SciQuery.getTableNames, SciQuery.getColumnNames, SciQuery.getDatabasesMetadata - """ - constraints = getConstraintsMetadata(rdbComputeDomain, databaseContextName, tableName, format="pandas") - return [constraintName for constraintName in constraints['CONSTRAINT_NAME']] + def get_routine_parameter_names(self, routine_name, database=None, rdb_compute_domain=None): + """ + Gets a list of the names of routine parameters in a particular database belonging to an RDBComputeDomain. + """ + parameters = self.get_routine_parameters_metadata(routine_name, database, rdb_compute_domain) + return [name for name in parameters['PARAMETER_NAME']] + def __str__(self): + return "SciQuery instance with rdb_compute_domains = {})".format(self._rdb_compute_domains) -def getRoutineParametersMetadata(rdbComputeDomain, databaseContextName, routineName, format="pandas"): - """ - Gets metadata related to routine parameters in a particular database belonging to an RDBComputeDomain. - - :param rdbComputeDomain: Name (string) of a relational database (RDB) compute domain, or an object (dictionary) - that defines an RDBComputeDomain. A list of these kind of objects available to the user is returned by - the function 'getRDBComputeDomains'. - :param databaseContextName: name (string) of the database. - :param routineName: name (string) of the routine or function. - :param format: String that defines the returned format. Either "pandas" for a pandas dataframe, or "dict" - for a dictionary object. - :return: pandas dataframe or dict (depending of the value of 'param') containing associated metadata. - :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). - Throws an exception if the HTTP request to the JOBM API returns an error. - :example: metadata = SciQuery.getRoutineParametersMetadata(rdbComputeDomain, databaseContextName, routineName) - - .. seealso:: SciQuery.getTablesMetadata, SciQuery.getColumnsMetadata, SciQuery.getDatabasesMetadata - """ - return _getMetadata(rdbComputeDomain, databaseContextName, routineName, metadataType=MetadataType.PARAMETERS, - format=format) + def __repr__(self): + return "SciQuery(rdb_compute_domains = {})".format(self._rdb_compute_domains) -def getRoutineParameterNames(rdbComputeDomain, databaseContextName, routineName): +class _MetadataType: """ - Gets a list of the names of routine parameters in a particular database belonging to an RDBComputeDomain. - - :param rdbComputeDomain: Name (string) of a relational database (RDB) compute domain, or an object (dictionary) - that defines an RDBComputeDomain. A list of these kind of objects available to the user is returned by - the function 'getRDBComputeDomains'. - :param databaseContextName: name (string) of the database. - :param routineName: name (string) of the routine or function. - :return: array of parameter names (strings) - :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). - Throws an exception if the HTTP request to the JOBM API returns an error. - :example: names = SciQuery.getRoutineParameterNames(rdbComputeDomain, databaseContextName, routineName) - - .. seealso:: SciQuery.getTableNames, SciQuery.getDatabasesMetadata, Sciquery.getDatabaseNames + Contains a set of metadata types. """ - parameters = getRoutineParametersMetadata(rdbComputeDomain, databaseContextName, routineName, format="pandas") - return [parametersName for parametersName in parameters['SPECIFIC_NAME']] + TABLES = "TABLES" + VIEWS = "VIEWS" + COLUMNS = "COLUMNS" + ROUTINES = "ROUTINES" + CONSTRAINTS = "CONSTRAINTS" + PARAMETERS = "PARAMETERS" From a30d075853fc155c160e9fd9345d25897372dba8 Mon Sep 17 00:00:00 2001 From: mtaghiza Date: Fri, 3 Jun 2022 18:51:32 -0400 Subject: [PATCH 06/13] fixed missing token --- py3/SciServer/SciQuery.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/py3/SciServer/SciQuery.py b/py3/SciServer/SciQuery.py index cec1a25..4ce4f25 100644 --- a/py3/SciServer/SciQuery.py +++ b/py3/SciServer/SciQuery.py @@ -740,7 +740,10 @@ def get_token() -> str: @staticmethod def get_user() -> Authentication.KeystoneUser: - return Authentication.getKeystoneUserWithToken(SciQuery.get_token()) + token = SciQuery.get_token() + user = Authentication.getKeystoneUserWithToken(token) + user.token = token + return user def set(self, rdb_compute_domain: Union[str, int, dict, RDBComputeDomain] = None, From 72c427af456ac25b29f35139741db6ed0326e4f9 Mon Sep 17 00:00:00 2001 From: mtaghiza Date: Fri, 3 Jun 2022 18:51:57 -0400 Subject: [PATCH 07/13] assign missing token --- py3/SciServer/Authentication.py | 1 + 1 file changed, 1 insertion(+) diff --git a/py3/SciServer/Authentication.py b/py3/SciServer/Authentication.py index a02f6c7..e4af767 100644 --- a/py3/SciServer/Authentication.py +++ b/py3/SciServer/Authentication.py @@ -66,6 +66,7 @@ def getKeystoneUserWithToken(token): ksu = KeystoneUser() ksu.userName = responseJson["token"]["user"]["name"] ksu.id = responseJson["token"]["user"]["id"] + ksu.token = token keystoneUser.token = token; keystoneUser.userName = ksu.userName keystoneUser.id = ksu.id From cc02c0e378d05260bd4aaaf09e1a54c9b91e095d Mon Sep 17 00:00:00 2001 From: mtaghiza Date: Sun, 5 Jun 2022 10:45:28 -0400 Subject: [PATCH 08/13] adding poll_time to SciQuery class. --- py3/SciServer/SciQuery.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/py3/SciServer/SciQuery.py b/py3/SciServer/SciQuery.py index 4ce4f25..a61ca8b 100644 --- a/py3/SciServer/SciQuery.py +++ b/py3/SciServer/SciQuery.py @@ -722,6 +722,7 @@ def __init__(self, self.user = SciQuery.get_user() self.verbose = verbose self.hard_fail = hard_fail + self.poll_time = 1.0 self._file_service = None self._results_base_path = None self._outputs = None @@ -1114,7 +1115,6 @@ def execute_query(self, results_base_path: str = None, rdb_compute_domain: Union[str, int, dict, RDBComputeDomain] = None, job_alias: str = "", - poll_time: float = 1.0, file_service: str = None) -> pd.DataFrame: """ Returns the query result (as a Pandas data frame) of a sql query submitted as a job to a @@ -1133,7 +1133,6 @@ def execute_query(self, RDBComputeDomain, or a dictionary containing all the attributes of an object of class RDBComputeDomain. If set to None, then the currently set value of rdb_compute_domain in the SciQuery object is internally used. :param job_alias: alias (string) of job, defined by the user. - :param poll_time: time (float) in seconds between consecutive requests for updates in the jobs status. :param file_service: a File Service defines an available file system where query result sets can be written into. This parameter can be its name or identifier (string), or a dictionary defining a file service. If set to None, then the currently set value of file_service in the SciQuery object is internally used. @@ -1146,8 +1145,9 @@ def execute_query(self, results_base_path=results_base_path, job_alias=job_alias, file_service=file_service) - - job = self.wait_for_job(job_id, verbose=False, poll_time=poll_time) + if self.verbose: + print("Query was submitted as a job with id = " + job_id) + job = self.wait_for_job(job_id, verbose=False, poll_time=self.poll_time) if job.status > 32: messages = ". ".join(job.message_list) if (job.status == 64): @@ -1242,7 +1242,6 @@ def wait_for_job(self, job_id, verbose = False, poll_time=1.0): .. seealso:: SciQuery.get_job_status, SciQuery.getJobDescription """ - #return Jobs.waitForJob(jobId=jobId, verbose=verbose, pollTime=poll_time) min_poll_time = 0.1 # in seconds while True: if verbose: From a3b6328ac9be292b22b2fbcf4859d857d35972ea Mon Sep 17 00:00:00 2001 From: mtaghiza Date: Mon, 6 Jun 2022 10:55:04 -0400 Subject: [PATCH 09/13] Improve messages in wait_for_job --- py3/SciServer/SciQuery.py | 40 ++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/py3/SciServer/SciQuery.py b/py3/SciServer/SciQuery.py index a61ca8b..1eb2edc 100644 --- a/py3/SciServer/SciQuery.py +++ b/py3/SciServer/SciQuery.py @@ -429,7 +429,7 @@ def get_json_output(self, output: Union[Output, int, str] = 0) -> dict: data_dict = json.loads(self.get_output_as_string(output)) return data_dict.get("Result") - def get_dataframe_from_output(self, output: Union[Output, int] = 0, result_index: int = 0) -> list: + def get_dataframe_from_output(self, output: Union[Output, int] = 0, result_index: int = 0) -> pd.DataFrame: out = self._get_output_from_index(output) if isinstance(output, int) else output if out.output_type == OutputType.FILE_JSON: result = self.get_json_output(out)[result_index] @@ -692,7 +692,9 @@ def __init__(self, results_base_path: str = None, outputs: Outputs = None, verbose: bool = True, - hard_fail: bool = False): + hard_fail: bool = False, + poll_time: float = 1.0 + ): """ Creates instance of SciQuery class. @@ -717,6 +719,7 @@ def __init__(self, :param hard_fail: Boolean parameter. If True, exceptions will be raised in case of errors during instantiation. If False, then no exceptions are raised, and warnings might be showed instead (depending on the value of the verbose parameter). + :param poll_time: time (float) in seconds between consecutive requests for updates in the jobs status. """ self.user = SciQuery.get_user() @@ -753,7 +756,9 @@ def set(self, results_base_path: str = None, outputs: Outputs = None, verbose: bool = None, - hard_fail: bool = None): + hard_fail: bool = None, + poll_time: float = None + ): """ Sets or refreshes the parameters in the SciQuery object, all at once. @@ -776,10 +781,12 @@ def set(self, :param hard_fail: Boolean parameter. If True, exceptions will be raised in case of errors during instantiation. If False, then no exceptions are raised, and warnings might be showed instead (depending on the value of the verbose parameter). + :param poll_time: time (float) in seconds between consecutive requests for updates in the jobs status. """ self.verbose = verbose if verbose else self.verbose self.hard_fail = hard_fail if hard_fail else self.hard_fail + self.poll_time = poll_time if poll_time else self.poll_time # set or refresh current _rdb_compute_domains try: @@ -1147,12 +1154,12 @@ def execute_query(self, file_service=file_service) if self.verbose: print("Query was submitted as a job with id = " + job_id) - job = self.wait_for_job(job_id, verbose=False, poll_time=self.poll_time) + job = self.wait_for_job(job_id, verbose=False) if job.status > 32: messages = ". ".join(job.message_list) - if (job.status == 64): + if job.status == 64: raise Exception("Query ended with an error. " + messages) - if (job.status == 128): + if job.status == 128: raise Exception("Query was cancelled. " + messages) df = job.get_dataframe_from_output(0) @@ -1225,34 +1232,29 @@ def cancel_job(job): else: raise NameError("Invalid type for input parameter 'job'.") - def wait_for_job(self, job_id, verbose = False, poll_time=1.0): + def wait_for_job(self, job_id, verbose = False): """ Queries the job status regularly and waits until the job is completed. :param job_id: id of job (integer) :param verbose: if True, will print "wait" messages on the screen while the job is still running. If False, it will suppress the printing of messages on the screen. - :param poll_time: idle time interval (integer, in seconds) before querying again for the job status. Minimum - value allowed is 0.1 seconds. - :return: After the job is finished, returns a dictionary object containing the job status. + :return: After the job is finished, returns an object of class RDBJob, containing the job definition. :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that purpose). Throws an exception if the HTTP request to the JOBM API returns an error. - :example: jobStatus = SciQuery.wait_for_job(jobId) - - .. seealso:: SciQuery.get_job_status, SciQuery.getJobDescription """ - min_poll_time = 0.1 # in seconds + t = max(0.1, self.poll_time) + wait_message = "Waiting" while True: - if verbose: - print("Waiting...") job_desc = Jobs.getJobDescription(job_id) if job_desc.get("status") >= 32: - if verbose: - print("Done!") return RDBJob(job_desc) else: - time.sleep(max(min_poll_time, poll_time)) + if verbose: + wait_message += "." + print(wait_message, end="\r") + time.sleep(t) ### METADATA From db67c7a3ddd0db40095669a275353ad1df6b645b Mon Sep 17 00:00:00 2001 From: mtaghiza Date: Wed, 8 Jun 2022 13:53:27 -0400 Subject: [PATCH 10/13] fixed docs, updated minimum polling time. --- py3/SciServer/SciQuery.py | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/py3/SciServer/SciQuery.py b/py3/SciServer/SciQuery.py index 1eb2edc..83cc690 100644 --- a/py3/SciServer/SciQuery.py +++ b/py3/SciServer/SciQuery.py @@ -725,7 +725,7 @@ def __init__(self, self.user = SciQuery.get_user() self.verbose = verbose self.hard_fail = hard_fail - self.poll_time = 1.0 + self.poll_time = poll_time self._file_service = None self._results_base_path = None self._outputs = None @@ -733,7 +733,7 @@ def __init__(self, self._rdb_compute_domain = None self._database = None self.refresh_date = None - self.set(rdb_compute_domain, database, file_service, results_base_path, outputs, verbose, hard_fail) + self.set(rdb_compute_domain, database, file_service, results_base_path, outputs, verbose, hard_fail, poll_time) @staticmethod def get_token() -> str: @@ -864,9 +864,6 @@ def get_rdb_compute_domains(result_format: str = 'class') -> RDBComputeDomains: of an RDBComputeDomain object. :return: an object of class RDBComputeDomains, or a list of dictionaries, each of them containing the attributes of an RDBComputeDomain object. - :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that - purpose). Throws an exception if the HTTP request to the JOBM API returns an error. - :example: rdb_compute_domains = SciQuery.get_rdb_compute_domains(); """ token = SciQuery.get_user().token @@ -880,8 +877,8 @@ def get_rdb_compute_domains(result_format: str = 'class') -> RDBComputeDomains: res = requests.get(url, headers=headers, stream=True) if res.status_code != 200: raise Exception( - "Error when getting RDB Compute Domains from JOBM API.\nHttp Response from JOBM API returned" - " status code " + str(res.status_code) + ":\n" + res.content.decode()) + "Error when getting RDB Compute Domains from the SciQuery API.\nHttp Response from the SciQuery API " + "returned status code " + str(res.status_code) + ":\n" + res.content.decode()) else: arr = json.loads(res.content.decode()) if result_format == 'class': @@ -1205,10 +1202,11 @@ def get_job(job_id): @staticmethod def get_job_status(job_id): """ - Gets a dictionary with the job status as an integer value, together with its semantic meaning. The integer value is - a power of 2, that is, 1:PENDING, 2:QUEUED, 4:ACCEPTED, 8:STARTED, 16:FINISHED, 32:SUCCESS, 64:ERROR, 128:CANCELED + Gets a dictionary with the job status as an integer value, together with its semantic meaning. The integer value + is a power of 2, that is, + 1:PENDING, 2:QUEUED, 4:ACCEPTED, 8:STARTED, 16:FINISHED, 32:SUCCESS, 64:ERROR, 128:CANCELED - :param job_id: Id of job (integer). + :param job_id: Id of job (string). :return: dictionary with the integer value of the job status, as well as its semantic meaning. """ return Jobs.getJobStatus(job_id) @@ -1218,12 +1216,7 @@ def cancel_job(job): """ Cancels the execution of a job. - :param job_id: Id of the job (integer) - :raises: Throws an exception if the HTTP request to the Authentication URL returns an error. Throws an exception if - the HTTP request to the JOBM API returns an error. - :example: SciQuery.cancelJob(jobId); - - .. seealso:: SciQuery.get_job_status, SciQuery.getJobDescription + :param job: id of the job (string), or object of class RDBJob. """ if isinstance(job, str): Jobs.cancelJob(job) @@ -1232,7 +1225,7 @@ def cancel_job(job): else: raise NameError("Invalid type for input parameter 'job'.") - def wait_for_job(self, job_id, verbose = False): + def wait_for_job(self, job_id, verbose=False): """ Queries the job status regularly and waits until the job is completed. @@ -1244,7 +1237,7 @@ def wait_for_job(self, job_id, verbose = False): purpose). Throws an exception if the HTTP request to the JOBM API returns an error. """ - t = max(0.1, self.poll_time) + t = max(0.5, self.poll_time) wait_message = "Waiting" while True: job_desc = Jobs.getJobDescription(job_id) @@ -1256,7 +1249,7 @@ def wait_for_job(self, job_id, verbose = False): print(wait_message, end="\r") time.sleep(t) - ### METADATA + # METADATA ------------------------------------------------------------------------------------------------- def get_rdb_compute_domains_metadata(self, do_include_databases=False): """ @@ -1326,7 +1319,7 @@ def _get_metadata(self, rdb_compute_domain, database, resource_name="", metadata else: task_name = "SciScript-Python.Sciquery.get_metadata_" + metadata_type - url = Config.SciqueryURL + "/api/metadata/{0}/{1}/".format(rdb_compute_domain._racm_id, database.name); + url = Config.SciqueryURL + "/api/metadata/{0}/{1}/".format(rdb_compute_domain._racm_id, database.name) if metadata_type == _MetadataType.TABLES: url += "tables" elif metadata_type == _MetadataType.VIEWS: From 4fbb613713bfe46e9789e056cef5f24d6f605022 Mon Sep 17 00:00:00 2001 From: mtaghiza Date: Wed, 19 Oct 2022 13:36:26 -0400 Subject: [PATCH 11/13] Adding SciQuery demo notebook. --- Examples/Sciquery_Demo(long).ipynb | 1175 ++++++++++++++++++++++++++++ 1 file changed, 1175 insertions(+) create mode 100644 Examples/Sciquery_Demo(long).ipynb diff --git a/Examples/Sciquery_Demo(long).ipynb b/Examples/Sciquery_Demo(long).ipynb new file mode 100644 index 0000000..1806cc8 --- /dev/null +++ b/Examples/Sciquery_Demo(long).ipynb @@ -0,0 +1,1175 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from SciServer.SciQuery import *" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1) Instantiation: different ways" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 1.0 Defining name of database and domain, for future queries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "database_name = 'BestDR16'\n", + "rdb_compute_domain_name = 'IDIES-SQLServer (Short queue)'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 1.1 Creating instance with default values/settings:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "sq = SciQuery()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 1.2 Ways of looking at preset instance values:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "print(f\"Default Domian = {sq.rdb_compute_domain}\" )\n", + "print(f\"Default Domian = {sq.get_rdb_compute_domain()}\" )\n", + "print(f\"Default Database = {sq.database}\" )\n", + "print(f\"Default Database = {sq.get_database()}\" )\n", + "print(f\"Default Output = {sq.outputs}\" )\n", + "print(f\"Default Output = {sq.get_outputs()}\" )\n", + "print(f\"Default File service = {sq.file_service.get('name')}\" )\n", + "print(f\"Default File service = {sq.get_file_service().get('name')}\" )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 1.3 Several ways of setting values:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### 1.3.1 By name" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "sq.rdb_compute_domain = rdb_compute_domain_name\n", + "sq.database = database_name" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### 1.3.2 By object" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "rdb_compute_domain = sq.get_rdb_compute_domain(rdb_compute_domain_name)\n", + "sq.rdb_compute_domain = rdb_compute_domain\n", + "database = sq.get_database(database_name)\n", + "sq.database = database" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### 1.3.3 Altogether by name" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "sq = SciQuery()\n", + "sq.set(rdb_compute_domain_name, database_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### 1.3.4 Altogether by object" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "sq = SciQuery()\n", + "sq.set(rdb_compute_domain, database)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### 1.3.5 During instantiation, by name" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "sq = SciQuery(rdb_compute_domain_name, database_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### 1.3.6 During instantiation, by object" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "sq = SciQuery(rdb_compute_domain, database)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 1.4 Updating the information stored in SciQuery if there are external changes, like being allowed to query new domains/databases:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "sq.refresh()\n", + "print(sq.refresh_date)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2) Exploring stored objects and asociated metadata" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "sq.get_rdb_compute_domain_names()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "sq.get_rdb_compute_domains_metadata()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "sq.get_rdb_compute_domains_metadata(do_include_databases=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "SciQuery.get_rdb_compute_domains()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "sq.rdb_compute_domains" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "rdb_compute_domain = sq.get_rdb_compute_domain(rdb_compute_domain_name)\n", + "rdb_compute_domain" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "rdb_compute_domain.get_metadata()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "rdb_compute_domain.get_database_names()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "rdb_compute_domain.get_databases_metadata()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "rdb_compute_domain.databases" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "database = rdb_compute_domain.get_database(database_name)\n", + "database" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "database.get_metadata()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3) Detailed Database Metadata" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "sq.rdb_compute_domain = rdb_compute_domain_name\n", + "sq.database = database_name" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "table_names = sq.get_table_names()\n", + "#table_names = sq.get_table_names(database, rdb_compute_domain)\n", + "print(table_names)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "sq.get_tables_metadata()\n", + "# or \n", + "sq.get_tables_metadata(database, rdb_compute_domain)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "table_name = table_names[0]\n", + "print(sq.get_column_names(table_name))\n", + "# or \n", + "print(sq.get_column_names(table_name, database, rdb_compute_domain))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sq.get_columns_metadata(table_name)\n", + "# or \n", + "sq.get_columns_metadata(table_name, database, rdb_compute_domain)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(sq.get_constraint_names(table_name))\n", + "# or \n", + "print(sq.get_constraint_names(table_name, database, rdb_compute_domain))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sq.get_constraints_metadata(table_name)\n", + "# or \n", + "sq.get_constraints_metadata(table_name, database, rdb_compute_domain)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "routine_names = sq.get_routine_names()\n", + "# or \n", + "routine_names = sq.get_routine_names(database, rdb_compute_domain)\n", + "print(routine_names)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sq.get_routines_metadata()\n", + "# or \n", + "sq.get_routines_metadata(database, rdb_compute_domain)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "routine_name = routine_names[0]\n", + "print(sq.get_routine_parameter_names(routine_name))\n", + "# or \n", + "print(sq.get_routine_parameter_names(routine_name, database, rdb_compute_domain))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sq.get_routine_parameters_metadata(routine_name)\n", + "# or \n", + "sq.get_routine_parameters_metadata(routine_name, database, rdb_compute_domain)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(sq.get_view_names())\n", + "# or \n", + "print(sq.get_view_names(database, rdb_compute_domain))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sq.get_views_metadata()\n", + "# or \n", + "sq.get_views_metadata(database, rdb_compute_domain)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4) Executing Synchronous Queries to Databases " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 4.1 Setting database and compute domain" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "sq.set(database=database_name, rdb_compute_domain=rdb_compute_domain_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 4.2 Defining Single SQL Query" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "sql_query = \"SELECT TOP 2 specObjID, ra, dec, z FROM specobj\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 4.3 Getting query result as pandas dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "df = sq.execute_query(sql_query)\n", + "# or\n", + "#df = sq.execute_query(sql_query, database_name, rdb_compute_domain=rdb_compute_domain_name)\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5) Submitting query as a batch job" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 5.1 Using default JSON file output" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "jobid = sq.submit_query_job(sql_query=sql_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 5.2 Getting current job status" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "sq.get_job_status(jobid)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 5.3 Waiting for job to finish before getting it back" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "job = sq.wait_for_job(jobid, verbose=True)\n", + "job\n", + "\n", + "# or after job is " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 5.4 Getting job object" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "job = sq.get_job(jobid)\n", + "job" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 5.5 Getting list of submitted jobs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "sq.get_jobs_list(top = 2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 5.6 Getting job object and checking its properties" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "job = sq.get_job(jobid)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "job.get_job_status()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "job.get_metadata()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "outputs = job.outputs\n", + "outputs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "job.get_output_path()\n", + "# or \n", + "job.get_output_path(0)\n", + "# or\n", + "job.get_output_path(job.outputs[0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "job.get_json_output()\n", + "# or\n", + "job.get_json_output(0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "job.get_dataframe_from_output(0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 5.7 Cancelling job" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "jobid = sq.submit_query_job(sql_query=sql_query)\n", + "sq.cancel_job(jobid)\n", + "\n", + "# or \n", + "\n", + "#jobid = sq.submit_query_job(sql_query=sql_query)\n", + "#job = sq.get_job(jobid)\n", + "#job.cancel()\n", + "\n", + "job = sq.wait_for_job(jobid)\n", + "print(job.status_string)\n", + "print(sq.get_job_status(jobid))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "results_base_path = '/home/idies/workspace/Temporary/{}/scratch/sciqueryjobs/dir/'.format(SciQuery.get_user().userName)\n", + "# or\n", + "results_base_path = FileOutput.build_file_base_path(top_volume=\"Temporary\", \n", + " user_volume=\"scratch\",\n", + " user_volume_owner_name=SciQuery.get_user().userName, \n", + " relative_path=\"sciqueryjobs/dir/\", \n", + " add_date_ending=False)\n", + "results_base_path" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "name = results_base_path + \"result.json\"\n", + "output = FileOutput(name=name, output_type=OutputType.FILE_JSON, statement_indexes=1)\n", + "print(output.file)\n", + "print(output.file_base_path)\n", + "print(output.path)\n", + "print(output.get_path())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "jobid = sq.submit_query_job(sql_query=sql_query, outputs=output)\n", + "\n", + "# or \n", + "#jobid = sq.submit_query_job(sql_query=sql_query, results_base_path=results_base_path)\n", + "\n", + "# or \n", + "#sq.set_results_base_path(results_base_path)\n", + "#jobid = sq.submit_query_job(sql_query=sql_query)\n", + "\n", + "job = sq.wait_for_job(jobid, verbose=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "job.outputs[0].get_path()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "out = job.outputs[0]\n", + "out" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "job.get_dataframe_from_output(0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "job.get_json_output(0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 5.9 Submitting query as batch job, with output to table in mydb database" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "sql_query = \"SELECT TOP 2 specObjID, ra, dec, z FROM specobj\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "output_table = \"results12e\"\n", + "output_rdb_compute_domain_name = rdb_compute_domain_name" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# setting output database directly to mydb:\n", + "\n", + "output_database_name = SciQuery.get_mydb_name() \n", + "print(output_database_name)\n", + "\n", + "# verifying that it is fact a database contained in the output rdb compute domian:\n", + "\n", + "print(output_database_name in sq.get_database_names(output_rdb_compute_domain_name))\n", + "print(sq.get_rdb_compute_domain(output_rdb_compute_domain_name).get_database(output_database_name).name)\n", + "\n", + "# or verifying that it is fact the default database in the output rdb compute domian:\n", + "\n", + "output_database_name = sq.get_rdb_compute_domain(output_rdb_compute_domain_name).get_default_database().name\n", + "print(output_database_name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "output = DatabaseTableOutput(table=output_table, database=output_database_name, rdb_compute_domain=output_rdb_compute_domain_name)\n", + "output" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "jobid = sq.submit_query_job(sql_query=sql_query, outputs=output)\n", + "job = sq.wait_for_job(jobid)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "print(job.message_list)\n", + "print(job.targets)\n", + "job.get_metadata()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# looking at the contents of the output table\n", + "\n", + "job.get_dataframe_from_output(0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# or directly making a query to see the contents:\n", + "\n", + "sql_query = \"SELECT * FROM \" + output_table\n", + "sq.execute_query(sql_query, database= output_database_name, rdb_compute_domain= output_rdb_compute_domain_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 5.10 Submitting a multi-query sql statement as a batch job, with multiple outputs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "sql_query = \"SELECT TOP 2 specObjID, ra,dec, z FROM specobj; SELECT TOP 2 specobjid, petromag_u, petromag_r FROM specphoto;\"\n", + "job_alias = \"multi-query\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "out1 = FileOutput(name=\"query1.json\", output_type=OutputType.FILE_JSON, statement_indexes=1)\n", + "out2 = FileOutput(name=\"query2.json\", output_type=OutputType.FILE_JSON, statement_indexes=2)\n", + "out3 = FileOutput(name=\"query12.json\", output_type=OutputType.FILE_JSON, statement_indexes=[1,2])\n", + "\n", + "outputs = Outputs(out1, out2, out3) \n", + "# or\n", + "outputs = Outputs([out1, out2, out3])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "results_base_path = '/home/idies/workspace/Temporary/{}/scratch/sciqueryjobs/'.format(SciQuery.get_user().userName)\n", + "# or\n", + "sq.results_base_path = results_base_path" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "jobid = sq.submit_query_job(sql_query=sql_query, outputs=outputs, results_base_path=results_base_path, job_alias=job_alias)\n", + "job = sq.wait_for_job(jobid)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "outputs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "job.outputs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "job.get_json_output(0)\n", + "# or\n", + "job.get_json_output(job.outputs[0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "job.get_json_output(1)\n", + "# or\n", + "job.get_json_output(job.outputs[1])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "job.get_json_output(2)\n", + "# or\n", + "job.get_json_output(job.outputs[2])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.8 (py38)", + "language": "python", + "name": "py38" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From 076f2439d56337d0d13b7857fc2e3449f8c5b990 Mon Sep 17 00:00:00 2001 From: mtaghiza Date: Wed, 26 Oct 2022 15:34:12 -0400 Subject: [PATCH 12/13] Add write_job_id parameter in execute_query function. Updated setting of path in FileOutput. --- py3/SciServer/SciQuery.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/py3/SciServer/SciQuery.py b/py3/SciServer/SciQuery.py index 83cc690..66245fe 100644 --- a/py3/SciServer/SciQuery.py +++ b/py3/SciServer/SciQuery.py @@ -6,7 +6,7 @@ from collections.abc import Iterable from datetime import datetime import warnings -from pathlib import Path +from pathlib import PurePosixPath from typing import Union, List import time @@ -105,7 +105,7 @@ def __init__(self, if not name: raise NameError("Input parameter name cannot be empty or None") name = name.rstrip("/") - file_path = Path(name) + file_path = PurePosixPath(name) if name == file_path.name: # means no path included in 'name' input parameter self.file_base_path = None self.file = name @@ -113,7 +113,7 @@ def __init__(self, self.file_service_path = None else: if not name.startswith(Config.ComputeWorkDir): - file_path = Path(Config.ComputeWorkDir + name) # in case it is relative path + file_path = PurePosixPath(Config.ComputeWorkDir + name) # in case it is relative path self.file_base_path = str(file_path.parent) self.file_base_path = self.file_base_path if self.file_base_path.endswith("/") \ else self.file_base_path + "/" @@ -157,9 +157,9 @@ def build_file_base_path(top_volume: str = "Temporary", if user_volume: if not user_volume_owner_name: user_volume_owner_name = SciQuery.get_user().userName - path = str(Path(Config.ComputeWorkDir, top_volume, user_volume_owner_name, user_volume, relative_path)) + path = str(PurePosixPath(Config.ComputeWorkDir, top_volume, user_volume_owner_name, user_volume, relative_path)) else: - path = str(Path(Config.ComputeWorkDir, top_volume, relative_path)) + path = str(PurePosixPath(Config.ComputeWorkDir, top_volume, relative_path)) return path if path.endswith("/") else path + "/" @staticmethod @@ -440,7 +440,7 @@ def get_dataframe_from_output(self, output: Union[Output, int] = 0, result_index elif out.output_type == OutputType.DATABASE_TABLE: sq = SciQuery(rdb_compute_domain=out.rdb_compute_domain_name, database=out.database) query = f"select * from {out.table};" - df = sq.execute_query(query) + df = sq.execute_query(query, write_job_id=False) else: raise Exception(f"Output type {out.output_type} not supported") return df @@ -1119,7 +1119,8 @@ def execute_query(self, results_base_path: str = None, rdb_compute_domain: Union[str, int, dict, RDBComputeDomain] = None, job_alias: str = "", - file_service: str = None) -> pd.DataFrame: + file_service: str = None, + write_job_id = True) -> pd.DataFrame: """ Returns the query result (as a Pandas data frame) of a sql query submitted as a job to a relational database (RDB) compute domain. @@ -1140,6 +1141,10 @@ def execute_query(self, :param file_service: a File Service defines an available file system where query result sets can be written into. This parameter can be its name or identifier (string), or a dictionary defining a file service. If set to None, then the currently set value of file_service in the SciQuery object is internally used. + :param write_job_id: if True, the job id will be written on the screen, just before returning the result. + The job id won;t be written if write_job_id = False. + into. This parameter can be its name or identifier (string), or a dictionary defining a file service. + If set to None, then the currently set value of file_service in the SciQuery object is internally used. :return: Pandas data frame containing the result of the query. """ output = FileOutput("result1.json", OutputType.FILE_JSON, 1) @@ -1149,7 +1154,7 @@ def execute_query(self, results_base_path=results_base_path, job_alias=job_alias, file_service=file_service) - if self.verbose: + if write_job_id: print("Query was submitted as a job with id = " + job_id) job = self.wait_for_job(job_id, verbose=False) if job.status > 32: From ec85792cdfcdcc2bde390b1d146403caa404a388 Mon Sep 17 00:00:00 2001 From: mtaghiza Date: Tue, 25 Jun 2024 08:58:41 -0400 Subject: [PATCH 13/13] Updating documentation. --- README.md | 4 + docs_sphinx/SciServer.rst | 8 + docs_sphinx/conf.py | 9 +- py2/SciServer/Config.py | 2 +- py2/SciServer/__init__.py | 2 +- py2/setup.py | 2 +- py3/SciServer/Config.py | 2 +- py3/SciServer/SciQuery.py | 370 +++++++++++++++++++++++++------------- py3/SciServer/__init__.py | 7 +- py3/setup.py | 2 +- requirements.txt | 1 + 11 files changed, 277 insertions(+), 132 deletions(-) create mode 100644 requirements.txt diff --git a/README.md b/README.md index 6b7b5af..c23868c 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,8 @@ Some SciServer tools you can access with this package: * [CasJobs](http://skyserver.sdss.org/CasJobs): Database storage and querying. +* SciQuery: Advanced Database storage and querying. + * [SciDrive](http://www.scidrive.org/): Drag-and-drop file storage and sharing. * [SkyServer](http://skyserver.sdss.org/): Access to the SDSS astronomical survey. @@ -37,6 +39,8 @@ Authors: Gerard Lemson, Manuchehr Taghizadeh-Popp. ## 3) Installation: +First, add required packages with `pip install -r requirements.txt` + There are 2 possibilities: automatic or manual installation. ### a) Automatic Installation and Update: diff --git a/docs_sphinx/SciServer.rst b/docs_sphinx/SciServer.rst index fa71e05..e42ad9e 100644 --- a/docs_sphinx/SciServer.rst +++ b/docs_sphinx/SciServer.rst @@ -14,6 +14,14 @@ SciServer.CasJobs module :undoc-members: :show-inheritance: +SciServer.SciQuery module +------------------------- + +.. automodule:: SciServer.SciQuery + :members: + :undoc-members: + :show-inheritance: + SciServer.Config module ----------------------- diff --git a/docs_sphinx/conf.py b/docs_sphinx/conf.py index 63600e8..381fc37 100644 --- a/docs_sphinx/conf.py +++ b/docs_sphinx/conf.py @@ -48,16 +48,16 @@ # General information about the project. project = 'SciScript-Python' -copyright = '2018, SciServer' +copyright = '2024, SciServer' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. -version = '2.1.0' +version = '2.2.0' # The full version, including alpha/beta/rc tags. -release = '2.1.0' +release = '2.2.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. @@ -182,6 +182,7 @@ # Output file base name for HTML help builder. htmlhelp_basename = 'sphinxdoc' +autoclass_content = 'both' # -- Options for LaTeX output --------------------------------------------- @@ -268,7 +269,7 @@ epub_title = '.' epub_author = 'Gerard Lemson, Manuchehr Taghizadeh-Popp' epub_publisher = 'Gerard Lemson, Manuchehr Taghizadeh-Popp' -epub_copyright = '2018, SciServer' +epub_copyright = '2024, SciServer' # The basename for the epub file. It defaults to the project name. #epub_basename = '.' diff --git a/py2/SciServer/Config.py b/py2/SciServer/Config.py index 3c78c03..c2b0c70 100644 --- a/py2/SciServer/Config.py +++ b/py2/SciServer/Config.py @@ -31,7 +31,7 @@ RacmApiURL = "https://apps.sciserver.org/racm" DataRelease = "DR14" KeystoneTokenPath = "/home/idies/keystone.token" #the path to the file containing the user's keystone token is hardcoded in the sciserver-compute environment -version = "sciserver-v2.1.0" #sciserver release version +version = "sciserver-v2.2.0" #sciserver release version ComputeJobDirectoryFile = "/home/idies/jobs.path" #the path to the file in the "Docker job container" that shows the directory path where the asynchronous compute job is being executed. def _load_config(filename): diff --git a/py2/SciServer/__init__.py b/py2/SciServer/__init__.py index bf3877c..79179af 100644 --- a/py2/SciServer/__init__.py +++ b/py2/SciServer/__init__.py @@ -41,7 +41,7 @@ **Authors**: Gerard Lemson , Manuchehr Taghizadeh-Popp -**Version**: sciserver-v2.1.0 +**Version**: sciserver-v2.2.0 """ diff --git a/py2/setup.py b/py2/setup.py index b61abb2..26e8abf 100644 --- a/py2/setup.py +++ b/py2/setup.py @@ -1,6 +1,6 @@ from setuptools import setup, find_packages setup( name = "SciServer", - version = "2.1.0", + version = "2.2.0", packages = find_packages(), ) \ No newline at end of file diff --git a/py3/SciServer/Config.py b/py3/SciServer/Config.py index b241298..557c195 100644 --- a/py3/SciServer/Config.py +++ b/py3/SciServer/Config.py @@ -35,7 +35,7 @@ RacmApiURL = "https://apps.sciserver.org/racm" DataRelease = "DR15" KeystoneTokenPath = "/home/idies/keystone.token" #the path to the file containing the user's keystone token is hardcoded in the sciserver-compute environment -version = "sciserver-v2.1.0" #sciserver release version +version = "sciserver-v2.2.0" #sciserver release version ComputeJobDirectoryFile = "/home/idies/jobs.path" #the path to the file in the "Docker job container" that shows the directory path where the asynchronous compute job is being executed. ComputeUrl = "https://apps.sciserver.org/compute" SciqueryURL = "https://apps.sciserver.org/sciquery-api" diff --git a/py3/SciServer/SciQuery.py b/py3/SciServer/SciQuery.py index 66245fe..76edf39 100644 --- a/py3/SciServer/SciQuery.py +++ b/py3/SciServer/SciQuery.py @@ -35,14 +35,14 @@ def _get_file_service(file_service: str = None): class Output: - """ - Base class for output objects - """ + def __init__(self, name: str = "output.json", output_type: str = OutputType.FILE_JSON, statement_indexes: Union[int, List[int]] = 1): - + """ + Base class for output objects, including files or database tables. + """ if type(name) != str: raise TypeError("Invalid type for input parameter 'name'.") if type(output_type) != str: @@ -58,8 +58,9 @@ def set_statement_indexes(self, statement_indexes: Union[int, List[int]] = 1): """ Sets the index(es) of the sql statement(s) within the input query, whose result-set(s) is(are) going to be written into this Output. + :param statement_indexes: integer or list of integers, which are the indices (starting with 1) of the sql - statements within the input query, whose resultsets are going to be written into this Output. + statements within the input query, whose resultsets are going to be written into this Output. """ if not isinstance(statement_indexes, Iterable): statement_indexes = [statement_indexes] @@ -77,9 +78,6 @@ def __repr__(self): class FileOutput(Output): - """ - Defines the output of a database query into a file. - """ def __init__(self, name: str = "output.json", @@ -87,11 +85,13 @@ def __init__(self, statement_indexes: Union[int, List[int]] = 1, file_service: str = None): """ + Defines the output of a database query into a file. + :param name: name of the file (string), such as "result.json" :param output_type: type (string) of the file containing the query result(s) (e.g., "FILE_JSON"). - As set of possible values is given by the static members of class 'SciQuery.OutputTargetType' + As set of possible values is given by the static members of class 'SciQuery.OutputTargetType' :param statement_indexes: list of integers or integer. Each integer value denotes the index or position (>=1) - of the sql statements within the input query, whose resultset is going to be written into this OutputTarget + of the sql statements within the input query, whose resultset is going to be written into this OutputTarget :param file_service: string denoting name or identifier of file service where the output file is written into. """ if file_service: @@ -125,6 +125,9 @@ def __init__(self, super().__init__(name, output_type, statement_indexes) def get_path(self) -> str: + """ + Get file output path on SciServer's FileSystem + """ if self.path: return self.path else: @@ -134,7 +137,7 @@ def get_path(self) -> str: @classmethod def get_default(cls): """ - Gets an Output object filled with default values: JSON output file where only the 1st SQL statement of + Gets a feault FileOutput object filled with default values: JSON output file where only the 1st SQL statement of the query is written in it. """ return cls("result.json", OutputType.FILE_JSON, 1) @@ -145,6 +148,16 @@ def build_file_base_path(top_volume: str = "Temporary", user_volume_owner_name: str = "", relative_path: str = "sciqueryjobs", add_date_ending: bool = False) -> str: + """ + Builds the base path for an output file located in the SciServer filesystem. + + :param top_volume: name (str) of top volume in SciServer's filesystem, such as "Temporary" or "Storage". + :param user_volume: name (str) of user volume in SciServer's filesystem. + :param user_volume_owner_name: name (str) of user volume owner in SciServer's filesystem. + :param relative_path: relative path (str) after /// + :param add_date_ending: if True, then the relative path is built from the current date. + :return: path string + """ if not top_volume: raise NameError("Input parameter top_volume cannot be empty or None") @@ -164,6 +177,9 @@ def build_file_base_path(top_volume: str = "Temporary", @staticmethod def find_file_service(file_service: Union[str, dict] = None) -> dict: + """ + Gets FileService info + """ if isinstance(file_service, dict): file_service = file_service.get("identifier") return _get_file_service(file_service) @@ -177,9 +193,6 @@ def __repr__(self): class DatabaseTableOutput(Output): - """ - Defines the output of a database query into a database table - """ def __init__(self, table: str = "resultTable", @@ -189,15 +202,17 @@ def __init__(self, schema: str = ""): """ + Defines the output of a database query into a database table. + :param table: name of the output database table (string), such as "resultTable" :param database: name of the database (string) where the output table in created. If it is owned explicitly by - a user, then it should follow the pattern "mydb:" + a user, then it should follow the pattern "mydb:" :param statement_indexes: list of integers or integer. Each integer value denotes the index or position (>=1) :param rdb_compute_domain: name (string) of the relational database (RDB) compute domain that contains the - database, or object of class RDBComputeDomain corresponding to it. - Name of such domains available to the user is returned by the function Jobs.getRDBComputeDomainNames(). + database, or object of class RDBComputeDomain corresponding to it. + Name of such domains available to the user is returned by the function Jobs.getRDBComputeDomainNames(). :param schema: database schema (string) - of the sql statements within the input query, whose resultset is going to be written into this OutputTarget + of the sql statements within the input query, whose resultset is going to be written into this OutputTarget """ if type(table) != str or type(schema) != str: raise TypeError("Input parameter(s) 'table' or 'schema' should be of type string.") @@ -242,11 +257,11 @@ def __repr__(self): class Outputs(list): - """ - Contains a list of output objects, defining database query resultset outputs. - """ def __init__(self, *outputs): + """ + Contains a list of output objects, defining database query result outputs. + """ super().__init__() for output in outputs: outs = output if isinstance(output, Iterable) else [output] @@ -254,12 +269,18 @@ def __init__(self, *outputs): self.append(out) def append(self, obj): + """ + Appends an Output object to this list. + """ if isinstance(obj, Output): super().append(obj) else: raise NameError("Input object is not a subclass of the 'Output' class.") def get_target_list(self, file_base_path: str = None, file_service: str = None): + """ + Gets list of output targets. + """ targets = [] fs = FileOutput.find_file_service(file_service) for output in self: @@ -285,18 +306,21 @@ def get_target_list(self, file_base_path: str = None, file_service: str = None): @staticmethod def get_default(): + """ + Gets an OutputList with one element, consisting of a JSON FileOutput object. + """ return Outputs(FileOutput(name="result.json", output_type=OutputType.FILE_JSON, statement_indexes = [1])) class RDBJob: - """ - Contains the definition of a job consisting on a query run in a Relational Database (RDB) - """ + _JOB_STATUS_MAP = {1: "PENDING", 2: "QUEUED", 4: "ACCEPTED", 8: "STARTED", 16: "FINISHED", 32: "SUCCESS", 64: "ERROR", 128: "CANCELED"} def __init__(self, job): """ + Contains the definition of a job consisting on a query run in a Relational Database (RDB). + :param job: can be the job ID (string), or a dictionary containing all the attributes of an RDBJob object. """ if type(job) != dict: @@ -337,6 +361,9 @@ def __init__(self, job): self.get_job_status = self._get_job_status_string def get_metadata(self, result_format="pandas") -> pd.DataFrame: + """ + Gets this RDBJob's metadata as a Pandas DataFrame. + """ data = [] column_names = [] if result_format == "pandas": @@ -355,16 +382,28 @@ def get_metadata(self, result_format="pandas") -> pd.DataFrame: @staticmethod def get_job_status(status: int) -> str: + """ + Gets job status string from its integer representation. + """ return RDBJob._JOB_STATUS_MAP.get(status) @staticmethod def get_job(job_id: int): + """ + Gets RDBJob object from its Id. + """ return RDBJob(Jobs.getJobDescription(job_id)) def cancel(self): + """ + Cancels this RDBJob. + """ Jobs.cancelJob(self.id) def refresh(self): + """ + Refreshes metadata and info of this RDBJob. + """ self.__init__(Jobs.getJobDescription(self.id)) def _get_job_status_string(self) -> str: @@ -400,12 +439,18 @@ def _get_output_from_index(self, ind: int): return self.outputs[ind] def get_output_path(self, output: Union[Output, int] = 0) -> str: + """ + Gets output path on SciServer's filesystem, if output type is a file. + """ out = self._get_output_from_index(output) if isinstance(output, int) else output if out.output_type == OutputType.DATABASE_TABLE: raise TypeError("Output is not a file but a database") return out.get_path() def get_output_as_string(self, output: Union[Output, int, str] = None): + """ + Gets content of output file in SciServer's filesystem as a string. + """ if not isinstance(output, str): out = self._get_output_from_index(output) if isinstance(output, int) else output file_path = self.get_output_path(out) @@ -426,10 +471,16 @@ def get_output_as_string(self, output: Union[Output, int, str] = None): return data def get_json_output(self, output: Union[Output, int, str] = 0) -> dict: + """ + Gets content of output file in SciServer's filesystem as a dictionary. + """ data_dict = json.loads(self.get_output_as_string(output)) return data_dict.get("Result") def get_dataframe_from_output(self, output: Union[Output, int] = 0, result_index: int = 0) -> pd.DataFrame: + """ + Gets query output as a Pandas DatFrame. + """ out = self._get_output_from_index(output) if isinstance(output, int) else output if out.output_type == OutputType.FILE_JSON: result = self.get_json_output(out)[result_index] @@ -456,17 +507,16 @@ def __repr__(self): class Database: - """ - Defines a database context where users can run sql queries. - """ def __init__(self, rdb_compute_domain: Union[str, int, dict], database: Union[str, int, dict]): """ + Defines a database context where users can run sql queries. + :param rdb_compute_domain: Parameter that identifies the relation database domain or environment that - contains the database. Could be either its name (string), ID (integer), or a dictionary containing - the attributes of the domain. + contains the database. Could be either its name (string), ID (integer), or a dictionary containing + the attributes of the domain. :param database: defines the database. Can be either the database name (string), ID (integer), or a dictionary - containing all the attributes of an object of class Database. + containing all the attributes of an object of class Database. """ if type(database) not in [str, int, dict]: raise TypeError("Invalid type for input parameter 'database'.") @@ -502,6 +552,9 @@ def __init__(self, rdb_compute_domain: Union[str, int, dict], database: Union[st self.rdb_compute_domain_id = domain.get('id') def get_metadata(self) -> pd.DataFrame: + """ + Gets Database metadata. + """ data = [] column_names = ['database_name', 'database_description', 'database_vendor', 'database_id', 'rdb_compute_domain_name', 'rdb_compute_domain_id'] @@ -521,16 +574,14 @@ def __repr__(self): class RDBComputeDomain: - """ - Defines a domain or environment with databases that users are able to query. - """ def __init__(self, rdb_compute_domain: Union[str, int, dict]): """ Creates an instance of an RDBComputeDomain, which defines a domain or environment with databases that users. are able to query. + :param rdb_compute_domain: Parameter that identifies the domain. Could be either its name (string), - ID (integer), or a dictionary containing all the attributes of the domain. + ID (integer), or a dictionary containing all the attributes of the domain. """ if type(rdb_compute_domain) not in [str, int, dict]: raise TypeError("Invalid type for input parameter 'rdb_compute_domain'.") @@ -564,15 +615,19 @@ def __init__(self, rdb_compute_domain: Union[str, int, dict]): def get_database_names(self) -> list: """ - Gets a list of the names of databases in an RDBComputeDomain + Gets a list of the names of databases in an RDBComputeDomain. :return: list of database names (strings) :example: dbnames = SciQuery.get_database_names(rdbComputeDomainName); + .. seealso:: SciQuery.get_databases_metadata """ return [db.name for db in self.databases] def get_database(self, database: Union[str, int, dict, Database]) -> Database: + """ + Gets Database + """ if type(database) == str: dbs = [db for db in self.databases if db.name == database] elif type(database) == int: @@ -589,6 +644,9 @@ def get_database(self, database: Union[str, int, dict, Database]) -> Database: return dbs[0] def get_default_database(self) -> Database: + """ + Gets default database. + """ dbs = [db for db in self.databases if db.name == SciQuery.get_mydb_name()] if len(dbs) > 0: return dbs[0] @@ -598,6 +656,9 @@ def get_default_database(self) -> Database: raise Exception("No default database available.") def get_metadata(self, do_include_databases: bool = False) -> pd.DataFrame: + """ + Gets metadata of this RDBComputeDomain. + """ column_names = ['rdb_compute_domain_name', 'rdb_compute_domain_description', 'rdb_compute_domain_id'] data = [[self.name, self.description, self.id]] @@ -622,6 +683,7 @@ def get_databases_metadata(self) -> pd.DataFrame: Gets metadata of the databases in this RDBComputeDomain. :return: pandas dataframe with associated metadata. + .. seealso:: SciQuery.get_database_names """ dfs = [db.get_metadata() for db in self.databases] @@ -637,13 +699,14 @@ def __repr__(self): class RDBComputeDomains(list): - """ - Defines a list of RDBComputeDomains, which are domains or environments with databases that users are able to query. - """ + def __init__(self, rdb_compute_domains: Union[Iterable, RDBComputeDomain]): """ + Defines a list of RDBComputeDomains, which are domains or environments with databases that users are able to + query. + :param rdb_compute_domains: Parameter that identifies a list of RDBComputeDomain objects. - Could be either single RDBComputeDomain object, or an iterable containing multiple RDBComputeDomain objects. + Could be either single RDBComputeDomain object, or an iterable containing multiple RDBComputeDomain objects. """ super().__init__() domains = rdb_compute_domains if isinstance(rdb_compute_domains, Iterable) else [rdb_compute_domains] @@ -654,6 +717,9 @@ def __init__(self, rdb_compute_domains: Union[Iterable, RDBComputeDomain]): raise NameError("Input object is not of class RDBComputeDomain.") def get_rdb_compute_domain(self, rdb_compute_domain: Union[str, int, dict, RDBComputeDomain]) -> RDBComputeDomain: + """ + Gets RDBComputeDomain from this list + """ if type(rdb_compute_domain) == str: domains = [d for d in self if d.name == rdb_compute_domain] elif type(rdb_compute_domain) == int: @@ -671,6 +737,9 @@ def get_rdb_compute_domain(self, rdb_compute_domain: Union[str, int, dict, RDBCo return domains[0] def get_default_rdb_compute_domain(self) -> RDBComputeDomain: + """ + Gets default RDBComputeDomain from this list. + """ domains = [domain for domain in self if len(domain.databases) > 0] if len(domains) > 0: doms = [dom for dom in domains if dom.get_default_database().name == SciQuery.get_mydb_name()] @@ -682,9 +751,7 @@ def get_default_rdb_compute_domain(self) -> RDBComputeDomain: class SciQuery: - """ - Instance of the SciQuery app for querying relational databases. - """ + def __init__(self, rdb_compute_domain: Union[str, int, dict, RDBComputeDomain] = None, database: Union[str, int, dict, Database] = None, @@ -696,29 +763,30 @@ def __init__(self, poll_time: float = 1.0 ): """ - Creates instance of SciQuery class. + Created an instance of the SciQuery app for querying relational databases. :param rdb_compute_domain: defines a domain or environment of multiple databases where users can run queries. - Can be either the domain's name (string), ID (integer), an object of class RDBComputeDomain, or a dictionary - containing all the attributes of an object of class RDBComputeDomain. If set to None, a default value will be - assigned to it. + Can be either the domain's name (string), ID (integer), an object of class RDBComputeDomain, or a dictionary + containing all the attributes of an object of class RDBComputeDomain. If set to None, a default value will + be assigned to it. :param database: defines the database where the queries are executed in. - Can be either the database name (string), ID (integer), an object of class Database, or a dictionary containing - all the attributes of an object of class Database. If set to None, a default value will be assigned to it. + Can be either the database name (string), ID (integer), an object of class Database, or a dictionary + containing all the attributes of an object of class Database. If set to None, a default value will be + assigned to it. :param file_service: a File Service defines an available file system where query result sets can be written - into. This parameter can be it name or identifier (string), or a dictionary defining a file service. - If set to None, a default value will be assigned to it. + into. This parameter can be it name or identifier (string), or a dictionary defining a file service. + If set to None, a default value will be assigned to it. :param results_base_path: base path (string) of the directory where the query results are written into. - Can be constructed by using FileOutput.build_file_base_path(). If set to None, a default value will be assigned - to it at the moment of running a sql query. + Can be constructed by using FileOutput.build_file_base_path(). If set to None, a default value will be + assigned to it at the moment of running a sql query. :param outputs: Defines the query(ies) output(s). Can be an object derived from the Output base class (such as - FileOutput or DatabaseTableOutput), or a list of those. If set to None, a default value (json file output) - will be assigned to it. + FileOutput or DatabaseTableOutput), or a list of those. If set to None, a default value (json file output) + will be assigned to it. :param verbose: Boolean parameter. If True, warning messages will be written in case of errors, in the case when - the hard_fail parameter is set to False. If False, nothing will be written. + the hard_fail parameter is set to False. If False, nothing will be written. :param hard_fail: Boolean parameter. If True, exceptions will be raised in case of errors during instantiation. - If False, then no exceptions are raised, and warnings might be showed instead - (depending on the value of the verbose parameter). + If False, then no exceptions are raised, and warnings might be showed instead + (depending on the value of the verbose parameter). :param poll_time: time (float) in seconds between consecutive requests for updates in the jobs status. """ @@ -737,6 +805,9 @@ def __init__(self, @staticmethod def get_token() -> str: + """ + Gets user's auth token. + """ token = Authentication.getToken() if token is None or token == "": raise Exception("User not has not logged into SciServer. Use 'Authentication.login'.") @@ -744,6 +815,9 @@ def get_token() -> str: @staticmethod def get_user() -> Authentication.KeystoneUser: + """ + Gets logged Keystone user info. + """ token = SciQuery.get_token() user = Authentication.getKeystoneUserWithToken(token) user.token = token @@ -763,24 +837,26 @@ def set(self, Sets or refreshes the parameters in the SciQuery object, all at once. :param rdb_compute_domain: defines a domain or environment of multiple databases where users can run queries. - Can be either the domain's name (string), ID (integer), an object of class RDBComputeDomain, or a dictionary - containing all the attributes of an object of class RDBComputeDomain. If set to None, the current value - is refreshed. + Can be either the domain's name (string), ID (integer), an object of class RDBComputeDomain, or a dictionary + containing all the attributes of an object of class RDBComputeDomain. If set to None, the current value + is refreshed. :param database: defines the database where the queries are executed in. - Can be either the database name (string), ID (integer), an object of class Database, or a dictionary containing - all the attributes of an object of class Database. If set to None, the current value is refreshed. + Can be either the database name (string), ID (integer), an object of class Database, or a dictionary + containing all the attributes of an object of class Database. If set to None, the current value is + refreshed. :param file_service: a File Service defines an available file system where query result sets can be written - into. This parameter can be it name or identifier (string), or a dictionary defining a file service. - If set to None, the current value is refreshed. + into. This parameter can be it name or identifier (string), or a dictionary defining a file service. + If set to None, the current value is refreshed. :param results_base_path: base path (string) of the directory where the query results are written into. - Can be constructed by using FileOutput.build_file_base_path(). + Can be constructed by using FileOutput.build_file_base_path(). :param outputs: Defines the query(ies) output(s). Can be a list of Output objects, - or a single object of class Outputs. If set to None, a default value (json file output) will be assigned to it. + or a single object of class Outputs. If set to None, a default value (json file output) will be assigned to + it. :param verbose: Boolean parameter. If True, warning messages will be written in case of errors, in the case when - the hard_fail parameter is set to False. If False, nothing will be written. + the hard_fail parameter is set to False. If False, nothing will be written. :param hard_fail: Boolean parameter. If True, exceptions will be raised in case of errors during instantiation. - If False, then no exceptions are raised, and warnings might be showed instead - (depending on the value of the verbose parameter). + If False, then no exceptions are raised, and warnings might be showed instead + (depending on the value of the verbose parameter). :param poll_time: time (float) in seconds between consecutive requests for updates in the jobs status. """ @@ -846,10 +922,16 @@ def _handle_exception(self, exception: Exception, extra_message: str = ""): warnings.warn(message) def refresh(self): + """ + Refreshes SciQuery instance. + """ self.set(verbose=self.verbose, hard_fail=self.hard_fail) @staticmethod def get_mydb_name(owner_name: str = None) -> str: + """ + Returns name of mydb based on the owner's name. + """ if not owner_name: owner_name = SciQuery.get_user().userName return "mydb:" + owner_name @@ -860,10 +942,10 @@ def get_rdb_compute_domains(result_format: str = 'class') -> RDBComputeDomains: Gets a list of all registered Relational Database (RDB) compute domains that the user has access to. :param result_format: If set to "class", then the returned value will be of class RDBComputeDomains. - If set to "dict", then the return value will be a list of dictionaries, each of them containing the attributes - of an RDBComputeDomain object. - :return: an object of class RDBComputeDomains, or a list of dictionaries, each of them containing the attributes - of an RDBComputeDomain object. + If set to "dict", then the return value will be a list of dictionaries, each of them containing the + attributes of an RDBComputeDomain object. + :return: an object of class RDBComputeDomains, or a list of dictionaries, each of them containing the + attributes of an RDBComputeDomain object. """ token = SciQuery.get_user().token @@ -890,6 +972,9 @@ def get_rdb_compute_domains(result_format: str = 'class') -> RDBComputeDomains: @property def rdb_compute_domains(self) -> RDBComputeDomains: + """ + Property defining the RDBComputeDomains available in this SciQuery instance. + """ return self._rdb_compute_domains @rdb_compute_domains.setter @@ -902,6 +987,9 @@ def rdb_compute_domains(self, rdb_compute_domains: RDBComputeDomains): @property def rdb_compute_domain(self) -> RDBComputeDomain: + """ + Property defining the selected RDBComputeDomain in this SciQuery instance. + """ return self._rdb_compute_domain @rdb_compute_domain.setter @@ -917,9 +1005,9 @@ def get_rdb_compute_domain(self, rdb_compute_domain: Union[str, int, dict, RDBCo which is set in the SciQuery instance. :param rdb_compute_domain: defines a domain or environment of multiple databases where users can run queries. - Can be either the domain's name (string), ID (integer), an object of class RDBComputeDomain, or a dictionary - containing all the attributes of an object of class RDBComputeDomain. If set to None, then the currently set - value of rdb_compute_domain in the SciQuery object is returned. + Can be either the domain's name (string), ID (integer), an object of class RDBComputeDomain, or a dictionary + containing all the attributes of an object of class RDBComputeDomain. If set to None, then the currently set + value of rdb_compute_domain in the SciQuery object is returned. :return: Object of class RDBComputeDomain. """ if rdb_compute_domain is None: @@ -927,12 +1015,18 @@ def get_rdb_compute_domain(self, rdb_compute_domain: Union[str, int, dict, RDBCo return self.rdb_compute_domains.get_rdb_compute_domain(rdb_compute_domain) def get_default_rdb_compute_domain(self): + """ + Gets default RDBComputeDomain. + """ return self.rdb_compute_domains.get_default_rdb_compute_domain() # database --------------------------------------------------- @property def database(self) -> Database: + """ + Property defining the selected database in this SciQuery instance. + """ return self._database @database.setter @@ -948,15 +1042,15 @@ def get_database(self, Returns an object of class Database, either defined by the input name or identifiers, or that which is set in the SciQuery instance. - :param database: identifies the database, which this function returns as an object of class Database. - Can be either the database name (string), ID (integer), an object of class Database, or a dictionary containing - all the attributes of an object of class Database. If set to None, then the currently set value of database in - the SciQuery object is returned. + Can be either the database name (string), ID (integer), an object of class Database, or a dictionary + containing all the attributes of an object of class Database. If set to None, then the currently set value + of database in the SciQuery object is returned. :param rdb_compute_domain: defines a domain or environment of multiple databases where users can run queries, - and that contains the database. Can be either the domain's name (string), ID (integer), an object of class - RDBComputeDomain, or a dictionary containing all the attributes of an object of class RDBComputeDomain. - If set to None, then the currently set value of rdb_compute_domain in the SciQuery object is internally used. + and that contains the database. Can be either the domain's name (string), ID (integer), an object of class + RDBComputeDomain, or a dictionary containing all the attributes of an object of class RDBComputeDomain. + If set to None, then the currently set value of rdb_compute_domain in the SciQuery object is internally + used. :return: Object of class Database """ if database is None: @@ -964,6 +1058,9 @@ def get_database(self, return self.get_rdb_compute_domain(rdb_compute_domain).get_database(database) def get_default_database(self, rdb_compute_domain: Union[str, int, dict, RDBComputeDomain] = None) -> Database: + """ + Gets default database. + """ domain = self.get_default_rdb_compute_domain() if rdb_compute_domain is None \ else self.get_rdb_compute_domain(rdb_compute_domain) return domain.get_default_database() @@ -972,6 +1069,9 @@ def get_default_database(self, rdb_compute_domain: Union[str, int, dict, RDBComp @property def file_service(self) -> dict: + """ + Property defining the selected FileService available in this SciQuery instance. + """ return self._file_service @file_service.setter @@ -983,10 +1083,10 @@ def file_service(self, file_service: Union[str, dict]): def get_file_service(self, file_service: Union[str, dict] = None) -> dict: """ Returns the definition of a file service as a dictionary, either defined by the input name or identifiers, - or that which is set in the SciQuery instance. + or that which is set in the SciQuery instance. :param file_service: name or identifier (string) of a file service, or the dictionary with its definition. - If set to None, then the currently set value of file_service in the SciQuery object is returned. + If set to None, then the currently set value of file_service in the SciQuery object is returned. :return: dictionary with the definition of a file service. """ if file_service is None: @@ -994,12 +1094,18 @@ def get_file_service(self, file_service: Union[str, dict] = None) -> dict: return FileOutput.find_file_service(file_service) def get_default_file_service(self) -> dict: + """ + Gets default FiLeService + """ return FileOutput.find_file_service() # results_base_path --------------------------------------------------- @property def results_base_path(self) -> str: + """ + Property defining the file output base path in SciServer's filesystem for this SciQuery instance. + """ return self._results_base_path @results_base_path.setter @@ -1009,15 +1115,24 @@ def results_base_path(self, results_base_path: str): self._results_base_path = results_base_path def get_results_base_path(self) -> str: + """ + Gets results base path on SciServer's filesystem. + """ return self._results_base_path def get_default_results_base_path(self, add_date_ending=True) -> str: + """ + Gets default results base path on SciServer's filesystem. + """ return FileOutput.build_file_base_path(add_date_ending=add_date_ending) # outputs ------------------------------------------------------------- @property def outputs(self) -> Outputs: + """ + Property defining a list of query result Output objects. + """ return self._outputs @outputs.setter @@ -1032,7 +1147,7 @@ def get_outputs(self, outputs: Union[Outputs, Output] = None) -> Outputs: which is set in the SciQuery instance. :param outputs: object of class Outputs, or iterable of output objects. If set to None, then the currently - set value of outputs in the SciQuery object is returned. + set value of outputs in the SciQuery object is returned. :return: object of class Outputs. """ if outputs is None: @@ -1040,6 +1155,9 @@ def get_outputs(self, outputs: Union[Outputs, Output] = None) -> Outputs: return Outputs(outputs) def get_default_outputs(self) -> Outputs: + """ + Gets default query outputs. + """ return Outputs.get_default() # --------------------------------------------------------------------------------------------- @@ -1059,22 +1177,24 @@ def submit_query_job(self, :param sql_query: sql query (string) :param database: defines the database where the sql query is executed in. - Can be either the database name (string), ID (integer), an object of class Database, or a dictionary containing - all the attributes of an object of class Database. If set to None, then the current value of the database field - in this SciQuery instance will be used. + Can be either the database name (string), ID (integer), an object of class Database, or a dictionary + containing all the attributes of an object of class Database. If set to None, then the current value of + the database field in this SciQuery instance will be used. :param outputs: Defines the query(ies) output(s). Can be an object derived from the Output base class (such as - FileOutput or DatabaseTableOutput), or a list of those. If set to None, then the current value of the outputs - field in this SciQuery instance will be used. + FileOutput or DatabaseTableOutput), or a list of those. If set to None, then the current value of the + outputs field in this SciQuery instance will be used. :param results_base_path: full path to results folder (string) where query output tables are written into. - E.g.: /home/idies/workspace/rootVOlume/username/userVolume/jobsFolder . If set to None, then its current value - in this SciQuery instance will be used. If that value is None, then a default folder will be set automatically. + E.g.: /home/idies/workspace/rootVolume/username/userVolume/jobsFolder . If set to None, then its current + value in this SciQuery instance will be used. If that value is None, then a default folder will be set + automatically. :param rdb_compute_domain: defines a domain or environment of multiple databases where users can run queries, - and that contains the database. Can be either the domain's name (string), ID (integer), an object of class - RDBComputeDomain, or a dictionary containing all the attributes of an object of class RDBComputeDomain. - If set to None, then the currently set value of rdb_compute_domain in the SciQuery object is internally used. + and that contains the database. Can be either the domain's name (string), ID (integer), an object of class + RDBComputeDomain, or a dictionary containing all the attributes of an object of class RDBComputeDomain. + If set to None, then the currently set value of rdb_compute_domain in the SciQuery object is internally + used. :param file_service: a File Service defines an available file system where query result sets can be written - into. This parameter can be its name or identifier (string), or a dictionary defining a file service. - If set to None, then the currently set value of file_service in the SciQuery object is internally used. + into. This parameter can be its name or identifier (string), or a dictionary defining a file service. + If set to None, then the currently set value of file_service in the SciQuery object is internally used. :param job_alias: alias (string) of job, defined by the user. :return: the ID (string) that labels the job. """ @@ -1127,24 +1247,26 @@ def execute_query(self, :param sql_query: sql query (string) :param database: defines the database where the sql query is executed in. - Can be either the database name (string), ID (integer), an object of class Database, or a dictionary containing - all the attributes of an object of class Database. If set to None, then the current value of the database field - in this SciQuery instance will be used. + Can be either the database name (string), ID (integer), an object of class Database, or a dictionary + containing all the attributes of an object of class Database. If set to None, then the current value of + the database field in this SciQuery instance will be used. :param results_base_path: full path to results folder (string) where query output tables are written into. - E.g.: /home/idies/workspace/rootVOlume/username/userVolume/jobsFolder . If set to None, then its current value in - this SciQuery instance will be used. If that value is None, then a default folder will be set automatically. + E.g.: /home/idies/workspace/rootVOlume/username/userVolume/jobsFolder . If set to None, then its current + value in this SciQuery instance will be used. If that value is None, then a default folder will be set + automatically. :param rdb_compute_domain: defines a domain or environment of multiple databases where users can run queries, - and that contains the database. Can be either the domain's name (string), ID (integer), an object of class - RDBComputeDomain, or a dictionary containing all the attributes of an object of class RDBComputeDomain. - If set to None, then the currently set value of rdb_compute_domain in the SciQuery object is internally used. + and that contains the database. Can be either the domain's name (string), ID (integer), an object of class + RDBComputeDomain, or a dictionary containing all the attributes of an object of class RDBComputeDomain. + If set to None, then the currently set value of rdb_compute_domain in the SciQuery object is internally used. :param job_alias: alias (string) of job, defined by the user. :param file_service: a File Service defines an available file system where query result sets can be written - into. This parameter can be its name or identifier (string), or a dictionary defining a file service. - If set to None, then the currently set value of file_service in the SciQuery object is internally used. + into. This parameter can be its name or identifier (string), or a dictionary defining a file service. + If set to None, then the currently set value of file_service in the SciQuery object is internally used. :param write_job_id: if True, the job id will be written on the screen, just before returning the result. - The job id won;t be written if write_job_id = False. - into. This parameter can be its name or identifier (string), or a dictionary defining a file service. - If set to None, then the currently set value of file_service in the SciQuery object is internally used. + The job id won;t be written if write_job_id = False. + into. This parameter can be its name or identifier (string), or a dictionary defining a file service. + If set to None, then the currently set value of file_service in the SciQuery object is internally used. + :return: Pandas data frame containing the result of the query. """ output = FileOutput("result1.json", OutputType.FILE_JSON, 1) @@ -1175,14 +1297,14 @@ def get_jobs_list(top=5, open=None, start=None, end=None, result_format="pandas" :param top: top number of jobs (integer) returned. If top=None, then all jobs are returned. :param open: If set to 'True', then only returns jobs that have not finished executing and wrapped up - (status <= FINISHED). If set to 'False' then only returns jobs that are still running. If set to 'None', - then returns both finished and unfinished jobs. + (status <= FINISHED). If set to 'False' then only returns jobs that are still running. If set to 'None', + then returns both finished and unfinished jobs. :param start: The earliest date (inclusive) to search for jobs, in string format yyyy-MM-dd hh:mm:ss.SSS. - If set to 'None', then there is no lower bound on date. + If set to 'None', then there is no lower bound on date. :param end: The latest date (inclusive) to search for jobs, in string format yyyy-MM-dd hh:mm:ss.SSS. - If set to 'None', then there is no upper bound on date. + If set to 'None', then there is no upper bound on date. :param result_format: string defining the return format. "pandas" for a pandas dataframe and "list" - for a list of RDBJob objects. + for a list of RDBJob objects. :return: pandas dataframe, or list of RDBJob objects or, each containing the definition of a submitted job. """ job_dict_list = Jobs.getJobsList(top=top, open=open, start=start, end=end, type='rdb') @@ -1236,11 +1358,11 @@ def wait_for_job(self, job_id, verbose=False): :param job_id: id of job (integer) :param verbose: if True, will print "wait" messages on the screen while the job is still running. If False, it - will suppress the printing of messages on the screen. + will suppress the printing of messages on the screen. :return: After the job is finished, returns an object of class RDBJob, containing the job definition. :raises: Throws an exception if the user is not logged into SciServer (use Authentication.login for that - purpose). - Throws an exception if the HTTP request to the JOBM API returns an error. + purpose). + Throws an exception if the HTTP request to the JOBM API returns an error. """ t = max(0.5, self.poll_time) wait_message = "Waiting" @@ -1261,7 +1383,7 @@ def get_rdb_compute_domains_metadata(self, do_include_databases=False): Gets metadata related to all relational database (RDB) compute domains (RDBComputeDomains) available. :param do_include_databases: Boolean parameter. If True, it will return metadata related to all available - databases in each RDBComputeDomain as well. + databases in each RDBComputeDomain as well. :return: pandas dataframe containing associated metadata. """ dfs = [] @@ -1281,6 +1403,9 @@ def get_rdb_compute_domain_names(self): return [d.name for d in self.rdb_compute_domains] def get_rdb_compute_domain_metadata(self, rdb_compute_domain=None, do_include_databases=False): + """ + Gets metadata of an rdb_compute_domain. + """ return self.get_rdb_compute_domain(rdb_compute_domain).get_metadata(do_include_databases) def get_databases_metadata(self, rdb_compute_domain=None): @@ -1295,13 +1420,16 @@ def get_databases_metadata(self, rdb_compute_domain=None): return rdb_compute_domain.get_databases_metadata() def get_database_metadata(self, database=None, rdb_compute_domain=None): + """ + Gets database metadata. + """ rdb_compute_domain = self.get_rdb_compute_domain(rdb_compute_domain) database = self.get_database(database, rdb_compute_domain) return database.get_metadata() def get_database_names(self, rdb_compute_domain=None): """ - Gets a list of the names of databases in an RDBComputeDomain + Gets a list of the names of the databases in this RDBComputeDomain. :return: array of database names (strings) """ diff --git a/py3/SciServer/__init__.py b/py3/SciServer/__init__.py index bf3877c..87f1fcb 100644 --- a/py3/SciServer/__init__.py +++ b/py3/SciServer/__init__.py @@ -9,7 +9,10 @@ \t\tAlthough some tools accept anonymous access, you can use Authentication.login to login and access the tools and your own data and environment (after registering in the Login Portal). If you are running this package in a Jupyter Notebook in the SciServer-Compute environment, the use of Authentication.login is not necessary since it's done automatically. * `CasJobs `_: Database storage and querying. -\t\tYou can have access big databases, as well as save your data tables in your own database called 'MyDB'. The user can run synchronous or asynchronous SQL queries and get the result back as an R data-frame (using CasJobs.executeQuery or CasJobs.submitJob, respectively). Uploading of CSV files or R data-frames into a database table can be done using CasJobs.uploadCSVToTable and CasJobs.uploadDataFrameToTable, respectively. +\t\tYou can have access big databases, as well as saving your data tables in your own database called 'MyDB'. The user can run synchronous or asynchronous SQL queries and get the result back as a Pandas data-frame (using CasJobs.executeQuery or CasJobs.submitJob, respectively). Uploading of CSV files or data-frames into a database table can be done using CasJobs.uploadCSVToTable and CasJobs.uploadDataFrameToTable, respectively. + +* `SciQuery`: Advanced database storage and querying. +\t\tSciQuery expands the capabilities of CasJobs, as it is designed to work with databases loaded in several database management systems beyond MS SQL Server, such as PostgreSQL and MySQL. Query result tables can also be directly written as JSON or CSV files in the SciServer filesystem. * `SciDrive `_: Drag-and-drop file storage and sharing. \t\tYou can create directories in SciDrive using SciDrive.createContainer, upload a file to SciDrive using SciDrive.upload, and share its URL with your collaborators by using SciDrive.publicUrl. @@ -41,7 +44,7 @@ **Authors**: Gerard Lemson , Manuchehr Taghizadeh-Popp -**Version**: sciserver-v2.1.0 +**Version**: sciserver-v2.2.0 """ diff --git a/py3/setup.py b/py3/setup.py index b61abb2..26e8abf 100644 --- a/py3/setup.py +++ b/py3/setup.py @@ -1,6 +1,6 @@ from setuptools import setup, find_packages setup( name = "SciServer", - version = "2.1.0", + version = "2.2.0", packages = find_packages(), ) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..cb22f83 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +cachetools \ No newline at end of file