diff --git a/constants.py b/constants.py index ca78c4a78cf72d584544124fa2b17aa5ee4f2733..5aaf27d31f66a390f6ceb2d3cd05974991029430 100644 --- a/constants.py +++ b/constants.py @@ -53,3 +53,5 @@ DELETE_ORGANISMS_TOOL = "toolshed.g2.bx.psu.edu/repos/gga/chado_organism_delete_ HOST_DATA_DIR='src_data' CONTAINER_DATA_DIR_ROOT='/project_data' +GALAXY_LIBRARY_NAME = 'Project Data' +GALAXY_LIBRARY_DESC = 'Data for current genome annotation project' diff --git a/gga_load_data.py b/gga_load_data.py index 74634d6d5f9d158a39ff01a30df82d8b45336bc5..d1bb0640f8a262a3bc4e08f417c9e3a5fbc6f4bf 100755 --- a/gga_load_data.py +++ b/gga_load_data.py @@ -69,12 +69,11 @@ class LoadData(speciesData.SpeciesData): """ logging.debug("Getting 'Homo sapiens' ID in chado database") - get_sapiens_id_job_output_dataset_id = utilities.run_tool_and_get_single_output_dataset_id( + get_sapiens_id_json_output = utilities.run_tool_and_download_single_output_dataset( self.instance, tool_id=constants.GET_ORGANISMS_TOOL, # If this version if not found, Galaxy will use the one that is found history_id=self.history_id, tool_inputs={"genus": "Homo", "species": "sapiens"}) - get_sapiens_id_json_output = self.instance.datasets.download_dataset(dataset_id=get_sapiens_id_job_output_dataset_id) logging.info("Deleting Homo 'sapiens' in the instance's chado database") try: @@ -114,7 +113,7 @@ class LoadData(speciesData.SpeciesData): data_dir_root=os.path.join(self.get_species_dir(), constants.HOST_DATA_DIR) - instance = GalaxyInstance(url=self.instance_url, + gio = GalaxyInstance(url=self.instance_url, email=self.config[constants.CONF_GALAXY_DEFAULT_ADMIN_EMAIL], password=self.config[constants.CONF_GALAXY_DEFAULT_ADMIN_PASSWORD] ) @@ -129,14 +128,14 @@ class LoadData(speciesData.SpeciesData): if folders: # Delete pre-existing lib (probably created by a previous call) - existing = instance.libraries.get_previews(name='Project Data') + existing = gio.libraries.get_previews(name=constants.GALAXY_LIBRARY_NAME) for lib in existing: if not lib.deleted: - logging.info('Pre-existing "Project Data" library %s found, removing it' % lib.id) - instance.libraries.delete(lib.id) + logging.info('Pre-existing {0} library {1}} found, removing it'.format(constants.GALAXY_LIBRARY_NAME, lib.id)) + gio.libraries.delete(lib.id) - logging.info("Creating new 'Project Data' library") - prj_lib = instance.libraries.create('Project Data', 'Data for current genome annotation project') + logging.info("Creating new %s library" % constants.GALAXY_LIBRARY_NAME) + prj_lib = gio.libraries.create(constants.GALAXY_LIBRARY_NAME, constants.GALAXY_LIBRARY_DESC) self.library_id = prj_lib.id # project data folder/library logging.info("Library for {0}: {1}".format(self.full_name, self.library_id)) @@ -280,7 +279,7 @@ class LoadData(speciesData.SpeciesData): logging.info("Did not find metadata in %s " % meta_file) return self.get_bam_label(dirname, bam_file) - def create_galaxy_instance(self): + def set_galaxy_instance(self): """ Test the connection to the galaxy instance for the current organism Exit if we cannot connect to the instance @@ -364,18 +363,16 @@ if __name__ == "__main__": # Parse the config yaml file load_data_for_current_species.config = config # Set the instance url attribute -- Does not work with localhost on scratch (ALB) - load_data_for_current_species.instance_url = "http://localhost:{0}/sp/{1}_{2}/galaxy/".format( + load_data_for_current_species.instance_url = "http://localhost:{0}/sp/{1}/galaxy/".format( load_data_for_current_species.config[constants.CONF_ALL_HTTP_PORT], - load_data_for_current_species.genus_lowercase, - load_data_for_current_species.species) + load_data_for_current_species.genus_species) # Check the galaxy container state and proceed if the galaxy services are up and running - if utilities.check_galaxy_state(genus_lowercase=load_data_for_current_species.genus_lowercase, - species=load_data_for_current_species.species, + if utilities.check_galaxy_state(network_name=load_data_for_current_species.genus_species, script_dir=load_data_for_current_species.script_dir): # Create the Galaxy instance - load_data_for_current_species.instance = load_data_for_current_species.create_galaxy_instance() + load_data_for_current_species.instance = load_data_for_current_species.set_galaxy_instance() # Load the datasets into a galaxy library logging.info("Setting up library for {0} {1}".format(load_data_for_current_species.genus, load_data_for_current_species.species)) diff --git a/phaoexplorer_constants.py b/phaoexplorer_constants.py new file mode 100644 index 0000000000000000000000000000000000000000..229b216d6f0267af1f69e84ff0c5fb45f75548f8 --- /dev/null +++ b/phaoexplorer_constants.py @@ -0,0 +1,40 @@ +### Workflows + +WORKFLOW_LOAD_FASTA_GFF_JBROWSE = "load_fasta_gff_jbrowse" +WORKFLOW_BLAST = "blast" +WORKFLOW_INTERPRO = "interpro" +WORKFLOW_VALID_TYPES = [WORKFLOW_LOAD_FASTA_GFF_JBROWSE, WORKFLOW_BLAST, WORKFLOW_INTERPRO] + +### Galaxy tools + +ADD_ORGANISM_TOOL_NAME = "toolshed.g2.bx.psu.edu/repos/gga/chado_organism_add_organism/organism_add_organism/" +ADD_ORGANISM_TOOL_NAME = "2.3.4+galaxy0" +ADD_ORGANISM_TOOL_ID = ADD_ORGANISM_TOOL_NAME + ADD_ORGANISM_TOOL_NAME +ADD_ORGANISM_TOOL_CHANGESET_REVISION = "1f12b9650028" + +ADD_ANALYSIS_TOOL_NAME = "toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/" +ADD_ANALYSIS_TOOL_VERSION = "2.3.4+galaxy0" +ADD_ANALYSIS_TOOL_ID= ADD_ANALYSIS_TOOL_NAME + ADD_ANALYSIS_TOOL_VERSION +ADD_ANALYSIS_TOOL_CHANGESET_REVISION = "10b2b1c70e69" +ADD_ANALYSIS_TOOL_PARAM_PROGRAM = "Performed by Genoscope" +ADD_ANALYSIS_TOOL_PARAM_DATE = "2021-02-24" + +GET_ORGANISMS_TOOL_NAME = "toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/" +GET_ORGANISMS_TOOL_VERSION = "2.3.4+galaxy0" +GET_ORGANISMS_TOOL_ID = GET_ORGANISMS_TOOL_NAME + GET_ORGANISMS_TOOL_VERSION +GET_ORGANISMS_TOOL_CHANGESET_REVISION = "831229e6cda2" + +GET_ANALYSES_TOOL_NAME = "toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_get_analyses/analysis_get_analyses/" +GET_ANALYSES_TOOL_VERSION = "2.3.4+galaxy0" +GET_ANALYSES_TOOL_ID = GET_ANALYSES_TOOL_NAME + GET_ANALYSES_TOOL_VERSION +GET_ANALYSES_TOOL_CHANGESET_REVISION = "a867923f555e" + +ANALYSIS_SYNC_TOOL_NAME = "toolshed.g2.bx.psu.edu/repos/gga/tripal_analysis_sync/analysis_sync/" +ANALYSIS_SYNC_TOOL_VERSION = "3.2.1.0" +ANALYSIS_SYNC_TOOL_ID = ANALYSIS_SYNC_TOOL_NAME + ANALYSIS_SYNC_TOOL_VERSION +ANALYSIS_SYNC_TOOL_CHANGESET_REVISION = "f487ff676088" + +ORGANISM_SYNC_TOOL_NAME = "toolshed.g2.bx.psu.edu/repos/gga/tripal_organism_sync/organism_sync/" +ORGANISM_SYNC_TOOL_VERSION = "3.2.1.0" +ORGANISM_SYNC_TOOL_ID = ORGANISM_SYNC_TOOL_NAME + ORGANISM_SYNC_TOOL_VERSION +ORGANISM_SYNC_TOOL_CHANGESET_REVISION = "afd5d92745fb" diff --git a/run_workflow_phaeoexplorer.py b/run_workflow_phaeoexplorer.py index 13e50ff7017b3e3df7710d1b6e256d1fbf57588a..4928e73207940f0fee9c164819e77c5cf7eb6153 100755 --- a/run_workflow_phaeoexplorer.py +++ b/run_workflow_phaeoexplorer.py @@ -15,6 +15,8 @@ from bioblend import galaxy import utilities import speciesData +import constants +import phaoexplorer_constants """ gga_init.py @@ -22,7 +24,6 @@ gga_init.py Usage: $ python3 gga_init.py -i input_example.yml --config [config file] [OPTIONS] """ - class RunWorkflow(speciesData.SpeciesData): """ Run a workflow into the galaxy instance's history of a given species @@ -34,77 +35,69 @@ class RunWorkflow(speciesData.SpeciesData): """ - def set_get_history(self): + def __init__(self, parameters_dictionary): + + super().__init__(parameters_dictionary) + + self.chado_species_name = " ".join(utilities.filter_empty_not_empty_items( + [self.species, self.strain, self.sex])["not_empty"]) + + self.abbreviation = self.genus_uppercase[0] + ". " + self.chado_species_name + + self.common = self.name + if not self.common_name is None and self.common_name != "": + self.common = self.common_name + + self.history_name = str(self.genus_species) + + self.genome_analysis_name = "genome v{0} of {1}".format(self.genome_version, self.full_name) + self.genome_analysis_programversion = "genome v{0}".format(self.genome_version) + self.genome_analysis_sourcename = self.full_name + + self.ogs_analysis_name = "OGS{0} of {1}".format(self.ogs_version, self.full_name) + self.ogs_analysis_programversion = "OGS{0}".format(self.ogs_version) + self.ogs_analysis_sourcename = self.full_name + + def set_history(self): """ Create or set the working history to the current species one :return: """ try: - histories = self.instance.histories.get_histories(name=str(self.genus_species)) + histories = self.instance.histories.get_histories(name=self.history_name) self.history_id = histories[0]["id"] - logging.debug("History ID set for {0}: {1}".format(self.full_name, self.history_id)) + logging.debug("History ID set for {0}: {1}".format(self.history_name, self.history_id)) except IndexError: - logging.info("Creating history for %s" % self.full_name) - self.instance.histories.create_history(name=str(self.full_name)) - histories = self.instance.histories.get_histories(name=str(self.genus_species)) - self.history_id = histories[0]["id"] - logging.debug("History ID set for {0}: {1}".format(self.full_name, self.history_id)) + logging.info("Creating history for %s" % self.history_name) + history = self.instance.histories.create_history(name=self.history_name) + self.history_id = history["id"] + logging.debug("History ID set for {0}: {1}".format(self.history_name, self.history_id)) return self.history_id - def get_instance_attributes(self): - """ - retrieves instance attributes: - - working history ID - - libraries ID (there should only be one library!) - - datasets IDs - - :return: - """ - - self.set_get_history() - - logging.debug("History ID: %s" % self.history_id) - libraries = self.instance.libraries.get_libraries() # normally only one library - library_id = self.instance.libraries.get_libraries()[0]["id"] # project data folder/library - logging.debug("Library ID: %s" % self.library_id) - instance_source_data_folders = self.instance.libraries.get_folders(library_id=library_id) - - return {"history_id": self.history_id, "library_id": library_id} - - - def connect_to_instance(self): + def set_galaxy_instance(self): """ Test the connection to the galaxy instance for the current organism Exit if we cannot connect to the instance """ - # logging.debug("Connecting to the galaxy instance (%s)" % self.instance_url) + logging.debug("Connecting to the galaxy instance (%s)" % self.instance_url) self.instance = galaxy.GalaxyInstance(url=self.instance_url, - email=self.config["galaxy_default_admin_email"], - password=self.config["galaxy_default_admin_password"] + email=self.config[constants.CONF_GALAXY_DEFAULT_ADMIN_EMAIL], + password=self.config[constants.CONF_GALAXY_DEFAULT_ADMIN_PASSWORD] ) - self.instance.histories.get_histories() try: self.instance.histories.get_histories() except bioblend.ConnectionError: - logging.critical("Cannot connect to galaxy instance (%s)" % self.instance_url) + logging.critical("Cannot connect to galaxy instance (%s) " % self.instance_url) sys.exit() else: - # logging.debug("Successfully connected to galaxy instance (%s) " % self.instance_url) - return 1 - - - - def return_instance(self): - + logging.debug("Successfully connected to galaxy instance (%s) " % self.instance_url) return self.instance - - def install_changesets_revisions_for_individual_tools(self): """ @@ -115,270 +108,164 @@ class RunWorkflow(speciesData.SpeciesData): :return: """ - self.connect_to_instance() + self.set_galaxy_instance() logging.info("Validating installed individual tools versions and changesets") - # Verify that the add_organism and add_analysis versions are correct in the toolshed - add_organism_tool = self.instance.tools.show_tool("toolshed.g2.bx.psu.edu/repos/gga/chado_organism_add_organism/organism_add_organism/2.3.4+galaxy0") - add_analysis_tool = self.instance.tools.show_tool("toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/2.3.4+galaxy0") - get_organism_tool = self.instance.tools.show_tool("toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/2.3.4+galaxy0") - get_analysis_tool = self.instance.tools.show_tool("toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_get_analyses/analysis_get_analyses/2.3.4+galaxy0") + # Verify that the add_organism and add_analysis versions are correct in the instance - # changeset for 2.3.4+galaxy0 has to be manually found because there is no way to get the wanted changeset of a non installed tool via bioblend + add_organism_tool = self.instance.tools.show_tool(phaoexplorer_constants.ADD_ORGANISM_TOOL_ID) + add_analysis_tool = self.instance.tools.show_tool(phaoexplorer_constants.ADD_ANALYSIS_TOOL_ID) + get_organisms_tool = self.instance.tools.show_tool(phaoexplorer_constants.GET_ORGANISMS_TOOL_ID) + get_analyses_tool = self.instance.tools.show_tool(phaoexplorer_constants.GET_ANALYSES_TOOL_ID) + analysis_sync_tool = self.instance.tools.show_tool(phaoexplorer_constants.ANALYSIS_SYNC_TOOL_ID) + organism_sync_tool = self.instance.tools.show_tool(phaoexplorer_constants.ORGANISM_SYNC_TOOL_ID) + + # changeset for 2.3.4+galaxy0 has to be manually found because there is no way to get the wanted changeset of a non installed tool via bioblend # except for workflows (.ga) that already contain the changeset revisions inside the steps ids - - if get_organism_tool["version"] != "2.3.4+galaxy0": - toolshed_dict = get_organism_tool["tool_shed_repository"] - logging.warning("Changeset for %s is not installed" % toolshed_dict["name"]) - changeset_revision = "831229e6cda2" - name = toolshed_dict["name"] - owner = toolshed_dict["owner"] - toolshed = "https://" + toolshed_dict["tool_shed"] - logging.warning("Installing changeset revision {0} for {1}".format(changeset_revision, name)) - - self.instance.toolshed.install_repository_revision(tool_shed_url=toolshed, name=name, owner=owner, - changeset_revision=changeset_revision, - install_tool_dependencies=True, - install_repository_dependencies=False, - install_resolver_dependencies=True) - - if get_analysis_tool["version"] != "2.3.4+galaxy0": - toolshed_dict = changeset_revision["tool_shed_repository"] - logging.warning("Changeset for %s is not installed" % toolshed_dict["name"]) - changeset_revision = "a867923f555e" - name = toolshed_dict["name"] - owner = toolshed_dict["owner"] - toolshed = "https://" + toolshed_dict["tool_shed"] - logging.warning("Installing changeset revision {0} for {1}".format(changeset_revision, name)) - - self.instance.toolshed.install_repository_revision(tool_shed_url=toolshed, name=name, owner=owner, - changeset_revision=changeset_revision, - install_tool_dependencies=True, - install_repository_dependencies=False, - install_resolver_dependencies=True) - - if add_organism_tool["version"] != "2.3.4+galaxy0": - toolshed_dict = add_organism_tool["tool_shed_repository"] - logging.warning("Changeset for %s is not installed" % toolshed_dict["name"]) - changeset_revision = "1f12b9650028" - name = toolshed_dict["name"] - owner = toolshed_dict["owner"] - toolshed = "https://" + toolshed_dict["tool_shed"] - logging.warning("Installing changeset revision {0} for {1}".format(changeset_revision, name)) - - self.instance.toolshed.install_repository_revision(tool_shed_url=toolshed, name=name, owner=owner, - changeset_revision=changeset_revision, - install_tool_dependencies=True, - install_repository_dependencies=False, - install_resolver_dependencies=True) - - if add_analysis_tool["version"] != "2.3.4+galaxy0": - toolshed_dict = add_analysis_tool["tool_shed_repository"] - logging.warning("Changeset for %s is not installed" % toolshed_dict["name"]) - changeset_revision = "10b2b1c70e69" - name = toolshed_dict["name"] - owner = toolshed_dict["owner"] - toolshed = "https://" + toolshed_dict["tool_shed"] - logging.warning("Installing changeset revision {0} for {1}".format(changeset_revision, name)) - - self.instance.toolshed.install_repository_revision(tool_shed_url=toolshed, name=name, owner=owner, - changeset_revision=changeset_revision, - install_tool_dependencies=True, - install_repository_dependencies=False, - install_resolver_dependencies=True) - - - sync_analysis_tool = self.instance.tools.show_tool("toolshed.g2.bx.psu.edu/repos/gga/tripal_analysis_sync/analysis_sync/3.2.1.0") - sync_organism_tool = self.instance.tools.show_tool("toolshed.g2.bx.psu.edu/repos/gga/tripal_organism_sync/organism_sync/3.2.1.0") - - if sync_analysis_tool["version"] != "3.2.1.0": - toolshed_dict = sync_analysis_tool["tool_shed_repository"] - logging.warning("Changeset for %s is not installed" % toolshed_dict["name"]) - changeset_revision = "f487ff676088" - name = toolshed_dict["name"] - owner = toolshed_dict["owner"] - toolshed = "https://" + toolshed_dict["tool_shed"] - logging.warning("Installing changeset revision {0} for {1}".format(changeset_revision, name)) - - self.instance.toolshed.install_repository_revision(tool_shed_url=toolshed, name=name, owner=owner, - changeset_revision=changeset_revision, - install_tool_dependencies=True, - install_repository_dependencies=False, - install_resolver_dependencies=True) - - if sync_organism_tool["version"] != "3.2.1.0": - toolshed_dict = sync_organism_tool["tool_shed_repository"] - logging.warning("Changeset for %s is not installed" % toolshed_dict["name"]) - changeset_revision = "afd5d92745fb" - name = toolshed_dict["name"] - owner = toolshed_dict["owner"] - toolshed = "https://" + toolshed_dict["tool_shed"] - logging.warning("Installing changeset revision {0} for {1}".format(changeset_revision, name)) - - self.instance.toolshed.install_repository_revision(tool_shed_url=toolshed, name=name, owner=owner, - changeset_revision=changeset_revision, - install_tool_dependencies=True, - install_repository_dependencies=False, - install_resolver_dependencies=True) + utilities.install_repository_revision(current_version=get_organisms_tool["version"], + toolshed_dict=get_organisms_tool["tool_shed_repository"], + version_to_install=phaoexplorer_constants.GET_ORGANISMS_TOOL_VERSION, + changeset_revision=phaoexplorer_constants.GET_ORGANISMS_TOOL_CHANGESET_REVISION, + instance=self.instance) + + utilities.install_repository_revision(current_version=get_analyses_tool["version"], + toolshed_dict=get_analyses_tool["tool_shed_repository"], + version_to_install=phaoexplorer_constants.GET_ANALYSES_TOOL_VERSION, + changeset_revision=phaoexplorer_constants.GET_ANALYSES_TOOL_CHANGESET_REVISION, + instance=self.instance) + + utilities.install_repository_revision(current_version=add_organism_tool["version"], + toolshed_dict=add_organism_tool["tool_shed_repository"], + version_to_install=phaoexplorer_constants.ADD_ORGANISM_TOOL_VERSION, + changeset_revision=phaoexplorer_constants.ADD_ORGANISM_TOOL_CHANGESET_REVISION, + instance=self.instance) + + utilities.install_repository_revision(current_version=add_analysis_tool["version"], + toolshed_dict=add_analysis_tool["tool_shed_repository"], + version_to_install=phaoexplorer_constants.ADD_ANALYSIS_TOOL_VERSION, + changeset_revision=phaoexplorer_constants.ADD_ANALYSIS_TOOL_CHANGESET_REVISION, + instance=self.instance) + + utilities.install_repository_revision(current_version=analysis_sync_tool["version"], + toolshed_dict=analysis_sync_tool["tool_shed_repository"], + version_to_install=phaoexplorer_constants.ANALYSIS_SYNC_TOOL_VERSION, + changeset_revision=phaoexplorer_constants.ANALYSIS_SYNC_TOOL_CHANGESET_REVISION, + instance=self.instance) + + utilities.install_repository_revision(current_version=organism_sync_tool["version"], + toolshed_dict=organism_sync_tool["tool_shed_repository"], + version_to_install=phaoexplorer_constants.ORGANISM_SYNC_TOOL_VERSION, + changeset_revision=phaoexplorer_constants.ORGANISM_SYNC_TOOL_CHANGESET_REVISION, + instance=self.instance) logging.info("Success: individual tools versions and changesets validated") + def add_analysis(self, name, programversion, sourcename): + add_analysis_tool_dataset = utilities.run_tool_and_download_single_output_dataset( + instance=self.instance, + tool_id=phaoexplorer_constants.ADD_ANALYSIS_TOOL_ID, + history_id=self.history_id, + tool_inputs={"name": name, + "program": phaoexplorer_constants.ADD_ANALYSIS_TOOL_PARAM_PROGRAM, + "programversion": programversion, + "sourcename": sourcename, + "date_executed": phaoexplorer_constants.ADD_ANALYSIS_TOOL_PARAM_DATE}) + analysis_dict = json.loads(add_analysis_tool_dataset) + analysis_id = str(analysis_dict["analysis_id"]) - def tripal_synchronize_organism_analyses(self): - """ - """ - show_tool_tripal_sync = self.instance.tools.show_tool(tool_id="toolshed.g2.bx.psu.edu/repos/gga/tripal_organism_sync/organism_sync/3.2.1.0", io_details=True) - org_sync = "toolshed.g2.bx.psu.edu/repos/gga/tripal_organism_sync/organism_sync/3.2.1.0" - org_sync = self.instance.tools.run_tool(tool_id="toolshed.g2.bx.psu.edu/repos/gga/tripal_organism_sync/organism_sync/3.2.1.0", - history_id=self.history_id, - tool_inputs={"organism_id": "2"}) - org_sync_job_out = org_sync["outputs"] - - - def add_organism_ogs_genome_analyses(self): - """ - Add OGS and genome vX analyses to Chado database - Required for Chado Load Tripal Synchronize workflow (which should be ran as the first workflow) - Called outside workflow for practical reasons (Chado add doesn't have an input link for analysis or organism) - - :return: - - """ + return analysis_id - self.connect_to_instance() - self.set_get_history() + def sync_analysis(self, analysis_id): - tool_version = "2.3.4+galaxy0" + time.sleep(60) + utilities.run_tool( + instance=self.instance, + tool_id=phaoexplorer_constants.ANALYSIS_SYNC_TOOL_ID, + history_id=self.history_id, + tool_inputs={"analysis_id": analysis_id}) - get_organism_tool = self.instance.tools.show_tool("toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/2.3.4+galaxy0") + def add_organism_and_sync(self): - get_organisms = self.instance.tools.run_tool( - tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/%s" % tool_version, + get_organisms_tool_dataset = utilities.run_tool_and_download_single_output_dataset( + instance=self.instance, + tool_id=phaoexplorer_constants.GET_ORGANISMS_TOOL_ID, history_id=self.history_id, - tool_inputs={}) - - time.sleep(10) # Ensure the tool has had time to complete - org_outputs = get_organisms["outputs"] # Outputs from the get_organism tool - org_job_out_id = org_outputs[0]["id"] # ID of the get_organism output dataset (list of dicts) - org_json_output = self.instance.datasets.download_dataset(dataset_id=org_job_out_id) # Download the dataset - org_output = json.loads(org_json_output) # Turn the dataset into a list for parsing + tool_inputs={}, + time_sleep=10 + ) + organisms_dict_list = json.loads(get_organisms_tool_dataset) # Turn the dataset into a list for parsing org_id = None # Look up list of outputs (dictionaries) - for organism_output_dict in org_output: - if organism_output_dict["genus"] == self.genus and organism_output_dict["species"] == "{0} {1}".format(self.species, self.sex): - correct_organism_id = str(organism_output_dict["organism_id"]) # id needs to be a str to be recognized by chado tools - org_id = str(correct_organism_id) - + for org_dict in organisms_dict_list: + if org_dict["genus"] == self.genus_uppercase and org_dict["species"] == self.chado_species_name: + org_id = str(org_dict["organism_id"]) # id needs to be a str to be recognized by chado tools if org_id is None: - if self.common == "" or self.common is None: - add_org_job = self.instance.tools.run_tool( - tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_add_organism/organism_add_organism/%s" % tool_version, - history_id=self.history_id, - tool_inputs={"abbr": self.abbreviation, - "genus": self.genus_uppercase, - "species": self.chado_species_name, - "common": self.abbreviation}) - org_job_out_id = add_org_job["outputs"][0]["id"] - org_json_output = self.instance.datasets.download_dataset(dataset_id=org_job_out_id) - org_output = json.loads(org_json_output) - org_id = str(org_output["organism_id"]) # id needs to be a str to be recognized by chado tools - else: - add_org_job = self.instance.tools.run_tool( - tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_add_organism/organism_add_organism/%s" % tool_version, - history_id=self.history_id, - tool_inputs={"abbr": self.abbreviation, - "genus": self.genus_uppercase, - "species": self.chado_species_name, - "common": self.common}) - org_job_out_id = add_org_job["outputs"][0]["id"] - org_json_output = self.instance.datasets.download_dataset(dataset_id=org_job_out_id) - org_output = json.loads(org_json_output) - org_id = str(org_output["organism_id"]) # id needs to be a str to be recognized by chado tools + add_organism_tool_dataset = utilities.run_tool_and_download_single_output_dataset( + instance=self.instance, + tool_id=phaoexplorer_constants.ADD_ORGANISM_TOOL_ID, + history_id=self.history_id, + tool_inputs={"abbr": self.abbreviation, + "genus": self.genus_uppercase, + "species": self.chado_species_name, + "common": self.common}) + organism_dict = json.loads(add_organism_tool_dataset) + org_id = str(organism_dict["organism_id"]) # id needs to be a str to be recognized by chado tools # Synchronize newly added organism in Tripal logging.info("Synchronizing organism %s in Tripal" % self.full_name) time.sleep(60) - org_sync = self.instance.tools.run_tool(tool_id="toolshed.g2.bx.psu.edu/repos/gga/tripal_organism_sync/organism_sync/3.2.1.0", - history_id=self.history_id, - tool_inputs={"organism_id": org_id}) + utilities.run_tool( + instance=self.instance, + tool_id=phaoexplorer_constants.ORGANISM_SYNC_TOOL_ID, + history_id=self.history_id, + tool_inputs={"organism_id": org_id}) + return org_id - # Analyses (genome + OGS) - get_analyses = self.instance.tools.run_tool( - tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_get_analyses/analysis_get_analyses/%s" % tool_version, + def get_analyses(self): + + get_analyses_tool_dataset = utilities.run_tool_and_download_single_output_dataset( + instance=self.instance, + tool_id=phaoexplorer_constants.GET_ANALYSES_TOOL_ID, history_id=self.history_id, - tool_inputs={}) + tool_inputs={}, + time_sleep=10 + ) + analyses_dict_list = json.loads(get_analyses_tool_dataset) + return analyses_dict_list - time.sleep(10) - analysis_outputs = get_analyses["outputs"] - analysis_job_out_id = analysis_outputs[0]["id"] - analysis_json_output = self.instance.datasets.download_dataset(dataset_id=analysis_job_out_id) - analysis_output = json.loads(analysis_json_output) + def add_analysis_and_sync(self, analyses_dict_list, analysis_name, analysis_programversion, analysis_sourcename): + """ + Add one analysis to Chado database + Required for Chado Load Tripal Synchronize workflow (which should be ran as the first workflow) + Called outside workflow for practical reasons (Chado add doesn't have an input link for analysis or organism) + """ - ogs_analysis_id = None - genome_analysis_id = None + analysis_id = None # Look up list of outputs (dictionaries) - for analysis_output_dict in analysis_output: - if analysis_output_dict["name"] == self.full_name_lowercase + " OGS" + self.ogs_version: - ogs_analysis_id = str(analysis_output_dict["analysis_id"]) - if analysis_output_dict["name"] == self.full_name_lowercase + " genome v" + self.genome_version: - genome_analysis_id = str(analysis_output_dict["analysis_id"]) + for analyses_dict in analyses_dict_list: + if analyses_dict["name"] == analysis_name: + analysis_id = str(analyses_dict["analysis_id"]) + if analysis_id is None: + analysis_id = self.add_analysis( + name=analysis_name, + programversion=analysis_programversion, + sourcename=analysis_sourcename + ) - if ogs_analysis_id is None: - add_ogs_analysis_job = self.instance.tools.run_tool( - tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/%s" % tool_version, - history_id=self.history_id, - tool_inputs={"name": self.full_name_lowercase + " OGS" + self.ogs_version, - "program": "Performed by Genoscope", - "programversion": str(self.sex + " OGS" + self.ogs_version), - "sourcename": "Genoscope", - "date_executed": self.date}) - analysis_outputs = add_ogs_analysis_job["outputs"] - analysis_job_out_id = analysis_outputs[0]["id"] - analysis_json_output = self.instance.datasets.download_dataset(dataset_id=analysis_job_out_id) - analysis_output = json.loads(analysis_json_output) - ogs_analysis_id = str(analysis_output["analysis_id"]) - - # Synchronize OGS analysis in Tripal - logging.info("Synchronizing OGS%s analysis in Tripal" % self.ogs_version) - time.sleep(60) - ogs_analysis_sync = self.instance.tools.run_tool(tool_id="toolshed.g2.bx.psu.edu/repos/gga/tripal_analysis_sync/analysis_sync/3.2.1.0", - history_id=self.history_id, - tool_inputs={"analysis_id": ogs_analysis_id}) - - if genome_analysis_id is None: - add_genome_analysis_job = self.instance.tools.run_tool( - tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/%s" % tool_version, - history_id=self.history_id, - tool_inputs={"name": self.full_name_lowercase + " genome v" + self.genome_version, - "program": "Performed by Genoscope", - "programversion": str(self.sex + "genome v" + self.genome_version), - "sourcename": "Genoscope", - "date_executed": self.date}) - analysis_outputs = add_genome_analysis_job["outputs"] - analysis_job_out_id = analysis_outputs[0]["id"] - analysis_json_output = self.instance.datasets.download_dataset(dataset_id=analysis_job_out_id) - analysis_output = json.loads(analysis_json_output) - genome_analysis_id = str(analysis_output["analysis_id"]) - - # Synchronize genome analysis in Tripal - logging.info("Synchronizing genome v%s analysis in Tripal" % self.genome_version) - time.sleep(60) - genome_analysis_sync = self.instance.tools.run_tool(tool_id="toolshed.g2.bx.psu.edu/repos/gga/tripal_analysis_sync/analysis_sync/3.2.1.0", - history_id=self.history_id, - tool_inputs={"analysis_id": genome_analysis_id}) - - # print({"org_id": org_id, "genome_analysis_id": genome_analysis_id, "ogs_analysis_id": ogs_analysis_id}) - return({"org_id": org_id, "genome_analysis_id": genome_analysis_id, "ogs_analysis_id": ogs_analysis_id}) + # Synchronize analysis in Tripal + logging.info("Synchronizing analysis %s in Tripal" % analysis_name) + self.sync_analysis(analysis_id=analysis_id) + return(analysis_id) def add_organism_blastp_analysis(self): """ @@ -390,8 +277,8 @@ class RunWorkflow(speciesData.SpeciesData): """ - self.connect_to_instance() - self.set_get_history() + self.set_galaxy_instance() + self.set_history() tool_version = "2.3.4+galaxy0" @@ -418,30 +305,17 @@ class RunWorkflow(speciesData.SpeciesData): if org_id is None: - if self.common == "" or self.common is None: - add_org_job = self.instance.tools.run_tool( - tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_add_organism/organism_add_organism/%s" % tool_version, - history_id=self.history_id, - tool_inputs={"abbr": self.abbreviation, - "genus": self.genus_uppercase, - "species": self.chado_species_name, - "common": self.abbreviation}) - org_job_out_id = add_org_job["outputs"][0]["id"] - org_json_output = self.instance.datasets.download_dataset(dataset_id=org_job_out_id) - org_output = json.loads(org_json_output) - org_id = str(org_output["organism_id"]) # id needs to be a str to be recognized by chado tools - else: - add_org_job = self.instance.tools.run_tool( - tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_add_organism/organism_add_organism/%s" % tool_version, - history_id=self.history_id, - tool_inputs={"abbr": self.abbreviation, - "genus": self.genus_uppercase, - "species": self.chado_species_name, - "common": self.common}) - org_job_out_id = add_org_job["outputs"][0]["id"] - org_json_output = self.instance.datasets.download_dataset(dataset_id=org_job_out_id) - org_output = json.loads(org_json_output) - org_id = str(org_output["organism_id"]) # id needs to be a str to be recognized by chado tools + add_org_job = self.instance.tools.run_tool( + tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_add_organism/organism_add_organism/%s" % tool_version, + history_id=self.history_id, + tool_inputs={"abbr": self.abbreviation, + "genus": self.genus_uppercase, + "species": self.chado_species_name, + "common": self.common}) + org_job_out_id = add_org_job["outputs"][0]["id"] + org_json_output = self.instance.datasets.download_dataset(dataset_id=org_job_out_id) + org_output = json.loads(org_json_output) + org_id = str(org_output["organism_id"]) # id needs to be a str to be recognized by chado tools # Synchronize newly added organism in Tripal logging.info("Synchronizing organism %s in Tripal" % self.full_name) @@ -505,8 +379,8 @@ class RunWorkflow(speciesData.SpeciesData): """ - self.connect_to_instance() - self.set_get_history() + self.set_galaxy_instance() + self.set_history() tool_version = "2.3.4+galaxy0" @@ -533,30 +407,17 @@ class RunWorkflow(speciesData.SpeciesData): if org_id is None: - if self.common == "" or self.common is None: - add_org_job = self.instance.tools.run_tool( - tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_add_organism/organism_add_organism/%s" % tool_version, - history_id=self.history_id, - tool_inputs={"abbr": self.abbreviation, - "genus": self.genus_uppercase, - "species": self.chado_species_name, - "common": self.abbreviation}) - org_job_out_id = add_org_job["outputs"][0]["id"] - org_json_output = self.instance.datasets.download_dataset(dataset_id=org_job_out_id) - org_output = json.loads(org_json_output) - org_id = str(org_output["organism_id"]) # id needs to be a str to be recognized by chado tools - else: - add_org_job = self.instance.tools.run_tool( - tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_add_organism/organism_add_organism/%s" % tool_version, - history_id=self.history_id, - tool_inputs={"abbr": self.abbreviation, - "genus": self.genus_uppercase, - "species": self.chado_species_name, - "common": self.common}) - org_job_out_id = add_org_job["outputs"][0]["id"] - org_json_output = self.instance.datasets.download_dataset(dataset_id=org_job_out_id) - org_output = json.loads(org_json_output) - org_id = str(org_output["organism_id"]) # id needs to be a str to be recognized by chado tools + add_org_job = self.instance.tools.run_tool( + tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_add_organism/organism_add_organism/%s" % tool_version, + history_id=self.history_id, + tool_inputs={"abbr": self.abbreviation, + "genus": self.genus_uppercase, + "species": self.chado_species_name, + "common": self.common}) + org_job_out_id = add_org_job["outputs"][0]["id"] + org_json_output = self.instance.datasets.download_dataset(dataset_id=org_job_out_id) + org_output = json.loads(org_json_output) + org_id = str(org_output["organism_id"]) # id needs to be a str to be recognized by chado tools # Synchronize newly added organism in Tripal logging.info("Synchronizing organism %s in Tripal" % self.full_name) @@ -663,25 +524,23 @@ class RunWorkflow(speciesData.SpeciesData): # Instanciate the instance gio = GalaxyInstance(url=self.instance_url, - email=self.config["galaxy_default_admin_email"], - password=self.config["galaxy_default_admin_password"]) + email=self.config[constants.CONF_GALAXY_DEFAULT_ADMIN_EMAIL], + password=self.config[constants.CONF_GALAXY_DEFAULT_ADMIN_PASSWORD]) - prj_lib = gio.libraries.get_previews(name="Project Data") + prj_lib = gio.libraries.get_previews(name=constants.GALAXY_LIBRARY_NAME) library_id = prj_lib[0].id + folder_dict_list = self.instance.libraries.get_folders(library_id=str(library_id)) - instance_source_data_folders = self.instance.libraries.get_folders(library_id=str(library_id)) - - folders_ids = {} - folder_name = "" + folders_id_dict = {} # Loop over the folders in the library and map folders names to their IDs - for i in instance_source_data_folders: - folders_ids[i["name"]] = i["id"] + for folder_dict in folder_dict_list: + folders_id_dict[folder_dict["name"]] = folder_dict["id"] # Iterating over the folders to find datasets and map datasets to their IDs - for k, v in folders_ids.items(): - if k == "/genome/{0}/v{1}".format(self.species_folder_name, self.genome_version): - sub_folder_content = self.instance.folders.show_folder(folder_id=v, contents=True) + for folder_name, folder_id in folders_id_dict.items(): + if folder_name == "/genome/{0}/v{1}".format(self.species_folder_name, self.genome_version): + sub_folder_content = self.instance.folders.show_folder(folder_id=folder_id, contents=True) for k2, v2 in sub_folder_content.items(): for e in v2: if type(e) == dict: @@ -689,8 +548,8 @@ class RunWorkflow(speciesData.SpeciesData): self.datasets["genome_file"] = e["ldda_id"] self.datasets_name["genome_file"] = e["name"] - if k == "/annotation/{0}/OGS{1}".format(self.species_folder_name, self.ogs_version): - sub_folder_content = self.instance.folders.show_folder(folder_id=v, contents=True) + if folder_name == "/annotation/{0}/OGS{1}".format(self.species_folder_name, self.ogs_version): + sub_folder_content = self.instance.folders.show_folder(folder_id=folder_id, contents=True) for k2, v2 in sub_folder_content.items(): for e in v2: if type(e) == dict: @@ -776,68 +635,6 @@ class RunWorkflow(speciesData.SpeciesData): "blastp_hda_id": blastp_hda_id, "interproscan_hda_id": interproscan_hda_id} - - def get_datasets_hda_ids(self): - """ - Get the hda IDs of the datasets imported into an history - - As some tools will not work using the input datasets ldda IDs we need to retrieve the datasets IDs imported - into an history - - - :return: - """ - - # List of all datasets in the instance (including outputs from jobs) - # "limit" and "offset" options *may* be used to restrict search to specific datasets but since - # there is no way to know which imported datasets are the correct ones depending on history content - # it's not currently used - history_datasets_li = self.instance.datasets.get_datasets() - - genome_dataset_hda_id, gff_dataset_hda_id, transcripts_dataset_hda_id, proteins_datasets_hda_id = None, None, None, None - interproscan_dataset_hda_id, blast_diamond_dataset_hda_id = None, None - - # Match files imported in history names vs library datasets names to assign their respective hda_id - for dataset_dict in history_datasets_li: - if dataset_dict["history_id"] == self.history_id: - if dataset_dict["name"] == self.datasets_name["genome_file"] and dataset_dict["id"] not in imported_datasets_ids: - genome_dataset_hda_id = dataset_dict["id"] - elif dataset_dict["name"] == self.datasets_name["proteins_file"] and dataset_dict["id"] not in imported_datasets_ids: - proteins_datasets_hda_id = dataset_dict["id"] - elif dataset_dict["name"] == self.datasets_name["transcripts_file"] and dataset_dict["id"] not in imported_datasets_ids: - transcripts_dataset_hda_id = dataset_dict["id"] - elif dataset_dict["name"] == self.datasets_name["gff_file"] and dataset_dict["id"] not in imported_datasets_ids: - gff_dataset_hda_id = dataset_dict["id"] - if "interproscan_file" in self.datasets_name.keys(): - if dataset_dict["name"] == self.datasets_name["interproscan_file"] and dataset_dict["id"] not in imported_datasets_ids: - interproscan_dataset_hda_id = dataset_dict["id"] - if "blast_diamond_file" in self.datasets_name.keys(): - if dataset_dict["name"] == self.datasets_name["blastp_file"] and dataset_dict["id"] not in imported_datasets_ids: - blastp_dataset_hda_id = dataset_dict["id"] - - logging.debug("Genome dataset hda id: %s" % genome_dataset_hda_id) - logging.debug("Proteins dataset hda ID: %s" % proteins_datasets_hda_id) - logging.debug("Transcripts dataset hda ID: %s" % transcripts_dataset_hda_id) - logging.debug("GFF dataset hda ID: %s" % gff_dataset_hda_id) - logging.debug("InterproScan dataset hda ID: %s" % gff_dataset_hda_id) - logging.debug("Blastp Diamond dataset hda ID: %s" % blastp_dataset_hda_id) - - # Add datasets IDs to already imported IDs (so we don't assign all the wrong IDs to the next organism if there is one) - imported_datasets_ids.append(genome_dataset_hda_id) - imported_datasets_ids.append(transcripts_dataset_hda_id) - imported_datasets_ids.append(proteins_datasets_hda_id) - imported_datasets_ids.append(gff_dataset_hda_id) - imported_datasets_ids.append(interproscan_dataset_hda_id) - imported_datasets_ids.append(blastp_dataset_hda_id) - - # Return a dict made of the hda ids - return {"genome_hda_id": genome_dataset_hda_id, "transcripts_hda_id": transcripts_dataset_hda_id, - "proteins_hda_id": proteins_datasets_hda_id, "gff_hda_id": gff_dataset_hda_id, - "interproscan_hda_id": interproscan_dataset_hda_id, - "blastp_hda_id": blastp_dataset_hda_id, - "imported_datasets_ids": imported_datasets_ids} - - def run_workflow(workflow_path, workflow_parameters, datamap, config, input_species_number): """ Run a workflow in galaxy @@ -904,8 +701,7 @@ def create_sp_workflow_dict(sp_dict, main_dir, config, workflow_type): run_workflow_for_current_organism = RunWorkflow(parameters_dictionary=sp_dict) # Verifying the galaxy container is running - if utilities.check_galaxy_state(genus_lowercase=run_workflow_for_current_organism.genus_lowercase, - species=run_workflow_for_current_organism.species, + if utilities.check_galaxy_state(network_name=run_workflow_for_current_organism.genus_species, script_dir=run_workflow_for_current_organism.script_dir): # Setting some of the instance attributes @@ -917,31 +713,34 @@ def create_sp_workflow_dict(sp_dict, main_dir, config, workflow_type): # Parse the config yaml file run_workflow_for_current_organism.config = config # Set the instance url attribute --> TODO: the localhost rule in the docker-compose still doesn't work on scratchgmodv1 - run_workflow_for_current_organism.instance_url = "http://localhost:{0}/sp/{1}_{2}/galaxy/".format( - run_workflow_for_current_organism.config["http_port"], - run_workflow_for_current_organism.genus_lowercase, - run_workflow_for_current_organism.species) - + run_workflow_for_current_organism.instance_url = "http://localhost:{0}/sp/{1}/galaxy/".format( + run_workflow_for_current_organism.config[constants.CONF_ALL_HTTP_PORT], + run_workflow_for_current_organism.genus_species) - if workflow_type == "load_fasta_gff_jbrowse": - run_workflow_for_current_organism.connect_to_instance() - history_id = run_workflow_for_current_organism.set_get_history() + if workflow_type == phaoexplorer_constants.WORKFLOW_LOAD_FASTA_GFF_JBROWSE: + run_workflow_for_current_organism.set_galaxy_instance() + history_id = run_workflow_for_current_organism.set_history() run_workflow_for_current_organism.install_changesets_revisions_for_individual_tools() - ids = run_workflow_for_current_organism.add_organism_ogs_genome_analyses() - - org_id = None - genome_analysis_id = None - ogs_analysis_id = None - org_id = ids["org_id"] - genome_analysis_id = ids["genome_analysis_id"] - ogs_analysis_id = ids["ogs_analysis_id"] - instance_attributes = run_workflow_for_current_organism.get_instance_attributes() - hda_ids = run_workflow_for_current_organism.import_datasets_into_history() - strain_sex = "{0}_{1}".format(run_workflow_for_current_organism.strain, run_workflow_for_current_organism.sex) - genus_species = run_workflow_for_current_organism.genus_species + analyses_dict_list = run_workflow_for_current_organism.get_analyses() + + org_id = run_workflow_for_current_organism.add_organism_and_sync() + genome_analysis_id = run_workflow_for_current_organism.add_analysis_and_sync( + analyses_dict_list=analyses_dict_list, + analysis_name=run_workflow_for_current_organism.genome_analysis_name, + analysis_programversion=run_workflow_for_current_organism.genome_analysis_programversion, + analysis_sourcename=run_workflow_for_current_organism.genome_analysis_sourcename + ) + ogs_analysis_id = run_workflow_for_current_organism.add_analysis_and_sync( + analyses_dict_list=analyses_dict_list, + analysis_name=run_workflow_for_current_organism.ogs_analysis_name, + analysis_programversion=run_workflow_for_current_organism.ogs_analysis_programversion, + analysis_sourcename=run_workflow_for_current_organism.ogs_analysis_sourcename + ) + + hda_ids = run_workflow_for_current_organism.import_datasets_into_history() # Create the dictionary holding all attributes needed to connect to the galaxy instance attributes = {"genus": run_workflow_for_current_organism.genus, @@ -954,35 +753,31 @@ def create_sp_workflow_dict(sp_dict, main_dir, config, workflow_type): "org_id": org_id, "genome_analysis_id": genome_analysis_id, "ogs_analysis_id": ogs_analysis_id, - "instance_attributes": instance_attributes, "hda_ids": hda_ids, "history_id": history_id, "instance": run_workflow_for_current_organism.instance, "instance_url": run_workflow_for_current_organism.instance_url, - "email": config["galaxy_default_admin_email"], - "password": config["galaxy_default_admin_password"]} + "email": config[constants.CONF_GALAXY_DEFAULT_ADMIN_EMAIL], + "password": config[constants.CONF_GALAXY_DEFAULT_ADMIN_PASSWORD]} - sp_workflow_dict[genus_species] = {strain_sex: attributes} + sp_workflow_dict[run_workflow_for_current_organism.genus_species] = {run_workflow_for_current_organism.genus_species.strain_sex: attributes} else: - logging.critical("The galaxy container for %s is not ready yet!" % run_workflow_for_current_organism.full_name) + logging.critical("The galaxy container for %s is not ready yet!" % run_workflow_for_current_organism.genus_species) sys.exit() return sp_workflow_dict if workflow_type == "blast": - run_workflow_for_current_organism.connect_to_instance() + run_workflow_for_current_organism.set_galaxy_instance() - history_id = run_workflow_for_current_organism.set_get_history() + history_id = run_workflow_for_current_organism.set_history() run_workflow_for_current_organism.install_changesets_revisions_for_individual_tools() ids = run_workflow_for_current_organism.add_organism_blastp_analysis() - org_id = None org_id = ids["org_id"] - blastp_analysis_id = None blastp_analysis_id = ids["blastp_analysis_id"] - instance_attributes = run_workflow_for_current_organism.get_instance_attributes() hda_ids = run_workflow_for_current_organism.import_datasets_into_history() strain_sex = "{0}_{1}".format(run_workflow_for_current_organism.strain, run_workflow_for_current_organism.sex) @@ -998,30 +793,26 @@ def create_sp_workflow_dict(sp_dict, main_dir, config, workflow_type): "strain": run_workflow_for_current_organism.strain, "org_id": org_id, "blastp_analysis_id": blastp_analysis_id, - "instance_attributes": instance_attributes, "hda_ids": hda_ids, "history_id": history_id, "instance": run_workflow_for_current_organism.instance, "instance_url": run_workflow_for_current_organism.instance_url, - "email": config["galaxy_default_admin_email"], - "password": config["galaxy_default_admin_password"]} + "email": config[constants.CONF_GALAXY_DEFAULT_ADMIN_EMAIL], + "password": config[constants.CONF_GALAXY_DEFAULT_ADMIN_PASSWORD]} sp_workflow_dict[genus_species] = {strain_sex: attributes} if workflow_type == "interpro": - run_workflow_for_current_organism.connect_to_instance() + run_workflow_for_current_organism.set_galaxy_instance() - history_id = run_workflow_for_current_organism.set_get_history() + history_id = run_workflow_for_current_organism.set_history() run_workflow_for_current_organism.install_changesets_revisions_for_individual_tools() ids = run_workflow_for_current_organism.add_organism_interproscan_analysis() - org_id = None org_id = ids["org_id"] - interpro_analysis_id = None interpro_analysis_id = ids["interpro_analysis_id"] - instance_attributes = run_workflow_for_current_organism.get_instance_attributes() hda_ids = run_workflow_for_current_organism.import_datasets_into_history() strain_sex = "{0}_{1}".format(run_workflow_for_current_organism.strain, run_workflow_for_current_organism.sex) @@ -1037,13 +828,12 @@ def create_sp_workflow_dict(sp_dict, main_dir, config, workflow_type): "strain": run_workflow_for_current_organism.strain, "org_id": org_id, "interpro_analysis_id": interpro_analysis_id, - "instance_attributes": instance_attributes, "hda_ids": hda_ids, "history_id": history_id, "instance": run_workflow_for_current_organism.instance, "instance_url": run_workflow_for_current_organism.instance_url, - "email": config["galaxy_default_admin_email"], - "password": config["galaxy_default_admin_password"]} + "email": config[constants.CONF_GALAXY_DEFAULT_ADMIN_EMAIL], + "password": config[constants.CONF_GALAXY_DEFAULT_ADMIN_PASSWORD]} sp_workflow_dict[genus_species] = {strain_sex: attributes} @@ -1137,46 +927,41 @@ if __name__ == "__main__": bioblend_logger.setLevel(logging.INFO) # Parsing the config file if provided, using the default config otherwise - if not args.config: - args.config = os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])), "config") - else: - args.config = os.path.abspath(args.config) - if args.config: config_file = os.path.abspath(args.config) else: config_file = os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])), constants.DEFAULT_CONFIG) config = utilities.parse_config(config_file) + main_dir = None if not args.main_directory: - args.main_directory = os.getcwd() + main_dir = os.getcwd() else: - args.main_directory = os.path.abspath(args.main_directory) + main_dir = os.path.abspath(args.main_directory) sp_dict_list = utilities.parse_input(args.input) - workflow_valid_types = ["load_fasta_gff_jbrowse", "blast", "interpro"] - workflow_type = None - # Checking if user specified a workflow to run if not args.workflow: logging.critical("No workflow type specified, exiting") sys.exit() - elif args.workflow in workflow_valid_types: + elif args.workflow in phaoexplorer_constants.WORKFLOW_VALID_TYPES: workflow_type = args.workflow - logging.info("Workflow type set to %s" % workflow_type) + logging.info("Workflow type set to '%s'" % workflow_type) script_dir = os.path.dirname(os.path.realpath(sys.argv[0])) - config = utilities.parse_config(args.config) all_sp_workflow_dict = {} - - if workflow_type == "load_fasta_gff_jbrowse": + if workflow_type == phaoexplorer_constants.WORKFLOW_LOAD_FASTA_GFF_JBROWSE: for sp_dict in sp_dict_list: # Add and retrieve all analyses/organisms for the current input species and add their IDs to the input dictionary - current_sp_workflow_dict = create_sp_workflow_dict(sp_dict, main_dir=args.main_directory, config=config, workflow_type="load_fasta_gff_jbrowse") + current_sp_workflow_dict = create_sp_workflow_dict( + sp_dict, + main_dir=main_dir, + config=config, + workflow_type=phaoexplorer_constants.WORKFLOW_LOAD_FASTA_GFF_JBROWSE) current_sp_key = list(current_sp_workflow_dict.keys())[0] current_sp_value = list(current_sp_workflow_dict.values())[0] diff --git a/speciesData.py b/speciesData.py index c012a46856e2c57d2cb3df83dfe6a2968b3c40b0..83626b70e1289a72eff98172a08af91962c490ec 100755 --- a/speciesData.py +++ b/speciesData.py @@ -96,8 +96,12 @@ class SpeciesData: self.genus_lowercase = self.genus.lower() self.species_lowercase = self.species.lower() + self.strain_lowercase = self.strain.lower() + self.sex_lowercase = self.sex.lower() + self.genus_uppercase = self.genus[0].upper() + self.genus_lowercase[1:] self.genus_species = "{0}_{1}".format(self.genus_lowercase, self.species_lowercase) + self.strain_sex = "{0}_{1}".format(self.strain_lowercase, self.sex_lowercase) self.full_name = ' '.join(utilities.filter_empty_not_empty_items([self.genus_uppercase, self.species, self.strain, self.sex])["not_empty"]) self.full_name_lowercase = self.full_name.lower() diff --git a/utilities.py b/utilities.py index af7a5ea2b054f756db0cf8b10032bec43bd16209..68f59cfe70f7d0850e74c16b0ca32772d6dad751 100755 --- a/utilities.py +++ b/utilities.py @@ -8,6 +8,7 @@ import os import subprocess import bioblend import constants +import time def load_yaml(yaml_file): @@ -76,7 +77,7 @@ def filter_empty_not_empty_items(li): return filtered_dict -def check_galaxy_state(genus_lowercase, species, script_dir): +def check_galaxy_state(network_name, script_dir): """ Read the logs of the galaxy container for the current species to check if the service is "ready" @@ -92,7 +93,7 @@ def check_galaxy_state(genus_lowercase, species, script_dir): os.chmod("%s/serexec" % script_dir, 0o0755) except PermissionError: logging.warning("serexec permissions incorrect in %s" % script_dir) - galaxy_logs = subprocess.run(["%s/serexec" % script_dir, "{0}_{1}_galaxy".format(genus_lowercase, species), + galaxy_logs = subprocess.run(["%s/serexec" % script_dir, "{0}_galaxy".format(network_name), "supervisorctl", "status", "galaxy:"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) if "galaxy:galaxy_web RUNNING" in str(galaxy_logs.stdout) \ and "galaxy:handler0 RUNNING" in str(galaxy_logs.stdout) \ @@ -199,12 +200,31 @@ def run_tool(instance, tool_id, history_id, tool_inputs): return output_dict -def run_tool_and_get_single_output_dataset_id(instance, tool_id, history_id, tool_inputs): +def run_tool_and_download_single_output_dataset(instance, tool_id, history_id, tool_inputs, time_sleep): output_dict = run_tool(instance, tool_id, history_id, tool_inputs) + if (not time_sleep is None): + time.sleep(time_sleep) single_output_dataset_id = output_dict["outputs"][0]["id"] + dataset = instance.datasets.download_dataset(dataset_id=single_output_dataset_id) - return single_output_dataset_id + return dataset + +def install_repository_revision(current_version, toolshed_dict, version_to_install, changeset_revision, instance): + + if current_version != version_to_install: + name = toolshed_dict["name"] + owner = toolshed_dict["owner"] + toolshed = "https://" + toolshed_dict["tool_shed"] + logging.warning("Installing changeset revision {0} for {1}".format(changeset_revision, name)) + + instance.toolshed.install_repository_revision(tool_shed_url=toolshed, + name=name, + owner=owner, + changeset_revision=changeset_revision, + install_tool_dependencies=True, + install_repository_dependencies=False, + install_resolver_dependencies=True) def create_org_param_dict_from_constants(): """ @@ -234,7 +254,6 @@ def create_org_param_dict_from_constants(): org_param_dict["org_param_data_blastx_path"] = constants.ORG_PARAM_DATA_BLASTX_PATH org_param_dict["org_param_data_genome_version"] = constants.ORG_PARAM_DATA_GENOME_VERSION org_param_dict["org_param_data_ogs_version"] = constants.ORG_PARAM_DATA_OGS_VERSION - org_param_dict["org_param_data_performed_by"] = constants.ORG_PARAM_DATA_PERFORMED_BY org_param_dict["org_param_services"] = constants.ORG_PARAM_SERVICES org_param_dict["org_param_services_blast"] = constants.ORG_PARAM_SERVICES_BLAST org_param_dict["org_param_services_go"] = constants.ORG_PARAM_SERVICES_GO