From 8fb9904ade5ca4e5808826c0c0ddd10c09401084 Mon Sep 17 00:00:00 2001 From: Arthur Le Bars <arthur.le-bars@sb-roscoff.fr> Date: Tue, 27 Apr 2021 11:11:52 +0200 Subject: [PATCH] workflow v2 initial commit --- gga_load_data.py | 47 +++++++++-- run_workflow_phaeoexplorer.py | 146 ++++++++++++++++++++++++++++++++-- speciesData.py | 4 +- 3 files changed, 181 insertions(+), 16 deletions(-) diff --git a/gga_load_data.py b/gga_load_data.py index 856d043..e349e78 100755 --- a/gga_load_data.py +++ b/gga_load_data.py @@ -59,20 +59,18 @@ class LoadData(speciesData.SpeciesData): """ Create or set the working history to the current species one - TODO - move to utilities? - :return: """ try: - histories = self.instance.histories.get_histories(name=str(self.full_name)) + histories = self.instance.histories.get_histories(name=str(self.genus_species)) self.history_id = histories[0]["id"] - logging.info("History for {0}: {1}".format(self.full_name, self.history_id)) + logging.debug("History ID set for {0}: {1}".format(self.full_name, self.history_id)) except IndexError: logging.info("Creating history for %s" % self.full_name) - self.instance.histories.create_history(name=str(self.full_name)) - histories = self.instance.histories.get_histories(name=str(self.full_name)) + self.instance.histories.create_history(name=str(self.genus_species)) + histories = self.instance.histories.get_histories(name=str(self.genus_species)) self.history_id = histories[0]["id"] - logging.info("History for {0}: {1}".format(self.full_name, self.history_id)) + logging.debug("History ID set for {0}: {1}".format(self.full_name, self.history_id)) return self.history_id @@ -82,6 +80,41 @@ class LoadData(speciesData.SpeciesData): Will do nothing if H. sapiens isn't in the database """ + + get_organism_tool = self.instance.tools.show_tool("toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/2.3.3") + delete_organism_tool = self.instance.tools.show_tool("toolshed.g2.bx.psu.edu/repos/gga/chado_organism_delete_organisms/organism_delete_organisms/2.3.4+galaxy0") + + if delete_organism_tool["version"] != "2.3.4+galaxy0": + toolshed_dict = delete_organism_tool["tool_shed_repository"] + logging.warning("Changeset for %s is not installed" % toolshed_dict["name"]) + changeset_revision = "b1aa4f9d82fe" + name = toolshed_dict["name"] + owner = toolshed_dict["owner"] + toolshed = "https://" + toolshed_dict["tool_shed"] + logging.warning("Installing changeset for %s" % toolshed_dict["name"]) + + self.instance.toolshed.install_repository_revision(tool_shed_url=toolshed, name=name, owner=owner, + changeset_revision=changeset_revision, + install_tool_dependencies=True, + install_repository_dependencies=False, + install_resolver_dependencies=True) + + + if get_organism_tool["version"] != "2.3.3": + toolshed_dict = get_organism_tool["tool_shed_repository"] + logging.warning("Changeset for %s is not installed" % toolshed_dict["name"]) + changeset_revision = "b07279b5f3bf" + name = toolshed_dict["name"] + owner = toolshed_dict["owner"] + toolshed = "https://" + toolshed_dict["tool_shed"] + logging.warning("Installing changeset for %s" % toolshed_dict["name"]) + + self.instance.toolshed.install_repository_revision(tool_shed_url=toolshed, name=name, owner=owner, + changeset_revision=changeset_revision, + install_tool_dependencies=True, + install_repository_dependencies=False, + install_resolver_dependencies=True) + logging.debug("Getting 'Homo sapiens' ID in instance's chado database") get_sapiens_id_job = self.instance.tools.run_tool( tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/2.3.4+galaxy0", diff --git a/run_workflow_phaeoexplorer.py b/run_workflow_phaeoexplorer.py index 49f3a2a..753803f 100755 --- a/run_workflow_phaeoexplorer.py +++ b/run_workflow_phaeoexplorer.py @@ -38,17 +38,18 @@ class RunWorkflow(speciesData.SpeciesData): """ Create or set the working history to the current species one - :return: """ try: - histories = self.instance.histories.get_histories(name=str(self.full_name)) + histories = self.instance.histories.get_histories(name=str(self.genus_species)) self.history_id = histories[0]["id"] + logging.debug("History ID set for {0}: {1}".format(self.full_name, self.history_id)) except IndexError: logging.info("Creating history for %s" % self.full_name) self.instance.histories.create_history(name=str(self.full_name)) - histories = self.instance.histories.get_histories(name=str(self.full_name)) + histories = self.instance.histories.get_histories(name=str(self.genus_species)) self.history_id = histories[0]["id"] + logging.debug("History ID set for {0}: {1}".format(self.full_name, self.history_id)) return self.history_id @@ -175,6 +176,14 @@ class RunWorkflow(speciesData.SpeciesData): logging.info("Tools versions and changesets from workflow validated") + + def return_instance(self): + + + return self.instance + + + def install_changesets_revisions_for_individual_tools(self): """ This function is used to verify that installed tools called outside workflows have the correct versions and changesets @@ -198,9 +207,9 @@ class RunWorkflow(speciesData.SpeciesData): # except for workflows (.ga) that already contain the changeset revisions inside the steps ids if get_organism_tool["version"] != "2.3.3": + toolshed_dict = get_organism_tool["tool_shed_repository"] logging.warning("Changeset for %s is not installed" % toolshed_dict["name"]) changeset_revision = "b07279b5f3bf" - toolshed_dict = get_organism_tool["tool_shed_repository"] name = toolshed_dict["name"] owner = toolshed_dict["owner"] toolshed = "https://" + toolshed_dict["tool_shed"] @@ -212,9 +221,9 @@ class RunWorkflow(speciesData.SpeciesData): install_resolver_dependencies=True) if get_analysis_tool["version"] != "2.3.3": + toolshed_dict = changeset_revision["tool_shed_repository"] logging.warning("Changeset for %s is not installed" % toolshed_dict["name"]) changeset_revision = "c7be2feafd73" - toolshed_dict = changeset_revision["tool_shed_repository"] name = toolshed_dict["name"] owner = toolshed_dict["owner"] toolshed = "https://" + toolshed_dict["tool_shed"] @@ -226,9 +235,9 @@ class RunWorkflow(speciesData.SpeciesData): install_resolver_dependencies=True) if add_organism_tool["version"] != "2.3.3": + toolshed_dict = add_organism_tool["tool_shed_repository"] logging.warning("Changeset for %s is not installed" % toolshed_dict["name"]) changeset_revision = "680a1fe3c266" - toolshed_dict = add_organism_tool["tool_shed_repository"] name = toolshed_dict["name"] owner = toolshed_dict["owner"] toolshed = "https://" + toolshed_dict["tool_shed"] @@ -240,9 +249,9 @@ class RunWorkflow(speciesData.SpeciesData): install_resolver_dependencies=True) if add_analysis_tool["version"] != "2.3.3": + toolshed_dict = add_analysis_tool["tool_shed_repository"] logging.warning("Changeset for %s is not installed" % toolshed_dict["name"]) changeset_revision = "43c36801669f" - toolshed_dict = add_analysis_tool["tool_shed_repository"] name = toolshed_dict["name"] owner = toolshed_dict["owner"] toolshed = "https://" + toolshed_dict["tool_shed"] @@ -826,6 +835,16 @@ if __name__ == "__main__": sp_dict_list = utilities.parse_input(args.input) + + org_ids = [] + genome_analysis_ids = [] + ogs_analysis_ids = [] + hda_ids_list = [] + instance_attributes_list = [] + + instance_url_2org = None + + for sp_dict in sp_dict_list: # Creating an instance of the RunWorkflow object for the current organism @@ -860,6 +879,119 @@ if __name__ == "__main__": run_workflow_for_current_organism.genus_lowercase, run_workflow_for_current_organism.species) + instance_url_2org = run_workflow_for_current_organism.instance_url + + # Get the instance attribute from the object for future connections + instance = run_workflow_for_current_organism.instance + + if "2org" in str(workflow): + logging.info("Executing workflow %s" % workflow) + + + run_workflow_for_current_organism.connect_to_instance() + run_workflow_for_current_organism.set_get_history() + run_workflow_for_current_organism.install_changesets_revisions_for_individual_tools() + run_workflow_for_current_organism.install_changesets_revisions_from_workflow(workflow_path=workflow) + run_workflow_for_current_organism.add_organism_ogs_genome_analyses() + org_id = run_workflow_for_current_organism.get_organism_id() + org_ids.append(org_id) + genome_analysis_id = run_workflow_for_current_organism.get_genome_analysis_id() + genome_analysis_ids.append(genome_analysis_id) + ogs_analysis_id = run_workflow_for_current_organism.get_ogs_analysis_id() + ogs_analysis_ids.append(ogs_analysis_id) + instance_attributes = run_workflow_for_current_organism.get_instance_attributes() + instance_attributes_list.append(instance_attributes) + + + # Import datasets into history and retrieve their hda IDs + # TODO: can be simplified with direct access to the folder contents via the full path (no loop required) + + hda_ids = run_workflow_for_current_organism.import_datasets_into_history() + hda_ids_list.append(hda_ids) + + + # TODO: Exlcude the workflow invocation from the loop + # Extract instance url from one, attributes from both in lists ? + + + # Source files association (ordered by their IDs in the workflow) + GENOME_FASTA_FILE_ORG1 = "0" + GFF_FILE_ORG1 = "1" + PROTEINS_FASTA_FILE_ORG1 = "2" + GENOME_FASTA_FILE_ORG2 = "3" + GFF_FILE_ORG2 = "4" + PROTEINS_FASTA_FILE_ORG2 = "5" + + LOAD_FASTA_ORG1 = "6" + JBROWSE_ORG1 = "7" + JRBOWSE_ORG2 = "8" + + LOAD_GFF_ORG1 = "9" + JBROWSE_CONTAINER = "10" + SYNC_FEATURES_ORG1 = "11" + + LOAD_FASTA_ORG2 = "12" + LOAD_GFF_ORG2 = "13" + + SYNC_FEATURES_ORG2 = "14" + POPULATE_MAT_VIEWS = "15" + INDEX_TRIPAL_DATA = "16" + + workflow_parameters = {} + + workflow_parameters[GENOME_FASTA_FILE_ORG1] = {} + workflow_parameters[GFF_FILE_ORG1] = {} + workflow_parameters[PROTEINS_FASTA_FILE_ORG1] = {} + workflow_parameters[GENOME_FASTA_FILE_ORG2] = {} + workflow_parameters[GFF_FILE_ORG2] = {} + workflow_parameters[PROTEINS_FASTA_FILE_ORG2] = {} + + + # ORGANISM 1 + workflow_parameters[LOAD_FASTA_ORG1] = {"organism": org_ids[0], + "analysis_id": genome_analysis_ids[0], + "do_update": "true"} + # Change "do_update": "true" to "do_update": "false" in above parameters to prevent appending/updates to the fasta file in chado + # WARNING: It is safer to never update it and just change the genome/ogs versions in the config + workflow_parameters[JBROWSE_ORG1] = {} + workflow_parameters[LOAD_GFF_ORG1] = {"organism": org_ids[0], "analysis_id": ogs_analysis_ids[0]} + workflow_parameters[SYNC_FEATURES_ORG1] = {"organism_id": org_ids[0]} + workflow_parameters[JBROWSE_CONTAINER] = {} + + + # ORGANISM 2 + workflow_parameters[LOAD_FASTA_ORG2] = {"organism": org_ids[1], + "analysis_id": genome_analysis_ids[1], + "do_update": "true"} + # Change "do_update": "true" to "do_update": "false" in above parameters to prevent appending/updates to the fasta file in chado + # WARNING: It is safer to never update it and just change the genome/ogs versions in the config + workflow_parameters[LOAD_GFF_ORG2] = {"organism": org_ids[1], "analysis_id": ogs_analysis_ids[1]} + workflow_parameters[JRBOWSE_ORG2] = {} + workflow_parameters[SYNC_FEATURES_ORG2] = {"organism_id": org_ids[1]} + + workflow_parameters[SYNC_GENOME_ANALYSIS_INTO_TRIPAL] = {"analysis_id": ogs_analysis_ids[0]} + workflow_parameters[SYNC_OGS_ANALYSIS_INTO_TRIPAL] = {"analysis_id": genome_analysis_ids[0]} + workflow_parameters[SYNC_FEATURES_INTO_TRIPAL] = {"organism_id": org_ids[0]} + + # POPULATE + INDEX DATA + workflow_parameters[POPULATE_MAT_VIEWS] = {} + workflow_parameters[INDEX_TRIPAL_DATA] = {} + + # Datamap for input datasets - dataset source (type): ldda (LibraryDatasetDatasetAssociation) + run_workflow_for_current_organism.datamap = {} + + run_workflow_for_current_organism.datamap[GENOME_FASTA_FILE_ORG1] = {"src": "hda", "id": hda_ids_list[0]["genome_hda_id"]} + run_workflow_for_current_organism.datamap[GFF_FILE_ORG1] = {"src": "hda", "id": hda_ids_list[0]["gff_hda_id"]} + run_workflow_for_current_organism.datamap[PROTEINS_FASTA_FILE_ORG1] = {"src": "hda", "id": hda_ids_list[0]["proteins_hda_id"]} + + run_workflow_for_current_organism.datamap[GENOME_FASTA_FILE_ORG2] = {"src": "hda", "id": hda_ids_list[1]["genome_hda_id"]} + run_workflow_for_current_organism.datamap[GFF_FILE_ORG2] = {"src": "hda", "id": hda_ids_list[1]["gff_hda_id"]} + run_workflow_for_current_organism.datamap[GFF_FILE_ORG2] = {"src": "hda", "id": hda_ids_list[1]["proteins_hda_id"]} + + + + logging.info("OK: Workflow invoked") + # If input workflow is Chado_load_Tripal_synchronize.ga diff --git a/speciesData.py b/speciesData.py index a2fcbdb..caccfda 100755 --- a/speciesData.py +++ b/speciesData.py @@ -20,6 +20,7 @@ class SpeciesData: self.parameters_dictionary = parameters_dictionary self.species = parameters_dictionary["description"]["species"].replace("(", "_").replace(")", "_").replace("-", "_") self.genus = parameters_dictionary["description"]["genus"].replace("(", "_").replace(")", "_").replace("-", "_") + self.genus_species = "{0}_{1}".format(self.genus.lower(), self.species.lower()) self.strain = parameters_dictionary["description"]["strain"].replace("(", "_").replace(")", "_").replace("-", "_") self.sex = parameters_dictionary["description"]["sex"].replace("(", "_").replace(")", "_").replace("-", "_") self.common = parameters_dictionary["description"]["common_name"].replace("(", "_").replace(")", "_").replace("-", "_") @@ -58,7 +59,6 @@ class SpeciesData: self.full_name_lowercase = self.full_name.lower() self.abbreviation = "_".join(utilities.filter_empty_not_empty_items([self.genus_lowercase[0], self.species, self.strain, self.sex])["not_empty"]) - self.genus_species = self.genus_lowercase + "_" + self.species self.instance_url = None self.instance = None self.history_id = None @@ -80,7 +80,7 @@ class SpeciesData: self.datasets = dict() self.config = None # Custom config used to set environment variables inside containers self.species_folder_name = "_".join(utilities.filter_empty_not_empty_items([self.genus_lowercase.lower(), self.species.lower(), self.strain.lower(), self.sex.lower()])["not_empty"]) - self.species_folder_name = self.species_folder_name .replace("-", "_") + self.species_folder_name = self.species_folder_name .replace("-", "_").replace('__', '_') self.existing_folders_cache = {} self.bam_metadata_cache = {} -- GitLab