diff --git a/run_workflow_phaeoexplorer.py b/run_workflow_phaeoexplorer.py index 661ea13ca3b60fd6a14b9082b355b16f6141b025..36a44675f37f4a3a0f32836b543e3b41f9369793 100755 --- a/run_workflow_phaeoexplorer.py +++ b/run_workflow_phaeoexplorer.py @@ -495,20 +495,120 @@ class RunWorkflow(speciesData.SpeciesData): # print({"org_id": org_id, "genome_analysis_id": genome_analysis_id, "ogs_analysis_id": ogs_analysis_id}) return({"org_id": org_id, "blastp_analysis_id": blastp_analysis_id}) - def add_interproscan_analysis(self): + def add_organism_interproscan_analysis(self): """ + Add OGS and genome vX analyses to Chado database + Required for Chado Load Tripal Synchronize workflow (which should be ran as the first workflow) + Called outside workflow for practical reasons (Chado add doesn't have an input link for analysis or organism) + + :return: + """ - # Add Interpro analysis to chado - logging.info("Adding Interproscan analysis to the instance's chado database") - self.instance.tools.run_tool( - tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/2.3.4+galaxy0", + self.connect_to_instance() + self.set_get_history() + + tool_version = "2.3.4+galaxy0" + + get_organism_tool = self.instance.tools.show_tool("toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/2.3.4+galaxy0") + + get_organisms = self.instance.tools.run_tool( + tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/%s" % tool_version, + history_id=self.history_id, + tool_inputs={}) + + time.sleep(10) # Ensure the tool has had time to complete + org_outputs = get_organisms["outputs"] # Outputs from the get_organism tool + org_job_out_id = org_outputs[0]["id"] # ID of the get_organism output dataset (list of dicts) + org_json_output = self.instance.datasets.download_dataset(dataset_id=org_job_out_id) # Download the dataset + org_output = json.loads(org_json_output) # Turn the dataset into a list for parsing + + org_id = None + + # Look up list of outputs (dictionaries) + for organism_output_dict in org_output: + if organism_output_dict["genus"] == self.genus and organism_output_dict["species"] == "{0} {1}".format(self.species, self.sex): + correct_organism_id = str(organism_output_dict["organism_id"]) # id needs to be a str to be recognized by chado tools + org_id = str(correct_organism_id) + + + if org_id is None: + if self.common == "" or self.common is None: + add_org_job = self.instance.tools.run_tool( + tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_add_organism/organism_add_organism/%s" % tool_version, + history_id=self.history_id, + tool_inputs={"abbr": self.abbreviation, + "genus": self.genus_uppercase, + "species": self.chado_species_name, + "common": self.abbreviation}) + org_job_out_id = add_org_job["outputs"][0]["id"] + org_json_output = self.instance.datasets.download_dataset(dataset_id=org_job_out_id) + org_output = json.loads(org_json_output) + org_id = str(org_output["organism_id"]) # id needs to be a str to be recognized by chado tools + else: + add_org_job = self.instance.tools.run_tool( + tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_add_organism/organism_add_organism/%s" % tool_version, + history_id=self.history_id, + tool_inputs={"abbr": self.abbreviation, + "genus": self.genus_uppercase, + "species": self.chado_species_name, + "common": self.common}) + org_job_out_id = add_org_job["outputs"][0]["id"] + org_json_output = self.instance.datasets.download_dataset(dataset_id=org_job_out_id) + org_output = json.loads(org_json_output) + org_id = str(org_output["organism_id"]) # id needs to be a str to be recognized by chado tools + + # Synchronize newly added organism in Tripal + logging.info("Synchronizing organism %s in Tripal" % self.full_name) + time.sleep(60) + org_sync = self.instance.tools.run_tool(tool_id="toolshed.g2.bx.psu.edu/repos/gga/tripal_organism_sync/organism_sync/3.2.1.0", + history_id=self.history_id, + tool_inputs={"organism_id": org_id}) + + + get_analyses = self.instance.tools.run_tool( + tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_get_analyses/analysis_get_analyses/%s" % tool_version, history_id=self.history_id, - tool_inputs={"name": "InterproScan on OGS%s" % self.ogs_version, - "program": "InterproScan", - "programversion": "OGS%s" % self.ogs_version, - "sourcename": "Genoscope", - "date_executed": self.date}) + tool_inputs={}) + + time.sleep(10) + analysis_outputs = get_analyses["outputs"] + analysis_job_out_id = analysis_outputs[0]["id"] + analysis_json_output = self.instance.datasets.download_dataset(dataset_id=analysis_job_out_id) + analysis_output = json.loads(analysis_json_output) + + interpro_analysis_id = None + + # Look up list of outputs (dictionaries) + for analysis_output_dict in analysis_output: + if analysis_output_dict["name"] == "Interproscan on " + self.full_name_lowercase + " OGS" + self.ogs_version: + interpro_analysis_id = str(analysis_output_dict["analysis_id"]) + + + if interpro_analysis_id is None: + add_interproscan_analysis_job = self.instance.tools.run_tool( + tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/%s" % tool_version, + history_id=self.history_id, + tool_inputs={"name": "Interproscan on " + self.full_name_lowercase + " OGS" + self.ogs_version, + "program": "Performed by Genoscope", + "programversion": str(self.sex + " OGS" + self.ogs_version), + "sourcename": "Genoscope", + "date_executed": self.date}) + analysis_outputs = add_interproscan_analysis_job["outputs"] + analysis_job_out_id = analysis_outputs[0]["id"] + analysis_json_output = self.instance.datasets.download_dataset(dataset_id=analysis_job_out_id) + analysis_output = json.loads(analysis_json_output) + interpro_analysis_id = str(analysis_output["analysis_id"]) + + # Synchronize blastp analysis + logging.info("Synchronizing Diamong blastp OGS%s analysis in Tripal" % self.ogs_version) + time.sleep(60) + interproscan_analysis_sync = self.instance.tools.run_tool(tool_id="toolshed.g2.bx.psu.edu/repos/gga/tripal_analysis_sync/analysis_sync/3.2.1.0", + history_id=self.history_id, + tool_inputs={"analysis_id": interpro_analysis_id}) + + # print({"org_id": org_id, "genome_analysis_id": genome_analysis_id, "ogs_analysis_id": ogs_analysis_id}) + return({"org_id": org_id, "interpro_analysis_id": interpro_analysis_id}) def get_interpro_analysis_id(self): @@ -628,6 +728,8 @@ class RunWorkflow(speciesData.SpeciesData): proteins_hda_id = dataset_id if dataset_name == "{0}_OGS{1}_blastp.xml".format(self.dataset_prefix, self.ogs_version): blastp_hda_id = dataset_id + if dataset_name == "{0}_OGS{1}_interproscan.xml".format(self.dataset_prefix, self.ogs_version): + interproscan_hda_id = dataset_id # Import each dataset into history if it is not imported @@ -675,67 +777,6 @@ class RunWorkflow(speciesData.SpeciesData): "interproscan_hda_id": interproscan_hda_id} - def get_datasets_hda_ids(self): - """ - Get the hda IDs of the datasets imported into an history - - As some tools will not work using the input datasets ldda IDs we need to retrieve the datasets IDs imported - into an history - - - :return: - """ - - # List of all datasets in the instance (including outputs from jobs) - # "limit" and "offset" options *may* be used to restrict search to specific datasets but since - # there is no way to know which imported datasets are the correct ones depending on history content - # it's not currently used - history_datasets_li = self.instance.datasets.get_datasets() - - genome_dataset_hda_id, gff_dataset_hda_id, transcripts_dataset_hda_id, proteins_datasets_hda_id = None, None, None, None - interproscan_dataset_hda_id, blast_diamond_dataset_hda_id = None, None - - # Match files imported in history names vs library datasets names to assign their respective hda_id - for dataset_dict in history_datasets_li: - if dataset_dict["history_id"] == self.history_id: - if dataset_dict["name"] == self.datasets_name["genome_file"] and dataset_dict["id"] not in imported_datasets_ids: - genome_dataset_hda_id = dataset_dict["id"] - elif dataset_dict["name"] == self.datasets_name["proteins_file"] and dataset_dict["id"] not in imported_datasets_ids: - proteins_datasets_hda_id = dataset_dict["id"] - elif dataset_dict["name"] == self.datasets_name["transcripts_file"] and dataset_dict["id"] not in imported_datasets_ids: - transcripts_dataset_hda_id = dataset_dict["id"] - elif dataset_dict["name"] == self.datasets_name["gff_file"] and dataset_dict["id"] not in imported_datasets_ids: - gff_dataset_hda_id = dataset_dict["id"] - if "interproscan_file" in self.datasets_name.keys(): - if dataset_dict["name"] == self.datasets_name["interproscan_file"] and dataset_dict["id"] not in imported_datasets_ids: - interproscan_dataset_hda_id = dataset_dict["id"] - if "blast_diamond_file" in self.datasets_name.keys(): - if dataset_dict["name"] == self.datasets_name["blastp_file"] and dataset_dict["id"] not in imported_datasets_ids: - blastp_dataset_hda_id = dataset_dict["id"] - - logging.debug("Genome dataset hda id: %s" % genome_dataset_hda_id) - logging.debug("Proteins dataset hda ID: %s" % proteins_datasets_hda_id) - logging.debug("Transcripts dataset hda ID: %s" % transcripts_dataset_hda_id) - logging.debug("GFF dataset hda ID: %s" % gff_dataset_hda_id) - logging.debug("InterproScan dataset hda ID: %s" % gff_dataset_hda_id) - logging.debug("Blastp Diamond dataset hda ID: %s" % blastp_dataset_hda_id) - - # Add datasets IDs to already imported IDs (so we don't assign all the wrong IDs to the next organism if there is one) - imported_datasets_ids.append(genome_dataset_hda_id) - imported_datasets_ids.append(transcripts_dataset_hda_id) - imported_datasets_ids.append(proteins_datasets_hda_id) - imported_datasets_ids.append(gff_dataset_hda_id) - imported_datasets_ids.append(interproscan_dataset_hda_id) - imported_datasets_ids.append(blastp_dataset_hda_id) - - # Return a dict made of the hda ids - return {"genome_hda_id": genome_dataset_hda_id, "transcripts_hda_id": transcripts_dataset_hda_id, - "proteins_hda_id": proteins_datasets_hda_id, "gff_hda_id": gff_dataset_hda_id, - "interproscan_hda_id": interproscan_dataset_hda_id, - "blastp_hda_id": blastp_dataset_hda_id, - "imported_datasets_ids": imported_datasets_ids} - - def run_workflow(workflow_path, workflow_parameters, datamap, config, input_species_number): """ Run a workflow in galaxy @@ -906,6 +947,45 @@ def create_sp_workflow_dict(sp_dict, main_dir, config, workflow_type): sp_workflow_dict[genus_species] = {strain_sex: attributes} + + if workflow_type == "interpro": + run_workflow_for_current_organism.connect_to_instance() + + history_id = run_workflow_for_current_organism.set_get_history() + + run_workflow_for_current_organism.install_changesets_revisions_for_individual_tools() + ids = run_workflow_for_current_organism.add_organism_interproscan_analysis() + + org_id = None + org_id = ids["org_id"] + interpro_analysis_id = None + interpro_analysis_id = ids["interpro_analysis_id"] + instance_attributes = run_workflow_for_current_organism.get_instance_attributes() + hda_ids = run_workflow_for_current_organism.import_datasets_into_history() + + strain_sex = "{0}_{1}".format(run_workflow_for_current_organism.strain, run_workflow_for_current_organism.sex) + genus_species = run_workflow_for_current_organism.genus_species + + # Create the dictionary holding all attributes needed to connect to the galaxy instance + attributes = {"genus": run_workflow_for_current_organism.genus, + "species": run_workflow_for_current_organism.species, + "genus_species": run_workflow_for_current_organism.genus_species, + "full_name": run_workflow_for_current_organism.full_name, + "species_folder_name": run_workflow_for_current_organism.species_folder_name, + "sex": run_workflow_for_current_organism.sex, + "strain": run_workflow_for_current_organism.strain, + "org_id": org_id, + "interpro_analysis_id": interpro_analysis_id, + "instance_attributes": instance_attributes, + "hda_ids": hda_ids, + "history_id": history_id, + "instance": run_workflow_for_current_organism.instance, + "instance_url": run_workflow_for_current_organism.instance_url, + "email": config["galaxy_default_admin_email"], + "password": config["galaxy_default_admin_password"]} + + sp_workflow_dict[genus_species] = {strain_sex: attributes} + else: logging.critical("The galaxy container for %s is not ready yet!" % run_workflow_for_current_organism.full_name) sys.exit() @@ -1164,13 +1244,6 @@ if __name__ == "__main__": else: jbrowse_menu_url_org = config["jbrowse_menu_url"] + "/sp/{genus_sp}/feature/{Genus}/{species}/mRNA/{id}".format(genus_sp=org_genus_species, Genus=org_genus[0].upper() + org_genus[1:], species=org_species, id="{id}") - # show_tool_add_organism = instance.tools.show_tool(tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_add_organism/organism_add_organism/2.3.4+galaxy0", io_details=True) - # print(show_tool_add_organism) - # show_jbrowse_tool = instance.tools.show_tool(tool_id="toolshed.g2.bx.psu.edu/repos/iuc/jbrowse/jbrowse/1.16.11+galaxy0", io_details=True) - # print(show_jbrowse_tool) - # show_jbrowse_container_tool = instance.tools.show_tool(tool_id="toolshed.g2.bx.psu.edu/repos/gga/jbrowse_to_container/jbrowse_to_container/0.5.1", io_details=True) - # print(show_jbrowse_container_tool) - # Replace values in the workflow dictionary workflow_dict["steps"]["4"]["tool_state"] = workflow_dict["steps"]["4"]["tool_state"].replace("__MENU_URL_ORG__", jbrowse_menu_url_org) workflow_dict["steps"]["6"]["tool_state"] = workflow_dict["steps"]["6"]["tool_state"].replace("__DISPLAY_NAME_ORG__", org_full_name).replace("__UNIQUE_ID_ORG__", org_species_folder_name) @@ -1190,7 +1263,7 @@ if __name__ == "__main__": logging.warning("Error finding workflow %s" % workflow_name) # Finally, invoke the workflow alogn with its datamap, parameters and the history in which to invoke it - # instance.workflows.invoke_workflow(workflow_id=workflow_id, history_id=history_id, params=workflow_parameters, inputs=datamap, allow_tool_state_corrections=True) + instance.workflows.invoke_workflow(workflow_id=workflow_id, history_id=history_id, params=workflow_parameters, inputs=datamap, allow_tool_state_corrections=True) logging.info("Successfully imported and invoked workflow {0}, check the galaxy instance ({1}) for the jobs state".format(workflow_name, instance_url)) @@ -1426,7 +1499,7 @@ if __name__ == "__main__": logging.warning("Error finding workflow %s" % workflow_name) # Finally, invoke the workflow alogn with its datamap, parameters and the history in which to invoke it - # instance.workflows.invoke_workflow(workflow_id=workflow_id, history_id=history_id, params=workflow_parameters, inputs=datamap, allow_tool_state_corrections=True) + instance.workflows.invoke_workflow(workflow_id=workflow_id, history_id=history_id, params=workflow_parameters, inputs=datamap, allow_tool_state_corrections=True) logging.info("Successfully imported and invoked workflow {0}, check the galaxy instance ({1}) for the jobs state".format(workflow_name, instance_url)) @@ -1541,7 +1614,7 @@ if __name__ == "__main__": logging.warning("Error finding workflow %s" % workflow_name) # Finally, invoke the workflow alogn with its datamap, parameters and the history in which to invoke it - # instance.workflows.invoke_workflow(workflow_id=workflow_id, history_id=history_id, params=workflow_parameters, inputs=datamap, allow_tool_state_corrections=True) + instance.workflows.invoke_workflow(workflow_id=workflow_id, history_id=history_id, params=workflow_parameters, inputs=datamap, allow_tool_state_corrections=True) logging.info("Successfully imported and invoked workflow {0}, check the galaxy instance ({1}) for the jobs state".format(workflow_name, instance_url)) @@ -1695,7 +1768,276 @@ if __name__ == "__main__": logging.warning("Error finding workflow %s" % workflow_name) # Finally, invoke the workflow alogn with its datamap, parameters and the history in which to invoke it - # instance.workflows.invoke_workflow(workflow_id=workflow_id, history_id=history_id, params=workflow_parameters, inputs=datamap, allow_tool_state_corrections=True) + instance.workflows.invoke_workflow(workflow_id=workflow_id, history_id=history_id, params=workflow_parameters, inputs=datamap, allow_tool_state_corrections=True) + + logging.info("Successfully imported and invoked workflow {0}, check the galaxy instance ({1}) for the jobs state".format(workflow_name, instance_url)) + + + if workflow_type == "inteproscan": + for sp_dict in sp_dict_list: + + # Add and retrieve all analyses/organisms for the current input species and add their IDs to the input dictionary + current_sp_workflow_dict = create_sp_workflow_dict(sp_dict, main_dir=args.main_directory, config=config, workfow_type="blast") + + current_sp_key = list(current_sp_workflow_dict.keys())[0] + current_sp_value = list(current_sp_workflow_dict.values())[0] + current_sp_strain_sex_key = list(current_sp_value.keys())[0] + current_sp_strain_sex_value = list(current_sp_value.values())[0] + + # Add the species dictionary to the complete dictionary + # This dictionary contains every organism present in the input file + # Its structure is the following: + # {genus species: {strain1_sex1: {variables_key: variables_values}, strain1_sex2: {variables_key: variables_values}}} + if not current_sp_key in all_sp_workflow_dict.keys(): + all_sp_workflow_dict[current_sp_key] = current_sp_value + else: + all_sp_workflow_dict[current_sp_key][current_sp_strain_sex_key] = current_sp_strain_sex_value + + if len(list(v.keys())) == 1: + logging.info("Input organism %s: 1 species detected in input dictionary" % k) + + # Set workflow path (1 organism) + workflow_path = os.path.join(os.path.abspath(script_dir), "workflows_phaeoexplorer/Galaxy-Workflow-load_blast_results_1org_v1.ga") + + # Instance object required variables + instance_url, email, password = None, None, None + + # Set the galaxy instance variables + for k2, v2 in v.items(): + instance_url = v2["instance_url"] + email = v2["email"] + password = v2["password"] + + instance = galaxy.GalaxyInstance(url=instance_url, email=email, password=password) + + # Check if the versions of tools specified in the workflow are installed in galaxy + install_changesets_revisions_from_workflow(workflow_path=workflow_path, instance=instance) + + organism_key_name = list(v.keys()) + org_dict = v[organisms_key_name[0]] + + history_id = org_dict["history_id"] + + # Organism attributes + org_genus = org_dict["genus"] + org_species = org_dict["species"] + org_genus_species = org_dict["genus_species"] + org_species_folder_name = org_dict["species_folder_name"] + org_full_name = org_dict["full_name"] + org_strain = org_dict["sex"] + org_sex = org_dict["strain"] + org_org_id = org_dict["org_id"] + org_inteproscan_analysis_id = org_dict["inteproscan_analysis_id"] + org_interproscan_hda_id = org_dict["hda_ids"]["interproscan_hda_id"] + + # Store these values into a dict for parameters logging/validation + org_parameters_dict = { + "org_genus": org_genus, + "org_species": org_species, + "org_genus_species": org_genus_species, + "org_species_folder_name": org_species_folder_name, + "org_full_name": org_full_name, + "org_strain": org_strain, + "org_sex": org_sex, + "org_org_id": org_org_id, + "org_inteproscan_analysis_id": org_inteproscan_analysis_id, + "org_interproscan_hda_id": org_interproscan_hda_id, + } + + # Look for empty parameters values, throw a critical error if a parameter value is invalid + for param_name, param_value in org_parameters_dict.items(): + if param_value is None or param_value == "": + logging.critical("Empty parameter value found for organism {0} (parameter: {1}, parameter value: {2})".format(org_full_name, param_name, param_value)) + sys.exit() + + INTEPRO_FILE = "0" + LOAD_INTERPRO_FILE = "1" + POPULATE_MAT_VIEWS = "2" + INDEX_TRIPAL_DATA = "3" + + # Set the workflow parameters (individual tools runtime parameters in the workflow) + workflow_parameters = {} + workflow_parameters[INTEPRO_FILE] = {} + workflow_parameters[LOAD_INTERPRO_FILE] = {"analysis_id": org_inteproscan_analysis_id, "organism_id": org_org_id} + workflow_parameters[POPULATE_MAT_VIEWS] = {} + workflow_parameters[INDEX_TRIPAL_DATA] = {} + + datamap = {} + datamap[INTEPRO_FILE] = {"src": "hda", "id": org_interproscan_hda_id} + + with open(workflow_path, 'r') as ga_in_file: + # Store the decoded json dictionary + workflow_dict = json.load(ga_in_file) + workflow_name = workflow_dict["name"] + + # Import the workflow in galaxy as a dict + instance.workflows.import_workflow_dict(workflow_dict=workflow_dict) + # Get its attributes + workflow_attributes = instance.workflows.get_workflows(name=workflow_name) + # Then get its ID (required to invoke the workflow) + workflow_id = workflow_attributes[0]["id"] # Index 0 is the most recently imported workflow (the one we want) + show_workflow = instance.workflows.show_workflow(workflow_id=workflow_id) + # Check if the workflow is found + try: + logging.debug("Workflow ID: %s" % workflow_id) + except bioblend.ConnectionError: + logging.warning("Error finding workflow %s" % workflow_name) + + # Finally, invoke the workflow alogn with its datamap, parameters and the history in which to invoke it + instance.workflows.invoke_workflow(workflow_id=workflow_id, history_id=history_id, params=workflow_parameters, inputs=datamap, allow_tool_state_corrections=True) logging.info("Successfully imported and invoked workflow {0}, check the galaxy instance ({1}) for the jobs state".format(workflow_name, instance_url)) + + + if len(list(v.keys())) == 2: + + logging.info("Input organism %s: 2 species detected in input dictionary" % k) + + # Set workflow path (2 organisms) + workflow_path = os.path.join(os.path.abspath(script_dir), "workflows_phaeoexplorer/Galaxy-Workflow-load_blast_results_2org_v1.ga") + + # Instance object required variables + instance_url, email, password = None, None, None + + # Set the galaxy instance variables + for k2, v2 in v.items(): + instance_url = v2["instance_url"] + email = v2["email"] + password = v2["password"] + + instance = galaxy.GalaxyInstance(url=instance_url, email=email, password=password) + + # Check if the versions of tools specified in the workflow are installed in galaxy + install_changesets_revisions_from_workflow(workflow_path=workflow_path, instance=instance) + + organisms_key_names = list(v.keys()) + org1_dict = v[organisms_key_names[0]] + org2_dict = v[organisms_key_names[1]] + + history_id = org1_dict["history_id"] + + # Organism 1 attributes + org1_genus = org1_dict["genus"] + org1_species = org1_dict["species"] + org1_genus_species = org1_dict["genus_species"] + org1_species_folder_name = org1_dict["species_folder_name"] + org1_full_name = org1_dict["full_name"] + org1_strain = org1_dict["sex"] + org1_sex = org1_dict["strain"] + org1_org_id = org1_dict["org_id"] + org1_interproscan_analysis_id = org1_dict["interproscan_analysis_id"] + org1_interproscan_hda_id = org1_dict["hda_ids"]["interproscan_hda_id"] + + # Store these values into a dict for parameters logging/validation + org1_parameters_dict = { + "org1_genus": org1_genus, + "org1_species": org1_species, + "org1_genus_species": org1_genus_species, + "org1_species_folder_name": org1_species_folder_name, + "org1_full_name": org1_full_name, + "org1_strain": org1_strain, + "org1_sex": org1_sex, + "org1_org_id": org1_org_id, + "org1_interproscan_analysis_id": org1_interproscan_analysis_id, + "org1_interproscan_hda_id": org1_interproscan_hda_id, + } + + + # Look for empty parameters values, throw a critical error if a parameter value is invalid + for param_name, param_value in org1_parameters_dict.items(): + if param_value is None or param_value == "": + logging.critical("Empty parameter value found for organism {0} (parameter: {1}, parameter value: {2})".format(org1_full_name, param_name, param_value)) + sys.exit() + + # Organism 2 attributes + org2_genus = org2_dict["genus"] + org2_species = org2_dict["species"] + org2_genus_species = org2_dict["genus_species"] + org2_species_folder_name = org2_dict["species_folder_name"] + org2_full_name = org2_dict["full_name"] + org2_strain = org2_dict["sex"] + org2_sex = org2_dict["strain"] + org2_org_id = org2_dict["org_id"] + org2_interproscan_analysis_id = org2_dict["interproscan_analysis_id"] + org2_interproscan_hda_id = org2_dict["hda_ids"]["interproscan_hda_id"] + + # Store these values into a dict for parameters logging/validation + org2_parameters_dict = { + "org2_genus": org2_genus, + "org2_species": org2_species, + "org2_genus_species": org2_genus_species, + "org2_species_folder_name": orgé_species_folder_name, + "org2_full_name": org2_full_name, + "org2_strain": org2_strain, + "org2_sex": org2_sex, + "org2_org_id": org2_org_id, + "org2_interproscan_analysis_id": org2_interproscan_analysis_id, + "org2_interproscan_hda_id": org2_interproscan_hda_id, + } + + + # Look for empty parameters values, throw a critical error if a parameter value is invalid + for param_name, param_value in org2_parameters_dict.items(): + if param_value is None or param_value == "": + logging.critical("Empty parameter value found for organism {0} (parameter: {1}, parameter value: {2})".format(org2_full_name, param_name, param_value)) + sys.exit() + + # Source files association (ordered by their IDs in the workflow) + # WARNING: Be very careful about how the workflow is "organized" (i.e the order of the steps/datasets, check the .ga if there is any error) + INTERPRO_FILE_ORG1 = "0" + INTERPRO_FILE_ORG2 = "1" + LOAD_INTERPRO_FILE_ORG1 = "2" + LOAD_INTERPRO_FILE_ORG2 = "3" + POPULATE_MAT_VIEWS = "4" + INDEX_TRIPAL_DATA = "5" + + # Set the workflow parameters (individual tools runtime parameters in the workflow) + workflow_parameters = {} + + # Input files have no parameters (they are set via assigning the hda IDs in the datamap parameter of the bioblend method) + workflow_parameters[INTERPRO_FILE_ORG1] = {} + workflow_parameters[INTERPRO_FILE_ORG2] = {} + + # Organism 1 + workflow_parameters[LOAD_INTERPRO_FILE_ORG1] = {"organism_id": org1_org_id, + "analysis_id": org1_interproscan_analysis_id} + + # Organism 2 + workflow_parameters[LOAD_INTERPRO_FILE_ORG2] = {"organism_id": org2_org_id, + "analysis_id": org2_interproscan_analysis_id} + + workflow_parameters[POPULATE_MAT_VIEWS] = {} + workflow_parameters[INDEX_TRIPAL_DATA] = {} + + # Set datamap (mapping of input files in the workflow) + datamap = {} + + # Organism 1 + datamap[BLASTP_FILE_ORG1] = {"src": "hda", "id": org1_interproscan_hda_id} + + # Organism 2 + datamap[BLASTP_FILE_ORG2] = {"src": "hda", "id": org2_interproscan_hda_id} + + with open(workflow_path, 'r') as ga_in_file: + # Store the decoded json dictionary + workflow_dict = json.load(ga_in_file) + workflow_name = workflow_dict["name"] + + # Import the workflow in galaxy as a dict + instance.workflows.import_workflow_dict(workflow_dict=workflow_dict) + # Get its attributes + workflow_attributes = instance.workflows.get_workflows(name=workflow_name) + # Then get its ID (required to invoke the workflow) + workflow_id = workflow_attributes[0]["id"] # Index 0 is the most recently imported workflow (the one we want) + show_workflow = instance.workflows.show_workflow(workflow_id=workflow_id) + # Check if the workflow is found + try: + logging.debug("Workflow ID: %s" % workflow_id) + except bioblend.ConnectionError: + logging.warning("Error finding workflow %s" % workflow_name) + + # Finally, invoke the workflow alogn with its datamap, parameters and the history in which to invoke it + instance.workflows.invoke_workflow(workflow_id=workflow_id, history_id=history_id, params=workflow_parameters, inputs=datamap, allow_tool_state_corrections=True) + + logging.info("Successfully imported and invoked workflow {0}, check the galaxy instance ({1}) for the jobs state".format(workflow_name, instance_url)) diff --git a/workflows_phaeoexplorer/Galaxy-Workflow-load_interproscan_1org_v1.ga b/workflows_phaeoexplorer/Galaxy-Workflow-load_interproscan_1org_v1.ga new file mode 100644 index 0000000000000000000000000000000000000000..d05f0c843520ef23f3a476e4a870dd37268a118b --- /dev/null +++ b/workflows_phaeoexplorer/Galaxy-Workflow-load_interproscan_1org_v1.ga @@ -0,0 +1,216 @@ +{ + "a_galaxy_workflow": "true", + "annotation": "", + "format-version": "0.1", + "name": "load_inteproscan_1org1_v1", + "steps": { + "0": { + "annotation": "", + "content_id": null, + "errors": null, + "id": 0, + "input_connections": {}, + "inputs": [ + { + "description": "", + "name": "inteproscan file org1" + } + ], + "label": "inteproscan file org1", + "name": "Input dataset", + "outputs": [], + "position": { + "bottom": 319.1999969482422, + "height": 82.19999694824219, + "left": 287, + "right": 487, + "top": 237, + "width": 200, + "x": 287, + "y": 237 + }, + "tool_id": null, + "tool_state": "{\"optional\": false}", + "tool_version": null, + "type": "data_input", + "uuid": "3b13466d-1b81-475a-b652-183ed9d24bfa", + "workflow_outputs": [ + { + "label": null, + "output_name": "output", + "uuid": "aff82f29-1971-4283-bfe8-cdab4857a215" + } + ] + }, + "1": { + "annotation": "", + "content_id": "toolshed.g2.bx.psu.edu/repos/gga/chado_load_interpro/load_interpro/2.3.6+galaxy0", + "errors": null, + "id": 1, + "input_connections": { + "input": { + "id": 0, + "output_name": "output" + } + }, + "inputs": [ + { + "description": "runtime parameter for tool Chado load InterProScan results", + "name": "analysis_id" + }, + { + "description": "runtime parameter for tool Chado load InterProScan results", + "name": "input" + }, + { + "description": "runtime parameter for tool Chado load InterProScan results", + "name": "organism_id" + }, + { + "description": "runtime parameter for tool Chado load InterProScan results", + "name": "wait_for" + } + ], + "label": "interproscan load org 1", + "name": "Chado load InterProScan results", + "outputs": [ + { + "name": "results", + "type": "json" + } + ], + "position": { + "bottom": 371.3999938964844, + "height": 164.39999389648438, + "left": 595, + "right": 795, + "top": 207, + "width": 200, + "x": 595, + "y": 207 + }, + "post_job_actions": {}, + "tool_id": "toolshed.g2.bx.psu.edu/repos/gga/chado_load_interpro/load_interpro/2.3.6+galaxy0", + "tool_shed_repository": { + "changeset_revision": "1e54f2717e74", + "name": "chado_load_interpro", + "owner": "gga", + "tool_shed": "toolshed.g2.bx.psu.edu" + }, + "tool_state": "{\"analysis_id\": {\"__class__\": \"RuntimeValue\"}, \"input\": {\"__class__\": \"RuntimeValue\"}, \"match_on_name\": \"false\", \"organism_id\": {\"__class__\": \"RuntimeValue\"}, \"parse_go\": \"false\", \"psql_target\": {\"method\": \"remote\", \"__current_case__\": 0}, \"query_type\": \"polypeptide\", \"re_name\": \"\", \"skip_missing\": \"false\", \"wait_for\": {\"__class__\": \"RuntimeValue\"}, \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "2.3.6+galaxy0", + "type": "tool", + "uuid": "a084abdc-a0f4-4670-a486-4aed4e0f61fa", + "workflow_outputs": [ + { + "label": null, + "output_name": "results", + "uuid": "dbaab44d-1494-4fbe-bd15-455f7cbf7307" + } + ] + }, + "2": { + "annotation": "", + "content_id": "toolshed.g2.bx.psu.edu/repos/gga/tripal_db_populate_mviews/db_populate_mviews/3.2.1.0", + "errors": null, + "id": 2, + "input_connections": { + "wait_for": { + "id": 1, + "output_name": "results" + } + }, + "inputs": [], + "label": null, + "name": "Populate materialized views", + "outputs": [ + { + "name": "results", + "type": "txt" + } + ], + "position": { + "bottom": 360.3999938964844, + "height": 154.39999389648438, + "left": 876, + "right": 1076, + "top": 206, + "width": 200, + "x": 876, + "y": 206 + }, + "post_job_actions": {}, + "tool_id": "toolshed.g2.bx.psu.edu/repos/gga/tripal_db_populate_mviews/db_populate_mviews/3.2.1.0", + "tool_shed_repository": { + "changeset_revision": "3c08f32a3dc1", + "name": "tripal_db_populate_mviews", + "owner": "gga", + "tool_shed": "toolshed.g2.bx.psu.edu" + }, + "tool_state": "{\"mview\": \"\", \"wait_for\": {\"__class__\": \"ConnectedValue\"}, \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "3.2.1.0", + "type": "tool", + "uuid": "2acbe412-1318-46e8-a178-72f50df36a07", + "workflow_outputs": [ + { + "label": "Populate Tripal materialized view(s)", + "output_name": "results", + "uuid": "08a4eec2-b95a-4c75-8116-b0fc9477812f" + } + ] + }, + "3": { + "annotation": "", + "content_id": "toolshed.g2.bx.psu.edu/repos/gga/tripal_db_index/db_index/3.2.1.1", + "errors": null, + "id": 3, + "input_connections": { + "wait_for": { + "id": 2, + "output_name": "results" + } + }, + "inputs": [], + "label": null, + "name": "Index Tripal data", + "outputs": [ + { + "name": "results", + "type": "txt" + } + ], + "position": { + "bottom": 340.6000061035156, + "height": 113.60000610351562, + "left": 1150, + "right": 1350, + "top": 227, + "width": 200, + "x": 1150, + "y": 227 + }, + "post_job_actions": {}, + "tool_id": "toolshed.g2.bx.psu.edu/repos/gga/tripal_db_index/db_index/3.2.1.1", + "tool_shed_repository": { + "changeset_revision": "d55a39f12dda", + "name": "tripal_db_index", + "owner": "gga", + "tool_shed": "toolshed.g2.bx.psu.edu" + }, + "tool_state": "{\"expose\": {\"do_expose\": \"no\", \"__current_case__\": 0}, \"queues\": \"10\", \"table\": {\"mode\": \"website\", \"__current_case__\": 0}, \"tokenizer\": \"standard\", \"wait_for\": {\"__class__\": \"ConnectedValue\"}, \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "3.2.1.1", + "type": "tool", + "uuid": "42d782ff-9e6b-49d8-bd17-37dc67f31a18", + "workflow_outputs": [ + { + "label": "Index Tripal data", + "output_name": "results", + "uuid": "8b38b48f-5d4e-49e6-ad70-748ad64dc901" + } + ] + } + }, + "tags": [], + "uuid": "9912daac-c00e-4d5e-a93e-a747bd1b3499", + "version": 3 +} \ No newline at end of file diff --git a/workflows_phaeoexplorer/Galaxy-Workflow-load_interproscan_2org_v1.ga b/workflows_phaeoexplorer/Galaxy-Workflow-load_interproscan_2org_v1.ga new file mode 100644 index 0000000000000000000000000000000000000000..e15b1885b432f45ae64e9f3d9079537dd602d5ae --- /dev/null +++ b/workflows_phaeoexplorer/Galaxy-Workflow-load_interproscan_2org_v1.ga @@ -0,0 +1,335 @@ +{ + "a_galaxy_workflow": "true", + "annotation": "", + "format-version": "0.1", + "name": "load_interproscan_2org_v1", + "steps": { + "0": { + "annotation": "", + "content_id": null, + "errors": null, + "id": 0, + "input_connections": {}, + "inputs": [ + { + "description": "", + "name": "interproscan file org 1" + } + ], + "label": "interproscan file org 1", + "name": "Input dataset", + "outputs": [], + "position": { + "bottom": 350.1999969482422, + "height": 82.19999694824219, + "left": 414, + "right": 614, + "top": 268, + "width": 200, + "x": 414, + "y": 268 + }, + "tool_id": null, + "tool_state": "{\"optional\": false}", + "tool_version": null, + "type": "data_input", + "uuid": "bfce9a38-df8a-46ef-af2e-390a4982ebfc", + "workflow_outputs": [ + { + "label": null, + "output_name": "output", + "uuid": "d5d96afa-8dcc-4322-89fa-df56fd503d89" + } + ] + }, + "1": { + "annotation": "", + "content_id": null, + "errors": null, + "id": 1, + "input_connections": {}, + "inputs": [ + { + "description": "", + "name": "interproscan file org 2" + } + ], + "label": "interproscan file org 2", + "name": "Input dataset", + "outputs": [], + "position": { + "bottom": 451.1999969482422, + "height": 82.19999694824219, + "left": 437, + "right": 637, + "top": 369, + "width": 200, + "x": 437, + "y": 369 + }, + "tool_id": null, + "tool_state": "{\"optional\": false}", + "tool_version": null, + "type": "data_input", + "uuid": "45f57c7f-f08c-4b3b-879d-1acd246868fb", + "workflow_outputs": [ + { + "label": null, + "output_name": "output", + "uuid": "f5e4ae10-fdbf-47e9-99d8-0644bf8bb46f" + } + ] + }, + "2": { + "annotation": "", + "content_id": "toolshed.g2.bx.psu.edu/repos/gga/chado_load_interpro/load_interpro/2.3.6+galaxy0", + "errors": null, + "id": 2, + "input_connections": { + "input": { + "id": 0, + "output_name": "output" + } + }, + "inputs": [ + { + "description": "runtime parameter for tool Chado load InterProScan results", + "name": "analysis_id" + }, + { + "description": "runtime parameter for tool Chado load InterProScan results", + "name": "input" + }, + { + "description": "runtime parameter for tool Chado load InterProScan results", + "name": "organism_id" + }, + { + "description": "runtime parameter for tool Chado load InterProScan results", + "name": "wait_for" + } + ], + "label": "inteproscan load org 1", + "name": "Chado load InterProScan results", + "outputs": [ + { + "name": "results", + "type": "json" + } + ], + "position": { + "bottom": 365.3999938964844, + "height": 164.39999389648438, + "left": 790, + "right": 990, + "top": 201, + "width": 200, + "x": 790, + "y": 201 + }, + "post_job_actions": {}, + "tool_id": "toolshed.g2.bx.psu.edu/repos/gga/chado_load_interpro/load_interpro/2.3.6+galaxy0", + "tool_shed_repository": { + "changeset_revision": "1e54f2717e74", + "name": "chado_load_interpro", + "owner": "gga", + "tool_shed": "toolshed.g2.bx.psu.edu" + }, + "tool_state": "{\"analysis_id\": {\"__class__\": \"RuntimeValue\"}, \"input\": {\"__class__\": \"RuntimeValue\"}, \"match_on_name\": \"false\", \"organism_id\": {\"__class__\": \"RuntimeValue\"}, \"parse_go\": \"false\", \"psql_target\": {\"method\": \"remote\", \"__current_case__\": 0}, \"query_type\": \"polypeptide\", \"re_name\": \"\", \"skip_missing\": \"false\", \"wait_for\": {\"__class__\": \"RuntimeValue\"}, \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "2.3.6+galaxy0", + "type": "tool", + "uuid": "e6c509b4-ea16-4fda-b171-3dc781a54759", + "workflow_outputs": [ + { + "label": null, + "output_name": "results", + "uuid": "bfff8893-5c92-45a0-88ca-594b33a8b4e2" + } + ] + }, + "3": { + "annotation": "", + "content_id": "toolshed.g2.bx.psu.edu/repos/gga/chado_load_interpro/load_interpro/2.3.6+galaxy0", + "errors": null, + "id": 3, + "input_connections": { + "input": { + "id": 1, + "output_name": "output" + }, + "wait_for": { + "id": 2, + "output_name": "results" + } + }, + "inputs": [ + { + "description": "runtime parameter for tool Chado load InterProScan results", + "name": "analysis_id" + }, + { + "description": "runtime parameter for tool Chado load InterProScan results", + "name": "input" + }, + { + "description": "runtime parameter for tool Chado load InterProScan results", + "name": "organism_id" + }, + { + "description": "runtime parameter for tool Chado load InterProScan results", + "name": "wait_for" + } + ], + "label": "interproscan load org 2", + "name": "Chado load InterProScan results", + "outputs": [ + { + "name": "results", + "type": "json" + } + ], + "position": { + "bottom": 555.3999938964844, + "height": 164.39999389648438, + "left": 819, + "right": 1019, + "top": 391, + "width": 200, + "x": 819, + "y": 391 + }, + "post_job_actions": {}, + "tool_id": "toolshed.g2.bx.psu.edu/repos/gga/chado_load_interpro/load_interpro/2.3.6+galaxy0", + "tool_shed_repository": { + "changeset_revision": "1e54f2717e74", + "name": "chado_load_interpro", + "owner": "gga", + "tool_shed": "toolshed.g2.bx.psu.edu" + }, + "tool_state": "{\"analysis_id\": {\"__class__\": \"RuntimeValue\"}, \"input\": {\"__class__\": \"RuntimeValue\"}, \"match_on_name\": \"false\", \"organism_id\": {\"__class__\": \"RuntimeValue\"}, \"parse_go\": \"false\", \"psql_target\": {\"method\": \"remote\", \"__current_case__\": 0}, \"query_type\": \"polypeptide\", \"re_name\": \"\", \"skip_missing\": \"false\", \"wait_for\": {\"__class__\": \"RuntimeValue\"}, \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "2.3.6+galaxy0", + "type": "tool", + "uuid": "1832aceb-851c-4020-9c07-8193e8b2299a", + "workflow_outputs": [ + { + "label": null, + "output_name": "results", + "uuid": "be9f442f-7f19-4b84-94cf-9c2c28d9a6a5" + } + ] + }, + "4": { + "annotation": "", + "content_id": "toolshed.g2.bx.psu.edu/repos/gga/tripal_db_populate_mviews/db_populate_mviews/3.2.1.0", + "errors": null, + "id": 4, + "input_connections": { + "wait_for": { + "id": 3, + "output_name": "results" + } + }, + "inputs": [ + { + "description": "runtime parameter for tool Populate materialized views", + "name": "wait_for" + } + ], + "label": null, + "name": "Populate materialized views", + "outputs": [ + { + "name": "results", + "type": "txt" + } + ], + "position": { + "bottom": 459.3999938964844, + "height": 154.39999389648438, + "left": 1118, + "right": 1318, + "top": 305, + "width": 200, + "x": 1118, + "y": 305 + }, + "post_job_actions": {}, + "tool_id": "toolshed.g2.bx.psu.edu/repos/gga/tripal_db_populate_mviews/db_populate_mviews/3.2.1.0", + "tool_shed_repository": { + "changeset_revision": "3c08f32a3dc1", + "name": "tripal_db_populate_mviews", + "owner": "gga", + "tool_shed": "toolshed.g2.bx.psu.edu" + }, + "tool_state": "{\"mview\": \"\", \"wait_for\": {\"__class__\": \"RuntimeValue\"}, \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "3.2.1.0", + "type": "tool", + "uuid": "a819f33b-e566-43e5-a467-e3410ce431ec", + "workflow_outputs": [ + { + "label": "Populate Tripal materialized view(s)", + "output_name": "results", + "uuid": "96483655-995e-4d4e-b8c6-7f602c974ee3" + } + ] + }, + "5": { + "annotation": "", + "content_id": "toolshed.g2.bx.psu.edu/repos/gga/tripal_db_index/db_index/3.2.1.1", + "errors": null, + "id": 5, + "input_connections": { + "wait_for": { + "id": 4, + "output_name": "results" + } + }, + "inputs": [ + { + "description": "runtime parameter for tool Index Tripal data", + "name": "wait_for" + } + ], + "label": null, + "name": "Index Tripal data", + "outputs": [ + { + "name": "results", + "type": "txt" + } + ], + "position": { + "bottom": 442.6000061035156, + "height": 113.60000610351562, + "left": 1382, + "right": 1582, + "top": 329, + "width": 200, + "x": 1382, + "y": 329 + }, + "post_job_actions": {}, + "tool_id": "toolshed.g2.bx.psu.edu/repos/gga/tripal_db_index/db_index/3.2.1.1", + "tool_shed_repository": { + "changeset_revision": "d55a39f12dda", + "name": "tripal_db_index", + "owner": "gga", + "tool_shed": "toolshed.g2.bx.psu.edu" + }, + "tool_state": "{\"expose\": {\"do_expose\": \"no\", \"__current_case__\": 0}, \"queues\": \"10\", \"table\": {\"mode\": \"website\", \"__current_case__\": 0}, \"tokenizer\": \"standard\", \"wait_for\": {\"__class__\": \"RuntimeValue\"}, \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "3.2.1.1", + "type": "tool", + "uuid": "489483da-c7c7-49a5-b5d4-353db41b1240", + "workflow_outputs": [ + { + "label": "Index Tripal data", + "output_name": "results", + "uuid": "3df4fd46-5dd9-49d6-b0d2-24425d9ba91e" + } + ] + } + }, + "tags": [], + "uuid": "ac96d70e-60b6-4455-8d14-9219d0674950", + "version": 1 +} \ No newline at end of file