diff --git a/gga_get_data.py b/gga_get_data.py index 6ff524b29333ee5470ce146b54976e6e3d448c5a..d57f0fb51fafcc374c9394834f602ded41d0aafe 100755 --- a/gga_get_data.py +++ b/gga_get_data.py @@ -124,12 +124,18 @@ class GetData(speciesData.SpeciesData): # search_excluded_datasets = ["interpro_path", "orthofinder_path", "blastp_path", "blastx_path"] # # These datasets will not be searched if missing in the input file + dataset_shortname = "" + if self.sex is not None or self.sex != "": + dataset_shortname = self.genus[0].lower() + "_" + self.species.lower() + "_" + self.sex[0].lower() + else: + dataset_shortname = self.genus[0].lower() + "_" + self.species.lower() + # Copy datasets in the organism src_data dir tree correct folder for k, v in datasets_to_get.items(): if v: # If dataset is not present in input file, skip copy if k in genome_datasets: logging.info("Copying {0} ({1}) into {2}".format(k, v, organism_genome_dir)) - genome_fname = "v%s.fasta" % self.genome_version + genome_fname = "{0}_v{1}.fasta".format(dataset_shortname, self.genome_version) try: shutil.copyfile(os.path.abspath(v), os.path.join(organism_genome_dir, genome_fname)) except Exception as exc: @@ -137,19 +143,19 @@ class GetData(speciesData.SpeciesData): elif k in annotation_datasets: dataset_fname = "" if k == "gff_path": - dataset_fname = "OGS%s.gff" % self.ogs_version + dataset_fname = "{0}_OGS{1}.gff".format(dataset_shortname, self.genome_version) elif k == "transcripts_path": - dataset_fname = "OGS%s_transcripts.fasta" % self.ogs_version + dataset_fname = "{0}_OGS{1}_transcripts.fasta".format(dataset_shortname, self.genome_version) elif k == "proteins_path": - dataset_fname = "OGS%s_proteins.fasta" % self.ogs_version + dataset_fname = "{0}_OGS{1}_proteins.fasta".format(dataset_shortname, self.genome_version) elif k == "orthofinder_path": - dataset_fname = "OGS%s_orthofinder.tsv" % self.ogs_version + dataset_fname = "{0}_OGS{1}_orthofinder.tsv".format(dataset_shortname, self.genome_version) elif k == "interpro_path": - dataset_fname = "OGS%s_interproscan.xml" % self.ogs_version + dataset_fname = "{0}_OGS{1}_interproscan.xml".format(dataset_shortname, self.genome_version) elif k == "blastp_path": - dataset_fname = "OGS%s_blastp.xml" % self.ogs_version + dataset_fname = "{0}_OGS{1}_blastp.xml".format(dataset_shortname, self.genome_version) elif k == "blastx_path": - dataset_fname = "OGS%s_blastx.xml" % self.ogs_version + dataset_fname = "{0}_OGS{1}_blastx.xml".format(dataset_shortname, self.genome_version) logging.info("Copying {0} ({1}) into {2}".format(k, v, organism_annotation_dir)) try: shutil.copyfile(os.path.abspath(v), os.path.join(organism_annotation_dir, dataset_fname)) diff --git a/run_workflow_phaeoexplorer.py b/run_workflow_phaeoexplorer.py index 9b21871fb3f170511ea94730495dabd70f5dbb49..78856471f248463241e8d5e809527513709cdd37 100755 --- a/run_workflow_phaeoexplorer.py +++ b/run_workflow_phaeoexplorer.py @@ -169,6 +169,7 @@ class RunWorkflow(speciesData.SpeciesData): name = toolshed_dict["name"] owner = toolshed_dict["owner"] toolshed = "https://" + toolshed_dict["tool_shed"] + logging.warning("Installing changeset revision {0} for {1}".format(changeset_revision, name)) self.instance.toolshed.install_repository_revision(tool_shed_url=toolshed, name=name, owner=owner, changeset_revision=changeset_revision, @@ -183,6 +184,7 @@ class RunWorkflow(speciesData.SpeciesData): name = toolshed_dict["name"] owner = toolshed_dict["owner"] toolshed = "https://" + toolshed_dict["tool_shed"] + logging.warning("Installing changeset revision {0} for {1}".format(changeset_revision, name)) self.instance.toolshed.install_repository_revision(tool_shed_url=toolshed, name=name, owner=owner, changeset_revision=changeset_revision, @@ -197,6 +199,7 @@ class RunWorkflow(speciesData.SpeciesData): name = toolshed_dict["name"] owner = toolshed_dict["owner"] toolshed = "https://" + toolshed_dict["tool_shed"] + logging.warning("Installing changeset revision {0} for {1}".format(changeset_revision, name)) self.instance.toolshed.install_repository_revision(tool_shed_url=toolshed, name=name, owner=owner, changeset_revision=changeset_revision, @@ -211,7 +214,8 @@ class RunWorkflow(speciesData.SpeciesData): name = toolshed_dict["name"] owner = toolshed_dict["owner"] toolshed = "https://" + toolshed_dict["tool_shed"] - logging.warning("Installing changeset revision %s for add_analysis" % changeset_revision) + logging.warning("Installing changeset revision {0} for {1}".format(changeset_revision, name)) + self.instance.toolshed.install_repository_revision(tool_shed_url=toolshed, name=name, owner=owner, changeset_revision=changeset_revision, install_tool_dependencies=True, @@ -247,118 +251,112 @@ class RunWorkflow(speciesData.SpeciesData): self.set_get_history() tool_version = "2.3.4+galaxy0" - # Add organism (species) to chado - logging.info("Adding organism to the instance's chado database") - if self.common == "" or self.common is None: - self.instance.tools.run_tool( - tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_add_organism/organism_add_organism/%s" % tool_version, - history_id=self.history_id, - tool_inputs={"abbr": self.abbreviation, - "genus": self.genus_uppercase, - "species": self.chado_species_name, - "common": self.abbreviation}) - else: - self.instance.tools.run_tool( - tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_add_organism/organism_add_organism/%s" % tool_version, - history_id=self.history_id, - tool_inputs={"abbr": self.abbreviation, - "genus": self.genus_uppercase, - "species": self.chado_species_name, - "common": self.common}) - # Add OGS analysis to chado - logging.info("Adding OGS analysis to the instance's chado database") - self.instance.tools.run_tool( - tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/%s" % tool_version, - history_id=self.history_id, - tool_inputs={"name": self.full_name_lowercase + " OGS" + self.ogs_version, - "program": "Performed by Genoscope", - "programversion": str(self.sex + " OGS" + self.ogs_version), - "sourcename": "Genoscope", - "date_executed": self.date}) - # Add genome analysis to chado - logging.info("Adding genome analysis to the instance's chado database") - self.instance.tools.run_tool( - tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/%s" % tool_version, + get_organism_tool = self.instance.tools.show_tool("toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/2.3.4+galaxy0") + + get_organisms = self.instance.tools.run_tool( + tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/%s" % tool_version, history_id=self.history_id, - tool_inputs={"name": self.full_name_lowercase + " genome v" + self.genome_version, - "program": "Performed by Genoscope", - "programversion": str(self.sex + "genome v" + self.genome_version), - "sourcename": "Genoscope", - "date_executed": self.date}) + tool_inputs={}) - # # TODO: check output of get_organism --> if empty or wrong --> rerun --> else: go next - # # Get organism and analyses IDs (runtime inputs for workflow) - # time.sleep(3) - # # Get the ID for the current organism in chado - # org = self.instance.tools.run_tool( - # tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/%s" % tool_version, - # history_id=self.history_id, - # tool_inputs={"abbr": self.abbreviation, - # "genus": self.genus_uppercase, - # "species": self.chado_species_name, - # "common": self.common}) - - # time.sleep(3) - # # Run tool again (sometimes the tool doesn't return anything despite the organism already being in the db) - # org = self.instance.tools.run_tool( - # tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/%s" % tool_version, - # history_id=self.history_id, - # tool_inputs={"abbr": self.abbreviation, - # "genus": self.genus_uppercase, - # "species": self.chado_species_name, - # "common": self.common}) - - # org_job_out = org["outputs"][0]["id"] - # org_json_output = self.instance.datasets.download_dataset(dataset_id=org_job_out) - # try: - # org_output = json.loads(org_json_output)[0] - # self.org_id = str(org_output["organism_id"]) # id needs to be a str to be recognized by chado tools - # except IndexError: - # logging.critical("No organism matching " + self.full_name + " exists in the instance's chado database") - # sys.exit() - - - def get_genome_analysis_id(self): - """ - """ + time.sleep(10) # Ensure the tool has had time to complete + org_outputs = get_organisms["outputs"] # Outputs from the get_organism tool + org_job_out_id = org_outputs[0]["id"] # ID of the get_organism output dataset (list of dicts) + org_json_output = self.instance.datasets.download_dataset(dataset_id=org_job_out_id) # Download the dataset + org_output = json.loads(org_json_output) # Turn the dataset into a list for parsing - # Get the ID for the genome analysis in chado - genome_analysis = self.instance.tools.run_tool( - tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_get_analyses/analysis_get_analyses/2.3.4+galaxy0", - history_id=self.history_id, - tool_inputs={"name": self.full_name_lowercase + " genome v" + self.genome_version}) - genome_analysis_job_out = genome_analysis["outputs"][0]["id"] - genome_analysis_json_output = self.instance.datasets.download_dataset(dataset_id=genome_analysis_job_out) - try: - genome_analysis_output = json.loads(genome_analysis_json_output)[0] - self.genome_analysis_id = str(genome_analysis_output["analysis_id"]) - except IndexError as exc: - logging.critical("no matching genome analysis exists in the instance's chado database") - sys.exit(exc) + org_id = None - return self.genome_analysis_id + # Look up list of outputs (dictionaries) + for organism_output_dict in org_output: + if organism_output_dict["genus"] == self.genus and organism_output_dict["species"] == "{0} {1}".format(self.species, self.sex): + correct_organism_id = str(organism_output_dict["organism_id"]) # id needs to be a str to be recognized by chado tools + org_id = str(correct_organism_id) - def get_ogs_analysis_id(self): - """ - """ - # Get the ID for the OGS analysis in chado - ogs_analysis = self.instance.tools.run_tool( - tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_get_analyses/analysis_get_analyses/2.3.4+galaxy0", + if org_id is None: + if self.common == "" or self.common is None: + add_org_job = self.instance.tools.run_tool( + tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_add_organism/organism_add_organism/%s" % tool_version, + history_id=self.history_id, + tool_inputs={"abbr": self.abbreviation, + "genus": self.genus_uppercase, + "species": self.chado_species_name, + "common": self.abbreviation}) + org_job_out_id = add_org_job["outputs"][0]["id"] + org_json_output = self.instance.datasets.download_dataset(dataset_id=org_job_out_id) + org_output = json.loads(org_json_output) + org_id = str(organism_output_dict["organism_id"]) # id needs to be a str to be recognized by chado tools + else: + add_org_job = self.instance.tools.run_tool( + tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_add_organism/organism_add_organism/%s" % tool_version, + history_id=self.history_id, + tool_inputs={"abbr": self.abbreviation, + "genus": self.genus_uppercase, + "species": self.chado_species_name, + "common": self.common}) + org_job_out_id = add_org_job["outputs"][0]["id"] + org_json_output = self.instance.datasets.download_dataset(dataset_id=org_job_out_id) + org_output = json.loads(org_json_output) + org_id = str(organism_output_dict["organism_id"]) # id needs to be a str to be recognized by chado tools + + + get_analyses = self.instance.tools.run_tool( + tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_get_analyses/analysis_get_analyses/%s" % tool_version, history_id=self.history_id, - tool_inputs={"name": self.full_name_lowercase + " OGS" + self.ogs_version}) - ogs_analysis_job_out = ogs_analysis["outputs"][0]["id"] - ogs_analysis_json_output = self.instance.datasets.download_dataset(dataset_id=ogs_analysis_job_out) - try: - ogs_analysis_output = json.loads(ogs_analysis_json_output)[0] - self.ogs_analysis_id = str(ogs_analysis_output["analysis_id"]) - except IndexError as exc: - logging.critical("No matching OGS analysis exists in the instance's chado database") - sys.exit(exc) + tool_inputs={}) - return self.ogs_analysis_id + time.sleep(10) + analysis_outputs = get_analyses["outputs"] + analysis_job_out_id = analysis_outputs[0]["id"] + analysis_json_output = self.instance.datasets.download_dataset(dataset_id=analysis_job_out_id) + analysis_output = json.loads(analysis_json_output) + + ogs_analysis_id = None + genome_analysis_id = None + + # Look up list of outputs (dictionaries) + for analysis_output_dict in analysis_output: + if analysis_output_dict["name"] == self.full_name_lowercase + " OGS" + self.ogs_version: + ogs_analysis_id = str(analysis_output_dict["analysis_id"]) + if analysis_output_dict["name"] == self.full_name_lowercase + " genome v" + self.genome_version: + genome_analysis_id = str(analysis_output_dict["analysis_id"]) + + + if ogs_analysis_id is None: + add_ogs_analysis_job = self.instance.tools.run_tool( + tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/%s" % tool_version, + history_id=self.history_id, + tool_inputs={"name": self.full_name_lowercase + " OGS" + self.ogs_version, + "program": "Performed by Genoscope", + "programversion": str(self.sex + " OGS" + self.ogs_version), + "sourcename": "Genoscope", + "date_executed": self.date}) + analysis_outputs = add_ogs_analysis_job["outputs"] + analysis_job_out_id = analysis_outputs[0]["id"] + analysis_json_output = self.instance.datasets.download_dataset(dataset_id=analysis_job_out_id) + analysis_output = json.loads(analysis_json_output) + ogs_analysis_id = analysis_output["analysis_id"] + ogs_analysis_id = str(analysis_output_dict["analysis_id"]) + + if genome_analysis_id is None: + add_genome_analysis_job = self.instance.tools.run_tool( + tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/%s" % tool_version, + history_id=self.history_id, + tool_inputs={"name": self.full_name_lowercase + " genome v" + self.genome_version, + "program": "Performed by Genoscope", + "programversion": str(self.sex + "genome v" + self.genome_version), + "sourcename": "Genoscope", + "date_executed": self.date}) + analysis_outputs = add_genome_analysis_job["outputs"] + analysis_job_out_id = analysis_outputs[0]["id"] + analysis_json_output = self.instance.datasets.download_dataset(dataset_id=analysis_job_out_id) + analysis_output = json.loads(analysis_json_output) + genome_analysis_id = str(analysis_output_dict["analysis_id"]) + + print({"org_id": org_id, "genome_analysis_id": genome_analysis_id, "ogs_analysis_id": ogs_analysis_id}) + return({"org_id": org_id, "genome_analysis_id": genome_analysis_id, "ogs_analysis_id": ogs_analysis_id}) def add_interproscan_analysis(self): @@ -510,17 +508,7 @@ class RunWorkflow(speciesData.SpeciesData): return invocation_report - - - def get_datasets_ldda_ids(self): - """ - Get and return the ldda_ids (and names) for the datasets in the library - """ - - return 0 - - - def import_datasets_into_history(self, imported_datasets_ids): + def import_datasets_into_history(self): """ Find datasets in a library, get their ID and import them into the current history if they are not already @@ -545,7 +533,6 @@ class RunWorkflow(speciesData.SpeciesData): folders_ids[i["name"]] = i["id"] # Iterating over the folders to find datasets and map datasets to their IDs - logging.debug("Datasets IDs: ") for k, v in folders_ids.items(): if k == "/genome/{0}/v{1}".format(self.species_folder_name, self.genome_version): sub_folder_content = self.instance.folders.show_folder(folder_id=v, contents=True) @@ -555,7 +542,6 @@ class RunWorkflow(speciesData.SpeciesData): if e["name"].endswith(".fasta"): self.datasets["genome_file"] = e["ldda_id"] self.datasets_name["genome_file"] = e["name"] - logging.debug("\tGenome file:\t" + e["name"] + ": " + e["ldda_id"]) if k == "/annotation/{0}/OGS{1}".format(self.species_folder_name, self.ogs_version): sub_folder_content = self.instance.folders.show_folder(folder_id=v, contents=True) @@ -563,59 +549,86 @@ class RunWorkflow(speciesData.SpeciesData): for e in v2: if type(e) == dict: if "transcripts" in e["name"]: + # the attributes datasets is set in the function get_instance_attributes() self.datasets["transcripts_file"] = e["ldda_id"] self.datasets_name["transcripts_file"] = e["name"] - logging.debug("\tTranscripts file:\t" + e["name"] + ": " + e["ldda_id"]) elif "proteins" in e["name"]: self.datasets["proteins_file"] = e["ldda_id"] self.datasets_name["proteins_file"] = e["name"] - logging.debug("\tProteins file:\t" + e["name"] + ": " + e["ldda_id"]) elif "gff" in e["name"]: self.datasets["gff_file"] = e["ldda_id"] self.datasets_name["gff_file"] = e["name"] - logging.debug("\tGFF file:\t" + e["name"] + ": " + e["ldda_id"]) elif "interpro" in e["name"]: self.datasets["interproscan_file"] = e["ldda_id"] self.datasets_name["interproscan_file"] = e["name"] - logging.debug("\tInterproscan file:\t" + e["name"] + ": " + e["ldda_id"]) elif "blastp" in e["name"]: self.datasets["blast_diamond_file"] = e["ldda_id"] self.datasets_name["blast_diamond_file"] = e["name"] - logging.debug("\tBlastp diamond file:\t" + e["name"] + ": " + e["ldda_id"]) - logging.debug("Uploading datasets into history %s" % self.history_id) + dataset_shortname = "" + if self.sex is not None or self.sex != "": + dataset_shortname = self.genus[0].lower() + "_" + self.species.lower() + "_" + self.sex[0].lower() + else: + dataset_shortname = self.genus[0].lower() + "_" + self.species.lower() + + history_datasets_li = self.instance.datasets.get_datasets() + genome_hda_id, gff_hda_id, transcripts_hda_id, proteins_hda_id, blast_diamond_hda_id, interproscan_hda_id = None, None, None, None, None, None + + # Finding datasets in history (matches datasets name) + for dataset in history_datasets_li: + dataset_name = dataset["name"] + if dataset_shortname in dataset_name: + dataset_id = dataset["id"] + if dataset_name.endswith(".fasta"): + genome_hda_id = dataset_id + if dataset_name.endswith(".gff"): + gff_hda_id = dataset_id + if dataset_name.endswith("transcripts.fasta"): + transcripts_hda_id = dataset_id + if dataset_name.endswith("proteins.fasta"): + proteins_hda_id = dataset_id + if dataset_name.endswith("blasp.xml"): + blast_diamond_hda_id = dataset_id + + # Import each dataset into history if it is not imported + logging.debug("Uploading datasets into history %s" % self.history_id) - first_hda_ids = self.get_datasets_hda_ids(imported_datasets_ids=imported_datasets_ids) - - if first_hda_ids["genome_hda_id"] is not None: - self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["genome_file"]) - if first_hda_ids["gff_hda_id"] is not None: - self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["gff_file"]) - if first_hda_ids["transcripts_hda_id"] is not None: - self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["transcripts_file"]) - if first_hda_ids["proteins_hda_id"] is not None: - self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["proteins_file"]) - if first_hda_ids["interproscan_hda_id"] is None: + if genome_hda_id is None: + genome_dataset_upload = self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["genome_file"]) + genome_hda_id = genome_dataset_upload["id"] + if gff_hda_id is None: + gff_dataset_upload = self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["gff_file"]) + gff_hda_id = gff_dataset_upload["id"] + if transcripts_hda_id is None: + transcripts_dataset_upload = self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["transcripts_file"]) + transcripts_hda_id = transcripts_dataset_upload["id"] + if proteins_hda_id is None: + proteins_dataset_upload = self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["proteins_file"]) + proteins_hda_id = proteins_dataset_upload["id"] + if interproscan_hda_id is None: try: - self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["interproscan_file"]) + interproscan_dataset_upload = self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["interproscan_file"]) + interproscan_hda_id = interproscan_dataset_upload["id"] except Exception as exc: logging.debug("Interproscan file not found in library (history: {0})".format(self.history_id)) - if first_hda_ids["blast_diamond_hda_id"] is None: + if blast_diamond_hda_id is None: try: - self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["blast_diamond_file"]) + blast_diamond_dataset_upload = self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["blast_diamond_file"]) + blast_diamond_hda_id = blast_diamond_upload["id"] except Exception as exc: logging.debug("Blastp file not found in library (history: {0})".format(self.history_id)) - # _datasets = self.instance.datasets.get_datasets() - # with open(os.path.join(self.main_dir, "datasets_ids.json"), "w") as datasets_ids_outfile: - # datasets_ids_outfile.write(str(_datasets)) - # Return a dict made of the hda ids - return self.get_datasets_hda_ids(imported_datasets_ids=first_hda_ids["imported_datasets_ids"]) + return {"genome_hda_id": genome_hda_id, + "gff_hda_id": gff_hda_id, + "transcripts_hda_id": transcripts_hda_id, + "proteins_hda_id": proteins_hda_id, + "blast_diamond_hda_id": blast_diamond_hda_id, + "interproscan_hda_id": interproscan_hda_id} - def get_datasets_hda_ids(self, imported_datasets_ids): + def get_datasets_hda_ids(self): """ Get the hda IDs of the datasets imported into an history @@ -677,81 +690,6 @@ class RunWorkflow(speciesData.SpeciesData): "imported_datasets_ids": imported_datasets_ids} - def get_organism_id(self): - """ - Retrieve current organism ID - Will try to add it to Chado if the organism ID can't be found - - :return: - """ - - tool_version = "2.3.4+galaxy0" - time.sleep(3) - - # # Get the ID for the current organism in chado - # org = self.instance.tools.run_tool( - # tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/2.3.3", - # history_id=self.history_id, - # tool_inputs={"abbr": self.abbreviation, - # "genus": self.genus_uppercase, - # "species": self.chado_species_name, - # "common": self.common}) - - # time.sleep(3) - - # Run tool again (sometimes the tool doesn't return anything despite the organism already being in the db) - org = self.instance.tools.run_tool( - tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/2.3.4+galaxy0", - history_id=self.history_id, - tool_inputs={"abbr": self.abbreviation, - "genus": self.genus_uppercase, - "species": self.chado_species_name, - "common": self.common}) - - org_job_out = org["outputs"][0]["id"] - org_json_output = self.instance.datasets.download_dataset(dataset_id=org_job_out) - try: - org_output = json.loads(org_json_output)[0] - self.org_id = str(org_output["organism_id"]) # id needs to be a str to be recognized by chado tools - except IndexError: - logging.warning("No organism matching " + self.full_name + " exists in the instance's chado database, adding it") - if self.common == "" or self.common is None: - self.instance.tools.run_tool( - tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_add_organism/organism_add_organism/%s" % tool_version, - history_id=self.history_id, - tool_inputs={"abbr": self.abbreviation, - "genus": self.genus_uppercase, - "species": self.chado_species_name, - "common": self.abbreviation}) - else: - self.instance.tools.run_tool( - tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_add_organism/organism_add_organism/%s" % tool_version, - history_id=self.history_id, - tool_inputs={"abbr": self.abbreviation, - "genus": self.genus_uppercase, - "species": self.chado_species_name, - "common": self.common}) - # Run tool again (sometimes the tool doesn't return anything despite the organism already being in the db) - org = self.instance.tools.run_tool( - tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/2.3.4+galaxy0", - history_id=self.history_id, - tool_inputs={"abbr": self.abbreviation, - "genus": self.genus_uppercase, - "species": self.chado_species_name, - "common": self.common}) - - org_job_out = org["outputs"][0]["id"] - org_json_output = self.instance.datasets.download_dataset(dataset_id=org_job_out) - try: - org_output = json.loads(org_json_output)[0] - self.org_id = str(org_output["organism_id"]) # id needs to be a str to be recognized by chado tools - except IndexError: - logging.critical("Cannot add {0} as an organism in Chado, please check the galaxy instance {1}".format(self.full_name, self.instance_url)) - sys.exit() - - return self.org_id - - def run_workflow(workflow_path, workflow_parameters, datamap, config, input_species_number): """ Run a workflow in galaxy @@ -810,9 +748,7 @@ def run_workflow(workflow_path, workflow_parameters, datamap, config, input_spec - - -def create_sp_workflow_dict(sp_dict, main_dir, config, imported_datasets_ids): +def create_sp_workflow_dict(sp_dict, main_dir, config): """ """ @@ -847,13 +783,16 @@ def create_sp_workflow_dict(sp_dict, main_dir, config, imported_datasets_ids): history_id = run_workflow_for_current_organism.set_get_history() run_workflow_for_current_organism.install_changesets_revisions_for_individual_tools() - run_workflow_for_current_organism.add_organism_ogs_genome_analyses() - - org_id = run_workflow_for_current_organism.get_organism_id() - genome_analysis_id = run_workflow_for_current_organism.get_genome_analysis_id() - ogs_analysis_id = run_workflow_for_current_organism.get_ogs_analysis_id() + ids = run_workflow_for_current_organism.add_organism_ogs_genome_analyses() + + org_id = None + genome_analysis_id = None + ogs_analysis_id = None + org_id = ids["org_id"] + genome_analysis_id = ids["genome_analysis_id"] + ogs_analysis_id = ids["ogs_analysis_id"] instance_attributes = run_workflow_for_current_organism.get_instance_attributes() - hda_ids = run_workflow_for_current_organism.import_datasets_into_history(imported_datasets_ids=imported_datasets_ids) + hda_ids = run_workflow_for_current_organism.import_datasets_into_history() strain_sex = "{0}_{1}".format(run_workflow_for_current_organism.strain, run_workflow_for_current_organism.sex) genus_species = run_workflow_for_current_organism.genus_species @@ -914,13 +853,12 @@ def install_changesets_revisions_from_workflow(instance, workflow_path): # (If it's not installed, the show_tool version returned will be a default version with the suffix "XXXX+0") if show_tool["version"] != v["tool_version"]: # If it doesn't match, proceed to install of the correct changeset revision - # logging.warning("Tool versions don't match for {0} (changeset installed: {1} | changeset required: {2}). Installing changeset revision {3}...".format(v["tool_shed_repository"]["name"], show_tool["changeset_revision"], v["tool_shed_repository"]["changeset_revision"], v["tool_shed_repository"]["changeset_revision"])) toolshed = "https://" + v["tool_shed_repository"]["tool_shed"] name = v["tool_shed_repository"]["name"] owner = v["tool_shed_repository"]["owner"] changeset_revision = v["tool_shed_repository"]["changeset_revision"] - - logging.debug("Installing changeset {0} for tool {1}".format(changeset_revision, name)) + + logging.warning("Installed tool versions for tool {0} do not match the version required by the specified workflow, installing changeset {1}".format(name, changeset_revision)) # Install changeset instance.toolshed.install_repository_revision(tool_shed_url=toolshed, name=name, owner=owner, @@ -928,6 +866,12 @@ def install_changesets_revisions_from_workflow(instance, workflow_path): install_tool_dependencies=True, install_repository_dependencies=False, install_resolver_dependencies=True) + else: + toolshed = "https://" + v["tool_shed_repository"]["tool_shed"] + name = v["tool_shed_repository"]["name"] + owner = v["tool_shed_repository"]["owner"] + changeset_revision = v["tool_shed_repository"]["changeset_revision"] + logging.debug("Installed tool versions for tool {0} match the version in the specified workflow (changeset {1})".format(name, changeset_revision)) logging.info("Tools versions and changesets from workflow validated") @@ -992,17 +936,10 @@ if __name__ == "__main__": config = utilities.parse_config(args.config) all_sp_workflow_dict = {} - - # IDs of already imported datasets (useful in case there are several species within the same instance) - # TODO: Not a very smart way to filter datasets as the list will grow at every input species, whereas - # we only need to do the ID lookup for a single galaxy instance --> Possible issue where we encounter 2 identical IDs in - # different instances - imported_datasets_ids = [] - for sp_dict in sp_dict_list: # Add and retrieve all analyses/organisms for the current input species and add their IDs to the input dictionary - current_sp_workflow_dict = create_sp_workflow_dict(sp_dict, main_dir=args.main_directory, config=config, imported_datasets_ids=imported_datasets_ids) + current_sp_workflow_dict = create_sp_workflow_dict(sp_dict, main_dir=args.main_directory, config=config) current_sp_key = list(current_sp_workflow_dict.keys())[0] current_sp_value = list(current_sp_workflow_dict.values())[0] @@ -1087,6 +1024,29 @@ if __name__ == "__main__": org1_transcripts_hda_id = org1_dict["hda_ids"]["transcripts_hda_id"] org1_proteins_hda_id = org1_dict["hda_ids"]["proteins_hda_id"] org1_gff_hda_id = org1_dict["hda_ids"]["gff_hda_id"] + + # Store these values into a dict for parameters logging/validation + org1_parameters_dict = { + "org1_genus": org1_genus, + "org1_species": org1_species, + "org1_genus_species": org1_genus_species, + "org1_species_folder_name": org1_species_folder_name, + "org1_full_name": org1_full_name, + "org1_strain": org1_strain, + "org1_sex": org1_sex, + "org1_org_id": org1_org_id, + "org1_genome_analysis_id": org1_genome_analysis_id, + "org1_ogs_analysis_id": org1_ogs_analysis_id, + "org1_transcripts_hda_id": org1_transcripts_hda_id, + "org1_proteins_hda_id": org1_proteins_hda_id, + "org1_gff_hda_id": org1_gff_hda_id, + } + + # Look for empty parameters values, throw a critical error if a parameter value is invalid + for param_name, param_value in org1_parameters_dict.items(): + if param_value is None or param_value == "": + logging.critical("Empty parameter found for organism {0} (parameter: {1}, parameter value: {2})".format(org1_full_name, param_name, param_value)) + sys.exit() # Organism 2 attributes org2_genus = org2_dict["genus"] @@ -1103,7 +1063,30 @@ if __name__ == "__main__": org2_transcripts_hda_id = org2_dict["hda_ids"]["transcripts_hda_id"] org2_proteins_hda_id = org2_dict["hda_ids"]["proteins_hda_id"] org2_gff_hda_id = org2_dict["hda_ids"]["gff_hda_id"] + + # Store these values into a dict for parameters logging/validation + org2_parameters_dict = { + "org2_genus": org2_genus, + "org2_species": org2_species, + "org2_genus_species": org2_genus_species, + "org2_species_folder_name": org2_species_folder_name, + "org2_full_name": org2_full_name, + "org2_strain": org2_strain, + "org2_sex": org2_sex, + "org2_org_id": org2_org_id, + "org2_genome_analysis_id": org2_genome_analysis_id, + "org2_ogs_analysis_id": org2_ogs_analysis_id, + "org2_transcripts_hda_id": org2_transcripts_hda_id, + "org2_proteins_hda_id": org2_proteins_hda_id, + "org2_gff_hda_id": org2_gff_hda_id, + } + # Look for empty parameters values, throw a critical error if a parameter value is invalid + for param_name, param_value in org2_parameters_dict.items(): + logging.info("Parameters for organism 1 (%s)" % org2_full_name) + if param_value is None or param_value == "": + logging.critical("Empty parameter found for organism {0} (parameter: {1}, parameter value: {2})".format(org2_full_name, param_name, param_value)) + sys.exit() # Source files association (ordered by their IDs in the workflow) GENOME_FASTA_FILE_ORG1 = "0" @@ -1177,50 +1160,54 @@ if __name__ == "__main__": datamap[GFF_FILE_ORG2] = {"src": "hda", "id": org2_gff_hda_id} datamap[PROTEINS_FASTA_FILE_ORG2] = {"src": "hda", "id": org2_proteins_hda_id} - with open(workflow_path, 'r') as ga_in_file: + # with open(workflow_path, 'r') as ga_in_file: - # Store the decoded json dictionary - workflow_dict = json.load(ga_in_file) - workflow_name = workflow_dict["name"] + # # Store the decoded json dictionary + # workflow_dict = json.load(ga_in_file) + # workflow_name = workflow_dict["name"] - # For the Jbrowse tool, we unfortunately have to manually edit the parameters instead of setting them - # as runtime values, using runtime parameters makes the tool throw an internal critical error ("replace not found" error) - # Scratchgmod test: need "http" (or "https"), the hostname (+ port) - if "menu_url" not in config.keys(): - jbrowse_menu_url_org1 = "https://{hostname}/sp/{genus_sp}/feature/{Genus}/{species}/mRNA/{id}".format(hostname=config["hostname"], genus_sp=org1_genus_species, Genus=org1_genus[0].upper() + org1_genus[1:], species=org1_species, id="\{id\}") - jbrowse_menu_url_org2 = "https://{hostname}/sp/{genus_sp}/feature/{Genus}/{species}/mRNA/{id}".format(hostname=config["hostname"], genus_sp=org2_genus_species, Genus=org2_genus[0].upper() + org2_genus[1:], species=org2_species, id="\{id\}") - else: - jbrowse_menu_url_org1 = config["menu_url"] - jbrowse_menu_url_org2 = jbrowse_menu_url_org1 + # # For the Jbrowse tool, we unfortunately have to manually edit the parameters instead of setting them + # # as runtime values, using runtime parameters makes the tool throw an internal critical error ("replace not found" error) + # # Scratchgmod test: need "http" (or "https"), the hostname (+ port) + # if "menu_url" not in config.keys(): + # jbrowse_menu_url_org1 = "https://{hostname}/sp/{genus_sp}/feature/{Genus}/{species}/mRNA/{id}".format(hostname=config["hostname"], genus_sp=org1_genus_species, Genus=org1_genus[0].upper() + org1_genus[1:], species=org1_species, id="\{id\}") + # jbrowse_menu_url_org2 = "https://{hostname}/sp/{genus_sp}/feature/{Genus}/{species}/mRNA/{id}".format(hostname=config["hostname"], genus_sp=org2_genus_species, Genus=org2_genus[0].upper() + org2_genus[1:], species=org2_species, id="\{id\}") + # else: + # jbrowse_menu_url_org1 = config["menu_url"] + # jbrowse_menu_url_org2 = jbrowse_menu_url_org1 + + # # Replace values in the workflow dictionary + # workflow_dict["steps"]["7"]["tool_state"] = workflow_dict["steps"]["7"]["tool_state"].replace("__MENU_URL_ORG1__", jbrowse_menu_url_org1) + # workflow_dict["steps"]["8"]["tool_state"] = workflow_dict["steps"]["8"]["tool_state"].replace("__MENU_URL_ORG2__", jbrowse_menu_url_org2) + # # The UNIQUE_ID is specific to a combination genus_species_strain_sex so every combination should have its unique workflow + # # in galaxy --> define a naming method for these workflows + # workflow_dict["steps"]["10"]["tool_state"] = workflow_dict["steps"]["10"]["tool_state"].replace("__FULL_NAME_ORG1__", org1_full_name).replace("__UNIQUE_ID_ORG1__", org1_species_folder_name) + # workflow_dict["steps"]["10"]["tool_state"] = workflow_dict["steps"]["10"]["tool_state"].replace("__FULL_NAME_ORG2__", org2_full_name).replace("__UNIQUE_ID_ORG2__", org2_species_folder_name) + + # # Import the workflow in galaxy as a dict + # instance.workflows.import_workflow_dict(workflow_dict=workflow_dict) + + # # Get its attributes + # workflow_attributes = instance.workflows.get_workflows(name=workflow_name) + # # Then get its ID (required to invoke the workflow) + # workflow_id = workflow_attributes[0]["id"] # Index 0 is the most recently imported workflow (the one we want) + # show_workflow = instance.workflows.show_workflow(workflow_id=workflow_id) + # # Check if the workflow is found + # try: + # logging.debug("Workflow ID: %s" % workflow_id) + # except bioblend.ConnectionError: + # logging.warning("Error finding workflow %s" % workflow_name) + + # # Finally, invoke the workflow alogn with its datamap, parameters and the history in which to invoke it + # instance.workflows.invoke_workflow(workflow_id=workflow_id, history_id=history_id, params=workflow_parameters, inputs=datamap, allow_tool_state_corrections=True) + + # logging.info("Successfully imported and invoked workflow {0}, check the galaxy instance ({1}) for the jobs state".format(workflow_name, instance_url)) - # Replace values in the workflow dictionary - workflow_dict["steps"]["7"]["tool_state"] = workflow_dict["steps"]["7"]["tool_state"].replace("__MENU_URL_ORG1__", jbrowse_menu_url_org1) - workflow_dict["steps"]["8"]["tool_state"] = workflow_dict["steps"]["8"]["tool_state"].replace("__MENU_URL_ORG2__", jbrowse_menu_url_org2) - # The UNIQUE_ID is specific to a combination genus_species_strain_sex so every combination should have its unique workflow - # in galaxy --> define a naming method for these workflows - workflow_dict["steps"]["10"]["tool_state"] = workflow_dict["steps"]["10"]["tool_state"].replace("__FULL_NAME_ORG1__", org1_full_name).replace("__UNIQUE_ID_ORG1__", org1_species_folder_name) - workflow_dict["steps"]["10"]["tool_state"] = workflow_dict["steps"]["10"]["tool_state"].replace("__FULL_NAME_ORG2__", org2_full_name).replace("__UNIQUE_ID_ORG2__", org2_species_folder_name) - print(workflow_dict) - # Import the workflow in galaxy as a dict - instance.workflows.import_workflow_dict(workflow_dict=workflow_dict) - # Get its attributes - workflow_attributes = instance.workflows.get_workflows(name=workflow_name) - # Then get its ID (required to invoke the workflow) - workflow_id = workflow_attributes[0]["id"] # Index 0 is the most recently imported workflow (the one we want) - show_workflow = instance.workflows.show_workflow(workflow_id=workflow_id) - # Check if the workflow is found - try: - logging.debug("Workflow ID: %s" % workflow_id) - except bioblend.ConnectionError: - logging.warning("Error finding workflow %s" % workflow_name) - # Finally, invoke the workflow alogn with its datamap, parameters and the history in which to invoke it - instance.workflows.invoke_workflow(workflow_id=workflow_id, history_id=history_id, params=workflow_parameters, inputs=datamap, allow_tool_state_corrections=True) - logging.info("Successfully imported and invoked workflow {0}, check the galaxy instance ({1}) for the jobs state".format(workflow_name, instance_url)) # Get the instance attribute from the object for future connections