diff --git a/misc/banner.png b/misc/banner.png index 06f7b085b4ede50ea7d91dfde8e30a2a88ff821a..2be0af67b7bd67b76a4f7d13eee392d4bd6f6bf7 100644 Binary files a/misc/banner.png and b/misc/banner.png differ diff --git a/run_workflow_phaeoexplorer.py b/run_workflow_phaeoexplorer.py index f3fd8e726252261b1403907c7cacaa1a6e3c8242..2857feb8b19ee81bbc17f13cc9e3adfb0a2fe307 100644 --- a/run_workflow_phaeoexplorer.py +++ b/run_workflow_phaeoexplorer.py @@ -447,7 +447,7 @@ class RunWorkflow(speciesData.SpeciesData): try: blast_diamond_analysis_output = json.loads(blast_diamond_analysis_json_output)[0] self.blast_diamond_analysis_id = str(blast_diamond_analysis_output["analysis_id"]) - except IndexErro as exc: + except IndexError as exc: logging.critical("No matching InterproScan analysis exists in the instance's chado database") sys.exit(exc) @@ -551,83 +551,89 @@ class RunWorkflow(speciesData.SpeciesData): instance_source_data_folders = self.instance.libraries.get_folders(library_id=str(library_id)) folders_ids = {} - current_folder_name = "" + folder_name = "" + # Loop over the folders in the library and map folders names to their IDs for i in instance_source_data_folders: - for k, v in i.items(): - if k == "name": - folders_ids[v] = 0 - current_folder_name = v - if k == "id": - folders_ids[current_folder_name] = v + folders_ids[i["name"]] = i["id"] - history_datasets_li = self.instance.datasets.get_datasets() - genome_dataset_hda_id, gff_dataset_hda_id, transcripts_dataset_hda_id, proteins_datasets_hda_id = None, None, None, None - interproscan_dataset_hda_id, blast_diamond_dataset_hda_id = None, None - - # Store datasets that are in the history already in a dict - hda_ids = self.get_datasets_hda_ids() # Iterating over the folders to find datasets and map datasets to their IDs logging.debug("Datasets IDs: ") for k, v in folders_ids.items(): - if k == "/genome": + # print(self.full_name) + # print(self.species_folder_name) + # print(k) + # print(v) + if k == "/genome/{0}/v{1}".format(self.species_folder_name, self.genome_version): sub_folder_content = self.instance.folders.show_folder(folder_id=v, contents=True) - final_sub_folder_content = self.instance.folders.show_folder(folder_id=sub_folder_content["folder_contents"][0]["id"], contents=True) - for k2, v2 in final_sub_folder_content.items(): + for k2, v2 in sub_folder_content.items(): for e in v2: if type(e) == dict: + # print(e) if e["name"].endswith(".fa"): self.datasets["genome_file"] = e["ldda_id"] - logging.debug("Genome file:\t" + e["name"] + ": " + e["ldda_id"]) - if k == "/annotation": + self.datasets_name["genome_file"] = e["name"] + logging.debug("\tGenome file:\t" + e["name"] + ": " + e["ldda_id"]) + # print("Genome file:\t" + e["name"] + ": " + e["ldda_id"]) + + # print(self.species_folder_name) + if k == "/annotation/{0}/OGS{1}".format(self.species_folder_name, self.ogs_version): sub_folder_content = self.instance.folders.show_folder(folder_id=v, contents=True) - final_sub_folder_content = self.instance.folders.show_folder(folder_id=sub_folder_content["folder_contents"][0]["id"], contents=True) - for k2, v2 in final_sub_folder_content.items(): + for k2, v2 in sub_folder_content.items(): for e in v2: if type(e) == dict: if "transcripts" in e["name"]: self.datasets["transcripts_file"] = e["ldda_id"] - logging.debug("Transcripts file:\t" + e["name"] + ": " + e["ldda_id"]) + self.datasets_name["transcripts_file"] = e["name"] + logging.debug("\tTranscripts file:\t" + e["name"] + ": " + e["ldda_id"]) elif "proteins.fa" in e["name"]: self.datasets["proteins_file"] = e["ldda_id"] - logging.debug("Proteins file:\t" + e["name"] + ": " + e["ldda_id"]) + self.datasets_name["proteins_file"] = e["name"] + logging.debug("\tProteins file:\t" + e["name"] + ": " + e["ldda_id"]) elif "gff" in e["name"]: self.datasets["gff_file"] = e["ldda_id"] - logging.debug("GFF file:\t" + e["name"] + ": " + e["ldda_id"]) + self.datasets_name["gff_file"] = e["name"] + logging.debug("\tGFF file:\t" + e["name"] + ": " + e["ldda_id"]) elif "Interpro" in e["name"]: self.datasets["interproscan_file"] = e["ldda_id"] - logging.debug("Interproscan file:\t" + e["name"] + ": " + e["ldda_id"]) + self.datasets_name["interproscan_file"] = e["name"] + logging.debug("\tInterproscan file:\t" + e["name"] + ": " + e["ldda_id"]) elif "diamond-blastp" in e["name"]: self.datasets["blast_diamond_file"] = e["ldda_id"] - logging.debug("Blastp diamond file:\t" + e["name"] + ": " + e["ldda_id"]) + self.datasets_name["blast_diamond_file"] = e["name"] + logging.debug("\tBlastp diamond file:\t" + e["name"] + ": " + e["ldda_id"]) logging.debug("Uploading datasets into history %s" % self.history_id) # Import each dataset into history if it is not imported - if hda_ids["genome_hda_id"] is None: + + first_hda_ids = self.get_datasets_hda_ids() + + if first_hda_ids["genome_hda_id"] is None: self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["genome_file"]) - if hda_ids["gff_hda_id"] is None: + if first_hda_ids["gff_hda_id"] is None: self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["gff_file"]) - if hda_ids["transcripts_hda_id"] is None: + if first_hda_ids["transcripts_hda_id"] is None: self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["transcripts_file"]) - if hda_ids["proteins_hda_id"] is None: + if first_hda_ids["proteins_hda_id"] is None: self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["proteins_file"]) - if hda_ids["interproscan_hda_id"] is None: + if first_hda_ids["interproscan_hda_id"] is None: try: self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["interproscan_file"]) except Exception as exc: - logging.debug("Interproscan file not found in library (history: {0})\n{1}".format(self.history_id, exc)) - if hda_ids["blast_diamond_hda_id"] is None: + logging.debug("Interproscan file not found in library (history: {0})".format(self.history_id)) + if first_hda_ids["blast_diamond_hda_id"] is None: try: self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["blast_diamond_file"]) except Exception as exc: - logging.debug("Blastp file not found in library (history: {0})\n{1}".format(self.history_id, exc)) + logging.debug("Blastp file not found in library (history: {0})".format(self.history_id)) # _datasets = self.instance.datasets.get_datasets() # with open(os.path.join(self.main_dir, "datasets_ids.json"), "w") as datasets_ids_outfile: # datasets_ids_outfile.write(str(_datasets)) - return {"history_id": self.history_id, "library_id": library_id, "datasets": self.datasets} + # Return a dict made of the hda ids + return self.get_datasets_hda_ids() def get_datasets_hda_ids(self): @@ -641,109 +647,46 @@ class RunWorkflow(speciesData.SpeciesData): :return: """ - # Create an object of the galaxy instance - gio = GalaxyInstance(url=self.instance_url, - email=self.config["galaxy_default_admin_email"], - password=self.config["galaxy_default_admin_password"]) - - prj_lib = gio.libraries.get_previews(name="Project Data") - library_id = prj_lib[0].id - - instance_source_data_folders = self.instance.libraries.get_folders(library_id=str(library_id)) - - folders_ids = {} - current_folder_name = "" - # Loop over the folders in the library and map folders names to their IDs - for i in instance_source_data_folders: - for k, v in i.items(): - if k == "name": - folders_ids[v] = 0 - current_folder_name = v - if k == "id": - folders_ids[current_folder_name] = v - - history_datasets_li = self.instance.datasets.get_datasets() - genome_dataset_hda_id, gff_dataset_hda_id, transcripts_dataset_hda_id, proteins_datasets_hda_id = None, None, None, None - interproscan_dataset_hda_id, blast_diamond_dataset_hda_id = None, None - - # Iterating over the folders to find datasets and map datasets to their IDs - # TODO: Add a required matching of the version (genome/ogs) - logging.debug("Datasets IDs: ") - for k, v in folders_ids.items(): - if k == "/genome": - sub_folder_content = self.instance.folders.show_folder(folder_id=v, contents=True) - final_sub_folder_content = self.instance.folders.show_folder(folder_id=sub_folder_content["folder_contents"][0]["id"], contents=True) - for k2, v2 in final_sub_folder_content.items(): - for e in v2: - if type(e) == dict: - if e["name"].endswith(".fa"): - self.datasets["genome_file"] = e["ldda_id"] - self.datasets_name["genome_file"] = e["name"] - logging.debug("Genome file:\t" + e["name"] + ": " + e["ldda_id"]) - if k == "/annotation": - sub_folder_content = self.instance.folders.show_folder(folder_id=v, contents=True) - final_sub_folder_content = self.instance.folders.show_folder(folder_id=sub_folder_content["folder_contents"][0]["id"], contents=True) - for k2, v2 in final_sub_folder_content.items(): - for e in v2: - if type(e) == dict: - if "transcripts" in e["name"]: - self.datasets["transcripts_file"] = e["ldda_id"] - self.datasets_name["transcripts_file"] = e["name"] - logging.debug("Transcripts file:\t" + e["name"] + ": " + e["ldda_id"]) - elif "proteins.fa" in e["name"]: - self.datasets["proteins_file"] = e["ldda_id"] - self.datasets_name["proteins_file"] = e["name"] - logging.debug("Proteins file:\t" + e["name"] + ": " + e["ldda_id"]) - elif "gff" in e["name"]: - self.datasets["gff_file"] = e["ldda_id"] - self.datasets_name["gff_file"] = e["name"] - logging.debug("GFF file:\t" + e["name"] + ": " + e["ldda_id"]) - elif "Interpro" in e["name"]: - self.datasets["interproscan_file"] = e["ldda_id"] - self.datasets_name["interproscan_file"] = e["name"] - logging.debug("Interproscan file:\t" + e["name"] + ": " + e["ldda_id"]) - elif "diamond-blastp" in e["name"]: - self.datasets["blast_diamond_file"] = e["ldda_id"] - self.datasets_name["blast_diamond_file"] = e["name"] - logging.debug("Blastp diamond file:\t" + e["name"] + ": " + e["ldda_id"]) - # List of all datasets in the instance (including outputs from jobs) # "limit" and "offset" options *may* be used to restrict search to specific datasets but since # there is no way to know which imported datasets are the correct ones depending on history content # it's not currently used history_datasets_li = self.instance.datasets.get_datasets() + print(history_datasets_li) genome_dataset_hda_id, gff_dataset_hda_id, transcripts_dataset_hda_id, proteins_datasets_hda_id = None, None, None, None interproscan_dataset_hda_id, blast_diamond_dataset_hda_id = None, None - # Match files imported in history names vs library datasets names to assign their respective hda_id + # Match files imported in history names vs library datasets names to assign their respective hda_id for dataset_dict in history_datasets_li: - if dataset_dict["name"] == self.datasets_name["genome_file"]: - genome_dataset_hda_id = dataset_dict["id"] - logging.debug("Genome dataset hda id: %s" % genome_dataset_hda_id) - elif dataset_dict["name"] == self.datasets_name["proteins_file"]: - proteins_datasets_hda_id = dataset_dict["id"] - logging.debug("Proteins dataset hda ID: %s" % proteins_datasets_hda_id) - elif dataset_dict["name"] == self.datasets_name["transcripts_file"]: - transcripts_dataset_hda_id = dataset_dict["id"] - logging.debug("Transcripts dataset hda ID: %s" % transcripts_dataset_hda_id) - elif dataset_dict["name"] == self.datasets_name["gff_file"]: - gff_dataset_hda_id = dataset_dict["id"] - logging.debug("gff dataset hda ID: %s" % gff_dataset_hda_id) - - if "interproscan_file" in self.datasets_name.keys(): - if dataset_dict["name"] == self.datasets_name["interproscan_file"]: - interproscan_dataset_hda_id = dataset_dict["id"] - logging.debug("InterproScan dataset hda ID: %s" % gff_dataset_hda_id) - if "blast_diamond_file" in self.datasets_name.keys(): - if dataset_dict["name"] == self.datasets_name["blast_diamond_file"]: - blast_diamond_dataset_hda_id = dataset_dict["id"] - logging.debug("Blast Diamond dataset hda ID: %s" % gff_dataset_hda_id) + if dataset_dict["history_id"] == self.history_id: + if dataset_dict["name"] == self.datasets_name["genome_file"]: + genome_dataset_hda_id = dataset_dict["id"] + logging.debug("Genome dataset hda id: %s" % genome_dataset_hda_id) + elif dataset_dict["name"] == self.datasets_name["proteins_file"]: + proteins_datasets_hda_id = dataset_dict["id"] + logging.debug("Proteins dataset hda ID: %s" % proteins_datasets_hda_id) + elif dataset_dict["name"] == self.datasets_name["transcripts_file"]: + transcripts_dataset_hda_id = dataset_dict["id"] + logging.debug("Transcripts dataset hda ID: %s" % transcripts_dataset_hda_id) + elif dataset_dict["name"] == self.datasets_name["gff_file"]: + gff_dataset_hda_id = dataset_dict["id"] + logging.debug("GFF dataset hda ID: %s" % gff_dataset_hda_id) + + if "interproscan_file" in self.datasets_name.keys(): + if dataset_dict["name"] == self.datasets_name["interproscan_file"]: + interproscan_dataset_hda_id = dataset_dict["id"] + logging.debug("InterproScan dataset hda ID: %s" % gff_dataset_hda_id) + if "blast_diamond_file" in self.datasets_name.keys(): + if dataset_dict["name"] == self.datasets_name["blast_diamond_file"]: + blast_diamond_dataset_hda_id = dataset_dict["id"] + logging.debug("Blast Diamond dataset hda ID: %s" % gff_dataset_hda_id) # Return a dict made of the hda ids - return{"genome_hda_id": genome_dataset_hda_id, "transcripts_hda_id": transcripts_dataset_hda_id, - "proteins_hda_id": proteins_datasets_hda_id, "gff_hda_id": gff_dataset_hda_id, - "interproscan_hda_id": interproscan_dataset_hda_id, "blast_diamond_hda_id": blast_diamond_dataset_hda_id} + return {"genome_hda_id": genome_dataset_hda_id, "transcripts_hda_id": transcripts_dataset_hda_id, + "proteins_hda_id": proteins_datasets_hda_id, "gff_hda_id": gff_dataset_hda_id, + "interproscan_hda_id": interproscan_dataset_hda_id, + "blast_diamond_hda_id": blast_diamond_dataset_hda_id} def get_organism_id(self): """ @@ -753,9 +696,9 @@ class RunWorkflow(speciesData.SpeciesData): :return: """ + tool_version = "2.3.3" time.sleep(3) - # Get the ID for the current organism in chado org = self.instance.tools.run_tool( tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/2.3.3", @@ -925,8 +868,7 @@ if __name__ == "__main__": run_workflow_for_current_organism.get_instance_attributes() # Import datasets into history and retrieve their hda IDs - run_workflow_for_current_organism.import_datasets_into_history() - hda_ids = run_workflow_for_current_organism.get_datasets_hda_ids() + hda_ids = run_workflow_for_current_organism.import_datasets_into_history() # DEBUG # run_workflow_for_current_organism.get_invocation_report(workflow_name="Chado load Tripal synchronize") @@ -976,6 +918,8 @@ if __name__ == "__main__": # run_workflow_for_current_organism.datamap[GFF_FILE] = {"src": "hda", # "id": hda_ids["gff_hda_id"]} + # Ensures galaxy has time to retrieve + time.sleep(60) # Run the Chado load Tripal sync workflow with the parameters set above run_workflow_for_current_organism.run_workflow(workflow_path=workflow, workflow_parameters=workflow_parameters, diff --git a/speciesData.py b/speciesData.py index a52b0e504015366d0b5dfee129d13e556b1d4cda..dc6d2a46ec8c00bf9d72b38af43fd841d374203a 100644 --- a/speciesData.py +++ b/speciesData.py @@ -51,6 +51,10 @@ class SpeciesData: self.genus_uppercase = self.genus[0].upper() + self.genus[1:] self.chado_species_name = "{0} {1}".format(self.species, self.sex) self.full_name = ' '.join(utilities.filter_empty_not_empty_items([self.genus_uppercase, self.species, self.strain, self.sex])["not_empty"]) + self.full_name = self.full_name.replace("__", "_").replace("_ ", "_").replace(" _", "_") + if self.full_name.endswith("_") or self.full_name.endswith(" "): + self.full_name = self.full_name[0:-2] + self.full_name_lowercase = self.full_name.lower() self.abbreviation = "_".join(utilities.filter_empty_not_empty_items([self.genus_lowercase[0], self.species, self.strain, self.sex])["not_empty"]) self.genus_species = self.genus_lowercase + "_" + self.species diff --git a/templates/gspecies_compose_template.yml.j2 b/templates/gspecies_compose_template.yml.j2 index c943c27b190351e600d7b7bf9beb59995a08aa3f..d6ca429d4db6da271b512e79824f860b1adc3c2e 100644 --- a/templates/gspecies_compose_template.yml.j2 +++ b/templates/gspecies_compose_template.yml.j2 @@ -50,7 +50,9 @@ services: TRIPAL_ENABLE_MODULES: "tripal_analysis_blast tripal_analysis_interpro tripal_analysis_go tripal_rest_api tripal_elasticsearch" SITE_NAME: "{{ Genus_species }}" ELASTICSEARCH_HOST: elasticsearch.{{ genus_species }} - ENABLE_JBROWSE: /jbrowse/?data=data/{{ genus_species_sex }} # WARNING --> the variable "sex" shouldn't be in the compose file (all strains/sex are supposed to be in the same jbrowse) + ENABLE_JBROWSE: /jbrowse/?data=data/{{ genus_species_sex }} + # This ENABLE_JBROWSE variable should point to the "best assembly" by default --> tag it in the input file and use it to define this variable correctly (also called + # unique id in the jbrowse tool parameters == both have to be identical) ENABLE_APOLLO: 0 ENABLE_BLAST: 1 ENABLE_DOWNLOAD: 1