diff --git a/gga_get_data.py b/gga_get_data.py index 73821ff989209c6552431bc747707ff1f293b8b0..4494f9ab1b6924e9175ae7d05d2ea352a9ed35a7 100644 --- a/gga_get_data.py +++ b/gga_get_data.py @@ -66,7 +66,7 @@ class GetData(speciesData.SpeciesData): logging.critical("Cannot access " + self.species_dir) sys.exit(0) - organism_annotation_dir = os.path.abspath("./src_data/annotation/{0}/OGS{1}".format(self.species_folder_name, self.genome_version)) + organism_annotation_dir = os.path.abspath("./src_data/annotation/{0}/OGS{1}".format(self.species_folder_name, self.ogs_version)) organism_genome_dir = os.path.abspath("./src_data/genome/{0}/v{1}".format(self.species_folder_name, self.genome_version)) datasets_to_get = {"genome_path": self.genome_path, @@ -93,7 +93,7 @@ class GetData(speciesData.SpeciesData): try: shutil.copyfile(os.path.abspath(v), os.path.join(organism_genome_dir, os.path.basename(v))) except Exception as exc: - logging.warning("Could not copy {0} - {1} - Exit Code {2})".format(k, v, exc)) + logging.warning("Could not copy {0} ({1}) - Exit Code: {2})".format(k, v, exc)) elif k in annotation_datasets: logging.info("Copying {0} ({1}) into {2}".format(k, v, organism_annotation_dir)) try: diff --git a/gga_load_data.py b/gga_load_data.py index 1c20b9079ced3ee672d5a51b14d523b8bb93a39d..2d83b83186f55f71901145a362b7a0bac97ae078 100644 --- a/gga_load_data.py +++ b/gga_load_data.py @@ -279,7 +279,6 @@ class LoadData(speciesData.SpeciesData): email=self.config["galaxy_default_admin_email"], password=self.config["galaxy_default_admin_password"] ) - print(self.instance) self.instance.histories.get_histories() logging.info("Connecting to the galaxy instance...") @@ -370,15 +369,11 @@ if __name__ == "__main__": # Parse the config yaml file load_data_for_current_species.config = utilities.parse_config(args.config) # Set the instance url attribute -- Does not work with localhost on scratch (ALB) - for env_variable, value in load_data_for_current_species.config.items(): - if env_variable == "hostname": - load_data_for_current_species.instance_url = "http://{0}:8888/sp/{1}_{2}/galaxy/".format( - value, load_data_for_current_species.genus_lowercase, load_data_for_current_species.species) - break - else: - load_data_for_current_species.instance_url = "http://localhost:8888/sp/{0}_{1}/galaxy/".format( - load_data_for_current_species.genus_lowercase, - load_data_for_current_species.species) + print(load_data_for_current_species.config["http_port"]) + load_data_for_current_species.instance_url = "http://scratchgmodv1:{0}/sp/{1}_{2}/galaxy/".format( + load_data_for_current_species.config["http_port"], + load_data_for_current_species.genus_lowercase, + load_data_for_current_species.species) diff --git a/run_workflow_phaeoexplorer.py b/run_workflow_phaeoexplorer.py index fafd4131f3b7579092e12541a1577d6f9ef7e05b..f3fd8e726252261b1403907c7cacaa1a6e3c8242 100644 --- a/run_workflow_phaeoexplorer.py +++ b/run_workflow_phaeoexplorer.py @@ -186,16 +186,9 @@ class RunWorkflow(speciesData.SpeciesData): # Verify that the add_organism and add_analysis versions are correct in the toolshed add_organism_tool = self.instance.tools.show_tool("toolshed.g2.bx.psu.edu/repos/gga/chado_organism_add_organism/organism_add_organism/2.3.3") - add_analysis_tool= self.instance.tools.show_tool("toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/2.3.3") - - # print(add_organism_tool) - # print(add_analysis_tool) - + add_analysis_tool = self.instance.tools.show_tool("toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/2.3.3") get_organism_tool = self.instance.tools.show_tool("toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/2.3.3") get_analysis_tool = self.instance.tools.show_tool("toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_get_analyses/analysis_get_analyses/2.3.3") - - # print(get_organism_tool) - # print(get_analysis_tool) # changeset for 2.3.3 has to be manually found because there is no way to get the wanted changeset of a non installed tool via bioblend # except for workflows (.ga) that already contain the changeset revisions inside the steps ids @@ -530,6 +523,16 @@ class RunWorkflow(speciesData.SpeciesData): return invocation_report + + + def get_datasets_ldda_ids(self): + """ + Get and return the ldda_ids (and names) for the datasets in the library + """ + + return 0 + + def import_datasets_into_history(self): """ Find datasets in a library, get their ID and import them into the current history if they are not already @@ -562,28 +565,8 @@ class RunWorkflow(speciesData.SpeciesData): genome_dataset_hda_id, gff_dataset_hda_id, transcripts_dataset_hda_id, proteins_datasets_hda_id = None, None, None, None interproscan_dataset_hda_id, blast_diamond_dataset_hda_id = None, None - # Check for existing datasets for current organism (should have been run separately for mutliple organisms instances) - for dataset_dict in history_datasets_li[0:5]: # Limit of datasets is 6 atm - # Datasets imports should be ordered correctly - if dataset_dict["name"].endswith("proteins.fa"): - proteins_datasets_hda_id = dataset_dict["id"] - logging.debug("Proteins dataset hda ID: %s" % proteins_datasets_hda_id) - elif dataset_dict["name"].endswith("transcripts-gff.fa"): - transcripts_dataset_hda_id = dataset_dict["id"] - logging.debug("Transcripts dataset hda ID: %s" % transcripts_dataset_hda_id) - elif dataset_dict["name"].endswith(".gff"): - gff_dataset_hda_id = dataset_dict["id"] - logging.debug("gff dataset hda ID: %s" % gff_dataset_hda_id) - elif "Interpro" in dataset_dict["name"]: - interproscan_dataset_hda_id = dataset_dict["id"] - logging.debug("InterproScan dataset hda ID: %s" % gff_dataset_hda_id) - elif "diamond-blastp" in dataset_dict["name"]: - blast_diamond_dataset_hda_id = dataset_dict["id"] - logging.debug("Blast Diamond dataset hda ID: %s" % gff_dataset_hda_id) - else: - genome_dataset_hda_id = dataset_dict["id"] - logging.debug("Genome dataset hda id: %s" % genome_dataset_hda_id) - + # Store datasets that are in the history already in a dict + hda_ids = self.get_datasets_hda_ids() # Iterating over the folders to find datasets and map datasets to their IDs logging.debug("Datasets IDs: ") @@ -619,37 +602,37 @@ class RunWorkflow(speciesData.SpeciesData): self.datasets["blast_diamond_file"] = e["ldda_id"] logging.debug("Blastp diamond file:\t" + e["name"] + ": " + e["ldda_id"]) - logging.info("Uploading datasets into history %s" % self.history_id) - if genome_dataset_hda_id is None: + logging.debug("Uploading datasets into history %s" % self.history_id) + # Import each dataset into history if it is not imported + if hda_ids["genome_hda_id"] is None: self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["genome_file"]) - if gff_dataset_hda_id is None: + if hda_ids["gff_hda_id"] is None: self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["gff_file"]) - if transcripts_dataset_hda_id is None: + if hda_ids["transcripts_hda_id"] is None: self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["transcripts_file"]) - if proteins_datasets_hda_id is None: + if hda_ids["proteins_hda_id"] is None: self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["proteins_file"]) - if interproscan_dataset_hda_id is None: + if hda_ids["interproscan_hda_id"] is None: try: self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["interproscan_file"]) except Exception as exc: logging.debug("Interproscan file not found in library (history: {0})\n{1}".format(self.history_id, exc)) - if blast_diamond_dataset_hda_id is None: + if hda_ids["blast_diamond_hda_id"] is None: try: self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["blast_diamond_file"]) except Exception as exc: logging.debug("Blastp file not found in library (history: {0})\n{1}".format(self.history_id, exc)) - _datasets = self.instance.datasets.get_datasets() - with open(os.path.join(self.main_dir, "datasets_ids.json"), "w") as datasets_ids_outfile: - datasets_ids_outfile.write(str(_datasets)) + # _datasets = self.instance.datasets.get_datasets() + # with open(os.path.join(self.main_dir, "datasets_ids.json"), "w") as datasets_ids_outfile: + # datasets_ids_outfile.write(str(_datasets)) return {"history_id": self.history_id, "library_id": library_id, "datasets": self.datasets} + def get_datasets_hda_ids(self): """ Get the hda IDs of the datasets imported into an history - The most "recent" imports will be prioritized - As some tools will not work using the input datasets ldda IDs we need to retrieve the datasets IDs imported into an history @@ -658,6 +641,72 @@ class RunWorkflow(speciesData.SpeciesData): :return: """ + # Create an object of the galaxy instance + gio = GalaxyInstance(url=self.instance_url, + email=self.config["galaxy_default_admin_email"], + password=self.config["galaxy_default_admin_password"]) + + prj_lib = gio.libraries.get_previews(name="Project Data") + library_id = prj_lib[0].id + + instance_source_data_folders = self.instance.libraries.get_folders(library_id=str(library_id)) + + folders_ids = {} + current_folder_name = "" + # Loop over the folders in the library and map folders names to their IDs + for i in instance_source_data_folders: + for k, v in i.items(): + if k == "name": + folders_ids[v] = 0 + current_folder_name = v + if k == "id": + folders_ids[current_folder_name] = v + + history_datasets_li = self.instance.datasets.get_datasets() + genome_dataset_hda_id, gff_dataset_hda_id, transcripts_dataset_hda_id, proteins_datasets_hda_id = None, None, None, None + interproscan_dataset_hda_id, blast_diamond_dataset_hda_id = None, None + + # Iterating over the folders to find datasets and map datasets to their IDs + # TODO: Add a required matching of the version (genome/ogs) + logging.debug("Datasets IDs: ") + for k, v in folders_ids.items(): + if k == "/genome": + sub_folder_content = self.instance.folders.show_folder(folder_id=v, contents=True) + final_sub_folder_content = self.instance.folders.show_folder(folder_id=sub_folder_content["folder_contents"][0]["id"], contents=True) + for k2, v2 in final_sub_folder_content.items(): + for e in v2: + if type(e) == dict: + if e["name"].endswith(".fa"): + self.datasets["genome_file"] = e["ldda_id"] + self.datasets_name["genome_file"] = e["name"] + logging.debug("Genome file:\t" + e["name"] + ": " + e["ldda_id"]) + if k == "/annotation": + sub_folder_content = self.instance.folders.show_folder(folder_id=v, contents=True) + final_sub_folder_content = self.instance.folders.show_folder(folder_id=sub_folder_content["folder_contents"][0]["id"], contents=True) + for k2, v2 in final_sub_folder_content.items(): + for e in v2: + if type(e) == dict: + if "transcripts" in e["name"]: + self.datasets["transcripts_file"] = e["ldda_id"] + self.datasets_name["transcripts_file"] = e["name"] + logging.debug("Transcripts file:\t" + e["name"] + ": " + e["ldda_id"]) + elif "proteins.fa" in e["name"]: + self.datasets["proteins_file"] = e["ldda_id"] + self.datasets_name["proteins_file"] = e["name"] + logging.debug("Proteins file:\t" + e["name"] + ": " + e["ldda_id"]) + elif "gff" in e["name"]: + self.datasets["gff_file"] = e["ldda_id"] + self.datasets_name["gff_file"] = e["name"] + logging.debug("GFF file:\t" + e["name"] + ": " + e["ldda_id"]) + elif "Interpro" in e["name"]: + self.datasets["interproscan_file"] = e["ldda_id"] + self.datasets_name["interproscan_file"] = e["name"] + logging.debug("Interproscan file:\t" + e["name"] + ": " + e["ldda_id"]) + elif "diamond-blastp" in e["name"]: + self.datasets["blast_diamond_file"] = e["ldda_id"] + self.datasets_name["blast_diamond_file"] = e["name"] + logging.debug("Blastp diamond file:\t" + e["name"] + ": " + e["ldda_id"]) + # List of all datasets in the instance (including outputs from jobs) # "limit" and "offset" options *may* be used to restrict search to specific datasets but since # there is no way to know which imported datasets are the correct ones depending on history content @@ -667,32 +716,29 @@ class RunWorkflow(speciesData.SpeciesData): genome_dataset_hda_id, gff_dataset_hda_id, transcripts_dataset_hda_id, proteins_datasets_hda_id = None, None, None, None interproscan_dataset_hda_id, blast_diamond_dataset_hda_id = None, None - genome_dataset_hda_id = history_datasets_li[3]["id"] - gff_dataset_hda_id = history_datasets_li[2]["id"] - transcripts_dataset_hda_id = history_datasets_li[1]["id"] - proteins_datasets_hda_id = history_datasets_li[0]["id"] - - for dataset_dict in history_datasets_li[0:5]: - # Datasets imports should be the last jobs in history if the function calls are in correct order - # If not, add the function call "get_datasets_hda_ids()" just after "import_datasets_into_history()" - if dataset_dict["name"].endswith("proteins.fa"): + # Match files imported in history names vs library datasets names to assign their respective hda_id + for dataset_dict in history_datasets_li: + if dataset_dict["name"] == self.datasets_name["genome_file"]: + genome_dataset_hda_id = dataset_dict["id"] + logging.debug("Genome dataset hda id: %s" % genome_dataset_hda_id) + elif dataset_dict["name"] == self.datasets_name["proteins_file"]: proteins_datasets_hda_id = dataset_dict["id"] logging.debug("Proteins dataset hda ID: %s" % proteins_datasets_hda_id) - elif dataset_dict["name"].endswith("transcripts-gff.fa"): + elif dataset_dict["name"] == self.datasets_name["transcripts_file"]: transcripts_dataset_hda_id = dataset_dict["id"] logging.debug("Transcripts dataset hda ID: %s" % transcripts_dataset_hda_id) - elif dataset_dict["name"].endswith(".gff"): + elif dataset_dict["name"] == self.datasets_name["gff_file"]: gff_dataset_hda_id = dataset_dict["id"] logging.debug("gff dataset hda ID: %s" % gff_dataset_hda_id) - elif "Interpro" in dataset_dict["name"]: - interproscan_dataset_hda_id = dataset_dict["id"] - logging.debug("InterproScan dataset hda ID: %s" % gff_dataset_hda_id) - elif "diamond-blastp" in dataset_dict["name"]: - blast_diamond_dataset_hda_id = dataset_dict["id"] - logging.debug("Blast Diamond dataset hda ID: %s" % gff_dataset_hda_id) - else: - genome_dataset_hda_id = dataset_dict["id"] - logging.debug("Genome dataset hda id: %s" % genome_dataset_hda_id) + + if "interproscan_file" in self.datasets_name.keys(): + if dataset_dict["name"] == self.datasets_name["interproscan_file"]: + interproscan_dataset_hda_id = dataset_dict["id"] + logging.debug("InterproScan dataset hda ID: %s" % gff_dataset_hda_id) + if "blast_diamond_file" in self.datasets_name.keys(): + if dataset_dict["name"] == self.datasets_name["blast_diamond_file"]: + blast_diamond_dataset_hda_id = dataset_dict["id"] + logging.debug("Blast Diamond dataset hda ID: %s" % gff_dataset_hda_id) # Return a dict made of the hda ids return{"genome_hda_id": genome_dataset_hda_id, "transcripts_hda_id": transcripts_dataset_hda_id, @@ -851,7 +897,7 @@ if __name__ == "__main__": # Parse the config yaml file run_workflow_for_current_organism.config = utilities.parse_config(args.config) # Set the instance url attribute - run_workflow_for_current_organism.instance_url = "http://localhost:{0}/sp/{1}_{2}/galaxy/".format( + run_workflow_for_current_organism.instance_url = "http://scratchgmodv1:{0}/sp/{1}_{2}/galaxy/".format( run_workflow_for_current_organism.config["http_port"], run_workflow_for_current_organism.genus_lowercase, run_workflow_for_current_organism.species) diff --git a/speciesData.py b/speciesData.py index 287c8ccf92a839743cb5ec71b0a7a23ff3f65acc..a52b0e504015366d0b5dfee129d13e556b1d4cda 100644 --- a/speciesData.py +++ b/speciesData.py @@ -11,7 +11,7 @@ from _datetime import datetime class SpeciesData: """ This class contains attributes and functions to interact with the galaxy container of the GGA environment - Parent class of LoadData, DeploySpeciesStack and RunWorkflow + Parent class of LoadData, GetData, DeploySpeciesStack, GgaPreprocess and RunWorkflow """ @@ -37,6 +37,7 @@ class SpeciesData: else: self.ogs_version = parameters_dictionary["data"]["ogs_version"] + # TODO: catch blocks if key is absent in input self.genome_path = parameters_dictionary["data"]["genome_path"] self.transcripts_path = parameters_dictionary["data"]["transcripts_path"] self.proteins_path = parameters_dictionary["data"]["proteins_path"] @@ -66,6 +67,7 @@ class SpeciesData: self.ogs_analysis_id = None self.tool_panel = None self.datasets = dict() + self.datasets_name = dict() self.source_files = dict() self.workflow_name = None self.metadata = dict() diff --git a/templates/gspecies_compose_template.yml.j2 b/templates/gspecies_compose_template.yml.j2 index 6bf0adcc41e19a1b29b5e1179f17ccfcae945e30..c943c27b190351e600d7b7bf9beb59995a08aa3f 100644 --- a/templates/gspecies_compose_template.yml.j2 +++ b/templates/gspecies_compose_template.yml.j2 @@ -50,7 +50,7 @@ services: TRIPAL_ENABLE_MODULES: "tripal_analysis_blast tripal_analysis_interpro tripal_analysis_go tripal_rest_api tripal_elasticsearch" SITE_NAME: "{{ Genus_species }}" ELASTICSEARCH_HOST: elasticsearch.{{ genus_species }} - ENABLE_JBROWSE: /jbrowse/?data=data/{{ genus_species_sex }} + ENABLE_JBROWSE: /jbrowse/?data=data/{{ genus_species_sex }} # WARNING --> the variable "sex" shouldn't be in the compose file (all strains/sex are supposed to be in the same jbrowse) ENABLE_APOLLO: 0 ENABLE_BLAST: 1 ENABLE_DOWNLOAD: 1 @@ -159,10 +159,11 @@ services: # - "traefik.http.routers.{{ genus_species }}-galaxy.entryPoints=webs" - "traefik.http.routers.{{ genus_species }}-galaxy.entryPoints=web" #lg # - "traefik.http.routers.{{ genus_species }}-galaxy.middlewares=sp-auth,sp-app-trailslash,sp-app-prefix" - - "traefik.http.routers.{{ genus_species }}-gga_load-galaxy.rule=(Host(`localhost`) && PathPrefix(`/sp/{{ genus_species }}/galaxy`))" - - "traefik.http.routers.{{ genus_species }}-gga_load-galaxy.entryPoints=web" - "traefik.http.routers.{{ genus_species }}-galaxy.middlewares=sp-app-trailslash,sp-app-prefix" #lg - "traefik.http.services.{{ genus_species }}-galaxy.loadbalancer.server.port=80" + - "traefik.http.routers.{{ genus_species }}-gga_load-galaxy.rule=(Host(`localhost`) && PathPrefix(`/sp/{{ genus_species }}/galaxy`))" + - "traefik.http.routers.{{ genus_species }}-gga_load-galaxy.entryPoints=web" + restart_policy: condition: on-failure delay: 5s