From 5e33708d6955d46190f2a23970e86167b807cb9d Mon Sep 17 00:00:00 2001
From: Arthur Le Bars <arthur.le-bars@sb-roscoff.fr>
Date: Wed, 10 Mar 2021 17:48:24 +0100
Subject: [PATCH] fixed hda ids attribution in run_workflow_phaeoexplorer

---
 gga_get_data.py                            |   4 +-
 gga_load_data.py                           |  15 +-
 run_workflow_phaeoexplorer.py              | 172 +++++++++++++--------
 speciesData.py                             |   4 +-
 templates/gspecies_compose_template.yml.j2 |   7 +-
 5 files changed, 123 insertions(+), 79 deletions(-)

diff --git a/gga_get_data.py b/gga_get_data.py
index 73821ff..4494f9a 100644
--- a/gga_get_data.py
+++ b/gga_get_data.py
@@ -66,7 +66,7 @@ class GetData(speciesData.SpeciesData):
             logging.critical("Cannot access " + self.species_dir)
             sys.exit(0)
 
-        organism_annotation_dir = os.path.abspath("./src_data/annotation/{0}/OGS{1}".format(self.species_folder_name, self.genome_version))
+        organism_annotation_dir = os.path.abspath("./src_data/annotation/{0}/OGS{1}".format(self.species_folder_name, self.ogs_version))
         organism_genome_dir = os.path.abspath("./src_data/genome/{0}/v{1}".format(self.species_folder_name, self.genome_version))
 
         datasets_to_get = {"genome_path": self.genome_path,
@@ -93,7 +93,7 @@ class GetData(speciesData.SpeciesData):
                     try:
                         shutil.copyfile(os.path.abspath(v), os.path.join(organism_genome_dir, os.path.basename(v)))
                     except Exception as exc:
-                        logging.warning("Could not copy {0} - {1} - Exit Code {2})".format(k, v, exc))
+                        logging.warning("Could not copy {0} ({1}) - Exit Code: {2})".format(k, v, exc))
                 elif k in annotation_datasets:
                     logging.info("Copying {0} ({1}) into {2}".format(k, v, organism_annotation_dir))
                     try:
diff --git a/gga_load_data.py b/gga_load_data.py
index 1c20b90..2d83b83 100644
--- a/gga_load_data.py
+++ b/gga_load_data.py
@@ -279,7 +279,6 @@ class LoadData(speciesData.SpeciesData):
                                               email=self.config["galaxy_default_admin_email"],
                                               password=self.config["galaxy_default_admin_password"]
                                               )
-        print(self.instance)
         self.instance.histories.get_histories()
         logging.info("Connecting to the galaxy instance...")
 
@@ -370,15 +369,11 @@ if __name__ == "__main__":
         # Parse the config yaml file
         load_data_for_current_species.config = utilities.parse_config(args.config)
         # Set the instance url attribute -- Does not work with localhost on scratch (ALB)
-        for env_variable, value in load_data_for_current_species.config.items():
-            if env_variable == "hostname":
-                load_data_for_current_species.instance_url = "http://{0}:8888/sp/{1}_{2}/galaxy/".format(
-                    value, load_data_for_current_species.genus_lowercase, load_data_for_current_species.species)
-                break
-            else:
-                load_data_for_current_species.instance_url = "http://localhost:8888/sp/{0}_{1}/galaxy/".format(
-                    load_data_for_current_species.genus_lowercase,
-                    load_data_for_current_species.species)
+        print(load_data_for_current_species.config["http_port"])
+        load_data_for_current_species.instance_url = "http://scratchgmodv1:{0}/sp/{1}_{2}/galaxy/".format(
+                load_data_for_current_species.config["http_port"],
+                load_data_for_current_species.genus_lowercase,
+                load_data_for_current_species.species)
 
         
 
diff --git a/run_workflow_phaeoexplorer.py b/run_workflow_phaeoexplorer.py
index fafd413..f3fd8e7 100644
--- a/run_workflow_phaeoexplorer.py
+++ b/run_workflow_phaeoexplorer.py
@@ -186,16 +186,9 @@ class RunWorkflow(speciesData.SpeciesData):
 
         # Verify that the add_organism and add_analysis versions are correct in the toolshed
         add_organism_tool = self.instance.tools.show_tool("toolshed.g2.bx.psu.edu/repos/gga/chado_organism_add_organism/organism_add_organism/2.3.3")
-        add_analysis_tool= self.instance.tools.show_tool("toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/2.3.3")
-        
-        # print(add_organism_tool)
-        # print(add_analysis_tool)
-
+        add_analysis_tool = self.instance.tools.show_tool("toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/2.3.3")
         get_organism_tool = self.instance.tools.show_tool("toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/2.3.3")
         get_analysis_tool = self.instance.tools.show_tool("toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_get_analyses/analysis_get_analyses/2.3.3")
-        
-        # print(get_organism_tool)
-        # print(get_analysis_tool)
 
         # changeset for 2.3.3 has to be manually found because there is no way to get the wanted changeset of a non installed tool via bioblend 
         # except for workflows (.ga) that already contain the changeset revisions inside the steps ids
@@ -530,6 +523,16 @@ class RunWorkflow(speciesData.SpeciesData):
 
         return invocation_report
 
+
+
+    def get_datasets_ldda_ids(self):
+        """
+        Get and return the ldda_ids (and names) for the datasets in the library
+        """
+
+        return 0
+
+
     def import_datasets_into_history(self):
         """
         Find datasets in a library, get their ID and import them into the current history if they are not already
@@ -562,28 +565,8 @@ class RunWorkflow(speciesData.SpeciesData):
         genome_dataset_hda_id, gff_dataset_hda_id, transcripts_dataset_hda_id, proteins_datasets_hda_id = None, None, None, None
         interproscan_dataset_hda_id, blast_diamond_dataset_hda_id = None, None
 
-        # Check for existing datasets for current organism (should have been run separately for mutliple organisms instances)
-        for dataset_dict in history_datasets_li[0:5]:  # Limit of datasets is 6 atm
-            # Datasets imports should be ordered correctly
-            if dataset_dict["name"].endswith("proteins.fa"):
-                proteins_datasets_hda_id = dataset_dict["id"]
-                logging.debug("Proteins dataset hda ID: %s" % proteins_datasets_hda_id)
-            elif dataset_dict["name"].endswith("transcripts-gff.fa"):
-                transcripts_dataset_hda_id = dataset_dict["id"]
-                logging.debug("Transcripts dataset hda ID: %s" % transcripts_dataset_hda_id)
-            elif dataset_dict["name"].endswith(".gff"):
-                gff_dataset_hda_id = dataset_dict["id"]
-                logging.debug("gff dataset hda ID: %s" % gff_dataset_hda_id)
-            elif "Interpro" in dataset_dict["name"]:
-                interproscan_dataset_hda_id = dataset_dict["id"]
-                logging.debug("InterproScan dataset hda ID: %s" % gff_dataset_hda_id)
-            elif "diamond-blastp" in dataset_dict["name"]:
-                blast_diamond_dataset_hda_id = dataset_dict["id"]
-                logging.debug("Blast Diamond dataset hda ID: %s" % gff_dataset_hda_id)
-            else:
-                genome_dataset_hda_id = dataset_dict["id"]
-                logging.debug("Genome dataset hda id: %s" % genome_dataset_hda_id)
-
+        # Store datasets that are in the history already in a dict
+        hda_ids = self.get_datasets_hda_ids()
 
         # Iterating over the folders to find datasets and map datasets to their IDs
         logging.debug("Datasets IDs: ")
@@ -619,37 +602,37 @@ class RunWorkflow(speciesData.SpeciesData):
                                 self.datasets["blast_diamond_file"] = e["ldda_id"]
                                 logging.debug("Blastp diamond file:\t" + e["name"] + ": " + e["ldda_id"])
 
-        logging.info("Uploading datasets into history %s" % self.history_id)
-        if genome_dataset_hda_id is None:
+        logging.debug("Uploading datasets into history %s" % self.history_id)
+        # Import each dataset into history if it is not imported
+        if hda_ids["genome_hda_id"] is None:
             self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["genome_file"])
-        if gff_dataset_hda_id is None:
+        if hda_ids["gff_hda_id"] is None:
             self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["gff_file"])
-        if transcripts_dataset_hda_id is None:
+        if hda_ids["transcripts_hda_id"] is None:
             self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["transcripts_file"])
-        if proteins_datasets_hda_id is None:
+        if hda_ids["proteins_hda_id"] is None:
             self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["proteins_file"])
-        if interproscan_dataset_hda_id is None:
+        if hda_ids["interproscan_hda_id"] is None:
             try:
                 self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["interproscan_file"])
             except Exception as exc:
                 logging.debug("Interproscan file not found in library (history: {0})\n{1}".format(self.history_id, exc))
-        if blast_diamond_dataset_hda_id is None:
+        if hda_ids["blast_diamond_hda_id"] is None:
             try:
                 self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["blast_diamond_file"])
             except Exception as exc:
                 logging.debug("Blastp file not found in library (history: {0})\n{1}".format(self.history_id, exc))
 
-        _datasets = self.instance.datasets.get_datasets()
-        with open(os.path.join(self.main_dir, "datasets_ids.json"), "w") as datasets_ids_outfile:
-            datasets_ids_outfile.write(str(_datasets))
+        # _datasets = self.instance.datasets.get_datasets()
+        # with open(os.path.join(self.main_dir, "datasets_ids.json"), "w") as datasets_ids_outfile:
+        #     datasets_ids_outfile.write(str(_datasets))
 
         return {"history_id": self.history_id, "library_id": library_id, "datasets": self.datasets}
 
+
     def get_datasets_hda_ids(self):
         """
         Get the hda IDs of the datasets imported into an history
-        The most "recent" imports will be prioritized
-
 
         As some tools will not work using the input datasets ldda IDs we need to retrieve the datasets IDs imported
         into an history
@@ -658,6 +641,72 @@ class RunWorkflow(speciesData.SpeciesData):
         :return:
         """
 
+        # Create an object of the galaxy instance 
+        gio = GalaxyInstance(url=self.instance_url,
+                             email=self.config["galaxy_default_admin_email"],
+                             password=self.config["galaxy_default_admin_password"])
+
+        prj_lib = gio.libraries.get_previews(name="Project Data")
+        library_id = prj_lib[0].id
+
+        instance_source_data_folders = self.instance.libraries.get_folders(library_id=str(library_id))
+
+        folders_ids = {}
+        current_folder_name = ""
+        # Loop over the folders in the library and map folders names to their IDs
+        for i in instance_source_data_folders:
+            for k, v in i.items():
+                if k == "name":
+                    folders_ids[v] = 0
+                    current_folder_name = v
+                if k == "id":
+                    folders_ids[current_folder_name] = v
+
+        history_datasets_li = self.instance.datasets.get_datasets()
+        genome_dataset_hda_id, gff_dataset_hda_id, transcripts_dataset_hda_id, proteins_datasets_hda_id = None, None, None, None
+        interproscan_dataset_hda_id, blast_diamond_dataset_hda_id = None, None
+
+        # Iterating over the folders to find datasets and map datasets to their IDs
+        # TODO: Add a required matching of the version (genome/ogs)
+        logging.debug("Datasets IDs: ")
+        for k, v in folders_ids.items():
+            if k == "/genome":
+                sub_folder_content = self.instance.folders.show_folder(folder_id=v, contents=True)
+                final_sub_folder_content = self.instance.folders.show_folder(folder_id=sub_folder_content["folder_contents"][0]["id"], contents=True)
+                for k2, v2 in final_sub_folder_content.items():
+                    for e in v2:
+                        if type(e) == dict:
+                            if e["name"].endswith(".fa"):
+                                self.datasets["genome_file"] = e["ldda_id"]
+                                self.datasets_name["genome_file"] = e["name"]
+                                logging.debug("Genome file:\t" + e["name"] + ": " + e["ldda_id"])
+            if k == "/annotation":
+                sub_folder_content = self.instance.folders.show_folder(folder_id=v, contents=True)
+                final_sub_folder_content = self.instance.folders.show_folder(folder_id=sub_folder_content["folder_contents"][0]["id"], contents=True)
+                for k2, v2 in final_sub_folder_content.items():
+                    for e in v2:
+                        if type(e) == dict:
+                            if "transcripts" in e["name"]:
+                                self.datasets["transcripts_file"] = e["ldda_id"]
+                                self.datasets_name["transcripts_file"] = e["name"]
+                                logging.debug("Transcripts file:\t" + e["name"] + ": " + e["ldda_id"])
+                            elif "proteins.fa" in e["name"]:
+                                self.datasets["proteins_file"] = e["ldda_id"]
+                                self.datasets_name["proteins_file"] = e["name"]
+                                logging.debug("Proteins file:\t" + e["name"] + ": " + e["ldda_id"])
+                            elif "gff" in e["name"]:
+                                self.datasets["gff_file"] = e["ldda_id"]
+                                self.datasets_name["gff_file"] = e["name"]
+                                logging.debug("GFF file:\t" + e["name"] + ": " + e["ldda_id"])
+                            elif "Interpro" in e["name"]:
+                                self.datasets["interproscan_file"] = e["ldda_id"]
+                                self.datasets_name["interproscan_file"] = e["name"]
+                                logging.debug("Interproscan file:\t" + e["name"] + ": " + e["ldda_id"])
+                            elif "diamond-blastp" in e["name"]:
+                                self.datasets["blast_diamond_file"] = e["ldda_id"]
+                                self.datasets_name["blast_diamond_file"] = e["name"]
+                                logging.debug("Blastp diamond file:\t" + e["name"] + ": " + e["ldda_id"])
+
         # List of all datasets in the instance (including outputs from jobs)
         # "limit" and "offset" options *may* be used to restrict search to specific datasets but since
         # there is no way to know which imported datasets are the correct ones depending on history content
@@ -667,32 +716,29 @@ class RunWorkflow(speciesData.SpeciesData):
         genome_dataset_hda_id, gff_dataset_hda_id, transcripts_dataset_hda_id, proteins_datasets_hda_id = None, None, None, None
         interproscan_dataset_hda_id, blast_diamond_dataset_hda_id = None, None
 
-        genome_dataset_hda_id = history_datasets_li[3]["id"]
-        gff_dataset_hda_id = history_datasets_li[2]["id"]
-        transcripts_dataset_hda_id = history_datasets_li[1]["id"]
-        proteins_datasets_hda_id = history_datasets_li[0]["id"]
-
-        for dataset_dict in history_datasets_li[0:5]:
-            # Datasets imports should be the last jobs in history if the function calls are in correct order
-            # If not, add the function call "get_datasets_hda_ids()" just after "import_datasets_into_history()"
-            if dataset_dict["name"].endswith("proteins.fa"):
+        # Match files imported in history names vs library datasets names to assign their respective hda_id 
+        for dataset_dict in history_datasets_li:
+            if dataset_dict["name"] == self.datasets_name["genome_file"]:
+                genome_dataset_hda_id = dataset_dict["id"]
+                logging.debug("Genome dataset hda id: %s" % genome_dataset_hda_id)
+            elif dataset_dict["name"] == self.datasets_name["proteins_file"]:
                 proteins_datasets_hda_id = dataset_dict["id"]
                 logging.debug("Proteins dataset hda ID: %s" % proteins_datasets_hda_id)
-            elif dataset_dict["name"].endswith("transcripts-gff.fa"):
+            elif dataset_dict["name"] == self.datasets_name["transcripts_file"]:
                 transcripts_dataset_hda_id = dataset_dict["id"]
                 logging.debug("Transcripts dataset hda ID: %s" % transcripts_dataset_hda_id)
-            elif dataset_dict["name"].endswith(".gff"):
+            elif dataset_dict["name"] == self.datasets_name["gff_file"]:
                 gff_dataset_hda_id = dataset_dict["id"]
                 logging.debug("gff dataset hda ID: %s" % gff_dataset_hda_id)
-            elif "Interpro" in dataset_dict["name"]:
-                interproscan_dataset_hda_id = dataset_dict["id"]
-                logging.debug("InterproScan dataset hda ID: %s" % gff_dataset_hda_id)
-            elif "diamond-blastp" in dataset_dict["name"]:
-                blast_diamond_dataset_hda_id = dataset_dict["id"]
-                logging.debug("Blast Diamond dataset hda ID: %s" % gff_dataset_hda_id)
-            else:
-                genome_dataset_hda_id = dataset_dict["id"]
-                logging.debug("Genome dataset hda id: %s" % genome_dataset_hda_id)
+
+            if "interproscan_file" in self.datasets_name.keys():
+                if dataset_dict["name"] == self.datasets_name["interproscan_file"]:
+                    interproscan_dataset_hda_id = dataset_dict["id"]
+                    logging.debug("InterproScan dataset hda ID: %s" % gff_dataset_hda_id)
+            if "blast_diamond_file" in self.datasets_name.keys():
+                if dataset_dict["name"] == self.datasets_name["blast_diamond_file"]:
+                    blast_diamond_dataset_hda_id = dataset_dict["id"]
+                    logging.debug("Blast Diamond dataset hda ID: %s" % gff_dataset_hda_id)
 
         # Return a dict made of the hda ids
         return{"genome_hda_id": genome_dataset_hda_id, "transcripts_hda_id": transcripts_dataset_hda_id,
@@ -851,7 +897,7 @@ if __name__ == "__main__":
             # Parse the config yaml file
             run_workflow_for_current_organism.config = utilities.parse_config(args.config)
             # Set the instance url attribute
-            run_workflow_for_current_organism.instance_url = "http://localhost:{0}/sp/{1}_{2}/galaxy/".format(
+            run_workflow_for_current_organism.instance_url = "http://scratchgmodv1:{0}/sp/{1}_{2}/galaxy/".format(
                 run_workflow_for_current_organism.config["http_port"], 
                 run_workflow_for_current_organism.genus_lowercase,
                 run_workflow_for_current_organism.species)
diff --git a/speciesData.py b/speciesData.py
index 287c8cc..a52b0e5 100644
--- a/speciesData.py
+++ b/speciesData.py
@@ -11,7 +11,7 @@ from _datetime import datetime
 class SpeciesData:
     """
     This class contains attributes and functions to interact with the galaxy container of the GGA environment
-    Parent class of LoadData, DeploySpeciesStack and RunWorkflow
+    Parent class of LoadData, GetData, DeploySpeciesStack, GgaPreprocess and RunWorkflow
 
     """
 
@@ -37,6 +37,7 @@ class SpeciesData:
         else:
             self.ogs_version = parameters_dictionary["data"]["ogs_version"]
 
+        # TODO: catch blocks if key is absent in input
         self.genome_path = parameters_dictionary["data"]["genome_path"]
         self.transcripts_path = parameters_dictionary["data"]["transcripts_path"]
         self.proteins_path = parameters_dictionary["data"]["proteins_path"]
@@ -66,6 +67,7 @@ class SpeciesData:
         self.ogs_analysis_id = None
         self.tool_panel = None
         self.datasets = dict()
+        self.datasets_name = dict()
         self.source_files = dict()
         self.workflow_name = None
         self.metadata = dict()
diff --git a/templates/gspecies_compose_template.yml.j2 b/templates/gspecies_compose_template.yml.j2
index 6bf0adc..c943c27 100644
--- a/templates/gspecies_compose_template.yml.j2
+++ b/templates/gspecies_compose_template.yml.j2
@@ -50,7 +50,7 @@ services:
             TRIPAL_ENABLE_MODULES: "tripal_analysis_blast tripal_analysis_interpro tripal_analysis_go tripal_rest_api tripal_elasticsearch"
             SITE_NAME: "{{ Genus_species }}"
             ELASTICSEARCH_HOST: elasticsearch.{{ genus_species }}
-            ENABLE_JBROWSE: /jbrowse/?data=data/{{ genus_species_sex }}
+            ENABLE_JBROWSE: /jbrowse/?data=data/{{ genus_species_sex }} # WARNING --> the variable "sex" shouldn't be in the compose file (all strains/sex are supposed to be in the same jbrowse)
             ENABLE_APOLLO: 0
             ENABLE_BLAST: 1
             ENABLE_DOWNLOAD: 1
@@ -159,10 +159,11 @@ services:
 #            - "traefik.http.routers.{{ genus_species }}-galaxy.entryPoints=webs"
             - "traefik.http.routers.{{ genus_species }}-galaxy.entryPoints=web" #lg
 #            - "traefik.http.routers.{{ genus_species }}-galaxy.middlewares=sp-auth,sp-app-trailslash,sp-app-prefix"
-            - "traefik.http.routers.{{ genus_species }}-gga_load-galaxy.rule=(Host(`localhost`) && PathPrefix(`/sp/{{ genus_species }}/galaxy`))"
-            - "traefik.http.routers.{{ genus_species }}-gga_load-galaxy.entryPoints=web"
             - "traefik.http.routers.{{ genus_species }}-galaxy.middlewares=sp-app-trailslash,sp-app-prefix" #lg
             - "traefik.http.services.{{ genus_species }}-galaxy.loadbalancer.server.port=80"
+            - "traefik.http.routers.{{ genus_species }}-gga_load-galaxy.rule=(Host(`localhost`) && PathPrefix(`/sp/{{ genus_species }}/galaxy`))"
+            - "traefik.http.routers.{{ genus_species }}-gga_load-galaxy.entryPoints=web"
+
           restart_policy:
             condition: on-failure
             delay: 5s
-- 
GitLab