From 8d36dd970b44d4310801be8b2e9a557bc8967500 Mon Sep 17 00:00:00 2001
From: Arthur Le Bars <arthur.le-bars@sb-roscoff.fr>
Date: Fri, 19 Mar 2021 12:54:00 +0100
Subject: [PATCH] src_data dir tree created in gga_get_data

---
 gga_get_data.py               | 60 +++++++++++++++++++++++++++++++++++
 gga_init.py                   | 34 ++------------------
 run_workflow_phaeoexplorer.py | 28 ++++++++++------
 speciesData.py                |  1 -
 utilities.py                  |  5 +++
 5 files changed, 85 insertions(+), 43 deletions(-)

diff --git a/gga_get_data.py b/gga_get_data.py
index 4494f9a..3b92591 100644
--- a/gga_get_data.py
+++ b/gga_get_data.py
@@ -53,6 +53,45 @@ class GetData(speciesData.SpeciesData):
         return 1
 
 
+
+        def make_directory_tree(self):
+        """
+        Generate the directory tree for an organism
+
+        :return:
+        """
+
+        os.chdir(self.main_dir)
+
+        try:
+            os.chdir(self.species_dir)
+        except OSError as exc:
+            logging.critical("Cannot access %s" % self.genus_species)
+            sys.exit(exc)
+
+        # Creation (or updating) of the src_data directory tree
+        try:
+            os.mkdir("./src_data")
+        except FileExistsError:
+            logging.debug("'src_data' directory already exist for %s" % self.full_name)
+        except PermissionError as exc:
+            logging.critical("Insufficient permission to create src_data directory tree")
+            sys.exit(exc)
+
+        # List of all the directories to create in src_data
+        src_data_dirs_li = ["./src_data", "./src_data/annotation", "./src_data/genome", "./src_data/tracks",
+                            "./src_data/annotation/%s" % self.species_folder_name,
+                            "./src_data/genome/%s" % self.species_folder_name,
+                            "./src_data/annotation/{0}/OGS{1}/".format(self.species_folder_name, self.ogs_version),
+                            "./src_data/genome/{0}/v{1}".format(self.species_folder_name, self.genome_version)]
+        make_dirs(dir_paths_li=src_data_dirs_li)
+
+        # Return to main directory
+        os.chdir(self.main_dir)
+
+        logging.info("src_data directory tree generated for %s" % self.full_name)
+
+
     def get_source_data_files_from_path(self):
         """
         Find source data files and copy them into the src_data dir tree
@@ -106,6 +145,27 @@ class GetData(speciesData.SpeciesData):
         os.chdir(self.main_dir)
 
 
+def make_dirs(dir_paths_li):
+    """
+    Recursively create directories from a list of paths with a try-catch condition
+
+    :param dir_paths_li:
+    :return:
+    """
+    created_dir_paths_li = []
+
+    for dir_path in dir_paths_li:
+        try:
+            os.mkdir(dir_path)
+        except FileExistsError:
+            logging.debug("%s directory already exists" % dir_path)
+        except PermissionError as exc:
+            logging.critical("Insufficient permission to create %s" % dir_path)
+            sys.exit(exc)
+        created_dir_paths_li.append(dir_path)
+
+    return created_dir_paths_li
+    
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Automatic data loading in containers and interaction "
                                                  "with galaxy instances for GGA"
diff --git a/gga_init.py b/gga_init.py
index 8613d0e..30ec55d 100644
--- a/gga_init.py
+++ b/gga_init.py
@@ -85,23 +85,6 @@ class DeploySpeciesStack(speciesData.SpeciesData):
             logging.critical("Cannot edit NginX conf file")
             sys.exit(exc)
 
-        # Creation (or updating) of the src_data directory tree
-        try:
-            os.mkdir("./src_data")
-        except FileExistsError:
-            logging.debug("'src_data' directory already exist for %s" % self.full_name)
-        except PermissionError as exc:
-            logging.critical("Insufficient permission to create src_data directory tree")
-            sys.exit(exc)
-
-        # List of all the directories to create in src_data
-        src_data_dirs_li = ["./src_data", "./src_data/annotation", "./src_data/genome", "./src_data/tracks",
-                            "./src_data/annotation/%s" % self.species_folder_name,
-                            "./src_data/genome/%s" % self.species_folder_name,
-                            "./src_data/annotation/{0}/OGS{1}/".format(self.species_folder_name, self.ogs_version),
-                            "./src_data/genome/{0}/v{1}".format(self.species_folder_name, self.genome_version)]
-        make_dirs(dir_paths_li=src_data_dirs_li)
-
         # Return to main directory
         os.chdir(self.main_dir)
 
@@ -161,6 +144,7 @@ class DeploySpeciesStack(speciesData.SpeciesData):
 
     def make_orthology_compose_files(self):
         """
+        Create/update orthology compose files
 
         :return:
         """
@@ -324,6 +308,7 @@ def deploy_stacks(input_list, main_dir, deploy_traefik):
     os.chdir(main_dir)
 
     # Get species for which to deploy the stacks
+    # Uses the get_unique_species_list method from utilities to deploy a stack only for the "species" level (i.e genus_species)
     to_deploy_species_li = utilities.get_unique_species_list(sp_dict_list=input_list)
 
     if deploy_traefik:
@@ -354,21 +339,6 @@ def deploy_stacks(input_list, main_dir, deploy_traefik):
                                             stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=".")
     os.chdir(main_dir)
 
-    # # Using deploy.sh script (obsolete)
-    # Launch and update docker stacks
-    # noinspection PyArgumentList
-    # deploy_stacks_popen = subprocess.Popen(["sh", self.script_dir + "/deploy.sh", self.genus_species,
-    #                                         self.main_dir + "/traefik"],
-    #                                        stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
-    #                                        universal_newlines=True)
-    #
-    # for stdout_line in iter(deploy_stacks_popen.stdout.readline, ""):
-    #     if "daemon" in stdout_line:  # Ignore swarm init error output
-    #         pass
-    #     else:
-    #         logging.info("\t%s" % stdout_line.strip())
-    # deploy_stacks_popen.stdout.close()
-
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Automatic data loading in containers and interaction "
diff --git a/run_workflow_phaeoexplorer.py b/run_workflow_phaeoexplorer.py
index 2857feb..4f73193 100644
--- a/run_workflow_phaeoexplorer.py
+++ b/run_workflow_phaeoexplorer.py
@@ -252,6 +252,18 @@ class RunWorkflow(speciesData.SpeciesData):
         logging.info("Individual tools versions and changesets validated")
 
 
+
+    def tripal_synchronize_organism_analyses(self):
+        """
+        """
+        show_tool_tripal_sync = self.instance.tools.show_tool(tool_id="toolshed.g2.bx.psu.edu/repos/gga/tripal_organism_sync/organism_sync/3.2.1.0", io_details=True)
+        org_sync = "toolshed.g2.bx.psu.edu/repos/gga/tripal_organism_sync/organism_sync/3.2.1.0"
+        org_sync = self.instance.tools.run_tool(tool_id="toolshed.g2.bx.psu.edu/repos/gga/tripal_organism_sync/organism_sync/3.2.1.0",
+                                                history_id=self.history_id,
+                                                tool_inputs={"organism_id": "2"})
+        org_sync_job_out = org_sync["outputs"]
+
+
     def add_organism_ogs_genome_analyses(self):
         """
         Add OGS and genome vX analyses to Chado database
@@ -477,6 +489,8 @@ class RunWorkflow(speciesData.SpeciesData):
             # as runtime values, using runtime parameters makes the tool throw an internal critical error ("replace not found" error)
             if workflow_name == "Jbrowse":
                 workflow_dict["steps"]["2"]["tool_state"] = workflow_dict["steps"]["2"]["tool_state"].replace("__MENU_URL__", "https://{host}:{port}/sp/{genus_sp}/feature/{Genus}/{species}/{id}".format(host=self.config["host"], port=self.config["https_port"], genus_sp=self.genus_species, Genus=self.genus_uppercase, species=self.species, id="{id}"))
+                # The UNIQUE_ID is specific to a combination genus_species_strain_sex so every combination should have its unique workflow
+                # in galaxy --> define a naming method for these workflows
                 workflow_dict["steps"]["3"]["tool_state"] = workflow_dict["steps"]["3"]["tool_state"].replace("__FULL_NAME__", self.full_name).replace("__UNIQUE_ID__", self.abbreviation)
 
             # Import the workflow in galaxy as a dict
@@ -561,23 +575,16 @@ class RunWorkflow(speciesData.SpeciesData):
         # Iterating over the folders to find datasets and map datasets to their IDs
         logging.debug("Datasets IDs: ")
         for k, v in folders_ids.items():
-            # print(self.full_name)
-            # print(self.species_folder_name)
-            # print(k)
-            # print(v)
             if k == "/genome/{0}/v{1}".format(self.species_folder_name, self.genome_version):
                 sub_folder_content = self.instance.folders.show_folder(folder_id=v, contents=True)
                 for k2, v2 in sub_folder_content.items():
                     for e in v2:
                         if type(e) == dict:
-                            # print(e)
                             if e["name"].endswith(".fa"):
                                 self.datasets["genome_file"] = e["ldda_id"]
                                 self.datasets_name["genome_file"] = e["name"]
                                 logging.debug("\tGenome file:\t" + e["name"] + ": " + e["ldda_id"])
-                                # print("Genome file:\t" + e["name"] + ": " + e["ldda_id"])
 
-            # print(self.species_folder_name)
             if k == "/annotation/{0}/OGS{1}".format(self.species_folder_name, self.ogs_version):
                 sub_folder_content = self.instance.folders.show_folder(folder_id=v, contents=True)
                 for k2, v2 in sub_folder_content.items():
@@ -652,7 +659,6 @@ class RunWorkflow(speciesData.SpeciesData):
         # there is no way to know which imported datasets are the correct ones depending on history content
         # it's not currently used
         history_datasets_li = self.instance.datasets.get_datasets()
-        print(history_datasets_li)
 
         genome_dataset_hda_id, gff_dataset_hda_id, transcripts_dataset_hda_id, proteins_datasets_hda_id = None, None, None, None
         interproscan_dataset_hda_id, blast_diamond_dataset_hda_id = None, None
@@ -839,7 +845,7 @@ if __name__ == "__main__":
 
             # Parse the config yaml file
             run_workflow_for_current_organism.config = utilities.parse_config(args.config)
-            # Set the instance url attribute
+            # Set the instance url attribute --> TODO: the localhost rule in the docker-compose still doesn't work on scratchgmodv1
             run_workflow_for_current_organism.instance_url = "http://scratchgmodv1:{0}/sp/{1}_{2}/galaxy/".format(
                 run_workflow_for_current_organism.config["http_port"], 
                 run_workflow_for_current_organism.genus_lowercase,
@@ -863,11 +869,13 @@ if __name__ == "__main__":
                 run_workflow_for_current_organism.get_genome_analysis_id()
                 run_workflow_for_current_organism.get_ogs_analysis_id()
 
+                # run_workflow_for_current_organism.tripal_synchronize_organism_analyses()
 
                 # Get the attributes of the instance and project data files
                 run_workflow_for_current_organism.get_instance_attributes()
 
                 # Import datasets into history and retrieve their hda IDs
+                # TODO: can be simplified with direct access to the folder contents via the full path (no loop required)
                 hda_ids = run_workflow_for_current_organism.import_datasets_into_history()
 
                 # DEBUG
@@ -918,7 +926,7 @@ if __name__ == "__main__":
                 # run_workflow_for_current_organism.datamap[GFF_FILE] = {"src": "hda",
                 #                                                        "id": hda_ids["gff_hda_id"]}
 
-                # Ensures galaxy has time to retrieve
+                # Ensures galaxy has had time to retrieve datasets
                 time.sleep(60)
                 # Run the Chado load Tripal sync workflow with the parameters set above
                 run_workflow_for_current_organism.run_workflow(workflow_path=workflow,
diff --git a/speciesData.py b/speciesData.py
index dc6d2a4..238b28e 100644
--- a/speciesData.py
+++ b/speciesData.py
@@ -80,7 +80,6 @@ class SpeciesData:
         self.datasets = dict()
         self.config = None
         # Custom config used to set environment variables inside containers, defaults to the one in the repo
-        self.source_data_dir = parameters_dictionary["data"]["parent_directory"]
         self.species_folder_name = "_".join(utilities.filter_empty_not_empty_items([self.genus_lowercase.lower(), self.species.lower(), self.sex.lower()])["not_empty"])
         self.existing_folders_cache = {}
         self.bam_metadata_cache = {}
diff --git a/utilities.py b/utilities.py
index 6885afb..f9f561d 100644
--- a/utilities.py
+++ b/utilities.py
@@ -170,6 +170,11 @@ def get_unique_species_dict(sp_dict_list):
     return unique_species_dict
 
 
+def get_unique_analysis(sp_dict_list):
+
+
+    return 0
+
 
 def write_metadata(metadata_file, metadata_dict):
     """
-- 
GitLab