Loraine Gueguen · e9abe404 · 19e70cf7 · 7c6df170 · 79116521 · c2b5e5a2
--- a/gga_load_data.py

+ 41

− 32
+++ b/gga_load_data.py

+ 41

− 32
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
+import re

 import bioblend
 import argparse
 @@ -10,6 +11,7 @@ import time
 import json
 import yaml

+from bioblend import galaxy
 from bioblend.galaxy.objects import GalaxyInstance

 import utilities
 @@ -27,6 +29,8 @@ Do not call this script before the galaxy container is ready
 GET_ORGANISMS_TOOL = "toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/2.3.4+galaxy0"
 DELETE_ORGANISMS_TOOL = "toolshed.g2.bx.psu.edu/repos/gga/chado_organism_delete_organisms/organism_delete_organisms/2.3.4+galaxy0"

+HOST_DATA_DIR='src_data'
+CONTAINER_DATA_DIR_ROOT='/project_data'

 class LoadData(speciesData.SpeciesData):
    """
 @@ -67,25 +71,22 @@ class LoadData(speciesData.SpeciesData):
        """

        logging.debug("Getting 'Homo sapiens' ID in chado database")
-        get_sapiens_id_job = self.instance.tools.run_tool(
+        get_sapiens_id_job_output_dataset_id = utilities.run_tool_and_get_single_output_dataset_id(self.instance,
            tool_id=GET_ORGANISMS_TOOL,
            history_id=self.history_id,
            tool_inputs={"genus": "Homo", "species": "sapiens"})
-        get_sapiens_id_job_output_dataset_id = get_sapiens_id_job["outputs"][0]["id"]
        get_sapiens_id_json_output = self.instance.datasets.download_dataset(dataset_id=get_sapiens_id_job_output_dataset_id)
+
+        logging.debug("Deleting Homo 'sapiens' in the instance's chado database")
        try:
-            logging.debug("Deleting Homo 'sapiens' in the instance's chado database")
            get_sapiens_id_final_output = json.loads(get_sapiens_id_json_output)[0]
-            sapiens_id = str(
-                get_sapiens_id_final_output["organism_id"])  # needs to be str to be recognized by the chado tool
-            self.instance.tools.run_tool(
+            sapiens_id = str(get_sapiens_id_final_output["organism_id"])  # needs to be str to be recognized by the chado tool
+            utilities.run_tool(
                tool_id=DELETE_ORGANISMS_TOOL,
                history_id=self.history_id,
-                tool_inputs={"organism": str(sapiens_id)})
-        except bioblend.ConnectionError:
-            logging.debug("Homo sapiens isn't in the instance's chado database (bioblend.ConnectionError)")
+                tool_inputs={"organism": sapiens_id})
        except IndexError:
-            logging.debug("Homo sapiens isn't in the instance's chado database (IndexError)")
+            logging.error("Homo sapiens isn't in the instance's chado database (IndexError)")
            pass

    def purge_histories(self):
 @@ -98,13 +99,12 @@ class LoadData(speciesData.SpeciesData):
        """

        histories = self.instance.histories.get_histories()
-        self.instance.histories.get_histories(deleted=False)
        for h in histories:
            self.instance.histories.delete_history(history_id=h["id"])

        return histories

-    def setup_library(self, instance):
+    def setup_library(self):
        """
        Create a "Project Data" library in galaxy, mirroring the "src_data" folder of the current organism
        directory tree
 @@ -112,12 +112,18 @@ class LoadData(speciesData.SpeciesData):
        :return:
        """

-        self.goto_species_dir()
+        data_dir_root=os.path.join(self.get_species_dir(), HOST_DATA_DIR)

+        instance = GalaxyInstance(url=self.instance_url,
+                                              email=self.config["galaxy_default_admin_email"],
+                                              password=self.config["galaxy_default_admin_password"]
+                                              )
+
+        logging.info("Looking for project data in %s" % data_dir_root)
        folders = dict()
        post_renaming = {}

-        for root, dirs, files in os.walk("./src_data", followlinks=True):
+        for root, dirs, files in os.walk(data_dir_root, followlinks=True):
            file_list = [os.path.join(root, filename) for filename in files]
            folders[root] = file_list

 @@ -136,7 +142,7 @@ class LoadData(speciesData.SpeciesData):

            for fname, files in folders.items():
                if fname and files:
-                    folder_name = fname[len("./src_data") + 1:]
+                    folder_name = re.sub(data_dir_root + "/", "", fname)
                    logging.info("Creating folder: %s" % folder_name)
                    folder = self.create_deep_folder(prj_lib, folder_name)

 @@ -178,9 +184,12 @@ class LoadData(speciesData.SpeciesData):
                            logging.info("Skipping useless file '%s'" % single_file)
                            continue

-                        logging.info("Adding file '%s' with type '%s' and name '%s'" % (single_file, ftype, clean_name))
+                        single_file_relative_path = re.sub(data_dir_root, CONTAINER_DATA_DIR_ROOT, single_file)
+                        single_file_path_in_container=os.path.join(CONTAINER_DATA_DIR_ROOT, single_file_relative_path)
+
+                        logging.info("Adding file '%s' with type '%s' and name '%s'" % (single_file_path_in_container, ftype, clean_name))
                        datasets = prj_lib.upload_from_galaxy_fs(
-                            single_file,
+                            single_file_path_in_container,
                            folder=folder,
                            link_data_only='link_to_files',
                            file_type=ftype,
 @@ -200,19 +209,19 @@ class LoadData(speciesData.SpeciesData):
        logging.info("Waiting for import jobs to finish... please wait")

        # Checking job state (only necessary if ran using SLURM)
-        # while True:
-        #     try:
-        #         # "C" state means the job is completed, no need to wait for it
-        #         ret = subprocess.check_output("squeue | grep -v \"C debug\" | grep -v \"JOBID\" || true",
-        #                                       shell=True)
-        #         if not len(ret):
-        #             break
-        #         time.sleep(3)
-        #     except subprocess.CalledProcessError as inst:
-        #         if inst.returncode == 153:  # queue is empty
-        #             break
-        #         else:
-        #             raise
+        while True:
+            try:
+                # "C" state means the job is completed, no need to wait for it
+                ret = subprocess.check_output("squeue | grep -v \"C debug\" | grep -v \"JOBID\" || true",
+                                              shell=True)
+                if not len(ret):
+                    break
+                time.sleep(3)
+            except subprocess.CalledProcessError as inst:
+                if inst.returncode == 153:  # queue is empty
+                    break
+                else:
+                    raise

        time.sleep(10)

 @@ -279,7 +288,7 @@ class LoadData(speciesData.SpeciesData):
        """

        logging.info("Connecting to the galaxy instance (%s)" % self.instance_url)
-        self.instance = GalaxyInstance(url=self.instance_url,
+        self.instance = galaxy.GalaxyInstance(url=self.instance_url,
                                              email=self.config["galaxy_default_admin_email"],
                                              password=self.config["galaxy_default_admin_password"]
                                              )
 @@ -369,7 +378,7 @@ if __name__ == "__main__":

            # Load the datasets into a galaxy library
            logging.info("Setting up library for {0} {1}".format(load_data_for_current_species.genus, load_data_for_current_species.species))
-            load_data_for_current_species.setup_library(load_data_for_current_species.instance)
+            load_data_for_current_species.setup_library()
            logging.debug("Successfully set up library in galaxy for {0} {1}".format(load_data_for_current_species.genus, load_data_for_current_species.species))

            # Set or get the history for the current organism