From 43540a85d2841122a2c46a34b8aac353b01269fb Mon Sep 17 00:00:00 2001
From: Arthur Le Bars <arthur.le-bars@sb-roscoff.fr>
Date: Mon, 24 Feb 2020 17:51:37 +0100
Subject: [PATCH] Data loading from project data into history

---
 IDEAS.md        |  13 +++++
 README.md       |   4 +-
 filetransfer.py |  19 +++++++
 main.py         | 115 ++++++++++++++++++++++++++++++-------
 runner.py       |  13 +++++
 workflow.py     | 147 ++++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 290 insertions(+), 21 deletions(-)

diff --git a/IDEAS.md b/IDEAS.md
index e69de29..10fc614 100644
--- a/IDEAS.md
+++ b/IDEAS.md
@@ -0,0 +1,13 @@
+## Project: Phaeoexplorer GGAuto
+
+### Features:
+    - Automatic import of data into the proper galaxy instance (to use in conjunction with docker swarm)
+    - Automatic execution of necessary workflows
+    - Updating existing instances of galaxy with additional data (e.g RNAseq, Hi-C data, more genomes?)
+    - Docker swarm integration for all-in-one deployment/integration tool?
+    - Write a .ga file (Galaxy workflow) to automatically load data and execute analyses
+### Requirements:
+    - Metadata file where the user has to describe the data/species/etc.:
+        - Species + strain names
+        - Who ordered the data, who sequenced it, who assembled it, who did the annotation (if any)?
+  
\ No newline at end of file
diff --git a/README.md b/README.md
index 8fa50d5..17a381d 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,5 @@
 # gga_load_data
 
-Bioblend script to load data into GGA
\ No newline at end of file
+Bioblend-based script to load data into GGA
+
+TODO: Metadata in /projet/sbr/phaeoexplorer/
\ No newline at end of file
diff --git a/filetransfer.py b/filetransfer.py
index e69de29..9dae890 100644
--- a/filetransfer.py
+++ b/filetransfer.py
@@ -0,0 +1,19 @@
+"""
+SFTP class for downloading files from the server, useless in production
+"""
+import subprocess
+
+
+# TODO: with pysftp
+
+class FileTransfer:
+
+    def connect(self, user, host, password):
+        user_host = [user, host]
+        subprocess.Popen(user_host)
+
+
+
+    def file(self):
+        return 0
+
diff --git a/main.py b/main.py
index bf999a9..8a46fc1 100644
--- a/main.py
+++ b/main.py
@@ -82,10 +82,10 @@ def main():
 
         # Connect to the galaxy instance of the current species TODO: API key connection issues
         gi = galaxy.GalaxyInstance(url=instance_url,
-                                   key="291ec50b48dd1f006985b32e39bc3696",
+                                   key="0e993414b2f876515e74dd890f16ffc7",
                                    email="alebars@sb-roscoff.fr",
                                    password="pouet",
-                                   verify=True)
+                                   verify=False)
 
         """
         This part creates the current species directory and go to it
@@ -117,6 +117,7 @@ def main():
             hl = gi.histories.get_histories()
         except bb.ConnectionError:
             print("Cannot connect to GGA instance @ " + instance_url)
+
         else:
             print("Successfully connected to instance " + instance_url)
 
@@ -148,34 +149,108 @@ def main():
         # generate workflow file and run it in the galaxy instance
         workflow = Workflow(parameters_dict=sp_dict, instance=gi)
         print("Generating custom workflow for " + genus_species_strain)
-        workflow.generate(working_directory=wd, main_directory=main_dir)
-        wf_id = workflow.store()
-        hi_id = gi.histories.get_histories()[0]["id"]
-        print("Workflow id: " + wf_id)
-        print("History id: " + hi_id)
-        wf_show = gi.workflows.show_workflow(workflow_id=wf_id)
-        print(wf_show["inputs"])
-
-        # workflow.port()
-        li_id = gi.libraries.get_libraries()[0]["id"]
-        # gi.folders.update_folder()  # TODO: add method to enable data updates
-        tsi = gi.toolshed.get_repositories()  # tool shed repo access point
-        # print(gi.users.get_users())  # TODO: users management
-        # print(gi.libraries.show_library(li_id))
-        fo_gi = gi.libraries.get_folders(library_id=li_id)  # data location
+        wf_dict = workflow.generate(working_directory=wd, main_directory=main_dir)
+
+        gi.histories.create_history(name=str(genus_species_strain + "_" + genome_version))
+        hi = gi.histories.get_histories(name=str(genus_species_strain + "_" + genome_version))
+        hi_id = hi[0]["id"]
+        # print(hi)
+        li = gi.libraries.get_libraries()  # only one library
+        # print(li)
+        li_id = gi.libraries.get_libraries()[0]["id"]  # project data folder/library
+        # print(li_id)
+        fo_gi = gi.libraries.get_folders(library_id=li_id)  # data location (project data)
+        # print(fo_gi)
         fo_id = {}
         current_fo_name = ""
-        print(fo_gi)
         # folders ids: access to data to run the first tools
         for i in fo_gi:
             for k, v in i.items():
                 if k == "name":
-                    fo_id[k] = 0
-                    current_fo_name = k
+                    fo_id[v] = 0
+                    current_fo_name = v
                 if k == "id":
                     fo_id[current_fo_name] = v
+        print("IDs: ")
+        datasets = {}
+        for k, v in fo_id.items():
+            print("\t" + k + ": " + v)
+            if k == "/genome":
+                sub_folder_content = gi.folders.show_folder(folder_id=v, contents=True)
+                # print(sub_folder_content)
+                for k2, v2 in sub_folder_content.items():
+                    # print(k2)
+                    # print(v2)
+                    for e in v2:
+                        if type(e) == dict:
+                            if e["name"].endswith(".fa"):
+                                datasets["genome_file"] = e["ldda_id"]
+                                print("\t\t" + e["name"] + ": " + e["ldda_id"])
+            elif k == "/annotation/" + genus_species:
+                sub_folder_content = gi.folders.show_folder(folder_id=v, contents=True)
+                # print(sub_folder_content)
+                for k2, v2 in sub_folder_content.items():
+                    # print(k2)
+                    # print(v2)
+                    for e in v2:
+                        if type(e) == dict:
+                            if e["name"].endswith("transcripts-gff.fa"):
+                                datasets["transcripts_file"] = e["ldda_id"]
+                                print("\t\t" + e["name"] + ": " + e["ldda_id"])
+                            elif e["name"].endswith("proteins.fasta"):
+                                datasets["proteins_file"] = e["ldda_id"]
+                                print("\t\t" + e["name"] + ": " + e["ldda_id"])
+                            elif e["name"].endswith(".gff"):
+                                datasets["gff_file"] = e["ldda_id"]
+                                print("\t\t" + e["name"] + ": " + e["ldda_id"])
+
+        current_hi = gi.histories.get_current_history()["id"]
+        print("History ID:" + current_hi)
+        gi.histories.upload_dataset_from_library(history_id=hi_id, lib_dataset_id=datasets["genome_file"])
+        gi.histories.upload_dataset_from_library(history_id=hi_id, lib_dataset_id=datasets["gff_file"])
+        gi.histories.upload_dataset_from_library(history_id=hi_id, lib_dataset_id=datasets["transcripts_file"])
+        gi.histories.upload_dataset_from_library(history_id=hi_id, lib_dataset_id=datasets["proteins_file"])
+        # print(gi.tools.get_tool_panel())
+
+        # print(wf_dict)
+        wf_dict_2 = json.loads(wf_dict)  # add to workflow class
+        gi.workflows.import_workflow_dict(workflow_dict=wf_dict_2)  # doesn't work with eval()
+
 
+        # # PRE FILLED METHOD
+        # wf_id = workflow.store()
+        # hi_id = gi.histories.get_histories()[0]["id"]
+        # print("Workflow id: " + wf_id)
+        # print("History id: " + hi_id)
+        # wf_show = gi.workflows.show_workflow(workflow_id=wf_id)
+        # # print(wf_show["inputs"])  # ->no inputs
+        # # workflow.port()
+        # li_id = gi.libraries.get_libraries()[0]["id"]
+        # # gi.folders.update_folder()  # TODO: add method to enable data updates
+        # tsi = gi.toolshed.get_repositories()  # tool shed repo access point
+        # # print(gi.users.get_users())  # TODO: users management
+        # # print(gi.libraries.show_library(li_id))
+        # # TODO: create a NEW history, import data to galaxy from library (src_data)
+        # fo_gi = gi.libraries.get_folders(library_id=li_id)  # data location
+        # fo_id = {}
+        # current_fo_name = ""
+        # # print(fo_gi)
+        # # folders ids: access to data to run the first tools
+        # for i in fo_gi:
+        #     for k, v in i.items():
+        #         if k == "name":
+        #             fo_id[v] = 0
+        #             current_fo_name = v
+        #         if k == "id":
+        #             fo_id[current_fo_name] = v
+        # print("Folders id: ")
+        # for k, v in fo_id.items():
+        #     print("\t" + k + ": " + v)
+        # workflow.show()
+        # # gi.workflows.run_workflow(workflow_id=wf_id)  # pre filled workflow, use the set on runtime approach instead
 
+        # for testing, purge configured @ ~/config/galaxy.yml.docker_sample
+        # gi.histories.delete_history(history_id=hi_id, purge=True)
         os.chdir(main_dir)
         print("\n")
 
diff --git a/runner.py b/runner.py
index e69de29..9bdfb67 100644
--- a/runner.py
+++ b/runner.py
@@ -0,0 +1,13 @@
+import os
+from bioblend.galaxy import GalaxyInstance
+
+"""
+Tool utils (managed by GGA/GMOD) for job submission in galaxy using bioblend
+"""
+
+
+class Runner:
+
+    def __init__(self, instance):
+        self.instance = None
+
diff --git a/workflow.py b/workflow.py
index e69de29..f658f6a 100644
--- a/workflow.py
+++ b/workflow.py
@@ -0,0 +1,147 @@
+import os
+from bioblend.galaxy import GalaxyInstance
+
+"""
+Workflow creation for generation and visualization of data and analyses output
+"""
+
+
+class Workflow:
+
+    def __init__(self, parameters_dict, instance):
+        self.instance = instance
+        self.parameters_dict = parameters_dict
+        self.genus = parameters_dict["genus"]
+        self.species = parameters_dict["species"]
+        self.strain = parameters_dict["strain"]
+        self.common = parameters_dict["common"]
+        self.performed = parameters_dict["performed by"]
+        self.genome_version = parameters_dict["genome version"]
+        self.ogs_version = parameters_dict["ogs version"]
+        self.sex = parameters_dict["sex"]
+        self.custom_ga_file = None
+        self.custom_ga_file_path = None
+        self.preset_ga_file = None
+        if self.strain != "":
+            self.abbr = self.genus[0].lower() + "_" + self.species + "_" + self.strain
+            self.full = "_".join([self.genus, self.species, self.strain, self.sex])
+        else:
+            self.abbr = self.genus[0].lower() + "_" + self.species
+            self.full = "_".join([self.genus, self.species, self.strain, self.sex])
+        self.workflow = None
+
+    def generate(self, working_directory, main_directory):
+        """
+        Generation of a galaxy workflow using the defined parameters in the .json input file
+        Output format is .ga (basically a .json for galaxy)
+
+        :param working_directory: 
+        :param main_directory: 
+        :return: 
+        """
+        # template workflow as a string
+        # template_workflow_str = '"{"uuid": "ea9c3050-416f-4098-a7ff-b05c952bcd73", "tags": [], "format-version": "0.1", "name": "test", "version": 2, "steps": {"0": {"tool_id": "toolshed.g2.bx.psu.edu/repos/gga/chado_organism_delete_organisms/organism_delete_organisms/2.3.2", "tool_version": "2.3.2", "outputs": [{"type": "txt", "name": "results"}], "workflow_outputs": [{"output_name": "results", "uuid": "d9b75b03-49e7-4a81-a67c-eaf6a9671905", "label": null}], "input_connections": {}, "tool_state": "{\"__page__\": null, \"__rerun_remap_job_id__\": null, \"organism\": \"\\\"2\\\"\", \"psql_target\": \"{\\\"__current_case__\\\": 0, \\\"method\\\": \\\"remote\\\"}\"}", "id": 0, "tool_shed_repository": {"owner": "gga", "changeset_revision": "13da56fdaeb1", "name": "chado_organism_delete_organisms", "tool_shed": "toolshed.g2.bx.psu.edu"}, "uuid": "f70569bc-9ac0-441a-a2d8-b547086a5bdf", "errors": null, "name": "Chado organism delete", "post_job_actions": {}, "label": "$ORGADELETE", "inputs": [], "position": {"top": 362, "left": 200}, "annotation": "", "content_id": "toolshed.g2.bx.psu.edu/repos/gga/chado_organism_delete_organisms/organism_delete_organisms/2.3.2", "type": "tool"}, "1": {"tool_id": "toolshed.g2.bx.psu.edu/repos/gga/chado_organism_add_organism/organism_add_organism/2.3.2", "tool_version": "2.3.2", "outputs": [{"type": "json", "name": "results"}], "workflow_outputs": [{"output_name": "results", "uuid": "8ca0b891-0f01-4787-9b7f-57105dc303b0", "label": null}], "input_connections": {}, "tool_state": "{\"comment\": \"\\\"\\\"\", \"__page__\": null, \"__rerun_remap_job_id__\": null, \"psql_target\": \"{\\\"__current_case__\\\": 0, \\\"method\\\": \\\"remote\\\"}\", \"common\": \"\\\"$COMMON\\\"\", \"genus\": \"\\\"$GENUS\\\"\", \"species\": \"\\\"$SPECIES\\\"\", \"abbr\": \"\\\"$ABBR\\\"\"}", "id": 1, "tool_shed_repository": {"owner": "gga", "changeset_revision": "0f4956cec445", "name": "chado_organism_add_organism", "tool_shed": "toolshed.g2.bx.psu.edu"}, "uuid": "24f0e175-f932-4e48-8b42-a53d9a432d5e", "errors": null, "name": "Chado organism add", "post_job_actions": {}, "label": "$ORGADD", "inputs": [], "position": {"top": 361, "left": 467.5}, "annotation": "", "content_id": "toolshed.g2.bx.psu.edu/repos/gga/chado_organism_add_organism/organism_add_organism/2.3.2", "type": "tool"}, "2": {"tool_id": "toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/2.3.2", "tool_version": "2.3.2", "outputs": [{"type": "json", "name": "results"}], "workflow_outputs": [{"output_name": "results", "uuid": "8fa0e728-8803-4800-93b4-70f906f95f87", "label": null}], "input_connections": {}, "tool_state": "{\"__page__\": null, \"name\": \"\\\"$GENOME\\\"\", \"sourceuri\": \"\\\"\\\"\", \"sourcename\": \"\\\"\\\"\", \"__rerun_remap_job_id__\": null, \"programversion\": \"\\\"\\\"\", \"psql_target\": \"{\\\"__current_case__\\\": 0, \\\"method\\\": \\\"remote\\\"}\", \"sourceversion\": \"\\\"\\\"\", \"program\": \"\\\"$PERFORMEDBY\\\"\", \"algorithm\": \"\\\"\\\"\", \"date_executed\": \"\\\"\\\"\", \"description\": \"\\\"\\\"\"}", "id": 2, "tool_shed_repository": {"owner": "gga", "changeset_revision": "3a1f3c9b755b", "name": "chado_analysis_add_analysis", "tool_shed": "toolshed.g2.bx.psu.edu"}, "uuid": "76cbbd55-f1ac-4e48-be3c-c7bbda5add4c", "errors": null, "name": "Chado analysis add", "post_job_actions": {}, "label": "$ADDGENOME", "inputs": [], "position": {"top": 307, "left": 690}, "annotation": "", "content_id": "toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/2.3.2", "type": "tool"}, "3": {"tool_id": "toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/2.3.2", "tool_version": "2.3.2", "outputs": [{"type": "json", "name": "results"}], "workflow_outputs": [{"output_name": "results", "uuid": "5e7da027-0723-4077-8885-2dbe51cb5dda", "label": null}], "input_connections": {}, "tool_state": "{\"__page__\": null, \"name\": \"\\\"$OGS\\\"\", \"sourceuri\": \"\\\"\\\"\", \"sourcename\": \"\\\"\\\"\", \"__rerun_remap_job_id__\": null, \"programversion\": \"\\\"\\\"\", \"psql_target\": \"{\\\"__current_case__\\\": 0, \\\"method\\\": \\\"remote\\\"}\", \"sourceversion\": \"\\\"\\\"\", \"program\": \"\\\"$PERFORMEDBY\\\"\", \"algorithm\": \"\\\"\\\"\", \"date_executed\": \"\\\"\\\"\", \"description\": \"\\\"\\\"\"}", "id": 3, "tool_shed_repository": {"owner": "gga", "changeset_revision": "3a1f3c9b755b", "name": "chado_analysis_add_analysis", "tool_shed": "toolshed.g2.bx.psu.edu"}, "uuid": "4d1ffee4-00b2-445d-b630-b7b774c17873", "errors": null, "name": "Chado analysis add", "post_job_actions": {}, "label": "$ADDOGS", "inputs": [], "position": {"top": 395, "left": 697}, "annotation": "", "content_id": "toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/2.3.2", "type": "tool"}, "4": {"tool_id": "toolshed.g2.bx.psu.edu/repos/gga/chado_feature_load_fasta/feature_load_fasta/2.3.2", "tool_version": "2.3.2", "outputs": [{"type": "json", "name": "results"}], "workflow_outputs": [{"output_name": "results", "uuid": "737dddc9-ae1b-463d-99fa-d9176053594d", "label": null}], "input_connections": {}, "tool_state": "{\"do_update\": \"\\\"false\\\"\", \"relationships\": \"{\\\"__current_case__\\\": 0, \\\"rel_type\\\": \\\"none\\\"}\", \"ext_db\": \"{\\\"db\\\": \\\"\\\", \\\"re_db_accession\\\": \\\"\\\"}\", \"analysis_id\": \"\\\"4\\\"\", \"re_uniquename\": \"\\\"\\\"\", \"match_on_name\": \"\\\"false\\\"\", \"__page__\": null, \"__rerun_remap_job_id__\": null, \"psql_target\": \"{\\\"__current_case__\\\": 0, \\\"method\\\": \\\"remote\\\"}\", \"re_name\": \"\\\"\\\"\", \"fasta\": \"{\\\"__class__\\\": \\\"RuntimeValue\\\"}\", \"wait_for\": \"{\\\"__class__\\\": \\\"RuntimeValue\\\"}\", \"organism\": \"\\\"2\\\"\", \"sequence_type\": \"\\\"contig\\\"\"}", "id": 4, "tool_shed_repository": {"owner": "gga", "changeset_revision": "1421dbc33a92", "name": "chado_feature_load_fasta", "tool_shed": "toolshed.g2.bx.psu.edu"}, "uuid": "3d417ced-fc48-4c04-8a92-fdb7b9fecafc", "errors": null, "name": "Chado load fasta", "post_job_actions": {}, "label": "$LOADFASTA", "inputs": [{"name": "fasta", "description": "runtime parameter for tool Chado load fasta"}, {"name": "wait_for", "description": "runtime parameter for tool Chado load fasta"}], "position": {"top": 306, "left": 933.5}, "annotation": "", "content_id": "toolshed.g2.bx.psu.edu/repos/gga/chado_feature_load_fasta/feature_load_fasta/2.3.2", "type": "tool"}}, "annotation": "", "a_galaxy_workflow": "true"}"'
+
+        os.chdir(path=working_directory)
+        self.preset_ga_file = main_directory + "preset_workflow.ga"
+        if self.strain != "":
+            self.custom_ga_file = "_".join([self.genus, self.species, self.strain]) + "_workflow.ga"
+            self.custom_ga_file_path = os.path.abspath(self.custom_ga_file)
+            print("Workflow file @ " + self.custom_ga_file_path)
+        else:
+            self.custom_ga_file = "_".join([self.genus, self.species]) + "_workflow.ga"
+            self.custom_ga_file_path = os.path.abspath(self.custom_ga_file)
+            print("Workflow file @ " + self.custom_ga_file_path)
+        with open(self.preset_ga_file, 'r') as ga_in_file:
+            ga_in = str(ga_in_file.readlines())
+            ga_in = ga_in.replace("$OGS", "OGS")
+            ga_in = ga_in.replace("$VERSION", self.ogs_version)
+            ga_in = ga_in.replace("$GENUS", self.genus)
+            ga_in = ga_in.replace("$SPECIES", self.species)
+            ga_in = ga_in.replace("$ABBR", self.abbr)
+            ga_in = ga_in.replace("$STRAIN", self.strain)
+            ga_in = ga_in.replace("$PERFORMEDBY", self.performed)
+            ga_in = ga_in.replace("$COMMON", self.common)
+            ga_in = ga_in.replace("$ORGA", self.full)
+            ga_in = ga_in.replace("$ADDAN", "Add analysis")
+            ga_in = ga_in.replace("\\\\", "\\")  # restore the correct amount of backslashes in the ga file
+            workflow_name = '"name": "' + self.full + '"'
+            ga_in = ga_in.replace('"name": "preset"', workflow_name)
+            ga_in = ga_in[2:]
+            ga_in = ga_in[:-4]
+            self.workflow = ga_in
+        return ga_in
+
+    def dict_port(self):
+        """
+
+        :return:
+        """
+
+        try:
+            self.instance.workflows.import_workflow_dict(workflow_dict=self.workflow)
+        except ConnectionError:
+            return False
+        return True
+
+    def port(self):
+        """
+        Import workflow into a galaxy instance
+        Importing from string doesnt work (MUST be dict) -> TODO: handle dict import
+        :return:
+        """
+        try:
+            self.instance.workflows.import_workflow_from_local_path(self.custom_ga_file_path)
+        except ConnectionError:
+            return False
+        else:
+            return True
+
+    def show(self):
+        """
+        Print the instance's main workflow to stdout (dict form)
+        :return:
+        """
+        workflow_id = self.instance.workflows.get_workflows()[0]['id']
+        return print(self.instance.workflows.show_workflow(workflow_id=workflow_id))
+
+    def store(self):
+        """
+        Store the instance's workflow
+        :return:
+        """
+        workflow_id = self.instance.workflows.get_workflows()[0]['id']
+        return workflow_id
+
+
+    def delete(self):
+        """
+
+        :return:
+        """
+        return None
+
+
+    def run(self):
+        """
+        Run the custom workflow into a galaxy instance
+        Input datasets in the form of a list
+
+        :return:
+        """
+
+        wf_id = self.show()
+
+        datamap = {"genus": self.genus, "species": self.species, "strain": self.strain, "abbr": self.abbr,
+                   "full": self.full, "common": self.common, "ogs_version": self.ogs_version,
+                   "genome_version": self.genome_version, "sex": self.sex, "performed": self.performed}
+        return None
+
+    # def add_step(self, step_position, description, name):
+    #     """
+    #     TODO: add a step to the workflow (data loading into chado for example)
+    #
+    #     :param workflow:
+    #     :return:
+    #     """
+    #     return None
-- 
GitLab