From a9daa4b9acd094b1d9e902feb3fff7e7cf4bd731 Mon Sep 17 00:00:00 2001
From: Arthur Le Bars <arthur.le-bars@sb-roscoff.fr>
Date: Mon, 9 Mar 2020 09:31:19 +0100
Subject: [PATCH] .ga correct editing for jbrowse workflow

---
 main.py     | 208 +++++++++++++++++++++++++++++++---------------------
 workflow.py |  21 ++++--
 2 files changed, 139 insertions(+), 90 deletions(-)

diff --git a/main.py b/main.py
index 7b06b6c..fa23d79 100644
--- a/main.py
+++ b/main.py
@@ -5,9 +5,9 @@ import argparse
 import os
 import sys
 import subprocess
+import logging
+import re
 import json
-import urllib3 as ul
-from chado import ChadoInstance
 from workflow import Workflow
 from toolrunner import ToolRunner
 """
@@ -17,22 +17,12 @@ python3 ~/PycharmProjects/ggauto/gga_load_data/main.py ~/PycharmProjects/ggauto/
 """
 
 
-class Autoload:
-    """
-    TODO: turn main into an object
-    """
-    def __init__(self, json_in):
-        self.json_in = json_in
-
-    def main(self):
-        return None
-
-
 def main():
 
     parser = argparse.ArgumentParser(description="Input genus, species, strain, version")
     parser.add_argument("json", type=str, help="Input JSON file")
     parser.add_argument("--just-load", help="Only load data into galaxy, does not create nor run analyses in galaxy")
+    parser.add_argument("-v", "--verbose", help="Increase output verbosity")
 
     # CLI stuff
     # parser.add_argument("--name", help="Sample species name, format: genus-species",type=str)
@@ -42,6 +32,8 @@ def main():
 
     user_arguments = parser.parse_args()
 
+    logging.basicConfig(level=logging.INFO)
+
     # List that will hold all dicts from the JSON input file, containing parameters for each species
     sp_dict_list = []
 
@@ -81,17 +73,20 @@ def main():
         # Test adress, change to abims-gga.sb-roscoff.fr/sp/ in production
         instance_url = "http://localhost/sp/" + genus_lower + "_" + species + "/galaxy/"
 
-        print("Species: " + genus + " " + species + " (" + common + ")"
-              "\nStrain: " + strain +
-              "\nAccessing instance " + instance_url)
+        print("Species: " + genus + " " + species + " (" + common + ")" +
+                     "\nStrain: " + strain +
+                     "\nAccessing instance " + instance_url)
 
-        # Connect to the galaxy instance of the current species TODO: API key connection issues
+        # Connect to the galaxy instance of the current species TODO: connection issues (galaxy side)
         gi = galaxy.GalaxyInstance(url=instance_url,
                                    key="3b36455cb16b4d0e4348e2c42f4bb934",
                                    email="alebars@sb-roscoff.fr",
                                    password="pouet",
                                    verify=True)
 
+        # admin_email = os.environ.get('GALAXY_DEFAULT_ADMIN_USER', 'admin@galaxy.org')
+        # admin_pass = os.environ.get('GALAXY_DEFAULT_ADMIN_PASSWORD', 'admin')
+
         """
         This part creates the current species directory and go to it
         If it already exists, just move to it
@@ -118,22 +113,62 @@ def main():
         #                            password="****")
 
         # Check connection to the current instance
+        print("Testing connection to the galaxy instance")
         try:
             hl = gi.histories.get_histories()
         except bb.ConnectionError:
-            print("Cannot connect to GGA instance @ " + instance_url)
+            print("Cannot connect to galaxy instance @ " + instance_url)
 
         else:
-            print("Successfully connected to instance " + instance_url)
+            print("Successfully connected to galaxy instance @ " + instance_url)
 
-        # TODO: FTP stuff to retrieve the datasets (used in testing, not needed for production)
+        # TODO: FTP/symlink stuff to retrieve the datasets + change headers in pep.fasta
+        setup_data_libraries_cl = "docker-compose exec galaxy /tool_deps/_conda/bin/python /opt/setup_data_libraries.py"
         # try:
         #     os.mkdir("./src_data")
         # except FileExistsError:
         #     print("src_data folder already exists for " + genus_species_strain)
+        #     print("Loading data into galaxy...")
+        #     try:
+        #         setup_data_libraries = subprocess.Popen(setup_data_libraries_cl.split(), stdout=subprocess.PIPE)
+        #         print("Output from setup_data_libraries.py")
+        #         print(setup_data_libraries.communicate())
+        #     except bb.ConnectionError:
+        #         print("Cannot load data into container for " + genus_species_strain)
+        #         break
+        #     else:
+        #         print("Data successfully loaded into docker container for " + genus_species_strain)
         # else:
         #     print("src_data folder created for " + genus_species_strain)
-        #
+        #     try:
+        #         setup_data_libraries = subprocess.Popen(setup_data_libraries_cl.split(), stdout=subprocess.PIPE)
+        #         print("Output from setup_data_libraries.py")
+        #         print(setup_data_libraries.communicate())
+        #     except bb.ConnectionError:
+        #         print("Cannot load data into container for " + genus_species_strain)
+        #         break
+        #     else:
+        #         print("Data successfully loaded into docker container for " + genus_species_strain)
+
+        genome_dir, annotation_dir = None, None
+        for d in [i[0] for i in os.walk(os.getcwd() + "/src_data")]:
+            if "annotation/" in d:
+                annotation_dir = d
+                annotation_dir_files = [f for f in os.listdir(d) if os.path.isfile(os.path.join(d, f))]
+                print("src_data annotation file(s):")
+                print(str('\t' + file) for file in annotation_dir_files)
+            elif "genome/" in d:
+                genome_dir = d
+                genome_dir_files = [f for f in os.listdir(d) if os.path.isfile(os.path.join(d, f))]
+                print("src_data genome file(s):")
+                print(str('\t' + file) for file in genome_dir_files)
+
+
+
+
+        modify_pep_headers = ["sh /usr/local/genome2/mmo/scripts/phaeoexplorer/phaeoexplorer-change_pep_fasta_header.sh"]
+
+
         # TODO: load the data into the current species directory and load it into galaxy instance
         # setup_data_libraries_cl = \
         #     "docker-compose exec galaxy /tool_deps/_conda/bin/python /opt/setup_data_libraries.py"
@@ -150,7 +185,7 @@ def main():
 
         # generate workflow file and run it in the galaxy instance
 
-        gi.histories.create_history(name=str(genus_species_strain + "_" + genome_version))
+        # gi.histories.create_history(name=str(genus_species_strain + "_" + genome_version))
         hi = gi.histories.get_histories(name=str(genus_species_strain + "_" + genome_version))
         hi_id = hi[0]["id"]
         li = gi.libraries.get_libraries()  # only one library
@@ -186,7 +221,7 @@ def main():
                 for k2, v2 in sub_folder_content.items():
                     for e in v2:
                         if type(e) == dict:
-                            # TODO: manage several files of the same type
+                            # TODO: manage several files of the same type and versions
                             if e["name"].endswith("transcripts-gff.fa"):
                                 datasets["transcripts_file"] = e["ldda_id"]
                                 print("\t\t" + e["name"] + ": " + e["ldda_id"])
@@ -196,6 +231,9 @@ def main():
                             elif e["name"].endswith(".gff"):
                                 datasets["gff_file"] = e["ldda_id"]
                                 print("\t\t" + e["name"] + ": " + e["ldda_id"])
+                            elif e["name"].endswith("MALE"):
+                                datasets["gff_file"] = e["ldda_id"]
+                                print("\t\t" + e["name"] + ": " + e["ldda_id"])
 
         current_hi_id = gi.histories.get_current_history()["id"]
         print("History ID: " + current_hi_id)
@@ -204,76 +242,80 @@ def main():
         gi.histories.upload_dataset_from_library(history_id=current_hi_id, lib_dataset_id=datasets["transcripts_file"])
         gi.histories.upload_dataset_from_library(history_id=current_hi_id, lib_dataset_id=datasets["proteins_file"])
 
-        # Delete Homo sapiens from Chado database
-        toolrunner = ToolRunner(parameters_dict=sp_dict, instance=gi, history=current_hi_id)
-        sapiens_id = None
-        sapiens = toolrunner.get_sapiens_id()
-        sapiens_job_out = sapiens["outputs"][0]["id"]
-        sapiens_json_output = gi.datasets.download_dataset(dataset_id=sapiens_job_out)
-        try:
-            sapiens_output = json.loads(sapiens_json_output)[0]
-            sapiens_id = str(sapiens_output["organism_id"])  # needs to be str to be recognized by the chado tool
-            toolrunner.delete_sapiens(hs_id=sapiens_id)
-        except bb.ConnectionError:
-            print("Homo sapiens isn't in the database")
+        # ---------------------------------------------------------------------
+        # Galaxy instance interaction
+        # ---------------------------------------------------------------------
 
-        # Workflow generation
+        # # Delete Homo sapiens from Chado database
+        # toolrunner = ToolRunner(parameters_dict=sp_dict, instance=gi, history=current_hi_id)
+        # sapiens = toolrunner.get_sapiens_id()
+        # sapiens_job_out = sapiens["outputs"][0]["id"]
+        # sapiens_json_output = gi.datasets.download_dataset(dataset_id=sapiens_job_out)
+        # try:
+        #     sapiens_output = json.loads(sapiens_json_output)[0]
+        #     sapiens_id = str(sapiens_output["organism_id"])  # needs to be str to be recognized by the chado tool
+        #     toolrunner.delete_sapiens(hs_id=sapiens_id)
+        # except bb.ConnectionError:
+        #     print("Homo sapiens isn't in the database")
+        # except IndexError:
+        #     pass
+        #
+        # # Workflow generation
         workflow = Workflow(parameters_dict=sp_dict, instance=gi, history_id = current_hi_id)
-        wf_dict_json = workflow.generate(working_directory=wd, main_directory=main_dir, workflow_name="preset_workflow")
-
-        tools = gi.tools.get_tool_panel()  # tools panel -> alternative to wf
-        # print(tools)
-
-        wf_dict = json.loads(wf_dict_json)  # doesn't work with eval()
-
-        gi.workflows.import_workflow_dict(workflow_dict=wf_dict)
-        wf_name = workflow.get_workflow_name()
-        wf_attr = gi.workflows.get_workflows(name=wf_name)
-        wf_id = wf_attr[0]["id"]
-        wf_show = gi.workflows.show_workflow(workflow_id=wf_id)
-        print("Workflow ID: " + wf_id)
-
-        toolrunner = ToolRunner(parameters_dict=sp_dict, instance=gi, history=current_hi_id)
-        # toolrunner.purge_organisms()
-
-        # wf_o = bbo.Workflow(wf_dict=wf_dict, gi=gi)
-
-        wf_params = workflow.set_main_workflow_parameters(datasets=datasets)
-        print("Inputs:")
-        print(wf_show["inputs"])
-
-        datamap = dict()
-        datamap["0"] = {"src": "hda", "id": datasets["genome_file"]}
-        datamap["1"] = {"src": "hda", "id": datasets["gff_file"]}
-        datamap["2"] = {"src": "hda", "id": datasets["proteins_file"]}
-        datamap["3"] = {"src": "hda", "id": datasets["transcripts_file"]}
+        # wf_dict_json = workflow.generate(working_directory=wd, main_directory=main_dir, workflow_name="preset_workflow")
+        #
+        # tools = gi.tools.get_tool_panel()  # tools panel -> alternative to wf
+        # # print(tools)
+        #
+        # wf_dict = json.loads(wf_dict_json)  # doesn't work with eval()
+        #
+        # gi.workflows.import_workflow_dict(workflow_dict=wf_dict)
+        # wf_name = workflow.get_workflow_name()
+        # wf_attr = gi.workflows.get_workflows(name=wf_name)
+        # wf_id = wf_attr[0]["id"]
+        # wf_show = gi.workflows.show_workflow(workflow_id=wf_id)
+        # print("Workflow ID: " + wf_id)
+        #
+        # toolrunner = ToolRunner(parameters_dict=sp_dict, instance=gi, history=current_hi_id)
+        # # toolrunner.purge_organisms()
+        #
+        # # wf_o = bbo.Workflow(wf_dict=wf_dict, gi=gi)
+        #
+        # wf_params = workflow.set_main_workflow_parameters(datasets=datasets)
+        # # print("Inputs:")
+        # # print(wf_show["inputs"])
+        #
+        # datamap = dict()
+        # datamap["0"] = {"src": "hda", "id": datasets["genome_file"]}
+        # datamap["1"] = {"src": "hda", "id": datasets["gff_file"]}
+        # datamap["2"] = {"src": "hda", "id": datasets["proteins_file"]}
+        # datamap["3"] = {"src": "hda", "id": datasets["transcripts_file"]}
         #
         # gi.workflows.invoke_workflow(workflow_id=wf_id,
         #                              history_id=current_hi_id,
         #                              params=wf_params,
         #                              inputs=datamap)
+        # gi.workflows.delete_workflow(workflow_id=wf_id)
+        #
+        # datamap = dict()
+        # datamap["0"] = {"src": "hda", "id": datasets["genome_file"]}
+        # datamap["1"] = {"src": "hda", "id": datasets["proteins_file"]}
         #
-        gi.workflows.delete_workflow(workflow_id=wf_id)
-
-        datamap = dict()
-        datamap["0"] = {"src": "hda", "id": datasets["genome_file"]}
-        datamap["1"] = {"src": "hda", "id": datasets["proteins_file"]}
-
         wf_dict_json = workflow.generate(working_directory=wd, main_directory=main_dir, workflow_name="jbrowse")
         wf_dict = json.loads(wf_dict_json)  # doesn't work with eval()
-
-        gi.workflows.import_workflow_dict(workflow_dict=wf_dict)
-        wf_attr = gi.workflows.get_workflows(name="jbrowse")
-        wf_id = wf_attr[0]["id"]
-        wf_show = gi.workflows.show_workflow(workflow_id=wf_id)
-        print("Jbrowse workflow ID: " + wf_id)
-        wf_params = workflow.set_jbrowse_workflow_parameters()
-
-        gi.workflows.invoke_workflow(workflow_id=wf_id,
-                                     history_id=current_hi_id,
-                                     params=wf_params,
-                                     inputs=datamap)
-        gi.workflows.delete_workflow(workflow_id=wf_id)
+        #
+        # gi.workflows.import_workflow_dict(workflow_dict=wf_dict)
+        # wf_attr = gi.workflows.get_workflows(name="jbrowse")
+        # wf_id = wf_attr[0]["id"]
+        # wf_show = gi.workflows.show_workflow(workflow_id=wf_id)
+        # print("Jbrowse workflow ID: " + wf_id)
+        # wf_params = workflow.set_jbrowse_workflow_parameters()
+        #
+        # gi.workflows.invoke_workflow(workflow_id=wf_id,
+        #                              history_id=current_hi_id,
+        #                              params=wf_params,
+        #                              inputs=datamap)
+        # gi.workflows.delete_workflow(workflow_id=wf_id)
 
         # remove active instance history for testing, purge configured @ ~/config/galaxy.yml.docker_sample
         # gi.histories.delete_history(history_id=current_hi_id, purge=True)
diff --git a/workflow.py b/workflow.py
index 16fff71..90cf75f 100644
--- a/workflow.py
+++ b/workflow.py
@@ -2,6 +2,7 @@ import os
 from bioblend.galaxy import GalaxyInstance
 from toolrunner import ToolRunner
 import json
+import logging
 """
 Workflow creation for generation and visualization of data and analyses output
 """
@@ -9,6 +10,8 @@ Workflow creation for generation and visualization of data and analyses output
 
 class Workflow:
 
+    logging.basicConfig(level=logging.INFO)
+
     def __init__(self, parameters_dict, instance, history_id):
         self.history_id = history_id
         self.instance = instance
@@ -59,18 +62,22 @@ class Workflow:
             # print("Workflow file @ " + self.custom_ga_file_path)
         with open(self.preset_ga_file, 'r') as ga_in_file:
             ga_in = str(ga_in_file.readlines())
+            ga_in = ga_in.replace('{\\\\\\\\\\\\"unique_id\\\\\\\\\\\\": \\\\\\\\\\\\"UNIQUEID\\\\\\\\\\\\"}',
+                                  str('{\\\\\\\\\\\\"unique_id\\\\\\\\\\\\": \\\\\\\\\\\\"' + self.genus + " " + self.species) + '\\\\\\\\\\\\"')
+            ga_in = ga_in.replace('\\\\\\\\\\\\"name\\\\\\\\\\\\": \\\\\\\\\\\\"NAME\\\\\\\\\\\\"',
+                                  str('\\\\\\\\\\\\"name\\\\\\\\\\\\": \\\\\\\\\\\\"' + self.genus.lower()[0] + self.species) + '\\\\\\\\\\\\"')
             ga_in = ga_in.replace("\\\\", "\\")  # to restore the correct amount of backslashes in the workflow string before import
-            ga_in = ga_in.replace('"name": "NAME"', str('"name": "' + self.genus.lower()[0] + self.species) + '"')
-            ga_in = ga_in.replace('{"unique_id": "UNIQUEID"}', str('{"unique_id": "' + self.genus + " " + self.species) + '"')
+            ga_in = ga_in.replace("\\\\\\\\\\\\", "\\\\\\")
             ga_in = ga_in.replace('http://localhost/sp/undaria_pinnatifida/feature/Undaria/pinnatifida/mRNA/{id}"',
                                   "http://localhost/sp/" + self.genus.lower()[0] + self.genus[1:] + " " + self.species + "/feature/" + self.genus + "/mRNA/{id}")
             # ga_in = ga_in.replace('"index\\\": \\\"false', '"index\\\": \\\"true')
             # workflow_name = '"name": "' + self.full + '"'
             # ga_in = ga_in.replace('"name": "preset_workflow"', '"name": "preset_workflow"')
             # print(workflow_name)
-            ga_in = ga_in[2:-2]
+            ga_in = ga_in[2:-2]  # if the line under doesn't outputs a correct json
             # ga_in = ga_in[:-2]  # if the line above doesn't outputs a correct json
             self.workflow = ga_in
+            print(ga_in)
         return ga_in
 
     def set_main_workflow_parameters(self, datasets):
@@ -98,7 +105,7 @@ class Workflow:
             org_id = str(org_output["organism_id"])  # needs to be str to be recognized by the chado tool
             global_org_id = org_id
         except IndexError:
-            print("No organism matching " + self.full + " exists in the Chado database")
+            logging.info("No organism matching " + self.full + " exists in the Chado database")
 
         ogs_analysis = toolrunner.get_ogs_analysis()
         ogs_analysis_job_out = ogs_analysis["outputs"][0]["id"]
@@ -108,7 +115,7 @@ class Workflow:
             ogs_analysis_id = str(ogs_analysis_output["analysis_id"])  # needs to be str to be recognized by the chado tool
             global_ogs_id = ogs_analysis_id
         except IndexError:
-            print("No matching OGS analysis exists in the Chado database")
+            logging.info("No matching OGS analysis exists in the Chado database")
 
         genome_analysis = toolrunner.get_genome_analysis()
         genome_analysis_job_out = genome_analysis["outputs"][0]["id"]
@@ -118,7 +125,7 @@ class Workflow:
             genome_analysis_id = str(genome_analysis_output["analysis_id"])  # needs to be str to be recognized by the chado tool
             global_genome_id = genome_analysis_id
         except IndexError:
-            print("No matching genome analysis exists in the Chado database")
+            logging.info("No matching genome analysis exists in the Chado database")
 
         params = dict()
         params["0"] = {}
@@ -194,7 +201,7 @@ class Workflow:
         :return:
         """
         workflow_id = self.instance.workflows.get_workflows()[0]['id']
-        return print(self.instance.workflows.show_workflow(workflow_id=workflow_id))
+        return logging.info(self.instance.workflows.show_workflow(workflow_id=workflow_id))
 
     def store(self):
         """
-- 
GitLab