From a9daa4b9acd094b1d9e902feb3fff7e7cf4bd731 Mon Sep 17 00:00:00 2001 From: Arthur Le Bars <arthur.le-bars@sb-roscoff.fr> Date: Mon, 9 Mar 2020 09:31:19 +0100 Subject: [PATCH] .ga correct editing for jbrowse workflow --- main.py | 208 +++++++++++++++++++++++++++++++--------------------- workflow.py | 21 ++++-- 2 files changed, 139 insertions(+), 90 deletions(-) diff --git a/main.py b/main.py index 7b06b6c..fa23d79 100644 --- a/main.py +++ b/main.py @@ -5,9 +5,9 @@ import argparse import os import sys import subprocess +import logging +import re import json -import urllib3 as ul -from chado import ChadoInstance from workflow import Workflow from toolrunner import ToolRunner """ @@ -17,22 +17,12 @@ python3 ~/PycharmProjects/ggauto/gga_load_data/main.py ~/PycharmProjects/ggauto/ """ -class Autoload: - """ - TODO: turn main into an object - """ - def __init__(self, json_in): - self.json_in = json_in - - def main(self): - return None - - def main(): parser = argparse.ArgumentParser(description="Input genus, species, strain, version") parser.add_argument("json", type=str, help="Input JSON file") parser.add_argument("--just-load", help="Only load data into galaxy, does not create nor run analyses in galaxy") + parser.add_argument("-v", "--verbose", help="Increase output verbosity") # CLI stuff # parser.add_argument("--name", help="Sample species name, format: genus-species",type=str) @@ -42,6 +32,8 @@ def main(): user_arguments = parser.parse_args() + logging.basicConfig(level=logging.INFO) + # List that will hold all dicts from the JSON input file, containing parameters for each species sp_dict_list = [] @@ -81,17 +73,20 @@ def main(): # Test adress, change to abims-gga.sb-roscoff.fr/sp/ in production instance_url = "http://localhost/sp/" + genus_lower + "_" + species + "/galaxy/" - print("Species: " + genus + " " + species + " (" + common + ")" - "\nStrain: " + strain + - "\nAccessing instance " + instance_url) + print("Species: " + genus + " " + species + " (" + common + ")" + + "\nStrain: " + strain + + "\nAccessing instance " + instance_url) - # Connect to the galaxy instance of the current species TODO: API key connection issues + # Connect to the galaxy instance of the current species TODO: connection issues (galaxy side) gi = galaxy.GalaxyInstance(url=instance_url, key="3b36455cb16b4d0e4348e2c42f4bb934", email="alebars@sb-roscoff.fr", password="pouet", verify=True) + # admin_email = os.environ.get('GALAXY_DEFAULT_ADMIN_USER', 'admin@galaxy.org') + # admin_pass = os.environ.get('GALAXY_DEFAULT_ADMIN_PASSWORD', 'admin') + """ This part creates the current species directory and go to it If it already exists, just move to it @@ -118,22 +113,62 @@ def main(): # password="****") # Check connection to the current instance + print("Testing connection to the galaxy instance") try: hl = gi.histories.get_histories() except bb.ConnectionError: - print("Cannot connect to GGA instance @ " + instance_url) + print("Cannot connect to galaxy instance @ " + instance_url) else: - print("Successfully connected to instance " + instance_url) + print("Successfully connected to galaxy instance @ " + instance_url) - # TODO: FTP stuff to retrieve the datasets (used in testing, not needed for production) + # TODO: FTP/symlink stuff to retrieve the datasets + change headers in pep.fasta + setup_data_libraries_cl = "docker-compose exec galaxy /tool_deps/_conda/bin/python /opt/setup_data_libraries.py" # try: # os.mkdir("./src_data") # except FileExistsError: # print("src_data folder already exists for " + genus_species_strain) + # print("Loading data into galaxy...") + # try: + # setup_data_libraries = subprocess.Popen(setup_data_libraries_cl.split(), stdout=subprocess.PIPE) + # print("Output from setup_data_libraries.py") + # print(setup_data_libraries.communicate()) + # except bb.ConnectionError: + # print("Cannot load data into container for " + genus_species_strain) + # break + # else: + # print("Data successfully loaded into docker container for " + genus_species_strain) # else: # print("src_data folder created for " + genus_species_strain) - # + # try: + # setup_data_libraries = subprocess.Popen(setup_data_libraries_cl.split(), stdout=subprocess.PIPE) + # print("Output from setup_data_libraries.py") + # print(setup_data_libraries.communicate()) + # except bb.ConnectionError: + # print("Cannot load data into container for " + genus_species_strain) + # break + # else: + # print("Data successfully loaded into docker container for " + genus_species_strain) + + genome_dir, annotation_dir = None, None + for d in [i[0] for i in os.walk(os.getcwd() + "/src_data")]: + if "annotation/" in d: + annotation_dir = d + annotation_dir_files = [f for f in os.listdir(d) if os.path.isfile(os.path.join(d, f))] + print("src_data annotation file(s):") + print(str('\t' + file) for file in annotation_dir_files) + elif "genome/" in d: + genome_dir = d + genome_dir_files = [f for f in os.listdir(d) if os.path.isfile(os.path.join(d, f))] + print("src_data genome file(s):") + print(str('\t' + file) for file in genome_dir_files) + + + + + modify_pep_headers = ["sh /usr/local/genome2/mmo/scripts/phaeoexplorer/phaeoexplorer-change_pep_fasta_header.sh"] + + # TODO: load the data into the current species directory and load it into galaxy instance # setup_data_libraries_cl = \ # "docker-compose exec galaxy /tool_deps/_conda/bin/python /opt/setup_data_libraries.py" @@ -150,7 +185,7 @@ def main(): # generate workflow file and run it in the galaxy instance - gi.histories.create_history(name=str(genus_species_strain + "_" + genome_version)) + # gi.histories.create_history(name=str(genus_species_strain + "_" + genome_version)) hi = gi.histories.get_histories(name=str(genus_species_strain + "_" + genome_version)) hi_id = hi[0]["id"] li = gi.libraries.get_libraries() # only one library @@ -186,7 +221,7 @@ def main(): for k2, v2 in sub_folder_content.items(): for e in v2: if type(e) == dict: - # TODO: manage several files of the same type + # TODO: manage several files of the same type and versions if e["name"].endswith("transcripts-gff.fa"): datasets["transcripts_file"] = e["ldda_id"] print("\t\t" + e["name"] + ": " + e["ldda_id"]) @@ -196,6 +231,9 @@ def main(): elif e["name"].endswith(".gff"): datasets["gff_file"] = e["ldda_id"] print("\t\t" + e["name"] + ": " + e["ldda_id"]) + elif e["name"].endswith("MALE"): + datasets["gff_file"] = e["ldda_id"] + print("\t\t" + e["name"] + ": " + e["ldda_id"]) current_hi_id = gi.histories.get_current_history()["id"] print("History ID: " + current_hi_id) @@ -204,76 +242,80 @@ def main(): gi.histories.upload_dataset_from_library(history_id=current_hi_id, lib_dataset_id=datasets["transcripts_file"]) gi.histories.upload_dataset_from_library(history_id=current_hi_id, lib_dataset_id=datasets["proteins_file"]) - # Delete Homo sapiens from Chado database - toolrunner = ToolRunner(parameters_dict=sp_dict, instance=gi, history=current_hi_id) - sapiens_id = None - sapiens = toolrunner.get_sapiens_id() - sapiens_job_out = sapiens["outputs"][0]["id"] - sapiens_json_output = gi.datasets.download_dataset(dataset_id=sapiens_job_out) - try: - sapiens_output = json.loads(sapiens_json_output)[0] - sapiens_id = str(sapiens_output["organism_id"]) # needs to be str to be recognized by the chado tool - toolrunner.delete_sapiens(hs_id=sapiens_id) - except bb.ConnectionError: - print("Homo sapiens isn't in the database") + # --------------------------------------------------------------------- + # Galaxy instance interaction + # --------------------------------------------------------------------- - # Workflow generation + # # Delete Homo sapiens from Chado database + # toolrunner = ToolRunner(parameters_dict=sp_dict, instance=gi, history=current_hi_id) + # sapiens = toolrunner.get_sapiens_id() + # sapiens_job_out = sapiens["outputs"][0]["id"] + # sapiens_json_output = gi.datasets.download_dataset(dataset_id=sapiens_job_out) + # try: + # sapiens_output = json.loads(sapiens_json_output)[0] + # sapiens_id = str(sapiens_output["organism_id"]) # needs to be str to be recognized by the chado tool + # toolrunner.delete_sapiens(hs_id=sapiens_id) + # except bb.ConnectionError: + # print("Homo sapiens isn't in the database") + # except IndexError: + # pass + # + # # Workflow generation workflow = Workflow(parameters_dict=sp_dict, instance=gi, history_id = current_hi_id) - wf_dict_json = workflow.generate(working_directory=wd, main_directory=main_dir, workflow_name="preset_workflow") - - tools = gi.tools.get_tool_panel() # tools panel -> alternative to wf - # print(tools) - - wf_dict = json.loads(wf_dict_json) # doesn't work with eval() - - gi.workflows.import_workflow_dict(workflow_dict=wf_dict) - wf_name = workflow.get_workflow_name() - wf_attr = gi.workflows.get_workflows(name=wf_name) - wf_id = wf_attr[0]["id"] - wf_show = gi.workflows.show_workflow(workflow_id=wf_id) - print("Workflow ID: " + wf_id) - - toolrunner = ToolRunner(parameters_dict=sp_dict, instance=gi, history=current_hi_id) - # toolrunner.purge_organisms() - - # wf_o = bbo.Workflow(wf_dict=wf_dict, gi=gi) - - wf_params = workflow.set_main_workflow_parameters(datasets=datasets) - print("Inputs:") - print(wf_show["inputs"]) - - datamap = dict() - datamap["0"] = {"src": "hda", "id": datasets["genome_file"]} - datamap["1"] = {"src": "hda", "id": datasets["gff_file"]} - datamap["2"] = {"src": "hda", "id": datasets["proteins_file"]} - datamap["3"] = {"src": "hda", "id": datasets["transcripts_file"]} + # wf_dict_json = workflow.generate(working_directory=wd, main_directory=main_dir, workflow_name="preset_workflow") + # + # tools = gi.tools.get_tool_panel() # tools panel -> alternative to wf + # # print(tools) + # + # wf_dict = json.loads(wf_dict_json) # doesn't work with eval() + # + # gi.workflows.import_workflow_dict(workflow_dict=wf_dict) + # wf_name = workflow.get_workflow_name() + # wf_attr = gi.workflows.get_workflows(name=wf_name) + # wf_id = wf_attr[0]["id"] + # wf_show = gi.workflows.show_workflow(workflow_id=wf_id) + # print("Workflow ID: " + wf_id) + # + # toolrunner = ToolRunner(parameters_dict=sp_dict, instance=gi, history=current_hi_id) + # # toolrunner.purge_organisms() + # + # # wf_o = bbo.Workflow(wf_dict=wf_dict, gi=gi) + # + # wf_params = workflow.set_main_workflow_parameters(datasets=datasets) + # # print("Inputs:") + # # print(wf_show["inputs"]) + # + # datamap = dict() + # datamap["0"] = {"src": "hda", "id": datasets["genome_file"]} + # datamap["1"] = {"src": "hda", "id": datasets["gff_file"]} + # datamap["2"] = {"src": "hda", "id": datasets["proteins_file"]} + # datamap["3"] = {"src": "hda", "id": datasets["transcripts_file"]} # # gi.workflows.invoke_workflow(workflow_id=wf_id, # history_id=current_hi_id, # params=wf_params, # inputs=datamap) + # gi.workflows.delete_workflow(workflow_id=wf_id) + # + # datamap = dict() + # datamap["0"] = {"src": "hda", "id": datasets["genome_file"]} + # datamap["1"] = {"src": "hda", "id": datasets["proteins_file"]} # - gi.workflows.delete_workflow(workflow_id=wf_id) - - datamap = dict() - datamap["0"] = {"src": "hda", "id": datasets["genome_file"]} - datamap["1"] = {"src": "hda", "id": datasets["proteins_file"]} - wf_dict_json = workflow.generate(working_directory=wd, main_directory=main_dir, workflow_name="jbrowse") wf_dict = json.loads(wf_dict_json) # doesn't work with eval() - - gi.workflows.import_workflow_dict(workflow_dict=wf_dict) - wf_attr = gi.workflows.get_workflows(name="jbrowse") - wf_id = wf_attr[0]["id"] - wf_show = gi.workflows.show_workflow(workflow_id=wf_id) - print("Jbrowse workflow ID: " + wf_id) - wf_params = workflow.set_jbrowse_workflow_parameters() - - gi.workflows.invoke_workflow(workflow_id=wf_id, - history_id=current_hi_id, - params=wf_params, - inputs=datamap) - gi.workflows.delete_workflow(workflow_id=wf_id) + # + # gi.workflows.import_workflow_dict(workflow_dict=wf_dict) + # wf_attr = gi.workflows.get_workflows(name="jbrowse") + # wf_id = wf_attr[0]["id"] + # wf_show = gi.workflows.show_workflow(workflow_id=wf_id) + # print("Jbrowse workflow ID: " + wf_id) + # wf_params = workflow.set_jbrowse_workflow_parameters() + # + # gi.workflows.invoke_workflow(workflow_id=wf_id, + # history_id=current_hi_id, + # params=wf_params, + # inputs=datamap) + # gi.workflows.delete_workflow(workflow_id=wf_id) # remove active instance history for testing, purge configured @ ~/config/galaxy.yml.docker_sample # gi.histories.delete_history(history_id=current_hi_id, purge=True) diff --git a/workflow.py b/workflow.py index 16fff71..90cf75f 100644 --- a/workflow.py +++ b/workflow.py @@ -2,6 +2,7 @@ import os from bioblend.galaxy import GalaxyInstance from toolrunner import ToolRunner import json +import logging """ Workflow creation for generation and visualization of data and analyses output """ @@ -9,6 +10,8 @@ Workflow creation for generation and visualization of data and analyses output class Workflow: + logging.basicConfig(level=logging.INFO) + def __init__(self, parameters_dict, instance, history_id): self.history_id = history_id self.instance = instance @@ -59,18 +62,22 @@ class Workflow: # print("Workflow file @ " + self.custom_ga_file_path) with open(self.preset_ga_file, 'r') as ga_in_file: ga_in = str(ga_in_file.readlines()) + ga_in = ga_in.replace('{\\\\\\\\\\\\"unique_id\\\\\\\\\\\\": \\\\\\\\\\\\"UNIQUEID\\\\\\\\\\\\"}', + str('{\\\\\\\\\\\\"unique_id\\\\\\\\\\\\": \\\\\\\\\\\\"' + self.genus + " " + self.species) + '\\\\\\\\\\\\"') + ga_in = ga_in.replace('\\\\\\\\\\\\"name\\\\\\\\\\\\": \\\\\\\\\\\\"NAME\\\\\\\\\\\\"', + str('\\\\\\\\\\\\"name\\\\\\\\\\\\": \\\\\\\\\\\\"' + self.genus.lower()[0] + self.species) + '\\\\\\\\\\\\"') ga_in = ga_in.replace("\\\\", "\\") # to restore the correct amount of backslashes in the workflow string before import - ga_in = ga_in.replace('"name": "NAME"', str('"name": "' + self.genus.lower()[0] + self.species) + '"') - ga_in = ga_in.replace('{"unique_id": "UNIQUEID"}', str('{"unique_id": "' + self.genus + " " + self.species) + '"') + ga_in = ga_in.replace("\\\\\\\\\\\\", "\\\\\\") ga_in = ga_in.replace('http://localhost/sp/undaria_pinnatifida/feature/Undaria/pinnatifida/mRNA/{id}"', "http://localhost/sp/" + self.genus.lower()[0] + self.genus[1:] + " " + self.species + "/feature/" + self.genus + "/mRNA/{id}") # ga_in = ga_in.replace('"index\\\": \\\"false', '"index\\\": \\\"true') # workflow_name = '"name": "' + self.full + '"' # ga_in = ga_in.replace('"name": "preset_workflow"', '"name": "preset_workflow"') # print(workflow_name) - ga_in = ga_in[2:-2] + ga_in = ga_in[2:-2] # if the line under doesn't outputs a correct json # ga_in = ga_in[:-2] # if the line above doesn't outputs a correct json self.workflow = ga_in + print(ga_in) return ga_in def set_main_workflow_parameters(self, datasets): @@ -98,7 +105,7 @@ class Workflow: org_id = str(org_output["organism_id"]) # needs to be str to be recognized by the chado tool global_org_id = org_id except IndexError: - print("No organism matching " + self.full + " exists in the Chado database") + logging.info("No organism matching " + self.full + " exists in the Chado database") ogs_analysis = toolrunner.get_ogs_analysis() ogs_analysis_job_out = ogs_analysis["outputs"][0]["id"] @@ -108,7 +115,7 @@ class Workflow: ogs_analysis_id = str(ogs_analysis_output["analysis_id"]) # needs to be str to be recognized by the chado tool global_ogs_id = ogs_analysis_id except IndexError: - print("No matching OGS analysis exists in the Chado database") + logging.info("No matching OGS analysis exists in the Chado database") genome_analysis = toolrunner.get_genome_analysis() genome_analysis_job_out = genome_analysis["outputs"][0]["id"] @@ -118,7 +125,7 @@ class Workflow: genome_analysis_id = str(genome_analysis_output["analysis_id"]) # needs to be str to be recognized by the chado tool global_genome_id = genome_analysis_id except IndexError: - print("No matching genome analysis exists in the Chado database") + logging.info("No matching genome analysis exists in the Chado database") params = dict() params["0"] = {} @@ -194,7 +201,7 @@ class Workflow: :return: """ workflow_id = self.instance.workflows.get_workflows()[0]['id'] - return print(self.instance.workflows.show_workflow(workflow_id=workflow_id)) + return logging.info(self.instance.workflows.show_workflow(workflow_id=workflow_id)) def store(self): """ -- GitLab