diff --git a/IDEAS.md b/IDEAS.md index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..10fc614e58570268dcf9cf2f5b84a1916b123d40 100644 --- a/IDEAS.md +++ b/IDEAS.md @@ -0,0 +1,13 @@ +## Project: Phaeoexplorer GGAuto + +### Features: + - Automatic import of data into the proper galaxy instance (to use in conjunction with docker swarm) + - Automatic execution of necessary workflows + - Updating existing instances of galaxy with additional data (e.g RNAseq, Hi-C data, more genomes?) + - Docker swarm integration for all-in-one deployment/integration tool? + - Write a .ga file (Galaxy workflow) to automatically load data and execute analyses +### Requirements: + - Metadata file where the user has to describe the data/species/etc.: + - Species + strain names + - Who ordered the data, who sequenced it, who assembled it, who did the annotation (if any)? + \ No newline at end of file diff --git a/README.md b/README.md index 8fa50d5ea9ea92a5d5c02add68c65212f4d88aa7..17a381d2499f8e9baf24f1ee3840526ecca4a55b 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ # gga_load_data -Bioblend script to load data into GGA \ No newline at end of file +Bioblend-based script to load data into GGA + +TODO: Metadata in /projet/sbr/phaeoexplorer/ \ No newline at end of file diff --git a/filetransfer.py b/filetransfer.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..9dae890ff808fdd635af453258e0276471d4e473 100644 --- a/filetransfer.py +++ b/filetransfer.py @@ -0,0 +1,19 @@ +""" +SFTP class for downloading files from the server, useless in production +""" +import subprocess + + +# TODO: with pysftp + +class FileTransfer: + + def connect(self, user, host, password): + user_host = [user, host] + subprocess.Popen(user_host) + + + + def file(self): + return 0 + diff --git a/main.py b/main.py index bf999a9e80bc58347b05dbfc8846b09d95ba00b6..8a46fc14d31997fc6ae43f38a54717816a6440df 100644 --- a/main.py +++ b/main.py @@ -82,10 +82,10 @@ def main(): # Connect to the galaxy instance of the current species TODO: API key connection issues gi = galaxy.GalaxyInstance(url=instance_url, - key="291ec50b48dd1f006985b32e39bc3696", + key="0e993414b2f876515e74dd890f16ffc7", email="alebars@sb-roscoff.fr", password="pouet", - verify=True) + verify=False) """ This part creates the current species directory and go to it @@ -117,6 +117,7 @@ def main(): hl = gi.histories.get_histories() except bb.ConnectionError: print("Cannot connect to GGA instance @ " + instance_url) + else: print("Successfully connected to instance " + instance_url) @@ -148,34 +149,108 @@ def main(): # generate workflow file and run it in the galaxy instance workflow = Workflow(parameters_dict=sp_dict, instance=gi) print("Generating custom workflow for " + genus_species_strain) - workflow.generate(working_directory=wd, main_directory=main_dir) - wf_id = workflow.store() - hi_id = gi.histories.get_histories()[0]["id"] - print("Workflow id: " + wf_id) - print("History id: " + hi_id) - wf_show = gi.workflows.show_workflow(workflow_id=wf_id) - print(wf_show["inputs"]) - - # workflow.port() - li_id = gi.libraries.get_libraries()[0]["id"] - # gi.folders.update_folder() # TODO: add method to enable data updates - tsi = gi.toolshed.get_repositories() # tool shed repo access point - # print(gi.users.get_users()) # TODO: users management - # print(gi.libraries.show_library(li_id)) - fo_gi = gi.libraries.get_folders(library_id=li_id) # data location + wf_dict = workflow.generate(working_directory=wd, main_directory=main_dir) + + gi.histories.create_history(name=str(genus_species_strain + "_" + genome_version)) + hi = gi.histories.get_histories(name=str(genus_species_strain + "_" + genome_version)) + hi_id = hi[0]["id"] + # print(hi) + li = gi.libraries.get_libraries() # only one library + # print(li) + li_id = gi.libraries.get_libraries()[0]["id"] # project data folder/library + # print(li_id) + fo_gi = gi.libraries.get_folders(library_id=li_id) # data location (project data) + # print(fo_gi) fo_id = {} current_fo_name = "" - print(fo_gi) # folders ids: access to data to run the first tools for i in fo_gi: for k, v in i.items(): if k == "name": - fo_id[k] = 0 - current_fo_name = k + fo_id[v] = 0 + current_fo_name = v if k == "id": fo_id[current_fo_name] = v + print("IDs: ") + datasets = {} + for k, v in fo_id.items(): + print("\t" + k + ": " + v) + if k == "/genome": + sub_folder_content = gi.folders.show_folder(folder_id=v, contents=True) + # print(sub_folder_content) + for k2, v2 in sub_folder_content.items(): + # print(k2) + # print(v2) + for e in v2: + if type(e) == dict: + if e["name"].endswith(".fa"): + datasets["genome_file"] = e["ldda_id"] + print("\t\t" + e["name"] + ": " + e["ldda_id"]) + elif k == "/annotation/" + genus_species: + sub_folder_content = gi.folders.show_folder(folder_id=v, contents=True) + # print(sub_folder_content) + for k2, v2 in sub_folder_content.items(): + # print(k2) + # print(v2) + for e in v2: + if type(e) == dict: + if e["name"].endswith("transcripts-gff.fa"): + datasets["transcripts_file"] = e["ldda_id"] + print("\t\t" + e["name"] + ": " + e["ldda_id"]) + elif e["name"].endswith("proteins.fasta"): + datasets["proteins_file"] = e["ldda_id"] + print("\t\t" + e["name"] + ": " + e["ldda_id"]) + elif e["name"].endswith(".gff"): + datasets["gff_file"] = e["ldda_id"] + print("\t\t" + e["name"] + ": " + e["ldda_id"]) + + current_hi = gi.histories.get_current_history()["id"] + print("History ID:" + current_hi) + gi.histories.upload_dataset_from_library(history_id=hi_id, lib_dataset_id=datasets["genome_file"]) + gi.histories.upload_dataset_from_library(history_id=hi_id, lib_dataset_id=datasets["gff_file"]) + gi.histories.upload_dataset_from_library(history_id=hi_id, lib_dataset_id=datasets["transcripts_file"]) + gi.histories.upload_dataset_from_library(history_id=hi_id, lib_dataset_id=datasets["proteins_file"]) + # print(gi.tools.get_tool_panel()) + + # print(wf_dict) + wf_dict_2 = json.loads(wf_dict) # add to workflow class + gi.workflows.import_workflow_dict(workflow_dict=wf_dict_2) # doesn't work with eval() + + # # PRE FILLED METHOD + # wf_id = workflow.store() + # hi_id = gi.histories.get_histories()[0]["id"] + # print("Workflow id: " + wf_id) + # print("History id: " + hi_id) + # wf_show = gi.workflows.show_workflow(workflow_id=wf_id) + # # print(wf_show["inputs"]) # ->no inputs + # # workflow.port() + # li_id = gi.libraries.get_libraries()[0]["id"] + # # gi.folders.update_folder() # TODO: add method to enable data updates + # tsi = gi.toolshed.get_repositories() # tool shed repo access point + # # print(gi.users.get_users()) # TODO: users management + # # print(gi.libraries.show_library(li_id)) + # # TODO: create a NEW history, import data to galaxy from library (src_data) + # fo_gi = gi.libraries.get_folders(library_id=li_id) # data location + # fo_id = {} + # current_fo_name = "" + # # print(fo_gi) + # # folders ids: access to data to run the first tools + # for i in fo_gi: + # for k, v in i.items(): + # if k == "name": + # fo_id[v] = 0 + # current_fo_name = v + # if k == "id": + # fo_id[current_fo_name] = v + # print("Folders id: ") + # for k, v in fo_id.items(): + # print("\t" + k + ": " + v) + # workflow.show() + # # gi.workflows.run_workflow(workflow_id=wf_id) # pre filled workflow, use the set on runtime approach instead + # for testing, purge configured @ ~/config/galaxy.yml.docker_sample + # gi.histories.delete_history(history_id=hi_id, purge=True) os.chdir(main_dir) print("\n") diff --git a/runner.py b/runner.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..9bdfb67117d3d184be193e00314a52e8ee65c2ec 100644 --- a/runner.py +++ b/runner.py @@ -0,0 +1,13 @@ +import os +from bioblend.galaxy import GalaxyInstance + +""" +Tool utils (managed by GGA/GMOD) for job submission in galaxy using bioblend +""" + + +class Runner: + + def __init__(self, instance): + self.instance = None + diff --git a/workflow.py b/workflow.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..f658f6a4dfcdb5891cb14d196f0c3ffa1205cef3 100644 --- a/workflow.py +++ b/workflow.py @@ -0,0 +1,147 @@ +import os +from bioblend.galaxy import GalaxyInstance + +""" +Workflow creation for generation and visualization of data and analyses output +""" + + +class Workflow: + + def __init__(self, parameters_dict, instance): + self.instance = instance + self.parameters_dict = parameters_dict + self.genus = parameters_dict["genus"] + self.species = parameters_dict["species"] + self.strain = parameters_dict["strain"] + self.common = parameters_dict["common"] + self.performed = parameters_dict["performed by"] + self.genome_version = parameters_dict["genome version"] + self.ogs_version = parameters_dict["ogs version"] + self.sex = parameters_dict["sex"] + self.custom_ga_file = None + self.custom_ga_file_path = None + self.preset_ga_file = None + if self.strain != "": + self.abbr = self.genus[0].lower() + "_" + self.species + "_" + self.strain + self.full = "_".join([self.genus, self.species, self.strain, self.sex]) + else: + self.abbr = self.genus[0].lower() + "_" + self.species + self.full = "_".join([self.genus, self.species, self.strain, self.sex]) + self.workflow = None + + def generate(self, working_directory, main_directory): + """ + Generation of a galaxy workflow using the defined parameters in the .json input file + Output format is .ga (basically a .json for galaxy) + + :param working_directory: + :param main_directory: + :return: + """ + # template workflow as a string + # template_workflow_str = '"{"uuid": "ea9c3050-416f-4098-a7ff-b05c952bcd73", "tags": [], "format-version": "0.1", "name": "test", "version": 2, "steps": {"0": {"tool_id": "toolshed.g2.bx.psu.edu/repos/gga/chado_organism_delete_organisms/organism_delete_organisms/2.3.2", "tool_version": "2.3.2", "outputs": [{"type": "txt", "name": "results"}], "workflow_outputs": [{"output_name": "results", "uuid": "d9b75b03-49e7-4a81-a67c-eaf6a9671905", "label": null}], "input_connections": {}, "tool_state": "{\"__page__\": null, \"__rerun_remap_job_id__\": null, \"organism\": \"\\\"2\\\"\", \"psql_target\": \"{\\\"__current_case__\\\": 0, \\\"method\\\": \\\"remote\\\"}\"}", "id": 0, "tool_shed_repository": {"owner": "gga", "changeset_revision": "13da56fdaeb1", "name": "chado_organism_delete_organisms", "tool_shed": "toolshed.g2.bx.psu.edu"}, "uuid": "f70569bc-9ac0-441a-a2d8-b547086a5bdf", "errors": null, "name": "Chado organism delete", "post_job_actions": {}, "label": "$ORGADELETE", "inputs": [], "position": {"top": 362, "left": 200}, "annotation": "", "content_id": "toolshed.g2.bx.psu.edu/repos/gga/chado_organism_delete_organisms/organism_delete_organisms/2.3.2", "type": "tool"}, "1": {"tool_id": "toolshed.g2.bx.psu.edu/repos/gga/chado_organism_add_organism/organism_add_organism/2.3.2", "tool_version": "2.3.2", "outputs": [{"type": "json", "name": "results"}], "workflow_outputs": [{"output_name": "results", "uuid": "8ca0b891-0f01-4787-9b7f-57105dc303b0", "label": null}], "input_connections": {}, "tool_state": "{\"comment\": \"\\\"\\\"\", \"__page__\": null, \"__rerun_remap_job_id__\": null, \"psql_target\": \"{\\\"__current_case__\\\": 0, \\\"method\\\": \\\"remote\\\"}\", \"common\": \"\\\"$COMMON\\\"\", \"genus\": \"\\\"$GENUS\\\"\", \"species\": \"\\\"$SPECIES\\\"\", \"abbr\": \"\\\"$ABBR\\\"\"}", "id": 1, "tool_shed_repository": {"owner": "gga", "changeset_revision": "0f4956cec445", "name": "chado_organism_add_organism", "tool_shed": "toolshed.g2.bx.psu.edu"}, "uuid": "24f0e175-f932-4e48-8b42-a53d9a432d5e", "errors": null, "name": "Chado organism add", "post_job_actions": {}, "label": "$ORGADD", "inputs": [], "position": {"top": 361, "left": 467.5}, "annotation": "", "content_id": "toolshed.g2.bx.psu.edu/repos/gga/chado_organism_add_organism/organism_add_organism/2.3.2", "type": "tool"}, "2": {"tool_id": "toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/2.3.2", "tool_version": "2.3.2", "outputs": [{"type": "json", "name": "results"}], "workflow_outputs": [{"output_name": "results", "uuid": "8fa0e728-8803-4800-93b4-70f906f95f87", "label": null}], "input_connections": {}, "tool_state": "{\"__page__\": null, \"name\": \"\\\"$GENOME\\\"\", \"sourceuri\": \"\\\"\\\"\", \"sourcename\": \"\\\"\\\"\", \"__rerun_remap_job_id__\": null, \"programversion\": \"\\\"\\\"\", \"psql_target\": \"{\\\"__current_case__\\\": 0, \\\"method\\\": \\\"remote\\\"}\", \"sourceversion\": \"\\\"\\\"\", \"program\": \"\\\"$PERFORMEDBY\\\"\", \"algorithm\": \"\\\"\\\"\", \"date_executed\": \"\\\"\\\"\", \"description\": \"\\\"\\\"\"}", "id": 2, "tool_shed_repository": {"owner": "gga", "changeset_revision": "3a1f3c9b755b", "name": "chado_analysis_add_analysis", "tool_shed": "toolshed.g2.bx.psu.edu"}, "uuid": "76cbbd55-f1ac-4e48-be3c-c7bbda5add4c", "errors": null, "name": "Chado analysis add", "post_job_actions": {}, "label": "$ADDGENOME", "inputs": [], "position": {"top": 307, "left": 690}, "annotation": "", "content_id": "toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/2.3.2", "type": "tool"}, "3": {"tool_id": "toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/2.3.2", "tool_version": "2.3.2", "outputs": [{"type": "json", "name": "results"}], "workflow_outputs": [{"output_name": "results", "uuid": "5e7da027-0723-4077-8885-2dbe51cb5dda", "label": null}], "input_connections": {}, "tool_state": "{\"__page__\": null, \"name\": \"\\\"$OGS\\\"\", \"sourceuri\": \"\\\"\\\"\", \"sourcename\": \"\\\"\\\"\", \"__rerun_remap_job_id__\": null, \"programversion\": \"\\\"\\\"\", \"psql_target\": \"{\\\"__current_case__\\\": 0, \\\"method\\\": \\\"remote\\\"}\", \"sourceversion\": \"\\\"\\\"\", \"program\": \"\\\"$PERFORMEDBY\\\"\", \"algorithm\": \"\\\"\\\"\", \"date_executed\": \"\\\"\\\"\", \"description\": \"\\\"\\\"\"}", "id": 3, "tool_shed_repository": {"owner": "gga", "changeset_revision": "3a1f3c9b755b", "name": "chado_analysis_add_analysis", "tool_shed": "toolshed.g2.bx.psu.edu"}, "uuid": "4d1ffee4-00b2-445d-b630-b7b774c17873", "errors": null, "name": "Chado analysis add", "post_job_actions": {}, "label": "$ADDOGS", "inputs": [], "position": {"top": 395, "left": 697}, "annotation": "", "content_id": "toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/2.3.2", "type": "tool"}, "4": {"tool_id": "toolshed.g2.bx.psu.edu/repos/gga/chado_feature_load_fasta/feature_load_fasta/2.3.2", "tool_version": "2.3.2", "outputs": [{"type": "json", "name": "results"}], "workflow_outputs": [{"output_name": "results", "uuid": "737dddc9-ae1b-463d-99fa-d9176053594d", "label": null}], "input_connections": {}, "tool_state": "{\"do_update\": \"\\\"false\\\"\", \"relationships\": \"{\\\"__current_case__\\\": 0, \\\"rel_type\\\": \\\"none\\\"}\", \"ext_db\": \"{\\\"db\\\": \\\"\\\", \\\"re_db_accession\\\": \\\"\\\"}\", \"analysis_id\": \"\\\"4\\\"\", \"re_uniquename\": \"\\\"\\\"\", \"match_on_name\": \"\\\"false\\\"\", \"__page__\": null, \"__rerun_remap_job_id__\": null, \"psql_target\": \"{\\\"__current_case__\\\": 0, \\\"method\\\": \\\"remote\\\"}\", \"re_name\": \"\\\"\\\"\", \"fasta\": \"{\\\"__class__\\\": \\\"RuntimeValue\\\"}\", \"wait_for\": \"{\\\"__class__\\\": \\\"RuntimeValue\\\"}\", \"organism\": \"\\\"2\\\"\", \"sequence_type\": \"\\\"contig\\\"\"}", "id": 4, "tool_shed_repository": {"owner": "gga", "changeset_revision": "1421dbc33a92", "name": "chado_feature_load_fasta", "tool_shed": "toolshed.g2.bx.psu.edu"}, "uuid": "3d417ced-fc48-4c04-8a92-fdb7b9fecafc", "errors": null, "name": "Chado load fasta", "post_job_actions": {}, "label": "$LOADFASTA", "inputs": [{"name": "fasta", "description": "runtime parameter for tool Chado load fasta"}, {"name": "wait_for", "description": "runtime parameter for tool Chado load fasta"}], "position": {"top": 306, "left": 933.5}, "annotation": "", "content_id": "toolshed.g2.bx.psu.edu/repos/gga/chado_feature_load_fasta/feature_load_fasta/2.3.2", "type": "tool"}}, "annotation": "", "a_galaxy_workflow": "true"}"' + + os.chdir(path=working_directory) + self.preset_ga_file = main_directory + "preset_workflow.ga" + if self.strain != "": + self.custom_ga_file = "_".join([self.genus, self.species, self.strain]) + "_workflow.ga" + self.custom_ga_file_path = os.path.abspath(self.custom_ga_file) + print("Workflow file @ " + self.custom_ga_file_path) + else: + self.custom_ga_file = "_".join([self.genus, self.species]) + "_workflow.ga" + self.custom_ga_file_path = os.path.abspath(self.custom_ga_file) + print("Workflow file @ " + self.custom_ga_file_path) + with open(self.preset_ga_file, 'r') as ga_in_file: + ga_in = str(ga_in_file.readlines()) + ga_in = ga_in.replace("$OGS", "OGS") + ga_in = ga_in.replace("$VERSION", self.ogs_version) + ga_in = ga_in.replace("$GENUS", self.genus) + ga_in = ga_in.replace("$SPECIES", self.species) + ga_in = ga_in.replace("$ABBR", self.abbr) + ga_in = ga_in.replace("$STRAIN", self.strain) + ga_in = ga_in.replace("$PERFORMEDBY", self.performed) + ga_in = ga_in.replace("$COMMON", self.common) + ga_in = ga_in.replace("$ORGA", self.full) + ga_in = ga_in.replace("$ADDAN", "Add analysis") + ga_in = ga_in.replace("\\\\", "\\") # restore the correct amount of backslashes in the ga file + workflow_name = '"name": "' + self.full + '"' + ga_in = ga_in.replace('"name": "preset"', workflow_name) + ga_in = ga_in[2:] + ga_in = ga_in[:-4] + self.workflow = ga_in + return ga_in + + def dict_port(self): + """ + + :return: + """ + + try: + self.instance.workflows.import_workflow_dict(workflow_dict=self.workflow) + except ConnectionError: + return False + return True + + def port(self): + """ + Import workflow into a galaxy instance + Importing from string doesnt work (MUST be dict) -> TODO: handle dict import + :return: + """ + try: + self.instance.workflows.import_workflow_from_local_path(self.custom_ga_file_path) + except ConnectionError: + return False + else: + return True + + def show(self): + """ + Print the instance's main workflow to stdout (dict form) + :return: + """ + workflow_id = self.instance.workflows.get_workflows()[0]['id'] + return print(self.instance.workflows.show_workflow(workflow_id=workflow_id)) + + def store(self): + """ + Store the instance's workflow + :return: + """ + workflow_id = self.instance.workflows.get_workflows()[0]['id'] + return workflow_id + + + def delete(self): + """ + + :return: + """ + return None + + + def run(self): + """ + Run the custom workflow into a galaxy instance + Input datasets in the form of a list + + :return: + """ + + wf_id = self.show() + + datamap = {"genus": self.genus, "species": self.species, "strain": self.strain, "abbr": self.abbr, + "full": self.full, "common": self.common, "ogs_version": self.ogs_version, + "genome_version": self.genome_version, "sex": self.sex, "performed": self.performed} + return None + + # def add_step(self, step_position, description, name): + # """ + # TODO: add a step to the workflow (data loading into chado for example) + # + # :param workflow: + # :return: + # """ + # return None