From 48cdd5a77e1a40cc23c40f931f49743475338d18 Mon Sep 17 00:00:00 2001 From: Arthur Le Bars <arthur.le-bars@sb-roscoff.fr> Date: Thu, 9 Apr 2020 16:41:08 +0200 Subject: [PATCH] Delete main.py --- main.py | 352 -------------------------------------------------------- 1 file changed, 352 deletions(-) delete mode 100644 main.py diff --git a/main.py b/main.py deleted file mode 100644 index faf1140..0000000 --- a/main.py +++ /dev/null @@ -1,352 +0,0 @@ -from bioblend import galaxy -import bioblend.galaxy.objects as bbo -import bioblend as bb -import argparse -import os -import sys -import subprocess -import logging -import re -import json -from workflow import Workflow -from toolrunner import ToolRunner -""" -TODO: script description -python3 ~/PycharmProjects/ggauto/gga_load_data/main.py ~/PycharmProjects/ggauto/gga_load_data/dataloader.json - -""" - - -def main(): - - parser = argparse.ArgumentParser(description="Input genus, species, strain, version") - parser.add_argument("json", type=str, help="Input JSON file") - parser.add_argument("--just-load", help="Only load data into galaxy, does not create nor run analyses in galaxy") - parser.add_argument("-v", "--verbose", help="Increase output verbosity") - - # CLI stuff - # parser.add_argument("--name", help="Sample species name, format: genus-species",type=str) - # parser.add_argument("--strain", help="Strain of the sample species", type=str) - # parser.add_argument("--version", help="Data version (e.g 1.0, 1.2, ...)", type=str) - # parser.add_argument("--common-name", help="Vernacular/common name of the species",type=str) - - user_arguments = parser.parse_args() - - logging.basicConfig(level=logging.INFO) - - # List that will hold all dicts from the JSON input file, containing parameters for each species - sp_dict_list = [] - - # JSON parsing and loading - with open(user_arguments.json, 'r') as infile: - json_sp_dict = json.load(infile) - json_sp_dump = json.dumps(json_sp_dict, indent=4, sort_keys=True) - for json_sp in json_sp_dict: - sp_dict_list.append(json_sp) - - # Get variables from the current species dict - for sp_dict in sp_dict_list: - sp_params_list = [] - genus = sp_dict["genus"] - genus_lower = genus[0].lower() + genus[1:] - species = sp_dict["species"] - genus_species = genus_lower + "_" + species - common = sp_dict["common"] - strain = sp_dict["strain"] - sex = sp_dict["sex"] - if strain != "": - genus_species_strain = genus_species + "_" + strain - else: - genus_species_strain = genus_species - ogs_version = sp_dict["ogs version"] - genome_version = sp_dict["genome version"] - performed_by = sp_dict["performed by"] - - # CLI stuff - # genus = user_arguments.name.split('-')[0] - # genus_lower = genus[0].lower().genus[1:] - # genus_upper = genus[0].upper() + genus[1:] - # species = user_arguments.name.split('-')[1] - # strain = user_arguments.strain - # vernacular = user_arguments.common_name - - # TODO: prompt y/n asking for the validity of info - # Test adress, change to abims-gga.sb-roscoff.fr/sp/ in production - instance_url = "http://localhost/sp/" + genus_lower + "_" + species + "/galaxy/" - - print("Species: " + genus + " " + species + " (" + common + ")" + - "\nStrain: " + strain + - "\nAccessing instance " + instance_url) - - # Connect to the galaxy instance of the current species TODO: connection issues (galaxy side) - gi = galaxy.GalaxyInstance(url=instance_url, - key="3b36455cb16b4d0e4348e2c42f4bb934", - email="alebars@sb-roscoff.fr", - password="pouet", - verify=True) - - # admin_email = os.environ.get('GALAXY_DEFAULT_ADMIN_USER', 'admin@galaxy.org') - # admin_pass = os.environ.get('GALAXY_DEFAULT_ADMIN_PASSWORD', 'admin') - - """ - This part creates the current species directory and go to it - If it already exists, just move to it - To be expanded when docker-swarm is implemented (or all config files are ready), not useful for now - """ - main_dir = os.getcwd() + "/" - sp_dir = os.path.join(main_dir, genus_species) + "/" - - try: - os.mkdir(sp_dir) - except FileExistsError: - print("Directory " + sp_dir + " already exists") - try: - os.chdir(sp_dir) - wd = os.getcwd() - except OSError: - print("Cannot access " + sp_dir + ", run with higher privileges") - break - - # Production instance example TODO: secure pswd and API key + manage API keys - # gi = galaxy.GalaxyInstance(url="http://abims-gga.sb-roscoff.fr/sp/ectocarpus_species1/galaxy/", - # key="84dfbee3c0efa9155518f01fbeff57c8", - # email="gga@sb-roscoff.fr", - # password="****") - - # Check connection to the current instance - print("Testing connection to the galaxy instance") - try: - hl = gi.histories.get_histories() - except bb.ConnectionError: - print("Cannot connect to galaxy instance @ " + instance_url) - - else: - print("Successfully connected to galaxy instance @ " + instance_url) - - # TODO: FTP/symlink stuff to retrieve the datasets + change headers in pep.fasta - - # --------------------------------------------------------------------- - # src_data directory tree creation - # --------------------------------------------------------------------- - - src_data_folders = ["annotation", "genome"] - species_folder_name = "_".join([genus_lower, species, strain, sex]) - try: - os.mkdir("./src_data") - os.mkdir("./src_data/annotation") - os.mkdir("./src_data/genome") - os.mkdir("./src_data/annotation/" + species_folder_name) - os.mkdir("./src_data/genome/" + species_folder_name) - except FileExistsError: - print("src_data directory tree already exists") - pass - except PermissionError: - print("Insufficient permission to create src_data directory tree") - - # --------------------------------------------------------------------- - # Data import into galaxy - # --------------------------------------------------------------------- - - source_files = dict() - annotation_dir, genome_dir = None, None - for d in [i[0] for i in os.walk(os.getcwd() + "/src_data")]: - if "annotation/" in d: - annotation_dir = d - for f in os.listdir(d): - if f.endswith("proteins.fasta"): - source_files["proteins_file"] = os.path.join(d, f) - elif f.endswith("transcripts-gff.fa"): - source_files["transcripts_file"] = os.path.join(d, f) - elif f.endswith(".gff"): - source_files["gff_file"] = os.path.join(d, f) - # annotation_dir_files = [f for f in os.listdir(d) if os.path.isfile(os.path.join(d, f))] - elif "genome/" in d: - genome_dir = d - for f in os.listdir(d): - if f.endswith(".fa"): - source_files["genome_file"] = os.path.join(d, f) - # genome_dir_files = [f for f in os.listdir(d) if os.path.isfile(os.path.join(d, f))] - print("Source files found:") - for k, v in source_files.items(): - print("\t" + k + "\t" + v) - - # Changing headers in the *proteins.fasta file from >mRNA* to >protein* - # production version - modify_pep_headers = ["/usr/local/genome2/mmo/scripts/phaeoexplorer/phaeoexplorer-change_pep_fasta_header.sh", - source_files["proteins_file"]] - # test version - modify_pep_headers = ["/home/alebars/gga/phaeoexplorer-change_pep_fasta_header.sh", - source_files["proteins_file"]] - print("Changing fasta headers in " + source_files["proteins_file"]) - subprocess.run(modify_pep_headers, stdout=subprocess.PIPE, cwd=annotation_dir) - - # src_data cleaning - if os.path.exists(annotation_dir + "outfile"): - subprocess.run(["mv", annotation_dir + "/outfile", source_files["proteins_file"]], - stdout=subprocess.PIPE, - cwd=annotation_dir) - if os.path.exists(annotation_dir + "gmon.out"): - subprocess.run(["rm", annotation_dir + "/gmon.out"], - stdout=subprocess.PIPE, - cwd=annotation_dir) - - # TODO: load the data into the current species directory and load it into galaxy instance - setup_data_libraries = "docker-compose exec galaxy /tool_deps/_conda/bin/python /opt/setup_data_libraries.py" - try: - print("Loading data into the galaxy container") - subprocess.run(setup_data_libraries, - stdout=subprocess.PIPE, - shell=True) - except subprocess.CalledProcessError: - print("Cannot load data into container for " + genus_species_strain) - break - else: - print("Data successfully loaded into docker container for " + genus_species_strain) - - # generate workflow file and run it in the galaxy instance - - # gi.histories.create_history(name=str(genus_species_strain + "_" + genome_version)) - hi = gi.histories.get_histories(name=str(genus_species_strain + "_" + genome_version)) - hi_id = hi[0]["id"] - li = gi.libraries.get_libraries() # only one library - li_id = gi.libraries.get_libraries()[0]["id"] # project data folder/library - fo_gi = gi.libraries.get_folders(library_id=li_id) # data location (project data) - - fo_id = {} - current_fo_name = "" - # folders ids: access to data to run the first tools - for i in fo_gi: - for k, v in i.items(): - if k == "name": - fo_id[v] = 0 - current_fo_name = v - if k == "id": - fo_id[current_fo_name] = v - print("Folders and datasets IDs: ") - datasets = dict() - for k, v in fo_id.items(): - print("\t" + k + ": " + v) - if k == "/genome": - sub_folder_content = gi.folders.show_folder(folder_id=v, contents=True) - for k2, v2 in sub_folder_content.items(): - for e in v2: - if type(e) == dict: - if e["name"].endswith(".fa"): - datasets["genome_file"] = e["ldda_id"] - print("\t\t" + e["name"] + ": " + e["ldda_id"]) - elif k == "/annotation/" + genus_species: - sub_folder_content = gi.folders.show_folder(folder_id=v, contents=True) - for k2, v2 in sub_folder_content.items(): - for e in v2: - if type(e) == dict: - # TODO: manage several files of the same type and versions - if e["name"].endswith("transcripts-gff.fa"): - datasets["transcripts_file"] = e["ldda_id"] - print("\t\t" + e["name"] + ": " + e["ldda_id"]) - elif e["name"].endswith("proteins.fasta"): - datasets["proteins_file"] = e["ldda_id"] - print("\t\t" + e["name"] + ": " + e["ldda_id"]) - elif e["name"].endswith(".gff"): - datasets["gff_file"] = e["ldda_id"] - print("\t\t" + e["name"] + ": " + e["ldda_id"]) - elif e["name"].endswith("MALE"): - datasets["gff_file"] = e["ldda_id"] - print("\t\t" + e["name"] + ": " + e["ldda_id"]) - - current_hi_id = gi.histories.get_current_history()["id"] - print("History ID: " + current_hi_id) - gi.histories.upload_dataset_from_library(history_id=current_hi_id, lib_dataset_id=datasets["genome_file"]) - gi.histories.upload_dataset_from_library(history_id=current_hi_id, lib_dataset_id=datasets["gff_file"]) - gi.histories.upload_dataset_from_library(history_id=current_hi_id, lib_dataset_id=datasets["transcripts_file"]) - gi.histories.upload_dataset_from_library(history_id=current_hi_id, lib_dataset_id=datasets["proteins_file"]) - - toolrunner = ToolRunner(parameters_dict=sp_dict, instance=gi, history=current_hi_id) - # toolrunner.show_pannel() # show tools pannel (with tool_id and versions) - - # --------------------------------------------------------------------- - # Galaxy instance interaction - # --------------------------------------------------------------------- - - # # Delete Homo sapiens from Chado database - # toolrunner = ToolRunner(parameters_dict=sp_dict, instance=gi, history=current_hi_id) - # sapiens = toolrunner.get_sapiens_id() - # sapiens_job_out = sapiens["outputs"][0]["id"] - # sapiens_json_output = gi.datasets.download_dataset(dataset_id=sapiens_job_out) - # try: - # sapiens_output = json.loads(sapiens_json_output)[0] - # sapiens_id = str(sapiens_output["organism_id"]) # needs to be str to be recognized by the chado tool - # toolrunner.delete_sapiens(hs_id=sapiens_id) - # except bb.ConnectionError: - # print("Homo sapiens isn't in the database") - # except IndexError: - # pass - # - # # Workflow generation - workflow = Workflow(parameters_dict=sp_dict, instance=gi, history_id = current_hi_id) - # wf_dict_json = workflow.generate(working_directory=wd, main_directory=main_dir, workflow_name="preset_workflow") - # - # tools = gi.tools.get_tool_panel() # tools panel -> alternative to wf - # # print(tools) - # - # wf_dict = json.loads(wf_dict_json) # doesn't work with eval() - # - # gi.workflows.import_workflow_dict(workflow_dict=wf_dict) - # wf_name = workflow.get_workflow_name() - # wf_attr = gi.workflows.get_workflows(name=wf_name) - # wf_id = wf_attr[0]["id"] - # wf_show = gi.workflows.show_workflow(workflow_id=wf_id) - # print("Workflow ID: " + wf_id) - # - # toolrunner = ToolRunner(parameters_dict=sp_dict, instance=gi, history=current_hi_id) - # # toolrunner.purge_organisms() - # - # # wf_o = bbo.Workflow(wf_dict=wf_dict, gi=gi) - # - # wf_params = workflow.set_main_workflow_parameters(datasets=datasets) - # # print("Inputs:") - # # print(wf_show["inputs"]) - # - # datamap = dict() - # datamap["0"] = {"src": "hda", "id": datasets["genome_file"]} - # datamap["1"] = {"src": "hda", "id": datasets["gff_file"]} - # datamap["2"] = {"src": "hda", "id": datasets["proteins_file"]} - # datamap["3"] = {"src": "hda", "id": datasets["transcripts_file"]} - # - # gi.workflows.invoke_workflow(workflow_id=wf_id, - # history_id=current_hi_id, - # params=wf_params, - # inputs=datamap) - # gi.workflows.delete_workflow(workflow_id=wf_id) - # - # datamap = dict() - # datamap["0"] = {"src": "hda", "id": datasets["genome_file"]} - # datamap["1"] = {"src": "hda", "id": datasets["proteins_file"]} - # - wf_dict_json = workflow.generate(working_directory=wd, main_directory=main_dir, workflow_name="main") - wf_dict = json.loads(wf_dict_json) # doesn't work with eval() - # - # gi.workflows.import_workflow_dict(workflow_dict=wf_dict) - # wf_attr = gi.workflows.get_workflows(name="jbrowse") - # wf_id = wf_attr[0]["id"] - # wf_show = gi.workflows.show_workflow(workflow_id=wf_id) - # print("Jbrowse workflow ID: " + wf_id) - # wf_params = workflow.set_jbrowse_workflow_parameters() - # - # allow_tool_state_correction makes galaxy fill missing tool states, - # because workflow was edited outside of galaxy with only some inputs (precaution parameter) - # gi.workflows.invoke_workflow(workflow_id=wf_id, - # history_id=current_hi_id, - # params=wf_params, - # inputs=datamap, - # allow_tool_state_corrections=True) - # gi.workflows.delete_workflow(workflow_id=wf_id) - - # remove active instance history for testing, purge configured @ ~/config/galaxy.yml.docker_sample - # gi.histories.delete_history(history_id=current_hi_id, purge=True) - - os.chdir(main_dir) - print("\n") - - -if __name__ == "__main__": - main() -- GitLab