from bioblend import galaxy import bioblend.galaxy.objects as bbo import bioblend as bb import argparse import os import sys import subprocess import json import urllib3 as ul from chado import ChadoInstance from workflow import Workflow from filetransfer import FileTransfer from toolrunner import ToolRunner from webscrap import WebScrap """ TODO: script description python3 ~/PycharmProjects/ggauto/gga_load_data/main.py ~/PycharmProjects/ggauto/gga_load_data/dataloader.json """ class Autoload: """ TODO: turn main into an object """ def __init__(self, json_in): self.json_in = json_in def main(self): return None def main(): parser = argparse.ArgumentParser(description="Input genus, species, strain, version") parser.add_argument("json", type=str, help="Input JSON file") parser.add_argument("--just-load", help="Only load data into galaxy, does not create nor run analyses in galaxy") # CLI stuff # parser.add_argument("--name", help="Sample species name, format: genus-species",type=str) # parser.add_argument("--strain", help="Strain of the sample species", type=str) # parser.add_argument("--version", help="Data version (e.g 1.0, 1.2, ...)", type=str) # parser.add_argument("--common-name", help="Vernacular/common name of the species",type=str) user_arguments = parser.parse_args() # List that will hold all dicts from the JSON input file, containing parameters for each species sp_dict_list = [] # JSON parsing and loading with open(user_arguments.json, 'r') as infile: json_sp_dict = json.load(infile) json_sp_dump = json.dumps(json_sp_dict, indent=4, sort_keys=True) for json_sp in json_sp_dict: sp_dict_list.append(json_sp) # Get variables from the current species dict for sp_dict in sp_dict_list: sp_params_list = [] genus = sp_dict["genus"] genus_lower = genus[0].lower() + genus[1:] species = sp_dict["species"] genus_species = genus_lower + "_" + species common = sp_dict["common"] strain = sp_dict["strain"] if strain != "": genus_species_strain = genus_species + "_" + strain else: genus_species_strain = genus_species ogs_version = sp_dict["ogs version"] genome_version = sp_dict["genome version"] performed_by = sp_dict["performed by"] # CLI stuff # genus = user_arguments.name.split('-')[0] # genus_lower = genus[0].lower().genus[1:] # genus_upper = genus[0].upper() + genus[1:] # species = user_arguments.name.split('-')[1] # strain = user_arguments.strain # vernacular = user_arguments.common_name # TODO: prompt y/n asking for the validity of info # Test adress, change to abims-gga.sb-roscoff.fr/sp/ in production instance_url = "http://localhost/sp/" + genus_lower + "_" + species + "/galaxy/" print("Species: " + genus + " " + species + " (" + common + ")" "\nStrain: " + strain + "\nAccessing instance " + instance_url) # Connect to the galaxy instance of the current species TODO: API key connection issues gi = galaxy.GalaxyInstance(url=instance_url, key="3b36455cb16b4d0e4348e2c42f4bb934", email="alebars@sb-roscoff.fr", password="pouet", verify=True) """ This part creates the current species directory and go to it If it already exists, just move to it To be expanded when docker-swarm is implemented (or all config files are ready), not useful for now """ main_dir = os.getcwd() + "/" sp_dir = os.path.join(main_dir, genus_species) + "/" try: os.mkdir(sp_dir) except FileExistsError: print("Directory " + sp_dir + " already exists") try: os.chdir(sp_dir) wd = os.getcwd() except OSError: print("Cannot access " + sp_dir + ", run with higher privileges") break # Production instance example TODO: secure pswd and API key + manage API keys # gi = galaxy.GalaxyInstance(url="http://abims-gga.sb-roscoff.fr/sp/ectocarpus_species1/galaxy/", # key="84dfbee3c0efa9155518f01fbeff57c8", # email="gga@sb-roscoff.fr", # password="****") # Check connection to the current instance try: hl = gi.histories.get_histories() except bb.ConnectionError: print("Cannot connect to GGA instance @ " + instance_url) else: print("Successfully connected to instance " + instance_url) # TODO: FTP stuff to retrieve the datasets (used in testing, not needed for production) # try: # os.mkdir("./src_data") # except FileExistsError: # print("src_data folder already exists for " + genus_species_strain) # else: # print("src_data folder created for " + genus_species_strain) # # TODO: load the data into the current species directory and load it into galaxy instance # setup_data_libraries_cl = \ # "docker-compose exec galaxy /tool_deps/_conda/bin/python /opt/setup_data_libraries.py" # # try: # setup_data_libraries = subprocess.Popen(setup_data_libraries_cl.split(), stdout=subprocess.PIPE) # # output message from the data loading script # setup_data_libraries_output = setup_data_libraries.communicate() # except Exception: # print("Cannot load data into container for " + genus_species_strain) # break # else: # print("Data successfully loaded into docker container for " + genus_species_strain) # generate workflow file and run it in the galaxy instance gi.histories.create_history(name=str(genus_species_strain + "_" + genome_version)) hi = gi.histories.get_histories(name=str(genus_species_strain + "_" + genome_version)) hi_id = hi[0]["id"] li = gi.libraries.get_libraries() # only one library li_id = gi.libraries.get_libraries()[0]["id"] # project data folder/library fo_gi = gi.libraries.get_folders(library_id=li_id) # data location (project data) fo_id = {} current_fo_name = "" # folders ids: access to data to run the first tools for i in fo_gi: for k, v in i.items(): if k == "name": fo_id[v] = 0 current_fo_name = v if k == "id": fo_id[current_fo_name] = v # TODO: turn data id parsing into a function print("Folders and datasets IDs: ") datasets = dict() for k, v in fo_id.items(): print("\t" + k + ": " + v) if k == "/genome": sub_folder_content = gi.folders.show_folder(folder_id=v, contents=True) for k2, v2 in sub_folder_content.items(): for e in v2: if type(e) == dict: if e["name"].endswith(".fa"): datasets["genome_file"] = e["ldda_id"] print("\t\t" + e["name"] + ": " + e["ldda_id"]) elif k == "/annotation/" + genus_species: sub_folder_content = gi.folders.show_folder(folder_id=v, contents=True) for k2, v2 in sub_folder_content.items(): for e in v2: if type(e) == dict: # TODO: manage several files of the same type if e["name"].endswith("transcripts-gff.fa"): datasets["transcripts_file"] = e["ldda_id"] print("\t\t" + e["name"] + ": " + e["ldda_id"]) elif e["name"].endswith("proteins.fasta"): datasets["proteins_file"] = e["ldda_id"] print("\t\t" + e["name"] + ": " + e["ldda_id"]) elif e["name"].endswith(".gff"): datasets["gff_file"] = e["ldda_id"] print("\t\t" + e["name"] + ": " + e["ldda_id"]) current_hi_id = gi.histories.get_current_history()["id"] print("History ID: " + current_hi_id) gi.histories.upload_dataset_from_library(history_id=current_hi_id, lib_dataset_id=datasets["genome_file"]) gi.histories.upload_dataset_from_library(history_id=current_hi_id, lib_dataset_id=datasets["gff_file"]) gi.histories.upload_dataset_from_library(history_id=current_hi_id, lib_dataset_id=datasets["transcripts_file"]) gi.histories.upload_dataset_from_library(history_id=current_hi_id, lib_dataset_id=datasets["proteins_file"]) # Workflow generation workflow = Workflow(parameters_dict=sp_dict, instance=gi, history_id = current_hi_id) wf_dict_json = workflow.generate(working_directory=wd, main_directory=main_dir, workflow_name="preset_workflow") tools = gi.tools.get_tool_panel() # tools panel -> alternative to wf # print(tools) wf_dict = json.loads(wf_dict_json) # doesn't work with eval() gi.workflows.import_workflow_dict(workflow_dict=wf_dict) wf_name = workflow.get_workflow_name() wf_attr = gi.workflows.get_workflows(name=wf_name) wf_id = wf_attr[0]["id"] wf_show = gi.workflows.show_workflow(workflow_id=wf_id) print("Workflow ID: " + wf_id) toolrunner = ToolRunner(parameters_dict=sp_dict, instance=gi, history=current_hi_id) # toolrunner.purge_organisms() # wf_o = bbo.Workflow(wf_dict=wf_dict, gi=gi) wf_params = workflow.set_main_workflow_parameters(datasets=datasets) print("Inputs:") print(wf_show["inputs"]) datamap = dict() datamap["0"] = {"src": "hda", "id": datasets["genome_file"]} datamap["1"] = {"src": "hda", "id": datasets["gff_file"]} datamap["2"] = {"src": "hda", "id": datasets["proteins_file"]} datamap["3"] = {"src": "hda", "id": datasets["transcripts_file"]} # # gi.workflows.invoke_workflow(workflow_id=wf_id, # history_id=current_hi_id, # params=wf_params, # inputs=datamap) # gi.workflows.delete_workflow(workflow_id=wf_id) datamap = dict() datamap["0"] = {"src": "hda", "id": datasets["genome_file"]} datamap["1"] = {"src": "hda", "id": datasets["proteins_file"]} wf_dict_json = workflow.generate(working_directory=wd, main_directory=main_dir, workflow_name="jbrowse") wf_dict = json.loads(wf_dict_json) # doesn't work with eval() gi.workflows.import_workflow_dict(workflow_dict=wf_dict) wf_attr = gi.workflows.get_workflows(name="jbrowse") wf_id = wf_attr[0]["id"] wf_show = gi.workflows.show_workflow(workflow_id=wf_id) print("Jbrowse workflow ID: " + wf_id) wf_params = workflow.set_jbrowse_workflow_parameters() gi.workflows.invoke_workflow(workflow_id=wf_id, history_id=current_hi_id, params=wf_params, inputs=datamap) gi.workflows.delete_workflow(workflow_id=wf_id) # remove active instance history for testing, purge configured @ ~/config/galaxy.yml.docker_sample # gi.histories.delete_history(history_id=current_hi_id, purge=True) os.chdir(main_dir) print("\n") if __name__ == "__main__": main()