Skip to content
Snippets Groups Projects
main.py 16.49 KiB
from bioblend import galaxy
import bioblend.galaxy.objects as bbo
import bioblend as bb
import argparse
import os
import sys
import subprocess
import logging
import re
import json
from workflow import Workflow
from toolrunner import ToolRunner
"""
TODO: script description
python3 ~/PycharmProjects/ggauto/gga_load_data/main.py ~/PycharmProjects/ggauto/gga_load_data/dataloader.json

"""


def main():

    parser = argparse.ArgumentParser(description="Input genus, species, strain, version")
    parser.add_argument("json", type=str, help="Input JSON file")
    parser.add_argument("--just-load", help="Only load data into galaxy, does not create nor run analyses in galaxy")
    parser.add_argument("-v", "--verbose", help="Increase output verbosity")

    # CLI stuff
    # parser.add_argument("--name", help="Sample species name, format: genus-species",type=str)
    # parser.add_argument("--strain", help="Strain of the sample species", type=str)
    # parser.add_argument("--version", help="Data version (e.g 1.0, 1.2, ...)", type=str)
    # parser.add_argument("--common-name", help="Vernacular/common name of the species",type=str)

    user_arguments = parser.parse_args()

    logging.basicConfig(level=logging.INFO)

    # List that will hold all dicts from the JSON input file, containing parameters for each species
    sp_dict_list = []

    # JSON parsing and loading
    with open(user_arguments.json, 'r') as infile:
        json_sp_dict = json.load(infile)
        json_sp_dump = json.dumps(json_sp_dict, indent=4, sort_keys=True)
        for json_sp in json_sp_dict:
            sp_dict_list.append(json_sp)

    # Get variables from the current species dict
    for sp_dict in sp_dict_list:
        sp_params_list = []
        genus = sp_dict["genus"]
        genus_lower = genus[0].lower() + genus[1:]
        species = sp_dict["species"]
        genus_species = genus_lower + "_" + species
        common = sp_dict["common"]
        strain = sp_dict["strain"]
        sex = sp_dict["sex"]
        if strain != "":
            genus_species_strain = genus_species + "_" + strain
        else:
            genus_species_strain = genus_species
        ogs_version = sp_dict["ogs version"]
        genome_version = sp_dict["genome version"]
        performed_by = sp_dict["performed by"]

        # CLI stuff
        # genus = user_arguments.name.split('-')[0]
        # genus_lower = genus[0].lower().genus[1:]
        # genus_upper = genus[0].upper() + genus[1:]
        # species = user_arguments.name.split('-')[1]
        # strain = user_arguments.strain
        # vernacular = user_arguments.common_name

        # TODO: prompt y/n asking for the validity of info
        # Test adress, change to abims-gga.sb-roscoff.fr/sp/ in production
        instance_url = "http://localhost/sp/" + genus_lower + "_" + species + "/galaxy/"

        print("Species: " + genus + " " + species + " (" + common + ")" +
                     "\nStrain: " + strain +
                     "\nAccessing instance " + instance_url)

        # Connect to the galaxy instance of the current species TODO: connection issues (galaxy side)
        gi = galaxy.GalaxyInstance(url=instance_url,
                                   key="3b36455cb16b4d0e4348e2c42f4bb934",
                                   email="alebars@sb-roscoff.fr",
                                   password="pouet",
                                   verify=True)

        # admin_email = os.environ.get('GALAXY_DEFAULT_ADMIN_USER', 'admin@galaxy.org')
        # admin_pass = os.environ.get('GALAXY_DEFAULT_ADMIN_PASSWORD', 'admin')

        """
        This part creates the current species directory and go to it
        If it already exists, just move to it
        To be expanded when docker-swarm is implemented (or all config files are ready), not useful for now
        """
        main_dir = os.getcwd() + "/"
        sp_dir = os.path.join(main_dir, genus_species) + "/"

        try:
            os.mkdir(sp_dir)
        except FileExistsError:
            print("Directory " + sp_dir + " already exists")
        try:
            os.chdir(sp_dir)
            wd = os.getcwd()
        except OSError:
            print("Cannot access " + sp_dir + ", run with higher privileges")
            break

        # Production instance example TODO: secure pswd and API key + manage API keys
        # gi = galaxy.GalaxyInstance(url="http://abims-gga.sb-roscoff.fr/sp/ectocarpus_species1/galaxy/",
        #                            key="84dfbee3c0efa9155518f01fbeff57c8",
        #                            email="gga@sb-roscoff.fr",
        #                            password="****")

        # Check connection to the current instance
        print("Testing connection to the galaxy instance")
        try:
            hl = gi.histories.get_histories()
        except bb.ConnectionError:
            print("Cannot connect to galaxy instance @ " + instance_url)

        else:
            print("Successfully connected to galaxy instance @ " + instance_url)

        # TODO: FTP/symlink stuff to retrieve the datasets + change headers in pep.fasta

        # ---------------------------------------------------------------------
        # src_data directory tree creation
        # ---------------------------------------------------------------------

        src_data_folders = ["annotation", "genome"]
        species_folder_name = "_".join([genus_lower, species, strain, sex])
        try:
            os.mkdir("./src_data")
            os.mkdir("./src_data/annotation")
            os.mkdir("./src_data/genome")
            os.mkdir("./src_data/annotation/" + species_folder_name)
            os.mkdir("./src_data/genome/" + species_folder_name)
        except FileExistsError:
            print("src_data directory tree already exists")
            pass
        except PermissionError:
            print("Insufficient permission to create src_data directory tree")

        # ---------------------------------------------------------------------
        # Data import into galaxy
        # ---------------------------------------------------------------------

        source_files = dict()
        annotation_dir, genome_dir = None, None
        for d in [i[0] for i in os.walk(os.getcwd() + "/src_data")]:
            if "annotation/" in d:
                annotation_dir = d
                for f in os.listdir(d):
                    if f.endswith("proteins.fasta"):
                        source_files["proteins_file"] = os.path.join(d, f)
                    elif f.endswith("transcripts-gff.fa"):
                        source_files["transcripts_file"] = os.path.join(d, f)
                    elif f.endswith(".gff"):
                        source_files["gff_file"] = os.path.join(d, f)
                # annotation_dir_files = [f for f in os.listdir(d) if os.path.isfile(os.path.join(d, f))]
            elif "genome/" in d:
                genome_dir = d
                for f in os.listdir(d):
                    if f.endswith(".fa"):
                        source_files["genome_file"] = os.path.join(d, f)
                # genome_dir_files = [f for f in os.listdir(d) if os.path.isfile(os.path.join(d, f))]
                print("Source files found:")
        for k, v in source_files.items():
            print("\t" + k + "\t" + v)

        # Changing headers in the *proteins.fasta file from >mRNA* to >protein*
        # production version
        modify_pep_headers = ["/usr/local/genome2/mmo/scripts/phaeoexplorer/phaeoexplorer-change_pep_fasta_header.sh",
                              source_files["proteins_file"]]
        # test version
        modify_pep_headers = ["/home/alebars/gga/phaeoexplorer-change_pep_fasta_header.sh",
                              source_files["proteins_file"]]
        print("Changing fasta headers in " + source_files["proteins_file"])
        subprocess.run(modify_pep_headers, stdout=subprocess.PIPE, cwd=annotation_dir)

        # src_data cleaning
        if os.path.exists(annotation_dir + "outfile"):
            subprocess.run(["mv", annotation_dir + "/outfile", source_files["proteins_file"]],
                           stdout=subprocess.PIPE,
                           cwd=annotation_dir)
        if os.path.exists(annotation_dir + "gmon.out"):
            subprocess.run(["rm", annotation_dir + "/gmon.out"],
                           stdout=subprocess.PIPE,
                           cwd=annotation_dir)

        # TODO: load the data into the current species directory and load it into galaxy instance
        setup_data_libraries = "docker-compose exec galaxy /tool_deps/_conda/bin/python /opt/setup_data_libraries.py"
        try:
            print("Loading data into the galaxy container")
            subprocess.run(setup_data_libraries,
                           stdout=subprocess.PIPE,
                           shell=True)
        except subprocess.CalledProcessError:
            print("Cannot load data into container for " + genus_species_strain)
            break
        else:
            print("Data successfully loaded into docker container for " + genus_species_strain)

        # generate workflow file and run it in the galaxy instance

        # gi.histories.create_history(name=str(genus_species_strain + "_" + genome_version))
        hi = gi.histories.get_histories(name=str(genus_species_strain + "_" + genome_version))
        hi_id = hi[0]["id"]
        li = gi.libraries.get_libraries()  # only one library
        li_id = gi.libraries.get_libraries()[0]["id"]  # project data folder/library
        fo_gi = gi.libraries.get_folders(library_id=li_id)  # data location (project data)

        fo_id = {}
        current_fo_name = ""
        # folders ids: access to data to run the first tools
        for i in fo_gi:
            for k, v in i.items():
                if k == "name":
                    fo_id[v] = 0
                    current_fo_name = v
                if k == "id":
                    fo_id[current_fo_name] = v
        print("Folders and datasets IDs: ")
        datasets = dict()
        for k, v in fo_id.items():
            print("\t" + k + ": " + v)
            if k == "/genome":
                sub_folder_content = gi.folders.show_folder(folder_id=v, contents=True)
                for k2, v2 in sub_folder_content.items():
                    for e in v2:
                        if type(e) == dict:
                            if e["name"].endswith(".fa"):
                                datasets["genome_file"] = e["ldda_id"]
                                print("\t\t" + e["name"] + ": " + e["ldda_id"])
            elif k == "/annotation/" + genus_species:
                sub_folder_content = gi.folders.show_folder(folder_id=v, contents=True)
                for k2, v2 in sub_folder_content.items():
                    for e in v2:
                        if type(e) == dict:
                            # TODO: manage several files of the same type and versions
                            if e["name"].endswith("transcripts-gff.fa"):
                                datasets["transcripts_file"] = e["ldda_id"]
                                print("\t\t" + e["name"] + ": " + e["ldda_id"])
                            elif e["name"].endswith("proteins.fasta"):
                                datasets["proteins_file"] = e["ldda_id"]
                                print("\t\t" + e["name"] + ": " + e["ldda_id"])
                            elif e["name"].endswith(".gff"):
                                datasets["gff_file"] = e["ldda_id"]
                                print("\t\t" + e["name"] + ": " + e["ldda_id"])
                            elif e["name"].endswith("MALE"):
                                datasets["gff_file"] = e["ldda_id"]
                                print("\t\t" + e["name"] + ": " + e["ldda_id"])

        current_hi_id = gi.histories.get_current_history()["id"]
        print("History ID: " + current_hi_id)
        gi.histories.upload_dataset_from_library(history_id=current_hi_id, lib_dataset_id=datasets["genome_file"])
        gi.histories.upload_dataset_from_library(history_id=current_hi_id, lib_dataset_id=datasets["gff_file"])
        gi.histories.upload_dataset_from_library(history_id=current_hi_id, lib_dataset_id=datasets["transcripts_file"])
        gi.histories.upload_dataset_from_library(history_id=current_hi_id, lib_dataset_id=datasets["proteins_file"])

        toolrunner = ToolRunner(parameters_dict=sp_dict, instance=gi, history=current_hi_id)
        # toolrunner.show_pannel()  # show tools pannel (with tool_id and versions)

        # ---------------------------------------------------------------------
        # Galaxy instance interaction
        # ---------------------------------------------------------------------

        # # Delete Homo sapiens from Chado database
        # toolrunner = ToolRunner(parameters_dict=sp_dict, instance=gi, history=current_hi_id)
        # sapiens = toolrunner.get_sapiens_id()
        # sapiens_job_out = sapiens["outputs"][0]["id"]
        # sapiens_json_output = gi.datasets.download_dataset(dataset_id=sapiens_job_out)
        # try:
        #     sapiens_output = json.loads(sapiens_json_output)[0]
        #     sapiens_id = str(sapiens_output["organism_id"])  # needs to be str to be recognized by the chado tool
        #     toolrunner.delete_sapiens(hs_id=sapiens_id)
        # except bb.ConnectionError:
        #     print("Homo sapiens isn't in the database")
        # except IndexError:
        #     pass
        #
        # # Workflow generation
        workflow = Workflow(parameters_dict=sp_dict, instance=gi, history_id = current_hi_id)
        # wf_dict_json = workflow.generate(working_directory=wd, main_directory=main_dir, workflow_name="preset_workflow")
        #
        # tools = gi.tools.get_tool_panel()  # tools panel -> alternative to wf
        # # print(tools)
        #
        # wf_dict = json.loads(wf_dict_json)  # doesn't work with eval()
        #
        # gi.workflows.import_workflow_dict(workflow_dict=wf_dict)
        # wf_name = workflow.get_workflow_name()
        # wf_attr = gi.workflows.get_workflows(name=wf_name)
        # wf_id = wf_attr[0]["id"]
        # wf_show = gi.workflows.show_workflow(workflow_id=wf_id)
        # print("Workflow ID: " + wf_id)
        #
        # toolrunner = ToolRunner(parameters_dict=sp_dict, instance=gi, history=current_hi_id)
        # # toolrunner.purge_organisms()
        #
        # # wf_o = bbo.Workflow(wf_dict=wf_dict, gi=gi)
        #
        # wf_params = workflow.set_main_workflow_parameters(datasets=datasets)
        # # print("Inputs:")
        # # print(wf_show["inputs"])
        #
        # datamap = dict()
        # datamap["0"] = {"src": "hda", "id": datasets["genome_file"]}
        # datamap["1"] = {"src": "hda", "id": datasets["gff_file"]}
        # datamap["2"] = {"src": "hda", "id": datasets["proteins_file"]}
        # datamap["3"] = {"src": "hda", "id": datasets["transcripts_file"]}
        #
        # gi.workflows.invoke_workflow(workflow_id=wf_id,
        #                              history_id=current_hi_id,
        #                              params=wf_params,
        #                              inputs=datamap)
        # gi.workflows.delete_workflow(workflow_id=wf_id)
        #
        # datamap = dict()
        # datamap["0"] = {"src": "hda", "id": datasets["genome_file"]}
        # datamap["1"] = {"src": "hda", "id": datasets["proteins_file"]}
        #
        wf_dict_json = workflow.generate(working_directory=wd, main_directory=main_dir, workflow_name="main")
        wf_dict = json.loads(wf_dict_json)  # doesn't work with eval()
        #
        # gi.workflows.import_workflow_dict(workflow_dict=wf_dict)
        # wf_attr = gi.workflows.get_workflows(name="jbrowse")
        # wf_id = wf_attr[0]["id"]
        # wf_show = gi.workflows.show_workflow(workflow_id=wf_id)
        # print("Jbrowse workflow ID: " + wf_id)
        # wf_params = workflow.set_jbrowse_workflow_parameters()
        #
        # allow_tool_state_correction makes galaxy fill missing tool states,
        # because workflow was edited outside of galaxy with only some inputs (precaution parameter)
        # gi.workflows.invoke_workflow(workflow_id=wf_id,
        #                              history_id=current_hi_id,
        #                              params=wf_params,
        #                              inputs=datamap,
        #                              allow_tool_state_corrections=True)
        # gi.workflows.delete_workflow(workflow_id=wf_id)

        # remove active instance history for testing, purge configured @ ~/config/galaxy.yml.docker_sample
        # gi.histories.delete_history(history_id=current_hi_id, purge=True)

        os.chdir(main_dir)
        print("\n")


if __name__ == "__main__":
    main()