Skip to content
Snippets Groups Projects
workflow.py 16.15 KiB
import os
from bioblend.galaxy import GalaxyInstance
from toolrunner import ToolRunner
import json
import logging
"""
Workflow creation for generation and visualization of data and analyses output
"""


class Workflow:

    logging.basicConfig(level=logging.INFO)

    def __init__(self, parameters_dict, instance, history_id):
        self.history_id = history_id
        self.instance = instance
        self.parameters_dict = parameters_dict
        self.genus = parameters_dict["genus"]
        self.species = parameters_dict["species"]
        self.strain = parameters_dict["strain"]
        self.common = parameters_dict["common"]
        self.performed = parameters_dict["performed by"]
        self.genome_version = parameters_dict["genome version"]
        self.ogs_version = parameters_dict["ogs version"]
        self.sex = parameters_dict["sex"]
        self.date = parameters_dict["date"]
        self.custom_ga_file = None
        self.custom_ga_file_path = None
        self.preset_ga_file = None
        self.sp_url = self.genus[0].lower() + self.genus[1:] + "_" + self.species
        if self.strain != "":
            self.abbr = self.genus[0].lower() + "_" + self.species + "_" + self.strain
            self.full = "_".join([self.genus, self.species, self.strain, self.sex])
        else:
            self.abbr = self.genus[0].lower() + "_" + self.species
            self.full = "_".join([self.genus, self.species, self.strain, self.sex])
        self.workflow = None

    def generate(self, working_directory, main_directory, workflow_name):
        """
        Generation of a galaxy workflow using the defined parameters in the .json input file
        Output format is a json dict

        :param working_directory: 
        :param main_directory: 
        :return: 
        """
        # template workflow as a string
        # template_workflow_str = '"{"uuid": "ea9c3050-416f-4098-a7ff-b05c952bcd73", "tags": [], "format-version": "0.1", "name": "test", "version": 2, "steps": {"0": {"tool_id": "toolshed.g2.bx.psu.edu/repos/gga/chado_organism_delete_organisms/organism_delete_organisms/2.3.2", "tool_version": "2.3.2", "outputs": [{"type": "txt", "name": "results"}], "workflow_outputs": [{"output_name": "results", "uuid": "d9b75b03-49e7-4a81-a67c-eaf6a9671905", "label": null}], "input_connections": {}, "tool_state": "{\"__page__\": null, \"__rerun_remap_job_id__\": null, \"organism\": \"\\\"2\\\"\", \"psql_target\": \"{\\\"__current_case__\\\": 0, \\\"method\\\": \\\"remote\\\"}\"}", "id": 0, "tool_shed_repository": {"owner": "gga", "changeset_revision": "13da56fdaeb1", "name": "chado_organism_delete_organisms", "tool_shed": "toolshed.g2.bx.psu.edu"}, "uuid": "f70569bc-9ac0-441a-a2d8-b547086a5bdf", "errors": null, "name": "Chado organism delete", "post_job_actions": {}, "label": "$ORGADELETE", "inputs": [], "position": {"top": 362, "left": 200}, "annotation": "", "content_id": "toolshed.g2.bx.psu.edu/repos/gga/chado_organism_delete_organisms/organism_delete_organisms/2.3.2", "type": "tool"}, "1": {"tool_id": "toolshed.g2.bx.psu.edu/repos/gga/chado_organism_add_organism/organism_add_organism/2.3.2", "tool_version": "2.3.2", "outputs": [{"type": "json", "name": "results"}], "workflow_outputs": [{"output_name": "results", "uuid": "8ca0b891-0f01-4787-9b7f-57105dc303b0", "label": null}], "input_connections": {}, "tool_state": "{\"comment\": \"\\\"\\\"\", \"__page__\": null, \"__rerun_remap_job_id__\": null, \"psql_target\": \"{\\\"__current_case__\\\": 0, \\\"method\\\": \\\"remote\\\"}\", \"common\": \"\\\"$COMMON\\\"\", \"genus\": \"\\\"$GENUS\\\"\", \"species\": \"\\\"$SPECIES\\\"\", \"abbr\": \"\\\"$ABBR\\\"\"}", "id": 1, "tool_shed_repository": {"owner": "gga", "changeset_revision": "0f4956cec445", "name": "chado_organism_add_organism", "tool_shed": "toolshed.g2.bx.psu.edu"}, "uuid": "24f0e175-f932-4e48-8b42-a53d9a432d5e", "errors": null, "name": "Chado organism add", "post_job_actions": {}, "label": "$ORGADD", "inputs": [], "position": {"top": 361, "left": 467.5}, "annotation": "", "content_id": "toolshed.g2.bx.psu.edu/repos/gga/chado_organism_add_organism/organism_add_organism/2.3.2", "type": "tool"}, "2": {"tool_id": "toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/2.3.2", "tool_version": "2.3.2", "outputs": [{"type": "json", "name": "results"}], "workflow_outputs": [{"output_name": "results", "uuid": "8fa0e728-8803-4800-93b4-70f906f95f87", "label": null}], "input_connections": {}, "tool_state": "{\"__page__\": null, \"name\": \"\\\"$GENOME\\\"\", \"sourceuri\": \"\\\"\\\"\", \"sourcename\": \"\\\"\\\"\", \"__rerun_remap_job_id__\": null, \"programversion\": \"\\\"\\\"\", \"psql_target\": \"{\\\"__current_case__\\\": 0, \\\"method\\\": \\\"remote\\\"}\", \"sourceversion\": \"\\\"\\\"\", \"program\": \"\\\"$PERFORMEDBY\\\"\", \"algorithm\": \"\\\"\\\"\", \"date_executed\": \"\\\"\\\"\", \"description\": \"\\\"\\\"\"}", "id": 2, "tool_shed_repository": {"owner": "gga", "changeset_revision": "3a1f3c9b755b", "name": "chado_analysis_add_analysis", "tool_shed": "toolshed.g2.bx.psu.edu"}, "uuid": "76cbbd55-f1ac-4e48-be3c-c7bbda5add4c", "errors": null, "name": "Chado analysis add", "post_job_actions": {}, "label": "$ADDGENOME", "inputs": [], "position": {"top": 307, "left": 690}, "annotation": "", "content_id": "toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/2.3.2", "type": "tool"}, "3": {"tool_id": "toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/2.3.2", "tool_version": "2.3.2", "outputs": [{"type": "json", "name": "results"}], "workflow_outputs": [{"output_name": "results", "uuid": "5e7da027-0723-4077-8885-2dbe51cb5dda", "label": null}], "input_connections": {}, "tool_state": "{\"__page__\": null, \"name\": \"\\\"$OGS\\\"\", \"sourceuri\": \"\\\"\\\"\", \"sourcename\": \"\\\"\\\"\", \"__rerun_remap_job_id__\": null, \"programversion\": \"\\\"\\\"\", \"psql_target\": \"{\\\"__current_case__\\\": 0, \\\"method\\\": \\\"remote\\\"}\", \"sourceversion\": \"\\\"\\\"\", \"program\": \"\\\"$PERFORMEDBY\\\"\", \"algorithm\": \"\\\"\\\"\", \"date_executed\": \"\\\"\\\"\", \"description\": \"\\\"\\\"\"}", "id": 3, "tool_shed_repository": {"owner": "gga", "changeset_revision": "3a1f3c9b755b", "name": "chado_analysis_add_analysis", "tool_shed": "toolshed.g2.bx.psu.edu"}, "uuid": "4d1ffee4-00b2-445d-b630-b7b774c17873", "errors": null, "name": "Chado analysis add", "post_job_actions": {}, "label": "$ADDOGS", "inputs": [], "position": {"top": 395, "left": 697}, "annotation": "", "content_id": "toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/2.3.2", "type": "tool"}, "4": {"tool_id": "toolshed.g2.bx.psu.edu/repos/gga/chado_feature_load_fasta/feature_load_fasta/2.3.2", "tool_version": "2.3.2", "outputs": [{"type": "json", "name": "results"}], "workflow_outputs": [{"output_name": "results", "uuid": "737dddc9-ae1b-463d-99fa-d9176053594d", "label": null}], "input_connections": {}, "tool_state": "{\"do_update\": \"\\\"false\\\"\", \"relationships\": \"{\\\"__current_case__\\\": 0, \\\"rel_type\\\": \\\"none\\\"}\", \"ext_db\": \"{\\\"db\\\": \\\"\\\", \\\"re_db_accession\\\": \\\"\\\"}\", \"analysis_id\": \"\\\"4\\\"\", \"re_uniquename\": \"\\\"\\\"\", \"match_on_name\": \"\\\"false\\\"\", \"__page__\": null, \"__rerun_remap_job_id__\": null, \"psql_target\": \"{\\\"__current_case__\\\": 0, \\\"method\\\": \\\"remote\\\"}\", \"re_name\": \"\\\"\\\"\", \"fasta\": \"{\\\"__class__\\\": \\\"RuntimeValue\\\"}\", \"wait_for\": \"{\\\"__class__\\\": \\\"RuntimeValue\\\"}\", \"organism\": \"\\\"2\\\"\", \"sequence_type\": \"\\\"contig\\\"\"}", "id": 4, "tool_shed_repository": {"owner": "gga", "changeset_revision": "1421dbc33a92", "name": "chado_feature_load_fasta", "tool_shed": "toolshed.g2.bx.psu.edu"}, "uuid": "3d417ced-fc48-4c04-8a92-fdb7b9fecafc", "errors": null, "name": "Chado load fasta", "post_job_actions": {}, "label": "$LOADFASTA", "inputs": [{"name": "fasta", "description": "runtime parameter for tool Chado load fasta"}, {"name": "wait_for", "description": "runtime parameter for tool Chado load fasta"}], "position": {"top": 306, "left": 933.5}, "annotation": "", "content_id": "toolshed.g2.bx.psu.edu/repos/gga/chado_feature_load_fasta/feature_load_fasta/2.3.2", "type": "tool"}}, "annotation": "", "a_galaxy_workflow": "true"}"'
        # TODO: store workflow as a var instead of file (once it runs smoothly)

        os.chdir(path=working_directory)
        self.preset_ga_file = main_directory + "Galaxy-Workflow-" + workflow_name + ".ga"
        if self.strain != "":
            self.custom_ga_file = "_".join([self.genus, self.species, self.strain]) + "_workflow.ga"
            self.custom_ga_file_path = os.path.abspath(self.custom_ga_file)
            # print("Workflow file @ " + self.custom_ga_file_path)
        else:
            self.custom_ga_file = "_".join([self.genus, self.species]) + "_workflow.ga"
            self.custom_ga_file_path = os.path.abspath(self.custom_ga_file)
            # print("Workflow file @ " + self.custom_ga_file_path)
        with open(self.preset_ga_file, 'r') as ga_in_file:
            ga_in = str(ga_in_file.readlines())
            print(ga_in)
            ga_in = ga_in.replace('{\\\\\\\\\\\\"unique_id\\\\\\\\\\\\": \\\\\\\\\\\\"UNIQUE_ID\\\\\\\\\\\\"}',
                                  str('{\\\\\\\\\\\\"unique_id\\\\\\\\\\\\": \\\\\\\\\\\\"' + self.genus + " " + self.species) + '\\\\\\\\\\\\"')
            ga_in = ga_in.replace('\\\\\\\\\\\\"name\\\\\\\\\\\\": \\\\\\\\\\\\"NAME\\\\\\\\\\\\"',
                                  str('\\\\\\\\\\\\"name\\\\\\\\\\\\": \\\\\\\\\\\\"' + self.genus.lower()[0] + self.species) + '\\\\\\\\\\\\"')
            ga_in = ga_in.replace("\\\\", "\\")  # to restore the correct amount of backslashes in the workflow string before import
            # ga_in = ga_in.replace("\\\\\\\\\\\\", "\\\\\\")
            ga_in = ga_in.replace('http://localhost/sp/genus_species/feature/Genus/species/mRNA/{id}',
                                  "http://localhost/sp/" + self.genus.lower()[0] + self.genus[1:] + "_" + self.species + "/feature/" + self.genus + "/mRNA/{id}")
            # ga_in = ga_in.replace('"index\\\": \\\"false', '"index\\\": \\\"true')
            # workflow_name = '"name": "' + self.full + '"'
            # ga_in = ga_in.replace('"name": "preset_workflow"', '"name": "preset_workflow"')
            # print(workflow_name)
            ga_in = ga_in[2:-2]  # if the line under doesn't outputs a correct json
            # ga_in = ga_in[:-2]  # if the line above doesn't outputs a correct json
            self.workflow = ga_in
            # print(ga_in)
        return ga_in

    def set_main_workflow_parameters(self, datasets):
        """
        Test function
        :return:
        """
        # TODO: move tool calls to main/autoload

        toolrunner = ToolRunner(parameters_dict=self.parameters_dict, instance=self.instance, history=self.history_id)

        toolrunner.add_organism()
        toolrunner.add_ogs()
        toolrunner.add_genome()

        org_id: str = None
        genome_analysis_id: str = None
        ogs_analysis_id: str = None

        org = toolrunner.get_organism()
        org_job_out = org["outputs"][0]["id"]
        org_json_output = self.instance.datasets.download_dataset(dataset_id=org_job_out)
        try:
            org_output = json.loads(org_json_output)[0]
            org_id = str(org_output["organism_id"])  # needs to be str to be recognized by the chado tool
            global_org_id = org_id
        except IndexError:
            logging.info("No organism matching " + self.full + " exists in the Chado database")

        ogs_analysis = toolrunner.get_ogs_analysis()
        ogs_analysis_job_out = ogs_analysis["outputs"][0]["id"]
        ogs_analysis_json_output = self.instance.datasets.download_dataset(dataset_id=ogs_analysis_job_out)
        try:
            ogs_analysis_output = json.loads(ogs_analysis_json_output)[0]
            ogs_analysis_id = str(ogs_analysis_output["analysis_id"])  # needs to be str to be recognized by the chado tool
            global_ogs_id = ogs_analysis_id
        except IndexError:
            logging.info("No matching OGS analysis exists in the Chado database")

        genome_analysis = toolrunner.get_genome_analysis()
        genome_analysis_job_out = genome_analysis["outputs"][0]["id"]
        genome_analysis_json_output = self.instance.datasets.download_dataset(dataset_id=genome_analysis_job_out)
        try:
            genome_analysis_output = json.loads(genome_analysis_json_output)[0]
            genome_analysis_id = str(genome_analysis_output["analysis_id"])  # needs to be str to be recognized by the chado tool
            global_genome_id = genome_analysis_id
        except IndexError:
            logging.info("No matching genome analysis exists in the Chado database")

        params = dict()
        params["0"] = {}
        params["1"] = {}
        params["2"] = {}
        params["3"] = {}
        params["4"] = {"organism": org_id,
                       "analysis_id": genome_analysis_id,
                       "do_update": "true"}  # the do_update parameter is to prevent assertion errors when loading the file, should always be set to "true"
        params["5"] = {"organism": org_id,
                       "analysis_id": ogs_analysis_id,
                       }
        params["6"] = {"organism_id": org_id}
        params["7"] = {"analysis_id": ogs_analysis_id}
        params["8"] = {"analysis_id": genome_analysis_id}
        params["9"] = {"organism_id": org_id}

        return params

    def set_jbrowse_workflow_parameters(self):
        params = dict()
        params["0"] = {}
        params["1"] = {}
        # jbrowse tools parameters are inside nested dictionaries, read tool inputs before adding or modifying anything
        # problem is with Jbrowse add organism --> the parameters are nested
        params["2"] = {}  # {"jbmenu":  {"menu_url": "http://localhost/sp/undaria_pinnatifida/feature/Undaria/pinnatifida/mRNA/{id}"}} # {"menu_url": "/".join(["http://localhost/sp", self.sp_url, "feature", self.genus, self.species, "mRNA", "{id}"])}}
        params["3"] = {}
        # params["3"] = {"name": " ".join([self.genus, self.species + self.strain + "male"]),
        #                "unique_id": self.genus.lower()[0] + self.species}
        # params["3"] = {"name": [{"name": str(self.genus + " " + self.species),
        #                "unique_id": str(self.genus.lower()[0] + self.species)}]}

    def set_datamap(self):
        gi = self.instance

    def dict_port(self):
        """
        Import workflow into a galaxy instance from a json dict

        :return:
        """
        try:
            self.instance.workflows.import_workflow_dict(workflow_dict=self.workflow)
        except ConnectionError:
            return False
        return True

    def port(self):
        """
        Import workflow into a galaxy instance from a local file

        :return:
        """
        try:
            self.instance.workflows.import_workflow_from_local_path(self.custom_ga_file_path)
        except ConnectionError:
            return False
        else:
            return True

    def get_workflow_name(self):
        """
        Name of the imported workflow

        :return:
        """
        return str("preset_workflow")

    def show(self):
        """
        Print the instance's main workflow to stdout (dict form)

        :return:
        """
        workflow_id = self.instance.workflows.get_workflows()[0]['id']
        return logging.info(self.instance.workflows.show_workflow(workflow_id=workflow_id))

    def store(self):
        """
        Store the instance's workflow

        :return:
        """
        workflow_id = self.instance.workflows.get_workflows()[0]['id']
        return workflow_id

    def delete(self):
        """
        Delete custom workflow

        :return:
        """
        return None

    def run(self, datamap, params):
        """
        Run the custom workflow into a galaxy instance
        Input datasets in the form of a list
        Params

        :return:
        """

        return None

    # def add_step(self, step_position, description, name):
    #     """
    #     TODO: add a step to the workflow (data loading into chado for example)
    #
    #     :param workflow:
    #     :return:
    #     """
    #     return None