diff --git a/constants.py b/constants.py index 5aaf27d31f66a390f6ceb2d3cd05974991029430..cfe86b22d2382968a7cf23e3e0e2b7078b2a8c0b 100644 --- a/constants.py +++ b/constants.py @@ -1,3 +1,6 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + # Constants used in the input yaml ORG_PARAM_NAME = "name" ORG_PARAM_DESC = "description" @@ -24,7 +27,6 @@ ORG_PARAM_SERVICES = "services" ORG_PARAM_SERVICES_BLAST = "blast" ORG_PARAM_SERVICES_GO = "go" - # Constants used in the config yaml file CONF_ALL_HOSTNAME = "hostname" CONF_ALL_HTTP_PORT = "http_port" @@ -45,12 +47,23 @@ CONF_TRIPAL_THEME_NAME = "tripal_theme_name" CONF_TRIPAL_THEME_GIT_CLONE = "tripal_theme_git_clone" CONF_JBROWSE_MENU_URL = "jbrowse_menu_url" +# Data +FILENAME_SUFFIX_TRANSCRIPTS = "transcripts_gff.fasta" +FILENAME_SUFFIX_PROTEINS = "proteins.fasta" +FILENAME_SUFFIX_INTERPRO = "interproscan.xml" +FILENAME_SUFFIX_BLASTP = "diamond_blastp_vs_uniref90.xml" # Temporary constant: this value should be in the organism input file +FILENAME_SUFFIX_BLASTX = "diamond_blastx_vs_uniref90.xml" # Temporary constant: this value should be in the organism input file +FILENAME_SUFFIX_ORTHOFINDER = "orthologous_one2one_vs_Ec32.tsv" # Temporary constant: this value should be in the organism input file +DATA_DATE = "2021-02-24" # Temporary constant: this value should be in the organism input file, for each data + # default config file DEFAULT_CONFIG = "examples/config" +# Galaxy tools GET_ORGANISMS_TOOL = "toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/2.3.4+galaxy0" DELETE_ORGANISMS_TOOL = "toolshed.g2.bx.psu.edu/repos/gga/chado_organism_delete_organisms/organism_delete_organisms/2.3.4+galaxy0" +# Galaxy library HOST_DATA_DIR='src_data' CONTAINER_DATA_DIR_ROOT='/project_data' GALAXY_LIBRARY_NAME = 'Project Data' diff --git a/gga_get_data.py b/gga_get_data.py index 992e5c6c46f59d0bfdd2cecf14f99d3be21f0a95..f46d73303434c1adf7dbd9df1c9b3f2482c6d8fc 100755 --- a/gga_get_data.py +++ b/gga_get_data.py @@ -101,7 +101,7 @@ class GetData(speciesData.SpeciesData): for k, v in genome_datasets.items(): if v: # If dataset is not present in input file, skip copy logging.info("Copying {0} ({1}) into {2}".format(k, v, organism_genome_dir)) - genome_fname = "{0}_v{1}.fasta".format(self.dataset_prefix, self.genome_version) + genome_fname = self.genome_filename try: shutil.copyfile(os.path.abspath(v), os.path.join(organism_genome_dir, genome_fname)) except Exception as exc: @@ -111,19 +111,19 @@ class GetData(speciesData.SpeciesData): if v: # If dataset is not present in input file, skip copy dataset_fname = "" if k == constants.ORG_PARAM_DATA_GFF_PATH: - dataset_fname = "{0}_OGS{1}_{2}.gff".format(self.dataset_prefix, self.ogs_version, self.get_last_modified_time_string(os.path.abspath(v))) + dataset_fname = self.gff_filename elif k == constants.ORG_PARAM_DATA_TRANSCRIPTS_PATH: - dataset_fname = "{0}_OGS{1}_transcripts.fasta".format(self.dataset_prefix, self.ogs_version) + dataset_fname = self.transcripts_filename elif k == constants.ORG_PARAM_DATA_PROTEINS_PATH: - dataset_fname = "{0}_OGS{1}_proteins.fasta".format(self.dataset_prefix, self.ogs_version) + dataset_fname = self.proteins_filename elif k == constants.ORG_PARAM_DATA_ORTHOFINDER_PATH: - dataset_fname = "{0}_OGS{1}_orthofinder.tsv".format(self.dataset_prefix, self.ogs_version) + dataset_fname = self.orthofinder_filename elif k == constants.ORG_PARAM_DATA_INTERPRO_PATH: - dataset_fname = "{0}_OGS{1}_interproscan.xml".format(self.dataset_prefix, self.ogs_version) + dataset_fname = self.interpro_filename elif k == constants.ORG_PARAM_DATA_BLASTP_PATH: - dataset_fname = "{0}_OGS{1}_blastp.xml".format(self.dataset_prefix, self.ogs_version) + dataset_fname = self.blastp_filename elif k == constants.ORG_PARAM_DATA_BLASTX_PATH: - dataset_fname = "{0}_OGS{1}_blastx.xml".format(self.dataset_prefix, self.ogs_version) + dataset_fname = self.blastx_filename logging.info("Copying {0} ({1}) into {2}".format(k, v, organism_annotation_dir)) try: shutil.copyfile(os.path.abspath(v), os.path.join(organism_annotation_dir, dataset_fname)) diff --git a/phaoexplorer_constants.py b/phaoexplorer_constants.py index 229b216d6f0267af1f69e84ff0c5fb45f75548f8..68d6c88398a215b88887c748ecedb58257f68fc1 100644 --- a/phaoexplorer_constants.py +++ b/phaoexplorer_constants.py @@ -1,3 +1,8 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import constants + ### Workflows WORKFLOW_LOAD_FASTA_GFF_JBROWSE = "load_fasta_gff_jbrowse" @@ -17,7 +22,7 @@ ADD_ANALYSIS_TOOL_VERSION = "2.3.4+galaxy0" ADD_ANALYSIS_TOOL_ID= ADD_ANALYSIS_TOOL_NAME + ADD_ANALYSIS_TOOL_VERSION ADD_ANALYSIS_TOOL_CHANGESET_REVISION = "10b2b1c70e69" ADD_ANALYSIS_TOOL_PARAM_PROGRAM = "Performed by Genoscope" -ADD_ANALYSIS_TOOL_PARAM_DATE = "2021-02-24" +ADD_ANALYSIS_TOOL_PARAM_DATE = constants.DATA_DATE GET_ORGANISMS_TOOL_NAME = "toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/" GET_ORGANISMS_TOOL_VERSION = "2.3.4+galaxy0" diff --git a/run_workflow_phaeoexplorer.py b/run_workflow_phaeoexplorer.py index 4928e73207940f0fee9c164819e77c5cf7eb6153..1193b9516928cc77b25b20ae63cceba975fbf470 100755 --- a/run_workflow_phaeoexplorer.py +++ b/run_workflow_phaeoexplorer.py @@ -24,6 +24,13 @@ gga_init.py Usage: $ python3 gga_init.py -i input_example.yml --config [config file] [OPTIONS] """ +class RunWorkflowParam: + + def __init__(self, genus_species, strain_sex, attributes_dict): + self.genus_species = genus_species + self.strain_sex = strain_sex + self.param_dict = attributes_dict + class RunWorkflow(speciesData.SpeciesData): """ Run a workflow into the galaxy instance's history of a given species @@ -108,8 +115,6 @@ class RunWorkflow(speciesData.SpeciesData): :return: """ - self.set_galaxy_instance() - logging.info("Validating installed individual tools versions and changesets") # Verify that the add_organism and add_analysis versions are correct in the instance @@ -514,7 +519,6 @@ class RunWorkflow(speciesData.SpeciesData): return invocation_report - def import_datasets_into_history(self): """ Find datasets in a library, get their ID and import them into the current history if they are not already @@ -541,99 +545,100 @@ class RunWorkflow(speciesData.SpeciesData): for folder_name, folder_id in folders_id_dict.items(): if folder_name == "/genome/{0}/v{1}".format(self.species_folder_name, self.genome_version): sub_folder_content = self.instance.folders.show_folder(folder_id=folder_id, contents=True) - for k2, v2 in sub_folder_content.items(): - for e in v2: + for value in sub_folder_content.values(): + for e in value: if type(e) == dict: - if e["name"].endswith(".fasta"): - self.datasets["genome_file"] = e["ldda_id"] - self.datasets_name["genome_file"] = e["name"] + if e["name"].endswith(self.genome_filename): + genome_ldda_id = e["ldda_id"] if folder_name == "/annotation/{0}/OGS{1}".format(self.species_folder_name, self.ogs_version): sub_folder_content = self.instance.folders.show_folder(folder_id=folder_id, contents=True) - for k2, v2 in sub_folder_content.items(): - for e in v2: + for value in sub_folder_content.values(): + for e in value: if type(e) == dict: - if "transcripts" in e["name"]: - self.datasets["transcripts_file"] = e["ldda_id"] - self.datasets_name["transcripts_file"] = e["name"] - elif "proteins" in e["name"]: - self.datasets["proteins_file"] = e["ldda_id"] - self.datasets_name["proteins_file"] = e["name"] - elif "gff" in e["name"]: - self.datasets["gff_file"] = e["ldda_id"] - self.datasets_name["gff_file"] = e["name"] - elif "interpro" in e["name"]: - self.datasets["interproscan_file"] = e["ldda_id"] - self.datasets_name["interproscan_file"] = e["name"] - elif "blastp" in e["name"]: - self.datasets["blastp_file"] = e["ldda_id"] - self.datasets_name["blastp_file"] = e["name"] - - - history_datasets_li = self.instance.datasets.get_datasets() + ldda_name = e["name"] + ldda_id = e["ldda_id"] + if ldda_name.endswith(self.transcripts_filename): + transcripts_ldda_id = ldda_id + elif ldda_name.endswith(self.proteins_filename): + proteins_ldda_id = ldda_id + elif ldda_name.endswith(self.gff_filename): + gff_ldda_id = ldda_id + elif ldda_name.endswith(self.interpro_filename): + interpro_ldda_id = ldda_id + elif ldda_name.endswith(self.blastp_filename): + blastp_ldda_id = ldda_id + elif ldda_name.endswith(self.blastx_filename): + blastx_ldda_id = ldda_id + + hda_list = self.instance.datasets.get_datasets(self.history_id) genome_hda_id, gff_hda_id, transcripts_hda_id, proteins_hda_id, blastp_hda_id, interproscan_hda_id = None, None, None, None, None, None - # Finding datasets in history (matching datasets names) - for dataset in history_datasets_li: - dataset_name = dataset["name"] - dataset_id = dataset["id"] - if dataset_name == "{0}_v{1}.fasta".format(self.dataset_prefix, self.genome_version): - genome_hda_id = dataset_id - if dataset_name == "{0}_OGS{1}_{2}.gff".format(self.dataset_prefix, self.ogs_version, self.date): - gff_hda_id = dataset_id - if dataset_name == "{0}_OGS{1}_transcripts.fasta".format(self.dataset_prefix, self.ogs_version): - transcripts_hda_id = dataset_id - if dataset_name == "{0}_OGS{1}_proteins.fasta".format(self.dataset_prefix, self.ogs_version): - proteins_hda_id = dataset_id - if dataset_name == "{0}_OGS{1}_blastp.xml".format(self.dataset_prefix, self.ogs_version): - blastp_hda_id = dataset_id - if dataset_name == "{0}_OGS{1}_interproscan.xml".format(self.dataset_prefix, self.ogs_version): - interproscan_hda_id = dataset_id + for hda in hda_list: + hda_name = hda["name"] + hda_id = hda["id"] + if hda_name == self.genome_filename: + genome_hda_id = hda_id + if hda_name == self.gff_filename: + gff_hda_id = hda_id + if hda_name == self.transcripts_filename: + transcripts_hda_id = hda_id + if hda_name == self.proteins_filename : + proteins_hda_id = hda_id + if hda_name == self.blastp_filename: + blastp_hda_id = hda_id + if hda_name == self.blastx_filename: + blastx_hda_id = hda_id + if hda_name == self.interpro_filename: + interproscan_hda_id = hda_id - # Import each dataset into history if it is not imported logging.debug("Uploading datasets into history %s" % self.history_id) if genome_hda_id is None: - genome_dataset_upload = self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["genome_file"]) + genome_dataset_upload = self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=genome_ldda_id) genome_hda_id = genome_dataset_upload["id"] if gff_hda_id is None: - gff_dataset_upload = self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["gff_file"]) + gff_dataset_upload = self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=gff_ldda_id) gff_hda_id = gff_dataset_upload["id"] - if transcripts_hda_id is None: - transcripts_dataset_upload = self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["transcripts_file"]) - transcripts_hda_id = transcripts_dataset_upload["id"] if proteins_hda_id is None: - proteins_dataset_upload = self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["proteins_file"]) + proteins_dataset_upload = self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=proteins_ldda_id) proteins_hda_id = proteins_dataset_upload["id"] + if transcripts_hda_id is None: + transcripts_dataset_upload = self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=transcripts_ldda_id) + transcripts_hda_id = transcripts_dataset_upload["id"] if interproscan_hda_id is None: try: - interproscan_dataset_upload = self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["interproscan_file"]) + interproscan_dataset_upload = self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=interpro_ldda_id) interproscan_hda_id = interproscan_dataset_upload["id"] except Exception as exc: logging.debug("Interproscan file not found in library (history: {0})".format(self.history_id)) if blastp_hda_id is None: try: - blastp_dataset_upload = self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["blastp_file"]) + blastp_dataset_upload = self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=blastp_ldda_id) blastp_hda_id = blastp_dataset_upload["id"] except Exception as exc: logging.debug("blastp file not found in library (history: {0})".format(self.history_id)) + if blastx_hda_id is None: + try: + blastx_dataset_upload = self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=blastx_ldda_id) + blastx_hda_id = blastx_dataset_upload["id"] + except Exception as exc: + logging.debug("blastp file not found in library (history: {0})".format(self.history_id)) + + hda_ids = {"genome_hda_id": genome_hda_id, + "gff_hda_id": gff_hda_id, + "transcripts_hda_id": transcripts_hda_id, + "proteins_hda_id": proteins_hda_id, + "blastp_hda_id": blastp_hda_id, + "blastx_hda_id": blastx_hda_id, + "interproscan_hda_id": interproscan_hda_id} # logging.debug("History dataset IDs (hda_id) for %s:" % self.full_name) - # logging.debug({"genome_hda_id": genome_hda_id, - # "gff_hda_id": gff_hda_id, - # "transcripts_hda_id": transcripts_hda_id, - # "proteins_hda_id": proteins_hda_id, - # "blastp_hda_id": blastp_hda_id, - # "interproscan_hda_id": interproscan_hda_id}) + # logging.debug(hda_ids) # Return a dict made of the hda ids - return {"genome_hda_id": genome_hda_id, - "gff_hda_id": gff_hda_id, - "transcripts_hda_id": transcripts_hda_id, - "proteins_hda_id": proteins_hda_id, - "blastp_hda_id": blastp_hda_id, - "interproscan_hda_id": interproscan_hda_id} + return hda_ids def run_workflow(workflow_path, workflow_parameters, datamap, config, input_species_number): """ @@ -693,16 +698,19 @@ def run_workflow(workflow_path, workflow_parameters, datamap, config, input_spec -def create_sp_workflow_dict(sp_dict, main_dir, config, workflow_type): +def get_sp_workflow_param(sp_dict, main_dir, config, workflow_type): """ """ - sp_workflow_dict = {} run_workflow_for_current_organism = RunWorkflow(parameters_dictionary=sp_dict) # Verifying the galaxy container is running - if utilities.check_galaxy_state(network_name=run_workflow_for_current_organism.genus_species, + if not utilities.check_galaxy_state(network_name=run_workflow_for_current_organism.genus_species, script_dir=run_workflow_for_current_organism.script_dir): + logging.critical("The galaxy container for %s is not ready yet!" % run_workflow_for_current_organism.genus_species) + sys.exit() + + else: # Setting some of the instance attributes run_workflow_for_current_organism.main_dir = main_dir @@ -717,13 +725,12 @@ def create_sp_workflow_dict(sp_dict, main_dir, config, workflow_type): run_workflow_for_current_organism.config[constants.CONF_ALL_HTTP_PORT], run_workflow_for_current_organism.genus_species) + run_workflow_for_current_organism.set_galaxy_instance() + history_id = run_workflow_for_current_organism.set_history() + run_workflow_for_current_organism.install_changesets_revisions_for_individual_tools() if workflow_type == phaoexplorer_constants.WORKFLOW_LOAD_FASTA_GFF_JBROWSE: - run_workflow_for_current_organism.set_galaxy_instance() - history_id = run_workflow_for_current_organism.set_history() - run_workflow_for_current_organism.install_changesets_revisions_for_individual_tools() - analyses_dict_list = run_workflow_for_current_organism.get_analyses() org_id = run_workflow_for_current_organism.add_organism_and_sync() @@ -743,7 +750,7 @@ def create_sp_workflow_dict(sp_dict, main_dir, config, workflow_type): hda_ids = run_workflow_for_current_organism.import_datasets_into_history() # Create the dictionary holding all attributes needed to connect to the galaxy instance - attributes = {"genus": run_workflow_for_current_organism.genus, + param = {"genus": run_workflow_for_current_organism.genus, "species": run_workflow_for_current_organism.species, "genus_species": run_workflow_for_current_organism.genus_species, "full_name": run_workflow_for_current_organism.full_name, @@ -760,87 +767,65 @@ def create_sp_workflow_dict(sp_dict, main_dir, config, workflow_type): "email": config[constants.CONF_GALAXY_DEFAULT_ADMIN_EMAIL], "password": config[constants.CONF_GALAXY_DEFAULT_ADMIN_PASSWORD]} - sp_workflow_dict[run_workflow_for_current_organism.genus_species] = {run_workflow_for_current_organism.genus_species.strain_sex: attributes} - else: - logging.critical("The galaxy container for %s is not ready yet!" % run_workflow_for_current_organism.genus_species) - sys.exit() + if workflow_type == "blast": - return sp_workflow_dict + ids = run_workflow_for_current_organism.add_organism_blastp_analysis() - if workflow_type == "blast": - run_workflow_for_current_organism.set_galaxy_instance() + org_id = ids["org_id"] + blastp_analysis_id = ids["blastp_analysis_id"] + hda_ids = run_workflow_for_current_organism.import_datasets_into_history() - history_id = run_workflow_for_current_organism.set_history() + # Create the dictionary holding all attributes needed to connect to the galaxy instance + param = {"genus": run_workflow_for_current_organism.genus, + "species": run_workflow_for_current_organism.species, + "genus_species": run_workflow_for_current_organism.genus_species, + "full_name": run_workflow_for_current_organism.full_name, + "species_folder_name": run_workflow_for_current_organism.species_folder_name, + "sex": run_workflow_for_current_organism.sex, + "strain": run_workflow_for_current_organism.strain, + "org_id": org_id, + "blastp_analysis_id": blastp_analysis_id, + "hda_ids": hda_ids, + "history_id": history_id, + "instance": run_workflow_for_current_organism.instance, + "instance_url": run_workflow_for_current_organism.instance_url, + "email": config[constants.CONF_GALAXY_DEFAULT_ADMIN_EMAIL], + "password": config[constants.CONF_GALAXY_DEFAULT_ADMIN_PASSWORD]} - run_workflow_for_current_organism.install_changesets_revisions_for_individual_tools() - ids = run_workflow_for_current_organism.add_organism_blastp_analysis() - - org_id = ids["org_id"] - blastp_analysis_id = ids["blastp_analysis_id"] - hda_ids = run_workflow_for_current_organism.import_datasets_into_history() - - strain_sex = "{0}_{1}".format(run_workflow_for_current_organism.strain, run_workflow_for_current_organism.sex) - genus_species = run_workflow_for_current_organism.genus_species - - # Create the dictionary holding all attributes needed to connect to the galaxy instance - attributes = {"genus": run_workflow_for_current_organism.genus, - "species": run_workflow_for_current_organism.species, - "genus_species": run_workflow_for_current_organism.genus_species, - "full_name": run_workflow_for_current_organism.full_name, - "species_folder_name": run_workflow_for_current_organism.species_folder_name, - "sex": run_workflow_for_current_organism.sex, - "strain": run_workflow_for_current_organism.strain, - "org_id": org_id, - "blastp_analysis_id": blastp_analysis_id, - "hda_ids": hda_ids, - "history_id": history_id, - "instance": run_workflow_for_current_organism.instance, - "instance_url": run_workflow_for_current_organism.instance_url, - "email": config[constants.CONF_GALAXY_DEFAULT_ADMIN_EMAIL], - "password": config[constants.CONF_GALAXY_DEFAULT_ADMIN_PASSWORD]} - - sp_workflow_dict[genus_species] = {strain_sex: attributes} + if workflow_type == "interpro": - if workflow_type == "interpro": - run_workflow_for_current_organism.set_galaxy_instance() + ids = run_workflow_for_current_organism.add_organism_interproscan_analysis() - history_id = run_workflow_for_current_organism.set_history() + org_id = ids["org_id"] + interpro_analysis_id = ids["interpro_analysis_id"] + hda_ids = run_workflow_for_current_organism.import_datasets_into_history() - run_workflow_for_current_organism.install_changesets_revisions_for_individual_tools() - ids = run_workflow_for_current_organism.add_organism_interproscan_analysis() - - org_id = ids["org_id"] - interpro_analysis_id = ids["interpro_analysis_id"] - hda_ids = run_workflow_for_current_organism.import_datasets_into_history() - - strain_sex = "{0}_{1}".format(run_workflow_for_current_organism.strain, run_workflow_for_current_organism.sex) - genus_species = run_workflow_for_current_organism.genus_species - - # Create the dictionary holding all attributes needed to connect to the galaxy instance - attributes = {"genus": run_workflow_for_current_organism.genus, - "species": run_workflow_for_current_organism.species, - "genus_species": run_workflow_for_current_organism.genus_species, - "full_name": run_workflow_for_current_organism.full_name, - "species_folder_name": run_workflow_for_current_organism.species_folder_name, - "sex": run_workflow_for_current_organism.sex, - "strain": run_workflow_for_current_organism.strain, - "org_id": org_id, - "interpro_analysis_id": interpro_analysis_id, - "hda_ids": hda_ids, - "history_id": history_id, - "instance": run_workflow_for_current_organism.instance, - "instance_url": run_workflow_for_current_organism.instance_url, - "email": config[constants.CONF_GALAXY_DEFAULT_ADMIN_EMAIL], - "password": config[constants.CONF_GALAXY_DEFAULT_ADMIN_PASSWORD]} - - sp_workflow_dict[genus_species] = {strain_sex: attributes} + # Create the dictionary holding all attributes needed to connect to the galaxy instance + param = {"genus": run_workflow_for_current_organism.genus, + "species": run_workflow_for_current_organism.species, + "genus_species": run_workflow_for_current_organism.genus_species, + "full_name": run_workflow_for_current_organism.full_name, + "species_folder_name": run_workflow_for_current_organism.species_folder_name, + "sex": run_workflow_for_current_organism.sex, + "strain": run_workflow_for_current_organism.strain, + "org_id": org_id, + "interpro_analysis_id": interpro_analysis_id, + "hda_ids": hda_ids, + "history_id": history_id, + "instance": run_workflow_for_current_organism.instance, + "instance_url": run_workflow_for_current_organism.instance_url, + "email": config[constants.CONF_GALAXY_DEFAULT_ADMIN_EMAIL], + "password": config[constants.CONF_GALAXY_DEFAULT_ADMIN_PASSWORD]} - else: - logging.critical("The galaxy container for %s is not ready yet!" % run_workflow_for_current_organism.full_name) - sys.exit() + sp_wf_param = RunWorkflowParam( + genus_species=run_workflow_for_current_organism.genus_species, + strain_sex=run_workflow_for_current_organism.strain_sex, + param_dict=param + ) + return sp_wf_param def install_changesets_revisions_from_workflow(instance, workflow_path): @@ -957,29 +942,31 @@ if __name__ == "__main__": for sp_dict in sp_dict_list: # Add and retrieve all analyses/organisms for the current input species and add their IDs to the input dictionary - current_sp_workflow_dict = create_sp_workflow_dict( + sp_workflow_attributes = get_sp_workflow_param( sp_dict, main_dir=main_dir, config=config, workflow_type=phaoexplorer_constants.WORKFLOW_LOAD_FASTA_GFF_JBROWSE) - current_sp_key = list(current_sp_workflow_dict.keys())[0] - current_sp_value = list(current_sp_workflow_dict.values())[0] - current_sp_strain_sex_key = list(current_sp_value.keys())[0] - current_sp_strain_sex_value = list(current_sp_value.values())[0] + current_sp_genus_species = sp_workflow_attributes.genus_species + current_sp_strain_sex = sp_workflow_attributes.strain_sex + current_sp_strain_sex_attributes_dict = sp_workflow_attributes.param_dict # Add the species dictionary to the complete dictionary # This dictionary contains every organism present in the input file # Its structure is the following: # {genus species: {strain1_sex1: {variables_key: variables_values}, strain1_sex2: {variables_key: variables_values}}} - if not current_sp_key in all_sp_workflow_dict.keys(): - all_sp_workflow_dict[current_sp_key] = current_sp_value + if not current_sp_genus_species in all_sp_workflow_dict.keys(): + all_sp_workflow_dict[current_sp_genus_species] = {current_sp_strain_sex: current_sp_strain_sex_attributes_dict} else: - all_sp_workflow_dict[current_sp_key][current_sp_strain_sex_key] = current_sp_strain_sex_value + if not current_sp_strain_sex in all_sp_workflow_dict[current_sp_genus_species].keys(): + all_sp_workflow_dict[current_sp_genus_species][current_sp_strain_sex] = current_sp_strain_sex_attributes_dict + else: + logging.error("Duplicate organism with 'genus_species' = '{0}' and 'strain_sex' = '{1}'".format(current_sp_genus_species, current_sp_strain_sex)) - for k, v in all_sp_workflow_dict.items(): - if len(list(v.keys())) == 1: - logging.info("Input organism %s: 1 species detected in input dictionary" % k) + for species, strains in all_sp_workflow_dict.items(): + if len(list(strains.keys())) == 1: + logging.info("Input species %s: 1 strain detected in input dictionary" % species) # Set workflow path (1 organism) workflow_path = os.path.join(os.path.abspath(script_dir), "workflows_phaeoexplorer/Galaxy-Workflow-chado_load_tripal_synchronize_jbrowse_1org_v4.ga") @@ -988,7 +975,7 @@ if __name__ == "__main__": instance_url, email, password = None, None, None # Set the galaxy instance variables - for k2, v2 in v.items(): + for k2, v2 in strains.values(): instance_url = v2["instance_url"] email = v2["email"] password = v2["password"] @@ -998,11 +985,8 @@ if __name__ == "__main__": # Check if the versions of tools specified in the workflow are installed in galaxy install_changesets_revisions_from_workflow(workflow_path=workflow_path, instance=instance) - organism_key_name = list(v.keys()) - org_dict = v[organisms_key_name[0]] - - # print("\n") - # print(org_dict) + organisms_key_name = list(strains.keys()) + org_dict = strains[organisms_key_name[0]] history_id = org_dict["history_id"] @@ -1121,9 +1105,9 @@ if __name__ == "__main__": logging.info("Successfully imported and invoked workflow {0}, check the galaxy instance ({1}) for the jobs state".format(workflow_name, instance_url)) - if len(list(v.keys())) == 2: + if len(list(strains.keys())) == 2: - logging.info("Input organism %s: 2 species detected in input dictionary" % k) + logging.info("Input organism %s: 2 species detected in input dictionary" % species) # Set workflow path (2 organisms) workflow_path = os.path.join(os.path.abspath(script_dir), "workflows_phaeoexplorer/Galaxy-Workflow-chado_load_tripal_synchronize_jbrowse_2org_v4.ga") @@ -1132,7 +1116,7 @@ if __name__ == "__main__": instance_url, email, password = None, None, None # Set the galaxy instance variables - for k2, v2 in v.items(): + for k2, v2 in strains.items(): instance_url = v2["instance_url"] email = v2["email"] password = v2["password"] @@ -1143,9 +1127,9 @@ if __name__ == "__main__": install_changesets_revisions_from_workflow(workflow_path=workflow_path, instance=instance) # Get key names from the current organism (item 1 = organism 1, item 2 = organism 2) - organisms_key_names = list(v.keys()) - org1_dict = v[organisms_key_names[0]] - org2_dict = v[organisms_key_names[1]] + organisms_key_names = list(strains.keys()) + org1_dict = strains[organisms_key_names[0]] + org2_dict = strains[organisms_key_names[1]] history_id = org1_dict["history_id"] @@ -1360,24 +1344,24 @@ if __name__ == "__main__": for sp_dict in sp_dict_list: # Add and retrieve all analyses/organisms for the current input species and add their IDs to the input dictionary - current_sp_workflow_dict = create_sp_workflow_dict(sp_dict, main_dir=args.main_directory, config=config, workfow_type="blast") + sp_workflow_attributes = get_sp_workflow_param(sp_dict, main_dir=args.main_directory, config=config, workfow_type="blast") - current_sp_key = list(current_sp_workflow_dict.keys())[0] - current_sp_value = list(current_sp_workflow_dict.values())[0] - current_sp_strain_sex_key = list(current_sp_value.keys())[0] - current_sp_strain_sex_value = list(current_sp_value.values())[0] + current_sp_genus_species = list(sp_workflow_attributes.keys())[0] + current_sp_genus_species_dict = list(sp_workflow_attributes.values())[0] + current_sp_strain_sex = list(current_sp_genus_species_dict.keys())[0] + current_sp_strain_sex_attributes_dict = list(current_sp_genus_species_dict.values())[0] # Add the species dictionary to the complete dictionary # This dictionary contains every organism present in the input file # Its structure is the following: # {genus species: {strain1_sex1: {variables_key: variables_values}, strain1_sex2: {variables_key: variables_values}}} - if not current_sp_key in all_sp_workflow_dict.keys(): - all_sp_workflow_dict[current_sp_key] = current_sp_value + if not current_sp_genus_species in all_sp_workflow_dict.keys(): + all_sp_workflow_dict[current_sp_genus_species] = current_sp_genus_species_dict else: - all_sp_workflow_dict[current_sp_key][current_sp_strain_sex_key] = current_sp_strain_sex_value + all_sp_workflow_dict[current_sp_genus_species][current_sp_strain_sex] = current_sp_strain_sex_attributes_dict - if len(list(v.keys())) == 1: - logging.info("Input organism %s: 1 species detected in input dictionary" % k) + if len(list(strains.keys())) == 1: + logging.info("Input organism %s: 1 species detected in input dictionary" % species) # Set workflow path (1 organism) workflow_path = os.path.join(os.path.abspath(script_dir), "workflows_phaeoexplorer/Galaxy-Workflow-load_blast_results_1org_v1.ga") @@ -1386,7 +1370,7 @@ if __name__ == "__main__": instance_url, email, password = None, None, None # Set the galaxy instance variables - for k2, v2 in v.items(): + for k2, v2 in strains.items(): instance_url = v2["instance_url"] email = v2["email"] password = v2["password"] @@ -1396,8 +1380,8 @@ if __name__ == "__main__": # Check if the versions of tools specified in the workflow are installed in galaxy install_changesets_revisions_from_workflow(workflow_path=workflow_path, instance=instance) - organism_key_name = list(v.keys()) - org_dict = v[organisms_key_name[0]] + organism_key_name = list(strains.keys()) + org_dict = strains[organisms_key_name[0]] history_id = org_dict["history_id"] @@ -1473,9 +1457,9 @@ if __name__ == "__main__": - if len(list(v.keys())) == 2: + if len(list(strains.keys())) == 2: - logging.info("Input organism %s: 2 species detected in input dictionary" % k) + logging.info("Input organism %s: 2 species detected in input dictionary" % species) # Set workflow path (2 organisms) workflow_path = os.path.join(os.path.abspath(script_dir), "workflows_phaeoexplorer/Galaxy-Workflow-load_blast_results_2org_v1.ga") @@ -1484,7 +1468,7 @@ if __name__ == "__main__": instance_url, email, password = None, None, None # Set the galaxy instance variables - for k2, v2 in v.items(): + for k2, v2 in strains.items(): instance_url = v2["instance_url"] email = v2["email"] password = v2["password"] @@ -1494,9 +1478,9 @@ if __name__ == "__main__": # Check if the versions of tools specified in the workflow are installed in galaxy install_changesets_revisions_from_workflow(workflow_path=workflow_path, instance=instance) - organisms_key_names = list(v.keys()) - org1_dict = v[organisms_key_names[0]] - org2_dict = v[organisms_key_names[1]] + organisms_key_names = list(strains.keys()) + org1_dict = strains[organisms_key_names[0]] + org2_dict = strains[organisms_key_names[1]] history_id = org1_dict["history_id"] @@ -1630,24 +1614,24 @@ if __name__ == "__main__": for sp_dict in sp_dict_list: # Add and retrieve all analyses/organisms for the current input species and add their IDs to the input dictionary - current_sp_workflow_dict = create_sp_workflow_dict(sp_dict, main_dir=args.main_directory, config=config, workfow_type="blast") + sp_workflow_attributes = get_sp_workflow_param(sp_dict, main_dir=args.main_directory, config=config, workfow_type="blast") - current_sp_key = list(current_sp_workflow_dict.keys())[0] - current_sp_value = list(current_sp_workflow_dict.values())[0] - current_sp_strain_sex_key = list(current_sp_value.keys())[0] - current_sp_strain_sex_value = list(current_sp_value.values())[0] + current_sp_genus_species = list(sp_workflow_attributes.keys())[0] + current_sp_genus_species_dict = list(sp_workflow_attributes.values())[0] + current_sp_strain_sex = list(current_sp_genus_species_dict.keys())[0] + current_sp_strain_sex_attributes_dict = list(current_sp_genus_species_dict.values())[0] # Add the species dictionary to the complete dictionary # This dictionary contains every organism present in the input file # Its structure is the following: # {genus species: {strain1_sex1: {variables_key: variables_values}, strain1_sex2: {variables_key: variables_values}}} - if not current_sp_key in all_sp_workflow_dict.keys(): - all_sp_workflow_dict[current_sp_key] = current_sp_value + if not current_sp_genus_species in all_sp_workflow_dict.keys(): + all_sp_workflow_dict[current_sp_genus_species] = current_sp_genus_species_dict else: - all_sp_workflow_dict[current_sp_key][current_sp_strain_sex_key] = current_sp_strain_sex_value + all_sp_workflow_dict[current_sp_genus_species][current_sp_strain_sex] = current_sp_strain_sex_attributes_dict - if len(list(v.keys())) == 1: - logging.info("Input organism %s: 1 species detected in input dictionary" % k) + if len(list(strains.keys())) == 1: + logging.info("Input organism %s: 1 species detected in input dictionary" % species) # Set workflow path (1 organism) workflow_path = os.path.join(os.path.abspath(script_dir), "workflows_phaeoexplorer/Galaxy-Workflow-load_blast_results_1org_v1.ga") @@ -1656,7 +1640,7 @@ if __name__ == "__main__": instance_url, email, password = None, None, None # Set the galaxy instance variables - for k2, v2 in v.items(): + for k2, v2 in strains.items(): instance_url = v2["instance_url"] email = v2["email"] password = v2["password"] @@ -1666,8 +1650,8 @@ if __name__ == "__main__": # Check if the versions of tools specified in the workflow are installed in galaxy install_changesets_revisions_from_workflow(workflow_path=workflow_path, instance=instance) - organism_key_name = list(v.keys()) - org_dict = v[organisms_key_name[0]] + organism_key_name = list(strains.keys()) + org_dict = strains[organisms_key_name[0]] history_id = org_dict["history_id"] @@ -1743,9 +1727,9 @@ if __name__ == "__main__": - if len(list(v.keys())) == 2: + if len(list(strains.keys())) == 2: - logging.info("Input organism %s: 2 species detected in input dictionary" % k) + logging.info("Input organism %s: 2 species detected in input dictionary" % species) # Set workflow path (2 organisms) workflow_path = os.path.join(os.path.abspath(script_dir), "workflows_phaeoexplorer/Galaxy-Workflow-load_blast_results_2org_v1.ga") @@ -1754,7 +1738,7 @@ if __name__ == "__main__": instance_url, email, password = None, None, None # Set the galaxy instance variables - for k2, v2 in v.items(): + for k2, v2 in strains.items(): instance_url = v2["instance_url"] email = v2["email"] password = v2["password"] @@ -1764,9 +1748,9 @@ if __name__ == "__main__": # Check if the versions of tools specified in the workflow are installed in galaxy install_changesets_revisions_from_workflow(workflow_path=workflow_path, instance=instance) - organisms_key_names = list(v.keys()) - org1_dict = v[organisms_key_names[0]] - org2_dict = v[organisms_key_names[1]] + organisms_key_names = list(strains.keys()) + org1_dict = strains[organisms_key_names[0]] + org2_dict = strains[organisms_key_names[1]] history_id = org1_dict["history_id"] diff --git a/speciesData.py b/speciesData.py index 83626b70e1289a72eff98172a08af91962c490ec..6cc5bb0e2941ec83e0dfd002080d39086b46b990 100755 --- a/speciesData.py +++ b/speciesData.py @@ -116,6 +116,15 @@ class SpeciesData: else: self.dataset_prefix = self.genus[0].lower() + "_" + self.species_lowercase + self.genome_filename = "{0}_v{1}.fasta".format(self.dataset_prefix, self.genome_version) + self.gff_filename = "{0}_OGS{1}_{2}.gff".format(self.dataset_prefix, self.ogs_version, constants.DATA_DATE) + self.transcripts_filename = "{0}_OGS{1}_{2}".format(self.dataset_prefix, self.ogs_version, constants.TRANSCRIPTS_FILENAME_SUFFIX) + self.proteins_filename = "{0}_OGS{1}_{2}".format(self.dataset_prefix, self.ogs_version, constants.FILENAME_SUFFIX_PROTEINS) + self.interpro_filename = "{0}_OGS{1}_{2}".format(self.dataset_prefix, self.ogs_version, constants.FILENAME_SUFFIX_INTERPRO) + self.blastp_filename = "{0}_OGS{1}_{2}".format(self.dataset_prefix, self.ogs_version, constants.FILENAME_SUFFIX_BLASTP) + self.blastx_filename = "{0}_OGS{1}_{2}".format(self.dataset_prefix, self.ogs_version, constants.FILENAME_SUFFIX_BLASTX) + self.orthofinder_filename = "{0}_OGS{1}_{2}".format(self.dataset_prefix, self.ogs_version, constants.FILENAME_SUFFIX_ORTHOFINDER) + # Bioblend/Chado IDs for an organism analyses/organisms/datasets/history/library self.org_id = None self.genome_analysis_id = None @@ -131,8 +140,6 @@ class SpeciesData: self.species_dir = None self.tool_panel = None - self.datasets = dict() - self.datasets_name = dict() self.source_files = dict() self.workflow_name = None self.metadata = dict()