From 8d36dd970b44d4310801be8b2e9a557bc8967500 Mon Sep 17 00:00:00 2001 From: Arthur Le Bars <arthur.le-bars@sb-roscoff.fr> Date: Fri, 19 Mar 2021 12:54:00 +0100 Subject: [PATCH] src_data dir tree created in gga_get_data --- gga_get_data.py | 60 +++++++++++++++++++++++++++++++++++ gga_init.py | 34 ++------------------ run_workflow_phaeoexplorer.py | 28 ++++++++++------ speciesData.py | 1 - utilities.py | 5 +++ 5 files changed, 85 insertions(+), 43 deletions(-) diff --git a/gga_get_data.py b/gga_get_data.py index 4494f9a..3b92591 100644 --- a/gga_get_data.py +++ b/gga_get_data.py @@ -53,6 +53,45 @@ class GetData(speciesData.SpeciesData): return 1 + + def make_directory_tree(self): + """ + Generate the directory tree for an organism + + :return: + """ + + os.chdir(self.main_dir) + + try: + os.chdir(self.species_dir) + except OSError as exc: + logging.critical("Cannot access %s" % self.genus_species) + sys.exit(exc) + + # Creation (or updating) of the src_data directory tree + try: + os.mkdir("./src_data") + except FileExistsError: + logging.debug("'src_data' directory already exist for %s" % self.full_name) + except PermissionError as exc: + logging.critical("Insufficient permission to create src_data directory tree") + sys.exit(exc) + + # List of all the directories to create in src_data + src_data_dirs_li = ["./src_data", "./src_data/annotation", "./src_data/genome", "./src_data/tracks", + "./src_data/annotation/%s" % self.species_folder_name, + "./src_data/genome/%s" % self.species_folder_name, + "./src_data/annotation/{0}/OGS{1}/".format(self.species_folder_name, self.ogs_version), + "./src_data/genome/{0}/v{1}".format(self.species_folder_name, self.genome_version)] + make_dirs(dir_paths_li=src_data_dirs_li) + + # Return to main directory + os.chdir(self.main_dir) + + logging.info("src_data directory tree generated for %s" % self.full_name) + + def get_source_data_files_from_path(self): """ Find source data files and copy them into the src_data dir tree @@ -106,6 +145,27 @@ class GetData(speciesData.SpeciesData): os.chdir(self.main_dir) +def make_dirs(dir_paths_li): + """ + Recursively create directories from a list of paths with a try-catch condition + + :param dir_paths_li: + :return: + """ + created_dir_paths_li = [] + + for dir_path in dir_paths_li: + try: + os.mkdir(dir_path) + except FileExistsError: + logging.debug("%s directory already exists" % dir_path) + except PermissionError as exc: + logging.critical("Insufficient permission to create %s" % dir_path) + sys.exit(exc) + created_dir_paths_li.append(dir_path) + + return created_dir_paths_li + if __name__ == "__main__": parser = argparse.ArgumentParser(description="Automatic data loading in containers and interaction " "with galaxy instances for GGA" diff --git a/gga_init.py b/gga_init.py index 8613d0e..30ec55d 100644 --- a/gga_init.py +++ b/gga_init.py @@ -85,23 +85,6 @@ class DeploySpeciesStack(speciesData.SpeciesData): logging.critical("Cannot edit NginX conf file") sys.exit(exc) - # Creation (or updating) of the src_data directory tree - try: - os.mkdir("./src_data") - except FileExistsError: - logging.debug("'src_data' directory already exist for %s" % self.full_name) - except PermissionError as exc: - logging.critical("Insufficient permission to create src_data directory tree") - sys.exit(exc) - - # List of all the directories to create in src_data - src_data_dirs_li = ["./src_data", "./src_data/annotation", "./src_data/genome", "./src_data/tracks", - "./src_data/annotation/%s" % self.species_folder_name, - "./src_data/genome/%s" % self.species_folder_name, - "./src_data/annotation/{0}/OGS{1}/".format(self.species_folder_name, self.ogs_version), - "./src_data/genome/{0}/v{1}".format(self.species_folder_name, self.genome_version)] - make_dirs(dir_paths_li=src_data_dirs_li) - # Return to main directory os.chdir(self.main_dir) @@ -161,6 +144,7 @@ class DeploySpeciesStack(speciesData.SpeciesData): def make_orthology_compose_files(self): """ + Create/update orthology compose files :return: """ @@ -324,6 +308,7 @@ def deploy_stacks(input_list, main_dir, deploy_traefik): os.chdir(main_dir) # Get species for which to deploy the stacks + # Uses the get_unique_species_list method from utilities to deploy a stack only for the "species" level (i.e genus_species) to_deploy_species_li = utilities.get_unique_species_list(sp_dict_list=input_list) if deploy_traefik: @@ -354,21 +339,6 @@ def deploy_stacks(input_list, main_dir, deploy_traefik): stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=".") os.chdir(main_dir) - # # Using deploy.sh script (obsolete) - # Launch and update docker stacks - # noinspection PyArgumentList - # deploy_stacks_popen = subprocess.Popen(["sh", self.script_dir + "/deploy.sh", self.genus_species, - # self.main_dir + "/traefik"], - # stdout=subprocess.PIPE, stderr=subprocess.STDOUT, - # universal_newlines=True) - # - # for stdout_line in iter(deploy_stacks_popen.stdout.readline, ""): - # if "daemon" in stdout_line: # Ignore swarm init error output - # pass - # else: - # logging.info("\t%s" % stdout_line.strip()) - # deploy_stacks_popen.stdout.close() - if __name__ == "__main__": parser = argparse.ArgumentParser(description="Automatic data loading in containers and interaction " diff --git a/run_workflow_phaeoexplorer.py b/run_workflow_phaeoexplorer.py index 2857feb..4f73193 100644 --- a/run_workflow_phaeoexplorer.py +++ b/run_workflow_phaeoexplorer.py @@ -252,6 +252,18 @@ class RunWorkflow(speciesData.SpeciesData): logging.info("Individual tools versions and changesets validated") + + def tripal_synchronize_organism_analyses(self): + """ + """ + show_tool_tripal_sync = self.instance.tools.show_tool(tool_id="toolshed.g2.bx.psu.edu/repos/gga/tripal_organism_sync/organism_sync/3.2.1.0", io_details=True) + org_sync = "toolshed.g2.bx.psu.edu/repos/gga/tripal_organism_sync/organism_sync/3.2.1.0" + org_sync = self.instance.tools.run_tool(tool_id="toolshed.g2.bx.psu.edu/repos/gga/tripal_organism_sync/organism_sync/3.2.1.0", + history_id=self.history_id, + tool_inputs={"organism_id": "2"}) + org_sync_job_out = org_sync["outputs"] + + def add_organism_ogs_genome_analyses(self): """ Add OGS and genome vX analyses to Chado database @@ -477,6 +489,8 @@ class RunWorkflow(speciesData.SpeciesData): # as runtime values, using runtime parameters makes the tool throw an internal critical error ("replace not found" error) if workflow_name == "Jbrowse": workflow_dict["steps"]["2"]["tool_state"] = workflow_dict["steps"]["2"]["tool_state"].replace("__MENU_URL__", "https://{host}:{port}/sp/{genus_sp}/feature/{Genus}/{species}/{id}".format(host=self.config["host"], port=self.config["https_port"], genus_sp=self.genus_species, Genus=self.genus_uppercase, species=self.species, id="{id}")) + # The UNIQUE_ID is specific to a combination genus_species_strain_sex so every combination should have its unique workflow + # in galaxy --> define a naming method for these workflows workflow_dict["steps"]["3"]["tool_state"] = workflow_dict["steps"]["3"]["tool_state"].replace("__FULL_NAME__", self.full_name).replace("__UNIQUE_ID__", self.abbreviation) # Import the workflow in galaxy as a dict @@ -561,23 +575,16 @@ class RunWorkflow(speciesData.SpeciesData): # Iterating over the folders to find datasets and map datasets to their IDs logging.debug("Datasets IDs: ") for k, v in folders_ids.items(): - # print(self.full_name) - # print(self.species_folder_name) - # print(k) - # print(v) if k == "/genome/{0}/v{1}".format(self.species_folder_name, self.genome_version): sub_folder_content = self.instance.folders.show_folder(folder_id=v, contents=True) for k2, v2 in sub_folder_content.items(): for e in v2: if type(e) == dict: - # print(e) if e["name"].endswith(".fa"): self.datasets["genome_file"] = e["ldda_id"] self.datasets_name["genome_file"] = e["name"] logging.debug("\tGenome file:\t" + e["name"] + ": " + e["ldda_id"]) - # print("Genome file:\t" + e["name"] + ": " + e["ldda_id"]) - # print(self.species_folder_name) if k == "/annotation/{0}/OGS{1}".format(self.species_folder_name, self.ogs_version): sub_folder_content = self.instance.folders.show_folder(folder_id=v, contents=True) for k2, v2 in sub_folder_content.items(): @@ -652,7 +659,6 @@ class RunWorkflow(speciesData.SpeciesData): # there is no way to know which imported datasets are the correct ones depending on history content # it's not currently used history_datasets_li = self.instance.datasets.get_datasets() - print(history_datasets_li) genome_dataset_hda_id, gff_dataset_hda_id, transcripts_dataset_hda_id, proteins_datasets_hda_id = None, None, None, None interproscan_dataset_hda_id, blast_diamond_dataset_hda_id = None, None @@ -839,7 +845,7 @@ if __name__ == "__main__": # Parse the config yaml file run_workflow_for_current_organism.config = utilities.parse_config(args.config) - # Set the instance url attribute + # Set the instance url attribute --> TODO: the localhost rule in the docker-compose still doesn't work on scratchgmodv1 run_workflow_for_current_organism.instance_url = "http://scratchgmodv1:{0}/sp/{1}_{2}/galaxy/".format( run_workflow_for_current_organism.config["http_port"], run_workflow_for_current_organism.genus_lowercase, @@ -863,11 +869,13 @@ if __name__ == "__main__": run_workflow_for_current_organism.get_genome_analysis_id() run_workflow_for_current_organism.get_ogs_analysis_id() + # run_workflow_for_current_organism.tripal_synchronize_organism_analyses() # Get the attributes of the instance and project data files run_workflow_for_current_organism.get_instance_attributes() # Import datasets into history and retrieve their hda IDs + # TODO: can be simplified with direct access to the folder contents via the full path (no loop required) hda_ids = run_workflow_for_current_organism.import_datasets_into_history() # DEBUG @@ -918,7 +926,7 @@ if __name__ == "__main__": # run_workflow_for_current_organism.datamap[GFF_FILE] = {"src": "hda", # "id": hda_ids["gff_hda_id"]} - # Ensures galaxy has time to retrieve + # Ensures galaxy has had time to retrieve datasets time.sleep(60) # Run the Chado load Tripal sync workflow with the parameters set above run_workflow_for_current_organism.run_workflow(workflow_path=workflow, diff --git a/speciesData.py b/speciesData.py index dc6d2a4..238b28e 100644 --- a/speciesData.py +++ b/speciesData.py @@ -80,7 +80,6 @@ class SpeciesData: self.datasets = dict() self.config = None # Custom config used to set environment variables inside containers, defaults to the one in the repo - self.source_data_dir = parameters_dictionary["data"]["parent_directory"] self.species_folder_name = "_".join(utilities.filter_empty_not_empty_items([self.genus_lowercase.lower(), self.species.lower(), self.sex.lower()])["not_empty"]) self.existing_folders_cache = {} self.bam_metadata_cache = {} diff --git a/utilities.py b/utilities.py index 6885afb..f9f561d 100644 --- a/utilities.py +++ b/utilities.py @@ -170,6 +170,11 @@ def get_unique_species_dict(sp_dict_list): return unique_species_dict +def get_unique_analysis(sp_dict_list): + + + return 0 + def write_metadata(metadata_file, metadata_dict): """ -- GitLab