diff --git a/gga_get_data.py b/gga_get_data.py index f936657600ce098e1a9359ff524390e3467ffc70..e9eb11e50fbd4470d739872a6f86fbffd3ac833b 100644 --- a/gga_get_data.py +++ b/gga_get_data.py @@ -55,11 +55,7 @@ class GetData(speciesData.SpeciesData): def get_source_data_files_from_path(self): """ - Find source data files in the parent_directory - Link data files - - TODO: manage access to the "parent directory" subdirectories properly - TODO: implement search/tests for individual file paths + Find source data files and copy them into the src_data dir tree :return: """ @@ -92,11 +88,17 @@ class GetData(speciesData.SpeciesData): # Copy dataset in the organism src_data dir tree correct folder for k, v in datasets_to_get.items(): if k in genome_datasets: - shutil.copyfile(os.path.abspath(v), os.path.join(organism_genome_dir, os.path.basename(v))) - logging.info("Copied {0} into {1}".format(v, organism_genome_dir)) + logging.info("Copying {0} into {1}".format(v, organism_genome_dir)) + try: + shutil.copyfile(os.path.abspath(v), os.path.join(organism_genome_dir, os.path.basename(v))) + except Exception as exc: + logging.warning("Could not copy {1} ({2})".format(v, exc)) elif k in annotation_datasets: - shutil.copyfile(os.path.abspath(v), os.path.join(organism_annotation_dir, os.path.basename(v))) - logging.info("Copied {0} into {1}".format(v, organism_annotation_dir)) + logging.info("Copying {0} into {1}".format(v, organism_annotation_dir)) + try: + shutil.copyfile(os.path.abspath(v), os.path.join(organism_annotation_dir, os.path.basename(v))) + except Exception as exc: + logging.warning("Could not copy {1} ({2})".format(v, exc)) else: pass diff --git a/gga_init.py b/gga_init.py index 95ee78171a08c8cb3c89304601b368854a3ab415..9c990af4106b4846d5b1c83fb71119bd5fa1ed7c 100644 --- a/gga_init.py +++ b/gga_init.py @@ -28,7 +28,6 @@ TODO """ - class DeploySpeciesStack(speciesData.SpeciesData): """ Child of SpeciesData @@ -40,7 +39,7 @@ class DeploySpeciesStack(speciesData.SpeciesData): def make_directory_tree(self): """ - Generate the directory tree for an organism and move datasets into src_data + Generate the directory tree for an organism :return: """ @@ -73,9 +72,12 @@ class DeploySpeciesStack(speciesData.SpeciesData): else: logging.debug("Using default banner for Tripal pages") self.config.pop("banner_path", None) + else: + logging.debug("Using default banner for Tripal pages") + self.config.pop("banner_path", None) # Create nginx dirs and write/re-write nginx conf - self.make_dirs(dir_paths_li=["./nginx", "./nginx/conf"]) + make_dirs(dir_paths_li=["./nginx", "./nginx/conf"]) try: with open(os.path.abspath("./nginx/conf/default.conf"), 'w') as conf: conf.write("server {\n\tlisten 80;\n\tserver_name ~.;\n\tlocation /download/ {\n\t\talias /project_data/; \n\t\tautoindex on;\n\t}\n}") # The species nginx conf @@ -92,46 +94,21 @@ class DeploySpeciesStack(speciesData.SpeciesData): logging.critical("Insufficient permission to create src_data directory tree") sys.exit(exc) - - print(self.strain) - print(self.species_folder_name) - # List of all the directories to create in src_data src_data_dirs_li = ["./src_data", "./src_data/annotation", "./src_data/genome", "./src_data/tracks", "./src_data/annotation/%s" % self.species_folder_name, "./src_data/genome/%s" % self.species_folder_name, "./src_data/annotation/{0}/OGS{1}/".format(self.species_folder_name, self.ogs_version), "./src_data/genome/{0}/v{1}".format(self.species_folder_name, self.genome_version)] - self.make_dirs(dir_paths_li=src_data_dirs_li) + make_dirs(dir_paths_li=src_data_dirs_li) # Return to main directory os.chdir(self.main_dir) logging.info("Directory tree generated for %s" % self.full_name) - @staticmethod - def make_dirs(dir_paths_li): - """ - Recursively create directories from a list of paths with a try-catch condition - :param dir_paths_li: - :return: - """ - created_dir_paths_li = [] - - for dir_path in dir_paths_li: - try: - os.mkdir(dir_path) - except FileExistsError: - logging.debug("%s directory already exists" % dir_path) - except PermissionError as exc: - logging.critical("Insufficient permission to create %s" % dir_path) - sys.exit(exc) - created_dir_paths_li.append(dir_path) - - return created_dir_paths_li - - def make_compose_files(self, force=False): + def make_compose_files(self): """ Create a formatted copy of the template compose file inside a species directory tree @@ -167,80 +144,105 @@ class DeploySpeciesStack(speciesData.SpeciesData): gspecies_compose_file.write(gspecies_compose_output) # Create the volumes (directory) of the species docker-compose file - self.create_mounts(working_dir=".") + create_mounts(working_dir=".", main_dir=self.main_dir) # Return to main directory os.chdir(self.main_dir) - def make_traefik_compose_files(self): + + def make_orthology_compose_files(self): """ - Create or update the traefik docker-compose file and authelia conf files - Will only write new authelia conf files if the argument "--overwrite-all" is specified or - the authelia directory doesn't contain conf files :return: """ os.chdir(self.main_dir) + make_dirs["./orthology", "./orthology/src_data", "./orthology/src_data/genomes", + "./orthology/src_data/gff", "./orthology/src_data/newicks", "./orthology/src_data/proteomes"] + + +def make_dirs(dir_paths_li): + """ + Recursively create directories from a list of paths with a try-catch condition + + :param dir_paths_li: + :return: + """ + created_dir_paths_li = [] + + for dir_path in dir_paths_li: + try: + os.mkdir(dir_path) + except FileExistsError: + logging.debug("%s directory already exists" % dir_path) + except PermissionError as exc: + logging.critical("Insufficient permission to create %s" % dir_path) + sys.exit(exc) + created_dir_paths_li.append(dir_path) + + return created_dir_paths_li + +def make_traefik_compose_files(config, main_dir): + """ + Create or update the traefik directory, docker-compose file and authelia conf files + Only called when the argument "--traefik" is specified + + :param config: + :param main_dir: + :return: + """ + + script_dir = os.path.dirname(os.path.realpath(sys.argv[0])) + render_vars = config + + os.chdir(main_dir) + # Create directory tree - self.make_dirs(["./traefik", "./traefik/authelia"]) + make_dirs(["./traefik", "./traefik/authelia"]) # Render and try to write the traefik docker-compose file # This new docker-compose file will not overwrite the one already present in the traefik dir # unless the argument "--overwrite-all" is specified # Jinja2 templating, handled using the python "jinja2" module - file_loader = FileSystemLoader(self.script_dir + "/templates") + file_loader = FileSystemLoader(script_dir + "/templates") env = Environment(loader=file_loader) - if not os.path.isfile("./traefik/docker-compose.yml") or force: + if not os.path.isfile("./traefik/docker-compose.yml"): traefik_compose_template = env.get_template("traefik_compose_template.yml.j2") traefik_compose_output = traefik_compose_template.render(render_vars) - with open(os.path.join(self.main_dir, "docker-compose.yml"), 'w') as traefik_compose_file: + with open(os.path.join(main_dir, "traefik/docker-compose.yml"), 'w') as traefik_compose_file: logging.info("Writing traefik docker-compose.yml") traefik_compose_file.truncate(0) traefik_compose_file.write(traefik_compose_output) - if self.config["authelia_config_path"]: - if not self.config["authelia_config_path"] == "" or not self.config["authelia_config_path"] == "/path/to/authelia/config": - if os.path.isfile(os.path.abspath(self.config["authelia_config_path"])): + if config["authelia_config_path"]: + if not config["authelia_config_path"] == "" or not config["authelia_config_path"] == "/path/to/authelia/config": + if os.path.isfile(os.path.abspath(config["authelia_config_path"])): try: - shutil.copy(os.path.abspath(self.config["authelia_config_path"]), "./traefik/authelia") + shutil.copy(os.path.abspath(config["authelia_config_path"]), "./traefik/authelia") except Exception as exc: - logging.critical("Cannot copy custom Authelia config file (%s)" % self.config["authelia_config_path"]) + logging.critical("Cannot copy custom Authelia config file (%s)" % config["authelia_config_path"]) sys.exit(exc) else: - logging.critical("Custom Authelia config file not found (%s)" % self.config["authelia_config_path"]) + logging.critical("Custom Authelia config file not found (%s)" % config["authelia_config_path"]) # Path to the authelia users in the repo - authelia_users_path = self.script_dir + "/templates/authelia_users_template.yml" + authelia_users_path = script_dir + "/templates/authelia_users_template.yml" # Copy authelia "users" file - if not os.path.isfile("./traefik/authelia/users.yml") or force: + if not os.path.isfile("./traefik/authelia/users.yml"): shutil.copy(authelia_users_path, "./traefik/authelia/users.yml") # Create the mounts for the traefik and authelia services - traefik_dir = os.path.abspath(os.path.join(self.main_dir, "traefik")) - if not os.path.isdir(os.path.join(traefik_dir, "docker_data")) or force: - self.create_mounts(working_dir=traefik_dir) + traefik_dir = os.path.abspath(os.path.join(main_dir, "traefik")) + if not os.path.isdir(os.path.join(traefik_dir, "docker_data")): + create_mounts(working_dir=traefik_dir, main_dir=main_dir) # Return to main directory - os.chdir(self.main_dir) - - - def make_orthology_compose_files(self): - """ - - :return: - """ - - os.chdir(self.main_dir) - - self.make_dirs["./orthology", "./orthology/src_data", "./orthology/src_data/genomes", - "./orthology/src_data/gff", "./orthology/src_data/newicks", "./orthology/src_data/proteomes"] - + os.chdir(main_dir) - def create_mounts(self, working_dir): +def create_mounts(working_dir, main_dir): """ Create the folders (volumes) required by a container (to see required volumes, check their compose file) @@ -285,12 +287,11 @@ class DeploySpeciesStack(speciesData.SpeciesData): # Go back to the "main" directory try: - os.chdir(os.path.abspath(self.main_dir)) + os.chdir(os.path.abspath(main_dir)) except OSError as exc: - logging.critical("Cannot access %s, exiting" % self.main_dir) + logging.critical("Cannot access %s, exiting" % main_dir) sys.exit(exc) - def deploy_stacks(input_list, main_dir): """ This function first deploys/redeploys the traefik stack, then deploys/redeploys the organism stack, then redeploys the traefik stack @@ -367,8 +368,8 @@ if __name__ == "__main__": type=str, help="Where the stack containers will be located, defaults to current directory") - parser.add_argument("--overwrite-all", - help="Overwrite all docker-compose and conf files in the traefik and authelia directories (default=False)", + parser.add_argument("--traefik", + help="Initialize/wverwrite traefik directory all docker-compose and conf files in the traefik and authelia directories (default=False)", action="store_true") args = parser.parse_args() @@ -392,6 +393,11 @@ if __name__ == "__main__": sp_dict_list = utilities.parse_input(os.path.abspath(args.input)) + # Create traefik directory and compose files if specified + if args.traefik: + config = utilities.parse_config(args.config) + make_traefik_compose_files(config=config, main_dir=main_dir) + logging.info("Deploying stacks for organisms in input file %s" % args.input) for sp_dict in sp_dict_list: @@ -428,7 +434,7 @@ if __name__ == "__main__": logging.info("Successfully generated the directory tree for %s" % deploy_stack_for_current_organism.full_name) # Make compose files - deploy_stack_for_current_organism.make_compose_files(force=args.overwrite_all) + deploy_stack_for_current_organism.make_compose_files() logging.info("Successfully generated the docker-compose files for %s" % deploy_stack_for_current_organism.full_name) logging.info("Deploying stacks") diff --git a/run_workflow_phaeoexplorer.py b/run_workflow_phaeoexplorer.py index 27ec5973e5f9fdaea84e4e6f2a21699bab55efc9..ead3c936153421ee3b5ad000712a256df72a605d 100644 --- a/run_workflow_phaeoexplorer.py +++ b/run_workflow_phaeoexplorer.py @@ -183,7 +183,8 @@ class RunWorkflow(speciesData.SpeciesData): "sourcename": "Genoscope", "date_executed": self.date}) - # Add Interpro analysis to chado + # Add Interpro analysis to chado + logging.info("Adding Interproscan analysis to the instance's chado database") self.instance.tools.run_tool( tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/2.3.3", history_id=self.history_id, @@ -194,7 +195,8 @@ class RunWorkflow(speciesData.SpeciesData): "date_executed": self.date}) - # Add Blastp (diamond) analysis to chado + # Add Blastp (diamond) analysis to chado + logging.info("Adding Blastp Diamond analysis to the instance's chado database") self.instance.tools.run_tool( tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/2.3.3", history_id=self.history_id, @@ -255,7 +257,6 @@ class RunWorkflow(speciesData.SpeciesData): history_id=self.history_id, params=workflow_parameters, inputs=datamap, - inputs_by="", allow_tool_state_corrections=True) logging.info("Successfully imported and invoked workflow {0}, check the galaxy instance ({1}) for the jobs state".format(workflow_name, self.instance_url)) @@ -310,7 +311,7 @@ class RunWorkflow(speciesData.SpeciesData): folders_ids[current_folder_name] = v # Iterating over the folders to find datasets and map datasets to their IDs - logging.info("Datasets IDs: ") + logging.debug("Datasets IDs: ") for k, v in folders_ids.items(): if k == "/genome": sub_folder_content = self.instance.folders.show_folder(folder_id=v, contents=True) @@ -320,29 +321,33 @@ class RunWorkflow(speciesData.SpeciesData): if type(e) == dict: if e["name"].endswith(".fa"): self.datasets["genome_file"] = e["ldda_id"] - logging.debug("\t" + e["name"] + ": " + e["ldda_id"]) + logging.debug("Genome file:\t" + e["name"] + ": " + e["ldda_id"]) if k == "/annotation": sub_folder_content = self.instance.folders.show_folder(folder_id=v, contents=True) final_sub_folder_content = self.instance.folders.show_folder(folder_id=sub_folder_content["folder_contents"][0]["id"], contents=True) for k2, v2 in final_sub_folder_content.items(): for e in v2: + # try: + # print(e["name"]) + # except TypeError: + # print("TypeError") if type(e) == dict: # TODO: manage genome and ogs versions (differentiate between the correct folders using self.config) if "transcripts" in e["name"]: self.datasets["transcripts_file"] = e["ldda_id"] - logging.debug("\t" + e["name"] + ": " + e["ldda_id"]) - elif "proteins" in e["name"]: + logging.debug("Transcripts file:\t" + e["name"] + ": " + e["ldda_id"]) + elif "proteins.fa" in e["name"]: self.datasets["proteins_file"] = e["ldda_id"] - logging.debug("\t" + e["name"] + ": " + e["ldda_id"]) + logging.debug("Proteins file:\t" + e["name"] + ": " + e["ldda_id"]) elif "gff" in e["name"]: self.datasets["gff_file"] = e["ldda_id"] - logging.debug("\t" + e["name"] + ": " + e["ldda_id"]) - elif e["name"].endswith(".xml") and e["name"].startswith("Interpro"): + logging.debug("GFF file:\t" + e["name"] + ": " + e["ldda_id"]) + elif "Interpro" in e["name"]: self.datasets["interproscan_file"] = e["ldda_id"] - logging.debug("\t" + e["name"] + ": " + e["ldda_id"]) - elif e["name"].endswith(".xml") and "diamond" in e["name"]: + logging.debug("Interproscan file:\t" + e["name"] + ": " + e["ldda_id"]) + elif "diamond-blastp" in e["name"]: self.datasets["blast_diamond_file"] = e["ldda_id"] - logging.debug("\t" + e["name"] + ": " + e["ldda_id"]) + logging.debug("Blastp diamond file:\t" + e["name"] + ": " + e["ldda_id"]) logging.info("Uploading datasets into history %s" % self.history_id) self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["genome_file"]) @@ -397,10 +402,10 @@ class RunWorkflow(speciesData.SpeciesData): elif dataset_dict["name"].endswith(".gff"): gff_dataset_hda_id = dataset_dict["id"] logging.debug("gff dataset hda ID: %s" % gff_dataset_hda_id) - elif dataset_dict["name"].endwsith(".xml") and dataset["name"].startswith("Interpro"): + elif "Interpro" in dataset_dict["name"]: interproscan_dataset_hda_id = dataset_dict["id"] logging.debug("InterproScan dataset hda ID: %s" % gff_dataset_hda_id) - elif dataset_dict["name"].endswith(".xml") and "diamond" in dataset_dict["name"]: + elif "diamond-blastp" in dataset_dict["name"]: blast_diamond_dataset_hda_id = dataset_dict["id"] logging.debug("Blast Diamond dataset hda ID: %s" % gff_dataset_hda_id) else: @@ -614,6 +619,7 @@ if __name__ == "__main__": # Import datasets into history and retrieve their hda IDs run_workflow_for_current_organism.import_datasets_into_history() hda_ids = run_workflow_for_current_organism.get_datasets_hda_ids() + # run_workflow_for_current_organism.get_invocation_report(workflow_name="Chado load Tripal synchronize") # Explicit workflow parameter names GENOME_FASTA_FILE = "0" @@ -634,6 +640,15 @@ if __name__ == "__main__": workflow_parameters[GFF_FILE] = {} workflow_parameters[PROTEINS_FASTA_FILE] = {} workflow_parameters[TRANSCRIPTS_FASTA_FILE] = {} + + print(run_workflow_for_current_organism.org_id) + print(run_workflow_for_current_organism.genome_analysis_id) + print(run_workflow_for_current_organism.ogs_analysis_id) + print(hda_ids["genome_hda_id"]) + print(hda_ids["gff_hda_id"]) + print(hda_ids["proteins_hda_id"]) + print(hda_ids["transcripts_hda_id"]) + workflow_parameters[LOAD_FASTA_IN_CHADO] = {"organism": run_workflow_for_current_organism.org_id, "analysis_id": run_workflow_for_current_organism.genome_analysis_id, "do_update": "true"} @@ -648,17 +663,16 @@ if __name__ == "__main__": # Datamap for input datasets - dataset source (type): ldda (LibraryDatasetDatasetAssociation) run_workflow_for_current_organism.datamap = {} - run_workflow_for_current_organism.datamap[GENOME_FASTA_FILE] = {"src": "ldda", "id": run_workflow_for_current_organism.datasets["genome_file"]} - run_workflow_for_current_organism.datamap[GFF_FILE] = {"src": "ldda", "id": run_workflow_for_current_organism.datasets["gff_file"]} - run_workflow_for_current_organism.datamap[PROTEINS_FASTA_FILE] = {"src": "ldda", "id": run_workflow_for_current_organism.datasets["proteins_file"]} - run_workflow_for_current_organism.datamap[TRANSCRIPTS_FASTA_FILE] = {"src": "ldda", "id": run_workflow_for_current_organism.datasets["transcripts_file"]} + run_workflow_for_current_organism.datamap[GENOME_FASTA_FILE] = {"src": "hda", "id": hda_ids["genome_hda_id"]} + run_workflow_for_current_organism.datamap[GFF_FILE] = {"src": "hda", "id": hda_ids["gff_hda_id"]} + run_workflow_for_current_organism.datamap[PROTEINS_FASTA_FILE] = {"src": "hda", "id": hda_ids["proteins_hda_id"]} + run_workflow_for_current_organism.datamap[TRANSCRIPTS_FASTA_FILE] = {"src": "hda", "id": hda_ids["transcripts_hda_id"]} run_workflow_for_current_organism.datamap = {} - run_workflow_for_current_organism.datamap[GENOME_FASTA_FILE] = {"src": "ldda", "id": + run_workflow_for_current_organism.datamap[GENOME_FASTA_FILE] = {"src": "hda", "id": run_workflow_for_current_organism.datasets["genome_file"]} - run_workflow_for_current_organism.datamap[GFF_FILE] = {"src": "ldda", - "id": run_workflow_for_current_organism.datasets[ - "gff_file"]} + run_workflow_for_current_organism.datamap[GFF_FILE] = {"src": "hda", + "id": hda_ids["gff_hda_id"]} # Run the Chado load Tripal sync workflow with the parameters set above run_workflow_for_current_organism.run_workflow(workflow_path=workflow, @@ -739,7 +753,7 @@ if __name__ == "__main__": run_workflow_for_current_organism.datamap = {} - run_workflow_for_current_organism.datamap[INTERPRO_FILE] = {"src": "hda", "id": hda_ids["interproscan_hda_id"]} + run_workflow_for_current_organism.datamap[INTERPRO_FILE] = {"src": "hda", "id": run_workflow_for_current_organism.hda_ids["interproscan_hda_id"]} # Run Interproscan workflow run_workflow_for_current_organism.run_workflow(workflow_path=workflow, diff --git a/speciesData.py b/speciesData.py index a099078fa620fa73232c101f14ebbd43da9fd298..287c8ccf92a839743cb5ec71b0a7a23ff3f65acc 100644 --- a/speciesData.py +++ b/speciesData.py @@ -18,11 +18,11 @@ class SpeciesData: def __init__(self, parameters_dictionary): # self.config_dictionary = None self.parameters_dictionary = parameters_dictionary - self.species = parameters_dictionary["description"]["species"] - self.genus = parameters_dictionary["description"]["genus"] - self.strain = parameters_dictionary["description"]["strain"] - self.sex = parameters_dictionary["description"]["sex"] - self.common = parameters_dictionary["description"]["common_name"] + self.species = parameters_dictionary["description"]["species"].replace("(", "_").replace(")", "_") + self.genus = parameters_dictionary["description"]["genus"].replace("(", "_").replace(")", "_") + self.strain = parameters_dictionary["description"]["strain"].replace("(", "_").replace(")", "_") + self.sex = parameters_dictionary["description"]["sex"].replace("(", "_").replace(")", "_") + self.common = parameters_dictionary["description"]["common_name"].replace("(", "_").replace(")", "_") self.date = datetime.today().strftime("%Y-%m-%d") self.origin = parameters_dictionary["description"]["origin"] @@ -70,14 +70,16 @@ class SpeciesData: self.workflow_name = None self.metadata = dict() self.api_key = None - self.datasets = dict() # API key used to communicate with the galaxy instance. Cannot be used to do user-tied actions + self.datasets = dict() self.config = None # Custom config used to set environment variables inside containers, defaults to the one in the repo - if parameters_dictionary["data"]["parent_directory"] == "" or parameters_dictionary["data"]["parent_directory"] == "/path/to/closest/parent/dir": - self.source_data_dir = "/shared/projects/phaeoexplorer/" # Testing path for phaeoexplorer data - TODO: REMOVE IN PRODUCTION - else: - self.source_data_dir = parameters_dictionary["data"]["parent_directory"] + self.source_data_dir = parameters_dictionary["data"]["parent_directory"] self.species_folder_name = "_".join(utilities.filter_empty_not_empty_items([self.genus_lowercase.lower(), self.species.lower(), self.sex.lower()])["not_empty"]) self.existing_folders_cache = {} self.bam_metadata_cache = {} + + # # Sanitize str attributes + # for var in vars(self): + # for attr in var if type(attr) == str: + # attr = attr.replace("(", "_").replace(")", "_") diff --git a/workflows/Chado_load_Tripal_synchronize.ga b/workflows/Chado_load_Tripal_synchronize.ga index 6571c65e4bbb10d7f7d66f5c7076cf74ebf09c2f..a04e7afff8cc0d0b0616fefd611fe3a24c66e862 100644 --- a/workflows/Chado_load_Tripal_synchronize.ga +++ b/workflows/Chado_load_Tripal_synchronize.ga @@ -2,7 +2,7 @@ "a_galaxy_workflow": "true", "annotation": "", "format-version": "0.1", - "name": "Chado load Tripal synchronize (imported from uploaded file)", + "name": "Chado load Tripal synchronize", "steps": { "0": { "annotation": "",