diff --git a/README.md b/README.md index 2d077bd2f1eac0e814e30ee1a337fab541f1a690..fd0b6243472147a79e44c59142e3c9b803c72f0d 100755 --- a/README.md +++ b/README.md @@ -149,8 +149,9 @@ Directory tree structure: [BSD 3-Clause](./LICENSE) -## Acknowledgments +## Contributors -[Anthony Bretaudeau](https://github.com/abretaud) - -[Matéo Boudet](https://github.com/mboudet) \ No newline at end of file +- [Matéo Boudet](https://github.com/mboudet) +- [Anthony Bretaudeau](https://github.com/abretaud) +- [Loraine Brillet-Guéguen](https://github.com/loraine-gueguen) +- [Arthur Le Bars](https://gitlab.com/Troubardours) \ No newline at end of file diff --git a/gga_get_data.py b/gga_get_data.py index 872d5c3a21d41eb6af1c9a547a262f806699525d..1c14ed3a3e674fc773dbc81ef6bc4bd583a4208e 100755 --- a/gga_get_data.py +++ b/gga_get_data.py @@ -1,22 +1,13 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -import bioblend import argparse import os -import subprocess import logging import sys -import fnmatch import time -import json -import re -import stat import shutil -from bioblend.galaxy.objects import GalaxyInstance -from bioblend import galaxy - import utilities import speciesData @@ -36,24 +27,6 @@ class GetData(speciesData.SpeciesData): """ - def goto_species_dir(self): - """ - Go to the species directory (starting from the main dir) - - :return: - """ - - os.chdir(self.main_dir) - species_dir = os.path.join(self.main_dir, self.genus_species) + "/" - try: - os.chdir(species_dir) - except OSError: - logging.critical("Cannot access %s" % species_dir) - sys.exit(0) - return 1 - - - def make_directory_tree(self): """ Generate the directory tree for an organism diff --git a/gga_init.py b/gga_init.py index 673df13b3e83a6540410ddcfebcf303031e6d36c..8ec255f81fe9a4a69e7b48808a20d534ecdfafa2 100755 --- a/gga_init.py +++ b/gga_init.py @@ -11,7 +11,7 @@ import yaml import shutil from pathlib import Path -from jinja2 import Template, Environment, FileSystemLoader +from jinja2 import Environment, FileSystemLoader import utilities import speciesData diff --git a/gga_load_data.py b/gga_load_data.py index 32c5eaa1c9ac944e99f35bfeccc3a56861a04820..0dc2c796f861b87fdd26defcb49dae4887cb3a17 100755 --- a/gga_load_data.py +++ b/gga_load_data.py @@ -1,21 +1,19 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- +import re import bioblend import argparse import os -import subprocess import logging import sys -import fnmatch import time import json -import re -import stat -import shutil +import yaml +import subprocess -from bioblend.galaxy.objects import GalaxyInstance from bioblend import galaxy +from bioblend.galaxy.objects import GalaxyInstance import utilities import speciesData @@ -23,11 +21,17 @@ import speciesData """ gga_load_data.py -Usage: $ python3 gga_init.py -i input_example.yml --config config.yml [OPTIONS] +Usage: $ python3 gga_load_data.py -i input_example.yml --config config.yml [OPTIONS] Do not call this script before the galaxy container is ready """ +# If this version if not found, Galaxy will use the one that is found +GET_ORGANISMS_TOOL = "toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/2.3.4+galaxy0" +DELETE_ORGANISMS_TOOL = "toolshed.g2.bx.psu.edu/repos/gga/chado_organism_delete_organisms/organism_delete_organisms/2.3.4+galaxy0" + +HOST_DATA_DIR='src_data' +CONTAINER_DATA_DIR_ROOT='/project_data' class LoadData(speciesData.SpeciesData): """ @@ -39,23 +43,7 @@ class LoadData(speciesData.SpeciesData): """ - def goto_species_dir(self): - """ - Go to the species directory (starting from the main dir) - - :return: - """ - - os.chdir(self.main_dir) - species_dir = os.path.join(self.main_dir, self.genus_species) + "/" - try: - os.chdir(species_dir) - except OSError: - logging.critical("Cannot access %s" % species_dir) - sys.exit(0) - return 1 - - def set_get_history(self): + def get_history(self): """ Create or set the working history to the current species one @@ -63,13 +51,15 @@ class LoadData(speciesData.SpeciesData): """ try: histories = self.instance.histories.get_histories(name=str(self.genus_species)) - self.history_id = histories[0]["id"] - logging.debug("History ID set for {0} {1}: {2}".format(self.genus, self.species, self.history_id)) + if len(histories) == 1: + self.history_id = histories[0]["id"] + logging.debug("History ID set for {0} {1}: {2}".format(self.genus, self.species, self.history_id)) + else: + logging.critical("Multiple histories exists for {1}: {2}".format(self.genus, self.species)) except IndexError: logging.info("Creating history for {0} {1}".format(self.genus, self.species)) - self.instance.histories.create_history(name=str(self.genus_species)) - histories = self.instance.histories.get_histories(name=str(self.genus_species)) - self.history_id = histories[0]["id"] + hist_dict = self.instance.histories.create_history(name=str(self.genus_species)) + self.history_id = hist_dict["id"] logging.debug("History ID set for {0} {1}: {2}".format(self.genus, self.species, self.history_id)) return self.history_id @@ -81,32 +71,23 @@ class LoadData(speciesData.SpeciesData): """ - get_organism_tool = self.instance.tools.show_tool("toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/2.3.4+galaxy0") - delete_organism_tool = self.instance.tools.show_tool("toolshed.g2.bx.psu.edu/repos/gga/chado_organism_delete_organisms/organism_delete_organisms/2.3.4+galaxy0") - - get_organism_tool_version = get_organism_tool["version"] - delete_organism_tool_version = delete_organism_tool["version"] - - logging.debug("Getting 'Homo sapiens' ID in instance's chado database") - get_sapiens_id_job = self.instance.tools.run_tool( - tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/%s" % get_organism_tool_version, + logging.debug("Getting 'Homo sapiens' ID in chado database") + get_sapiens_id_job_output_dataset_id = utilities.run_tool_and_get_single_output_dataset_id(self.instance, + tool_id=GET_ORGANISMS_TOOL, history_id=self.history_id, tool_inputs={"genus": "Homo", "species": "sapiens"}) - get_sapiens_id_job_output = get_sapiens_id_job["outputs"][0]["id"] - get_sapiens_id_json_output = self.instance.datasets.download_dataset(dataset_id=get_sapiens_id_job_output) + get_sapiens_id_json_output = self.instance.datasets.download_dataset(dataset_id=get_sapiens_id_job_output_dataset_id) + + logging.debug("Deleting Homo 'sapiens' in the instance's chado database") try: - logging.debug("Deleting Homo 'sapiens' in the instance's chado database") get_sapiens_id_final_output = json.loads(get_sapiens_id_json_output)[0] - sapiens_id = str( - get_sapiens_id_final_output["organism_id"]) # needs to be str to be recognized by the chado tool - self.instance.tools.run_tool( - tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_delete_organisms/organism_delete_organisms/%s" % delete_organism_tool_version, + sapiens_id = str(get_sapiens_id_final_output["organism_id"]) # needs to be str to be recognized by the chado tool + utilities.run_tool( + tool_id=DELETE_ORGANISMS_TOOL, history_id=self.history_id, - tool_inputs={"organism": str(sapiens_id)}) - except bioblend.ConnectionError: - logging.debug("Homo sapiens isn't in the instance's chado database (bioblend.ConnectionError)") + tool_inputs={"organism": sapiens_id}) except IndexError: - logging.debug("Homo sapiens isn't in the instance's chado database (IndexError)") + logging.error("Homo sapiens isn't in the instance's chado database (IndexError)") pass def purge_histories(self): @@ -119,7 +100,6 @@ class LoadData(speciesData.SpeciesData): """ histories = self.instance.histories.get_histories() - self.instance.histories.get_histories(deleted=False) for h in histories: self.instance.histories.delete_history(history_id=h["id"]) @@ -133,38 +113,40 @@ class LoadData(speciesData.SpeciesData): :return: """ - self.goto_species_dir() + data_dir_root=os.path.join(self.get_species_dir(), HOST_DATA_DIR) - # Delete pre-existing lib (probably created by a previous call) - gio = GalaxyInstance(url=self.instance_url, - email=self.config["galaxy_default_admin_email"], - password=self.config["galaxy_default_admin_password"]) + instance = GalaxyInstance(url=self.instance_url, + email=self.config["galaxy_default_admin_email"], + password=self.config["galaxy_default_admin_password"] + ) + logging.info("Looking for project data in %s" % data_dir_root) folders = dict() post_renaming = {} - for root, dirs, files in os.walk("./src_data", followlinks=True): + for root, dirs, files in os.walk(data_dir_root, followlinks=True): file_list = [os.path.join(root, filename) for filename in files] folders[root] = file_list if folders: # Delete pre-existing lib (probably created by a previous call) - existing = gio.libraries.get_previews(name='Project Data') + existing = instance.libraries.get_previews(name='Project Data') for lib in existing: if not lib.deleted: logging.info('Pre-existing "Project Data" library %s found, removing it' % lib.id) - gio.libraries.delete(lib.id) + instance.libraries.delete(lib.id) logging.info("Creating new 'Project Data' library") - prj_lib = gio.libraries.create('Project Data', 'Data for current genome annotation project') + prj_lib = instance.libraries.create('Project Data', 'Data for current genome annotation project') self.library_id = prj_lib.id # project data folder/library logging.info("Library for {0}: {1}".format(self.full_name, self.library_id)) for fname, files in folders.items(): if fname and files: - folder_name = fname[len("./src_data") + 1:] + folder_name = re.sub(data_dir_root + "/", "", fname) logging.info("Creating folder: %s" % folder_name) folder = self.create_deep_folder(prj_lib, folder_name) + for single_file in files: ftype = 'auto' @@ -203,11 +185,16 @@ class LoadData(speciesData.SpeciesData): logging.info("Skipping useless file '%s'" % single_file) continue - logging.info("Adding file '%s' with type '%s' and name '%s'" % (single_file, ftype, clean_name)) - datasets = prj_lib.upload_from_local( - path=single_file, + single_file_relative_path = re.sub(data_dir_root, CONTAINER_DATA_DIR_ROOT, single_file) + single_file_path_in_container=os.path.join(CONTAINER_DATA_DIR_ROOT, single_file_relative_path) + + logging.info("Adding file '%s' with type '%s' and name '%s'" % (single_file_path_in_container, ftype, clean_name)) + datasets = prj_lib.upload_from_galaxy_fs( + single_file_path_in_container, folder=folder, - file_type=ftype + link_data_only='link_to_files', + file_type=ftype, + tag_using_filenames=False ) # Rename dataset @@ -219,10 +206,10 @@ class LoadData(speciesData.SpeciesData): time.sleep(1) - # Wait for uploads to complete - logging.info("Waiting for import jobs to finish... please wait") - - # Checking job state (only necessary if ran using SLURM) + # # Wait for uploads to complete + # logging.info("Waiting for import jobs to finish... please wait") + # + # # Checking job state (only necessary if ran using SLURM) # while True: # try: # # "C" state means the job is completed, no need to wait for it @@ -236,8 +223,8 @@ class LoadData(speciesData.SpeciesData): # break # else: # raise - - time.sleep(10) + # + # time.sleep(10) # Batch renaming --> Throws a critical error at the moment # logging.info("Import finished, now renaming datasets with pretty names") @@ -272,7 +259,29 @@ class LoadData(speciesData.SpeciesData): return new_folder - def connect_to_instance(self): + def get_bam_label(self, dirname, bam_file): + + bam_id = bam_file + if bam_id.endswith('.bam'): + bam_id = bam_id[:-4] + + if dirname in self.bam_metadata_cache: + if bam_id in self.bam_metadata_cache[dirname] and 'label' in self.bam_metadata_cache[dirname][bam_id] and self.bam_metadata_cache[dirname][bam_id]['label']: + return self.bam_metadata_cache[dirname][bam_id]['label'] + else: + return None + else: + meta_file = os.path.join(dirname, 'metadata.yml') + if os.path.exists(meta_file): + with open(meta_file) as f: + self.bam_metadata_cache[dirname] = yaml.safe_load(f) + logging.info("Found metadata in %s " % meta_file) + else: + self.bam_metadata_cache[dirname] = {} + logging.info("Did not find metadata in %s " % meta_file) + return self.get_bam_label(dirname, bam_file) + + def create_galaxy_instance(self): """ Test the connection to the galaxy instance for the current organism Exit if we cannot connect to the instance @@ -284,7 +293,6 @@ class LoadData(speciesData.SpeciesData): email=self.config["galaxy_default_admin_email"], password=self.config["galaxy_default_admin_password"] ) - self.instance.histories.get_histories() try: self.instance.histories.get_histories() @@ -294,19 +302,7 @@ class LoadData(speciesData.SpeciesData): else: logging.info("Successfully connected to galaxy instance (%s) " % self.instance_url) - - - -def get_species_to_load(sp_dict_list): - """ - """ - - - - utilities.get_unique_species_list(sp_dict_list) - - - return 1 + return self.instance if __name__ == "__main__": @@ -338,64 +334,56 @@ if __name__ == "__main__": else: logging.basicConfig(level=logging.INFO) - # Parsing the config file if provided, using the default config otherwise - if not args.config: - args.config = os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])), "config") - else: - args.config = os.path.abspath(args.config) + config_file = os.path.abspath(args.config) + config = utilities.parse_config(config_file) + main_dir = None if not args.main_directory: - args.main_directory = os.getcwd() + main_dir = os.getcwd() else: - args.main_directory = os.path.abspath(args.main_directory) + main_dir = os.path.abspath(args.main_directory) sp_dict_list = utilities.parse_input(args.input) unique_sp_dict_list = utilities.get_unique_species_dict_list(sp_dict_list=sp_dict_list) - for sp_dict in unique_sp_dict_list: # Creating an instance of load_data_for_current_species object load_data_for_current_species = LoadData(parameters_dictionary=sp_dict) # Starting - logging.info("gga_load_data.py called for %s" % load_data_for_current_species.full_name) + logging.info("gga_load_data.py called for {0} {1}".format(load_data_for_current_species.genus, load_data_for_current_species.species)) # Setting some of the instance attributes - load_data_for_current_species.main_dir = args.main_directory + load_data_for_current_species.main_dir = main_dir load_data_for_current_species.species_dir = os.path.join(load_data_for_current_species.main_dir, load_data_for_current_species.genus_species + "/") # Parse the config yaml file - load_data_for_current_species.config = utilities.parse_config(args.config) + load_data_for_current_species.config = config # Set the instance url attribute -- Does not work with localhost on scratch (ALB) load_data_for_current_species.instance_url = "http://localhost:{0}/sp/{1}_{2}/galaxy/".format( load_data_for_current_species.config["http_port"], load_data_for_current_species.genus_lowercase, load_data_for_current_species.species) - - # Check the galaxy container state and proceed if the galaxy services are up and running if utilities.check_galaxy_state(genus_lowercase=load_data_for_current_species.genus_lowercase, species=load_data_for_current_species.species, script_dir=load_data_for_current_species.script_dir): - # Load config file - load_data_for_current_species.config = utilities.parse_config(args.config) - - # Testing connection to the instance - load_data_for_current_species.connect_to_instance() + # Create the Galaxy instance + load_data_for_current_species.instance = load_data_for_current_species.create_galaxy_instance() # Load the datasets into a galaxy library logging.info("Setting up library for {0} {1}".format(load_data_for_current_species.genus, load_data_for_current_species.species)) load_data_for_current_species.setup_library() - logging.info("Successfully set up library in galaxy for {0} {1}".format(load_data_for_current_species.genus, load_data_for_current_species.species)) + logging.debug("Successfully set up library in galaxy for {0} {1}".format(load_data_for_current_species.genus, load_data_for_current_species.species)) # Set or get the history for the current organism - load_data_for_current_species.set_get_history() + load_data_for_current_species.get_history() # Remove H. sapiens from database if here # TODO: set a dedicated history for removing H. sapiens (instead of doing it into a species history) @@ -403,7 +391,6 @@ if __name__ == "__main__": # logging.info("Importing datasets into history for %s" % load_data_for_current_species.full_name) # load_data_for_current_species.import_datasets_into_history() # Option "--load-history" - # load_data_for_current_species.purge_histories() # Testing purposes logging.info("Data successfully loaded and imported for {0} {1}".format(load_data_for_current_species.genus, load_data_for_current_species.species)) diff --git a/speciesData.py b/speciesData.py index f2b97026d4d929a39e21c79b31058f498f253622..bb13d11c2c2979e6193b8a0f3408962f22aab910 100755 --- a/speciesData.py +++ b/speciesData.py @@ -4,6 +4,7 @@ import os import sys import utilities +import logging from _datetime import datetime @@ -15,6 +16,30 @@ class SpeciesData: """ + def get_species_dir(self): + + species_dir = None + if os.path.isdir(self.main_dir) and not self.genus_species is None: + species_dir = os.path.join(self.main_dir, self.genus_species) + else: + logging.error("Cannot set species dir with '{0}/{1}'".format(self.main_dir,self.genus_species)) + return species_dir + + def goto_species_dir(self): + """ + Go to the species directory (starting from the main dir) + + :return: + """ + + species_dir = self.get_species_dir() + try: + os.chdir(species_dir) + except OSError: + logging.critical("Cannot access %s" % species_dir) + sys.exit(0) + return 1 + def clean_string(self, string): if not string is None and string != "": clean_string = string.replace(" ", "_").replace("-", "_").replace("(", "").replace(")", "").replace("'", "").strip() diff --git a/utilities.py b/utilities.py index 93c3e72d49d6709548402e87e3b14c2cd3945b9d..e633843af5a59c26f1109a9573420ae8f8ee06c6 100755 --- a/utilities.py +++ b/utilities.py @@ -6,6 +6,7 @@ import logging import sys import os import subprocess +import bioblend def parse_config(config_file): @@ -172,3 +173,24 @@ def get_unique_species_dict_list(sp_dict_list): unique_species_list_of_dict.append(v) return unique_species_list_of_dict + +def run_tool(instance, tool_id, history_id, tool_inputs): + + output_dict = None + try: + logging.debug("Running tool {0} with tool inputs: {1}".format(tool_id, tool_inputs)) + output_dict = instance.tools.run_tool( + tool_id=tool_id, + history_id=history_id, + tool_inputs=tool_inputs) + except bioblend.ConnectionError: + logging.error("Unexpected HTTP response (bioblend.ConnectionError) when running tool {0} with tool inputs: {1}".format(tool_id, tool_inputs)) + + return output_dict + +def run_tool_and_get_single_output_dataset_id(instance, tool_id, history_id, tool_inputs): + + output_dict = run_tool(instance, tool_id, history_id, tool_inputs) + single_output_dataset_id = output_dict["outputs"][0]["id"] + + return single_output_dataset_id \ No newline at end of file