#!/usr/bin/env python3 # -*- coding: utf-8 -*- import argparse import os import logging import sys import shutil import utilities import speciesData import constants """ gga_get_data.py Usage: $ python3 gga_get_data.py -i input_example.yml [OPTIONS] """ class GetData(speciesData.SpeciesData): """ Child of SpeciesData Contains methods and attributes to copy data into the src_data subfolders of an organism """ def make_directory_tree(self): """ Generate the directory tree for an organism :return: """ os.chdir(self.main_dir) try: os.chdir(self.species_dir) except OSError as exc: logging.critical("Cannot access %s" % self.genus_species) sys.exit(exc) # Creation (or updating) of the src_data directory tree try: os.mkdir("./src_data") except FileExistsError: logging.debug("'src_data' directory already exist for %s" % self.full_name) except PermissionError as exc: logging.critical("Insufficient permission to create src_data directory tree") sys.exit(exc) # List of all the directories to create in src_data src_data_dirs_li = ["./src_data", "./src_data/annotation", "./src_data/genome", "./src_data/tracks", "./src_data/annotation/%s" % self.species_folder_name, "./src_data/genome/%s" % self.species_folder_name, "./src_data/annotation/{0}/OGS{1}/".format(self.species_folder_name, self.ogs_version), "./src_data/genome/{0}/v{1}".format(self.species_folder_name, self.genome_version)] make_dirs(dir_paths_li=src_data_dirs_li) # Return to main directory os.chdir(self.main_dir) logging.info("src_data directory tree generated for %s" % self.full_name) def get_source_data_files_from_path(self): """ Find source data files and copy them into the src_data dir tree :return: """ try: os.chdir(self.species_dir) except OSError: logging.critical("Cannot access " + self.species_dir) sys.exit(0) organism_annotation_dir = os.path.abspath("./src_data/annotation/{0}/OGS{1}".format(self.species_folder_name, self.ogs_version)) organism_genome_dir = os.path.abspath("./src_data/genome/{0}/v{1}".format(self.species_folder_name, self.genome_version)) genome_datasets = {constants.ORG_PARAM_DATA_GENOME_PATH: self.genome_path} annotation_datasets = {constants.ORG_PARAM_DATA_GFF_PATH: self.gff_path, constants.ORG_PARAM_DATA_TRANSCRIPTS_PATH: self.transcripts_path, constants.ORG_PARAM_DATA_PROTEINS_PATH: self.proteins_path, constants.ORG_PARAM_DATA_INTERPRO_PATH: self.interpro_path, constants.ORG_PARAM_DATA_ORTHOFINDER_PATH: self.orthofinder_path, constants.ORG_PARAM_DATA_BLASTP_PATH: self.blastp_path, constants.ORG_PARAM_DATA_BLASTX_PATH: self.blastx_path} # Copy datasets in the organism src_data dir tree correct folder for k, v in genome_datasets.items(): if v: # If dataset is not present in input file, skip copy logging.info("Copying {0} ({1}) into {2}".format(k, v, organism_genome_dir)) genome_fname = self.genome_filename try: shutil.copyfile(os.path.abspath(v), os.path.join(organism_genome_dir, genome_fname)) except Exception as exc: logging.warning("Could not copy {0} ({1}) - Exit Code: {2})".format(k, v, exc)) for k, v in annotation_datasets.items(): if v: # If dataset is not present in input file, skip copy dataset_fname = "" if k == constants.ORG_PARAM_DATA_GFF_PATH: dataset_fname = self.gff_filename elif k == constants.ORG_PARAM_DATA_TRANSCRIPTS_PATH: dataset_fname = self.transcripts_filename elif k == constants.ORG_PARAM_DATA_PROTEINS_PATH: dataset_fname = self.proteins_filename elif k == constants.ORG_PARAM_DATA_ORTHOFINDER_PATH: dataset_fname = self.orthofinder_filename elif k == constants.ORG_PARAM_DATA_INTERPRO_PATH: dataset_fname = self.interpro_filename elif k == constants.ORG_PARAM_DATA_BLASTP_PATH: dataset_fname = self.blastp_filename elif k == constants.ORG_PARAM_DATA_BLASTX_PATH: dataset_fname = self.blastx_filename logging.info("Copying {0} ({1}) into {2}".format(k, v, organism_annotation_dir)) try: shutil.copyfile(os.path.abspath(v), os.path.join(organism_annotation_dir, dataset_fname)) except Exception as exc: logging.warning("Could not copy {0} ({1}) - Exit Code: {2}".format(k, v, exc)) os.chdir(self.main_dir) def make_dirs(dir_paths_li): """ Recursively create directories from a list of paths with a try-catch condition :param dir_paths_li: :return: """ created_dir_paths_li = [] for dir_path in dir_paths_li: try: os.mkdir(dir_path) except FileExistsError: logging.debug("%s directory already exists" % dir_path) except PermissionError as exc: logging.critical("Insufficient permission to create %s" % dir_path) sys.exit(exc) created_dir_paths_li.append(dir_path) return created_dir_paths_li if __name__ == "__main__": parser = argparse.ArgumentParser(description="Create 'src_data' tree and add data files") parser.add_argument("input", type=str, help="Input file (yml)") parser.add_argument("-v", "--verbose", help="Increase output verbosity", action="store_true") parser.add_argument("--main-directory", type=str, help="Where the stack containers will be located, defaults to working directory") args = parser.parse_args() if args.verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) if not args.main_directory: args.main_directory = os.getcwd() else: args.main_directory = os.path.abspath(args.main_directory) sp_dict_list = utilities.parse_input(args.input) for sp_dict in sp_dict_list: # Creating an instance of get_data_for_current_species object get_data_for_current_species = GetData(parameters_dictionary=sp_dict) # Starting logging.info("gga_load_data.py called for %s" % get_data_for_current_species.full_name) # Setting some of the instance attributes get_data_for_current_species.main_dir = args.main_directory get_data_for_current_species.species_dir = os.path.join(get_data_for_current_species.main_dir, get_data_for_current_species.genus_species + "/") # create src_data directory tree get_data_for_current_species.make_directory_tree() # Retrieve datasets logging.info("Finding and copying datasets for %s" % get_data_for_current_species.full_name) get_data_for_current_species.get_source_data_files_from_path() logging.info("Sucessfully copied datasets for %s" % get_data_for_current_species.full_name)