Skip to content
Snippets Groups Projects
gga_get_data.py 7.73 KiB
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import argparse
import os
import logging
import sys
import shutil

import utilities
import speciesData
import constants

""" 
gga_get_data.py

Usage: $ python3 gga_get_data.py -i input_example.yml [OPTIONS]

"""


class GetData(speciesData.SpeciesData):
    """
    Child of SpeciesData

    Contains methods and attributes to copy data into the src_data subfolders of an organism

    """

    def make_directory_tree(self):
        """
        Generate the directory tree for an organism

        :return:
        """

        os.chdir(self.main_dir)

        try:
            os.chdir(self.species_dir)
        except OSError as exc:
            logging.critical("Cannot access %s" % self.genus_species)
            sys.exit(exc)

        # Creation (or updating) of the src_data directory tree
        try:
            os.mkdir("./src_data")
        except FileExistsError:
            logging.debug("'src_data' directory already exist for %s" % self.full_name)
        except PermissionError as exc:
            logging.critical("Insufficient permission to create src_data directory tree")
            sys.exit(exc)

        # List of all the directories to create in src_data
        src_data_dirs_li = ["./src_data", "./src_data/annotation", "./src_data/genome", "./src_data/tracks",
                            "./src_data/annotation/%s" % self.species_folder_name,
                            "./src_data/genome/%s" % self.species_folder_name,
                            "./src_data/annotation/{0}/OGS{1}/".format(self.species_folder_name, self.ogs_version),
                            "./src_data/genome/{0}/v{1}".format(self.species_folder_name, self.genome_version)]
        make_dirs(dir_paths_li=src_data_dirs_li)

        # Return to main directory
        os.chdir(self.main_dir)

        logging.info("src_data directory tree generated for %s" % self.full_name)

    def get_source_data_files_from_path(self):
        """
        Find source data files and copy them into the src_data dir tree

        :return:
        """

        try:
            os.chdir(self.species_dir)
        except OSError:
            logging.critical("Cannot access " + self.species_dir)
            sys.exit(0)

        organism_annotation_dir = os.path.abspath("./src_data/annotation/{0}/OGS{1}".format(self.species_folder_name, self.ogs_version))
        organism_genome_dir = os.path.abspath("./src_data/genome/{0}/v{1}".format(self.species_folder_name, self.genome_version))

        genome_datasets = {constants.ORG_PARAM_DATA_GENOME_PATH: self.genome_path}
        annotation_datasets = {constants.ORG_PARAM_DATA_GFF_PATH: self.gff_path,
                               constants.ORG_PARAM_DATA_TRANSCRIPTS_PATH: self.transcripts_path,
                               constants.ORG_PARAM_DATA_PROTEINS_PATH: self.proteins_path,
                               constants.ORG_PARAM_DATA_INTERPRO_PATH: self.interpro_path,
                               constants.ORG_PARAM_DATA_ORTHOFINDER_PATH: self.orthofinder_path,
                               constants.ORG_PARAM_DATA_BLASTP_PATH: self.blastp_path,
                               constants.ORG_PARAM_DATA_BLASTX_PATH: self.blastx_path}

        # Copy datasets in the organism src_data dir tree correct folder
        for k, v in genome_datasets.items():
            if v:  # If dataset is not present in input file, skip copy
                logging.info("Copying {0} ({1}) into {2}".format(k, v, organism_genome_dir))
                genome_fname = self.genome_filename
                try:
                    shutil.copyfile(os.path.abspath(v), os.path.join(organism_genome_dir, genome_fname))
                except Exception as exc:
                    logging.warning("Could not copy {0} ({1}) - Exit Code: {2})".format(k, v, exc))

        for k, v in annotation_datasets.items():
            if v:  # If dataset is not present in input file, skip copy
                dataset_fname = ""
                if k == constants.ORG_PARAM_DATA_GFF_PATH:
                    dataset_fname = self.gff_filename
                elif k == constants.ORG_PARAM_DATA_TRANSCRIPTS_PATH:
                    dataset_fname = self.transcripts_filename
                elif k == constants.ORG_PARAM_DATA_PROTEINS_PATH:
                    dataset_fname = self.proteins_filename
                elif k == constants.ORG_PARAM_DATA_ORTHOFINDER_PATH:
                    dataset_fname = self.orthofinder_filename
                elif k == constants.ORG_PARAM_DATA_INTERPRO_PATH:
                    dataset_fname = self.interpro_filename
                elif k == constants.ORG_PARAM_DATA_BLASTP_PATH:
                    dataset_fname = self.blastp_filename
                elif k == constants.ORG_PARAM_DATA_BLASTX_PATH:
                    dataset_fname = self.blastx_filename
                logging.info("Copying {0} ({1}) into {2}".format(k, v, organism_annotation_dir))
                try:
                    shutil.copyfile(os.path.abspath(v), os.path.join(organism_annotation_dir, dataset_fname))
                except Exception as exc:
                    logging.warning("Could not copy {0} ({1}) - Exit Code: {2}".format(k, v, exc))

        os.chdir(self.main_dir)


def make_dirs(dir_paths_li):
    """
    Recursively create directories from a list of paths with a try-catch condition

    :param dir_paths_li:
    :return:
    """
    created_dir_paths_li = []

    for dir_path in dir_paths_li:
        try:
            os.mkdir(dir_path)
        except FileExistsError:
            logging.debug("%s directory already exists" % dir_path)
        except PermissionError as exc:
            logging.critical("Insufficient permission to create %s" % dir_path)
            sys.exit(exc)
        created_dir_paths_li.append(dir_path)

    return created_dir_paths_li
    
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Create 'src_data' tree and add data files")

    parser.add_argument("input",
                        type=str,
                        help="Input file (yml)")

    parser.add_argument("-v", "--verbose",
                        help="Increase output verbosity",
                        action="store_true")

    parser.add_argument("--main-directory",
                        type=str,
                        help="Where the stack containers will be located, defaults to working directory")

    args = parser.parse_args()

    if args.verbose:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)

    if not args.main_directory:
        args.main_directory = os.getcwd()
    else:
        args.main_directory = os.path.abspath(args.main_directory)

    sp_dict_list = utilities.parse_input(args.input)

    for sp_dict in sp_dict_list:

        # Creating an instance of get_data_for_current_species object
        get_data_for_current_species = GetData(parameters_dictionary=sp_dict)

        # Starting
        logging.info("gga_load_data.py called for %s" % get_data_for_current_species.full_name)

        # Setting some of the instance attributes
        get_data_for_current_species.main_dir = args.main_directory
        get_data_for_current_species.species_dir = os.path.join(get_data_for_current_species.main_dir,
                                                                 get_data_for_current_species.genus_species +
                                                                 "/")
        # create src_data directory tree
        get_data_for_current_species.make_directory_tree()
        # Retrieve datasets
        logging.info("Finding and copying datasets for %s" % get_data_for_current_species.full_name)
        get_data_for_current_species.get_source_data_files_from_path()
        logging.info("Sucessfully copied datasets for %s" % get_data_for_current_species.full_name)