-
Loraine Gueguen authored5bbc05a7
gga_get_data.py 7.73 KiB
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import argparse
import os
import logging
import sys
import shutil
import utilities
import speciesData
import constants
"""
gga_get_data.py
Usage: $ python3 gga_get_data.py -i input_example.yml [OPTIONS]
"""
class GetData(speciesData.SpeciesData):
"""
Child of SpeciesData
Contains methods and attributes to copy data into the src_data subfolders of an organism
"""
def make_directory_tree(self):
"""
Generate the directory tree for an organism
:return:
"""
os.chdir(self.main_dir)
try:
os.chdir(self.species_dir)
except OSError as exc:
logging.critical("Cannot access %s" % self.genus_species)
sys.exit(exc)
# Creation (or updating) of the src_data directory tree
try:
os.mkdir("./src_data")
except FileExistsError:
logging.debug("'src_data' directory already exist for %s" % self.full_name)
except PermissionError as exc:
logging.critical("Insufficient permission to create src_data directory tree")
sys.exit(exc)
# List of all the directories to create in src_data
src_data_dirs_li = ["./src_data", "./src_data/annotation", "./src_data/genome", "./src_data/tracks",
"./src_data/annotation/%s" % self.species_folder_name,
"./src_data/genome/%s" % self.species_folder_name,
"./src_data/annotation/{0}/OGS{1}/".format(self.species_folder_name, self.ogs_version),
"./src_data/genome/{0}/v{1}".format(self.species_folder_name, self.genome_version)]
make_dirs(dir_paths_li=src_data_dirs_li)
# Return to main directory
os.chdir(self.main_dir)
logging.info("src_data directory tree generated for %s" % self.full_name)
def get_source_data_files_from_path(self):
"""
Find source data files and copy them into the src_data dir tree
:return:
"""
try:
os.chdir(self.species_dir)
except OSError:
logging.critical("Cannot access " + self.species_dir)
sys.exit(0)
organism_annotation_dir = os.path.abspath("./src_data/annotation/{0}/OGS{1}".format(self.species_folder_name, self.ogs_version))
organism_genome_dir = os.path.abspath("./src_data/genome/{0}/v{1}".format(self.species_folder_name, self.genome_version))
genome_datasets = {constants.ORG_PARAM_DATA_GENOME_PATH: self.genome_path}
annotation_datasets = {constants.ORG_PARAM_DATA_GFF_PATH: self.gff_path,
constants.ORG_PARAM_DATA_TRANSCRIPTS_PATH: self.transcripts_path,
constants.ORG_PARAM_DATA_PROTEINS_PATH: self.proteins_path,
constants.ORG_PARAM_DATA_INTERPRO_PATH: self.interpro_path,
constants.ORG_PARAM_DATA_ORTHOFINDER_PATH: self.orthofinder_path,
constants.ORG_PARAM_DATA_BLASTP_PATH: self.blastp_path,
constants.ORG_PARAM_DATA_BLASTX_PATH: self.blastx_path}
# Copy datasets in the organism src_data dir tree correct folder
for k, v in genome_datasets.items():
if v: # If dataset is not present in input file, skip copy
logging.info("Copying {0} ({1}) into {2}".format(k, v, organism_genome_dir))
genome_fname = self.genome_filename
try:
shutil.copyfile(os.path.abspath(v), os.path.join(organism_genome_dir, genome_fname))
except Exception as exc:
logging.warning("Could not copy {0} ({1}) - Exit Code: {2})".format(k, v, exc))
for k, v in annotation_datasets.items():
if v: # If dataset is not present in input file, skip copy
dataset_fname = ""
if k == constants.ORG_PARAM_DATA_GFF_PATH:
dataset_fname = self.gff_filename
elif k == constants.ORG_PARAM_DATA_TRANSCRIPTS_PATH:
dataset_fname = self.transcripts_filename
elif k == constants.ORG_PARAM_DATA_PROTEINS_PATH:
dataset_fname = self.proteins_filename
elif k == constants.ORG_PARAM_DATA_ORTHOFINDER_PATH:
dataset_fname = self.orthofinder_filename
elif k == constants.ORG_PARAM_DATA_INTERPRO_PATH:
dataset_fname = self.interpro_filename
elif k == constants.ORG_PARAM_DATA_BLASTP_PATH:
dataset_fname = self.blastp_filename
elif k == constants.ORG_PARAM_DATA_BLASTX_PATH:
dataset_fname = self.blastx_filename
logging.info("Copying {0} ({1}) into {2}".format(k, v, organism_annotation_dir))
try:
shutil.copyfile(os.path.abspath(v), os.path.join(organism_annotation_dir, dataset_fname))
except Exception as exc:
logging.warning("Could not copy {0} ({1}) - Exit Code: {2}".format(k, v, exc))
os.chdir(self.main_dir)
def make_dirs(dir_paths_li):
"""
Recursively create directories from a list of paths with a try-catch condition
:param dir_paths_li:
:return:
"""
created_dir_paths_li = []
for dir_path in dir_paths_li:
try:
os.mkdir(dir_path)
except FileExistsError:
logging.debug("%s directory already exists" % dir_path)
except PermissionError as exc:
logging.critical("Insufficient permission to create %s" % dir_path)
sys.exit(exc)
created_dir_paths_li.append(dir_path)
return created_dir_paths_li
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Create 'src_data' tree and add data files")
parser.add_argument("input",
type=str,
help="Input file (yml)")
parser.add_argument("-v", "--verbose",
help="Increase output verbosity",
action="store_true")
parser.add_argument("--main-directory",
type=str,
help="Where the stack containers will be located, defaults to working directory")
args = parser.parse_args()
if args.verbose:
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)
if not args.main_directory:
args.main_directory = os.getcwd()
else:
args.main_directory = os.path.abspath(args.main_directory)
sp_dict_list = utilities.parse_input(args.input)
for sp_dict in sp_dict_list:
# Creating an instance of get_data_for_current_species object
get_data_for_current_species = GetData(parameters_dictionary=sp_dict)
# Starting
logging.info("gga_load_data.py called for %s" % get_data_for_current_species.full_name)
# Setting some of the instance attributes
get_data_for_current_species.main_dir = args.main_directory
get_data_for_current_species.species_dir = os.path.join(get_data_for_current_species.main_dir,
get_data_for_current_species.genus_species +
"/")
# create src_data directory tree
get_data_for_current_species.make_directory_tree()
# Retrieve datasets
logging.info("Finding and copying datasets for %s" % get_data_for_current_species.full_name)
get_data_for_current_species.get_source_data_files_from_path()
logging.info("Sucessfully copied datasets for %s" % get_data_for_current_species.full_name)