Loraine Gueguen
--- a/gga_get_data.py 0 → 100755

+ 237

− 0
+++ b/gga_get_data.py 0 → 100755

+ 237

− 0
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import bioblend
+import argparse
+import os
+import subprocess
+import logging
+import sys
+import fnmatch
+import time
+import json
+import re
+import stat
+import shutil
+
+from bioblend.galaxy.objects import GalaxyInstance
+from bioblend import galaxy
+
+import utilities
+import speciesData
+
+""" 
+gga_get_data.py
+
+Usage: $ python3 gga_get_data.py -i input_example.yml [OPTIONS]
+
+"""
+
+
+class GetData(speciesData.SpeciesData):
+    """
+    Child of SpeciesData
+
+    Contains methods and attributes to copy data into the src_data subfolders of an organism
+
+    """
+
+    def goto_species_dir(self):
+        """
+        Go to the species directory (starting from the main dir)
+
+        :return:
+        """
+
+        os.chdir(self.main_dir)
+        species_dir = os.path.join(self.main_dir, self.genus_species) + "/"
+        try:
+            os.chdir(species_dir)
+        except OSError:
+            logging.critical("Cannot access %s" % species_dir)
+            sys.exit(0)
+        return 1
+
+
+
+    def make_directory_tree(self):
+        """
+        Generate the directory tree for an organism
+
+        :return:
+        """
+
+        os.chdir(self.main_dir)
+
+        try:
+            os.chdir(self.species_dir)
+        except OSError as exc:
+            logging.critical("Cannot access %s" % self.genus_species)
+            sys.exit(exc)
+
+        # Creation (or updating) of the src_data directory tree
+        try:
+            os.mkdir("./src_data")
+        except FileExistsError:
+            logging.debug("'src_data' directory already exist for %s" % self.full_name)
+        except PermissionError as exc:
+            logging.critical("Insufficient permission to create src_data directory tree")
+            sys.exit(exc)
+
+        # List of all the directories to create in src_data
+        src_data_dirs_li = ["./src_data", "./src_data/annotation", "./src_data/genome", "./src_data/tracks",
+                            "./src_data/annotation/%s" % self.species_folder_name,
+                            "./src_data/genome/%s" % self.species_folder_name,
+                            "./src_data/annotation/{0}/OGS{1}/".format(self.species_folder_name, self.ogs_version),
+                            "./src_data/genome/{0}/v{1}".format(self.species_folder_name, self.genome_version)]
+        make_dirs(dir_paths_li=src_data_dirs_li)
+
+        # Return to main directory
+        os.chdir(self.main_dir)
+
+        logging.info("src_data directory tree generated for %s" % self.full_name)
+
+
+    def get_source_data_files_from_path(self):
+        """
+        Find source data files and copy them into the src_data dir tree
+
+        :return:
+        """
+
+        try:
+            os.chdir(self.species_dir)
+        except OSError:
+            logging.critical("Cannot access " + self.species_dir)
+            sys.exit(0)
+
+        organism_annotation_dir = os.path.abspath("./src_data/annotation/{0}/OGS{1}".format(self.species_folder_name, self.ogs_version))
+        organism_genome_dir = os.path.abspath("./src_data/genome/{0}/v{1}".format(self.species_folder_name, self.genome_version))
+
+        datasets_to_get = {"genome_path": self.genome_path,
+                           "gff_path": self.gff_path, 
+                           "transcripts_path": self.transcripts_path,
+                           "proteins_path": self.proteins_path,
+                           "interpro_path": self.interpro_path, 
+                           "orthofinder_path": self.orthofinder_path, 
+                           "blastp_path": self.blastp_path, 
+                           "blastx_path": self.blastx_path}
+
+        genome_datasets = ["genome_path"]
+        annotation_datasets = ["gff_path", "transcripts_path", "proteins_path", "orthofinder_path", "interpro_path", "blastp_path", "blastx_path"]  
+        # Where to store blast results?
+
+        # search_excluded_datasets = ["interpro_path", "orthofinder_path", "blastp_path", "blastx_path"]  
+        # # These datasets will not be searched if missing in the input file
+
+        # Copy datasets in the organism src_data dir tree correct folder
+        for k, v in datasets_to_get.items():
+            if v:  # If dataset is not present in input file, skip copy
+                if k in genome_datasets:
+                    logging.info("Copying {0} ({1}) into {2}".format(k, v, organism_genome_dir))
+                    genome_fname = "v%s.fasta" % self.genome_version
+                    try:
+                        shutil.copyfile(os.path.abspath(v), os.path.join(organism_genome_dir, genome_fname))
+                    except Exception as exc:
+                        logging.warning("Could not copy {0} ({1}) - Exit Code: {2})".format(k, v, exc))
+                elif k in annotation_datasets:
+                    dataset_fname = ""
+                    if k == "gff_path":
+                        dataset_fname = "OGS%s.gff" % self.ogs_version
+                    elif k == "transcripts_path":
+                        dataset_fname = "OGS%s_transcripts.fasta" % self.ogs_version
+                    elif k == "proteins_path":
+                        dataset_fname = "OGS%s_proteins.fasta" % self.ogs_version
+                    elif k == "orthofinder_path":
+                        dataset_fname = "OGS%s_orthofinder.tsv" % self.ogs_version
+                    elif k == "interpro_path":
+                        dataset_fname = "OGS%s_interproscan.xml" % self.ogs_version
+                    elif k == "blastp_path":
+                        dataset_fname = "OGS%s_blastp.xml" % self.ogs_version
+                    elif k == "blastx_path":
+                        dataset_fname = "OGS%s_blastx.xml" % self.ogs_version
+                    logging.info("Copying {0} ({1}) into {2}".format(k, v, organism_annotation_dir))
+                    try:
+                        shutil.copyfile(os.path.abspath(v), os.path.join(organism_annotation_dir, dataset_fname))
+                    except Exception as exc:
+                        logging.warning("Could not copy {0} ({1}) - Exit Code: {2}".format(k, v, exc))
+                else:
+                    pass
+
+        os.chdir(self.main_dir)
+
+
+def make_dirs(dir_paths_li):
+    """
+    Recursively create directories from a list of paths with a try-catch condition
+
+    :param dir_paths_li:
+    :return:
+    """
+    created_dir_paths_li = []
+
+    for dir_path in dir_paths_li:
+        try:
+            os.mkdir(dir_path)
+        except FileExistsError:
+            logging.debug("%s directory already exists" % dir_path)
+        except PermissionError as exc:
+            logging.critical("Insufficient permission to create %s" % dir_path)
+            sys.exit(exc)
+        created_dir_paths_li.append(dir_path)
+
+    return created_dir_paths_li
+    
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Automatic data loading in containers and interaction "
+                                                 "with galaxy instances for GGA"
+                                                 ", following the protocol @ "
+                                                 "http://gitlab.sb-roscoff.fr/abims/e-infra/gga")
+
+    parser.add_argument("input",
+                        type=str,
+                        help="Input file (yml)")
+
+    parser.add_argument("-v", "--verbose",
+                        help="Increase output verbosity",
+                        action="store_false")
+
+    parser.add_argument("--main-directory",
+                        type=str,
+                        help="Where the stack containers will be located, defaults to working directory")
+
+    args = parser.parse_args()
+
+    if args.verbose:
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.INFO)
+    logging.getLogger("urllib3").setLevel(logging.WARNING)
+
+    if not args.main_directory:
+        args.main_directory = os.getcwd()
+    else:
+        args.main_directory = os.path.abspath(args.main_directory)
+
+    sp_dict_list = utilities.parse_input(args.input)
+
+    for sp_dict in sp_dict_list:
+
+        # Creating an instance of get_data_for_current_species object
+        get_data_for_current_species = GetData(parameters_dictionary=sp_dict)
+
+        # Starting
+        logging.info("gga_load_data.py called for %s" % get_data_for_current_species.full_name)
+
+        # Setting some of the instance attributes
+        get_data_for_current_species.main_dir = args.main_directory
+        get_data_for_current_species.species_dir = os.path.join(get_data_for_current_species.main_dir,
+                                                                 get_data_for_current_species.genus_species +
+                                                                 "/")
+        # create src_data directory tree
+        get_data_for_current_species.make_directory_tree()
+        # Retrieve datasets
+        logging.info("Finding and copying datasets for %s" % get_data_for_current_species.full_name)
+        get_data_for_current_species.get_source_data_files_from_path()
+        logging.info("Sucessfully copied datasets for %s" % get_data_for_current_species.full_name)
+