#!/usr/bin/python # -*- coding: utf-8 -*- import bioblend import bioblend.galaxy.objects from bioblend import galaxy import argparse import os import subprocess import logging import sys import utilities import speciesData from datetime import datetime """ gga_load_data.py Usage: $ python3 gga_init.py -i example.yml [OPTIONS] """ class LoadData(speciesData.SpeciesData): """ Load data from the src_data subfolders into the galaxy instance's history of a given species """ def modify_fasta_headers(self): """ Change the fasta headers before integration. :return: """ try: os.chdir(self.species_dir) working_dir = os.getcwd() except OSError: logging.info("Cannot access " + self.species_dir + ", run with higher privileges") logging.info("Fatal error: exit") sys.exit() self.source_files = dict() annotation_dir, genome_dir = None, None for d in [i[0] for i in os.walk(os.getcwd() + "/src_data")]: if "annotation/" in d: annotation_dir = d for f in os.listdir(d): if f.endswith("proteins.fasta"): self.source_files["proteins_file"] = os.path.join(d, f) elif f.endswith("transcripts-gff.fa"): self.source_files["transcripts_file"] = os.path.join(d, f) elif f.endswith(".gff"): self.source_files["gff_file"] = os.path.join(d, f) elif "genome/" in d: genome_dir = d for f in os.listdir(d): if f.endswith(".fa"): self.source_files["genome_file"] = os.path.join(d, f) logging.debug("source files found:") for k, v in self.source_files.items(): logging.debug("\t" + k + "\t" + v) # Changing headers in the *proteins.fasta file from >mRNA* to >protein* # production version modify_pep_headers = [str(self.main_dir) + "/gga_load_data/utils/phaeoexplorer-change_pep_fasta_header.sh", self.source_files["proteins_file"]] # test version # modify_pep_headers = ["/home/alebars/gga/phaeoexplorer-change_pep_fasta_header.sh", # self.source_files["proteins_file"]] logging.info("Changing fasta headers: " + self.source_files["proteins_file"]) subprocess.run(modify_pep_headers, stdout=subprocess.PIPE, cwd=annotation_dir) # production version modify_pep_headers = [str(self.main_dir) + "/gga_load_data/utils/phaeoexplorer-change_transcript_fasta_header.sh", self.source_files["proteins_file"]] # test version # modify_pep_headers = ["/home/alebars/gga/phaeoexplorer-change_transcript_fasta_header.sh", # self.source_files["proteins_file"]] logging.info("Changing fasta headers: " + self.source_files["transcripts_file"]) subprocess.run(modify_pep_headers, stdout=subprocess.PIPE, cwd=annotation_dir) # src_data cleaning if os.path.exists(annotation_dir + "outfile"): subprocess.run(["mv", annotation_dir + "/outfile", self.source_files["proteins_file"]], stdout=subprocess.PIPE, cwd=annotation_dir) if os.path.exists(annotation_dir + "gmon.out"): subprocess.run(["rm", annotation_dir + "/gmon.out"], stdout=subprocess.PIPE, cwd=annotation_dir) def setup_data_libraries(self): """ - generate blast banks and docker-compose - load data into the galaxy container with the galaxy_data_libs_SI.py script :return: """ try: logging.info("Loading data into the galaxy container") subprocess.run("../serexec genus_species_galaxy /tool_deps/_conda/bin/python /opt/galaxy_data_libs_SI.py", shell=True) except subprocess.CalledProcessError: logging.info("Cannot load data into the galaxy container for " + self.full_name) pass else: logging.info("Data successfully loaded into the galaxy container for " + self.full_name) self.get_species_history_id() # self.get_instance_attributes() # # # import all datasets into current history # self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["genome_file"]) # self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["gff_file"]) # self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["transcripts_file"]) # self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["proteins_file"]) def generate_blast_banks(self): """ Automatically generate blast banks for a species and commit :return: """ def connect_to_instance(self): """ Test the connection to the galaxy instance for the current organism Exit if it cannot connect to the instance """ self.instance = galaxy.GalaxyInstance(url=self.instance_url, email="gga@sb-roscoff.fr", password="password", verify=False) logging.info("Connecting to the galaxy instance ...") try: self.instance.histories.get_histories() self.tool_panel = self.instance.tools.get_tool_panel() except bioblend.ConnectionError: logging.critical("Cannot connect to galaxy instance @ " + self.instance_url) sys.exit() else: logging.info("Successfully connected to galaxy instance @ " + self.instance_url) self.instance.histories.create_history(name="FOO") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Automatic data loading in containers and interaction " "with galaxy instances for GGA" ", following the protocol @ " "http://gitlab.sb-roscoff.fr/abims/e-infra/gga") parser.add_argument("input", type=str, help="Input file (yml)") parser.add_argument("-v", "--verbose", help="Increase output verbosity", action="store_false") args = parser.parse_args() if args.verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) logging.info("Load data: start") sp_dict_list = utilities.parse_input(args.input) for sp_dict in sp_dict_list: o = LoadData(parameters_dictionary=sp_dict) o.main_dir = os.path.abspath(args.dir) o.modify_fasta_headers() logging.info("Successfully formatted files headers " + o.genus[0].upper() + ". " + o.species + " " + o.strain + " " + o.sex) # o.setup_data_libraries() # logging.info("Successfully set up data libraries in galaxy for " + o.genus[0].upper() + ". " + o.species + " " + o.strain + " " + o.sex) logging.info("Load data: done")