Skip to content
Snippets Groups Projects

Release 2.0 (merge dev to master)

Merged Loraine Gueguen requested to merge dev into master
1 file
+ 7
1
Compare changes
  • Side-by-side
  • Inline
+ 47
80
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import bioblend
import argparse
import os
import subprocess
import logging
import sys
import fnmatch
import time
import json
import re
import stat
import shutil
from bioblend.galaxy.objects import GalaxyInstance
from bioblend import galaxy
import utilities
import speciesData
import constants
"""
gga_get_data.py
@@ -36,24 +28,6 @@ class GetData(speciesData.SpeciesData):
"""
def goto_species_dir(self):
"""
Go to the species directory (starting from the main dir)
:return:
"""
os.chdir(self.main_dir)
species_dir = os.path.join(self.main_dir, self.genus_species) + "/"
try:
os.chdir(species_dir)
except OSError:
logging.critical("Cannot access %s" % species_dir)
sys.exit(0)
return 1
def make_directory_tree(self):
"""
Generate the directory tree for an organism
@@ -91,6 +65,12 @@ class GetData(speciesData.SpeciesData):
logging.info("src_data directory tree generated for %s" % self.full_name)
def get_last_modified_time_string(self, filePath):
# give the last modification date for the file, with format '20190130'
lastModifiedTimestamp = os.path.getmtime(filePath)
lastModifiedTimeStructure = time.localtime(lastModifiedTimestamp)
lastModifiedDate = time.strftime("%Y%m%d", lastModifiedTimeStructure)
return lastModifiedDate
def get_source_data_files_from_path(self):
"""
@@ -108,55 +88,47 @@ class GetData(speciesData.SpeciesData):
organism_annotation_dir = os.path.abspath("./src_data/annotation/{0}/OGS{1}".format(self.species_folder_name, self.ogs_version))
organism_genome_dir = os.path.abspath("./src_data/genome/{0}/v{1}".format(self.species_folder_name, self.genome_version))
datasets_to_get = {"genome_path": self.genome_path,
"gff_path": self.gff_path,
"transcripts_path": self.transcripts_path,
"proteins_path": self.proteins_path,
"interpro_path": self.interpro_path,
"orthofinder_path": self.orthofinder_path,
"blastp_path": self.blastp_path,
"blastx_path": self.blastx_path}
genome_datasets = ["genome_path"]
annotation_datasets = ["gff_path", "transcripts_path", "proteins_path", "orthofinder_path", "interpro_path", "blastp_path", "blastx_path"]
# Where to store blast results?
# search_excluded_datasets = ["interpro_path", "orthofinder_path", "blastp_path", "blastx_path"]
# # These datasets will not be searched if missing in the input file
genome_datasets = {constants.ORG_PARAM_DATA_GENOME_PATH: self.genome_path}
annotation_datasets = {constants.ORG_PARAM_DATA_GFF_PATH: self.gff_path,
constants.ORG_PARAM_DATA_TRANSCRIPTS_PATH: self.transcripts_path,
constants.ORG_PARAM_DATA_PROTEINS_PATH: self.proteins_path,
constants.ORG_PARAM_DATA_INTERPRO_PATH: self.interpro_path,
constants.ORG_PARAM_DATA_ORTHOFINDER_PATH: self.orthofinder_path,
constants.ORG_PARAM_DATA_BLASTP_PATH: self.blastp_path,
constants.ORG_PARAM_DATA_BLASTX_PATH: self.blastx_path}
# Copy datasets in the organism src_data dir tree correct folder
for k, v in datasets_to_get.items():
for k, v in genome_datasets.items():
if v: # If dataset is not present in input file, skip copy
if k in genome_datasets:
logging.info("Copying {0} ({1}) into {2}".format(k, v, organism_genome_dir))
genome_fname = "v%s.fasta" % self.genome_version
try:
shutil.copyfile(os.path.abspath(v), os.path.join(organism_genome_dir, genome_fname))
except Exception as exc:
logging.warning("Could not copy {0} ({1}) - Exit Code: {2})".format(k, v, exc))
elif k in annotation_datasets:
dataset_fname = ""
if k == "gff_path":
dataset_fname = "OGS%s.gff" % self.ogs_version
elif k == "transcripts_path":
dataset_fname = "OGS%s_transcripts.fasta" % self.ogs_version
elif k == "proteins_path":
dataset_fname = "OGS%s_proteins.fasta" % self.ogs_version
elif k == "orthofinder_path":
dataset_fname = "OGS%s_orthofinder.tsv" % self.ogs_version
elif k == "interpro_path":
dataset_fname = "OGS%s_interproscan.xml" % self.ogs_version
elif k == "blastp_path":
dataset_fname = "OGS%s_blastp.xml" % self.ogs_version
elif k == "blastx_path":
dataset_fname = "OGS%s_blastx.xml" % self.ogs_version
logging.info("Copying {0} ({1}) into {2}".format(k, v, organism_annotation_dir))
try:
shutil.copyfile(os.path.abspath(v), os.path.join(organism_annotation_dir, dataset_fname))
except Exception as exc:
logging.warning("Could not copy {0} ({1}) - Exit Code: {2}".format(k, v, exc))
else:
pass
logging.info("Copying {0} ({1}) into {2}".format(k, v, organism_genome_dir))
genome_fname = "{0}_v{1}.fasta".format(self.dataset_prefix, self.genome_version)
try:
shutil.copyfile(os.path.abspath(v), os.path.join(organism_genome_dir, genome_fname))
except Exception as exc:
logging.warning("Could not copy {0} ({1}) - Exit Code: {2})".format(k, v, exc))
for k, v in annotation_datasets.items():
if v: # If dataset is not present in input file, skip copy
dataset_fname = ""
if k == constants.ORG_PARAM_DATA_GFF_PATH:
dataset_fname = "{0}_OGS{1}_{2}.gff".format(self.dataset_prefix, self.ogs_version, self.get_last_modified_time_string(os.path.abspath(v)))
elif k == constants.ORG_PARAM_DATA_TRANSCRIPTS_PATH:
dataset_fname = "{0}_OGS{1}_transcripts.fasta".format(self.dataset_prefix, self.ogs_version)
elif k == constants.ORG_PARAM_DATA_PROTEINS_PATH:
dataset_fname = "{0}_OGS{1}_proteins.fasta".format(self.dataset_prefix, self.ogs_version)
elif k == constants.ORG_PARAM_DATA_ORTHOFINDER_PATH:
dataset_fname = "{0}_OGS{1}_orthofinder.tsv".format(self.dataset_prefix, self.ogs_version)
elif k == constants.ORG_PARAM_DATA_INTERPRO_PATH:
dataset_fname = "{0}_OGS{1}_interproscan.xml".format(self.dataset_prefix, self.ogs_version)
elif k == constants.ORG_PARAM_DATA_BLASTP_PATH:
dataset_fname = "{0}_OGS{1}_blastp.xml".format(self.dataset_prefix, self.ogs_version)
elif k == constants.ORG_PARAM_DATA_BLASTX_PATH:
dataset_fname = "{0}_OGS{1}_blastx.xml".format(self.dataset_prefix, self.ogs_version)
logging.info("Copying {0} ({1}) into {2}".format(k, v, organism_annotation_dir))
try:
shutil.copyfile(os.path.abspath(v), os.path.join(organism_annotation_dir, dataset_fname))
except Exception as exc:
logging.warning("Could not copy {0} ({1}) - Exit Code: {2}".format(k, v, exc))
os.chdir(self.main_dir)
@@ -183,10 +155,7 @@ def make_dirs(dir_paths_li):
return created_dir_paths_li
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Automatic data loading in containers and interaction "
"with galaxy instances for GGA"
", following the protocol @ "
"http://gitlab.sb-roscoff.fr/abims/e-infra/gga")
parser = argparse.ArgumentParser(description="Create 'src_data' tree and add data files")
parser.add_argument("input",
type=str,
@@ -194,7 +163,7 @@ if __name__ == "__main__":
parser.add_argument("-v", "--verbose",
help="Increase output verbosity",
action="store_false")
action="store_true")
parser.add_argument("--main-directory",
type=str,
@@ -206,7 +175,6 @@ if __name__ == "__main__":
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)
logging.getLogger("urllib3").setLevel(logging.WARNING)
if not args.main_directory:
args.main_directory = os.getcwd()
@@ -234,4 +202,3 @@ if __name__ == "__main__":
logging.info("Finding and copying datasets for %s" % get_data_for_current_species.full_name)
get_data_for_current_species.get_source_data_files_from_path()
logging.info("Sucessfully copied datasets for %s" % get_data_for_current_species.full_name)
Loading