Skip to content
Snippets Groups Projects

Release_2.0 next

Merged Loraine Gueguen requested to merge release_2.0 into dev
3 files
+ 25
19
Compare changes
  • Side-by-side
  • Inline
Files
3
+ 40
51
@@ -10,6 +10,7 @@ import shutil
import utilities
import speciesData
import constants
"""
gga_get_data.py
@@ -87,55 +88,47 @@ class GetData(speciesData.SpeciesData):
organism_annotation_dir = os.path.abspath("./src_data/annotation/{0}/OGS{1}".format(self.species_folder_name, self.ogs_version))
organism_genome_dir = os.path.abspath("./src_data/genome/{0}/v{1}".format(self.species_folder_name, self.genome_version))
datasets_to_get = {"genome_path": self.genome_path,
"gff_path": self.gff_path,
"transcripts_path": self.transcripts_path,
"proteins_path": self.proteins_path,
"interpro_path": self.interpro_path,
"orthofinder_path": self.orthofinder_path,
"blastp_path": self.blastp_path,
"blastx_path": self.blastx_path}
genome_datasets = ["genome_path"]
annotation_datasets = ["gff_path", "transcripts_path", "proteins_path", "orthofinder_path", "interpro_path", "blastp_path", "blastx_path"]
# Where to store blast results?
# search_excluded_datasets = ["interpro_path", "orthofinder_path", "blastp_path", "blastx_path"]
# These datasets will not be searched if missing in the input file
genome_datasets = {constants.ORG_PARAM_DATA_GENOME_PATH: self.genome_path}
annotation_datasets = {constants.ORG_PARAM_DATA_GFF_PATH: self.gff_path,
constants.ORG_PARAM_DATA_TRANSCRIPTS_PATH: self.transcripts_path,
constants.ORG_PARAM_DATA_PROTEINS_PATH: self.proteins_path,
constants.ORG_PARAM_DATA_INTERPRO_PATH: self.interpro_path,
constants.ORG_PARAM_DATA_ORTHOFINDER_PATH: self.orthofinder_path,
constants.ORG_PARAM_DATA_BLASTP_PATH: self.blastp_path,
constants.ORG_PARAM_DATA_BLASTX_PATH: self.blastx_path}
# Copy datasets in the organism src_data dir tree correct folder
for k, v in datasets_to_get.items():
for k, v in genome_datasets.items():
if v: # If dataset is not present in input file, skip copy
if k in genome_datasets:
logging.info("Copying {0} ({1}) into {2}".format(k, v, organism_genome_dir))
genome_fname = "{0}_v{1}.fasta".format(self.dataset_prefix, self.genome_version)
try:
shutil.copyfile(os.path.abspath(v), os.path.join(organism_genome_dir, genome_fname))
except Exception as exc:
logging.warning("Could not copy {0} ({1}) - Exit Code: {2})".format(k, v, exc))
elif k in annotation_datasets:
dataset_fname = ""
if k == "gff_path":
dataset_fname = "{0}_OGS{1}_{2}.gff".format(self.dataset_prefix, self.ogs_version, self.get_last_modified_time_string(os.path.abspath(v)))
elif k == "transcripts_path":
dataset_fname = "{0}_OGS{1}_transcripts.fasta".format(self.dataset_prefix, self.ogs_version)
elif k == "proteins_path":
dataset_fname = "{0}_OGS{1}_proteins.fasta".format(self.dataset_prefix, self.ogs_version)
elif k == "orthofinder_path":
dataset_fname = "{0}_OGS{1}_orthofinder.tsv".format(self.dataset_prefix, self.ogs_version)
elif k == "interpro_path":
dataset_fname = "{0}_OGS{1}_interproscan.xml".format(self.dataset_prefix, self.ogs_version)
elif k == "blastp_path":
dataset_fname = "{0}_OGS{1}_blastp.xml".format(self.dataset_prefix, self.ogs_version)
elif k == "blastx_path":
dataset_fname = "{0}_OGS{1}_blastx.xml".format(self.dataset_prefix, self.ogs_version)
logging.info("Copying {0} ({1}) into {2}".format(k, v, organism_annotation_dir))
try:
shutil.copyfile(os.path.abspath(v), os.path.join(organism_annotation_dir, dataset_fname))
except Exception as exc:
logging.warning("Could not copy {0} ({1}) - Exit Code: {2}".format(k, v, exc))
else:
pass
logging.info("Copying {0} ({1}) into {2}".format(k, v, organism_genome_dir))
genome_fname = "{0}_v{1}.fasta".format(self.dataset_prefix, self.genome_version)
try:
shutil.copyfile(os.path.abspath(v), os.path.join(organism_genome_dir, genome_fname))
except Exception as exc:
logging.warning("Could not copy {0} ({1}) - Exit Code: {2})".format(k, v, exc))
for k, v in annotation_datasets.items():
if v: # If dataset is not present in input file, skip copy
dataset_fname = ""
if k == constants.ORG_PARAM_DATA_GFF_PATH:
dataset_fname = "{0}_OGS{1}_{2}.gff".format(self.dataset_prefix, self.ogs_version, self.get_last_modified_time_string(os.path.abspath(v)))
elif k == constants.ORG_PARAM_DATA_TRANSCRIPTS_PATH:
dataset_fname = "{0}_OGS{1}_transcripts.fasta".format(self.dataset_prefix, self.ogs_version)
elif k == constants.ORG_PARAM_DATA_PROTEINS_PATH:
dataset_fname = "{0}_OGS{1}_proteins.fasta".format(self.dataset_prefix, self.ogs_version)
elif k == constants.ORG_PARAM_DATA_ORTHOFINDER_PATH:
dataset_fname = "{0}_OGS{1}_orthofinder.tsv".format(self.dataset_prefix, self.ogs_version)
elif k == constants.ORG_PARAM_DATA_INTERPRO_PATH:
dataset_fname = "{0}_OGS{1}_interproscan.xml".format(self.dataset_prefix, self.ogs_version)
elif k == constants.ORG_PARAM_DATA_BLASTP_PATH:
dataset_fname = "{0}_OGS{1}_blastp.xml".format(self.dataset_prefix, self.ogs_version)
elif k == constants.ORG_PARAM_DATA_BLASTX_PATH:
dataset_fname = "{0}_OGS{1}_blastx.xml".format(self.dataset_prefix, self.ogs_version)
logging.info("Copying {0} ({1}) into {2}".format(k, v, organism_annotation_dir))
try:
shutil.copyfile(os.path.abspath(v), os.path.join(organism_annotation_dir, dataset_fname))
except Exception as exc:
logging.warning("Could not copy {0} ({1}) - Exit Code: {2}".format(k, v, exc))
os.chdir(self.main_dir)
@@ -162,10 +155,7 @@ def make_dirs(dir_paths_li):
return created_dir_paths_li
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Automatic data loading in containers and interaction "
"with galaxy instances for GGA"
", following the protocol @ "
"http://gitlab.sb-roscoff.fr/abims/e-infra/gga")
parser = argparse.ArgumentParser(description="Create 'src_data' tree and add data files")
parser.add_argument("input",
type=str,
@@ -212,4 +202,3 @@ if __name__ == "__main__":
logging.info("Finding and copying datasets for %s" % get_data_for_current_species.full_name)
get_data_for_current_species.get_source_data_files_from_path()
logging.info("Sucessfully copied datasets for %s" % get_data_for_current_species.full_name)
Loading