Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
gga_load_data
Manage
Activity
Members
Labels
Plan
Issues
12
Issue boards
Milestones
Wiki
Code
Merge requests
5
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
abims
e-infra
gga_load_data
Merge requests
!1
Release 1.0
Code
Review changes
Check out branch
Download
Patches
Plain diff
Merged
Release 1.0
dev
into
master
Overview
3
Commits
135
Pipelines
0
Changes
37
Merged
Loraine Gueguen
requested to merge
dev
into
master
4 years ago
Overview
3
Commits
135
Pipelines
0
Changes
6
Expand
👍
0
👎
0
Merge request reports
Viewing commit
7fb2d08e
Prev
Next
Show latest version
6 files
+
0
−
0
Inline
Compare changes
Side-by-side
Inline
Show whitespace changes
Show one file at a time
Files
6
Search (e.g. *.vue) (Ctrl+P)
7fb2d08e
make python scripts executable
· 7fb2d08e
Loraine Guéguen
authored
4 years ago
gga_get_data.py
0 → 100755
+
237
−
0
Options
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import
bioblend
import
argparse
import
os
import
subprocess
import
logging
import
sys
import
fnmatch
import
time
import
json
import
re
import
stat
import
shutil
from
bioblend.galaxy.objects
import
GalaxyInstance
from
bioblend
import
galaxy
import
utilities
import
speciesData
"""
gga_get_data.py
Usage: $ python3 gga_get_data.py -i input_example.yml [OPTIONS]
"""
class
GetData
(
speciesData
.
SpeciesData
):
"""
Child of SpeciesData
Contains methods and attributes to copy data into the src_data subfolders of an organism
"""
def
goto_species_dir
(
self
):
"""
Go to the species directory (starting from the main dir)
:return:
"""
os
.
chdir
(
self
.
main_dir
)
species_dir
=
os
.
path
.
join
(
self
.
main_dir
,
self
.
genus_species
)
+
"
/
"
try
:
os
.
chdir
(
species_dir
)
except
OSError
:
logging
.
critical
(
"
Cannot access %s
"
%
species_dir
)
sys
.
exit
(
0
)
return
1
def
make_directory_tree
(
self
):
"""
Generate the directory tree for an organism
:return:
"""
os
.
chdir
(
self
.
main_dir
)
try
:
os
.
chdir
(
self
.
species_dir
)
except
OSError
as
exc
:
logging
.
critical
(
"
Cannot access %s
"
%
self
.
genus_species
)
sys
.
exit
(
exc
)
# Creation (or updating) of the src_data directory tree
try
:
os
.
mkdir
(
"
./src_data
"
)
except
FileExistsError
:
logging
.
debug
(
"'
src_data
'
directory already exist for %s
"
%
self
.
full_name
)
except
PermissionError
as
exc
:
logging
.
critical
(
"
Insufficient permission to create src_data directory tree
"
)
sys
.
exit
(
exc
)
# List of all the directories to create in src_data
src_data_dirs_li
=
[
"
./src_data
"
,
"
./src_data/annotation
"
,
"
./src_data/genome
"
,
"
./src_data/tracks
"
,
"
./src_data/annotation/%s
"
%
self
.
species_folder_name
,
"
./src_data/genome/%s
"
%
self
.
species_folder_name
,
"
./src_data/annotation/{0}/OGS{1}/
"
.
format
(
self
.
species_folder_name
,
self
.
ogs_version
),
"
./src_data/genome/{0}/v{1}
"
.
format
(
self
.
species_folder_name
,
self
.
genome_version
)]
make_dirs
(
dir_paths_li
=
src_data_dirs_li
)
# Return to main directory
os
.
chdir
(
self
.
main_dir
)
logging
.
info
(
"
src_data directory tree generated for %s
"
%
self
.
full_name
)
def
get_source_data_files_from_path
(
self
):
"""
Find source data files and copy them into the src_data dir tree
:return:
"""
try
:
os
.
chdir
(
self
.
species_dir
)
except
OSError
:
logging
.
critical
(
"
Cannot access
"
+
self
.
species_dir
)
sys
.
exit
(
0
)
organism_annotation_dir
=
os
.
path
.
abspath
(
"
./src_data/annotation/{0}/OGS{1}
"
.
format
(
self
.
species_folder_name
,
self
.
ogs_version
))
organism_genome_dir
=
os
.
path
.
abspath
(
"
./src_data/genome/{0}/v{1}
"
.
format
(
self
.
species_folder_name
,
self
.
genome_version
))
datasets_to_get
=
{
"
genome_path
"
:
self
.
genome_path
,
"
gff_path
"
:
self
.
gff_path
,
"
transcripts_path
"
:
self
.
transcripts_path
,
"
proteins_path
"
:
self
.
proteins_path
,
"
interpro_path
"
:
self
.
interpro_path
,
"
orthofinder_path
"
:
self
.
orthofinder_path
,
"
blastp_path
"
:
self
.
blastp_path
,
"
blastx_path
"
:
self
.
blastx_path
}
genome_datasets
=
[
"
genome_path
"
]
annotation_datasets
=
[
"
gff_path
"
,
"
transcripts_path
"
,
"
proteins_path
"
,
"
orthofinder_path
"
,
"
interpro_path
"
,
"
blastp_path
"
,
"
blastx_path
"
]
# Where to store blast results?
# search_excluded_datasets = ["interpro_path", "orthofinder_path", "blastp_path", "blastx_path"]
# # These datasets will not be searched if missing in the input file
# Copy datasets in the organism src_data dir tree correct folder
for
k
,
v
in
datasets_to_get
.
items
():
if
v
:
# If dataset is not present in input file, skip copy
if
k
in
genome_datasets
:
logging
.
info
(
"
Copying {0} ({1}) into {2}
"
.
format
(
k
,
v
,
organism_genome_dir
))
genome_fname
=
"
v%s.fasta
"
%
self
.
genome_version
try
:
shutil
.
copyfile
(
os
.
path
.
abspath
(
v
),
os
.
path
.
join
(
organism_genome_dir
,
genome_fname
))
except
Exception
as
exc
:
logging
.
warning
(
"
Could not copy {0} ({1}) - Exit Code: {2})
"
.
format
(
k
,
v
,
exc
))
elif
k
in
annotation_datasets
:
dataset_fname
=
""
if
k
==
"
gff_path
"
:
dataset_fname
=
"
OGS%s.gff
"
%
self
.
ogs_version
elif
k
==
"
transcripts_path
"
:
dataset_fname
=
"
OGS%s_transcripts.fasta
"
%
self
.
ogs_version
elif
k
==
"
proteins_path
"
:
dataset_fname
=
"
OGS%s_proteins.fasta
"
%
self
.
ogs_version
elif
k
==
"
orthofinder_path
"
:
dataset_fname
=
"
OGS%s_orthofinder.tsv
"
%
self
.
ogs_version
elif
k
==
"
interpro_path
"
:
dataset_fname
=
"
OGS%s_interproscan.xml
"
%
self
.
ogs_version
elif
k
==
"
blastp_path
"
:
dataset_fname
=
"
OGS%s_blastp.xml
"
%
self
.
ogs_version
elif
k
==
"
blastx_path
"
:
dataset_fname
=
"
OGS%s_blastx.xml
"
%
self
.
ogs_version
logging
.
info
(
"
Copying {0} ({1}) into {2}
"
.
format
(
k
,
v
,
organism_annotation_dir
))
try
:
shutil
.
copyfile
(
os
.
path
.
abspath
(
v
),
os
.
path
.
join
(
organism_annotation_dir
,
dataset_fname
))
except
Exception
as
exc
:
logging
.
warning
(
"
Could not copy {0} ({1}) - Exit Code: {2}
"
.
format
(
k
,
v
,
exc
))
else
:
pass
os
.
chdir
(
self
.
main_dir
)
def
make_dirs
(
dir_paths_li
):
"""
Recursively create directories from a list of paths with a try-catch condition
:param dir_paths_li:
:return:
"""
created_dir_paths_li
=
[]
for
dir_path
in
dir_paths_li
:
try
:
os
.
mkdir
(
dir_path
)
except
FileExistsError
:
logging
.
debug
(
"
%s directory already exists
"
%
dir_path
)
except
PermissionError
as
exc
:
logging
.
critical
(
"
Insufficient permission to create %s
"
%
dir_path
)
sys
.
exit
(
exc
)
created_dir_paths_li
.
append
(
dir_path
)
return
created_dir_paths_li
if
__name__
==
"
__main__
"
:
parser
=
argparse
.
ArgumentParser
(
description
=
"
Automatic data loading in containers and interaction
"
"
with galaxy instances for GGA
"
"
, following the protocol @
"
"
http://gitlab.sb-roscoff.fr/abims/e-infra/gga
"
)
parser
.
add_argument
(
"
input
"
,
type
=
str
,
help
=
"
Input file (yml)
"
)
parser
.
add_argument
(
"
-v
"
,
"
--verbose
"
,
help
=
"
Increase output verbosity
"
,
action
=
"
store_false
"
)
parser
.
add_argument
(
"
--main-directory
"
,
type
=
str
,
help
=
"
Where the stack containers will be located, defaults to working directory
"
)
args
=
parser
.
parse_args
()
if
args
.
verbose
:
logging
.
basicConfig
(
level
=
logging
.
DEBUG
)
else
:
logging
.
basicConfig
(
level
=
logging
.
INFO
)
logging
.
getLogger
(
"
urllib3
"
).
setLevel
(
logging
.
WARNING
)
if
not
args
.
main_directory
:
args
.
main_directory
=
os
.
getcwd
()
else
:
args
.
main_directory
=
os
.
path
.
abspath
(
args
.
main_directory
)
sp_dict_list
=
utilities
.
parse_input
(
args
.
input
)
for
sp_dict
in
sp_dict_list
:
# Creating an instance of get_data_for_current_species object
get_data_for_current_species
=
GetData
(
parameters_dictionary
=
sp_dict
)
# Starting
logging
.
info
(
"
gga_load_data.py called for %s
"
%
get_data_for_current_species
.
full_name
)
# Setting some of the instance attributes
get_data_for_current_species
.
main_dir
=
args
.
main_directory
get_data_for_current_species
.
species_dir
=
os
.
path
.
join
(
get_data_for_current_species
.
main_dir
,
get_data_for_current_species
.
genus_species
+
"
/
"
)
# create src_data directory tree
get_data_for_current_species
.
make_directory_tree
()
# Retrieve datasets
logging
.
info
(
"
Finding and copying datasets for %s
"
%
get_data_for_current_species
.
full_name
)
get_data_for_current_species
.
get_source_data_files_from_path
()
logging
.
info
(
"
Sucessfully copied datasets for %s
"
%
get_data_for_current_species
.
full_name
)
Loading