Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
gga_load_data
Manage
Activity
Members
Labels
Plan
Issues
12
Issue boards
Milestones
Wiki
Code
Merge requests
5
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
abims
e-infra
gga_load_data
Commits
48cdd5a7
Commit
48cdd5a7
authored
4 years ago
by
Arthur Le Bars
Browse files
Options
Downloads
Patches
Plain Diff
Delete main.py
parent
65f38814
No related branches found
Branches containing commit
No related tags found
Tags containing commit
1 merge request
!1
Release 1.0
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
main.py
+0
-352
0 additions, 352 deletions
main.py
with
0 additions
and
352 deletions
main.py
deleted
100644 → 0
+
0
−
352
View file @
65f38814
from
bioblend
import
galaxy
import
bioblend.galaxy.objects
as
bbo
import
bioblend
as
bb
import
argparse
import
os
import
sys
import
subprocess
import
logging
import
re
import
json
from
workflow
import
Workflow
from
toolrunner
import
ToolRunner
"""
TODO: script description
python3 ~/PycharmProjects/ggauto/gga_load_data/main.py ~/PycharmProjects/ggauto/gga_load_data/dataloader.json
"""
def
main
():
parser
=
argparse
.
ArgumentParser
(
description
=
"
Input genus, species, strain, version
"
)
parser
.
add_argument
(
"
json
"
,
type
=
str
,
help
=
"
Input JSON file
"
)
parser
.
add_argument
(
"
--just-load
"
,
help
=
"
Only load data into galaxy, does not create nor run analyses in galaxy
"
)
parser
.
add_argument
(
"
-v
"
,
"
--verbose
"
,
help
=
"
Increase output verbosity
"
)
# CLI stuff
# parser.add_argument("--name", help="Sample species name, format: genus-species",type=str)
# parser.add_argument("--strain", help="Strain of the sample species", type=str)
# parser.add_argument("--version", help="Data version (e.g 1.0, 1.2, ...)", type=str)
# parser.add_argument("--common-name", help="Vernacular/common name of the species",type=str)
user_arguments
=
parser
.
parse_args
()
logging
.
basicConfig
(
level
=
logging
.
INFO
)
# List that will hold all dicts from the JSON input file, containing parameters for each species
sp_dict_list
=
[]
# JSON parsing and loading
with
open
(
user_arguments
.
json
,
'
r
'
)
as
infile
:
json_sp_dict
=
json
.
load
(
infile
)
json_sp_dump
=
json
.
dumps
(
json_sp_dict
,
indent
=
4
,
sort_keys
=
True
)
for
json_sp
in
json_sp_dict
:
sp_dict_list
.
append
(
json_sp
)
# Get variables from the current species dict
for
sp_dict
in
sp_dict_list
:
sp_params_list
=
[]
genus
=
sp_dict
[
"
genus
"
]
genus_lower
=
genus
[
0
].
lower
()
+
genus
[
1
:]
species
=
sp_dict
[
"
species
"
]
genus_species
=
genus_lower
+
"
_
"
+
species
common
=
sp_dict
[
"
common
"
]
strain
=
sp_dict
[
"
strain
"
]
sex
=
sp_dict
[
"
sex
"
]
if
strain
!=
""
:
genus_species_strain
=
genus_species
+
"
_
"
+
strain
else
:
genus_species_strain
=
genus_species
ogs_version
=
sp_dict
[
"
ogs version
"
]
genome_version
=
sp_dict
[
"
genome version
"
]
performed_by
=
sp_dict
[
"
performed by
"
]
# CLI stuff
# genus = user_arguments.name.split('-')[0]
# genus_lower = genus[0].lower().genus[1:]
# genus_upper = genus[0].upper() + genus[1:]
# species = user_arguments.name.split('-')[1]
# strain = user_arguments.strain
# vernacular = user_arguments.common_name
# TODO: prompt y/n asking for the validity of info
# Test adress, change to abims-gga.sb-roscoff.fr/sp/ in production
instance_url
=
"
http://localhost/sp/
"
+
genus_lower
+
"
_
"
+
species
+
"
/galaxy/
"
print
(
"
Species:
"
+
genus
+
"
"
+
species
+
"
(
"
+
common
+
"
)
"
+
"
\n
Strain:
"
+
strain
+
"
\n
Accessing instance
"
+
instance_url
)
# Connect to the galaxy instance of the current species TODO: connection issues (galaxy side)
gi
=
galaxy
.
GalaxyInstance
(
url
=
instance_url
,
key
=
"
3b36455cb16b4d0e4348e2c42f4bb934
"
,
email
=
"
alebars@sb-roscoff.fr
"
,
password
=
"
pouet
"
,
verify
=
True
)
# admin_email = os.environ.get('GALAXY_DEFAULT_ADMIN_USER', 'admin@galaxy.org')
# admin_pass = os.environ.get('GALAXY_DEFAULT_ADMIN_PASSWORD', 'admin')
"""
This part creates the current species directory and go to it
If it already exists, just move to it
To be expanded when docker-swarm is implemented (or all config files are ready), not useful for now
"""
main_dir
=
os
.
getcwd
()
+
"
/
"
sp_dir
=
os
.
path
.
join
(
main_dir
,
genus_species
)
+
"
/
"
try
:
os
.
mkdir
(
sp_dir
)
except
FileExistsError
:
print
(
"
Directory
"
+
sp_dir
+
"
already exists
"
)
try
:
os
.
chdir
(
sp_dir
)
wd
=
os
.
getcwd
()
except
OSError
:
print
(
"
Cannot access
"
+
sp_dir
+
"
, run with higher privileges
"
)
break
# Production instance example TODO: secure pswd and API key + manage API keys
# gi = galaxy.GalaxyInstance(url="http://abims-gga.sb-roscoff.fr/sp/ectocarpus_species1/galaxy/",
# key="84dfbee3c0efa9155518f01fbeff57c8",
# email="gga@sb-roscoff.fr",
# password="****")
# Check connection to the current instance
print
(
"
Testing connection to the galaxy instance
"
)
try
:
hl
=
gi
.
histories
.
get_histories
()
except
bb
.
ConnectionError
:
print
(
"
Cannot connect to galaxy instance @
"
+
instance_url
)
else
:
print
(
"
Successfully connected to galaxy instance @
"
+
instance_url
)
# TODO: FTP/symlink stuff to retrieve the datasets + change headers in pep.fasta
# ---------------------------------------------------------------------
# src_data directory tree creation
# ---------------------------------------------------------------------
src_data_folders
=
[
"
annotation
"
,
"
genome
"
]
species_folder_name
=
"
_
"
.
join
([
genus_lower
,
species
,
strain
,
sex
])
try
:
os
.
mkdir
(
"
./src_data
"
)
os
.
mkdir
(
"
./src_data/annotation
"
)
os
.
mkdir
(
"
./src_data/genome
"
)
os
.
mkdir
(
"
./src_data/annotation/
"
+
species_folder_name
)
os
.
mkdir
(
"
./src_data/genome/
"
+
species_folder_name
)
except
FileExistsError
:
print
(
"
src_data directory tree already exists
"
)
pass
except
PermissionError
:
print
(
"
Insufficient permission to create src_data directory tree
"
)
# ---------------------------------------------------------------------
# Data import into galaxy
# ---------------------------------------------------------------------
source_files
=
dict
()
annotation_dir
,
genome_dir
=
None
,
None
for
d
in
[
i
[
0
]
for
i
in
os
.
walk
(
os
.
getcwd
()
+
"
/src_data
"
)]:
if
"
annotation/
"
in
d
:
annotation_dir
=
d
for
f
in
os
.
listdir
(
d
):
if
f
.
endswith
(
"
proteins.fasta
"
):
source_files
[
"
proteins_file
"
]
=
os
.
path
.
join
(
d
,
f
)
elif
f
.
endswith
(
"
transcripts-gff.fa
"
):
source_files
[
"
transcripts_file
"
]
=
os
.
path
.
join
(
d
,
f
)
elif
f
.
endswith
(
"
.gff
"
):
source_files
[
"
gff_file
"
]
=
os
.
path
.
join
(
d
,
f
)
# annotation_dir_files = [f for f in os.listdir(d) if os.path.isfile(os.path.join(d, f))]
elif
"
genome/
"
in
d
:
genome_dir
=
d
for
f
in
os
.
listdir
(
d
):
if
f
.
endswith
(
"
.fa
"
):
source_files
[
"
genome_file
"
]
=
os
.
path
.
join
(
d
,
f
)
# genome_dir_files = [f for f in os.listdir(d) if os.path.isfile(os.path.join(d, f))]
print
(
"
Source files found:
"
)
for
k
,
v
in
source_files
.
items
():
print
(
"
\t
"
+
k
+
"
\t
"
+
v
)
# Changing headers in the *proteins.fasta file from >mRNA* to >protein*
# production version
modify_pep_headers
=
[
"
/usr/local/genome2/mmo/scripts/phaeoexplorer/phaeoexplorer-change_pep_fasta_header.sh
"
,
source_files
[
"
proteins_file
"
]]
# test version
modify_pep_headers
=
[
"
/home/alebars/gga/phaeoexplorer-change_pep_fasta_header.sh
"
,
source_files
[
"
proteins_file
"
]]
print
(
"
Changing fasta headers in
"
+
source_files
[
"
proteins_file
"
])
subprocess
.
run
(
modify_pep_headers
,
stdout
=
subprocess
.
PIPE
,
cwd
=
annotation_dir
)
# src_data cleaning
if
os
.
path
.
exists
(
annotation_dir
+
"
outfile
"
):
subprocess
.
run
([
"
mv
"
,
annotation_dir
+
"
/outfile
"
,
source_files
[
"
proteins_file
"
]],
stdout
=
subprocess
.
PIPE
,
cwd
=
annotation_dir
)
if
os
.
path
.
exists
(
annotation_dir
+
"
gmon.out
"
):
subprocess
.
run
([
"
rm
"
,
annotation_dir
+
"
/gmon.out
"
],
stdout
=
subprocess
.
PIPE
,
cwd
=
annotation_dir
)
# TODO: load the data into the current species directory and load it into galaxy instance
setup_data_libraries
=
"
docker-compose exec galaxy /tool_deps/_conda/bin/python /opt/setup_data_libraries.py
"
try
:
print
(
"
Loading data into the galaxy container
"
)
subprocess
.
run
(
setup_data_libraries
,
stdout
=
subprocess
.
PIPE
,
shell
=
True
)
except
subprocess
.
CalledProcessError
:
print
(
"
Cannot load data into container for
"
+
genus_species_strain
)
break
else
:
print
(
"
Data successfully loaded into docker container for
"
+
genus_species_strain
)
# generate workflow file and run it in the galaxy instance
# gi.histories.create_history(name=str(genus_species_strain + "_" + genome_version))
hi
=
gi
.
histories
.
get_histories
(
name
=
str
(
genus_species_strain
+
"
_
"
+
genome_version
))
hi_id
=
hi
[
0
][
"
id
"
]
li
=
gi
.
libraries
.
get_libraries
()
# only one library
li_id
=
gi
.
libraries
.
get_libraries
()[
0
][
"
id
"
]
# project data folder/library
fo_gi
=
gi
.
libraries
.
get_folders
(
library_id
=
li_id
)
# data location (project data)
fo_id
=
{}
current_fo_name
=
""
# folders ids: access to data to run the first tools
for
i
in
fo_gi
:
for
k
,
v
in
i
.
items
():
if
k
==
"
name
"
:
fo_id
[
v
]
=
0
current_fo_name
=
v
if
k
==
"
id
"
:
fo_id
[
current_fo_name
]
=
v
print
(
"
Folders and datasets IDs:
"
)
datasets
=
dict
()
for
k
,
v
in
fo_id
.
items
():
print
(
"
\t
"
+
k
+
"
:
"
+
v
)
if
k
==
"
/genome
"
:
sub_folder_content
=
gi
.
folders
.
show_folder
(
folder_id
=
v
,
contents
=
True
)
for
k2
,
v2
in
sub_folder_content
.
items
():
for
e
in
v2
:
if
type
(
e
)
==
dict
:
if
e
[
"
name
"
].
endswith
(
"
.fa
"
):
datasets
[
"
genome_file
"
]
=
e
[
"
ldda_id
"
]
print
(
"
\t\t
"
+
e
[
"
name
"
]
+
"
:
"
+
e
[
"
ldda_id
"
])
elif
k
==
"
/annotation/
"
+
genus_species
:
sub_folder_content
=
gi
.
folders
.
show_folder
(
folder_id
=
v
,
contents
=
True
)
for
k2
,
v2
in
sub_folder_content
.
items
():
for
e
in
v2
:
if
type
(
e
)
==
dict
:
# TODO: manage several files of the same type and versions
if
e
[
"
name
"
].
endswith
(
"
transcripts-gff.fa
"
):
datasets
[
"
transcripts_file
"
]
=
e
[
"
ldda_id
"
]
print
(
"
\t\t
"
+
e
[
"
name
"
]
+
"
:
"
+
e
[
"
ldda_id
"
])
elif
e
[
"
name
"
].
endswith
(
"
proteins.fasta
"
):
datasets
[
"
proteins_file
"
]
=
e
[
"
ldda_id
"
]
print
(
"
\t\t
"
+
e
[
"
name
"
]
+
"
:
"
+
e
[
"
ldda_id
"
])
elif
e
[
"
name
"
].
endswith
(
"
.gff
"
):
datasets
[
"
gff_file
"
]
=
e
[
"
ldda_id
"
]
print
(
"
\t\t
"
+
e
[
"
name
"
]
+
"
:
"
+
e
[
"
ldda_id
"
])
elif
e
[
"
name
"
].
endswith
(
"
MALE
"
):
datasets
[
"
gff_file
"
]
=
e
[
"
ldda_id
"
]
print
(
"
\t\t
"
+
e
[
"
name
"
]
+
"
:
"
+
e
[
"
ldda_id
"
])
current_hi_id
=
gi
.
histories
.
get_current_history
()[
"
id
"
]
print
(
"
History ID:
"
+
current_hi_id
)
gi
.
histories
.
upload_dataset_from_library
(
history_id
=
current_hi_id
,
lib_dataset_id
=
datasets
[
"
genome_file
"
])
gi
.
histories
.
upload_dataset_from_library
(
history_id
=
current_hi_id
,
lib_dataset_id
=
datasets
[
"
gff_file
"
])
gi
.
histories
.
upload_dataset_from_library
(
history_id
=
current_hi_id
,
lib_dataset_id
=
datasets
[
"
transcripts_file
"
])
gi
.
histories
.
upload_dataset_from_library
(
history_id
=
current_hi_id
,
lib_dataset_id
=
datasets
[
"
proteins_file
"
])
toolrunner
=
ToolRunner
(
parameters_dict
=
sp_dict
,
instance
=
gi
,
history
=
current_hi_id
)
# toolrunner.show_pannel() # show tools pannel (with tool_id and versions)
# ---------------------------------------------------------------------
# Galaxy instance interaction
# ---------------------------------------------------------------------
# # Delete Homo sapiens from Chado database
# toolrunner = ToolRunner(parameters_dict=sp_dict, instance=gi, history=current_hi_id)
# sapiens = toolrunner.get_sapiens_id()
# sapiens_job_out = sapiens["outputs"][0]["id"]
# sapiens_json_output = gi.datasets.download_dataset(dataset_id=sapiens_job_out)
# try:
# sapiens_output = json.loads(sapiens_json_output)[0]
# sapiens_id = str(sapiens_output["organism_id"]) # needs to be str to be recognized by the chado tool
# toolrunner.delete_sapiens(hs_id=sapiens_id)
# except bb.ConnectionError:
# print("Homo sapiens isn't in the database")
# except IndexError:
# pass
#
# # Workflow generation
workflow
=
Workflow
(
parameters_dict
=
sp_dict
,
instance
=
gi
,
history_id
=
current_hi_id
)
# wf_dict_json = workflow.generate(working_directory=wd, main_directory=main_dir, workflow_name="preset_workflow")
#
# tools = gi.tools.get_tool_panel() # tools panel -> alternative to wf
# # print(tools)
#
# wf_dict = json.loads(wf_dict_json) # doesn't work with eval()
#
# gi.workflows.import_workflow_dict(workflow_dict=wf_dict)
# wf_name = workflow.get_workflow_name()
# wf_attr = gi.workflows.get_workflows(name=wf_name)
# wf_id = wf_attr[0]["id"]
# wf_show = gi.workflows.show_workflow(workflow_id=wf_id)
# print("Workflow ID: " + wf_id)
#
# toolrunner = ToolRunner(parameters_dict=sp_dict, instance=gi, history=current_hi_id)
# # toolrunner.purge_organisms()
#
# # wf_o = bbo.Workflow(wf_dict=wf_dict, gi=gi)
#
# wf_params = workflow.set_main_workflow_parameters(datasets=datasets)
# # print("Inputs:")
# # print(wf_show["inputs"])
#
# datamap = dict()
# datamap["0"] = {"src": "hda", "id": datasets["genome_file"]}
# datamap["1"] = {"src": "hda", "id": datasets["gff_file"]}
# datamap["2"] = {"src": "hda", "id": datasets["proteins_file"]}
# datamap["3"] = {"src": "hda", "id": datasets["transcripts_file"]}
#
# gi.workflows.invoke_workflow(workflow_id=wf_id,
# history_id=current_hi_id,
# params=wf_params,
# inputs=datamap)
# gi.workflows.delete_workflow(workflow_id=wf_id)
#
# datamap = dict()
# datamap["0"] = {"src": "hda", "id": datasets["genome_file"]}
# datamap["1"] = {"src": "hda", "id": datasets["proteins_file"]}
#
wf_dict_json
=
workflow
.
generate
(
working_directory
=
wd
,
main_directory
=
main_dir
,
workflow_name
=
"
main
"
)
wf_dict
=
json
.
loads
(
wf_dict_json
)
# doesn't work with eval()
#
# gi.workflows.import_workflow_dict(workflow_dict=wf_dict)
# wf_attr = gi.workflows.get_workflows(name="jbrowse")
# wf_id = wf_attr[0]["id"]
# wf_show = gi.workflows.show_workflow(workflow_id=wf_id)
# print("Jbrowse workflow ID: " + wf_id)
# wf_params = workflow.set_jbrowse_workflow_parameters()
#
# allow_tool_state_correction makes galaxy fill missing tool states,
# because workflow was edited outside of galaxy with only some inputs (precaution parameter)
# gi.workflows.invoke_workflow(workflow_id=wf_id,
# history_id=current_hi_id,
# params=wf_params,
# inputs=datamap,
# allow_tool_state_corrections=True)
# gi.workflows.delete_workflow(workflow_id=wf_id)
# remove active instance history for testing, purge configured @ ~/config/galaxy.yml.docker_sample
# gi.histories.delete_history(history_id=current_hi_id, purge=True)
os
.
chdir
(
main_dir
)
print
(
"
\n
"
)
if
__name__
==
"
__main__
"
:
main
()
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment