Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
gga_load_data
Manage
Activity
Members
Labels
Plan
Issues
12
Issue boards
Milestones
Wiki
Code
Merge requests
5
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
abims
e-infra
gga_load_data
Commits
a9daa4b9
Commit
a9daa4b9
authored
5 years ago
by
Arthur Le Bars
Browse files
Options
Downloads
Patches
Plain Diff
.ga correct editing for jbrowse workflow
parent
de797749
No related branches found
Branches containing commit
No related tags found
Tags containing commit
1 merge request
!1
Release 1.0
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
main.py
+125
-83
125 additions, 83 deletions
main.py
workflow.py
+14
-7
14 additions, 7 deletions
workflow.py
with
139 additions
and
90 deletions
main.py
+
125
−
83
View file @
a9daa4b9
...
...
@@ -5,9 +5,9 @@ import argparse
import
os
import
sys
import
subprocess
import
logging
import
re
import
json
import
urllib3
as
ul
from
chado
import
ChadoInstance
from
workflow
import
Workflow
from
toolrunner
import
ToolRunner
"""
...
...
@@ -17,22 +17,12 @@ python3 ~/PycharmProjects/ggauto/gga_load_data/main.py ~/PycharmProjects/ggauto/
"""
class
Autoload
:
"""
TODO: turn main into an object
"""
def
__init__
(
self
,
json_in
):
self
.
json_in
=
json_in
def
main
(
self
):
return
None
def
main
():
parser
=
argparse
.
ArgumentParser
(
description
=
"
Input genus, species, strain, version
"
)
parser
.
add_argument
(
"
json
"
,
type
=
str
,
help
=
"
Input JSON file
"
)
parser
.
add_argument
(
"
--just-load
"
,
help
=
"
Only load data into galaxy, does not create nor run analyses in galaxy
"
)
parser
.
add_argument
(
"
-v
"
,
"
--verbose
"
,
help
=
"
Increase output verbosity
"
)
# CLI stuff
# parser.add_argument("--name", help="Sample species name, format: genus-species",type=str)
...
...
@@ -42,6 +32,8 @@ def main():
user_arguments
=
parser
.
parse_args
()
logging
.
basicConfig
(
level
=
logging
.
INFO
)
# List that will hold all dicts from the JSON input file, containing parameters for each species
sp_dict_list
=
[]
...
...
@@ -81,17 +73,20 @@ def main():
# Test adress, change to abims-gga.sb-roscoff.fr/sp/ in production
instance_url
=
"
http://localhost/sp/
"
+
genus_lower
+
"
_
"
+
species
+
"
/galaxy/
"
print
(
"
Species:
"
+
genus
+
"
"
+
species
+
"
(
"
+
common
+
"
)
"
"
\n
Strain:
"
+
strain
+
"
\n
Accessing instance
"
+
instance_url
)
print
(
"
Species:
"
+
genus
+
"
"
+
species
+
"
(
"
+
common
+
"
)
"
+
"
\n
Strain:
"
+
strain
+
"
\n
Accessing instance
"
+
instance_url
)
# Connect to the galaxy instance of the current species TODO:
API key
connection issues
# Connect to the galaxy instance of the current species TODO: connection issues
(galaxy side)
gi
=
galaxy
.
GalaxyInstance
(
url
=
instance_url
,
key
=
"
3b36455cb16b4d0e4348e2c42f4bb934
"
,
email
=
"
alebars@sb-roscoff.fr
"
,
password
=
"
pouet
"
,
verify
=
True
)
# admin_email = os.environ.get('GALAXY_DEFAULT_ADMIN_USER', 'admin@galaxy.org')
# admin_pass = os.environ.get('GALAXY_DEFAULT_ADMIN_PASSWORD', 'admin')
"""
This part creates the current species directory and go to it
If it already exists, just move to it
...
...
@@ -118,22 +113,62 @@ def main():
# password="****")
# Check connection to the current instance
print
(
"
Testing connection to the galaxy instance
"
)
try
:
hl
=
gi
.
histories
.
get_histories
()
except
bb
.
ConnectionError
:
print
(
"
Cannot connect to
GGA
instance @
"
+
instance_url
)
print
(
"
Cannot connect to
galaxy
instance @
"
+
instance_url
)
else
:
print
(
"
Successfully connected to instance
"
+
instance_url
)
print
(
"
Successfully connected to
galaxy
instance
@
"
+
instance_url
)
# TODO: FTP stuff to retrieve the datasets (used in testing, not needed for production)
# TODO: FTP/symlink stuff to retrieve the datasets + change headers in pep.fasta
setup_data_libraries_cl
=
"
docker-compose exec galaxy /tool_deps/_conda/bin/python /opt/setup_data_libraries.py
"
# try:
# os.mkdir("./src_data")
# except FileExistsError:
# print("src_data folder already exists for " + genus_species_strain)
# print("Loading data into galaxy...")
# try:
# setup_data_libraries = subprocess.Popen(setup_data_libraries_cl.split(), stdout=subprocess.PIPE)
# print("Output from setup_data_libraries.py")
# print(setup_data_libraries.communicate())
# except bb.ConnectionError:
# print("Cannot load data into container for " + genus_species_strain)
# break
# else:
# print("Data successfully loaded into docker container for " + genus_species_strain)
# else:
# print("src_data folder created for " + genus_species_strain)
#
# try:
# setup_data_libraries = subprocess.Popen(setup_data_libraries_cl.split(), stdout=subprocess.PIPE)
# print("Output from setup_data_libraries.py")
# print(setup_data_libraries.communicate())
# except bb.ConnectionError:
# print("Cannot load data into container for " + genus_species_strain)
# break
# else:
# print("Data successfully loaded into docker container for " + genus_species_strain)
genome_dir
,
annotation_dir
=
None
,
None
for
d
in
[
i
[
0
]
for
i
in
os
.
walk
(
os
.
getcwd
()
+
"
/src_data
"
)]:
if
"
annotation/
"
in
d
:
annotation_dir
=
d
annotation_dir_files
=
[
f
for
f
in
os
.
listdir
(
d
)
if
os
.
path
.
isfile
(
os
.
path
.
join
(
d
,
f
))]
print
(
"
src_data annotation file(s):
"
)
print
(
str
(
'
\t
'
+
file
)
for
file
in
annotation_dir_files
)
elif
"
genome/
"
in
d
:
genome_dir
=
d
genome_dir_files
=
[
f
for
f
in
os
.
listdir
(
d
)
if
os
.
path
.
isfile
(
os
.
path
.
join
(
d
,
f
))]
print
(
"
src_data genome file(s):
"
)
print
(
str
(
'
\t
'
+
file
)
for
file
in
genome_dir_files
)
modify_pep_headers
=
[
"
sh /usr/local/genome2/mmo/scripts/phaeoexplorer/phaeoexplorer-change_pep_fasta_header.sh
"
]
# TODO: load the data into the current species directory and load it into galaxy instance
# setup_data_libraries_cl = \
# "docker-compose exec galaxy /tool_deps/_conda/bin/python /opt/setup_data_libraries.py"
...
...
@@ -150,7 +185,7 @@ def main():
# generate workflow file and run it in the galaxy instance
gi
.
histories
.
create_history
(
name
=
str
(
genus_species_strain
+
"
_
"
+
genome_version
))
#
gi.histories.create_history(name=str(genus_species_strain + "_" + genome_version))
hi
=
gi
.
histories
.
get_histories
(
name
=
str
(
genus_species_strain
+
"
_
"
+
genome_version
))
hi_id
=
hi
[
0
][
"
id
"
]
li
=
gi
.
libraries
.
get_libraries
()
# only one library
...
...
@@ -186,7 +221,7 @@ def main():
for
k2
,
v2
in
sub_folder_content
.
items
():
for
e
in
v2
:
if
type
(
e
)
==
dict
:
# TODO: manage several files of the same type
# TODO: manage several files of the same type
and versions
if
e
[
"
name
"
].
endswith
(
"
transcripts-gff.fa
"
):
datasets
[
"
transcripts_file
"
]
=
e
[
"
ldda_id
"
]
print
(
"
\t\t
"
+
e
[
"
name
"
]
+
"
:
"
+
e
[
"
ldda_id
"
])
...
...
@@ -196,6 +231,9 @@ def main():
elif
e
[
"
name
"
].
endswith
(
"
.gff
"
):
datasets
[
"
gff_file
"
]
=
e
[
"
ldda_id
"
]
print
(
"
\t\t
"
+
e
[
"
name
"
]
+
"
:
"
+
e
[
"
ldda_id
"
])
elif
e
[
"
name
"
].
endswith
(
"
MALE
"
):
datasets
[
"
gff_file
"
]
=
e
[
"
ldda_id
"
]
print
(
"
\t\t
"
+
e
[
"
name
"
]
+
"
:
"
+
e
[
"
ldda_id
"
])
current_hi_id
=
gi
.
histories
.
get_current_history
()[
"
id
"
]
print
(
"
History ID:
"
+
current_hi_id
)
...
...
@@ -204,76 +242,80 @@ def main():
gi
.
histories
.
upload_dataset_from_library
(
history_id
=
current_hi_id
,
lib_dataset_id
=
datasets
[
"
transcripts_file
"
])
gi
.
histories
.
upload_dataset_from_library
(
history_id
=
current_hi_id
,
lib_dataset_id
=
datasets
[
"
proteins_file
"
])
# Delete Homo sapiens from Chado database
toolrunner
=
ToolRunner
(
parameters_dict
=
sp_dict
,
instance
=
gi
,
history
=
current_hi_id
)
sapiens_id
=
None
sapiens
=
toolrunner
.
get_sapiens_id
()
sapiens_job_out
=
sapiens
[
"
outputs
"
][
0
][
"
id
"
]
sapiens_json_output
=
gi
.
datasets
.
download_dataset
(
dataset_id
=
sapiens_job_out
)
try
:
sapiens_output
=
json
.
loads
(
sapiens_json_output
)[
0
]
sapiens_id
=
str
(
sapiens_output
[
"
organism_id
"
])
# needs to be str to be recognized by the chado tool
toolrunner
.
delete_sapiens
(
hs_id
=
sapiens_id
)
except
bb
.
ConnectionError
:
print
(
"
Homo sapiens isn
'
t in the database
"
)
# ---------------------------------------------------------------------
# Galaxy instance interaction
# ---------------------------------------------------------------------
# Workflow generation
# # Delete Homo sapiens from Chado database
# toolrunner = ToolRunner(parameters_dict=sp_dict, instance=gi, history=current_hi_id)
# sapiens = toolrunner.get_sapiens_id()
# sapiens_job_out = sapiens["outputs"][0]["id"]
# sapiens_json_output = gi.datasets.download_dataset(dataset_id=sapiens_job_out)
# try:
# sapiens_output = json.loads(sapiens_json_output)[0]
# sapiens_id = str(sapiens_output["organism_id"]) # needs to be str to be recognized by the chado tool
# toolrunner.delete_sapiens(hs_id=sapiens_id)
# except bb.ConnectionError:
# print("Homo sapiens isn't in the database")
# except IndexError:
# pass
#
# # Workflow generation
workflow
=
Workflow
(
parameters_dict
=
sp_dict
,
instance
=
gi
,
history_id
=
current_hi_id
)
wf_dict_json
=
workflow
.
generate
(
working_directory
=
wd
,
main_directory
=
main_dir
,
workflow_name
=
"
preset_workflow
"
)
tools
=
gi
.
tools
.
get_tool_panel
()
# tools panel -> alternative to wf
# print(tools)
wf_dict
=
json
.
loads
(
wf_dict_json
)
# doesn't work with eval()
gi
.
workflows
.
import_workflow_dict
(
workflow_dict
=
wf_dict
)
wf_name
=
workflow
.
get_workflow_name
()
wf_attr
=
gi
.
workflows
.
get_workflows
(
name
=
wf_name
)
wf_id
=
wf_attr
[
0
][
"
id
"
]
wf_show
=
gi
.
workflows
.
show_workflow
(
workflow_id
=
wf_id
)
print
(
"
Workflow ID:
"
+
wf_id
)
toolrunner
=
ToolRunner
(
parameters_dict
=
sp_dict
,
instance
=
gi
,
history
=
current_hi_id
)
# toolrunner.purge_organisms()
# wf_o = bbo.Workflow(wf_dict=wf_dict, gi=gi)
wf_params
=
workflow
.
set_main_workflow_parameters
(
datasets
=
datasets
)
print
(
"
Inputs:
"
)
print
(
wf_show
[
"
inputs
"
])
datamap
=
dict
()
datamap
[
"
0
"
]
=
{
"
src
"
:
"
hda
"
,
"
id
"
:
datasets
[
"
genome_file
"
]}
datamap
[
"
1
"
]
=
{
"
src
"
:
"
hda
"
,
"
id
"
:
datasets
[
"
gff_file
"
]}
datamap
[
"
2
"
]
=
{
"
src
"
:
"
hda
"
,
"
id
"
:
datasets
[
"
proteins_file
"
]}
datamap
[
"
3
"
]
=
{
"
src
"
:
"
hda
"
,
"
id
"
:
datasets
[
"
transcripts_file
"
]}
#
wf_dict_json = workflow.generate(working_directory=wd, main_directory=main_dir, workflow_name="preset_workflow")
#
#
tools = gi.tools.get_tool_panel() # tools panel -> alternative to wf
#
#
print(tools)
#
#
wf_dict = json.loads(wf_dict_json) # doesn't work with eval()
#
#
gi.workflows.import_workflow_dict(workflow_dict=wf_dict)
#
wf_name = workflow.get_workflow_name()
#
wf_attr = gi.workflows.get_workflows(name=wf_name)
#
wf_id = wf_attr[0]["id"]
#
wf_show = gi.workflows.show_workflow(workflow_id=wf_id)
#
print("Workflow ID: " + wf_id)
#
#
toolrunner = ToolRunner(parameters_dict=sp_dict, instance=gi, history=current_hi_id)
#
#
toolrunner.purge_organisms()
#
#
#
wf_o = bbo.Workflow(wf_dict=wf_dict, gi=gi)
#
#
wf_params = workflow.set_main_workflow_parameters(datasets=datasets)
# #
print("Inputs:")
# #
print(wf_show["inputs"])
#
#
datamap = dict()
#
datamap["0"] = {"src": "hda", "id": datasets["genome_file"]}
#
datamap["1"] = {"src": "hda", "id": datasets["gff_file"]}
#
datamap["2"] = {"src": "hda", "id": datasets["proteins_file"]}
#
datamap["3"] = {"src": "hda", "id": datasets["transcripts_file"]}
#
# gi.workflows.invoke_workflow(workflow_id=wf_id,
# history_id=current_hi_id,
# params=wf_params,
# inputs=datamap)
# gi.workflows.delete_workflow(workflow_id=wf_id)
#
# datamap = dict()
# datamap["0"] = {"src": "hda", "id": datasets["genome_file"]}
# datamap["1"] = {"src": "hda", "id": datasets["proteins_file"]}
#
gi
.
workflows
.
delete_workflow
(
workflow_id
=
wf_id
)
datamap
=
dict
()
datamap
[
"
0
"
]
=
{
"
src
"
:
"
hda
"
,
"
id
"
:
datasets
[
"
genome_file
"
]}
datamap
[
"
1
"
]
=
{
"
src
"
:
"
hda
"
,
"
id
"
:
datasets
[
"
proteins_file
"
]}
wf_dict_json
=
workflow
.
generate
(
working_directory
=
wd
,
main_directory
=
main_dir
,
workflow_name
=
"
jbrowse
"
)
wf_dict
=
json
.
loads
(
wf_dict_json
)
# doesn't work with eval()
gi
.
workflows
.
import_workflow_dict
(
workflow_dict
=
wf_dict
)
wf_attr
=
gi
.
workflows
.
get_workflows
(
name
=
"
jbrowse
"
)
wf_id
=
wf_attr
[
0
][
"
id
"
]
wf_show
=
gi
.
workflows
.
show_workflow
(
workflow_id
=
wf_id
)
print
(
"
Jbrowse workflow ID:
"
+
wf_id
)
wf_params
=
workflow
.
set_jbrowse_workflow_parameters
()
gi
.
workflows
.
invoke_workflow
(
workflow_id
=
wf_id
,
history_id
=
current_hi_id
,
params
=
wf_params
,
inputs
=
datamap
)
gi
.
workflows
.
delete_workflow
(
workflow_id
=
wf_id
)
#
#
gi.workflows.import_workflow_dict(workflow_dict=wf_dict)
#
wf_attr = gi.workflows.get_workflows(name="jbrowse")
#
wf_id = wf_attr[0]["id"]
#
wf_show = gi.workflows.show_workflow(workflow_id=wf_id)
#
print("Jbrowse workflow ID: " + wf_id)
#
wf_params = workflow.set_jbrowse_workflow_parameters()
#
#
gi.workflows.invoke_workflow(workflow_id=wf_id,
#
history_id=current_hi_id,
#
params=wf_params,
#
inputs=datamap)
#
gi.workflows.delete_workflow(workflow_id=wf_id)
# remove active instance history for testing, purge configured @ ~/config/galaxy.yml.docker_sample
# gi.histories.delete_history(history_id=current_hi_id, purge=True)
...
...
This diff is collapsed.
Click to expand it.
workflow.py
+
14
−
7
View file @
a9daa4b9
...
...
@@ -2,6 +2,7 @@ import os
from
bioblend.galaxy
import
GalaxyInstance
from
toolrunner
import
ToolRunner
import
json
import
logging
"""
Workflow creation for generation and visualization of data and analyses output
"""
...
...
@@ -9,6 +10,8 @@ Workflow creation for generation and visualization of data and analyses output
class
Workflow
:
logging
.
basicConfig
(
level
=
logging
.
INFO
)
def
__init__
(
self
,
parameters_dict
,
instance
,
history_id
):
self
.
history_id
=
history_id
self
.
instance
=
instance
...
...
@@ -59,18 +62,22 @@ class Workflow:
# print("Workflow file @ " + self.custom_ga_file_path)
with
open
(
self
.
preset_ga_file
,
'
r
'
)
as
ga_in_file
:
ga_in
=
str
(
ga_in_file
.
readlines
())
ga_in
=
ga_in
.
replace
(
'
{
\\\\\\\\\\\\
"
unique_id
\\\\\\\\\\\\
"
:
\\\\\\\\\\\\
"
UNIQUEID
\\\\\\\\\\\\
"
}
'
,
str
(
'
{
\\\\\\\\\\\\
"
unique_id
\\\\\\\\\\\\
"
:
\\\\\\\\\\\\
"'
+
self
.
genus
+
"
"
+
self
.
species
)
+
'
\\\\\\\\\\\\
"'
)
ga_in
=
ga_in
.
replace
(
'
\\\\\\\\\\\\
"
name
\\\\\\\\\\\\
"
:
\\\\\\\\\\\\
"
NAME
\\\\\\\\\\\\
"'
,
str
(
'
\\\\\\\\\\\\
"
name
\\\\\\\\\\\\
"
:
\\\\\\\\\\\\
"'
+
self
.
genus
.
lower
()[
0
]
+
self
.
species
)
+
'
\\\\\\\\\\\\
"'
)
ga_in
=
ga_in
.
replace
(
"
\\\\
"
,
"
\\
"
)
# to restore the correct amount of backslashes in the workflow string before import
ga_in
=
ga_in
.
replace
(
'"
name
"
:
"
NAME
"'
,
str
(
'"
name
"
:
"'
+
self
.
genus
.
lower
()[
0
]
+
self
.
species
)
+
'"'
)
ga_in
=
ga_in
.
replace
(
'
{
"
unique_id
"
:
"
UNIQUEID
"
}
'
,
str
(
'
{
"
unique_id
"
:
"'
+
self
.
genus
+
"
"
+
self
.
species
)
+
'"'
)
ga_in
=
ga_in
.
replace
(
"
\\\\\\\\\\\\
"
,
"
\\\\\\
"
)
ga_in
=
ga_in
.
replace
(
'
http://localhost/sp/undaria_pinnatifida/feature/Undaria/pinnatifida/mRNA/{id}
"'
,
"
http://localhost/sp/
"
+
self
.
genus
.
lower
()[
0
]
+
self
.
genus
[
1
:]
+
"
"
+
self
.
species
+
"
/feature/
"
+
self
.
genus
+
"
/mRNA/{id}
"
)
# ga_in = ga_in.replace('"index\\\": \\\"false', '"index\\\": \\\"true')
# workflow_name = '"name": "' + self.full + '"'
# ga_in = ga_in.replace('"name": "preset_workflow"', '"name": "preset_workflow"')
# print(workflow_name)
ga_in
=
ga_in
[
2
:
-
2
]
ga_in
=
ga_in
[
2
:
-
2
]
# if the line under doesn't outputs a correct json
# ga_in = ga_in[:-2] # if the line above doesn't outputs a correct json
self
.
workflow
=
ga_in
print
(
ga_in
)
return
ga_in
def
set_main_workflow_parameters
(
self
,
datasets
):
...
...
@@ -98,7 +105,7 @@ class Workflow:
org_id
=
str
(
org_output
[
"
organism_id
"
])
# needs to be str to be recognized by the chado tool
global_org_id
=
org_id
except
IndexError
:
print
(
"
No organism matching
"
+
self
.
full
+
"
exists in the Chado database
"
)
logging
.
info
(
"
No organism matching
"
+
self
.
full
+
"
exists in the Chado database
"
)
ogs_analysis
=
toolrunner
.
get_ogs_analysis
()
ogs_analysis_job_out
=
ogs_analysis
[
"
outputs
"
][
0
][
"
id
"
]
...
...
@@ -108,7 +115,7 @@ class Workflow:
ogs_analysis_id
=
str
(
ogs_analysis_output
[
"
analysis_id
"
])
# needs to be str to be recognized by the chado tool
global_ogs_id
=
ogs_analysis_id
except
IndexError
:
print
(
"
No matching OGS analysis exists in the Chado database
"
)
logging
.
info
(
"
No matching OGS analysis exists in the Chado database
"
)
genome_analysis
=
toolrunner
.
get_genome_analysis
()
genome_analysis_job_out
=
genome_analysis
[
"
outputs
"
][
0
][
"
id
"
]
...
...
@@ -118,7 +125,7 @@ class Workflow:
genome_analysis_id
=
str
(
genome_analysis_output
[
"
analysis_id
"
])
# needs to be str to be recognized by the chado tool
global_genome_id
=
genome_analysis_id
except
IndexError
:
print
(
"
No matching genome analysis exists in the Chado database
"
)
logging
.
info
(
"
No matching genome analysis exists in the Chado database
"
)
params
=
dict
()
params
[
"
0
"
]
=
{}
...
...
@@ -194,7 +201,7 @@ class Workflow:
:return:
"""
workflow_id
=
self
.
instance
.
workflows
.
get_workflows
()[
0
][
'
id
'
]
return
print
(
self
.
instance
.
workflows
.
show_workflow
(
workflow_id
=
workflow_id
))
return
logging
.
info
(
self
.
instance
.
workflows
.
show_workflow
(
workflow_id
=
workflow_id
))
def
store
(
self
):
"""
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment