Newer
Older

Arthur Le Bars
committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
from bioblend import galaxy
import bioblend
import argparse
import os
import subprocess
import sys
import json
import yaml
import numpy
import pandas
import logging
import re
class Autoload:
"""
Cleaner version for gga_auto_load (to use in production).
This class possesses most useful parameters to interact with GGA as attributes (as defined in __init__), so new
methods can be more easily implemented by copying already existing ones (i.e add new analysis, run a workflow, ...)
To run the workflows, place them in the same directory as this script, and add the method + the workflow
parameters in the main invocation (at the end of the file)
"""
def __init__(self, species_parameters_dictionary: dict):
self.species_parameters_dictionary = species_parameters_dictionary
self.species = species_parameters_dictionary["species"]
self.genus = species_parameters_dictionary["genus"]
self.strain = species_parameters_dictionary["strain"]
self.sex = species_parameters_dictionary["sex"]
self.common = species_parameters_dictionary["common"]
self.date = species_parameters_dictionary["date"]
self.performed = species_parameters_dictionary["performed by"]
self.genome_version = species_parameters_dictionary["genome version"]
self.ogs_version = species_parameters_dictionary["ogs version"]
self.genus_lowercase = self.genus[0].lower() + self.genus[1:]
self.full_name = " ".join([self.genus_lowercase, self.species, self.strain, self.sex])
self.abbreviation = " ".join([self.genus_lowercase[0], self.species, self.strain, self.sex])
self.genus_species = self.genus_lowercase + "_" + self.species
self.instance_url = "http://localhost/sp/" + self.genus_lowercase + "_" + self.species + "/galaxy/"
self.instance: galaxy = None
self.history_id = None
self.library_id = None
self.main_dir = None
self.species_dir = None
self.org_id = None
self.genome_analysis_id = None
self.ogs_analysis_id = None
self.tool_panel = None
# Test the connection to the galaxy instance for the current species
# Additionally set some class attributes
# TODO: auth issues with nginx
self.instance = galaxy.GalaxyInstance(url=self.instance_url,
key="3b36455cb16b4d0e4348e2c42f4bb934",
email="alebars@sb-roscoff.fr",
password="pouet",
verify=True)
logging.info("testing connection to the galaxy instance ...")
try:
self.instance.histories.get_histories()
self.tool_panel = self.instance.tools.get_tool_panel()
except bioblend.ConnectionError:
logging.info("cannot connect to galaxy instance @ " + self.instance_url)
sys.exit()
else:
logging.info("successfully connected to galaxy instance @ " + self.instance_url)
self.main_dir = os.getcwd() + "/"
self.species_dir = os.path.join(self.main_dir, self.genus_species) + "/"
def load_data_in_galaxy(self, method):
"""
- create the src_data directory tree for the species
- change headers for pep file
- load data into the galaxy container with the galaxy_data_libs_SI.py script
:param method:
:return:
"""
os.chdir(self.main_dir)
try:
os.mkdir(self.species_dir)
except FileExistsError:
logging.debug("directory " + self.species_dir + " already exists")
try:
os.chdir(self.species_dir)
working_dir = os.getcwd()
except OSError:
logging.info("cannot access " + self.species_dir + ", run with higher privileges")
sys.exit()
src_data_folders = ["annotation", "genome"]
species_folder_name = "_".join([self.genus_lowercase, self.species, self.strain, self.sex])
try:
os.mkdir("./src_data")
os.mkdir("./src_data/annotation")
os.mkdir("./src_data/genome")
os.mkdir("./src_data/annotation/" + species_folder_name)
os.mkdir("./src_data/genome/" + species_folder_name)
except FileExistsError:
logging.debug("src_data directory tree already exists")
except PermissionError:
logging.debug("insufficient permission to create src_data directory tree")
# Data import into galaxy
source_files = dict()
annotation_dir, genome_dir = None, None
for d in [i[0] for i in os.walk(os.getcwd() + "/src_data")]:
if "annotation/" in d:
annotation_dir = d
for f in os.listdir(d):
if f.endswith("proteins.fasta"):
source_files["proteins_file"] = os.path.join(d, f)
elif f.endswith("transcripts-gff.fa"):
source_files["transcripts_file"] = os.path.join(d, f)
elif f.endswith(".gff"):
source_files["gff_file"] = os.path.join(d, f)
elif "genome/" in d:
genome_dir = d
for f in os.listdir(d):
if f.endswith(".fa"):
source_files["genome_file"] = os.path.join(d, f)
logging.debug("source files found:")
for k, v in source_files.items():
logging.debug("\t" + k + "\t" + v)
# Changing headers in the *proteins.fasta file from >mRNA* to >protein*
# production version
modify_pep_headers = ["/usr/local/genome2/mmo/scripts/phaeoexplorer/phaeoexplorer-change_pep_fasta_header.sh",
source_files["proteins_file"]]
# test version
modify_pep_headers = ["/home/alebars/gga/phaeoexplorer-change_pep_fasta_header.sh",
source_files["proteins_file"]]
logging.info("changing fasta headers in " + source_files["proteins_file"])
subprocess.run(modify_pep_headers, stdout=subprocess.PIPE, cwd=annotation_dir)
# src_data cleaning
if os.path.exists(annotation_dir + "outfile"):
subprocess.run(["mv", annotation_dir + "/outfile", source_files["proteins_file"]],
stdout=subprocess.PIPE,
cwd=annotation_dir)
if os.path.exists(annotation_dir + "gmon.out"):
subprocess.run(["rm", annotation_dir + "/gmon.out"],
stdout=subprocess.PIPE,
cwd=annotation_dir)
setup_data_libraries = "docker-compose exec galaxy /tool_deps/_conda/bin/python /opt/setup_data_libraries.py"
try:
logging.info("loading data into the galaxy container")
subprocess.run(setup_data_libraries,
stdout=subprocess.PIPE,
shell=True)
except subprocess.CalledProcessError:
logging.info("cannot load data into container for " + self.full_name)
pass
else:
logging.info("data successfully loaded into docker container for " + self.full_name)
# gi.histories.create_history(name=str(genus_species_strain + "_" + genome_version))
histories = self.instance.histories.get_histories(name=str(self.full_name + "_" + self.genome_version))
self.history_id = histories[0]["id"]
libraries = self.instance.libraries.get_libraries() # normally only one library
self.library_id = self.instance.libraries.get_libraries()[0]["id"] # project data folder/library
instance_source_data_folders = self.instance.libraries.get_folders(library_id=self.library_id)
folders_ids = {}
current_fo_name = ""
# folders ids: access to data to run the first tools
for i in instance_source_data_folders:
for k, v in i.items():
if k == "name":
folders_ids[v] = 0
current_fo_name = v
if k == "id":
folders_ids[current_fo_name] = v
logging.info("folders and datasets IDs: ")
datasets = dict()
for k, v in folders_ids.items():
logging.info("\t" + k + ": " + v)
if k == "/genome":
sub_folder_content = self.instance.folders.show_folder(folder_id=v, contents=True)
for k2, v2 in sub_folder_content.items():
for e in v2:
if type(e) == dict:
if e["name"].endswith(".fa"):
datasets["genome_file"] = e["ldda_id"]
logging.info("\t\t" + e["name"] + ": " + e["ldda_id"])
elif k == "/annotation/" + self.genus_species:
sub_folder_content = self.instance.folders.show_folder(folder_id=v, contents=True)
for k2, v2 in sub_folder_content.items():
for e in v2:
if type(e) == dict:
# TODO: manage several files of the same type and manage versions
if e["name"].endswith("transcripts-gff.fa"):
datasets["transcripts_file"] = e["ldda_id"]
logging.info("\t\t" + e["name"] + ": " + e["ldda_id"])
elif e["name"].endswith("proteins.fasta"):
datasets["proteins_file"] = e["ldda_id"]
logging.info("\t\t" + e["name"] + ": " + e["ldda_id"])
elif e["name"].endswith(".gff"):
datasets["gff_file"] = e["ldda_id"]
logging.info("\t\t" + e["name"] + ": " + e["ldda_id"])
elif e["name"].endswith("MALE"):
datasets["gff_file"] = e["ldda_id"]
logging.info("\t\t" + e["name"] + ": " + e["ldda_id"])
self.history_id = self.instance.histories.get_current_history()["id"]
logging.debug("history ID: " + self.history_id)
# import all datasets into current history
self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=datasets["genome_file"])
self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=datasets["gff_file"])
self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=datasets["transcripts_file"])
self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=datasets["proteins_file"])
def run_workflow(self, workflow_name, workflow_parameters):
"""
:param workflow_ga_file:
:param workflow_parameters:
:return:
"""
logging.debug("running workflow: " + str(workflow_name))
workflow_ga_file = self.main_dir + "Galaxy-Workflow-" + workflow_name + ".ga"
if self.strain != "":
custom_ga_file = "_".join([self.genus, self.species, self.strain]) + "_workflow.ga"
custom_ga_file_path = os.path.abspath(custom_ga_file)
else:
custom_ga_file = "_".join([self.genus, self.species]) + "_workflow.ga"
custom_ga_file_path = os.path.abspath(custom_ga_file)
with open(workflow_ga_file, 'r') as ga_in_file:
ga_in = str(ga_in_file.readlines())
ga_in = ga_in.replace('{\\\\\\\\\\\\"unique_id\\\\\\\\\\\\": \\\\\\\\\\\\"UNIQUE_ID\\\\\\\\\\\\"}',
str('{\\\\\\\\\\\\"unique_id\\\\\\\\\\\\": \\\\\\\\\\\\"' + self.genus + " " + self.species) + '\\\\\\\\\\\\"')
ga_in = ga_in.replace('\\\\\\\\\\\\"name\\\\\\\\\\\\": \\\\\\\\\\\\"NAME\\\\\\\\\\\\"',
str('\\\\\\\\\\\\"name\\\\\\\\\\\\": \\\\\\\\\\\\"' + self.genus.lower()[0] + self.species) + '\\\\\\\\\\\\"')
ga_in = ga_in.replace("\\\\", "\\") # to restore the correct amount of backslashes in the workflow string before import
# test
ga_in = ga_in.replace('http://localhost/sp/genus_species/feature/Genus/species/mRNA/{id}',
"http://localhost/sp/" + self.genus_lowercase+ "_" + self.species + "/feature/" + self.genus + "/mRNA/{id}")
# production
# ga_in = ga_in.replace('http://localhost/sp/genus_species/feature/Genus/species/mRNA/{id}',
# "http://abims--gga.sb-roscoff.fr/sp/" + self.genus_lowercase + "_" + self.species + "/feature/" + self.genus + "/mRNA/{id}")
ga_in = ga_in[2:-2] # if the line under doesn't outputs a correct json
# ga_in = ga_in[:-2] # if the line above doesn't outputs a correct json
def init_instance(self):
"""
Galaxy instance startup in preparation for running workflows
- remove Homo sapiens from the chado database.
- add organism and analyses into the chado database
- get any other existing organisms IDs (mainly used for testing)
:return:
"""
# Delete Homo sapiens from Chado database
get_sapiens_id_job = self.instance.tools.run_tool(tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/2.3.2",
tool_inputs={"genus": "Homo", "species": "species"},
history=self.history_id)
get_sapiens_id_job_output = get_sapiens_id_job["outputs"][0]["id"]
get_sapiens_id_json_output = self.instance.datasets.download_dataset(dataset_id=get_sapiens_id_job_output)
try:
get_sapiens_id_final_output = json.loads(get_sapiens_id_json_output)[0]
sapiens_id = str(get_sapiens_id_final_output["organism_id"]) # needs to be str to be recognized by the chado tool
self.instance.tools.run_tool(tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_delete_organisms/organism_delete_organisms/2.3.2",
history_id=self.history_id,
tool_inputs={"organism": str(sapiens_id)})
except bioblend.ConnectionError:
logging.debug("homo sapiens isn't in the database")
except IndexError:
pass
# Add organism (species) to chado
self.instance.tools.run_tool(tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_add_organism/organism_add_organism/2.3.2",
history_id=self.history_id,
tool_inputs={"abbr": self.abbreviation,
"genus": self.genus,
"species": self.species,
"common": self.common})
# Add OGS analysis to chado
self.instance.tools.run_tool(tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/2.3.2",
history_id=self.history_id,
tool_inputs={"name": self.genus + " " + self.species + " OGS" + self.ogs_version,
"program": "Performed by Genoscope",
"programversion": str("OGS" + self.ogs_version),
"sourcename": "Genoscope",
"date_executed": self.date})
# Add genome analysis to chado
self.instance.tools.run_tool(tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/2.3.2",
history_id=self.history_id,
tool_inputs={"name": self.genus + " " + self.species + " genome v" + self.genome_version,
"program": "Performed by Genoscope",
"programversion": str("genome v" + self.genome_version),
"sourcename": "Genoscope",
"date_executed": self.date})
# Get the ID from OGS analysis in chado
org = self.instance.tools.run_tool(tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/2.3.2",
history_id=self.history_id,
tool_inputs={"genus": self.genus, "species": self.species})
org_job_out = org["outputs"][0]["id"]
org_json_output = self.instance.datasets.download_dataset(dataset_id=org_job_out)
try:
org_output = json.loads(org_json_output)[0]
self.org_id = str(org_output["organism_id"]) # needs to be str to be recognized by chado tools
except IndexError:
logging.debug("no organism matching " + self.full_name + " exists in the Chado database")
ogs_analysis = self.instance.tools.run_tool(tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_get_analyses/analysis_get_analyses/2.3.2",
history_id=self.history_id,
tool_inputs={"name": self.genus + " " + self.species + " OGS" + self.ogs_version})
ogs_analysis_job_out = ogs_analysis["outputs"][0]["id"]
ogs_analysis_json_output = self.instance.datasets.download_dataset(dataset_id=ogs_analysis_job_out)
try:
ogs_analysis_output = json.loads(ogs_analysis_json_output)[0]
self.ogs_analysis_id = str(ogs_analysis_output["analysis_id"]) # needs to be str to be recognized by chado tools
except IndexError:
logging.debug("no matching OGS analysis exists in the Chado database")
genome_analysis = self.instance.tools.run_tool(tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_get_analyses/analysis_get_analyses/2.3.2",
history_id=self.history_id,
tool_inputs={"name": self.genus + " " + self.species + " genome v" + self.genome_version})
genome_analysis_job_out = genome_analysis["outputs"][0]["id"]
genome_analysis_json_output = self.instance.datasets.download_dataset(dataset_id=genome_analysis_job_out)
try:
genome_analysis_output = json.loads(genome_analysis_json_output)[0]
self.genome_analysis_id = str(genome_analysis_output["analysis_id"]) # needs to be str to be recognized by chado tools
except IndexError:
logging.debug("no matching genome analysis exists in the Chado database")
logging.info("finished initializing instance")
def clean_instance(self):
"""
TODO: function to purge the instance from analyses and organisms
:return:
"""
return None
if __name__ == "main":
parser = argparse.ArgumentParser(description="Input genus, species, strain, version")
parser.add_argument("json", type=str, help="Input JSON file")
parser.add_argument("-v", "--verbose", help="Increase output verbosity")
parser.add_argument("--load-data", help="Create src_data directory tree and load data into galaxy")
parser.add_argument("--main-workflow", help="Run main workflow (initialize galaxy instance, load data into chado,"
"sync with tripal, create jbrowse and add organism to jbrowse")
args = parser.parse_args()
if args.verbose:
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)
sp_dict_list = list()
with open(args.json, 'r') as infile:
json_sp_dict = json.load(infile)
json_sp_dump = json.dumps(json_sp_dict, indent=4, sort_keys=True)
for json_sp in json_sp_dict:
sp_dict_list.append(json_sp)
for sp_dict in sp_dict_list:
al = Autoload(species_parameters_dictionary=sp_dict)
if args.main_workflow:
workflow_parameters = dict()
workflow_parameters["0"] = {}
workflow_parameters["1"] = {}
workflow_parameters["2"] = {}
workflow_parameters["3"] = {}
workflow_parameters["4"] = {"organism": al.org_id,
"analysis_id": al.genome_analysis_id,
"do_update": "true"} # the do_update parameter is to prevent assertion errors when loading the file, should always be set to "true"
workflow_parameters["5"] = {"organism": al.org_id,
"analysis_id": al.ogs_analysis_id}
workflow_parameters["6"] = {"organism_id": al.org_id}
workflow_parameters["7"] = {"analysis_id": al.ogs_analysis_id}
workflow_parameters["8"] = {"analysis_id": al.genome_analysis_id}
workflow_parameters["9"] = {"organism_id": al.org_id}
al.run_workflow(workflow_name="main", workflow_parameters=workflow_parameters)