Newer
Older

Arthur Le Bars
committed
import bioblend

Arthur Le Bars
committed
import argparse
import os
import subprocess

Arthur Le Bars
committed
import sys
import json
import yaml
import re
import metadata_generator, docker_compose_generator, table_parser
import fnmatch
import shutil
"""
gga_auto_load main script
Scripted integration of new data into GGA instances. The input is either a table-like (csv, xls, ...) or a json (TODO: yaml) file
that describes what data is to be integrated (genus, species, sex, strain, data), see data_example.json for an example of the correct syntax.
The script will parse the input and take care of everything, from source files directory tree creation to running the gmod tools
inside the galaxy instances of organisms.
TODO: By default, the script will do everything needed to have a functional instance from scratch. If you want to bypass this behavior,
you have to specify --update as a parameter. The script can also be used to update an existing GGA instance with new data. For example, you have an instance "genus_species"
with data for the male sex and want to add the female sex to the same GGA instance. To do this, create your configuration input file as you would normally, and add the "--update"
argument when invoking the script.
TODO EOSC/Cloudification:
- keep in mind
- divide into 2 general-use scripts
- create docker stack via swarm

Arthur Le Bars
committed
- load data into libraries (method to load it at init, and a method/script to load it separately (galaxy_data_libs_SI does this already?)
- find and copy data
- change file headers, etc.. (ext scripts for data manipulation)
- generate blast banks and links
- generate and edit nginx confs
- generate dc and start the containers
- connect to instance and launch tools>workflows
- generate and update metadata
- exit
"""

Arthur Le Bars
committed
class Autoload:
"""
Autoload class contains attributes and functions to interact with GGA

Arthur Le Bars
committed
"""

Arthur Le Bars
committed
self.species_parameters_dictionary = species_parameters_dictionary

Arthur Le Bars
committed
self.species = species_parameters_dictionary["species"]
self.genus = species_parameters_dictionary["genus"]
self.strain = species_parameters_dictionary["strain"]
self.sex = species_parameters_dictionary["sex"]
self.common = species_parameters_dictionary["common"]
self.date = species_parameters_dictionary["date"]

Arthur Le Bars
committed
self.performed = species_parameters_dictionary["performed by"]
self.genome_version = species_parameters_dictionary["genome version"]
self.ogs_version = species_parameters_dictionary["ogs version"]
self.genus_lowercase = self.genus[0].lower() + self.genus[1:]
self.full_name = " ".join([self.genus_lowercase, self.species, self.strain, self.sex])
self.abbreviation = " ".join([self.genus_lowercase[0], self.species, self.strain, self.sex])
self.genus_species = self.genus_lowercase + "_" + self.species
self.instance_url = "http://localhost/sp/" + self.genus_lowercase + "_" + self.species + "/galaxy/" # testing!

Arthur Le Bars
committed
self.history_id = None
self.library_id = None
self.script_dir = os.path.dirname(os.path.realpath(sys.argv[0]))

Arthur Le Bars
committed
self.main_dir = None
self.species_dir = None
self.org_id = None
self.genome_analysis_id = None
self.ogs_analysis_id = None
self.tool_panel = None
self.datasets = dict()
self.source_files = dict()
self.workflow_name = None

Arthur Le Bars
committed
self.docker_compose_generator = None

Arthur Le Bars
committed
self.source_data_dir = "/projet/sbr/phaeoexplorer"
# Directory/subdirectories where data files are located (fasta, gff, ...), point to a directory as close as possible to the source files

Arthur Le Bars
committed
# Update the instance (in histories corresponding to the input) instead of creating a new one TODO: move this variable inside methods
self.api_key = "dev"
# Api key used to communicate with the galaxy instance. Set to "dev" for the moment TODO: find a way to create, store then use the api key safely

Arthur Le Bars
committed
Test the connection to the galaxy instance for the current organism
Exit if we can't connect to the instance

Arthur Le Bars
committed
self.instance = galaxy.GalaxyInstance(url=self.instance_url, key=self.api_key)

Arthur Le Bars
committed
try:
self.instance.histories.get_histories()
self.tool_panel = self.instance.tools.get_tool_panel()
except bioblend.ConnectionError:
logging.info("cannot connect to galaxy instance @ " + self.instance_url)
sys.exit()
else:
logging.info("successfully connected to galaxy instance @ " + self.instance_url)

Arthur Le Bars
committed
"""
Find and copy source data files to src_data directory tree
- recursively search for the correct files (within a fixed max depth)
- requires the organism src_data directory tree to already be properly created for the organism (run generate_dir_tree)
- the source files must have "transcripts", "proteins"/"pep", "genome" in their name, and a gff extension

Arthur Le Bars
committed
"""
src_data_dir = os.path.join(self.species_dir, "/src_data")
sp_regex = "(?=\w*V)(?=\w*A)(?=\w*R)(?=\w*I)(?=\w*A)(?=\w*B)(?=\w*L)(?=\w*E)\w+" # example with VARIABLE
# The regex works using the species attribute (unique) --> regex is probably not necessary
sp_regex = ""
for i in self.species:
sp_regex = sp_regex + "?=\w*" + i + ")"
sp_regex = sp_regex + ")\w+"
re_dict["gff"] = None
re_dict["transcripts"] = None
re_dict["proteins"] = None
re_dict["genome"] = None
for dirpath, dirnames, files in os.walk(self.source_data_dir):
for f in files:
if self.species and self.sex in f:
print("File found")
Generate the directory tree for an organism and move datasets into src_data

Arthur Le Bars
committed
os.chdir(self.main_dir)
self.main_dir = os.getcwd() + "/"
self.species_dir = os.path.join(self.main_dir, self.genus_species) + "/"

Arthur Le Bars
committed
try:
os.mkdir(self.species_dir)
except FileExistsError:
logging.debug("directory " + self.species_dir + " already exists")
try:
os.chdir(self.species_dir)
working_dir = os.getcwd()
except OSError:
logging.info("cannot access " + self.species_dir + ", run with higher privileges")
sys.exit()
try:
os.mkdir("./nginx/")
os.mkdir("./nginx/conf")
with open(os.path.abspath("./nginx/conf/default.conf"), 'w') as conf:
conf.write("server {\n\tlisten 80;\n\tserver_name ~.;\n\tlocation /download/ {\n\t\talias /project_data/; \n\t\tautoindex on;\n\t}\n}") # the nginx conf
except FileExistsError:
logging.debug("nginx conf exists")
# src_data_folders = ["annotation", "genome"] # directories to generate

Arthur Le Bars
committed
species_folder_name = "_".join([self.genus_lowercase, self.species, self.strain, self.sex])
organism_annotation_dir, organism_genome_dir = None, None
# Create src_data dir tree

Arthur Le Bars
committed
try:
os.mkdir("./src_data")
os.mkdir("./src_data/annotation")
os.mkdir("./src_data/genome")
os.mkdir("./src_data/tracks")
os.mkdir("./src_data/annotation/" + species_folder_name)
os.mkdir("./src_data/genome/" + species_folder_name)
os.mkdir("./src_data/annotation/" + species_folder_name + "/OGS" + self.ogs_version)
os.mkdir("./src_data/genome/" + species_folder_name + "/v" + self.genome_version)
organism_annotation_dir = os.path.abspath("./src_data/annotation/" + species_folder_name + "/OGS" + self.genome_version)
organism_genome_dir = os.path.abspath("./src_data/genome/" + species_folder_name + "/v" + self.genome_version)

Arthur Le Bars
committed
except FileExistsError:
logging.info("The src_data directory tree already exists")

Arthur Le Bars
committed
except PermissionError:
logging.info("Insufficient permission to create src_data directory tree")
sys.exit()
# Hard coded paths (find a way to get the files by adding an attribute "path_to_repo")
# Write with string?

Arthur Le Bars
committed
stack_template_path = self.script_dir + "/templates/stack-organism.yml"
traefik_template_path = self.script_dir + "/templates/traefik.yml"
authelia_config_path = self.script_dir + "/templates/authelia_config.yml"
authelia_users_path = self.script_dir + "/templates/authelia_users.yml"
if self.sex and self.strain:
genus_species_strain_sex = self.genus.lower() + "_" + self.species + "_" + self.strain + "_" + self.sex
else:
genus_species_strain_sex = self.genus.lower() + "_" + self.species
with open(stack_template_path, 'r') as infile:
organism_content = list()
for line in infile:
# Replace placeholders by the genus and species
organism_content.append(

Arthur Le Bars
committed
line.replace("genus_species", str(self.genus.lower() + "_" + self.species)).replace("Genus species", str(self.genus + " " + self.species)).replace("Genus/species", str(self.genus + "/" + self.species)).replace("gspecies", str( self.genus.lower()[0] + self.species)).replace("genus_species_strain_sex", genus_species_strain_sex))
with open("./docker-compose.yml", 'w') as outfile:
for line in organism_content:
outfile.write(line)

Arthur Le Bars
committed
subprocess.call(["python3", self.script_dir + "/create_mounts.py"], cwd=working_dir)
try:
os.mkdir("../traefik")
os.mkdir("../traefik/authelia")

Arthur Le Bars
committed
shutil.copy(authelia_config_path, "../traefik/authelia/configuration.yml")
shutil.copy(authelia_users_path, "../traefik/authelia/users.yml")
# with open(traefik_template_path, 'r') as infile:
# traefik_content = list()
# for line in infile:
# # Replace placeholders by the genus and species (there are none)
# traefik_content.append(
# line.replace("genus_species", str(self.genus.lower() + "_" + self.species)).replace("Genus species", str(self.genus + " " + self.species)).replace("Genus/species", str(self.genus + "/" + self.species)).replace("gspecies", str(self.genus.lower()[0] + self.species)).replace("genus_species_strain_sex", genus_species_strain_sex))
# with open("../traefik/docker-compose.yml", 'w') as outfile:
# for line in traefik_content:
# outfile.write(line)
subprocess.call(["python3", self.script_dir + "/create_mounts.py"], cwd=working_dir)

Arthur Le Bars
committed
logging.debug("SKIP: Traefik directory already exists")

Arthur Le Bars
committed
# Create volumes for the containers (script written by A. Bretaudeau)
subprocess.call(["python3", self.script_dir + "/create_mounts.py"], cwd=working_dir)
# Find all files in source_data directory, to link the matching files in the src_data dir tree
# Can be turned into a generator for performance

Arthur Le Bars
committed
# TODO: cp data files method in a separate function (for EOSC)
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
for dirpath, dirnames, files in os.walk(self.source_data_dir):
if "0" in str(dirpath): # ensures to take the correct files (other dirs hold files with the correct names, but I don't know if they are the same
for f in files:
try:
if fnmatch.fnmatch(f, "*" + self.species[1:] + "_" + self.sex.upper() + ".fa"):
logging.info("genome assembly file: " + str(f))
organism_genome_dir = organism_genome_dir + "/" + f
os.symlink(os.path.join(dirpath, f), organism_genome_dir)
organism_genome_dir = os.path.abspath("./src_data/genome/" + species_folder_name + "/v" + self.genome_version)
elif fnmatch.fnmatch(f, "*" + self.species[1:] + "_" + self.sex.upper() + ".gff"):
logging.info("gff file: " + str(f))
organism_annotation_dir = organism_annotation_dir + "/" + f
os.symlink(os.path.join(dirpath, f), organism_annotation_dir)
organism_annotation_dir = os.path.abspath("./src_data/annotation/" + species_folder_name + "/OGS" + self.genome_version)
elif fnmatch.fnmatch(f, "*" + self.species[1:] + "_" + self.sex.upper() + "_transcripts-gff.fa"):
logging.info("transcripts file: " + str(f))
organism_annotation_dir = organism_annotation_dir + "/" + f
os.symlink(os.path.join(dirpath, f), organism_annotation_dir)
organism_annotation_dir = os.path.abspath("./src_data/annotation/" + species_folder_name + "/OGS" + self.genome_version)
elif fnmatch.fnmatch(f, "*" + self.species[1:] + "_" + self.sex.upper() + "_proteins.fa"):
logging.info("proteins file: " + str(f))
organism_annotation_dir = organism_annotation_dir + "/" + f
os.symlink(os.path.join(dirpath, f), organism_annotation_dir)
organism_annotation_dir = os.path.abspath("./src_data/annotation/" + species_folder_name + "/OGS" + self.genome_version)
except TypeError:
pass

Arthur Le Bars
committed
# Launch and update docker stacks (cf docs) TODO: deploy method in a separate function (for EOSC)
# deploy_script_path = self.script_dir + "/deploy.sh"
# subprocess.call(["sh", deploy_script_path, self.genus_species])

Arthur Le Bars
committed
OBSOLETE: compose method
Generate (and update nginx) conf files to add new organisms from the proxy
:return:
"""
nginx_proxy_path = "" # nginx conf template for the main proxy (needs to be updated for each new organism integration)
nginx_organism_path = "" # nginx conf template for the current organism (used once)
docker_proxy_template_path = "" # dockerfile for the main proxy (used once)

Arthur Le Bars
committed
Change the fasta headers before integration.
:return:
"""
try:
os.chdir(self.species_dir)
working_dir = os.getcwd()
except OSError:
logging.info("cannot access " + self.species_dir + ", run with higher privileges")
sys.exit()

Arthur Le Bars
committed
annotation_dir, genome_dir = None, None
for d in [i[0] for i in os.walk(os.getcwd() + "/src_data")]:
if "annotation/" in d:
annotation_dir = d
for f in os.listdir(d):
if f.endswith("proteins.fasta"):
self.source_files["proteins_file"] = os.path.join(d, f)

Arthur Le Bars
committed
elif f.endswith("transcripts-gff.fa"):
self.source_files["transcripts_file"] = os.path.join(d, f)

Arthur Le Bars
committed
elif f.endswith(".gff"):
self.source_files["gff_file"] = os.path.join(d, f)

Arthur Le Bars
committed
elif "genome/" in d:
genome_dir = d
for f in os.listdir(d):
if f.endswith(".fa"):
self.source_files["genome_file"] = os.path.join(d, f)

Arthur Le Bars
committed
logging.debug("source files found:")

Arthur Le Bars
committed
logging.debug("\t" + k + "\t" + v)
# Changing headers in the *proteins.fasta file from >mRNA* to >protein*
# production version
modify_pep_headers = [str(self.main_dir) + "/gga_load_data/ext_scripts/phaeoexplorer-change_pep_fasta_header.sh",

Arthur Le Bars
committed
# test version
# modify_pep_headers = ["/home/alebars/gga/phaeoexplorer-change_pep_fasta_header.sh",
# self.source_files["proteins_file"]]
logging.info("changing fasta headers in " + self.source_files["proteins_file"])

Arthur Le Bars
committed
subprocess.run(modify_pep_headers, stdout=subprocess.PIPE, cwd=annotation_dir)
# production version
modify_pep_headers = [str(self.main_dir) + "/gga_load_data/ext_scripts/phaeoexplorer-change_transcript_fasta_header.sh",
self.source_files["proteins_file"]]
# test version
# modify_pep_headers = ["/home/alebars/gga/phaeoexplorer-change_transcript_fasta_header.sh",
# self.source_files["proteins_file"]]
logging.info("changing fasta headers in " + self.source_files["transcripts_file"])
subprocess.run(modify_pep_headers, stdout=subprocess.PIPE, cwd=annotation_dir)

Arthur Le Bars
committed
# src_data cleaning
if os.path.exists(annotation_dir + "outfile"):
subprocess.run(["mv", annotation_dir + "/outfile", self.source_files["proteins_file"]],

Arthur Le Bars
committed
stdout=subprocess.PIPE,
cwd=annotation_dir)
if os.path.exists(annotation_dir + "gmon.out"):
subprocess.run(["rm", annotation_dir + "/gmon.out"],
stdout=subprocess.PIPE,
cwd=annotation_dir)
return None
def setup_data_libraries(self):
"""
- generate blast banks and docker-compose (TODO: separate function)
- load data into the galaxy container with the galaxy_data_libs_SI.py script
:return:
"""

Arthur Le Bars
committed
try:
logging.info("loading data into the galaxy container")
subprocess.run("docker-compose exec galaxy /tool_deps/_conda/bin/python /opt/setup_data_libraries.py",

Arthur Le Bars
committed
stdout=subprocess.PIPE,
shell=True)
except subprocess.CalledProcessError:
logging.info("cannot load data into container for " + self.full_name)
pass
else:
logging.info("data successfully loaded into docker container for " + self.full_name)
self.get_instance_attributes()
# self.history_id = self.instance.histories.get_current_history()["id"]
# import all datasets into current history
self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["genome_file"])
self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["gff_file"])
self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["transcripts_file"])
self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["proteins_file"])
def get_instance_attributes(self):
"""
- libraries ID (there should only be one library!)
- datasets IDs
:return:
"""
histories = self.instance.histories.get_histories(name=str(self.full_name))

Arthur Le Bars
committed
self.history_id = histories[0]["id"]
logging.debug("history ID: " + self.history_id)

Arthur Le Bars
committed
libraries = self.instance.libraries.get_libraries() # normally only one library
self.library_id = self.instance.libraries.get_libraries()[0]["id"] # project data folder/library
logging.debug("library ID: " + self.history_id)

Arthur Le Bars
committed
instance_source_data_folders = self.instance.libraries.get_folders(library_id=self.library_id)
folders_ids = {}

Arthur Le Bars
committed
for i in instance_source_data_folders:
for k, v in i.items():
if k == "name":
folders_ids[v] = 0

Arthur Le Bars
committed
if k == "id":

Arthur Le Bars
committed
for k, v in folders_ids.items():
logging.info("\t" + k + ": " + v)
if k == "/genome":
sub_folder_content = self.instance.folders.show_folder(folder_id=v, contents=True)
for k2, v2 in sub_folder_content.items():
for e in v2:
if type(e) == dict:
if e["name"].endswith(".fa"):

Arthur Le Bars
committed
elif k == "/annotation/" + self.genus_species:
sub_folder_content = self.instance.folders.show_folder(folder_id=v, contents=True)
for k2, v2 in sub_folder_content.items():
for e in v2:
if type(e) == dict:
# TODO: manage several files of the same type and manage versions
if e["name"].endswith("transcripts-gff.fa"):
self.datasets["transcripts_file"] = e["ldda_id"]

Arthur Le Bars
committed
elif e["name"].endswith("proteins.fasta"):

Arthur Le Bars
committed
elif e["name"].endswith(".gff"):

Arthur Le Bars
committed
elif e["name"].endswith("MALE"):

Arthur Le Bars
committed
def run_workflow(self, workflow_name, workflow_parameters, datamap):

Arthur Le Bars
committed
"""
Run the "main" workflow in the galaxy instance
- import data to library
- load fasta and gff
- sync with tripal
- add jbrowse + organism
- fill in the tripal views
TODO: map tool name to step id
:param workflow_name:

Arthur Le Bars
committed
:param workflow_parameters:

Arthur Le Bars
committed
:return:
"""
logging.debug("running workflow: " + str(workflow_name))
workflow_ga_file = self.main_dir + "Galaxy-Workflow-" + workflow_name + ".ga"
if self.strain != "":
custom_ga_file = "_".join([self.genus, self.species, self.strain]) + "_workflow.ga"
custom_ga_file_path = os.path.abspath(custom_ga_file)
else:
custom_ga_file = "_".join([self.genus, self.species]) + "_workflow.ga"
custom_ga_file_path = os.path.abspath(custom_ga_file)
with open(workflow_ga_file, 'r') as ga_in_file:
workflow = workflow.replace('{\\\\\\\\\\\\"unique_id\\\\\\\\\\\\": \\\\\\\\\\\\"UNIQUE_ID\\\\\\\\\\\\"}',
str('{\\\\\\\\\\\\"unique_id\\\\\\\\\\\\": \\\\\\\\\\\\"' + self.genus + " " + self.species) + '\\\\\\\\\\\\"')
workflow = workflow.replace('\\\\\\\\\\\\"name\\\\\\\\\\\\": \\\\\\\\\\\\"NAME\\\\\\\\\\\\"',
str('\\\\\\\\\\\\"name\\\\\\\\\\\\": \\\\\\\\\\\\"' + self.genus.lower()[0] + self.species) + '\\\\\\\\\\\\"')
workflow = workflow.replace("\\\\", "\\") # to restore the correct amount of backslashes in the workflow string before import

Arthur Le Bars
committed
# test
workflow = workflow.replace('http://localhost/sp/genus_species/feature/Genus/species/mRNA/{id}',
"http://localhost/sp/" + self.genus_lowercase+ "_" + self.species + "/feature/" + self.genus + "/mRNA/{id}")

Arthur Le Bars
committed
# production
# workflow = workflow.replace('http://localhost/sp/genus_species/feature/Genus/species/mRNA/{id}',
# "http://abims--gga.sb-roscoff.fr/sp/" + self.genus_lowercase + "_" + self.species + "/feature/" + self.genus + "/mRNA/{id}")
workflow = workflow[2:-2] # if the line under doesn't output a correct json
# workflow = workflow[:-2] # if the line above doesn't output a correct json
self.instance.workflows.import_workflow_dict(workflow_dict=workflow_dict)
self.workflow_name = workflow_name
workflow_attributes = self.instance.workflows.get_workflows(name=self.workflow_name)
workflow_id = workflow_attributes[0]["id"]
show_workflow = self.instance.workflows.show_workflow(workflow_id=workflow_id)
logging.debug("workflow ID: " + workflow_id)
logging.debug("inputs:")
logging.debug(show_workflow["inputs"])
self.instance.workflows.invoke_workflow(workflow_id=workflow_id,
history_id=self.history_id,
params=workflow_parameters,
inputs=datamap,
inputs_by="")
self.instance.workflows.delete_workflow(workflow_id=workflow_id)

Arthur Le Bars
committed
def init_instance(self):
"""
Galaxy instance startup in preparation for running workflows
- remove Homo sapiens from the chado database.
- add organism and analyses into the chado database
- get any other existing organisms IDs before updating the galaxy instance

Arthur Le Bars
committed
:return:
"""
self.instance.histories.create_history(name=str(self.full_name))
histories = self.instance.histories.get_histories(name=str(self.full_name))
self.history_id = histories[0]["id"]
logging.debug("history ID: " + self.history_id)
libraries = self.instance.libraries.get_libraries() # routine check: one library
self.library_id = self.instance.libraries.get_libraries()[0]["id"] # project data folder/library
logging.debug("library ID: " + self.history_id)
instance_source_data_folders = self.instance.libraries.get_folders(library_id=self.library_id)

Arthur Le Bars
committed
# Delete Homo sapiens from Chado database
logging.info("getting sapiens ID in instance's chado database")

Arthur Le Bars
committed
get_sapiens_id_job = self.instance.tools.run_tool(tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/2.3.2",
history_id=self.history_id,
tool_inputs={"genus": "Homo", "species": "sapiens"})

Arthur Le Bars
committed
get_sapiens_id_job_output = get_sapiens_id_job["outputs"][0]["id"]
get_sapiens_id_json_output = self.instance.datasets.download_dataset(dataset_id=get_sapiens_id_job_output)
try:
logging.info("deleting Homo sapiens in the instance's chado database")

Arthur Le Bars
committed
get_sapiens_id_final_output = json.loads(get_sapiens_id_json_output)[0]
sapiens_id = str(get_sapiens_id_final_output["organism_id"]) # needs to be str to be recognized by the chado tool
self.instance.tools.run_tool(tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_delete_organisms/organism_delete_organisms/2.3.2",
history_id=self.history_id,
tool_inputs={"organism": str(sapiens_id)})
except bioblend.ConnectionError:
logging.debug("Homo sapiens isn't in the instance's chado database")

Arthur Le Bars
committed
except IndexError:
logging.debug("Homo sapiens isn't in the instance's chado database")

Arthur Le Bars
committed
pass
# Add organism (species) to chado
logging.info("adding organism to the instance's chado database")

Arthur Le Bars
committed
self.instance.tools.run_tool(tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_add_organism/organism_add_organism/2.3.2",
history_id=self.history_id,
tool_inputs={"abbr": self.abbreviation,
"genus": self.genus,
"species": self.species,
"common": self.common})
# Add OGS analysis to chado
logging.info("adding OGS analysis to the instance's chado database")

Arthur Le Bars
committed
self.instance.tools.run_tool(tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/2.3.2",
history_id=self.history_id,
tool_inputs={"name": self.genus + " " + self.species + " OGS" + self.ogs_version,
"program": "Performed by Genoscope",
"programversion": str("OGS" + self.ogs_version),
"sourcename": "Genoscope",
"date_executed": self.date})
# Add genome analysis to chado
logging.info("adding genome analysis to the instance's chado database")

Arthur Le Bars
committed
self.instance.tools.run_tool(tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/2.3.2",
history_id=self.history_id,
tool_inputs={"name": self.genus + " " + self.species + " genome v" + self.genome_version,
"program": "Performed by Genoscope",
"programversion": str("genome v" + self.genome_version),
"sourcename": "Genoscope",
"date_executed": self.date})
self.get_organism_and_analyses_ids()
logging.info("finished initializing instance")
def get_organism_and_analyses_ids(self):
"""
Retrieve current organism ID and OGS and genome chado analyses IDs (needed to run some tools as Tripal/Chado
doesn't accept organism/analyses names as valid inputs
:return:
"""
# Get the ID for the current organism in chado
org = self.instance.tools.run_tool(
tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/2.3.2",
history_id=self.history_id,
tool_inputs={"genus": self.genus, "species": self.species})

Arthur Le Bars
committed
org_job_out = org["outputs"][0]["id"]
org_json_output = self.instance.datasets.download_dataset(dataset_id=org_job_out)
try:
org_output = json.loads(org_json_output)[0]
self.org_id = str(org_output["organism_id"]) # id needs to be a str to be recognized by chado tools

Arthur Le Bars
committed
except IndexError:
logging.debug("no organism matching " + self.full_name + " exists in the instance's chado database")

Arthur Le Bars
committed
# Get the ID for the OGS analysis in chado
ogs_analysis = self.instance.tools.run_tool(
tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_get_analyses/analysis_get_analyses/2.3.2",
history_id=self.history_id,
tool_inputs={"name": self.genus + " " + self.species + " OGS" + self.ogs_version})

Arthur Le Bars
committed
ogs_analysis_job_out = ogs_analysis["outputs"][0]["id"]
ogs_analysis_json_output = self.instance.datasets.download_dataset(dataset_id=ogs_analysis_job_out)
try:
ogs_analysis_output = json.loads(ogs_analysis_json_output)[0]

Arthur Le Bars
committed
except IndexError:
logging.debug("no matching OGS analysis exists in the instance's chado database")

Arthur Le Bars
committed
# Get the ID for the genome analysis in chado
genome_analysis = self.instance.tools.run_tool(
tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_get_analyses/analysis_get_analyses/2.3.2",
history_id=self.history_id,
tool_inputs={"name": self.genus + " " + self.species + " genome v" + self.genome_version})

Arthur Le Bars
committed
genome_analysis_job_out = genome_analysis["outputs"][0]["id"]
genome_analysis_json_output = self.instance.datasets.download_dataset(dataset_id=genome_analysis_job_out)
try:
genome_analysis_output = json.loads(genome_analysis_json_output)[0]
self.genome_analysis_id = str(genome_analysis_output["analysis_id"])

Arthur Le Bars
committed
except IndexError:
logging.debug("no matching genome analysis exists in the instance's chado database")

Arthur Le Bars
committed

Arthur Le Bars
committed
def clean_instance(self):
"""
TODO: function to purge the instance from analyses and organisms
:return:
"""
return None

Arthur Le Bars
committed
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Automatic data loading in containers and interaction with galaxy instances for GGA"
", following the protocol @ "
"http://gitlab.sb-roscoff.fr/abims/e-infra/gga")
# Dev arguments, TODO: remove in production branch!
parser.add_argument("--full",
help="Run everything, from src_data dir tree creation, moving data files (abims) into src_data,"
"modify headers (abims), generate blast banks (doesn't commit them: TODO), initialize GGA instance, load the data and run,"
" the main workflow. To update/add data to container, use --update in conjunction to --full (TODO)")
parser.add_argument("--init-instance",
help="Initialization of galaxy instance. Run first in an empty instance, DEV",
parser.add_argument("--load-data",
help="Create src_data directory tree, copy datasets to src_data, and load these datasets into the instance, DEV",
action="store_true")
parser.add_argument("--run-main",
help="Run main workflow (load data into chado, sync all with tripal, "
"index tripal data, populate materialized view, "
"create a jbrowse for the current genus_species_strain_sex and add organism to jbrowse")

Arthur Le Bars
committed
parser.add_argument("--generate-docker-compose",
help="Generate docker-compose.yml for current species, DEV")
help="Find source files in source data dir and copy them to src_data, DEV, OBSOLETE",

Arthur Le Bars
committed
action="store_true")
# Production arguments
parser.add_argument("input", type=str, help="Input table (tabulated file that describes all data) or json file")
parser.add_argument("-v", "--verbose",
help="Increase output verbosity",
action="store_false")
parser.add_argument("--update",
help="Update an already integrated organisms with new data from input file, docker-compose.yml will not be re-generated"
", assuming the instances for the organisms are already generated and initialized",
help="Path of the main directory, either absolute or relative, defaults to current directory",
args = parser.parse_args()
if args.verbose:
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)

Arthur Le Bars
committed
if str(args.input).endswith(".json"):

Arthur Le Bars
committed
input_json = args.input
else:

Arthur Le Bars
committed
tp = table_parser.TableParser()
logging.info("parsing input table")
tp.table = args.input
input_json = tp.parse_table(mode="simple", method="table_to_json")
sp_dict_list = list()

Arthur Le Bars
committed
with open(input_json, 'r') as infile:
json_sp_dict = json.load(infile)
json_sp_dump = json.dumps(json_sp_dict, indent=4, sort_keys=True)
for json_sp in json_sp_dict:
sp_dict_list.append(json_sp)
for sp_dict in sp_dict_list:
al = Autoload(species_parameters_dictionary=sp_dict, args=args)
if args.init_instance:
logging.info("initializing the galaxy instance")
al.init_instance()
al.get_instance_attributes()
# metadata[genus_species_strain_sex]["initialized"] = True
if args.load_data:
logging.info("loading data into galaxy")
# metadata[genus_species_strain_sex]["data_loaded_in_instance"] = True
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
if args.run_main:
logging.info("running main workflow")
al.get_organism_and_analyses_ids()
workflow_parameters = dict()
workflow_parameters["0"] = {}
workflow_parameters["1"] = {}
workflow_parameters["2"] = {}
workflow_parameters["3"] = {}
workflow_parameters["4"] = {"organism": al.org_id,
"analysis_id": al.genome_analysis_id,
"do_update": "true"}
workflow_parameters["5"] = {"organism": al.org_id,
"analysis_id": al.ogs_analysis_id}
workflow_parameters["6"] = {"organism_id": al.org_id}
workflow_parameters["7"] = {"analysis_id": al.ogs_analysis_id}
workflow_parameters["8"] = {"analysis_id": al.genome_analysis_id}
workflow_parameters["9"] = {"organism_id": al.org_id}
workflow_parameters["10"] = {}
workflow_parameters["11"] = {}
al.datamap = dict()
al.datamap["0"] = {"src": "hda", "id": al.datasets["genome_file"]}
al.datamap["1"] = {"src": "hda", "id": al.datasets["gff_file"]}
al.datamap["2"] = {"src": "hda", "id": al.datasets["proteins_file"]}
al.datamap["3"] = {"src": "hda", "id": al.datasets["transcripts_file"]}
al.run_workflow(workflow_name="main", workflow_parameters=workflow_parameters, datamap=al.datamap)
# metadata[genus_species_strain_sex]["workflows_run"] = metadata[genus_species_strain_sex]["workflows_run"].append("main")