Skip to content
Snippets Groups Projects
Commit a9daa4b9 authored by Arthur Le Bars's avatar Arthur Le Bars
Browse files

.ga correct editing for jbrowse workflow

parent de797749
No related branches found
No related tags found
1 merge request!1Release 1.0
......@@ -5,9 +5,9 @@ import argparse
import os
import sys
import subprocess
import logging
import re
import json
import urllib3 as ul
from chado import ChadoInstance
from workflow import Workflow
from toolrunner import ToolRunner
"""
......@@ -17,22 +17,12 @@ python3 ~/PycharmProjects/ggauto/gga_load_data/main.py ~/PycharmProjects/ggauto/
"""
class Autoload:
"""
TODO: turn main into an object
"""
def __init__(self, json_in):
self.json_in = json_in
def main(self):
return None
def main():
parser = argparse.ArgumentParser(description="Input genus, species, strain, version")
parser.add_argument("json", type=str, help="Input JSON file")
parser.add_argument("--just-load", help="Only load data into galaxy, does not create nor run analyses in galaxy")
parser.add_argument("-v", "--verbose", help="Increase output verbosity")
# CLI stuff
# parser.add_argument("--name", help="Sample species name, format: genus-species",type=str)
......@@ -42,6 +32,8 @@ def main():
user_arguments = parser.parse_args()
logging.basicConfig(level=logging.INFO)
# List that will hold all dicts from the JSON input file, containing parameters for each species
sp_dict_list = []
......@@ -81,17 +73,20 @@ def main():
# Test adress, change to abims-gga.sb-roscoff.fr/sp/ in production
instance_url = "http://localhost/sp/" + genus_lower + "_" + species + "/galaxy/"
print("Species: " + genus + " " + species + " (" + common + ")"
"\nStrain: " + strain +
"\nAccessing instance " + instance_url)
print("Species: " + genus + " " + species + " (" + common + ")" +
"\nStrain: " + strain +
"\nAccessing instance " + instance_url)
# Connect to the galaxy instance of the current species TODO: API key connection issues
# Connect to the galaxy instance of the current species TODO: connection issues (galaxy side)
gi = galaxy.GalaxyInstance(url=instance_url,
key="3b36455cb16b4d0e4348e2c42f4bb934",
email="alebars@sb-roscoff.fr",
password="pouet",
verify=True)
# admin_email = os.environ.get('GALAXY_DEFAULT_ADMIN_USER', 'admin@galaxy.org')
# admin_pass = os.environ.get('GALAXY_DEFAULT_ADMIN_PASSWORD', 'admin')
"""
This part creates the current species directory and go to it
If it already exists, just move to it
......@@ -118,22 +113,62 @@ def main():
# password="****")
# Check connection to the current instance
print("Testing connection to the galaxy instance")
try:
hl = gi.histories.get_histories()
except bb.ConnectionError:
print("Cannot connect to GGA instance @ " + instance_url)
print("Cannot connect to galaxy instance @ " + instance_url)
else:
print("Successfully connected to instance " + instance_url)
print("Successfully connected to galaxy instance @ " + instance_url)
# TODO: FTP stuff to retrieve the datasets (used in testing, not needed for production)
# TODO: FTP/symlink stuff to retrieve the datasets + change headers in pep.fasta
setup_data_libraries_cl = "docker-compose exec galaxy /tool_deps/_conda/bin/python /opt/setup_data_libraries.py"
# try:
# os.mkdir("./src_data")
# except FileExistsError:
# print("src_data folder already exists for " + genus_species_strain)
# print("Loading data into galaxy...")
# try:
# setup_data_libraries = subprocess.Popen(setup_data_libraries_cl.split(), stdout=subprocess.PIPE)
# print("Output from setup_data_libraries.py")
# print(setup_data_libraries.communicate())
# except bb.ConnectionError:
# print("Cannot load data into container for " + genus_species_strain)
# break
# else:
# print("Data successfully loaded into docker container for " + genus_species_strain)
# else:
# print("src_data folder created for " + genus_species_strain)
#
# try:
# setup_data_libraries = subprocess.Popen(setup_data_libraries_cl.split(), stdout=subprocess.PIPE)
# print("Output from setup_data_libraries.py")
# print(setup_data_libraries.communicate())
# except bb.ConnectionError:
# print("Cannot load data into container for " + genus_species_strain)
# break
# else:
# print("Data successfully loaded into docker container for " + genus_species_strain)
genome_dir, annotation_dir = None, None
for d in [i[0] for i in os.walk(os.getcwd() + "/src_data")]:
if "annotation/" in d:
annotation_dir = d
annotation_dir_files = [f for f in os.listdir(d) if os.path.isfile(os.path.join(d, f))]
print("src_data annotation file(s):")
print(str('\t' + file) for file in annotation_dir_files)
elif "genome/" in d:
genome_dir = d
genome_dir_files = [f for f in os.listdir(d) if os.path.isfile(os.path.join(d, f))]
print("src_data genome file(s):")
print(str('\t' + file) for file in genome_dir_files)
modify_pep_headers = ["sh /usr/local/genome2/mmo/scripts/phaeoexplorer/phaeoexplorer-change_pep_fasta_header.sh"]
# TODO: load the data into the current species directory and load it into galaxy instance
# setup_data_libraries_cl = \
# "docker-compose exec galaxy /tool_deps/_conda/bin/python /opt/setup_data_libraries.py"
......@@ -150,7 +185,7 @@ def main():
# generate workflow file and run it in the galaxy instance
gi.histories.create_history(name=str(genus_species_strain + "_" + genome_version))
# gi.histories.create_history(name=str(genus_species_strain + "_" + genome_version))
hi = gi.histories.get_histories(name=str(genus_species_strain + "_" + genome_version))
hi_id = hi[0]["id"]
li = gi.libraries.get_libraries() # only one library
......@@ -186,7 +221,7 @@ def main():
for k2, v2 in sub_folder_content.items():
for e in v2:
if type(e) == dict:
# TODO: manage several files of the same type
# TODO: manage several files of the same type and versions
if e["name"].endswith("transcripts-gff.fa"):
datasets["transcripts_file"] = e["ldda_id"]
print("\t\t" + e["name"] + ": " + e["ldda_id"])
......@@ -196,6 +231,9 @@ def main():
elif e["name"].endswith(".gff"):
datasets["gff_file"] = e["ldda_id"]
print("\t\t" + e["name"] + ": " + e["ldda_id"])
elif e["name"].endswith("MALE"):
datasets["gff_file"] = e["ldda_id"]
print("\t\t" + e["name"] + ": " + e["ldda_id"])
current_hi_id = gi.histories.get_current_history()["id"]
print("History ID: " + current_hi_id)
......@@ -204,76 +242,80 @@ def main():
gi.histories.upload_dataset_from_library(history_id=current_hi_id, lib_dataset_id=datasets["transcripts_file"])
gi.histories.upload_dataset_from_library(history_id=current_hi_id, lib_dataset_id=datasets["proteins_file"])
# Delete Homo sapiens from Chado database
toolrunner = ToolRunner(parameters_dict=sp_dict, instance=gi, history=current_hi_id)
sapiens_id = None
sapiens = toolrunner.get_sapiens_id()
sapiens_job_out = sapiens["outputs"][0]["id"]
sapiens_json_output = gi.datasets.download_dataset(dataset_id=sapiens_job_out)
try:
sapiens_output = json.loads(sapiens_json_output)[0]
sapiens_id = str(sapiens_output["organism_id"]) # needs to be str to be recognized by the chado tool
toolrunner.delete_sapiens(hs_id=sapiens_id)
except bb.ConnectionError:
print("Homo sapiens isn't in the database")
# ---------------------------------------------------------------------
# Galaxy instance interaction
# ---------------------------------------------------------------------
# Workflow generation
# # Delete Homo sapiens from Chado database
# toolrunner = ToolRunner(parameters_dict=sp_dict, instance=gi, history=current_hi_id)
# sapiens = toolrunner.get_sapiens_id()
# sapiens_job_out = sapiens["outputs"][0]["id"]
# sapiens_json_output = gi.datasets.download_dataset(dataset_id=sapiens_job_out)
# try:
# sapiens_output = json.loads(sapiens_json_output)[0]
# sapiens_id = str(sapiens_output["organism_id"]) # needs to be str to be recognized by the chado tool
# toolrunner.delete_sapiens(hs_id=sapiens_id)
# except bb.ConnectionError:
# print("Homo sapiens isn't in the database")
# except IndexError:
# pass
#
# # Workflow generation
workflow = Workflow(parameters_dict=sp_dict, instance=gi, history_id = current_hi_id)
wf_dict_json = workflow.generate(working_directory=wd, main_directory=main_dir, workflow_name="preset_workflow")
tools = gi.tools.get_tool_panel() # tools panel -> alternative to wf
# print(tools)
wf_dict = json.loads(wf_dict_json) # doesn't work with eval()
gi.workflows.import_workflow_dict(workflow_dict=wf_dict)
wf_name = workflow.get_workflow_name()
wf_attr = gi.workflows.get_workflows(name=wf_name)
wf_id = wf_attr[0]["id"]
wf_show = gi.workflows.show_workflow(workflow_id=wf_id)
print("Workflow ID: " + wf_id)
toolrunner = ToolRunner(parameters_dict=sp_dict, instance=gi, history=current_hi_id)
# toolrunner.purge_organisms()
# wf_o = bbo.Workflow(wf_dict=wf_dict, gi=gi)
wf_params = workflow.set_main_workflow_parameters(datasets=datasets)
print("Inputs:")
print(wf_show["inputs"])
datamap = dict()
datamap["0"] = {"src": "hda", "id": datasets["genome_file"]}
datamap["1"] = {"src": "hda", "id": datasets["gff_file"]}
datamap["2"] = {"src": "hda", "id": datasets["proteins_file"]}
datamap["3"] = {"src": "hda", "id": datasets["transcripts_file"]}
# wf_dict_json = workflow.generate(working_directory=wd, main_directory=main_dir, workflow_name="preset_workflow")
#
# tools = gi.tools.get_tool_panel() # tools panel -> alternative to wf
# # print(tools)
#
# wf_dict = json.loads(wf_dict_json) # doesn't work with eval()
#
# gi.workflows.import_workflow_dict(workflow_dict=wf_dict)
# wf_name = workflow.get_workflow_name()
# wf_attr = gi.workflows.get_workflows(name=wf_name)
# wf_id = wf_attr[0]["id"]
# wf_show = gi.workflows.show_workflow(workflow_id=wf_id)
# print("Workflow ID: " + wf_id)
#
# toolrunner = ToolRunner(parameters_dict=sp_dict, instance=gi, history=current_hi_id)
# # toolrunner.purge_organisms()
#
# # wf_o = bbo.Workflow(wf_dict=wf_dict, gi=gi)
#
# wf_params = workflow.set_main_workflow_parameters(datasets=datasets)
# # print("Inputs:")
# # print(wf_show["inputs"])
#
# datamap = dict()
# datamap["0"] = {"src": "hda", "id": datasets["genome_file"]}
# datamap["1"] = {"src": "hda", "id": datasets["gff_file"]}
# datamap["2"] = {"src": "hda", "id": datasets["proteins_file"]}
# datamap["3"] = {"src": "hda", "id": datasets["transcripts_file"]}
#
# gi.workflows.invoke_workflow(workflow_id=wf_id,
# history_id=current_hi_id,
# params=wf_params,
# inputs=datamap)
# gi.workflows.delete_workflow(workflow_id=wf_id)
#
# datamap = dict()
# datamap["0"] = {"src": "hda", "id": datasets["genome_file"]}
# datamap["1"] = {"src": "hda", "id": datasets["proteins_file"]}
#
gi.workflows.delete_workflow(workflow_id=wf_id)
datamap = dict()
datamap["0"] = {"src": "hda", "id": datasets["genome_file"]}
datamap["1"] = {"src": "hda", "id": datasets["proteins_file"]}
wf_dict_json = workflow.generate(working_directory=wd, main_directory=main_dir, workflow_name="jbrowse")
wf_dict = json.loads(wf_dict_json) # doesn't work with eval()
gi.workflows.import_workflow_dict(workflow_dict=wf_dict)
wf_attr = gi.workflows.get_workflows(name="jbrowse")
wf_id = wf_attr[0]["id"]
wf_show = gi.workflows.show_workflow(workflow_id=wf_id)
print("Jbrowse workflow ID: " + wf_id)
wf_params = workflow.set_jbrowse_workflow_parameters()
gi.workflows.invoke_workflow(workflow_id=wf_id,
history_id=current_hi_id,
params=wf_params,
inputs=datamap)
gi.workflows.delete_workflow(workflow_id=wf_id)
#
# gi.workflows.import_workflow_dict(workflow_dict=wf_dict)
# wf_attr = gi.workflows.get_workflows(name="jbrowse")
# wf_id = wf_attr[0]["id"]
# wf_show = gi.workflows.show_workflow(workflow_id=wf_id)
# print("Jbrowse workflow ID: " + wf_id)
# wf_params = workflow.set_jbrowse_workflow_parameters()
#
# gi.workflows.invoke_workflow(workflow_id=wf_id,
# history_id=current_hi_id,
# params=wf_params,
# inputs=datamap)
# gi.workflows.delete_workflow(workflow_id=wf_id)
# remove active instance history for testing, purge configured @ ~/config/galaxy.yml.docker_sample
# gi.histories.delete_history(history_id=current_hi_id, purge=True)
......
......@@ -2,6 +2,7 @@ import os
from bioblend.galaxy import GalaxyInstance
from toolrunner import ToolRunner
import json
import logging
"""
Workflow creation for generation and visualization of data and analyses output
"""
......@@ -9,6 +10,8 @@ Workflow creation for generation and visualization of data and analyses output
class Workflow:
logging.basicConfig(level=logging.INFO)
def __init__(self, parameters_dict, instance, history_id):
self.history_id = history_id
self.instance = instance
......@@ -59,18 +62,22 @@ class Workflow:
# print("Workflow file @ " + self.custom_ga_file_path)
with open(self.preset_ga_file, 'r') as ga_in_file:
ga_in = str(ga_in_file.readlines())
ga_in = ga_in.replace('{\\\\\\\\\\\\"unique_id\\\\\\\\\\\\": \\\\\\\\\\\\"UNIQUEID\\\\\\\\\\\\"}',
str('{\\\\\\\\\\\\"unique_id\\\\\\\\\\\\": \\\\\\\\\\\\"' + self.genus + " " + self.species) + '\\\\\\\\\\\\"')
ga_in = ga_in.replace('\\\\\\\\\\\\"name\\\\\\\\\\\\": \\\\\\\\\\\\"NAME\\\\\\\\\\\\"',
str('\\\\\\\\\\\\"name\\\\\\\\\\\\": \\\\\\\\\\\\"' + self.genus.lower()[0] + self.species) + '\\\\\\\\\\\\"')
ga_in = ga_in.replace("\\\\", "\\") # to restore the correct amount of backslashes in the workflow string before import
ga_in = ga_in.replace('"name": "NAME"', str('"name": "' + self.genus.lower()[0] + self.species) + '"')
ga_in = ga_in.replace('{"unique_id": "UNIQUEID"}', str('{"unique_id": "' + self.genus + " " + self.species) + '"')
ga_in = ga_in.replace("\\\\\\\\\\\\", "\\\\\\")
ga_in = ga_in.replace('http://localhost/sp/undaria_pinnatifida/feature/Undaria/pinnatifida/mRNA/{id}"',
"http://localhost/sp/" + self.genus.lower()[0] + self.genus[1:] + " " + self.species + "/feature/" + self.genus + "/mRNA/{id}")
# ga_in = ga_in.replace('"index\\\": \\\"false', '"index\\\": \\\"true')
# workflow_name = '"name": "' + self.full + '"'
# ga_in = ga_in.replace('"name": "preset_workflow"', '"name": "preset_workflow"')
# print(workflow_name)
ga_in = ga_in[2:-2]
ga_in = ga_in[2:-2] # if the line under doesn't outputs a correct json
# ga_in = ga_in[:-2] # if the line above doesn't outputs a correct json
self.workflow = ga_in
print(ga_in)
return ga_in
def set_main_workflow_parameters(self, datasets):
......@@ -98,7 +105,7 @@ class Workflow:
org_id = str(org_output["organism_id"]) # needs to be str to be recognized by the chado tool
global_org_id = org_id
except IndexError:
print("No organism matching " + self.full + " exists in the Chado database")
logging.info("No organism matching " + self.full + " exists in the Chado database")
ogs_analysis = toolrunner.get_ogs_analysis()
ogs_analysis_job_out = ogs_analysis["outputs"][0]["id"]
......@@ -108,7 +115,7 @@ class Workflow:
ogs_analysis_id = str(ogs_analysis_output["analysis_id"]) # needs to be str to be recognized by the chado tool
global_ogs_id = ogs_analysis_id
except IndexError:
print("No matching OGS analysis exists in the Chado database")
logging.info("No matching OGS analysis exists in the Chado database")
genome_analysis = toolrunner.get_genome_analysis()
genome_analysis_job_out = genome_analysis["outputs"][0]["id"]
......@@ -118,7 +125,7 @@ class Workflow:
genome_analysis_id = str(genome_analysis_output["analysis_id"]) # needs to be str to be recognized by the chado tool
global_genome_id = genome_analysis_id
except IndexError:
print("No matching genome analysis exists in the Chado database")
logging.info("No matching genome analysis exists in the Chado database")
params = dict()
params["0"] = {}
......@@ -194,7 +201,7 @@ class Workflow:
:return:
"""
workflow_id = self.instance.workflows.get_workflows()[0]['id']
return print(self.instance.workflows.show_workflow(workflow_id=workflow_id))
return logging.info(self.instance.workflows.show_workflow(workflow_id=workflow_id))
def store(self):
"""
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment