Skip to content
Snippets Groups Projects
Commit adf8df17 authored by Arthur Le Bars's avatar Arthur Le Bars
Browse files

autoload.py: script implementation, easier to understand and use. small fixes...

autoload.py: script implementation, easier to understand and use. small fixes to workflow.py and main.py
parent a9daa4b9
No related branches found
No related tags found
1 merge request!1Release 1.0
This diff is collapsed.
#!/usr/bin/env bash
\ No newline at end of file
...@@ -53,6 +53,7 @@ def main(): ...@@ -53,6 +53,7 @@ def main():
genus_species = genus_lower + "_" + species genus_species = genus_lower + "_" + species
common = sp_dict["common"] common = sp_dict["common"]
strain = sp_dict["strain"] strain = sp_dict["strain"]
sex = sp_dict["sex"]
if strain != "": if strain != "":
genus_species_strain = genus_species + "_" + strain genus_species_strain = genus_species + "_" + strain
else: else:
...@@ -123,65 +124,84 @@ def main(): ...@@ -123,65 +124,84 @@ def main():
print("Successfully connected to galaxy instance @ " + instance_url) print("Successfully connected to galaxy instance @ " + instance_url)
# TODO: FTP/symlink stuff to retrieve the datasets + change headers in pep.fasta # TODO: FTP/symlink stuff to retrieve the datasets + change headers in pep.fasta
setup_data_libraries_cl = "docker-compose exec galaxy /tool_deps/_conda/bin/python /opt/setup_data_libraries.py"
# try: # ---------------------------------------------------------------------
# os.mkdir("./src_data") # src_data directory tree creation
# except FileExistsError: # ---------------------------------------------------------------------
# print("src_data folder already exists for " + genus_species_strain)
# print("Loading data into galaxy...") src_data_folders = ["annotation", "genome"]
# try: species_folder_name = "_".join([genus_lower, species, strain, sex])
# setup_data_libraries = subprocess.Popen(setup_data_libraries_cl.split(), stdout=subprocess.PIPE) try:
# print("Output from setup_data_libraries.py") os.mkdir("./src_data")
# print(setup_data_libraries.communicate()) os.mkdir("./src_data/annotation")
# except bb.ConnectionError: os.mkdir("./src_data/genome")
# print("Cannot load data into container for " + genus_species_strain) os.mkdir("./src_data/annotation/" + species_folder_name)
# break os.mkdir("./src_data/genome/" + species_folder_name)
# else: except FileExistsError:
# print("Data successfully loaded into docker container for " + genus_species_strain) print("src_data directory tree already exists")
# else: pass
# print("src_data folder created for " + genus_species_strain) except PermissionError:
# try: print("Insufficient permission to create src_data directory tree")
# setup_data_libraries = subprocess.Popen(setup_data_libraries_cl.split(), stdout=subprocess.PIPE)
# print("Output from setup_data_libraries.py") # ---------------------------------------------------------------------
# print(setup_data_libraries.communicate()) # Data import into galaxy
# except bb.ConnectionError: # ---------------------------------------------------------------------
# print("Cannot load data into container for " + genus_species_strain)
# break source_files = dict()
# else: annotation_dir, genome_dir = None, None
# print("Data successfully loaded into docker container for " + genus_species_strain)
genome_dir, annotation_dir = None, None
for d in [i[0] for i in os.walk(os.getcwd() + "/src_data")]: for d in [i[0] for i in os.walk(os.getcwd() + "/src_data")]:
if "annotation/" in d: if "annotation/" in d:
annotation_dir = d annotation_dir = d
annotation_dir_files = [f for f in os.listdir(d) if os.path.isfile(os.path.join(d, f))] for f in os.listdir(d):
print("src_data annotation file(s):") if f.endswith("proteins.fasta"):
print(str('\t' + file) for file in annotation_dir_files) source_files["proteins_file"] = os.path.join(d, f)
elif f.endswith("transcripts-gff.fa"):
source_files["transcripts_file"] = os.path.join(d, f)
elif f.endswith(".gff"):
source_files["gff_file"] = os.path.join(d, f)
# annotation_dir_files = [f for f in os.listdir(d) if os.path.isfile(os.path.join(d, f))]
elif "genome/" in d: elif "genome/" in d:
genome_dir = d genome_dir = d
genome_dir_files = [f for f in os.listdir(d) if os.path.isfile(os.path.join(d, f))] for f in os.listdir(d):
print("src_data genome file(s):") if f.endswith(".fa"):
print(str('\t' + file) for file in genome_dir_files) source_files["genome_file"] = os.path.join(d, f)
# genome_dir_files = [f for f in os.listdir(d) if os.path.isfile(os.path.join(d, f))]
print("Source files found:")
for k, v in source_files.items():
print("\t" + k + "\t" + v)
modify_pep_headers = ["sh /usr/local/genome2/mmo/scripts/phaeoexplorer/phaeoexplorer-change_pep_fasta_header.sh"]
# Changing headers in the *proteins.fasta file from >mRNA* to >protein*
# production version
modify_pep_headers = ["/usr/local/genome2/mmo/scripts/phaeoexplorer/phaeoexplorer-change_pep_fasta_header.sh",
source_files["proteins_file"]]
# test version
modify_pep_headers = ["/home/alebars/gga/phaeoexplorer-change_pep_fasta_header.sh",
source_files["proteins_file"]]
print("Changing fasta headers in " + source_files["proteins_file"])
subprocess.run(modify_pep_headers, stdout=subprocess.PIPE, cwd=annotation_dir)
# src_data cleaning
if os.path.exists(annotation_dir + "outfile"):
subprocess.run(["mv", annotation_dir + "/outfile", source_files["proteins_file"]],
stdout=subprocess.PIPE,
cwd=annotation_dir)
if os.path.exists(annotation_dir + "gmon.out"):
subprocess.run(["rm", annotation_dir + "/gmon.out"],
stdout=subprocess.PIPE,
cwd=annotation_dir)
# TODO: load the data into the current species directory and load it into galaxy instance # TODO: load the data into the current species directory and load it into galaxy instance
# setup_data_libraries_cl = \ setup_data_libraries = "docker-compose exec galaxy /tool_deps/_conda/bin/python /opt/setup_data_libraries.py"
# "docker-compose exec galaxy /tool_deps/_conda/bin/python /opt/setup_data_libraries.py" try:
# print("Loading data into the galaxy container")
# try: subprocess.run(setup_data_libraries,
# setup_data_libraries = subprocess.Popen(setup_data_libraries_cl.split(), stdout=subprocess.PIPE) stdout=subprocess.PIPE,
# # output message from the data loading script shell=True)
# setup_data_libraries_output = setup_data_libraries.communicate() except subprocess.CalledProcessError:
# except Exception: print("Cannot load data into container for " + genus_species_strain)
# print("Cannot load data into container for " + genus_species_strain) break
# break else:
# else: print("Data successfully loaded into docker container for " + genus_species_strain)
# print("Data successfully loaded into docker container for " + genus_species_strain)
# generate workflow file and run it in the galaxy instance # generate workflow file and run it in the galaxy instance
...@@ -202,8 +222,6 @@ def main(): ...@@ -202,8 +222,6 @@ def main():
current_fo_name = v current_fo_name = v
if k == "id": if k == "id":
fo_id[current_fo_name] = v fo_id[current_fo_name] = v
# TODO: turn data id parsing into a function
print("Folders and datasets IDs: ") print("Folders and datasets IDs: ")
datasets = dict() datasets = dict()
for k, v in fo_id.items(): for k, v in fo_id.items():
...@@ -242,6 +260,9 @@ def main(): ...@@ -242,6 +260,9 @@ def main():
gi.histories.upload_dataset_from_library(history_id=current_hi_id, lib_dataset_id=datasets["transcripts_file"]) gi.histories.upload_dataset_from_library(history_id=current_hi_id, lib_dataset_id=datasets["transcripts_file"])
gi.histories.upload_dataset_from_library(history_id=current_hi_id, lib_dataset_id=datasets["proteins_file"]) gi.histories.upload_dataset_from_library(history_id=current_hi_id, lib_dataset_id=datasets["proteins_file"])
toolrunner = ToolRunner(parameters_dict=sp_dict, instance=gi, history=current_hi_id)
# toolrunner.show_pannel() # show tools pannel (with tool_id and versions)
# --------------------------------------------------------------------- # ---------------------------------------------------------------------
# Galaxy instance interaction # Galaxy instance interaction
# --------------------------------------------------------------------- # ---------------------------------------------------------------------
...@@ -301,7 +322,7 @@ def main(): ...@@ -301,7 +322,7 @@ def main():
# datamap["0"] = {"src": "hda", "id": datasets["genome_file"]} # datamap["0"] = {"src": "hda", "id": datasets["genome_file"]}
# datamap["1"] = {"src": "hda", "id": datasets["proteins_file"]} # datamap["1"] = {"src": "hda", "id": datasets["proteins_file"]}
# #
wf_dict_json = workflow.generate(working_directory=wd, main_directory=main_dir, workflow_name="jbrowse") wf_dict_json = workflow.generate(working_directory=wd, main_directory=main_dir, workflow_name="main")
wf_dict = json.loads(wf_dict_json) # doesn't work with eval() wf_dict = json.loads(wf_dict_json) # doesn't work with eval()
# #
# gi.workflows.import_workflow_dict(workflow_dict=wf_dict) # gi.workflows.import_workflow_dict(workflow_dict=wf_dict)
...@@ -311,10 +332,13 @@ def main(): ...@@ -311,10 +332,13 @@ def main():
# print("Jbrowse workflow ID: " + wf_id) # print("Jbrowse workflow ID: " + wf_id)
# wf_params = workflow.set_jbrowse_workflow_parameters() # wf_params = workflow.set_jbrowse_workflow_parameters()
# #
# allow_tool_state_correction makes galaxy fill missing tool states,
# because workflow was edited outside of galaxy with only some inputs (precaution parameter)
# gi.workflows.invoke_workflow(workflow_id=wf_id, # gi.workflows.invoke_workflow(workflow_id=wf_id,
# history_id=current_hi_id, # history_id=current_hi_id,
# params=wf_params, # params=wf_params,
# inputs=datamap) # inputs=datamap,
# allow_tool_state_corrections=True)
# gi.workflows.delete_workflow(workflow_id=wf_id) # gi.workflows.delete_workflow(workflow_id=wf_id)
# remove active instance history for testing, purge configured @ ~/config/galaxy.yml.docker_sample # remove active instance history for testing, purge configured @ ~/config/galaxy.yml.docker_sample
......
File moved
...@@ -62,14 +62,15 @@ class Workflow: ...@@ -62,14 +62,15 @@ class Workflow:
# print("Workflow file @ " + self.custom_ga_file_path) # print("Workflow file @ " + self.custom_ga_file_path)
with open(self.preset_ga_file, 'r') as ga_in_file: with open(self.preset_ga_file, 'r') as ga_in_file:
ga_in = str(ga_in_file.readlines()) ga_in = str(ga_in_file.readlines())
ga_in = ga_in.replace('{\\\\\\\\\\\\"unique_id\\\\\\\\\\\\": \\\\\\\\\\\\"UNIQUEID\\\\\\\\\\\\"}', print(ga_in)
ga_in = ga_in.replace('{\\\\\\\\\\\\"unique_id\\\\\\\\\\\\": \\\\\\\\\\\\"UNIQUE_ID\\\\\\\\\\\\"}',
str('{\\\\\\\\\\\\"unique_id\\\\\\\\\\\\": \\\\\\\\\\\\"' + self.genus + " " + self.species) + '\\\\\\\\\\\\"') str('{\\\\\\\\\\\\"unique_id\\\\\\\\\\\\": \\\\\\\\\\\\"' + self.genus + " " + self.species) + '\\\\\\\\\\\\"')
ga_in = ga_in.replace('\\\\\\\\\\\\"name\\\\\\\\\\\\": \\\\\\\\\\\\"NAME\\\\\\\\\\\\"', ga_in = ga_in.replace('\\\\\\\\\\\\"name\\\\\\\\\\\\": \\\\\\\\\\\\"NAME\\\\\\\\\\\\"',
str('\\\\\\\\\\\\"name\\\\\\\\\\\\": \\\\\\\\\\\\"' + self.genus.lower()[0] + self.species) + '\\\\\\\\\\\\"') str('\\\\\\\\\\\\"name\\\\\\\\\\\\": \\\\\\\\\\\\"' + self.genus.lower()[0] + self.species) + '\\\\\\\\\\\\"')
ga_in = ga_in.replace("\\\\", "\\") # to restore the correct amount of backslashes in the workflow string before import ga_in = ga_in.replace("\\\\", "\\") # to restore the correct amount of backslashes in the workflow string before import
ga_in = ga_in.replace("\\\\\\\\\\\\", "\\\\\\") # ga_in = ga_in.replace("\\\\\\\\\\\\", "\\\\\\")
ga_in = ga_in.replace('http://localhost/sp/undaria_pinnatifida/feature/Undaria/pinnatifida/mRNA/{id}"', ga_in = ga_in.replace('http://localhost/sp/genus_species/feature/Genus/species/mRNA/{id}',
"http://localhost/sp/" + self.genus.lower()[0] + self.genus[1:] + " " + self.species + "/feature/" + self.genus + "/mRNA/{id}") "http://localhost/sp/" + self.genus.lower()[0] + self.genus[1:] + "_" + self.species + "/feature/" + self.genus + "/mRNA/{id}")
# ga_in = ga_in.replace('"index\\\": \\\"false', '"index\\\": \\\"true') # ga_in = ga_in.replace('"index\\\": \\\"false', '"index\\\": \\\"true')
# workflow_name = '"name": "' + self.full + '"' # workflow_name = '"name": "' + self.full + '"'
# ga_in = ga_in.replace('"name": "preset_workflow"', '"name": "preset_workflow"') # ga_in = ga_in.replace('"name": "preset_workflow"', '"name": "preset_workflow"')
...@@ -77,7 +78,7 @@ class Workflow: ...@@ -77,7 +78,7 @@ class Workflow:
ga_in = ga_in[2:-2] # if the line under doesn't outputs a correct json ga_in = ga_in[2:-2] # if the line under doesn't outputs a correct json
# ga_in = ga_in[:-2] # if the line above doesn't outputs a correct json # ga_in = ga_in[:-2] # if the line above doesn't outputs a correct json
self.workflow = ga_in self.workflow = ga_in
print(ga_in) # print(ga_in)
return ga_in return ga_in
def set_main_workflow_parameters(self, datasets): def set_main_workflow_parameters(self, datasets):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment