Skip to content
Snippets Groups Projects
Commit 13e0e68a authored by Arthur Le Bars's avatar Arthur Le Bars
Browse files

Pipeline v1.0 ready, missing orthology data and template workflows for blastp...

Pipeline v1.0 ready, missing orthology data and template workflows for blastp (diamond) and interpro. The data preprocessing for phaeoexplorer script is available in the phaeoexplorer-tools repo (but is not usable as a standalone)
parent 16785175
No related branches found
No related tags found
1 merge request!1Release 1.0
......@@ -53,70 +53,70 @@ class GetData(speciesData.SpeciesData):
return 1
def batch_modify_fasta_headers(self):
"""
Change the fasta headers before integration, so that the indexing tool in galaxy interprets the headers
correctly and doesn't throw an error
The function will use the class attribute "source_datasets", pointing to files in the galaxy
library to find the fasta files that need their headers formatted
:return:
"""
proteins_file = None
proteins_outfile = None
annotation_dir = None
organism_annotation_dir = os.path.abspath("./src_data/annotation/{0}/OGS{1}".format(self.species_folder_name, self.ogs_version))
self.goto_species_dir()
for d in [i[0] for i in os.walk(os.getcwd() + "/src_data")]:
if "annotation" in d and self.species_folder_name in d and self.ogs_version in d:
for f in os.listdir(d):
if "proteins" in f:
proteins_file = os.path.join(d, f)
proteins_outfile = os.path.join(d, "outfile_proteins.fa")
annotation_dir = os.path.abspath(d)
# Formatting the headers
if proteins_file is not None:
self.format_fasta_headers(infile=proteins_file,
outfile=proteins_outfile,
pattern="^>mRNA",
repl=">protein")
if os.path.exists(annotation_dir + "/outfile_proteins.fa"):
subprocess.call(["mv", annotation_dir + "/outfile_proteins.fa", proteins_file],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
cwd=annotation_dir)
subprocess.call(["rm", annotation_dir + "/outfile_proteins.fa"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=annotation_dir)
else:
logging.warning("Skipping proteins fasta headers formatting (FileNotFound)")
@staticmethod
def format_fasta_headers(infile, outfile, pattern, repl):
"""
Format the fasta headers of a given file, given a matching pattern and a replacement string
:param infile:
:param outfile:
:param pattern:
:param repl:
:return:
"""
infile = open(infile, 'r')
outfile = open(outfile, 'w')
lines = infile.readlines()
for line in lines:
line_out = re.sub(pattern, repl, line)
outfile.write(line_out)
infile.close()
outfile.close()
# def batch_modify_fasta_headers(self):
# """
# Change the fasta headers before integration, so that the indexing tool in galaxy interprets the headers
# correctly and doesn't throw an error
# The function will use the class attribute "source_datasets", pointing to files in the galaxy
# library to find the fasta files that need their headers formatted
# :return:
# """
# proteins_file = None
# proteins_outfile = None
# annotation_dir = None
# organism_annotation_dir = os.path.abspath("./src_data/annotation/{0}/OGS{1}".format(self.species_folder_name, self.ogs_version))
# self.goto_species_dir()
# for d in [i[0] for i in os.walk(os.getcwd() + "/src_data")]:
# if "annotation" in d and self.species_folder_name in d and self.ogs_version in d:
# for f in os.listdir(d):
# if "proteins" in f:
# proteins_file = os.path.join(d, f)
# proteins_outfile = os.path.join(d, "outfile_proteins.fa")
# annotation_dir = os.path.abspath(d)
# # Formatting the headers
# if proteins_file is not None:
# self.format_fasta_headers(infile=proteins_file,
# outfile=proteins_outfile,
# pattern="^>mRNA",
# repl=">protein")
# if os.path.exists(annotation_dir + "/outfile_proteins.fa"):
# subprocess.call(["mv", annotation_dir + "/outfile_proteins.fa", proteins_file],
# stdout=subprocess.PIPE,
# stderr=subprocess.PIPE,
# cwd=annotation_dir)
# subprocess.call(["rm", annotation_dir + "/outfile_proteins.fa"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=annotation_dir)
# else:
# logging.warning("Skipping proteins fasta headers formatting (FileNotFound)")
# @staticmethod
# def format_fasta_headers(infile, outfile, pattern, repl):
# """
# Format the fasta headers of a given file, given a matching pattern and a replacement string
# :param infile:
# :param outfile:
# :param pattern:
# :param repl:
# :return:
# """
# infile = open(infile, 'r')
# outfile = open(outfile, 'w')
# lines = infile.readlines()
# for line in lines:
# line_out = re.sub(pattern, repl, line)
# outfile.write(line_out)
# infile.close()
# outfile.close()
def get_source_data_files_from_path(self):
"""
......@@ -297,8 +297,8 @@ if __name__ == "__main__":
logging.info("Sucessfully copied datasets for %s" % get_data_for_current_species.full_name)
# Format fasta headers (proteins)
logging.info("Formatting fasta files headers %s " % get_data_for_current_species.full_name)
get_data_for_current_species.batch_modify_fasta_headers()
logging.info("Successfully formatted files headers %s " % get_data_for_current_species.full_name)
# logging.info("Formatting fasta files headers %s " % get_data_for_current_species.full_name)
# get_data_for_current_species.batch_modify_fasta_headers()
# logging.info("Successfully formatted files headers %s " % get_data_for_current_species.full_name)
logging.info("Data successfully loaded and imported for %s" % get_data_for_current_species.full_name)
......@@ -21,6 +21,11 @@ gga_init.py
Usage: $ python3 gga_init.py -i input_example.yml --config config.yml [OPTIONS]
TODO
- Exclude traefik dir tree creation and file writing from the loop (make make_dirs() an external func and write a func similar to make_compose_files()
for traefik and authelia as an external func)
"""
......@@ -82,11 +87,15 @@ class DeploySpeciesStack(speciesData.SpeciesData):
try:
os.mkdir("./src_data")
except FileExistsError:
logging.debug("src_data folder already exist for %s" % self.full_name)
logging.debug("'src_data' directory already exist for %s" % self.full_name)
except PermissionError as exc:
logging.critical("Insufficient permission to create src_data directory tree")
sys.exit(exc)
print(self.strain)
print(self.species_folder_name)
# List of all the directories to create in src_data
src_data_dirs_li = ["./src_data", "./src_data/annotation", "./src_data/genome", "./src_data/tracks",
"./src_data/annotation/%s" % self.species_folder_name,
......@@ -160,12 +169,32 @@ class DeploySpeciesStack(speciesData.SpeciesData):
# Create the volumes (directory) of the species docker-compose file
self.create_mounts(working_dir=".")
# Return to main directory
os.chdir(self.main_dir)
def make_traefik_compose_files(self):
"""
Create or update the traefik docker-compose file and authelia conf files
Will only write new authelia conf files if the argument "--overwrite-all" is specified or
the authelia directory doesn't contain conf files
:return:
"""
# Proceed to the traefik and authelia directories
os.chdir(self.main_dir)
# Create directory tree
self.make_dirs(["./traefik", "./traefik/authelia"])
# Render and try to write the traefik docker-compose file
# This new docker-compose file will not overwrite the one already present in the traefik dir
# unless the argument "--overwrite-all" is specified
# Jinja2 templating, handled using the python "jinja2" module
file_loader = FileSystemLoader(self.script_dir + "/templates")
env = Environment(loader=file_loader)
if not os.path.isfile("./traefik/docker-compose.yml") or force:
traefik_compose_template = env.get_template("traefik_compose_template.yml.j2")
traefik_compose_output = traefik_compose_template.render(render_vars)
......@@ -199,6 +228,7 @@ class DeploySpeciesStack(speciesData.SpeciesData):
# Return to main directory
os.chdir(self.main_dir)
def create_mounts(self, working_dir):
"""
Create the folders (volumes) required by a container (to see required volumes, check their compose file)
......@@ -249,41 +279,59 @@ class DeploySpeciesStack(speciesData.SpeciesData):
logging.critical("Cannot access %s, exiting" % self.main_dir)
sys.exit(exc)
def deploy_stack(self, input_list):
"""
Call the script "deploy.sh" used to initiliaze the swarm cluster if needed and
launch/update the current organism's stack
This script first try to deploy the traefik stack, then deploy the organism stack, then update the traefik stack
The stacks are updated if already deployed
:return:
"""
to_deploy_species_li = []
def deploy_stacks(input_list, main_dir):
"""
This function first deploys/redeploys the traefik stack, then deploys/redeploys the organism stack, then redeploys the traefik stack
This function is executed outside the "main" loop of input species
# # Create our swarm cluster if it doesn't exist
# subprocess.Popen(["docker", "swarm", "init"],
# stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=self.main_dir)
#
# # Deploy/update the stack for the current species
# subprocess.Popen(["docker", "stack", "deploy", "-c", "./docker-compose.yml", "{0}_{1}".format(self.genus_lowercase, self.species)],
# stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=self.main_dir)
:return:
"""
# Launch and update docker stacks
# noinspection PyArgumentList
# deploy_stacks_popen = subprocess.Popen(["sh", self.script_dir + "/deploy.sh", self.genus_species,
# self.main_dir + "/traefik"],
# stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
# universal_newlines=True)
#
# for stdout_line in iter(deploy_stacks_popen.stdout.readline, ""):
# if "daemon" in stdout_line: # Ignore swarm init error output
# pass
# else:
# logging.info("\t%s" % stdout_line.strip())
# deploy_stacks_popen.stdout.close()
main_dir = os.path.abspath(main_dir)
os.chdir(main_dir)
# Get species for which to deploy the stacks
to_deploy_species_li = utilities.get_species_to_deploy(sp_dict_list=input_list)
# Create the swarm cluster if needed
subprocess.call(["docker", "swarm", "init"],
stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=main_dir)
# Deploy traefik stack
os.chdir("./traefik")
subprocess.call(["docker", "stack", "deploy", "-c", "./docker-compose.yml", "traefik"],
stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=".")
os.chdir(main_dir)
# Deploy individual species stacks
for sp in to_deploy_species_li:
os.chdir(sp)
subprocess.call(["docker", "stack", "deploy", "-c", "./docker-compose.yml", "{0}_{1}".format(sp.split("_")[0], sp.split("_")[1])],
stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=".")
logging.info("Deployed %s stack" % sp)
os.chdir(main_dir)
# Update traefik stack
os.chdir("./traefik")
subprocess.call(["docker", "stack", "deploy", "-c", "./docker-compose.yml", "traefik"],
stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=".")
os.chdir(main_dir)
# # Using deploy.sh script (obsolete)
# Launch and update docker stacks
# noinspection PyArgumentList
# deploy_stacks_popen = subprocess.Popen(["sh", self.script_dir + "/deploy.sh", self.genus_species,
# self.main_dir + "/traefik"],
# stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
# universal_newlines=True)
#
# for stdout_line in iter(deploy_stacks_popen.stdout.readline, ""):
# if "daemon" in stdout_line: # Ignore swarm init error output
# pass
# else:
# logging.info("\t%s" % stdout_line.strip())
# deploy_stacks_popen.stdout.close()
if __name__ == "__main__":
......@@ -332,9 +380,6 @@ if __name__ == "__main__":
main_dir = os.path.abspath(args.main_directory)
sp_dict_list = utilities.parse_input(os.path.abspath(args.input))
print(sp_dict_list)
utilities.get_species_to_deploy(sp_dict_list=sp_dict_list)
logging.info("Deploying stacks for organisms in input file %s" % args.input)
for sp_dict in sp_dict_list:
......@@ -375,14 +420,6 @@ if __name__ == "__main__":
deploy_stack_for_current_organism.make_compose_files(force=args.overwrite_all)
logging.info("Successfully generated the docker-compose files for %s" % deploy_stack_for_current_organism.full_name)
# Deploy the stack
logging.info("Deploying stack for %s..." % deploy_stack_for_current_organism.full_name)
# deploy_stack_for_current_organism.deploy_stack()
logging.info("Successfully deployed stack for %s" % deploy_stack_for_current_organism.full_name)
logging.info("Stack deployed for %s" % deploy_stack_for_current_organism.full_name)
# TODO: if GENUS°1 == GENUS°2 AND SP°1 == SP°2 --> skip init, continue to next item and only deploy once the loop is done
# TODO: reload traefik outside loop
logging.info("Deploying stacks")
deploy_stacks(input_list=sp_dict_list, main_dir=main_dir)
logging.info("All stacks deployed for organisms in input file %s" % args.input)
......@@ -183,11 +183,34 @@ class RunWorkflow(speciesData.SpeciesData):
"sourcename": "Genoscope",
"date_executed": self.date})
# Add Interpro analysis to chado
self.instance.tools.run_tool(
tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/2.3.3",
history_id=self.history_id,
tool_inputs={"name": "InterproScan on OGS%s" % self.ogs_version,
"program": "InterproScan",
"programversion": "OGS%s" % self.ogs_version,
"sourcename": "Genoscope",
"date_executed": self.date})
# Add Blastp (diamond) analysis to chado
self.instance.tools.run_tool(
tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/2.3.3",
history_id=self.history_id,
tool_inputs={"name": "Diamond on OGS%s" % self.ogs_version,
"program": "Diamond",
"programversion": "OGS%s" % self.ogs_version,
"sourcename": "Genoscope",
"date_executed": self.date})
# Also get the organism and analyses IDs
self.get_organism_and_analyses_ids()
logging.info("Finished initializing instance")
def run_workflow(self, workflow_path, workflow_parameters, workflow_name, datamap):
"""
Run a workflow in galaxy
......@@ -641,6 +664,23 @@ if __name__ == "__main__":
datamap=run_workflow_for_current_organism.datamap,
workflow_name="Jbrowse")
elif "Interpro" in str(workflow):
logging.info("Executing workflow 'Interproscan")
run_workflow_for_current_organism.connect_to_instance()
run_workflow_for_current_organism.set_get_history()
# run_workflow_for_current_organism.get_species_history_id()
# Get the attributes of the instance and project data files
run_workflow_for_current_organism.get_instance_attributes()
run_workflow_for_current_organism.get_organism_and_analyses_ids()
# Import datasets into history and retrieve their hda IDs
run_workflow_for_current_organism.import_datasets_into_history()
run_workflow_for_current_organism.get_datasets_hda_ids()
else:
logging.critical("The galaxy container for %s is not ready yet!" % run_workflow_for_current_organism.full_name)
sys.exit()
......@@ -82,5 +82,3 @@ class SpeciesData:
self.species_folder_name = "_".join(utilities.filter_empty_not_empty_items([self.genus_lowercase, self.species, self.strain, self.sex])["not_empty"])
self.existing_folders_cache = {}
self.bam_metadata_cache = {}
# self.do_update = False # Update the instance (in histories corresponding to the input) instead of creating a new one
# self.species_name_regex_litteral = "(?=\w*V)(?=\w*A)(?=\w*R)(?=\w*I)(?=\w*A)(?=\w*B)(?=\w*L)(?=\w*E)\w+" # Placeholder regex for file matching
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
import os
import argparse
import pickle
import logging
def orthologs_transfer(manual_annotation_file, orthofinder_file, gff_file, outdir):
"""
Transfer of description between a manual annotation file and a gff file, using an orthofinder output file
to find the corresponding IDs between the two files
:param manual_annotation_file:
:param orthofinder_file:
:param gff_file:
:param outdir:
:return:
"""
manual_annotations_dict, orthofinder_dict, gff_dict = {}, {}, {}
species_filename = ""
with open(orthofinder_file, 'r') as orthofinder:
next(orthofinder) # Skip header
# Mapping orthofinder's IDs to manual annotation's description
logging.info("Mapping Orthofinder's IDs to manual annotation's IDs...")
for orthofiner_line in orthofinder:
orthofiner_line_split = orthofiner_line.rstrip().split("\t")
orthofinder_dict[orthofiner_line_split[1].split(" ")[0]] = orthofiner_line_split[2].split(" ")[0]
with open(manual_annotation_file, 'r') as manual_annotation:
# Mapping orthofinder's IDs to manual annotation's description
logging.info("Mapping Orthofinder's IDs to descriptions...")
next(manual_annotation) # Skip header
for manual_annotation_line in manual_annotation:
manual_annotation_line_split = manual_annotation_line.rstrip().split("\t")
manual_annotations_dict[manual_annotation_line_split[0]] = ";".join([manual_annotation_line_split[0], manual_annotation_line_split[6]])
# Opening GFF, appending manual annotation's description to matching IDs
logging.info("Transferring manual descriptions to the GFF file...")
output_filename = str(os.path.join(os.path.abspath(outdir), os.path.basename(gff_file).split(".")[0]) + "_TRANSFERED.gff")
with open(output_filename, 'a') as output_file:
output_file.truncate(0) # Erase previous file content
with open(gff_file, 'r') as gff:
for gff_line in gff:
if "ID=mRNA" in gff_line: # Look for mRNA items
gff_line_id = gff_line.split("\t")[8].split("=")[1].split(";")[0] # GFF file ID (matches manual annotation IDs)
if gff_line_id in orthofinder_dict.keys(): # The gff ID is supposed to match a unique ID (key) in the orthofinder file
try:
manual_annotation_value = orthofinder_dict[gff_line_id] # Find the corresponding manual annotation to transfer
gff_line = "{0};ec32_ortholog={1};ec32_ortholog_description={2}\n".format(gff_line.strip(), orthofinder_dict[gff_line_id], manual_annotation_value)
except KeyError: # Just in case some values are missing in the manual annotation file (optional, can be removed, it will then exit instead)
continue
output_file.write(gff_line)
logging.info("Finished transferring descriptions for %s" % str(os.path.basename(gff_file)))
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Orthologs_transfer between a reference manual annotation and Orthofinder outputs. \
\n\nRun example: python3 ec32_orthologs_transfer.py -d /path/to/desired/dir/ -o orthofinder_file.tsv \
-g gff_file.gff -m manual_annotation_file.txt")
parser.add_argument("-o", "--orthofinder",
help="Orthofinder output (tabulated)")
parser.add_argument("-m", "--manual-annotation",
help="Manual annotation file (tabulated) or pickle file containing the dicionary (automatically generated when you input a new manual annotation file)")
parser.add_argument("-g", "--gff",
help="GFF input file (tabulated)")
parser.add_argument("-d", "--out-dir",
help="Output directory")
args = parser.parse_args()
# WARNING for shell use case:
# The script doesn't check if the inout files are valid.
# Files must be put in this order: $1=orthofinder file, $2=manual annotation file, $3=GFF file, $4=optional output directory)
if not args.manual_annotation:
logging.info("Please input a manual annotation file or a pickled manual annotation dictionary")
sys.exit()
if not args.orthofinder:
logging.info("Please input an orthofinder output file")
sys.exit()
if not args.gff:
logging.info("Please input a gff file")
sys.exit()
if not args.out_dir:
args.out = "."
orthologs_transfer(manual_annotation_file=args.manual_annotation,
orthofinder_file=args.orthofinder,
gff_file=args.gff,
outdir=args.out_dir)
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
import os
import argparse
import logging
def hectar_transfer(hectar_file, orthofinder_file, gff_file, outdir):
"""
Transfer of description between a manual annotation file and a gff file, using an orthofinder output file
to find the corresponding IDs between the two files
:param hectar_file:
:param orthofinder_file:
:param gff_file:
:param outdir:
:return:
"""
manual_annotations_dict, orthofinder_dict, gff_dict = {}, {}, {}
species_filename = ""
with open(orthofinder_file, 'r') as orthofinder:
next(orthofinder) # Skip header
# Mapping orthofinder's IDs to hectar IDs
logging.info("Mapping Orthofinder's IDs to hectar IDs...")
for orthofiner_line in orthofinder:
orthofiner_line_split = orthofiner_line.rstrip().split("\t")
orthofinder_dict[orthofiner_line_split[1].split(" ")[0]] = orthofiner_line_split[2].split(" ")[0]
hectar_dict = {}
with open(hectar_file, 'r') as hectar:
# Mapping orthofinder's IDs to hectar description
logging.info("Mapping Orthofinder's IDs to descriptions...")
next(hectar) # Skip header
for hectar_line in hectar:
hectar_line_split = hectar_line.rstrip().split("\t")
hectar_dict[hectar_line_split[0].split(" ")[0]] = hectar_line_split[1]
# Opening GFF, appending hectar description to matching IDs
logging.info("Transferring manual descriptions to the GFF file...")
output_filename = str(os.path.join(os.path.abspath(outdir), os.path.basename(gff_file).split(".")[0]) + "_TRANSFERED_HECTAR.gff")
with open(output_filename, 'a') as output_file:
output_file.truncate(0) # Erase previous file content
with open(gff_file, 'r') as gff:
for gff_line in gff:
if "ID=mRNA" in gff_line: # Look for mRNA items
gff_line_id = gff_line.split("\t")[8].split("=")[1].split(";")[0] # GFF file ID (matches hectar IDs)
if gff_line_id in hectar_dict.keys(): # The gff ID is supposed to match a unique ID (key) in the orthofinder file
try:
hectar_value = hectar_dict[gff_line_id] # Find the corresponding manual annotation to transfer
gff_line = "{0};HECTAR_predicted_targeting_category={1}\n".format(gff_line.strip(), hectar_value)
except KeyError: # Just in case some values are missing in the hectar file (If removed it will then exit instead)
continue
output_file.write(gff_line)
logging.info("Finished transferring descriptions for %s" % str(os.path.basename(gff_file)))
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Orthologs_transfer between a reference manual annotation and Orthofinder outputs. \
\n\nRun example: python3 ec32_orthologs_transfer.py -d /path/to/desired/dir/ -h hectar_file.tsv \
-g gff_file.gff -m manual_annotation_file.txt")
parser.add_argument("-i", "--hectar",
help="Hectar output (tabulated)")
parser.add_argument("-o", "--orthofinder",
help="Manual annotation file (tabulated) or pickle file containing the dicionary (automatically generated when you input a new manual annotation file)")
parser.add_argument("-g", "--gff",
help="GFF input file (tabulated)")
parser.add_argument("-d", "--out-dir",
help="Output directory")
args = parser.parse_args()
# WARNING for shell use case:
# The script doesn't check if the inout files are valid.
# Files must be put in this order: $1=orthofinder file, $2=manual annotation file, $3=GFF file, $4=optional output directory)
if not args.hectar:
logging.info("Please input a hectar file")
sys.exit()
if not args.orthofinder:
logging.info("Please input a hectar output file")
sys.exit()
if not args.gff:
logging.info("Please input a gff file")
sys.exit()
if not args.out_dir:
args.out = "."
hectar_transfer(hectar_file=args.hectar,
orthofinder_file=args.orthofinder,
gff_file=args.gff,
outdir=args.out_dir)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment