diff --git a/.gitignore b/.gitignore index 07e2dd14ac2543f1386b88911f859a1ded6a9954..17e708b3152da298ad47c73921399accf781c0b9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,107 @@ -__pycache__ -.idea -phaeoexplorer_test.json -example.json -example.xlsx -*.bak -undaria_pinnatifida +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ + +# IDE stuff +.idea \ No newline at end of file diff --git a/__pycache__/docker_compose_generator.cpython-36.pyc b/__pycache__/docker_compose_generator.cpython-36.pyc deleted file mode 100644 index f0bfefaa1f33103cb7eb2bf92aff23cbcba9ed3b..0000000000000000000000000000000000000000 Binary files a/__pycache__/docker_compose_generator.cpython-36.pyc and /dev/null differ diff --git a/__pycache__/docker_compose_generator.cpython-38.pyc b/__pycache__/docker_compose_generator.cpython-38.pyc deleted file mode 100644 index 1201041cefa429a4343e2513ccd1c57b142f50b8..0000000000000000000000000000000000000000 Binary files a/__pycache__/docker_compose_generator.cpython-38.pyc and /dev/null differ diff --git a/__pycache__/metadata_generator.cpython-36.pyc b/__pycache__/metadata_generator.cpython-36.pyc deleted file mode 100644 index 19eb173b57afdd0bdc0683e4ed77949e298cd055..0000000000000000000000000000000000000000 Binary files a/__pycache__/metadata_generator.cpython-36.pyc and /dev/null differ diff --git a/__pycache__/metadata_generator.cpython-38.pyc b/__pycache__/metadata_generator.cpython-38.pyc deleted file mode 100644 index eed2e9474897c4805dc636277bafade56ab8a337..0000000000000000000000000000000000000000 Binary files a/__pycache__/metadata_generator.cpython-38.pyc and /dev/null differ diff --git a/__pycache__/table_parser.cpython-36.pyc b/__pycache__/table_parser.cpython-36.pyc deleted file mode 100644 index 7272a5afeb889befe0494f81c89160a22c94c6fb..0000000000000000000000000000000000000000 Binary files a/__pycache__/table_parser.cpython-36.pyc and /dev/null differ diff --git a/__pycache__/table_parser.cpython-38.pyc b/__pycache__/table_parser.cpython-38.pyc deleted file mode 100644 index 57f034e254bb97b21195f7f449cc828c52038074..0000000000000000000000000000000000000000 Binary files a/__pycache__/table_parser.cpython-38.pyc and /dev/null differ diff --git a/create_input_instance.py b/create_input_instance.py index 9ef50aaff9db6a5111f59dceaae63bf24616f220..d01dd88084e6cdaa3fdeaeb3080e43b6d353f128 100644 --- a/create_input_instance.py +++ b/create_input_instance.py @@ -13,9 +13,8 @@ from datetime import datetime """ create_input_instance.py -Create an object containing the data input from the yml file as attributes -This object is then fed to the other scripts -It is to avoid having several times the same code in several files +Create an object containing the data input from the yml file as attributes, which is then fed to the other scripts +This object is created using the data in the input yml file """ @@ -23,7 +22,7 @@ It is to avoid having several times the same code in several files def parse_input(input_file): """ Parse the yml input file to extract data to create the SpeciesData objects - Return a list of dictionaries. Each dictionary contains all the data + Return a list of dictionaries. Each dictionary contains data tied to a species :param input_file: :return: @@ -40,24 +39,58 @@ def parse_input(input_file): try: yaml_dict = yaml.safe_load(stream) for k, v in yaml_dict.items(): + if k == "config": + pass parsed_sp_dict_list.append(v) - except yaml.YAMLError as exc: - logging.debug(exc) + except yaml.YAMLError: + logging.critical("YAMLError raised") + sys.exit() return parsed_sp_dict_list +def parse_args(): + parser = argparse.ArgumentParser(description="Automatic data loading in containers and interaction " + "with galaxy instances for GGA" + ", following the protocol @ " + "http://gitlab.sb-roscoff.fr/abims/e-infra/gga") + + parser.add_argument("-i", "--input", + help="Input file (yml)") + + parser.add_argument("-v", "--verbose", + help="Increase output verbosity", + action="store_false") + + parser.add_argument("--deploy-stacks", + help="Create and deploy the stacks of services", + action="store_true") + + parser.add_argument("--load-data", + help="Create src_data directory tree, copy datasets to src_data, and load these datasets " + "into the instance, DEV", + action="store_true") + + parser.add_argument("--run-workflow", + help="Run main workflow (load data into chado, sync all with tripal, " + "index tripal data, populate materialized view, " + "create a jbrowse for the current genus_species_strain_sex and add organism to jbrowse") + + args = parser.parse_args() + + return args + class SpeciesData: """ This class contains attributes and functions to interact with the galaxy container of the GGA environment - + Parent class of LoadData, DeploySpeciesStack and RunWorkflow """ - def __init__(self, parameters_dictionary, args): + def __init__(self, parameters_dictionary): self.parameters_dictionary = parameters_dictionary - self.args = args + self.args = parse_args() # Not a good design self.species = parameters_dictionary["description"]["species"] self.genus = parameters_dictionary["description"]["genus"] self.strain = parameters_dictionary["description"]["strain"] @@ -105,57 +138,8 @@ class SpeciesData: self.do_update = False # Update the instance (in histories corresponding to the input) instead of creating a new one // TODO: move this variable inside methods self.api_key = "dev" # API key used to communicate with the galaxy instance. Set to "dev" for the moment. Cannot be used to do user-tied actions self.args = args - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Automatic data loading in containers and interaction with galaxy instances for GGA" - ", following the protocol @ " - "http://gitlab.sb-roscoff.fr/abims/e-infra/gga") - - # Dev arguments, TODO: remove in production branch! - parser.add_argument("--full", - help="Run everything, from src_data dir tree creation, moving data files (abims) into src_data," - "modify headers (abims), generate blast banks (doesn't commit them: TODO), initialize GGA instance, load the data and run," - " the main workflow. To update/add data to container, use --update in conjunction to --full (TODO)") - - parser.add_argument("--init-instance", - help="Initialization of galaxy instance. Run first in an empty instance, DEV", - action="store_true") - - parser.add_argument("--deploy-stacks", - help="Create and deploy the stacks of services", - action="store_true") - - parser.add_argument("--load-data", - help="Create src_data directory tree, copy datasets to src_data, and load these datasets into the instance, DEV", - action="store_true") - - parser.add_argument("--run-workflow", - help="Run main workflow (load data into chado, sync all with tripal, " - "index tripal data, populate materialized view, " - "create a jbrowse for the current genus_species_strain_sex and add organism to jbrowse") - - - # Production arguments - parser.add_argument("input", type=str, help="Input file (yml)") - - parser.add_argument("-v", "--verbose", - help="Increase output verbosity", - action="store_false") - - parser.add_argument("--update", - help="Update an already integrated organisms with new data from input file, docker-compose.yml will not be re-generated" - ", assuming the instances for the organisms are already generated and initialized", - action="store_false") - - parser.add_argument("--dir", - help="Path of the main directory, either absolute or relative, defaults to current directory", - default=os.getcwd()) - - args = parser.parse_args() - - if args.verbose: - logging.basicConfig(level=logging.DEBUG) - else: - logging.basicConfig(level=logging.INFO) - + if self.args.verbose: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) diff --git a/deploy_stacks.py b/deploy_stacks.py index 73b766e502cd646361b7bacd1b1d6d281e0fccba..45a89d013f02a2a7d800ea96c02bd4b96eb3779a 100755 --- a/deploy_stacks.py +++ b/deploy_stacks.py @@ -1,64 +1,31 @@ #!/usr/bin/python # -*- coding: utf-8 -*- -import bioblend -import bioblend.galaxy.objects -from bioblend import galaxy + import argparse import os import subprocess import logging import sys -import json import yaml import re -import metadata_generator -import docker_compose_generator -import table_parser +from gga_autoload.gga_load_data import table_parser import fnmatch import shutil from datetime import datetime -import create_input_instance + """ deploy_stacks.py - - -TODO: -- add config file (inside repo or outside with argument -- update existing history -- clean/delete instance? -- delete stack? -- commit the files for blast banks. - -TODO EOSC/Cloudification: -- divide into 2 general-use scripts - - create docker swarm, stacks, etc... (docker side) - - load data into libraries (method to load it at init, and a method/script to load it separately (galaxy side) (alb: galaxy_data_libs_SI does this already?) - -STEPS: -- read input (yml, maybe xlsx later) -- create dir_tree -- DONE -- find and copy data -- DONE -- change file headers, etc.. (ext scripts for data manipulation) -- IN PROGRESS -- generate blast banks and links -- NOT DONE -- generate and edit nginx confs -- DONE -- generate dc and start the containers -- IN PROGRESS -- connect to instance and launch tools>workflows -- IN PROGRESS -- generate and update metadata -- IN PROGRESS - - -NOTES: -- A master API key cannot be used, as some functions are tied to a user (like creating an history), so the access to the - galaxy instance must be done using email and password (definable in yml_example_input.yml) - +Usage: $ python3 deploy_stacks.py -i example.yml [OPTIONS] """ def parse_input(input_file): """ - Parse the yml, json or tabulated input in order to set attributes for the Autoload class + Parse the yml input file to extract data to create the SpeciesData objects + Return a list of dictionaries. Each dictionary contains data tied to a species :param input_file: :return: @@ -75,24 +42,24 @@ def parse_input(input_file): try: yaml_dict = yaml.safe_load(stream) for k, v in yaml_dict.items(): + if k == "config": + pass parsed_sp_dict_list.append(v) - except yaml.YAMLError as exc: - logging.debug(exc) + except yaml.YAMLError as exit_code: + logging.critical(exit_code + " (YAML input file might be incorrect)") + sys.exit() return parsed_sp_dict_list - - -class DeploySpeciesStacks: +class DeploySpeciesStack: """ - The class DeploySpeciesStacks + Deploy a stack of services for a given species """ - def __init__(self, parameters_dictionary, args): + def __init__(self, parameters_dictionary): self.parameters_dictionary = parameters_dictionary - self.args = args self.species = parameters_dictionary["description"]["species"] self.genus = parameters_dictionary["description"]["genus"] self.strain = parameters_dictionary["description"]["strain"] @@ -115,7 +82,8 @@ class DeploySpeciesStacks: self.full_name = " ".join([self.genus_uppercase, self.species, self.strain, self.sex]) self.abbreviation = " ".join([self.genus_lowercase[0], self.species, self.strain, self.sex]) self.genus_species = self.genus_lowercase + "_" + self.species - self.instance_url = "http://scratchgmodv1:8888/sp/" + self.genus_lowercase + "_" + self.species + "/galaxy/" # Testing with localhost/scratchgmodv1 + self.instance_url = "http://scratchgmodv1:8888/sp/" + self.genus_lowercase + "_" + self.species + "/galaxy/" + # Testing with localhost/scratchgmodv1 self.instance = None self.history_id = None self.library_id = None @@ -129,59 +97,24 @@ class DeploySpeciesStacks: self.datasets = dict() self.source_files = dict() self.workflow_name = None - self.docker_compose_generator = None self.metadata = dict() - self.api_key = "dev" # TODO: set the key in config file --> saved for later (master api key access actions are limited) + self.api_key = "master" # TODO: set the key in config file --> saved for later (master api key access actions are limited) if parameters_dictionary["data"]["parent_directory"] == "" or parameters_dictionary["data"]["parent_directory"] == "/path/to/closest/parent/dir": self.source_data_dir = "/projet/sbr/phaeoexplorer/" # Testing path for phaeoexplorer data else: self.source_data_dir = parameters_dictionary["data"]["parent_directory"] - # Directory/subdirectories where data files are located (fasta, gff, ...), point to a directory as close as possible to the source files + # Directory/subdirectories where data files are located (fasta, gff, ...) self.do_update = False - # Update the instance (in histories corresponding to the input) instead of creating a new one // TODO: move this variable inside methods - self.api_key = "dev" - # API key used to communicate with the galaxy instance. Set to "dev" for the moment // TODO: find a way to create, store then use the api key safely - - - # def get_source_data(self, max_depth): - # """ - # TODO: saved for later just in case - # - # Find and copy source data files to src_data directory tree - # - recursively search for the correct files (within a fixed max depth) - # - requires the organism src_data directory tree to already be properly created for the organism (run generate_dir_tree) - # - the source files must have "transcripts", "proteins"/"pep", "genome" in their name, and a gff extension - # - # """ - # src_data_dir = os.path.join(self.species_dir, "/src_data") - # sp_regex = "(?=\w*V)(?=\w*A)(?=\w*R)(?=\w*I)(?=\w*A)(?=\w*B)(?=\w*L)(?=\w*E)\w+" # example with VARIABLE - # - # # The regex works using the species attribute (unique) --> regex is probably not necessary - # sp_regex = "" - # for i in self.species: - # sp_regex = sp_regex + "?=\w*" + i + ")" - # sp_regex = sp_regex + ")\w+" - # re_dict = dict() - # re_dict["gff"] = None - # re_dict["transcripts"] = None - # re_dict["proteins"] = None - # re_dict["genome"] = None - # reg = None - # - # for dirpath, dirnames, files in os.walk(self.source_data_dir): - # for f in files: - # if self.species and self.sex in f: - # logging.info("File found") - - - - - def generate_dir_tree(self): + # Update the instance (in histories corresponding to the input) instead of creating a new one + self.api_key = "master" + # API key used to communicate with the galaxy instance. Cannot be used to do user-tied actions + self.species_name_regex_litteral = "(?=\w*V)(?=\w*A)(?=\w*R)(?=\w*I)(?=\w*A)(?=\w*B)(?=\w*L)(?=\w*E)\w+" # Placeholder re + + + def make_directory_tree(self): """ Generate the directory tree for an organism and move datasets into src_data - TODO: DOCKER -- this is the one the "docker" parts of the script - :return: """ @@ -213,14 +146,37 @@ class DeploySpeciesStacks: # self.species_folder_name = "_".join([self.genus_lowercase, self.species, self.strain, self.sex]) organism_annotation_dir, organism_genome_dir = None, None - # Create src_data dir tree + # Creation (or updating) of the src_data directory tree + # Depth 0-1 try: os.mkdir("./src_data") os.mkdir("./src_data/annotation") os.mkdir("./src_data/genome") os.mkdir("./src_data/tracks") + except FileExistsError: + if self.do_update: + logging.info("Updating src_data directory tree") + else: + logging.debug("The src_data directory tree already exists") + except PermissionError: + logging.critical("Insufficient permission to create src_data directory tree") + sys.exit() + + # Depth 2 + try: os.mkdir("./src_data/annotation/" + self.species_folder_name) os.mkdir("./src_data/genome/" + self.species_folder_name) + except FileExistsError: + if self.do_update: + logging.info("Updating src_data directory tree") + else: + logging.debug("The src_data directory tree already exists") + except PermissionError: + logging.critical("Insufficient permission to create src_data directory tree") + sys.exit() + + # Depth 3 + try: os.mkdir("./src_data/annotation/" + self.species_folder_name + "/OGS" + self.ogs_version) os.mkdir("./src_data/genome/" + self.species_folder_name + "/v" + self.genome_version) organism_annotation_dir = os.path.abspath("./src_data/annotation/" + self.species_folder_name + "/OGS" + self.genome_version) @@ -234,6 +190,12 @@ class DeploySpeciesStacks: logging.critical("Insufficient permission to create src_data directory tree") sys.exit() + + def make_compose_files(self): + """ + + :return: + """ # Path to the templates used to generate the custom docker-compose files for an input species stack_template_path = self.script_dir + "/templates/stack-organism.yml" traefik_template_path = self.script_dir + "/templates/traefik.yml" @@ -248,20 +210,27 @@ class DeploySpeciesStacks: with open(stack_template_path, 'r') as infile: organism_content = list() for line in infile: - # One-liner to replace placeholders by the genus and species + # Replace placeholders in the compose file organism_content.append( - line.replace("genus_species", str(self.genus.lower() + "_" + self.species)).replace("Genus species", str(self.genus_uppercase + " " + self.species)).replace("Genus/species", str(self.genus_uppercase + "/" + self.species)).replace("gspecies", str( self.genus.lower()[0] + self.species)).replace("genus_species_strain_sex", genus_species_strain_sex)) + line.replace("genus_species", str(self.genus.lower() + "_" + self.species)).replace("Genus species", + str( + self.genus_uppercase + " " + self.species)).replace( + "Genus/species", str(self.genus_uppercase + "/" + self.species)).replace("gspecies", str( + self.genus.lower()[0] + self.species)).replace("genus_species_strain_sex", + genus_species_strain_sex)) with open("./docker-compose.yml", 'w') as outfile: for line in organism_content: outfile.write(line) - subprocess.call(["python3", self.script_dir + "/create_mounts.py"], cwd=working_dir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) # Create mounts for the containers + subprocess.call(["python3", self.script_dir + "/create_mounts.py"], cwd=working_dir, + stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) # Create mounts for the containers try: os.mkdir("../traefik") os.mkdir("../traefik/authelia") shutil.copy(authelia_config_path, "../traefik/authelia/configuration.yml") shutil.copy(authelia_users_path, "../traefik/authelia/users.yml") # TODO: custom users (add a config file?) - subprocess.call(["python3", self.script_dir + "/create_mounts.py"], cwd=working_dir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) # Create mounts for the containers + subprocess.call(["python3", self.script_dir + "/create_mounts.py"], cwd=working_dir, + stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) # Create mounts for the containers except FileExistsError: logging.debug("Traefik directory already exists") try: @@ -271,11 +240,9 @@ class DeploySpeciesStacks: subprocess.call(["python3", self.script_dir + "/create_mounts.py"], cwd=working_dir) - def get_source_data_files_from_path(self): """ - Find all files in source_data directory, to link the matching files in the src_data dir tree - + Link data files :return: """ @@ -290,7 +257,7 @@ class DeploySpeciesStacks: organism_genome_dir = os.path.abspath("./src_data/genome/" + self.species_folder_name + "/v" + self.genome_version) for dirpath, dirnames, files in os.walk(self.source_data_dir): - if "0" in str(dirpath): # Ensures to take the correct files (other dirs hold files with the correct names, but I don't know if they are the same) #alb + if "0" in str(dirpath): # Ensures to take the correct files (other dirs hold files with the correct names, but I don't know if they are the same), this is for Phaeoexplorer only for f in files: if "Contaminants" not in str(f): try: @@ -322,7 +289,6 @@ class DeploySpeciesStacks: logging.warning("Error raised (NotADirectoryError)") - def deploy_stack(self): """ Call the script "deploy.sh" used to initiliaze the swarm cluster if needed and launch/update the stack @@ -330,458 +296,10 @@ class DeploySpeciesStacks: :return: """ # Launch and update docker stacks (cf docs) - # TODO: add a fail condition? subprocess.call(["sh", self.script_dir + "/deploy.sh", self.genus_species, self.main_dir + "/traefik"]) - - - def modify_fasta_headers(self): - """ - Change the fasta headers before integration. - - :return: - """ - - try: - os.chdir(self.species_dir) - working_dir = os.getcwd() - except OSError: - logging.info("Cannot access " + self.species_dir + ", run with higher privileges") - logging.info("Fatal error: exit") - sys.exit() - self.source_files = dict() - annotation_dir, genome_dir = None, None - for d in [i[0] for i in os.walk(os.getcwd() + "/src_data")]: - if "annotation/" in d: - annotation_dir = d - for f in os.listdir(d): - if f.endswith("proteins.fasta"): - self.source_files["proteins_file"] = os.path.join(d, f) - elif f.endswith("transcripts-gff.fa"): - self.source_files["transcripts_file"] = os.path.join(d, f) - elif f.endswith(".gff"): - self.source_files["gff_file"] = os.path.join(d, f) - elif "genome/" in d: - genome_dir = d - for f in os.listdir(d): - if f.endswith(".fa"): - self.source_files["genome_file"] = os.path.join(d, f) - logging.debug("source files found:") - for k, v in self.source_files.items(): - logging.debug("\t" + k + "\t" + v) - - # Changing headers in the *proteins.fasta file from >mRNA* to >protein* - # production version - modify_pep_headers = [str(self.main_dir) + "/gga_load_data/ext_scripts/phaeoexplorer-change_pep_fasta_header.sh", - self.source_files["proteins_file"]] - # test version - # modify_pep_headers = ["/home/alebars/gga/phaeoexplorer-change_pep_fasta_header.sh", - # self.source_files["proteins_file"]] - logging.info("Changing fasta headers: " + self.source_files["proteins_file"]) - subprocess.run(modify_pep_headers, stdout=subprocess.PIPE, cwd=annotation_dir) - # production version - modify_pep_headers = [str(self.main_dir) + "/gga_load_data/ext_scripts/phaeoexplorer-change_transcript_fasta_header.sh", - self.source_files["proteins_file"]] - # test version - # modify_pep_headers = ["/home/alebars/gga/phaeoexplorer-change_transcript_fasta_header.sh", - # self.source_files["proteins_file"]] - logging.info("Changing fasta headers: " + self.source_files["transcripts_file"]) - subprocess.run(modify_pep_headers, stdout=subprocess.PIPE, cwd=annotation_dir) - - # src_data cleaning - if os.path.exists(annotation_dir + "outfile"): - subprocess.run(["mv", annotation_dir + "/outfile", self.source_files["proteins_file"]], - stdout=subprocess.PIPE, - cwd=annotation_dir) - if os.path.exists(annotation_dir + "gmon.out"): - subprocess.run(["rm", annotation_dir + "/gmon.out"], - stdout=subprocess.PIPE, - cwd=annotation_dir) - - - - - def generate_blast_banks(self): - """ - TODO - Automatically generate blast banks for a species - TODO: auto commit the files? - - :return: - """ - - - def connect_to_instance(self): - """ - TODO: move in init/access - TODO: password - Test the connection to the galaxy instance for the current organism - Exit if it cannot connect to the instance - """ - self.instance = galaxy.GalaxyInstance(url=self.instance_url, email="gga@sb-roscoff.fr", password="password", verify=False) - logging.info("Connecting to the galaxy instance ...") - try: - self.instance.histories.get_histories() - self.tool_panel = self.instance.tools.get_tool_panel() - except bioblend.ConnectionError: - logging.critical("Cannot connect to galaxy instance @ " + self.instance_url) - sys.exit() - else: - logging.info("Successfully connected to galaxy instance @ " + self.instance_url) - self.instance.histories.create_history(name="FOO") - - - - - - - def setup_data_libraries(self): - """ - - generate blast banks and docker-compose - - load data into the galaxy container with the galaxy_data_libs_SI.py script - - :return: - """ - - try: - logging.info("Loading data into the galaxy container") - subprocess.run("../serexec genus_species_galaxy /tool_deps/_conda/bin/python /opt/setup_data_libraries.py", - shell=True) - except subprocess.CalledProcessError: - logging.info("Cannot load data into the galaxy container for " + self.full_name) - pass - else: - logging.info("Data successfully loaded into the galaxy container for " + self.full_name) - - self.get_species_history_id() - # self.get_instance_attributes() - # - # # import all datasets into current history - # self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["genome_file"]) - # self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["gff_file"]) - # self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["transcripts_file"]) - # self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["proteins_file"]) - - - - - - def get_species_history_id(self): - """ - Set and return the current species history id in its galaxy instance - - :return: - """ - histories = self.instance.histories.get_histories(name=str(self.full_name)) - self.history_id = histories[0]["id"] - self.instance.histories.show_history(history_id=self.history_id) - - return self.history_id - - - - - def create_species_history(self): - histories = self.instance.histories.get_histories(name=str(self.full_name)) - print("\n" + str(histories) + "\n" + self.full_name + "\n") - if not histories: - self.instance.histories.create_history(name="FOO") - print("Created history!") - - - - - - def get_instance_attributes(self): - """ - retrieves instance attributes: - - working history ID - - libraries ID (there should only be one library!) - - datasets IDs - - :return: - """ - histories = self.instance.histories.get_histories(name=str(self.full_name)) - self.history_id = histories[0]["id"] - logging.debug("history ID: " + self.history_id) - libraries = self.instance.libraries.get_libraries() # normally only one library - self.library_id = self.instance.libraries.get_libraries()[0]["id"] # project data folder/library - logging.debug("library ID: " + self.history_id) - instance_source_data_folders = self.instance.libraries.get_folders(library_id=self.library_id) - - folders_ids = {} - current_folder_name = "" - for i in instance_source_data_folders: - for k, v in i.items(): - if k == "name": - folders_ids[v] = 0 - current_folder_name = v - if k == "id": - folders_ids[current_folder_name] = v - logging.info("Folders and datasets IDs: ") - self.datasets = dict() - for k, v in folders_ids.items(): - logging.info("\t" + k + ": " + v) - if k == "/genome": - sub_folder_content = self.instance.folders.show_folder(folder_id=v, contents=True) - for k2, v2 in sub_folder_content.items(): - for e in v2: - if type(e) == dict: - if e["name"].endswith(".fa"): - self.datasets["genome_file"] = e["ldda_id"] - logging.info("\t\t" + e["name"] + ": " + e["ldda_id"]) - elif k == "/annotation/" + self.genus_species: - sub_folder_content = self.instance.folders.show_folder(folder_id=v, contents=True) - for k2, v2 in sub_folder_content.items(): - for e in v2: - if type(e) == dict: - # TODO: manage several files of the same type and manage versions - if e["name"].endswith("transcripts-gff.fa"): - self.datasets["transcripts_file"] = e["ldda_id"] - logging.info("\t\t" + e["name"] + ": " + e["ldda_id"]) - elif e["name"].endswith("proteins.fasta"): - self.datasets["proteins_file"] = e["ldda_id"] - logging.info("\t\t" + e["name"] + ": " + e["ldda_id"]) - elif e["name"].endswith(".gff"): - self.datasets["gff_file"] = e["ldda_id"] - logging.info("\t\t" + e["name"] + ": " + e["ldda_id"]) - elif e["name"].endswith("MALE"): - self.datasets["gff_file"] = e["ldda_id"] - logging.info("\t\t" + e["name"] + ": " + e["ldda_id"]) - - - - - - - def init_instance(self): - """ - Galaxy instance startup in preparation for running workflows - - remove Homo sapiens from the chado database. - - add organism and analyses into the chado database --> separate - - get any other existing organisms IDs before updating the galaxy instance --> separate - - TODO: move the library and analysis/data stuff to a separate function - :return: - """ - - self.connect_to_instance() - self.get_species_history_id() - histories = self.instance.histories.get_histories(name=str(self.full_name)) - # Create the first history - if not histories: - self.instance.histories.create_history(name=str(self.full_name)) - self.history_id = histories[0]["id"] - logging.debug("history ID: " + self.history_id) - # libraries = self.instance.libraries.get_libraries() # routine check: one library - # self.library_id = self.instance.libraries.get_libraries()[0]["id"] # project data folder/library - logging.debug("library ID: " + self.history_id) - instance_source_data_folders = self.instance.libraries.get_folders(library_id=self.library_id) - - # Delete Homo sapiens from Chado database - logging.debug("Getting 'Homo sapiens' ID in instance's chado database") - get_sapiens_id_job = self.instance.tools.run_tool( - tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/2.3.2", - history_id=self.history_id, - tool_inputs={"genus": "Homo", "species": "sapiens"}) - get_sapiens_id_job_output = get_sapiens_id_job["outputs"][0]["id"] - get_sapiens_id_json_output = self.instance.datasets.download_dataset(dataset_id=get_sapiens_id_job_output) - try: - logging.debug("Deleting Homo 'sapiens' in the instance's chado database") - get_sapiens_id_final_output = json.loads(get_sapiens_id_json_output)[0] - sapiens_id = str( - get_sapiens_id_final_output["organism_id"]) # needs to be str to be recognized by the chado tool - self.instance.tools.run_tool( - tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_delete_organisms/organism_delete_organisms/2.3.2", - history_id=self.history_id, - tool_inputs={"organism": str(sapiens_id)}) - except bioblend.ConnectionError: - logging.debug("Homo sapiens isn't in the instance's chado database") - except IndexError: - logging.debug("Homo sapiens isn't in the instance's chado database") - pass - - # TODO: the following actions should be done in a separate function (in case if the user wants to do everything him/herself -- for EOSC) - # Add organism (species) to chado - logging.info("Adding organism to the instance's chado database") - self.instance.tools.run_tool( - tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_add_organism/organism_add_organism/2.3.2", - history_id=self.history_id, - tool_inputs={"abbr": self.abbreviation, - "genus": self.genus, - "species": self.species, - "common": self.common}) - # Add OGS analysis to chado - logging.info("Adding OGS analysis to the instance's chado database") - self.instance.tools.run_tool( - tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/2.3.2", - history_id=self.history_id, - tool_inputs={"name": self.genus + " " + self.species + " OGS" + self.ogs_version, - "program": "Performed by Genoscope", - "programversion": str("OGS" + self.ogs_version), - "sourcename": "Genoscope", - "date_executed": self.date}) - - # Add genome analysis to chado - logging.info("Adding genome analysis to the instance's chado database") - self.instance.tools.run_tool( - tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/2.3.2", - history_id=self.history_id, - tool_inputs={"name": self.genus + " " + self.species + " genome v" + self.genome_version, - "program": "Performed by Genoscope", - "programversion": str("genome v" + self.genome_version), - "sourcename": "Genoscope", - "date_executed": self.date}) - self.get_organism_and_analyses_ids() - logging.info("Finished initializing instance") - - - - - - - - def run_workflow(self, workflow_name, workflow_parameters, datamap): - """ - Run the "main" workflow in the galaxy instance - - import data to library - - load fasta and gff - - sync with tripal - - add jbrowse + organism - - fill in the tripal views - - TODO: map tool name to step id - :param workflow_name: - :param workflow_parameters: - :param datamap: - :return: - """ - - logging.debug("running workflow: " + str(workflow_name)) - workflow_ga_file = self.main_dir + "Galaxy-Workflow-" + workflow_name + ".ga" - if self.strain != "": - custom_ga_file = "_".join([self.genus, self.species, self.strain]) + "_workflow.ga" - custom_ga_file_path = os.path.abspath(custom_ga_file) - else: - custom_ga_file = "_".join([self.genus, self.species]) + "_workflow.ga" - custom_ga_file_path = os.path.abspath(custom_ga_file) - with open(workflow_ga_file, 'r') as ga_in_file: - workflow = str(ga_in_file.readlines()) - # ugly fix for the jbrowse parameters - workflow = workflow.replace('{\\\\\\\\\\\\"unique_id\\\\\\\\\\\\": \\\\\\\\\\\\"UNIQUE_ID\\\\\\\\\\\\"}', - str('{\\\\\\\\\\\\"unique_id\\\\\\\\\\\\": \\\\\\\\\\\\"' + self.genus + " " + self.species) + '\\\\\\\\\\\\"') - workflow = workflow.replace('\\\\\\\\\\\\"name\\\\\\\\\\\\": \\\\\\\\\\\\"NAME\\\\\\\\\\\\"', - str('\\\\\\\\\\\\"name\\\\\\\\\\\\": \\\\\\\\\\\\"' + self.genus.lower()[0] + self.species) + '\\\\\\\\\\\\"') - workflow = workflow.replace("\\\\", "\\") # to restore the correct amount of backslashes in the workflow string before import - # test - workflow = workflow.replace('http://localhost/sp/genus_species/feature/Genus/species/mRNA/{id}', - "http://localhost/sp/" + self.genus_lowercase+ "_" + self.species + "/feature/" + self.genus + "/mRNA/{id}") - # production - # workflow = workflow.replace('http://localhost/sp/genus_species/feature/Genus/species/mRNA/{id}', - # "http://abims--gga.sb-roscoff.fr/sp/" + self.genus_lowercase + "_" + self.species + "/feature/" + self.genus + "/mRNA/{id}") - workflow = workflow[2:-2] # if the line under doesn't output a correct json - # workflow = workflow[:-2] # if the line above doesn't output a correct json - - workflow_dict = json.loads(workflow) - - self.instance.workflows.import_workflow_dict(workflow_dict=workflow_dict) - self.workflow_name = workflow_name - workflow_attributes = self.instance.workflows.get_workflows(name=self.workflow_name) - workflow_id = workflow_attributes[0]["id"] - show_workflow = self.instance.workflows.show_workflow(workflow_id=workflow_id) - logging.debug("Workflow ID: " + workflow_id) - - logging.debug("Inputs:") - logging.debug(show_workflow["Inputs"]) - self.instance.workflows.invoke_workflow(workflow_id=workflow_id, - history_id=self.history_id, - params=workflow_parameters, - inputs=datamap, - inputs_by="") - self.instance.workflows.delete_workflow(workflow_id=workflow_id) - - - - - - - def load_data_in_galaxy(self): - """ - Function to load the src_data folder in galaxy - - :return: - """ - - logging.info("Loading data in galaxy") - - return None - - - - - - def get_organism_and_analyses_ids(self): - """ - Retrieve current organism ID and OGS and genome chado analyses IDs (needed to run some tools as Tripal/Chado - doesn't accept organism/analyses names as valid inputs - - :return: - """ - # Get the ID for the current organism in chado - org = self.instance.tools.run_tool( - tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/2.3.2", - history_id=self.history_id, - tool_inputs={"genus": self.genus, "species": self.species}) - org_job_out = org["outputs"][0]["id"] - org_json_output = self.instance.datasets.download_dataset(dataset_id=org_job_out) - try: - org_output = json.loads(org_json_output)[0] - self.org_id = str(org_output["organism_id"]) # id needs to be a str to be recognized by chado tools - except IndexError: - logging.debug("no organism matching " + self.full_name + " exists in the instance's chado database") - - # Get the ID for the OGS analysis in chado - ogs_analysis = self.instance.tools.run_tool( - tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_get_analyses/analysis_get_analyses/2.3.2", - history_id=self.history_id, - tool_inputs={"name": self.genus + " " + self.species + " OGS" + self.ogs_version}) - ogs_analysis_job_out = ogs_analysis["outputs"][0]["id"] - ogs_analysis_json_output = self.instance.datasets.download_dataset(dataset_id=ogs_analysis_job_out) - try: - ogs_analysis_output = json.loads(ogs_analysis_json_output)[0] - self.ogs_analysis_id = str(ogs_analysis_output["analysis_id"]) - except IndexError: - logging.debug("no matching OGS analysis exists in the instance's chado database") - - # Get the ID for the genome analysis in chado - genome_analysis = self.instance.tools.run_tool( - tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_get_analyses/analysis_get_analyses/2.3.2", - history_id=self.history_id, - tool_inputs={"name": self.genus + " " + self.species + " genome v" + self.genome_version}) - genome_analysis_job_out = genome_analysis["outputs"][0]["id"] - genome_analysis_json_output = self.instance.datasets.download_dataset(dataset_id=genome_analysis_job_out) - try: - genome_analysis_output = json.loads(genome_analysis_json_output)[0] - self.genome_analysis_id = str(genome_analysis_output["analysis_id"]) - except IndexError: - logging.debug("no matching genome analysis exists in the instance's chado database") - - - - - def clean_instance(self): - """ - TODO: method to purge the instance from analyses and organisms - :return: - """ - return None - - - - - def filter_empty_not_empty_items(li): ret = {"empty": [], "not_empty": []} for i in li: @@ -793,42 +311,18 @@ def filter_empty_not_empty_items(li): if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Automatic data loading in containers and interaction with galaxy instances for GGA" + parser = argparse.ArgumentParser(description="Automatic data loading in containers and interaction " + "with galaxy instances for GGA" ", following the protocol @ " "http://gitlab.sb-roscoff.fr/abims/e-infra/gga") - # Dev arguments, TODO: remove in production branch! - parser.add_argument("--full", - help="Run everything, from src_data dir tree creation, moving data files (abims) into src_data," - "modify headers (abims), generate blast banks (doesn't commit them: TODO), initialize GGA instance, load the data and run," - " the main workflow. To update/add data to container, use --update in conjunction to --full (TODO)") - parser.add_argument("--init-instance", - help="Initialization of galaxy instance. Run first in an empty instance, DEV", - action="store_true") - parser.add_argument("--load-data", - help="Create src_data directory tree, copy datasets to src_data, and load these datasets into the instance, DEV", - action="store_true") - parser.add_argument("--run-main", - help="Run main workflow (load data into chado, sync all with tripal, " - "index tripal data, populate materialized view, " - "create a jbrowse for the current genus_species_strain_sex and add organism to jbrowse") - parser.add_argument("--generate-docker-compose", - help="Generate docker-compose.yml for current species, DEV") - parser.add_argument("--link-source", - help="Find source files in source data dir and copy them to src_data, DEV, OBSOLETE", - action="store_true") - - # Production arguments - parser.add_argument("input", type=str, help="Input file (yml)") + + parser.add_argument("input", + type=str, + help="Input file (yml)") + parser.add_argument("-v", "--verbose", help="Increase output verbosity", action="store_false") - parser.add_argument("--update", - help="Update an already integrated organisms with new data from input file, docker-compose.yml will not be re-generated" - ", assuming the instances for the organisms are already generated and initialized", - action="store_false") - parser.add_argument("--dir", - help="Path of the main directory, either absolute or relative, defaults to current directory", - default=os.getcwd()) args = parser.parse_args() @@ -837,93 +331,23 @@ if __name__ == "__main__": else: logging.basicConfig(level=logging.INFO) - logging.info("Start") + logging.info("Deploy stacks: start") sp_dict_list = parse_input(args.input) for sp_dict in sp_dict_list: - al = Autoload(parameters_dictionary=sp_dict, args=args) - al.main_dir = os.path.abspath(args.dir) - if args.load_data: - """ - Full workflow - TODO: change later (docker side / load data side / galaxy side) - """ - # al.generate_dir_tree() - # logging.info("Successfully generated the directory tree for " + al.genus[0].upper() + ". " + al.species + " " + al.strain + " " + al.sex) - # - # # al.get_source_data_files_from_path() - # logging.info("Successfully retrieved source data files for " + al.genus[0].upper() + ". " + al.species + " " + al.strain + " " + al.sex) - # - # al.deploy_stack() - # logging.info("Successfully deployed containers stack for " + al.genus[0].upper() + ". " + al.species + " " + al.strain + " " + al.sex) - # - al.connect_to_instance() - logging.info("Connected to instance") - # - # al.create_species_history() - # logging.info("Created a history") - # - # al.setup_data_libraries() - # logging.info("Setting up data libraries") - - # al.init_instance() - # logging.info("Successfully initialized instance for " + al.genus[0].upper() + ". " + al.species + " " + al.strain + " " + al.sex) - - # al.setup_data_libraries() - # logging.info("Successfully set up data libraries in galaxy for " + al.genus[0].upper() + ". " + al.species + " " + al.strain + " " + al.sex) - - - # if args.init_instance: - # logging.info(" Initializing the galaxy instance") - # al.init_instance() - # al.get_instance_attributes() - # # metadata[genus_species_strain_sex]["initialized"] = True - # if args.load_data: - # logging.info("Loading data into galaxy") - # # al.load_data() - # # metadata[genus_species_strain_sex]["data_loaded_in_instance"] = True - # if args.run_main: - # logging.info("Running main workflow") - # al.get_organism_and_analyses_ids() - # workflow_parameters = dict() - # workflow_parameters["0"] = {} - # workflow_parameters["1"] = {} - # workflow_parameters["2"] = {} - # workflow_parameters["3"] = {} - # workflow_parameters["4"] = {"organism": al.org_id, - # "analysis_id": al.genome_analysis_id, - # "do_update": "true"} - # workflow_parameters["5"] = {"organism": al.org_id, - # "analysis_id": al.ogs_analysis_id} - # workflow_parameters["6"] = {"organism_id": al.org_id} - # workflow_parameters["7"] = {"analysis_id": al.ogs_analysis_id} - # workflow_parameters["8"] = {"analysis_id": al.genome_analysis_id} - # workflow_parameters["9"] = {"organism_id": al.org_id} - # workflow_parameters["10"] = {} - # workflow_parameters["11"] = {} - # - # al.datamap = dict() - # al.datamap["0"] = {"src": "hda", "id": al.datasets["genome_file"]} - # al.datamap["1"] = {"src": "hda", "id": al.datasets["gff_file"]} - # al.datamap["2"] = {"src": "hda", "id": al.datasets["proteins_file"]} - # al.datamap["3"] = {"src": "hda", "id": al.datasets["transcripts_file"]} - # - # al.run_workflow(workflow_name="main", workflow_parameters=workflow_parameters, datamap=al.datamap) - # # metadata[genus_species_strain_sex]["workflows_run"] = metadata[genus_species_strain_sex]["workflows_run"].append("main") - # - # if args.link_source: - # print('DEV') - # al.generate_dir_tree() - # print(al.main_dir) - # print(al.species_dir) - - logging.info("Exit") - - - -def main(species_data): - """ - "Main" function + o = DeploySpeciesStack(parameters_dictionary=sp_dict) + o.main_dir = os.path.abspath(args.dir) + + # dss.make_directory_tree() + # logging.info("Successfully generated the directory tree for " + o.genus[0].upper() + ". " + o.species + " " + o.strain + " " + o.sex) + + dss.make_compose_files() + logging.info("Successfully generated the directory tree for " + o.genus[0].upper() + ". " + o.species + " " + o.strain + " " + o.sex) + + # dss.get_source_data_files_from_path() + # logging.info("Successfully retrieved source data files for " + o.genus[0].upper() + ". " + o.species + " " + o.strain + " " + o.sex) + + # dss.deploy_stack() + # logging.info("Successfully deployed containers stack for " + o.genus[0].upper() + ". " + o.species + " " + o.strain + " " + o.sex) + + logging.info("Deploy stacks: done") - :return: - """ - print("OK") \ No newline at end of file diff --git a/docker_compose_generator.py b/docker_compose_generator.py index d5fe776c921dee8eda64c9c151a7ab478a26be01..81fdcc3495f97fc5f8e27bac53499e2ab91f6f6a 100755 --- a/docker_compose_generator.py +++ b/docker_compose_generator.py @@ -6,23 +6,9 @@ import logging # import json """ -docker-compose.yml generator -The method "generate" works for both docker-compose architecture (old), or docker stack (new) -This method will write a formatted docker-compose.yml for the specified organism (only requires genus and species) - -Made to work in the integration streamlined script "deploy_stacks.py" but can be used as a standalone (either with a CLI -or in another python file as a module) - -Dockerfiles are specific to genus-species: a same organism can have several strains and sexes integrated, but only one -set of applications are used (see metadata files for details about what strains/sexes have been integrated for -an organism) +docker-compose_generator.py -TODO: write the whole yml dict from scratch (would allow the script to be more reusable into the future and make it -more customizable while being clearer (instead of the default yml string or input docker-compose template) - -TODO: read json - -API master key or galaxy: MASTER_API_KEY: XXXXXXX (alphanum, user prompt or git env variable) +This method will write a formatted docker-compose.yml for the specified organism (only requires genus and species) """ diff --git a/examples/example_input.json b/examples/json_example_input.json similarity index 100% rename from examples/example_input.json rename to examples/json_example_input.json diff --git a/examples/yml_example_input.yml b/examples/yml_example_input.yml index af0fe1213e1ac36cd0011d8bf74cbe6097f74c20..10395abcdb72178d522afc026caf829170ce7336 100644 --- a/examples/yml_example_input.yml +++ b/examples/yml_example_input.yml @@ -3,13 +3,13 @@ # list of species for which the script will have to create these stacks/load data into galaxy/run workflows # Add new config option using a config scalar - -config: # Simple config part, allowing the user to create his/her own admin account (default is gga) - # WARNING: not supported currently, as the default connection is using the gga account - admin: +config: + admins: # Add admin account WARNING: not supported currently, as the default connection through a web browser is using the gga account username: "nflantier" # Desired admin username password: "blanquette" # Desired admin password - email: "noel.flantier@galaxy.org" # Desired admin email + email: "noel.flantier@mail.com" # Desired admin email + master_api_key: "master" # Master API key is useless at the moment + url_prefix: "http://localhost/ # URL prefix to forward ectocarpus_sp1: # Dummy value the user gives to designate the species (isn't used by the script) # Species description, leave blank if unknown or you don't want it to be used diff --git a/setup_data_libraries.py b/galaxy_data_libs_SI.py similarity index 100% rename from setup_data_libraries.py rename to galaxy_data_libs_SI.py diff --git a/load_data.py b/load_data.py index 4f3fbbdad58fe245a2277b8c8a5e2c94d9e03972..dd5cb3da1417d773c75e40318891cfe810f89b94 100644 --- a/load_data.py +++ b/load_data.py @@ -5,25 +5,262 @@ import bioblend import bioblend.galaxy.objects from bioblend import galaxy +import argparse +import os +import subprocess import logging import sys -import deploy_stacks -import create_input_instance +import yaml +import re +from datetime import datetime -""" +""" load_data.py -Find source data files using the information provided in the input file. -Copy these source data files over into the src_data directory +Usage: $ python3 deploy_stacks.py -i example.yml [OPTIONS] +""" + -Load the data into Galaxy using the script provided by Anthony Bretaudeau (setup_data_libraries) -Also create/update the species history (TODO: Updating history) +def parse_input(input_file): + """ + Parse the yml input file to extract data to create the SpeciesData objects + Return a list of dictionaries. Each dictionary contains data tied to a species + :param input_file: + :return: + """ -""" + parsed_sp_dict_list = [] + + if str(input_file).endswith("yml") or str(input_file).endswith("yaml"): + logging.debug("Input format used: YAML") + else: + logging.critical("Error, please input a YAML file") + sys.exit() + with open(input_file, 'r') as stream: + try: + yaml_dict = yaml.safe_load(stream) + for k, v in yaml_dict.items(): + if k == "config": + pass + parsed_sp_dict_list.append(v) + except yaml.YAMLError as exit_code: + logging.critical(exit_code + " (YAML input file might be incorrect)") + sys.exit() + return parsed_sp_dict_list + + +class LoadData: + """ + Load data from the src_data subfolders into the galaxy instance's history of a given species + + """ + + def __init__(self, parameters_dictionary): + self.parameters_dictionary = parameters_dictionary + self.species = parameters_dictionary["description"]["species"] + self.genus = parameters_dictionary["description"]["genus"] + self.strain = parameters_dictionary["description"]["strain"] + self.sex = parameters_dictionary["description"]["sex"] + self.common = parameters_dictionary["description"]["common_name"] + self.date = datetime.today().strftime("%Y-%m-%d") + self.origin = parameters_dictionary["description"]["origin"] + self.performed = parameters_dictionary["data"]["performed_by"] + if parameters_dictionary["data"]["genome_version"] == "": + self.genome_version = "1.0" + else: + self.genome_version = parameters_dictionary["data"]["genome_version"] + if parameters_dictionary["data"]["ogs_version"] == "": + self.ogs_version = "1.0" + else: + self.ogs_version = parameters_dictionary["data"]["ogs_version"] + self.genus_lowercase = self.genus[0].lower() + self.genus[1:] + self.genus_uppercase = self.genus[0].upper() + self.genus[1:] + self.species_folder_name = "_".join([self.genus_lowercase, self.species, self.strain, self.sex]) + self.full_name = " ".join([self.genus_uppercase, self.species, self.strain, self.sex]) + self.abbreviation = " ".join([self.genus_lowercase[0], self.species, self.strain, self.sex]) + self.genus_species = self.genus_lowercase + "_" + self.species + self.instance_url = "http://scratchgmodv1:8888/sp/" + self.genus_lowercase + "_" + self.species + "/galaxy/" + # Testing with localhost/scratchgmodv1 + self.instance = None + self.history_id = None + self.library_id = None + self.script_dir = os.path.dirname(os.path.realpath(sys.argv[0])) + self.main_dir = None + self.species_dir = None + self.org_id = None + self.genome_analysis_id = None + self.ogs_analysis_id = None + self.tool_panel = None + self.datasets = dict() + self.source_files = dict() + self.workflow_name = None + self.metadata = dict() + self.api_key = "master" # TODO: set the key in config file --> saved for later (master api key access actions are limited) + if parameters_dictionary["data"]["parent_directory"] == "" or parameters_dictionary["data"]["parent_directory"] == "/path/to/closest/parent/dir": + self.source_data_dir = "/projet/sbr/phaeoexplorer/" # Testing path for phaeoexplorer data + else: + self.source_data_dir = parameters_dictionary["data"]["parent_directory"] + # Directory/subdirectories where data files are located (fasta, gff, ...) + self.do_update = False + # Update the instance (in histories corresponding to the input) instead of creating a new one + self.api_key = "master" + # API key used to communicate with the galaxy instance. Cannot be used to do user-tied actions + self.species_name_regex_litteral = "(?=\w*V)(?=\w*A)(?=\w*R)(?=\w*I)(?=\w*A)(?=\w*B)(?=\w*L)(?=\w*E)\w+" # Placeholder re + + + def modify_fasta_headers(self): + """ + Change the fasta headers before integration. + + :return: + """ + + try: + os.chdir(self.species_dir) + working_dir = os.getcwd() + except OSError: + logging.info("Cannot access " + self.species_dir + ", run with higher privileges") + logging.info("Fatal error: exit") + sys.exit() + self.source_files = dict() + annotation_dir, genome_dir = None, None + for d in [i[0] for i in os.walk(os.getcwd() + "/src_data")]: + if "annotation/" in d: + annotation_dir = d + for f in os.listdir(d): + if f.endswith("proteins.fasta"): + self.source_files["proteins_file"] = os.path.join(d, f) + elif f.endswith("transcripts-gff.fa"): + self.source_files["transcripts_file"] = os.path.join(d, f) + elif f.endswith(".gff"): + self.source_files["gff_file"] = os.path.join(d, f) + elif "genome/" in d: + genome_dir = d + for f in os.listdir(d): + if f.endswith(".fa"): + self.source_files["genome_file"] = os.path.join(d, f) + logging.debug("source files found:") + for k, v in self.source_files.items(): + logging.debug("\t" + k + "\t" + v) + + # Changing headers in the *proteins.fasta file from >mRNA* to >protein* + # production version + modify_pep_headers = [str(self.main_dir) + "/gga_load_data/utils/phaeoexplorer-change_pep_fasta_header.sh", + self.source_files["proteins_file"]] + # test version + # modify_pep_headers = ["/home/alebars/gga/phaeoexplorer-change_pep_fasta_header.sh", + # self.source_files["proteins_file"]] + logging.info("Changing fasta headers: " + self.source_files["proteins_file"]) + subprocess.run(modify_pep_headers, stdout=subprocess.PIPE, cwd=annotation_dir) + # production version + modify_pep_headers = [str(self.main_dir) + "/gga_load_data/utils/phaeoexplorer-change_transcript_fasta_header.sh", + self.source_files["proteins_file"]] + # test version + # modify_pep_headers = ["/home/alebars/gga/phaeoexplorer-change_transcript_fasta_header.sh", + # self.source_files["proteins_file"]] + logging.info("Changing fasta headers: " + self.source_files["transcripts_file"]) + subprocess.run(modify_pep_headers, stdout=subprocess.PIPE, cwd=annotation_dir) + + # src_data cleaning + if os.path.exists(annotation_dir + "outfile"): + subprocess.run(["mv", annotation_dir + "/outfile", self.source_files["proteins_file"]], + stdout=subprocess.PIPE, + cwd=annotation_dir) + if os.path.exists(annotation_dir + "gmon.out"): + subprocess.run(["rm", annotation_dir + "/gmon.out"], + stdout=subprocess.PIPE, + cwd=annotation_dir) + + + def setup_data_libraries(self): + """ + - generate blast banks and docker-compose + - load data into the galaxy container with the galaxy_data_libs_SI.py script + + :return: + """ + + try: + logging.info("Loading data into the galaxy container") + subprocess.run("../serexec genus_species_galaxy /tool_deps/_conda/bin/python /opt/galaxy_data_libs_SI.py", shell=True) + except subprocess.CalledProcessError: + logging.info("Cannot load data into the galaxy container for " + self.full_name) + pass + else: + logging.info("Data successfully loaded into the galaxy container for " + self.full_name) + + self.get_species_history_id() + # self.get_instance_attributes() + # + # # import all datasets into current history + # self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["genome_file"]) + # self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["gff_file"]) + # self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["transcripts_file"]) + # self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["proteins_file"]) + + + + def generate_blast_banks(self): + """ + Automatically generate blast banks for a species and commit + + :return: + """ + + + def connect_to_instance(self): + """ + Test the connection to the galaxy instance for the current organism + Exit if it cannot connect to the instance + """ + self.instance = galaxy.GalaxyInstance(url=self.instance_url, email="gga@sb-roscoff.fr", password="password", + verify=False) + logging.info("Connecting to the galaxy instance ...") + try: + self.instance.histories.get_histories() + self.tool_panel = self.instance.tools.get_tool_panel() + except bioblend.ConnectionError: + logging.critical("Cannot connect to galaxy instance @ " + self.instance_url) + sys.exit() + else: + logging.info("Successfully connected to galaxy instance @ " + self.instance_url) + self.instance.histories.create_history(name="FOO") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Automatic data loading in containers and interaction " + "with galaxy instances for GGA" + ", following the protocol @ " + "http://gitlab.sb-roscoff.fr/abims/e-infra/gga") + + parser.add_argument("input", + type=str, + help="Input file (yml)") + + parser.add_argument("-v", "--verbose", + help="Increase output verbosity", + action="store_false") + + args = parser.parse_args() + + if args.verbose: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + logging.info("Load data: start") + sp_dict_list = parse_input(args.input) + for sp_dict in sp_dict_list: + o = LoadData(parameters_dictionary=sp_dict) + o.main_dir = os.path.abspath(args.dir) + o.modify_fasta_headers() + logging.info("Successfully formatted files headers " + o.genus[0].upper() + ". " + o.species + " " + o.strain + " " + o.sex) + # o.setup_data_libraries() + # logging.info("Successfully set up data libraries in galaxy for " + o.genus[0].upper() + ". " + o.species + " " + o.strain + " " + o.sex) + logging.info("Load data: done") diff --git a/run_workflow.py b/run_workflow.py index 836e3e88982589c26a669cd7248b9997bfa67f9a..00e0c8222e858faaae658a18db7cd3b18fd3c74b 100644 --- a/run_workflow.py +++ b/run_workflow.py @@ -1,2 +1,464 @@ #!/usr/bin/python # -*- coding: utf-8 -*- + + +import bioblend +import bioblend.galaxy.objects +from bioblend import galaxy +import argparse +import os +import subprocess +import logging +import sys +import yaml +import re +from gga_autoload.gga_load_data import metadata_generator + +""" +deploy_stacks.py + +Usage: $ python3 deploy_stacks.py -i example.yml [OPTIONS] +""" + + +def parse_input(input_file): + """ + Parse the yml input file to extract data to create the SpeciesData objects + Return a list of dictionaries. Each dictionary contains data tied to a species + + :param input_file: + :return: + """ + + parsed_sp_dict_list = [] + + if str(input_file).endswith("yml") or str(input_file).endswith("yaml"): + logging.debug("Input format used: YAML") + else: + logging.critical("Error, please input a YAML file") + sys.exit() + with open(input_file, 'r') as stream: + try: + yaml_dict = yaml.safe_load(stream) + for k, v in yaml_dict.items(): + if k == "config": + pass + parsed_sp_dict_list.append(v) + except yaml.YAMLError as exit_code: + logging.critical(exit_code + " (YAML input file might be incorrect)") + sys.exit() + return parsed_sp_dict_list + + +class RunWorkflow: + """ + Run a workflow into the galaxy instance's history of a given species + + """ + + def __init__(self, parameters_dictionary): + self.parameters_dictionary = parameters_dictionary + self.species = parameters_dictionary["description"]["species"] + self.genus = parameters_dictionary["description"]["genus"] + self.strain = parameters_dictionary["description"]["strain"] + self.sex = parameters_dictionary["description"]["sex"] + self.common = parameters_dictionary["description"]["common_name"] + self.date = datetime.today().strftime("%Y-%m-%d") + self.origin = parameters_dictionary["description"]["origin"] + self.performed = parameters_dictionary["data"]["performed_by"] + if parameters_dictionary["data"]["genome_version"] == "": + self.genome_version = "1.0" + else: + self.genome_version = parameters_dictionary["data"]["genome_version"] + if parameters_dictionary["data"]["ogs_version"] == "": + self.ogs_version = "1.0" + else: + self.ogs_version = parameters_dictionary["data"]["ogs_version"] + self.genus_lowercase = self.genus[0].lower() + self.genus[1:] + self.genus_uppercase = self.genus[0].upper() + self.genus[1:] + self.species_folder_name = "_".join([self.genus_lowercase, self.species, self.strain, self.sex]) + self.full_name = " ".join([self.genus_uppercase, self.species, self.strain, self.sex]) + self.abbreviation = " ".join([self.genus_lowercase[0], self.species, self.strain, self.sex]) + self.genus_species = self.genus_lowercase + "_" + self.species + self.instance_url = "http://scratchgmodv1:8888/sp/" + self.genus_lowercase + "_" + self.species + "/galaxy/" + # Testing with localhost/scratchgmodv1 + self.instance = None + self.history_id = None + self.library_id = None + self.script_dir = os.path.dirname(os.path.realpath(sys.argv[0])) + self.main_dir = None + self.species_dir = None + self.org_id = None + self.genome_analysis_id = None + self.ogs_analysis_id = None + self.tool_panel = None + self.datasets = dict() + self.source_files = dict() + self.workflow_name = None + self.metadata = dict() + self.api_key = "master" + if parameters_dictionary["data"]["parent_directory"] == "" or parameters_dictionary["data"]["parent_directory"] == "/path/to/closest/parent/dir": + self.source_data_dir = "/projet/sbr/phaeoexplorer/" # Testing path for phaeoexplorer data + else: + self.source_data_dir = parameters_dictionary["data"]["parent_directory"] + # Directory/subdirectories where data files are located (fasta, gff, ...) + self.do_update = False + # Update the instance (in histories corresponding to the input) instead of creating a new one + self.api_key = "master" + # API key used to communicate with the galaxy instance. Cannot be used to do user-tied actions + self.species_name_regex_litteral = "(?=\w*V)(?=\w*A)(?=\w*R)(?=\w*I)(?=\w*A)(?=\w*B)(?=\w*L)(?=\w*E)\w+" # Placeholder re + + + def get_species_history_id(self): + """ + Set and return the current species history id in its galaxy instance + + :return: + """ + histories = self.instance.histories.get_histories(name=str(self.full_name)) + self.history_id = histories[0]["id"] + self.instance.histories.show_history(history_id=self.history_id) + + return self.history_id + + + def create_species_history(self): + histories = self.instance.histories.get_histories(name=str(self.full_name)) + print("\n" + str(histories) + "\n" + self.full_name + "\n") + if not histories: + self.instance.histories.create_history(name="FOO") + print("Created history!") + + + def get_instance_attributes(self): + """ + retrieves instance attributes: + - working history ID + - libraries ID (there should only be one library!) + - datasets IDs + + :return: + """ + histories = self.instance.histories.get_histories(name=str(self.full_name)) + self.history_id = histories[0]["id"] + logging.debug("history ID: " + self.history_id) + libraries = self.instance.libraries.get_libraries() # normally only one library + self.library_id = self.instance.libraries.get_libraries()[0]["id"] # project data folder/library + logging.debug("library ID: " + self.history_id) + instance_source_data_folders = self.instance.libraries.get_folders(library_id=self.library_id) + + folders_ids = {} + current_folder_name = "" + for i in instance_source_data_folders: + for k, v in i.items(): + if k == "name": + folders_ids[v] = 0 + current_folder_name = v + if k == "id": + folders_ids[current_folder_name] = v + logging.info("Folders and datasets IDs: ") + self.datasets = dict() + for k, v in folders_ids.items(): + logging.info("\t" + k + ": " + v) + if k == "/genome": + sub_folder_content = self.instance.folders.show_folder(folder_id=v, contents=True) + for k2, v2 in sub_folder_content.items(): + for e in v2: + if type(e) == dict: + if e["name"].endswith(".fa"): + self.datasets["genome_file"] = e["ldda_id"] + logging.info("\t\t" + e["name"] + ": " + e["ldda_id"]) + elif k == "/annotation/" + self.genus_species: + sub_folder_content = self.instance.folders.show_folder(folder_id=v, contents=True) + for k2, v2 in sub_folder_content.items(): + for e in v2: + if type(e) == dict: + # TODO: manage several files of the same type and manage versions + if e["name"].endswith("transcripts-gff.fa"): + self.datasets["transcripts_file"] = e["ldda_id"] + logging.info("\t\t" + e["name"] + ": " + e["ldda_id"]) + elif e["name"].endswith("proteins.fasta"): + self.datasets["proteins_file"] = e["ldda_id"] + logging.info("\t\t" + e["name"] + ": " + e["ldda_id"]) + elif e["name"].endswith(".gff"): + self.datasets["gff_file"] = e["ldda_id"] + logging.info("\t\t" + e["name"] + ": " + e["ldda_id"]) + elif e["name"].endswith("MALE"): + self.datasets["gff_file"] = e["ldda_id"] + logging.info("\t\t" + e["name"] + ": " + e["ldda_id"]) + + + def init_instance(self): + """ + Galaxy instance startup in preparation for running workflows + - remove Homo sapiens from the chado database. + - add organism and analyses into the chado database --> separate + - get any other existing organisms IDs before updating the galaxy instance --> separate + + TODO: move the library and analysis/data stuff to a separate function + :return: + """ + + self.connect_to_instance() + self.get_species_history_id() + histories = self.instance.histories.get_histories(name=str(self.full_name)) + # Create the first history + if not histories: + self.instance.histories.create_history(name=str(self.full_name)) + self.history_id = histories[0]["id"] + logging.debug("history ID: " + self.history_id) + # libraries = self.instance.libraries.get_libraries() # routine check: one library + # self.library_id = self.instance.libraries.get_libraries()[0]["id"] # project data folder/library + logging.debug("library ID: " + self.history_id) + instance_source_data_folders = self.instance.libraries.get_folders(library_id=self.library_id) + + # Delete Homo sapiens from Chado database + logging.debug("Getting 'Homo sapiens' ID in instance's chado database") + get_sapiens_id_job = self.instance.tools.run_tool( + tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/2.3.2", + history_id=self.history_id, + tool_inputs={"genus": "Homo", "species": "sapiens"}) + get_sapiens_id_job_output = get_sapiens_id_job["outputs"][0]["id"] + get_sapiens_id_json_output = self.instance.datasets.download_dataset(dataset_id=get_sapiens_id_job_output) + try: + logging.debug("Deleting Homo 'sapiens' in the instance's chado database") + get_sapiens_id_final_output = json.loads(get_sapiens_id_json_output)[0] + sapiens_id = str( + get_sapiens_id_final_output["organism_id"]) # needs to be str to be recognized by the chado tool + self.instance.tools.run_tool( + tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_delete_organisms/organism_delete_organisms/2.3.2", + history_id=self.history_id, + tool_inputs={"organism": str(sapiens_id)}) + except bioblend.ConnectionError: + logging.debug("Homo sapiens isn't in the instance's chado database") + except IndexError: + logging.debug("Homo sapiens isn't in the instance's chado database") + pass + + # TODO: the following actions should be done in a separate function (in case if the user wants to do everything him/herself -- for EOSC) + # Add organism (species) to chado + logging.info("Adding organism to the instance's chado database") + self.instance.tools.run_tool( + tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_add_organism/organism_add_organism/2.3.2", + history_id=self.history_id, + tool_inputs={"abbr": self.abbreviation, + "genus": self.genus, + "species": self.species, + "common": self.common}) + # Add OGS analysis to chado + logging.info("Adding OGS analysis to the instance's chado database") + self.instance.tools.run_tool( + tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/2.3.2", + history_id=self.history_id, + tool_inputs={"name": self.genus + " " + self.species + " OGS" + self.ogs_version, + "program": "Performed by Genoscope", + "programversion": str("OGS" + self.ogs_version), + "sourcename": "Genoscope", + "date_executed": self.date}) + + # Add genome analysis to chado + logging.info("Adding genome analysis to the instance's chado database") + self.instance.tools.run_tool( + tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/2.3.2", + history_id=self.history_id, + tool_inputs={"name": self.genus + " " + self.species + " genome v" + self.genome_version, + "program": "Performed by Genoscope", + "programversion": str("genome v" + self.genome_version), + "sourcename": "Genoscope", + "date_executed": self.date}) + self.get_organism_and_analyses_ids() + logging.info("Finished initializing instance") + + + def run_workflow(self, workflow_name, workflow_parameters, datamap): + """ + Run the "main" workflow in the galaxy instance + - import data to library + - load fasta and gff + - sync with tripal + - add jbrowse + organism + - fill in the tripal views + + TODO: map tool name to step id + :param workflow_name: + :param workflow_parameters: + :param datamap: + :return: + """ + + logging.debug("running workflow: " + str(workflow_name)) + workflow_ga_file = self.main_dir + "Galaxy-Workflow-" + workflow_name + ".ga" + if self.strain != "": + custom_ga_file = "_".join([self.genus, self.species, self.strain]) + "_workflow.ga" + custom_ga_file_path = os.path.abspath(custom_ga_file) + else: + custom_ga_file = "_".join([self.genus, self.species]) + "_workflow.ga" + custom_ga_file_path = os.path.abspath(custom_ga_file) + with open(workflow_ga_file, 'r') as ga_in_file: + workflow = str(ga_in_file.readlines()) + # ugly fix for the jbrowse parameters + workflow = workflow.replace('{\\\\\\\\\\\\"unique_id\\\\\\\\\\\\": \\\\\\\\\\\\"UNIQUE_ID\\\\\\\\\\\\"}', + str('{\\\\\\\\\\\\"unique_id\\\\\\\\\\\\": \\\\\\\\\\\\"' + self.genus + " " + self.species) + '\\\\\\\\\\\\"') + workflow = workflow.replace('\\\\\\\\\\\\"name\\\\\\\\\\\\": \\\\\\\\\\\\"NAME\\\\\\\\\\\\"', + str('\\\\\\\\\\\\"name\\\\\\\\\\\\": \\\\\\\\\\\\"' + self.genus.lower()[0] + self.species) + '\\\\\\\\\\\\"') + workflow = workflow.replace("\\\\", "\\") # to restore the correct amount of backslashes in the workflow string before import + # test + workflow = workflow.replace('http://localhost/sp/genus_species/feature/Genus/species/mRNA/{id}', + "http://localhost/sp/" + self.genus_lowercase+ "_" + self.species + "/feature/" + self.genus + "/mRNA/{id}") + # production + # workflow = workflow.replace('http://localhost/sp/genus_species/feature/Genus/species/mRNA/{id}', + # "http://abims--gga.sb-roscoff.fr/sp/" + self.genus_lowercase + "_" + self.species + "/feature/" + self.genus + "/mRNA/{id}") + workflow = workflow[2:-2] # if the line under doesn't output a correct json + # workflow = workflow[:-2] # if the line above doesn't output a correct json + + workflow_dict = json.loads(workflow) + + self.instance.workflows.import_workflow_dict(workflow_dict=workflow_dict) + self.workflow_name = workflow_name + workflow_attributes = self.instance.workflows.get_workflows(name=self.workflow_name) + workflow_id = workflow_attributes[0]["id"] + show_workflow = self.instance.workflows.show_workflow(workflow_id=workflow_id) + logging.debug("Workflow ID: " + workflow_id) + + logging.debug("Inputs:") + logging.debug(show_workflow["Inputs"]) + self.instance.workflows.invoke_workflow(workflow_id=workflow_id, + history_id=self.history_id, + params=workflow_parameters, + inputs=datamap, + inputs_by="") + self.instance.workflows.delete_workflow(workflow_id=workflow_id) + + + def get_organism_and_analyses_ids(self): + """ + Retrieve current organism ID and OGS and genome chado analyses IDs (needed to run some tools as Tripal/Chado + doesn't accept organism/analyses names as valid inputs + + :return: + """ + # Get the ID for the current organism in chado + org = self.instance.tools.run_tool( + tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/2.3.2", + history_id=self.history_id, + tool_inputs={"genus": self.genus, "species": self.species}) + org_job_out = org["outputs"][0]["id"] + org_json_output = self.instance.datasets.download_dataset(dataset_id=org_job_out) + try: + org_output = json.loads(org_json_output)[0] + self.org_id = str(org_output["organism_id"]) # id needs to be a str to be recognized by chado tools + except IndexError: + logging.debug("no organism matching " + self.full_name + " exists in the instance's chado database") + + # Get the ID for the OGS analysis in chado + ogs_analysis = self.instance.tools.run_tool( + tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_get_analyses/analysis_get_analyses/2.3.2", + history_id=self.history_id, + tool_inputs={"name": self.genus + " " + self.species + " OGS" + self.ogs_version}) + ogs_analysis_job_out = ogs_analysis["outputs"][0]["id"] + ogs_analysis_json_output = self.instance.datasets.download_dataset(dataset_id=ogs_analysis_job_out) + try: + ogs_analysis_output = json.loads(ogs_analysis_json_output)[0] + self.ogs_analysis_id = str(ogs_analysis_output["analysis_id"]) + except IndexError: + logging.debug("no matching OGS analysis exists in the instance's chado database") + + # Get the ID for the genome analysis in chado + genome_analysis = self.instance.tools.run_tool( + tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_get_analyses/analysis_get_analyses/2.3.2", + history_id=self.history_id, + tool_inputs={"name": self.genus + " " + self.species + " genome v" + self.genome_version}) + genome_analysis_job_out = genome_analysis["outputs"][0]["id"] + genome_analysis_json_output = self.instance.datasets.download_dataset(dataset_id=genome_analysis_job_out) + try: + genome_analysis_output = json.loads(genome_analysis_json_output)[0] + self.genome_analysis_id = str(genome_analysis_output["analysis_id"]) + except IndexError: + logging.debug("no matching genome analysis exists in the instance's chado database") + + + def connect_to_instance(self): + """ + TODO: move in init/access + TODO: password + Test the connection to the galaxy instance for the current organism + Exit if it cannot connect to the instance + """ + self.instance = galaxy.GalaxyInstance(url=self.instance_url, email="gga@sb-roscoff.fr", password="password", + verify=False) + logging.info("Connecting to the galaxy instance ...") + try: + self.instance.histories.get_histories() + self.tool_panel = self.instance.tools.get_tool_panel() + except bioblend.ConnectionError: + logging.critical("Cannot connect to galaxy instance @ " + self.instance_url) + sys.exit() + else: + logging.info("Successfully connected to galaxy instance @ " + self.instance_url) + self.instance.histories.create_history(name="FOO") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Automatic data loading in containers and interaction " + "with galaxy instances for GGA" + ", following the protocol @ " + "http://gitlab.sb-roscoff.fr/abims/e-infra/gga") + + parser.add_argument("input", + type=str, + help="Input file (yml)") + + parser.add_argument("-v", "--verbose", + help="Increase output verbosity", + action="store_false") + + args = parser.parse_args() + + if args.verbose: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + + logging.info("Start") + sp_dict_list = parse_input(args.input) + + for sp_dict in sp_dict_list: + o = RunWorkflow(parameters_dictionary=sp_dict) + o.main_dir = os.path.abspath(args.dir) + if args.init_instance: + logging.info(" Initializing the galaxy instance") + o.init_instance() + o.get_instance_attributes() + # metadata[genus_species_strain_sex]["initialized"] = True + if args.load_data: + logging.info("Loading data into galaxy") + # o.load_data() + # metadata[genus_species_strain_sex]["data_loaded_in_instance"] = True + if args.run_main: + logging.info("Running main workflow") + o.get_organism_and_analyses_ids() + workflow_parameters = dict() + workflow_parameters["0"] = {} + workflow_parameters["1"] = {} + workflow_parameters["2"] = {} + workflow_parameters["3"] = {} + workflow_parameters["4"] = {"organism": al.org_id, + "analysis_id": al.genome_analysis_id, + "do_update": "true"} + workflow_parameters["5"] = {"organism": al.org_id, + "analysis_id": al.ogs_analysis_id} + workflow_parameters["6"] = {"organism_id": al.org_id} + workflow_parameters["7"] = {"analysis_id": al.ogs_analysis_id} + workflow_parameters["8"] = {"analysis_id": al.genome_analysis_id} + workflow_parameters["9"] = {"organism_id": al.org_id} + workflow_parameters["10"] = {} + workflow_parameters["11"] = {} + + o.datamap = dict() + o.datamap["0"] = {"src": "hda", "id": al.datasets["genome_file"]} + o.datamap["1"] = {"src": "hda", "id": al.datasets["gff_file"]} + o.datamap["2"] = {"src": "hda", "id": al.datasets["proteins_file"]} + o.datamap["3"] = {"src": "hda", "id": al.datasets["transcripts_file"]} + + o.run_workflow(workflow_name="main", workflow_parameters=workflow_parameters, datamap=al.datamap) + # metadata[genus_species_strain_sex]["workflows_run"] = metadata[genus_species_strain_sex]["workflows_run"].append("main") diff --git a/table_parser.py b/table_parser.py index 9e55ecd3eb51c9dd0d755d2a4532d798c576df52..9314b91cedb9e7f6809d5524822d282aad2d2532 100755 --- a/table_parser.py +++ b/table_parser.py @@ -1,80 +1,79 @@ -import os -import sys -import pandas # xlrd required for excel files reading -import numpy -import json -import argparse -import logging -from datetime import datetime - -""" -OBSOLETE - -Input parser script. -Does not work for ods spreadsheets (save as xls or xlsx instead) --> need to handle with pandas_ods_reader (requires ezodf, lxml) -Does not support multiple sheets (TODO: "integration" and "update" sheets (1 and 2)) -See example toy table (toy_table.xls) - -TODO: move this script inside autoload - -standalone usage: python3 table_parser.py <tabulated_file> -d <directory_to_write_json_to (default: cwd)> -""" - - -class TableParser: - - def __init__(self, table_file, dir): - self.dir = os.path.abspath(args.dir) - self.table_file = table_file - self.method = None # TODO: instant launch or just parse (standalone) - self.extension = None - self.meta = dict() - self.json_file = None - - def parse_table(self, extension): - if extension == "xls": - pandas_table = pandas.DataFrame(pandas.read_excel(self.table_file)) - elif extension == "csv": - pandas_table = pandas.DataFrame(pandas.read_csv(self.table_file)) - else: - logging.info("wrong format: input tabulated file cannot be read (supported formats: xls, xlsx, csv)") - sys.exit() - pandas_table = pandas_table.replace(numpy.nan, "", regex=True) - - for char in " ,.()-/": - pandas_table = pandas_table.replace("\\" + char, "_", regex=True) - pandas_table = pandas_table.replace("\\__", "_", regex=True) - pandas_table.loc[pandas_table["genome version"] == "", "genome version"] = "1.0" - pandas_table.loc[pandas_table["ogs version"] == "", "ogs version"] = "1.0" - pandas_table.loc[pandas_table["version"] == "", "version"] = "1.0" - pandas_table.loc[pandas_table["date"] == "", "date"] = datetime.today().strftime("%Y-%m-%d") - with open(os.path.join(self.dir, self.json_file), 'w') as json_file: - json_file.truncate(0) - json_content = list() - for organism in range(0, len(pandas_table.index)): - organism_dict = pandas_table.iloc[organism].to_dict() - for k, v in organism_dict.items(): - v = str(v).split(" ") - v = "_".join(v) - v = v.replace("__", "_") - if v.endswith("_"): - v = v[:-1] - json_content.append(organism_dict) - json.dump(json_content, json_file, indent=4) - - def write_json(self, data, filename): - with open(filename, 'w') as f: - json.dump(data, f, indent=4) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Table parser for phaeoexplorer data") - parser.add_argument("input", type=str, help="input table") - parser.add_argument("-d", "--dir", type=str, help="Where to write the output json file that is be used for integration", default = os.getcwd()) - args = parser.parse_args() - - if args.input.endswith("xlsx") or args.input.endswith("xls"): - tp = TableParser(table_file=args.input, dir=args.dir) - tp.extension = args.input.split(".")[1] - tp.json_file = tp.dir + "/dataloader_" + datetime.today().strftime("%Y%m%d") + ".json" - tp.parse_table(extension="xls") +import os +import sys +import pandas # xlrd required for excel files reading +import numpy +import json +import argparse +import logging +from datetime import datetime + +""" +!! OBSOLETE !! + +Input parser script. +Does not work for ods spreadsheets (save as xls or xlsx instead) --> need to handle with pandas_ods_reader (requires ezodf, lxml) +Does not support multiple sheets (TODO: "integration" and "update" sheets (1 and 2)) +See example toy table (toy_table.xls) + +standalone usage: python3 table_parser.py <tabulated_file> -d <directory_to_write_json_to (default: cwd)> + +""" + + +class TableParser: + + def __init__(self, table_file, dir): + self.dir = os.path.abspath(args.dir) + self.table_file = table_file + self.method = None # TODO: instant launch or just parse (standalone) + self.extension = None + self.meta = dict() + self.json_file = None + + def parse_table(self, extension): + if extension == "xls": + pandas_table = pandas.DataFrame(pandas.read_excel(self.table_file)) + elif extension == "csv": + pandas_table = pandas.DataFrame(pandas.read_csv(self.table_file)) + else: + logging.info("wrong format: input tabulated file cannot be read (supported formats: xls, xlsx, csv)") + sys.exit() + pandas_table = pandas_table.replace(numpy.nan, "", regex=True) + + for char in " ,.()-/": + pandas_table = pandas_table.replace("\\" + char, "_", regex=True) + pandas_table = pandas_table.replace("\\__", "_", regex=True) + pandas_table.loc[pandas_table["genome version"] == "", "genome version"] = "1.0" + pandas_table.loc[pandas_table["ogs version"] == "", "ogs version"] = "1.0" + pandas_table.loc[pandas_table["version"] == "", "version"] = "1.0" + pandas_table.loc[pandas_table["date"] == "", "date"] = datetime.today().strftime("%Y-%m-%d") + with open(os.path.join(self.dir, self.json_file), 'w') as json_file: + json_file.truncate(0) + json_content = list() + for organism in range(0, len(pandas_table.index)): + organism_dict = pandas_table.iloc[organism].to_dict() + for k, v in organism_dict.items(): + v = str(v).split(" ") + v = "_".join(v) + v = v.replace("__", "_") + if v.endswith("_"): + v = v[:-1] + json_content.append(organism_dict) + json.dump(json_content, json_file, indent=4) + + def write_json(self, data, filename): + with open(filename, 'w') as f: + json.dump(data, f, indent=4) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Table parser for phaeoexplorer data") + parser.add_argument("input", type=str, help="input table") + parser.add_argument("-d", "--dir", type=str, help="Where to write the output json file that is be used for integration", default = os.getcwd()) + args = parser.parse_args() + + if args.input.endswith("xlsx") or args.input.endswith("xls"): + tp = TableParser(table_file=args.input, dir=args.dir) + tp.extension = args.input.split(".")[1] + tp.json_file = tp.dir + "/dataloader_" + datetime.today().strftime("%Y%m%d") + ".json" + tp.parse_table(extension="xls") diff --git a/templates/compose-template.yml b/templates/compose-template.yml index 590923cd3b27ff536dd15d51931524783b5c52a3..b3b85789b9cb74224cef40cffd643f35c6a9eb38 100755 --- a/templates/compose-template.yml +++ b/templates/compose-template.yml @@ -81,7 +81,7 @@ services: galaxy: image: quay.io/galaxy-genome-annotation/docker-galaxy-annotation:gmod volumes: - - ../galaxy_data_libs_SI.py:/opt/setup_data_libraries.py + - ../galaxy_data_libs_SI.py:/opt/galaxy_data_libs_SI.py - ./docker_data/galaxy:/export - ./src_data/:/project_data:ro #- /groups/XXX/:/groups/XXX/:ro # We do this when we have symlinks in src_data pointing to /groups/XXX/... diff --git a/templates/stack-organism.yml b/templates/stack-organism.yml index 519b96f55d309b748c52b0b33d231727dce72870..103757f4df201e28cfde0b22c11e2105a3bc267c 100644 --- a/templates/stack-organism.yml +++ b/templates/stack-organism.yml @@ -112,7 +112,7 @@ services: galaxy: image: quay.io/galaxy-genome-annotation/docker-galaxy-annotation:gmod volumes: - - ../galaxy_data_libs_SI.py:/opt/setup_data_libraries.py + - ../galaxy_data_libs_SI.py:/opt/galaxy_data_libs_SI.py - ./docker_data/galaxy/:/export/ - ./src_data/:/project_data/:ro #- /groups/XXX/:/groups/XXX/:ro # We do this when we have symlinks in src_data pointing to /groups/XXX/... @@ -130,7 +130,7 @@ services: GALAXY_DEFAULT_ADMIN_USER: "gga" GALAXY_DEFAULT_ADMIN_PASSWORD: "password" GALAXY_CONFIG_ADMIN_USERS: "admin@galaxy.org, gga@sb-roscoff.fr, lgueguen@sb-roscoff.fr, alebars@sb-roscoff.fr" # admin@galaxy.org is the default (leave it), gogepp@bipaa is a shared ldap user we use to connect - GALAXY_CONFIG_MASTER_API_KEY: "dev" + GALAXY_CONFIG_MASTER_API_KEY: "master" ENABLE_FIX_PERMS: 0 PROXY_PREFIX: /sp/genus_species/galaxy GALAXY_TRIPAL_URL: http://tripal.genus_species/tripal/ diff --git a/ext_scripts/__init__.py b/utils/__init__.py similarity index 100% rename from ext_scripts/__init__.py rename to utils/__init__.py diff --git a/ext_scripts/blastdb.py b/utils/blastdb.py similarity index 100% rename from ext_scripts/blastdb.py rename to utils/blastdb.py diff --git a/ext_scripts/common-stringSubsitute.py b/utils/common-stringSubsitute.py similarity index 97% rename from ext_scripts/common-stringSubsitute.py rename to utils/common-stringSubsitute.py index c32a177b83f45b3ee68c45151c3bb36147561d7a..c4d22a9fe017a03feb3b276047924353fd864406 100755 --- a/ext_scripts/common-stringSubsitute.py +++ b/utils/common-stringSubsitute.py @@ -1,37 +1,37 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -import argparse -import os -import re -import sys - -# Return the file obtained by replacing the occurrences of pattern by the replacement string. -#Â Use of python method re.sub() -# python common-stringSubsitute.py -f file -p pattern -r replacement_string -# ex : python common-stringSubsitute.py -f file -p '(tRNA)(\w{3})(\w{3})' -r '\g<1>-\g<2>(\g<3>)' - -if __name__ == '__main__': - - #Get arguments - parser = argparse.ArgumentParser(description="Return the file obtained by replacing the occurrences of pattern by the replacement string. Use of python method re.sub(). Example: python common-stringSubsitute.py -f file -p '(tRNA)(\w{3})(\w{3})' -r '\g<1>-\g<2>(\g<3>)'") - parser.add_argument('-i','--infile', help='Input file', required=True) - parser.add_argument('-o','--outfile', help='Output file', default='outfile') - parser.add_argument('-p','--pattern', help='Pattern string to be replaced', required=True) - parser.add_argument('-r','--repl', help='Replacement string', required=True) - args = parser.parse_args() - - infilename=args.infile - outfilename=args.outfile - pattern=args.pattern - repl=args.repl - - infile=open(infilename,'r') - outfile=open(outfilename,'w') - - lines=infile.readlines() - - for line in lines : - line_out=re.sub(pattern,repl,line) - outfile.write(line_out) - +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +import argparse +import os +import re +import sys + +# Return the file obtained by replacing the occurrences of pattern by the replacement string. +#Â Use of python method re.sub() +# python common-stringSubsitute.py -f file -p pattern -r replacement_string +# ex : python common-stringSubsitute.py -f file -p '(tRNA)(\w{3})(\w{3})' -r '\g<1>-\g<2>(\g<3>)' + +if __name__ == '__main__': + + #Get arguments + parser = argparse.ArgumentParser(description="Return the file obtained by replacing the occurrences of pattern by the replacement string. Use of python method re.sub(). Example: python common-stringSubsitute.py -f file -p '(tRNA)(\w{3})(\w{3})' -r '\g<1>-\g<2>(\g<3>)'") + parser.add_argument('-i','--infile', help='Input file', required=True) + parser.add_argument('-o','--outfile', help='Output file', default='outfile') + parser.add_argument('-p','--pattern', help='Pattern string to be replaced', required=True) + parser.add_argument('-r','--repl', help='Replacement string', required=True) + args = parser.parse_args() + + infilename=args.infile + outfilename=args.outfile + pattern=args.pattern + repl=args.repl + + infile=open(infilename,'r') + outfile=open(outfilename,'w') + + lines=infile.readlines() + + for line in lines : + line_out=re.sub(pattern,repl,line) + outfile.write(line_out) + outfile.close() \ No newline at end of file diff --git a/ext_scripts/phaeoexplorer-change_pep_fasta_header.sh b/utils/phaeoexplorer-change_pep_fasta_header.sh similarity index 96% rename from ext_scripts/phaeoexplorer-change_pep_fasta_header.sh rename to utils/phaeoexplorer-change_pep_fasta_header.sh index 0de7b9b7bada4edb88dff1d6422c34c1bfbbd4e8..3cf614f745bfaef03725038f7bb9fac84a00011b 100755 --- a/ext_scripts/phaeoexplorer-change_pep_fasta_header.sh +++ b/utils/phaeoexplorer-change_pep_fasta_header.sh @@ -1,17 +1,17 @@ -#!/usr/bin/env bash - -INFILE=$1 -OUTFILE=tmpfile - -FILE_HEADER_START=$(grep ">" $INFILE | cut -c 1-6 | sort | uniq) -HEADER_START_STRING=">mRNA." - -if [[ "$FILE_HEADER_START" == "$HEADER_START_STRING" ]] -then - /usr/local/genome2/mmo/scripts/common/common-stringSubstitute.py -i $INFILE -o $OUTFILE -p '^>mRNA' -r '>protein' - mv $OUTFILE $INFILE - echo "'>mRNA' replaced by '>protein'" -else - echo "Abort. Not all headers start with '>mRNA.':" - echo "$FILE_HEADER_START" +#!/usr/bin/env bash + +INFILE=$1 +OUTFILE=tmpfile + +FILE_HEADER_START=$(grep ">" $INFILE | cut -c 1-6 | sort | uniq) +HEADER_START_STRING=">mRNA." + +if [[ "$FILE_HEADER_START" == "$HEADER_START_STRING" ]] +then + /usr/local/genome2/mmo/scripts/common/common-stringSubstitute.py -i $INFILE -o $OUTFILE -p '^>mRNA' -r '>protein' + mv $OUTFILE $INFILE + echo "'>mRNA' replaced by '>protein'" +else + echo "Abort. Not all headers start with '>mRNA.':" + echo "$FILE_HEADER_START" fi \ No newline at end of file diff --git a/ext_scripts/phaeoexplorer-change_transcript_fasta_header.sh b/utils/phaeoexplorer-change_transcript_fasta_header.sh similarity index 100% rename from ext_scripts/phaeoexplorer-change_transcript_fasta_header.sh rename to utils/phaeoexplorer-change_transcript_fasta_header.sh diff --git a/ext_scripts/phaeoexplorer-change_transcript_fasta_header.sh.bak b/utils/phaeoexplorer-change_transcript_fasta_header.sh.bak similarity index 97% rename from ext_scripts/phaeoexplorer-change_transcript_fasta_header.sh.bak rename to utils/phaeoexplorer-change_transcript_fasta_header.sh.bak index 196675b503a42188dce58d3b930e1b804aab6868..12ce4e56544070af8daddcb3f981b7e0dc81f3fd 100755 --- a/ext_scripts/phaeoexplorer-change_transcript_fasta_header.sh.bak +++ b/utils/phaeoexplorer-change_transcript_fasta_header.sh.bak @@ -1,7 +1,7 @@ -#!/usr/bin/env bash - -INFILE=$1 -OUTFILE=tmpfile -/home/fr2424/sib/alebars/gga_load_data/ext_scripts/common-stringSubsitute.py -i $INFILE -o $OUTFILE -p '^>\d+ mRNA' -r '>mRNA' -mv $OUTFILE $INFILE -echo "'>[0-9]+ mRNA' replaced by '>mRNA' in $1" +#!/usr/bin/env bash + +INFILE=$1 +OUTFILE=tmpfile +/home/fr2424/sib/alebars/gga_load_data/ext_scripts/common-stringSubsitute.py -i $INFILE -o $OUTFILE -p '^>\d+ mRNA' -r '>mRNA' +mv $OUTFILE $INFILE +echo "'>[0-9]+ mRNA' replaced by '>mRNA' in $1"