From 0ab13844b2a8491d04757ea06373b7213b5a939a Mon Sep 17 00:00:00 2001 From: Arthur Le Bars <arthur.le-bars@sb-roscoff.fr> Date: Fri, 4 Sep 2020 11:12:02 +0200 Subject: [PATCH] Deploy/load parts, docker deploy stacks fixes --- .gitignore | 7 + README.md | 147 ++-- .../docker_compose_generator.cpython-38.pyc | Bin 2644 -> 2644 bytes __pycache__/table_parser.cpython-38.pyc | Bin 2694 -> 2694 bytes deploy.sh | 27 +- autoload.py => deploy_stacks.py | 722 +++++++++++------- docker_compose_generator.py | 4 +- examples/example.yml | 60 ++ examples/phaeoexplorer_test.json | 2 + load_data.py | 26 + requirements.txt | 2 +- ...ta_libraries.py => setup_data_libraries.py | 2 +- table_parser.py | 9 +- templates/stack-organism.yml | 3 +- templates/traefik.yml | 2 +- 15 files changed, 643 insertions(+), 370 deletions(-) create mode 100644 .gitignore rename autoload.py => deploy_stacks.py (54%) create mode 100644 examples/example.yml create mode 100644 load_data.py rename ext_scripts/setup_data_libraries.py => setup_data_libraries.py (99%) diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..07e2dd1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +__pycache__ +.idea +phaeoexplorer_test.json +example.json +example.xlsx +*.bak +undaria_pinnatifida diff --git a/README.md b/README.md index 8cc6536..590e82b 100755 --- a/README.md +++ b/README.md @@ -1,30 +1,27 @@ # gga_load_data (WIP) -Automated integration of new organisms into GGA instances. +Automated integration of new organisms into GGA environments as a form of a docker stack of services. ## Description: -This script is made for automatically integrating new organisms into GGA instances as part of the phaeexplorer project. -As input, the script either takes a tabulated file (xls, xlsx or csv) or a json file describing the organism for which it has to create/update instances. -For each organism to be integrated, the script needs at least its genus and species (strain, sex, genome and annotation files versions are optional, but the two later will be set to the default version of 1.0, and the two former will be set as empty and will not being considered during the integration process). -See example datasets (example.json and example.xlsx) for an example of what information can be described and the correct formatting of these input files. The script should then take of everything (for phaeoexplorer organisms), from generating the directory tree to running workflows and tools in the galaxy instance. +Automatically generate functional GGA environments from a descriptive input file. +See example datasets (example.json, example.yml or example.xlsx) for an example of what information can be described +and the correct formatting of these input files. -## TODO: -- ready the script for production (prepare production arguments) + remove dev args for master -- metadata -- call the scripts for formatting data and generate blast banks +"gga_load_data" in its current version is divided in three (automated) parts: +- Create the stacks of services for the input organisms (orchestrated using docker swarm, with traefik used as a networking interface between the different stacks) +- Load the organisms datasets into the galaxy instance +- Remotely run a custom workflow in galaxy ## Metadata files (WIP): -The script also generates a metadata file in the directory of the newly integrated species, summing up what actions were taken for this organism (see meta_toy.yaml for -the kind of information it can contain). It also creates another metadata files in the main directory (where you put all the organisms you have integrated), which contains the sum of all metadata files from all integrated organisms. These metadata files are also updated when updating an existing instance. - -## nginx conf (WIP): -The default.conf will be automatically generated (automatic port affectation), APIs will be able to bypass authentication (for bioblend, a master key -is set at the creation of the docker-compose.yml of the organisms) +A metadata file will be generated to summarize what actions have previously been taken inside a stack. ## Directory tree: -For every input organism, the script will create the following directories structure, or try to update it if it already exists. -It will update the files in the main directory to account for the new organisms that are getting integrated. +For every input organism, a dedicated directory is created. The script will create this directory and all subdirectories +required. + +If the user is adding new data to a species (for example adding another strain/sex's datasets to the same species), the directory tree will be updated +Directory tree structure: ``` /main_directory | @@ -37,19 +34,25 @@ It will update the files in the main directory to account for the new organisms | |---/nginx | | |---/conf | | |---/default.conf -| | -| |---/src_data -| | |---/genome -| | | |---/genus1_species1_strain_sex -| | | |---/vX.X -| | | |---/genus_species_vX.X.fasta -| | | -| | |---/annotation -| | | |---/genus1_species1_strain_sex -| | | |---/OGSX.X -| | | |---/OGSX.X.gff -| | | |---/OGSX.X_pep.fasta -| | | |---/OGSX.X_transcripts.fasta +| | +| |---/blast +| | |---/banks.yml +| | |---/links.yml +| | +| |---/docker_data # Data used internally by docker (do not delete!) +| | +| |---/src_data +| | |---/genome +| | | |---/genus1_species1_strain_sex +| | | |---/vX.X +| | | |---/genus_species_vX.X.fasta +| | | +| | |---/annotation +| | | |---/genus1_species1_strain_sex +| | | |---/OGSX.X +| | | |---/OGSX.X.gff +| | | |---/OGSX.X_pep.fasta +| | | |---/OGSX.X_transcripts.fasta | | | | | |---/tracks | | |---/genus1_species1_strain_sex @@ -59,52 +62,60 @@ It will update the files in the main directory to account for the new organisms | | | |---/docker-compose.yml | | -| |---/metada_genus1_species1.yml +| |---/metada_genus1_species1.yml (WIP) | |---/metadata.yml | |---/traefik - |---/authelia - |---/users.yml - |---/configuration.yml + |---/docker-compose.yml + |---/authelia + |---/users.yml + |---/configuration.yml ``` ## Steps: For each input organism: -1) parsing the tabulated input -2) create the docker-compose.yml for the organism (+ default.conf and edit main_proxy nginx default.conf for docker-compose docker configuration) -3) create the directory tree structure (if it already exists, only create the required directories) -4) gather files in the "source data" directory tree, can recursively search the directory (by default, the source-data folder is fixed for phaeoexplorer data, this default fixed directory can be set in the attributes of the Autoload class in autoload.py. -5) link the source files to the organism correct src_data folders -6) modify headers in the transcripts and protein fasta files -7) generate blast banks (no commit) -8) start the containers -9) connect to the galaxy instance -10) run data integration galaxy steps (see @ http://gitlab.sb-roscoff.fr/abims/e-infra/gga) -11) generate and update metadata files +1) Create the directory tree structure (if it already exists, only create the required subdirectories) +2) Create the dockerfile for the organism and deploy the stack of services. If the dockerfile exists, skips this step +3) Gather source data files as specified in the input, can recursively search the directory (fully automated for local phaeoexplorer data) +4) Link the source files to the organism correct src_data folders +5) (*Optional*) Modify headers in the transcripts and protein fasta files +6) (*Optional*) TODO: Generate blast banks (no commit) +7) (*Optional*) Connect to the galaxy instance +8) (*Optional*) Run data integration galaxy steps (see @ http://gitlab.sb-roscoff.fr/abims/e-infra/gga) +9) (*Optional*) TODO: Generate and update metadata files + +## Usage: +```WIP``` -## Usage (production): -In progress +## Current limitations +When deploying the stack of services, the galaxy service takes a long time to be ready (around 2 hours of wait time). -## Requirements: -bioblend==0.13.0 -boto==2.49.0 -certifi==2019.11.28 -cffi==1.14.0 -chardet==3.0.4 -cryptography==2.8 -idna==2.9 -numpy==1.18.1 -pandas==1.0.3 -pycparser==2.20 -pyOpenSSL==19.1.0 -PySocks==1.7.1 -python-dateutil==2.8.1 -pytz==2019.3 -PyYAML==5.3.1 -requests==2.23.0 -requests-toolbelt==0.9.1 -six==1.14.0 -urllib3==1.25.7 -xlrd==1.2.0 +For the moment, the stacks deployment and the data loading into galaxy should be run separately (only once the galaxy service is ready). + +To check the status of the galaxy service, run ```$ docker service logs -f genus_species_galaxy``` + +## Requirements (*temporary*): +``` +bioblend==0.14.0 +boto==2.49.0 +certifi==2019.11.28 +cffi==1.14.0 +chardet==3.0.4 +cryptography==2.8 +idna==2.9 +numpy==1.18.1 +pandas==1.0.3 +pycparser==2.20 +pyOpenSSL==19.1.0 +PySocks==1.7.1 +python-dateutil==2.8.1 +pytz==2019.3 +PyYAML==5.3.1 +requests==2.23.0 +requests-toolbelt==0.9.1 +six==1.14.0 +urllib3==1.25.7 +xlrd==1.2.0 +``` \ No newline at end of file diff --git a/__pycache__/docker_compose_generator.cpython-38.pyc b/__pycache__/docker_compose_generator.cpython-38.pyc index 3519791c440b492aee228a3785e1d32a58d2896a..1201041cefa429a4343e2513ccd1c57b142f50b8 100644 GIT binary patch delta 48 zcmca2az%tUl$V!_0SM&Q`^W#=$m_$(7_~WywVIJpVe>V%BTS4clV@{gFe*%D;R*l% DK`;#z delta 48 zcmca2az%tUl$V!_0SLqtRN{Ya<n>`?ytO%swVIJpaq~5{BTS5{lV@{gFe*-F;R*l% DGqnuS diff --git a/__pycache__/table_parser.cpython-38.pyc b/__pycache__/table_parser.cpython-38.pyc index 906e86c557512b57afb6d437208935bfd7159922..57f034e254bb97b21195f7f449cc828c52038074 100644 GIT binary patch delta 84 zcmZn@Z4>1U<>lpK0D{Mie(}s3dDk&9N^Rc5<jKOQG?|V4JEP;|f9%El5<msTY(Rp8 jiGzbfescrIM@CTrK0OXb9!3F12}TYk2_P-RD8~o@?U4?& delta 84 zcmZn@Z4>1U<>lpK00J=um3W_xyz7`4#W(L^@?>F@pUlSoozY?PKlWmNQJ{iiHXy;l j#KFO#xVeGjBcmuUp8*FW52FC11S1EN1dtYDlw$+{!x0V~ diff --git a/deploy.sh b/deploy.sh index 96d895e..16f4a33 100755 --- a/deploy.sh +++ b/deploy.sh @@ -1,10 +1,21 @@ #!/usr/bin/env bash -CURDIR="$PWD" -DOCKER_NETWORKS=$(docker network ls) -DOCKER_STACKS=$(docker stack ls) -docker network create --driver overlay ingress -docker network create --driver overlay traefikbig -docker swarm init -# SEPARATE traefik and gspecies !!! + +if ! docker swarm init | grep -q 'Error' +then + docker swarm init +fi + +cd $2 docker stack deploy -c docker-compose.yml traefik -docker stack deploy -c docker-compose.yml $1 \ No newline at end of file +cd .. + +if docker network ls | grep -q 'traefik' +then + docker stack deploy -c docker-compose.yml $1 +else + docker stack deploy -c docker-compose.yml $1 +fi + +cd $2 +docker stack deploy -c docker-compose.yml traefik +cd .. \ No newline at end of file diff --git a/autoload.py b/deploy_stacks.py similarity index 54% rename from autoload.py rename to deploy_stacks.py index 5041385..86a87b6 100755 --- a/autoload.py +++ b/deploy_stacks.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + import bioblend import bioblend.galaxy.objects from bioblend import galaxy @@ -9,66 +11,115 @@ import sys import json import yaml import re -import metadata_generator, docker_compose_generator, table_parser +import metadata_generator +import docker_compose_generator +import table_parser import fnmatch import shutil +from datetime import datetime """ -gga_auto_load main script +gga_load_data main script Scripted integration of new data into GGA instances. The input is either a table-like (csv, xls, ...) or a json (TODO: yaml) file that describes what data is to be integrated (genus, species, sex, strain, data), see data_example.json for an example of the correct syntax. The script will parse the input and take care of everything, from source files directory tree creation to running the gmod tools inside the galaxy instances of organisms. -TODO: By default, the script will do everything needed to have a functional instance from scratch. If you want to bypass this behavior, +By default, the script will do everything needed to have a functional instance from scratch. If you want to bypass this behavior, you have to specify --update as a parameter. The script can also be used to update an existing GGA instance with new data. For example, you have an instance "genus_species" with data for the male sex and want to add the female sex to the same GGA instance. To do this, create your configuration input file as you would normally, and add the "--update" argument when invoking the script. +TODO: +- add config file (inside repo or outside with argument +- update existing history +- clean/delete instance +- delete stack +- commit the files for blast banks + TODO EOSC/Cloudification: -- keep in mind - divide into 2 general-use scripts - - create docker stack via swarm - - load data into libraries (method to load it at init, and a method/script to load it separately (galaxy_data_libs_SI does this already?) + - create docker swarm, stacks, etc... (docker side) + - load data into libraries (method to load it at init, and a method/script to load it separately (galaxy side) (alb: galaxy_data_libs_SI does this already?) STEPS: -- read input (xls or json) -- create dir_tree -- find and copy data -- change file headers, etc.. (ext scripts for data manipulation) -- generate blast banks and links -- generate and edit nginx confs -- generate dc and start the containers -- connect to instance and launch tools>workflows -- generate and update metadata -- exit +- read input (yml, maybe xlsx later) +- create dir_tree -- DONE +- find and copy data -- DONE +- change file headers, etc.. (ext scripts for data manipulation) -- IN PROGRESS +- generate blast banks and links -- NOT DONE +- generate and edit nginx confs -- DONE +- generate dc and start the containers -- IN PROGRESS +- connect to instance and launch tools>workflows -- IN PROGRESS +- generate and update metadata -- IN PROGRESS + + +NOTES: +- A master API key cannot be used, as some functions are tied to a user (like creating an history), so the access to the + galaxy instance must be done using email and password + """ +def parse_input(input_file): + """ + Parse the yml, json or tabulated input in order to set attributes for the Autoload class + + :param input_file: + :return: + """ + + parsed_sp_dict_list = [] + + if str(input_file).endswith("yml") or str(input_file).endswith("yaml"): + logging.debug("Input format used: YAML") + else: + logging.critical("Error, please input a YAML file") + sys.exit() + with open(input_file, 'r') as stream: + try: + yaml_dict = yaml.safe_load(stream) + for k, v in yaml_dict.items(): + parsed_sp_dict_list.append(v) + except yaml.YAMLError as exc: + logging.debug(exc) + return parsed_sp_dict_list + + class Autoload: """ - Autoload class contains attributes and functions to interact with GGA + The "Autoload" class contains attributes and functions to interact with the galaxy container of the GGA environment + + """ - def __init__(self, species_parameters_dictionary, args): - self.species_parameters_dictionary = species_parameters_dictionary + def __init__(self, parameters_dictionary, args): + self.parameters_dictionary = parameters_dictionary self.args = args - self.species = species_parameters_dictionary["species"] - self.genus = species_parameters_dictionary["genus"] - self.strain = species_parameters_dictionary["strain"] - self.sex = species_parameters_dictionary["sex"] - self.common = species_parameters_dictionary["common"] - self.date = species_parameters_dictionary["date"] - self.origin = species_parameters_dictionary["origin"] - self.performed = species_parameters_dictionary["performed by"] - self.genome_version = species_parameters_dictionary["genome version"] - self.ogs_version = species_parameters_dictionary["ogs version"] + self.species = parameters_dictionary["description"]["species"] + self.genus = parameters_dictionary["description"]["genus"] + self.strain = parameters_dictionary["description"]["strain"] + self.sex = parameters_dictionary["description"]["sex"] + self.common = parameters_dictionary["description"]["common_name"] + self.date = datetime.today().strftime("%Y-%m-%d") + self.origin = parameters_dictionary["description"]["origin"] + self.performed = parameters_dictionary["data"]["performed_by"] + if parameters_dictionary["data"]["genome_version"] == "": + self.genome_version = "1.0" + else: + self.genome_version = parameters_dictionary["data"]["genome_version"] + if parameters_dictionary["data"]["ogs_version"] == "": + self.ogs_version = "1.0" + else: + self.ogs_version = parameters_dictionary["data"]["ogs_version"] self.genus_lowercase = self.genus[0].lower() + self.genus[1:] + self.genus_uppercase = self.genus[0].upper() + self.genus[1:] + self.species_folder_name = "_".join([self.genus_lowercase, self.species, self.strain, self.sex]) self.full_name = " ".join([self.genus_lowercase, self.species, self.strain, self.sex]) self.abbreviation = " ".join([self.genus_lowercase[0], self.species, self.strain, self.sex]) self.genus_species = self.genus_lowercase + "_" + self.species - self.instance_url = "http://localhost/sp/" + self.genus_lowercase + "_" + self.species + "/galaxy/" # testing! + self.instance_url = "http://scratchgmodv1:8888/sp/" + self.genus_lowercase + "_" + self.species + "/galaxy/" # Testing with localhost/scratchgmodv1 self.instance = None self.history_id = None self.library_id = None @@ -84,89 +135,83 @@ class Autoload: self.workflow_name = None self.docker_compose_generator = None self.metadata = dict() - self.source_data_dir = "/projet/sbr/phaeoexplorer" + self.api_key = "dev" # TODO: set the key in config file --> saved for later (master api key access actions are limited) + if parameters_dictionary["data"]["parent_directory"] == "" or parameters_dictionary["data"]["parent_directory"] == "/path/to/closest/parent/dir": + self.source_data_dir = "/projet/sbr/phaeoexplorer/" # Testing path for phaeoexplorer data + else: + self.source_data_dir = parameters_dictionary["data"]["parent_directory"] # Directory/subdirectories where data files are located (fasta, gff, ...), point to a directory as close as possible to the source files self.do_update = False - # Update the instance (in histories corresponding to the input) instead of creating a new one TODO: move this variable inside methods + # Update the instance (in histories corresponding to the input) instead of creating a new one // TODO: move this variable inside methods self.api_key = "dev" - # Api key used to communicate with the galaxy instance. Set to "dev" for the moment TODO: find a way to create, store then use the api key safely + # API key used to communicate with the galaxy instance. Set to "dev" for the moment // TODO: find a way to create, store then use the api key safely + + + # def get_source_data(self, max_depth): + # """ + # TODO: saved for later just in case + # + # Find and copy source data files to src_data directory tree + # - recursively search for the correct files (within a fixed max depth) + # - requires the organism src_data directory tree to already be properly created for the organism (run generate_dir_tree) + # - the source files must have "transcripts", "proteins"/"pep", "genome" in their name, and a gff extension + # + # """ + # src_data_dir = os.path.join(self.species_dir, "/src_data") + # sp_regex = "(?=\w*V)(?=\w*A)(?=\w*R)(?=\w*I)(?=\w*A)(?=\w*B)(?=\w*L)(?=\w*E)\w+" # example with VARIABLE + # + # # The regex works using the species attribute (unique) --> regex is probably not necessary + # sp_regex = "" + # for i in self.species: + # sp_regex = sp_regex + "?=\w*" + i + ")" + # sp_regex = sp_regex + ")\w+" + # re_dict = dict() + # re_dict["gff"] = None + # re_dict["transcripts"] = None + # re_dict["proteins"] = None + # re_dict["genome"] = None + # reg = None + # + # for dirpath, dirnames, files in os.walk(self.source_data_dir): + # for f in files: + # if self.species and self.sex in f: + # logging.info("File found") - def connect_to_instance(self): - """ - Test the connection to the galaxy instance for the current organism - Exit if we can't connect to the instance - """ - self.instance = galaxy.GalaxyInstance(url=self.instance_url, key=self.api_key) - logging.info("connection to the galaxy instance ...") - try: - self.instance.histories.get_histories() - self.tool_panel = self.instance.tools.get_tool_panel() - except bioblend.ConnectionError: - logging.info("cannot connect to galaxy instance @ " + self.instance_url) - sys.exit() - else: - logging.info("successfully connected to galaxy instance @ " + self.instance_url) - - def get_source_data(self, max_depth): + def generate_dir_tree(self): """ - OBSOLETE + Generate the directory tree for an organism and move datasets into src_data - Find and copy source data files to src_data directory tree - - recursively search for the correct files (within a fixed max depth) - - requires the organism src_data directory tree to already be properly created for the organism (run generate_dir_tree) - - the source files must have "transcripts", "proteins"/"pep", "genome" in their name, and a gff extension + TODO: DOCKER -- this is the one the "docker" parts of the script + :return: """ - src_data_dir = os.path.join(self.species_dir, "/src_data") - - sp_regex = "(?=\w*V)(?=\w*A)(?=\w*R)(?=\w*I)(?=\w*A)(?=\w*B)(?=\w*L)(?=\w*E)\w+" # example with VARIABLE - - # The regex works using the species attribute (unique) --> regex is probably not necessary - sp_regex = "" - for i in self.species: - sp_regex = sp_regex + "?=\w*" + i + ")" - sp_regex = sp_regex + ")\w+" - re_dict = dict() - re_dict["gff"] = None - re_dict["transcripts"] = None - re_dict["proteins"] = None - re_dict["genome"] = None - reg = None - - for dirpath, dirnames, files in os.walk(self.source_data_dir): - for f in files: - if self.species and self.sex in f: - print("File found") - def generate_dir_tree(self): - """ - Generate the directory tree for an organism and move datasets into src_data - """ - os.chdir(self.main_dir) self.main_dir = os.getcwd() + "/" self.species_dir = os.path.join(self.main_dir, self.genus_species) + "/" try: os.mkdir(self.species_dir) except FileExistsError: - logging.debug("directory " + self.species_dir + " already exists") + logging.debug("Directory " + self.species_dir + " already exists") try: os.chdir(self.species_dir) working_dir = os.getcwd() except OSError: - logging.info("cannot access " + self.species_dir + ", run with higher privileges") + logging.critical("Cannot access " + self.species_dir + ", run with higher privileges") sys.exit() try: os.mkdir("./nginx/") os.mkdir("./nginx/conf") with open(os.path.abspath("./nginx/conf/default.conf"), 'w') as conf: - conf.write("server {\n\tlisten 80;\n\tserver_name ~.;\n\tlocation /download/ {\n\t\talias /project_data/; \n\t\tautoindex on;\n\t}\n}") # the nginx conf + conf.write("server {\n\tlisten 80;\n\tserver_name ~.;\n\tlocation /download/ {\n\t\talias /project_data/; \n\t\tautoindex on;\n\t}\n}") # The species nginx conf except FileExistsError: - logging.debug("nginx conf exists") + logging.debug("NginX conf exists") - # src_data_folders = ["annotation", "genome"] # directories to generate - species_folder_name = "_".join([self.genus_lowercase, self.species, self.strain, self.sex]) + # src_data_folders = ["annotation", "genome"] # The directories to generate + not_empty_attributes = filter_empty_not_empty_items([self.genus_lowercase, self.species, self.strain, self.sex])["not_empty"] + self.species_folder_name = "_".join(not_empty_attributes) + # self.species_folder_name = "_".join([self.genus_lowercase, self.species, self.strain, self.sex]) organism_annotation_dir, organism_genome_dir = None, None # Create src_data dir tree @@ -175,23 +220,22 @@ class Autoload: os.mkdir("./src_data/annotation") os.mkdir("./src_data/genome") os.mkdir("./src_data/tracks") - os.mkdir("./src_data/annotation/" + species_folder_name) - os.mkdir("./src_data/genome/" + species_folder_name) - os.mkdir("./src_data/annotation/" + species_folder_name + "/OGS" + self.ogs_version) - os.mkdir("./src_data/genome/" + species_folder_name + "/v" + self.genome_version) - organism_annotation_dir = os.path.abspath("./src_data/annotation/" + species_folder_name + "/OGS" + self.genome_version) - organism_genome_dir = os.path.abspath("./src_data/genome/" + species_folder_name + "/v" + self.genome_version) + os.mkdir("./src_data/annotation/" + self.species_folder_name) + os.mkdir("./src_data/genome/" + self.species_folder_name) + os.mkdir("./src_data/annotation/" + self.species_folder_name + "/OGS" + self.ogs_version) + os.mkdir("./src_data/genome/" + self.species_folder_name + "/v" + self.genome_version) + organism_annotation_dir = os.path.abspath("./src_data/annotation/" + self.species_folder_name + "/OGS" + self.genome_version) + organism_genome_dir = os.path.abspath("./src_data/genome/" + self.species_folder_name + "/v" + self.genome_version) except FileExistsError: if self.do_update: logging.info("Updating src_data directory tree") else: - logging.info("The src_data directory tree already exists") + logging.debug("The src_data directory tree already exists") except PermissionError: - logging.info("Insufficient permission to create src_data directory tree") + logging.critical("Insufficient permission to create src_data directory tree") sys.exit() - # Hard coded paths (find a way to get the files by adding an attribute "path_to_repo") - # Write with string? + # Path to the templates used to generate the custom docker-compose files for an input species stack_template_path = self.script_dir + "/templates/stack-organism.yml" traefik_template_path = self.script_dir + "/templates/traefik.yml" authelia_config_path = self.script_dir + "/templates/authelia_config.yml" @@ -205,82 +249,86 @@ class Autoload: with open(stack_template_path, 'r') as infile: organism_content = list() for line in infile: - # Replace placeholders by the genus and species + # One-liner to replace placeholders by the genus and species organism_content.append( line.replace("genus_species", str(self.genus.lower() + "_" + self.species)).replace("Genus species", str(self.genus + " " + self.species)).replace("Genus/species", str(self.genus + "/" + self.species)).replace("gspecies", str( self.genus.lower()[0] + self.species)).replace("genus_species_strain_sex", genus_species_strain_sex)) with open("./docker-compose.yml", 'w') as outfile: for line in organism_content: outfile.write(line) - subprocess.call(["python3", self.script_dir + "/create_mounts.py"], cwd=working_dir) + subprocess.call(["python3", self.script_dir + "/create_mounts.py"], cwd=working_dir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) # Create mounts for the containers try: os.mkdir("../traefik") os.mkdir("../traefik/authelia") shutil.copy(authelia_config_path, "../traefik/authelia/configuration.yml") - shutil.copy(authelia_users_path, "../traefik/authelia/users.yml") - # with open(traefik_template_path, 'r') as infile: - # traefik_content = list() - # for line in infile: - # # Replace placeholders by the genus and species (there are none) - # traefik_content.append( - # line.replace("genus_species", str(self.genus.lower() + "_" + self.species)).replace("Genus species", str(self.genus + " " + self.species)).replace("Genus/species", str(self.genus + "/" + self.species)).replace("gspecies", str(self.genus.lower()[0] + self.species)).replace("genus_species_strain_sex", genus_species_strain_sex)) - # with open("../traefik/docker-compose.yml", 'w') as outfile: - # for line in traefik_content: - # outfile.write(line) - subprocess.call(["python3", self.script_dir + "/create_mounts.py"], cwd=working_dir) + shutil.copy(authelia_users_path, "../traefik/authelia/users.yml") # TODO: custom users (add a config file?) + subprocess.call(["python3", self.script_dir + "/create_mounts.py"], cwd=working_dir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) # Create mounts for the containers except FileExistsError: - logging.debug("SKIP: Traefik directory already exists") + logging.debug("Traefik directory already exists") + try: + shutil.copy(traefik_template_path, "../traefik/docker-compose.yml") + except FileExistsError: + logging.debug("Traefik compose file already exists") + subprocess.call(["python3", self.script_dir + "/create_mounts.py"], cwd=working_dir) + def get_source_data_files(self): + """ + Find all files in source_data directory, to link the matching files in the src_data dir tree + :return: + """ - # Create volumes for the containers (script written by A. Bretaudeau) - subprocess.call(["python3", self.script_dir + "/create_mounts.py"], cwd=working_dir) + try: + os.chdir(self.species_dir) + working_dir = os.getcwd() + except OSError: + logging.critical("Cannot access " + self.species_dir + ", run with higher privileges") + sys.exit() + organism_annotation_dir = os.path.abspath("./src_data/annotation/" + self.species_folder_name + "/OGS" + self.genome_version) + organism_genome_dir = os.path.abspath("./src_data/genome/" + self.species_folder_name + "/v" + self.genome_version) - # Find all files in source_data directory, to link the matching files in the src_data dir tree - # Can be turned into a generator for performance - # TODO: cp data files method in a separate function (for EOSC) for dirpath, dirnames, files in os.walk(self.source_data_dir): - if "0" in str(dirpath): # ensures to take the correct files (other dirs hold files with the correct names, but I don't know if they are the same - for f in files: + if "0" in str(dirpath): # Ensures to take the correct files (other dirs hold files with the correct names, but I don't know if they are the same) #alb + for f in files: + if "Contaminants" not in str(f): try: if fnmatch.fnmatch(f, "*" + self.species[1:] + "_" + self.sex.upper() + ".fa"): - logging.info("genome assembly file: " + str(f)) + logging.info("Genome assembly file - " + str(f)) organism_genome_dir = organism_genome_dir + "/" + f os.symlink(os.path.join(dirpath, f), organism_genome_dir) - organism_genome_dir = os.path.abspath("./src_data/genome/" + species_folder_name + "/v" + self.genome_version) + organism_genome_dir = os.path.abspath("./src_data/genome/" + self.species_folder_name + "/v" + self.genome_version) elif fnmatch.fnmatch(f, "*" + self.species[1:] + "_" + self.sex.upper() + ".gff"): - logging.info("gff file: " + str(f)) + logging.info("GFF file - " + str(f)) organism_annotation_dir = organism_annotation_dir + "/" + f os.symlink(os.path.join(dirpath, f), organism_annotation_dir) - organism_annotation_dir = os.path.abspath("./src_data/annotation/" + species_folder_name + "/OGS" + self.genome_version) + organism_annotation_dir = os.path.abspath("./src_data/annotation/" + self.species_folder_name + "/OGS" + self.genome_version) elif fnmatch.fnmatch(f, "*" + self.species[1:] + "_" + self.sex.upper() + "_transcripts-gff.fa"): - logging.info("transcripts file: " + str(f)) + logging.info("Transcripts file - " + str(f)) organism_annotation_dir = organism_annotation_dir + "/" + f os.symlink(os.path.join(dirpath, f), organism_annotation_dir) - organism_annotation_dir = os.path.abspath("./src_data/annotation/" + species_folder_name + "/OGS" + self.genome_version) + organism_annotation_dir = os.path.abspath("./src_data/annotation/" + self.species_folder_name + "/OGS" + self.genome_version) elif fnmatch.fnmatch(f, "*" + self.species[1:] + "_" + self.sex.upper() + "_proteins.fa"): - logging.info("proteins file: " + str(f)) + logging.info("Proteins file - " + str(f)) organism_annotation_dir = organism_annotation_dir + "/" + f os.symlink(os.path.join(dirpath, f), organism_annotation_dir) - organism_annotation_dir = os.path.abspath("./src_data/annotation/" + species_folder_name + "/OGS" + self.genome_version) + organism_annotation_dir = os.path.abspath("./src_data/annotation/" + self.species_folder_name + "/OGS" + self.genome_version) + except FileExistsError: + logging.warning("Error raised (FileExistsError)") except TypeError: - pass - - # Launch and update docker stacks (cf docs) TODO: deploy method in a separate function (for EOSC) - # deploy_script_path = self.script_dir + "/deploy.sh" - # subprocess.call(["sh", deploy_script_path, self.genus_species]) + logging.warning("Error raised (TypeError)") + except NotADirectoryError: + logging.warning("Error raised (NotADirectoryError)") - def write_nginx_conf(self): + def deploy_stack(self): """ - OBSOLETE: compose method - Generate (and update nginx) conf files to add new organisms from the proxy + Call the script "deploy.sh" used to initiliaze the swarm cluster if needed and launch/update the stack :return: """ - nginx_proxy_path = "" # nginx conf template for the main proxy (needs to be updated for each new organism integration) - nginx_organism_path = "" # nginx conf template for the current organism (used once) - docker_proxy_template_path = "" # dockerfile for the main proxy (used once) + # Launch and update docker stacks (cf docs) + # TODO: add a fail condition? + subprocess.call(["sh", self.script_dir + "/deploy.sh", self.genus_species, self.main_dir + "/traefik"]) def modify_fasta_headers(self): """ @@ -293,7 +341,8 @@ class Autoload: os.chdir(self.species_dir) working_dir = os.getcwd() except OSError: - logging.info("cannot access " + self.species_dir + ", run with higher privileges") + logging.info("Cannot access " + self.species_dir + ", run with higher privileges") + logging.info("Fatal error: exit") sys.exit() self.source_files = dict() annotation_dir, genome_dir = None, None @@ -323,7 +372,7 @@ class Autoload: # test version # modify_pep_headers = ["/home/alebars/gga/phaeoexplorer-change_pep_fasta_header.sh", # self.source_files["proteins_file"]] - logging.info("changing fasta headers in " + self.source_files["proteins_file"]) + logging.info("Changing fasta headers: " + self.source_files["proteins_file"]) subprocess.run(modify_pep_headers, stdout=subprocess.PIPE, cwd=annotation_dir) # production version modify_pep_headers = [str(self.main_dir) + "/gga_load_data/ext_scripts/phaeoexplorer-change_transcript_fasta_header.sh", @@ -331,7 +380,7 @@ class Autoload: # test version # modify_pep_headers = ["/home/alebars/gga/phaeoexplorer-change_transcript_fasta_header.sh", # self.source_files["proteins_file"]] - logging.info("changing fasta headers in " + self.source_files["transcripts_file"]) + logging.info("Changing fasta headers: " + self.source_files["transcripts_file"]) subprocess.run(modify_pep_headers, stdout=subprocess.PIPE, cwd=annotation_dir) # src_data cleaning @@ -346,37 +395,99 @@ class Autoload: def generate_blast_banks(self): """ - TODO Generate BLAST banks for the species + TODO + Automatically generate blast banks for a species + TODO: auto commit the files? + + :return: + """ + + # @commit_files + def generate_blast_banks_and_commit(self): + """ + TODO + + :return: + """ + return None + + def commit_files(self): + """ + TODO + Commit files to a git repo + Commits to the gga repo for phaeoexplorer + TODO: add repo to config file + + :return: """ + return None + def connect_to_instance(self): + """ + TODO: move in init/access + + Test the connection to the galaxy instance for the current organism + Exit if it cannot connect to the instance + """ + self.instance = galaxy.GalaxyInstance(url=self.instance_url, email="admin@galaxy.org", password="password", verify=False) + logging.info("Connecting to the galaxy instance ...") + try: + self.instance.histories.get_histories() + self.tool_panel = self.instance.tools.get_tool_panel() + except bioblend.ConnectionError: + logging.critical("Cannot connect to galaxy instance @ " + self.instance_url) + sys.exit() + else: + logging.info("Successfully connected to galaxy instance @ " + self.instance_url) + self.instance.histories.create_history(name=str(self.full_name)) + + def setup_data_libraries(self): """ - - generate blast banks and docker-compose (TODO: separate function) + - generate blast banks and docker-compose - load data into the galaxy container with the galaxy_data_libs_SI.py script :return: """ try: - logging.info("loading data into the galaxy container") - subprocess.run("docker-compose exec galaxy /tool_deps/_conda/bin/python /opt/setup_data_libraries.py", - stdout=subprocess.PIPE, + logging.info("Loading data into the galaxy container") + subprocess.run("../serexec genus_species_galaxy /tool_deps/_conda/bin/python /opt/setup_data_libraries.py", shell=True) except subprocess.CalledProcessError: - logging.info("cannot load data into container for " + self.full_name) + logging.info("Cannot load data into the galaxy container for " + self.full_name) pass else: - logging.info("data successfully loaded into docker container for " + self.full_name) + logging.info("Data successfully loaded into the galaxy container for " + self.full_name) + + self.get_species_history_id() + # self.get_instance_attributes() + # + # # import all datasets into current history + # self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["genome_file"]) + # self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["gff_file"]) + # self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["transcripts_file"]) + # self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["proteins_file"]) + + def get_species_history_id(self): + """ + Set and return the current species history id in its galaxy instance + + :return: + """ + histories = self.instance.histories.get_histories(name=str(self.full_name)) + self.history_id = histories[0]["id"] + self.instance.histories.show_history(history_id=self.history_id) - self.get_instance_attributes() - # self.history_id = self.instance.histories.get_current_history()["id"] + return self.history_id - # import all datasets into current history - self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["genome_file"]) - self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["gff_file"]) - self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["transcripts_file"]) - self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["proteins_file"]) + def create_species_history(self): + histories = self.instance.histories.get_histories(name=str(self.full_name)) + print("\n" + str(histories) + "\n" + self.full_name + "\n") + if not histories: + self.instance.histories.create_history(name=str(self.full_name)) + print("Created history!") def get_instance_attributes(self): """ @@ -404,7 +515,7 @@ class Autoload: current_folder_name = v if k == "id": folders_ids[current_folder_name] = v - logging.info("folders and datasets IDs: ") + logging.info("Folders and datasets IDs: ") self.datasets = dict() for k, v in folders_ids.items(): logging.info("\t" + k + ": " + v) @@ -435,6 +546,87 @@ class Autoload: self.datasets["gff_file"] = e["ldda_id"] logging.info("\t\t" + e["name"] + ": " + e["ldda_id"]) + def init_instance(self): + """ + Galaxy instance startup in preparation for running workflows + - remove Homo sapiens from the chado database. + - add organism and analyses into the chado database --> separate + - get any other existing organisms IDs before updating the galaxy instance --> separate + + TODO: move the library and analysis/data stuff to a separate function + :return: + """ + + self.connect_to_instance() + self.get_species_history_id() + histories = self.instance.histories.get_histories(name=str(self.full_name)) + # Create the first history + if not histories: + self.instance.histories.create_history(name=str(self.full_name)) + self.history_id = histories[0]["id"] + logging.debug("history ID: " + self.history_id) + # libraries = self.instance.libraries.get_libraries() # routine check: one library + # self.library_id = self.instance.libraries.get_libraries()[0]["id"] # project data folder/library + logging.debug("library ID: " + self.history_id) + instance_source_data_folders = self.instance.libraries.get_folders(library_id=self.library_id) + + # Delete Homo sapiens from Chado database + logging.debug("Getting 'Homo sapiens' ID in instance's chado database") + get_sapiens_id_job = self.instance.tools.run_tool( + tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/2.3.2", + history_id=self.history_id, + tool_inputs={"genus": "Homo", "species": "sapiens"}) + get_sapiens_id_job_output = get_sapiens_id_job["outputs"][0]["id"] + get_sapiens_id_json_output = self.instance.datasets.download_dataset(dataset_id=get_sapiens_id_job_output) + try: + logging.debug("Deleting Homo 'sapiens' in the instance's chado database") + get_sapiens_id_final_output = json.loads(get_sapiens_id_json_output)[0] + sapiens_id = str( + get_sapiens_id_final_output["organism_id"]) # needs to be str to be recognized by the chado tool + self.instance.tools.run_tool( + tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_delete_organisms/organism_delete_organisms/2.3.2", + history_id=self.history_id, + tool_inputs={"organism": str(sapiens_id)}) + except bioblend.ConnectionError: + logging.debug("Homo sapiens isn't in the instance's chado database") + except IndexError: + logging.debug("Homo sapiens isn't in the instance's chado database") + pass + + # TODO: the following actions should be done in a separate function (in case if the user wants to do everything him/herself -- for EOSC) + # Add organism (species) to chado + logging.info("Adding organism to the instance's chado database") + self.instance.tools.run_tool( + tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_add_organism/organism_add_organism/2.3.2", + history_id=self.history_id, + tool_inputs={"abbr": self.abbreviation, + "genus": self.genus, + "species": self.species, + "common": self.common}) + # Add OGS analysis to chado + logging.info("Adding OGS analysis to the instance's chado database") + self.instance.tools.run_tool( + tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/2.3.2", + history_id=self.history_id, + tool_inputs={"name": self.genus + " " + self.species + " OGS" + self.ogs_version, + "program": "Performed by Genoscope", + "programversion": str("OGS" + self.ogs_version), + "sourcename": "Genoscope", + "date_executed": self.date}) + + # Add genome analysis to chado + logging.info("Adding genome analysis to the instance's chado database") + self.instance.tools.run_tool( + tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/2.3.2", + history_id=self.history_id, + tool_inputs={"name": self.genus + " " + self.species + " genome v" + self.genome_version, + "program": "Performed by Genoscope", + "programversion": str("genome v" + self.genome_version), + "sourcename": "Genoscope", + "date_executed": self.date}) + self.get_organism_and_analyses_ids() + logging.info("Finished initializing instance") + def run_workflow(self, workflow_name, workflow_parameters, datamap): """ Run the "main" workflow in the galaxy instance @@ -483,10 +675,10 @@ class Autoload: workflow_attributes = self.instance.workflows.get_workflows(name=self.workflow_name) workflow_id = workflow_attributes[0]["id"] show_workflow = self.instance.workflows.show_workflow(workflow_id=workflow_id) - logging.debug("workflow ID: " + workflow_id) + logging.debug("Workflow ID: " + workflow_id) - logging.debug("inputs:") - logging.debug(show_workflow["inputs"]) + logging.debug("Inputs:") + logging.debug(show_workflow["Inputs"]) self.instance.workflows.invoke_workflow(workflow_id=workflow_id, history_id=self.history_id, params=workflow_parameters, @@ -494,75 +686,16 @@ class Autoload: inputs_by="") self.instance.workflows.delete_workflow(workflow_id=workflow_id) - def init_instance(self): + def load_data_in_galaxy(self): """ - Galaxy instance startup in preparation for running workflows - - remove Homo sapiens from the chado database. - - add organism and analyses into the chado database - - get any other existing organisms IDs before updating the galaxy instance + Function to load the src_data folder in galaxy :return: """ - self.instance.histories.create_history(name=str(self.full_name)) - histories = self.instance.histories.get_histories(name=str(self.full_name)) - self.history_id = histories[0]["id"] - logging.debug("history ID: " + self.history_id) - libraries = self.instance.libraries.get_libraries() # routine check: one library - self.library_id = self.instance.libraries.get_libraries()[0]["id"] # project data folder/library - logging.debug("library ID: " + self.history_id) - instance_source_data_folders = self.instance.libraries.get_folders(library_id=self.library_id) - - # Delete Homo sapiens from Chado database - logging.info("getting sapiens ID in instance's chado database") - get_sapiens_id_job = self.instance.tools.run_tool(tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/2.3.2", - history_id=self.history_id, - tool_inputs={"genus": "Homo", "species": "sapiens"}) - get_sapiens_id_job_output = get_sapiens_id_job["outputs"][0]["id"] - get_sapiens_id_json_output = self.instance.datasets.download_dataset(dataset_id=get_sapiens_id_job_output) - try: - logging.info("deleting Homo sapiens in the instance's chado database") - get_sapiens_id_final_output = json.loads(get_sapiens_id_json_output)[0] - sapiens_id = str(get_sapiens_id_final_output["organism_id"]) # needs to be str to be recognized by the chado tool - self.instance.tools.run_tool(tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_delete_organisms/organism_delete_organisms/2.3.2", - history_id=self.history_id, - tool_inputs={"organism": str(sapiens_id)}) - except bioblend.ConnectionError: - logging.debug("Homo sapiens isn't in the instance's chado database") - except IndexError: - logging.debug("Homo sapiens isn't in the instance's chado database") - pass - - # Add organism (species) to chado - logging.info("adding organism to the instance's chado database") - self.instance.tools.run_tool(tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_add_organism/organism_add_organism/2.3.2", - history_id=self.history_id, - tool_inputs={"abbr": self.abbreviation, - "genus": self.genus, - "species": self.species, - "common": self.common}) - # Add OGS analysis to chado - logging.info("adding OGS analysis to the instance's chado database") - self.instance.tools.run_tool(tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/2.3.2", - history_id=self.history_id, - tool_inputs={"name": self.genus + " " + self.species + " OGS" + self.ogs_version, - "program": "Performed by Genoscope", - "programversion": str("OGS" + self.ogs_version), - "sourcename": "Genoscope", - "date_executed": self.date}) - - # Add genome analysis to chado - logging.info("adding genome analysis to the instance's chado database") - self.instance.tools.run_tool(tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/2.3.2", - history_id=self.history_id, - tool_inputs={"name": self.genus + " " + self.species + " genome v" + self.genome_version, - "program": "Performed by Genoscope", - "programversion": str("genome v" + self.genome_version), - "sourcename": "Genoscope", - "date_executed": self.date}) + logging.info("Loading data in galaxy") - self.get_organism_and_analyses_ids() - logging.info("finished initializing instance") + return None def get_organism_and_analyses_ids(self): """ @@ -610,15 +743,24 @@ class Autoload: except IndexError: logging.debug("no matching genome analysis exists in the instance's chado database") - def clean_instance(self): """ - TODO: function to purge the instance from analyses and organisms + TODO: method to purge the instance from analyses and organisms :return: """ return None +def filter_empty_not_empty_items(li): + ret = {"empty": [], "not_empty": []} + for i in li: + if i is None or i == "": + ret["empty"].append(i) + else: + ret["not_empty"].append(i) + return ret + + if __name__ == "__main__": parser = argparse.ArgumentParser(description="Automatic data loading in containers and interaction with galaxy instances for GGA" ", following the protocol @ " @@ -645,7 +787,7 @@ if __name__ == "__main__": action="store_true") # Production arguments - parser.add_argument("input", type=str, help="Input table (tabulated file that describes all data) or json file") + parser.add_argument("input", type=str, help="Input file (yml)") parser.add_argument("-v", "--verbose", help="Increase output verbosity", action="store_false") @@ -664,68 +806,80 @@ if __name__ == "__main__": else: logging.basicConfig(level=logging.INFO) - if str(args.input).endswith(".json"): - print("JSON") - input_json = args.input - else: - print("TABLE") - tp = table_parser.TableParser() - logging.info("parsing input table") - tp.table = args.input - input_json = tp.parse_table(mode="simple", method="table_to_json") - sp_dict_list = list() - with open(input_json, 'r') as infile: - json_sp_dict = json.load(infile) - json_sp_dump = json.dumps(json_sp_dict, indent=4, sort_keys=True) - for json_sp in json_sp_dict: - sp_dict_list.append(json_sp) - - metadata = {} + logging.info("Start") + sp_dict_list = parse_input(args.input) for sp_dict in sp_dict_list: - al = Autoload(species_parameters_dictionary=sp_dict, args=args) + al = Autoload(parameters_dictionary=sp_dict, args=args) al.main_dir = os.path.abspath(args.dir) if args.load_data: + al.generate_dir_tree() - if args.init_instance: - logging.info("initializing the galaxy instance") - al.init_instance() - al.get_instance_attributes() - # metadata[genus_species_strain_sex]["initialized"] = True - if args.load_data: - logging.info("loading data into galaxy") - # al.load_data() - # metadata[genus_species_strain_sex]["data_loaded_in_instance"] = True - if args.run_main: - logging.info("running main workflow") - al.get_organism_and_analyses_ids() - workflow_parameters = dict() - workflow_parameters["0"] = {} - workflow_parameters["1"] = {} - workflow_parameters["2"] = {} - workflow_parameters["3"] = {} - workflow_parameters["4"] = {"organism": al.org_id, - "analysis_id": al.genome_analysis_id, - "do_update": "true"} - workflow_parameters["5"] = {"organism": al.org_id, - "analysis_id": al.ogs_analysis_id} - workflow_parameters["6"] = {"organism_id": al.org_id} - workflow_parameters["7"] = {"analysis_id": al.ogs_analysis_id} - workflow_parameters["8"] = {"analysis_id": al.genome_analysis_id} - workflow_parameters["9"] = {"organism_id": al.org_id} - workflow_parameters["10"] = {} - workflow_parameters["11"] = {} - - al.datamap = dict() - al.datamap["0"] = {"src": "hda", "id": al.datasets["genome_file"]} - al.datamap["1"] = {"src": "hda", "id": al.datasets["gff_file"]} - al.datamap["2"] = {"src": "hda", "id": al.datasets["proteins_file"]} - al.datamap["3"] = {"src": "hda", "id": al.datasets["transcripts_file"]} - - al.run_workflow(workflow_name="main", workflow_parameters=workflow_parameters, datamap=al.datamap) - # metadata[genus_species_strain_sex]["workflows_run"] = metadata[genus_species_strain_sex]["workflows_run"].append("main") - - if args.link_source: - print('SOURCE DATA HANDLE') - al.generate_dir_tree() - print(al.main_dir) - print(al.species_dir) + logging.info("Successfully generated the directory tree for " + al.genus[0].upper() + ". " + al.species + " " + al.strain + " " + al.sex) + + al.get_source_data_files() + logging.info("Successfully retrieved source data files for " + al.genus[0].upper() + ". " + al.species + " " + al.strain + " " + al.sex) + + al.deploy_stack() + logging.info("Successfully deployed containers stack for " + al.genus[0].upper() + ". " + al.species + " " + al.strain + " " + al.sex) + + # al.connect_to_instance() + # logging.info("Connected to instance") + # + # al.create_species_history() + # logging.info("Created a history") + # + # al.setup_data_libraries() + # logging.info("Setting up data libraries") + + # al.init_instance() + # logging.info("Successfully initialized instance for " + al.genus[0].upper() + ". " + al.species + " " + al.strain + " " + al.sex) + + # al.setup_data_libraries() + # logging.info("Successfully set up data libraries in galaxy for " + al.genus[0].upper() + ". " + al.species + " " + al.strain + " " + al.sex) + + + # if args.init_instance: + # logging.info(" Initializing the galaxy instance") + # al.init_instance() + # al.get_instance_attributes() + # # metadata[genus_species_strain_sex]["initialized"] = True + # if args.load_data: + # logging.info("Loading data into galaxy") + # # al.load_data() + # # metadata[genus_species_strain_sex]["data_loaded_in_instance"] = True + # if args.run_main: + # logging.info("Running main workflow") + # al.get_organism_and_analyses_ids() + # workflow_parameters = dict() + # workflow_parameters["0"] = {} + # workflow_parameters["1"] = {} + # workflow_parameters["2"] = {} + # workflow_parameters["3"] = {} + # workflow_parameters["4"] = {"organism": al.org_id, + # "analysis_id": al.genome_analysis_id, + # "do_update": "true"} + # workflow_parameters["5"] = {"organism": al.org_id, + # "analysis_id": al.ogs_analysis_id} + # workflow_parameters["6"] = {"organism_id": al.org_id} + # workflow_parameters["7"] = {"analysis_id": al.ogs_analysis_id} + # workflow_parameters["8"] = {"analysis_id": al.genome_analysis_id} + # workflow_parameters["9"] = {"organism_id": al.org_id} + # workflow_parameters["10"] = {} + # workflow_parameters["11"] = {} + # + # al.datamap = dict() + # al.datamap["0"] = {"src": "hda", "id": al.datasets["genome_file"]} + # al.datamap["1"] = {"src": "hda", "id": al.datasets["gff_file"]} + # al.datamap["2"] = {"src": "hda", "id": al.datasets["proteins_file"]} + # al.datamap["3"] = {"src": "hda", "id": al.datasets["transcripts_file"]} + # + # al.run_workflow(workflow_name="main", workflow_parameters=workflow_parameters, datamap=al.datamap) + # # metadata[genus_species_strain_sex]["workflows_run"] = metadata[genus_species_strain_sex]["workflows_run"].append("main") + # + # if args.link_source: + # print('DEV') + # al.generate_dir_tree() + # print(al.main_dir) + # print(al.species_dir) + + logging.info("Exit") diff --git a/docker_compose_generator.py b/docker_compose_generator.py index aa51d14..d5fe776 100755 --- a/docker_compose_generator.py +++ b/docker_compose_generator.py @@ -7,10 +7,10 @@ import logging """ docker-compose.yml generator -The method "generate" works for both docker-compose architecture (old), or docker stacks (new) +The method "generate" works for both docker-compose architecture (old), or docker stack (new) This method will write a formatted docker-compose.yml for the specified organism (only requires genus and species) -Made to work in the integration streamlined script "autoload.py" but can be used as a standalone (either with a CLI +Made to work in the integration streamlined script "deploy_stacks.py" but can be used as a standalone (either with a CLI or in another python file as a module) Dockerfiles are specific to genus-species: a same organism can have several strains and sexes integrated, but only one diff --git a/examples/example.yml b/examples/example.yml new file mode 100644 index 0000000..ff06661 --- /dev/null +++ b/examples/example.yml @@ -0,0 +1,60 @@ +# Config file for the automated creation GGA docker stacks +# The file consists in a list of species for which the script will have to create stacks and load data into galaxy + +ectocarpus_sp1: # Dummy value the user gives to designate the species (isn't used by the script) + # Species description, leave blank if unknown or you don't want it to be used + # These parameters are used to set up the various urls and adresses in different containers + # The script requires at least the genus to be specified + description: + genus: "ectocarpus" # Mandatory! + species: "sp1" + sex: "male" + strain: "" + common_name: "" + origin: "" + # Data files. + # WARNING: The paths must be absolute paths! + # If any path is left blank and the "parent_directory" scalar is specified, this directory and ALL its subdirectories will be + # scanned for files corresponding to the description provided for the species (i.e if the user specified + # the sex and strain, the script will look for files containing the genus, species, sex and strain of the species) + # If no file corresponding to the description is found, this path will be considered empty and the script will + # proceed to the next step (create the directory tree for the GGA docker stack) + # If a path is left blank and the "parent_directory" scalar is also blank, no file will be loaded for this "path" scalar + # If the files are not named using this nomenclature, please provide all the paths in the corresponding scalars below + data: + # "parent_directory": (optional) directory from where to search files if a "***_path" scalar is empty + # NOTE: Try to set a parent directory "close" to the data files so as not to increase runtime + # If empty (""), the script will not search for files and no dataset will be loaded for the corresponding scalar + parent_directory: "/path/to/closest/parent/dir" + # "***_path": path to the file (optional if parent_directory is set and species "description" scalars are precised) + genome_path: "/path/to/fasta" + transcripts_path: "/path/to/fasta" + proteins_path: "/path/to/fasta" + gff_path: "/path/to/gff" + # If the user has several genomes to upload to galaxy, the next scalar is used by the script to differentiate + # between these different versions and name directories according to it. + # If left empty, the genome will be considered version "1.0" + genome_version: "1.0" + # Same as genome version, but for the analysis + ogs_version: "" + performed_by: "" + +# Second example without the comments doc +ectocarpus_sp2: + description: + genus: "ectocarpus" + species: "sp2" + sex: "male" + strain: "" + common_name: "" + origin: "" + data: + parent_directory: "/path/to/closest/parent/dir" + genome_path: "/path/to/fasta" + transcripts_path: "/path/to/fasta" + proteins_path: "/path/to/fasta" + gff_path: "/path/to/gff" + genome_version: "1.0" + ogs_version: "1.0" + performed_by: "" + diff --git a/examples/phaeoexplorer_test.json b/examples/phaeoexplorer_test.json index abf3792..bd281b9 100644 --- a/examples/phaeoexplorer_test.json +++ b/examples/phaeoexplorer_test.json @@ -26,3 +26,5 @@ "date" : "2020-08-03" } ] + + diff --git a/load_data.py b/load_data.py new file mode 100644 index 0000000..ed53876 --- /dev/null +++ b/load_data.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 + +import bioblend +import bioblend.galaxy.objects +from bioblend import galaxy +import logging +import sys +import deploy_stacks + + +""" +load_data.py + +Load data in a galaxy container. + + + +""" + + +class LoadData: + def __init__(self, autoload_instance): + self.instance = None + + + diff --git a/requirements.txt b/requirements.txt index 35c00c4..3a3610f 100755 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -bioblend==0.13.0 +bioblend==0.14.0 boto==2.49.0 certifi==2019.11.28 cffi==1.14.0 diff --git a/ext_scripts/setup_data_libraries.py b/setup_data_libraries.py similarity index 99% rename from ext_scripts/setup_data_libraries.py rename to setup_data_libraries.py index a81de1a..c46c548 100644 --- a/ext_scripts/setup_data_libraries.py +++ b/setup_data_libraries.py @@ -66,7 +66,7 @@ class DataLibLoader: url = "http://localhost" # The environment variables are set by the parent container admin_email = os.environ.get('GALAXY_DEFAULT_ADMIN_USER', 'admin@galaxy.org') - admin_pass = os.environ.get('GALAXY_DEFAULT_ADMIN_PASSWORD', 'admin') + admin_pass = os.environ.get('GALAXY_DEFAULT_ADMIN_PASSWORD', 'password') # Establish connection to galaxy instance gio = GalaxyInstance(url=url, email=admin_email, password=admin_pass) diff --git a/table_parser.py b/table_parser.py index 66e4346..9e55ecd 100755 --- a/table_parser.py +++ b/table_parser.py @@ -8,16 +8,19 @@ import logging from datetime import datetime """ -Table parser for phaeoexplorer data. Tested with xls and xlsx input format, should work with csv format as well +OBSOLETE + +Input parser script. Does not work for ods spreadsheets (save as xls or xlsx instead) --> need to handle with pandas_ods_reader (requires ezodf, lxml) Does not support multiple sheets (TODO: "integration" and "update" sheets (1 and 2)) See example toy table (toy_table.xls) -TODO: move it inside autoload +TODO: move this script inside autoload standalone usage: python3 table_parser.py <tabulated_file> -d <directory_to_write_json_to (default: cwd)> """ + class TableParser: def __init__(self, table_file, dir): @@ -59,8 +62,6 @@ class TableParser: json_content.append(organism_dict) json.dump(json_content, json_file, indent=4) - - def write_json(self, data, filename): with open(filename, 'w') as f: json.dump(data, f, indent=4) diff --git a/templates/stack-organism.yml b/templates/stack-organism.yml index 0e20fe9..b836476 100644 --- a/templates/stack-organism.yml +++ b/templates/stack-organism.yml @@ -126,7 +126,8 @@ services: GALAXY_CONFIG_ALLOW_LIBRARY_PATH_PASTE: "True" GALAXY_CONFIG_USE_REMOTE_USER: "True" GALAXY_CONFIG_REMOTE_USER_MAILDOMAIN: "sb-roscoff.fr" - GALAXY_CONFIG_ADMIN_USERS: "admin@galaxy.org,gga@sb-roscoff.fr,lgueguen@sb-roscoff.fr, alebars@sb-roscoff.fr" # admin@galaxy.org is the default (leave it), gogepp@bipaa is a shared ldap user we use to connect + GALAXY_DEFAULT_ADMIN_PASSWORD: "password" + GALAXY_CONFIG_ADMIN_USERS: "admin@galaxy.org, gga@sb-roscoff.fr, lgueguen@sb-roscoff.fr, alebars@sb-roscoff.fr" # admin@galaxy.org is the default (leave it), gogepp@bipaa is a shared ldap user we use to connect GALAXY_CONFIG_MASTER_API_KEY: "dev" ENABLE_FIX_PERMS: 0 PROXY_PREFIX: /sp/genus_species/galaxy diff --git a/templates/traefik.yml b/templates/traefik.yml index cb4a6b4..f47766c 100755 --- a/templates/traefik.yml +++ b/templates/traefik.yml @@ -15,7 +15,7 @@ services: - "--entryPoints.webs.address=:443" - "--entryPoints.webs.forwardedHeaders.trustedIPs=192.168.1.133" # The ips of our upstream proxies: eci ports: -# - 8001:8080 # added by lg to debug, for dashboard + - 8001:8080 # added by lg to debug, for dashboard - 8888:80 - 8889:443 networks: -- GitLab