Merge branch 'dev' of http://gitlab.sb-roscoff.fr/abims/e-infra/gga_load_data into workflow_v2

87f9fb47 · Arthur Le Bars · 8154c570 · 28a1d904 · 87f9fb47 · 87f9fb47
Commit 87f9fb47 authored 3 years ago by Arthur Le Bars
--- a/examples/citrus_sinensis.yml
+++ b/examples/citrus_sinensis.yml
 # Input file for the automated creation GGA docker stacks
 # The file consists in a "list" of species for which the script will have to create these stacks/load data into galaxy/run workflows
-# This file is internally turned into a list of dictionaries by the scripts

-citrus_sinensis:  # Dummy value to designate the species (isn't used by the script)
+- name: citrus_sinensis
  description:
  # Species description, leave blank if unknown or you don't want it to be used
  # These parameters are used to set up the various urls and adresses in different containers
@@ -13,11 +12,8 @@ citrus_sinensis:  # Dummy value to designate the species (isn't used by the scri
    strain: ""
    common_name: ""
    origin: ""
-  # the sex and strain, the script will look for files containing the genus, species, sex and strain of the species)
-  # If no file corresponding to the description is found, this path will be considered empty and the script will
-  # proceed to the next step (create the directory tree for the GGA docker stack)
  data:
-  # Sequence of paths to the different datasets to copy and import into the galaxy container (as a shared library)
+  # Paths to the different datasets to copy and import into the galaxy container (as a shared library)
  # Must be absolute paths to the dataset
    genome_path: "/path/to/repo/examples/src_data/genome/v1.0/Citrus_sinensis-scaffold00001.fasta" # Mandatory!
    transcripts_path: "/path/to/repo/examples/src_data/annotation/v1.0/Citrus_sinensis-orange1.1g015632m.g.fasta"  # Mandatory!
@@ -35,8 +31,8 @@ citrus_sinensis:  # Dummy value to designate the species (isn't used by the scri
    ogs_version: "1.0"
    performed_by: ""
  services:
-  # Describe what optional services to deploy for the stack
-  # By default, only tripal, tripaldb and galaxy services will be deployed
+  # List the optional services to be deploy in the stack
+  # By default, only tripal, tripaldb, galaxy, jbrowse and elasticsearch services will be deployed
    blast: "False"
    wiki: "False"
    apollo: "False"
\ No newline at end of file
--- a/gga_get_data.py
+++ b/gga_get_data.py
@@ -91,6 +91,12 @@ class GetData(speciesData.SpeciesData):

        logging.info("src_data directory tree generated for %s" % self.full_name)

+    def get_last_modified_time_string(self, filePath):
+        # give the last modification date for the file, with format '20190130'
+        lastModifiedTimestamp = os.path.getmtime(filePath)
+        lastModifiedTimeStructure = time.localtime(lastModifiedTimestamp)
+        lastModifiedDate = time.strftime("%Y%m%d", lastModifiedTimeStructure)
+        return lastModifiedDate

    def get_source_data_files_from_path(self):
        """
@@ -137,7 +143,7 @@ class GetData(speciesData.SpeciesData):
                elif k in annotation_datasets:
                    dataset_fname = ""
                    if k == "gff_path":
-                        dataset_fname = "{0}_OGS{1}.gff".format(self.dataset_prefix, self.ogs_version)
+                        dataset_fname = "{0}_OGS{1}_{2}.gff".format(self.dataset_prefix, self.ogs_version, self.get_last_modified_time_string(os.path.abspath(v)))
                    elif k == "transcripts_path":
                        dataset_fname = "{0}_OGS{1}_transcripts.fasta".format(self.dataset_prefix, self.ogs_version)
                    elif k == "proteins_path":
@@ -194,7 +200,7 @@ if __name__ == "__main__":

    parser.add_argument("-v", "--verbose",
                        help="Increase output verbosity",
-                        action="store_false")
+                        action="store_true")

    parser.add_argument("--main-directory",
                        type=str,
@@ -206,7 +212,6 @@ if __name__ == "__main__":
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)
-    logging.getLogger("urllib3").setLevel(logging.WARNING)

    if not args.main_directory:
        args.main_directory = os.getcwd()

--- a/gga_init.py
+++ b/gga_init.py
@@ -62,11 +62,10 @@ class DeploySpeciesStack(speciesData.SpeciesData):
        # use the default banner instead
        if "banner_path" in self.config.keys():
            if not config["banner_path"] == "" and os.path.isfile(os.path.abspath(config["banner_path"])):
-                banner_dest_path = os.path.abspath("./banner.png")
-                logging.info("Custom banner path: %s" % self.config["banner_path"])
-                # if os.path.samefile(os.path.abspath(config["banner_path"]), banner_dest_path):
-                    # shutil.copy(os.path.abspath(self.config["banner_path"]), banner_dest_path)
-                shutil.copy(os.path.abspath(self.config["banner_path"]), banner_dest_path)
+                banner_dest_path = os.path.join(self.species_dir, os.path.abspath("banner.png"))
+                if not os.path.isfile(banner_dest_path) or not os.path.samefile(os.path.abspath(config["banner_path"]),banner_dest_path):
+                    os.symlink(os.path.abspath(self.config["banner_path"]), banner_dest_path)
+                    logging.info("Custom banner added: symlink from %s" % self.config["banner_path"])
            else:
                logging.debug("Using default banner for Tripal pages")
                self.config.pop("banner_path", None)

--- a/gga_load_data.py
+++ b/gga_load_data.py
@@ -321,7 +321,7 @@ if __name__ == "__main__":

    parser.add_argument("-v", "--verbose",
                        help="Increase output verbosity",
-                        action="store_false")
+                        action="store_true")

    parser.add_argument("--config",
                        type=str,
@@ -337,7 +337,6 @@ if __name__ == "__main__":
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)
-    logging.getLogger("urllib3").setLevel(logging.WARNING)

    # Parsing the config file if provided, using the default config otherwise
    if not args.config:

--- a/speciesData.py
+++ b/speciesData.py
@@ -23,7 +23,7 @@ class SpeciesData:
            return string

    def __init__(self, parameters_dictionary):
-        # self.config_dictionary = None
+        self.name = parameters_dictionary["name"]
        self.parameters_dictionary = parameters_dictionary
        parameters_dictionary_description=parameters_dictionary["description"]
        self.species = self.clean_string(parameters_dictionary_description["species"])
@@ -59,9 +59,6 @@ class SpeciesData:
        self.genus_uppercase = self.genus[0].upper() + self.genus[1:]
        self.chado_species_name = "{0} {1}".format(self.species, self.sex)
        self.full_name = ' '.join(utilities.filter_empty_not_empty_items([self.genus_uppercase, self.species, self.strain, self.sex])["not_empty"])
-        self.full_name = self.full_name.replace("__", "_").replace("_ ", "_").replace(" _", "_")
-        if self.full_name.endswith("_") or self.full_name.endswith(" "):
-            self.full_name = self.full_name[0:-2]

        self.full_name_lowercase = self.full_name.lower()
        self.abbreviation = "_".join(utilities.filter_empty_not_empty_items([self.genus_lowercase[0], self.species, self.strain, self.sex])["not_empty"])
@@ -96,14 +93,8 @@ class SpeciesData:
        self.api_key = None  # API key used to communicate with the galaxy instance. Cannot be used to do user-tied actions
        self.datasets = dict()
        self.config = None  # Custom config used to set environment variables inside containers
-        self.species_folder_name = "_".join(utilities.filter_empty_not_empty_items([self.genus_lowercase.lower(), self.species.lower(), self.strain.lower(), self.sex.lower()])["not_empty"])
-        self.species_folder_name  = self.species_folder_name .replace("-", "_").replace('__', '_').replace("(", "_").replace(")", "_")
-        if self.species_folder_name.endswith("_"):
-            self.species_folder_name = self.species_folder_name[0:-2]
+        self.species_folder_name = "_".join(utilities.filter_empty_not_empty_items(
+            [self.genus_lowercase.lower(), self.species.lower(), self.strain.lower(),
+             self.sex.lower()])["not_empty"])
        self.existing_folders_cache = {}
        self.bam_metadata_cache = {}
-
-        # # Sanitize str attributes
-        # for var in vars(self):
-        #     for attr in var if type(attr) == str:
-        #         attr = attr.replace("(", "_").replace(")", "_")
--- a/templates/organisms.yml.j2
+++ b/templates/organisms.yml.j2
-{{ name }}:
+- name: {{ name }}
  description:
    genus: {{ genus }}
    species: {{ species }}
@@ -13,7 +13,7 @@
    genome_path: {{ genome_path }}
    transcripts_path: {{ transcripts_path }}
    proteins_path: {{ proteins_path }}
-    gff_path: {{ gff_path }} # Mandatory!
+    gff_path: {{ gff_path }}
    interpro_path: {{ interpro_path }}
    orthofinder_path: {{ orthofinder_path }}
    blastp_path: {{ blastp_path }}

--- a/utilities.py
+++ b/utilities.py
@@ -44,14 +44,12 @@ def parse_input(input_file):
    :return:
    """

-    parsed_sp_dict_list = []
+    sp_dict_list = []

    try:
        with open(input_file, 'r') as stream:
            try:
-                yaml_dict = yaml.safe_load(stream)
-                for k, v in yaml_dict.items():
-                    parsed_sp_dict_list.append(v)
+                sp_dict_list = yaml.safe_load(stream)
            except yaml.YAMLError as err:
                logging.critical("Input file is not in YAML format")
                sys.exit(err)
@@ -62,7 +60,7 @@ def parse_input(input_file):
        logging.critical("The specified input file cannot be read (%s)" % input_file)
        sys.exit()

-    return parsed_sp_dict_list
+    return sp_dict_list


 def filter_empty_not_empty_items(li):
@@ -125,13 +123,9 @@ def get_species_history_id(instance, full_name):

 def get_gspecies_string_from_sp_dict(sp_dict):

-    gspecies = ""
-    for k, v in sp_dict.items():
-        for k2, v2 in v.items():
-            if k2 == "genus":
-                gspecies = gspecies.lower() + v2
-            elif k2 == "species":
-                gspecies = gspecies.lower() + "_" + v2
+    genus = sp_dict["description"]["genus"]
+    species = sp_dict["description"]["species"]
+    gspecies = genus.lower() + "_" + species.lower()
    return gspecies

 def get_unique_species_str_list(sp_dict_list):
@@ -166,7 +160,6 @@ def get_unique_species_dict_list(sp_dict_list):

    unique_species_dict = {}
    unique_species_list_of_dict = []
-    unique_species_genus_species = get_unique_species_str_list(sp_dict_list=sp_dict_list)

    for sp in sp_dict_list:
        gspecies = get_gspecies_string_from_sp_dict(sp)