From 5f7757d4375d2241e6bc411cff7ac3f07e662e90 Mon Sep 17 00:00:00 2001
From: Loraine Gueguen <loraine.gueguen@sb-roscoff.fr>
Date: Fri, 28 May 2021 08:47:44 +0200
Subject: [PATCH] Set data file and dataset name as constants. Refactor run_wf
 (WIP)

---
 constants.py                  |  15 +-
 gga_get_data.py               |  16 +-
 phaoexplorer_constants.py     |   7 +-
 run_workflow_phaeoexplorer.py | 392 ++++++++++++++++------------------
 speciesData.py                |  11 +-
 5 files changed, 225 insertions(+), 216 deletions(-)

diff --git a/constants.py b/constants.py
index 5aaf27d..cfe86b2 100644
--- a/constants.py
+++ b/constants.py
@@ -1,3 +1,6 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
 # Constants used in the input yaml
 ORG_PARAM_NAME = "name"
 ORG_PARAM_DESC = "description"
@@ -24,7 +27,6 @@ ORG_PARAM_SERVICES = "services"
 ORG_PARAM_SERVICES_BLAST = "blast"
 ORG_PARAM_SERVICES_GO = "go"
 
-
 # Constants used in the config yaml file
 CONF_ALL_HOSTNAME = "hostname"
 CONF_ALL_HTTP_PORT = "http_port"
@@ -45,12 +47,23 @@ CONF_TRIPAL_THEME_NAME = "tripal_theme_name"
 CONF_TRIPAL_THEME_GIT_CLONE = "tripal_theme_git_clone"
 CONF_JBROWSE_MENU_URL = "jbrowse_menu_url"
 
+# Data
+FILENAME_SUFFIX_TRANSCRIPTS = "transcripts_gff.fasta"
+FILENAME_SUFFIX_PROTEINS = "proteins.fasta"
+FILENAME_SUFFIX_INTERPRO = "interproscan.xml"
+FILENAME_SUFFIX_BLASTP = "diamond_blastp_vs_uniref90.xml" # Temporary constant: this value should be in the organism input file
+FILENAME_SUFFIX_BLASTX = "diamond_blastx_vs_uniref90.xml" # Temporary constant: this value should be in the organism input file
+FILENAME_SUFFIX_ORTHOFINDER = "orthologous_one2one_vs_Ec32.tsv" # Temporary constant: this value should be in the organism input file
+DATA_DATE = "2021-02-24" # Temporary constant: this value should be in the organism input file, for each data
+
 # default config file
 DEFAULT_CONFIG = "examples/config"
 
+# Galaxy tools
 GET_ORGANISMS_TOOL = "toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/2.3.4+galaxy0"
 DELETE_ORGANISMS_TOOL = "toolshed.g2.bx.psu.edu/repos/gga/chado_organism_delete_organisms/organism_delete_organisms/2.3.4+galaxy0"
 
+# Galaxy library
 HOST_DATA_DIR='src_data'
 CONTAINER_DATA_DIR_ROOT='/project_data'
 GALAXY_LIBRARY_NAME = 'Project Data'
diff --git a/gga_get_data.py b/gga_get_data.py
index 992e5c6..f46d733 100755
--- a/gga_get_data.py
+++ b/gga_get_data.py
@@ -101,7 +101,7 @@ class GetData(speciesData.SpeciesData):
         for k, v in genome_datasets.items():
             if v:  # If dataset is not present in input file, skip copy
                 logging.info("Copying {0} ({1}) into {2}".format(k, v, organism_genome_dir))
-                genome_fname = "{0}_v{1}.fasta".format(self.dataset_prefix, self.genome_version)
+                genome_fname = self.genome_filename
                 try:
                     shutil.copyfile(os.path.abspath(v), os.path.join(organism_genome_dir, genome_fname))
                 except Exception as exc:
@@ -111,19 +111,19 @@ class GetData(speciesData.SpeciesData):
             if v:  # If dataset is not present in input file, skip copy
                 dataset_fname = ""
                 if k == constants.ORG_PARAM_DATA_GFF_PATH:
-                    dataset_fname = "{0}_OGS{1}_{2}.gff".format(self.dataset_prefix, self.ogs_version, self.get_last_modified_time_string(os.path.abspath(v)))
+                    dataset_fname = self.gff_filename
                 elif k == constants.ORG_PARAM_DATA_TRANSCRIPTS_PATH:
-                    dataset_fname = "{0}_OGS{1}_transcripts.fasta".format(self.dataset_prefix, self.ogs_version)
+                    dataset_fname = self.transcripts_filename
                 elif k == constants.ORG_PARAM_DATA_PROTEINS_PATH:
-                    dataset_fname = "{0}_OGS{1}_proteins.fasta".format(self.dataset_prefix, self.ogs_version)
+                    dataset_fname = self.proteins_filename
                 elif k == constants.ORG_PARAM_DATA_ORTHOFINDER_PATH:
-                    dataset_fname = "{0}_OGS{1}_orthofinder.tsv".format(self.dataset_prefix, self.ogs_version)
+                    dataset_fname = self.orthofinder_filename
                 elif k == constants.ORG_PARAM_DATA_INTERPRO_PATH:
-                    dataset_fname = "{0}_OGS{1}_interproscan.xml".format(self.dataset_prefix, self.ogs_version)
+                    dataset_fname = self.interpro_filename
                 elif k == constants.ORG_PARAM_DATA_BLASTP_PATH:
-                    dataset_fname = "{0}_OGS{1}_blastp.xml".format(self.dataset_prefix, self.ogs_version)
+                    dataset_fname = self.blastp_filename
                 elif k == constants.ORG_PARAM_DATA_BLASTX_PATH:
-                    dataset_fname = "{0}_OGS{1}_blastx.xml".format(self.dataset_prefix, self.ogs_version)
+                    dataset_fname = self.blastx_filename
                 logging.info("Copying {0} ({1}) into {2}".format(k, v, organism_annotation_dir))
                 try:
                     shutil.copyfile(os.path.abspath(v), os.path.join(organism_annotation_dir, dataset_fname))
diff --git a/phaoexplorer_constants.py b/phaoexplorer_constants.py
index 229b216..68d6c88 100644
--- a/phaoexplorer_constants.py
+++ b/phaoexplorer_constants.py
@@ -1,3 +1,8 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import constants
+
 ### Workflows
 
 WORKFLOW_LOAD_FASTA_GFF_JBROWSE = "load_fasta_gff_jbrowse"
@@ -17,7 +22,7 @@ ADD_ANALYSIS_TOOL_VERSION = "2.3.4+galaxy0"
 ADD_ANALYSIS_TOOL_ID= ADD_ANALYSIS_TOOL_NAME + ADD_ANALYSIS_TOOL_VERSION
 ADD_ANALYSIS_TOOL_CHANGESET_REVISION = "10b2b1c70e69"
 ADD_ANALYSIS_TOOL_PARAM_PROGRAM = "Performed by Genoscope"
-ADD_ANALYSIS_TOOL_PARAM_DATE = "2021-02-24"
+ADD_ANALYSIS_TOOL_PARAM_DATE = constants.DATA_DATE
 
 GET_ORGANISMS_TOOL_NAME = "toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/"
 GET_ORGANISMS_TOOL_VERSION = "2.3.4+galaxy0"
diff --git a/run_workflow_phaeoexplorer.py b/run_workflow_phaeoexplorer.py
index 4928e73..1193b95 100755
--- a/run_workflow_phaeoexplorer.py
+++ b/run_workflow_phaeoexplorer.py
@@ -24,6 +24,13 @@ gga_init.py
 Usage: $ python3 gga_init.py -i input_example.yml --config [config file] [OPTIONS]
 """
 
+class RunWorkflowParam:
+
+    def __init__(self, genus_species, strain_sex, attributes_dict):
+        self.genus_species = genus_species
+        self.strain_sex = strain_sex
+        self.param_dict = attributes_dict
+
 class RunWorkflow(speciesData.SpeciesData):
     """
     Run a workflow into the galaxy instance's history of a given species
@@ -108,8 +115,6 @@ class RunWorkflow(speciesData.SpeciesData):
         :return:
         """
 
-        self.set_galaxy_instance()
-
         logging.info("Validating installed individual tools versions and changesets")
 
         # Verify that the add_organism and add_analysis versions are correct in the instance
@@ -514,7 +519,6 @@ class RunWorkflow(speciesData.SpeciesData):
 
         return invocation_report
 
-
     def import_datasets_into_history(self):
         """
         Find datasets in a library, get their ID and import them into the current history if they are not already
@@ -541,99 +545,100 @@ class RunWorkflow(speciesData.SpeciesData):
         for folder_name, folder_id in folders_id_dict.items():
             if folder_name == "/genome/{0}/v{1}".format(self.species_folder_name, self.genome_version):
                 sub_folder_content = self.instance.folders.show_folder(folder_id=folder_id, contents=True)
-                for k2, v2 in sub_folder_content.items():
-                    for e in v2:
+                for value in sub_folder_content.values():
+                    for e in value:
                         if type(e) == dict:
-                            if e["name"].endswith(".fasta"):
-                                self.datasets["genome_file"] = e["ldda_id"]
-                                self.datasets_name["genome_file"] = e["name"]
+                            if e["name"].endswith(self.genome_filename):
+                                genome_ldda_id = e["ldda_id"]
 
             if folder_name == "/annotation/{0}/OGS{1}".format(self.species_folder_name, self.ogs_version):
                 sub_folder_content = self.instance.folders.show_folder(folder_id=folder_id, contents=True)
-                for k2, v2 in sub_folder_content.items():
-                    for e in v2:
+                for value in sub_folder_content.values():
+                    for e in value:
                         if type(e) == dict:
-                            if "transcripts" in e["name"]:
-                                self.datasets["transcripts_file"] = e["ldda_id"]
-                                self.datasets_name["transcripts_file"] = e["name"]
-                            elif "proteins" in e["name"]:
-                                self.datasets["proteins_file"] = e["ldda_id"]
-                                self.datasets_name["proteins_file"] = e["name"]
-                            elif "gff" in e["name"]:
-                                self.datasets["gff_file"] = e["ldda_id"]
-                                self.datasets_name["gff_file"] = e["name"]
-                            elif "interpro" in e["name"]:
-                                self.datasets["interproscan_file"] = e["ldda_id"]
-                                self.datasets_name["interproscan_file"] = e["name"]
-                            elif "blastp" in e["name"]:
-                                self.datasets["blastp_file"] = e["ldda_id"]
-                                self.datasets_name["blastp_file"] = e["name"]
-
-
-        history_datasets_li = self.instance.datasets.get_datasets()
+                            ldda_name = e["name"]
+                            ldda_id = e["ldda_id"]
+                            if ldda_name.endswith(self.transcripts_filename):
+                                transcripts_ldda_id = ldda_id
+                            elif ldda_name.endswith(self.proteins_filename):
+                                proteins_ldda_id = ldda_id
+                            elif ldda_name.endswith(self.gff_filename):
+                                gff_ldda_id = ldda_id
+                            elif ldda_name.endswith(self.interpro_filename):
+                                interpro_ldda_id = ldda_id
+                            elif ldda_name.endswith(self.blastp_filename):
+                                blastp_ldda_id = ldda_id
+                            elif ldda_name.endswith(self.blastx_filename):
+                                blastx_ldda_id = ldda_id
+
+        hda_list = self.instance.datasets.get_datasets(self.history_id)
         genome_hda_id, gff_hda_id, transcripts_hda_id, proteins_hda_id, blastp_hda_id, interproscan_hda_id = None, None, None, None, None, None
-
         # Finding datasets in history (matching datasets names)
-        for dataset in history_datasets_li:
-            dataset_name = dataset["name"]
-            dataset_id = dataset["id"]
-            if dataset_name == "{0}_v{1}.fasta".format(self.dataset_prefix, self.genome_version):
-                genome_hda_id = dataset_id
-            if dataset_name == "{0}_OGS{1}_{2}.gff".format(self.dataset_prefix, self.ogs_version, self.date):
-                gff_hda_id = dataset_id
-            if dataset_name == "{0}_OGS{1}_transcripts.fasta".format(self.dataset_prefix, self.ogs_version):
-                transcripts_hda_id = dataset_id
-            if dataset_name == "{0}_OGS{1}_proteins.fasta".format(self.dataset_prefix, self.ogs_version):
-                proteins_hda_id = dataset_id
-            if dataset_name == "{0}_OGS{1}_blastp.xml".format(self.dataset_prefix, self.ogs_version):
-                blastp_hda_id = dataset_id
-            if dataset_name == "{0}_OGS{1}_interproscan.xml".format(self.dataset_prefix, self.ogs_version):
-                interproscan_hda_id = dataset_id
+        for hda in hda_list:
+            hda_name = hda["name"]
+            hda_id = hda["id"]
+            if hda_name == self.genome_filename:
+                genome_hda_id = hda_id
+            if hda_name ==  self.gff_filename:
+                gff_hda_id = hda_id
+            if hda_name == self.transcripts_filename:
+                transcripts_hda_id = hda_id
+            if hda_name == self.proteins_filename :
+                proteins_hda_id = hda_id
+            if hda_name == self.blastp_filename:
+                blastp_hda_id = hda_id
+            if hda_name == self.blastx_filename:
+                blastx_hda_id = hda_id
+            if hda_name == self.interpro_filename:
+                interproscan_hda_id = hda_id
 
-                    
         # Import each dataset into history if it is not imported
         logging.debug("Uploading datasets into history %s" % self.history_id)
 
         if genome_hda_id is None:
-            genome_dataset_upload = self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["genome_file"])
+            genome_dataset_upload = self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=genome_ldda_id)
             genome_hda_id = genome_dataset_upload["id"]
         if gff_hda_id is  None:
-            gff_dataset_upload = self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["gff_file"])
+            gff_dataset_upload = self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=gff_ldda_id)
             gff_hda_id = gff_dataset_upload["id"]
-        if transcripts_hda_id is None:
-            transcripts_dataset_upload = self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["transcripts_file"])
-            transcripts_hda_id = transcripts_dataset_upload["id"]
         if proteins_hda_id is None:
-            proteins_dataset_upload = self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["proteins_file"])
+            proteins_dataset_upload = self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=proteins_ldda_id)
             proteins_hda_id = proteins_dataset_upload["id"]
+        if transcripts_hda_id is None:
+            transcripts_dataset_upload = self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=transcripts_ldda_id)
+            transcripts_hda_id = transcripts_dataset_upload["id"]
         if interproscan_hda_id is None:
             try:
-                interproscan_dataset_upload = self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["interproscan_file"])
+                interproscan_dataset_upload = self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=interpro_ldda_id)
                 interproscan_hda_id = interproscan_dataset_upload["id"]
             except Exception as exc:
                 logging.debug("Interproscan file not found in library (history: {0})".format(self.history_id))
         if blastp_hda_id is None:
             try:
-                blastp_dataset_upload = self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["blastp_file"])
+                blastp_dataset_upload = self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=blastp_ldda_id)
                 blastp_hda_id = blastp_dataset_upload["id"]
             except Exception as exc:
                 logging.debug("blastp file not found in library (history: {0})".format(self.history_id))
+        if blastx_hda_id is None:
+            try:
+                blastx_dataset_upload = self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=blastx_ldda_id)
+                blastx_hda_id = blastx_dataset_upload["id"]
+            except Exception as exc:
+                logging.debug("blastp file not found in library (history: {0})".format(self.history_id))
+
+        hda_ids = {"genome_hda_id": genome_hda_id,
+                   "gff_hda_id": gff_hda_id,
+                   "transcripts_hda_id": transcripts_hda_id,
+                   "proteins_hda_id": proteins_hda_id,
+                   "blastp_hda_id": blastp_hda_id,
+                   "blastx_hda_id": blastx_hda_id,
+                   "interproscan_hda_id": interproscan_hda_id}
 
         # logging.debug("History dataset IDs (hda_id) for %s:" % self.full_name)
-        # logging.debug({"genome_hda_id": genome_hda_id,
-        #         "gff_hda_id": gff_hda_id,
-        #         "transcripts_hda_id": transcripts_hda_id,
-        #         "proteins_hda_id": proteins_hda_id,
-        #         "blastp_hda_id": blastp_hda_id,
-        #         "interproscan_hda_id": interproscan_hda_id})
+        # logging.debug(hda_ids)
 
         # Return a dict made of the hda ids
-        return {"genome_hda_id": genome_hda_id, 
-                "gff_hda_id": gff_hda_id, 
-                "transcripts_hda_id": transcripts_hda_id, 
-                "proteins_hda_id": proteins_hda_id, 
-                "blastp_hda_id": blastp_hda_id,
-                "interproscan_hda_id": interproscan_hda_id}
+        return hda_ids
 
 def run_workflow(workflow_path, workflow_parameters, datamap, config, input_species_number):
     """
@@ -693,16 +698,19 @@ def run_workflow(workflow_path, workflow_parameters, datamap, config, input_spec
 
 
 
-def create_sp_workflow_dict(sp_dict, main_dir, config, workflow_type):
+def get_sp_workflow_param(sp_dict, main_dir, config, workflow_type):
     """
     """
 
-    sp_workflow_dict = {}
     run_workflow_for_current_organism = RunWorkflow(parameters_dictionary=sp_dict)
 
     # Verifying the galaxy container is running
-    if utilities.check_galaxy_state(network_name=run_workflow_for_current_organism.genus_species,
+    if not utilities.check_galaxy_state(network_name=run_workflow_for_current_organism.genus_species,
                                     script_dir=run_workflow_for_current_organism.script_dir):
+        logging.critical("The galaxy container for %s is not ready yet!" % run_workflow_for_current_organism.genus_species)
+        sys.exit()
+
+    else:
 
         # Setting some of the instance attributes
         run_workflow_for_current_organism.main_dir = main_dir
@@ -717,13 +725,12 @@ def create_sp_workflow_dict(sp_dict, main_dir, config, workflow_type):
             run_workflow_for_current_organism.config[constants.CONF_ALL_HTTP_PORT],
             run_workflow_for_current_organism.genus_species)
 
+        run_workflow_for_current_organism.set_galaxy_instance()
+        history_id = run_workflow_for_current_organism.set_history()
+        run_workflow_for_current_organism.install_changesets_revisions_for_individual_tools()
 
         if workflow_type == phaoexplorer_constants.WORKFLOW_LOAD_FASTA_GFF_JBROWSE:
 
-            run_workflow_for_current_organism.set_galaxy_instance()
-            history_id = run_workflow_for_current_organism.set_history()
-            run_workflow_for_current_organism.install_changesets_revisions_for_individual_tools()
-
             analyses_dict_list = run_workflow_for_current_organism.get_analyses()
 
             org_id = run_workflow_for_current_organism.add_organism_and_sync()
@@ -743,7 +750,7 @@ def create_sp_workflow_dict(sp_dict, main_dir, config, workflow_type):
             hda_ids = run_workflow_for_current_organism.import_datasets_into_history()
 
             # Create the dictionary holding all attributes needed to connect to the galaxy instance
-            attributes = {"genus": run_workflow_for_current_organism.genus,
+            param = {"genus": run_workflow_for_current_organism.genus,
                           "species": run_workflow_for_current_organism.species,
                           "genus_species": run_workflow_for_current_organism.genus_species,
                           "full_name": run_workflow_for_current_organism.full_name,
@@ -760,87 +767,65 @@ def create_sp_workflow_dict(sp_dict, main_dir, config, workflow_type):
                           "email": config[constants.CONF_GALAXY_DEFAULT_ADMIN_EMAIL],
                           "password": config[constants.CONF_GALAXY_DEFAULT_ADMIN_PASSWORD]}
 
-            sp_workflow_dict[run_workflow_for_current_organism.genus_species] = {run_workflow_for_current_organism.genus_species.strain_sex: attributes}
 
-        else:
-            logging.critical("The galaxy container for %s is not ready yet!" % run_workflow_for_current_organism.genus_species)
-            sys.exit()
+        if workflow_type == "blast":
 
-        return sp_workflow_dict
+            ids = run_workflow_for_current_organism.add_organism_blastp_analysis()
 
-    if workflow_type == "blast":
-        run_workflow_for_current_organism.set_galaxy_instance()
+            org_id = ids["org_id"]
+            blastp_analysis_id = ids["blastp_analysis_id"]
+            hda_ids = run_workflow_for_current_organism.import_datasets_into_history()
 
-        history_id = run_workflow_for_current_organism.set_history()
+            # Create the dictionary holding all attributes needed to connect to the galaxy instance
+            param = {"genus": run_workflow_for_current_organism.genus,
+                          "species": run_workflow_for_current_organism.species,
+                          "genus_species": run_workflow_for_current_organism.genus_species,
+                          "full_name": run_workflow_for_current_organism.full_name,
+                          "species_folder_name": run_workflow_for_current_organism.species_folder_name,
+                          "sex": run_workflow_for_current_organism.sex,
+                          "strain": run_workflow_for_current_organism.strain,
+                          "org_id": org_id,
+                          "blastp_analysis_id": blastp_analysis_id,
+                          "hda_ids": hda_ids,
+                          "history_id": history_id,
+                          "instance": run_workflow_for_current_organism.instance,
+                          "instance_url": run_workflow_for_current_organism.instance_url,
+                          "email": config[constants.CONF_GALAXY_DEFAULT_ADMIN_EMAIL],
+                          "password": config[constants.CONF_GALAXY_DEFAULT_ADMIN_PASSWORD]}
 
-        run_workflow_for_current_organism.install_changesets_revisions_for_individual_tools()
-        ids = run_workflow_for_current_organism.add_organism_blastp_analysis()
-
-        org_id = ids["org_id"]
-        blastp_analysis_id = ids["blastp_analysis_id"]
-        hda_ids = run_workflow_for_current_organism.import_datasets_into_history()
-
-        strain_sex = "{0}_{1}".format(run_workflow_for_current_organism.strain, run_workflow_for_current_organism.sex)
-        genus_species = run_workflow_for_current_organism.genus_species
-
-        # Create the dictionary holding all attributes needed to connect to the galaxy instance
-        attributes = {"genus": run_workflow_for_current_organism.genus,
-                      "species": run_workflow_for_current_organism.species,
-                      "genus_species": run_workflow_for_current_organism.genus_species,
-                      "full_name": run_workflow_for_current_organism.full_name,
-                      "species_folder_name": run_workflow_for_current_organism.species_folder_name,
-                      "sex": run_workflow_for_current_organism.sex,
-                      "strain": run_workflow_for_current_organism.strain,
-                      "org_id": org_id,
-                      "blastp_analysis_id": blastp_analysis_id,
-                      "hda_ids": hda_ids,
-                      "history_id": history_id,
-                      "instance": run_workflow_for_current_organism.instance,
-                      "instance_url": run_workflow_for_current_organism.instance_url,
-                      "email": config[constants.CONF_GALAXY_DEFAULT_ADMIN_EMAIL],
-                      "password": config[constants.CONF_GALAXY_DEFAULT_ADMIN_PASSWORD]}
-
-        sp_workflow_dict[genus_species] = {strain_sex: attributes}
 
+        if workflow_type == "interpro":
 
-    if workflow_type == "interpro":
-        run_workflow_for_current_organism.set_galaxy_instance()
+            ids = run_workflow_for_current_organism.add_organism_interproscan_analysis()
 
-        history_id = run_workflow_for_current_organism.set_history()
+            org_id = ids["org_id"]
+            interpro_analysis_id = ids["interpro_analysis_id"]
+            hda_ids = run_workflow_for_current_organism.import_datasets_into_history()
 
-        run_workflow_for_current_organism.install_changesets_revisions_for_individual_tools()
-        ids = run_workflow_for_current_organism.add_organism_interproscan_analysis()
-
-        org_id = ids["org_id"]
-        interpro_analysis_id = ids["interpro_analysis_id"]
-        hda_ids = run_workflow_for_current_organism.import_datasets_into_history()
-
-        strain_sex = "{0}_{1}".format(run_workflow_for_current_organism.strain, run_workflow_for_current_organism.sex)
-        genus_species = run_workflow_for_current_organism.genus_species
-
-        # Create the dictionary holding all attributes needed to connect to the galaxy instance
-        attributes = {"genus": run_workflow_for_current_organism.genus,
-                      "species": run_workflow_for_current_organism.species,
-                      "genus_species": run_workflow_for_current_organism.genus_species,
-                      "full_name": run_workflow_for_current_organism.full_name,
-                      "species_folder_name": run_workflow_for_current_organism.species_folder_name,
-                      "sex": run_workflow_for_current_organism.sex,
-                      "strain": run_workflow_for_current_organism.strain,
-                      "org_id": org_id,
-                      "interpro_analysis_id": interpro_analysis_id,
-                      "hda_ids": hda_ids,
-                      "history_id": history_id,
-                      "instance": run_workflow_for_current_organism.instance,
-                      "instance_url": run_workflow_for_current_organism.instance_url,
-                      "email": config[constants.CONF_GALAXY_DEFAULT_ADMIN_EMAIL],
-                      "password": config[constants.CONF_GALAXY_DEFAULT_ADMIN_PASSWORD]}
-
-        sp_workflow_dict[genus_species] = {strain_sex: attributes}
+            # Create the dictionary holding all attributes needed to connect to the galaxy instance
+            param = {"genus": run_workflow_for_current_organism.genus,
+                          "species": run_workflow_for_current_organism.species,
+                          "genus_species": run_workflow_for_current_organism.genus_species,
+                          "full_name": run_workflow_for_current_organism.full_name,
+                          "species_folder_name": run_workflow_for_current_organism.species_folder_name,
+                          "sex": run_workflow_for_current_organism.sex,
+                          "strain": run_workflow_for_current_organism.strain,
+                          "org_id": org_id,
+                          "interpro_analysis_id": interpro_analysis_id,
+                          "hda_ids": hda_ids,
+                          "history_id": history_id,
+                          "instance": run_workflow_for_current_organism.instance,
+                          "instance_url": run_workflow_for_current_organism.instance_url,
+                          "email": config[constants.CONF_GALAXY_DEFAULT_ADMIN_EMAIL],
+                          "password": config[constants.CONF_GALAXY_DEFAULT_ADMIN_PASSWORD]}
 
-    else:
-        logging.critical("The galaxy container for %s is not ready yet!" % run_workflow_for_current_organism.full_name)
-        sys.exit()
 
+        sp_wf_param = RunWorkflowParam(
+            genus_species=run_workflow_for_current_organism.genus_species,
+            strain_sex=run_workflow_for_current_organism.strain_sex,
+            param_dict=param
+        )
+        return sp_wf_param
 
 
 def install_changesets_revisions_from_workflow(instance, workflow_path):
@@ -957,29 +942,31 @@ if __name__ == "__main__":
         for sp_dict in sp_dict_list:
 
             # Add and retrieve all analyses/organisms for the current input species and add their IDs to the input dictionary
-            current_sp_workflow_dict = create_sp_workflow_dict(
+            sp_workflow_attributes = get_sp_workflow_param(
                 sp_dict,
                 main_dir=main_dir,
                 config=config,
                 workflow_type=phaoexplorer_constants.WORKFLOW_LOAD_FASTA_GFF_JBROWSE)
 
-            current_sp_key = list(current_sp_workflow_dict.keys())[0]
-            current_sp_value = list(current_sp_workflow_dict.values())[0]
-            current_sp_strain_sex_key = list(current_sp_value.keys())[0]
-            current_sp_strain_sex_value = list(current_sp_value.values())[0]
+            current_sp_genus_species = sp_workflow_attributes.genus_species
+            current_sp_strain_sex = sp_workflow_attributes.strain_sex
+            current_sp_strain_sex_attributes_dict = sp_workflow_attributes.param_dict
 
             # Add the species dictionary to the complete dictionary
             # This dictionary contains every organism present in the input file
             # Its structure is the following:
             # {genus species: {strain1_sex1: {variables_key: variables_values}, strain1_sex2: {variables_key: variables_values}}}
-            if not current_sp_key in all_sp_workflow_dict.keys():
-                all_sp_workflow_dict[current_sp_key] = current_sp_value
+            if not current_sp_genus_species in all_sp_workflow_dict.keys():
+                all_sp_workflow_dict[current_sp_genus_species] = {current_sp_strain_sex: current_sp_strain_sex_attributes_dict}
             else:
-                all_sp_workflow_dict[current_sp_key][current_sp_strain_sex_key] = current_sp_strain_sex_value
+                if not current_sp_strain_sex in all_sp_workflow_dict[current_sp_genus_species].keys():
+                    all_sp_workflow_dict[current_sp_genus_species][current_sp_strain_sex] = current_sp_strain_sex_attributes_dict
+                else:
+                    logging.error("Duplicate organism with 'genus_species' = '{0}' and 'strain_sex' = '{1}'".format(current_sp_genus_species, current_sp_strain_sex))
 
-        for k, v in all_sp_workflow_dict.items():
-            if len(list(v.keys())) == 1:
-                logging.info("Input organism %s: 1 species detected in input dictionary" % k)
+        for species, strains in all_sp_workflow_dict.items():
+            if len(list(strains.keys())) == 1:
+                logging.info("Input species %s: 1 strain detected in input dictionary" % species)
 
                 # Set workflow path (1 organism)
                 workflow_path = os.path.join(os.path.abspath(script_dir), "workflows_phaeoexplorer/Galaxy-Workflow-chado_load_tripal_synchronize_jbrowse_1org_v4.ga")
@@ -988,7 +975,7 @@ if __name__ == "__main__":
                 instance_url, email, password = None, None, None
 
                 # Set the galaxy instance variables
-                for k2, v2 in v.items():
+                for k2, v2 in strains.values():
                     instance_url = v2["instance_url"]
                     email = v2["email"]
                     password = v2["password"]
@@ -998,11 +985,8 @@ if __name__ == "__main__":
                 # Check if the versions of tools specified in the workflow are installed in galaxy
                 install_changesets_revisions_from_workflow(workflow_path=workflow_path, instance=instance)
 
-                organism_key_name = list(v.keys())
-                org_dict = v[organisms_key_name[0]]
-
-                # print("\n")
-                # print(org_dict)
+                organisms_key_name = list(strains.keys())
+                org_dict = strains[organisms_key_name[0]]
 
                 history_id = org_dict["history_id"]
 
@@ -1121,9 +1105,9 @@ if __name__ == "__main__":
                     logging.info("Successfully imported and invoked workflow {0}, check the galaxy instance ({1}) for the jobs state".format(workflow_name, instance_url))
 
 
-            if len(list(v.keys())) == 2:
+            if len(list(strains.keys())) == 2:
 
-                logging.info("Input organism %s: 2 species detected in input dictionary" % k)
+                logging.info("Input organism %s: 2 species detected in input dictionary" % species)
 
                 # Set workflow path (2 organisms)
                 workflow_path = os.path.join(os.path.abspath(script_dir), "workflows_phaeoexplorer/Galaxy-Workflow-chado_load_tripal_synchronize_jbrowse_2org_v4.ga")
@@ -1132,7 +1116,7 @@ if __name__ == "__main__":
                 instance_url, email, password = None, None, None
 
                 # Set the galaxy instance variables
-                for k2, v2 in v.items():
+                for k2, v2 in strains.items():
                     instance_url = v2["instance_url"]
                     email = v2["email"]
                     password = v2["password"]
@@ -1143,9 +1127,9 @@ if __name__ == "__main__":
                 install_changesets_revisions_from_workflow(workflow_path=workflow_path, instance=instance)
 
                 # Get key names from the current organism (item 1 = organism 1, item 2 = organism 2)
-                organisms_key_names = list(v.keys())
-                org1_dict = v[organisms_key_names[0]]
-                org2_dict = v[organisms_key_names[1]]
+                organisms_key_names = list(strains.keys())
+                org1_dict = strains[organisms_key_names[0]]
+                org2_dict = strains[organisms_key_names[1]]
 
                 history_id = org1_dict["history_id"]
 
@@ -1360,24 +1344,24 @@ if __name__ == "__main__":
         for sp_dict in sp_dict_list:
 
             # Add and retrieve all analyses/organisms for the current input species and add their IDs to the input dictionary
-            current_sp_workflow_dict = create_sp_workflow_dict(sp_dict, main_dir=args.main_directory, config=config, workfow_type="blast")
+            sp_workflow_attributes = get_sp_workflow_param(sp_dict, main_dir=args.main_directory, config=config, workfow_type="blast")
 
-            current_sp_key = list(current_sp_workflow_dict.keys())[0]
-            current_sp_value = list(current_sp_workflow_dict.values())[0]
-            current_sp_strain_sex_key = list(current_sp_value.keys())[0]
-            current_sp_strain_sex_value = list(current_sp_value.values())[0]
+            current_sp_genus_species = list(sp_workflow_attributes.keys())[0]
+            current_sp_genus_species_dict = list(sp_workflow_attributes.values())[0]
+            current_sp_strain_sex = list(current_sp_genus_species_dict.keys())[0]
+            current_sp_strain_sex_attributes_dict = list(current_sp_genus_species_dict.values())[0]
 
             # Add the species dictionary to the complete dictionary
             # This dictionary contains every organism present in the input file
             # Its structure is the following:
             # {genus species: {strain1_sex1: {variables_key: variables_values}, strain1_sex2: {variables_key: variables_values}}}
-            if not current_sp_key in all_sp_workflow_dict.keys():
-                all_sp_workflow_dict[current_sp_key] = current_sp_value
+            if not current_sp_genus_species in all_sp_workflow_dict.keys():
+                all_sp_workflow_dict[current_sp_genus_species] = current_sp_genus_species_dict
             else:
-                all_sp_workflow_dict[current_sp_key][current_sp_strain_sex_key] = current_sp_strain_sex_value
+                all_sp_workflow_dict[current_sp_genus_species][current_sp_strain_sex] = current_sp_strain_sex_attributes_dict
 
-        if len(list(v.keys())) == 1:
-            logging.info("Input organism %s: 1 species detected in input dictionary" % k)
+        if len(list(strains.keys())) == 1:
+            logging.info("Input organism %s: 1 species detected in input dictionary" % species)
 
             # Set workflow path (1 organism)
             workflow_path = os.path.join(os.path.abspath(script_dir), "workflows_phaeoexplorer/Galaxy-Workflow-load_blast_results_1org_v1.ga")
@@ -1386,7 +1370,7 @@ if __name__ == "__main__":
             instance_url, email, password = None, None, None
 
             # Set the galaxy instance variables
-            for k2, v2 in v.items():
+            for k2, v2 in strains.items():
                 instance_url = v2["instance_url"]
                 email = v2["email"]
                 password = v2["password"]
@@ -1396,8 +1380,8 @@ if __name__ == "__main__":
             # Check if the versions of tools specified in the workflow are installed in galaxy
             install_changesets_revisions_from_workflow(workflow_path=workflow_path, instance=instance)
 
-            organism_key_name = list(v.keys())
-            org_dict = v[organisms_key_name[0]]
+            organism_key_name = list(strains.keys())
+            org_dict = strains[organisms_key_name[0]]
 
             history_id = org_dict["history_id"]
 
@@ -1473,9 +1457,9 @@ if __name__ == "__main__":
 
 
 
-        if len(list(v.keys())) == 2:
+        if len(list(strains.keys())) == 2:
 
-            logging.info("Input organism %s: 2 species detected in input dictionary" % k)
+            logging.info("Input organism %s: 2 species detected in input dictionary" % species)
 
             # Set workflow path (2 organisms)
             workflow_path = os.path.join(os.path.abspath(script_dir), "workflows_phaeoexplorer/Galaxy-Workflow-load_blast_results_2org_v1.ga")
@@ -1484,7 +1468,7 @@ if __name__ == "__main__":
             instance_url, email, password = None, None, None
 
             # Set the galaxy instance variables
-            for k2, v2 in v.items():
+            for k2, v2 in strains.items():
                 instance_url = v2["instance_url"]
                 email = v2["email"]
                 password = v2["password"]
@@ -1494,9 +1478,9 @@ if __name__ == "__main__":
             # Check if the versions of tools specified in the workflow are installed in galaxy
             install_changesets_revisions_from_workflow(workflow_path=workflow_path, instance=instance)
 
-            organisms_key_names = list(v.keys())
-            org1_dict = v[organisms_key_names[0]]
-            org2_dict = v[organisms_key_names[1]]
+            organisms_key_names = list(strains.keys())
+            org1_dict = strains[organisms_key_names[0]]
+            org2_dict = strains[organisms_key_names[1]]
 
             history_id = org1_dict["history_id"]
 
@@ -1630,24 +1614,24 @@ if __name__ == "__main__":
         for sp_dict in sp_dict_list:
 
             # Add and retrieve all analyses/organisms for the current input species and add their IDs to the input dictionary
-            current_sp_workflow_dict = create_sp_workflow_dict(sp_dict, main_dir=args.main_directory, config=config, workfow_type="blast")
+            sp_workflow_attributes = get_sp_workflow_param(sp_dict, main_dir=args.main_directory, config=config, workfow_type="blast")
 
-            current_sp_key = list(current_sp_workflow_dict.keys())[0]
-            current_sp_value = list(current_sp_workflow_dict.values())[0]
-            current_sp_strain_sex_key = list(current_sp_value.keys())[0]
-            current_sp_strain_sex_value = list(current_sp_value.values())[0]
+            current_sp_genus_species = list(sp_workflow_attributes.keys())[0]
+            current_sp_genus_species_dict = list(sp_workflow_attributes.values())[0]
+            current_sp_strain_sex = list(current_sp_genus_species_dict.keys())[0]
+            current_sp_strain_sex_attributes_dict = list(current_sp_genus_species_dict.values())[0]
 
             # Add the species dictionary to the complete dictionary
             # This dictionary contains every organism present in the input file
             # Its structure is the following:
             # {genus species: {strain1_sex1: {variables_key: variables_values}, strain1_sex2: {variables_key: variables_values}}}
-            if not current_sp_key in all_sp_workflow_dict.keys():
-                all_sp_workflow_dict[current_sp_key] = current_sp_value
+            if not current_sp_genus_species in all_sp_workflow_dict.keys():
+                all_sp_workflow_dict[current_sp_genus_species] = current_sp_genus_species_dict
             else:
-                all_sp_workflow_dict[current_sp_key][current_sp_strain_sex_key] = current_sp_strain_sex_value
+                all_sp_workflow_dict[current_sp_genus_species][current_sp_strain_sex] = current_sp_strain_sex_attributes_dict
 
-        if len(list(v.keys())) == 1:
-            logging.info("Input organism %s: 1 species detected in input dictionary" % k)
+        if len(list(strains.keys())) == 1:
+            logging.info("Input organism %s: 1 species detected in input dictionary" % species)
 
             # Set workflow path (1 organism)
             workflow_path = os.path.join(os.path.abspath(script_dir), "workflows_phaeoexplorer/Galaxy-Workflow-load_blast_results_1org_v1.ga")
@@ -1656,7 +1640,7 @@ if __name__ == "__main__":
             instance_url, email, password = None, None, None
 
             # Set the galaxy instance variables
-            for k2, v2 in v.items():
+            for k2, v2 in strains.items():
                 instance_url = v2["instance_url"]
                 email = v2["email"]
                 password = v2["password"]
@@ -1666,8 +1650,8 @@ if __name__ == "__main__":
             # Check if the versions of tools specified in the workflow are installed in galaxy
             install_changesets_revisions_from_workflow(workflow_path=workflow_path, instance=instance)
 
-            organism_key_name = list(v.keys())
-            org_dict = v[organisms_key_name[0]]
+            organism_key_name = list(strains.keys())
+            org_dict = strains[organisms_key_name[0]]
 
             history_id = org_dict["history_id"]
 
@@ -1743,9 +1727,9 @@ if __name__ == "__main__":
 
 
 
-        if len(list(v.keys())) == 2:
+        if len(list(strains.keys())) == 2:
 
-            logging.info("Input organism %s: 2 species detected in input dictionary" % k)
+            logging.info("Input organism %s: 2 species detected in input dictionary" % species)
 
             # Set workflow path (2 organisms)
             workflow_path = os.path.join(os.path.abspath(script_dir), "workflows_phaeoexplorer/Galaxy-Workflow-load_blast_results_2org_v1.ga")
@@ -1754,7 +1738,7 @@ if __name__ == "__main__":
             instance_url, email, password = None, None, None
 
             # Set the galaxy instance variables
-            for k2, v2 in v.items():
+            for k2, v2 in strains.items():
                 instance_url = v2["instance_url"]
                 email = v2["email"]
                 password = v2["password"]
@@ -1764,9 +1748,9 @@ if __name__ == "__main__":
             # Check if the versions of tools specified in the workflow are installed in galaxy
             install_changesets_revisions_from_workflow(workflow_path=workflow_path, instance=instance)
 
-            organisms_key_names = list(v.keys())
-            org1_dict = v[organisms_key_names[0]]
-            org2_dict = v[organisms_key_names[1]]
+            organisms_key_names = list(strains.keys())
+            org1_dict = strains[organisms_key_names[0]]
+            org2_dict = strains[organisms_key_names[1]]
 
             history_id = org1_dict["history_id"]
 
diff --git a/speciesData.py b/speciesData.py
index 83626b7..6cc5bb0 100755
--- a/speciesData.py
+++ b/speciesData.py
@@ -116,6 +116,15 @@ class SpeciesData:
         else:
             self.dataset_prefix = self.genus[0].lower() + "_" + self.species_lowercase
 
+        self.genome_filename = "{0}_v{1}.fasta".format(self.dataset_prefix, self.genome_version)
+        self.gff_filename = "{0}_OGS{1}_{2}.gff".format(self.dataset_prefix, self.ogs_version, constants.DATA_DATE)
+        self.transcripts_filename = "{0}_OGS{1}_{2}".format(self.dataset_prefix, self.ogs_version, constants.TRANSCRIPTS_FILENAME_SUFFIX)
+        self.proteins_filename = "{0}_OGS{1}_{2}".format(self.dataset_prefix, self.ogs_version, constants.FILENAME_SUFFIX_PROTEINS)
+        self.interpro_filename = "{0}_OGS{1}_{2}".format(self.dataset_prefix, self.ogs_version, constants.FILENAME_SUFFIX_INTERPRO)
+        self.blastp_filename = "{0}_OGS{1}_{2}".format(self.dataset_prefix, self.ogs_version, constants.FILENAME_SUFFIX_BLASTP)
+        self.blastx_filename = "{0}_OGS{1}_{2}".format(self.dataset_prefix, self.ogs_version, constants.FILENAME_SUFFIX_BLASTX)
+        self.orthofinder_filename = "{0}_OGS{1}_{2}".format(self.dataset_prefix, self.ogs_version, constants.FILENAME_SUFFIX_ORTHOFINDER)
+
         # Bioblend/Chado IDs for an organism analyses/organisms/datasets/history/library
         self.org_id = None
         self.genome_analysis_id = None
@@ -131,8 +140,6 @@ class SpeciesData:
         self.species_dir = None
 
         self.tool_panel = None
-        self.datasets = dict()
-        self.datasets_name = dict()
         self.source_files = dict()
         self.workflow_name = None
         self.metadata = dict()
-- 
GitLab