From 9dd73617466e9734fc53c6d4b9c6e7be7950bb03 Mon Sep 17 00:00:00 2001
From: Arthur Le Bars <arthur.le-bars@sb-roscoff.fr>
Date: Mon, 22 Feb 2021 16:02:48 +0100
Subject: [PATCH] fixed workflow invocation in run_workflow and datasets copy
 in gga_get_data

---
 gga_get_data.py               | 53 +++++++++++------------------------
 run_workflow_phaeoexplorer.py | 34 +++++++++++-----------
 2 files changed, 32 insertions(+), 55 deletions(-)

diff --git a/gga_get_data.py b/gga_get_data.py
index 07c1b04..339daf5 100644
--- a/gga_get_data.py
+++ b/gga_get_data.py
@@ -23,7 +23,7 @@ import speciesData
 """ 
 gga_get_data.py
 
-Usage: $ python3 gga_get_data.py -i input_example.yml --config config.yml [OPTIONS]
+Usage: $ python3 gga_get_data.py -i input_example.yml [OPTIONS]
 
 """
 
@@ -87,20 +87,21 @@ class GetData(speciesData.SpeciesData):
 
         # Copy datasets in the organism src_data dir tree correct folder
         for k, v in datasets_to_get.items():
-            if k in genome_datasets:
-                logging.info("Copying {0} into {1}".format(v, organism_genome_dir))
-                try:
-                    shutil.copyfile(os.path.abspath(v), os.path.join(organism_genome_dir, os.path.basename(v)))
-                except Exception as exc:
-                    logging.warning("Could not copy {1} ({2})".format(v, exc))
-            elif k in annotation_datasets:
-                logging.info("Copying {0} into {1}".format(v, organism_annotation_dir))
-                try:
-                    shutil.copyfile(os.path.abspath(v), os.path.join(organism_annotation_dir, os.path.basename(v)))
-                except Exception as exc:
-                    logging.warning("Could not copy {1} ({2})".format(v, exc))
-            else:
-                pass
+            if v:  # If dataset is not present in input file, skip copy
+                if k in genome_datasets:
+                    logging.info("Copying {0} ({1}) into {2}".format(k, v, organism_genome_dir))
+                    try:
+                        shutil.copyfile(os.path.abspath(v), os.path.join(organism_genome_dir, os.path.basename(v)))
+                    except Exception as exc:
+                        logging.warning("Could not copy {1} - {2} - Exit Code {3})".format(k, v, exc))
+                elif k in annotation_datasets:
+                    logging.info("Copying {0} ({1}) into {2}".format(k, v, organism_annotation_dir))
+                    try:
+                        shutil.copyfile(os.path.abspath(v), os.path.join(organism_annotation_dir, os.path.basename(v)))
+                    except Exception as exc:
+                        logging.warning("Could not copy {1} ({2}) - Exit Code: {3}".format(k, v, exc))
+                else:
+                    pass
 
         os.chdir(self.main_dir)
 
@@ -119,10 +120,6 @@ if __name__ == "__main__":
                         help="Increase output verbosity",
                         action="store_false")
 
-    parser.add_argument("--config",
-                        type=str,
-                        help="Config path, default to the 'config' file inside the script repository")
-
     parser.add_argument("--main-directory",
                         type=str,
                         help="Where the stack containers will be located, defaults to working directory")
@@ -135,12 +132,6 @@ if __name__ == "__main__":
         logging.basicConfig(level=logging.INFO)
     logging.getLogger("urllib3").setLevel(logging.WARNING)
 
-    # Parsing the config file if provided, using the default config otherwise
-    if not args.config:
-        args.config = os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])), "config")
-    else:
-        args.config = os.path.abspath(args.config)
-
     if not args.main_directory:
         args.main_directory = os.getcwd()
     else:
@@ -162,26 +153,14 @@ if __name__ == "__main__":
                                                                  get_data_for_current_species.genus_species +
                                                                  "/")
 
-        # Parse the config yaml file
-        get_data_for_current_species.config = utilities.parse_config(args.config)
-        
         # Change serexec permissions in repo
         try:
             os.chmod("%s/serexec" % get_data_for_current_species.script_dir, 0o0777)
         except PermissionError:
             logging.critical("Cannot access %s, exiting" % get_data_for_current_species.script_dir)
 
-        # Load config file
-        get_data_for_current_species.config = utilities.parse_config(args.config)
-
         # Retrieve datasets
         logging.info("Finding and copying datasets for %s" % get_data_for_current_species.full_name)
         get_data_for_current_species.get_source_data_files_from_path()
         logging.info("Sucessfully copied datasets for %s" % get_data_for_current_species.full_name)
-        
-        # Format fasta headers (proteins)
-        # logging.info("Formatting fasta files headers %s " % get_data_for_current_species.full_name)
-        # get_data_for_current_species.batch_modify_fasta_headers()
-        # logging.info("Successfully formatted files headers %s " % get_data_for_current_species.full_name)
 
-        logging.info("Data successfully copied in src_data for %s" % get_data_for_current_species.full_name)
diff --git a/run_workflow_phaeoexplorer.py b/run_workflow_phaeoexplorer.py
index ead3c93..70f6387 100644
--- a/run_workflow_phaeoexplorer.py
+++ b/run_workflow_phaeoexplorer.py
@@ -64,11 +64,10 @@ class RunWorkflow(speciesData.SpeciesData):
 
         self.set_get_history()
 
-        logging.debug("History ID: " + self.history_id)
+        logging.debug("History ID: %s" % self.history_id)
         libraries = self.instance.libraries.get_libraries()  # normally only one library
-
         library_id = self.instance.libraries.get_libraries()[0]["id"]  # project data folder/library
-        logging.debug("Library ID: " + self.history_id)
+        logging.debug("Library ID: %s" % self.library_id)
         instance_source_data_folders = self.instance.libraries.get_folders(library_id=library_id)
 
         # Access folders via their absolute path
@@ -99,6 +98,7 @@ class RunWorkflow(speciesData.SpeciesData):
                     if ".gff" in d["name"]:
                         annotation_gff_ldda_id = d["ldda_id"]
 
+        # Minimum datasets to populate tripal views --> will not work if these files are not assigned in the input file
         self.datasets["genome_file"] = genome_fasta_ldda_id
         self.datasets["gff_file"] = annotation_gff_ldda_id
         self.datasets["proteins_file"] = annotation_proteins_ldda_id
@@ -354,8 +354,14 @@ class RunWorkflow(speciesData.SpeciesData):
         self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["gff_file"])
         self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["transcripts_file"])
         self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["proteins_file"])
-        self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["interproscan_file"])
-        self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["blast_diamond_file"])
+        try:
+            self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["interproscan_file"])
+        except Exception as exc:
+            logging.debug("Interproscan file could not be loaded in history {0} ({1})".format(self.history_id, exc))
+        try:
+            self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["blast_diamond_file"])
+        except Exception as exc:
+            logging.debug("Blastp file could not be loaded in history {0} ({1})".format(self.history_id, exc))
 
         _datasets = self.instance.datasets.get_datasets()
         with open(os.path.join(self.main_dir, "datasets_ids.json"), "w") as datasets_ids_outfile:
@@ -641,14 +647,6 @@ if __name__ == "__main__":
                 workflow_parameters[PROTEINS_FASTA_FILE] = {}
                 workflow_parameters[TRANSCRIPTS_FASTA_FILE] = {}
 
-                print(run_workflow_for_current_organism.org_id)
-                print(run_workflow_for_current_organism.genome_analysis_id)
-                print(run_workflow_for_current_organism.ogs_analysis_id)
-                print(hda_ids["genome_hda_id"])
-                print(hda_ids["gff_hda_id"])
-                print(hda_ids["proteins_hda_id"])
-                print(hda_ids["transcripts_hda_id"])
-
                 workflow_parameters[LOAD_FASTA_IN_CHADO] = {"organism": run_workflow_for_current_organism.org_id,
                                                             "analysis_id": run_workflow_for_current_organism.genome_analysis_id,
                                                             "do_update": "true"}
@@ -668,11 +666,11 @@ if __name__ == "__main__":
                 run_workflow_for_current_organism.datamap[PROTEINS_FASTA_FILE] = {"src": "hda", "id": hda_ids["proteins_hda_id"]}
                 run_workflow_for_current_organism.datamap[TRANSCRIPTS_FASTA_FILE] = {"src": "hda", "id": hda_ids["transcripts_hda_id"]}
 
-                run_workflow_for_current_organism.datamap = {}
-                run_workflow_for_current_organism.datamap[GENOME_FASTA_FILE] = {"src": "hda", "id":
-                    run_workflow_for_current_organism.datasets["genome_file"]}
-                run_workflow_for_current_organism.datamap[GFF_FILE] = {"src": "hda",
-                                                                       "id": hda_ids["gff_hda_id"]}
+                # run_workflow_for_current_organism.datamap = {}
+                # run_workflow_for_current_organism.datamap[GENOME_FASTA_FILE] = {"src": "hda", "id":
+                #     run_workflow_for_current_organism.datasets["genome_file"]}
+                # run_workflow_for_current_organism.datamap[GFF_FILE] = {"src": "hda",
+                #                                                        "id": hda_ids["gff_hda_id"]}
 
                 # Run the Chado load Tripal sync workflow with the parameters set above
                 run_workflow_for_current_organism.run_workflow(workflow_path=workflow,
-- 
GitLab