diff --git a/gga_get_data.py b/gga_get_data.py index 07c1b0414d852a1da009480931be900cc723eefa..339daf5e3ca0ea946a5d9105f1d198b194197a8f 100644 --- a/gga_get_data.py +++ b/gga_get_data.py @@ -23,7 +23,7 @@ import speciesData """ gga_get_data.py -Usage: $ python3 gga_get_data.py -i input_example.yml --config config.yml [OPTIONS] +Usage: $ python3 gga_get_data.py -i input_example.yml [OPTIONS] """ @@ -87,20 +87,21 @@ class GetData(speciesData.SpeciesData): # Copy datasets in the organism src_data dir tree correct folder for k, v in datasets_to_get.items(): - if k in genome_datasets: - logging.info("Copying {0} into {1}".format(v, organism_genome_dir)) - try: - shutil.copyfile(os.path.abspath(v), os.path.join(organism_genome_dir, os.path.basename(v))) - except Exception as exc: - logging.warning("Could not copy {1} ({2})".format(v, exc)) - elif k in annotation_datasets: - logging.info("Copying {0} into {1}".format(v, organism_annotation_dir)) - try: - shutil.copyfile(os.path.abspath(v), os.path.join(organism_annotation_dir, os.path.basename(v))) - except Exception as exc: - logging.warning("Could not copy {1} ({2})".format(v, exc)) - else: - pass + if v: # If dataset is not present in input file, skip copy + if k in genome_datasets: + logging.info("Copying {0} ({1}) into {2}".format(k, v, organism_genome_dir)) + try: + shutil.copyfile(os.path.abspath(v), os.path.join(organism_genome_dir, os.path.basename(v))) + except Exception as exc: + logging.warning("Could not copy {1} - {2} - Exit Code {3})".format(k, v, exc)) + elif k in annotation_datasets: + logging.info("Copying {0} ({1}) into {2}".format(k, v, organism_annotation_dir)) + try: + shutil.copyfile(os.path.abspath(v), os.path.join(organism_annotation_dir, os.path.basename(v))) + except Exception as exc: + logging.warning("Could not copy {1} ({2}) - Exit Code: {3}".format(k, v, exc)) + else: + pass os.chdir(self.main_dir) @@ -119,10 +120,6 @@ if __name__ == "__main__": help="Increase output verbosity", action="store_false") - parser.add_argument("--config", - type=str, - help="Config path, default to the 'config' file inside the script repository") - parser.add_argument("--main-directory", type=str, help="Where the stack containers will be located, defaults to working directory") @@ -135,12 +132,6 @@ if __name__ == "__main__": logging.basicConfig(level=logging.INFO) logging.getLogger("urllib3").setLevel(logging.WARNING) - # Parsing the config file if provided, using the default config otherwise - if not args.config: - args.config = os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])), "config") - else: - args.config = os.path.abspath(args.config) - if not args.main_directory: args.main_directory = os.getcwd() else: @@ -162,26 +153,14 @@ if __name__ == "__main__": get_data_for_current_species.genus_species + "/") - # Parse the config yaml file - get_data_for_current_species.config = utilities.parse_config(args.config) - # Change serexec permissions in repo try: os.chmod("%s/serexec" % get_data_for_current_species.script_dir, 0o0777) except PermissionError: logging.critical("Cannot access %s, exiting" % get_data_for_current_species.script_dir) - # Load config file - get_data_for_current_species.config = utilities.parse_config(args.config) - # Retrieve datasets logging.info("Finding and copying datasets for %s" % get_data_for_current_species.full_name) get_data_for_current_species.get_source_data_files_from_path() logging.info("Sucessfully copied datasets for %s" % get_data_for_current_species.full_name) - - # Format fasta headers (proteins) - # logging.info("Formatting fasta files headers %s " % get_data_for_current_species.full_name) - # get_data_for_current_species.batch_modify_fasta_headers() - # logging.info("Successfully formatted files headers %s " % get_data_for_current_species.full_name) - logging.info("Data successfully copied in src_data for %s" % get_data_for_current_species.full_name) diff --git a/run_workflow_phaeoexplorer.py b/run_workflow_phaeoexplorer.py index ead3c936153421ee3b5ad000712a256df72a605d..70f6387e49591e7ea61949e632d814f43b0eeb43 100644 --- a/run_workflow_phaeoexplorer.py +++ b/run_workflow_phaeoexplorer.py @@ -64,11 +64,10 @@ class RunWorkflow(speciesData.SpeciesData): self.set_get_history() - logging.debug("History ID: " + self.history_id) + logging.debug("History ID: %s" % self.history_id) libraries = self.instance.libraries.get_libraries() # normally only one library - library_id = self.instance.libraries.get_libraries()[0]["id"] # project data folder/library - logging.debug("Library ID: " + self.history_id) + logging.debug("Library ID: %s" % self.library_id) instance_source_data_folders = self.instance.libraries.get_folders(library_id=library_id) # Access folders via their absolute path @@ -99,6 +98,7 @@ class RunWorkflow(speciesData.SpeciesData): if ".gff" in d["name"]: annotation_gff_ldda_id = d["ldda_id"] + # Minimum datasets to populate tripal views --> will not work if these files are not assigned in the input file self.datasets["genome_file"] = genome_fasta_ldda_id self.datasets["gff_file"] = annotation_gff_ldda_id self.datasets["proteins_file"] = annotation_proteins_ldda_id @@ -354,8 +354,14 @@ class RunWorkflow(speciesData.SpeciesData): self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["gff_file"]) self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["transcripts_file"]) self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["proteins_file"]) - self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["interproscan_file"]) - self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["blast_diamond_file"]) + try: + self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["interproscan_file"]) + except Exception as exc: + logging.debug("Interproscan file could not be loaded in history {0} ({1})".format(self.history_id, exc)) + try: + self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["blast_diamond_file"]) + except Exception as exc: + logging.debug("Blastp file could not be loaded in history {0} ({1})".format(self.history_id, exc)) _datasets = self.instance.datasets.get_datasets() with open(os.path.join(self.main_dir, "datasets_ids.json"), "w") as datasets_ids_outfile: @@ -641,14 +647,6 @@ if __name__ == "__main__": workflow_parameters[PROTEINS_FASTA_FILE] = {} workflow_parameters[TRANSCRIPTS_FASTA_FILE] = {} - print(run_workflow_for_current_organism.org_id) - print(run_workflow_for_current_organism.genome_analysis_id) - print(run_workflow_for_current_organism.ogs_analysis_id) - print(hda_ids["genome_hda_id"]) - print(hda_ids["gff_hda_id"]) - print(hda_ids["proteins_hda_id"]) - print(hda_ids["transcripts_hda_id"]) - workflow_parameters[LOAD_FASTA_IN_CHADO] = {"organism": run_workflow_for_current_organism.org_id, "analysis_id": run_workflow_for_current_organism.genome_analysis_id, "do_update": "true"} @@ -668,11 +666,11 @@ if __name__ == "__main__": run_workflow_for_current_organism.datamap[PROTEINS_FASTA_FILE] = {"src": "hda", "id": hda_ids["proteins_hda_id"]} run_workflow_for_current_organism.datamap[TRANSCRIPTS_FASTA_FILE] = {"src": "hda", "id": hda_ids["transcripts_hda_id"]} - run_workflow_for_current_organism.datamap = {} - run_workflow_for_current_organism.datamap[GENOME_FASTA_FILE] = {"src": "hda", "id": - run_workflow_for_current_organism.datasets["genome_file"]} - run_workflow_for_current_organism.datamap[GFF_FILE] = {"src": "hda", - "id": hda_ids["gff_hda_id"]} + # run_workflow_for_current_organism.datamap = {} + # run_workflow_for_current_organism.datamap[GENOME_FASTA_FILE] = {"src": "hda", "id": + # run_workflow_for_current_organism.datasets["genome_file"]} + # run_workflow_for_current_organism.datamap[GFF_FILE] = {"src": "hda", + # "id": hda_ids["gff_hda_id"]} # Run the Chado load Tripal sync workflow with the parameters set above run_workflow_for_current_organism.run_workflow(workflow_path=workflow,