diff --git a/gga_get_data.py b/gga_get_data.py index 5724b9b42c5a9073011831929c4dbdab4e08ac5f..f936657600ce098e1a9359ff524390e3467ffc70 100644 --- a/gga_get_data.py +++ b/gga_get_data.py @@ -53,71 +53,6 @@ class GetData(speciesData.SpeciesData): return 1 - # def batch_modify_fasta_headers(self): - # """ - # Change the fasta headers before integration, so that the indexing tool in galaxy interprets the headers - # correctly and doesn't throw an error - - # The function will use the class attribute "source_datasets", pointing to files in the galaxy - # library to find the fasta files that need their headers formatted - - # :return: - # """ - - # proteins_file = None - # proteins_outfile = None - # annotation_dir = None - # organism_annotation_dir = os.path.abspath("./src_data/annotation/{0}/OGS{1}".format(self.species_folder_name, self.ogs_version)) - - # self.goto_species_dir() - - # for d in [i[0] for i in os.walk(os.getcwd() + "/src_data")]: - # if "annotation" in d and self.species_folder_name in d and self.ogs_version in d: - # for f in os.listdir(d): - # if "proteins" in f: - # proteins_file = os.path.join(d, f) - # proteins_outfile = os.path.join(d, "outfile_proteins.fa") - # annotation_dir = os.path.abspath(d) - # # Formatting the headers - # if proteins_file is not None: - # self.format_fasta_headers(infile=proteins_file, - # outfile=proteins_outfile, - # pattern="^>mRNA", - # repl=">protein") - # if os.path.exists(annotation_dir + "/outfile_proteins.fa"): - # subprocess.call(["mv", annotation_dir + "/outfile_proteins.fa", proteins_file], - # stdout=subprocess.PIPE, - # stderr=subprocess.PIPE, - # cwd=annotation_dir) - # subprocess.call(["rm", annotation_dir + "/outfile_proteins.fa"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=annotation_dir) - - # else: - # logging.warning("Skipping proteins fasta headers formatting (FileNotFound)") - - # @staticmethod - # def format_fasta_headers(infile, outfile, pattern, repl): - # """ - # Format the fasta headers of a given file, given a matching pattern and a replacement string - - # :param infile: - # :param outfile: - # :param pattern: - # :param repl: - # :return: - # """ - - # infile = open(infile, 'r') - # outfile = open(outfile, 'w') - - # lines = infile.readlines() - - # for line in lines: - # line_out = re.sub(pattern, repl, line) - # outfile.write(line_out) - - # infile.close() - # outfile.close() - def get_source_data_files_from_path(self): """ Find source data files in the parent_directory @@ -154,16 +89,6 @@ class GetData(speciesData.SpeciesData): search_excluded_datasets = ["interpro_path", "orthofinder_path", "blastp_path", "blastx_path"] # These datasets will not be searched if missing in the input file - # Automatically find the "main" genome and annotation datasets (genome, transcripts, proteome, gff) - # Triggers when a dataset path is empty - # Exits after a single iteration to not replicate the search - # This is VERY specific to phaeoexplorer, as the search depends on how the folders and datasets are named - for k, v in datasets_to_get.items(): - if k not in search_excluded_datasets and v == "": - print("Dataset not specified (%s), searching datasets" % k) - self.find_dataset_from_source_data_parent_dir() - break - # Copy dataset in the organism src_data dir tree correct folder for k, v in datasets_to_get.items(): if k in genome_datasets: @@ -177,50 +102,6 @@ class GetData(speciesData.SpeciesData): os.chdir(self.main_dir) - def find_dataset_from_source_data_parent_dir(self): - """ - "Fail case" func if a dataset isn't specified in the input file. This func will search the specified "parent directory" for files matching - the current species - - Highly specific to the phaeoexplorer project! - Doesn't work for interpro, orthofinder and blast datasets at the moment, those absolutely need to be written in the input file - - :return - """ - - organism_annotation_dir = os.path.abspath("./src_data/annotation/{0}/OGS{1}".format(self.species_folder_name, self.genome_version)) - organism_genome_dir = os.path.abspath("./src_data/genome/{0}/v{1}".format(self.species_folder_name, self.genome_version)) - - for dirpath, dirnames, files in os.walk(self.source_data_dir): - if self.genus_upper and self.species in str(dirpath): - for f in files: - if "Contaminants" not in str(f): - try: - if fnmatch.fnmatch(f, "*" + self.species[1:] + "_" + self.sex.upper() + ".fa"): - logging.info("Genome assembly file found - " + str(f)) - self.genome_path = os.path.abspath(f) - elif fnmatch.fnmatch(f, "*" + self.species[1:] + "_" + self.sex.upper() + ".gff"): - logging.info("GFF file - " + str(f)) - self.gff_path = os.path.abspath(f) - elif fnmatch.fnmatch(f, "*" + self.species[1:] + "_" + self.sex.upper() + "_transcripts-gff.fa"): - logging.info("Transcripts file - " + str(f)) - self.transcripts_path = os.path.abspath(f) - elif fnmatch.fnmatch(f, "*" + self.species[1:] + "_" + self.sex.upper() + "_proteins.fa"): - logging.info("Proteins file - " + str(f)) - self.proteins_path = os.path.abspath(f) - except Exception as exc: - logging.debug("Error raised %s" % exc) - - def generate_blast_banks(self): - """ - TODO - Do we need to generate blast banks? - - :return: - """ - - return 0 - if __name__ == "__main__": parser = argparse.ArgumentParser(description="Automatic data loading in containers and interaction " diff --git a/speciesData.py b/speciesData.py index db78196a11efa6080f192ade6c0ca42611841189..a099078fa620fa73232c101f14ebbd43da9fd298 100644 --- a/speciesData.py +++ b/speciesData.py @@ -49,7 +49,6 @@ class SpeciesData: self.genus_lowercase = self.genus[0].lower() + self.genus[1:] self.genus_uppercase = self.genus[0].upper() + self.genus[1:] self.chado_species_name = "{0} {1}".format(self.species, self.sex) - self.species_folder_name = "_".join([self.genus_lowercase, self.species, self.strain, self.sex]) self.full_name = ' '.join(utilities.filter_empty_not_empty_items([self.genus_uppercase, self.species, self.strain, self.sex])["not_empty"]) self.full_name_lowercase = self.full_name.lower() self.abbreviation = "_".join(utilities.filter_empty_not_empty_items([self.genus_lowercase[0], self.species, self.strain, self.sex])["not_empty"]) @@ -79,6 +78,6 @@ class SpeciesData: self.source_data_dir = "/shared/projects/phaeoexplorer/" # Testing path for phaeoexplorer data - TODO: REMOVE IN PRODUCTION else: self.source_data_dir = parameters_dictionary["data"]["parent_directory"] - self.species_folder_name = "_".join(utilities.filter_empty_not_empty_items([self.genus_lowercase, self.species, self.strain, self.sex])["not_empty"]) + self.species_folder_name = "_".join(utilities.filter_empty_not_empty_items([self.genus_lowercase.lower(), self.species.lower(), self.sex.lower()])["not_empty"]) self.existing_folders_cache = {} self.bam_metadata_cache = {}