speciesData assigns correct species_folder_name attribute

c825e6be · Arthur Le Bars · b33c4b5e · c825e6be · c825e6be
Commit c825e6be authored 4 years ago by Arthur Le Bars
--- a/gga_get_data.py
+++ b/gga_get_data.py
@@ -53,71 +53,6 @@ class GetData(speciesData.SpeciesData):
        return 1


-    # def batch_modify_fasta_headers(self):
-    #     """
-    #     Change the fasta headers before integration, so that the indexing tool in galaxy interprets the headers
-    #     correctly and doesn't throw an error
-
-    #     The function will use the class attribute "source_datasets", pointing to files in the galaxy
-    #     library to find the fasta files that need their headers formatted
-
-    #     :return:
-    #     """
-
-    #     proteins_file = None
-    #     proteins_outfile = None
-    #     annotation_dir = None
-    #     organism_annotation_dir = os.path.abspath("./src_data/annotation/{0}/OGS{1}".format(self.species_folder_name, self.ogs_version))
-
-    #     self.goto_species_dir()
-
-    #     for d in [i[0] for i in os.walk(os.getcwd() + "/src_data")]:
-    #         if "annotation" in d and self.species_folder_name in d and self.ogs_version in d:
-    #             for f in os.listdir(d):
-    #                 if "proteins" in f:
-    #                     proteins_file = os.path.join(d, f)
-    #                     proteins_outfile = os.path.join(d, "outfile_proteins.fa")
-    #                     annotation_dir = os.path.abspath(d)
-    #     # Formatting the headers
-    #     if proteins_file is not None:
-    #         self.format_fasta_headers(infile=proteins_file,
-    #                                   outfile=proteins_outfile,
-    #                                   pattern="^>mRNA",
-    #                                   repl=">protein")
-    #         if os.path.exists(annotation_dir + "/outfile_proteins.fa"):
-    #             subprocess.call(["mv", annotation_dir + "/outfile_proteins.fa", proteins_file],
-    #                            stdout=subprocess.PIPE,
-    #                            stderr=subprocess.PIPE,
-    #                            cwd=annotation_dir)
-    #             subprocess.call(["rm", annotation_dir + "/outfile_proteins.fa"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=annotation_dir)
-
-    #     else:
-    #         logging.warning("Skipping proteins fasta headers formatting (FileNotFound)")
-
-    # @staticmethod
-    # def format_fasta_headers(infile, outfile, pattern, repl):
-    #     """
-    #     Format the fasta headers of a given file, given a matching pattern and a replacement string
-
-    #     :param infile:
-    #     :param outfile:
-    #     :param pattern:
-    #     :param repl:
-    #     :return:
-    #     """
-
-    #     infile = open(infile, 'r')
-    #     outfile = open(outfile, 'w')
-
-    #     lines = infile.readlines()
-
-    #     for line in lines:
-    #         line_out = re.sub(pattern, repl, line)
-    #         outfile.write(line_out)
-
-    #     infile.close()
-    #     outfile.close()
-
    def get_source_data_files_from_path(self):
        """
        Find source data files in the parent_directory
@@ -154,16 +89,6 @@ class GetData(speciesData.SpeciesData):
        search_excluded_datasets = ["interpro_path", "orthofinder_path", "blastp_path", "blastx_path"]  
        # These datasets will not be searched if missing in the input file

-        # Automatically find the "main" genome and annotation datasets (genome, transcripts, proteome, gff)
-        # Triggers when a dataset path is empty
-        # Exits after a single iteration to not replicate the search
-        # This is VERY specific to phaeoexplorer, as the search depends on how the folders and datasets are named 
-        for k, v in datasets_to_get.items():
-            if k not in search_excluded_datasets and v == "":
-                print("Dataset not specified (%s), searching datasets" % k)
-                self.find_dataset_from_source_data_parent_dir()
-                break
-
        # Copy dataset in the organism src_data dir tree correct folder
        for k, v in datasets_to_get.items():
            if k in genome_datasets:
@@ -177,50 +102,6 @@ class GetData(speciesData.SpeciesData):

        os.chdir(self.main_dir)

-    def find_dataset_from_source_data_parent_dir(self):
-        """
-        "Fail case" func if a dataset isn't specified in the input file. This func will search the specified "parent directory" for files matching
-        the current species
-        
-        Highly specific to the phaeoexplorer project!
-        Doesn't work for interpro, orthofinder and blast datasets at the moment, those absolutely need to be written in the input file 
-
-        :return
-        """
-
-        organism_annotation_dir = os.path.abspath("./src_data/annotation/{0}/OGS{1}".format(self.species_folder_name, self.genome_version))
-        organism_genome_dir = os.path.abspath("./src_data/genome/{0}/v{1}".format(self.species_folder_name, self.genome_version))
-
-        for dirpath, dirnames, files in os.walk(self.source_data_dir):
-            if self.genus_upper and self.species in str(dirpath):
-                for f in files:
-                    if "Contaminants" not in str(f):
-                        try:
-                            if fnmatch.fnmatch(f, "*" + self.species[1:] + "_" + self.sex.upper() + ".fa"):
-                                logging.info("Genome assembly file found - " + str(f))
-                                self.genome_path = os.path.abspath(f)
-                            elif fnmatch.fnmatch(f, "*" + self.species[1:] + "_" + self.sex.upper() + ".gff"):
-                                logging.info("GFF file - " + str(f))
-                                self.gff_path = os.path.abspath(f)
-                            elif fnmatch.fnmatch(f, "*" + self.species[1:] + "_" + self.sex.upper() + "_transcripts-gff.fa"):
-                                logging.info("Transcripts file - " + str(f))
-                                self.transcripts_path = os.path.abspath(f)
-                            elif fnmatch.fnmatch(f, "*" + self.species[1:] + "_" + self.sex.upper() + "_proteins.fa"):
-                                logging.info("Proteins file - " + str(f))
-                                self.proteins_path = os.path.abspath(f)
-                        except Exception as exc:
-                            logging.debug("Error raised %s" % exc)
-
-    def generate_blast_banks(self):
-        """
-        TODO
-        Do we need to generate blast banks?
-
-        :return:
-        """
-
-        return 0
-

 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Automatic data loading in containers and interaction "

--- a/speciesData.py
+++ b/speciesData.py
@@ -49,7 +49,6 @@ class SpeciesData:
        self.genus_lowercase = self.genus[0].lower() + self.genus[1:]
        self.genus_uppercase = self.genus[0].upper() + self.genus[1:]
        self.chado_species_name = "{0} {1}".format(self.species, self.sex)
-        self.species_folder_name = "_".join([self.genus_lowercase, self.species, self.strain, self.sex])
        self.full_name = ' '.join(utilities.filter_empty_not_empty_items([self.genus_uppercase, self.species, self.strain, self.sex])["not_empty"])
        self.full_name_lowercase = self.full_name.lower()
        self.abbreviation = "_".join(utilities.filter_empty_not_empty_items([self.genus_lowercase[0], self.species, self.strain, self.sex])["not_empty"])
@@ -79,6 +78,6 @@ class SpeciesData:
            self.source_data_dir = "/shared/projects/phaeoexplorer/"  # Testing path for phaeoexplorer data - TODO: REMOVE IN PRODUCTION
        else:
            self.source_data_dir = parameters_dictionary["data"]["parent_directory"]
-        self.species_folder_name = "_".join(utilities.filter_empty_not_empty_items([self.genus_lowercase, self.species, self.strain, self.sex])["not_empty"])
+        self.species_folder_name = "_".join(utilities.filter_empty_not_empty_items([self.genus_lowercase.lower(), self.species.lower(), self.sex.lower()])["not_empty"])
        self.existing_folders_cache = {}
        self.bam_metadata_cache = {}