Delete blastdb.py

d4395ecb · Arthur Le Bars · c0111d89 · c0111d89
Commit d4395ecb authored 4 years ago by Arthur Le Bars
--- a/blastdb.py
+++ b/blastdb.py
-#!/usr/bin/env python
-
-from __future__ import print_function
-
-import argparse
-import collections
-import json
-import logging as log
-import os
-import sys
-
-from shutil import copyfile
-from subprocess import call
-
-
-class BlastBank:
-
-    def __init__(self, raw_org, data_dir_root, rel_path, fasta_file_name, db_dir_root, seq_type, path, is_multi):
-        self.raw_org = raw_org
-        self.org = prettify(raw_org)
-        self.data_dir_root = data_dir_root
-        self.rel_path = rel_path
-        self.fasta_file_name = fasta_file_name
-        self.db_dir_root = db_dir_root
-        self.seq_type = seq_type
-        self.path = path  # http://bipaa.genouest.org/sp/xxx/ Can be the same as raw_org, or something else when having multiple genomes.
-        self.is_multi = is_multi
-
-        self.fasta = os.path.join(data_dir_root, rel_path, fasta_file_name)
-        self.dest_path = os.path.splitext(os.path.join(db_dir_root, self.path, rel_path, fasta_file_name))[0]
-        self.title = sanitize(rel_path + '_' + os.path.splitext(self.fasta_file_name)[0])
-        if self.is_multi:
-            fake_path = rel_path.split('/')
-            if len(fake_path) > 2:
-                fake_path = [fake_path[1]] + [fake_path[0]] + fake_path[2:]
-            fake_path = '/'.join(fake_path)
-            self.pretty_name = prettify(fake_path, True)
-        else:
-            self.pretty_name = self.org + ' ' + prettify(rel_path, False)
-
-        with open(self.fasta, 'r') as f:
-            self.first_id = f.readline()[1:].rstrip()
-
-        if self.seq_type == 'nucl':
-            if 'transcript' in self.fasta_file_name.lower() or 'cdna' in self.fasta_file_name.lower():
-                self.pretty_name += " transcripts"
-            elif 'cds' in self.fasta_file_name.lower():
-                self.pretty_name += " CDS"
-        else:
-            if 'protein' in self.fasta_file_name.lower() or 'pep' in self.fasta_file_name.lower() or 'proteome' in self.fasta_file_name.lower() or self.fasta_file_name.endswith('.faa'):
-                self.pretty_name += " proteins"
-
-        # Just a stupid/hacky string used for sorting bank list
-        self.sort_key = 'a_' if 'genome' in self.title else 'b_'
-        self.sort_key += self.pretty_name
-
-    def __str__(self):
-        return str({
-            'raw_org': self.raw_org,
-            'org': self.org,
-            'data_dir_root': self.data_dir_root,
-            'rel_path': self.rel_path,
-            'fasta_file_name': self.fasta_file_name,
-            'db_dir_root': self.db_dir_root,
-            'seq_type': self.seq_type,
-            'path': self.path,
-            'fasta': self.fasta,
-            'dest_path': self.dest_path,
-            'title': self.title,
-            'pretty_name': self.pretty_name,
-        })
-
-
-def main(args):
-
-    genome_path = os.path.basename(os.getcwd())
-    if not args.multi_org:
-        genome_name = genome_path
-    data_dir_root = os.path.abspath(os.path.join('src_data'))
-    if not os.path.isdir(data_dir_root):
-        raise Exception("Could not find data dir: %s" % data_dir_root)
-
-    db_dir_root = os.path.abspath(args.dest)
-
-    ignore_list = ['func_annot', "apollo_source"]
-    if args.ignore:
-        ignore_list += args.ignore
-
-    # Looking for files
-    log.info("Looking for fasta files in %s:" % data_dir_root)
-    banks = []
-    for root, dirs, files in os.walk(data_dir_root, followlinks=True):
-        file_list = [os.path.realpath(os.path.join(root, filename)) for filename in files]
-        rel_path = root[len(data_dir_root) + 1:]
-
-        skip_current = False
-        for ign in ignore_list:
-            if ign in rel_path:
-                skip_current = True
-
-        if not skip_current:  # skip useless data
-            for f in file_list:
-                f = os.path.basename(f)
-                if f.endswith('.fasta') or f.endswith('.fa') or f.endswith('.fna') or f.endswith('.faa'):
-                    if args.multi_org:
-                        genome_name = rel_path.split('/')[1]
-
-                    if 'protein' in f or 'pep.' in f or 'proteome' in f or f.endswith('.faa'):
-                        seq_type = 'prot'
-                    else:
-                        seq_type = 'nucl'
-                    new_bank = BlastBank(genome_name, data_dir_root, rel_path, f, db_dir_root, seq_type, genome_path, args.multi_org)
-                    log.info("Found '%s' of type: %s" % (new_bank.fasta, new_bank.seq_type))
-                    banks.append(new_bank)
-
-    if not banks:
-        log.info("No fasta file found.")
-    else:
-        for b in banks:
-            makeblastdb(b, args.dry_run, args.no_parse_seqids)
-
-    nuc_list = collections.OrderedDict()
-    prot_list = collections.OrderedDict()
-    banks.sort(key=lambda x: x.sort_key)
-    for b in banks:
-        if b.seq_type == 'nucl':
-            if b.pretty_name not in nuc_list:
-                nuc_list[b.dest_path] = b.pretty_name
-            else:
-                nuc_list[b.dest_path] = "%s (%s)" % (b.pretty_name, b.fasta_file_name)
-        else:
-            if b.pretty_name not in prot_list:
-                prot_list[b.dest_path] = b.pretty_name
-            else:
-                prot_list[b.dest_path] = "%s (%s)" % (b.pretty_name, b.fasta_file_name)
-
-    yml_dir = os.path.abspath('blast')
-    yml_file_path = os.path.abspath(os.path.join(yml_dir, 'banks.yml'))
-    links_file_path = os.path.abspath(os.path.join(yml_dir, 'links.yml'))
-    if not args.dry_run:
-
-        log.info("List of bank names (to use in links.yml):")
-        write_titles(banks)
-
-        log.info("Writing bank list in '%s'" % yml_file_path)
-        if not os.path.exists(yml_dir):
-            os.makedirs(yml_dir, mode=0o755)
-        yml_file = open(yml_file_path, 'w')
-        write_yml(yml_file, nuc_list, prot_list)
-
-        log.info("Writing automatic links to links.yml in '%s'" % links_file_path)
-        if os.path.exists(links_file_path):
-            log.info("Making backup of previous links.yml to '%s'" % (links_file_path + '.back'))
-            copyfile(links_file_path, links_file_path + '.back')
-        links_yml_file = open(links_file_path, 'w')
-        write_links_yml(links_yml_file, banks, args.apollo)
-
-    else:
-        log.info("List of bank names (to use in links.yml):")
-        write_titles(banks)
-        log.info("Would write bank list in '%s'" % yml_file_path)
-        write_yml(sys.stdout, nuc_list, prot_list)
-        log.info("Would write links.yml in '%s'" % links_file_path)
-        write_links_yml(sys.stdout, banks, args.apollo)
-
-
-def write_yml(yml_file, nuc_list, prot_list):
-
-    nuc = "~"
-    prot = "~"
-
-    if nuc_list:
-        nuc = "\n                ".join(['%s: %s' % (json.dumps(k), json.dumps(v)) for k, v in nuc_list.items()])
-    if prot_list:
-        prot = "\n                ".join(['%s: %s' % (json.dumps(k), json.dumps(v)) for k, v in prot_list.items()])
-
-    print("genouest_blast:", file=yml_file)
-    print("    db_provider:", file=yml_file)
-    print("        list:", file=yml_file)
-    print("            nucleic:", file=yml_file)
-    print("                %s" % nuc, file=yml_file)
-    print("            proteic:", file=yml_file)
-    print("                %s" % prot, file=yml_file)
-
-
-def write_links_yml(yml_file, banks, apollo):
-
-    for bank in banks:
-        print("", file=yml_file)
-        print("# %s" % (bank.pretty_name), file=yml_file)
-
-        link = ''
-        if bank.seq_type == 'prot':
-            spl = bank.org.split()
-            if len(spl) > 2:
-                sp_str = '/'.join(spl[:2])
-                sp_str += '-' + '-'.join(spl[2:])
-            else:
-                sp_str = '/'.join(spl)
-            link = 'http://abims-gga.sb-roscoff.fr/sp/%s/feature/%s/polypeptide/{id}' % (bank.path, sp_str)
-        elif 'genome' in bank.title:
-            dataset_id = bank.org.lower()
-            spl = dataset_id.split()
-            if len(spl) == 2:  # Genus species => gspecies
-                dataset_id = spl[0][:1] + spl[1]
-            elif len(spl) == 3:  # Genus species strain1 => gsstrain1
-                dataset_id = spl[0][:1] + spl[1][:1] + spl[2]
-            else:  # Genus species some garbage => genus_species_some_garbage
-                dataset_id = dataset_id.replace(' ', '_')
-            if apollo:
-                link = '<a href="http://abims-gga.sb-roscoff.fr/sp/' + bank.path + '/jbrowse/?data=data%2F' + dataset_id + '&loc={id}{jbrowse_track}">{id}</a> <a href="http://abims-gga.sb-roscoff.fr/sp/' + bank.path + '/apollo/annotator/loadLink?loc={id}:1{apollo_track}">Apollo</a>'
-            else:
-                link = '<a href="http://abims-gga.sb-roscoff.fr/sp/' + bank.path + '/jbrowse/?data=data%2F' + dataset_id + '&loc={id}{jbrowse_track}">{id}</a>'
-        else:
-            spl = bank.org.split()
-            if len(spl) > 2:
-                sp_str = '/'.join(spl[:2])
-                sp_str += '-' + '-'.join(spl[2:])
-            else:
-                sp_str = '/'.join(spl)
-            link = 'http://abims-gga.sb-roscoff.fr/sp/%s/feature/%s/mRNA/{id}' % (bank.path, sp_str)
-
-        if link:
-            print("%s:" % (bank.title), file=yml_file)
-            print("    db: '%s'" % (bank.title), file=yml_file)
-            print("    '*': '%s'" % (link), file=yml_file)
-        else:
-            print("# Skipped", file=yml_file)
-
-
-def write_titles(banks):
-
-    for bank in banks:
-        print("'%s' -> '%s'      [%s]" % (bank.pretty_name, bank.title, bank.first_id))
-
-
-def makeblastdb(bank, dry_run, no_parse_seqids):
-    log.info("Formatting bank: %s  --->  %s" % (bank.fasta, bank.dest_path))
-    dest_dir = os.path.realpath(os.path.join(bank.dest_path, '..'))
-    if not os.path.exists(dest_dir):
-        log.info("Creating folder: %s" % dest_dir)
-        if not dry_run:
-            os.makedirs(dest_dir, mode=0o755)
-    parse = "-parse_seqids"
-    if no_parse_seqids:
-        parse = ""
-    cmd = "makeblastdb -in '%s' -dbtype '%s' %s -out '%s' -title '%s'" % (bank.fasta, bank.seq_type, parse, bank.dest_path, bank.title)
-    log.info("Running: %s" % cmd)
-    if not dry_run:
-        try:
-            retcode = call(cmd, shell=True)
-            if retcode != 0:
-                raise RuntimeError("Child was terminated by signal " + str(retcode))
-        except OSError as e:
-            print("Execution failed:" + e, file=sys.stderr)
-            sys.exit(1)
-
-
-def prettify(name, capital=True):
-    name = name.replace('_', ' ')
-    name = name.replace('/', ' ')
-    if capital:
-        name = name[0].upper() + name[1:]
-
-    return name
-
-
-def sanitize(name):
-    name = name.lower()
-    name = name.replace(' ', '_')
-    name = name.replace('/', '_')
-
-    return name
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
-        description='Generate blast databanks and update blast forms.'
-    )
-    parser.add_argument("-v", "--verbose", help="Increase output verbosity.",
-                        action="store_true")
-    parser.add_argument("-d", "--dry-run", help="Dry run: no modification will be done, for testing purpose.",
-                        action="store_true")
-    parser.add_argument("-m", "--multi-org", help="Add this flag if there are multiple organisms in src_data.",
-                        action="store_true")
-    parser.add_argument("-a", "--apollo", help="Add this flag to generate links to apollo.",
-                        action="store_true")
-    parser.add_argument("-p", "--no-parse-seqids", help="Don't use the makeblastdb -parse_seqids option (use this in case you have strange looking sequence ids that make html files unreadable)",
-                        action="store_true")
-    parser.add_argument("--ignore", help='Files or directories to ignore', nargs='*')
-    parser.add_argument("dest", help="Destination directory (not including the genome name, should be mounted on compute nodes)")
-
-    args = parser.parse_args()
-    log.basicConfig(level=log.INFO)
-    if args.verbose:
-        log.basicConfig(level=log.DEBUG)
-
-    main(args)