#!/usr/bin/env python from __future__ import print_function import argparse import collections import json import logging as log import os import sys from shutil import copyfile from subprocess import call class BlastBank: def __init__(self, raw_org, data_dir_root, rel_path, fasta_file_name, db_dir_root, seq_type, path, is_multi): self.raw_org = raw_org self.org = prettify(raw_org) self.data_dir_root = data_dir_root self.rel_path = rel_path self.fasta_file_name = fasta_file_name self.db_dir_root = db_dir_root self.seq_type = seq_type self.path = path # http://bipaa.genouest.org/sp/xxx/ Can be the same as raw_org, or something else when having multiple genomes. self.is_multi = is_multi self.fasta = os.path.join(data_dir_root, rel_path, fasta_file_name) self.dest_path = os.path.splitext(os.path.join(db_dir_root, self.path, rel_path, fasta_file_name))[0] self.title = sanitize(rel_path + '_' + os.path.splitext(self.fasta_file_name)[0]) if self.is_multi: fake_path = rel_path.split('/') if len(fake_path) > 2: fake_path = [fake_path[1]] + [fake_path[0]] + fake_path[2:] fake_path = '/'.join(fake_path) self.pretty_name = prettify(fake_path, True) else: self.pretty_name = self.org + ' ' + prettify(rel_path, False) with open(self.fasta, 'r') as f: self.first_id = f.readline()[1:].rstrip() if self.seq_type == 'nucl': if 'transcript' in self.fasta_file_name.lower() or 'cdna' in self.fasta_file_name.lower(): self.pretty_name += " transcripts" elif 'cds' in self.fasta_file_name.lower(): self.pretty_name += " CDS" else: if 'protein' in self.fasta_file_name.lower() or 'pep' in self.fasta_file_name.lower() or 'proteome' in self.fasta_file_name.lower() or self.fasta_file_name.endswith('.faa'): self.pretty_name += " proteins" # Just a stupid/hacky string used for sorting bank list self.sort_key = 'a_' if 'genome' in self.title else 'b_' self.sort_key += self.pretty_name def __str__(self): return str({ 'raw_org': self.raw_org, 'org': self.org, 'data_dir_root': self.data_dir_root, 'rel_path': self.rel_path, 'fasta_file_name': self.fasta_file_name, 'db_dir_root': self.db_dir_root, 'seq_type': self.seq_type, 'path': self.path, 'fasta': self.fasta, 'dest_path': self.dest_path, 'title': self.title, 'pretty_name': self.pretty_name, }) def main(args): genome_path = os.path.basename(os.getcwd()) if not args.multi_org: genome_name = genome_path data_dir_root = os.path.abspath(os.path.join('src_data')) if not os.path.isdir(data_dir_root): raise Exception("Could not find data dir: %s" % data_dir_root) db_dir_root = os.path.abspath(args.dest) ignore_list = ['func_annot', "apollo_source"] if args.ignore: ignore_list += args.ignore # Looking for files log.info("Looking for fasta files in %s:" % data_dir_root) banks = [] for root, dirs, files in os.walk(data_dir_root, followlinks=True): file_list = [os.path.realpath(os.path.join(root, filename)) for filename in files] rel_path = root[len(data_dir_root) + 1:] skip_current = False for ign in ignore_list: if ign in rel_path: skip_current = True if not skip_current: # skip useless data for f in file_list: f = os.path.basename(f) if f.endswith('.fasta') or f.endswith('.fa') or f.endswith('.fna') or f.endswith('.faa'): if args.multi_org: genome_name = rel_path.split('/')[1] if 'protein' in f or 'pep.' in f or 'proteome' in f or f.endswith('.faa'): seq_type = 'prot' else: seq_type = 'nucl' new_bank = BlastBank(genome_name, data_dir_root, rel_path, f, db_dir_root, seq_type, genome_path, args.multi_org) log.info("Found '%s' of type: %s" % (new_bank.fasta, new_bank.seq_type)) banks.append(new_bank) if not banks: log.info("No fasta file found.") else: for b in banks: makeblastdb(b, args.dry_run, args.no_parse_seqids) nuc_list = collections.OrderedDict() prot_list = collections.OrderedDict() banks.sort(key=lambda x: x.sort_key) for b in banks: if b.seq_type == 'nucl': if b.pretty_name not in nuc_list: nuc_list[b.dest_path] = b.pretty_name else: nuc_list[b.dest_path] = "%s (%s)" % (b.pretty_name, b.fasta_file_name) else: if b.pretty_name not in prot_list: prot_list[b.dest_path] = b.pretty_name else: prot_list[b.dest_path] = "%s (%s)" % (b.pretty_name, b.fasta_file_name) yml_dir = os.path.abspath('blast') yml_file_path = os.path.abspath(os.path.join(yml_dir, 'banks.yml')) links_file_path = os.path.abspath(os.path.join(yml_dir, 'links.yml')) if not args.dry_run: log.info("List of bank names (to use in links.yml):") write_titles(banks) log.info("Writing bank list in '%s'" % yml_file_path) if not os.path.exists(yml_dir): os.makedirs(yml_dir, mode=0o755) yml_file = open(yml_file_path, 'w') write_yml(yml_file, nuc_list, prot_list) log.info("Writing automatic links to links.yml in '%s'" % links_file_path) if os.path.exists(links_file_path): log.info("Making backup of previous links.yml to '%s'" % (links_file_path + '.back')) copyfile(links_file_path, links_file_path + '.back') links_yml_file = open(links_file_path, 'w') write_links_yml(links_yml_file, banks, args.apollo) else: log.info("List of bank names (to use in links.yml):") write_titles(banks) log.info("Would write bank list in '%s'" % yml_file_path) write_yml(sys.stdout, nuc_list, prot_list) log.info("Would write links.yml in '%s'" % links_file_path) write_links_yml(sys.stdout, banks, args.apollo) def write_yml(yml_file, nuc_list, prot_list): nuc = "~" prot = "~" if nuc_list: nuc = "\n ".join(['%s: %s' % (json.dumps(k), json.dumps(v)) for k, v in nuc_list.items()]) if prot_list: prot = "\n ".join(['%s: %s' % (json.dumps(k), json.dumps(v)) for k, v in prot_list.items()]) print("genouest_blast:", file=yml_file) print(" db_provider:", file=yml_file) print(" list:", file=yml_file) print(" nucleic:", file=yml_file) print(" %s" % nuc, file=yml_file) print(" proteic:", file=yml_file) print(" %s" % prot, file=yml_file) def write_links_yml(yml_file, banks, apollo): for bank in banks: print("", file=yml_file) print("# %s" % (bank.pretty_name), file=yml_file) link = '' if bank.seq_type == 'prot': spl = bank.org.split() if len(spl) > 2: sp_str = '/'.join(spl[:2]) sp_str += '-' + '-'.join(spl[2:]) else: sp_str = '/'.join(spl) link = 'http://abims-gga.sb-roscoff.fr/sp/%s/feature/%s/polypeptide/{id}' % (bank.path, sp_str) elif 'genome' in bank.title: dataset_id = bank.org.lower() spl = dataset_id.split() if len(spl) == 2: # Genus species => gspecies dataset_id = spl[0][:1] + spl[1] elif len(spl) == 3: # Genus species strain1 => gsstrain1 dataset_id = spl[0][:1] + spl[1][:1] + spl[2] else: # Genus species some garbage => genus_species_some_garbage dataset_id = dataset_id.replace(' ', '_') if apollo: link = '<a href="http://abims-gga.sb-roscoff.fr/sp/' + bank.path + '/jbrowse/?data=data%2F' + dataset_id + '&loc={id}{jbrowse_track}">{id}</a> <a href="http://abims-gga.sb-roscoff.fr/sp/' + bank.path + '/apollo/annotator/loadLink?loc={id}:1{apollo_track}">Apollo</a>' else: link = '<a href="http://abims-gga.sb-roscoff.fr/sp/' + bank.path + '/jbrowse/?data=data%2F' + dataset_id + '&loc={id}{jbrowse_track}">{id}</a>' else: spl = bank.org.split() if len(spl) > 2: sp_str = '/'.join(spl[:2]) sp_str += '-' + '-'.join(spl[2:]) else: sp_str = '/'.join(spl) link = 'http://abims-gga.sb-roscoff.fr/sp/%s/feature/%s/mRNA/{id}' % (bank.path, sp_str) if link: print("%s:" % (bank.title), file=yml_file) print(" db: '%s'" % (bank.title), file=yml_file) print(" '*': '%s'" % (link), file=yml_file) else: print("# Skipped", file=yml_file) def write_titles(banks): for bank in banks: print("'%s' -> '%s' [%s]" % (bank.pretty_name, bank.title, bank.first_id)) def makeblastdb(bank, dry_run, no_parse_seqids): log.info("Formatting bank: %s ---> %s" % (bank.fasta, bank.dest_path)) dest_dir = os.path.realpath(os.path.join(bank.dest_path, '..')) if not os.path.exists(dest_dir): log.info("Creating folder: %s" % dest_dir) if not dry_run: os.makedirs(dest_dir, mode=0o755) parse = "-parse_seqids" if no_parse_seqids: parse = "" cmd = "makeblastdb -in '%s' -dbtype '%s' %s -out '%s' -title '%s'" % (bank.fasta, bank.seq_type, parse, bank.dest_path, bank.title) log.info("Running: %s" % cmd) if not dry_run: try: retcode = call(cmd, shell=True) if retcode != 0: raise RuntimeError("Child was terminated by signal " + str(retcode)) except OSError as e: print("Execution failed:" + e, file=sys.stderr) sys.exit(1) def prettify(name, capital=True): name = name.replace('_', ' ') name = name.replace('/', ' ') if capital: name = name[0].upper() + name[1:] return name def sanitize(name): name = name.lower() name = name.replace(' ', '_') name = name.replace('/', '_') return name if __name__ == '__main__': parser = argparse.ArgumentParser( description='Generate blast databanks and update blast forms.' ) parser.add_argument("-v", "--verbose", help="Increase output verbosity.", action="store_true") parser.add_argument("-d", "--dry-run", help="Dry run: no modification will be done, for testing purpose.", action="store_true") parser.add_argument("-m", "--multi-org", help="Add this flag if there are multiple organisms in src_data.", action="store_true") parser.add_argument("-a", "--apollo", help="Add this flag to generate links to apollo.", action="store_true") parser.add_argument("-p", "--no-parse-seqids", help="Don't use the makeblastdb -parse_seqids option (use this in case you have strange looking sequence ids that make html files unreadable)", action="store_true") parser.add_argument("--ignore", help='Files or directories to ignore', nargs='*') parser.add_argument("dest", help="Destination directory (not including the genome name, should be mounted on compute nodes)") args = parser.parse_args() log.basicConfig(level=log.INFO) if args.verbose: log.basicConfig(level=log.DEBUG) main(args)