-
Arthur Le Bars authoredeaa18fbb
blastdb.py 11.68 KiB
#!/usr/bin/env python
from __future__ import print_function
import argparse
import collections
import json
import logging as log
import os
import sys
from shutil import copyfile
from subprocess import call
class BlastBank:
def __init__(self, raw_org, data_dir_root, rel_path, fasta_file_name, db_dir_root, seq_type, path, is_multi):
self.raw_org = raw_org
self.org = prettify(raw_org)
self.data_dir_root = data_dir_root
self.rel_path = rel_path
self.fasta_file_name = fasta_file_name
self.db_dir_root = db_dir_root
self.seq_type = seq_type
self.path = path # http://bipaa.genouest.org/sp/xxx/ Can be the same as raw_org, or something else when having multiple genomes.
self.is_multi = is_multi
self.fasta = os.path.join(data_dir_root, rel_path, fasta_file_name)
self.dest_path = os.path.splitext(os.path.join(db_dir_root, self.path, rel_path, fasta_file_name))[0]
self.title = sanitize(rel_path + '_' + os.path.splitext(self.fasta_file_name)[0])
if self.is_multi:
fake_path = rel_path.split('/')
if len(fake_path) > 2:
fake_path = [fake_path[1]] + [fake_path[0]] + fake_path[2:]
fake_path = '/'.join(fake_path)
self.pretty_name = prettify(fake_path, True)
else:
self.pretty_name = self.org + ' ' + prettify(rel_path, False)
with open(self.fasta, 'r') as f:
self.first_id = f.readline()[1:].rstrip()
if self.seq_type == 'nucl':
if 'transcript' in self.fasta_file_name.lower() or 'cdna' in self.fasta_file_name.lower():
self.pretty_name += " transcripts"
elif 'cds' in self.fasta_file_name.lower():
self.pretty_name += " CDS"
else:
if 'protein' in self.fasta_file_name.lower() or 'pep' in self.fasta_file_name.lower() or 'proteome' in self.fasta_file_name.lower() or self.fasta_file_name.endswith('.faa'):
self.pretty_name += " proteins"
# Just a stupid/hacky string used for sorting bank list
self.sort_key = 'a_' if 'genome' in self.title else 'b_'
self.sort_key += self.pretty_name
def __str__(self):
return str({
'raw_org': self.raw_org,
'org': self.org,
'data_dir_root': self.data_dir_root,
'rel_path': self.rel_path,
'fasta_file_name': self.fasta_file_name,
'db_dir_root': self.db_dir_root,
'seq_type': self.seq_type,
'path': self.path,
'fasta': self.fasta,
'dest_path': self.dest_path,
'title': self.title,
'pretty_name': self.pretty_name,
})
def main(args):
genome_path = os.path.basename(os.getcwd())
if not args.multi_org:
genome_name = genome_path
data_dir_root = os.path.abspath(os.path.join('src_data'))
if not os.path.isdir(data_dir_root):
raise Exception("Could not find data dir: %s" % data_dir_root)
db_dir_root = os.path.abspath(args.dest)
ignore_list = ['func_annot', "apollo_source"]
if args.ignore:
ignore_list += args.ignore
# Looking for files
log.info("Looking for fasta files in %s:" % data_dir_root)
banks = []
for root, dirs, files in os.walk(data_dir_root, followlinks=True):
file_list = [os.path.realpath(os.path.join(root, filename)) for filename in files]
rel_path = root[len(data_dir_root) + 1:]
skip_current = False
for ign in ignore_list:
if ign in rel_path:
skip_current = True
if not skip_current: # skip useless data
for f in file_list:
f = os.path.basename(f)
if f.endswith('.fasta') or f.endswith('.fa') or f.endswith('.fna') or f.endswith('.faa'):
if args.multi_org:
genome_name = rel_path.split('/')[1]
if 'protein' in f or 'pep.' in f or 'proteome' in f or f.endswith('.faa'):
seq_type = 'prot'
else:
seq_type = 'nucl'
new_bank = BlastBank(genome_name, data_dir_root, rel_path, f, db_dir_root, seq_type, genome_path, args.multi_org)
log.info("Found '%s' of type: %s" % (new_bank.fasta, new_bank.seq_type))
banks.append(new_bank)
if not banks:
log.info("No fasta file found.")
else:
for b in banks:
makeblastdb(b, args.dry_run, args.no_parse_seqids)
nuc_list = collections.OrderedDict()
prot_list = collections.OrderedDict()
banks.sort(key=lambda x: x.sort_key)
for b in banks:
if b.seq_type == 'nucl':
if b.pretty_name not in nuc_list:
nuc_list[b.dest_path] = b.pretty_name
else:
nuc_list[b.dest_path] = "%s (%s)" % (b.pretty_name, b.fasta_file_name)
else:
if b.pretty_name not in prot_list:
prot_list[b.dest_path] = b.pretty_name
else:
prot_list[b.dest_path] = "%s (%s)" % (b.pretty_name, b.fasta_file_name)
yml_dir = os.path.abspath('blast')
yml_file_path = os.path.abspath(os.path.join(yml_dir, 'banks.yml'))
links_file_path = os.path.abspath(os.path.join(yml_dir, 'links.yml'))
if not args.dry_run:
log.info("List of bank names (to use in links.yml):")
write_titles(banks)
log.info("Writing bank list in '%s'" % yml_file_path)
if not os.path.exists(yml_dir):
os.makedirs(yml_dir, mode=0o755)
yml_file = open(yml_file_path, 'w')
write_yml(yml_file, nuc_list, prot_list)
log.info("Writing automatic links to links.yml in '%s'" % links_file_path)
if os.path.exists(links_file_path):
log.info("Making backup of previous links.yml to '%s'" % (links_file_path + '.back'))
copyfile(links_file_path, links_file_path + '.back')
links_yml_file = open(links_file_path, 'w')
write_links_yml(links_yml_file, banks, args.apollo)
else:
log.info("List of bank names (to use in links.yml):")
write_titles(banks)
log.info("Would write bank list in '%s'" % yml_file_path)
write_yml(sys.stdout, nuc_list, prot_list)
log.info("Would write links.yml in '%s'" % links_file_path)
write_links_yml(sys.stdout, banks, args.apollo)
def write_yml(yml_file, nuc_list, prot_list):
nuc = "~"
prot = "~"
if nuc_list:
nuc = "\n ".join(['%s: %s' % (json.dumps(k), json.dumps(v)) for k, v in nuc_list.items()])
if prot_list:
prot = "\n ".join(['%s: %s' % (json.dumps(k), json.dumps(v)) for k, v in prot_list.items()])
print("genouest_blast:", file=yml_file)
print(" db_provider:", file=yml_file)
print(" list:", file=yml_file)
print(" nucleic:", file=yml_file)
print(" %s" % nuc, file=yml_file)
print(" proteic:", file=yml_file)
print(" %s" % prot, file=yml_file)
def write_links_yml(yml_file, banks, apollo):
for bank in banks:
print("", file=yml_file)
print("# %s" % (bank.pretty_name), file=yml_file)
link = ''
if bank.seq_type == 'prot':
spl = bank.org.split()
if len(spl) > 2:
sp_str = '/'.join(spl[:2])
sp_str += '-' + '-'.join(spl[2:])
else:
sp_str = '/'.join(spl)
link = 'http://abims-gga.sb-roscoff.fr/sp/%s/feature/%s/polypeptide/{id}' % (bank.path, sp_str)
elif 'genome' in bank.title:
dataset_id = bank.org.lower()
spl = dataset_id.split()
if len(spl) == 2: # Genus species => gspecies
dataset_id = spl[0][:1] + spl[1]
elif len(spl) == 3: # Genus species strain1 => gsstrain1
dataset_id = spl[0][:1] + spl[1][:1] + spl[2]
else: # Genus species some garbage => genus_species_some_garbage
dataset_id = dataset_id.replace(' ', '_')
if apollo:
link = '<a href="http://abims-gga.sb-roscoff.fr/sp/' + bank.path + '/jbrowse/?data=data%2F' + dataset_id + '&loc={id}{jbrowse_track}">{id}</a> <a href="http://abims-gga.sb-roscoff.fr/sp/' + bank.path + '/apollo/annotator/loadLink?loc={id}:1{apollo_track}">Apollo</a>'
else:
link = '<a href="http://abims-gga.sb-roscoff.fr/sp/' + bank.path + '/jbrowse/?data=data%2F' + dataset_id + '&loc={id}{jbrowse_track}">{id}</a>'
else:
spl = bank.org.split()
if len(spl) > 2:
sp_str = '/'.join(spl[:2])
sp_str += '-' + '-'.join(spl[2:])
else:
sp_str = '/'.join(spl)
link = 'http://abims-gga.sb-roscoff.fr/sp/%s/feature/%s/mRNA/{id}' % (bank.path, sp_str)
if link:
print("%s:" % (bank.title), file=yml_file)
print(" db: '%s'" % (bank.title), file=yml_file)
print(" '*': '%s'" % (link), file=yml_file)
else:
print("# Skipped", file=yml_file)
def write_titles(banks):
for bank in banks:
print("'%s' -> '%s' [%s]" % (bank.pretty_name, bank.title, bank.first_id))
def makeblastdb(bank, dry_run, no_parse_seqids):
log.info("Formatting bank: %s ---> %s" % (bank.fasta, bank.dest_path))
dest_dir = os.path.realpath(os.path.join(bank.dest_path, '..'))
if not os.path.exists(dest_dir):
log.info("Creating folder: %s" % dest_dir)
if not dry_run:
os.makedirs(dest_dir, mode=0o755)
parse = "-parse_seqids"
if no_parse_seqids:
parse = ""
cmd = "makeblastdb -in '%s' -dbtype '%s' %s -out '%s' -title '%s'" % (bank.fasta, bank.seq_type, parse, bank.dest_path, bank.title)
log.info("Running: %s" % cmd)
if not dry_run:
try:
retcode = call(cmd, shell=True)
if retcode != 0:
raise RuntimeError("Child was terminated by signal " + str(retcode))
except OSError as e:
print("Execution failed:" + e, file=sys.stderr)
sys.exit(1)
def prettify(name, capital=True):
name = name.replace('_', ' ')
name = name.replace('/', ' ')
if capital:
name = name[0].upper() + name[1:]
return name
def sanitize(name):
name = name.lower()
name = name.replace(' ', '_')
name = name.replace('/', '_')
return name
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Generate blast databanks and update blast forms.'
)
parser.add_argument("-v", "--verbose", help="Increase output verbosity.",
action="store_true")
parser.add_argument("-d", "--dry-run", help="Dry run: no modification will be done, for testing purpose.",
action="store_true")
parser.add_argument("-m", "--multi-org", help="Add this flag if there are multiple organisms in src_data.",
action="store_true")
parser.add_argument("-a", "--apollo", help="Add this flag to generate links to apollo.",
action="store_true")
parser.add_argument("-p", "--no-parse-seqids", help="Don't use the makeblastdb -parse_seqids option (use this in case you have strange looking sequence ids that make html files unreadable)",
action="store_true")
parser.add_argument("--ignore", help='Files or directories to ignore', nargs='*')
parser.add_argument("dest", help="Destination directory (not including the genome name, should be mounted on compute nodes)")
args = parser.parse_args()
log.basicConfig(level=log.INFO)
if args.verbose:
log.basicConfig(level=log.DEBUG)
main(args)