Newer
Older

Arthur Le Bars
committed
#!/usr/bin/env python3

Arthur Le Bars
committed
import argparse
import os
import logging
import sys

Arthur Le Bars
committed
import time
import json

Loraine Gueguen
committed
import yaml
from bioblend.galaxy.objects import GalaxyInstance

Arthur Le Bars
committed
import utilities
import utilities_bioblend

Arthur Le Bars
committed
import speciesData

Loraine Gueguen
committed
import constants

Arthur Le Bars
committed
"""

Loraine Gueguen
committed
Usage: $ python3 gga_load_data.py -i input_example.yml --config config.yml [OPTIONS]

Arthur Le Bars
committed

Arthur Le Bars
committed
Do not call this script before the galaxy container is ready
"""

Arthur Le Bars
committed
"""

Arthur Le Bars
committed
Child of SpeciesData

Arthur Le Bars
committed
Contains methods and attributes to copy data into the galaxy instance's library of this given organism

Arthur Le Bars
committed

Arthur Le Bars
committed
Optional data file formatting

Arthur Le Bars
committed
"""
def __init__(self, parameters_dictionary):

Loraine Gueguen
committed
self.existing_folders_cache = {}
self.bam_metadata_cache = {}
super().__init__(parameters_dictionary)

Arthur Le Bars
committed

Arthur Le Bars
committed
def remove_homo_sapiens_from_db(self):

Arthur Le Bars
committed
"""

Arthur Le Bars
committed
Run the GMOD tool to remove the "Homo sapiens" default organism from the original database
Will do nothing if H. sapiens isn't in the database

Arthur Le Bars
committed
"""

Loraine Gueguen
committed
logging.debug("Getting 'Homo sapiens' ID in chado database")
get_sapiens_id_json_output = utilities_bioblend.run_tool_and_download_single_output_dataset(

Loraine Gueguen
committed
tool_id=constants.GET_ORGANISMS_TOOL, # If this version if not found, Galaxy will use the one that is found

Arthur Le Bars
committed
history_id=self.history_id,
tool_inputs={"genus": "Homo", "species": "sapiens"})
logging.info("Deleting Homo 'sapiens' in the instance's chado database")

Arthur Le Bars
committed
try:
get_sapiens_id_final_output = json.loads(get_sapiens_id_json_output)[0]
sapiens_id = str(get_sapiens_id_final_output["organism_id"]) # needs to be str to be recognized by the chado tool
utilities_bioblend.run_tool(

Loraine Gueguen
committed
tool_id=constants.DELETE_ORGANISMS_TOOL,

Arthur Le Bars
committed
history_id=self.history_id,

Arthur Le Bars
committed
except IndexError:
logging.error("Homo sapiens isn't in the instance's chado database (IndexError)")

Arthur Le Bars
committed
pass
def purge_histories(self):
"""
Delete all histories in the instance

Arthur Le Bars
committed
For testing purposes

Arthur Le Bars
committed
:return:
"""
histories = self.instance.histories.get_histories()
for h in histories:
self.instance.histories.delete_history(history_id=h["id"])
return histories
def setup_library(self):

Arthur Le Bars
committed
"""
Create a "Project Data" library in galaxy, mirroring the "src_data" folder of the current organism
directory tree
:return:
"""

Loraine Gueguen
committed
data_dir_root=os.path.join(self.get_species_dir(), constants.HOST_DATA_DIR)

Arthur Le Bars
committed

Loraine Gueguen
committed
gio = GalaxyInstance(url=self.instance_url,
email=self.config[constants.CONF_GALAXY_DEFAULT_ADMIN_EMAIL],
password=self.config[constants.CONF_GALAXY_DEFAULT_ADMIN_PASSWORD]
)
logging.info("Looking for project data in %s" % data_dir_root)

Arthur Le Bars
committed
folders = dict()
post_renaming = {}
for root, dirs, files in os.walk(data_dir_root, followlinks=True):

Arthur Le Bars
committed
file_list = [os.path.join(root, filename) for filename in files]
folders[root] = file_list
if folders:
# Delete pre-existing lib (probably created by a previous call)

Loraine Gueguen
committed
existing = gio.libraries.get_previews(name=constants.GALAXY_LIBRARY_NAME)

Arthur Le Bars
committed
for lib in existing:
if not lib.deleted:

Loraine Gueguen
committed
logging.info('Pre-existing {0} library {1}} found, removing it'.format(constants.GALAXY_LIBRARY_NAME, lib.id))
gio.libraries.delete(lib.id)

Arthur Le Bars
committed

Loraine Gueguen
committed
logging.info("Creating new %s library" % constants.GALAXY_LIBRARY_NAME)
prj_lib = gio.libraries.create(constants.GALAXY_LIBRARY_NAME, constants.GALAXY_LIBRARY_DESC)

Arthur Le Bars
committed
self.library_id = prj_lib.id # project data folder/library
logging.info("Library for {0}: {1}".format(self.full_name, self.library_id))
for fname, files in folders.items():
if fname and files:
folder_name = re.sub(re.compile(data_dir_root + "/"), "", str(fname))

Arthur Le Bars
committed
logging.info("Creating folder: %s" % folder_name)
folder = self.create_deep_folder(prj_lib, folder_name)

Loraine Gueguen
committed

Arthur Le Bars
committed
for single_file in files:
ftype = 'auto'
clean_name = os.path.basename(single_file)
clean_name = clean_name.replace('_', ' ') # Not a good idea for files with a complex name (solution --> rename file or remove the replace)

Arthur Le Bars
committed
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
if single_file.endswith('.bam'):
ftype = 'bam'
bam_label = self.get_bam_label(fname, os.path.basename(single_file))
if bam_label:
clean_name = bam_label
else:
clean_name = os.path.splitext(clean_name)[0]
if clean_name.endswith("Aligned.sortedByCoord.out"): # Stupid thing for many local bam files
clean_name = clean_name[:-25]
elif single_file.endswith('.fasta') or single_file.endswith('.fa') or single_file.endswith(
'.faa') or single_file.endswith('.fna'):
ftype = 'fasta'
elif single_file.endswith('.gff') or single_file.endswith('.gff3'):
ftype = 'gff3'
clean_name = os.path.splitext(clean_name)[0]
elif single_file.endswith('.xml'):
ftype = 'xml'
elif single_file.endswith('.bw'):
ftype = 'bigwig'
elif single_file.endswith('.gaf'):
ftype = 'tabular'
elif single_file.endswith('_tree.txt'):
# We don't want to pollute the logs with 20000 useless lines
logging.debug("Skipping useless file '%s'" % single_file)
continue
elif single_file.endswith('.tar.gz') and 'newick' in fname:
ftype = 'tar'
elif single_file.endswith('.bai') or single_file.endswith('.tar.gz') or single_file.endswith(
'.tar.bz2') or single_file.endswith('.raw') or single_file.endswith('.pdf'):
logging.info("Skipping useless file '%s'" % single_file)
continue

Loraine Gueguen
committed
single_file_relative_path = re.sub(data_dir_root, constants.CONTAINER_DATA_DIR_ROOT, single_file)
single_file_path_in_container=os.path.join(constants.CONTAINER_DATA_DIR_ROOT, single_file_relative_path)
logging.info("Adding file '%s' with type '%s' and name '%s'" % (single_file_path_in_container, ftype, clean_name))

Loraine Gueguen
committed
datasets = prj_lib.upload_from_galaxy_fs(
single_file_path_in_container,

Arthur Le Bars
committed
folder=folder,

Loraine Gueguen
committed
link_data_only='link_to_files',
file_type=ftype,
tag_using_filenames=False

Arthur Le Bars
committed
)
# Rename dataset
# Need to do it AFTER the datasets import is finished, otherwise the new names are not kept by galaxy
# (erased by metadata generation I guess)

Arthur Le Bars
committed
# Doesn't work for some reason (LibraryDataset not subscriptable, __getitem__() not implemented)

Arthur Le Bars
committed
# post_renaming[datasets[0]] = clean_name
time.sleep(1)
# # Wait for uploads to complete
# logging.info("Waiting for import jobs to finish... please wait")
#
# # Checking job state (only necessary if ran using SLURM)
# while True:
# try:
# # "C" state means the job is completed, no need to wait for it
# ret = subprocess.check_output("squeue | grep -v \"C debug\" | grep -v \"JOBID\" || true",
# shell=True)
# if not len(ret):
# break
# time.sleep(3)
# except subprocess.CalledProcessError as inst:
# if inst.returncode == 153: # queue is empty
# break
# else:
# raise
#
# time.sleep(10)

Arthur Le Bars
committed

Arthur Le Bars
committed
# Batch renaming --> Throws a critical error at the moment

Arthur Le Bars
committed
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
# logging.info("Import finished, now renaming datasets with pretty names")
# for dataset in post_renaming:
# dataset.update(name=post_renaming[dataset])
logging.info("Finished importing data")
def create_deep_folder(self, prj_lib, path, parent_folder=None, deep_name=""):
"""
Create a folder inside a folder in a galaxy library
Recursive
:param prj_lib:
:param path:
:param parent_folder:
:param deep_name:
:return:
"""
segments = path.split(os.sep)
deeper_name = os.sep.join([deep_name, segments[0]])
if deeper_name in self.existing_folders_cache:
new_folder = self.existing_folders_cache[deeper_name]
else:
new_folder = prj_lib.create_folder(segments[0], base_folder=parent_folder)
self.existing_folders_cache[deeper_name] = new_folder
if len(segments) > 1:
new_folder = self.create_deep_folder(prj_lib, os.sep.join(segments[1:]), new_folder, deeper_name)
return new_folder

Loraine Gueguen
committed
def get_bam_label(self, dirname, bam_file):
bam_id = bam_file
if bam_id.endswith('.bam'):
bam_id = bam_id[:-4]
if dirname in self.bam_metadata_cache:
if bam_id in self.bam_metadata_cache[dirname] and 'label' in self.bam_metadata_cache[dirname][bam_id] and self.bam_metadata_cache[dirname][bam_id]['label']:
return self.bam_metadata_cache[dirname][bam_id]['label']
else:
return None
else:
meta_file = os.path.join(dirname, 'metadata.yml')
if os.path.exists(meta_file):
with open(meta_file) as f:
self.bam_metadata_cache[dirname] = yaml.safe_load(f)
logging.info("Found metadata in %s " % meta_file)
else:
self.bam_metadata_cache[dirname] = {}
logging.info("Did not find metadata in %s " % meta_file)
return self.get_bam_label(dirname, bam_file)

Arthur Le Bars
committed
if __name__ == "__main__":

Loraine Gueguen
committed
parser = argparse.ArgumentParser(description="Load data into Galaxy library")

Arthur Le Bars
committed
parser.add_argument("input",
type=str,
help="Input file (yml)")
parser.add_argument("-v", "--verbose",
help="Increase output verbosity",

Arthur Le Bars
committed

Arthur Le Bars
committed
parser.add_argument("--config",
type=str,

Loraine Gueguen
committed
help="Config path, default to 'examples/config.yml'")

Arthur Le Bars
committed
parser.add_argument("--main-directory",
type=str,
help="Where the stack containers will be located, defaults to working directory")

Arthur Le Bars
committed
args = parser.parse_args()
if args.verbose:
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)

Loraine Gueguen
committed
# Parsing the config file if provided, using the default config otherwise
if args.config:
config_file = os.path.abspath(args.config)
else:
config_file = os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])), constants.DEFAULT_CONFIG)

Loraine Gueguen
committed
config = utilities.parse_config(config_file)

Arthur Le Bars
committed

Loraine Gueguen
committed
main_dir = None

Arthur Le Bars
committed
if not args.main_directory:

Loraine Gueguen
committed
main_dir = os.getcwd()

Arthur Le Bars
committed
else:

Loraine Gueguen
committed
main_dir = os.path.abspath(args.main_directory)

Arthur Le Bars
committed
sp_dict_list = utilities.parse_input(args.input)

Arthur Le Bars
committed
unique_sp_dict_list = utilities.get_unique_species_dict_list(sp_dict_list=sp_dict_list)

Arthur Le Bars
committed
for sp_dict in unique_sp_dict_list:

Arthur Le Bars
committed
# Creating an instance of load_data_for_current_species object
load_data_for_current_species = LoadData(parameters_dictionary=sp_dict)
# Starting

Loraine Gueguen
committed
logging.info("gga_load_data.py called for {0} {1}".format(load_data_for_current_species.genus, load_data_for_current_species.species))

Arthur Le Bars
committed
# Setting some of the instance attributes

Loraine Gueguen
committed
load_data_for_current_species.main_dir = main_dir

Arthur Le Bars
committed
load_data_for_current_species.species_dir = os.path.join(load_data_for_current_species.main_dir,
load_data_for_current_species.genus_species +
"/")

Arthur Le Bars
committed
# Parse the config yaml file

Loraine Gueguen
committed
load_data_for_current_species.config = config
# Set the instance url attribute -- Does not work with localhost on scratch (ALB)

Loraine Gueguen
committed
load_data_for_current_species.instance_url = "http://localhost:{0}/sp/{1}/galaxy/".format(

Loraine Gueguen
committed
load_data_for_current_species.config[constants.CONF_ALL_HTTP_PORT],

Loraine Gueguen
committed
load_data_for_current_species.genus_species)

Arthur Le Bars
committed
# Check the galaxy container state and proceed if the galaxy services are up and running
if utilities_bioblend.check_galaxy_state(network_name=load_data_for_current_species.genus_species,

Arthur Le Bars
committed
script_dir=load_data_for_current_species.script_dir):

Arthur Le Bars
committed

Loraine Gueguen
committed
# Create the Galaxy instance
load_data_for_current_species.instance = utilities_bioblend.get_galaxy_instance(
instance_url=load_data_for_current_species.instance_url,
email=load_data_for_current_species.config[constants.CONF_GALAXY_DEFAULT_ADMIN_EMAIL],
password=load_data_for_current_species.config[constants.CONF_GALAXY_DEFAULT_ADMIN_PASSWORD]
)

Arthur Le Bars
committed

Arthur Le Bars
committed
# Load the datasets into a galaxy library
logging.info("Setting up library for {0} {1}".format(load_data_for_current_species.genus, load_data_for_current_species.species))
load_data_for_current_species.setup_library()

Loraine Gueguen
committed
logging.debug("Successfully set up library in galaxy for {0} {1}".format(load_data_for_current_species.genus, load_data_for_current_species.species))

Arthur Le Bars
committed

Arthur Le Bars
committed
# Set or get the history for the current organism
load_data_for_current_species.history_id = utilities_bioblend.get_history(
instance=load_data_for_current_species.instance,
history_name=load_data_for_current_species.history_name)

Arthur Le Bars
committed
# Remove H. sapiens from database if here
# TODO: set a dedicated history for removing H. sapiens (instead of doing it into a species history)

Arthur Le Bars
committed

Arthur Le Bars
committed
# logging.info("Importing datasets into history for %s" % load_data_for_current_species.full_name)

Arthur Le Bars
committed
# load_data_for_current_species.import_datasets_into_history() # Option "--load-history"

Arthur Le Bars
committed
# load_data_for_current_species.purge_histories() # Testing purposes

Arthur Le Bars
committed
logging.info("Data successfully loaded and imported for {0} {1}".format(load_data_for_current_species.genus, load_data_for_current_species.species))

Arthur Le Bars
committed
else:
logging.critical("The galaxy container for {0} {1} is not ready yet".format(load_data_for_current_species.genus, load_data_for_current_species.species))