Skip to content
Snippets Groups Projects

Release 2.0

Merged Loraine Gueguen requested to merge release_2.0 into dev
Files
3
+ 41
32
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import re
import bioblend
import argparse
@@ -10,6 +11,7 @@ import time
import json
import yaml
from bioblend import galaxy
from bioblend.galaxy.objects import GalaxyInstance
import utilities
@@ -27,6 +29,8 @@ Do not call this script before the galaxy container is ready
GET_ORGANISMS_TOOL = "toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/2.3.4+galaxy0"
DELETE_ORGANISMS_TOOL = "toolshed.g2.bx.psu.edu/repos/gga/chado_organism_delete_organisms/organism_delete_organisms/2.3.4+galaxy0"
HOST_DATA_DIR='src_data'
CONTAINER_DATA_DIR_ROOT='/project_data'
class LoadData(speciesData.SpeciesData):
"""
@@ -67,25 +71,22 @@ class LoadData(speciesData.SpeciesData):
"""
logging.debug("Getting 'Homo sapiens' ID in chado database")
get_sapiens_id_job = self.instance.tools.run_tool(
get_sapiens_id_job_output_dataset_id = utilities.run_tool_and_get_single_output_dataset_id(self.instance,
tool_id=GET_ORGANISMS_TOOL,
history_id=self.history_id,
tool_inputs={"genus": "Homo", "species": "sapiens"})
get_sapiens_id_job_output_dataset_id = get_sapiens_id_job["outputs"][0]["id"]
get_sapiens_id_json_output = self.instance.datasets.download_dataset(dataset_id=get_sapiens_id_job_output_dataset_id)
logging.debug("Deleting Homo 'sapiens' in the instance's chado database")
try:
logging.debug("Deleting Homo 'sapiens' in the instance's chado database")
get_sapiens_id_final_output = json.loads(get_sapiens_id_json_output)[0]
sapiens_id = str(
get_sapiens_id_final_output["organism_id"]) # needs to be str to be recognized by the chado tool
self.instance.tools.run_tool(
sapiens_id = str(get_sapiens_id_final_output["organism_id"]) # needs to be str to be recognized by the chado tool
utilities.run_tool(
tool_id=DELETE_ORGANISMS_TOOL,
history_id=self.history_id,
tool_inputs={"organism": str(sapiens_id)})
except bioblend.ConnectionError:
logging.debug("Homo sapiens isn't in the instance's chado database (bioblend.ConnectionError)")
tool_inputs={"organism": sapiens_id})
except IndexError:
logging.debug("Homo sapiens isn't in the instance's chado database (IndexError)")
logging.error("Homo sapiens isn't in the instance's chado database (IndexError)")
pass
def purge_histories(self):
@@ -98,13 +99,12 @@ class LoadData(speciesData.SpeciesData):
"""
histories = self.instance.histories.get_histories()
self.instance.histories.get_histories(deleted=False)
for h in histories:
self.instance.histories.delete_history(history_id=h["id"])
return histories
def setup_library(self, instance):
def setup_library(self):
"""
Create a "Project Data" library in galaxy, mirroring the "src_data" folder of the current organism
directory tree
@@ -112,12 +112,18 @@ class LoadData(speciesData.SpeciesData):
:return:
"""
self.goto_species_dir()
data_dir_root=os.path.join(self.get_species_dir(), HOST_DATA_DIR)
instance = GalaxyInstance(url=self.instance_url,
email=self.config["galaxy_default_admin_email"],
password=self.config["galaxy_default_admin_password"]
)
logging.info("Looking for project data in %s" % data_dir_root)
folders = dict()
post_renaming = {}
for root, dirs, files in os.walk("./src_data", followlinks=True):
for root, dirs, files in os.walk(data_dir_root, followlinks=True):
file_list = [os.path.join(root, filename) for filename in files]
folders[root] = file_list
@@ -136,7 +142,7 @@ class LoadData(speciesData.SpeciesData):
for fname, files in folders.items():
if fname and files:
folder_name = fname[len("./src_data") + 1:]
folder_name = re.sub(data_dir_root + "/", "", fname)
logging.info("Creating folder: %s" % folder_name)
folder = self.create_deep_folder(prj_lib, folder_name)
@@ -178,9 +184,12 @@ class LoadData(speciesData.SpeciesData):
logging.info("Skipping useless file '%s'" % single_file)
continue
logging.info("Adding file '%s' with type '%s' and name '%s'" % (single_file, ftype, clean_name))
single_file_relative_path = re.sub(data_dir_root, CONTAINER_DATA_DIR_ROOT, single_file)
single_file_path_in_container=os.path.join(CONTAINER_DATA_DIR_ROOT, single_file_relative_path)
logging.info("Adding file '%s' with type '%s' and name '%s'" % (single_file_path_in_container, ftype, clean_name))
datasets = prj_lib.upload_from_galaxy_fs(
single_file,
single_file_path_in_container,
folder=folder,
link_data_only='link_to_files',
file_type=ftype,
@@ -200,19 +209,19 @@ class LoadData(speciesData.SpeciesData):
logging.info("Waiting for import jobs to finish... please wait")
# Checking job state (only necessary if ran using SLURM)
# while True:
# try:
# # "C" state means the job is completed, no need to wait for it
# ret = subprocess.check_output("squeue | grep -v \"C debug\" | grep -v \"JOBID\" || true",
# shell=True)
# if not len(ret):
# break
# time.sleep(3)
# except subprocess.CalledProcessError as inst:
# if inst.returncode == 153: # queue is empty
# break
# else:
# raise
while True:
try:
# "C" state means the job is completed, no need to wait for it
ret = subprocess.check_output("squeue | grep -v \"C debug\" | grep -v \"JOBID\" || true",
shell=True)
if not len(ret):
break
time.sleep(3)
except subprocess.CalledProcessError as inst:
if inst.returncode == 153: # queue is empty
break
else:
raise
time.sleep(10)
@@ -279,7 +288,7 @@ class LoadData(speciesData.SpeciesData):
"""
logging.info("Connecting to the galaxy instance (%s)" % self.instance_url)
self.instance = GalaxyInstance(url=self.instance_url,
self.instance = galaxy.GalaxyInstance(url=self.instance_url,
email=self.config["galaxy_default_admin_email"],
password=self.config["galaxy_default_admin_password"]
)
@@ -369,7 +378,7 @@ if __name__ == "__main__":
# Load the datasets into a galaxy library
logging.info("Setting up library for {0} {1}".format(load_data_for_current_species.genus, load_data_for_current_species.species))
load_data_for_current_species.setup_library(load_data_for_current_species.instance)
load_data_for_current_species.setup_library()
logging.debug("Successfully set up library in galaxy for {0} {1}".format(load_data_for_current_species.genus, load_data_for_current_species.species))
# Set or get the history for the current organism
Loading