Skip to content
Snippets Groups Projects
Commit 73cc89a8 authored by Loraine Gueguen's avatar Loraine Gueguen
Browse files

Remove unused imports. Factorize goto_species_dir(), create_galaxy_instance....

Remove unused imports. Factorize goto_species_dir(), create_galaxy_instance. Move constants at the beginning. Use upload_from_galaxy_fs() instead of upload_from_local(). Add get_bam_label()
parent 91c3500d
No related branches found
No related tags found
2 merge requests!10Release 2.0,!9Release 2.0 (merge dev to master)
...@@ -4,18 +4,13 @@ ...@@ -4,18 +4,13 @@
import bioblend import bioblend
import argparse import argparse
import os import os
import subprocess
import logging import logging
import sys import sys
import fnmatch
import time import time
import json import json
import re import yaml
import stat
import shutil
from bioblend.galaxy.objects import GalaxyInstance from bioblend.galaxy.objects import GalaxyInstance
from bioblend import galaxy
import utilities import utilities
import speciesData import speciesData
...@@ -23,11 +18,15 @@ import speciesData ...@@ -23,11 +18,15 @@ import speciesData
""" """
gga_load_data.py gga_load_data.py
Usage: $ python3 gga_init.py -i input_example.yml --config config.yml [OPTIONS] Usage: $ python3 gga_load_data.py -i input_example.yml --config config.yml [OPTIONS]
Do not call this script before the galaxy container is ready Do not call this script before the galaxy container is ready
""" """
# If this version if not found, Galaxy will use the one that is found
GET_ORGANISMS_TOOL = "toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/2.3.4+galaxy0"
DELETE_ORGANISMS_TOOL = "toolshed.g2.bx.psu.edu/repos/gga/chado_organism_delete_organisms/organism_delete_organisms/2.3.4+galaxy0"
class LoadData(speciesData.SpeciesData): class LoadData(speciesData.SpeciesData):
""" """
...@@ -39,23 +38,7 @@ class LoadData(speciesData.SpeciesData): ...@@ -39,23 +38,7 @@ class LoadData(speciesData.SpeciesData):
""" """
def goto_species_dir(self): def get_history(self):
"""
Go to the species directory (starting from the main dir)
:return:
"""
os.chdir(self.main_dir)
species_dir = os.path.join(self.main_dir, self.genus_species) + "/"
try:
os.chdir(species_dir)
except OSError:
logging.critical("Cannot access %s" % species_dir)
sys.exit(0)
return 1
def set_get_history(self):
""" """
Create or set the working history to the current species one Create or set the working history to the current species one
...@@ -63,13 +46,15 @@ class LoadData(speciesData.SpeciesData): ...@@ -63,13 +46,15 @@ class LoadData(speciesData.SpeciesData):
""" """
try: try:
histories = self.instance.histories.get_histories(name=str(self.genus_species)) histories = self.instance.histories.get_histories(name=str(self.genus_species))
self.history_id = histories[0]["id"] if len(histories) == 1:
logging.debug("History ID set for {0} {1}: {2}".format(self.genus, self.species, self.history_id)) self.history_id = histories[0]["id"]
logging.debug("History ID set for {0} {1}: {2}".format(self.genus, self.species, self.history_id))
else:
logging.critical("Multiple histories exists for {1}: {2}".format(self.genus, self.species))
except IndexError: except IndexError:
logging.info("Creating history for {0} {1}".format(self.genus, self.species)) logging.info("Creating history for {0} {1}".format(self.genus, self.species))
self.instance.histories.create_history(name=str(self.genus_species)) hist_dict = self.instance.histories.create_history(name=str(self.genus_species))
histories = self.instance.histories.get_histories(name=str(self.genus_species)) self.history_id = hist_dict["id"]
self.history_id = histories[0]["id"]
logging.debug("History ID set for {0} {1}: {2}".format(self.genus, self.species, self.history_id)) logging.debug("History ID set for {0} {1}: {2}".format(self.genus, self.species, self.history_id))
return self.history_id return self.history_id
...@@ -81,26 +66,20 @@ class LoadData(speciesData.SpeciesData): ...@@ -81,26 +66,20 @@ class LoadData(speciesData.SpeciesData):
""" """
get_organism_tool = self.instance.tools.show_tool("toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/2.3.4+galaxy0") logging.debug("Getting 'Homo sapiens' ID in chado database")
delete_organism_tool = self.instance.tools.show_tool("toolshed.g2.bx.psu.edu/repos/gga/chado_organism_delete_organisms/organism_delete_organisms/2.3.4+galaxy0")
get_organism_tool_version = get_organism_tool["version"]
delete_organism_tool_version = delete_organism_tool["version"]
logging.debug("Getting 'Homo sapiens' ID in instance's chado database")
get_sapiens_id_job = self.instance.tools.run_tool( get_sapiens_id_job = self.instance.tools.run_tool(
tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/%s" % get_organism_tool_version, tool_id=GET_ORGANISMS_TOOL,
history_id=self.history_id, history_id=self.history_id,
tool_inputs={"genus": "Homo", "species": "sapiens"}) tool_inputs={"genus": "Homo", "species": "sapiens"})
get_sapiens_id_job_output = get_sapiens_id_job["outputs"][0]["id"] get_sapiens_id_job_output_dataset_id = get_sapiens_id_job["outputs"][0]["id"]
get_sapiens_id_json_output = self.instance.datasets.download_dataset(dataset_id=get_sapiens_id_job_output) get_sapiens_id_json_output = self.instance.datasets.download_dataset(dataset_id=get_sapiens_id_job_output_dataset_id)
try: try:
logging.debug("Deleting Homo 'sapiens' in the instance's chado database") logging.debug("Deleting Homo 'sapiens' in the instance's chado database")
get_sapiens_id_final_output = json.loads(get_sapiens_id_json_output)[0] get_sapiens_id_final_output = json.loads(get_sapiens_id_json_output)[0]
sapiens_id = str( sapiens_id = str(
get_sapiens_id_final_output["organism_id"]) # needs to be str to be recognized by the chado tool get_sapiens_id_final_output["organism_id"]) # needs to be str to be recognized by the chado tool
self.instance.tools.run_tool( self.instance.tools.run_tool(
tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_delete_organisms/organism_delete_organisms/%s" % delete_organism_tool_version, tool_id=DELETE_ORGANISMS_TOOL,
history_id=self.history_id, history_id=self.history_id,
tool_inputs={"organism": str(sapiens_id)}) tool_inputs={"organism": str(sapiens_id)})
except bioblend.ConnectionError: except bioblend.ConnectionError:
...@@ -125,7 +104,7 @@ class LoadData(speciesData.SpeciesData): ...@@ -125,7 +104,7 @@ class LoadData(speciesData.SpeciesData):
return histories return histories
def setup_library(self): def setup_library(self, instance):
""" """
Create a "Project Data" library in galaxy, mirroring the "src_data" folder of the current organism Create a "Project Data" library in galaxy, mirroring the "src_data" folder of the current organism
directory tree directory tree
...@@ -135,11 +114,6 @@ class LoadData(speciesData.SpeciesData): ...@@ -135,11 +114,6 @@ class LoadData(speciesData.SpeciesData):
self.goto_species_dir() self.goto_species_dir()
# Delete pre-existing lib (probably created by a previous call)
gio = GalaxyInstance(url=self.instance_url,
email=self.config["galaxy_default_admin_email"],
password=self.config["galaxy_default_admin_password"])
folders = dict() folders = dict()
post_renaming = {} post_renaming = {}
...@@ -149,14 +123,14 @@ class LoadData(speciesData.SpeciesData): ...@@ -149,14 +123,14 @@ class LoadData(speciesData.SpeciesData):
if folders: if folders:
# Delete pre-existing lib (probably created by a previous call) # Delete pre-existing lib (probably created by a previous call)
existing = gio.libraries.get_previews(name='Project Data') existing = instance.libraries.get_previews(name='Project Data')
for lib in existing: for lib in existing:
if not lib.deleted: if not lib.deleted:
logging.info('Pre-existing "Project Data" library %s found, removing it' % lib.id) logging.info('Pre-existing "Project Data" library %s found, removing it' % lib.id)
gio.libraries.delete(lib.id) instance.libraries.delete(lib.id)
logging.info("Creating new 'Project Data' library") logging.info("Creating new 'Project Data' library")
prj_lib = gio.libraries.create('Project Data', 'Data for current genome annotation project') prj_lib = instance.libraries.create('Project Data', 'Data for current genome annotation project')
self.library_id = prj_lib.id # project data folder/library self.library_id = prj_lib.id # project data folder/library
logging.info("Library for {0}: {1}".format(self.full_name, self.library_id)) logging.info("Library for {0}: {1}".format(self.full_name, self.library_id))
...@@ -165,6 +139,7 @@ class LoadData(speciesData.SpeciesData): ...@@ -165,6 +139,7 @@ class LoadData(speciesData.SpeciesData):
folder_name = fname[len("./src_data") + 1:] folder_name = fname[len("./src_data") + 1:]
logging.info("Creating folder: %s" % folder_name) logging.info("Creating folder: %s" % folder_name)
folder = self.create_deep_folder(prj_lib, folder_name) folder = self.create_deep_folder(prj_lib, folder_name)
for single_file in files: for single_file in files:
ftype = 'auto' ftype = 'auto'
...@@ -204,10 +179,12 @@ class LoadData(speciesData.SpeciesData): ...@@ -204,10 +179,12 @@ class LoadData(speciesData.SpeciesData):
continue continue
logging.info("Adding file '%s' with type '%s' and name '%s'" % (single_file, ftype, clean_name)) logging.info("Adding file '%s' with type '%s' and name '%s'" % (single_file, ftype, clean_name))
datasets = prj_lib.upload_from_local( datasets = prj_lib.upload_from_galaxy_fs(
path=single_file, single_file,
folder=folder, folder=folder,
file_type=ftype link_data_only='link_to_files',
file_type=ftype,
tag_using_filenames=False
) )
# Rename dataset # Rename dataset
...@@ -272,7 +249,29 @@ class LoadData(speciesData.SpeciesData): ...@@ -272,7 +249,29 @@ class LoadData(speciesData.SpeciesData):
return new_folder return new_folder
def connect_to_instance(self): def get_bam_label(self, dirname, bam_file):
bam_id = bam_file
if bam_id.endswith('.bam'):
bam_id = bam_id[:-4]
if dirname in self.bam_metadata_cache:
if bam_id in self.bam_metadata_cache[dirname] and 'label' in self.bam_metadata_cache[dirname][bam_id] and self.bam_metadata_cache[dirname][bam_id]['label']:
return self.bam_metadata_cache[dirname][bam_id]['label']
else:
return None
else:
meta_file = os.path.join(dirname, 'metadata.yml')
if os.path.exists(meta_file):
with open(meta_file) as f:
self.bam_metadata_cache[dirname] = yaml.safe_load(f)
logging.info("Found metadata in %s " % meta_file)
else:
self.bam_metadata_cache[dirname] = {}
logging.info("Did not find metadata in %s " % meta_file)
return self.get_bam_label(dirname, bam_file)
def create_galaxy_instance(self):
""" """
Test the connection to the galaxy instance for the current organism Test the connection to the galaxy instance for the current organism
Exit if we cannot connect to the instance Exit if we cannot connect to the instance
...@@ -280,11 +279,10 @@ class LoadData(speciesData.SpeciesData): ...@@ -280,11 +279,10 @@ class LoadData(speciesData.SpeciesData):
""" """
logging.info("Connecting to the galaxy instance (%s)" % self.instance_url) logging.info("Connecting to the galaxy instance (%s)" % self.instance_url)
self.instance = galaxy.GalaxyInstance(url=self.instance_url, self.instance = GalaxyInstance(url=self.instance_url,
email=self.config["galaxy_default_admin_email"], email=self.config["galaxy_default_admin_email"],
password=self.config["galaxy_default_admin_password"] password=self.config["galaxy_default_admin_password"]
) )
self.instance.histories.get_histories()
try: try:
self.instance.histories.get_histories() self.instance.histories.get_histories()
...@@ -294,19 +292,7 @@ class LoadData(speciesData.SpeciesData): ...@@ -294,19 +292,7 @@ class LoadData(speciesData.SpeciesData):
else: else:
logging.info("Successfully connected to galaxy instance (%s) " % self.instance_url) logging.info("Successfully connected to galaxy instance (%s) " % self.instance_url)
return self.instance
def get_species_to_load(sp_dict_list):
"""
"""
utilities.get_unique_species_list(sp_dict_list)
return 1
if __name__ == "__main__": if __name__ == "__main__":
...@@ -338,64 +324,56 @@ if __name__ == "__main__": ...@@ -338,64 +324,56 @@ if __name__ == "__main__":
else: else:
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
# Parsing the config file if provided, using the default config otherwise config_file = os.path.abspath(args.config)
if not args.config: config = utilities.parse_config(config_file)
args.config = os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])), "config")
else:
args.config = os.path.abspath(args.config)
main_dir = None
if not args.main_directory: if not args.main_directory:
args.main_directory = os.getcwd() main_dir = os.getcwd()
else: else:
args.main_directory = os.path.abspath(args.main_directory) main_dir = os.path.abspath(args.main_directory)
sp_dict_list = utilities.parse_input(args.input) sp_dict_list = utilities.parse_input(args.input)
unique_sp_dict_list = utilities.get_unique_species_dict_list(sp_dict_list=sp_dict_list) unique_sp_dict_list = utilities.get_unique_species_dict_list(sp_dict_list=sp_dict_list)
for sp_dict in unique_sp_dict_list: for sp_dict in unique_sp_dict_list:
# Creating an instance of load_data_for_current_species object # Creating an instance of load_data_for_current_species object
load_data_for_current_species = LoadData(parameters_dictionary=sp_dict) load_data_for_current_species = LoadData(parameters_dictionary=sp_dict)
# Starting # Starting
logging.info("gga_load_data.py called for %s" % load_data_for_current_species.full_name) logging.info("gga_load_data.py called for {0} {1}".format(load_data_for_current_species.genus, load_data_for_current_species.species))
# Setting some of the instance attributes # Setting some of the instance attributes
load_data_for_current_species.main_dir = args.main_directory load_data_for_current_species.main_dir = main_dir
load_data_for_current_species.species_dir = os.path.join(load_data_for_current_species.main_dir, load_data_for_current_species.species_dir = os.path.join(load_data_for_current_species.main_dir,
load_data_for_current_species.genus_species + load_data_for_current_species.genus_species +
"/") "/")
# Parse the config yaml file # Parse the config yaml file
load_data_for_current_species.config = utilities.parse_config(args.config) load_data_for_current_species.config = config
# Set the instance url attribute -- Does not work with localhost on scratch (ALB) # Set the instance url attribute -- Does not work with localhost on scratch (ALB)
load_data_for_current_species.instance_url = "http://localhost:{0}/sp/{1}_{2}/galaxy/".format( load_data_for_current_species.instance_url = "http://localhost:{0}/sp/{1}_{2}/galaxy/".format(
load_data_for_current_species.config["http_port"], load_data_for_current_species.config["http_port"],
load_data_for_current_species.genus_lowercase, load_data_for_current_species.genus_lowercase,
load_data_for_current_species.species) load_data_for_current_species.species)
# Check the galaxy container state and proceed if the galaxy services are up and running # Check the galaxy container state and proceed if the galaxy services are up and running
if utilities.check_galaxy_state(genus_lowercase=load_data_for_current_species.genus_lowercase, if utilities.check_galaxy_state(genus_lowercase=load_data_for_current_species.genus_lowercase,
species=load_data_for_current_species.species, species=load_data_for_current_species.species,
script_dir=load_data_for_current_species.script_dir): script_dir=load_data_for_current_species.script_dir):
# Load config file # Create the Galaxy instance
load_data_for_current_species.config = utilities.parse_config(args.config) load_data_for_current_species.instance = load_data_for_current_species.create_galaxy_instance()
# Testing connection to the instance
load_data_for_current_species.connect_to_instance()
# Load the datasets into a galaxy library # Load the datasets into a galaxy library
logging.info("Setting up library for {0} {1}".format(load_data_for_current_species.genus, load_data_for_current_species.species)) logging.info("Setting up library for {0} {1}".format(load_data_for_current_species.genus, load_data_for_current_species.species))
load_data_for_current_species.setup_library() load_data_for_current_species.setup_library(load_data_for_current_species.instance)
logging.info("Successfully set up library in galaxy for {0} {1}".format(load_data_for_current_species.genus, load_data_for_current_species.species)) logging.debug("Successfully set up library in galaxy for {0} {1}".format(load_data_for_current_species.genus, load_data_for_current_species.species))
# Set or get the history for the current organism # Set or get the history for the current organism
load_data_for_current_species.set_get_history() load_data_for_current_species.get_history()
# Remove H. sapiens from database if here # Remove H. sapiens from database if here
# TODO: set a dedicated history for removing H. sapiens (instead of doing it into a species history) # TODO: set a dedicated history for removing H. sapiens (instead of doing it into a species history)
...@@ -403,7 +381,6 @@ if __name__ == "__main__": ...@@ -403,7 +381,6 @@ if __name__ == "__main__":
# logging.info("Importing datasets into history for %s" % load_data_for_current_species.full_name) # logging.info("Importing datasets into history for %s" % load_data_for_current_species.full_name)
# load_data_for_current_species.import_datasets_into_history() # Option "--load-history" # load_data_for_current_species.import_datasets_into_history() # Option "--load-history"
# load_data_for_current_species.purge_histories() # Testing purposes # load_data_for_current_species.purge_histories() # Testing purposes
logging.info("Data successfully loaded and imported for {0} {1}".format(load_data_for_current_species.genus, load_data_for_current_species.species)) logging.info("Data successfully loaded and imported for {0} {1}".format(load_data_for_current_species.genus, load_data_for_current_species.species))
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment