Skip to content
Snippets Groups Projects
Commit 08074e9b authored by Arthur Le Bars's avatar Arthur Le Bars
Browse files

reworked datasets and Chado IDs assignation

parent 6af39982
No related branches found
No related tags found
2 merge requests!9Release 2.0 (merge dev to master),!5Workflow v2
This commit is part of merge request !5. Comments created here will be created in the context of that merge request.
...@@ -124,12 +124,18 @@ class GetData(speciesData.SpeciesData): ...@@ -124,12 +124,18 @@ class GetData(speciesData.SpeciesData):
# search_excluded_datasets = ["interpro_path", "orthofinder_path", "blastp_path", "blastx_path"] # search_excluded_datasets = ["interpro_path", "orthofinder_path", "blastp_path", "blastx_path"]
# # These datasets will not be searched if missing in the input file # # These datasets will not be searched if missing in the input file
dataset_shortname = ""
if self.sex is not None or self.sex != "":
dataset_shortname = self.genus[0].lower() + "_" + self.species.lower() + "_" + self.sex[0].lower()
else:
dataset_shortname = self.genus[0].lower() + "_" + self.species.lower()
# Copy datasets in the organism src_data dir tree correct folder # Copy datasets in the organism src_data dir tree correct folder
for k, v in datasets_to_get.items(): for k, v in datasets_to_get.items():
if v: # If dataset is not present in input file, skip copy if v: # If dataset is not present in input file, skip copy
if k in genome_datasets: if k in genome_datasets:
logging.info("Copying {0} ({1}) into {2}".format(k, v, organism_genome_dir)) logging.info("Copying {0} ({1}) into {2}".format(k, v, organism_genome_dir))
genome_fname = "v%s.fasta" % self.genome_version genome_fname = "{0}_v{1}.fasta".format(dataset_shortname, self.genome_version)
try: try:
shutil.copyfile(os.path.abspath(v), os.path.join(organism_genome_dir, genome_fname)) shutil.copyfile(os.path.abspath(v), os.path.join(organism_genome_dir, genome_fname))
except Exception as exc: except Exception as exc:
...@@ -137,19 +143,19 @@ class GetData(speciesData.SpeciesData): ...@@ -137,19 +143,19 @@ class GetData(speciesData.SpeciesData):
elif k in annotation_datasets: elif k in annotation_datasets:
dataset_fname = "" dataset_fname = ""
if k == "gff_path": if k == "gff_path":
dataset_fname = "OGS%s.gff" % self.ogs_version dataset_fname = "{0}_OGS{1}.gff".format(dataset_shortname, self.genome_version)
elif k == "transcripts_path": elif k == "transcripts_path":
dataset_fname = "OGS%s_transcripts.fasta" % self.ogs_version dataset_fname = "{0}_OGS{1}_transcripts.fasta".format(dataset_shortname, self.genome_version)
elif k == "proteins_path": elif k == "proteins_path":
dataset_fname = "OGS%s_proteins.fasta" % self.ogs_version dataset_fname = "{0}_OGS{1}_proteins.fasta".format(dataset_shortname, self.genome_version)
elif k == "orthofinder_path": elif k == "orthofinder_path":
dataset_fname = "OGS%s_orthofinder.tsv" % self.ogs_version dataset_fname = "{0}_OGS{1}_orthofinder.tsv".format(dataset_shortname, self.genome_version)
elif k == "interpro_path": elif k == "interpro_path":
dataset_fname = "OGS%s_interproscan.xml" % self.ogs_version dataset_fname = "{0}_OGS{1}_interproscan.xml".format(dataset_shortname, self.genome_version)
elif k == "blastp_path": elif k == "blastp_path":
dataset_fname = "OGS%s_blastp.xml" % self.ogs_version dataset_fname = "{0}_OGS{1}_blastp.xml".format(dataset_shortname, self.genome_version)
elif k == "blastx_path": elif k == "blastx_path":
dataset_fname = "OGS%s_blastx.xml" % self.ogs_version dataset_fname = "{0}_OGS{1}_blastx.xml".format(dataset_shortname, self.genome_version)
logging.info("Copying {0} ({1}) into {2}".format(k, v, organism_annotation_dir)) logging.info("Copying {0} ({1}) into {2}".format(k, v, organism_annotation_dir))
try: try:
shutil.copyfile(os.path.abspath(v), os.path.join(organism_annotation_dir, dataset_fname)) shutil.copyfile(os.path.abspath(v), os.path.join(organism_annotation_dir, dataset_fname))
......
...@@ -169,6 +169,7 @@ class RunWorkflow(speciesData.SpeciesData): ...@@ -169,6 +169,7 @@ class RunWorkflow(speciesData.SpeciesData):
name = toolshed_dict["name"] name = toolshed_dict["name"]
owner = toolshed_dict["owner"] owner = toolshed_dict["owner"]
toolshed = "https://" + toolshed_dict["tool_shed"] toolshed = "https://" + toolshed_dict["tool_shed"]
logging.warning("Installing changeset revision {0} for {1}".format(changeset_revision, name))
self.instance.toolshed.install_repository_revision(tool_shed_url=toolshed, name=name, owner=owner, self.instance.toolshed.install_repository_revision(tool_shed_url=toolshed, name=name, owner=owner,
changeset_revision=changeset_revision, changeset_revision=changeset_revision,
...@@ -183,6 +184,7 @@ class RunWorkflow(speciesData.SpeciesData): ...@@ -183,6 +184,7 @@ class RunWorkflow(speciesData.SpeciesData):
name = toolshed_dict["name"] name = toolshed_dict["name"]
owner = toolshed_dict["owner"] owner = toolshed_dict["owner"]
toolshed = "https://" + toolshed_dict["tool_shed"] toolshed = "https://" + toolshed_dict["tool_shed"]
logging.warning("Installing changeset revision {0} for {1}".format(changeset_revision, name))
self.instance.toolshed.install_repository_revision(tool_shed_url=toolshed, name=name, owner=owner, self.instance.toolshed.install_repository_revision(tool_shed_url=toolshed, name=name, owner=owner,
changeset_revision=changeset_revision, changeset_revision=changeset_revision,
...@@ -197,6 +199,7 @@ class RunWorkflow(speciesData.SpeciesData): ...@@ -197,6 +199,7 @@ class RunWorkflow(speciesData.SpeciesData):
name = toolshed_dict["name"] name = toolshed_dict["name"]
owner = toolshed_dict["owner"] owner = toolshed_dict["owner"]
toolshed = "https://" + toolshed_dict["tool_shed"] toolshed = "https://" + toolshed_dict["tool_shed"]
logging.warning("Installing changeset revision {0} for {1}".format(changeset_revision, name))
self.instance.toolshed.install_repository_revision(tool_shed_url=toolshed, name=name, owner=owner, self.instance.toolshed.install_repository_revision(tool_shed_url=toolshed, name=name, owner=owner,
changeset_revision=changeset_revision, changeset_revision=changeset_revision,
...@@ -211,7 +214,8 @@ class RunWorkflow(speciesData.SpeciesData): ...@@ -211,7 +214,8 @@ class RunWorkflow(speciesData.SpeciesData):
name = toolshed_dict["name"] name = toolshed_dict["name"]
owner = toolshed_dict["owner"] owner = toolshed_dict["owner"]
toolshed = "https://" + toolshed_dict["tool_shed"] toolshed = "https://" + toolshed_dict["tool_shed"]
logging.warning("Installing changeset revision %s for add_analysis" % changeset_revision) logging.warning("Installing changeset revision {0} for {1}".format(changeset_revision, name))
self.instance.toolshed.install_repository_revision(tool_shed_url=toolshed, name=name, owner=owner, self.instance.toolshed.install_repository_revision(tool_shed_url=toolshed, name=name, owner=owner,
changeset_revision=changeset_revision, changeset_revision=changeset_revision,
install_tool_dependencies=True, install_tool_dependencies=True,
...@@ -247,118 +251,112 @@ class RunWorkflow(speciesData.SpeciesData): ...@@ -247,118 +251,112 @@ class RunWorkflow(speciesData.SpeciesData):
self.set_get_history() self.set_get_history()
tool_version = "2.3.4+galaxy0" tool_version = "2.3.4+galaxy0"
# Add organism (species) to chado
logging.info("Adding organism to the instance's chado database")
if self.common == "" or self.common is None:
self.instance.tools.run_tool(
tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_add_organism/organism_add_organism/%s" % tool_version,
history_id=self.history_id,
tool_inputs={"abbr": self.abbreviation,
"genus": self.genus_uppercase,
"species": self.chado_species_name,
"common": self.abbreviation})
else:
self.instance.tools.run_tool(
tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_add_organism/organism_add_organism/%s" % tool_version,
history_id=self.history_id,
tool_inputs={"abbr": self.abbreviation,
"genus": self.genus_uppercase,
"species": self.chado_species_name,
"common": self.common})
# Add OGS analysis to chado
logging.info("Adding OGS analysis to the instance's chado database")
self.instance.tools.run_tool(
tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/%s" % tool_version,
history_id=self.history_id,
tool_inputs={"name": self.full_name_lowercase + " OGS" + self.ogs_version,
"program": "Performed by Genoscope",
"programversion": str(self.sex + " OGS" + self.ogs_version),
"sourcename": "Genoscope",
"date_executed": self.date})
# Add genome analysis to chado get_organism_tool = self.instance.tools.show_tool("toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/2.3.4+galaxy0")
logging.info("Adding genome analysis to the instance's chado database")
self.instance.tools.run_tool( get_organisms = self.instance.tools.run_tool(
tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/%s" % tool_version, tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/%s" % tool_version,
history_id=self.history_id, history_id=self.history_id,
tool_inputs={"name": self.full_name_lowercase + " genome v" + self.genome_version, tool_inputs={})
"program": "Performed by Genoscope",
"programversion": str(self.sex + "genome v" + self.genome_version),
"sourcename": "Genoscope",
"date_executed": self.date})
# # TODO: check output of get_organism --> if empty or wrong --> rerun --> else: go next time.sleep(10) # Ensure the tool has had time to complete
# # Get organism and analyses IDs (runtime inputs for workflow) org_outputs = get_organisms["outputs"] # Outputs from the get_organism tool
# time.sleep(3) org_job_out_id = org_outputs[0]["id"] # ID of the get_organism output dataset (list of dicts)
# # Get the ID for the current organism in chado org_json_output = self.instance.datasets.download_dataset(dataset_id=org_job_out_id) # Download the dataset
# org = self.instance.tools.run_tool( org_output = json.loads(org_json_output) # Turn the dataset into a list for parsing
# tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/%s" % tool_version,
# history_id=self.history_id,
# tool_inputs={"abbr": self.abbreviation,
# "genus": self.genus_uppercase,
# "species": self.chado_species_name,
# "common": self.common})
# time.sleep(3)
# # Run tool again (sometimes the tool doesn't return anything despite the organism already being in the db)
# org = self.instance.tools.run_tool(
# tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/%s" % tool_version,
# history_id=self.history_id,
# tool_inputs={"abbr": self.abbreviation,
# "genus": self.genus_uppercase,
# "species": self.chado_species_name,
# "common": self.common})
# org_job_out = org["outputs"][0]["id"]
# org_json_output = self.instance.datasets.download_dataset(dataset_id=org_job_out)
# try:
# org_output = json.loads(org_json_output)[0]
# self.org_id = str(org_output["organism_id"]) # id needs to be a str to be recognized by chado tools
# except IndexError:
# logging.critical("No organism matching " + self.full_name + " exists in the instance's chado database")
# sys.exit()
def get_genome_analysis_id(self):
"""
"""
# Get the ID for the genome analysis in chado org_id = None
genome_analysis = self.instance.tools.run_tool(
tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_get_analyses/analysis_get_analyses/2.3.4+galaxy0",
history_id=self.history_id,
tool_inputs={"name": self.full_name_lowercase + " genome v" + self.genome_version})
genome_analysis_job_out = genome_analysis["outputs"][0]["id"]
genome_analysis_json_output = self.instance.datasets.download_dataset(dataset_id=genome_analysis_job_out)
try:
genome_analysis_output = json.loads(genome_analysis_json_output)[0]
self.genome_analysis_id = str(genome_analysis_output["analysis_id"])
except IndexError as exc:
logging.critical("no matching genome analysis exists in the instance's chado database")
sys.exit(exc)
return self.genome_analysis_id # Look up list of outputs (dictionaries)
for organism_output_dict in org_output:
if organism_output_dict["genus"] == self.genus and organism_output_dict["species"] == "{0} {1}".format(self.species, self.sex):
correct_organism_id = str(organism_output_dict["organism_id"]) # id needs to be a str to be recognized by chado tools
org_id = str(correct_organism_id)
def get_ogs_analysis_id(self):
"""
"""
# Get the ID for the OGS analysis in chado if org_id is None:
ogs_analysis = self.instance.tools.run_tool( if self.common == "" or self.common is None:
tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_get_analyses/analysis_get_analyses/2.3.4+galaxy0", add_org_job = self.instance.tools.run_tool(
tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_add_organism/organism_add_organism/%s" % tool_version,
history_id=self.history_id,
tool_inputs={"abbr": self.abbreviation,
"genus": self.genus_uppercase,
"species": self.chado_species_name,
"common": self.abbreviation})
org_job_out_id = add_org_job["outputs"][0]["id"]
org_json_output = self.instance.datasets.download_dataset(dataset_id=org_job_out_id)
org_output = json.loads(org_json_output)
org_id = str(organism_output_dict["organism_id"]) # id needs to be a str to be recognized by chado tools
else:
add_org_job = self.instance.tools.run_tool(
tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_add_organism/organism_add_organism/%s" % tool_version,
history_id=self.history_id,
tool_inputs={"abbr": self.abbreviation,
"genus": self.genus_uppercase,
"species": self.chado_species_name,
"common": self.common})
org_job_out_id = add_org_job["outputs"][0]["id"]
org_json_output = self.instance.datasets.download_dataset(dataset_id=org_job_out_id)
org_output = json.loads(org_json_output)
org_id = str(organism_output_dict["organism_id"]) # id needs to be a str to be recognized by chado tools
get_analyses = self.instance.tools.run_tool(
tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_get_analyses/analysis_get_analyses/%s" % tool_version,
history_id=self.history_id, history_id=self.history_id,
tool_inputs={"name": self.full_name_lowercase + " OGS" + self.ogs_version}) tool_inputs={})
ogs_analysis_job_out = ogs_analysis["outputs"][0]["id"]
ogs_analysis_json_output = self.instance.datasets.download_dataset(dataset_id=ogs_analysis_job_out)
try:
ogs_analysis_output = json.loads(ogs_analysis_json_output)[0]
self.ogs_analysis_id = str(ogs_analysis_output["analysis_id"])
except IndexError as exc:
logging.critical("No matching OGS analysis exists in the instance's chado database")
sys.exit(exc)
return self.ogs_analysis_id time.sleep(10)
analysis_outputs = get_analyses["outputs"]
analysis_job_out_id = analysis_outputs[0]["id"]
analysis_json_output = self.instance.datasets.download_dataset(dataset_id=analysis_job_out_id)
analysis_output = json.loads(analysis_json_output)
ogs_analysis_id = None
genome_analysis_id = None
# Look up list of outputs (dictionaries)
for analysis_output_dict in analysis_output:
if analysis_output_dict["name"] == self.full_name_lowercase + " OGS" + self.ogs_version:
ogs_analysis_id = str(analysis_output_dict["analysis_id"])
if analysis_output_dict["name"] == self.full_name_lowercase + " genome v" + self.genome_version:
genome_analysis_id = str(analysis_output_dict["analysis_id"])
if ogs_analysis_id is None:
add_ogs_analysis_job = self.instance.tools.run_tool(
tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/%s" % tool_version,
history_id=self.history_id,
tool_inputs={"name": self.full_name_lowercase + " OGS" + self.ogs_version,
"program": "Performed by Genoscope",
"programversion": str(self.sex + " OGS" + self.ogs_version),
"sourcename": "Genoscope",
"date_executed": self.date})
analysis_outputs = add_ogs_analysis_job["outputs"]
analysis_job_out_id = analysis_outputs[0]["id"]
analysis_json_output = self.instance.datasets.download_dataset(dataset_id=analysis_job_out_id)
analysis_output = json.loads(analysis_json_output)
ogs_analysis_id = analysis_output["analysis_id"]
ogs_analysis_id = str(analysis_output_dict["analysis_id"])
if genome_analysis_id is None:
add_genome_analysis_job = self.instance.tools.run_tool(
tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/%s" % tool_version,
history_id=self.history_id,
tool_inputs={"name": self.full_name_lowercase + " genome v" + self.genome_version,
"program": "Performed by Genoscope",
"programversion": str(self.sex + "genome v" + self.genome_version),
"sourcename": "Genoscope",
"date_executed": self.date})
analysis_outputs = add_genome_analysis_job["outputs"]
analysis_job_out_id = analysis_outputs[0]["id"]
analysis_json_output = self.instance.datasets.download_dataset(dataset_id=analysis_job_out_id)
analysis_output = json.loads(analysis_json_output)
genome_analysis_id = str(analysis_output_dict["analysis_id"])
print({"org_id": org_id, "genome_analysis_id": genome_analysis_id, "ogs_analysis_id": ogs_analysis_id})
return({"org_id": org_id, "genome_analysis_id": genome_analysis_id, "ogs_analysis_id": ogs_analysis_id})
def add_interproscan_analysis(self): def add_interproscan_analysis(self):
...@@ -510,17 +508,7 @@ class RunWorkflow(speciesData.SpeciesData): ...@@ -510,17 +508,7 @@ class RunWorkflow(speciesData.SpeciesData):
return invocation_report return invocation_report
def import_datasets_into_history(self):
def get_datasets_ldda_ids(self):
"""
Get and return the ldda_ids (and names) for the datasets in the library
"""
return 0
def import_datasets_into_history(self, imported_datasets_ids):
""" """
Find datasets in a library, get their ID and import them into the current history if they are not already Find datasets in a library, get their ID and import them into the current history if they are not already
...@@ -545,7 +533,6 @@ class RunWorkflow(speciesData.SpeciesData): ...@@ -545,7 +533,6 @@ class RunWorkflow(speciesData.SpeciesData):
folders_ids[i["name"]] = i["id"] folders_ids[i["name"]] = i["id"]
# Iterating over the folders to find datasets and map datasets to their IDs # Iterating over the folders to find datasets and map datasets to their IDs
logging.debug("Datasets IDs: ")
for k, v in folders_ids.items(): for k, v in folders_ids.items():
if k == "/genome/{0}/v{1}".format(self.species_folder_name, self.genome_version): if k == "/genome/{0}/v{1}".format(self.species_folder_name, self.genome_version):
sub_folder_content = self.instance.folders.show_folder(folder_id=v, contents=True) sub_folder_content = self.instance.folders.show_folder(folder_id=v, contents=True)
...@@ -555,7 +542,6 @@ class RunWorkflow(speciesData.SpeciesData): ...@@ -555,7 +542,6 @@ class RunWorkflow(speciesData.SpeciesData):
if e["name"].endswith(".fasta"): if e["name"].endswith(".fasta"):
self.datasets["genome_file"] = e["ldda_id"] self.datasets["genome_file"] = e["ldda_id"]
self.datasets_name["genome_file"] = e["name"] self.datasets_name["genome_file"] = e["name"]
logging.debug("\tGenome file:\t" + e["name"] + ": " + e["ldda_id"])
if k == "/annotation/{0}/OGS{1}".format(self.species_folder_name, self.ogs_version): if k == "/annotation/{0}/OGS{1}".format(self.species_folder_name, self.ogs_version):
sub_folder_content = self.instance.folders.show_folder(folder_id=v, contents=True) sub_folder_content = self.instance.folders.show_folder(folder_id=v, contents=True)
...@@ -563,59 +549,86 @@ class RunWorkflow(speciesData.SpeciesData): ...@@ -563,59 +549,86 @@ class RunWorkflow(speciesData.SpeciesData):
for e in v2: for e in v2:
if type(e) == dict: if type(e) == dict:
if "transcripts" in e["name"]: if "transcripts" in e["name"]:
# the attributes datasets is set in the function get_instance_attributes()
self.datasets["transcripts_file"] = e["ldda_id"] self.datasets["transcripts_file"] = e["ldda_id"]
self.datasets_name["transcripts_file"] = e["name"] self.datasets_name["transcripts_file"] = e["name"]
logging.debug("\tTranscripts file:\t" + e["name"] + ": " + e["ldda_id"])
elif "proteins" in e["name"]: elif "proteins" in e["name"]:
self.datasets["proteins_file"] = e["ldda_id"] self.datasets["proteins_file"] = e["ldda_id"]
self.datasets_name["proteins_file"] = e["name"] self.datasets_name["proteins_file"] = e["name"]
logging.debug("\tProteins file:\t" + e["name"] + ": " + e["ldda_id"])
elif "gff" in e["name"]: elif "gff" in e["name"]:
self.datasets["gff_file"] = e["ldda_id"] self.datasets["gff_file"] = e["ldda_id"]
self.datasets_name["gff_file"] = e["name"] self.datasets_name["gff_file"] = e["name"]
logging.debug("\tGFF file:\t" + e["name"] + ": " + e["ldda_id"])
elif "interpro" in e["name"]: elif "interpro" in e["name"]:
self.datasets["interproscan_file"] = e["ldda_id"] self.datasets["interproscan_file"] = e["ldda_id"]
self.datasets_name["interproscan_file"] = e["name"] self.datasets_name["interproscan_file"] = e["name"]
logging.debug("\tInterproscan file:\t" + e["name"] + ": " + e["ldda_id"])
elif "blastp" in e["name"]: elif "blastp" in e["name"]:
self.datasets["blast_diamond_file"] = e["ldda_id"] self.datasets["blast_diamond_file"] = e["ldda_id"]
self.datasets_name["blast_diamond_file"] = e["name"] self.datasets_name["blast_diamond_file"] = e["name"]
logging.debug("\tBlastp diamond file:\t" + e["name"] + ": " + e["ldda_id"])
logging.debug("Uploading datasets into history %s" % self.history_id) dataset_shortname = ""
if self.sex is not None or self.sex != "":
dataset_shortname = self.genus[0].lower() + "_" + self.species.lower() + "_" + self.sex[0].lower()
else:
dataset_shortname = self.genus[0].lower() + "_" + self.species.lower()
history_datasets_li = self.instance.datasets.get_datasets()
genome_hda_id, gff_hda_id, transcripts_hda_id, proteins_hda_id, blast_diamond_hda_id, interproscan_hda_id = None, None, None, None, None, None
# Finding datasets in history (matches datasets name)
for dataset in history_datasets_li:
dataset_name = dataset["name"]
if dataset_shortname in dataset_name:
dataset_id = dataset["id"]
if dataset_name.endswith(".fasta"):
genome_hda_id = dataset_id
if dataset_name.endswith(".gff"):
gff_hda_id = dataset_id
if dataset_name.endswith("transcripts.fasta"):
transcripts_hda_id = dataset_id
if dataset_name.endswith("proteins.fasta"):
proteins_hda_id = dataset_id
if dataset_name.endswith("blasp.xml"):
blast_diamond_hda_id = dataset_id
# Import each dataset into history if it is not imported # Import each dataset into history if it is not imported
logging.debug("Uploading datasets into history %s" % self.history_id)
first_hda_ids = self.get_datasets_hda_ids(imported_datasets_ids=imported_datasets_ids) if genome_hda_id is None:
genome_dataset_upload = self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["genome_file"])
if first_hda_ids["genome_hda_id"] is not None: genome_hda_id = genome_dataset_upload["id"]
self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["genome_file"]) if gff_hda_id is None:
if first_hda_ids["gff_hda_id"] is not None: gff_dataset_upload = self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["gff_file"])
self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["gff_file"]) gff_hda_id = gff_dataset_upload["id"]
if first_hda_ids["transcripts_hda_id"] is not None: if transcripts_hda_id is None:
self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["transcripts_file"]) transcripts_dataset_upload = self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["transcripts_file"])
if first_hda_ids["proteins_hda_id"] is not None: transcripts_hda_id = transcripts_dataset_upload["id"]
self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["proteins_file"]) if proteins_hda_id is None:
if first_hda_ids["interproscan_hda_id"] is None: proteins_dataset_upload = self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["proteins_file"])
proteins_hda_id = proteins_dataset_upload["id"]
if interproscan_hda_id is None:
try: try:
self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["interproscan_file"]) interproscan_dataset_upload = self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["interproscan_file"])
interproscan_hda_id = interproscan_dataset_upload["id"]
except Exception as exc: except Exception as exc:
logging.debug("Interproscan file not found in library (history: {0})".format(self.history_id)) logging.debug("Interproscan file not found in library (history: {0})".format(self.history_id))
if first_hda_ids["blast_diamond_hda_id"] is None: if blast_diamond_hda_id is None:
try: try:
self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["blast_diamond_file"]) blast_diamond_dataset_upload = self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["blast_diamond_file"])
blast_diamond_hda_id = blast_diamond_upload["id"]
except Exception as exc: except Exception as exc:
logging.debug("Blastp file not found in library (history: {0})".format(self.history_id)) logging.debug("Blastp file not found in library (history: {0})".format(self.history_id))
# _datasets = self.instance.datasets.get_datasets()
# with open(os.path.join(self.main_dir, "datasets_ids.json"), "w") as datasets_ids_outfile:
# datasets_ids_outfile.write(str(_datasets))
# Return a dict made of the hda ids # Return a dict made of the hda ids
return self.get_datasets_hda_ids(imported_datasets_ids=first_hda_ids["imported_datasets_ids"]) return {"genome_hda_id": genome_hda_id,
"gff_hda_id": gff_hda_id,
"transcripts_hda_id": transcripts_hda_id,
"proteins_hda_id": proteins_hda_id,
"blast_diamond_hda_id": blast_diamond_hda_id,
"interproscan_hda_id": interproscan_hda_id}
def get_datasets_hda_ids(self, imported_datasets_ids): def get_datasets_hda_ids(self):
""" """
Get the hda IDs of the datasets imported into an history Get the hda IDs of the datasets imported into an history
...@@ -677,81 +690,6 @@ class RunWorkflow(speciesData.SpeciesData): ...@@ -677,81 +690,6 @@ class RunWorkflow(speciesData.SpeciesData):
"imported_datasets_ids": imported_datasets_ids} "imported_datasets_ids": imported_datasets_ids}
def get_organism_id(self):
"""
Retrieve current organism ID
Will try to add it to Chado if the organism ID can't be found
:return:
"""
tool_version = "2.3.4+galaxy0"
time.sleep(3)
# # Get the ID for the current organism in chado
# org = self.instance.tools.run_tool(
# tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/2.3.3",
# history_id=self.history_id,
# tool_inputs={"abbr": self.abbreviation,
# "genus": self.genus_uppercase,
# "species": self.chado_species_name,
# "common": self.common})
# time.sleep(3)
# Run tool again (sometimes the tool doesn't return anything despite the organism already being in the db)
org = self.instance.tools.run_tool(
tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/2.3.4+galaxy0",
history_id=self.history_id,
tool_inputs={"abbr": self.abbreviation,
"genus": self.genus_uppercase,
"species": self.chado_species_name,
"common": self.common})
org_job_out = org["outputs"][0]["id"]
org_json_output = self.instance.datasets.download_dataset(dataset_id=org_job_out)
try:
org_output = json.loads(org_json_output)[0]
self.org_id = str(org_output["organism_id"]) # id needs to be a str to be recognized by chado tools
except IndexError:
logging.warning("No organism matching " + self.full_name + " exists in the instance's chado database, adding it")
if self.common == "" or self.common is None:
self.instance.tools.run_tool(
tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_add_organism/organism_add_organism/%s" % tool_version,
history_id=self.history_id,
tool_inputs={"abbr": self.abbreviation,
"genus": self.genus_uppercase,
"species": self.chado_species_name,
"common": self.abbreviation})
else:
self.instance.tools.run_tool(
tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_add_organism/organism_add_organism/%s" % tool_version,
history_id=self.history_id,
tool_inputs={"abbr": self.abbreviation,
"genus": self.genus_uppercase,
"species": self.chado_species_name,
"common": self.common})
# Run tool again (sometimes the tool doesn't return anything despite the organism already being in the db)
org = self.instance.tools.run_tool(
tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/2.3.4+galaxy0",
history_id=self.history_id,
tool_inputs={"abbr": self.abbreviation,
"genus": self.genus_uppercase,
"species": self.chado_species_name,
"common": self.common})
org_job_out = org["outputs"][0]["id"]
org_json_output = self.instance.datasets.download_dataset(dataset_id=org_job_out)
try:
org_output = json.loads(org_json_output)[0]
self.org_id = str(org_output["organism_id"]) # id needs to be a str to be recognized by chado tools
except IndexError:
logging.critical("Cannot add {0} as an organism in Chado, please check the galaxy instance {1}".format(self.full_name, self.instance_url))
sys.exit()
return self.org_id
def run_workflow(workflow_path, workflow_parameters, datamap, config, input_species_number): def run_workflow(workflow_path, workflow_parameters, datamap, config, input_species_number):
""" """
Run a workflow in galaxy Run a workflow in galaxy
...@@ -810,9 +748,7 @@ def run_workflow(workflow_path, workflow_parameters, datamap, config, input_spec ...@@ -810,9 +748,7 @@ def run_workflow(workflow_path, workflow_parameters, datamap, config, input_spec
def create_sp_workflow_dict(sp_dict, main_dir, config):
def create_sp_workflow_dict(sp_dict, main_dir, config, imported_datasets_ids):
""" """
""" """
...@@ -847,13 +783,16 @@ def create_sp_workflow_dict(sp_dict, main_dir, config, imported_datasets_ids): ...@@ -847,13 +783,16 @@ def create_sp_workflow_dict(sp_dict, main_dir, config, imported_datasets_ids):
history_id = run_workflow_for_current_organism.set_get_history() history_id = run_workflow_for_current_organism.set_get_history()
run_workflow_for_current_organism.install_changesets_revisions_for_individual_tools() run_workflow_for_current_organism.install_changesets_revisions_for_individual_tools()
run_workflow_for_current_organism.add_organism_ogs_genome_analyses() ids = run_workflow_for_current_organism.add_organism_ogs_genome_analyses()
org_id = run_workflow_for_current_organism.get_organism_id() org_id = None
genome_analysis_id = run_workflow_for_current_organism.get_genome_analysis_id() genome_analysis_id = None
ogs_analysis_id = run_workflow_for_current_organism.get_ogs_analysis_id() ogs_analysis_id = None
org_id = ids["org_id"]
genome_analysis_id = ids["genome_analysis_id"]
ogs_analysis_id = ids["ogs_analysis_id"]
instance_attributes = run_workflow_for_current_organism.get_instance_attributes() instance_attributes = run_workflow_for_current_organism.get_instance_attributes()
hda_ids = run_workflow_for_current_organism.import_datasets_into_history(imported_datasets_ids=imported_datasets_ids) hda_ids = run_workflow_for_current_organism.import_datasets_into_history()
strain_sex = "{0}_{1}".format(run_workflow_for_current_organism.strain, run_workflow_for_current_organism.sex) strain_sex = "{0}_{1}".format(run_workflow_for_current_organism.strain, run_workflow_for_current_organism.sex)
genus_species = run_workflow_for_current_organism.genus_species genus_species = run_workflow_for_current_organism.genus_species
...@@ -914,13 +853,12 @@ def install_changesets_revisions_from_workflow(instance, workflow_path): ...@@ -914,13 +853,12 @@ def install_changesets_revisions_from_workflow(instance, workflow_path):
# (If it's not installed, the show_tool version returned will be a default version with the suffix "XXXX+0") # (If it's not installed, the show_tool version returned will be a default version with the suffix "XXXX+0")
if show_tool["version"] != v["tool_version"]: if show_tool["version"] != v["tool_version"]:
# If it doesn't match, proceed to install of the correct changeset revision # If it doesn't match, proceed to install of the correct changeset revision
# logging.warning("Tool versions don't match for {0} (changeset installed: {1} | changeset required: {2}). Installing changeset revision {3}...".format(v["tool_shed_repository"]["name"], show_tool["changeset_revision"], v["tool_shed_repository"]["changeset_revision"], v["tool_shed_repository"]["changeset_revision"]))
toolshed = "https://" + v["tool_shed_repository"]["tool_shed"] toolshed = "https://" + v["tool_shed_repository"]["tool_shed"]
name = v["tool_shed_repository"]["name"] name = v["tool_shed_repository"]["name"]
owner = v["tool_shed_repository"]["owner"] owner = v["tool_shed_repository"]["owner"]
changeset_revision = v["tool_shed_repository"]["changeset_revision"] changeset_revision = v["tool_shed_repository"]["changeset_revision"]
logging.debug("Installing changeset {0} for tool {1}".format(changeset_revision, name)) logging.warning("Installed tool versions for tool {0} do not match the version required by the specified workflow, installing changeset {1}".format(name, changeset_revision))
# Install changeset # Install changeset
instance.toolshed.install_repository_revision(tool_shed_url=toolshed, name=name, owner=owner, instance.toolshed.install_repository_revision(tool_shed_url=toolshed, name=name, owner=owner,
...@@ -928,6 +866,12 @@ def install_changesets_revisions_from_workflow(instance, workflow_path): ...@@ -928,6 +866,12 @@ def install_changesets_revisions_from_workflow(instance, workflow_path):
install_tool_dependencies=True, install_tool_dependencies=True,
install_repository_dependencies=False, install_repository_dependencies=False,
install_resolver_dependencies=True) install_resolver_dependencies=True)
else:
toolshed = "https://" + v["tool_shed_repository"]["tool_shed"]
name = v["tool_shed_repository"]["name"]
owner = v["tool_shed_repository"]["owner"]
changeset_revision = v["tool_shed_repository"]["changeset_revision"]
logging.debug("Installed tool versions for tool {0} match the version in the specified workflow (changeset {1})".format(name, changeset_revision))
logging.info("Tools versions and changesets from workflow validated") logging.info("Tools versions and changesets from workflow validated")
...@@ -992,17 +936,10 @@ if __name__ == "__main__": ...@@ -992,17 +936,10 @@ if __name__ == "__main__":
config = utilities.parse_config(args.config) config = utilities.parse_config(args.config)
all_sp_workflow_dict = {} all_sp_workflow_dict = {}
# IDs of already imported datasets (useful in case there are several species within the same instance)
# TODO: Not a very smart way to filter datasets as the list will grow at every input species, whereas
# we only need to do the ID lookup for a single galaxy instance --> Possible issue where we encounter 2 identical IDs in
# different instances
imported_datasets_ids = []
for sp_dict in sp_dict_list: for sp_dict in sp_dict_list:
# Add and retrieve all analyses/organisms for the current input species and add their IDs to the input dictionary # Add and retrieve all analyses/organisms for the current input species and add their IDs to the input dictionary
current_sp_workflow_dict = create_sp_workflow_dict(sp_dict, main_dir=args.main_directory, config=config, imported_datasets_ids=imported_datasets_ids) current_sp_workflow_dict = create_sp_workflow_dict(sp_dict, main_dir=args.main_directory, config=config)
current_sp_key = list(current_sp_workflow_dict.keys())[0] current_sp_key = list(current_sp_workflow_dict.keys())[0]
current_sp_value = list(current_sp_workflow_dict.values())[0] current_sp_value = list(current_sp_workflow_dict.values())[0]
...@@ -1087,6 +1024,29 @@ if __name__ == "__main__": ...@@ -1087,6 +1024,29 @@ if __name__ == "__main__":
org1_transcripts_hda_id = org1_dict["hda_ids"]["transcripts_hda_id"] org1_transcripts_hda_id = org1_dict["hda_ids"]["transcripts_hda_id"]
org1_proteins_hda_id = org1_dict["hda_ids"]["proteins_hda_id"] org1_proteins_hda_id = org1_dict["hda_ids"]["proteins_hda_id"]
org1_gff_hda_id = org1_dict["hda_ids"]["gff_hda_id"] org1_gff_hda_id = org1_dict["hda_ids"]["gff_hda_id"]
# Store these values into a dict for parameters logging/validation
org1_parameters_dict = {
"org1_genus": org1_genus,
"org1_species": org1_species,
"org1_genus_species": org1_genus_species,
"org1_species_folder_name": org1_species_folder_name,
"org1_full_name": org1_full_name,
"org1_strain": org1_strain,
"org1_sex": org1_sex,
"org1_org_id": org1_org_id,
"org1_genome_analysis_id": org1_genome_analysis_id,
"org1_ogs_analysis_id": org1_ogs_analysis_id,
"org1_transcripts_hda_id": org1_transcripts_hda_id,
"org1_proteins_hda_id": org1_proteins_hda_id,
"org1_gff_hda_id": org1_gff_hda_id,
}
# Look for empty parameters values, throw a critical error if a parameter value is invalid
for param_name, param_value in org1_parameters_dict.items():
if param_value is None or param_value == "":
logging.critical("Empty parameter found for organism {0} (parameter: {1}, parameter value: {2})".format(org1_full_name, param_name, param_value))
sys.exit()
# Organism 2 attributes # Organism 2 attributes
org2_genus = org2_dict["genus"] org2_genus = org2_dict["genus"]
...@@ -1103,7 +1063,30 @@ if __name__ == "__main__": ...@@ -1103,7 +1063,30 @@ if __name__ == "__main__":
org2_transcripts_hda_id = org2_dict["hda_ids"]["transcripts_hda_id"] org2_transcripts_hda_id = org2_dict["hda_ids"]["transcripts_hda_id"]
org2_proteins_hda_id = org2_dict["hda_ids"]["proteins_hda_id"] org2_proteins_hda_id = org2_dict["hda_ids"]["proteins_hda_id"]
org2_gff_hda_id = org2_dict["hda_ids"]["gff_hda_id"] org2_gff_hda_id = org2_dict["hda_ids"]["gff_hda_id"]
# Store these values into a dict for parameters logging/validation
org2_parameters_dict = {
"org2_genus": org2_genus,
"org2_species": org2_species,
"org2_genus_species": org2_genus_species,
"org2_species_folder_name": org2_species_folder_name,
"org2_full_name": org2_full_name,
"org2_strain": org2_strain,
"org2_sex": org2_sex,
"org2_org_id": org2_org_id,
"org2_genome_analysis_id": org2_genome_analysis_id,
"org2_ogs_analysis_id": org2_ogs_analysis_id,
"org2_transcripts_hda_id": org2_transcripts_hda_id,
"org2_proteins_hda_id": org2_proteins_hda_id,
"org2_gff_hda_id": org2_gff_hda_id,
}
# Look for empty parameters values, throw a critical error if a parameter value is invalid
for param_name, param_value in org2_parameters_dict.items():
logging.info("Parameters for organism 1 (%s)" % org2_full_name)
if param_value is None or param_value == "":
logging.critical("Empty parameter found for organism {0} (parameter: {1}, parameter value: {2})".format(org2_full_name, param_name, param_value))
sys.exit()
# Source files association (ordered by their IDs in the workflow) # Source files association (ordered by their IDs in the workflow)
GENOME_FASTA_FILE_ORG1 = "0" GENOME_FASTA_FILE_ORG1 = "0"
...@@ -1177,50 +1160,54 @@ if __name__ == "__main__": ...@@ -1177,50 +1160,54 @@ if __name__ == "__main__":
datamap[GFF_FILE_ORG2] = {"src": "hda", "id": org2_gff_hda_id} datamap[GFF_FILE_ORG2] = {"src": "hda", "id": org2_gff_hda_id}
datamap[PROTEINS_FASTA_FILE_ORG2] = {"src": "hda", "id": org2_proteins_hda_id} datamap[PROTEINS_FASTA_FILE_ORG2] = {"src": "hda", "id": org2_proteins_hda_id}
with open(workflow_path, 'r') as ga_in_file: # with open(workflow_path, 'r') as ga_in_file:
# Store the decoded json dictionary # # Store the decoded json dictionary
workflow_dict = json.load(ga_in_file) # workflow_dict = json.load(ga_in_file)
workflow_name = workflow_dict["name"] # workflow_name = workflow_dict["name"]
# For the Jbrowse tool, we unfortunately have to manually edit the parameters instead of setting them # # For the Jbrowse tool, we unfortunately have to manually edit the parameters instead of setting them
# as runtime values, using runtime parameters makes the tool throw an internal critical error ("replace not found" error) # # as runtime values, using runtime parameters makes the tool throw an internal critical error ("replace not found" error)
# Scratchgmod test: need "http" (or "https"), the hostname (+ port) # # Scratchgmod test: need "http" (or "https"), the hostname (+ port)
if "menu_url" not in config.keys(): # if "menu_url" not in config.keys():
jbrowse_menu_url_org1 = "https://{hostname}/sp/{genus_sp}/feature/{Genus}/{species}/mRNA/{id}".format(hostname=config["hostname"], genus_sp=org1_genus_species, Genus=org1_genus[0].upper() + org1_genus[1:], species=org1_species, id="\{id\}") # jbrowse_menu_url_org1 = "https://{hostname}/sp/{genus_sp}/feature/{Genus}/{species}/mRNA/{id}".format(hostname=config["hostname"], genus_sp=org1_genus_species, Genus=org1_genus[0].upper() + org1_genus[1:], species=org1_species, id="\{id\}")
jbrowse_menu_url_org2 = "https://{hostname}/sp/{genus_sp}/feature/{Genus}/{species}/mRNA/{id}".format(hostname=config["hostname"], genus_sp=org2_genus_species, Genus=org2_genus[0].upper() + org2_genus[1:], species=org2_species, id="\{id\}") # jbrowse_menu_url_org2 = "https://{hostname}/sp/{genus_sp}/feature/{Genus}/{species}/mRNA/{id}".format(hostname=config["hostname"], genus_sp=org2_genus_species, Genus=org2_genus[0].upper() + org2_genus[1:], species=org2_species, id="\{id\}")
else: # else:
jbrowse_menu_url_org1 = config["menu_url"] # jbrowse_menu_url_org1 = config["menu_url"]
jbrowse_menu_url_org2 = jbrowse_menu_url_org1 # jbrowse_menu_url_org2 = jbrowse_menu_url_org1
# # Replace values in the workflow dictionary
# workflow_dict["steps"]["7"]["tool_state"] = workflow_dict["steps"]["7"]["tool_state"].replace("__MENU_URL_ORG1__", jbrowse_menu_url_org1)
# workflow_dict["steps"]["8"]["tool_state"] = workflow_dict["steps"]["8"]["tool_state"].replace("__MENU_URL_ORG2__", jbrowse_menu_url_org2)
# # The UNIQUE_ID is specific to a combination genus_species_strain_sex so every combination should have its unique workflow
# # in galaxy --> define a naming method for these workflows
# workflow_dict["steps"]["10"]["tool_state"] = workflow_dict["steps"]["10"]["tool_state"].replace("__FULL_NAME_ORG1__", org1_full_name).replace("__UNIQUE_ID_ORG1__", org1_species_folder_name)
# workflow_dict["steps"]["10"]["tool_state"] = workflow_dict["steps"]["10"]["tool_state"].replace("__FULL_NAME_ORG2__", org2_full_name).replace("__UNIQUE_ID_ORG2__", org2_species_folder_name)
# # Import the workflow in galaxy as a dict
# instance.workflows.import_workflow_dict(workflow_dict=workflow_dict)
# # Get its attributes
# workflow_attributes = instance.workflows.get_workflows(name=workflow_name)
# # Then get its ID (required to invoke the workflow)
# workflow_id = workflow_attributes[0]["id"] # Index 0 is the most recently imported workflow (the one we want)
# show_workflow = instance.workflows.show_workflow(workflow_id=workflow_id)
# # Check if the workflow is found
# try:
# logging.debug("Workflow ID: %s" % workflow_id)
# except bioblend.ConnectionError:
# logging.warning("Error finding workflow %s" % workflow_name)
# # Finally, invoke the workflow alogn with its datamap, parameters and the history in which to invoke it
# instance.workflows.invoke_workflow(workflow_id=workflow_id, history_id=history_id, params=workflow_parameters, inputs=datamap, allow_tool_state_corrections=True)
# logging.info("Successfully imported and invoked workflow {0}, check the galaxy instance ({1}) for the jobs state".format(workflow_name, instance_url))
# Replace values in the workflow dictionary
workflow_dict["steps"]["7"]["tool_state"] = workflow_dict["steps"]["7"]["tool_state"].replace("__MENU_URL_ORG1__", jbrowse_menu_url_org1)
workflow_dict["steps"]["8"]["tool_state"] = workflow_dict["steps"]["8"]["tool_state"].replace("__MENU_URL_ORG2__", jbrowse_menu_url_org2)
# The UNIQUE_ID is specific to a combination genus_species_strain_sex so every combination should have its unique workflow
# in galaxy --> define a naming method for these workflows
workflow_dict["steps"]["10"]["tool_state"] = workflow_dict["steps"]["10"]["tool_state"].replace("__FULL_NAME_ORG1__", org1_full_name).replace("__UNIQUE_ID_ORG1__", org1_species_folder_name)
workflow_dict["steps"]["10"]["tool_state"] = workflow_dict["steps"]["10"]["tool_state"].replace("__FULL_NAME_ORG2__", org2_full_name).replace("__UNIQUE_ID_ORG2__", org2_species_folder_name)
print(workflow_dict)
# Import the workflow in galaxy as a dict
instance.workflows.import_workflow_dict(workflow_dict=workflow_dict)
# Get its attributes
workflow_attributes = instance.workflows.get_workflows(name=workflow_name)
# Then get its ID (required to invoke the workflow)
workflow_id = workflow_attributes[0]["id"] # Index 0 is the most recently imported workflow (the one we want)
show_workflow = instance.workflows.show_workflow(workflow_id=workflow_id)
# Check if the workflow is found
try:
logging.debug("Workflow ID: %s" % workflow_id)
except bioblend.ConnectionError:
logging.warning("Error finding workflow %s" % workflow_name)
# Finally, invoke the workflow alogn with its datamap, parameters and the history in which to invoke it
instance.workflows.invoke_workflow(workflow_id=workflow_id, history_id=history_id, params=workflow_parameters, inputs=datamap, allow_tool_state_corrections=True)
logging.info("Successfully imported and invoked workflow {0}, check the galaxy instance ({1}) for the jobs state".format(workflow_name, instance_url))
# Get the instance attribute from the object for future connections # Get the instance attribute from the object for future connections
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment