Pipeline v1.0 ready, missing orthology data and template workflows for blastp...

Pipeline v1.0 ready, missing orthology data and template workflows for blastp (diamond) and interpro. The data preprocessing for phaeoexplorer script is available in the phaeoexplorer-tools repo (but is not usable as a standalone)

Pipeline v1.0 ready, missing orthology data and template workflows for blastp...
Pipeline v1.0 ready, missing orthology data and template workflows for blastp (diamond) and interpro. The data preprocessing for phaeoexplorer script is available in the phaeoexplorer-tools repo (but is not usable as a standalone)
13e0e68a · Arthur Le Bars · 16785175 · 13e0e68a · 13e0e68a · 13e0e68a
Commit 13e0e68a authored 4 years ago by Arthur Le Bars
--- a/gga_get_data.py
+++ b/gga_get_data.py
@@ -53,70 +53,70 @@ class GetData(speciesData.SpeciesData):
        return 1


-    def batch_modify_fasta_headers(self):
-        """
-        Change the fasta headers before integration, so that the indexing tool in galaxy interprets the headers
-        correctly and doesn't throw an error
-
-        The function will use the class attribute "source_datasets", pointing to files in the galaxy
-        library to find the fasta files that need their headers formatted
-
-        :return:
-        """
-
-        proteins_file = None
-        proteins_outfile = None
-        annotation_dir = None
-        organism_annotation_dir = os.path.abspath("./src_data/annotation/{0}/OGS{1}".format(self.species_folder_name, self.ogs_version))
-
-        self.goto_species_dir()
-
-        for d in [i[0] for i in os.walk(os.getcwd() + "/src_data")]:
-            if "annotation" in d and self.species_folder_name in d and self.ogs_version in d:
-                for f in os.listdir(d):
-                    if "proteins" in f:
-                        proteins_file = os.path.join(d, f)
-                        proteins_outfile = os.path.join(d, "outfile_proteins.fa")
-                        annotation_dir = os.path.abspath(d)
-        # Formatting the headers
-        if proteins_file is not None:
-            self.format_fasta_headers(infile=proteins_file,
-                                      outfile=proteins_outfile,
-                                      pattern="^>mRNA",
-                                      repl=">protein")
-            if os.path.exists(annotation_dir + "/outfile_proteins.fa"):
-                subprocess.call(["mv", annotation_dir + "/outfile_proteins.fa", proteins_file],
-                               stdout=subprocess.PIPE,
-                               stderr=subprocess.PIPE,
-                               cwd=annotation_dir)
-                subprocess.call(["rm", annotation_dir + "/outfile_proteins.fa"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=annotation_dir)
-
-        else:
-            logging.warning("Skipping proteins fasta headers formatting (FileNotFound)")
-
-    @staticmethod
-    def format_fasta_headers(infile, outfile, pattern, repl):
-        """
-        Format the fasta headers of a given file, given a matching pattern and a replacement string
-
-        :param infile:
-        :param outfile:
-        :param pattern:
-        :param repl:
-        :return:
-        """
-
-        infile = open(infile, 'r')
-        outfile = open(outfile, 'w')
-
-        lines = infile.readlines()
-
-        for line in lines:
-            line_out = re.sub(pattern, repl, line)
-            outfile.write(line_out)
-
-        infile.close()
-        outfile.close()
+    # def batch_modify_fasta_headers(self):
+    #     """
+    #     Change the fasta headers before integration, so that the indexing tool in galaxy interprets the headers
+    #     correctly and doesn't throw an error
+
+    #     The function will use the class attribute "source_datasets", pointing to files in the galaxy
+    #     library to find the fasta files that need their headers formatted
+
+    #     :return:
+    #     """
+
+    #     proteins_file = None
+    #     proteins_outfile = None
+    #     annotation_dir = None
+    #     organism_annotation_dir = os.path.abspath("./src_data/annotation/{0}/OGS{1}".format(self.species_folder_name, self.ogs_version))
+
+    #     self.goto_species_dir()
+
+    #     for d in [i[0] for i in os.walk(os.getcwd() + "/src_data")]:
+    #         if "annotation" in d and self.species_folder_name in d and self.ogs_version in d:
+    #             for f in os.listdir(d):
+    #                 if "proteins" in f:
+    #                     proteins_file = os.path.join(d, f)
+    #                     proteins_outfile = os.path.join(d, "outfile_proteins.fa")
+    #                     annotation_dir = os.path.abspath(d)
+    #     # Formatting the headers
+    #     if proteins_file is not None:
+    #         self.format_fasta_headers(infile=proteins_file,
+    #                                   outfile=proteins_outfile,
+    #                                   pattern="^>mRNA",
+    #                                   repl=">protein")
+    #         if os.path.exists(annotation_dir + "/outfile_proteins.fa"):
+    #             subprocess.call(["mv", annotation_dir + "/outfile_proteins.fa", proteins_file],
+    #                            stdout=subprocess.PIPE,
+    #                            stderr=subprocess.PIPE,
+    #                            cwd=annotation_dir)
+    #             subprocess.call(["rm", annotation_dir + "/outfile_proteins.fa"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=annotation_dir)
+
+    #     else:
+    #         logging.warning("Skipping proteins fasta headers formatting (FileNotFound)")
+
+    # @staticmethod
+    # def format_fasta_headers(infile, outfile, pattern, repl):
+    #     """
+    #     Format the fasta headers of a given file, given a matching pattern and a replacement string
+
+    #     :param infile:
+    #     :param outfile:
+    #     :param pattern:
+    #     :param repl:
+    #     :return:
+    #     """
+
+    #     infile = open(infile, 'r')
+    #     outfile = open(outfile, 'w')
+
+    #     lines = infile.readlines()
+
+    #     for line in lines:
+    #         line_out = re.sub(pattern, repl, line)
+    #         outfile.write(line_out)
+
+    #     infile.close()
+    #     outfile.close()

    def get_source_data_files_from_path(self):
        """
@@ -297,8 +297,8 @@ if __name__ == "__main__":
        logging.info("Sucessfully copied datasets for %s" % get_data_for_current_species.full_name)
        
        # Format fasta headers (proteins)
-        logging.info("Formatting fasta files headers %s " % get_data_for_current_species.full_name)
-        get_data_for_current_species.batch_modify_fasta_headers()
-        logging.info("Successfully formatted files headers %s " % get_data_for_current_species.full_name)
+        # logging.info("Formatting fasta files headers %s " % get_data_for_current_species.full_name)
+        # get_data_for_current_species.batch_modify_fasta_headers()
+        # logging.info("Successfully formatted files headers %s " % get_data_for_current_species.full_name)

        logging.info("Data successfully loaded and imported for %s" % get_data_for_current_species.full_name)
--- a/gga_init.py
+++ b/gga_init.py
@@ -21,6 +21,11 @@ gga_init.py

 Usage: $ python3 gga_init.py -i input_example.yml --config config.yml [OPTIONS]

+
+TODO
+- Exclude traefik dir tree creation and file writing from the loop (make make_dirs() an external func and write a func similar to make_compose_files()
+  for traefik and authelia as an external func)
+
 """


@@ -82,11 +87,15 @@ class DeploySpeciesStack(speciesData.SpeciesData):
        try:
            os.mkdir("./src_data")
        except FileExistsError:
-            logging.debug("src_data folder already exist for %s" % self.full_name)
+            logging.debug("'src_data' directory already exist for %s" % self.full_name)
        except PermissionError as exc:
            logging.critical("Insufficient permission to create src_data directory tree")
            sys.exit(exc)

+
+        print(self.strain)
+        print(self.species_folder_name)
+
        # List of all the directories to create in src_data
        src_data_dirs_li = ["./src_data", "./src_data/annotation", "./src_data/genome", "./src_data/tracks",
                            "./src_data/annotation/%s" % self.species_folder_name,
@@ -160,12 +169,32 @@ class DeploySpeciesStack(speciesData.SpeciesData):
        # Create the volumes (directory) of the species docker-compose file
        self.create_mounts(working_dir=".")

+        # Return to main directory
+        os.chdir(self.main_dir)
+
+    def make_traefik_compose_files(self):
+        """
+        Create or update the traefik docker-compose file and authelia conf files
+        Will only write new authelia conf files if the argument "--overwrite-all" is specified or
+        the authelia directory doesn't contain conf files
+
+        :return:
+        """
+
        # Proceed to the traefik and authelia directories
        os.chdir(self.main_dir)
+
+        # Create directory tree
        self.make_dirs(["./traefik", "./traefik/authelia"])
+
        # Render and try to write the traefik docker-compose file
        # This new docker-compose file will not overwrite the one already present in the traefik dir
        # unless the argument "--overwrite-all" is specified
+
+        # Jinja2 templating, handled using the python "jinja2" module
+        file_loader = FileSystemLoader(self.script_dir + "/templates")
+        env = Environment(loader=file_loader)
+
        if not os.path.isfile("./traefik/docker-compose.yml") or force:
            traefik_compose_template = env.get_template("traefik_compose_template.yml.j2")
            traefik_compose_output = traefik_compose_template.render(render_vars)
@@ -199,6 +228,7 @@ class DeploySpeciesStack(speciesData.SpeciesData):
        # Return to main directory
        os.chdir(self.main_dir)

+
    def create_mounts(self, working_dir):
        """
        Create the folders (volumes) required by a container (to see required volumes, check their compose file)
@@ -249,41 +279,59 @@ class DeploySpeciesStack(speciesData.SpeciesData):
            logging.critical("Cannot access %s, exiting" % self.main_dir)
            sys.exit(exc)

-    def deploy_stack(self, input_list):
-        """
-        Call the script "deploy.sh" used to initiliaze the swarm cluster if needed and
-        launch/update the current organism's stack
-
-        This script first try to deploy the traefik stack, then deploy the organism stack, then update the traefik stack
-        The stacks are updated if already deployed
-
-        :return:
-        """
-

-        to_deploy_species_li = []
+def deploy_stacks(input_list, main_dir):
+    """
+    This function first deploys/redeploys the traefik stack, then deploys/redeploys the organism stack, then redeploys the traefik stack
+    This function is executed outside the "main" loop of input species

-        # # Create our swarm cluster if it doesn't exist
-        # subprocess.Popen(["docker", "swarm", "init"],
-        #                  stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=self.main_dir)
-        #
-        # # Deploy/update the stack for the current species
-        # subprocess.Popen(["docker", "stack", "deploy", "-c", "./docker-compose.yml", "{0}_{1}".format(self.genus_lowercase, self.species)],
-        #                  stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=self.main_dir)
+    :return:
+    """

-        # Launch and update docker stacks
-        # noinspection PyArgumentList
-        # deploy_stacks_popen = subprocess.Popen(["sh", self.script_dir + "/deploy.sh", self.genus_species,
-        #                                         self.main_dir + "/traefik"],
-        #                                        stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
-        #                                        universal_newlines=True)
-        #
-        # for stdout_line in iter(deploy_stacks_popen.stdout.readline, ""):
-        #     if "daemon" in stdout_line:  # Ignore swarm init error output
-        #         pass
-        #     else:
-        #         logging.info("\t%s" % stdout_line.strip())
-        # deploy_stacks_popen.stdout.close()
+    main_dir = os.path.abspath(main_dir)
+    os.chdir(main_dir)
+
+    # Get species for which to deploy the stacks
+    to_deploy_species_li = utilities.get_species_to_deploy(sp_dict_list=input_list)
+
+    # Create the swarm cluster if needed
+    subprocess.call(["docker", "swarm", "init"],
+                    stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=main_dir)
+    
+    # Deploy traefik stack
+    os.chdir("./traefik")
+    subprocess.call(["docker", "stack", "deploy", "-c", "./docker-compose.yml", "traefik"],
+                                            stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=".")
+    os.chdir(main_dir)
+
+    # Deploy individual species stacks
+    for sp in to_deploy_species_li:
+        os.chdir(sp)
+        subprocess.call(["docker", "stack", "deploy", "-c", "./docker-compose.yml", "{0}_{1}".format(sp.split("_")[0], sp.split("_")[1])],
+                                           stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=".")
+        logging.info("Deployed %s stack" % sp)
+        os.chdir(main_dir)
+
+    # Update traefik stack
+    os.chdir("./traefik")
+    subprocess.call(["docker", "stack", "deploy", "-c", "./docker-compose.yml", "traefik"],
+                                            stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=".")
+    os.chdir(main_dir)
+
+    # # Using deploy.sh script (obsolete)
+    # Launch and update docker stacks
+    # noinspection PyArgumentList
+    # deploy_stacks_popen = subprocess.Popen(["sh", self.script_dir + "/deploy.sh", self.genus_species,
+    #                                         self.main_dir + "/traefik"],
+    #                                        stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+    #                                        universal_newlines=True)
+    #
+    # for stdout_line in iter(deploy_stacks_popen.stdout.readline, ""):
+    #     if "daemon" in stdout_line:  # Ignore swarm init error output
+    #         pass
+    #     else:
+    #         logging.info("\t%s" % stdout_line.strip())
+    # deploy_stacks_popen.stdout.close()


 if __name__ == "__main__":
@@ -332,9 +380,6 @@ if __name__ == "__main__":
        main_dir = os.path.abspath(args.main_directory)

    sp_dict_list = utilities.parse_input(os.path.abspath(args.input))
-    print(sp_dict_list)
-
-    utilities.get_species_to_deploy(sp_dict_list=sp_dict_list)

    logging.info("Deploying stacks for organisms in input file %s" % args.input)
    for sp_dict in sp_dict_list:
@@ -375,14 +420,6 @@ if __name__ == "__main__":
        deploy_stack_for_current_organism.make_compose_files(force=args.overwrite_all)
        logging.info("Successfully generated the docker-compose files for %s" % deploy_stack_for_current_organism.full_name)

-        # Deploy the stack
-        logging.info("Deploying stack for %s..." % deploy_stack_for_current_organism.full_name)
-        # deploy_stack_for_current_organism.deploy_stack()
-        logging.info("Successfully deployed stack for %s" % deploy_stack_for_current_organism.full_name)
-
-        logging.info("Stack deployed for %s" % deploy_stack_for_current_organism.full_name)
-
-        # TODO: if GENUS°1 == GENUS°2 AND SP°1 == SP°2 --> skip init, continue to next item and only deploy once the loop is done
-
-    # TODO: reload traefik outside loop
+    logging.info("Deploying stacks")
+    deploy_stacks(input_list=sp_dict_list, main_dir=main_dir)
    logging.info("All stacks deployed for organisms in input file %s" % args.input)
--- a/run_workflow_phaeoexplorer.py
+++ b/run_workflow_phaeoexplorer.py
@@ -183,11 +183,34 @@ class RunWorkflow(speciesData.SpeciesData):
                         "sourcename": "Genoscope",
                         "date_executed": self.date})

+        # Add Interpro analysis to chado 
+        self.instance.tools.run_tool(
+            tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/2.3.3",
+            history_id=self.history_id,
+            tool_inputs={"name": "InterproScan on OGS%s" % self.ogs_version,
+                         "program": "InterproScan",
+                         "programversion": "OGS%s" % self.ogs_version,
+                         "sourcename": "Genoscope",
+                         "date_executed": self.date})
+
+
+        # Add Blastp (diamond) analysis to chado 
+        self.instance.tools.run_tool(
+            tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/2.3.3",
+            history_id=self.history_id,
+            tool_inputs={"name": "Diamond on OGS%s" % self.ogs_version,
+                         "program": "Diamond",
+                         "programversion": "OGS%s" % self.ogs_version,
+                         "sourcename": "Genoscope",
+                         "date_executed": self.date})
+
+
        # Also get the organism and analyses IDs
        self.get_organism_and_analyses_ids()

        logging.info("Finished initializing instance")

+
    def run_workflow(self, workflow_path, workflow_parameters, workflow_name, datamap):
        """
        Run a workflow in galaxy
@@ -641,6 +664,23 @@ if __name__ == "__main__":
                                                               datamap=run_workflow_for_current_organism.datamap,
                                                               workflow_name="Jbrowse")

+            elif "Interpro" in str(workflow):
+
+                logging.info("Executing workflow 'Interproscan")
+
+                run_workflow_for_current_organism.connect_to_instance()
+                run_workflow_for_current_organism.set_get_history()
+                # run_workflow_for_current_organism.get_species_history_id()
+
+                # Get the attributes of the instance and project data files
+                run_workflow_for_current_organism.get_instance_attributes()
+                run_workflow_for_current_organism.get_organism_and_analyses_ids()
+
+                # Import datasets into history and retrieve their hda IDs
+                run_workflow_for_current_organism.import_datasets_into_history()
+                run_workflow_for_current_organism.get_datasets_hda_ids()
+
+
        else:
            logging.critical("The galaxy container for %s is not ready yet!" % run_workflow_for_current_organism.full_name)
            sys.exit()
--- a/speciesData.py
+++ b/speciesData.py
@@ -82,5 +82,3 @@ class SpeciesData:
        self.species_folder_name = "_".join(utilities.filter_empty_not_empty_items([self.genus_lowercase, self.species, self.strain, self.sex])["not_empty"])
        self.existing_folders_cache = {}
        self.bam_metadata_cache = {}
-        # self.do_update = False  # Update the instance (in histories corresponding to the input) instead of creating a new one
-        # self.species_name_regex_litteral = "(?=\w*V)(?=\w*A)(?=\w*R)(?=\w*I)(?=\w*A)(?=\w*B)(?=\w*L)(?=\w*E)\w+"  # Placeholder regex for file matching
--- a/utils/phaeoexplorer_ec32_orthologs_transfer.py
+++ b/utils/phaeoexplorer_ec32_orthologs_transfer.py
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+import sys
+import os
+import argparse
+import pickle
+import logging
+
+def orthologs_transfer(manual_annotation_file, orthofinder_file, gff_file, outdir):
+    """
+    Transfer of description between a manual annotation file and a gff file, using an orthofinder output file
+    to find the corresponding IDs between the two files
+
+    :param manual_annotation_file:
+    :param orthofinder_file:
+    :param gff_file:
+    :param outdir:
+    :return:
+    """
+
+    manual_annotations_dict, orthofinder_dict, gff_dict = {}, {}, {}
+    species_filename = ""
+
+    with open(orthofinder_file, 'r') as orthofinder:
+        next(orthofinder)  # Skip header
+        # Mapping orthofinder's IDs to manual annotation's description  
+        logging.info("Mapping Orthofinder's IDs to manual annotation's IDs...")
+        for orthofiner_line in orthofinder:
+            orthofiner_line_split = orthofiner_line.rstrip().split("\t")
+            orthofinder_dict[orthofiner_line_split[1].split(" ")[0]] = orthofiner_line_split[2].split(" ")[0]
+    
+
+        with open(manual_annotation_file, 'r') as manual_annotation:
+            # Mapping orthofinder's IDs to manual annotation's description
+            logging.info("Mapping Orthofinder's IDs to descriptions...")
+            next(manual_annotation)  # Skip header
+            for manual_annotation_line in manual_annotation:
+                manual_annotation_line_split = manual_annotation_line.rstrip().split("\t")
+                manual_annotations_dict[manual_annotation_line_split[0]] = ";".join([manual_annotation_line_split[0], manual_annotation_line_split[6]])
+
+    # Opening GFF, appending manual annotation's description to matching IDs
+    logging.info("Transferring manual descriptions to the GFF file...")
+    output_filename = str(os.path.join(os.path.abspath(outdir), os.path.basename(gff_file).split(".")[0]) + "_TRANSFERED.gff")
+
+    with open(output_filename, 'a') as output_file:
+        output_file.truncate(0)  # Erase previous file content
+        with open(gff_file, 'r') as gff:
+            for gff_line in gff:
+                if "ID=mRNA" in gff_line:  # Look for mRNA items
+                    gff_line_id = gff_line.split("\t")[8].split("=")[1].split(";")[0]  # GFF file ID (matches manual annotation IDs)
+                    if gff_line_id in orthofinder_dict.keys():  # The gff ID is supposed to match a unique ID (key) in the orthofinder file
+                        try:
+                            manual_annotation_value = orthofinder_dict[gff_line_id]  # Find the corresponding manual annotation to transfer
+                            gff_line = "{0};ec32_ortholog={1};ec32_ortholog_description={2}\n".format(gff_line.strip(), orthofinder_dict[gff_line_id], manual_annotation_value)
+                        except KeyError:  # Just in case some values are missing in the manual annotation file (optional, can be removed, it will then exit instead)
+                            continue
+                output_file.write(gff_line)
+            logging.info("Finished transferring descriptions for %s" % str(os.path.basename(gff_file)))
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(description="Orthologs_transfer between a reference manual annotation and Orthofinder outputs. \
+                                                  \n\nRun example: python3 ec32_orthologs_transfer.py -d /path/to/desired/dir/ -o orthofinder_file.tsv \
+                                                  -g gff_file.gff -m manual_annotation_file.txt")
+    parser.add_argument("-o", "--orthofinder",
+                        help="Orthofinder output (tabulated)")
+    parser.add_argument("-m", "--manual-annotation",
+                        help="Manual annotation file (tabulated) or pickle file containing the dicionary (automatically generated when you input a new manual annotation file)")
+    parser.add_argument("-g", "--gff",
+                        help="GFF input file (tabulated)")
+    parser.add_argument("-d", "--out-dir",
+                        help="Output directory")
+    args = parser.parse_args()
+
+    # WARNING for shell use case:  
+    # The script doesn't check if the inout files are valid. 
+    # Files must be put in this order: $1=orthofinder file, $2=manual annotation file, $3=GFF file, $4=optional output directory)
+
+    if not args.manual_annotation:
+        logging.info("Please input a manual annotation file or a pickled manual annotation dictionary")
+        sys.exit()
+    if not args.orthofinder:
+        logging.info("Please input an orthofinder output file")
+        sys.exit()
+    if not args.gff:
+        logging.info("Please input a gff file")
+        sys.exit()
+    if not args.out_dir:
+        args.out = "."
+
+    orthologs_transfer(manual_annotation_file=args.manual_annotation,
+                       orthofinder_file=args.orthofinder,
+                       gff_file=args.gff,
+                       outdir=args.out_dir)
--- a/utils/phaeoexplorer_hectar_transfer.py
+++ b/utils/phaeoexplorer_hectar_transfer.py
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+import sys
+import os
+import argparse
+import logging
+
+def hectar_transfer(hectar_file, orthofinder_file, gff_file, outdir):
+    """
+    Transfer of description between a manual annotation file and a gff file, using an orthofinder output file
+    to find the corresponding IDs between the two files
+
+    :param hectar_file:
+    :param orthofinder_file:
+    :param gff_file:
+    :param outdir:
+    :return:
+    """
+
+    manual_annotations_dict, orthofinder_dict, gff_dict = {}, {}, {}
+    species_filename = ""
+
+    with open(orthofinder_file, 'r') as orthofinder:
+        next(orthofinder)  # Skip header
+        # Mapping orthofinder's IDs to hectar IDs  
+        logging.info("Mapping Orthofinder's IDs to hectar IDs...")
+        for orthofiner_line in orthofinder:
+            orthofiner_line_split = orthofiner_line.rstrip().split("\t")
+            orthofinder_dict[orthofiner_line_split[1].split(" ")[0]] = orthofiner_line_split[2].split(" ")[0]
+        
+        hectar_dict = {}
+        with open(hectar_file, 'r') as hectar:
+            # Mapping orthofinder's IDs to hectar description
+            logging.info("Mapping Orthofinder's IDs to descriptions...")
+            next(hectar)  # Skip header
+            for hectar_line in hectar:
+                hectar_line_split = hectar_line.rstrip().split("\t")
+                hectar_dict[hectar_line_split[0].split(" ")[0]] = hectar_line_split[1]
+
+    # Opening GFF, appending hectar description to matching IDs
+    logging.info("Transferring manual descriptions to the GFF file...")
+    output_filename = str(os.path.join(os.path.abspath(outdir), os.path.basename(gff_file).split(".")[0]) + "_TRANSFERED_HECTAR.gff")
+
+    with open(output_filename, 'a') as output_file:
+        output_file.truncate(0)  # Erase previous file content
+        with open(gff_file, 'r') as gff:
+            for gff_line in gff:
+                if "ID=mRNA" in gff_line:  # Look for mRNA items
+                    gff_line_id = gff_line.split("\t")[8].split("=")[1].split(";")[0]  # GFF file ID (matches hectar IDs)
+                    if gff_line_id in hectar_dict.keys():  # The gff ID is supposed to match a unique ID (key) in the orthofinder file
+                        try:
+                            hectar_value = hectar_dict[gff_line_id]  # Find the corresponding manual annotation to transfer
+                            gff_line = "{0};HECTAR_predicted_targeting_category={1}\n".format(gff_line.strip(), hectar_value)
+                        except KeyError:  # Just in case some values are missing in the hectar file (If removed it will then exit instead)
+                            continue
+                output_file.write(gff_line)
+            logging.info("Finished transferring descriptions for %s" % str(os.path.basename(gff_file)))
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(description="Orthologs_transfer between a reference manual annotation and Orthofinder outputs. \
+                                                  \n\nRun example: python3 ec32_orthologs_transfer.py -d /path/to/desired/dir/ -h hectar_file.tsv \
+                                                  -g gff_file.gff -m manual_annotation_file.txt")
+    parser.add_argument("-i", "--hectar",
+                        help="Hectar output (tabulated)")
+    parser.add_argument("-o", "--orthofinder",
+                        help="Manual annotation file (tabulated) or pickle file containing the dicionary (automatically generated when you input a new manual annotation file)")
+    parser.add_argument("-g", "--gff",
+                        help="GFF input file (tabulated)")
+    parser.add_argument("-d", "--out-dir",
+                        help="Output directory")
+    args = parser.parse_args()
+
+    # WARNING for shell use case:  
+    # The script doesn't check if the inout files are valid. 
+    # Files must be put in this order: $1=orthofinder file, $2=manual annotation file, $3=GFF file, $4=optional output directory)
+
+    if not args.hectar:
+        logging.info("Please input a hectar file")
+        sys.exit()
+    if not args.orthofinder:
+        logging.info("Please input a hectar output file")
+        sys.exit()
+    if not args.gff:
+        logging.info("Please input a gff file")
+        sys.exit()
+    if not args.out_dir:
+        args.out = "."
+
+    hectar_transfer(hectar_file=args.hectar,
+                       orthofinder_file=args.orthofinder,
+                       gff_file=args.gff,
+                       outdir=args.out_dir)