import os import pandas # xlrd required for excel files reading import numpy import json import argparse import logging from datetime import datetime """ Table parser for phaeoexplorer data. Tested with xlsand xlsx input format, should work with csv format as well Does not work for ods spreadsheets (save as xls or xlsx instead) --> need to handle with pandas_ods_reader (requires ezodf, lxml) Does not support multiple sheets (TODO: "integration" and "update" sheets (1 and 2)) See example toy table (toy_table.xls) standalone usage: python3 table_parser.py <tabulated_file> -d <directory_to_write_json_to (default: cwd)> """ class TableParser: def __init__(self, table_file, dir): self.dir = os.path.abspath(args.dir) self.table_file = table_file self.method = None # TODO: instant launch or just parse (standalone) self.extension = None self.meta = dict() self.json_file = None def parse_table(self, extension): if extension == "xls": pandas_table = pandas.DataFrame(pandas.read_excel(self.table_file)) pandas_table = pandas_table.replace(numpy.nan, "", regex=True) for char in " ,.()-/": pandas_table = pandas_table.replace("\\" + char, "_", regex=True) pandas_table = pandas_table.replace("\\__", "_", regex=True) pandas_table.loc[pandas_table["genome version"] == "", "genome version"] = "1.0" pandas_table.loc[pandas_table["ogs version"] == "", "ogs version"] = "1.0" pandas_table.loc[pandas_table["version"] == "", "version"] = "1.0" pandas_table.loc[pandas_table["date"] == "", "date"] = datetime.today().strftime("%Y-%m-%d") with open(os.path.join(self.dir, self.json_file), 'w') as json_file: json_file.truncate(0) json_content = list() for organism in range(0, len(pandas_table.index)): organism_data = pandas_table.iloc[organism] organism_dict = organism_data.to_dict() for k, v in organism_dict.items(): v = str(v).split(" ") v = "_".join(v) v = v.replace("__", "_") if v.endswith("_"): v = v[:-1] json_content.append(organism_dict) json.dump(json_content, json_file, indent=4) elif extension == "csv": pandas_table = pandas.DataFrame(pandas.read_csv(self.table_file)) pandas_table = pandas_table.replace(numpy.nan, "", regex=True) for char in " ,.()-/": pandas_table = pandas_table.replace("\\" + char, "_", regex=True) pandas_table = pandas_table.replace("\\__", "_", regex=True) pandas_table.loc[pandas_table["genome version"] == "", "genome version"] = "1.0" pandas_table.loc[pandas_table["ogs version"] == "", "ogs version"] = "1.0" pandas_table.loc[pandas_table["version"] == "", "version"] = "1.0" pandas_table.loc[pandas_table["date"] == "", "date"] = datetime.today().strftime("%Y-%m-%d") with open(os.path.join(self.dir, self.json_file), 'w') as json_file: json_file.truncate(0) json_content = list() for organism in range(0, len(pandas_table.index)): organism_data = pandas_table.iloc[organism] organism_dict = organism_data.to_dict() for k, v in organism_dict.items(): v = str(v).split(" ") v = "_".join(v) v = v.replace("__", "_") if v.endswith("_"): v = v[:-1] json_content.append(organism_dict) json.dump(json_content, json_file, indent=4) else: logging.info("input tabulated file doesn't have the correct extension (supported extensions: xls, xlsx, csv)") def write_json(data, filename): with open(filename, 'w') as f: json.dump(data, f, indent=4) if __name__ == "__main__": parser = argparse.ArgumentParser(description="Table parser for phaeoexplorer data") parser.add_argument("input", type=str, help="input table") parser.add_argument("-d", "--dir", type=str, help="Where to write the output json file that is be used for integration", default = os.getcwd()) args = parser.parse_args() if args.input.endswith("xlsx") or args.input.endswith("xls"): tp = TableParser(table_file=args.input, dir=args.dir) tp.extension = args.input.split(".")[1] tp.json_file = tp.dir + "/dataloader_" + datetime.today().strftime("%Y%m%d") + ".json" tp.parse_table(extension="xls")