table_parser.py

import os
import sys
import pandas  # xlrd required for excel files reading
import numpy
import json
import argparse
import logging
from datetime import datetime

"""
!! OBSOLETE !!

Input parser script. 
Does not work for ods spreadsheets (save as xls or xlsx instead) --> need to handle with pandas_ods_reader (requires ezodf, lxml)
Does not support multiple sheets (TODO: "integration" and "update" sheets (1 and 2))
See example toy table (toy_table.xls)

standalone usage: python3 table_parser.py <tabulated_file> -d <directory_to_write_json_to (default: cwd)>

"""


class TableParser:

	def __init__(self, table_file, dir):
		self.dir = os.path.abspath(args.dir)
		self.table_file = table_file
		self.method = None  # TODO: instant launch or just parse (standalone)
		self.extension = None
		self.meta = dict()
		self.json_file = None	

	def parse_table(self, extension):
		if extension == "xls":
			pandas_table = pandas.DataFrame(pandas.read_excel(self.table_file))
		elif extension == "csv":
			pandas_table = pandas.DataFrame(pandas.read_csv(self.table_file))
		else:
			logging.info("wrong format: input tabulated file cannot be read (supported formats: xls, xlsx, csv)")
			sys.exit()
		pandas_table = pandas_table.replace(numpy.nan, "", regex=True)
		
		for char in " ,.()-/":
			pandas_table = pandas_table.replace("\\" + char, "_", regex=True)
		pandas_table = pandas_table.replace("\\__", "_", regex=True)
		pandas_table.loc[pandas_table["genome version"] == "", "genome version"] = "1.0"
		pandas_table.loc[pandas_table["ogs version"] == "", "ogs version"] = "1.0"
		pandas_table.loc[pandas_table["version"] == "", "version"] = "1.0"
		pandas_table.loc[pandas_table["date"] == "", "date"] = datetime.today().strftime("%Y-%m-%d")
		with open(os.path.join(self.dir, self.json_file), 'w') as json_file:
			json_file.truncate(0)
			json_content = list()
			for organism in range(0, len(pandas_table.index)):
				organism_dict = pandas_table.iloc[organism].to_dict()
				for k, v in organism_dict.items():
					v = str(v).split(" ")
					v = "_".join(v)
					v = v.replace("__", "_")
					if v.endswith("_"):
						v = v[:-1]
				json_content.append(organism_dict)
			json.dump(json_content, json_file, indent=4)

	def write_json(self, data, filename):
		with open(filename, 'w') as f:
			json.dump(data, f, indent=4)


if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Table parser for phaeoexplorer data")
	parser.add_argument("input", type=str, help="input table")
	parser.add_argument("-d", "--dir", type=str, help="Where to write the output json file that is be used for integration", default = os.getcwd())
	args = parser.parse_args()

	if args.input.endswith("xlsx") or args.input.endswith("xls"):
		tp = TableParser(table_file=args.input, dir=args.dir)
		tp.extension = args.input.split(".")[1]
		tp.json_file = tp.dir + "/dataloader_" + datetime.today().strftime("%Y%m%d") + ".json"
		tp.parse_table(extension="xls")