diff options
Diffstat (limited to 'include/orcus')
79 files changed, 16496 insertions, 0 deletions
diff --git a/include/orcus/Makefile.am b/include/orcus/Makefile.am new file mode 100644 index 0000000..865e8e6 --- /dev/null +++ b/include/orcus/Makefile.am @@ -0,0 +1,88 @@ +SUBDIRS = detail spreadsheet + +liborcusdir = $(includedir)/liborcus-@ORCUS_API_VERSION@/orcus +liborcus_HEADERS = \ + base64.hpp \ + cell_buffer.hpp \ + config.hpp \ + css_document_tree.hpp \ + css_parser.hpp \ + css_parser_base.hpp \ + css_selector.hpp \ + css_types.hpp \ + csv_parser.hpp \ + csv_parser_base.hpp \ + dom_tree.hpp \ + env.hpp \ + exception.hpp \ + format_detection.hpp \ + info.hpp \ + interface.hpp \ + json_document_tree.hpp \ + json_global.hpp \ + json_parser.hpp \ + json_parser_base.hpp \ + json_parser_thread.hpp \ + json_structure_tree.hpp \ + measurement.hpp \ + orcus_csv.hpp \ + orcus_json.hpp \ + orcus_xml.hpp \ + parser_base.hpp \ + parser_global.hpp \ + sax_parser.hpp \ + sax_parser_base.hpp \ + sax_ns_parser.hpp \ + sax_token_parser.hpp \ + sax_token_parser_thread.hpp \ + stream.hpp \ + string_pool.hpp \ + threaded_json_parser.hpp \ + threaded_sax_token_parser.hpp \ + tokens.hpp \ + types.hpp \ + xml_namespace.hpp \ + xml_structure_tree.hpp \ + xml_writer.hpp \ + yaml_document_tree.hpp \ + yaml_parser.hpp \ + yaml_parser_base.hpp \ + zip_archive.hpp \ + zip_archive_stream.hpp + +if WITH_ODS_FILTER + +liborcus_HEADERS += \ + orcus_ods.hpp \ + orcus_import_ods.hpp + +endif # WITH_ODS_FILTER + +if WITH_XLSX_FILTER + +liborcus_HEADERS += \ + orcus_xlsx.hpp \ + orcus_import_xlsx.hpp + +endif # WITH_XLSX_FILTER + +if WITH_XLS_XML_FILTER + +liborcus_HEADERS += \ + orcus_xls_xml.hpp + +endif # WITH_XLS_XML_FILTER + +if WITH_GNUMERIC_FILTER + +liborcus_HEADERS += \ + orcus_gnumeric.hpp + +endif # WITH_GNUMERIC_FILTER + +if WITH_PARQUET_FILTER + +liborcus_HEADERS += \ + orcus_parquet.hpp + +endif # WITH_PARQUET_FILTER diff --git a/include/orcus/Makefile.in b/include/orcus/Makefile.in new file mode 100644 index 0000000..0070b0f --- /dev/null +++ b/include/orcus/Makefile.in @@ -0,0 +1,828 @@ +# Makefile.in generated by automake 1.16.5 from Makefile.am. +# @configure_input@ + +# Copyright (C) 1994-2021 Free Software Foundation, Inc. + +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + +@SET_MAKE@ + +VPATH = @srcdir@ +am__is_gnu_make = { \ + if test -z '$(MAKELEVEL)'; then \ + false; \ + elif test -n '$(MAKE_HOST)'; then \ + true; \ + elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \ + true; \ + else \ + false; \ + fi; \ +} +am__make_running_with_option = \ + case $${target_option-} in \ + ?) ;; \ + *) echo "am__make_running_with_option: internal error: invalid" \ + "target option '$${target_option-}' specified" >&2; \ + exit 1;; \ + esac; \ + has_opt=no; \ + sane_makeflags=$$MAKEFLAGS; \ + if $(am__is_gnu_make); then \ + sane_makeflags=$$MFLAGS; \ + else \ + case $$MAKEFLAGS in \ + *\\[\ \ ]*) \ + bs=\\; \ + sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \ + | sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \ + esac; \ + fi; \ + skip_next=no; \ + strip_trailopt () \ + { \ + flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \ + }; \ + for flg in $$sane_makeflags; do \ + test $$skip_next = yes && { skip_next=no; continue; }; \ + case $$flg in \ + *=*|--*) continue;; \ + -*I) strip_trailopt 'I'; skip_next=yes;; \ + -*I?*) strip_trailopt 'I';; \ + -*O) strip_trailopt 'O'; skip_next=yes;; \ + -*O?*) strip_trailopt 'O';; \ + -*l) strip_trailopt 'l'; skip_next=yes;; \ + -*l?*) strip_trailopt 'l';; \ + -[dEDm]) skip_next=yes;; \ + -[JT]) skip_next=yes;; \ + esac; \ + case $$flg in \ + *$$target_option*) has_opt=yes; break;; \ + esac; \ + done; \ + test $$has_opt = yes +am__make_dryrun = (target_option=n; $(am__make_running_with_option)) +am__make_keepgoing = (target_option=k; $(am__make_running_with_option)) +pkgdatadir = $(datadir)/@PACKAGE@ +pkgincludedir = $(includedir)/@PACKAGE@ +pkglibdir = $(libdir)/@PACKAGE@ +pkglibexecdir = $(libexecdir)/@PACKAGE@ +am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd +install_sh_DATA = $(install_sh) -c -m 644 +install_sh_PROGRAM = $(install_sh) -c +install_sh_SCRIPT = $(install_sh) -c +INSTALL_HEADER = $(INSTALL_DATA) +transform = $(program_transform_name) +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +build_triplet = @build@ +host_triplet = @host@ +@WITH_ODS_FILTER_TRUE@am__append_1 = \ +@WITH_ODS_FILTER_TRUE@ orcus_ods.hpp \ +@WITH_ODS_FILTER_TRUE@ orcus_import_ods.hpp + +@WITH_XLSX_FILTER_TRUE@am__append_2 = \ +@WITH_XLSX_FILTER_TRUE@ orcus_xlsx.hpp \ +@WITH_XLSX_FILTER_TRUE@ orcus_import_xlsx.hpp + +@WITH_XLS_XML_FILTER_TRUE@am__append_3 = \ +@WITH_XLS_XML_FILTER_TRUE@ orcus_xls_xml.hpp + +@WITH_GNUMERIC_FILTER_TRUE@am__append_4 = \ +@WITH_GNUMERIC_FILTER_TRUE@ orcus_gnumeric.hpp + +@WITH_PARQUET_FILTER_TRUE@am__append_5 = \ +@WITH_PARQUET_FILTER_TRUE@ orcus_parquet.hpp + +subdir = include/orcus +ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 +am__aclocal_m4_deps = $(top_srcdir)/m4/ax_cxx_compile_stdcxx.m4 \ + $(top_srcdir)/m4/ax_cxx_compile_stdcxx_17.m4 \ + $(top_srcdir)/m4/boost.m4 $(top_srcdir)/m4/libtool.m4 \ + $(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \ + $(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \ + $(top_srcdir)/m4/m4_ax_valgrind_check.m4 \ + $(top_srcdir)/configure.ac +am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ + $(ACLOCAL_M4) +DIST_COMMON = $(srcdir)/Makefile.am $(am__liborcus_HEADERS_DIST) \ + $(am__DIST_COMMON) +mkinstalldirs = $(install_sh) -d +CONFIG_HEADER = $(top_builddir)/config.h +CONFIG_CLEAN_FILES = +CONFIG_CLEAN_VPATH_FILES = +AM_V_P = $(am__v_P_@AM_V@) +am__v_P_ = $(am__v_P_@AM_DEFAULT_V@) +am__v_P_0 = false +am__v_P_1 = : +AM_V_GEN = $(am__v_GEN_@AM_V@) +am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@) +am__v_GEN_0 = @echo " GEN " $@; +am__v_GEN_1 = +AM_V_at = $(am__v_at_@AM_V@) +am__v_at_ = $(am__v_at_@AM_DEFAULT_V@) +am__v_at_0 = @ +am__v_at_1 = +SOURCES = +DIST_SOURCES = +RECURSIVE_TARGETS = all-recursive check-recursive cscopelist-recursive \ + ctags-recursive dvi-recursive html-recursive info-recursive \ + install-data-recursive install-dvi-recursive \ + install-exec-recursive install-html-recursive \ + install-info-recursive install-pdf-recursive \ + install-ps-recursive install-recursive installcheck-recursive \ + installdirs-recursive pdf-recursive ps-recursive \ + tags-recursive uninstall-recursive +am__can_run_installinfo = \ + case $$AM_UPDATE_INFO_DIR in \ + n|no|NO) false;; \ + *) (install-info --version) >/dev/null 2>&1;; \ + esac +am__liborcus_HEADERS_DIST = base64.hpp cell_buffer.hpp config.hpp \ + css_document_tree.hpp css_parser.hpp css_parser_base.hpp \ + css_selector.hpp css_types.hpp csv_parser.hpp \ + csv_parser_base.hpp dom_tree.hpp env.hpp exception.hpp \ + format_detection.hpp info.hpp interface.hpp \ + json_document_tree.hpp json_global.hpp json_parser.hpp \ + json_parser_base.hpp json_parser_thread.hpp \ + json_structure_tree.hpp measurement.hpp orcus_csv.hpp \ + orcus_json.hpp orcus_xml.hpp parser_base.hpp parser_global.hpp \ + sax_parser.hpp sax_parser_base.hpp sax_ns_parser.hpp \ + sax_token_parser.hpp sax_token_parser_thread.hpp stream.hpp \ + string_pool.hpp threaded_json_parser.hpp \ + threaded_sax_token_parser.hpp tokens.hpp types.hpp \ + xml_namespace.hpp xml_structure_tree.hpp xml_writer.hpp \ + yaml_document_tree.hpp yaml_parser.hpp yaml_parser_base.hpp \ + zip_archive.hpp zip_archive_stream.hpp orcus_ods.hpp \ + orcus_import_ods.hpp orcus_xlsx.hpp orcus_import_xlsx.hpp \ + orcus_xls_xml.hpp orcus_gnumeric.hpp orcus_parquet.hpp +am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; +am__vpath_adj = case $$p in \ + $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \ + *) f=$$p;; \ + esac; +am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`; +am__install_max = 40 +am__nobase_strip_setup = \ + srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'` +am__nobase_strip = \ + for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||" +am__nobase_list = $(am__nobase_strip_setup); \ + for p in $$list; do echo "$$p $$p"; done | \ + sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \ + $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \ + if (++n[$$2] == $(am__install_max)) \ + { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \ + END { for (dir in files) print dir, files[dir] }' +am__base_list = \ + sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \ + sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g' +am__uninstall_files_from_dir = { \ + test -z "$$files" \ + || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \ + || { echo " ( cd '$$dir' && rm -f" $$files ")"; \ + $(am__cd) "$$dir" && rm -f $$files; }; \ + } +am__installdirs = "$(DESTDIR)$(liborcusdir)" +HEADERS = $(liborcus_HEADERS) +RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive \ + distclean-recursive maintainer-clean-recursive +am__recursive_targets = \ + $(RECURSIVE_TARGETS) \ + $(RECURSIVE_CLEAN_TARGETS) \ + $(am__extra_recursive_targets) +AM_RECURSIVE_TARGETS = $(am__recursive_targets:-recursive=) TAGS CTAGS \ + distdir distdir-am +am__extra_recursive_targets = check-valgrind-recursive \ + check-valgrind-memcheck-recursive \ + check-valgrind-helgrind-recursive check-valgrind-drd-recursive \ + check-valgrind-sgcheck-recursive +am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP) +# Read a list of newline-separated strings from the standard input, +# and print each of them once, without duplicates. Input order is +# *not* preserved. +am__uniquify_input = $(AWK) '\ + BEGIN { nonempty = 0; } \ + { items[$$0] = 1; nonempty = 1; } \ + END { if (nonempty) { for (i in items) print i; }; } \ +' +# Make sure the list of sources is unique. This is necessary because, +# e.g., the same source file might be shared among _SOURCES variables +# for different programs/libraries. +am__define_uniq_tagged_files = \ + list='$(am__tagged_files)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | $(am__uniquify_input)` +DIST_SUBDIRS = $(SUBDIRS) +am__DIST_COMMON = $(srcdir)/Makefile.in +DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) +am__relativize = \ + dir0=`pwd`; \ + sed_first='s,^\([^/]*\)/.*$$,\1,'; \ + sed_rest='s,^[^/]*/*,,'; \ + sed_last='s,^.*/\([^/]*\)$$,\1,'; \ + sed_butlast='s,/*[^/]*$$,,'; \ + while test -n "$$dir1"; do \ + first=`echo "$$dir1" | sed -e "$$sed_first"`; \ + if test "$$first" != "."; then \ + if test "$$first" = ".."; then \ + dir2=`echo "$$dir0" | sed -e "$$sed_last"`/"$$dir2"; \ + dir0=`echo "$$dir0" | sed -e "$$sed_butlast"`; \ + else \ + first2=`echo "$$dir2" | sed -e "$$sed_first"`; \ + if test "$$first2" = "$$first"; then \ + dir2=`echo "$$dir2" | sed -e "$$sed_rest"`; \ + else \ + dir2="../$$dir2"; \ + fi; \ + dir0="$$dir0"/"$$first"; \ + fi; \ + fi; \ + dir1=`echo "$$dir1" | sed -e "$$sed_rest"`; \ + done; \ + reldir="$$dir2" +ACLOCAL = @ACLOCAL@ +AMTAR = @AMTAR@ +AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ +AR = @AR@ +AS = @AS@ +AUTOCONF = @AUTOCONF@ +AUTOHEADER = @AUTOHEADER@ +AUTOMAKE = @AUTOMAKE@ +AWK = @AWK@ +BOOST_CPPFLAGS = @BOOST_CPPFLAGS@ +BOOST_DATE_TIME_LDFLAGS = @BOOST_DATE_TIME_LDFLAGS@ +BOOST_DATE_TIME_LDPATH = @BOOST_DATE_TIME_LDPATH@ +BOOST_DATE_TIME_LIBS = @BOOST_DATE_TIME_LIBS@ +BOOST_FILESYSTEM_LDFLAGS = @BOOST_FILESYSTEM_LDFLAGS@ +BOOST_FILESYSTEM_LDPATH = @BOOST_FILESYSTEM_LDPATH@ +BOOST_FILESYSTEM_LIBS = @BOOST_FILESYSTEM_LIBS@ +BOOST_IOSTREAMS_LDFLAGS = @BOOST_IOSTREAMS_LDFLAGS@ +BOOST_IOSTREAMS_LDPATH = @BOOST_IOSTREAMS_LDPATH@ +BOOST_IOSTREAMS_LIBS = @BOOST_IOSTREAMS_LIBS@ +BOOST_LDPATH = @BOOST_LDPATH@ +BOOST_PROGRAM_OPTIONS_LDFLAGS = @BOOST_PROGRAM_OPTIONS_LDFLAGS@ +BOOST_PROGRAM_OPTIONS_LDPATH = @BOOST_PROGRAM_OPTIONS_LDPATH@ +BOOST_PROGRAM_OPTIONS_LIBS = @BOOST_PROGRAM_OPTIONS_LIBS@ +BOOST_ROOT = @BOOST_ROOT@ +BOOST_SYSTEM_LDFLAGS = @BOOST_SYSTEM_LDFLAGS@ +BOOST_SYSTEM_LDPATH = @BOOST_SYSTEM_LDPATH@ +BOOST_SYSTEM_LIBS = @BOOST_SYSTEM_LIBS@ +CC = @CC@ +CCDEPMODE = @CCDEPMODE@ +CFLAGS = @CFLAGS@ +CPP = @CPP@ +CPPFLAGS = @CPPFLAGS@ +CSCOPE = @CSCOPE@ +CTAGS = @CTAGS@ +CXX = @CXX@ +CXXCPP = @CXXCPP@ +CXXDEPMODE = @CXXDEPMODE@ +CXXFLAGS = @CXXFLAGS@ +CYGPATH_W = @CYGPATH_W@ +DEFS = @DEFS@ +DEPDIR = @DEPDIR@ +DISTCHECK_CONFIGURE_FLAGS = @DISTCHECK_CONFIGURE_FLAGS@ +DLLTOOL = @DLLTOOL@ +DSYMUTIL = @DSYMUTIL@ +DUMPBIN = @DUMPBIN@ +ECHO_C = @ECHO_C@ +ECHO_N = @ECHO_N@ +ECHO_T = @ECHO_T@ +EGREP = @EGREP@ +ENABLE_VALGRIND_drd = @ENABLE_VALGRIND_drd@ +ENABLE_VALGRIND_helgrind = @ENABLE_VALGRIND_helgrind@ +ENABLE_VALGRIND_memcheck = @ENABLE_VALGRIND_memcheck@ +ENABLE_VALGRIND_sgcheck = @ENABLE_VALGRIND_sgcheck@ +ETAGS = @ETAGS@ +EXEEXT = @EXEEXT@ +FGREP = @FGREP@ +GREP = @GREP@ +HAVE_CXX17 = @HAVE_CXX17@ +INSTALL = @INSTALL@ +INSTALL_DATA = @INSTALL_DATA@ +INSTALL_PROGRAM = @INSTALL_PROGRAM@ +INSTALL_SCRIPT = @INSTALL_SCRIPT@ +INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ +IXION_REQUIRED_API_VERSION = @IXION_REQUIRED_API_VERSION@ +LD = @LD@ +LDFLAGS = @LDFLAGS@ +LIBIXION_CFLAGS = @LIBIXION_CFLAGS@ +LIBIXION_LIBS = @LIBIXION_LIBS@ +LIBOBJS = @LIBOBJS@ +LIBS = @LIBS@ +LIBTOOL = @LIBTOOL@ +LIPO = @LIPO@ +LN_S = @LN_S@ +LTLIBOBJS = @LTLIBOBJS@ +LT_SYS_LIBRARY_PATH = @LT_SYS_LIBRARY_PATH@ +MAKEINFO = @MAKEINFO@ +MANIFEST_TOOL = @MANIFEST_TOOL@ +MDDS_CFLAGS = @MDDS_CFLAGS@ +MDDS_LIBS = @MDDS_LIBS@ +MKDIR_P = @MKDIR_P@ +NM = @NM@ +NMEDIT = @NMEDIT@ +OBJDUMP = @OBJDUMP@ +OBJEXT = @OBJEXT@ +ORCUS_API_VERSION = @ORCUS_API_VERSION@ +ORCUS_MAJOR_VERSION = @ORCUS_MAJOR_VERSION@ +ORCUS_MICRO_VERSION = @ORCUS_MICRO_VERSION@ +ORCUS_MINOR_VERSION = @ORCUS_MINOR_VERSION@ +OTOOL = @OTOOL@ +OTOOL64 = @OTOOL64@ +PACKAGE = @PACKAGE@ +PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ +PACKAGE_NAME = @PACKAGE_NAME@ +PACKAGE_STRING = @PACKAGE_STRING@ +PACKAGE_TARNAME = @PACKAGE_TARNAME@ +PACKAGE_URL = @PACKAGE_URL@ +PACKAGE_VERSION = @PACKAGE_VERSION@ +PARQUET_CFLAGS = @PARQUET_CFLAGS@ +PARQUET_LIBS = @PARQUET_LIBS@ +PATH_SEPARATOR = @PATH_SEPARATOR@ +PKG_CONFIG = @PKG_CONFIG@ +PKG_CONFIG_LIBDIR = @PKG_CONFIG_LIBDIR@ +PKG_CONFIG_PATH = @PKG_CONFIG_PATH@ +POW_LIB = @POW_LIB@ +PYTHON = @PYTHON@ +PYTHON_CFLAGS = @PYTHON_CFLAGS@ +PYTHON_EXEC_PREFIX = @PYTHON_EXEC_PREFIX@ +PYTHON_LIBS = @PYTHON_LIBS@ +PYTHON_PLATFORM = @PYTHON_PLATFORM@ +PYTHON_PREFIX = @PYTHON_PREFIX@ +PYTHON_VERSION = @PYTHON_VERSION@ +RANLIB = @RANLIB@ +SED = @SED@ +SET_MAKE = @SET_MAKE@ +SHELL = @SHELL@ +STRIP = @STRIP@ +VALGRIND = @VALGRIND@ +VALGRIND_ENABLED = @VALGRIND_ENABLED@ +VERSION = @VERSION@ +ZLIB_CFLAGS = @ZLIB_CFLAGS@ +ZLIB_LIBS = @ZLIB_LIBS@ +abs_builddir = @abs_builddir@ +abs_srcdir = @abs_srcdir@ +abs_top_builddir = @abs_top_builddir@ +abs_top_srcdir = @abs_top_srcdir@ +ac_ct_AR = @ac_ct_AR@ +ac_ct_CC = @ac_ct_CC@ +ac_ct_CXX = @ac_ct_CXX@ +ac_ct_DUMPBIN = @ac_ct_DUMPBIN@ +am__include = @am__include@ +am__leading_dot = @am__leading_dot@ +am__quote = @am__quote@ +am__tar = @am__tar@ +am__untar = @am__untar@ +bindir = @bindir@ +build = @build@ +build_alias = @build_alias@ +build_cpu = @build_cpu@ +build_os = @build_os@ +build_vendor = @build_vendor@ +builddir = @builddir@ +datadir = @datadir@ +datarootdir = @datarootdir@ +docdir = @docdir@ +dvidir = @dvidir@ +exec_prefix = @exec_prefix@ +host = @host@ +host_alias = @host_alias@ +host_cpu = @host_cpu@ +host_os = @host_os@ +host_vendor = @host_vendor@ +htmldir = @htmldir@ +includedir = @includedir@ +infodir = @infodir@ +install_sh = @install_sh@ +libdir = @libdir@ +libexecdir = @libexecdir@ +localedir = @localedir@ +localstatedir = @localstatedir@ +mandir = @mandir@ +mkdir_p = @mkdir_p@ +oldincludedir = @oldincludedir@ +pdfdir = @pdfdir@ +pkgpyexecdir = @pkgpyexecdir@ +pkgpythondir = @pkgpythondir@ +prefix = @prefix@ +program_transform_name = @program_transform_name@ +psdir = @psdir@ +pyexecdir = @pyexecdir@ +pythondir = @pythondir@ +runstatedir = @runstatedir@ +sbindir = @sbindir@ +sharedstatedir = @sharedstatedir@ +srcdir = @srcdir@ +sysconfdir = @sysconfdir@ +target_alias = @target_alias@ +top_build_prefix = @top_build_prefix@ +top_builddir = @top_builddir@ +top_srcdir = @top_srcdir@ +valgrind_enabled_tools = @valgrind_enabled_tools@ +valgrind_tools = @valgrind_tools@ +SUBDIRS = detail spreadsheet +liborcusdir = $(includedir)/liborcus-@ORCUS_API_VERSION@/orcus +liborcus_HEADERS = base64.hpp cell_buffer.hpp config.hpp \ + css_document_tree.hpp css_parser.hpp css_parser_base.hpp \ + css_selector.hpp css_types.hpp csv_parser.hpp \ + csv_parser_base.hpp dom_tree.hpp env.hpp exception.hpp \ + format_detection.hpp info.hpp interface.hpp \ + json_document_tree.hpp json_global.hpp json_parser.hpp \ + json_parser_base.hpp json_parser_thread.hpp \ + json_structure_tree.hpp measurement.hpp orcus_csv.hpp \ + orcus_json.hpp orcus_xml.hpp parser_base.hpp parser_global.hpp \ + sax_parser.hpp sax_parser_base.hpp sax_ns_parser.hpp \ + sax_token_parser.hpp sax_token_parser_thread.hpp stream.hpp \ + string_pool.hpp threaded_json_parser.hpp \ + threaded_sax_token_parser.hpp tokens.hpp types.hpp \ + xml_namespace.hpp xml_structure_tree.hpp xml_writer.hpp \ + yaml_document_tree.hpp yaml_parser.hpp yaml_parser_base.hpp \ + zip_archive.hpp zip_archive_stream.hpp $(am__append_1) \ + $(am__append_2) $(am__append_3) $(am__append_4) \ + $(am__append_5) +all: all-recursive + +.SUFFIXES: +$(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps) + @for dep in $?; do \ + case '$(am__configure_deps)' in \ + *$$dep*) \ + ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ + && { if test -f $@; then exit 0; else break; fi; }; \ + exit 1;; \ + esac; \ + done; \ + echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign include/orcus/Makefile'; \ + $(am__cd) $(top_srcdir) && \ + $(AUTOMAKE) --foreign include/orcus/Makefile +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + @case '$?' in \ + *config.status*) \ + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ + *) \ + echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles)'; \ + cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles);; \ + esac; + +$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh + +$(top_srcdir)/configure: $(am__configure_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(ACLOCAL_M4): $(am__aclocal_m4_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(am__aclocal_m4_deps): + +mostlyclean-libtool: + -rm -f *.lo + +clean-libtool: + -rm -rf .libs _libs +install-liborcusHEADERS: $(liborcus_HEADERS) + @$(NORMAL_INSTALL) + @list='$(liborcus_HEADERS)'; test -n "$(liborcusdir)" || list=; \ + if test -n "$$list"; then \ + echo " $(MKDIR_P) '$(DESTDIR)$(liborcusdir)'"; \ + $(MKDIR_P) "$(DESTDIR)$(liborcusdir)" || exit 1; \ + fi; \ + for p in $$list; do \ + if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \ + echo "$$d$$p"; \ + done | $(am__base_list) | \ + while read files; do \ + echo " $(INSTALL_HEADER) $$files '$(DESTDIR)$(liborcusdir)'"; \ + $(INSTALL_HEADER) $$files "$(DESTDIR)$(liborcusdir)" || exit $$?; \ + done + +uninstall-liborcusHEADERS: + @$(NORMAL_UNINSTALL) + @list='$(liborcus_HEADERS)'; test -n "$(liborcusdir)" || list=; \ + files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \ + dir='$(DESTDIR)$(liborcusdir)'; $(am__uninstall_files_from_dir) + +# This directory's subdirectories are mostly independent; you can cd +# into them and run 'make' without going through this Makefile. +# To change the values of 'make' variables: instead of editing Makefiles, +# (1) if the variable is set in 'config.status', edit 'config.status' +# (which will cause the Makefiles to be regenerated when you run 'make'); +# (2) otherwise, pass the desired values on the 'make' command line. +$(am__recursive_targets): + @fail=; \ + if $(am__make_keepgoing); then \ + failcom='fail=yes'; \ + else \ + failcom='exit 1'; \ + fi; \ + dot_seen=no; \ + target=`echo $@ | sed s/-recursive//`; \ + case "$@" in \ + distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \ + *) list='$(SUBDIRS)' ;; \ + esac; \ + for subdir in $$list; do \ + echo "Making $$target in $$subdir"; \ + if test "$$subdir" = "."; then \ + dot_seen=yes; \ + local_target="$$target-am"; \ + else \ + local_target="$$target"; \ + fi; \ + ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \ + || eval $$failcom; \ + done; \ + if test "$$dot_seen" = "no"; then \ + $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \ + fi; test -z "$$fail" +check-valgrind-local: +check-valgrind-memcheck-local: +check-valgrind-helgrind-local: +check-valgrind-drd-local: +check-valgrind-sgcheck-local: + +ID: $(am__tagged_files) + $(am__define_uniq_tagged_files); mkid -fID $$unique +tags: tags-recursive +TAGS: tags + +tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) + set x; \ + here=`pwd`; \ + if ($(ETAGS) --etags-include --version) >/dev/null 2>&1; then \ + include_option=--etags-include; \ + empty_fix=.; \ + else \ + include_option=--include; \ + empty_fix=; \ + fi; \ + list='$(SUBDIRS)'; for subdir in $$list; do \ + if test "$$subdir" = .; then :; else \ + test ! -f $$subdir/TAGS || \ + set "$$@" "$$include_option=$$here/$$subdir/TAGS"; \ + fi; \ + done; \ + $(am__define_uniq_tagged_files); \ + shift; \ + if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ + test -n "$$unique" || unique=$$empty_fix; \ + if test $$# -gt 0; then \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + "$$@" $$unique; \ + else \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + $$unique; \ + fi; \ + fi +ctags: ctags-recursive + +CTAGS: ctags +ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) + $(am__define_uniq_tagged_files); \ + test -z "$(CTAGS_ARGS)$$unique" \ + || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ + $$unique + +GTAGS: + here=`$(am__cd) $(top_builddir) && pwd` \ + && $(am__cd) $(top_srcdir) \ + && gtags -i $(GTAGS_ARGS) "$$here" +cscopelist: cscopelist-recursive + +cscopelist-am: $(am__tagged_files) + list='$(am__tagged_files)'; \ + case "$(srcdir)" in \ + [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \ + *) sdir=$(subdir)/$(srcdir) ;; \ + esac; \ + for i in $$list; do \ + if test -f "$$i"; then \ + echo "$(subdir)/$$i"; \ + else \ + echo "$$sdir/$$i"; \ + fi; \ + done >> $(top_builddir)/cscope.files + +distclean-tags: + -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags +distdir: $(BUILT_SOURCES) + $(MAKE) $(AM_MAKEFLAGS) distdir-am + +distdir-am: $(DISTFILES) + @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + list='$(DISTFILES)'; \ + dist_files=`for file in $$list; do echo $$file; done | \ + sed -e "s|^$$srcdirstrip/||;t" \ + -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ + case $$dist_files in \ + */*) $(MKDIR_P) `echo "$$dist_files" | \ + sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ + sort -u` ;; \ + esac; \ + for file in $$dist_files; do \ + if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ + if test -d $$d/$$file; then \ + dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ + if test -d "$(distdir)/$$file"; then \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ + cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ + else \ + test -f "$(distdir)/$$file" \ + || cp -p $$d/$$file "$(distdir)/$$file" \ + || exit 1; \ + fi; \ + done + @list='$(DIST_SUBDIRS)'; for subdir in $$list; do \ + if test "$$subdir" = .; then :; else \ + $(am__make_dryrun) \ + || test -d "$(distdir)/$$subdir" \ + || $(MKDIR_P) "$(distdir)/$$subdir" \ + || exit 1; \ + dir1=$$subdir; dir2="$(distdir)/$$subdir"; \ + $(am__relativize); \ + new_distdir=$$reldir; \ + dir1=$$subdir; dir2="$(top_distdir)"; \ + $(am__relativize); \ + new_top_distdir=$$reldir; \ + echo " (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir="$$new_top_distdir" distdir="$$new_distdir" \\"; \ + echo " am__remove_distdir=: am__skip_length_check=: am__skip_mode_fix=: distdir)"; \ + ($(am__cd) $$subdir && \ + $(MAKE) $(AM_MAKEFLAGS) \ + top_distdir="$$new_top_distdir" \ + distdir="$$new_distdir" \ + am__remove_distdir=: \ + am__skip_length_check=: \ + am__skip_mode_fix=: \ + distdir) \ + || exit 1; \ + fi; \ + done +check-am: all-am +check: check-recursive +all-am: Makefile $(HEADERS) +installdirs: installdirs-recursive +installdirs-am: + for dir in "$(DESTDIR)$(liborcusdir)"; do \ + test -z "$$dir" || $(MKDIR_P) "$$dir"; \ + done +install: install-recursive +install-exec: install-exec-recursive +install-data: install-data-recursive +uninstall: uninstall-recursive + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am + +installcheck: installcheck-recursive +install-strip: + if test -z '$(STRIP)'; then \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + install; \ + else \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \ + fi +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) + -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) + +maintainer-clean-generic: + @echo "This command is intended for maintainers to use" + @echo "it deletes files that may require special tools to rebuild." +check-valgrind: check-valgrind-recursive + +check-valgrind-am: check-valgrind-local + +check-valgrind-drd: check-valgrind-drd-recursive + +check-valgrind-drd-am: check-valgrind-drd-local + +check-valgrind-helgrind: check-valgrind-helgrind-recursive + +check-valgrind-helgrind-am: check-valgrind-helgrind-local + +check-valgrind-memcheck: check-valgrind-memcheck-recursive + +check-valgrind-memcheck-am: check-valgrind-memcheck-local + +check-valgrind-sgcheck: check-valgrind-sgcheck-recursive + +check-valgrind-sgcheck-am: check-valgrind-sgcheck-local + +clean: clean-recursive + +clean-am: clean-generic clean-libtool mostlyclean-am + +distclean: distclean-recursive + -rm -f Makefile +distclean-am: clean-am distclean-generic distclean-tags + +dvi: dvi-recursive + +dvi-am: + +html: html-recursive + +html-am: + +info: info-recursive + +info-am: + +install-data-am: install-liborcusHEADERS + +install-dvi: install-dvi-recursive + +install-dvi-am: + +install-exec-am: + +install-html: install-html-recursive + +install-html-am: + +install-info: install-info-recursive + +install-info-am: + +install-man: + +install-pdf: install-pdf-recursive + +install-pdf-am: + +install-ps: install-ps-recursive + +install-ps-am: + +installcheck-am: + +maintainer-clean: maintainer-clean-recursive + -rm -f Makefile +maintainer-clean-am: distclean-am maintainer-clean-generic + +mostlyclean: mostlyclean-recursive + +mostlyclean-am: mostlyclean-generic mostlyclean-libtool + +pdf: pdf-recursive + +pdf-am: + +ps: ps-recursive + +ps-am: + +uninstall-am: uninstall-liborcusHEADERS + +.MAKE: $(am__recursive_targets) install-am install-strip + +.PHONY: $(am__recursive_targets) CTAGS GTAGS TAGS all all-am check \ + check-am check-valgrind-am check-valgrind-drd-am \ + check-valgrind-drd-local check-valgrind-helgrind-am \ + check-valgrind-helgrind-local check-valgrind-local \ + check-valgrind-memcheck-am check-valgrind-memcheck-local \ + check-valgrind-sgcheck-am check-valgrind-sgcheck-local clean \ + clean-generic clean-libtool cscopelist-am ctags ctags-am \ + distclean distclean-generic distclean-libtool distclean-tags \ + distdir dvi dvi-am html html-am info info-am install \ + install-am install-data install-data-am install-dvi \ + install-dvi-am install-exec install-exec-am install-html \ + install-html-am install-info install-info-am \ + install-liborcusHEADERS install-man install-pdf install-pdf-am \ + install-ps install-ps-am install-strip installcheck \ + installcheck-am installdirs installdirs-am maintainer-clean \ + maintainer-clean-generic mostlyclean mostlyclean-generic \ + mostlyclean-libtool pdf pdf-am ps ps-am tags tags-am uninstall \ + uninstall-am uninstall-liborcusHEADERS + +.PRECIOUS: Makefile + + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/include/orcus/base64.hpp b/include/orcus/base64.hpp new file mode 100644 index 0000000..44c7017 --- /dev/null +++ b/include/orcus/base64.hpp @@ -0,0 +1,37 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef __ORCUS_BASE64_HPP__ +#define __ORCUS_BASE64_HPP__ + +#include "env.hpp" +#include <cstdint> +#include <vector> +#include <string> + +namespace orcus { + +/** + * Decode a based64-encoded character sequence into a sequence of bytes. + * + * @param base64 encoded character sequence. + * @return decoded byte sequence. + */ +ORCUS_PSR_DLLPUBLIC std::vector<uint8_t> decode_from_base64(std::string_view base64); + +/** + * Encode a sequence of bytes into base64-encoded characters. + * + * @param input sequence of bytes to encode. + * @return base64-encoded character sequence representing the input bytes. + */ +ORCUS_PSR_DLLPUBLIC std::string encode_to_base64(const std::vector<uint8_t>& input); + +} + +#endif +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/cell_buffer.hpp b/include/orcus/cell_buffer.hpp new file mode 100644 index 0000000..60df728 --- /dev/null +++ b/include/orcus/cell_buffer.hpp @@ -0,0 +1,42 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef __ORCUS_CELL_BUFFER_HPP__ +#define __ORCUS_CELL_BUFFER_HPP__ + +#include "env.hpp" + +#include <string> + +namespace orcus { + +/** + * Temporary cell buffer used to decode encoded cell values. This is used in + * the sax, json and csv parsers. + */ +class ORCUS_PSR_DLLPUBLIC cell_buffer +{ + std::string m_buffer; + size_t m_buf_size; +public: + cell_buffer(const cell_buffer&) = delete; + + cell_buffer(); + ~cell_buffer(); + + void append(const char* p, size_t len); + void reset(); + + std::string_view str() const; + + bool empty() const; +}; + +} + +#endif +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/config.hpp b/include/orcus/config.hpp new file mode 100644 index 0000000..17743e6 --- /dev/null +++ b/include/orcus/config.hpp @@ -0,0 +1,125 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_CONFIG_HPP +#define INCLUDED_ORCUS_CONFIG_HPP + +#include "orcus/env.hpp" +#include "orcus/types.hpp" + +#include <string> +#include <variant> + +namespace orcus { + +struct ORCUS_DLLPUBLIC config +{ + format_t input_format; + + /** + * configuration settings specific to the CSV format. This struct must be + * POD. + */ + struct csv_config + { + /** Number of header rows to repeat in case of split. */ + size_t header_row_size; + + /** + * Whether or not to split oversized source data into multiple sheets + * in case it spills over. + */ + bool split_to_multiple_sheets; + }; + + // TODO: add config for other formats as needed. + using data_type = std::variant<csv_config>; + + /** + * Enable or disable runtime debug output to stdout or stderr. + */ + bool debug; + + /** + * Control whether or not to perform strict check of the xml structure of + * a stream being parsed. When enabled, it throws an xml_structure_error + * exception when an incorrect xml structure is detected. + */ + bool structure_check; + + data_type data; + + config(format_t input_format); +}; + +struct ORCUS_DLLPUBLIC json_config +{ + /** + * Path of the JSON file being parsed, in case the JSON string originates + * from a file. This parameter is required if external JSON files need to + * be resolved. Otherwise it's optional. + */ + std::string input_path; + + /** + * Path of the file to which output is written to. Used only from the + * orcus-json command line tool. + */ + std::string output_path; + + /** + * Output format type. Used only from the orcus-json command line tool. + */ + dump_format_t output_format; + + /** + * Control whether or not to preserve the order of object's child + * name/value pairs. By definition, JSON's object is an unordered set of + * name/value pairs, but in some cases preserving the original order may + * be desirable. + */ + bool preserve_object_order; + + /** + * Control whether or not to resolve JSON references to external files. + */ + bool resolve_references; + + /** + * When true, the document tree should allocate memory and hold copies of + * string values in the tree. When false, no extra memory is allocated + * for string values in the tree and the string values simply point to the + * original json string stream. + * + * In other words, when this option is set to false, the caller must + * ensure that the json string stream instance stays alive for the entire + * life cycle of the document tree. + */ + bool persistent_string_values; + + json_config(); + ~json_config(); +}; + +struct ORCUS_DLLPUBLIC yaml_config +{ + enum class output_format_type { none, yaml, json }; + + std::string input_path; + std::string output_path; + + output_format_type output_format; + + yaml_config(); + ~yaml_config(); +}; + +} + +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/css_document_tree.hpp b/include/orcus/css_document_tree.hpp new file mode 100644 index 0000000..abbc65a --- /dev/null +++ b/include/orcus/css_document_tree.hpp @@ -0,0 +1,100 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_CSS_DOCUMENT_TREE_HPP +#define INCLUDED_ORCUS_CSS_DOCUMENT_TREE_HPP + +#include "orcus/css_selector.hpp" +#include "orcus/exception.hpp" + +#include <string> +#include <memory> + +namespace orcus { + +/** + * Class representing CSS rules. + */ +class ORCUS_DLLPUBLIC css_document_tree +{ + struct impl; + std::unique_ptr<impl> mp_impl; + +public: + + class insertion_error : public general_error + { + public: + insertion_error(const std::string& msg); + }; + + css_document_tree(const css_document_tree&) = delete; + + css_document_tree(); + css_document_tree(css_document_tree&& other); + ~css_document_tree(); + + css_document_tree& operator=(css_document_tree&& other); + + /** + * Load raw string stream containing CSS rules to populate the document + * tree. + * + * @param stream raw CSS rules. + */ + void load(std::string_view stream); + + /** + * Insert or replace properties for given selector and pseudo element + * flags. + * + * @param selector selector to store properties for. + * @param pseudo_elem pseudo element flags for the last simple selector. + * @param props new properties to insert. + */ + void insert_properties( + const css_selector_t& selector, + css::pseudo_element_t pseudo_elem, + const css_properties_t& props); + + /** + * Get properties associated with given selector and one or more pseudo + * elements. + * + * @param selector selector to get properties for. + * @param pseudo_elem pseudo element flags for the last simple selector. + * This value is a bitfield. + * + * @return const pointer to the property set instance, or NULL in case + * there is no properties for the given selector. + */ + const css_properties_t* get_properties( + const css_selector_t& selector, css::pseudo_element_t pseudo_elem) const; + + /** + * Get all sets of properties associated with given selector, for all + * pseudo element values. + * + * @param selector selector to get properties for. + * + * @return const pointer to the map of property sets with pseudo element + * values as the keys, or NULL in case there is no properties for + * the given selector. + */ + const css_pseudo_element_properties_t* + get_all_properties(const css_selector_t& selector) const; + + void dump() const; + + void swap(css_document_tree& other) noexcept; +}; + +} + +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/css_parser.hpp b/include/orcus/css_parser.hpp new file mode 100644 index 0000000..93bbc14 --- /dev/null +++ b/include/orcus/css_parser.hpp @@ -0,0 +1,883 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_CSS_PARSER_HPP +#define INCLUDED_ORCUS_CSS_PARSER_HPP + +#define ORCUS_DEBUG_CSS 0 + +#include "parser_global.hpp" +#include "css_parser_base.hpp" + +#include <cassert> +#include <algorithm> + +#if ORCUS_DEBUG_CSS +#include <iostream> +using std::cout; +using std::endl; +#endif + +namespace orcus { + +/** + * Empty handler for CSS parser. Sub-class from it and implement necessary + * methods. + */ +class css_handler +{ +public: + /** + * Called upon encountering an at-rule. + * + * @param name name of the at-rule. + */ + void at_rule_name(std::string_view name) + { + (void)name; + } + + /** + * Called upon encountering a simple selector type. A simple selector may + * consist of + * + * @code{.txt} + * <type>.<class>#<id> + * @endcode + * + * and this function only passes the type part of the simple selector + * expression. + * + * @param type simple selector type. + */ + void simple_selector_type(std::string_view type) + { + (void)type; + } + + /** + * Called upon encountering a simple selector class. A simple selector may + * consist of + * + * @code{.txt} + * <type>.<class>#<id> + * @endcode + * + * and this function only passes the class part of the simple selector + * expression. + * + * @param cls simple selector class. + */ + void simple_selector_class(std::string_view cls) + { + (void)cls; + } + + /** + * Called upon encountering a pseudo element of a simple selector. For + * instance, given the following CSS block: + * + * @code{.css} + * p::first-line { + * color: blue; + * text-transform: uppercase; + * } + * @endcode + * + * the `first-line` part is the pseudo element of the selector named `p`. + * + * @param pe pseudo element of a simple selector. + */ + void simple_selector_pseudo_element(orcus::css::pseudo_element_t pe) + { + (void)pe; + } + + /** + * Called upon encountering a pseudo class of a simple selector. For + * instance, given the following CSS block: + * + * @code{.css} + * button:hover { + * color: blue; + * } + * @endcode + * + * the `hover` part is the pseudo class of the selector named `button`. + * + * @param pc pseudo class of a simple selector. + */ + void simple_selector_pseudo_class(orcus::css::pseudo_class_t pc) + { + (void)pc; + } + + /** + * Called upon encountering a simple selector id. A simple selector may + * consist of + * + * @code{.txt} + * <type>.<class>#<id> + * @endcode + * + * and this function only passes the id part of the simle selector + * expression. + * + * @param id simple selector id. + */ + void simple_selector_id(std::string_view id) + { + (void)id; + } + + /** + * Called at the end of a simple selector expression. + * + * @todo find out the difference between a simple selector and a selector, + * and document it. + */ + void end_simple_selector() {} + + /** + * Called at the end of a selector expression. + * + * @todo find out the difference between a simple selector and a selector, + * and document it. + */ + void end_selector() {} + + /** + * Calling upon encountering a combinator. A combinator is an operator that + * combines other selectors. Given the following CSS block: + * + * @code{.css} + * div > p { + * background-color: yellow; + * } + * @endcode + * + * the `>` is the combinator that combines the `div` and `p` selectors. + * + * @param combinator type of combinator encountered. + */ + void combinator(orcus::css::combinator_t combinator) + { + (void)combinator; + } + + /** + * Called at each property name. + * + * @param name property name string. + */ + void property_name(std::string_view name) + { + (void)name; + } + + /** + * Called at each ordinary property value string. + * + * @param value value string. + */ + void value(std::string_view value) + { + (void)value; + } + + /** + * Called at each RGB color value of a property. + * + * @param red value of red (0-255) + * @param green value of green (0-255) + * @param blue value of blue (0-255) + */ + void rgb(uint8_t red, uint8_t green, uint8_t blue) + { + (void)red; (void)green; (void)blue; + } + + /** + * Called at each RGB color value of a property with alpha transparency + * value. + * + * @param red value of red (0-255) + * @param green value of green (0-255) + * @param blue value of blue (0-255) + * @param alpha alpha transparency value + */ + void rgba(uint8_t red, uint8_t green, uint8_t blue, double alpha) + { + (void)red; (void)green; (void)blue; (void)alpha; + } + + /** + * Called at each HSL color value of a property. + * + * @param hue hue + * @param sat saturation + * @param light lightness + */ + void hsl(uint8_t hue, uint8_t sat, uint8_t light) + { + (void)hue; (void)sat; (void)light; + } + + /** + * Called at each HSL color value of a property with alpha transparency + * value. + * + * @param hue hue + * @param sat saturation + * @param light lightness + * @param alpha alpha value + */ + void hsla(uint8_t hue, uint8_t sat, uint8_t light, double alpha) + { + (void)hue; (void)sat; (void)light; (void)alpha; + } + + /** + * Called at each URL value of a property. + * + * @param url URL value string. + */ + void url(std::string_view url) + { + (void)url; + } + + /** + * Called when the parsing begins. + */ + void begin_parse() {} + + /** + * Called when the parsing ends. + */ + void end_parse() {} + + /** + * Called at the beginning of each block. An opening brace '{' marks the + * beginning of a block. + */ + void begin_block() {} + + /** + * Called at the end of each block. A closing brace '}' marks the end of + * a block. + */ + void end_block() {} + + /** + * Called at the beginning of a single property expression. Each property + * expression may consist of + * + * @code{.txt} + * <name> : <value>, ..., <value> + * @endcode + * + * terminated by either a `;` or `}`. + */ + void begin_property() {} + + /** + * Called at the end of a single property expression. + */ + void end_property() {} +}; + +/** + * Parser for CSS documents. + * + * @tparam HandlerT Hanlder type with member functions for event callbacks. + * Refer to css_handler. + */ +template<typename HandlerT> +class css_parser : public css::parser_base +{ +public: + typedef HandlerT handler_type; + + css_parser(std::string_view content, handler_type& hdl); + void parse(); + +private: + // Handlers - at the time a handler is called the current position is + // expected to point to the first unprocessed non-blank character, and + // each handler must set the current position to the next unprocessed + // non-blank character when it finishes. + void rule(); + void at_rule_name(); + void simple_selector_name(); + void property_name(); + void property(); + void quoted_value(char c); + void value(); + void function_value(std::string_view v); + void function_rgb(bool alpha); + void function_hsl(bool alpha); + void function_url(); + void name_sep(); + void property_sep(); + void block(); + + handler_type& m_handler; +}; + +template<typename _Handler> +css_parser<_Handler>::css_parser(std::string_view content, handler_type& hdl) : + css::parser_base(content), m_handler(hdl) {} + +template<typename _Handler> +void css_parser<_Handler>::parse() +{ + shrink_stream(); + +#if ORCUS_DEBUG_CSS + std::cout << "compressed: '"; + const char* p = mp_char; + for (; p != mp_end; ++p) + std::cout << *p; + std::cout << "'" << std::endl; +#endif + m_handler.begin_parse(); + while (has_char()) + rule(); + m_handler.end_parse(); +} + +template<typename _Handler> +void css_parser<_Handler>::rule() +{ + // <selector name> , ... , <selector name> <block> + while (has_char()) + { + if (skip_comment()) + continue; + + char c = cur_char(); + if (is_alpha(c)) + { + simple_selector_name(); + continue; + } + + switch (c) + { + case '>': + set_combinator(c, css::combinator_t::direct_child); + break; + case '+': + set_combinator(c, css::combinator_t::next_sibling); + break; + case '.': + case '#': + case '@': + simple_selector_name(); + break; + case ',': + name_sep(); + break; + case '{': + reset_before_block(); + block(); + break; + default: + parse_error::throw_with("rule: failed to parse '", c, "'", offset()); + } + } +} + +template<typename _Handler> +void css_parser<_Handler>::at_rule_name() +{ + assert(has_char()); + assert(cur_char() == '@'); + next(); + char c = cur_char(); + if (!is_alpha(c)) + throw parse_error("at_rule_name: first character of an at-rule name must be an alphabet.", offset()); + + const char* p; + size_t len; + identifier(p, len); + skip_blanks(); + + m_handler.at_rule_name({p, len}); +#if ORCUS_DEBUG_CSS + std::string foo(p, len); + std::cout << "at-rule name: " << foo.c_str() << std::endl; +#endif +} + +template<typename _Handler> +void css_parser<_Handler>::simple_selector_name() +{ + assert(has_char()); + char c = cur_char(); + if (c == '@') + { + // This is the name of an at-rule. + at_rule_name(); + return; + } + + if (m_simple_selector_count) + { +#if ORCUS_DEBUG_CSS + cout << "combinator: " << m_combinator << endl; +#endif + m_handler.combinator(m_combinator); + m_combinator = css::combinator_t::descendant; + } + assert(is_alpha(c) || c == '.' || c == '#'); + + const char* p = nullptr; + size_t n = 0; + +#if ORCUS_DEBUG_CSS + cout << "simple_selector_name: (" << m_simple_selector_count << ")"; +#endif + + if (c != '.' && c != '#') + { + identifier(p, n); +#if ORCUS_DEBUG_CSS + std::string s(p, n); + cout << " type=" << s; +#endif + m_handler.simple_selector_type({p, n}); + } + + bool in_loop = true; + while (in_loop && has_char()) + { + switch (cur_char()) + { + case '.': + { + next(); + identifier(p, n); + m_handler.simple_selector_class({p, n}); +#if ORCUS_DEBUG_CSS + std::string s(p, n); + std::cout << " class=" << s; +#endif + } + break; + case '#': + { + next(); + identifier(p, n); + m_handler.simple_selector_id({p, n}); +#if ORCUS_DEBUG_CSS + std::string s(p, n); + std::cout << " id=" << s; +#endif + } + break; + case ':': + { + // This could be either a pseudo element or pseudo class. + next(); + if (cur_char() == ':') + { + // pseudo element. + next(); + identifier(p, n); + css::pseudo_element_t elem = css::to_pseudo_element({p, n}); + if (!elem) + parse_error::throw_with( + "selector_name: unknown pseudo element '", {p, n}, "'", offset()); + + m_handler.simple_selector_pseudo_element(elem); + } + else + { + // pseudo class (or pseudo element in the older version of CSS). + identifier(p, n); + css::pseudo_class_t pc = css::to_pseudo_class({p, n}); + if (!pc) + parse_error::throw_with( + "selector_name: unknown pseudo class '", {p, n}, "'", offset()); + + m_handler.simple_selector_pseudo_class(pc); + } + } + break; + default: + in_loop = false; + } + } + + m_handler.end_simple_selector(); + skip_comments_and_blanks(); + + ++m_simple_selector_count; + +#if ORCUS_DEBUG_CSS + std::cout << std::endl; +#endif +} + +template<typename _Handler> +void css_parser<_Handler>::property_name() +{ + // <identifier> + + assert(has_char()); + char c = cur_char(); + if (!is_alpha(c) && c != '.') + parse_error::throw_with( + "property_name: first character of a name must be an alphabet or a dot, but found '", c, "'", offset()); + + const char* p; + size_t len; + identifier(p, len); + skip_comments_and_blanks(); + + m_handler.property_name({p, len}); +#if ORCUS_DEBUG_CSS + std::string foo(p, len); + std::cout << "property name: " << foo.c_str() << std::endl; +#endif +} + +template<typename _Handler> +void css_parser<_Handler>::property() +{ + // <property name> : <value> , ... , <value> + + m_handler.begin_property(); + property_name(); + if (cur_char() != ':') + throw parse_error("property: ':' expected.", offset()); + next(); + skip_comments_and_blanks(); + + bool in_loop = true; + while (in_loop && has_char()) + { + value(); + char c = cur_char(); + switch (c) + { + case ',': + { + // separated by commas. + next(); + skip_comments_and_blanks(); + } + break; + case ';': + case '}': + in_loop = false; + break; + default: + ; + } + } + + skip_comments_and_blanks(); + m_handler.end_property(); +} + +template<typename _Handler> +void css_parser<_Handler>::quoted_value(char c) +{ + // Parse until the the end quote is reached. + const char* p = nullptr; + size_t len = 0; + literal(p, len, c); + next(); + skip_blanks(); + + m_handler.value({p, len}); +#if ORCUS_DEBUG_CSS + std::string foo(p, len); + std::cout << "quoted value: " << foo.c_str() << std::endl; +#endif +} + +template<typename _Handler> +void css_parser<_Handler>::value() +{ + assert(has_char()); + char c = cur_char(); + if (c == '"' || c == '\'') + { + quoted_value(c); + return; + } + + std::string_view v = parse_value(); + if (v.empty()) + return; + + if (cur_char() == '(') + { + function_value(v); + return; + } + + m_handler.value(v); + + skip_comments_and_blanks(); + +#if ORCUS_DEBUG_CSS + std::cout << "value: " << v << std::endl; +#endif +} + +template<typename _Handler> +void css_parser<_Handler>::function_value(std::string_view v) +{ + assert(cur_char() == '('); + css::property_function_t func = css::to_property_function(v); + if (func == css::property_function_t::unknown) + parse_error::throw_with("function_value: unknown function '", v, "'", offset()); + + // Move to the first character of the first argument. + next(); + skip_comments_and_blanks(); + + switch (func) + { + case css::property_function_t::rgb: + function_rgb(false); + break; + case css::property_function_t::rgba: + function_rgb(true); + break; + case css::property_function_t::hsl: + function_hsl(false); + break; + case css::property_function_t::hsla: + function_hsl(true); + break; + case css::property_function_t::url: + function_url(); + break; + default: + parse_error::throw_with("function_value: unhandled function '", v, "'", offset()); + } + + char c = cur_char(); + if (c != ')') + parse_error::throw_with("function_value: ')' expected but '", c, "' found.", offset()); + + next(); + skip_comments_and_blanks(); +} + +template<typename _Handler> +void css_parser<_Handler>::function_rgb(bool alpha) +{ + // rgb(num, num, num) rgba(num, num, num, float) + + uint8_t vals[3]; + uint8_t* p = vals; + const uint8_t* plast = p + 2; + char c = 0; + + for (; ; ++p) + { + *p = parse_uint8(); + + skip_comments_and_blanks(); + + if (p == plast) + break; + + c = cur_char(); + + if (c != ',') + parse_error::throw_with("function_rgb: ',' expected but '", c, "' found.", offset()); + + next(); + skip_comments_and_blanks(); + } + + if (alpha) + { + c = cur_char(); + if (c != ',') + parse_error::throw_with("function_rgb: ',' expected but '", c, "' found.", offset()); + + next(); + skip_comments_and_blanks(); + + double alpha_val = parse_double_or_throw(); + + alpha_val = std::clamp(alpha_val, 0.0, 1.0); + m_handler.rgba(vals[0], vals[1], vals[2], alpha_val); + } + else + m_handler.rgb(vals[0], vals[1], vals[2]); + +#if ORCUS_DEBUG_CSS + std::cout << "rgb"; + if (alpha) + std::cout << 'a'; + std::cout << '('; + p = vals; + const uint8_t* pend = plast + 1; + for (; p != pend; ++p) + std::cout << ' ' << (int)*p; + std::cout << " )" << std::endl; +#endif +} + +template<typename _Handler> +void css_parser<_Handler>::function_hsl(bool alpha) +{ + // hsl(num, percent, percent) hsla(num, percent, percent, float) + + double hue = parse_double_or_throw(); // casted to uint8_t eventually. + hue = std::clamp(hue, 0.0, 360.0); + skip_comments_and_blanks(); + + char c = cur_char(); + if (c != ',') + parse_error::throw_with("function_hsl: ',' expected but '", c, "' found.", offset()); + + next(); + skip_comments_and_blanks(); + + double sat = parse_percent(); + sat = std::clamp(sat, 0.0, 100.0); + skip_comments_and_blanks(); + + c = cur_char(); + if (c != ',') + parse_error::throw_with("function_hsl: ',' expected but '", c, "' found.", offset()); + + next(); + skip_comments_and_blanks(); + + double light = parse_percent(); + light = std::clamp(light, 0.0, 100.0); + skip_comments_and_blanks(); + + if (!alpha) + { + m_handler.hsl(hue, sat, light); + return; + } + + c = cur_char(); + if (c != ',') + parse_error::throw_with("function_hsl: ',' expected but '", c, "' found.", offset()); + + next(); + skip_comments_and_blanks(); + + double alpha_val = parse_double_or_throw(); + alpha_val = std::clamp(alpha_val, 0.0, 1.0); + skip_comments_and_blanks(); + m_handler.hsla(hue, sat, light, alpha_val); +} + +template<typename _Handler> +void css_parser<_Handler>::function_url() +{ + char c = cur_char(); + + if (c == '"' || c == '\'') + { + // Quoted URL value. + const char* p; + size_t len; + literal(p, len, c); + next(); + skip_comments_and_blanks(); + m_handler.url({p, len}); +#if ORCUS_DEBUG_CSS + std::cout << "url(" << std::string(p, len) << ")" << std::endl; +#endif + return; + } + + // Unquoted URL value. + const char* p; + size_t len; + skip_to_or_blank(p, len, ")"); + skip_comments_and_blanks(); + m_handler.url({p, len}); +#if ORCUS_DEBUG_CSS + std::cout << "url(" << std::string(p, len) << ")" << std::endl; +#endif +} + +template<typename _Handler> +void css_parser<_Handler>::name_sep() +{ + assert(cur_char() == ','); +#if ORCUS_DEBUG_CSS + std::cout << "," << std::endl; +#endif + next(); + skip_blanks(); + m_handler.end_selector(); +} + +template<typename _Handler> +void css_parser<_Handler>::property_sep() +{ +#if ORCUS_DEBUG_CSS + std::cout << ";" << std::endl; +#endif + next(); + skip_comments_and_blanks(); +} + +template<typename _Handler> +void css_parser<_Handler>::block() +{ + // '{' <property> ';' ... ';' <property> ';'(optional) '}' + + assert(cur_char() == '{'); +#if ORCUS_DEBUG_CSS + std::cout << "{" << std::endl; +#endif + m_handler.end_selector(); + m_handler.begin_block(); + + next(); + skip_comments_and_blanks(); + + // parse properties. + while (has_char()) + { + property(); + if (cur_char() != ';') + break; + property_sep(); + if (cur_char() == '}') + // ';' after the last property. This is optional but allowed. + break; + } + + if (cur_char() != '}') + throw parse_error("block: '}' expected.", offset()); + + m_handler.end_block(); + + next(); + skip_comments_and_blanks(); + +#if ORCUS_DEBUG_CSS + std::cout << "}" << std::endl; +#endif +} + +} + +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/css_parser_base.hpp b/include/orcus/css_parser_base.hpp new file mode 100644 index 0000000..4514269 --- /dev/null +++ b/include/orcus/css_parser_base.hpp @@ -0,0 +1,71 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_CSS_PARSER_BASE_HPP +#define INCLUDED_CSS_PARSER_BASE_HPP + +#include "orcus/env.hpp" +#include "orcus/css_types.hpp" +#include "orcus/exception.hpp" +#include "orcus/parser_base.hpp" + +#include <string> +#include <exception> + +namespace orcus { namespace css { + +class ORCUS_PSR_DLLPUBLIC parser_base : public ::orcus::parser_base +{ +public: + parser_base(std::string_view content); + +protected: + + void identifier(const char*& p, size_t& len, std::string_view extra = std::string_view{}); + uint8_t parse_uint8(); + + /** + * Parse an unquoted property value until one of non-value characters is + * reached. + * + * @return parsed value segment. + */ + std::string_view parse_value(); + double parse_percent(); + double parse_double_or_throw(); + + void literal(const char*& p, size_t& len, char quote); + void skip_to(const char*& p, size_t& len, char c); + + /** + * Skip until one of specified characters or a blank character is reached. + * + * @param p pointer to the first character of the skipped character array. + * @param len length of the skipped character array. + * @param chars one or more characters that can end the skipping. + */ + void skip_to_or_blank(const char*& p, size_t& len, std::string_view chars); + void skip_blanks(); + void skip_blanks_reverse(); + void shrink_stream(); + bool skip_comment(); + void comment(); + void skip_comments_and_blanks(); + void set_combinator(char c, css::combinator_t combinator); + void reset_before_block(); + +protected: + size_t m_simple_selector_count; + combinator_t m_combinator; +}; + + +}} + +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/css_selector.hpp b/include/orcus/css_selector.hpp new file mode 100644 index 0000000..1e41d54 --- /dev/null +++ b/include/orcus/css_selector.hpp @@ -0,0 +1,110 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_CSS_SELECTOR_HPP +#define INCLUDED_ORCUS_CSS_SELECTOR_HPP + +#include "env.hpp" +#include "css_types.hpp" + +#include <ostream> +#include <variant> +#include <vector> +#include <unordered_set> +#include <unordered_map> + +namespace orcus { + +struct ORCUS_DLLPUBLIC css_simple_selector_t +{ + typedef std::unordered_set<std::string_view> classes_type; + + std::string_view name; + std::string_view id; + classes_type classes; + css::pseudo_class_t pseudo_classes; + + css_simple_selector_t(); + + void clear(); + bool empty() const; + + bool operator== (const css_simple_selector_t& r) const; + bool operator!= (const css_simple_selector_t& r) const; + + struct hash + { + size_t operator() (const css_simple_selector_t& ss) const; + }; +}; + +struct ORCUS_DLLPUBLIC css_chained_simple_selector_t +{ + css::combinator_t combinator; + css_simple_selector_t simple_selector; + + bool operator== (const css_chained_simple_selector_t& r) const; + + css_chained_simple_selector_t(); + css_chained_simple_selector_t(const css_simple_selector_t& ss); + css_chained_simple_selector_t(css::combinator_t op, const css_simple_selector_t& ss); +}; + +/** + * Each CSS selector consists of one or more chained simple selectors. + */ +struct ORCUS_DLLPUBLIC css_selector_t +{ + typedef std::vector<css_chained_simple_selector_t> chained_type; + css_simple_selector_t first; + chained_type chained; + + void clear(); + + bool operator== (const css_selector_t& r) const; +}; + +/** + * Structure representing a single CSS property value. + */ +struct ORCUS_DLLPUBLIC css_property_value_t +{ + using value_type = std::variant<std::string_view, css::rgba_color_t, css::hsla_color_t>; + + css::property_value_t type; + value_type value; + + css_property_value_t(); + css_property_value_t(const css_property_value_t& r); + + /** + * Constructor that takes a string value. + * + * @param _str string value to store. This value should point to a string + * buffer that's already been interned. The caller is + * responsible for managing the life cycle of the source string + * buffer. + */ + css_property_value_t(std::string_view _str); + + css_property_value_t& operator= (const css_property_value_t& r); + + void swap(css_property_value_t& r); +}; + +typedef std::unordered_map<std::string_view, std::vector<css_property_value_t>> css_properties_t; +typedef std::unordered_map<css::pseudo_element_t, css_properties_t> css_pseudo_element_properties_t; + +ORCUS_DLLPUBLIC std::ostream& operator<< (std::ostream& os, const css_simple_selector_t& v); +ORCUS_DLLPUBLIC std::ostream& operator<< (std::ostream& os, const css_selector_t& v); +ORCUS_DLLPUBLIC std::ostream& operator<< (std::ostream& os, const css_property_value_t& v); + +} + +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/css_types.hpp b/include/orcus/css_types.hpp new file mode 100644 index 0000000..75386ea --- /dev/null +++ b/include/orcus/css_types.hpp @@ -0,0 +1,139 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_CSS_TYPES_HPP +#define INCLUDED_ORCUS_CSS_TYPES_HPP + +#include "env.hpp" + +#include <cstdlib> +#include <cstdint> +#include <string> + +namespace orcus { namespace css { + +enum class combinator_t +{ + /// `E F` where `F` is a descendant of `E`. + descendant, + /// `E > F` where `F` is a direct child of `E`. + direct_child, + /// `E + F` where `F` is a direct sibling of `E` where `E` precedes `F`. + next_sibling +}; + +/** + * List of functions used as property values. + */ +enum class property_function_t +{ + unknown = 0, + hsl, + hsla, + rgb, + rgba, + url +}; + +enum class property_value_t +{ + none = 0, + string, + hsl, + hsla, + rgb, + rgba, + url +}; + +struct rgba_color_t +{ + uint8_t red; /// 0 to 255 + uint8_t green; /// 0 to 255 + uint8_t blue; /// 0 to 255 + double alpha; +}; + +struct hsla_color_t +{ + uint8_t hue; /// 0 to 255 + uint8_t saturation; /// 0 to 255 + uint8_t lightness; /// 0 to 255 + double alpha; +}; + +using pseudo_element_t = uint16_t; +using pseudo_class_t = uint64_t; + +ORCUS_PSR_DLLPUBLIC extern const pseudo_element_t pseudo_element_after; +ORCUS_PSR_DLLPUBLIC extern const pseudo_element_t pseudo_element_before; +ORCUS_PSR_DLLPUBLIC extern const pseudo_element_t pseudo_element_first_letter; +ORCUS_PSR_DLLPUBLIC extern const pseudo_element_t pseudo_element_first_line; +ORCUS_PSR_DLLPUBLIC extern const pseudo_element_t pseudo_element_selection; +ORCUS_PSR_DLLPUBLIC extern const pseudo_element_t pseudo_element_backdrop; + +ORCUS_PSR_DLLPUBLIC extern const pseudo_class_t pseudo_class_active; +ORCUS_PSR_DLLPUBLIC extern const pseudo_class_t pseudo_class_checked; +ORCUS_PSR_DLLPUBLIC extern const pseudo_class_t pseudo_class_default; +ORCUS_PSR_DLLPUBLIC extern const pseudo_class_t pseudo_class_dir; +ORCUS_PSR_DLLPUBLIC extern const pseudo_class_t pseudo_class_disabled; +ORCUS_PSR_DLLPUBLIC extern const pseudo_class_t pseudo_class_empty; +ORCUS_PSR_DLLPUBLIC extern const pseudo_class_t pseudo_class_enabled; +ORCUS_PSR_DLLPUBLIC extern const pseudo_class_t pseudo_class_first; +ORCUS_PSR_DLLPUBLIC extern const pseudo_class_t pseudo_class_first_child; +ORCUS_PSR_DLLPUBLIC extern const pseudo_class_t pseudo_class_first_of_type; +ORCUS_PSR_DLLPUBLIC extern const pseudo_class_t pseudo_class_fullscreen; +ORCUS_PSR_DLLPUBLIC extern const pseudo_class_t pseudo_class_focus; +ORCUS_PSR_DLLPUBLIC extern const pseudo_class_t pseudo_class_hover; +ORCUS_PSR_DLLPUBLIC extern const pseudo_class_t pseudo_class_indeterminate; +ORCUS_PSR_DLLPUBLIC extern const pseudo_class_t pseudo_class_in_range; +ORCUS_PSR_DLLPUBLIC extern const pseudo_class_t pseudo_class_invalid; +ORCUS_PSR_DLLPUBLIC extern const pseudo_class_t pseudo_class_lang; +ORCUS_PSR_DLLPUBLIC extern const pseudo_class_t pseudo_class_last_child; +ORCUS_PSR_DLLPUBLIC extern const pseudo_class_t pseudo_class_last_of_type; +ORCUS_PSR_DLLPUBLIC extern const pseudo_class_t pseudo_class_left; +ORCUS_PSR_DLLPUBLIC extern const pseudo_class_t pseudo_class_link; +ORCUS_PSR_DLLPUBLIC extern const pseudo_class_t pseudo_class_not; +ORCUS_PSR_DLLPUBLIC extern const pseudo_class_t pseudo_class_nth_child; +ORCUS_PSR_DLLPUBLIC extern const pseudo_class_t pseudo_class_nth_last_child; +ORCUS_PSR_DLLPUBLIC extern const pseudo_class_t pseudo_class_nth_last_of_type; +ORCUS_PSR_DLLPUBLIC extern const pseudo_class_t pseudo_class_nth_of_type; +ORCUS_PSR_DLLPUBLIC extern const pseudo_class_t pseudo_class_only_child; +ORCUS_PSR_DLLPUBLIC extern const pseudo_class_t pseudo_class_only_of_type; +ORCUS_PSR_DLLPUBLIC extern const pseudo_class_t pseudo_class_optional; +ORCUS_PSR_DLLPUBLIC extern const pseudo_class_t pseudo_class_out_of_range; +ORCUS_PSR_DLLPUBLIC extern const pseudo_class_t pseudo_class_read_only; +ORCUS_PSR_DLLPUBLIC extern const pseudo_class_t pseudo_class_read_write; +ORCUS_PSR_DLLPUBLIC extern const pseudo_class_t pseudo_class_required; +ORCUS_PSR_DLLPUBLIC extern const pseudo_class_t pseudo_class_right; +ORCUS_PSR_DLLPUBLIC extern const pseudo_class_t pseudo_class_root; +ORCUS_PSR_DLLPUBLIC extern const pseudo_class_t pseudo_class_scope; +ORCUS_PSR_DLLPUBLIC extern const pseudo_class_t pseudo_class_target; +ORCUS_PSR_DLLPUBLIC extern const pseudo_class_t pseudo_class_valid; +ORCUS_PSR_DLLPUBLIC extern const pseudo_class_t pseudo_class_visited; + +/** + * Convert a textural representation of a pseudo element into its numerical + * representation. + */ +ORCUS_PSR_DLLPUBLIC pseudo_element_t to_pseudo_element(std::string_view s); + +/** + * Convert a textural representation of a pseudo class into its numerical + * representation. + */ +ORCUS_PSR_DLLPUBLIC pseudo_class_t to_pseudo_class(std::string_view s); + +ORCUS_PSR_DLLPUBLIC std::string pseudo_class_to_string(pseudo_class_t val); + +ORCUS_PSR_DLLPUBLIC property_function_t to_property_function(std::string_view s); + +}} + +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/csv_parser.hpp b/include/orcus/csv_parser.hpp new file mode 100644 index 0000000..5cb9598 --- /dev/null +++ b/include/orcus/csv_parser.hpp @@ -0,0 +1,306 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef ORCUS_CSV_PARSER_HPP +#define ORCUS_CSV_PARSER_HPP + +#include "csv_parser_base.hpp" + +namespace orcus { + +class csv_handler +{ +public: + /** + * Called when the parser starts parsing a stream. + */ + void begin_parse() {} + + /** + * Called when the parser finishes parsing a stream. + */ + void end_parse() {} + + /** + * Called at the beginning of every row. + */ + void begin_row() {} + + /** + * Called at the end of every row. + */ + void end_row() {} + + /** + * Called after every cell is parsed. + * + * @param value cell content. + * @param transient when true, the text content has been converted and is + * stored in a temporary buffer. In such case, there is + * no guarantee that the text content remain available + * after the end of the call. When this value is false, + * the text content is guaranteed to be valid so long as + * the original CSV stream content is valid. + */ + void cell(std::string_view value, bool transient) + { + (void)value; (void)transient; + } +}; + +/** + * Parser for CSV documents. + * + * @tparam HandlerT Hanlder type with member functions for event callbacks. + * Refer to csv_handler. + */ +template<typename HandlerT> +class csv_parser : public csv::parser_base +{ +public: + typedef HandlerT handler_type; + + csv_parser(std::string_view content, handler_type& hdl, const csv::parser_config& config); + void parse(); + +private: + + // handlers + void row(); + void cell(); + void quoted_cell(); + + void parse_cell_with_quote(const char* p0, size_t len0); + + /** + * Push cell value to the handler. + */ + void push_cell_value(const char* p, size_t n); + +private: + handler_type& m_handler; +}; + +template<typename _Handler> +csv_parser<_Handler>::csv_parser( + std::string_view content, handler_type& hdl, const csv::parser_config& config) : + csv::parser_base(content, config), m_handler(hdl) {} + +template<typename _Handler> +void csv_parser<_Handler>::parse() +{ +#if ORCUS_DEBUG_CSV + for (const char* p = mp_begin; p < mp_end; ++p) + std::cout << *p; + std::cout << std::endl; +#endif + + m_handler.begin_parse(); + while (has_char()) + row(); + m_handler.end_parse(); +} + +template<typename _Handler> +void csv_parser<_Handler>::row() +{ + m_handler.begin_row(); + while (true) + { + if (is_text_qualifier(cur_char())) + quoted_cell(); + else + cell(); + + if (!has_char()) + { + m_handler.end_row(); + return; + } + + char c = cur_char(); + if (c == '\n') + { + next(); +#if ORCUS_DEBUG_CSV + cout << "(LF)" << endl; +#endif + m_handler.end_row(); + return; + } + + if (!is_delim(c)) + throw orcus::parse_error("expected a delimiter", offset()); + + next(); + + if (m_config.trim_cell_value) + skip_blanks(); + + if (!has_char()) + { + m_handler.end_row(); + return; + } + } +} + +template<typename _Handler> +void csv_parser<_Handler>::cell() +{ + const char* p = mp_char; + size_t len = 0; + char c = cur_char(); + while (c != '\n' && !is_delim(c)) + { + ++len; + next(); + if (!has_char()) + break; + c = cur_char(); + } + + if (!len) + p = nullptr; + + push_cell_value(p, len); +} + +template<typename _Handler> +void csv_parser<_Handler>::quoted_cell() +{ +#if ORCUS_DEBUG_CSV + cout << "--- quoted cell" << endl; +#endif + char c = cur_char(); + assert(is_text_qualifier(c)); + next(); // Skip the opening quote. + if (!has_char()) + return; + + const char* p0 = mp_char; + size_t len = 1; + for (; has_char(); next(), ++len) + { + c = cur_char(); +#if ORCUS_DEBUG_CSV + cout << "'" << c << "'" << endl; +#endif + if (!is_text_qualifier(c)) + continue; + + // current char is a quote. Check if the next char is also a text + // qualifier. + + if (has_next() && is_text_qualifier(peek_char())) + { + next(); + parse_cell_with_quote(p0, len); + return; + } + + // Closing quote. + m_handler.cell({p0, len-1}, false); + next(); + skip_blanks(); + return; + } + + // Stream ended prematurely. Handle it gracefully. + m_handler.cell({p0, len}, false); +} + +template<typename _Handler> +void csv_parser<_Handler>::parse_cell_with_quote(const char* p0, size_t len0) +{ +#if ORCUS_DEBUG_CSV + using namespace std; + cout << "--- parse cell with quote" << endl; +#endif + assert(is_text_qualifier(cur_char())); + + // Push the preceding chars to the temp buffer. + m_cell_buf.reset(); + m_cell_buf.append(p0, len0); + + // Parse the rest, until the closing quote. + next(); + const char* p_cur = mp_char; + size_t cur_len = 0; + for (; has_char(); next(), ++cur_len) + { + char c = cur_char(); +#if ORCUS_DEBUG_CSV + cout << "'" << c << "'" << endl; +#endif + if (!is_text_qualifier(c)) + continue; + + if (has_next() && is_text_qualifier(peek_char())) + { + // double quotation. Copy the current segment to the cell buffer. + m_cell_buf.append(p_cur, cur_len); + + next(); // to the 2nd quote. + p_cur = mp_char; + cur_len = 0; + continue; + } + + // closing quote. Flush the current segment to the cell + // buffer, push the value to the handler, and exit normally. + m_cell_buf.append(p_cur, cur_len); + + m_handler.cell(m_cell_buf.str(), true); + next(); + skip_blanks(); + return; + } + + // Stream ended prematurely. + throw parse_error("stream ended prematurely while parsing quoted cell.", offset()); +} + +template<typename _Handler> +void csv_parser<_Handler>::push_cell_value(const char* p, size_t n) +{ + size_t len = n; + + if (m_config.trim_cell_value) + { + // Trim any leading blanks. + for (size_t i = 0; i < n; ++i, --len, ++p) + { + if (!is_blank(*p)) + break; + } + + // Trim any trailing blanks. + if (len) + { + const char* p_end = p + (len-1); + for (; p != p_end; --p_end, --len) + { + if (!is_blank(*p_end)) + break; + } + } + } + + m_handler.cell({p, len}, false); +#if ORCUS_DEBUG_CSV + if (len) + cout << "(cell:'" << std::string(p, len) << "')" << endl; + else + cout << "(cell:'')" << endl; +#endif +} + +} + +#endif +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/csv_parser_base.hpp b/include/orcus/csv_parser_base.hpp new file mode 100644 index 0000000..506d4e5 --- /dev/null +++ b/include/orcus/csv_parser_base.hpp @@ -0,0 +1,80 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef CSV_PARSER_BASE_HPP +#define CSV_PARSER_BASE_HPP + +#include "env.hpp" +#include "cell_buffer.hpp" +#include "parser_global.hpp" +#include "parser_base.hpp" + +#include <cstdlib> +#include <cstring> +#include <exception> +#include <string> +#include <cassert> +#include <sstream> + +#define ORCUS_DEBUG_CSV 0 + +#if ORCUS_DEBUG_CSV +#include <iostream> +using std::cout; +using std::endl; +#endif + +namespace orcus { namespace csv { + +/** + * Run-time configuration object for csv_parser. + */ +struct ORCUS_PSR_DLLPUBLIC parser_config +{ + /** + * One or more characters that serve as cell boundaries. + */ + std::string delimiters; + + /** + * A single character used as a text quote value. + */ + char text_qualifier; + + /** + * When true, the value of each cell gets trimmed i.e. any leading or + * trailing white spaces will get ignored. + */ + bool trim_cell_value:1; + + parser_config(); +}; + +class ORCUS_PSR_DLLPUBLIC parser_base : public ::orcus::parser_base +{ +protected: + const csv::parser_config& m_config; + cell_buffer m_cell_buf; + +protected: + parser_base(std::string_view content, const parser_config& config); + + /** + * This is different from the global 'is_blank' in that it doesn't treat + * linefeed and carriage return characters as non-blanks. + */ + bool is_blank(char c) const; + bool is_delim(char c) const; + bool is_text_qualifier(char c) const; + + void skip_blanks(); +}; + +}} + +#endif +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/detail/Makefile.am b/include/orcus/detail/Makefile.am new file mode 100644 index 0000000..d1d3aee --- /dev/null +++ b/include/orcus/detail/Makefile.am @@ -0,0 +1,7 @@ + +liborcusdir = $(includedir)/liborcus-@ORCUS_API_VERSION@/orcus/detail + +liborcus_HEADERS = \ + parser_token_buffer.hpp \ + thread.hpp + diff --git a/include/orcus/detail/Makefile.in b/include/orcus/detail/Makefile.in new file mode 100644 index 0000000..f58c7b1 --- /dev/null +++ b/include/orcus/detail/Makefile.in @@ -0,0 +1,662 @@ +# Makefile.in generated by automake 1.16.5 from Makefile.am. +# @configure_input@ + +# Copyright (C) 1994-2021 Free Software Foundation, Inc. + +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + +@SET_MAKE@ + +VPATH = @srcdir@ +am__is_gnu_make = { \ + if test -z '$(MAKELEVEL)'; then \ + false; \ + elif test -n '$(MAKE_HOST)'; then \ + true; \ + elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \ + true; \ + else \ + false; \ + fi; \ +} +am__make_running_with_option = \ + case $${target_option-} in \ + ?) ;; \ + *) echo "am__make_running_with_option: internal error: invalid" \ + "target option '$${target_option-}' specified" >&2; \ + exit 1;; \ + esac; \ + has_opt=no; \ + sane_makeflags=$$MAKEFLAGS; \ + if $(am__is_gnu_make); then \ + sane_makeflags=$$MFLAGS; \ + else \ + case $$MAKEFLAGS in \ + *\\[\ \ ]*) \ + bs=\\; \ + sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \ + | sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \ + esac; \ + fi; \ + skip_next=no; \ + strip_trailopt () \ + { \ + flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \ + }; \ + for flg in $$sane_makeflags; do \ + test $$skip_next = yes && { skip_next=no; continue; }; \ + case $$flg in \ + *=*|--*) continue;; \ + -*I) strip_trailopt 'I'; skip_next=yes;; \ + -*I?*) strip_trailopt 'I';; \ + -*O) strip_trailopt 'O'; skip_next=yes;; \ + -*O?*) strip_trailopt 'O';; \ + -*l) strip_trailopt 'l'; skip_next=yes;; \ + -*l?*) strip_trailopt 'l';; \ + -[dEDm]) skip_next=yes;; \ + -[JT]) skip_next=yes;; \ + esac; \ + case $$flg in \ + *$$target_option*) has_opt=yes; break;; \ + esac; \ + done; \ + test $$has_opt = yes +am__make_dryrun = (target_option=n; $(am__make_running_with_option)) +am__make_keepgoing = (target_option=k; $(am__make_running_with_option)) +pkgdatadir = $(datadir)/@PACKAGE@ +pkgincludedir = $(includedir)/@PACKAGE@ +pkglibdir = $(libdir)/@PACKAGE@ +pkglibexecdir = $(libexecdir)/@PACKAGE@ +am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd +install_sh_DATA = $(install_sh) -c -m 644 +install_sh_PROGRAM = $(install_sh) -c +install_sh_SCRIPT = $(install_sh) -c +INSTALL_HEADER = $(INSTALL_DATA) +transform = $(program_transform_name) +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +build_triplet = @build@ +host_triplet = @host@ +subdir = include/orcus/detail +ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 +am__aclocal_m4_deps = $(top_srcdir)/m4/ax_cxx_compile_stdcxx.m4 \ + $(top_srcdir)/m4/ax_cxx_compile_stdcxx_17.m4 \ + $(top_srcdir)/m4/boost.m4 $(top_srcdir)/m4/libtool.m4 \ + $(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \ + $(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \ + $(top_srcdir)/m4/m4_ax_valgrind_check.m4 \ + $(top_srcdir)/configure.ac +am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ + $(ACLOCAL_M4) +DIST_COMMON = $(srcdir)/Makefile.am $(liborcus_HEADERS) \ + $(am__DIST_COMMON) +mkinstalldirs = $(install_sh) -d +CONFIG_HEADER = $(top_builddir)/config.h +CONFIG_CLEAN_FILES = +CONFIG_CLEAN_VPATH_FILES = +AM_V_P = $(am__v_P_@AM_V@) +am__v_P_ = $(am__v_P_@AM_DEFAULT_V@) +am__v_P_0 = false +am__v_P_1 = : +AM_V_GEN = $(am__v_GEN_@AM_V@) +am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@) +am__v_GEN_0 = @echo " GEN " $@; +am__v_GEN_1 = +AM_V_at = $(am__v_at_@AM_V@) +am__v_at_ = $(am__v_at_@AM_DEFAULT_V@) +am__v_at_0 = @ +am__v_at_1 = +SOURCES = +DIST_SOURCES = +am__can_run_installinfo = \ + case $$AM_UPDATE_INFO_DIR in \ + n|no|NO) false;; \ + *) (install-info --version) >/dev/null 2>&1;; \ + esac +am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; +am__vpath_adj = case $$p in \ + $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \ + *) f=$$p;; \ + esac; +am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`; +am__install_max = 40 +am__nobase_strip_setup = \ + srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'` +am__nobase_strip = \ + for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||" +am__nobase_list = $(am__nobase_strip_setup); \ + for p in $$list; do echo "$$p $$p"; done | \ + sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \ + $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \ + if (++n[$$2] == $(am__install_max)) \ + { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \ + END { for (dir in files) print dir, files[dir] }' +am__base_list = \ + sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \ + sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g' +am__uninstall_files_from_dir = { \ + test -z "$$files" \ + || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \ + || { echo " ( cd '$$dir' && rm -f" $$files ")"; \ + $(am__cd) "$$dir" && rm -f $$files; }; \ + } +am__installdirs = "$(DESTDIR)$(liborcusdir)" +HEADERS = $(liborcus_HEADERS) +am__extra_recursive_targets = check-valgrind-recursive \ + check-valgrind-memcheck-recursive \ + check-valgrind-helgrind-recursive check-valgrind-drd-recursive \ + check-valgrind-sgcheck-recursive +am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP) +# Read a list of newline-separated strings from the standard input, +# and print each of them once, without duplicates. Input order is +# *not* preserved. +am__uniquify_input = $(AWK) '\ + BEGIN { nonempty = 0; } \ + { items[$$0] = 1; nonempty = 1; } \ + END { if (nonempty) { for (i in items) print i; }; } \ +' +# Make sure the list of sources is unique. This is necessary because, +# e.g., the same source file might be shared among _SOURCES variables +# for different programs/libraries. +am__define_uniq_tagged_files = \ + list='$(am__tagged_files)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | $(am__uniquify_input)` +am__DIST_COMMON = $(srcdir)/Makefile.in +DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) +ACLOCAL = @ACLOCAL@ +AMTAR = @AMTAR@ +AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ +AR = @AR@ +AS = @AS@ +AUTOCONF = @AUTOCONF@ +AUTOHEADER = @AUTOHEADER@ +AUTOMAKE = @AUTOMAKE@ +AWK = @AWK@ +BOOST_CPPFLAGS = @BOOST_CPPFLAGS@ +BOOST_DATE_TIME_LDFLAGS = @BOOST_DATE_TIME_LDFLAGS@ +BOOST_DATE_TIME_LDPATH = @BOOST_DATE_TIME_LDPATH@ +BOOST_DATE_TIME_LIBS = @BOOST_DATE_TIME_LIBS@ +BOOST_FILESYSTEM_LDFLAGS = @BOOST_FILESYSTEM_LDFLAGS@ +BOOST_FILESYSTEM_LDPATH = @BOOST_FILESYSTEM_LDPATH@ +BOOST_FILESYSTEM_LIBS = @BOOST_FILESYSTEM_LIBS@ +BOOST_IOSTREAMS_LDFLAGS = @BOOST_IOSTREAMS_LDFLAGS@ +BOOST_IOSTREAMS_LDPATH = @BOOST_IOSTREAMS_LDPATH@ +BOOST_IOSTREAMS_LIBS = @BOOST_IOSTREAMS_LIBS@ +BOOST_LDPATH = @BOOST_LDPATH@ +BOOST_PROGRAM_OPTIONS_LDFLAGS = @BOOST_PROGRAM_OPTIONS_LDFLAGS@ +BOOST_PROGRAM_OPTIONS_LDPATH = @BOOST_PROGRAM_OPTIONS_LDPATH@ +BOOST_PROGRAM_OPTIONS_LIBS = @BOOST_PROGRAM_OPTIONS_LIBS@ +BOOST_ROOT = @BOOST_ROOT@ +BOOST_SYSTEM_LDFLAGS = @BOOST_SYSTEM_LDFLAGS@ +BOOST_SYSTEM_LDPATH = @BOOST_SYSTEM_LDPATH@ +BOOST_SYSTEM_LIBS = @BOOST_SYSTEM_LIBS@ +CC = @CC@ +CCDEPMODE = @CCDEPMODE@ +CFLAGS = @CFLAGS@ +CPP = @CPP@ +CPPFLAGS = @CPPFLAGS@ +CSCOPE = @CSCOPE@ +CTAGS = @CTAGS@ +CXX = @CXX@ +CXXCPP = @CXXCPP@ +CXXDEPMODE = @CXXDEPMODE@ +CXXFLAGS = @CXXFLAGS@ +CYGPATH_W = @CYGPATH_W@ +DEFS = @DEFS@ +DEPDIR = @DEPDIR@ +DISTCHECK_CONFIGURE_FLAGS = @DISTCHECK_CONFIGURE_FLAGS@ +DLLTOOL = @DLLTOOL@ +DSYMUTIL = @DSYMUTIL@ +DUMPBIN = @DUMPBIN@ +ECHO_C = @ECHO_C@ +ECHO_N = @ECHO_N@ +ECHO_T = @ECHO_T@ +EGREP = @EGREP@ +ENABLE_VALGRIND_drd = @ENABLE_VALGRIND_drd@ +ENABLE_VALGRIND_helgrind = @ENABLE_VALGRIND_helgrind@ +ENABLE_VALGRIND_memcheck = @ENABLE_VALGRIND_memcheck@ +ENABLE_VALGRIND_sgcheck = @ENABLE_VALGRIND_sgcheck@ +ETAGS = @ETAGS@ +EXEEXT = @EXEEXT@ +FGREP = @FGREP@ +GREP = @GREP@ +HAVE_CXX17 = @HAVE_CXX17@ +INSTALL = @INSTALL@ +INSTALL_DATA = @INSTALL_DATA@ +INSTALL_PROGRAM = @INSTALL_PROGRAM@ +INSTALL_SCRIPT = @INSTALL_SCRIPT@ +INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ +IXION_REQUIRED_API_VERSION = @IXION_REQUIRED_API_VERSION@ +LD = @LD@ +LDFLAGS = @LDFLAGS@ +LIBIXION_CFLAGS = @LIBIXION_CFLAGS@ +LIBIXION_LIBS = @LIBIXION_LIBS@ +LIBOBJS = @LIBOBJS@ +LIBS = @LIBS@ +LIBTOOL = @LIBTOOL@ +LIPO = @LIPO@ +LN_S = @LN_S@ +LTLIBOBJS = @LTLIBOBJS@ +LT_SYS_LIBRARY_PATH = @LT_SYS_LIBRARY_PATH@ +MAKEINFO = @MAKEINFO@ +MANIFEST_TOOL = @MANIFEST_TOOL@ +MDDS_CFLAGS = @MDDS_CFLAGS@ +MDDS_LIBS = @MDDS_LIBS@ +MKDIR_P = @MKDIR_P@ +NM = @NM@ +NMEDIT = @NMEDIT@ +OBJDUMP = @OBJDUMP@ +OBJEXT = @OBJEXT@ +ORCUS_API_VERSION = @ORCUS_API_VERSION@ +ORCUS_MAJOR_VERSION = @ORCUS_MAJOR_VERSION@ +ORCUS_MICRO_VERSION = @ORCUS_MICRO_VERSION@ +ORCUS_MINOR_VERSION = @ORCUS_MINOR_VERSION@ +OTOOL = @OTOOL@ +OTOOL64 = @OTOOL64@ +PACKAGE = @PACKAGE@ +PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ +PACKAGE_NAME = @PACKAGE_NAME@ +PACKAGE_STRING = @PACKAGE_STRING@ +PACKAGE_TARNAME = @PACKAGE_TARNAME@ +PACKAGE_URL = @PACKAGE_URL@ +PACKAGE_VERSION = @PACKAGE_VERSION@ +PARQUET_CFLAGS = @PARQUET_CFLAGS@ +PARQUET_LIBS = @PARQUET_LIBS@ +PATH_SEPARATOR = @PATH_SEPARATOR@ +PKG_CONFIG = @PKG_CONFIG@ +PKG_CONFIG_LIBDIR = @PKG_CONFIG_LIBDIR@ +PKG_CONFIG_PATH = @PKG_CONFIG_PATH@ +POW_LIB = @POW_LIB@ +PYTHON = @PYTHON@ +PYTHON_CFLAGS = @PYTHON_CFLAGS@ +PYTHON_EXEC_PREFIX = @PYTHON_EXEC_PREFIX@ +PYTHON_LIBS = @PYTHON_LIBS@ +PYTHON_PLATFORM = @PYTHON_PLATFORM@ +PYTHON_PREFIX = @PYTHON_PREFIX@ +PYTHON_VERSION = @PYTHON_VERSION@ +RANLIB = @RANLIB@ +SED = @SED@ +SET_MAKE = @SET_MAKE@ +SHELL = @SHELL@ +STRIP = @STRIP@ +VALGRIND = @VALGRIND@ +VALGRIND_ENABLED = @VALGRIND_ENABLED@ +VERSION = @VERSION@ +ZLIB_CFLAGS = @ZLIB_CFLAGS@ +ZLIB_LIBS = @ZLIB_LIBS@ +abs_builddir = @abs_builddir@ +abs_srcdir = @abs_srcdir@ +abs_top_builddir = @abs_top_builddir@ +abs_top_srcdir = @abs_top_srcdir@ +ac_ct_AR = @ac_ct_AR@ +ac_ct_CC = @ac_ct_CC@ +ac_ct_CXX = @ac_ct_CXX@ +ac_ct_DUMPBIN = @ac_ct_DUMPBIN@ +am__include = @am__include@ +am__leading_dot = @am__leading_dot@ +am__quote = @am__quote@ +am__tar = @am__tar@ +am__untar = @am__untar@ +bindir = @bindir@ +build = @build@ +build_alias = @build_alias@ +build_cpu = @build_cpu@ +build_os = @build_os@ +build_vendor = @build_vendor@ +builddir = @builddir@ +datadir = @datadir@ +datarootdir = @datarootdir@ +docdir = @docdir@ +dvidir = @dvidir@ +exec_prefix = @exec_prefix@ +host = @host@ +host_alias = @host_alias@ +host_cpu = @host_cpu@ +host_os = @host_os@ +host_vendor = @host_vendor@ +htmldir = @htmldir@ +includedir = @includedir@ +infodir = @infodir@ +install_sh = @install_sh@ +libdir = @libdir@ +libexecdir = @libexecdir@ +localedir = @localedir@ +localstatedir = @localstatedir@ +mandir = @mandir@ +mkdir_p = @mkdir_p@ +oldincludedir = @oldincludedir@ +pdfdir = @pdfdir@ +pkgpyexecdir = @pkgpyexecdir@ +pkgpythondir = @pkgpythondir@ +prefix = @prefix@ +program_transform_name = @program_transform_name@ +psdir = @psdir@ +pyexecdir = @pyexecdir@ +pythondir = @pythondir@ +runstatedir = @runstatedir@ +sbindir = @sbindir@ +sharedstatedir = @sharedstatedir@ +srcdir = @srcdir@ +sysconfdir = @sysconfdir@ +target_alias = @target_alias@ +top_build_prefix = @top_build_prefix@ +top_builddir = @top_builddir@ +top_srcdir = @top_srcdir@ +valgrind_enabled_tools = @valgrind_enabled_tools@ +valgrind_tools = @valgrind_tools@ +liborcusdir = $(includedir)/liborcus-@ORCUS_API_VERSION@/orcus/detail +liborcus_HEADERS = \ + parser_token_buffer.hpp \ + thread.hpp + +all: all-am + +.SUFFIXES: +$(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps) + @for dep in $?; do \ + case '$(am__configure_deps)' in \ + *$$dep*) \ + ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ + && { if test -f $@; then exit 0; else break; fi; }; \ + exit 1;; \ + esac; \ + done; \ + echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign include/orcus/detail/Makefile'; \ + $(am__cd) $(top_srcdir) && \ + $(AUTOMAKE) --foreign include/orcus/detail/Makefile +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + @case '$?' in \ + *config.status*) \ + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ + *) \ + echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles)'; \ + cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles);; \ + esac; + +$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh + +$(top_srcdir)/configure: $(am__configure_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(ACLOCAL_M4): $(am__aclocal_m4_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(am__aclocal_m4_deps): + +mostlyclean-libtool: + -rm -f *.lo + +clean-libtool: + -rm -rf .libs _libs +install-liborcusHEADERS: $(liborcus_HEADERS) + @$(NORMAL_INSTALL) + @list='$(liborcus_HEADERS)'; test -n "$(liborcusdir)" || list=; \ + if test -n "$$list"; then \ + echo " $(MKDIR_P) '$(DESTDIR)$(liborcusdir)'"; \ + $(MKDIR_P) "$(DESTDIR)$(liborcusdir)" || exit 1; \ + fi; \ + for p in $$list; do \ + if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \ + echo "$$d$$p"; \ + done | $(am__base_list) | \ + while read files; do \ + echo " $(INSTALL_HEADER) $$files '$(DESTDIR)$(liborcusdir)'"; \ + $(INSTALL_HEADER) $$files "$(DESTDIR)$(liborcusdir)" || exit $$?; \ + done + +uninstall-liborcusHEADERS: + @$(NORMAL_UNINSTALL) + @list='$(liborcus_HEADERS)'; test -n "$(liborcusdir)" || list=; \ + files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \ + dir='$(DESTDIR)$(liborcusdir)'; $(am__uninstall_files_from_dir) +check-valgrind-local: +check-valgrind-memcheck-local: +check-valgrind-helgrind-local: +check-valgrind-drd-local: +check-valgrind-sgcheck-local: + +ID: $(am__tagged_files) + $(am__define_uniq_tagged_files); mkid -fID $$unique +tags: tags-am +TAGS: tags + +tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) + set x; \ + here=`pwd`; \ + $(am__define_uniq_tagged_files); \ + shift; \ + if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ + test -n "$$unique" || unique=$$empty_fix; \ + if test $$# -gt 0; then \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + "$$@" $$unique; \ + else \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + $$unique; \ + fi; \ + fi +ctags: ctags-am + +CTAGS: ctags +ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) + $(am__define_uniq_tagged_files); \ + test -z "$(CTAGS_ARGS)$$unique" \ + || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ + $$unique + +GTAGS: + here=`$(am__cd) $(top_builddir) && pwd` \ + && $(am__cd) $(top_srcdir) \ + && gtags -i $(GTAGS_ARGS) "$$here" +cscopelist: cscopelist-am + +cscopelist-am: $(am__tagged_files) + list='$(am__tagged_files)'; \ + case "$(srcdir)" in \ + [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \ + *) sdir=$(subdir)/$(srcdir) ;; \ + esac; \ + for i in $$list; do \ + if test -f "$$i"; then \ + echo "$(subdir)/$$i"; \ + else \ + echo "$$sdir/$$i"; \ + fi; \ + done >> $(top_builddir)/cscope.files + +distclean-tags: + -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags +distdir: $(BUILT_SOURCES) + $(MAKE) $(AM_MAKEFLAGS) distdir-am + +distdir-am: $(DISTFILES) + @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + list='$(DISTFILES)'; \ + dist_files=`for file in $$list; do echo $$file; done | \ + sed -e "s|^$$srcdirstrip/||;t" \ + -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ + case $$dist_files in \ + */*) $(MKDIR_P) `echo "$$dist_files" | \ + sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ + sort -u` ;; \ + esac; \ + for file in $$dist_files; do \ + if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ + if test -d $$d/$$file; then \ + dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ + if test -d "$(distdir)/$$file"; then \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ + cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ + else \ + test -f "$(distdir)/$$file" \ + || cp -p $$d/$$file "$(distdir)/$$file" \ + || exit 1; \ + fi; \ + done +check-am: all-am +check: check-am +all-am: Makefile $(HEADERS) +installdirs: + for dir in "$(DESTDIR)$(liborcusdir)"; do \ + test -z "$$dir" || $(MKDIR_P) "$$dir"; \ + done +install: install-am +install-exec: install-exec-am +install-data: install-data-am +uninstall: uninstall-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am + +installcheck: installcheck-am +install-strip: + if test -z '$(STRIP)'; then \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + install; \ + else \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \ + fi +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) + -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) + +maintainer-clean-generic: + @echo "This command is intended for maintainers to use" + @echo "it deletes files that may require special tools to rebuild." +check-valgrind: check-valgrind-am + +check-valgrind-am: check-valgrind-local + +check-valgrind-drd: check-valgrind-drd-am + +check-valgrind-drd-am: check-valgrind-drd-local + +check-valgrind-helgrind: check-valgrind-helgrind-am + +check-valgrind-helgrind-am: check-valgrind-helgrind-local + +check-valgrind-memcheck: check-valgrind-memcheck-am + +check-valgrind-memcheck-am: check-valgrind-memcheck-local + +check-valgrind-sgcheck: check-valgrind-sgcheck-am + +check-valgrind-sgcheck-am: check-valgrind-sgcheck-local + +clean: clean-am + +clean-am: clean-generic clean-libtool mostlyclean-am + +distclean: distclean-am + -rm -f Makefile +distclean-am: clean-am distclean-generic distclean-tags + +dvi: dvi-am + +dvi-am: + +html: html-am + +html-am: + +info: info-am + +info-am: + +install-data-am: install-liborcusHEADERS + +install-dvi: install-dvi-am + +install-dvi-am: + +install-exec-am: + +install-html: install-html-am + +install-html-am: + +install-info: install-info-am + +install-info-am: + +install-man: + +install-pdf: install-pdf-am + +install-pdf-am: + +install-ps: install-ps-am + +install-ps-am: + +installcheck-am: + +maintainer-clean: maintainer-clean-am + -rm -f Makefile +maintainer-clean-am: distclean-am maintainer-clean-generic + +mostlyclean: mostlyclean-am + +mostlyclean-am: mostlyclean-generic mostlyclean-libtool + +pdf: pdf-am + +pdf-am: + +ps: ps-am + +ps-am: + +uninstall-am: uninstall-liborcusHEADERS + +.MAKE: install-am install-strip + +.PHONY: CTAGS GTAGS TAGS all all-am check check-am check-valgrind-am \ + check-valgrind-drd-am check-valgrind-drd-local \ + check-valgrind-helgrind-am check-valgrind-helgrind-local \ + check-valgrind-local check-valgrind-memcheck-am \ + check-valgrind-memcheck-local check-valgrind-sgcheck-am \ + check-valgrind-sgcheck-local clean clean-generic clean-libtool \ + cscopelist-am ctags ctags-am distclean distclean-generic \ + distclean-libtool distclean-tags distdir dvi dvi-am html \ + html-am info info-am install install-am install-data \ + install-data-am install-dvi install-dvi-am install-exec \ + install-exec-am install-html install-html-am install-info \ + install-info-am install-liborcusHEADERS install-man \ + install-pdf install-pdf-am install-ps install-ps-am \ + install-strip installcheck installcheck-am installdirs \ + maintainer-clean maintainer-clean-generic mostlyclean \ + mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \ + tags tags-am uninstall uninstall-am uninstall-liborcusHEADERS + +.PRECIOUS: Makefile + + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/include/orcus/detail/parser_token_buffer.hpp b/include/orcus/detail/parser_token_buffer.hpp new file mode 100644 index 0000000..3b13bec --- /dev/null +++ b/include/orcus/detail/parser_token_buffer.hpp @@ -0,0 +1,188 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_DETAIL_THREAD_PARSER_TOKEN_BUFFER_HPP +#define INCLUDED_ORCUS_DETAIL_THREAD_PARSER_TOKEN_BUFFER_HPP + +#include "orcus/exception.hpp" + +#include <mutex> +#include <condition_variable> + +namespace orcus { namespace detail { namespace thread { + +/** + * Class that manages synchronization of parser tokens used in + * multi-threaded parsers. + */ +template<typename _TokensT> +class parser_token_buffer +{ + enum class state_type { parsing_progress, parsing_ended, parsing_aborted }; + + typedef _TokensT tokens_type; + + mutable std::mutex m_mtx_tokens; + std::condition_variable m_cv_tokens_empty; + std::condition_variable m_cv_tokens_ready; + + tokens_type m_tokens; // token buffer used to hand over tokens to the client. + + size_t m_token_size_threshold; + const size_t m_max_token_size; + + state_type m_state; + + bool tokens_empty() const + { + std::lock_guard<std::mutex> lock(m_mtx_tokens); + return m_tokens.empty(); + } + + /** + * Only to be called from the parser thread. + * + * Wait until the processor thread takes the new tokens and makes the + * token buffer empty. + */ + void wait_until_tokens_empty() + { + std::unique_lock<std::mutex> lock(m_mtx_tokens); + while (!m_tokens.empty() && m_state == state_type::parsing_progress) + m_cv_tokens_empty.wait(lock); + + if (m_state == state_type::parsing_aborted) + throw detail::parsing_aborted_error(); + } + +public: + + parser_token_buffer(size_t min_token_size, size_t max_token_size) : + m_token_size_threshold(std::max<size_t>(min_token_size, 1)), + m_max_token_size(max_token_size), + m_state(state_type::parsing_progress) + { + if (m_token_size_threshold > m_max_token_size) + throw invalid_arg_error( + "initial token size threshold is already larger than the max token size."); + } + + /** + * Check the size of the parser token buffer, and if it exceeds specified + * threshold, move it to the client buffer. + * + * Call this from the parser thread. + * + * @param parser_tokens parser token buffer. + */ + void check_and_notify(tokens_type& parser_tokens) + { + if (parser_tokens.size() < m_token_size_threshold) + // Still below the threshold. + return; + + if (!tokens_empty()) + { + if (m_token_size_threshold < (m_max_token_size/2)) + { + // Double the threshold and continue to parse. + m_token_size_threshold *= 2; + return; + } + + // We cannot increase the threshold any more. Wait for the + // client to finish. + wait_until_tokens_empty(); + } + + std::unique_lock<std::mutex> lock(m_mtx_tokens); + m_tokens.swap(parser_tokens); + lock.unlock(); + m_cv_tokens_ready.notify_one(); + } + + /** + * Move the current parser token buffer to the client buffer, and signal + * the end of parsing. + * + * Call this from the parser thread. + * + * @param parser_tokens parser token buffer. + */ + void notify_and_finish(tokens_type& parser_tokens) + { + // Wait until the client tokens get used up. + wait_until_tokens_empty(); + + { + std::lock_guard<std::mutex> lock(m_mtx_tokens); + m_tokens.swap(parser_tokens); + m_state = state_type::parsing_ended; + } + m_cv_tokens_ready.notify_one(); + } + + void abort() + { + { + std::lock_guard<std::mutex> lock(m_mtx_tokens); + m_tokens.clear(); + m_state = state_type::parsing_aborted; + } + m_cv_tokens_empty.notify_one(); + } + + /** + * Retrieve the tokens currently in the client token buffer. + * + * Call this from the client (non-parser) thread. + * + * @param tokens place to move the tokens in the client token buffer to. + * + * @return true if the parsing is still in progress, therefore more tokens + * are expected, false if this is the last set of tokens. + */ + bool next_tokens(tokens_type& tokens) + { + tokens.clear(); + + // Wait until the parser passes a new set of tokens. + std::unique_lock<std::mutex> lock(m_mtx_tokens); + while (m_tokens.empty() && m_state == state_type::parsing_progress) + m_cv_tokens_ready.wait(lock); + + // Get the new tokens and notify the parser. + tokens.swap(m_tokens); + state_type parsing_progress = m_state; // Make a copy so that lock can be released safely. + + lock.unlock(); + + m_cv_tokens_empty.notify_one(); + + return parsing_progress == state_type::parsing_progress; + } + + /** + * Return the current token size threshold. Call this only after the + * parsing has finished. + * + * @return current token size threshold. + */ + size_t token_size_threshold() const + { + if (m_state == state_type::parsing_progress) + return 0; + + return m_token_size_threshold; + } +}; + +}}} + +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/detail/thread.hpp b/include/orcus/detail/thread.hpp new file mode 100644 index 0000000..2d63dbd --- /dev/null +++ b/include/orcus/detail/thread.hpp @@ -0,0 +1,35 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_DETAIL_THREAD_HPP +#define INCLUDED_ORCUS_DETAIL_THREAD_HPP + +#include <thread> + +namespace orcus { namespace detail { namespace thread { + +class scoped_guard +{ + std::thread m_thread; +public: + scoped_guard(std::thread thread) : m_thread(std::move(thread)) {} + scoped_guard(scoped_guard&& other) : m_thread(std::move(other.m_thread)) {} + + scoped_guard(const scoped_guard&) = delete; + scoped_guard& operator= (const scoped_guard&) = delete; + + ~scoped_guard() + { + m_thread.join(); + } +}; + +}}} + +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/dom_tree.hpp b/include/orcus/dom_tree.hpp new file mode 100644 index 0000000..68df0d0 --- /dev/null +++ b/include/orcus/dom_tree.hpp @@ -0,0 +1,134 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_DOM_TREE_HPP +#define INCLUDED_ORCUS_DOM_TREE_HPP + +#include "types.hpp" + +#include <vector> +#include <ostream> +#include <memory> + +namespace orcus { + +class xmlns_context; + +namespace sax { + +struct doctype_declaration; + +} + +namespace dom { + +class document_tree; + +enum class node_t : uint8_t +{ + unset, + declaration, + element, +}; + +struct ORCUS_DLLPUBLIC entity_name +{ + xmlns_id_t ns; + std::string_view name; + + entity_name(); + entity_name(std::string_view _name); + entity_name(xmlns_id_t _ns, std::string_view _name); + + bool operator== (const entity_name& other) const; + bool operator!= (const entity_name& other) const; +}; + +class ORCUS_DLLPUBLIC const_node +{ + friend class document_tree; + + struct impl; + std::unique_ptr<impl> mp_impl; + + const_node(std::unique_ptr<impl>&& _impl); +public: + const_node(); + const_node(const const_node& other); + const_node(const_node&& other); + + ~const_node(); + + node_t type() const; + + size_t child_count() const; + + const_node child(size_t index) const; + + entity_name name() const; + + std::string_view attribute(const entity_name& name) const; + std::string_view attribute(std::string_view name) const; + + size_t attribute_count() const; + + const_node parent() const; + + void swap(const_node& other); + + const_node& operator= (const const_node& other); + + bool operator== (const const_node& other) const; + bool operator!= (const const_node& other) const; +}; + +/** + * Ordinary DOM tree representing the content of an XML document. + */ +class ORCUS_DLLPUBLIC document_tree +{ + struct impl; + std::unique_ptr<impl> mp_impl; + +public: + document_tree(const document_tree&) = delete; + document_tree& operator= (const document_tree&) = delete; + + document_tree(xmlns_context& cxt); + document_tree(document_tree&& other); + ~document_tree(); + + /** + * Parse a given XML stream and build the content tree. + * + * @param strm XML stream. + */ + void load(std::string_view strm); + + dom::const_node root() const; + + dom::const_node declaration(std::string_view name) const; + + /** + * Swap the content with another dom_tree instance. + * + * @param other the dom_tree instance to swap the content with. + */ + void swap(document_tree& other); + + const sax::doctype_declaration* get_doctype() const; + + void dump_compact(std::ostream& os) const; +}; + +} // namespace dom + +} + +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/env.hpp b/include/orcus/env.hpp new file mode 100644 index 0000000..47dc153 --- /dev/null +++ b/include/orcus/env.hpp @@ -0,0 +1,141 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef __ORCUS_ENV_HPP__ +#define __ORCUS_ENV_HPP__ + +// orcus + +#if defined _WIN32 || defined __CYGWIN__ + #if defined __MINGW32__ + #define ORCUS_DLLPUBLIC + #define ORCUS_DLLLOCAL + #elif defined __ORCUS_BUILDING_DLL + #ifdef __GNUC__ + #define ORCUS_DLLPUBLIC __attribute__ ((dllexport)) + #else + #define ORCUS_DLLPUBLIC __declspec(dllexport) + #endif + #elif defined __ORCUS_STATIC_LIB + #define ORCUS_DLLPUBLIC + #else + #ifdef __GNUC__ + #define ORCUS_DLLPUBLIC __attribute__ ((dllimport)) + #else + #define ORCUS_DLLPUBLIC __declspec(dllimport) + #endif + #endif + #define ORCUS_DLLLOCAL +#else + #if __GNUC__ >= 4 + #define ORCUS_DLLPUBLIC __attribute__ ((visibility ("default"))) + #define ORCUS_DLLLOCAL __attribute__ ((visibility ("hidden"))) + #else + #define ORCUS_DLLPUBLIC + #define ORCUS_DLLLOCAL + #endif +#endif + +// orcus-parser + +#if defined _WIN32 || defined __CYGWIN__ + #if defined __MINGW32__ + #define ORCUS_PSR_DLLPUBLIC + #define ORCUS_PSR_DLLLOCAL + #elif defined __ORCUS_PSR_BUILDING_DLL + #ifdef __GNUC__ + #define ORCUS_PSR_DLLPUBLIC __attribute__ ((dllexport)) + #else + #define ORCUS_PSR_DLLPUBLIC __declspec(dllexport) + #endif + #elif defined __ORCUS_PSR_STATIC_LIB + #define ORCUS_PSR_DLLPUBLIC + #else + #ifdef __GNUC__ + #define ORCUS_PSR_DLLPUBLIC __attribute__ ((dllimport)) + #else + #define ORCUS_PSR_DLLPUBLIC __declspec(dllimport) + #endif + #endif + #define ORCUS_PSR_DLLLOCAL +#else + #if __GNUC__ >= 4 + #define ORCUS_PSR_DLLPUBLIC __attribute__ ((visibility ("default"))) + #define ORCUS_PSR_DLLLOCAL __attribute__ ((visibility ("hidden"))) + #else + #define ORCUS_PSR_DLLPUBLIC + #define ORCUS_PSR_DLLLOCAL + #endif +#endif + +// orcus-spreadsheet-model + +#if defined _WIN32 || defined __CYGWIN__ + #if defined __MINGW32__ + #define ORCUS_SPM_DLLPUBLIC + #define ORCUS_SPM_DLLLOCAL + #elif defined __ORCUS_SPM_BUILDING_DLL + #ifdef __GNUC__ + #define ORCUS_SPM_DLLPUBLIC __attribute__ ((dllexport)) + #else + #define ORCUS_SPM_DLLPUBLIC __declspec(dllexport) + #endif + #elif defined __ORCUS_SPM_STATIC_LIB + #define ORCUS_SPM_DLLPUBLIC + #else + #ifdef __GNUC__ + #define ORCUS_SPM_DLLPUBLIC __attribute__ ((dllimport)) + #else + #define ORCUS_SPM_DLLPUBLIC __declspec(dllimport) + #endif + #endif + #define ORCUS_SPM_DLLLOCAL +#else + #if __GNUC__ >= 4 + #define ORCUS_SPM_DLLPUBLIC __attribute__ ((visibility ("default"))) + #define ORCUS_SPM_DLLLOCAL __attribute__ ((visibility ("hidden"))) + #else + #define ORCUS_SPM_DLLPUBLIC + #define ORCUS_SPM_DLLLOCAL + #endif +#endif + +// orcus-mso + +#if defined _WIN32 || defined __CYGWIN__ + #if defined __MINGW32__ + #define ORCUS_MSO_DLLPUBLIC + #define ORCUS_MSO_DLLLOCAL + #elif defined __ORCUS_MSO_BUILDING_DLL + #ifdef __GNUC__ + #define ORCUS_MSO_DLLPUBLIC __attribute__ ((dllexport)) + #else + #define ORCUS_MSO_DLLPUBLIC __declspec(dllexport) + #endif + #elif defined __ORCUS_MSO_STATIC_LIB + #define ORCUS_MSO_DLLPUBLIC + #else + #ifdef __GNUC__ + #define ORCUS_MSO_DLLPUBLIC __attribute__ ((dllimport)) + #else + #define ORCUS_MSO_DLLPUBLIC __declspec(dllimport) + #endif + #endif + #define ORCUS_MSO_DLLLOCAL +#else + #if __GNUC__ >= 4 + #define ORCUS_MSO_DLLPUBLIC __attribute__ ((visibility ("default"))) + #define ORCUS_MSO_DLLLOCAL __attribute__ ((visibility ("hidden"))) + #else + #define ORCUS_MSO_DLLPUBLIC + #define ORCUS_MSO_DLLLOCAL + #endif +#endif + +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/exception.hpp b/include/orcus/exception.hpp new file mode 100644 index 0000000..5d1aa82 --- /dev/null +++ b/include/orcus/exception.hpp @@ -0,0 +1,152 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_EXCEPTION_HPP +#define INCLUDED_ORCUS_EXCEPTION_HPP + +#include <stdexcept> +#include <string> + +#include "env.hpp" + +namespace orcus { + +class ORCUS_PSR_DLLPUBLIC general_error : public std::exception +{ +public: + explicit general_error(std::string msg); + explicit general_error(std::string_view cls, std::string_view msg); + virtual ~general_error() noexcept; + virtual const char* what() const noexcept; + +protected: + void append_msg(const std::string& s); + +private: + std::string m_msg; +}; + +class ORCUS_PSR_DLLPUBLIC invalid_arg_error : public std::invalid_argument +{ +public: + explicit invalid_arg_error(const std::string& msg); + virtual ~invalid_arg_error() noexcept; +}; + +class ORCUS_PSR_DLLPUBLIC xml_structure_error : public general_error +{ +public: + explicit xml_structure_error(std::string msg); + virtual ~xml_structure_error() noexcept; +}; + +class ORCUS_PSR_DLLPUBLIC json_structure_error : public general_error +{ +public: + explicit json_structure_error(std::string msg); + virtual ~json_structure_error() noexcept; +}; + +class ORCUS_PSR_DLLPUBLIC invalid_map_error : public general_error +{ +public: + explicit invalid_map_error(std::string msg); + virtual ~invalid_map_error() noexcept; +}; + +class ORCUS_PSR_DLLPUBLIC value_error : public general_error +{ +public: + explicit value_error(std::string msg); + virtual ~value_error() noexcept; +}; + +/** + * Error indicating improper xpath syntax. + */ +class ORCUS_PSR_DLLPUBLIC xpath_error : public general_error +{ +public: + xpath_error(std::string msg); + virtual ~xpath_error() noexcept; +}; + +/** + * This gets thrown when a public interface method is expected to return a + * non-null pointer to another interface but actually returns a null pointer. + */ +class ORCUS_PSR_DLLPUBLIC interface_error : public general_error +{ +public: + interface_error(std::string msg); + virtual ~interface_error() noexcept; +}; + +/** + * Exception related to a parsing error that includes an offset in the stream + * where the error occurred. + */ +class ORCUS_PSR_DLLPUBLIC parse_error : public general_error +{ + std::ptrdiff_t m_offset; /// offset in the stream where the error occurred. + +protected: + parse_error(std::string_view cls, std::string_view msg, std::ptrdiff_t offset); + +public: + parse_error(std::string msg, std::ptrdiff_t offset); + + /** + * Get the offset in a stream associated with the error. + * + * @return offset in a stream where the error occurred. + */ + std::ptrdiff_t offset() const; + + static void throw_with( + std::string_view msg_before, char c, std::string_view msg_after, std::ptrdiff_t offset); + + static void throw_with( + std::string_view msg_before, std::string_view msg, std::string_view msg_after, std::ptrdiff_t offset); +}; + +/** + * This exception is thrown when SAX parser detects a malformed XML document. + */ +class ORCUS_PSR_DLLPUBLIC malformed_xml_error : public parse_error +{ +public: + malformed_xml_error() = delete; + malformed_xml_error(std::string_view msg, std::ptrdiff_t offset); + virtual ~malformed_xml_error(); +}; + +/** + * Exception related to parsing of zip archive stream. + */ +class ORCUS_PSR_DLLPUBLIC zip_error : public general_error +{ +public: + zip_error(std::string_view msg); + virtual ~zip_error(); +}; + +namespace detail { + +/** + * Internal error used in multi-threaded parsing to signal that the parser + * thread has been aborted. + */ +class ORCUS_PSR_DLLPUBLIC parsing_aborted_error : public std::exception {}; + +} + +} + +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/format_detection.hpp b/include/orcus/format_detection.hpp new file mode 100644 index 0000000..f4754bc --- /dev/null +++ b/include/orcus/format_detection.hpp @@ -0,0 +1,52 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef ORCUS_FORMAT_DETECTION_HPP +#define ORCUS_FORMAT_DETECTION_HPP + +#include <orcus/env.hpp> +#include <orcus/types.hpp> + +#include <cstdlib> +#include <memory> + +namespace orcus { + +namespace iface { + +class import_filter; + +} + +namespace spreadsheet { namespace iface { + +class import_factory; + +}} + +/** + * Detect the format of a given document stream. + * + * @param strm document stream to detect the format of. + */ +ORCUS_DLLPUBLIC format_t detect(std::string_view strm); + +/** + * Create an instance of import_filter for a specified format. + * + * @param type Format type to create an instace of import_filter of. + * @param factory Pointer to an import factory instance. It must not be null. + * + * @return Pointer to an instance of import_filter for specified format. + */ +ORCUS_DLLPUBLIC std::shared_ptr<iface::import_filter> create_filter( + format_t type, spreadsheet::iface::import_factory* factory); + +} // namespace orcus + +#endif +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/info.hpp b/include/orcus/info.hpp new file mode 100644 index 0000000..61866fa --- /dev/null +++ b/include/orcus/info.hpp @@ -0,0 +1,22 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_INFO_HPP + +#include "orcus/env.hpp" + +namespace orcus { + +ORCUS_DLLPUBLIC int get_version_major(); +ORCUS_DLLPUBLIC int get_version_minor(); +ORCUS_DLLPUBLIC int get_version_micro(); + +} + +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/interface.hpp b/include/orcus/interface.hpp new file mode 100644 index 0000000..b08a9ee --- /dev/null +++ b/include/orcus/interface.hpp @@ -0,0 +1,92 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_INTERFACE_HPP +#define INCLUDED_ORCUS_INTERFACE_HPP + +#include "orcus/env.hpp" +#include "orcus/types.hpp" + +#include <string> +#include <memory> + +namespace orcus { + +struct config; + +namespace iface { + +/** + * Base interface for import filters. + */ +class ORCUS_DLLPUBLIC import_filter +{ + struct impl; + std::unique_ptr<impl> mp_impl; + +public: + import_filter(format_t input); + virtual ~import_filter(); + + /** + * Read the content of a file. + * + * @param filepath path to a local file. It must be a system path. + */ + virtual void read_file(std::string_view filepath) = 0; + + /** + * Read the content of an in-memory stream. + * + * @param stream in-memory stream to read from. + */ + virtual void read_stream(std::string_view stream) = 0; + + /** + * Get the name of a filter. + * + * @return name of a filter. + */ + virtual std::string_view get_name() const = 0; + + void set_config(const orcus::config& v); + const orcus::config& get_config() const; +}; + +/** + * Base interface for document content dumpers. + */ +class ORCUS_DLLPUBLIC document_dumper +{ +public: + virtual ~document_dumper(); + + /** + * Dump the content of a document in a specified format, either into set of + * multiple files, or a single file. + * + * @param format Output format type in which to dump the content. + * @param output Depending on the output format type, this can be either an + * output directory path where multiple output files get + * created, or an output file path where the content of the + * entire document gets dumped into. + */ + virtual void dump(dump_format_t format, const std::string& output) const = 0; + + /** + * Dump the content of a document in a specialized "check" format suitable + * for content verification. + * + * @param os output stream to write the transformed content to. + */ + virtual void dump_check(std::ostream& os) const = 0; +}; + +}} + +#endif +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/json_document_tree.hpp b/include/orcus/json_document_tree.hpp new file mode 100644 index 0000000..e558c38 --- /dev/null +++ b/include/orcus/json_document_tree.hpp @@ -0,0 +1,504 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_JSON_DOCUMENT_TREE_HPP +#define INCLUDED_ORCUS_JSON_DOCUMENT_TREE_HPP + +#include "env.hpp" +#include "exception.hpp" + +#include <string> +#include <memory> +#include <vector> + +namespace orcus { + +struct json_config; + +namespace json { + +struct json_value; +struct document_resource; +class document_tree; + +/** + * Exception related to JSON document tree construction. + */ +class ORCUS_DLLPUBLIC document_error : public general_error +{ +public: + document_error(const std::string& msg); + virtual ~document_error(); +}; + +/** + * Exception that gets thrown due to ambiguity when you specify a braced + * list that can be interpreted either as a key-value pair inside an object + * or as values of an array. + */ +class ORCUS_DLLPUBLIC key_value_error : public document_error +{ +public: + key_value_error(const std::string& msg); + virtual ~key_value_error(); +}; + +enum class node_t : uint8_t +{ + /** node type is not set. */ + unset = 0, + /** JSON string node. A node of this type contains a string value. */ + string = 1, + /** JSON number node. A node of this type contains a numeric value. */ + number = 2, + /** + * JSON object node. A node of this type contains one or more key-value + * pairs. + */ + object = 3, + /** + * JSON array node. A node of this type contains one or more child nodes. + */ + array = 4, + /** + * JSON boolean node containing a value of 'true'. + */ + boolean_true = 5, + /** + * JSON boolean node containing a value of 'false'. + */ + boolean_false = 6, + /** + * JSON node containing a 'null' value. + */ + null = 7, +}; + +namespace detail { namespace init { class node; }} + +class const_node; +class document_tree; + +class ORCUS_DLLPUBLIC const_node_iterator +{ + friend class const_node; + + struct impl; + std::unique_ptr<impl> mp_impl; + + const_node_iterator(const document_tree* doc, const const_node& v, bool begin); + +public: + const_node_iterator(); + const_node_iterator(const const_node_iterator& other); + ~const_node_iterator(); + + const const_node& operator*() const; + const const_node* operator->() const; + + const_node_iterator& operator++(); + const_node_iterator operator++(int); + + const_node_iterator& operator--(); + const_node_iterator operator--(int); + + bool operator== (const const_node_iterator& other) const; + bool operator!= (const const_node_iterator& other) const; + + const_node_iterator& operator= (const const_node_iterator& other); +}; + +/** + * Each node instance represents a JSON value stored in the document tree. + * It's immutable. + */ +class ORCUS_DLLPUBLIC const_node +{ + friend class document_tree; + friend class const_node_iterator; + +protected: + struct impl; + std::unique_ptr<impl> mp_impl; + + const_node(const document_tree* doc, json_value* jv); + const_node(std::unique_ptr<impl>&& p); +public: + const_node() = delete; + + const_node(const const_node& other); + const_node(const_node&& rhs); + ~const_node(); + + /** + * Get the type of a node. + * + * @return node type. + */ + node_t type() const; + + /** + * Get the number of child nodes if any. + * + * @return number of child nodes. + */ + size_t child_count() const; + + /** + * Get a list of keys stored in a JSON object node. + * + * @exception orcus::json::document_error if the node is not of the object + * type. + * @return a list of keys. + */ + std::vector<std::string_view> keys() const; + + /** + * Get the key by index in a JSON object node. This method works only + * when the <b>preserve object order</b> option is set. + * + * @param index 0-based key index. + * + * @exception orcus::json::document_error if the node is not of the object + * type. + * + * @exception std::out_of_range if the index is equal to or greater than + * the number of keys stored in the node. + * + * @return key value. + */ + std::string_view key(size_t index) const; + + /** + * Query whether or not a particular key exists in a JSON object node. + * + * @param key key value. + * + * @return true if this object node contains the specified key, otherwise + * false. If this node is not of a JSON object type, false is + * returned. + */ + bool has_key(std::string_view key) const; + /** + * Get a child node by index. + * + * @param index 0-based index of a child node. + * + * @exception orcus::json::document_error if the node is not one of the + * object or array types. + * + * @exception std::out_of_range if the index is equal to or greater than + * the number of child nodes that the node has. + * + * @return child node instance. + */ + const_node child(size_t index) const; + + /** + * Get a child node by textural key value. + * + * @param key textural key value to get a child node by. + * + * @exception orcus::json::document_error if the node is not of the object + * type, or the node doesn't have the specified key. + * + * @return child node instance. + */ + const_node child(std::string_view key) const; + + /** + * Get the parent node. + * + * @exception orcus::json::document_error if the node doesn't have a parent + * node which implies that the node is a root node. + * + * @return parent node instance. + */ + const_node parent() const; + + /** + * Get the last child node. + * + * @exception orcus::json::document_error if the node is not of array type + * or node has no children. + * + * @return last child node instance. + */ + const_node back() const; + + /** + * Get the string value of a JSON string node. + * + * @exception orcus::json::document_error if the node is not of the string + * type. + * + * @return string value. + */ + std::string_view string_value() const; + + /** + * Get the numeric value of a JSON number node. + * + * @exception orcus::json::document_error if the node is not of the number + * type. + * + * @return numeric value. + */ + double numeric_value() const; + + const_node& operator=(const const_node& other); + const_node& operator=(const_node&& other); + + /** + * Return an indentifier of the JSON value object that the node + * represents. The identifier is derived directly from the memory address + * of the value object. + * + * @return identifier of the JSON value object. + */ + uintptr_t identity() const; + + const_node_iterator begin() const; + const_node_iterator end() const; +}; + +/** + * Each node instance represents a JSON value stored in the document tree. + * This class allows mutable operations. + */ +class ORCUS_DLLPUBLIC node : public const_node +{ + friend class document_tree; + + node(const document_tree* doc, json_value* jv); + node(const_node&& rhs); + +public: + node() = delete; + + node(const node& other); + node(node&& rhs); + ~node(); + + node& operator=(const node& other); + node& operator=(const detail::init::node& v); + node operator[](std::string_view key); + + /** + * Get a child node by index. + * + * @param index 0-based index of a child node. + * + * @exception orcus::json::document_error if the node is not one of the + * object or array types. + * + * @exception std::out_of_range if the index is equal to or greater than + * the number of child nodes that the node has. + * + * @return child node instance. + */ + node child(size_t index); + + /** + * Get a child node by textural key value. + * + * @param key textural key value to get a child node by. + * + * @exception orcus::json::document_error if the node is not of the object + * type, or the node doesn't have the specified key. + * + * @return child node instance. + */ + node child(std::string_view key); + + /** + * Get the parent node. + * + * @exception orcus::json::document_error if the node doesn't have a parent + * node which implies that the node is a root node. + * + * @return parent node instance. + */ + node parent(); + + /** + * Get the last child node. + * + * @exception orcus::json::document_error if the node is not of array type + * or node has no children. + * + * @return last child node instance. + */ + node back(); + + /** + * Append a new node value to the end of the array. + * + * @exception orcus::json::document_error if the node is not of array + * type. + * @param v new node value to append to the end of the array. + */ + void push_back(const detail::init::node& v); +}; + +/** + * This class represents a JSON array, to be used to explicitly create an + * array instance during initialization. + */ +class ORCUS_DLLPUBLIC array +{ + friend class detail::init::node; + friend class document_tree; + + std::vector<detail::init::node> m_vs; +public: + array(); + array(const array&) = delete; + array(array&& other); + array(std::initializer_list<detail::init::node> vs); + ~array(); +}; + +/** + * This class represents a JSON object, primarily to be used to create an + * empty object instance. + */ +class ORCUS_DLLPUBLIC object +{ +public: + object(); + object(const object&) = delete; + object(object&& other); + ~object(); +}; + +namespace detail { namespace init { + +/** + * Node to store an initial value during document tree initialization. It's + * not meant to be instantiated explicitly. A value passed from the braced + * initialization list is implicitly converted to an instance of this class. + */ +class ORCUS_DLLPUBLIC node +{ + friend class ::orcus::json::document_tree; + friend class ::orcus::json::node; + + struct impl; + std::unique_ptr<impl> mp_impl; + +public: + node(double v); + node(int v); + node(bool b); + node(std::nullptr_t); + node(const char* p); + node(const std::string& s); + node(std::initializer_list<detail::init::node> vs); + node(json::array array); + node(json::object obj); + + node(const node& other) = delete; + node(node&& other); + ~node(); + + node& operator= (node other) = delete; + +private: + node_t type() const; + json_value* to_json_value(document_resource& res) const; + void store_to_node(document_resource& res, json_value* parent) const; +}; + +}} + +/** + * This class stores a parsed JSON document tree structure. + */ +class ORCUS_DLLPUBLIC document_tree +{ + friend class const_node; + friend class node; + + struct impl; + std::unique_ptr<impl> mp_impl; + + const document_resource& get_resource() const; + +public: + document_tree(); + document_tree(const document_tree&) = delete; + document_tree(document_tree&& other); + document_tree(document_resource& res); + document_tree(std::initializer_list<detail::init::node> vs); + document_tree(array vs); + document_tree(object obj); + ~document_tree(); + + document_tree& operator= (std::initializer_list<detail::init::node> vs); + document_tree& operator= (array vs); + document_tree& operator= (object obj); + + /** + * Load raw string stream containing a JSON structure to populate the + * document tree. + * + * @param stream stream containing a JSON structure. + * @param config configuration object. + */ + void load(std::string_view stream, const json_config& config); + + /** + * Get the root node of the document. + * + * @return root node of the document. + */ + json::const_node get_document_root() const; + + /** + * Get the root node of the document. + * + * @return root node of the document. + */ + json::node get_document_root(); + + /** + * Dump the JSON document tree to string. + * + * @return a string representation of the JSON document tree. + */ + std::string dump() const; + + /** + * Dump the JSON document tree to an XML structure. + * + * @return a string containing an XML structure representing the JSON + * content. + */ + std::string dump_xml() const; + + /** + * Dump the JSON document tree as YAML output. + * + * @return string containing a YAML output representing the JSON document + * tree structure. + */ + std::string dump_yaml() const; + + /** + * Swap the content of the document with another document instance. + * + * @param other document instance to swap the content with. + */ + void swap(document_tree& other); +}; + +}} + +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/json_global.hpp b/include/orcus/json_global.hpp new file mode 100644 index 0000000..8c3a6e7 --- /dev/null +++ b/include/orcus/json_global.hpp @@ -0,0 +1,30 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_JSON_GLOBAL_HPP +#define INCLUDED_ORCUS_JSON_GLOBAL_HPP + +#include "orcus/env.hpp" + +#include <string> + +namespace orcus { namespace json { + +/** + * Properly escape an input string appropriate for json output. + * + * @param input string value to escape. + * + * @return escaped string value. + */ +ORCUS_PSR_DLLPUBLIC std::string escape_string(const std::string& input); + +}} + +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/json_parser.hpp b/include/orcus/json_parser.hpp new file mode 100644 index 0000000..b021ff8 --- /dev/null +++ b/include/orcus/json_parser.hpp @@ -0,0 +1,402 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_JSON_PARSER_HPP +#define INCLUDED_ORCUS_JSON_PARSER_HPP + +#include "json_parser_base.hpp" + +#include <cassert> +#include <cmath> + +namespace orcus { + +class json_handler +{ +public: + /** + * Called when the parsing begins. + */ + void begin_parse() {} + + /** + * Called when the parsing ends. + */ + void end_parse() {} + + /** + * Called when the opening brace of an array is encountered. + */ + void begin_array() {} + + /** + * Called when the closing brace of an array is encountered. + */ + void end_array() {} + + /** + * Called when the opening curly brace of an object is encountered. + */ + void begin_object() {} + + /** + * Called when a key value string of an object is encountered. + * + * @param key key value string. + * @param transient true if the string value is stored in a temporary + * buffer which is not guaranteed to hold the string + * value after the end of this callback. When false, the + * pointer points to somewhere in the JSON stream being + * parsed. + */ + void object_key(std::string_view key, bool transient) + { + (void)key; (void)transient; + } + + /** + * Called when the closing curly brace of an object is encountered. + */ + void end_object() {} + + /** + * Called when a boolean 'true' keyword is encountered. + */ + void boolean_true() {} + + /** + * Called when a boolean 'false' keyword is encountered. + */ + void boolean_false() {} + + /** + * Called when a 'null' keyword is encountered. + */ + void null() {} + + /** + * Called when a string value is encountered. + * + * @param val string value. + * @param transient true if the string value is stored in a temporary + * buffer which is not guaranteed to hold the string + * value after the end of this callback. When false, the + * pointer points to somewhere in the JSON stream being + * parsed. + */ + void string(std::string_view val, bool transient) + { + (void)val; (void)transient; + } + + /** + * Called when a numeric value is encountered. + * + * @param val numeric value. + */ + void number(double val) + { + (void)val; + } +}; + +/** + * Parser for JSON documents. + * + * @tparam HandlerT Hanlder type with member functions for event callbacks. + * Refer to json_handler. + */ +template<typename HandlerT> +class json_parser : public json::parser_base +{ +public: + typedef HandlerT handler_type; + + /** + * Constructor. + * + * @param content string stream containing JSON string. + * @param hdl handler class instance. + */ + json_parser(std::string_view content, handler_type& hdl); + + /** + * Call this method to start parsing. + */ + void parse(); + +private: + void root_value(); + void value(); + void array(); + void end_array(); + void object(); + void number(); + void string(); + +private: + handler_type& m_handler; +}; + +template<typename _Handler> +json_parser<_Handler>::json_parser( + std::string_view content, handler_type& hdl) : + json::parser_base(content), m_handler(hdl) {} + +template<typename _Handler> +void json_parser<_Handler>::parse() +{ + m_handler.begin_parse(); + + skip_ws(); + if (has_char()) + root_value(); + else + throw parse_error("parse: no json content could be found in file", offset()); + + if (has_char()) + throw parse_error("parse: unexpected trailing string segment.", offset()); + + m_handler.end_parse(); +} + +template<typename _Handler> +void json_parser<_Handler>::root_value() +{ + char c = cur_char(); + + switch (c) + { + case '[': + array(); + break; + case '{': + object(); + break; + default: + parse_error::throw_with( + "root_value: either '[' or '{' was expected, but '", cur_char(), "' was found.", offset()); + } +} + +template<typename _Handler> +void json_parser<_Handler>::value() +{ + char c = cur_char(); + if (is_numeric(c)) + { + number(); + return; + } + + switch (c) + { + case '-': + number(); + break; + case '[': + array(); + break; + case '{': + object(); + break; + case 't': + parse_true(); + m_handler.boolean_true(); + break; + case 'f': + parse_false(); + m_handler.boolean_false(); + break; + case 'n': + parse_null(); + m_handler.null(); + break; + case '"': + string(); + break; + default: + parse_error::throw_with("value: failed to parse '", cur_char(), "'.", offset()); + } +} + +template<typename _Handler> +void json_parser<_Handler>::array() +{ + assert(cur_char() == '['); + + m_handler.begin_array(); + for (next(); has_char(); next()) + { + skip_ws(); + + if (cur_char() == ']') + { + end_array(); + return; + } + + value(); + skip_ws(); + + if (has_char()) + { + switch (cur_char()) + { + case ']': + end_array(); + return; + case ',': + if (peek_char() == ']') + { + parse_error::throw_with( + "array: ']' expected but '", cur_char(), "' found.", offset() ); + } + continue; + default: + parse_error::throw_with( + "array: either ']' or ',' expected, but '", cur_char(), "' found.", offset()); + } + } + else + { + // needs to be handled here, + // we would call next() before checking again with has_char() which + // is already past the end + break; + } + } + + throw parse_error("array: failed to parse array.", offset()); +} + +template<typename _Handler> +void json_parser<_Handler>::end_array() +{ + m_handler.end_array(); + next(); + skip_ws(); +} + +template<typename _Handler> +void json_parser<_Handler>::object() +{ + assert(cur_char() == '{'); + + bool require_new_key = false; + m_handler.begin_object(); + for (next(); has_char(); next()) + { + skip_ws(); + if (!has_char()) + throw parse_error("object: stream ended prematurely before reaching a key.", offset()); + + switch (cur_char()) + { + case '}': + if (require_new_key) + { + parse_error::throw_with( + "object: new key expected, but '", cur_char(), "' found.", offset()); + } + m_handler.end_object(); + next(); + skip_ws(); + return; + case '"': + break; + default: + parse_error::throw_with( + "object: '\"' was expected, but '", cur_char(), "' found.", offset()); + } + require_new_key = false; + + parse_quoted_string_state res = parse_string(); + if (!res.str) + { + // Parsing was unsuccessful. + if (res.length == parse_quoted_string_state::error_no_closing_quote) + throw parse_error("object: stream ended prematurely before reaching the closing quote of a key.", offset()); + else if (res.length == parse_quoted_string_state::error_illegal_escape_char) + parse_error::throw_with( + "object: illegal escape character '", cur_char(), "' in key value.", offset()); + else + throw parse_error("object: unknown error while parsing a key value.", offset()); + } + + m_handler.object_key({res.str, res.length}, res.transient); + + skip_ws(); + if (cur_char() != ':') + parse_error::throw_with( + "object: ':' was expected, but '", cur_char(), "' found.", offset()); + + next(); + skip_ws(); + + if (!has_char()) + throw parse_error("object: stream ended prematurely before reaching a value.", offset()); + + value(); + + skip_ws(); + if (!has_char()) + throw parse_error("object: stream ended prematurely before reaching either '}' or ','.", offset()); + + switch (cur_char()) + { + case '}': + m_handler.end_object(); + next(); + skip_ws(); + return; + case ',': + require_new_key = true; + continue; + default: + parse_error::throw_with( + "object: either '}' or ',' expected, but '", cur_char(), "' found.", offset()); + } + } + + throw parse_error("object: closing '}' was never reached.", offset()); +} + +template<typename _Handler> +void json_parser<_Handler>::number() +{ + assert(is_numeric(cur_char()) || cur_char() == '-'); + + double val = parse_double_or_throw(); + m_handler.number(val); + skip_ws(); +} + +template<typename _Handler> +void json_parser<_Handler>::string() +{ + parse_quoted_string_state res = parse_string(); + if (res.str) + { + m_handler.string({res.str, res.length}, res.transient); + return; + } + + // Parsing was unsuccessful. + if (res.length == parse_quoted_string_state::error_no_closing_quote) + throw parse_error("string: stream ended prematurely before reaching the closing quote.", offset()); + else if (res.length == parse_quoted_string_state::error_illegal_escape_char) + parse_error::throw_with("string: illegal escape character '", cur_char(), "'.", offset()); + else + throw parse_error("string: unknown error.", offset()); +} + +} + +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/json_parser_base.hpp b/include/orcus/json_parser_base.hpp new file mode 100644 index 0000000..461808e --- /dev/null +++ b/include/orcus/json_parser_base.hpp @@ -0,0 +1,46 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_JSON_PARSER_BASE_HPP +#define INCLUDED_ORCUS_JSON_PARSER_BASE_HPP + +#include "parser_base.hpp" +#include "parser_global.hpp" +#include "exception.hpp" + +#include <memory> + +namespace orcus { namespace json { + +class ORCUS_PSR_DLLPUBLIC parser_base : public ::orcus::parser_base +{ + struct impl; + std::unique_ptr<impl> mp_impl; + +protected: + + parser_base() = delete; + parser_base(const parser_base&) = delete; + parser_base& operator=(const parser_base&) = delete; + + parser_base(std::string_view content); + ~parser_base(); + + void skip_ws(); + void parse_true(); + void parse_false(); + void parse_null(); + double parse_double_or_throw(); + + parse_quoted_string_state parse_string(); +}; + +}} + +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/json_parser_thread.hpp b/include/orcus/json_parser_thread.hpp new file mode 100644 index 0000000..8328ef1 --- /dev/null +++ b/include/orcus/json_parser_thread.hpp @@ -0,0 +1,104 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_JSON_PARSER_THREAD_HPP +#define INCLUDED_ORCUS_JSON_PARSER_THREAD_HPP + +#include "env.hpp" +#include "types.hpp" + +#include <memory> +#include <vector> +#include <ostream> +#include <variant> + +namespace orcus { + +class string_pool; + +namespace json { + +struct ORCUS_PSR_DLLPUBLIC parser_stats +{ + size_t token_buffer_size_threshold; +}; + +enum class parse_token_t +{ + unknown, + begin_parse, + end_parse, + begin_array, + end_array, + begin_object, + object_key, + end_object, + boolean_true, + boolean_false, + null, + string, + number, + parse_error, +}; + +struct ORCUS_PSR_DLLPUBLIC parse_token +{ + using value_type = std::variant<std::string_view, parse_error_value_t, double>; + + parse_token_t type; + value_type value; + + parse_token(); + parse_token(parse_token_t _type); + parse_token(parse_token_t _type, std::string_view s); + parse_token(std::string_view s, std::ptrdiff_t offset); + parse_token(double value); + + parse_token(const parse_token& other); + + parse_token& operator= (parse_token) = delete; + + bool operator== (const parse_token& other) const; + bool operator!= (const parse_token& other) const; +}; + +typedef std::vector<parse_token> parse_tokens_t; + +ORCUS_PSR_DLLPUBLIC std::ostream& operator<< (std::ostream& os, const parse_tokens_t& tokens); + +class ORCUS_PSR_DLLPUBLIC parser_thread +{ + struct impl; + std::unique_ptr<impl> mp_impl; + +public: + parser_thread(const char* p, size_t n, size_t min_token_size); + parser_thread(const char* p, size_t n, size_t min_token_size, size_t max_token_size); + ~parser_thread(); + + void start(); + + /** + * Wait until new set of tokens becomes available. + * + * @param tokens new set of tokens. + * + * @return true if the parsing is still in progress (therefore more tokens + * to come), false if it's done i.e. this is the last token set. + */ + bool next_tokens(parse_tokens_t& tokens); + + parser_stats get_stats() const; + + void swap_string_pool(string_pool& pool); +}; + +}} + +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/json_structure_tree.hpp b/include/orcus/json_structure_tree.hpp new file mode 100644 index 0000000..ad77f5c --- /dev/null +++ b/include/orcus/json_structure_tree.hpp @@ -0,0 +1,137 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_JSON_STRUCTURE_TREE_HPP +#define INCLUDED_ORCUS_JSON_STRUCTURE_TREE_HPP + +#include "orcus/env.hpp" +#include "orcus/types.hpp" + +#include <ostream> +#include <memory> +#include <vector> +#include <functional> + +namespace orcus { namespace json { + +struct ORCUS_DLLPUBLIC table_range_t +{ + std::vector<std::string> paths; + std::vector<std::string> row_groups; +}; + +class ORCUS_DLLPUBLIC structure_tree +{ + struct impl; + std::unique_ptr<impl> mp_impl; + +public: + + enum class node_type : short { unknown = 0, array = 1, object = 2, object_key = 3, value = 4 }; + + struct node_properties + { + node_type type; + bool repeat; + }; + + class ORCUS_DLLPUBLIC walker + { + friend class structure_tree; + + struct impl; + std::unique_ptr<impl> mp_impl; + + walker(const structure_tree::impl* parent_impl); + public: + walker(); + walker(const walker& other); + ~walker(); + + /** + * Set the current position to the root node, and return its + * properties. + */ + void root(); + + /** + * Move down to a child node at specified position. Call + * child_count() to get the number of child nodes the current node + * has. A child node position is 0-based and must be less than the + * child count. + * + * @param child_pos 0-based index of the child node to move down to. + */ + void descend(size_t child_pos); + + /** + * Move up to the parent node of the current node. + */ + void ascend(); + + /** + * Return the number of child nodes the current node has. + * + * @return number of child nodes of the current node. + */ + size_t child_count() const; + + /** + * Get the properties of the current node. + */ + node_properties get_node() const; + + /** + * Build one or more field paths for the current value node. For a + * value node that is a child of an object, you'll always get one + * path, whereas a value node that is a chlid of an array, you may get + * more than one field paths. + * + * @return one or more field paths built for the current value node. + */ + std::vector<std::string> build_field_paths() const; + + /** + * Build a path for the parent of the current repeating node. A row + * group is an anchor to which repeating nodes get anchored to. It is + * used to determine when to increment row position during mapping. + * + * @return path for the row group of the current repeating node. + */ + std::string build_row_group_path() const; + }; + + structure_tree(const structure_tree&) = delete; + structure_tree& operator= (const structure_tree&) = delete; + + structure_tree(); + ~structure_tree(); + + void parse(std::string_view stream); + + /** + * For now, normalizing a tree just means sorting child nodes. We may add + * other normalization stuff later. + */ + void normalize_tree(); + + void dump_compact(std::ostream& os) const; + + walker get_walker() const; + + using range_handler_type = std::function<void(table_range_t&&)>; + + void process_ranges(range_handler_type rh) const; +}; + +ORCUS_DLLPUBLIC std::ostream& operator<< (std::ostream& os, structure_tree::node_type nt); + +}} + +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/measurement.hpp b/include/orcus/measurement.hpp new file mode 100644 index 0000000..7444ae0 --- /dev/null +++ b/include/orcus/measurement.hpp @@ -0,0 +1,41 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef ORCUS_MEASUREMENT_HPP +#define ORCUS_MEASUREMENT_HPP + +#include "types.hpp" +#include "env.hpp" + +#include <cstdlib> +#include <string> + +namespace orcus { + +ORCUS_DLLPUBLIC double to_double(std::string_view s, const char** p_parse_ended = nullptr); +ORCUS_DLLPUBLIC long to_long(std::string_view s, const char** p_parse_ended = nullptr); +ORCUS_DLLPUBLIC bool to_bool(std::string_view s); + +/** + * Parse a string value containing a part representing a numerical value + * optionally followed by a part representing a unit of measurement. + * + * Examples of such string value are: "1.234in", "0.34cm" and so on. + * + * @param str original string value. + * + * @return structure containing a numerical value and a unit of measurement + * that the original string value represents. + */ +ORCUS_DLLPUBLIC length_t to_length(std::string_view str); + +ORCUS_DLLPUBLIC double convert(double value, length_unit_t unit_from, length_unit_t unit_to); + +} + +#endif +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/orcus_csv.hpp b/include/orcus/orcus_csv.hpp new file mode 100644 index 0000000..3e34c15 --- /dev/null +++ b/include/orcus/orcus_csv.hpp @@ -0,0 +1,41 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef ORCUS_ORCUS_CSV_HPP +#define ORCUS_ORCUS_CSV_HPP + +#include "interface.hpp" + +namespace orcus { + +namespace spreadsheet { namespace iface { + class import_factory; +}} + +class ORCUS_DLLPUBLIC orcus_csv : public iface::import_filter +{ + struct impl; + std::unique_ptr<impl> mp_impl; + +public: + orcus_csv() = delete; + orcus_csv(const orcus_csv&) = delete; + orcus_csv& operator=(const orcus_csv&) = delete; + + orcus_csv(spreadsheet::iface::import_factory* factory); + ~orcus_csv(); + + virtual void read_file(std::string_view filepath) override; + virtual void read_stream(std::string_view stream) override; + + virtual std::string_view get_name() const override; +}; + +} + +#endif +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/orcus_gnumeric.hpp b/include/orcus/orcus_gnumeric.hpp new file mode 100644 index 0000000..54f74a2 --- /dev/null +++ b/include/orcus/orcus_gnumeric.hpp @@ -0,0 +1,43 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef ORCUS_ORCUS_GNUMERIC_HPP +#define ORCUS_ORCUS_GNUMERIC_HPP + +#include "interface.hpp" + +#include <memory> + +namespace orcus { + +namespace spreadsheet { namespace iface { class import_factory; }} + +class ORCUS_DLLPUBLIC orcus_gnumeric : public iface::import_filter +{ + struct impl; + std::unique_ptr<impl> mp_impl; +public: + orcus_gnumeric() = delete; + orcus_gnumeric(const orcus_gnumeric&) = delete; + orcus_gnumeric& operator=(const orcus_gnumeric&) = delete; + + orcus_gnumeric(spreadsheet::iface::import_factory* factory); + ~orcus_gnumeric(); + + static bool detect(const unsigned char* blob, size_t size); + + virtual void read_file(std::string_view filepath) override; + + virtual void read_stream(std::string_view stream) override; + + virtual std::string_view get_name() const override; +}; + +} + +#endif +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/orcus_import_ods.hpp b/include/orcus/orcus_import_ods.hpp new file mode 100644 index 0000000..1a94d0b --- /dev/null +++ b/include/orcus/orcus_import_ods.hpp @@ -0,0 +1,32 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef ORCUS_ORCUS_IMPORT_ODS_HPP +#define ORCUS_ORCUS_IMPORT_ODS_HPP + +#include "interface.hpp" + +namespace orcus { + +namespace spreadsheet { namespace iface { + class import_styles; +}} + +class ORCUS_DLLPUBLIC import_ods +{ +public: + import_ods() = delete; + import_ods(const import_ods&) = delete; + import_ods& operator=(const import_ods&) = delete; + + static void read_styles(std::string_view s, spreadsheet::iface::import_styles* data); +}; + +} + +#endif +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/orcus_import_xlsx.hpp b/include/orcus/orcus_import_xlsx.hpp new file mode 100644 index 0000000..8523299 --- /dev/null +++ b/include/orcus/orcus_import_xlsx.hpp @@ -0,0 +1,37 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_ORCUS_IMPORT_XLSX_HPP +#define INCLUDED_ORCUS_ORCUS_IMPORT_XLSX_HPP + +#include "interface.hpp" + +namespace orcus { + +namespace spreadsheet { namespace iface { + class import_table; + class import_reference_resolver; +}} + +class ORCUS_DLLPUBLIC import_xlsx +{ +public: + import_xlsx() = delete; + import_xlsx(const import_xlsx&) = delete; + import_xlsx& operator=(const import_xlsx&) = delete; + + static void read_table( + std::string_view s, + spreadsheet::iface::import_table& table, + spreadsheet::iface::import_reference_resolver& resolver); +}; + +} + +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/orcus_json.hpp b/include/orcus/orcus_json.hpp new file mode 100644 index 0000000..7ec487f --- /dev/null +++ b/include/orcus/orcus_json.hpp @@ -0,0 +1,73 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_ORCUS_JSON_HPP +#define INCLUDED_ORCUS_ORCUS_JSON_HPP + +#include "env.hpp" +#include "./spreadsheet/types.hpp" + +#include <memory> +#include <string_view> + +namespace orcus { + +namespace spreadsheet { namespace iface { + +class import_factory; + +}} + +class ORCUS_DLLPUBLIC orcus_json +{ + struct impl; + std::unique_ptr<impl> mp_impl; + +public: + + orcus_json(const orcus_json&) = delete; + orcus_json& operator= (const orcus_json&) = delete; + + orcus_json(spreadsheet::iface::import_factory* im_fact); + ~orcus_json(); + + void set_cell_link(std::string_view path, std::string_view sheet, spreadsheet::row_t row, spreadsheet::col_t col); + + void start_range( + std::string_view sheet, spreadsheet::row_t row, spreadsheet::col_t col, bool row_header); + + void append_field_link(std::string_view path, std::string_view label); + void set_range_row_group(std::string_view path); + void commit_range(); + + void append_sheet(std::string_view name); + + void read_stream(std::string_view stream); + + /** + * Read a JSON string that contains an entire set of mapping rules. + * + * This method also inserts all necessary sheets into the document model. + * + * @param stream JSON string. + */ + void read_map_definition(std::string_view stream); + + /** + * Read a JSON string, and detect and define mapping rules for one or more + * ranges. + * + * @param stream JSON string. + */ + void detect_map_definition(std::string_view stream); +}; + +} + +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/orcus_ods.hpp b/include/orcus/orcus_ods.hpp new file mode 100644 index 0000000..08eb197 --- /dev/null +++ b/include/orcus/orcus_ods.hpp @@ -0,0 +1,58 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_ORCUS_ODS_HPP +#define INCLUDED_ORCUS_ORCUS_ODS_HPP + +#include "orcus/spreadsheet/import_interface.hpp" +#include "orcus/env.hpp" +#include "interface.hpp" + +#include <memory> + +namespace orcus { + +namespace spreadsheet { namespace iface { class import_factory; }} + +struct orcus_ods_impl; +class zip_archive; +class zip_archive_stream; + +class ORCUS_DLLPUBLIC orcus_ods : public iface::import_filter +{ + orcus_ods(const orcus_ods&); // disabled + orcus_ods& operator= (const orcus_ods&); // disabled + +public: + orcus_ods(spreadsheet::iface::import_factory* factory); + ~orcus_ods(); + + static bool detect(const unsigned char* blob, size_t size); + + virtual void read_file(std::string_view filepath) override; + + virtual void read_stream(std::string_view stream) override; + + virtual std::string_view get_name() const override; + +private: + static void list_content(const zip_archive& archive); + void read_styles(const zip_archive& archive); + void read_content(const zip_archive& archive); + void read_content_xml(const unsigned char* p, size_t size); + + void read_file_impl(zip_archive_stream* stream); + +private: + struct impl; + std::unique_ptr<impl> mp_impl; +}; + +} + +#endif +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/orcus_parquet.hpp b/include/orcus/orcus_parquet.hpp new file mode 100644 index 0000000..1dccf45 --- /dev/null +++ b/include/orcus/orcus_parquet.hpp @@ -0,0 +1,41 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#pragma once + +#include "./interface.hpp" +#include "./spreadsheet/import_interface.hpp" + +namespace orcus { + +namespace spreadsheet { namespace iface { class import_factory; }} + +class ORCUS_DLLPUBLIC orcus_parquet : public iface::import_filter +{ +public: + orcus_parquet(const orcus_parquet&) = delete; + orcus_parquet& operator=(const orcus_parquet&) = delete; + + orcus_parquet(spreadsheet::iface::import_factory* factory); + ~orcus_parquet(); + + static bool detect(const unsigned char* blob, std::size_t size); + + virtual void read_file(std::string_view filepath) override; + + virtual void read_stream(std::string_view stream) override; + + virtual std::string_view get_name() const override; + +private: + class impl; + std::unique_ptr<impl> mp_impl; +}; + +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/orcus_xls_xml.hpp b/include/orcus/orcus_xls_xml.hpp new file mode 100644 index 0000000..4534bfc --- /dev/null +++ b/include/orcus/orcus_xls_xml.hpp @@ -0,0 +1,43 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_ORCUS_XLS_XML_HPP +#define INCLUDED_ORCUS_ORCUS_XLS_XML_HPP + +#include "interface.hpp" +#include <memory> + +namespace orcus { + +namespace spreadsheet { namespace iface { class import_factory; }} + +struct orcus_xls_xml_impl; + +class ORCUS_DLLPUBLIC orcus_xls_xml : public iface::import_filter +{ + struct impl; + std::unique_ptr<impl> mp_impl; + +public: + orcus_xls_xml(spreadsheet::iface::import_factory* factory); + ~orcus_xls_xml(); + + orcus_xls_xml(const orcus_xls_xml&) = delete; + orcus_xls_xml& operator= (const orcus_xls_xml&) = delete; + + static bool detect(const unsigned char* blob, size_t size); + + virtual void read_file(std::string_view filepath) override; + virtual void read_stream(std::string_view stream) override; + + virtual std::string_view get_name() const override; +}; + +} + +#endif +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/orcus_xlsx.hpp b/include/orcus/orcus_xlsx.hpp new file mode 100644 index 0000000..68b01c0 --- /dev/null +++ b/include/orcus/orcus_xlsx.hpp @@ -0,0 +1,87 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_ORCUS_XLSX_HPP +#define INCLUDED_ORCUS_ORCUS_XLSX_HPP + +#include "interface.hpp" + +#include <memory> + +namespace orcus { + +namespace spreadsheet { namespace iface { class import_factory; }} + +struct xlsx_rel_sheet_info; +struct xlsx_rel_table_info; +struct xlsx_rel_pivot_cache_info; +struct xlsx_rel_pivot_cache_record_info; +struct orcus_xlsx_impl; +class xlsx_opc_handler; + +class ORCUS_DLLPUBLIC orcus_xlsx : public iface::import_filter +{ + friend class xlsx_opc_handler; + struct impl; + std::unique_ptr<impl> mp_impl; + +public: + orcus_xlsx(spreadsheet::iface::import_factory* factory); + ~orcus_xlsx(); + + orcus_xlsx(const orcus_xlsx&) = delete; + orcus_xlsx& operator= (const orcus_xlsx&) = delete; + + static bool detect(const unsigned char* blob, size_t size); + + virtual void read_file(std::string_view filepath) override; + virtual void read_stream(std::string_view stream) override; + + virtual std::string_view get_name() const override; + +private: + + void set_formulas_to_doc(); + + void read_workbook(const std::string& dir_path, const std::string& file_name); + + /** + * Parse a sheet xml part that contains data stored in a single sheet. + */ + void read_sheet(const std::string& dir_path, const std::string& file_name, xlsx_rel_sheet_info* data); + + /** + * Parse sharedStrings.xml part that contains a list of strings referenced + * in the document. + */ + void read_shared_strings(const std::string& dir_path, const std::string& file_name); + + void read_styles(const std::string& dir_path, const std::string& file_name); + + void read_table(const std::string& dir_path, const std::string& file_name, xlsx_rel_table_info* data); + + void read_pivot_cache_def( + const std::string& dir_path, const std::string& file_name, + const xlsx_rel_pivot_cache_info* data); + + void read_pivot_cache_rec( + const std::string& dir_path, const std::string& file_name, + const xlsx_rel_pivot_cache_record_info* data); + + void read_pivot_table(const std::string& dir_path, const std::string& file_name); + + void read_rev_headers(const std::string& dir_path, const std::string& file_name); + + void read_rev_log(const std::string& dir_path, const std::string& file_name); + + void read_drawing(const std::string& dir_path, const std::string& file_name); +}; + +} + +#endif +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/orcus_xml.hpp b/include/orcus/orcus_xml.hpp new file mode 100644 index 0000000..f20466f --- /dev/null +++ b/include/orcus/orcus_xml.hpp @@ -0,0 +1,155 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_ORCUS_XML_HPP +#define INCLUDED_ORCUS_ORCUS_XML_HPP + +#include "env.hpp" +#include "spreadsheet/types.hpp" + +#include <ostream> +#include <memory> + +namespace orcus { + +class xmlns_repository; + +namespace spreadsheet { namespace iface { + class import_factory; + class export_factory; +}} + +class ORCUS_DLLPUBLIC orcus_xml +{ + struct impl; + std::unique_ptr<impl> mp_impl; + +public: + orcus_xml(const orcus_xml&) = delete; + orcus_xml& operator= (const orcus_xml&) = delete; + + orcus_xml(xmlns_repository& ns_repo, spreadsheet::iface::import_factory* im_fact, spreadsheet::iface::export_factory* ex_fact); + ~orcus_xml(); + + /** + * Define a namespace and its alias used in a map file. + * + * @param alias alias for the namespace. + * @param uri namespace value. + * @param default_ns whether or not to use this namespace as the default + * namespace. When this value is set to true, the + * namespace being set will be applied for all elements + * and attributes used in the paths without explicit + * namespace values. + */ + void set_namespace_alias(std::string_view alias, std::string_view uri, bool default_ns=false); + + /** + * Define a mapping of a single element or attribute to a single cell + * location. + * + * @param xpath path to the element or attribute to link. + * @param sheet sheet index (0-based) of the linked cell location. + * @param row row index (0-based) of the linked cell location. + * @param col column index (0-based) of the linked cell location. + */ + void set_cell_link(std::string_view xpath, std::string_view sheet, spreadsheet::row_t row, spreadsheet::col_t col); + + /** + * Initiate the mapping definition of a linked range. The definition will + * get committed when the {@link commit_range} method is called. + * + * @param sheet sheet index (0-based) of the linked cell location. + * @param row row index (0-based) of the linked cell location. + * @param col column index (0-based) of the linked cell location. + */ + void start_range(std::string_view sheet, spreadsheet::row_t row, spreadsheet::col_t col); + + /** + * Append a field that is mapped to a specified path in the XML document + * to the current linked range. + * + * @param xpath path to the element or attribute to link as a field. + * @param label custom header label to use in lieu of the name of the + * linked entity. + */ + void append_field_link(std::string_view xpath, std::string_view label); + + /** + * Set the element located in the specified path as a row group in the + * current linked range. + * + * If the element is defined as a row-group element, the row index will + * increment whenever that element closes. + * + * @param xpath path to the element to use as a row group element. + */ + void set_range_row_group(std::string_view xpath); + + /** + * Commit the mapping definition of the current range. + */ + void commit_range(); + + /** + * Append a new sheet to the spreadsheet document. + * + * @param name name of the sheet. + */ + void append_sheet(std::string_view name); + + /** + * Read the stream containing the source XML document. + * + * @param stream stream containing the content of the source XML document. + */ + void read_stream(std::string_view stream); + + /** + * Read an XML stream that contains an entire set of mapping rules. + * + * This method also inserts all necessary sheets into the document model. + * + * @param stream stream containing the XML string. + */ + void read_map_definition(std::string_view stream); + + /** + * Read a stream containing the source XML document, automatically detect + * all linkable ranges and import them one range per sheet. + * + * @param stream stream containing the source XML document. + */ + void detect_map_definition(std::string_view stream); + + /** + * Read a stream containing the source XML document, automatically detect + * all linkable ranges, and write a map definition file depicting the + * detected ranges. + * + * @param stream stream containing the source XML document. + * @param out output stream to write the map definition file to. + */ + void write_map_definition(std::string_view stream, std::ostream& out) const; + + /** + * Write the linked cells and ranges in the spreadsheet document as an XML + * document using the same map definition rules used to load the content. + * + * Note that this requires the source XML document stream, as it re-uses + * parts of the source stream. + * + * @param stream stream containing the source XML document. + * @param out output stream to write the XML document to. + */ + void write(std::string_view stream, std::ostream& out) const; +}; + +} + +#endif +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/parser_base.hpp b/include/orcus/parser_base.hpp new file mode 100644 index 0000000..b3d99a1 --- /dev/null +++ b/include/orcus/parser_base.hpp @@ -0,0 +1,155 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_PARSER_BASE_HPP +#define INCLUDED_ORCUS_PARSER_BASE_HPP + +#include "env.hpp" +#include "exception.hpp" + +#include <string> +#include <cstdlib> +#include <cstddef> +#include <cassert> +#include <functional> + +namespace orcus { + +class ORCUS_PSR_DLLPUBLIC parser_base +{ +protected: + using numeric_parser_type = std::function<const char*(const char*, const char*, double&)>; + + const char* const mp_begin; + const char* mp_char; + const char* mp_end; + +private: + numeric_parser_type m_func_parse_numeric; + +protected: + parser_base(const char* p, size_t n); + + void set_numeric_parser(const numeric_parser_type& func) + { + m_func_parse_numeric = func; + } + + bool has_char() const + { + assert(mp_char <= mp_end); + return mp_char != mp_end; + } + + bool has_next() const + { + assert((mp_char+1) <= mp_end); + return (mp_char+1) != mp_end; + } + + void next(size_t inc=1) { mp_char += inc; } + + void prev(size_t dec=1); + + char cur_char() const { return *mp_char; } + + /** + * Peek a character at specified offset from the current position without + * advancing the current position. + * + * @note The caller <strong>must</strong> ensure that the specified offset + * position is a valid position. This method does not check its + * validity. + * + * @param offset offset from the current position to peek at. + * + * @return character at a specified offset position from the current + * position. + */ + char peek_char(std::size_t offset=1) const; + + /** + * Peek a segment of contiguous characters of a specified length starting + * from the current position. + * + * @note The caller <strong>must</strong> ensure that the specified + * substring segment is entirely valid. This method does not check + * its validity. + * + * @param length length of the segment to peek. + * + * @return segment of contiguous characters. + */ + std::string_view peek_chars(std::size_t length) const; + + /** + * Skip an optional byte order mark at the current position of the stream. + * + * Currently we only check for UTF-8 BOM. + */ + void skip_bom(); + + void skip(std::string_view chars_to_skip); + + /** + * Skip all characters that are 0-32 in ASCII range + */ + void skip_space_and_control(); + + /** + * Parse and check next characters to see if it matches specified + * character sequence. + * + * @param expected sequence of characters to match against. + * + * @return true if it matches specified character sequence, false + * otherwise. + */ + bool parse_expected(std::string_view expected); + + /** + * Try to parse the next characters as double, or return NaN in case of + * failure. + * + * @return double value on success, or NaN on failure. + */ + double parse_double(); + + /** + * Determine the number of characters remaining <strong>after</strong> the + * current character. For instance, if the current character is on the + * last character in the stream, this method will return 0, whereas if + * it's on the first character, it will return the total length - 1. + * + * @return number of characters remaining after the current character. + */ + size_t remaining_size() const; + + /** + * Determine the number of characters available from the current character + * to the end of the buffer. The current character is included. + * + * @return number of characters available including the current character. + */ + size_t available_size() const + { + return std::distance(mp_char, mp_end); + } + + /** + * Return the current offset from the beginning of the character stream. + * + * @return current offset from the beginning of the character stream. + */ + std::ptrdiff_t offset() const; +}; + +} + +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/parser_global.hpp b/include/orcus/parser_global.hpp new file mode 100644 index 0000000..bf5971b --- /dev/null +++ b/include/orcus/parser_global.hpp @@ -0,0 +1,153 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef ORCUS_PARSER_GLOBAL_HPP +#define ORCUS_PARSER_GLOBAL_HPP + +#include "env.hpp" + +#include <sstream> + +namespace orcus { + +class cell_buffer; + +enum class string_escape_char_t +{ + invalid, + valid, + control_char +}; + +/** + * Stores state of string parsing. Upon successful parsing the str points + * to the first character of the string and the length stores the size of + * the string. When the parsing fails, the str value becomes nullptr and + * the length stores the error code. + */ +struct parse_quoted_string_state +{ + ORCUS_PSR_DLLPUBLIC static const size_t error_no_closing_quote; + ORCUS_PSR_DLLPUBLIC static const size_t error_illegal_escape_char; + + const char* str; + size_t length; + + /** + * When true, the str pointer points to the temporary buffer storage + * provided by the caller instead of the original character stream. The + * caller must allocate memory and copy the value to it before the buffer + * content changes if the parsed string value needs to be stored. + * + * When false, str points to a position in the original stream, and the + * caller doens't need to allocate memory to store the string value as + * long as the original character stream is alive. + */ + bool transient; + + /** + * When true, the string contains at least one control character - a + * character whose value ranges between 0x00 and 0x1F. + */ + bool has_control_character; +}; + +ORCUS_PSR_DLLPUBLIC bool is_blank(char c); +ORCUS_PSR_DLLPUBLIC bool is_alpha(char c); +ORCUS_PSR_DLLPUBLIC bool is_numeric(char c); + +/** + * Check if the characater is one of allowed characters. Note that you can + * only specify up to 16 allowed characters. + * + * @param c character to check. + * @param allowed string containing all allowed characters. + * + * @return true if the character is one of the allowed characters, false + * otherwise. + */ +ORCUS_PSR_DLLPUBLIC bool is_in(char c, std::string_view allowed); + +/** + * Parse a sequence of characters into a double-precision numeric value. + * + * @param p pointer to the first character to start parsing from. + * @param p_end pointer to the first character not allowed to parse. + * @param value output parameter to assign the matched value to. + * + * @return pointer to the first non-matching character. + */ +ORCUS_PSR_DLLPUBLIC const char* parse_numeric(const char* p, const char* p_end, double& value); + +/** + * Parse a sequence of characters into an integer value. + * + * @param p pointer to the first character to start parsing from. + * @param p_end pointer to the first character not allowed to parse. + * @param value output parameter to assign the matched value to. + * + * @return pointer to the first non-matching character. + * + * @note Use of this function should be eventually replaced with + * std::from_chars() once it becomes available. + */ +ORCUS_PSR_DLLPUBLIC const char* parse_integer(const char* p, const char* p_end, long& value); + +/** + * Two single-quote characters ('') represent one single-quote character. + */ +ORCUS_PSR_DLLPUBLIC parse_quoted_string_state parse_single_quoted_string( + const char*& p, size_t max_length, cell_buffer& buffer); + +/** + * Starting from the opening single quote position, parse string all the way + * to the closing quote. Two single-quote characters ('') will be + * interpreted as encoded one single-quote character. + * + * @param p it should point to the opening single quote character. + * @param max_length maximum length to parse. + * + * @return address of the character immediately after the closing quote, or + * nullptr in case no closing quote is found. + */ +ORCUS_PSR_DLLPUBLIC const char* parse_to_closing_single_quote( + const char* p, size_t max_length); + +ORCUS_PSR_DLLPUBLIC parse_quoted_string_state parse_double_quoted_string( + const char*& p, size_t max_length, cell_buffer& buffer); + +/** + * Starting from the opening double quote position, parse string all the way + * to the closing quote. Two single-quote characters ('') will be + * interpreted as encoded one single-quote character. + * + * @param p it should point to the opening single quote character. + * @param max_length maximum length to parse. + * + * @return address of the character immediately after the closing quote, or + * nullptr in case no closing quote is found. + */ +ORCUS_PSR_DLLPUBLIC const char* parse_to_closing_double_quote( + const char* p, size_t max_length); + +/** + * Given a character that occurs immediately after the escape character '\', + * return what type this character is. + * + * @param c character that occurs immediately after the escape character + * '\'. + * + * @return enum value representing the type of escape character. + */ +ORCUS_PSR_DLLPUBLIC string_escape_char_t get_string_escape_char_type(char c); + +ORCUS_PSR_DLLPUBLIC std::string_view trim(std::string_view str); + +} + +#endif +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/sax_ns_parser.hpp b/include/orcus/sax_ns_parser.hpp new file mode 100644 index 0000000..f888fa2 --- /dev/null +++ b/include/orcus/sax_ns_parser.hpp @@ -0,0 +1,374 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_SAX_NS_PARSER_HPP +#define INCLUDED_ORCUS_SAX_NS_PARSER_HPP + +#include "sax_parser.hpp" +#include "xml_namespace.hpp" + +#include <unordered_set> +#include <vector> +#include <algorithm> + +namespace orcus { + +struct sax_ns_parser_element +{ + /** Element namespace identifier. */ + xmlns_id_t ns; + /** Element namespace alias. */ + std::string_view ns_alias; + /** Element name. */ + std::string_view name; + /** Position of the opening brace '<'. */ + std::ptrdiff_t begin_pos; + /** Position immediately after the closing brace '>'. */ + std::ptrdiff_t end_pos; +}; + +struct sax_ns_parser_attribute +{ + /** Attribute namespace identifier. */ + xmlns_id_t ns; + /** Attribute namespace alias. */ + std::string_view ns_alias; + /** Attribute name. */ + std::string_view name; + /** Attribute value. */ + std::string_view value; + /** Whether or not the attribute value is transient. */ + bool transient; +}; + +namespace sax { namespace detail { + +struct entity_name +{ + std::string_view ns; + std::string_view name; + + entity_name(std::string_view _ns, std::string_view _name) : + ns(_ns), name(_name) {} + + bool operator== (const entity_name& other) const + { + return other.ns == ns && other.name == name; + } + + struct hash + { + size_t operator() (const entity_name& v) const + { + std::hash<std::string_view> hasher; + return hasher(v.ns) + hasher(v.name); + } + }; +}; + +typedef std::unordered_set<std::string_view> ns_keys_type; +typedef std::unordered_set<entity_name, entity_name::hash> entity_names_type; + +struct elem_scope +{ + xmlns_id_t ns; + std::string_view name; + ns_keys_type ns_keys; + + elem_scope() {} + elem_scope(const elem_scope&) = delete; + elem_scope(elem_scope&& other) = default; +}; + +using elem_scopes_type = std::vector<elem_scope>; + +}} // namespace sax::detail + +class sax_ns_handler +{ +public: + /** + * Called when a doctype declaration <!DOCTYPE ... > is encountered. + * + * @param dtd struct containing doctype declaration data. + */ + void doctype(const orcus::sax::doctype_declaration& dtd) + { + (void)dtd; + } + + /** + * Called when <?... is encountered, where the '...' may be an + * arbitraray dentifier. One common declaration is <?xml which is + * typically given at the start of an XML stream. + * + * @param decl name of the identifier. + */ + void start_declaration(std::string_view decl) + { + (void)decl; + } + + /** + * Called when the closing tag (>) of a <?... ?> is encountered. + * + * @param decl name of the identifier. + */ + void end_declaration(std::string_view decl) + { + (void)decl; + } + + /** + * Called at the start of each element. + * + * @param elem information of the element being parsed. + */ + void start_element(const orcus::sax_ns_parser_element& elem) + { + (void)elem; + } + + /** + * Called at the end of each element. + * + * @param elem information of the element being parsed. + */ + void end_element(const orcus::sax_ns_parser_element& elem) + { + (void)elem; + } + + /** + * Called when a segment of a text content is parsed. Each text content + * is a direct child of an element, which may have multiple child contents + * when the element also has a child element that are direct sibling to + * the text contents or the text contents are splitted by a comment. + * + * @param val value of the text content. + * @param transient when true, the text content has been converted and is + * stored in a temporary buffer due to presence of one or + * more encoded characters, in which case <em>the passed + * text value needs to be either immediately converted to + * a non-text value or be interned within the scope of + * the callback</em>. + */ + void characters(std::string_view val, bool transient) + { + (void)val; + (void)transient; + } + + /** + * Called upon parsing of an attribute of a declaration. The value of an + * attribute is assumed to be transient thus should be consumed within the + * scope of this callback. + * + * @param name name of an attribute. + * @param val value of an attribute. + * + * @todo Perhaps we should pass the transient flag here as well like all the + * other places. + */ + void attribute(std::string_view name, std::string_view val) + { + (void)name; + (void)val; + } + + /** + * Called upon parsing of an attribute of an element. Note that <em>when + * the attribute's transient flag is set, the attribute value is stored in + * a temporary buffer due to a presence of encoded characters, and must be + * processed within the scope of the callback</em>. + * + * @param attr struct containing attribute information. + */ + void attribute(const orcus::sax_ns_parser_attribute& attr) + { + (void)attr; + } +}; + +/** + * SAX based XML parser with extra namespace handling. + * + * It uses an instance of xmlns_context passed by the caller to validate and + * convert namespace values into identifiers. The namespace identifier of + * each encountered element is always given even if one is not explicitly + * given. + * + * This parser keeps track of element scopes and detects non-matching element + * pairs. + * + * @tparam HandlerT Handler type with member functions for event callbacks. + * Refer to @ref sax_ns_handler. + */ +template<typename HandlerT> +class sax_ns_parser +{ +public: + typedef HandlerT handler_type; + + sax_ns_parser(std::string_view content, xmlns_context& ns_cxt, handler_type& handler); + ~sax_ns_parser() = default; + + /** + * Start parsing the document. + * + * @exception orcus::malformed_xml_error when it encounters a + * non-matching closing element. + */ + void parse(); + +private: + /** + * Re-route callbacks from the internal sax_parser into sax_ns_parser + * callbacks. + */ + class handler_wrapper + { + sax::detail::elem_scopes_type m_scopes; + sax::detail::ns_keys_type m_ns_keys; + sax::detail::entity_names_type m_attrs; + + sax_ns_parser_element m_elem; + sax_ns_parser_attribute m_attr; + + xmlns_context& m_ns_cxt; + handler_type& m_handler; + + bool m_declaration; + + public: + handler_wrapper(xmlns_context& ns_cxt, handler_type& handler) : m_ns_cxt(ns_cxt), m_handler(handler), m_declaration(false) {} + + void doctype(const sax::doctype_declaration& dtd) + { + m_handler.doctype(dtd); + } + + void start_declaration(std::string_view name) + { + m_declaration = true; + m_handler.start_declaration(name); + } + + void end_declaration(std::string_view name) + { + m_declaration = false; + m_handler.end_declaration(name); + } + + void start_element(const sax::parser_element& elem) + { + m_scopes.emplace_back(); + sax::detail::elem_scope& scope = m_scopes.back(); + scope.ns = m_ns_cxt.get(elem.ns); + scope.name = elem.name; + scope.ns_keys.swap(m_ns_keys); + + m_elem.ns = scope.ns; + m_elem.ns_alias = elem.ns; + m_elem.name = scope.name; + m_elem.begin_pos = elem.begin_pos; + m_elem.end_pos = elem.end_pos; + m_handler.start_element(m_elem); + + m_attrs.clear(); + } + + void end_element(const sax::parser_element& elem) + { + sax::detail::elem_scope& scope = m_scopes.back(); + if (scope.ns != m_ns_cxt.get(elem.ns) || scope.name != elem.name) + throw malformed_xml_error("mis-matching closing element.", -1); + + m_elem.ns = scope.ns; + m_elem.ns_alias = elem.ns; + m_elem.name = scope.name; + m_elem.begin_pos = elem.begin_pos; + m_elem.end_pos = elem.end_pos; + m_handler.end_element(m_elem); + + // Pop all namespaces declared in this scope. + for (const std::string_view& key : scope.ns_keys) + m_ns_cxt.pop(key); + + m_scopes.pop_back(); + } + + void characters(std::string_view val, bool transient) + { + m_handler.characters(val, transient); + } + + void attribute(const sax::parser_attribute& attr) + { + if (m_declaration) + { + // XML declaration attribute. Pass it through to the handler without namespace. + m_handler.attribute(attr.name, attr.value); + return; + } + + if (m_attrs.count(sax::detail::entity_name(attr.ns, attr.name)) > 0) + throw malformed_xml_error( + "You can't define two attributes of the same name in the same element.", -1); + + m_attrs.insert(sax::detail::entity_name(attr.ns, attr.name)); + + if (attr.ns.empty() && attr.name == "xmlns") + { + // Default namespace + m_ns_cxt.push(std::string_view{}, attr.value); + m_ns_keys.insert(std::string_view{}); + return; + } + + if (attr.ns == "xmlns") + { + // Namespace alias + if (!attr.name.empty()) + { + m_ns_cxt.push(attr.name, attr.value); + m_ns_keys.insert(attr.name); + } + return; + } + + m_attr.ns = attr.ns.empty() ? XMLNS_UNKNOWN_ID : m_ns_cxt.get(attr.ns); + m_attr.ns_alias = attr.ns; + m_attr.name = attr.name; + m_attr.value = attr.value; + m_attr.transient = attr.transient; + m_handler.attribute(m_attr); + } + }; + +private: + handler_wrapper m_wrapper; + sax_parser<handler_wrapper> m_parser; +}; + +template<typename HandlerT> +sax_ns_parser<HandlerT>::sax_ns_parser( + std::string_view content, xmlns_context& ns_cxt, handler_type& handler) : + m_wrapper(ns_cxt, handler), m_parser(content, m_wrapper) +{ +} + +template<typename HandlerT> +void sax_ns_parser<HandlerT>::parse() +{ + m_parser.parse(); +} + +} + +#endif +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/sax_parser.hpp b/include/orcus/sax_parser.hpp new file mode 100644 index 0000000..f7283d2 --- /dev/null +++ b/include/orcus/sax_parser.hpp @@ -0,0 +1,576 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_SAX_PARSER_HPP +#define INCLUDED_ORCUS_SAX_PARSER_HPP + +#include "sax_parser_base.hpp" + +#include <string_view> + +namespace orcus { + +struct sax_parser_default_config +{ + /** + * An integer value representing a baseline XML version. A value of 10 + * corresponds with version 1.0 whereas a value of 11 corresponds with + * version 1.1. + */ + static constexpr uint8_t baseline_version = 10; +}; + +class sax_handler +{ +public: + /** + * Called when a doctype declaration <!DOCTYPE ... > is encountered. + * + * @param dtd struct containing doctype declaration data. + */ + void doctype(const orcus::sax::doctype_declaration& dtd) + { + (void)dtd; + } + + /** + * Called when <?... is encountered, where the '...' may be an + * arbitraray dentifier. One common declaration is <?xml which is + * typically given at the start of an XML stream. + * + * @param decl name of the identifier. + */ + void start_declaration(std::string_view decl) + { + (void)decl; + } + + /** + * Called when the closing tag (>) of a <?... ?> is encountered. + * + * @param decl name of the identifier. + */ + void end_declaration(std::string_view decl) + { + (void)decl; + } + + /** + * Called at the start of each element. + * + * @param elem information of the element being parsed. + */ + void start_element(const orcus::sax::parser_element& elem) + { + (void)elem; + } + + /** + * Called at the end of each element. + * + * @param elem information of the element being parsed. + */ + void end_element(const orcus::sax::parser_element& elem) + { + (void)elem; + } + + /** + * Called when a segment of a text content is parsed. Each text content + * is a direct child of an element, which may have multiple child contents + * when the element also has a child element that are direct sibling to + * the text contents or the text contents are splitted by a comment. + * + * @param val value of the text content. + * @param transient when true, the text content has been converted and is + * stored in a temporary buffer due to presence of one or + * more encoded characters, in which case <em>the passed + * text value needs to be either immediately converted to + * a non-text value or be interned within the scope of + * the callback</em>. + */ + void characters(std::string_view val, bool transient) + { + (void)val; (void)transient; + } + + /** + * Called upon parsing of an attribute of an element. Note that <em>when + * the attribute's transient flag is set, the attribute value is stored in + * a temporary buffer due to presence of one or more encoded characters, + * and must be processed within the scope of the callback</em>. + * + * @param attr struct containing attribute information. + */ + void attribute(const orcus::sax::parser_attribute& attr) + { + (void)attr; + } +}; + +/** + * SAX parser for XML documents. + * + * This parser is barebone in that it only parses the document and picks up + * all encountered elements and attributes without checking proper element + * pairs. The user is responsible for checking whether or not the document is + * well-formed in terms of element scopes. + * + * This parser additionally records the begin and end offset positions of each + * element. + * + * @tparam HandlerT Handler type with member functions for event callbacks. + * Refer to @ref sax_handler. + * @tparam ConfigT Parser configuration. + */ +template<typename HandlerT, typename ConfigT = sax_parser_default_config> +class sax_parser : public sax::parser_base +{ +public: + typedef HandlerT handler_type; + typedef ConfigT config_type; + + sax_parser(std::string_view content, handler_type& handler); + ~sax_parser() = default; + + void parse(); + +private: + + /** + * Parse XML header that occurs at the beginning of every XML stream i.e. + * <?xml version="..." encoding="..." ?> + */ + void header(); + void body(); + void element(); + void element_open(std::ptrdiff_t begin_pos); + void element_close(std::ptrdiff_t begin_pos); + void special_tag(); + void declaration(const char* name_check); + void cdata(); + void doctype(); + void characters(); + void attribute(); + +private: + handler_type& m_handler; +}; + +template<typename HandlerT, typename ConfigT> +sax_parser<HandlerT,ConfigT>::sax_parser(std::string_view content, handler_type& handler) : + sax::parser_base(content.data(), content.size()), + m_handler(handler) +{ +} + +template<typename HandlerT, typename ConfigT> +void sax_parser<HandlerT,ConfigT>::parse() +{ + m_nest_level = 0; + mp_char = mp_begin; + header(); + skip_space_and_control(); + body(); + + assert(m_buffer_pos == 0); +} + +template<typename HandlerT, typename ConfigT> +void sax_parser<HandlerT,ConfigT>::header() +{ + // we don't handle multi byte encodings so we can just skip bom entry if exists. + skip_bom(); + + // Allow leading whitespace in the XML stream. + // TODO : Make this configurable since strictly speaking such an XML + // sttream is invalid. + skip_space_and_control(); + + if (!has_char() || cur_char() != '<') + throw malformed_xml_error("xml file must begin with '<'.", offset()); + + if (config_type::baseline_version >= 11) + { + // XML version 1.1 requires a header declaration whereas in 1.0 it's + // optional. + if (next_char_checked() != '?') + throw malformed_xml_error("xml file must begin with '<?'.", offset()); + + declaration("xml"); + } +} + +template<typename HandlerT, typename ConfigT> +void sax_parser<HandlerT,ConfigT>::body() +{ + while (has_char()) + { + if (cur_char() == '<') + { + element(); + if (!m_root_elem_open) + // Root element closed. Stop parsing. + return; + } + else if (m_nest_level) + // Call characters only when in xml hierarchy. + characters(); + else + next(); + } +} + +template<typename HandlerT, typename ConfigT> +void sax_parser<HandlerT,ConfigT>::element() +{ + assert(cur_char() == '<'); + std::ptrdiff_t pos = offset(); + char c = next_char_checked(); + switch (c) + { + case '/': + element_close(pos); + return; + case '!': + special_tag(); + return; + case '?': + declaration(nullptr); + return; + } + + element_open(pos); +} + +template<typename HandlerT, typename ConfigT> +void sax_parser<HandlerT,ConfigT>::element_open(std::ptrdiff_t begin_pos) +{ + sax::parser_element elem; + element_name(elem, begin_pos); + + while (true) + { + skip_space_and_control(); + char c = cur_char_checked(); + if (c == '/') + { + // Self-closing element: <element/> + if (next_and_char() != '>') + throw malformed_xml_error("expected '/>' to self-close the element.", offset()); + next(); + elem.end_pos = offset(); + m_handler.start_element(elem); + reset_buffer_pos(); + m_handler.end_element(elem); + if (!m_nest_level) + m_root_elem_open = false; +#if ORCUS_DEBUG_SAX_PARSER + cout << "element_open: ns='" << elem.ns << "', name='" << elem.name << "' (self-closing)" << endl; +#endif + return; + } + else if (c == '>') + { + // End of opening element: <element> + next(); + elem.end_pos = offset(); + nest_up(); + m_handler.start_element(elem); + reset_buffer_pos(); +#if ORCUS_DEBUG_SAX_PARSER + cout << "element_open: ns='" << elem.ns << "', name='" << elem.name << "'" << endl; +#endif + return; + } + else + attribute(); + } +} + +template<typename HandlerT, typename ConfigT> +void sax_parser<HandlerT,ConfigT>::element_close(std::ptrdiff_t begin_pos) +{ + assert(cur_char() == '/'); + nest_down(); + next_check(); + sax::parser_element elem; + element_name(elem, begin_pos); + + if (cur_char() != '>') + throw malformed_xml_error("expected '>' to close the element.", offset()); + next(); + elem.end_pos = offset(); + + m_handler.end_element(elem); +#if ORCUS_DEBUG_SAX_PARSER + cout << "element_close: ns='" << elem.ns << "', name='" << elem.name << "'" << endl; +#endif + if (!m_nest_level) + m_root_elem_open = false; +} + +template<typename HandlerT, typename ConfigT> +void sax_parser<HandlerT,ConfigT>::special_tag() +{ + assert(cur_char() == '!'); + // This can be either <![CDATA, <!--, or <!DOCTYPE. + size_t len = available_size(); + if (len < 2) + throw malformed_xml_error("special tag too short.", offset()); + + switch (next_and_char()) + { + case '-': + { + // Possibly comment. + if (next_and_char() != '-') + throw malformed_xml_error("comment expected.", offset()); + + len -= 2; + if (len < 3) + throw malformed_xml_error("malformed comment.", offset()); + + next(); + comment(); + } + break; + case '[': + { + // Possibly a CDATA. + expects_next("CDATA[", 6); + if (has_char()) + cdata(); + } + break; + case 'D': + { + // check if this is a DOCTYPE. + expects_next("OCTYPE", 6); + skip_space_and_control(); + if (has_char()) + doctype(); + } + break; + default: + throw malformed_xml_error("failed to parse special tag.", offset()); + } +} + +template<typename HandlerT, typename ConfigT> +void sax_parser<HandlerT,ConfigT>::declaration(const char* name_check) +{ + assert(cur_char() == '?'); + next_check(); + + // Get the declaration name first. + std::string_view decl_name; + name(decl_name); +#if ORCUS_DEBUG_SAX_PARSER + cout << "sax_parser::declaration: start name='" << decl_name << "'" << endl; +#endif + + if (name_check && decl_name != name_check) + { + std::ostringstream os; + os << "declaration name of '" << name_check << "' was expected, but '" << decl_name << "' was found instead."; + throw malformed_xml_error(os.str(), offset()); + } + + m_handler.start_declaration(decl_name); + skip_space_and_control(); + + // Parse the attributes. + while (cur_char_checked() != '?') + { + attribute(); + skip_space_and_control(); + } + if (next_char_checked() != '>') + throw malformed_xml_error("declaration must end with '?>'.", offset()); + + m_handler.end_declaration(decl_name); + reset_buffer_pos(); + next(); +#if ORCUS_DEBUG_SAX_PARSER + cout << "sax_parser::declaration: end name='" << decl_name << "'" << endl; +#endif +} + +template<typename HandlerT, typename ConfigT> +void sax_parser<HandlerT,ConfigT>::cdata() +{ + size_t len = available_size(); + assert(len > 3); + + // Parse until we reach ']]>'. + const char* p0 = mp_char; + size_t i = 0, match = 0; + for (char c = cur_char(); i < len; ++i, c = next_and_char()) + { + if (c == ']') + { + // Be aware that we may encounter a series of more than two ']' + // characters, in which case we'll only count the last two. + + if (match == 0) + // First ']' + ++match; + else if (match == 1) + // Second ']' + ++match; + } + else if (c == '>' && match == 2) + { + // Found ']]>'. + size_t cdata_len = i - 2; + m_handler.characters(std::string_view(p0, cdata_len), false); + next(); + return; + } + else + match = 0; + } + throw malformed_xml_error("malformed CDATA section.", offset()); +} + +template<typename HandlerT, typename ConfigT> +void sax_parser<HandlerT,ConfigT>::doctype() +{ + // Parse the root element first. + sax::doctype_declaration param; + name(param.root_element); + skip_space_and_control(); + + // Either PUBLIC or SYSTEM. + size_t len = available_size(); + if (len < 6) + throw malformed_xml_error("DOCTYPE section too short.", offset()); + + param.keyword = sax::doctype_declaration::keyword_type::dtd_private; + char c = cur_char(); + if (c == 'P') + { + if (next_and_char() != 'U' || next_and_char() != 'B' || next_and_char() != 'L' || next_and_char() != 'I' || next_and_char() != 'C') + throw malformed_xml_error("malformed DOCTYPE section.", offset()); + + param.keyword = sax::doctype_declaration::keyword_type::dtd_public; + } + else if (c == 'S') + { + if (next_and_char() != 'Y' || next_and_char() != 'S' || next_and_char() != 'T' || next_and_char() != 'E' || next_and_char() != 'M') + throw malformed_xml_error("malformed DOCTYPE section.", offset()); + } + + next_check(); + skip_space_and_control(); + + // Parse FPI. + value(param.fpi, false); + + has_char_throw("DOCTYPE section too short."); + skip_space_and_control(); + has_char_throw("DOCTYPE section too short."); + + if (cur_char() == '>') + { + // Optional URI not given. Exit. +#if ORCUS_DEBUG_SAX_PARSER + cout << "sax_parser::doctype: root='" << param.root_element << "', fpi='" << param.fpi << "'" << endl; +#endif + m_handler.doctype(param); + next(); + return; + } + + // Parse optional URI. + value(param.uri, false); + + has_char_throw("DOCTYPE section too short."); + skip_space_and_control(); + has_char_throw("DOCTYPE section too short."); + + if (cur_char() != '>') + throw malformed_xml_error("malformed DOCTYPE section - closing '>' expected but not found.", offset()); + +#if ORCUS_DEBUG_SAX_PARSER + cout << "sax_parser::doctype: root='" << param.root_element << "', fpi='" << param.fpi << "' uri='" << param.uri << "'" << endl; +#endif + m_handler.doctype(param); + next(); +} + +template<typename HandlerT, typename ConfigT> +void sax_parser<HandlerT,ConfigT>::characters() +{ + const char* p0 = mp_char; + for (; has_char(); next()) + { + if (cur_char() == '<') + break; + + if (cur_char() == '&') + { + // Text span with one or more encoded characters. Parse using cell buffer. + cell_buffer& buf = get_cell_buffer(); + buf.reset(); + buf.append(p0, mp_char-p0); + characters_with_encoded_char(buf); + if (buf.empty()) + m_handler.characters(std::string_view{}, false); + else + m_handler.characters(buf.str(), true); + return; + } + } + + if (mp_char > p0) + { + std::string_view val(p0, mp_char-p0); + m_handler.characters(val, false); + } +} + +template<typename HandlerT, typename ConfigT> +void sax_parser<HandlerT,ConfigT>::attribute() +{ + sax::parser_attribute attr; + attribute_name(attr.ns, attr.name); + +#if ORCUS_DEBUG_SAX_PARSER + cout << "sax_parser::attribute: ns='" << attr.ns << "', name='" << attr.name << "'" << endl; +#endif + + skip_space_and_control(); + + char c = cur_char_checked(); + if (c != '=') + { + std::ostringstream os; + os << "Attribute must begin with 'name=..'. (ns='" << attr.ns << "', name='" << attr.name << "')"; + throw malformed_xml_error(os.str(), offset()); + } + + next_check(); // skip the '='. + skip_space_and_control(); + + attr.transient = value(attr.value, true); + if (attr.transient) + // Value is stored in a temporary buffer. Push a new buffer. + inc_buffer_pos(); + +#if ORCUS_DEBUG_SAX_PARSER + cout << "sax_parser::attribute: value='" << attr.value << "'" << endl; +#endif + + m_handler.attribute(attr); +} + +} + +#endif +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/sax_parser_base.hpp b/include/orcus/sax_parser_base.hpp new file mode 100644 index 0000000..4dcfc07 --- /dev/null +++ b/include/orcus/sax_parser_base.hpp @@ -0,0 +1,207 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_SAX_PARSER_BASE_HPP +#define INCLUDED_ORCUS_SAX_PARSER_BASE_HPP + +#include "env.hpp" +#include "cell_buffer.hpp" +#include "parser_global.hpp" +#include "parser_base.hpp" + +#include <cassert> +#include <cstdlib> +#include <exception> +#include <sstream> +#include <memory> + +#define ORCUS_DEBUG_SAX_PARSER 0 + +#if ORCUS_DEBUG_SAX_PARSER +#include <iostream> +using std::cout; +using std::endl; +#endif + +namespace orcus { namespace sax { + +/** + * Document type declaration passed by sax_parser to its handler's doctype() + * call. + */ +struct doctype_declaration +{ + enum class keyword_type { dtd_public, dtd_private }; + + keyword_type keyword; + std::string_view root_element; + std::string_view fpi; + std::string_view uri; +}; + +/** + * Given an encoded name (such as 'quot' and 'amp'), return a single + * character that corresponds with the name. The name shouldn't include the + * leading '&' and trailing ';'. + * + * @param p pointer to the first character of encoded name + * @param n length of encoded name + * + * @return single character that corresponds with the encoded name. '\0' is + * returned if decoding fails. + */ +ORCUS_PSR_DLLPUBLIC char decode_xml_encoded_char(const char* p, size_t n); + +/** + * Given an encoded unicode value (such as #20A9), return a UTF-8 string + * that corresponds with the unicode value. The value shouldn't include the + * leading '&' and trailing ';'. + * + * @param p pointer to the first character of encoded name + * @param n length of encoded name + * + * @return string that corresponds with the encoded value. An empty string + * is returned if decoding fails. + */ +ORCUS_PSR_DLLPUBLIC std::string decode_xml_unicode_char(const char* p, size_t n); + +/** + * Element properties passed by sax_parser to its handler's open_element() + * and close_element() calls. + */ +struct parser_element +{ + /** Optional element namespace. It may be empty if it's not given. */ + std::string_view ns; + /** Element name. */ + std::string_view name; + /** Position of the opening brace '<'. */ + std::ptrdiff_t begin_pos; + /** Position immediately after the closing brace '>'. */ + std::ptrdiff_t end_pos; +}; + +/** + * Attribute properties passed by sax_parser to its handler's attribute() + * call. When an attribute value is "transient", it has been converted due to + * presence of encoded character(s) and has been stored in a temporary buffer. + * The handler must assume that the value will not survive after the callback + * function ends. + */ +struct parser_attribute +{ + /** Optional attribute namespace. It may be empty if it's not given. */ + std::string_view ns; + /** Attribute name. */ + std::string_view name; + /** Attribute value. */ + std::string_view value; + /** Whether or not the attribute value is in a temporary buffer. */ + bool transient; +}; + +class ORCUS_PSR_DLLPUBLIC parser_base : public ::orcus::parser_base +{ + struct impl; + std::unique_ptr<impl> mp_impl; + + parser_base() = delete; + parser_base(const parser_base&) = delete; + parser_base& operator=(const parser_base&) = delete; +protected: + size_t m_nest_level; + size_t m_buffer_pos; + bool m_root_elem_open:1; + +protected: + parser_base(const char* content, size_t size); + ~parser_base(); + + void next_check() + { + next(); + if (!has_char()) + throw malformed_xml_error("xml stream ended prematurely.", offset()); + } + + void nest_up() { ++m_nest_level; } + void nest_down() + { + if (m_nest_level == 0) + throw malformed_xml_error("incorrect nesting in xml stream", offset()); + + --m_nest_level; + } + + void inc_buffer_pos(); + void reset_buffer_pos() { m_buffer_pos = 0; } + + void has_char_throw(const char* msg) const + { + if (!has_char()) + throw malformed_xml_error(msg, offset()); + } + + char cur_char_checked() const + { + if (!has_char()) + throw malformed_xml_error("xml stream ended prematurely.", offset()); + + return *mp_char; + } + + char next_and_char() + { + next(); +#if ORCUS_DEBUG_SAX_PARSER + if (mp_char >= mp_end) + throw malformed_xml_error("xml stream ended prematurely.", offset()); +#endif + return *mp_char; + } + + char next_char_checked() + { + next(); + if (!has_char()) + throw malformed_xml_error("xml stream ended prematurely.", offset()); + + return *mp_char; + } + + cell_buffer& get_cell_buffer(); + + void comment(); + + void expects_next(const char* p, size_t n); + + void parse_encoded_char(cell_buffer& buf); + void value_with_encoded_char(cell_buffer& buf, std::string_view& str, char quote_char); + + /** + * Parse quoted value. Note that the retrieved string may be stored in a + * temporary cell buffer if the decode parameter is true. Use the string + * immediately after this call before the buffer becomes invalid. + * + * @note This method checks for valid stream; the caller doesn't need to + * check for valid stream before calling this method. + * + * @return true if the value is stored in temporary buffer, false + * otherwise. + */ + bool value(std::string_view& str, bool decode); + + void name(std::string_view& str); + void element_name(parser_element& elem, std::ptrdiff_t begin_pos); + void attribute_name(std::string_view& attr_ns, std::string_view& attr_name); + void characters_with_encoded_char(cell_buffer& buf); +}; + +}} + +#endif +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/sax_token_parser.hpp b/include/orcus/sax_token_parser.hpp new file mode 100644 index 0000000..867c8b5 --- /dev/null +++ b/include/orcus/sax_token_parser.hpp @@ -0,0 +1,186 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_SAX_TOKEN_PARSER_HPP +#define INCLUDED_ORCUS_SAX_TOKEN_PARSER_HPP + +#include "sax_ns_parser.hpp" +#include "types.hpp" + +#include <vector> +#include <algorithm> +#include <functional> + +namespace orcus { + +class tokens; + +class ORCUS_PSR_DLLPUBLIC sax_token_handler_wrapper_base +{ +protected: + xml_declaration_t m_declaration; + xml_token_element_t m_elem; + const tokens& m_tokens; + + xml_token_t tokenize(std::string_view name) const; + void set_element(const sax_ns_parser_element& elem); + +public: + sax_token_handler_wrapper_base(const tokens& _tokens); + + void attribute(std::string_view name, std::string_view val); + void attribute(const sax_ns_parser_attribute& attr); +}; + +class sax_token_handler +{ +public: + + /** + * Called immediately after the entire XML declaration has been parsed. + * + * @param decl struct containing the attributes of the XML declaration. + */ + void declaration(const orcus::xml_declaration_t& decl) + { + (void)decl; + } + + /** + * Called at the start of each element. + * + * @param elem struct containing the element's information as well as all + * the attributes that belong to the element. + */ + void start_element(const orcus::xml_token_element_t& elem) + { + (void)elem; + } + + /** + * Called at the end of each element. + * + * @param elem struct containing the element's information as well as all + * the attributes that belong to the element. + */ + void end_element(const orcus::xml_token_element_t& elem) + { + (void)elem; + } + + /** + * Called when a segment of a text content is parsed. Each text content + * is a direct child of an element, which may have multiple child contents + * when the element also has a child element that are direct sibling to + * the text contents or the text contents are splitted by a comment. + * + * @param val value of the text content. + * @param transient when true, the text content has been converted and is + * stored in a temporary buffer due to presence of one or + * more encoded characters, in which case <em>the passed + * text value needs to be either immediately converted to + * a non-text value or be interned within the scope of + * the callback</em>. + */ + void characters(std::string_view val, bool transient) + { + (void)val; (void)transient; + } +}; + +/** + * SAX parser that tokenizes element and attribute names while parsing. All + * pre-defined elements and attribute names are translated into integral + * identifiers via use of @ref tokens. The user of this class needs to + * provide a pre-defined set of element and attribute names at construction + * time. + * + * This parser internally uses @ref sax_ns_parser. + * + * @tparam HandlerT Handler type with member functions for event callbacks. + * Refer to @ref sax_token_handler. + */ +template<typename HandlerT> +class sax_token_parser +{ +public: + typedef HandlerT handler_type; + + sax_token_parser( + std::string_view content, const tokens& _tokens, + xmlns_context& ns_cxt, handler_type& handler); + + ~sax_token_parser() = default; + + void parse(); + +private: + + /** + * Re-route callbacks from the internal sax_ns_parser into the + * sax_token_parser callbacks. + */ + class handler_wrapper : public sax_token_handler_wrapper_base + { + handler_type& m_handler; + + public: + handler_wrapper(const tokens& _tokens, handler_type& handler) : + sax_token_handler_wrapper_base(_tokens), m_handler(handler) {} + + void doctype(const sax::doctype_declaration&) {} + + void start_declaration(std::string_view) {} + + void end_declaration(std::string_view) + { + m_handler.declaration(m_declaration); + m_elem.attrs.clear(); + } + + void start_element(const sax_ns_parser_element& elem) + { + set_element(elem); + m_handler.start_element(m_elem); + m_elem.attrs.clear(); + } + + void end_element(const sax_ns_parser_element& elem) + { + set_element(elem); + m_handler.end_element(m_elem); + } + + void characters(std::string_view val, bool transient) + { + m_handler.characters(val, transient); + } + }; + +private: + handler_wrapper m_wrapper; + sax_ns_parser<handler_wrapper> m_parser; +}; + +template<typename HandlerT> +sax_token_parser<HandlerT>::sax_token_parser( + std::string_view content, const tokens& _tokens, xmlns_context& ns_cxt, handler_type& handler) : + m_wrapper(_tokens, handler), + m_parser(content, ns_cxt, m_wrapper) +{ +} + +template<typename HandlerT> +void sax_token_parser<HandlerT>::parse() +{ + m_parser.parse(); +} + +} // namespace orcus + +#endif +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/sax_token_parser_thread.hpp b/include/orcus/sax_token_parser_thread.hpp new file mode 100644 index 0000000..b364573 --- /dev/null +++ b/include/orcus/sax_token_parser_thread.hpp @@ -0,0 +1,92 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_SAX_TOKEN_PARSER_THREAD_HPP +#define INCLUDED_ORCUS_SAX_TOKEN_PARSER_THREAD_HPP + +#include "env.hpp" +#include "types.hpp" + +#include <memory> +#include <variant> +#include <vector> +#include <ostream> + +namespace orcus { + +class tokens; +class xmlns_context; +class string_pool; +struct xml_token_element_t; + +namespace sax { + +enum class parse_token_t +{ + unknown, + start_element, + end_element, + characters, + parse_error, +}; + +struct ORCUS_PSR_DLLPUBLIC parse_token +{ + using value_type = std::variant<std::string_view, parse_error_value_t, const xml_token_element_t*>; + + parse_token_t type; + value_type value; + + parse_token(); + parse_token(std::string_view _characters); + parse_token(parse_token_t _type, const xml_token_element_t* _element); + parse_token(std::string_view msg, std::ptrdiff_t offset); + + parse_token(const parse_token& other); + + parse_token& operator= (parse_token) = delete; + + bool operator== (const parse_token& other) const; + bool operator!= (const parse_token& other) const; +}; + +typedef std::vector<parse_token> parse_tokens_t; + +ORCUS_PSR_DLLPUBLIC std::ostream& operator<< (std::ostream& os, const parse_tokens_t& tokens); + +class ORCUS_PSR_DLLPUBLIC parser_thread +{ + struct impl; + std::unique_ptr<impl> mp_impl; + +public: + parser_thread(const char* p, size_t n, const orcus::tokens& tks, xmlns_context& ns_cxt, size_t min_token_size); + parser_thread(const char* p, size_t n, const orcus::tokens& tks, xmlns_context& ns_cxt, size_t min_token_size, size_t max_token_size); + ~parser_thread(); + + void start(); + + /** + * Wait until new set of tokens becomes available. + * + * @param tokens new set of tokens. + * + * @return true if the parsing is still in progress (therefore more tokens + * to come), false if it's done i.e. this is the last token set. + */ + bool next_tokens(parse_tokens_t& tokens); + + void swap_string_pool(string_pool& pool); + + void abort(); +}; + +}} + +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/spreadsheet/Makefile.am b/include/orcus/spreadsheet/Makefile.am new file mode 100644 index 0000000..b01bce7 --- /dev/null +++ b/include/orcus/spreadsheet/Makefile.am @@ -0,0 +1,26 @@ + +liborcusdir = $(includedir)/liborcus-@ORCUS_API_VERSION@/orcus/spreadsheet +liborcus_HEADERS = \ + types.hpp \ + view_types.hpp \ + export_interface.hpp \ + import_interface.hpp \ + import_interface_pivot.hpp \ + import_interface_styles.hpp \ + import_interface_view.hpp + +if BUILD_SPREADSHEET_MODEL + +liborcus_HEADERS += \ + auto_filter.hpp \ + config.hpp \ + document.hpp \ + document_types.hpp \ + factory.hpp \ + pivot.hpp \ + shared_strings.hpp \ + sheet.hpp \ + styles.hpp \ + view.hpp + +endif diff --git a/include/orcus/spreadsheet/Makefile.in b/include/orcus/spreadsheet/Makefile.in new file mode 100644 index 0000000..2331067 --- /dev/null +++ b/include/orcus/spreadsheet/Makefile.in @@ -0,0 +1,680 @@ +# Makefile.in generated by automake 1.16.5 from Makefile.am. +# @configure_input@ + +# Copyright (C) 1994-2021 Free Software Foundation, Inc. + +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + +@SET_MAKE@ + +VPATH = @srcdir@ +am__is_gnu_make = { \ + if test -z '$(MAKELEVEL)'; then \ + false; \ + elif test -n '$(MAKE_HOST)'; then \ + true; \ + elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \ + true; \ + else \ + false; \ + fi; \ +} +am__make_running_with_option = \ + case $${target_option-} in \ + ?) ;; \ + *) echo "am__make_running_with_option: internal error: invalid" \ + "target option '$${target_option-}' specified" >&2; \ + exit 1;; \ + esac; \ + has_opt=no; \ + sane_makeflags=$$MAKEFLAGS; \ + if $(am__is_gnu_make); then \ + sane_makeflags=$$MFLAGS; \ + else \ + case $$MAKEFLAGS in \ + *\\[\ \ ]*) \ + bs=\\; \ + sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \ + | sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \ + esac; \ + fi; \ + skip_next=no; \ + strip_trailopt () \ + { \ + flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \ + }; \ + for flg in $$sane_makeflags; do \ + test $$skip_next = yes && { skip_next=no; continue; }; \ + case $$flg in \ + *=*|--*) continue;; \ + -*I) strip_trailopt 'I'; skip_next=yes;; \ + -*I?*) strip_trailopt 'I';; \ + -*O) strip_trailopt 'O'; skip_next=yes;; \ + -*O?*) strip_trailopt 'O';; \ + -*l) strip_trailopt 'l'; skip_next=yes;; \ + -*l?*) strip_trailopt 'l';; \ + -[dEDm]) skip_next=yes;; \ + -[JT]) skip_next=yes;; \ + esac; \ + case $$flg in \ + *$$target_option*) has_opt=yes; break;; \ + esac; \ + done; \ + test $$has_opt = yes +am__make_dryrun = (target_option=n; $(am__make_running_with_option)) +am__make_keepgoing = (target_option=k; $(am__make_running_with_option)) +pkgdatadir = $(datadir)/@PACKAGE@ +pkgincludedir = $(includedir)/@PACKAGE@ +pkglibdir = $(libdir)/@PACKAGE@ +pkglibexecdir = $(libexecdir)/@PACKAGE@ +am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd +install_sh_DATA = $(install_sh) -c -m 644 +install_sh_PROGRAM = $(install_sh) -c +install_sh_SCRIPT = $(install_sh) -c +INSTALL_HEADER = $(INSTALL_DATA) +transform = $(program_transform_name) +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +build_triplet = @build@ +host_triplet = @host@ +@BUILD_SPREADSHEET_MODEL_TRUE@am__append_1 = \ +@BUILD_SPREADSHEET_MODEL_TRUE@ auto_filter.hpp \ +@BUILD_SPREADSHEET_MODEL_TRUE@ config.hpp \ +@BUILD_SPREADSHEET_MODEL_TRUE@ document.hpp \ +@BUILD_SPREADSHEET_MODEL_TRUE@ document_types.hpp \ +@BUILD_SPREADSHEET_MODEL_TRUE@ factory.hpp \ +@BUILD_SPREADSHEET_MODEL_TRUE@ pivot.hpp \ +@BUILD_SPREADSHEET_MODEL_TRUE@ shared_strings.hpp \ +@BUILD_SPREADSHEET_MODEL_TRUE@ sheet.hpp \ +@BUILD_SPREADSHEET_MODEL_TRUE@ styles.hpp \ +@BUILD_SPREADSHEET_MODEL_TRUE@ view.hpp + +subdir = include/orcus/spreadsheet +ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 +am__aclocal_m4_deps = $(top_srcdir)/m4/ax_cxx_compile_stdcxx.m4 \ + $(top_srcdir)/m4/ax_cxx_compile_stdcxx_17.m4 \ + $(top_srcdir)/m4/boost.m4 $(top_srcdir)/m4/libtool.m4 \ + $(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \ + $(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \ + $(top_srcdir)/m4/m4_ax_valgrind_check.m4 \ + $(top_srcdir)/configure.ac +am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ + $(ACLOCAL_M4) +DIST_COMMON = $(srcdir)/Makefile.am $(am__liborcus_HEADERS_DIST) \ + $(am__DIST_COMMON) +mkinstalldirs = $(install_sh) -d +CONFIG_HEADER = $(top_builddir)/config.h +CONFIG_CLEAN_FILES = +CONFIG_CLEAN_VPATH_FILES = +AM_V_P = $(am__v_P_@AM_V@) +am__v_P_ = $(am__v_P_@AM_DEFAULT_V@) +am__v_P_0 = false +am__v_P_1 = : +AM_V_GEN = $(am__v_GEN_@AM_V@) +am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@) +am__v_GEN_0 = @echo " GEN " $@; +am__v_GEN_1 = +AM_V_at = $(am__v_at_@AM_V@) +am__v_at_ = $(am__v_at_@AM_DEFAULT_V@) +am__v_at_0 = @ +am__v_at_1 = +SOURCES = +DIST_SOURCES = +am__can_run_installinfo = \ + case $$AM_UPDATE_INFO_DIR in \ + n|no|NO) false;; \ + *) (install-info --version) >/dev/null 2>&1;; \ + esac +am__liborcus_HEADERS_DIST = types.hpp view_types.hpp \ + export_interface.hpp import_interface.hpp \ + import_interface_pivot.hpp import_interface_styles.hpp \ + import_interface_view.hpp auto_filter.hpp config.hpp \ + document.hpp document_types.hpp factory.hpp pivot.hpp \ + shared_strings.hpp sheet.hpp styles.hpp view.hpp +am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; +am__vpath_adj = case $$p in \ + $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \ + *) f=$$p;; \ + esac; +am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`; +am__install_max = 40 +am__nobase_strip_setup = \ + srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'` +am__nobase_strip = \ + for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||" +am__nobase_list = $(am__nobase_strip_setup); \ + for p in $$list; do echo "$$p $$p"; done | \ + sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \ + $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \ + if (++n[$$2] == $(am__install_max)) \ + { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \ + END { for (dir in files) print dir, files[dir] }' +am__base_list = \ + sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \ + sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g' +am__uninstall_files_from_dir = { \ + test -z "$$files" \ + || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \ + || { echo " ( cd '$$dir' && rm -f" $$files ")"; \ + $(am__cd) "$$dir" && rm -f $$files; }; \ + } +am__installdirs = "$(DESTDIR)$(liborcusdir)" +HEADERS = $(liborcus_HEADERS) +am__extra_recursive_targets = check-valgrind-recursive \ + check-valgrind-memcheck-recursive \ + check-valgrind-helgrind-recursive check-valgrind-drd-recursive \ + check-valgrind-sgcheck-recursive +am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP) +# Read a list of newline-separated strings from the standard input, +# and print each of them once, without duplicates. Input order is +# *not* preserved. +am__uniquify_input = $(AWK) '\ + BEGIN { nonempty = 0; } \ + { items[$$0] = 1; nonempty = 1; } \ + END { if (nonempty) { for (i in items) print i; }; } \ +' +# Make sure the list of sources is unique. This is necessary because, +# e.g., the same source file might be shared among _SOURCES variables +# for different programs/libraries. +am__define_uniq_tagged_files = \ + list='$(am__tagged_files)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | $(am__uniquify_input)` +am__DIST_COMMON = $(srcdir)/Makefile.in +DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) +ACLOCAL = @ACLOCAL@ +AMTAR = @AMTAR@ +AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ +AR = @AR@ +AS = @AS@ +AUTOCONF = @AUTOCONF@ +AUTOHEADER = @AUTOHEADER@ +AUTOMAKE = @AUTOMAKE@ +AWK = @AWK@ +BOOST_CPPFLAGS = @BOOST_CPPFLAGS@ +BOOST_DATE_TIME_LDFLAGS = @BOOST_DATE_TIME_LDFLAGS@ +BOOST_DATE_TIME_LDPATH = @BOOST_DATE_TIME_LDPATH@ +BOOST_DATE_TIME_LIBS = @BOOST_DATE_TIME_LIBS@ +BOOST_FILESYSTEM_LDFLAGS = @BOOST_FILESYSTEM_LDFLAGS@ +BOOST_FILESYSTEM_LDPATH = @BOOST_FILESYSTEM_LDPATH@ +BOOST_FILESYSTEM_LIBS = @BOOST_FILESYSTEM_LIBS@ +BOOST_IOSTREAMS_LDFLAGS = @BOOST_IOSTREAMS_LDFLAGS@ +BOOST_IOSTREAMS_LDPATH = @BOOST_IOSTREAMS_LDPATH@ +BOOST_IOSTREAMS_LIBS = @BOOST_IOSTREAMS_LIBS@ +BOOST_LDPATH = @BOOST_LDPATH@ +BOOST_PROGRAM_OPTIONS_LDFLAGS = @BOOST_PROGRAM_OPTIONS_LDFLAGS@ +BOOST_PROGRAM_OPTIONS_LDPATH = @BOOST_PROGRAM_OPTIONS_LDPATH@ +BOOST_PROGRAM_OPTIONS_LIBS = @BOOST_PROGRAM_OPTIONS_LIBS@ +BOOST_ROOT = @BOOST_ROOT@ +BOOST_SYSTEM_LDFLAGS = @BOOST_SYSTEM_LDFLAGS@ +BOOST_SYSTEM_LDPATH = @BOOST_SYSTEM_LDPATH@ +BOOST_SYSTEM_LIBS = @BOOST_SYSTEM_LIBS@ +CC = @CC@ +CCDEPMODE = @CCDEPMODE@ +CFLAGS = @CFLAGS@ +CPP = @CPP@ +CPPFLAGS = @CPPFLAGS@ +CSCOPE = @CSCOPE@ +CTAGS = @CTAGS@ +CXX = @CXX@ +CXXCPP = @CXXCPP@ +CXXDEPMODE = @CXXDEPMODE@ +CXXFLAGS = @CXXFLAGS@ +CYGPATH_W = @CYGPATH_W@ +DEFS = @DEFS@ +DEPDIR = @DEPDIR@ +DISTCHECK_CONFIGURE_FLAGS = @DISTCHECK_CONFIGURE_FLAGS@ +DLLTOOL = @DLLTOOL@ +DSYMUTIL = @DSYMUTIL@ +DUMPBIN = @DUMPBIN@ +ECHO_C = @ECHO_C@ +ECHO_N = @ECHO_N@ +ECHO_T = @ECHO_T@ +EGREP = @EGREP@ +ENABLE_VALGRIND_drd = @ENABLE_VALGRIND_drd@ +ENABLE_VALGRIND_helgrind = @ENABLE_VALGRIND_helgrind@ +ENABLE_VALGRIND_memcheck = @ENABLE_VALGRIND_memcheck@ +ENABLE_VALGRIND_sgcheck = @ENABLE_VALGRIND_sgcheck@ +ETAGS = @ETAGS@ +EXEEXT = @EXEEXT@ +FGREP = @FGREP@ +GREP = @GREP@ +HAVE_CXX17 = @HAVE_CXX17@ +INSTALL = @INSTALL@ +INSTALL_DATA = @INSTALL_DATA@ +INSTALL_PROGRAM = @INSTALL_PROGRAM@ +INSTALL_SCRIPT = @INSTALL_SCRIPT@ +INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ +IXION_REQUIRED_API_VERSION = @IXION_REQUIRED_API_VERSION@ +LD = @LD@ +LDFLAGS = @LDFLAGS@ +LIBIXION_CFLAGS = @LIBIXION_CFLAGS@ +LIBIXION_LIBS = @LIBIXION_LIBS@ +LIBOBJS = @LIBOBJS@ +LIBS = @LIBS@ +LIBTOOL = @LIBTOOL@ +LIPO = @LIPO@ +LN_S = @LN_S@ +LTLIBOBJS = @LTLIBOBJS@ +LT_SYS_LIBRARY_PATH = @LT_SYS_LIBRARY_PATH@ +MAKEINFO = @MAKEINFO@ +MANIFEST_TOOL = @MANIFEST_TOOL@ +MDDS_CFLAGS = @MDDS_CFLAGS@ +MDDS_LIBS = @MDDS_LIBS@ +MKDIR_P = @MKDIR_P@ +NM = @NM@ +NMEDIT = @NMEDIT@ +OBJDUMP = @OBJDUMP@ +OBJEXT = @OBJEXT@ +ORCUS_API_VERSION = @ORCUS_API_VERSION@ +ORCUS_MAJOR_VERSION = @ORCUS_MAJOR_VERSION@ +ORCUS_MICRO_VERSION = @ORCUS_MICRO_VERSION@ +ORCUS_MINOR_VERSION = @ORCUS_MINOR_VERSION@ +OTOOL = @OTOOL@ +OTOOL64 = @OTOOL64@ +PACKAGE = @PACKAGE@ +PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ +PACKAGE_NAME = @PACKAGE_NAME@ +PACKAGE_STRING = @PACKAGE_STRING@ +PACKAGE_TARNAME = @PACKAGE_TARNAME@ +PACKAGE_URL = @PACKAGE_URL@ +PACKAGE_VERSION = @PACKAGE_VERSION@ +PARQUET_CFLAGS = @PARQUET_CFLAGS@ +PARQUET_LIBS = @PARQUET_LIBS@ +PATH_SEPARATOR = @PATH_SEPARATOR@ +PKG_CONFIG = @PKG_CONFIG@ +PKG_CONFIG_LIBDIR = @PKG_CONFIG_LIBDIR@ +PKG_CONFIG_PATH = @PKG_CONFIG_PATH@ +POW_LIB = @POW_LIB@ +PYTHON = @PYTHON@ +PYTHON_CFLAGS = @PYTHON_CFLAGS@ +PYTHON_EXEC_PREFIX = @PYTHON_EXEC_PREFIX@ +PYTHON_LIBS = @PYTHON_LIBS@ +PYTHON_PLATFORM = @PYTHON_PLATFORM@ +PYTHON_PREFIX = @PYTHON_PREFIX@ +PYTHON_VERSION = @PYTHON_VERSION@ +RANLIB = @RANLIB@ +SED = @SED@ +SET_MAKE = @SET_MAKE@ +SHELL = @SHELL@ +STRIP = @STRIP@ +VALGRIND = @VALGRIND@ +VALGRIND_ENABLED = @VALGRIND_ENABLED@ +VERSION = @VERSION@ +ZLIB_CFLAGS = @ZLIB_CFLAGS@ +ZLIB_LIBS = @ZLIB_LIBS@ +abs_builddir = @abs_builddir@ +abs_srcdir = @abs_srcdir@ +abs_top_builddir = @abs_top_builddir@ +abs_top_srcdir = @abs_top_srcdir@ +ac_ct_AR = @ac_ct_AR@ +ac_ct_CC = @ac_ct_CC@ +ac_ct_CXX = @ac_ct_CXX@ +ac_ct_DUMPBIN = @ac_ct_DUMPBIN@ +am__include = @am__include@ +am__leading_dot = @am__leading_dot@ +am__quote = @am__quote@ +am__tar = @am__tar@ +am__untar = @am__untar@ +bindir = @bindir@ +build = @build@ +build_alias = @build_alias@ +build_cpu = @build_cpu@ +build_os = @build_os@ +build_vendor = @build_vendor@ +builddir = @builddir@ +datadir = @datadir@ +datarootdir = @datarootdir@ +docdir = @docdir@ +dvidir = @dvidir@ +exec_prefix = @exec_prefix@ +host = @host@ +host_alias = @host_alias@ +host_cpu = @host_cpu@ +host_os = @host_os@ +host_vendor = @host_vendor@ +htmldir = @htmldir@ +includedir = @includedir@ +infodir = @infodir@ +install_sh = @install_sh@ +libdir = @libdir@ +libexecdir = @libexecdir@ +localedir = @localedir@ +localstatedir = @localstatedir@ +mandir = @mandir@ +mkdir_p = @mkdir_p@ +oldincludedir = @oldincludedir@ +pdfdir = @pdfdir@ +pkgpyexecdir = @pkgpyexecdir@ +pkgpythondir = @pkgpythondir@ +prefix = @prefix@ +program_transform_name = @program_transform_name@ +psdir = @psdir@ +pyexecdir = @pyexecdir@ +pythondir = @pythondir@ +runstatedir = @runstatedir@ +sbindir = @sbindir@ +sharedstatedir = @sharedstatedir@ +srcdir = @srcdir@ +sysconfdir = @sysconfdir@ +target_alias = @target_alias@ +top_build_prefix = @top_build_prefix@ +top_builddir = @top_builddir@ +top_srcdir = @top_srcdir@ +valgrind_enabled_tools = @valgrind_enabled_tools@ +valgrind_tools = @valgrind_tools@ +liborcusdir = $(includedir)/liborcus-@ORCUS_API_VERSION@/orcus/spreadsheet +liborcus_HEADERS = types.hpp view_types.hpp export_interface.hpp \ + import_interface.hpp import_interface_pivot.hpp \ + import_interface_styles.hpp import_interface_view.hpp \ + $(am__append_1) +all: all-am + +.SUFFIXES: +$(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps) + @for dep in $?; do \ + case '$(am__configure_deps)' in \ + *$$dep*) \ + ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ + && { if test -f $@; then exit 0; else break; fi; }; \ + exit 1;; \ + esac; \ + done; \ + echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign include/orcus/spreadsheet/Makefile'; \ + $(am__cd) $(top_srcdir) && \ + $(AUTOMAKE) --foreign include/orcus/spreadsheet/Makefile +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + @case '$?' in \ + *config.status*) \ + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ + *) \ + echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles)'; \ + cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles);; \ + esac; + +$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh + +$(top_srcdir)/configure: $(am__configure_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(ACLOCAL_M4): $(am__aclocal_m4_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(am__aclocal_m4_deps): + +mostlyclean-libtool: + -rm -f *.lo + +clean-libtool: + -rm -rf .libs _libs +install-liborcusHEADERS: $(liborcus_HEADERS) + @$(NORMAL_INSTALL) + @list='$(liborcus_HEADERS)'; test -n "$(liborcusdir)" || list=; \ + if test -n "$$list"; then \ + echo " $(MKDIR_P) '$(DESTDIR)$(liborcusdir)'"; \ + $(MKDIR_P) "$(DESTDIR)$(liborcusdir)" || exit 1; \ + fi; \ + for p in $$list; do \ + if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \ + echo "$$d$$p"; \ + done | $(am__base_list) | \ + while read files; do \ + echo " $(INSTALL_HEADER) $$files '$(DESTDIR)$(liborcusdir)'"; \ + $(INSTALL_HEADER) $$files "$(DESTDIR)$(liborcusdir)" || exit $$?; \ + done + +uninstall-liborcusHEADERS: + @$(NORMAL_UNINSTALL) + @list='$(liborcus_HEADERS)'; test -n "$(liborcusdir)" || list=; \ + files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \ + dir='$(DESTDIR)$(liborcusdir)'; $(am__uninstall_files_from_dir) +check-valgrind-local: +check-valgrind-memcheck-local: +check-valgrind-helgrind-local: +check-valgrind-drd-local: +check-valgrind-sgcheck-local: + +ID: $(am__tagged_files) + $(am__define_uniq_tagged_files); mkid -fID $$unique +tags: tags-am +TAGS: tags + +tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) + set x; \ + here=`pwd`; \ + $(am__define_uniq_tagged_files); \ + shift; \ + if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ + test -n "$$unique" || unique=$$empty_fix; \ + if test $$# -gt 0; then \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + "$$@" $$unique; \ + else \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + $$unique; \ + fi; \ + fi +ctags: ctags-am + +CTAGS: ctags +ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) + $(am__define_uniq_tagged_files); \ + test -z "$(CTAGS_ARGS)$$unique" \ + || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ + $$unique + +GTAGS: + here=`$(am__cd) $(top_builddir) && pwd` \ + && $(am__cd) $(top_srcdir) \ + && gtags -i $(GTAGS_ARGS) "$$here" +cscopelist: cscopelist-am + +cscopelist-am: $(am__tagged_files) + list='$(am__tagged_files)'; \ + case "$(srcdir)" in \ + [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \ + *) sdir=$(subdir)/$(srcdir) ;; \ + esac; \ + for i in $$list; do \ + if test -f "$$i"; then \ + echo "$(subdir)/$$i"; \ + else \ + echo "$$sdir/$$i"; \ + fi; \ + done >> $(top_builddir)/cscope.files + +distclean-tags: + -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags +distdir: $(BUILT_SOURCES) + $(MAKE) $(AM_MAKEFLAGS) distdir-am + +distdir-am: $(DISTFILES) + @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + list='$(DISTFILES)'; \ + dist_files=`for file in $$list; do echo $$file; done | \ + sed -e "s|^$$srcdirstrip/||;t" \ + -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ + case $$dist_files in \ + */*) $(MKDIR_P) `echo "$$dist_files" | \ + sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ + sort -u` ;; \ + esac; \ + for file in $$dist_files; do \ + if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ + if test -d $$d/$$file; then \ + dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ + if test -d "$(distdir)/$$file"; then \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ + cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ + else \ + test -f "$(distdir)/$$file" \ + || cp -p $$d/$$file "$(distdir)/$$file" \ + || exit 1; \ + fi; \ + done +check-am: all-am +check: check-am +all-am: Makefile $(HEADERS) +installdirs: + for dir in "$(DESTDIR)$(liborcusdir)"; do \ + test -z "$$dir" || $(MKDIR_P) "$$dir"; \ + done +install: install-am +install-exec: install-exec-am +install-data: install-data-am +uninstall: uninstall-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am + +installcheck: installcheck-am +install-strip: + if test -z '$(STRIP)'; then \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + install; \ + else \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \ + fi +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) + -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) + +maintainer-clean-generic: + @echo "This command is intended for maintainers to use" + @echo "it deletes files that may require special tools to rebuild." +check-valgrind: check-valgrind-am + +check-valgrind-am: check-valgrind-local + +check-valgrind-drd: check-valgrind-drd-am + +check-valgrind-drd-am: check-valgrind-drd-local + +check-valgrind-helgrind: check-valgrind-helgrind-am + +check-valgrind-helgrind-am: check-valgrind-helgrind-local + +check-valgrind-memcheck: check-valgrind-memcheck-am + +check-valgrind-memcheck-am: check-valgrind-memcheck-local + +check-valgrind-sgcheck: check-valgrind-sgcheck-am + +check-valgrind-sgcheck-am: check-valgrind-sgcheck-local + +clean: clean-am + +clean-am: clean-generic clean-libtool mostlyclean-am + +distclean: distclean-am + -rm -f Makefile +distclean-am: clean-am distclean-generic distclean-tags + +dvi: dvi-am + +dvi-am: + +html: html-am + +html-am: + +info: info-am + +info-am: + +install-data-am: install-liborcusHEADERS + +install-dvi: install-dvi-am + +install-dvi-am: + +install-exec-am: + +install-html: install-html-am + +install-html-am: + +install-info: install-info-am + +install-info-am: + +install-man: + +install-pdf: install-pdf-am + +install-pdf-am: + +install-ps: install-ps-am + +install-ps-am: + +installcheck-am: + +maintainer-clean: maintainer-clean-am + -rm -f Makefile +maintainer-clean-am: distclean-am maintainer-clean-generic + +mostlyclean: mostlyclean-am + +mostlyclean-am: mostlyclean-generic mostlyclean-libtool + +pdf: pdf-am + +pdf-am: + +ps: ps-am + +ps-am: + +uninstall-am: uninstall-liborcusHEADERS + +.MAKE: install-am install-strip + +.PHONY: CTAGS GTAGS TAGS all all-am check check-am check-valgrind-am \ + check-valgrind-drd-am check-valgrind-drd-local \ + check-valgrind-helgrind-am check-valgrind-helgrind-local \ + check-valgrind-local check-valgrind-memcheck-am \ + check-valgrind-memcheck-local check-valgrind-sgcheck-am \ + check-valgrind-sgcheck-local clean clean-generic clean-libtool \ + cscopelist-am ctags ctags-am distclean distclean-generic \ + distclean-libtool distclean-tags distdir dvi dvi-am html \ + html-am info info-am install install-am install-data \ + install-data-am install-dvi install-dvi-am install-exec \ + install-exec-am install-html install-html-am install-info \ + install-info-am install-liborcusHEADERS install-man \ + install-pdf install-pdf-am install-ps install-ps-am \ + install-strip installcheck installcheck-am installdirs \ + maintainer-clean maintainer-clean-generic mostlyclean \ + mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \ + tags tags-am uninstall uninstall-am uninstall-liborcusHEADERS + +.PRECIOUS: Makefile + + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/include/orcus/spreadsheet/auto_filter.hpp b/include/orcus/spreadsheet/auto_filter.hpp new file mode 100644 index 0000000..b6f2959 --- /dev/null +++ b/include/orcus/spreadsheet/auto_filter.hpp @@ -0,0 +1,149 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_SPREADSHEET_AUTO_FILTER_HPP +#define INCLUDED_ORCUS_SPREADSHEET_AUTO_FILTER_HPP + +#include "types.hpp" +#include "../env.hpp" + +#include <map> +#include <unordered_set> + +#include <ixion/address.hpp> + +namespace orcus { namespace spreadsheet { + +/** + * Data for a single column inside autofilter range. + */ +struct ORCUS_SPM_DLLPUBLIC auto_filter_column_t +{ + using match_values_type = std::unordered_set<std::string_view>; + match_values_type match_values; + + auto_filter_column_t(); + auto_filter_column_t(const auto_filter_column_t& other); + auto_filter_column_t(auto_filter_column_t&& other); + ~auto_filter_column_t(); + + auto_filter_column_t& operator=(const auto_filter_column_t& other); + auto_filter_column_t& operator=(auto_filter_column_t&& other); + + void reset(); + void swap(auto_filter_column_t& r); +}; + +/** + * Data for a single autofilter entry. An autofilter can belong to either a + * sheet or a table. + */ +struct ORCUS_SPM_DLLPUBLIC auto_filter_t +{ + typedef std::map<col_t, auto_filter_column_t> columns_type; + + ixion::abs_range_t range; + + columns_type columns; + + auto_filter_t(); + auto_filter_t(const auto_filter_t& other); + auto_filter_t(auto_filter_t&& other); + ~auto_filter_t(); + + auto_filter_t& operator=(const auto_filter_t& other); + auto_filter_t& operator=(auto_filter_t&& other); + + void reset(); + void swap(auto_filter_t& r); + + /** + * Set column data to specified column index. + * + * @param col column index to associate the data to. + * @param data column data. + */ + void commit_column(col_t col, auto_filter_column_t data); +}; + +/** + * Single column entry in table. + */ +struct ORCUS_SPM_DLLPUBLIC table_column_t +{ + std::size_t identifier; + std::string_view name; + std::string_view totals_row_label; + totals_row_function_t totals_row_function; + + table_column_t(); + table_column_t(const table_column_t& other); + ~table_column_t(); + + table_column_t& operator=(const table_column_t& other); + + void reset(); +}; + +/** + * Table style information. + */ +struct ORCUS_SPM_DLLPUBLIC table_style_t +{ + std::string_view name; + + bool show_first_column:1; + bool show_last_column:1; + bool show_row_stripes:1; + bool show_column_stripes:1; + + table_style_t(); + table_style_t(const table_style_t& other); + ~table_style_t(); + + table_style_t& operator=(const table_style_t& other); + + void reset(); +}; + +/** + * Single table entry. A table is a range in a spreadsheet that represents + * a single set of data that can be used as a data source. + */ +struct ORCUS_SPM_DLLPUBLIC table_t +{ + typedef std::vector<table_column_t> columns_type; + + size_t identifier; + + std::string_view name; + std::string_view display_name; + + ixion::abs_range_t range; + + size_t totals_row_count; + + auto_filter_t filter; + columns_type columns; + table_style_t style; + + table_t(); + table_t(const table_t& other); + table_t(table_t&& other); + ~table_t(); + + table_t& operator=(const table_t& other); + table_t& operator=(table_t&& other); + + void reset(); +}; + +}} + +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/spreadsheet/config.hpp b/include/orcus/spreadsheet/config.hpp new file mode 100644 index 0000000..11eebfc --- /dev/null +++ b/include/orcus/spreadsheet/config.hpp @@ -0,0 +1,37 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_SPREADSHEET_CONFIG_HPP +#define INCLUDED_ORCUS_SPREADSHEET_CONFIG_HPP + +#include "orcus/env.hpp" + +#include <cstdint> + +namespace orcus { namespace spreadsheet { + +struct ORCUS_SPM_DLLPUBLIC document_config +{ + /** + * Precision to use when converting numeric values to their string + * representations. A negative value indicates the precision is not being + * specified. + */ + int8_t output_precision; + + document_config(); + document_config(const document_config& r); + ~document_config(); + + document_config& operator= (const document_config& r); +}; + +}} + +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/spreadsheet/document.hpp b/include/orcus/spreadsheet/document.hpp new file mode 100644 index 0000000..4f20b6e --- /dev/null +++ b/include/orcus/spreadsheet/document.hpp @@ -0,0 +1,166 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_SPREADSHEET_DOCUMENT_HPP +#define INCLUDED_ORCUS_SPREADSHEET_DOCUMENT_HPP + +#include "orcus/env.hpp" +#include "orcus/interface.hpp" +#include "orcus/spreadsheet/types.hpp" + +#include <ostream> +#include <memory> + +namespace ixion { + +class formula_name_resolver; +class model_context; +struct abs_address_t; + +} + +namespace orcus { + +class string_pool; +struct date_time_t; + +namespace spreadsheet { + +class shared_strings; +class styles; +class pivot_collection; +class sheet; +class import_factory; + +struct document_config; +struct table_t; + +namespace detail { + +struct document_impl; + +} + +/** + * Store spreadsheet document content. It uses the @p model_context class + * from the ixion library to store raw cell values required in the computation + * of formula expressions. + */ +class ORCUS_SPM_DLLPUBLIC document : public orcus::iface::document_dumper +{ + friend class sheet; + friend class import_factory; + +public: + document(const document&) = delete; + document& operator= (const document&) = delete; + + document(const range_size_t& sheet_size); + ~document(); + + /** See @ref iface::document_dumper. */ + virtual void dump(dump_format_t format, const std::string& output) const override; + + /** See @ref iface::document_dumper. */ + virtual void dump_check(std::ostream& os) const override; + + shared_strings& get_shared_strings(); + const shared_strings& get_shared_strings() const; + + styles& get_styles(); + const styles& get_styles() const; + + pivot_collection& get_pivot_collection(); + const pivot_collection& get_pivot_collection() const; + + sheet* append_sheet(std::string_view sheet_name); + sheet* get_sheet(std::string_view sheet_name); + const sheet* get_sheet(std::string_view sheet_name) const; + sheet* get_sheet(sheet_t sheet_pos); + const sheet* get_sheet(sheet_t sheet_pos) const; + + /** + * Clear document content, to make it empty. + */ + void clear(); + + /** + * Calculate those formula cells that have been newly inserted and have + * not yet been calculated. + */ + void recalc_formula_cells(); + + sheet_t get_sheet_index(std::string_view name) const; + std::string_view get_sheet_name(sheet_t sheet_pos) const; + + /** + * Set a new name to a sheet. + * + * @param sheet_pos 0-based position of a sheet. + * @param name New name to set to a sheet. + */ + void set_sheet_name(sheet_t sheet_pos, std::string name); + + range_size_t get_sheet_size() const; + void set_sheet_size(const range_size_t& sheet_size); + size_t get_sheet_count() const; + + void set_origin_date(int year, int month, int day); + date_time_t get_origin_date() const; + + void set_formula_grammar(formula_grammar_t grammar); + formula_grammar_t get_formula_grammar() const; + + const ixion::formula_name_resolver* get_formula_name_resolver(formula_ref_context_t cxt) const; + + ixion::model_context& get_model_context(); + const ixion::model_context& get_model_context() const; + + const document_config& get_config() const; + void set_config(const document_config& cfg); + + string_pool& get_string_pool(); + const string_pool& get_string_pool() const; + + /** + * Insert a new table object into the document. The document will take + * ownership of the inserted object after the call. The object will get + * inserted only when there is no pre-existing table object of the same + * name. The object not being inserted will be deleted. + * + * @param p table object to insert. + */ + void insert_table(table_t* p); + + /** + * Get a structure containing properties of a named table. + * + * @param name Name of the table. + * + * @return Pointer to the structure containing the properties of a named + * table, or @p nullptr if no such table exists for the given name. + */ + const table_t* get_table(std::string_view name) const; + +private: + void dump_flat(const std::string& outdir) const; + void dump_html(const ::std::string& outdir) const; + void dump_json(const ::std::string& outdir) const; + void dump_csv(const std::string& outdir) const; + void dump_debug_state(const std::string& outdir) const; + + void finalize_import(); + void insert_dirty_cell(const ixion::abs_address_t& pos); + +private: + std::unique_ptr<detail::document_impl> mp_impl; +}; + +}} + +#endif +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/spreadsheet/document_types.hpp b/include/orcus/spreadsheet/document_types.hpp new file mode 100644 index 0000000..b1a864f --- /dev/null +++ b/include/orcus/spreadsheet/document_types.hpp @@ -0,0 +1,77 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#pragma once + +#include "types.hpp" +#include <vector> + +namespace orcus { namespace spreadsheet { + +/** + * Stores a color value in ARGB format. Each color component ranges from 0 to + * 255 (8-bit). + */ +struct ORCUS_SPM_DLLPUBLIC color_t +{ + color_elem_t alpha; + color_elem_t red; + color_elem_t green; + color_elem_t blue; + + color_t(); + color_t(color_elem_t _red, color_elem_t _green, color_elem_t _blue); + color_t(color_elem_t _alpha, color_elem_t _red, color_elem_t _green, color_elem_t _blue); + + void reset(); + + bool operator==(const color_t& other) const; + bool operator!=(const color_t& other) const; +}; + +/** + * Contains formatting properties of a section of a string. This is used in + * the stroage of rich-text strings. + */ +struct ORCUS_SPM_DLLPUBLIC format_run +{ + /** Position of the section where the formatting starts. */ + std::size_t pos; + /** Length of the section. */ + std::size_t size; + /** Name of the font. */ + std::string_view font; + /** Size of the font. */ + double font_size; + /** Color of the section. */ + color_t color; + /** Whether or not the font is bold. */ + bool bold:1; + /** Whether or not the font is italic. */ + bool italic:1; + + format_run(); + + /** + * Reset the properties to unformatted state. + */ + void reset(); + + /** + * Query whether or not the section contains non-default format properties. + * + * @return @p true of it's formatted, otherwise @p false. + */ + bool formatted() const; +}; + +/** Collection of format properties of a string. */ +using format_runs_t = std::vector<format_run>; + +}} // namespace orcus::spreadsheet + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/spreadsheet/export_interface.hpp b/include/orcus/spreadsheet/export_interface.hpp new file mode 100644 index 0000000..3c3104d --- /dev/null +++ b/include/orcus/spreadsheet/export_interface.hpp @@ -0,0 +1,60 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_SPREADSHEET_EXPORT_INTERFACE_HPP +#define INCLUDED_ORCUS_SPREADSHEET_EXPORT_INTERFACE_HPP + +#include "types.hpp" +#include "../env.hpp" + +#include <ostream> + +namespace orcus { namespace spreadsheet { namespace iface { + +/** + * Interface for exporting sheet contents. + */ +class export_sheet +{ +public: + ORCUS_DLLPUBLIC virtual ~export_sheet() = 0; + + /** + * Write the content of a cell to an output stream. + * + * @param os output stream to write the cell content to. + * @param row 0-based row position of a cell. + * @param col 0-based column position of a cell. + */ + virtual void write_string(std::ostream& os, orcus::spreadsheet::row_t row, orcus::spreadsheet::col_t col) const = 0; +}; + +/** + * Entry-point interface for exporting document contents. + */ +class export_factory +{ +public: + ORCUS_DLLPUBLIC virtual ~export_factory() = 0; + + /** + * Obtain an interface for exporting sheet content. + * + * @param sheet_name name of the sheet to export. + * + * @return pointer to an interface for exporting sheet content. + */ + virtual const export_sheet* get_sheet(std::string_view sheet_name) const = 0; +}; + +}}} + + + + +#endif +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/spreadsheet/factory.hpp b/include/orcus/spreadsheet/factory.hpp new file mode 100644 index 0000000..e1423fa --- /dev/null +++ b/include/orcus/spreadsheet/factory.hpp @@ -0,0 +1,143 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_SPREADSHEET_IMPORT_FACTORY_HPP +#define INCLUDED_ORCUS_SPREADSHEET_IMPORT_FACTORY_HPP + +#include <orcus/spreadsheet/import_interface.hpp> +#include <orcus/spreadsheet/import_interface_styles.hpp> +#include <orcus/spreadsheet/export_interface.hpp> +#include <orcus/env.hpp> + +#include <memory> + +namespace orcus { + +class string_pool; + +namespace spreadsheet { + +class document; +class view; +class styles; + +struct ORCUS_SPM_DLLPUBLIC import_factory_config +{ + /** + * When the font cache is enabled, the import factory checks each incoming + * font entry against the pool of existing font entries and insert it only + * when an equal entry doesn't already exist in the pool. + * + * @note It should not be enabled for a file format that already has + * font entries normalized, such as xlsx. + */ + bool enable_font_cache = true; + + import_factory_config(); + import_factory_config(const import_factory_config& other); + ~import_factory_config(); + + import_factory_config& operator=(const import_factory_config& other); +}; + +/** + * Wraps @ref document and @ref view stores. This is to be used by the import + * filter to populate the document and view stores. + */ +class ORCUS_SPM_DLLPUBLIC import_factory : public iface::import_factory +{ + struct impl; + std::unique_ptr<impl> mp_impl; +public: + import_factory(document& doc); + import_factory(document& doc, view& view_store); + virtual ~import_factory(); + + virtual iface::import_global_settings* get_global_settings() override; + virtual iface::import_shared_strings* get_shared_strings() override; + virtual iface::import_styles* get_styles() override; + virtual iface::import_named_expression* get_named_expression() override; + virtual iface::import_reference_resolver* get_reference_resolver(formula_ref_context_t cxt) override; + virtual iface::import_pivot_cache_definition* create_pivot_cache_definition( + orcus::spreadsheet::pivot_cache_id_t cache_id) override; + virtual iface::import_pivot_cache_records* create_pivot_cache_records( + orcus::spreadsheet::pivot_cache_id_t cache_id) override; + virtual iface::import_sheet* append_sheet(sheet_t sheet_index, std::string_view name) override; + virtual iface::import_sheet* get_sheet(std::string_view name) override; + virtual iface::import_sheet* get_sheet(sheet_t sheet_index) override; + virtual void finalize() override; + + void set_config(const import_factory_config& config); + + void set_default_row_size(row_t row_size); + void set_default_column_size(col_t col_size); + + void set_character_set(character_set_t charset); + character_set_t get_character_set() const; + + /** + * When setting this flag to true, those formula cells with no cached + * results will be re-calculated upon loading. + * + * + * @param b value of this flag. + */ + void set_recalc_formula_cells(bool b); + + void set_formula_error_policy(formula_error_policy_t policy); +}; + +/** + * Wraps @ref styles store. This is to be used by an import styles parser to + * populate the styles store. + */ +class ORCUS_SPM_DLLPUBLIC import_styles : public iface::import_styles +{ + struct impl; + std::unique_ptr<impl> mp_impl; +public: + import_styles(styles& styles_store, string_pool& sp); + import_styles(std::shared_ptr<import_factory_config> config, styles& styles_store, string_pool& sp); + virtual ~import_styles() override; + + virtual iface::import_font_style* start_font_style() override; + virtual iface::import_fill_style* start_fill_style() override; + virtual iface::import_border_style* start_border_style() override; + virtual iface::import_cell_protection* start_cell_protection() override; + virtual iface::import_number_format* start_number_format() override; + virtual iface::import_xf* start_xf(xf_category_t cat) override; + virtual iface::import_cell_style* start_cell_style() override; + + virtual void set_font_count(size_t n) override; + virtual void set_fill_count(size_t n) override; + virtual void set_border_count(size_t n) override; + virtual void set_number_format_count(size_t n) override; + virtual void set_xf_count(xf_category_t cat, size_t n) override; + virtual void set_cell_style_count(size_t n) override; +}; + +/** + * Wraps @ref document store and faciliates export of its content. + * + * @warning It currently provides very limited functionality especially when + * compared to that of the @ref import_factory. + */ +class ORCUS_SPM_DLLPUBLIC export_factory : public iface::export_factory +{ + struct impl; + std::unique_ptr<impl> mp_impl; +public: + export_factory(const document& doc); + virtual ~export_factory(); + + virtual const iface::export_sheet* get_sheet(std::string_view sheet_name) const override; +}; + +}} + +#endif +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/spreadsheet/import_interface.hpp b/include/orcus/spreadsheet/import_interface.hpp new file mode 100644 index 0000000..2ba80a7 --- /dev/null +++ b/include/orcus/spreadsheet/import_interface.hpp @@ -0,0 +1,1332 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef ORCUS_SPREADSHEET_IMPORT_INTERFACE_HPP +#define ORCUS_SPREADSHEET_IMPORT_INTERFACE_HPP + +#include <cstdlib> + +#include "types.hpp" +#include "../types.hpp" +#include "../env.hpp" + +// NB: This header must not depend on ixion, as it needs to be usable for +// those clients that provide their own formula engine. Other headers in +// the orcus::spreadsheet namespace may depend on ixion. + +namespace orcus { namespace spreadsheet { namespace iface { + +class import_styles; +class import_pivot_cache_definition; +class import_pivot_cache_records; +class import_sheet_view; + +/** + * Interface for importing raw string values shared in string cells. String + * values may be either with or without formatted segments. + * + * To insert an unformatted string, simply use either append() or add() + * method. The string will then be immediately pushed to the pool. + * + * To insert a string with mixed formatted segments, you need to first use one + * or more of: + * + * @li set_segment_font() + * @li set_segment_bold() + * @li set_segment_italic() + * @li set_segment_font_name() + * @li set_segment_font_size() + * @li set_segment_font_color() + * + * to define the format attribute(s) of a string segment followed by a call to + * append_segment(). This may be repeated as many times as necessary. Then + * as the final step, call commit_segments() to insert the entire series of + * formatted segments to the pool as a single string entry. The following + * example demonstrates how the code may look like: + * + * @code{.cpp} + * import_shared_strings* iface = ...; + * + * // store a segment with specific font, size and boldness. + * iface->set_segment_font_name("FreeMono"); + * iface->set_segment_font_size(14); + * iface->set_segment_font_bold(true); + * iface->append_segment("a bold and big segment"); + * + * // store an unformatted segment. + * iface->append_segment(" followed by "); + * + * // store a segment with smaller, italic font. + * iface->set_segment_font_size(7); + * iface->set_segment_font_italic(true); + * iface->append_segment("a small and italic segment"); + * + * iface->commit_segments(); // commit the whole formatted string to the pool. + * @endcode + */ +class ORCUS_DLLPUBLIC import_shared_strings +{ +public: + virtual ~import_shared_strings(); + + /** + * Append a new string to the sequence of strings. Order of insertion + * determines the numerical ID value of an inserted string. Note that this + * method assumes that the caller knows the string being appended is not yet + * in the pool; it does not check on duplicated strings. + * + * @param s string to append to the pool. + * + * @return ID of the inserted string. + */ + virtual size_t append(std::string_view s) = 0; + + /** + * Similar to the append() method, it adds a new string to the string pool; + * however, this method checks if the string being added is already in the + * pool before each insertion, to avoid duplicated strings. + * + * @param s string to add to the pool. + * + * @return ID of the inserted string. + */ + virtual size_t add(std::string_view s) = 0; + + /** + * Set the index of a font to apply to the current format attributes. Refer + * to the import_font_style interface on how to obtain a font index. Note + * that a single font index is associated with multiple font-related + * formatting attributes, such as font name, font color, boldness and + * italics. + * + * @param font_index positive integer representing the font to use. + */ + virtual void set_segment_font(size_t font_index) = 0; + + /** + * Set whether or not to make the current segment bold. + * + * @param b true if it's bold, false otherwise. + */ + virtual void set_segment_bold(bool b) = 0; + + /** + * Set whether or not to make the current segment italic. + * + * @param b true if it's italic, false otherwise. + */ + virtual void set_segment_italic(bool b) = 0; + + /** + * Set the name of a font to the current segment. + * + * @param s font name. + */ + virtual void set_segment_font_name(std::string_view s) = 0; + + /** + * Set a font size to the current segment. + * + * @param point font size in points. + */ + virtual void set_segment_font_size(double point) = 0; + + /** + * Set the color of a font in ARGB format to the current segment. + * + * @param alpha alpha component value (0-255). + * @param red red component value (0-255). + * @param green green component value (0-255). + * @param blue blue component value (0-255). + */ + virtual void set_segment_font_color(color_elem_t alpha, color_elem_t red, color_elem_t green, color_elem_t blue) = 0; + + /** + * Push the current string segment to the buffer. Any formatting attributes + * defined so far will be applied to this segment. + * + * @param s string value for the segment. + */ + virtual void append_segment(std::string_view s) = 0; + + /** + * Store the entire formatted string in the current buffer to the shared + * strings pool. The implementor may choose to unconditionally append the + * string to the pool, or choose to find an existing duplicate and reuse + * it instead. + * + * @return ID of the string just inserted, or the ID of an existing string + * with identical formatting. + */ + virtual size_t commit_segments() = 0; +}; + +/** + * Interface for importing sheet properties. Sheet properties include: + * + * @li column widths and row heights, + * @li hidden flags for columns and rows, and + * @li merged cell ranges. + * + * These properties are independent of the cell contents of a sheet. + */ +class ORCUS_DLLPUBLIC import_sheet_properties +{ +public: + virtual ~import_sheet_properties(); + + /** + * Set a column width to one or more columns. + * + * @param col 0-based position of the first column. + * @param col_span number of contiguous columns to apply the width to. + * @param width column width to apply. + * @param unit unit of measurement to use for the width value. + */ + virtual void set_column_width(col_t col, col_t col_span, double width, orcus::length_unit_t unit) = 0; + + /** + * Set a column hidden flag to one or more columns. + * + * @param col 0-based position of the first column. + * @param col_span number of contiguous columns to apply the flag to. + * @param hidden flag indicating whether or not the columns are hidden. + */ + virtual void set_column_hidden(col_t col, col_t col_span, bool hidden) = 0; + + /** + * Set a row height to specified row. + * + * @param row 0-based position of a row. + * @param height new row height value to set. + * @param unit unit of the new row height value. + * + * @todo Convert this to take a raw span. + */ + virtual void set_row_height(row_t row, double height, orcus::length_unit_t unit) = 0; + + /** + * Set a row hidden flag to a specified row. + * + * @param row 0-based position of a row. + * @param hidden flag indicating whether or not the row is hidden. + * + * @todo Convert this to take a raw span. + */ + virtual void set_row_hidden(row_t row, bool hidden) = 0; + + /** + * Set a merged cell range. + * + * @param range structure containing the top-left and bottom-right + * positions of a merged cell range. + */ + virtual void set_merge_cell_range(const range_t& range) = 0; +}; + +/** + * Interface for importing named expressions or ranges. + * + * This interface has two different methods for defining named expressions: + * + * @li set_named_expression() and + * @li set_named_range(). + * + * Generally speaking, set_named_expression() can be used to define both named + * expression and named range. However, the implementor may choose to apply a + * different syntax rule to parse an expression passed to set_named_range(), + * depending on the formula grammar defined via @ref + * import_global_settings::set_default_formula_grammar(). For instance, the + * OpenDocument Spreadsheet format is known to use different syntax rules + * between named expressions and named ranges. + * + * A named range is a special case of a named expression where the expression + * consists of only one single cell range token. + * + * Here is a code example of how a named expression is defined: + * + * @code{.cpp} + * import_named_expression* iface = ...; + * + * // set the A1 on the first sheet as its origin (optional). + * src_address_t origin{0, 0, 0}; + * iface->set_base_position(origin); + * iface->set_named_expression("MyExpression", "SUM(A1:B10)+SUM(D1:D4)"); + * iface->commit(); + * @endcode + * + * Replace the above set_named_expression() call with set_named_range() if you + * wish to define a named range instead. + */ +class ORCUS_DLLPUBLIC import_named_expression +{ +public: + virtual ~import_named_expression(); + + /** + * Specify an optional base position, or origin, from which to evaluate a + * named expression. If not specified, the implementor should use the + * top-left corner cell on the first sheet as its origin. + * + * @param pos cell position to be used as the origin. + */ + virtual void set_base_position(const src_address_t& pos) = 0; + + /** + * Set a named expression to the buffer. + * + * @param name name of the expression to be defined. + * @param expression expression to be associated with the name. + */ + virtual void set_named_expression(std::string_view name, std::string_view expression) = 0; + + /** + * Set a named range to the buffer. + * + * @param name name of the expression to be defined. + * @param range range to be associated with the name. + */ + virtual void set_named_range(std::string_view name, std::string_view range) = 0; + + /** + * Commit the named expression or range currently in the buffer to the + * document. + */ + virtual void commit() = 0; +}; + +/** + * Interface for importing data tables. + */ +class ORCUS_DLLPUBLIC import_data_table +{ +public: + virtual ~import_data_table(); + + /** + * Set the type of a data table. A data table can either: + * + * @li be a single-variable column-oriented, + * @li be a single-variable row-oriented, or + * @li use two variables that use both column and row. + * + * @param type type of a data table. + */ + virtual void set_type(data_table_type_t type) = 0; + + /** + * Set the range of a data table. + * + * @param range range of a data table. + */ + virtual void set_range(const range_t& range) = 0; + + /** + * Set the reference of the first input cell. + * + * @param ref reference of the first input cell. + * @param deleted whether or not this input cell has been deleted. + */ + virtual void set_first_reference(std::string_view ref, bool deleted) = 0; + + /** + * Set the reference of the second input cell but only if the data table + * uses two variables. + * + * @note This method gets called only if the data table uses two variables. + * + * @param ref reference of the second input cell. + * @param deleted whether or not this input cell has been deleted. + */ + virtual void set_second_reference(std::string_view ref, bool deleted) = 0; + + /** + * Store the current data table data in the buffer to the backend sheet + * storage. + */ + virtual void commit() = 0; +}; + +/** + * Interface for importing auto filters. + * + * Importing a single auto filter would roughly follow the following flow: + * + * @code{.cpp} + * import_auto_filter* iface = ... ; + * + * range_t range; + * range.first.column = 0; + * range.first.row = 0; + * range.last.column = 3; + * range.last.row = 1000; + * iface->set_range(range); // Auto filter is applied for A1:D1001. + * + * // Column A is filtered for a value of "A". + * iface->set_column(0); + * iface->append_column_match_value("A"); + * iface->commit_column(); + * + * // Column D is filtered for values of 1 and 4. + * iface->set_column(3); + * iface->append_column_match_value("1"); + * iface->append_column_match_value("4"); + * iface->commit_column(); + * + * // Push the autofilter data in the current buffer to the sheet store. + * iface->commit(); + * @endcode + */ +class ORCUS_DLLPUBLIC import_auto_filter +{ +public: + virtual ~import_auto_filter(); + + /** + * Specify the range where the auto filter is applied. + * + * @param range structure containing the top-left and bottom-right + * positions of the auto filter range. + */ + virtual void set_range(const range_t& range) = 0; + + /** + * Specify the column position of a filter. The position is relative to + * the first column in the auto filter range. This method gets called at + * the beginning of each column filter data. The implementor may initialize + * the column filter data buffer when this method is called. + * + * @note This column position is relative to the first column in the + * autofilter range. + * + * @param col 0-based column position of a filter relative to the first + * column of the auto filter range. + */ + virtual void set_column(col_t col) = 0; + + /** + * Append a match value to the current column filter. A single column + * filter may have one or more match values. + * + * @param value match value to append to the current column filter. + */ + virtual void append_column_match_value(std::string_view value) = 0; + + /** + * Commit the current column filter data to the current auto filter buffer. + * The implementor may clear the current column filter buffer after this + * call. + */ + virtual void commit_column() = 0; + + /** + * Commit current auto filter data stored in the buffer to the sheet store. + */ + virtual void commit() = 0; +}; + +/** + * This is an optional interface to import conditional formatting. + * + * In general, a single conditional format consists of: + * + * @li a cell range the format is applied to, and + * @li one or more rule entries. + * + * Each rule entry consists of: + * + * @li a type of rule, + * @li zero or more rule properties, and + * @li zero or more conditions depending on the rule type. + * + * Lastly, each condition consists of: + * + * @li a formula, value, or string, + * @li an optional color. + * + * The flow of the import process varies depending on the type of the + * conditional formatting being imported. The following is an example of + * importing a conditional formatting that consists of a rule that applies a + * format when the cell value is greather than 2: + * + * @code{.cpp} + * import_conditional_format* iface = ... ; + * + * iface->set_range("A2:A13"); + * iface->set_xf_id(14); // apply differential format (dxf) whose ID is 14 + * iface->set_type(conditional_format_t::condition); // rule entry type + * iface->set_operator(condition_operator_t::expression); + * iface->set_operator(condition_operator_t::greater); + * + * iface->set_formula("2"); + * iface->commit_condition(); + * + * iface->commit_entry(); + * + * iface->commit_format(); + * @endcode + * + * @todo Revise this API for simplification. + */ +class ORCUS_DLLPUBLIC import_conditional_format +{ +public: + virtual ~import_conditional_format(); + + /** + * Sets the color of the current condition. + * only valid for type == databar or type == colorscale. + */ + virtual void set_color(color_elem_t alpha, color_elem_t red, + color_elem_t green, color_elem_t blue) = 0; + + /** + * Sets the formula, value or string of the current condition. + */ + virtual void set_formula(std::string_view formula) = 0; + + /** + * Sets the type for the formula, value or string of the current condition. + * Only valid for type = iconset, databar or colorscale. + */ + virtual void set_condition_type(condition_type_t type) = 0; + + /** + * Only valid for type = date. + */ + virtual void set_date(condition_date_t date) = 0; + + /** + * commits the current condition to the current entry. + */ + virtual void commit_condition() = 0; + + /** + * Name of the icons to use in the current entry. + * only valid for type = iconset + */ + virtual void set_icon_name(std::string_view name) = 0; + + /** + * Use a gradient for the current entry. + * only valid for type == databar + */ + virtual void set_databar_gradient(bool gradient) = 0; + + /** + * Position of the 0 axis in the current entry. + * only valid for type == databar. + */ + virtual void set_databar_axis(databar_axis_t axis) = 0; + + /** + * Databar color for positive values. + * only valid for type == databar. + */ + virtual void set_databar_color_positive(color_elem_t alpha, color_elem_t red, + color_elem_t green, color_elem_t blue) = 0; + + /** + * Databar color for negative values. + * only valid for type == databar. + */ + virtual void set_databar_color_negative(color_elem_t alpha, color_elem_t red, + color_elem_t green, color_elem_t blue) = 0; + + /** + * Sets the minimum length for a databar. + * only valid for type == databar. + */ + virtual void set_min_databar_length(double length) = 0; + + /** + * Sets the maximum length for a databar. + * only valid for type == databar. + */ + virtual void set_max_databar_length(double length) = 0; + + /** + * Don't show the value in the cell. + * only valid for type = databar, iconset, colorscale. + */ + virtual void set_show_value(bool show) = 0; + + /** + * Use the icons in reverse order. + * only valid for type == iconset. + */ + virtual void set_iconset_reverse(bool reverse) = 0; + + /** + * TODO: In OOXML the style is stored as dxf and in ODF as named style. + */ + virtual void set_xf_id(size_t xf) = 0; + + /** + * Sets the current operation used for the current entry. + * only valid for type == condition + */ + virtual void set_operator(condition_operator_t condition_type) = 0; + + virtual void set_type(conditional_format_t type) = 0; + + virtual void commit_entry() = 0; + + virtual void set_range(std::string_view range) = 0; + + virtual void set_range(row_t row_start, col_t col_start, + row_t row_end, col_t col_end) = 0; + + virtual void commit_format() = 0; +}; + +/** + * Interface for table. A table is a range of cells within a sheet that + * consists of one or more data columns with a header row that contains their + * labels. + */ +class ORCUS_DLLPUBLIC import_table +{ +public: + virtual ~import_table(); + + /** + * Get an optional interface for importing auto filter data stored as part + * of a table. + * + * The implementor should initialize the internal state of the temporary + * auto filter object when this method is called. + * + * @return pointer to the auto filter interface object, or a @p nullptr if + * the implementor doesn't support it. + */ + virtual import_auto_filter* get_auto_filter(); + + /** + * Set an integral identifier unique to the table. + * + * @param id identifier associated with the table. + */ + virtual void set_identifier(size_t id) = 0; + + /** + * Set a 2-dimensional cell range associated with the table. + * + * @param range cell range associated with the table. + */ + virtual void set_range(const range_t& range) = 0; + + /** + * Set the number of totals rows. + * + * @param row_count number of totals rows. + */ + virtual void set_totals_row_count(size_t row_count) = 0; + + /** + * Set the internal name of the table. + * + * @param name name of the table. + */ + virtual void set_name(std::string_view name) = 0; + + /** + * Set the displayed name of the table. + * + * @param name displayed name of the table. + */ + virtual void set_display_name(std::string_view name) = 0; + + /** + * Set the number of columns the table contains. + * + * @param n number of columns in the table. + * + * @note This method gets called before the column data gets imported. The + * implementor can use this call to initialize the buffer for storing + * the column data. + */ + virtual void set_column_count(size_t n) = 0; + + /** + * Set an integral identifier for a column. + * + * @param id integral identifier for a column. + */ + virtual void set_column_identifier(size_t id) = 0; + + /** + * Set a name of a column. + * + * @param name name of a column. + */ + virtual void set_column_name(std::string_view name) = 0; + + /** + * Set the totals row label for a column. + * + * @param label row label for a column. + */ + virtual void set_column_totals_row_label(std::string_view label) = 0; + + /** + * Set the totals row function for a column. + * + * @param func totals row function for a column. + */ + virtual void set_column_totals_row_function(totals_row_function_t func) = 0; + + /** + * Push and append the column data stored in the current column data buffer + * into the table buffer. + */ + virtual void commit_column() = 0; + + /** + * Set the name of a style to apply to the table. + * + * @param name name of a style to apply to the table. + */ + virtual void set_style_name(std::string_view name) = 0; + + /** + * Specify whether or not the first column in the table should have the + * style applied. + * + * @param b whether or not the first column in the table should have the + * style applied. + */ + virtual void set_style_show_first_column(bool b) = 0; + + /** + * Specify whether or not the last column in the table should have the style + * applied. + * + * @param b whether or not the last column in the table should have the + * style applied. + */ + virtual void set_style_show_last_column(bool b) = 0; + + /** + * Specify whether or not row stripe formatting is applied. + * + * @param b whether or not row stripe formatting is applied. + */ + virtual void set_style_show_row_stripes(bool b) = 0; + + /** + * Specify whether or not column stripe formatting is applied. + * + * @param b whether or not column stripe formatting is applied. + */ + virtual void set_style_show_column_stripes(bool b) = 0; + + /** + * Push the data stored in the table buffer into the document store. + */ + virtual void commit() = 0; +}; + +/** + * Interface for importing the properties of a single formula cell. A formula + * cell contains a formula expression that can be computed, and optionally a + * cached result of the last computation performed on the expression. + */ +class ORCUS_DLLPUBLIC import_formula +{ +public: + virtual ~import_formula(); + + /** + * Set the position of a cell. + * + * @param row row position. + * @param col column position. + */ + virtual void set_position(row_t row, col_t col) = 0; + + /** + * Set formula string to a cell. + * + * @param grammar grammar to use to compile the formula string into + * tokens. + * @param formula formula expression to store. + */ + virtual void set_formula(formula_grammar_t grammar, std::string_view formula) = 0; + + /** + * Register the formula stored in a cell as a shared formula to be shared + * with other cells, if the cell contains a formula string. + * + * If a cell references a shared formula stored in another cell, only + * specify the index of that shared formula without specifying a formula + * string of its own. In that case, it is expected that another formula + * cell registers its formula string for that index. + * + * @param index shared string index to register the formula with. + */ + virtual void set_shared_formula_index(size_t index) = 0; + + /** + * Set cached result of string type. + * + * @param value string result value. + */ + virtual void set_result_string(std::string_view value) = 0; + + /** + * Set cached result of numeric type. + * + * @param value numeric value to set as a cached result. + */ + virtual void set_result_value(double value) = 0; + + /** + * Set cached result of boolean type. + * + * @param value boolean value to set as a cached result. + */ + virtual void set_result_bool(bool value) = 0; + + /** + * Set empty value as a cached result. + */ + virtual void set_result_empty() = 0; + + /** + * Commit all the formula data to the specified cell. + */ + virtual void commit() = 0; +}; + +/** + * Interface for importing the properties of an array formula which occupies a + * range of cells. Cells that are part of an array formula share the same + * formula expression but may have different calculation results. + */ +class ORCUS_DLLPUBLIC import_array_formula +{ +public: + virtual ~import_array_formula(); + + /** + * Set the range of an array formula. + * + * @param range range of an array formula. + */ + virtual void set_range(const range_t& range) = 0; + + /** + * Set the formula expression of an array formula. + * + * @param grammar grammar to use to compile the formula string into + * tokens. + * @param formula formula expression of an array formula. + */ + virtual void set_formula(formula_grammar_t grammar, std::string_view formula) = 0; + + /** + * Set a cached string result of a cell within the array formula range. + * + * @param row 0-based row position of a cell. + * @param col 0-based column position of a cell. + * @param value cached string value to set. + */ + virtual void set_result_string(row_t row, col_t col, std::string_view value) = 0; + + /** + * Set a cached numeric result of a cell within the array formula range. + * + * @param row 0-based row position of a cell. + * @param col 0-based column position of a cell. + * @param value cached numeric value to set. + */ + virtual void set_result_value(row_t row, col_t col, double value) = 0; + + /** + * Set a cached boolean result of a cell within the array formula range. + * + * @param row 0-based row position of a cell. + * @param col 0-based column position of a cell. + * @param value cached boolean value to set. + */ + virtual void set_result_bool(row_t row, col_t col, bool value) = 0; + + /** + * Set an empty value as a cached result to a cell within the array formula + * range. + * + * @param row 0-based row position of a cell. + * @param col 0-based column position of a cell. + */ + virtual void set_result_empty(row_t row, col_t col) = 0; + + /** + * Push the properties of an array formula currently stored in the buffer to + * the sheet store. + */ + virtual void commit() = 0; +}; + +/** + * Interface for importing the content and properties of a sheet. + */ +class ORCUS_DLLPUBLIC import_sheet +{ +public: + virtual ~import_sheet(); + + /** + * Get an optional interface for importing properties that are specific to a + * view of a sheet. + * + * @return pointer to the interface for importing view properties, or a @p + * nullptr if the implementor doesn't support it. + */ + virtual import_sheet_view* get_sheet_view(); + + /** + * Get an optional interface for importing sheet properties. + * + * @return pointer to the interface for importing sheet properties, or a @p + * nullptr if the implementor doesn't support it. + */ + virtual import_sheet_properties* get_sheet_properties(); + + /** + * Get an optional interface for importing data tables. Note that the + * implementer may decide not to support this feature in which case this + * method should return a @p nullptr. + * + * The implementor should initialize the internal state of the temporary + * data table object when this method is called. + * + * @return pointer to the data table interface object, or a @p nullptr if + * the implementor doesn't support it. + */ + virtual import_data_table* get_data_table(); + + /** + * Get an optional interface for importing auto filter ranges. + * + * The implementor should initialize the internal state of the temporary + * auto filter object when this method is called. + * + * @return pointer to the auto filter interface object, or a @p nullptr if + * the implementor doesn't support it. + */ + virtual import_auto_filter* get_auto_filter(); + + /** + * Get an interface for importing tables. + * + * The implementor should initialize the internal state of the temporary + * table object when this method is called. + * + * @return pointer to the table interface object, or @p nullptr if the + * implementer doesn't support importing of tables. + */ + virtual import_table* get_table(); + + /** + * Get an optional interface for importing conditional formats. + * + * @return pointer to the conditional format interface object, or @p nullptr + * if the implementer doesn't support importing conditional + * formats. + */ + virtual import_conditional_format* get_conditional_format(); + + /** + * Get an optional interface for importing sheet-local named expressions. + * + * @return pointer to the sheet-local named expression interface, or a @p + * nullptr if the implementor doesn't support it. + */ + virtual import_named_expression* get_named_expression(); + + /** + * Get an optional interface for importing array formulas. An array formula + * is a formula expression applied to a range of cells where each cell may + * have a different result value. + * + * @return pointer to the array formula import interface, or a @p nullptr if + * the implementor doesn't support it. + */ + virtual import_array_formula* get_array_formula(); + + /** + * Get an optional interface for importing formula cells. + * + * @return pointer to the formula interface object, or a @p nullptr if the + * implementer doesn't support importing of formula cells. + */ + virtual import_formula* get_formula(); + + /** + * Set raw string value to a cell and have the implementation + * auto-recognize its data type. + * + * @param row row ID + * @param col column ID + * @param s raw string value. + */ + virtual void set_auto(row_t row, col_t col, std::string_view s) = 0; + + /** + * Set string value to a cell. + * + * @param row row ID + * @param col column ID + * @param sindex 0-based string index in the shared string table. + */ + virtual void set_string(row_t row, col_t col, string_id_t sindex) = 0; + + /** + * Set numerical value to a cell. + * + * @param row row ID + * @param col column ID + * @param value value being assigned to the cell. + */ + virtual void set_value(row_t row, col_t col, double value) = 0; + + /** + * Set a boolean value to a cell. + * + * @param row row ID + * @param col col ID + * @param value boolean value being assigned to the cell + */ + virtual void set_bool(row_t row, col_t col, bool value) = 0; + + /** + * Set date and time value to a cell. + * + * @param row row ID + * @param col column ID + * @param year 1-based value representing year + * @param month 1-based value representing month, varying from 1 through + * 12. + * @param day 1-based value representing day, varying from 1 through 31. + * @param hour the hour of a day, ranging from 0 through 23. + * @param minute the minute of an hour, ranging from 0 through 59. + * @param second the second of a minute, ranging from 0 through 59. + */ + virtual void set_date_time( + row_t row, col_t col, + int year, int month, int day, int hour, int minute, double second) = 0; + + /** + * Set cell format to specified cell. The cell format is referred to by + * the xf (cell format) index in the styles table. + * + * @note This method gets called after both set_column_format() and + * set_row_format(). + * + * @param row row ID + * @param col column ID + * @param xf_index 0-based xf (cell format) index + */ + virtual void set_format(row_t row, col_t col, size_t xf_index) = 0; + + /** + * Set cell format to specified cell range. The cell format is referred + * to by the xf (cell format) index in the styles table. + * + * @param row_start start row ID + * @param col_start start column ID + * @param row_end end row ID + * @param col_end end column ID + * @param xf_index 0-based xf (cell format) index + */ + virtual void set_format(row_t row_start, col_t col_start, + row_t row_end, col_t col_end, size_t xf_index) = 0; + + /** + * Set cell format to a specified column. The cell format is referred to by + * the xf (cell format) index in the styles table. + * + * @note This method gets called first before set_row_format() or + * set_format() variants. + * + * @param col column ID + * @param col_span number of contiguous columns to apply the format to. It + * must be at least one. + * @param xf_index 0-based xf (cell format) index + */ + virtual void set_column_format(col_t col, col_t col_span, std::size_t xf_index) = 0; + + /** + * Set cell format to a specified row. The cell format is referred to by + * the xf (cell format) index in the styles table. + * + * @note This method gets called after set_column_format() but before + * set_format(). + * + * @param row row ID + * @param xf_index 0-based xf (cell format) index + */ + virtual void set_row_format(row_t row, std::size_t xf_index) = 0; + + /** + * Duplicate the value of the source cell to one or more cells located + * immediately below it. + * + * @param src_row row ID of the source cell + * @param src_col column ID of the source cell + * @param range_size number of cells below the source cell to copy the + * source cell value to. It must be at least one. + */ + virtual void fill_down_cells(row_t src_row, col_t src_col, row_t range_size) = 0; + + /** + * Get the size of the sheet. + * + * @return structure containing the numbers of rows and columns of the + * sheet. + */ + virtual range_size_t get_sheet_size() const = 0; +}; + +/** + * Interface for specifying global settings that may affect how the + * implementor should process certain values and properties. + */ +class ORCUS_DLLPUBLIC import_global_settings +{ +public: + virtual ~import_global_settings(); + + /** + * Set the date that is to be represented by a value of 0. All date + * values should be represented relative to this date. This may affect, for + * instance, values imported via @ref import_sheet::set_date_time(). + * + * @param year 1-based value representing year + * @param month 1-based value representing month, varying from 1 through + * 12. + * @param day 1-based value representing day, varying from 1 through 31. + */ + virtual void set_origin_date(int year, int month, int day) = 0; + + /** + * Set the formula grammar to be used globally when parsing formulas if the + * grammar is not specified. This grammar should also be used when parsing + * range strings associated with shared formula ranges, array formula + * ranges, autofilter ranges etc. + * + * Note that the import filter may specify what formula grammar to use + * locally when importing formula expressions for cells via @ref + * import_formula::set_formula(), in which case the implementor should honor + * that one instead. + * + * @param grammar default formula grammar to use globally unless otherwise + * specified. + */ + virtual void set_default_formula_grammar(formula_grammar_t grammar) = 0; + + /** + * Get current global formula grammar. The import filter may use this + * method to query the current global formula grammar. + * + * @return current default formula grammar. + */ + virtual formula_grammar_t get_default_formula_grammar() const = 0; + + /** + * Set the character set to use when parsing encoded string values. + * + * @param charset character set to use when parsing encoded string values. + */ + virtual void set_character_set(character_set_t charset) = 0; +}; + +/** + * This is an interface to allow the implementor to provide its own reference + * address parsers, for both single cell references and cell range references. + * The implementor may choose to provide a different parser depending of the + * type of formula_ref_context_t argument given to the @ref + * import_factory::get_reference_resolver() call. + */ +class ORCUS_DLLPUBLIC import_reference_resolver +{ +public: + virtual ~import_reference_resolver(); + + /** + * Resolve a textural representation of a single cell address. + * + * @param address single cell address string. + * + * @return structure containing the column and row positions of the + * address. + * + * @exception orcus::invalid_arg_error the string is not a valid + * single cell addreess. + */ + virtual src_address_t resolve_address(std::string_view address) = 0; + + /** + * Resolve a textural representation of a range address. Note that a + * string representing a valid single cell address should be considered a + * valid range address. + * + * @param range range address string. + * + * @return structure containing the start and end positions of the range + * address. + * + * @exception invalid_arg_error the string is not a valid range addreess. + */ + virtual src_range_t resolve_range(std::string_view range) = 0; +}; + +/** + * This interface is the entry point for the import filter code to instantiate + * other, more specialized interfaces. The life cycles of any specialized + * interfaces returned from this interface shall be managed by the implementor + * of this interface. + * + * The implementer of this interface may wrap a backend document store that + * needs to be populated. + */ +class ORCUS_DLLPUBLIC import_factory +{ +public: + virtual ~import_factory(); + + /** + * Obtain an optional interface for global settings, which the import filter + * uses to specify global filter settings that may affect how certain values + * and properties should be processed. The implementor can use this + * interface to decide how to process relevant values and properties. + * + * @return pointer to the global settings interface, or a @p nullptr if the + * implementor doesn't support it. + */ + virtual import_global_settings* get_global_settings(); + + /** + * Obtain an optional interface for importing shared strings for string + * cells. Implementing this interface is required in order to import string + * cell values. + * + * @return pointer to the shared strings interface, or a @p nullptr if the + * implementor doesn't support it. + */ + virtual import_shared_strings* get_shared_strings(); + + /** + * Obtain an optional interface for importing global named expressions. + * + * Note that @ref import_sheet also provides the same interface, but its + * interface is for importing sheet-local named expressions. + * + * @return pointer to the global named expression interface, or a @p nullptr + * if the implementor doesn't support it. + */ + virtual import_named_expression* get_named_expression(); + + /** + * Obtain an optional interface for importing styles used to add formatting + * properties to cell values. + * + * @return pointer to the styles interface, or a @p nullptr if the + * implementor doesn't support it. + */ + virtual import_styles* get_styles(); + + /** + * Obtain an optional interface for resolving cell and cell-range references + * from string values. + * + * @param cxt context in which the formula expression containing the + * references to be resolved occurs. + * + * @return pointer to the reference resolve interfance, or a @p nullptr if + * the implementor doesn't support it. + */ + virtual import_reference_resolver* get_reference_resolver(formula_ref_context_t cxt); + + /** + * Obtain an optional interface for pivot cache definition import for a + * specified cache ID. In case a pivot cache alrady exists for the passed + * ID, the implementor should overwrite the existing cache with a brand-new + * cache instance. + * + * @param cache_id numeric ID associated with the pivot cache. + * + * @return pointer to the pivot cache interface, or a @p nullptr if the + * implementor doesn't support pivot cache import. + */ + virtual import_pivot_cache_definition* create_pivot_cache_definition( + pivot_cache_id_t cache_id); + + /** + * Obtain an optional interface for pivot cache records import for a + * specified cache ID. + * + * @param cache_id numeric ID associated with the pivot cache. + * + * @return pointer to the pivot cache records interface, or a @p nullptr if + * the implementor doesn't support pivot cache import. + */ + virtual import_pivot_cache_records* create_pivot_cache_records( + pivot_cache_id_t cache_id); + + /** + * Append a sheet with a specified sheet position index and name and return + * an interface for importing its content. The implementor can use a call + * to this method as a signal to create and append a new sheet instance to + * the document store. + * + * @param sheet_index position index of the sheet to be appended. It is + * 0-based i.e. the first sheet to be appended will + * have an index value of 0. + * @param name sheet name. + * + * @return pointer to the sheet instance, or a @p nullptr if the implementor + * doesn't support it. Note, however, that if the implementor + * doesn't support this interface, no cell values will get imported. + */ + virtual import_sheet* append_sheet(sheet_t sheet_index, std::string_view name) = 0; + + /** + * Get a sheet instance by name. The import filter may use this method to + * get access to an existing sheet after it has been created. + * + * @param name sheet name. + * + * @return pointer to the sheet instance whose name matches the name + * passed to this method. It returns a @p nullptr if no sheet + * instance exists by the specified name. + */ + virtual import_sheet* get_sheet(std::string_view name) = 0; + + /** + * Retrieve a sheet instance by a specified numerical sheet index. + * + * @param sheet_index sheet index. + * + * @return pointer to the sheet instance, or a @p nullptr if no sheet + * instance exists at the specified sheet index. + */ + virtual import_sheet* get_sheet(sheet_t sheet_index) = 0; + + /** + * The import filter calls this method after completing its import, to give + * the implementor a chance to perform post-processing. + */ + virtual void finalize() = 0; +}; + +}}} + +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/spreadsheet/import_interface_pivot.hpp b/include/orcus/spreadsheet/import_interface_pivot.hpp new file mode 100644 index 0000000..275ed44 --- /dev/null +++ b/include/orcus/spreadsheet/import_interface_pivot.hpp @@ -0,0 +1,351 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_SPREADSHEET_IMPORT_INTERFACE_PIVOT_HPP +#define INCLUDED_ORCUS_SPREADSHEET_IMPORT_INTERFACE_PIVOT_HPP + +#include <cstdlib> + +#include "types.hpp" +#include "../types.hpp" +#include "../env.hpp" + +// NB: This header must not depend on ixion, as it needs to be usable for +// those clients that provide their own formula engine. Other headers in +// the orcus::spreadsheet namespace may depend on ixion. + +namespace orcus { namespace spreadsheet { namespace iface { + +class import_pivot_cache_field_group; + +/** + * Interface for importing pivot cache definitions. + */ +class ORCUS_DLLPUBLIC import_pivot_cache_definition +{ +public: + virtual ~import_pivot_cache_definition(); + + /** + * Specify that the source data of this pivot cache is located on a local + * worksheet. + * + * @param ref range string specifying the source range. + * @param sheet_name name of the worksheet where the source data is located. + * + * @todo use the ref resolver to resolve the range. + */ + virtual void set_worksheet_source(std::string_view ref, std::string_view sheet_name) = 0; + + /** + * Specify that the source data of this pivot cache is associated with a + * table. + * + * @param table_name name of the table. + */ + virtual void set_worksheet_source(std::string_view table_name) = 0; + + /** + * Set the total number of fields present in this pivot cache. + * + * @param n total number of fields in this pivot cache. + */ + virtual void set_field_count(size_t n) = 0; + + /** + * Set the name of the field in the current field buffer. + * + * @param name field name. + */ + virtual void set_field_name(std::string_view name) = 0; + + /** + * Set the lowest value of the field in the current field buffer. + * + * @param v lowest value of the field. + */ + virtual void set_field_min_value(double v) = 0; + + /** + * Set the highest value of the field in the current field buffer. + * + * @param v highest value of the field. + */ + virtual void set_field_max_value(double v) = 0; + + /** + * Set the lowest date value of the field in the current field buffer. + * + * @param dt lowest date value of the field. + */ + virtual void set_field_min_date(const date_time_t& dt) = 0; + + /** + * Set the highest date value of the field in the current field buffer. + * + * @param dt highest date value of the field. + */ + virtual void set_field_max_date(const date_time_t& dt) = 0; + + /** + * Mark the current field as a group field and initiate its import. + * + * The implementor should create an internal storage to prepare for the + * importing of field group data when this method gets called. + * + * @param base_index 0-based index of the field this group field uses as its + * base. + * @return interface for importing group field data, or a @p nullptr if the + * implementor doesn't support it. + */ + virtual import_pivot_cache_field_group* start_field_group(size_t base_index) = 0; + + /** + * Commit the field in the current field buffer to the pivot cache model. + */ + virtual void commit_field() = 0; + + /** + * Set a string value to the current field item buffer. + * + * @param value string value. + */ + virtual void set_field_item_string(std::string_view value) = 0; + + /** + * Set a numeric value to the current field item buffer. + * + * @param v numeric value. + */ + virtual void set_field_item_numeric(double v) = 0; + + /** + * Set a date-time value to the current field item buffer. + * + * @param dt date-time value. + */ + virtual void set_field_item_date_time(const date_time_t& dt) = 0; + + /** + * Set an error value to the current field item buffer, + * + * @param ev error value. + */ + virtual void set_field_item_error(error_value_t ev) = 0; + + /** + * Commit the field item in current field item buffer to the current field + * model. + */ + virtual void commit_field_item() = 0; + + /** + * Commit the current pivot cache model to the document model. + */ + virtual void commit() = 0; +}; + +/** + * Interface for importing group field settings in a pivot cache definition. + */ +class ORCUS_DLLPUBLIC import_pivot_cache_field_group +{ +public: + virtual ~import_pivot_cache_field_group(); + + /** + * Establish a linkage between a base item to a group item. + * + * The index to corresponding base item is inferred from the order of this + * method being called; the first call to this method implies a base item + * index of 0, the second call implies an index of 1, and so on. + * + * This method is called only for a non-range group field; a group field + * where parent-to-child item relationships are manually defined. + * + * @param group_item_index 0-based index for the group item. + */ + virtual void link_base_to_group_items(size_t group_item_index) = 0; + + /** + * Set an individual field item value that is of string type to the + * current internal buffer. + * + * This method can be called either for a range group field or a non-range + * one. + * + * @param value field item value. + */ + virtual void set_field_item_string(std::string_view value) = 0; + + /** + * Set an individual field item value that is of numeric type to the + * current internal buffer. + * + * This method can be called either for a range group field or a non-range + * one. + * + * @param v field item value. + */ + virtual void set_field_item_numeric(double v) = 0; + + /** + * Commit the current internal field item buffer to the group. + */ + virtual void commit_field_item() = 0; + + /** + * Set the range grouping type. + * + * The current group field implicitly becomes a range group field when + * this method is called. + * + * @param group_by type of range grouping. + */ + virtual void set_range_grouping_type(pivot_cache_group_by_t group_by) = 0; + + /** + * Set whether the current range group field has an automatic start + * position. + * + * The current group field implicitly becomes a range group field when + * this method is called. + * + * @param b whether or not the current range group field has an automatic + * start position. + */ + virtual void set_range_auto_start(bool b) = 0; + + /** + * Set whether the current range group field has an automatic end + * position. + * + * The current group field implicitly becomes a range group field when + * this method is called. + * + * @param b whether or not the current range group field has an automatic + * end position. + */ + virtual void set_range_auto_end(bool b) = 0; + + /** + * Set the start number of the current range group field. + * + * The current group field implicitly becomes a range group field when + * this method is called. + * + * @param v start number of the current range group field. + */ + virtual void set_range_start_number(double v) = 0; + + /** + * Set the end number of the current range group field. + * + * The current group field implicitly becomes a range group field when + * this method is called. + * + * @param v end number of the current range group field. + */ + virtual void set_range_end_number(double v) = 0; + + /** + * Set the start date of the current range group field. + * + * The current group field implicitly becomes a range group field when + * this method is called. + * + * @param dt start date of the current range group field. + */ + virtual void set_range_start_date(const date_time_t& dt) = 0; + + /** + * Set the end date of the current range group field. + * + * The current group field implicitly becomes a range group field when + * this method is called. + * + * @param dt end date of the current range group field. + */ + virtual void set_range_end_date(const date_time_t& dt) = 0; + + /** + * Set the interval of the current range group field. If the current + * range is a date range, the value represents the number of days. + * + * @param v interval of the current range group field. + */ + virtual void set_range_interval(double v) = 0; + + /** + * Commit the current field group data to the parent field. + */ + virtual void commit() = 0; +}; + +/** + * Interface for importing pivot cache records. + */ +class ORCUS_DLLPUBLIC import_pivot_cache_records +{ +public: + virtual ~import_pivot_cache_records(); + + /** + * Set the number of records included in pivot cache records. + * + * @note This method gets called before the very first record gets imported. + * The implementor can use this call as an opportunity to initialize + * any internal buffers used to store the imported records. + * + * @param n number of records included in pivot cache records. + */ + virtual void set_record_count(size_t n) = 0; + + /** + * Append to the current record buffer a numeric value as a column value. + * + * @param v numeric value to append to the current record buffer as a column + * value. + */ + virtual void append_record_value_numeric(double v) = 0; + + /** + * Append to the current record buffer a character value as a column value. + * + * @param s character value to append to the current record buffer as a + * column value. + */ + virtual void append_record_value_character(std::string_view s) = 0; + + /** + * Append to the current record buffer a column value referenced by an index + * into the shared items table of a pivot cache field. The corresponding + * field in the pivot cache definition should provide the shared items table + * that this index references. + * + * @param index index into the shared items table of a pivot cache field. + */ + virtual void append_record_value_shared_item(size_t index) = 0; + + /** + * Commit the record in the current record buffer. + * + * The implementor can clear the buffer afterward. + */ + virtual void commit_record() = 0; + + /** + * Commit the entire records set to the document store. + */ + virtual void commit() = 0; +}; + +}}} + +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/spreadsheet/import_interface_styles.hpp b/include/orcus/spreadsheet/import_interface_styles.hpp new file mode 100644 index 0000000..6ad94a8 --- /dev/null +++ b/include/orcus/spreadsheet/import_interface_styles.hpp @@ -0,0 +1,774 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#pragma once + +#include <cstdlib> + +#include "types.hpp" +#include "../types.hpp" +#include "../env.hpp" + +// NB: This header must not depend on ixion, as it needs to be usable for +// those clients that provide their own formula engine. Other headers in +// the orcus::spreadsheet namespace may depend on ixion. + +namespace orcus { namespace spreadsheet { namespace iface { + +class import_font_style; +class import_fill_style; +class import_border_style; +class import_cell_protection; +class import_number_format; +class import_xf; +class import_cell_style; + +/** + * Interface for importing styles. This one acts as an entry point and + * provides other interfaces for the style categories. + * + * The styles are to be stored in a <a + * href="https://en.wikipedia.org/wiki/Flyweight_pattern">flyweight</a> + * fashion where each style category maintains an array of stored style + * items, which are referenced by their indices. Each time a style + * item is pushed through the interface, it returns an index representing the + * item. The indices are to be assigned sequentially starting with 0 in each + * style category, and <em>the default style must get an index of 0</em>. + * Because of this, the import filter imports the default styles first before + * importing other non-default styles. + * + * The appreviation @p xf stands for cell format, and is used throughout the + * styles API. Similarly, the @p dxf stands for differential cell format, and + * stores partial format properties that are to be applied on top of the base + * format properties. + * + * @note The implementor of this interface @em must implement all interfaces + * for all the style categories that this interface returns. + */ +class ORCUS_DLLPUBLIC import_styles +{ +public: + virtual ~import_styles(); + + /** + * Signal the start of the import of font style attributes, and return a + * pointer to the interface instance for importing the attributes. + * + * @note Note that the import_styles implementer <i>must</i> return a + * non-null pointer. + * + * @return pointer to the interface instance for importing font style + * attributes. + */ + virtual import_font_style* start_font_style() = 0; + + /** + * Signal the start of the import of fill style attributes, and return a + * pointer to the interface instance for importing the attributes. + * + * @note Note that the import_styles implementer <i>must</i> return a + * non-null pointer. + * + * @return pointer to the interface instance for importing fill style + * attributes. + */ + virtual import_fill_style* start_fill_style() = 0; + + /** + * Signal the start of the import of border style attributes, and return a + * pointer to the interface instance for importing the attributes. + * + * @note Note that the import_styles implementer <i>must</i> return a + * non-null pointer. + * + * @return pointer to the interface instance for importing border style + * attributes. + */ + virtual import_border_style* start_border_style() = 0; + + /** + * Signal the start of the import of cell protection attributes, and return + * a pointer to the interface instance for importing the attributes. + * + * @note Note that the import_styles implementer <i>must</i> return a + * non-null pointer. + * + * @return pointer to the interface instance for importing cell protection + * attributes. + */ + virtual import_cell_protection* start_cell_protection() = 0; + + /** + * Signal the start of the import of number format attributes and return a + * pointer to the interface instance for importing the attributes. + * + * @note Note that the import_styles implementer <i>must</i> return a + * non-null pointer. + * + * @return pointer to the interface instance for importing number format + * attributes. + */ + virtual import_number_format* start_number_format() = 0; + + /** + * Signal the start of the import of cell format (xf) indices that each + * reference different format attributes in their respective pools, and + * return a pointer to the interface instance for importing the indices. + * + * @note Note that the import_styles implementer <i>must</i> return a + * non-null pointer. + * + * @return pointer to the interface instance for importing cell format (xf) + * indices. + */ + virtual import_xf* start_xf(xf_category_t cat) = 0; + + /** + * Signal the start of the import of named cell style information, and + * return a pointer to the interface instance for importing the information. + * + * @note Note that the import_styles implementer <i>must</i> return a + * non-null pointer. + * + * @return pointer to the interface instance for importing named cell style + * information. + */ + virtual import_cell_style* start_cell_style() = 0; + + /** + * Set the total number of font styles. This may be called before importing + * any of the font styles. This will give the implementer a chance to + * allocate storage. Note that it may not always be called. + * + * @param n number of font styles. + */ + virtual void set_font_count(size_t n) = 0; + + /** + * Set the total number of fill styles. This may be called before importing + * any of the fill styles. This will give the implementer a chance to + * allocate storage. Note that it may not always be called. + * + * @param n number of fill styles. + */ + virtual void set_fill_count(size_t n) = 0; + + /** + * Set the total number of border styles. This may be called before + * importing any of the border styles. This will give the implementer a + * chance to allocate storage. Note that it may not always be called. + * + * @param n number of border styles. + */ + virtual void set_border_count(size_t n) = 0; + + /** + * Set the total number of number format styles. This may be called before + * importing any of the number format styles. This will give the implementer + * a chance to allocate storage. Note that it may not always be called. + * + * @param n number of number format styles. + */ + virtual void set_number_format_count(size_t n) = 0; + + /** + * Set the total number of cell format styles for a specified cell format + * category. This may be called before importing any of the cell format + * styles for the specified category. This will give the implementer a + * chance to allocate storage. Note that it may not always be called. + * + * @param cat cell format category. + * @param n number of cell formats styles for the specified cell format + * category. + */ + virtual void set_xf_count(xf_category_t cat, size_t n) = 0; + + /** + * Set the total number of named cell styles. This may be called before + * importing any cell styles to give the implementer a chance to allocate + * storage. Note that it may not always be called. + * + * @param n number of named cell styles. + */ + virtual void set_cell_style_count(size_t n) = 0; +}; + +/** + * Interface for importing font style items. The following font style + * properties store different values for western, asian and complex scripts: + * + * @li font name + * @li font size + * @li font weight (normal or bold) + * @li font style (normal or italic) + */ +class ORCUS_DLLPUBLIC import_font_style +{ +public: + virtual ~import_font_style(); + + /** + * Set the font weight to either normal or bold, for western script. + * + * @param b whether the font has normal (false) or bold weight (true). + */ + virtual void set_bold(bool b) = 0; + + /** + * Set the font weight to either normal or bold, for asian script. + * + * @param b whether the font has normal (false) or bold weight (true). + */ + virtual void set_bold_asian(bool b) = 0; + + /** + * Set the font weight to either normal or bold, for complex script. + * + * @param b whether the font has normal (false) or bold weight (true). + */ + virtual void set_bold_complex(bool b) = 0; + + /** + * Set the font style to either normal or italic, for western script. + * + * @param b whether the font has normal (false) or italic style (true). + */ + virtual void set_italic(bool b) = 0; + + /** + * Set the font style to either normal or italic, for asian script. + * + * @param b whether the font has normal (false) or italic style (true). + */ + virtual void set_italic_asian(bool b) = 0; + + /** + * Set the font style to either normal or italic, for complex script. + * + * @param b whether the font has normal (false) or italic style (true). + */ + virtual void set_italic_complex(bool b) = 0; + + /** + * Set the name of a font, for western script. + * + * @param s font name. + */ + virtual void set_name(std::string_view s) = 0; + + /** + * Set the name of a font, for asian script. + * + * @param s font name. + */ + virtual void set_name_asian(std::string_view s) = 0; + + /** + * Set the name of a font, for complex script. + * + * @param s font name. + */ + virtual void set_name_complex(std::string_view s) = 0; + + /** + * Set the size of a font in points, for western script. + * + * @param point font size in points. + */ + virtual void set_size(double point) = 0; + + /** + * Set the size of a font in points, for asian script. + * + * @param point font size in points. + */ + virtual void set_size_asian(double point) = 0; + + /** + * Set the size of a font in points, for complex script. + * + * @param point font size in points. + */ + virtual void set_size_complex(double point) = 0; + + /** + * Set the underline type of a font. + * + * @param e underline type of a font. + */ + virtual void set_underline(underline_t e) = 0; + + /** + * Set the width of the underline of a font. + * + * @param e width of the underline of a font. + */ + virtual void set_underline_width(underline_width_t e) = 0; + + /** + * Set whether the underline of a font is continuous over the gaps, or skip + * the gaps. + * + * @param e whether the underline of a font is continuous over the gaps or + * skip the gaps. + */ + virtual void set_underline_mode(underline_mode_t e) = 0; + + /** + * Set whether the underline of a font consists of a single line, or a + * double line. + * + * @param e whether the underline of a font consists of a single line, or a + * double line. + * + * @todo Look into merging this with set_underline(). + */ + virtual void set_underline_type(underline_type_t e) = 0; + + /** + * Specify the color of an underline in ARGB format. + * + * @param alpha alpha component of the color. + * @param red red component of the color. + * @param green green component of the color. + * @param blue blue component of the color. + * + * @note If this value is not explicitly set, the font color should be used. + */ + virtual void set_underline_color(color_elem_t alpha, color_elem_t red, color_elem_t green, color_elem_t blue) = 0; + + /** + * Specify the color of font in ARGB format. + * + * @param alpha alpha component of the color. + * @param red red component of the color. + * @param green green component of the color. + * @param blue blue component of the color. + */ + virtual void set_color(color_elem_t alpha, color_elem_t red, color_elem_t green, color_elem_t blue) = 0; + + /** + * Set the strikethrough style of a font. + * + * @param s strikethrough style of a font. + */ + virtual void set_strikethrough_style(strikethrough_style_t s) = 0; + + /** + * Set whether the strikethrough of a font consists of a single line or a + * double line. + * + * @param s whether the strikethrough of a font consists of a single line or + * a double line. + */ + virtual void set_strikethrough_type(strikethrough_type_t s) = 0; + + /** + * Set the width of the strikethrough of a font. + * + * @param s the width of the strikethrough of a font. + */ + virtual void set_strikethrough_width(strikethrough_width_t s) = 0; + + /** + * Set the text to use as a strikethrough. + * + * @param s text to use as a strikethrough. + */ + virtual void set_strikethrough_text(strikethrough_text_t s) = 0; + + /** + * Commit the font style in the current buffer. + * + * @return index of the committed font style, to be passed on to the + * import_xf::set_font() method as its argument. + */ + virtual std::size_t commit() = 0; +}; + +/** + * Interface for importing fill style items. + */ +class ORCUS_DLLPUBLIC import_fill_style +{ +public: + virtual ~import_fill_style(); + + /** + * Set the type of fill pattern. + * + * @param fp fill pattern type. + */ + virtual void set_pattern_type(fill_pattern_t fp) = 0; + + /** + * Set the foreground color of a fill. <i>Note that for a solid fill + * type, the foreground color will be used.</i> + * + * @param alpha alpha component ranging from 0 (fully transparent) to 255 + * (fully opaque). + * @param red red component ranging from 0 to 255. + * @param green green component ranging from 0 to 255. + * @param blue blue component ranging from 0 to 255. + */ + virtual void set_fg_color(color_elem_t alpha, color_elem_t red, color_elem_t green, color_elem_t blue) = 0; + + /** + * Set the background color of a fill. <i>Note that this color will + * be ignored for a solid fill type.</i> + * + * @param alpha alpha component ranging from 0 (fully transparent) to 255 + * (fully opaque). + * @param red red component ranging from 0 to 255. + * @param green green component ranging from 0 to 255. + * @param blue blue component ranging from 0 to 255. + */ + virtual void set_bg_color(color_elem_t alpha, color_elem_t red, color_elem_t green, color_elem_t blue) = 0; + + /** + * Commit the fill style in the current buffer. + * + * @return index of the committed fill style, to be passed on to the + * import_xf::set_fill() method as its argument. + */ + virtual size_t commit() = 0; +}; + +/** + * Interface for importing border style items. + */ +class ORCUS_DLLPUBLIC import_border_style +{ +public: + virtual ~import_border_style(); + + /** + * Set the border style to a specified border position. + * + * @param dir position of a border to set the style to. + * @param style border style to set. + */ + virtual void set_style(border_direction_t dir, border_style_t style) = 0; + + /** + * Set the color of a border. + * + * @param dir position of a border to set the color to. + * @param alpha alpha element of the color. + * @param red red element of the color. + * @param green green element of the color. + * @param blue blue element of the color. + */ + virtual void set_color( + border_direction_t dir, color_elem_t alpha, color_elem_t red, color_elem_t green, color_elem_t blue) = 0; + + /** + * Set the width of a border. + * + * @param dir position of a border. + * @param width width of a border. + * @param unit unit of measurement to use in the border width. + */ + virtual void set_width(border_direction_t dir, double width, orcus::length_unit_t unit) = 0; + + /** + * Commit the border style in the current buffer. + * + * @return index of the committed border style, to be passed on to the + * import_xf::set_border() method as its argument. + */ + virtual size_t commit() = 0; +}; + +/** + * Interface for importing cell protection items. + */ +class ORCUS_DLLPUBLIC import_cell_protection +{ +public: + virtual ~import_cell_protection(); + + /** + * Hide the entire cell content when the sheet is protected. + * + * @param b whether to hide the entire cell content when the sheet is + * protected. + */ + virtual void set_hidden(bool b) = 0; + + /** + * Lock the cell when the sheet is protected. + * + * @param b whether or not to lock the cell when the sheet is protected. + */ + virtual void set_locked(bool b) = 0; + + /** + * Specify whether or not to print the cell content when the sheet is + * protected. + * + * + * @param b whether or not to print the cell content when the sheet is + * protected. + */ + virtual void set_print_content(bool b) = 0; + + /** + * Hide the formula when the sheet is protected and the cell contains + * formula. + * + * @param b whether or not to hide the formula when the sheet is protected + * and the cell contains formula. + */ + virtual void set_formula_hidden(bool b) = 0; + + /** + * Commit the cell protection data in the current buffer. + * + * @return index of the committed cell protection data, to be passed on to + * the import_xf::set_protection() method as its argument. + */ + virtual std::size_t commit() = 0; +}; + +/** + * Interface for importing number format items. + */ +class ORCUS_DLLPUBLIC import_number_format +{ +public: + virtual ~import_number_format(); + + /** + * Set the integral identifier of a number format. + * + * @param id integral indentifier of a number format. + * + * @note This is specific to xlsx format. In xlsx, this identifier gets + * used to reference number formats instead of the identifier returned + * by the commit() method. + * + * @todo Perhaps when this method is called, the commit() method of the + * corresponding item should return the value set in this method + * instead. + */ + virtual void set_identifier(std::size_t id) = 0; + + /** + * Set the number format code. + * + * @param s number format code. + */ + virtual void set_code(std::string_view s) = 0; + + /** + * Commit the number format item in the current buffer. + * + * @return index of the committed number format item, to be passed on to the + * import_xf::set_number_format() method as its argument. + * + * @todo Look into returning the identifier set through the set_identifier() + * method. + */ + virtual size_t commit() = 0; +}; + +/** + * This interface is used to import cell format records for direct cell + * formats, named cell style formats, and differential cell formats. + * + * The following cell format types: + * <ul> + * <li>font</li> + * <li>fill</li> + * <li>border</li> + * <li>protection</li> + * <li>number format</li> + * </ul> + * use indices to reference their records in their respective record pools. + * + * The horizontal and vertical alignments are specified directly. + */ +class ORCUS_DLLPUBLIC import_xf +{ +public: + virtual ~import_xf(); + + /** + * Set the index of the font record, as returned from the + * import_font_style::commit() method. + * + * @param index index of the font record to reference. + */ + virtual void set_font(size_t index) = 0; + + /** + * Set the index of the fill record, as returned from the + * import_fill_style::commit() method. + * + * @param index index of the fill record to reference. + */ + virtual void set_fill(size_t index) = 0; + + /** + * Set the index of the border record, as returned from the + * import_border_style::commit() method. + * + * @param index index of the border record to reference. + */ + virtual void set_border(size_t index) = 0; + + /** + * Set the index of the cell protection record, as returned from the + * import_cell_protection::commit() method. + * + * @param index index of the cell protection record to reference. + */ + virtual void set_protection(size_t index) = 0; + + /** + * Set the index of the number format record, as returned from the + * import_number_format::commit() method. + * + * @param index index of the number format record to reference. + */ + virtual void set_number_format(size_t index) = 0; + + /** + * Set the index into the cell style record to specify a named cell style it + * uses as its base format in case the cell has an underlying style applied. + * This can be used for a direct cell format i.e. when the xf category is + * xf_category_t::cell or for a cell style format i.e. the xf category is + * xf_category_t::cell_style. In a cell style format, this can be used to + * reference a parent style. + * + * @param index index into the cell style record it uses as its basis. + */ + virtual void set_style_xf(size_t index) = 0; + + /** + * Set the flag indicating whether or not to apply the alignment attribute. + * + * @param b flag indicating whether or not to apply the alignment attribute. + * + * @note This is specific to Excel format. + */ + virtual void set_apply_alignment(bool b) = 0; + + /** + * Set the horizontal alignment of a style. + * + * @param align horizontal alignment of a style. + */ + virtual void set_horizontal_alignment(hor_alignment_t align) = 0; + + /** + * Set the vertical alignment of a style. + * + * @param align vertical alignment of a style. + */ + virtual void set_vertical_alignment(ver_alignment_t align) = 0; + + /** + * Specify whether or not to wrap text when the text spills over the cell + * region. + * + * @param b whether or not to wrap text when the text spills over the cell + * region. + */ + virtual void set_wrap_text(bool b) = 0; + + /** + * Specify whether or not to shrink the text within cell until it fits + * inside the cell. + * + * @param b whether or not to shrink the text. + */ + virtual void set_shrink_to_fit(bool b) = 0; + + /** + * Commit the cell format in the current buffer to the storage. + * + * @return index of the cell format data in the storage. This index may be + * passed to the import_cell_style::set_xf() method. + */ + virtual size_t commit() = 0; +}; + +/** + * This interface is used to import named cell style records. + * + * @note The actual cell format data for named cell styles are imported + * through import_xf, and this interface references its index through + * the import_cell_style::set_xf() method. + * + */ +class ORCUS_DLLPUBLIC import_cell_style +{ +public: + virtual ~import_cell_style(); + + /** + * Set the name associated with the named cell style. + * + * @param s name of the named cell style. + */ + virtual void set_name(std::string_view s) = 0; + + /** + * Set the name associated with the named cell style intended for display + * purposes. + * + * @param s name to use for display purposes. + * + * @note Not all supported formats make use of this property. Also, the + * style may not always have this property even if the format supports + * it. ODF uses this property when the original name contains + * characters that cannot be used in internal symbols. + */ + virtual void set_display_name(std::string_view s) = 0; + + /** + * Set the index into the cell format record. The named cell style applies + * the format referenced by this index. + * + * @param index index into the cell format record. + */ + virtual void set_xf(size_t index) = 0; + + /** + * Set the index into the built-in cell style record. + * + * @note This is Excel-specific, and unclear whether it's useful outside of + * Excel's implementation. Built-in styles are not stored in file, and + * Excel likely has its own internal styles stored in the application + * itself. + * + * @param index index into the built-in cell style record. + */ + virtual void set_builtin(size_t index) = 0; + + /** + * Set the name of the parent cell style it uses as its basis. + * + * @note ODF uses this but Excel does not use this value. + * + * @param s name of the parent cell style. + */ + virtual void set_parent_name(std::string_view s) = 0; + + /** + * Commit the cell style format in the current buffer to the storage. + * + * @note This method does @em not return an index. + */ + virtual void commit() = 0; +}; + +}}} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/spreadsheet/import_interface_view.hpp b/include/orcus/spreadsheet/import_interface_view.hpp new file mode 100644 index 0000000..8e6b53e --- /dev/null +++ b/include/orcus/spreadsheet/import_interface_view.hpp @@ -0,0 +1,78 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef IMPORT_ORCUS_SPREADSHEET_IMPORT_INTERFACE_VIEW_HPP +#define IMPORT_ORCUS_SPREADSHEET_IMPORT_INTERFACE_VIEW_HPP + +#include <cstdlib> + +#include "view_types.hpp" +#include "../types.hpp" +#include "../env.hpp" + +namespace orcus { namespace spreadsheet { namespace iface { + +/** + * Interface for importing view properties. This interface may be obtained + * from the import_sheet interface. + */ +class ORCUS_DLLPUBLIC import_sheet_view +{ +public: + virtual ~import_sheet_view(); + + /** + * Set the current sheet as the active sheet. + */ + virtual void set_sheet_active() = 0; + + /** + * Set the information about split view in the current sheet. + * + * @param hor_split horizontal position of the split in 1/20th of a point, + * or 0 if none. "Horizontal" in this case indicates the + * column direction. + * @param ver_split vertical position of the split in 1/20th of a point, + * or 0 if none. "Vertical" in this case indicates the + * row direction. + * @param top_left_cell the top left visible cell in the bottom right + * pane. + * @param active_pane active pane in this sheet. + */ + virtual void set_split_pane( + double hor_split, double ver_split, const address_t& top_left_cell, + sheet_pane_t active_pane) = 0; + + /** + * Set the state of frozen view in the current sheet. + * + * @param visible_columns number of visible columns in the left pane. + * @param visible_rows number of visible rows in the top pane. + * @param top_left_cell the top left visible cell in the bottom right + * pane. + * @param active_pane active pane in this sheet. + */ + virtual void set_frozen_pane( + col_t visible_columns, row_t visible_rows, const address_t& top_left_cell, + sheet_pane_t active_pane) = 0; + + /** + * Set the selected cursor range in a specified sheet pane. + * + * @param pane sheet pane associated with the selection. The top-left + * pane is used for a non-split sheet view. + * @param range selected cursor range. The range will be 1 column by 1 + * row when the cursor is on a single cell only. + */ + virtual void set_selected_range(sheet_pane_t pane, range_t range) = 0; +}; + +}}} + +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/spreadsheet/pivot.hpp b/include/orcus/spreadsheet/pivot.hpp new file mode 100644 index 0000000..dee2559 --- /dev/null +++ b/include/orcus/spreadsheet/pivot.hpp @@ -0,0 +1,254 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_SPREADSHEET_PIVOT_HPP +#define INCLUDED_ORCUS_SPREADSHEET_PIVOT_HPP + +#include "../env.hpp" +#include "../types.hpp" +#include "types.hpp" + +#include <memory> +#include <vector> +#include <limits> +#include <variant> +#include <optional> + +namespace ixion { + +struct abs_range_t; + +} + +namespace orcus { + +class string_pool; + +namespace spreadsheet { + +class document; + +using pivot_cache_indices_t = std::vector<size_t>; + +struct ORCUS_SPM_DLLPUBLIC pivot_cache_record_value_t +{ + using value_type = std::variant<bool, double, std::size_t, std::string_view, date_time_t>; + + enum class record_type + { + unknown = 0, + boolean, + date_time, + character, + numeric, + blank, + error, + shared_item_index + }; + + record_type type; + value_type value; + + pivot_cache_record_value_t(); + pivot_cache_record_value_t(std::string_view s); + pivot_cache_record_value_t(double v); + pivot_cache_record_value_t(size_t index); + + bool operator== (const pivot_cache_record_value_t& other) const; + bool operator!= (const pivot_cache_record_value_t& other) const; +}; + +using pivot_cache_record_t = std::vector<pivot_cache_record_value_t>; + +struct ORCUS_SPM_DLLPUBLIC pivot_cache_item_t +{ + using value_type = std::variant<bool, double, std::string_view, date_time_t, error_value_t>; + + enum class item_type + { + unknown = 0, boolean, date_time, character, numeric, blank, error + }; + + item_type type; + value_type value; + + pivot_cache_item_t(); + pivot_cache_item_t(std::string_view s); + pivot_cache_item_t(double numeric); + pivot_cache_item_t(bool boolean); + pivot_cache_item_t(const date_time_t& date_time); + pivot_cache_item_t(error_value_t error); + + pivot_cache_item_t(const pivot_cache_item_t& other); + pivot_cache_item_t(pivot_cache_item_t&& other); + + bool operator< (const pivot_cache_item_t& other) const; + bool operator== (const pivot_cache_item_t& other) const; + + pivot_cache_item_t& operator= (pivot_cache_item_t other); + + void swap(pivot_cache_item_t& other); +}; + +using pivot_cache_items_t = std::vector<pivot_cache_item_t>; + +/** + * Group data for a pivot cache field. + */ +struct ORCUS_SPM_DLLPUBLIC pivot_cache_group_data_t +{ + struct ORCUS_SPM_DLLPUBLIC range_grouping_type + { + pivot_cache_group_by_t group_by = pivot_cache_group_by_t::range; + + bool auto_start = true; + bool auto_end = true; + + double start = 0.0; + double end = 0.0; + double interval = 1.0; + + date_time_t start_date; + date_time_t end_date; + + range_grouping_type() = default; + range_grouping_type(const range_grouping_type& other) = default; + }; + + /** + * Mapping of base field member indices to the group field item indices. + */ + pivot_cache_indices_t base_to_group_indices; + + std::optional<range_grouping_type> range_grouping; + + /** + * Individual items comprising the group. + */ + pivot_cache_items_t items; + + /** 0-based index of the base field. */ + size_t base_field; + + pivot_cache_group_data_t(size_t _base_field); + pivot_cache_group_data_t(const pivot_cache_group_data_t& other); + pivot_cache_group_data_t(pivot_cache_group_data_t&& other); + + pivot_cache_group_data_t() = delete; +}; + +struct ORCUS_SPM_DLLPUBLIC pivot_cache_field_t +{ + /** + * Field name. It must be interned with the string pool belonging to the + * document. + */ + std::string_view name; + + pivot_cache_items_t items; + + std::optional<double> min_value; + std::optional<double> max_value; + + std::optional<date_time_t> min_date; + std::optional<date_time_t> max_date; + + std::unique_ptr<pivot_cache_group_data_t> group_data; + + pivot_cache_field_t(); + pivot_cache_field_t(std::string_view _name); + pivot_cache_field_t(const pivot_cache_field_t& other); + pivot_cache_field_t(pivot_cache_field_t&& other); +}; + +class ORCUS_SPM_DLLPUBLIC pivot_cache +{ + struct impl; + std::unique_ptr<impl> mp_impl; + +public: + using fields_type = std::vector<pivot_cache_field_t>; + using records_type = std::vector<pivot_cache_record_t>; + + pivot_cache(pivot_cache_id_t cache_id, string_pool& sp); + ~pivot_cache(); + + /** + * Bulk-insert all the fields in one step. Note that this will replace any + * pre-existing fields if any. + * + * @param fields field instances to move into storage. + */ + void insert_fields(fields_type fields); + + void insert_records(records_type record); + + size_t get_field_count() const; + + /** + * Retrieve a field data by its index. + * + * @param index index of the field to retrieve. + * + * @return pointer to the field instance, or nullptr if the index is + * out-of-range. + */ + const pivot_cache_field_t* get_field(size_t index) const; + + pivot_cache_id_t get_id() const; + + const records_type& get_all_records() const; +}; + +class ORCUS_SPM_DLLPUBLIC pivot_collection +{ + struct impl; + std::unique_ptr<impl> mp_impl; + +public: + pivot_collection(document& doc); + ~pivot_collection(); + + /** + * Insert a new pivot cache associated with a worksheet source. + * + * @param sheet_name name of the sheet where the source data is. + * @param range range of the source data. Note that the sheet indices are + * not used. + * @param cache pivot cache instance to store. + */ + void insert_worksheet_cache( + std::string_view sheet_name, const ixion::abs_range_t& range, std::unique_ptr<pivot_cache>&& cache); + + /** + * Insert a new pivot cache associated with a table name. + * + * @param table_name source table name. + * @param cache pivot cache instance to store. + */ + void insert_worksheet_cache(std::string_view table_name, std::unique_ptr<pivot_cache>&& cache); + + /** + * Count the number of pivot caches currently stored. + * + * @return number of pivot caches currently stored in the document. + */ + size_t get_cache_count() const; + + const pivot_cache* get_cache( + std::string_view sheet_name, const ixion::abs_range_t& range) const; + + pivot_cache* get_cache(pivot_cache_id_t cache_id); + + const pivot_cache* get_cache(pivot_cache_id_t cache_id) const; +}; + +}} + +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/spreadsheet/shared_strings.hpp b/include/orcus/spreadsheet/shared_strings.hpp new file mode 100644 index 0000000..d447cb3 --- /dev/null +++ b/include/orcus/spreadsheet/shared_strings.hpp @@ -0,0 +1,77 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_SPREADSHEET_SHARED_STRINGS_HPP +#define INCLUDED_ORCUS_SPREADSHEET_SHARED_STRINGS_HPP + +#include "document_types.hpp" + +#include <vector> +#include <memory> +#include <string> + +namespace ixion { class model_context; } + +namespace orcus { + +namespace spreadsheet { + +/** + * This class manages access to a pool of shared string instances for both + * unformatted strings and rich-text strings. The underlying string values + * themselves are stored externally in the `ixion::model_context` instance + * which this class references; this class itself only stores the format + * properties of the rich-text strings. + */ +class ORCUS_SPM_DLLPUBLIC shared_strings +{ + struct impl; + std::unique_ptr<impl> mp_impl; + +public: + shared_strings() = delete; + shared_strings(const shared_strings&) = delete; + shared_strings& operator=(const shared_strings&) = delete; + + shared_strings(ixion::model_context& cxt); + ~shared_strings(); + + /** + * Set the entire format runs of a string. + * + * @param sindex index of the string to associate the format runs with. + * @param runs format runs. + */ + void set_format_runs(std::size_t sindex, std::unique_ptr<format_runs_t> runs); + + /** + * Get the entire format runs of a string. + * + * @param index index of the string to get the format runs of. + * + * @return pointer to the format runs, or @p nullptr if no format runs exist + * for the specified string index. + */ + const format_runs_t* get_format_runs(std::size_t index) const; + + /** + * Get an underlying string value associated with an index. + * + * @param index index of a string value. + * + * @return pointer to a string value associated with the index, or @p + * nullptr in case of an invalid string index. + */ + const std::string* get_string(std::size_t index) const; + + void dump(std::ostream& os) const; +}; + +}} + +#endif +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/spreadsheet/sheet.hpp b/include/orcus/spreadsheet/sheet.hpp new file mode 100644 index 0000000..2ea6392 --- /dev/null +++ b/include/orcus/spreadsheet/sheet.hpp @@ -0,0 +1,150 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_SPREADSHEET_ODSTABLE_HPP +#define INCLUDED_ORCUS_SPREADSHEET_ODSTABLE_HPP + +#include "../env.hpp" +#include "types.hpp" + +#include <ostream> +#include <memory> + +#include <ixion/address.hpp> +#include <ixion/formula_tokens.hpp> +#include <ixion/formula_result.hpp> + +namespace orcus { + +struct date_time_t; + +namespace spreadsheet { + +class document; +struct auto_filter_t; + +namespace detail { + +struct sheet_impl; + +} + +/** + * This class represents a single sheet instance in the internal document + * model. + */ +class ORCUS_SPM_DLLPUBLIC sheet +{ + friend class document; + friend struct detail::sheet_impl; + + static const row_t max_row_limit; + static const col_t max_col_limit; + +public: + sheet(document& doc, sheet_t sheet_index); + ~sheet() noexcept; + + void set_auto(row_t row, col_t col, std::string_view s); + void set_string(row_t row, col_t col, string_id_t sindex); + void set_value(row_t row, col_t col, double value); + void set_bool(row_t row, col_t col, bool value); + void set_date_time(row_t row, col_t col, int year, int month, int day, int hour, int minute, double second); + void set_format(row_t row, col_t col, size_t index); + void set_format(row_t row_start, col_t col_start, row_t row_end, col_t col_end, size_t index); + void set_column_format(col_t col, col_t col_span, std::size_t index); + void set_row_format(row_t row, std::size_t index); + + void set_formula(row_t row, col_t col, const ixion::formula_tokens_store_ptr_t& tokens); + void set_formula(row_t row, col_t col, const ixion::formula_tokens_store_ptr_t& tokens, ixion::formula_result result); + void set_grouped_formula(const range_t& range, ixion::formula_tokens_t tokens); + void set_grouped_formula(const range_t& range, ixion::formula_tokens_t tokens, ixion::formula_result result); + + void set_col_width(col_t col, col_t col_span, col_width_t width); + + /** + * Get column width in twips. + * + * @param col column index + * @param col_start pointer to a variable to store the index of the starting + * column of the range with the same width. Pass nullptr if + * the caller doesn't need this information. + * @param col_end pointer to a variable to store the index of the ending + * column plus one, of the range with the same width. Pass + * nullptr if the caller doesn't need this information. + * + * @return width of the specified column index (in twips). + */ + col_width_t get_col_width(col_t col, col_t* col_start, col_t* col_end) const; + + void set_col_hidden(col_t col, col_t col_span, bool hidden); + bool is_col_hidden(col_t col, col_t* col_start, col_t* col_end) const; + + void set_row_height(row_t row, row_height_t height); + row_height_t get_row_height(row_t row, row_t* row_start, row_t* row_end) const; + + void set_row_hidden(row_t row, bool hidden); + bool is_row_hidden(row_t row, row_t* row_start, row_t* row_end) const; + + void set_merge_cell_range(const range_t& range); + + void fill_down_cells(row_t src_row, col_t src_col, row_t range_size); + + /** + * Return the size of a merged cell range. + * + * @param row row position of the upper-left cell. + * @param col column position of the upper-left cell. + * + * @return merged cell range. + */ + range_t get_merge_cell_range(row_t row, col_t col) const; + + size_t get_string_identifier(row_t row, col_t col) const; + + auto_filter_t* get_auto_filter_data(); + const auto_filter_t* get_auto_filter_data() const; + void set_auto_filter_data(auto_filter_t* p); + + // Sheet dimension methods + + /** + * Return the smallest range that contains all non-empty cells in this + * sheet. The top-left corner of the returned range is always column 0 and + * row 0. + * + * @return smallest range that contains all non-empty cells. + */ + ixion::abs_range_t get_data_range() const; + + sheet_t get_index() const; + + date_time_t get_date_time(row_t row, col_t col) const; + + void dump_flat(std::ostream& os) const; + void dump_check(std::ostream& os, std::string_view sheet_name) const; + void dump_html(std::ostream& os) const; + void dump_json(std::ostream& os) const; + void dump_csv(std::ostream& os) const; + + void dump_debug_state(const std::string& output_dir, std::string_view sheet_name) const; + + /** + * Get the cell format ID of specified cell. + */ + size_t get_cell_format(row_t row, col_t col) const; + +private: + void finalize_import(); + + std::unique_ptr<detail::sheet_impl> mp_impl; +}; + +}} + +#endif +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/spreadsheet/styles.hpp b/include/orcus/spreadsheet/styles.hpp new file mode 100644 index 0000000..5458b1f --- /dev/null +++ b/include/orcus/spreadsheet/styles.hpp @@ -0,0 +1,268 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_SPREADSHEET_STYLES_HPP +#define INCLUDED_ORCUS_SPREADSHEET_STYLES_HPP + +#include "../env.hpp" +#include "../measurement.hpp" +#include "document_types.hpp" + +#include <memory> +#include <string_view> +#include <optional> + +namespace orcus { namespace spreadsheet { + +class document; + +/** + * Font style record. + */ +struct ORCUS_SPM_DLLPUBLIC font_t +{ + std::optional<std::string_view> name; + std::optional<std::string_view> name_asian; + std::optional<std::string_view> name_complex; + std::optional<double> size; + std::optional<double> size_asian; + std::optional<double> size_complex; + std::optional<bool> bold; + std::optional<bool> bold_asian; + std::optional<bool> bold_complex; + std::optional<bool> italic; + std::optional<bool> italic_asian; + std::optional<bool> italic_complex; + std::optional<underline_t> underline_style; + std::optional<underline_width_t> underline_width; + std::optional<underline_mode_t> underline_mode; + std::optional<underline_type_t> underline_type; + std::optional<color_t> underline_color; + std::optional<color_t> color; + std::optional<strikethrough_style_t> strikethrough_style; + std::optional<strikethrough_width_t> strikethrough_width; + std::optional<strikethrough_type_t> strikethrough_type; + std::optional<strikethrough_text_t> strikethrough_text; + + font_t(); + font_t(const font_t& other); + ~font_t(); + + font_t& operator=(const font_t& other); + + bool operator==(const font_t& other) const; + bool operator!=(const font_t& other) const; + + void reset(); + + struct ORCUS_SPM_DLLPUBLIC hash + { + std::size_t operator()(const font_t& v) const; + }; +}; + +/** + * Fill style record. + */ +struct ORCUS_SPM_DLLPUBLIC fill_t +{ + std::optional<fill_pattern_t> pattern_type; + std::optional<color_t> fg_color; + std::optional<color_t> bg_color; + + fill_t(); + void reset(); +}; + +/** + * Attributes for a single border. + */ +struct ORCUS_SPM_DLLPUBLIC border_attrs_t +{ + std::optional<border_style_t> style; + std::optional<color_t> border_color; + std::optional<length_t> border_width; + + border_attrs_t(); + void reset(); +}; + +/** + * Style record for the borders of a single cell. + */ +struct ORCUS_SPM_DLLPUBLIC border_t +{ + border_attrs_t top; + border_attrs_t bottom; + border_attrs_t left; + border_attrs_t right; + border_attrs_t diagonal; + border_attrs_t diagonal_bl_tr; + border_attrs_t diagonal_tl_br; + + border_t(); + void reset(); +}; + +/** + * Style record for cell protection attributes. + */ +struct ORCUS_SPM_DLLPUBLIC protection_t +{ + std::optional<bool> locked; + std::optional<bool> hidden; + std::optional<bool> print_content; + std::optional<bool> formula_hidden; + + protection_t(); + void reset(); +}; + +/** + * Style record for a number format. + */ +struct ORCUS_SPM_DLLPUBLIC number_format_t +{ + std::optional<std::size_t> identifier; + std::optional<std::string_view> format_string; + + number_format_t(); + void reset(); + + bool operator== (const number_format_t& other) const noexcept; + bool operator!= (const number_format_t& other) const noexcept; +}; + +/** + * Format attributes for a single cell. It references the format entries via + * integer indices, with some exceptions. + */ +struct ORCUS_SPM_DLLPUBLIC cell_format_t +{ + /** ID of a font style record. */ + std::size_t font; + /** ID of a fill style record. */ + std::size_t fill; + /** ID of a border style record. */ + std::size_t border; + /** ID for a cell protection record. */ + std::size_t protection; + /** ID for a number format record. */ + std::size_t number_format; + /** ID for a parent named style. */ + std::size_t style_xf; + /** Horizontal alignment of a cell. */ + hor_alignment_t hor_align; + /** Vertical alignment of a cell. */ + ver_alignment_t ver_align; + /** Flag on whether or not wrap text is enabled. */ + std::optional<bool> wrap_text; + /** Flag on whether or not shrink to fit is enabled. */ + std::optional<bool> shrink_to_fit; + bool apply_num_format:1; + bool apply_font:1; + bool apply_fill:1; + bool apply_border:1; + bool apply_alignment:1; + bool apply_protection:1; + + cell_format_t(); + void reset(); +}; + +/** + * Attributes of a named cell style. + * + * Refer to @ref orcus::spreadsheet::iface::import_cell_style for how the data + * members of this struct are used in practice. + */ +struct ORCUS_SPM_DLLPUBLIC cell_style_t +{ + std::string_view name; + std::string_view display_name; + std::size_t xf; + std::size_t builtin; + std::string_view parent_name; + + cell_style_t(); + void reset(); +}; + +ORCUS_SPM_DLLPUBLIC std::ostream& operator<< (std::ostream& os, const color_t& c); + +/** + * Stores various styles records such that they can be referenced via integer + * indices. + */ +class ORCUS_SPM_DLLPUBLIC styles +{ + friend class document; + + struct impl; + std::unique_ptr<impl> mp_impl; + +public: + styles(); + ~styles(); + + void reserve_font_store(size_t n); + std::size_t append_font(const font_t& font); + + void reserve_fill_store(size_t n); + std::size_t append_fill(const fill_t& fill); + + void reserve_border_store(size_t n); + std::size_t append_border(const border_t& border); + + std::size_t append_protection(const protection_t& protection); + + void reserve_number_format_store(size_t n); + std::size_t append_number_format(const number_format_t& nf); + + void reserve_cell_style_format_store(size_t n); + size_t append_cell_style_format(const cell_format_t& cf); + + void reserve_cell_format_store(size_t n); + size_t append_cell_format(const cell_format_t& cf); + + void reserve_diff_cell_format_store(size_t n); + size_t append_diff_cell_format(const cell_format_t& cf); + + void reserve_cell_style_store(size_t n); + void append_cell_style(const cell_style_t& cs); + + const font_t* get_font(size_t index) const; + const fill_t* get_fill(size_t index) const; + const border_t* get_border(size_t index) const; + const protection_t* get_protection(size_t index) const; + const number_format_t* get_number_format(size_t index) const; + const cell_format_t* get_cell_format(size_t index) const; + const cell_format_t* get_cell_style_format(size_t index) const; + const cell_format_t* get_dxf_format(size_t index) const; + const cell_style_t* get_cell_style(size_t index) const; + const cell_style_t* get_cell_style_by_xf(size_t xfid) const; + + size_t get_font_count() const; + size_t get_fill_count() const; + size_t get_border_count() const; + size_t get_protection_count() const; + size_t get_number_format_count() const; + size_t get_cell_formats_count() const; + size_t get_cell_style_formats_count() const; + size_t get_dxf_count() const; + size_t get_cell_styles_count() const; + + void clear(); + +private: + void finalize_import(); +}; + +}} + +#endif +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/spreadsheet/types.hpp b/include/orcus/spreadsheet/types.hpp new file mode 100644 index 0000000..df7b27e --- /dev/null +++ b/include/orcus/spreadsheet/types.hpp @@ -0,0 +1,751 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef ORCUS_SPREADSHEET_TYPES_HPP +#define ORCUS_SPREADSHEET_TYPES_HPP + +#include "../env.hpp" + +#include <cstdlib> +#include <cstdint> +#include <iosfwd> +#include <initializer_list> +#include <string_view> +#include <vector> + +// NB: This header should only define primitive data types, enums and structs. + +namespace orcus { namespace spreadsheet { + +/** Row ID type. */ +using row_t = int32_t; +/** Column ID type. */ +using col_t = int32_t; +/** Sheet ID type. */ +using sheet_t = int32_t; +/** Individual color element type. */ +using color_elem_t = uint8_t; +/** Type for column width values. Column width values are stored in twips. */ +using col_width_t = uint16_t; +/** Type for row height values. Row height values are stored in twips. */ +using row_height_t = uint16_t; +/** Type for string ID's for string cells. */ +using string_id_t = uint32_t; +/** Pivot cache ID type. */ +using pivot_cache_id_t = uint32_t; + +/** + * Get the special column width value that represents the default column + * width. The value itself is not to be used as an actual width value. + * + * @return value that represents the default column width. + */ +ORCUS_DLLPUBLIC col_width_t get_default_column_width(); + +/** + * Get the special row height value that represents the default row height. + * The value itself is not to be used as an actual row height value. + * + * @return value that represents the default row height. + */ +ORCUS_DLLPUBLIC row_height_t get_default_row_height(); + +/** + * Type of error value in cells. + */ +enum class error_value_t +{ + /** + * Error type unknown, typically used as an initial error value or generic + * default value. + */ + unknown = 0, + /** Null reference error, displayed as `#NULL!`. */ + null, + /** Division-by-zero error, displayed as `#DIV/0`. */ + div0, + /** Formula expression error, displayed as `#VALUE!`. */ + value, + /** Reference error, displayed as `#REF!`. */ + ref, + /** Invalid named-expression error, displayed as `#NAME?` */ + name, + /** Invalid numeric value error, displayed as `#NUM!`. */ + num, + /** No value is available error, displayed as `#N/A!`. */ + na +}; + +/** + * Type of border direction, used to reference the position of a border in a + * cell. + */ +enum class border_direction_t +{ + /** Unknown or uninitialized border direction value. */ + unknown = 0, + /** Top border of a cell. */ + top, + /** Bottom border of a cell. */ + bottom, + /** Left border of a cell. */ + left, + /** Right border of a cell. */ + right, + /** + * Cross-diagonal borders of a cell. This is equivalent of both + * @p diagonal_bl_tr and @p diagonal_tl_br combined. + */ + diagonal, + /** Diagonal border of a cell that runs from bottom-left to top-right. */ + diagonal_bl_tr, + /** Diagonal border of a cell that runs from top-left to bottom-right. */ + diagonal_tl_br +}; + +/** + * Type of border style. + */ +enum class border_style_t +{ + unknown = 0, + none, + solid, + dash_dot, + dash_dot_dot, + dashed, + dotted, + double_border, + hair, + medium, + medium_dash_dot, + medium_dash_dot_dot, + medium_dashed, + slant_dash_dot, + thick, + thin, + double_thin, + fine_dashed +}; + +/** + * Type of fill pattern for cell background. + */ +enum class fill_pattern_t +{ + none = 0, + solid, + dark_down, + dark_gray, + dark_grid, + dark_horizontal, + dark_trellis, + dark_up, + dark_vertical, + gray_0625, + gray_125, + light_down, + light_gray, + light_grid, + light_horizontal, + light_trellis, + light_up, + light_vertical, + medium_gray +}; + +/** + * Strikethrough style as applied to a cell value. + * + * @note This is specific to ODS format. + */ +enum class strikethrough_style_t +{ + none = 0, + solid, + dash, + dot_dash, + dot_dot_dash, + dotted, + long_dash, + wave +}; + +/** + * Strikethrough type as applied to a cell value. + * + * @note This is specific to ODS format. + */ +enum class strikethrough_type_t +{ + unknown = 0, + none, + single_type, + double_type +}; + +/** + * Width of strikethrough applied to a cell value. + * + * @note This is specific to ODS format. + */ +enum class strikethrough_width_t +{ + unknown = 0, + width_auto, + thin, + medium, + thick, + bold +}; + +/** + * Text used for strike-through. + * + * @note This is specific to ODS format. + */ +enum class strikethrough_text_t +{ + unknown = 0, + /** `/` is used as the text. */ + slash, + /** `X` is used as the text. */ + cross +}; + +/** + * Type that specifies the grammar of a formula expression. Each grammar + * may exhibit a different set of syntax rules. + */ +enum class formula_grammar_t +{ + /** Grammar type is either unknown or unspecified. */ + unknown = 0, + /** Grammar used by the Excel 2003 XML (aka XML Spreadsheet) format. */ + xls_xml, + /** Grammar used by the Office Open XML spreadsheet format. */ + xlsx, + /** Grammar used by the OpenDocument Spreadsheet format. */ + ods, + /** Grammar used by the Gnumeric XML format. */ + gnumeric +}; + +/** + * Type of formula expression. + */ +enum class formula_t +{ + /** Formula expression type unknown, or generic default value. */ + unknown = 0, + /** Formula expression in an array of cells. */ + array, + /** Formula expression in a data table. */ + data_table, + /** Formula expression in a normal formula cell. */ + normal, + /** Formula expression in a shared formula cell. */ + shared +}; + +/** + * Formula reference context specifies the location where a formula + * expression is used. This is used mainly for those document formats that + * make use of multiple formula reference syntaxes, such as ODS. + */ +enum class formula_ref_context_t +{ + /** + * Default context, that is, the context that is NOT any of the other + * contexts specified below. + */ + global = 0, + + /** Base cell position of either a named range or expression. */ + named_expression_base, + + /** + * Named range is a special case of named expression where the expression + * consists of only one range token. + */ + named_range, +}; + +/** + * Type of policy on how to handle a formula cell with an erroneous expression + * that has been parsed unsuccessfully. + */ +enum class formula_error_policy_t +{ + unknown, + /** Loading of the document will be halted. */ + fail, + /** The error cell will be skipped. */ + skip +}; + +/** + * Underline type for a cell value. + */ +enum class underline_t +{ + /** Underline is absent. */ + none = 0, + /** Underline consists of a single line. */ + single_line, + /** + * Single line for accounting format. + * + * @note This is unique to xlsx format. + */ + single_accounting, + /** Underline consists of a double line. */ + double_line, + /** + * Double line for accounting format. + * + * @note This is unique to xlsx format. + */ + double_accounting, + /** Underline is dotted. */ + dotted, + /** Underline is dashed. */ + dash, + /** Underline consists of repeated long dash segments. */ + long_dash, + /** Underline consists of repeated dot and dash segments. */ + dot_dash, + /** Underline consists of repeated dot, dot and dash segments. */ + dot_dot_dash, + /** Underline is waved. */ + wave +}; + +/** + * Underline width types, specific to ODF. When the enum value is either + * percent, positive_integer, or positive_length, the actual value should be + * given separately. + * + * @note The automatic enum value corresponds with the "auto" text value, + * which could not be used since it's a keyword in C++. + */ +enum class underline_width_t +{ + none = 0, + automatic, + bold, + dash, + medium, + thick, + thin, + percent, + positive_integer, + positive_length +}; + +/** + * Underline mode that determines whether an underline is applied to both + * words and spaces, or words only. + * + * @note This is specific to ODS format. + */ +enum class underline_mode_t +{ + /** Underline is applied to both words and spaces. */ + continuous = 0, + /** Underline is applied only to words. */ + skip_white_space +}; + +/** + * Whether a single line or a double line is used as an underline. + * + * @todo Perhaps we should merge this with underline_t. + */ +enum class underline_type_t +{ + none = 0, + /** A single line is used as an underline. */ + single_type, + /** A double line is used as an underline. */ + double_type +}; + +/** + * Type of horizontal alignment applied to a cell content. + */ +enum class hor_alignment_t +{ + unknown = 0, + left, + center, + right, + justified, + distributed, + filled +}; + +/** + * Type of vertical alignment applied to a cell content. + */ +enum class ver_alignment_t +{ + unknown = 0, + top, + middle, + bottom, + justified, + distributed +}; + +/** + * Cell format categories. The abbreviation "xf" stands for "cell format" + * where the "x" is short for cell. + */ +enum class xf_category_t +{ + unknown, + /** Direct cell format, also often referenced as xf. */ + cell, + /** Cell format for named styles. */ + cell_style, + /** Incremental cell format, also referenced as dxf. */ + differential, +}; + +/** + * Type of data table. A data table can be either of a single-variable + * column, a single-variable row, or a double-variable type that uses both + * column and row input cells. + */ +enum class data_table_type_t +{ + column, + row, + both +}; + +/** + * Function type used in the totals row of a table. + */ +enum class totals_row_function_t +{ + none = 0, + sum, + minimum, + maximum, + average, + count, + count_numbers, + standard_deviation, + variance, + custom +}; + +/** + * Type of conditional format. + */ +enum class conditional_format_t +{ + unknown = 0, + condition, + date, + formula, + colorscale, + databar, + iconset +}; + +/** + * Operator type associated with a conditional format rule. + */ +enum class condition_operator_t +{ + unknown = 0, + equal, + less, + greater, + greater_equal, + less_equal, + not_equal, + between, + not_between, + duplicate, + unique, + top_n, + bottom_n, + above_average, + below_average, + above_equal_average, + below_equal_average, + contains_error, + contains_no_error, + begins_with, + ends_with, + contains, + contains_blanks, + not_contains, + expression +}; + +/** + * Type of a condition in a conditional format rule. This is applicable only + * when the type of a conditional format entry is either: + * + * @li @p colorscale, + * @li @p databar or + * @li @p iconset. + */ +enum class condition_type_t +{ + unknown = 0, + value, + automatic, + max, + min, + formula, + percent, + percentile +}; + +/** + * Type of a date condition when the type of a conditional format entry is + * @p date. + */ +enum class condition_date_t +{ + unknown = 0, + today, + yesterday, + tomorrow, + last_7_days, + this_week, + next_week, + last_week, + this_month, + next_month, + last_month, + this_year, + next_year, + last_year, +}; + +/** + * Databar axis type, applicable only when the type of a conditional format + * entry is @p databar. + */ +enum class databar_axis_t +{ + none = 0, + middle, + automatic +}; + +/** + * Type of range grouping in a group field of a pivot table cache. + */ +enum class pivot_cache_group_by_t +{ + /** + * Type of range grouping is unknown. + * + * This is an implicit default value of this type. + */ + unknown = 0, + /** Grouping on "days" for date values. */ + days, + /** Grouping on "hours" for date values. */ + hours, + /** Grouping on "minutes" for date values. */ + minutes, + /** Grouping on "months" for date values. */ + months, + /** Grouping on "quarters" for date values. */ + quarters, + /** Grouping by numeric ranges for numeric values. */ + range, + /** Grouping on "seconds" for date values. */ + seconds, + /** Grouping on "years" for date values. */ + years +}; + +/** + * Stores a 2-dimensional cell address. + */ +struct address_t +{ + row_t row; + col_t column; +}; + +/** + * Stores the size of a range of a spreadsheet. + */ +struct range_size_t +{ + row_t rows; + col_t columns; +}; + +/** + * Stores a 2-dimensional cell range by storing the positions of the top-left + * and bottom-right corners of the range. + */ +struct range_t +{ + address_t first; + address_t last; +}; + +/** + * Stores 3-dimensional cell address. The 'src' abbreviation stands for + * sheet-row-column. + */ +struct src_address_t +{ + sheet_t sheet; + row_t row; + col_t column; +}; + +/** + * Stores 3-dimensional cell range address. The 'src' abbreviation stands for + * sheet-row-column. + */ +struct src_range_t +{ + src_address_t first; + src_address_t last; +}; + +/** + * Convert a 3-dimensional cell address to a 2-dimensional counterpart by + * dropping the sheet index. + */ +ORCUS_DLLPUBLIC address_t to_rc_address(const src_address_t& r); + +/** + * Convert a 3-dimensional cell range address to a 2-dimensional counterpart + * by dropping the sheet indices. + */ +ORCUS_DLLPUBLIC range_t to_rc_range(const src_range_t& r); + +ORCUS_DLLPUBLIC bool operator== (const address_t& left, const address_t& right); +ORCUS_DLLPUBLIC bool operator!= (const address_t& left, const address_t& right); + +ORCUS_DLLPUBLIC bool operator== (const src_address_t& left, const src_address_t& right); +ORCUS_DLLPUBLIC bool operator!= (const src_address_t& left, const src_address_t& right); + +ORCUS_DLLPUBLIC bool operator== (const range_t& left, const range_t& right); +ORCUS_DLLPUBLIC bool operator!= (const range_t& left, const range_t& right); + +ORCUS_DLLPUBLIC bool operator== (const src_range_t& left, const src_range_t& right); +ORCUS_DLLPUBLIC bool operator!= (const src_range_t& left, const src_range_t& right); + +ORCUS_DLLPUBLIC bool operator< (const range_t& left, const range_t& right); +ORCUS_DLLPUBLIC bool operator> (const range_t& left, const range_t& right); + +ORCUS_DLLPUBLIC range_t& operator+= (range_t& left, const address_t& right); +ORCUS_DLLPUBLIC range_t& operator-= (range_t& left, const address_t& right); + +ORCUS_DLLPUBLIC std::ostream& operator<< (std::ostream& os, const address_t& v); +ORCUS_DLLPUBLIC std::ostream& operator<< (std::ostream& os, const src_address_t& v); +ORCUS_DLLPUBLIC std::ostream& operator<< (std::ostream& os, const range_t& v); + +/** + * Stores a color value in RGB format. + */ +struct color_rgb_t +{ + color_elem_t red; + color_elem_t green; + color_elem_t blue; +}; + +/** + * Convert a string representation of a totals row function name to its + * equivalent enum value. + * + * @param s string value for totals row function name. + * + * @return enum value representing the totals row function. + */ +ORCUS_DLLPUBLIC totals_row_function_t to_totals_row_function_enum(std::string_view s); + +/** + * Convert a string representation of a pivot cache group-by type to its + * equivalent enum value. + * + * @param s string value for pivot cache group-by type. + * + * @return enum value representing the pivot cache group-by type. + */ +ORCUS_DLLPUBLIC pivot_cache_group_by_t to_pivot_cache_group_by_enum(std::string_view s); + +/** + * Convert a string representation of a error value to its equivalent enum + * value. + * + * @param s error value string. + * + * @return enum value representing the error value. + */ +ORCUS_DLLPUBLIC error_value_t to_error_value_enum(std::string_view s); + +/** + * Convert a string representation of a RGB value to an equivalent struct + * value. The string representation is expected to be a 6 digit hexadecimal + * value string that may or may not be prefixed with a '#'. + * + * @param s string representation of the RGB value. + * + * @return struct value representing an RGB value. + */ +ORCUS_DLLPUBLIC color_rgb_t to_color_rgb(std::string_view s); + +/** + * Convert a color name to an RGB value. It supports SVG 1.0 color keyword + * names minus those gray colors with 'grey' spelling variants. Note that + * the name must be all in lowercase. + * + * @param s color name. + * + * @return struct value representing an RGB value. + */ +ORCUS_DLLPUBLIC color_rgb_t to_color_rgb_from_name(std::string_view s); + +/** + * Convert a formula error policy name to its enum value equivalent. + * + * @param s policy name. + * + * @return enum value equivalent for the original error policy name. + */ +ORCUS_DLLPUBLIC formula_error_policy_t to_formula_error_policy(std::string_view s); + +ORCUS_DLLPUBLIC std::ostream& operator<< (std::ostream& os, error_value_t ev); +ORCUS_DLLPUBLIC std::ostream& operator<< (std::ostream& os, border_style_t border); +ORCUS_DLLPUBLIC std::ostream& operator<< (std::ostream& os, formula_grammar_t grammar); +ORCUS_DLLPUBLIC std::ostream& operator<< (std::ostream& os, underline_t uline); +ORCUS_DLLPUBLIC std::ostream& operator<< (std::ostream& os, underline_width_t ulwidth); +ORCUS_DLLPUBLIC std::ostream& operator<< (std::ostream& os, underline_mode_t ulmode); +ORCUS_DLLPUBLIC std::ostream& operator<< (std::ostream& os, underline_type_t ultype); +ORCUS_DLLPUBLIC std::ostream& operator<< (std::ostream& os, hor_alignment_t halign); +ORCUS_DLLPUBLIC std::ostream& operator<< (std::ostream& os, ver_alignment_t valign); +ORCUS_DLLPUBLIC std::ostream& operator<< (std::ostream& os, const color_rgb_t& color); +ORCUS_DLLPUBLIC std::ostream& operator<< (std::ostream& os, const fill_pattern_t& fill); +ORCUS_DLLPUBLIC std::ostream& operator<< (std::ostream& os, const strikethrough_style_t& ss); +ORCUS_DLLPUBLIC std::ostream& operator<< (std::ostream& os, const strikethrough_type_t& st); +ORCUS_DLLPUBLIC std::ostream& operator<< (std::ostream& os, const strikethrough_width_t& sw); +ORCUS_DLLPUBLIC std::ostream& operator<< (std::ostream& os, const strikethrough_text_t& st); + +}} + +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/spreadsheet/view.hpp b/include/orcus/spreadsheet/view.hpp new file mode 100644 index 0000000..7b5552f --- /dev/null +++ b/include/orcus/spreadsheet/view.hpp @@ -0,0 +1,65 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_SPREADSHEET_VIEW_HPP +#define INCLUDED_ORCUS_SPREADSHEET_VIEW_HPP + +#include "orcus/env.hpp" +#include "orcus/spreadsheet/types.hpp" +#include "orcus/spreadsheet/view_types.hpp" + +#include <memory> + +namespace orcus { namespace spreadsheet { + +class sheet_view; +class document; + +class ORCUS_SPM_DLLPUBLIC view +{ + struct impl; + std::unique_ptr<impl> mp_impl; +public: + view(document& doc); + ~view(); + + sheet_view* get_or_create_sheet_view(sheet_t sheet); + const sheet_view* get_sheet_view(sheet_t sheet) const; + + void set_active_sheet(sheet_t sheet); + sheet_t get_active_sheet() const; +}; + +class ORCUS_SPM_DLLPUBLIC sheet_view +{ + struct impl; + std::unique_ptr<impl> mp_impl; +public: + sheet_view(view& doc_view); + ~sheet_view(); + + const range_t& get_selection(sheet_pane_t pos) const; + + void set_selection(sheet_pane_t pos, const range_t& range); + + void set_active_pane(sheet_pane_t pos); + sheet_pane_t get_active_pane() const; + + void set_split_pane(double hor_split, double ver_split, const address_t& top_left_cell); + const split_pane_t& get_split_pane() const; + + void set_frozen_pane(col_t visible_cols, row_t visible_rows, const address_t& top_left_cell); + const frozen_pane_t& get_frozen_pane() const; + + view& get_document_view(); +}; + +}} + +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/spreadsheet/view_types.hpp b/include/orcus/spreadsheet/view_types.hpp new file mode 100644 index 0000000..ae6e728 --- /dev/null +++ b/include/orcus/spreadsheet/view_types.hpp @@ -0,0 +1,95 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef ORCUS_SPREADSHEET_VIEW_TYPES_HPP +#define ORCUS_SPREADSHEET_VIEW_TYPES_HPP + +#include "orcus/spreadsheet/types.hpp" + +namespace orcus { namespace spreadsheet { + +/** + * Sheet pane position in a split sheet view. When the sheet is split, it is + * split into four panes. + */ +enum class sheet_pane_t : uint8_t +{ + unspecified = 0, + /** Top-left pane. */ + top_left, + /** Top-right pane. */ + top_right, + /** Bottom-left pane. */ + bottom_left, + /** Bottom-right pane. */ + bottom_right +}; + +/** + * State of a split pane - whether it's frozen, split, or both. + */ +enum class pane_state_t : uint8_t +{ + /** The state of the pane is not specified. */ + unspecified = 0, + /** The pane is frozen. */ + frozen, + /** The pane is split. */ + split, + /** The pane is both frozen and split. */ + frozen_split +}; + +/** + * Store information about the state of a split sheet view. + */ +struct split_pane_t +{ + /** + * Horizontal distance to the vertical split bar in 1/20th of a point, or + * 0 if not horizontally split. + */ + double hor_split; + + /** + * Vertical distance to the horizontal split bar in 1/20th of a point, or + * 0 if not vertically split. + */ + double ver_split; + + /** + * Top-left visible cell of the bottom-right pane. This value is valid + * only when either the horizontal distance or the vertical distance is + * non-zero. + */ + address_t top_left_cell; +}; + +/** + * Store the state of a frozen sheet view. + */ +struct frozen_pane_t +{ + /** + * The number of visible columns in the top-left pane. + */ + col_t visible_columns; + /** + * The number of visible rows in the top-left pane. + */ + row_t visible_rows; + /** + * The position of the top-left cell in the bottom-right pane. + */ + address_t top_left_cell; +}; + +}} + +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/stream.hpp b/include/orcus/stream.hpp new file mode 100644 index 0000000..dd094bb --- /dev/null +++ b/include/orcus/stream.hpp @@ -0,0 +1,188 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_STREAM_HPP +#define INCLUDED_ORCUS_STREAM_HPP + +#include "env.hpp" + +#include <memory> +#include <string> + +namespace orcus { + +/** + * Represents the content of a file. + * + * The file content is memory-mapped initially, but may later become in-memory + * if the non-utf-8 content gets converted to utf-8. + */ +class ORCUS_PSR_DLLPUBLIC file_content +{ + struct impl; + std::unique_ptr<impl> mp_impl; +public: + file_content(const file_content&) = delete; + file_content& operator= (const file_content&) = delete; + + file_content(); + file_content(file_content&& other); + file_content(std::string_view filepath); + ~file_content(); + + /** + * Obtain the memory address to the first character in the content buffer. + * + * @return pointer to the first character in the buffer. + */ + const char* data() const; + + /** + * Return the size of the content i.e. the number of characters in the + * content buffer. + * + * @return size of the content. + */ + size_t size() const; + + /** + * Query whether or not the content is empty. + * + * @return true if the content is empty, otherwise false. + */ + bool empty() const; + + /** + * Swap content with another instance. + * + * @param other another instance to swap content with. + */ + void swap(file_content& other); + + /** + * Load from a new file. This will invalidate the pointer returned from the + * data() method prior to the call. + * + * @param filepath path of the file to load from. + */ + void load(std::string_view filepath); + + /** + * Convert a non-utf-8 stream to a utf-8 one if the source stream contains + * a byte order mark. If not, it does nothing. When the conversion + * happens, the converted content will be stored in-memory. + */ + void convert_to_utf8(); + + std::string_view str() const; +}; + +/** + * Represents the content of an in-memory buffer. Note that this class will + * NOT own the content of the source buffer but simply will reference it, + * except when the original buffer is a non-utf-8 stream and the caller + * chooses to convert it to utf-8 by calling its convert_to_utf8() method. + */ +class ORCUS_PSR_DLLPUBLIC memory_content +{ + struct impl; + std::unique_ptr<impl> mp_impl; +public: + memory_content(const file_content&) = delete; + memory_content& operator= (const file_content&) = delete; + + memory_content(); + memory_content(std::string_view s); + memory_content(memory_content&& other); + ~memory_content(); + + const char* data() const; + size_t size() const; + bool empty() const; + + void swap(memory_content& other); + + /** + * Convert a non-utf-8 stream to a utf-8 one if the source stream contains + * a byte order mark. If not, it does nothing. When the conversion + * happens, the converted content will be owned by the object. + */ + void convert_to_utf8(); + + std::string_view str() const; +}; + +struct ORCUS_PSR_DLLPUBLIC line_with_offset +{ + /** content of the entire line. */ + std::string line; + /** 0-based line number. */ + std::size_t line_number; + /** 0-based offset within the line. */ + std::size_t offset_on_line; + + line_with_offset(std::string _line, std::size_t _line_number, std::size_t _offset_on_line); + line_with_offset(const line_with_offset& other); + line_with_offset(line_with_offset&& other); + ~line_with_offset(); + + bool operator== (const line_with_offset& other) const; + bool operator!= (const line_with_offset& other) const; +}; + +/** + * Generate a sensible error output for parse error including the line where + * the error occurred and the offset of the error position on that line. + * + * @param strm entire character stream where the error occurred. + * @param offset offset of the error position within the stream. + * + * @return string formatted to be usable as an error message for stdout. + */ +ORCUS_PSR_DLLPUBLIC std::string create_parse_error_output(std::string_view strm, std::ptrdiff_t offset); + +/** + * Given a string consisting of multiple lines i.e. multiple line breaks, + * find the line that contains the specified offset position. + * + * @param strm string stream containing multiple lines to search. + * @param offset offset position. + * + * @return structure containing information about the line containing the + * offset position. + * + * @exception std::invalid_argument if the offset value equals or exceeds the + * length of the string stream being searched. + */ +ORCUS_PSR_DLLPUBLIC line_with_offset locate_line_with_offset(std::string_view strm, std::ptrdiff_t offset); + +/** + * Given two strings, locate the position of the first character that is + * different between the two strings. Note that if one of the strings is + * empty (or both of them are empty), it returns 0. + * + * @param left one of the strings to compare. + * @param right one of the strings to compare. + * + * @return position of the first character that is different between the two + * compared strings. + */ +ORCUS_PSR_DLLPUBLIC size_t locate_first_different_char(std::string_view left, std::string_view right); + +/** + * Calculate the logical length of a UTF-8 encoded string. + * + * @param s string to calculate the logical length of. + * @return logical length of the UTF-8 encoded string. + */ +ORCUS_PSR_DLLPUBLIC std::size_t calc_logical_string_length(std::string_view s); + +} // namespace orcus + +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/string_pool.hpp b/include/orcus/string_pool.hpp new file mode 100644 index 0000000..12419bc --- /dev/null +++ b/include/orcus/string_pool.hpp @@ -0,0 +1,99 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_STRING_POOL_HPP +#define INCLUDED_ORCUS_STRING_POOL_HPP + +#include "env.hpp" + +#include <string> +#include <memory> +#include <vector> + +namespace orcus { + +/** + * This class implements a shared string pool with the ability to merge with + * other pools. + * + * @note This class is not copy-constructible, but is move-constructible. + */ +class ORCUS_PSR_DLLPUBLIC string_pool +{ +public: + string_pool(const string_pool&) = delete; + string_pool& operator=(const string_pool&) = delete; + + string_pool(); + string_pool(string_pool&& other); + ~string_pool(); + + /** + * Intern a string. + * + * @param str string to intern. + * + * @return pair whose first value is the interned string, and the second + * value specifies whether it is a newly created instance (true) + * or a reuse of an existing instance (false). + */ + std::pair<std::string_view, bool> intern(std::string_view str); + + /** + * Return all interned strings. + * + * @return sequence of all interned strings. The sequence will be sorted. + */ + std::vector<std::string_view> get_interned_strings() const; + + /** + * Dump pool's content to stdout. + * + * @todo This needs to be reworked to make it more generally usable. + */ + void dump() const; + + /** + * Clear pool's content. + */ + void clear(); + + /** + * Query the total number of strings stored in the pool. + * + * @return size_t total number of strings in the pool. + */ + size_t size() const; + + /** + * Swap the content with another string-pool instance. + * + * + * @param other string-pool instance to swap contents with. + */ + void swap(string_pool& other); + + /** + * Merge another string pool instance in. This will not invalidate any + * string references to the other pool. + * + * The other string pool instance will become empty when this call + * returns. + * + * @param other string pool instance to merge in. + */ + void merge(string_pool& other); + +private: + struct impl; + std::unique_ptr<impl> mp_impl; +}; + +} + +#endif +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/threaded_json_parser.hpp b/include/orcus/threaded_json_parser.hpp new file mode 100644 index 0000000..09bddfa --- /dev/null +++ b/include/orcus/threaded_json_parser.hpp @@ -0,0 +1,185 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_THREADED_JSON_PARSER_HPP +#define INCLUDED_ORCUS_THREADED_JSON_PARSER_HPP + +#include "json_parser_thread.hpp" +#include "json_parser_base.hpp" +#include "detail/thread.hpp" + +#include <algorithm> + +namespace orcus { + +template<typename _Handler> +class threaded_json_parser +{ +public: + + typedef _Handler handler_type; + + /** + * Constructor. + * + * @param p pointer to a string stream containing JSON string. + * @param n size of the stream. + * @param hdl handler class instance. + * @param min_token_size minimum size of the internal token buffer. + */ + threaded_json_parser( + const char* p, size_t n, handler_type& hdl, size_t min_token_size); + + /** + * Constructor. + * + * @param p pointer to a string stream containing JSON string. + * @param n size of the stream. + * @param hdl handler class instance. + * @param min_token_size minimum size of the internal token buffer. + * @param max_token_size maximum size of the internal token buffer. + */ + threaded_json_parser( + const char* p, size_t n, handler_type& hdl, size_t min_token_size, + size_t max_token_size); + + /** + * Call this method to start parsing. + */ + void parse(); + + /** + * Get statistics on the parsing session. Call this only after the + * parsing has finished. + * + * @return structure containing statistics of the parsing session. + */ + json::parser_stats get_stats() const; + + void swap_string_pool(string_pool& pool); + +private: + void thread_parse(); + + void process_tokens(json::parse_tokens_t& tokens); + +private: + json::parser_thread m_parser_thread; + handler_type& m_handler; +}; + +template<typename _Handler> +threaded_json_parser<_Handler>::threaded_json_parser( + const char* p, size_t n, handler_type& hdl, size_t min_token_size) : + m_parser_thread(p, n, min_token_size), m_handler(hdl) {} + +template<typename _Handler> +threaded_json_parser<_Handler>::threaded_json_parser( + const char* p, size_t n, handler_type& hdl, size_t min_token_size, size_t max_token_size) : + m_parser_thread(p, n, min_token_size, max_token_size), m_handler(hdl) {} + +template<typename _Handler> +void threaded_json_parser<_Handler>::parse() +{ + std::thread t(&threaded_json_parser::thread_parse, this); + detail::thread::scoped_guard guard(std::move(t)); + + json::parse_tokens_t tokens; + + while (m_parser_thread.next_tokens(tokens)) + process_tokens(tokens); + + process_tokens(tokens); +} + +template<typename _Handler> +json::parser_stats threaded_json_parser<_Handler>::get_stats() const +{ + return m_parser_thread.get_stats(); +} + +template<typename _Handler> +void threaded_json_parser<_Handler>::swap_string_pool(string_pool& pool) +{ + m_parser_thread.swap_string_pool(pool); +} + +template<typename _Handler> +void threaded_json_parser<_Handler>::thread_parse() +{ + // Start parsing. + m_parser_thread.start(); +} + +template<typename _Handler> +void threaded_json_parser<_Handler>::process_tokens(json::parse_tokens_t& tokens) +{ + std::for_each(tokens.begin(), tokens.end(), + [this](const json::parse_token& t) + { + switch (t.type) + { + case json::parse_token_t::begin_array: + m_handler.begin_array(); + break; + case json::parse_token_t::begin_object: + m_handler.begin_object(); + break; + case json::parse_token_t::begin_parse: + m_handler.begin_parse(); + break; + case json::parse_token_t::boolean_false: + m_handler.boolean_false(); + break; + case json::parse_token_t::boolean_true: + m_handler.boolean_true(); + break; + case json::parse_token_t::end_array: + m_handler.end_array(); + break; + case json::parse_token_t::end_object: + m_handler.end_object(); + break; + case json::parse_token_t::end_parse: + m_handler.end_parse(); + break; + case json::parse_token_t::null: + m_handler.null(); + break; + case json::parse_token_t::number: + m_handler.number(std::get<double>(t.value)); + break; + case json::parse_token_t::object_key: + { + auto s = std::get<std::string_view>(t.value); + m_handler.object_key(s.data(), s.size(), false); + break; + } + case json::parse_token_t::string: + { + auto s = std::get<std::string_view>(t.value); + m_handler.string(s.data(), s.size(), false); + break; + } + case json::parse_token_t::parse_error: + { + auto v = std::get<parse_error_value_t>(t.value); + throw parse_error(std::string{v.str}, v.offset); + } + case json::parse_token_t::unknown: + default: + throw general_error("unknown token type encountered."); + } + } + ); +} + +} + +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/threaded_sax_token_parser.hpp b/include/orcus/threaded_sax_token_parser.hpp new file mode 100644 index 0000000..aa9019f --- /dev/null +++ b/include/orcus/threaded_sax_token_parser.hpp @@ -0,0 +1,165 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_THREADED_SAX_TOKEN_PARSER_HPP +#define INCLUDED_ORCUS_THREADED_SAX_TOKEN_PARSER_HPP + +#include "tokens.hpp" +#include "xml_namespace.hpp" +#include "sax_token_parser_thread.hpp" +#include "sax_parser_base.hpp" +#include "exception.hpp" +#include "detail/thread.hpp" + +#include <thread> + +namespace orcus { + +class xmlns_context; +class string_pool; + +template<typename _Handler> +class threaded_sax_token_parser +{ +public: + + typedef _Handler handler_type; + + /** + * Constructor. + * + * @param p pointer to a string stream containing XML content. + * @param n size of the stream. + * @param tks XML token map instance. + * @param ns_cxt namespace context instance. + * @param hdl handler class instance. + * @param min_token_size minimum size of the internal token buffer. + */ + threaded_sax_token_parser( + const char* p, size_t n, const tokens& tks, xmlns_context& ns_cxt, + handler_type& hdl, size_t min_token_size); + + /** + * Constructor. + * + * @param p pointer to a string stream containing XML content. + * @param n size of the stream. + * @param tks XML token map instance. + * @param ns_cxt namespace context instance. + * @param hdl handler class instance. + * @param min_token_size minimum size of the internal token buffer. + * @param max_token_size maximum size of the internal token buffer. + */ + threaded_sax_token_parser( + const char* p, size_t n, const tokens& tks, xmlns_context& ns_cxt, + handler_type& hdl, size_t min_token_size, size_t max_token_size); + + /** + * Call this method to start parsing. + */ + void parse(); + + void swap_string_pool(string_pool& pool); + +private: + void thread_parse(); + + void process_tokens(const sax::parse_tokens_t& tokens); + +private: + sax::parser_thread m_parser_thread; + handler_type& m_handler; +}; + +template<typename _Handler> +threaded_sax_token_parser<_Handler>::threaded_sax_token_parser( + const char* p, size_t n, const tokens& tks, xmlns_context& ns_cxt, + handler_type& hdl, size_t min_token_size) : + m_parser_thread(p, n, tks, ns_cxt, min_token_size), m_handler(hdl) {} + +template<typename _Handler> +threaded_sax_token_parser<_Handler>::threaded_sax_token_parser( + const char* p, size_t n, const tokens& tks, xmlns_context& ns_cxt, handler_type& hdl, + size_t min_token_size, size_t max_token_size) : + m_parser_thread(p, n, tks, ns_cxt, min_token_size, max_token_size), m_handler(hdl) {} + +template<typename _Handler> +void threaded_sax_token_parser<_Handler>::parse() +{ + std::thread t(&threaded_sax_token_parser::thread_parse, this); + detail::thread::scoped_guard guard(std::move(t)); + + sax::parse_tokens_t tokens; + + try + { + while (m_parser_thread.next_tokens(tokens)) + process_tokens(tokens); + + process_tokens(tokens); + } + catch (const std::exception&) + { + m_parser_thread.abort(); + throw; + } +} + +template<typename _Handler> +void threaded_sax_token_parser<_Handler>::swap_string_pool(string_pool& pool) +{ + m_parser_thread.swap_string_pool(pool); +} + +template<typename _Handler> +void threaded_sax_token_parser<_Handler>::thread_parse() +{ + // Start parsing. + m_parser_thread.start(); +} + +template<typename _Handler> +void threaded_sax_token_parser<_Handler>::process_tokens(const sax::parse_tokens_t& tks) +{ + for (const sax::parse_token& t : tks) + { + switch (t.type) + { + case sax::parse_token_t::start_element: + { + const auto* elem = std::get<const xml_token_element_t*>(t.value); + m_handler.start_element(*elem); + break; + } + case sax::parse_token_t::end_element: + { + const auto* elem = std::get<const xml_token_element_t*>(t.value); + m_handler.end_element(*elem); + break; + } + case sax::parse_token_t::characters: + { + auto s = std::get<std::string_view>(t.value); + m_handler.characters(s, false); + break; + } + case sax::parse_token_t::parse_error: + { + auto v = std::get<parse_error_value_t>(t.value); + throw malformed_xml_error(std::string{v.str}, v.offset); + } + default: + throw general_error("unknown token type encountered."); + } + } +} + +} + +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/tokens.hpp b/include/orcus/tokens.hpp new file mode 100644 index 0000000..9edc877 --- /dev/null +++ b/include/orcus/tokens.hpp @@ -0,0 +1,74 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_TOKENS_HPP +#define INCLUDED_ORCUS_TOKENS_HPP + +#include "types.hpp" + +#include <algorithm> +#include <unordered_map> + +namespace orcus { + +/** + * XML token store that provides mapping of integral token indentifiers and + * their original names. Instances of this class are typically used as global + * constants. + * + * @note The string values for the original token names should be static + * values whose values and memory addresses remain unchanged during the + * life cycle of the instance that references them. + * + * @note This class is not copy-constructible. + */ +class ORCUS_PSR_DLLPUBLIC tokens +{ +public: + tokens() = delete; + tokens(const tokens&) = delete; + tokens(const char** token_names, size_t token_name_count); + ~tokens(); + + /** + * Check if a token returned from get_token() method is valid. + * + * @return true if valid, false otherwise. + */ + bool is_valid_token(xml_token_t token) const; + + /** + * Get token from a specified name. + * + * @param name textural token name + * + * @return token value representing the given textural token. + */ + xml_token_t get_token(std::string_view name) const; + + /** + * Get textural token name from a token value. + * + * @param token numeric token value + * + * @return textural token name, or empty string in case the given token is + * not valid. + */ + std::string_view get_token_name(xml_token_t token) const; + +private: + using token_map_type = std::unordered_map<std::string_view, xml_token_t>; + + token_map_type m_tokens; + const char** m_token_names; + size_t m_token_name_count; +}; + +} + +#endif +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/types.hpp b/include/orcus/types.hpp new file mode 100644 index 0000000..34c968a --- /dev/null +++ b/include/orcus/types.hpp @@ -0,0 +1,634 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_TYPES_HPP +#define INCLUDED_ORCUS_TYPES_HPP + +#include <cstdint> +#include <cstdlib> +#include <vector> +#include <string> +#include <unordered_set> +#include "env.hpp" + +namespace orcus { + +class xmlns_context; +class xmlns_repository; + +/** + * Integral type that represents a tokenized XML element name. + */ +using xml_token_t = std::size_t; + +/** + * Type that represents a normalized XML namespace identifier. Internally it + * is a pointer value that points to a static char buffer that stores a + * namespace name. + */ +using xmlns_id_t = const char*; + +/** + * Parser token that represents the state of a parse error, used by + * threaded_json_parser and threaded_sax_token_parser when transferring + * parse status between threads. + */ +struct ORCUS_PSR_DLLPUBLIC parse_error_value_t +{ + /** error message associated with the parse error. */ + std::string_view str; + /** offset in stream where the error occurred. */ + std::ptrdiff_t offset; + + parse_error_value_t(); + parse_error_value_t(const parse_error_value_t& other); + parse_error_value_t(std::string_view _str, std::ptrdiff_t _offset); + + parse_error_value_t& operator=(const parse_error_value_t& other); + + bool operator==(const parse_error_value_t& other) const; + bool operator!=(const parse_error_value_t& other) const; +}; + +/** + * Represents a name with a normalized namespace in XML documents. This can + * be used either as an element name or as an attribute name. + */ +struct ORCUS_PSR_DLLPUBLIC xml_name_t +{ + enum to_string_type { use_alias, use_short_name }; + + xmlns_id_t ns; + std::string_view name; + + xml_name_t() noexcept; + xml_name_t(xmlns_id_t _ns, std::string_view _name); + xml_name_t(const xml_name_t& other); + + xml_name_t& operator= (const xml_name_t& other); + + bool operator== (const xml_name_t& other) const noexcept; + bool operator!= (const xml_name_t& other) const noexcept; + + /** + * Convert a namespace-name value pair to a string representation with the + * namespace value converted to either an alias or a unique "short name". + * Refer to @link xmlns_context::get_alias() get_alias() @endlink and + * @link xmlns_context::get_short_name() get_short_name() @endlink + * for the explanations of an alias and short name. + * + * @param cxt namespace context object associated with the XML stream + * currently being parsed. + * @param type policy on how to convert a namespace identifier to a string + * representation. + * + * @return string representation of a namespace-name value pair. + */ + std::string to_string(const xmlns_context& cxt, to_string_type type) const; + + /** + * Convert a namespace-name value pair to a string representation with the + * namespace value converted to a unique "short name". Refer to @link + * xmlns_repository::get_short_name() get_short_name() @endlink for the + * explanations of a short name. + * + * @param repo namespace repository. + * + * @return string representation of a namespace-name value pair. + */ + std::string to_string(const xmlns_repository& repo) const; +}; + +/** + * Struct containing properties of a tokenized XML attribute. + */ +struct ORCUS_PSR_DLLPUBLIC xml_token_attr_t +{ + xmlns_id_t ns; + xml_token_t name; + std::string_view raw_name; + std::string_view value; + + /** + * Whether or not the attribute value is transient. A transient value is + * only guaranteed to be valid until the end of the start_element call, + * after which its validity is not guaranteed. A non-transient value is + * guaranteed to be valid during the life cycle of the xml stream it + * belongs to. + */ + bool transient; + + xml_token_attr_t(); + xml_token_attr_t(const xml_token_attr_t& other); + xml_token_attr_t( + xmlns_id_t _ns, xml_token_t _name, std::string_view _value, bool _transient); + xml_token_attr_t( + xmlns_id_t _ns, xml_token_t _name, std::string_view _raw_name, + std::string_view _value, bool _transient); + + xml_token_attr_t& operator=(const xml_token_attr_t& other); +}; + +using xml_token_attrs_t = std::vector<xml_token_attr_t>; + +/** + * Struct containing XML element properties passed to the handler of + * sax_token_parser via its @p start_element() and @p end_element() + * calls. + * + * @see + * @li sax_token_handler::start_element + * @li sax_token_handler::end_element + */ +struct ORCUS_PSR_DLLPUBLIC xml_token_element_t +{ + xmlns_id_t ns; + xml_token_t name; + std::string_view raw_name; + xml_token_attrs_t attrs; + + xml_token_element_t& operator= (xml_token_element_t) = delete; + + xml_token_element_t(); + xml_token_element_t(xmlns_id_t _ns, xml_token_t _name, std::string_view _raw_name, std::vector<xml_token_attr_t>&& _attrs); + xml_token_element_t(const xml_token_element_t& other); + xml_token_element_t(xml_token_element_t&& other); +}; + +/** + * Character set types, generated from IANA character-sets specifications. + * + * @see https://www.iana.org/assignments/character-sets/character-sets.xhtml + */ +enum class character_set_t +{ + unspecified = 0, + adobe_standard_encoding, + adobe_symbol_encoding, + amiga_1251, + ansi_x3_110_1983, + asmo_449, + big5, + big5_hkscs, + bocu_1, + brf, + bs_4730, + bs_viewdata, + cesu_8, + cp50220, + cp51932, + csa_z243_4_1985_1, + csa_z243_4_1985_2, + csa_z243_4_1985_gr, + csn_369103, + dec_mcs, + din_66003, + dk_us, + ds_2089, + ebcdic_at_de, + ebcdic_at_de_a, + ebcdic_ca_fr, + ebcdic_dk_no, + ebcdic_dk_no_a, + ebcdic_es, + ebcdic_es_a, + ebcdic_es_s, + ebcdic_fi_se, + ebcdic_fi_se_a, + ebcdic_fr, + ebcdic_it, + ebcdic_pt, + ebcdic_uk, + ebcdic_us, + ecma_cyrillic, + es, + es2, + euc_jp, + euc_kr, + extended_unix_code_fixed_width_for_japanese, + gb18030, + gb2312, + gb_1988_80, + gb_2312_80, + gbk, + gost_19768_74, + greek7, + greek7_old, + greek_ccitt, + hp_desktop, + hp_legal, + hp_math8, + hp_pi_font, + hp_roman8, + hz_gb_2312, + ibm00858, + ibm00924, + ibm01140, + ibm01141, + ibm01142, + ibm01143, + ibm01144, + ibm01145, + ibm01146, + ibm01147, + ibm01148, + ibm01149, + ibm037, + ibm038, + ibm1026, + ibm1047, + ibm273, + ibm274, + ibm275, + ibm277, + ibm278, + ibm280, + ibm281, + ibm284, + ibm285, + ibm290, + ibm297, + ibm420, + ibm423, + ibm424, + ibm437, + ibm500, + ibm775, + ibm850, + ibm851, + ibm852, + ibm855, + ibm857, + ibm860, + ibm861, + ibm862, + ibm863, + ibm864, + ibm865, + ibm866, + ibm868, + ibm869, + ibm870, + ibm871, + ibm880, + ibm891, + ibm903, + ibm904, + ibm905, + ibm918, + ibm_symbols, + ibm_thai, + iec_p27_1, + inis, + inis_8, + inis_cyrillic, + invariant, + iso_10367_box, + iso_10646_j_1, + iso_10646_ucs_2, + iso_10646_ucs_4, + iso_10646_ucs_basic, + iso_10646_unicode_latin1, + iso_10646_utf_1, + iso_11548_1, + iso_2022_cn, + iso_2022_cn_ext, + iso_2022_jp, + iso_2022_jp_2, + iso_2022_kr, + iso_2033_1983, + iso_5427, + iso_5427_1981, + iso_5428_1980, + iso_646_basic_1983, + iso_646_irv_1983, + iso_6937_2_25, + iso_6937_2_add, + iso_8859_1, + iso_8859_10, + iso_8859_13, + iso_8859_14, + iso_8859_15, + iso_8859_16, + iso_8859_1_windows_3_0_latin_1, + iso_8859_1_windows_3_1_latin_1, + iso_8859_2, + iso_8859_2_windows_latin_2, + iso_8859_3, + iso_8859_4, + iso_8859_5, + iso_8859_6, + iso_8859_6_e, + iso_8859_6_i, + iso_8859_7, + iso_8859_8, + iso_8859_8_e, + iso_8859_8_i, + iso_8859_9, + iso_8859_9_windows_latin_5, + iso_8859_supp, + iso_ir_90, + iso_unicode_ibm_1261, + iso_unicode_ibm_1264, + iso_unicode_ibm_1265, + iso_unicode_ibm_1268, + iso_unicode_ibm_1276, + it, + jis_c6220_1969_jp, + jis_c6220_1969_ro, + jis_c6226_1978, + jis_c6226_1983, + jis_c6229_1984_a, + jis_c6229_1984_b, + jis_c6229_1984_b_add, + jis_c6229_1984_hand, + jis_c6229_1984_hand_add, + jis_c6229_1984_kana, + jis_encoding, + jis_x0201, + jis_x0212_1990, + jus_i_b1_002, + jus_i_b1_003_mac, + jus_i_b1_003_serb, + koi7_switched, + koi8_r, + koi8_u, + ks_c_5601_1987, + ksc5636, + kz_1048, + latin_greek, + latin_greek_1, + latin_lap, + macintosh, + microsoft_publishing, + mnem, + mnemonic, + msz_7795_3, + nats_dano, + nats_dano_add, + nats_sefi, + nats_sefi_add, + nc_nc00_10_81, + nf_z_62_010, + nf_z_62_010_1973, + ns_4551_1, + ns_4551_2, + osd_ebcdic_df03_irv, + osd_ebcdic_df04_1, + osd_ebcdic_df04_15, + pc8_danish_norwegian, + pc8_turkish, + pt, + pt2, + ptcp154, + scsu, + sen_850200_b, + sen_850200_c, + shift_jis, + t_101_g2, + t_61_7bit, + t_61_8bit, + tis_620, + tscii, + unicode_1_1, + unicode_1_1_utf_7, + unknown_8bit, + us_ascii, + us_dk, + utf_16, + utf_16be, + utf_16le, + utf_32, + utf_32be, + utf_32le, + utf_7, + utf_7_imap, + utf_8, + ventura_international, + ventura_math, + ventura_us, + videotex_suppl, + viqr, + viscii, + windows_1250, + windows_1251, + windows_1252, + windows_1253, + windows_1254, + windows_1255, + windows_1256, + windows_1257, + windows_1258, + windows_31j, + windows_874, +}; + +/** + * Struct holding XML declaration properties. + */ +struct ORCUS_PSR_DLLPUBLIC xml_declaration_t +{ + uint8_t version_major; + uint8_t version_minor; + character_set_t encoding; + bool standalone; + + xml_declaration_t(); + xml_declaration_t(uint8_t _version_major, uint8_t _version_minor, character_set_t _encoding, bool _standalone); + xml_declaration_t(const xml_declaration_t& other); + ~xml_declaration_t(); + + xml_declaration_t& operator= (const xml_declaration_t& other); + + bool operator== (const xml_declaration_t& other) const; + bool operator!= (const xml_declaration_t& other) const; +}; + +/** + * Unit of length, as used in length_t. + */ +enum class length_unit_t +{ + unknown = 0, + centimeter, + millimeter, + /** + * Special unit of length used by Excel, defined as the maximum digit width + * of font used as the "Normal" style font. + * + * @note Since it's not possible to determine the actual length using this + * unit, it is approximated by 1.9 millimeters. + */ + xlsx_column_digit, + inch, + point, + /** One twip is a twentieth of a point equal to 1/1440 of an inch. */ + twip, + pixel +}; + +/** + * Input formats that orcus can import. + */ +enum class format_t +{ + unknown = 0, + ods, + xlsx, + gnumeric, + xls_xml, + csv, + parquet +}; + +/** + * Formats supported by orcus as output formats. + */ +enum class dump_format_t +{ + unknown = 0, + none, + check, + csv, + flat, + html, + json, + xml, + yaml, + debug_state +}; + +/** + * Holds a length value with unit of measurement. + */ +struct ORCUS_PSR_DLLPUBLIC length_t +{ + length_unit_t unit; + double value; + + length_t(); + length_t(length_unit_t _unit, double _value); + length_t(const length_t& other); + length_t& operator= (const length_t& other); + + std::string to_string() const; + + bool operator== (const length_t& other) const noexcept; + bool operator!= (const length_t& other) const noexcept; +}; + +/** + * Struct that holds a date or date-time value. + */ +struct ORCUS_PSR_DLLPUBLIC date_time_t +{ + int year; + int month; + int day; + int hour; + int minute; + double second; + + date_time_t(); + date_time_t(int _year, int _month, int _day); + date_time_t(int _year, int _month, int _day, int _hour, int _minute, double _second); + date_time_t(const date_time_t& other); + ~date_time_t(); + + date_time_t& operator= (date_time_t other); + + bool operator== (const date_time_t& other) const; + bool operator!= (const date_time_t& other) const; + bool operator< (const date_time_t& other) const; + + /** + * Convert the date-time value to an ISO-formatted string representation. + * + * @return ISO-formatted string representation of the date-time value. + */ + std::string to_string() const; + + /** + * Swap the value with another instance. + * + * @param other another instance to swap values with. + */ + void swap(date_time_t& other); + + /** + * Parse an ISO-formatted string representation of a date-time value, and + * convert it into a date_time_t value. A string representation allows + * either a date only or a date and time value, but it does not allow a time + * only value. + * + * Here are some examples of ISO-formatted date and date-time values: + * + * @li <b>2013-04-09</b> (date only) + * @li <b>2013-04-09T21:34:09.55</b> (date and time) + * + * @param str string representation of a date-time value. + * @return converted date-time value consisting of a set of numeric values. + */ + static date_time_t from_chars(std::string_view str); +}; + +/** + * Parse a string that represents an output format type and convert it to a + * corresponding enum value. + * + * @param s string representing an output format type. + * + * @return enum value representing a character set, or + * character_set_t::unknown in case it cannot be + * determined. + */ +ORCUS_PSR_DLLPUBLIC dump_format_t to_dump_format_enum(std::string_view s); + +/** + * Parse a string that represents a character set and convert it to a + * corresponding enum value. + * + * @param s string representing a character set. + * + * @return enum value representing a character set, or + * character_set_t::unspecified in case it cannot be + * determined. + */ +ORCUS_PSR_DLLPUBLIC character_set_t to_character_set(std::string_view s); + +/** + * Get a list of available output format entries. Each entry consists of the + * name of a format and its enum value equivalent. + * + * @return list of available output format entries. + */ +ORCUS_PSR_DLLPUBLIC std::vector<std::pair<std::string_view, dump_format_t>> get_dump_format_entries(); + +ORCUS_PSR_DLLPUBLIC std::ostream& operator<< (std::ostream& os, const length_t& v); +ORCUS_PSR_DLLPUBLIC std::ostream& operator<< (std::ostream& os, const date_time_t& v); +ORCUS_PSR_DLLPUBLIC std::ostream& operator<< (std::ostream& os, format_t v); + +/** + * Generic constant to be used to indicate that a valid index value is + * expected but not found. + */ +ORCUS_PSR_DLLPUBLIC extern const std::size_t INDEX_NOT_FOUND; + +/** + * Value associated with an unknown XML namespace. + */ +ORCUS_PSR_DLLPUBLIC extern const xmlns_id_t XMLNS_UNKNOWN_ID; + +/** + * Value associated with an unknown XML token. + */ +ORCUS_PSR_DLLPUBLIC extern const xml_token_t XML_UNKNOWN_TOKEN; + +} + +#endif +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/xml_namespace.hpp b/include/orcus/xml_namespace.hpp new file mode 100644 index 0000000..cf9b270 --- /dev/null +++ b/include/orcus/xml_namespace.hpp @@ -0,0 +1,195 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_XML_NAMESPACE_MANAGER_HPP +#define INCLUDED_ORCUS_XML_NAMESPACE_MANAGER_HPP + +#include "types.hpp" + +#include <ostream> +#include <memory> + +namespace orcus { + +class xmlns_context; +struct xmlns_repository_impl; +struct xmlns_context_impl; + +/** + * Central XML namespace repository that stores all namespaces that are used + * in the current session. + * + * @warning this class is not copyable, but is movable; however, the + * moved-from object will not be usable after the move. + */ +class ORCUS_PSR_DLLPUBLIC xmlns_repository +{ + friend class xmlns_context; + + struct impl; + std::unique_ptr<impl> mp_impl; + + xmlns_id_t intern(std::string_view uri); + + size_t get_index(xmlns_id_t ns_id) const; + +public: + xmlns_repository(const xmlns_repository&) = delete; + xmlns_repository& operator= (const xmlns_repository&) = delete; + + xmlns_repository(); + xmlns_repository(xmlns_repository&& other); + ~xmlns_repository(); + + xmlns_repository& operator= (xmlns_repository&&); + + /** + * Add a set of predefined namespace values to the repository. + * + * @param predefined_ns predefined set of namespace values. This is a + * null-terminated array of xmlns_id_t. This + * xmlns_repository instance will assume that the + * instances of these xmlns_id_t values will be + * available throughout its life cycle; caller needs + * to ensure that they won't get deleted before the + * corresponding xmlns_repository instance is + * deleted. + */ + void add_predefined_values(const xmlns_id_t* predefined_ns); + + /** + * Create a context object associated with this namespace repository. + * + * @warning Since this context object references values stored in the repo, + * make sure that it will not out-live the repository object + * itself. + * + * @return context object to use for a new XML stream. + */ + xmlns_context create_context(); + + /** + * Get XML namespace identifier from its numerical index. + * + * @param index numeric index of namespace. + * + * @return valid namespace identifier, or XMLNS_UNKNOWN_ID if not found. + */ + xmlns_id_t get_identifier(size_t index) const; + + /** + * See xmlns_context::get_short_name() for the explanation of this method, + * which works identically to it. + */ + std::string get_short_name(xmlns_id_t ns_id) const; +}; + +/** + * XML namespace context. A new context should be used for each xml stream + * since the namespace keys themselves are not interned. Don't hold an + * instance of this class any longer than the life cycle of the xml stream + * it is used in. + * + * An empty key value i.e. `""` is associated with a default namespace. + */ +class ORCUS_PSR_DLLPUBLIC xmlns_context +{ + friend class xmlns_repository; + + struct impl; + std::unique_ptr<impl> mp_impl; + + xmlns_context(xmlns_repository& repo); +public: + xmlns_context(); + xmlns_context(xmlns_context&&); + xmlns_context(const xmlns_context& r); + ~xmlns_context(); + + xmlns_context& operator= (const xmlns_context& r); + xmlns_context& operator= (xmlns_context&& r); + + /** + * Push a new namespace alias-value pair to the stack. + * + * @param alias namespace alias to push onto the stack. If the same alias + * is already present, this overwrites it until it gets popped + * off the stack. + * @param uri namespace name to associate with the alias. + * + * @return normalized namespace identifier for the namespace name. + */ + xmlns_id_t push(std::string_view alias, std::string_view uri); + + /** + * Pop a namespace alias from the stack. + * + * @param alias namespace alias to pop from the stack. + */ + void pop(std::string_view alias); + + /** + * Get the currnet namespace identifier for a specified namespace alias. + * + * @param alias namespace alias to get the current namespace identifier for. + * + * @return current namespace identifier associated with the alias. + */ + xmlns_id_t get(std::string_view alias) const; + + /** + * Get a unique index value associated with a specified identifier. An + * index value is guaranteed to be unique regardless of contexts. + * + * @param ns_id a namespace identifier to obtain index for. + * + * @return index value associated with the identifier. + */ + size_t get_index(xmlns_id_t ns_id) const; + + /** + * Get a 'short' name associated with a specified identifier. A short + * name is a string value conveniently short enough for display purposes, + * but still guaranteed to be unique to the identifier it is associated + * with. + * + * @note The xmlns_repository class has method of the same name, and that + * method works identically to this method. + * + * @param ns_id a namespace identifier to obtain short name for. + * + * @return short name for the specified identifier. + */ + std::string get_short_name(xmlns_id_t ns_id) const; + + /** + * Get an alias currently associated with a given namespace identifier. + * + * @param ns_id namespace identifier. + * + * @return alias name currently associted with the given namespace + * identifier, or an empty string if the given namespace is + * currently not associated with any aliases. + */ + std::string_view get_alias(xmlns_id_t ns_id) const; + + std::vector<xmlns_id_t> get_all_namespaces() const; + + void dump(std::ostream& os) const; + + /** + * Dump the internal state for debugging in YAML format. + */ + void dump_state(std::ostream& os) const; + + void swap(xmlns_context& other) noexcept; +}; + +} + +#endif +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/xml_structure_tree.hpp b/include/orcus/xml_structure_tree.hpp new file mode 100644 index 0000000..423ede4 --- /dev/null +++ b/include/orcus/xml_structure_tree.hpp @@ -0,0 +1,198 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_XML_STRUCTURE_TREE_HPP +#define INCLUDED_ORCUS_XML_STRUCTURE_TREE_HPP + +#include "env.hpp" +#include "types.hpp" + +#include <ostream> +#include <memory> +#include <functional> + +namespace orcus { + +class xmlns_context; + +struct ORCUS_DLLPUBLIC xml_table_range_t +{ + std::vector<std::string> paths; + std::vector<std::string> row_groups; + + xml_table_range_t(); + ~xml_table_range_t(); +}; + +/** + * Tree representing the structure of elements in XML content. Recurring + * elements under the same parent are represented by a single element + * instance. This tree only includes elements; no attributes and content + * nodes appear in this tree. + */ +class ORCUS_DLLPUBLIC xml_structure_tree +{ + struct impl; + std::unique_ptr<impl> mp_impl; + +public: + xml_structure_tree() = delete; + xml_structure_tree(const xml_structure_tree&) = delete; + xml_structure_tree& operator= (const xml_structure_tree&) = delete; + + struct ORCUS_DLLPUBLIC entity_name + { + xmlns_id_t ns; + std::string_view name; + + entity_name(); + entity_name(xmlns_id_t _ns, std::string_view _name); + + bool operator< (const entity_name& r) const; + bool operator== (const entity_name& r) const; + + struct ORCUS_DLLPUBLIC hash + { + size_t operator ()(const entity_name& val) const; + }; + }; + + typedef std::vector<entity_name> entity_names_type; + + struct ORCUS_DLLPUBLIC element + { + entity_name name; + bool repeat; + bool has_content; + + element(); + element(const entity_name& _name, bool _repeat, bool _has_content); + }; + + struct walker_impl; + + /** + * This class allows client to traverse the tree. + */ + class ORCUS_DLLPUBLIC walker + { + friend class xml_structure_tree; + + std::unique_ptr<walker_impl> mp_impl; + + walker(const xml_structure_tree::impl& parent_impl); + public: + walker() = delete; + walker(const walker& r); + ~walker(); + walker& operator= (const walker& r); + + /** + * Set current position to the root element, and return the root + * element. + * + * @return root element. + */ + element root(); + + /** + * Descend into a specified child element. + * + * @param name name of a child element. + * + * @return child element + * + * @throw general_error if no child elements exist for the specified + * name. + */ + element descend(const entity_name& name); + + /** + * Move up to the parent element. + */ + element ascend(); + + /** + * Move to the element specified by a path expression. The path + * expression may be generated by + * <code>xml_structure_tree::walker::get_path</code>. + * + * @param path a simple XPath like expression + * + * @return element pointed to by the path. + */ + element move_to(const std::string& path); + + /** + * Get a list of names of all child elements at the current element + * position. The list of names is in order of appearance. + * + * @return list of child element names in order of appearance. + */ + entity_names_type get_children(); + + /** + * Get a list of names of all attributes that belong to current + * element. The list of names is in order of appearance. + * + * @return list of attribute names in order of appearance. + */ + entity_names_type get_attributes(); + + /** + * Get a numerical, 0-based index of given XML namespace. + * + * @param ns XML namespace ID. + * + * @return numeric, 0-based index of XML namespace if found, or + * <code>xml_structure_tree::walker::index_not_found</code> if + * the namespace is not found in this structure. + */ + size_t get_xmlns_index(xmlns_id_t ns) const; + + std::string get_xmlns_short_name(xmlns_id_t ns) const; + + /** + * Convert an entity name to its proper string representation. + * + * @param name entity name to convert to string. + * + * @return string representation of the entity name, including the + * namespace. + */ + std::string to_string(const entity_name& name) const; + + /** + * Get a XPath like ID for the element inside of the XML tree. + * + */ + std::string get_path() const; + }; + + xml_structure_tree(xmlns_context& xmlns_cxt); + xml_structure_tree(xml_structure_tree&& other); + ~xml_structure_tree(); + + void parse(std::string_view s); + + void dump_compact(std::ostream& os) const; + + walker get_walker() const; + + using range_handler_type = std::function<void(xml_table_range_t&&)>; + + void process_ranges(range_handler_type rh) const; + + void swap(xml_structure_tree& other); +}; + +} + + + +#endif +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/xml_writer.hpp b/include/orcus/xml_writer.hpp new file mode 100644 index 0000000..b55485c --- /dev/null +++ b/include/orcus/xml_writer.hpp @@ -0,0 +1,122 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_XML_WRITER_HPP +#define INCLUDED_ORCUS_XML_WRITER_HPP + +#include "orcus/types.hpp" + +#include <memory> + +namespace orcus { + +class xmlns_repository; + +/** + * This class lets you produce XML contents from scratch. It writes its + * content to any object supporting the std::ostream interface. + */ +class ORCUS_PSR_DLLPUBLIC xml_writer +{ + struct impl; + std::unique_ptr<impl> mp_impl; + + void close_current_element(); + void pop_elements(); + +public: + class ORCUS_PSR_DLLPUBLIC scope + { + friend class xml_writer; + + struct impl; + std::unique_ptr<impl> mp_impl; + + scope(xml_writer* parent, const xml_name_t& name); + public: + scope(const scope&) = delete; + scope(scope&& other); + ~scope(); + + scope& operator= (scope&& other); + }; + + xml_writer(const xml_writer&) = delete; + xml_writer& operator= (const xml_writer&) = delete; + + xml_writer(xmlns_repository& ns_repo, std::ostream& os); + xml_writer(xml_writer&& other); + + xml_writer& operator= (xml_writer&& other); + + /** + * Destructor. Any remaining element(s) on the stack will get popped when + * the destructor is called. + */ + ~xml_writer(); + + /** + * Push a new element to the stack, and write an opening element to the + * output stream. It differs from the {@link push_element} method in that + * the new element will be automatically popped when the returned object + * goes out of scope. + * + * @param name name of the new element. + * + * @return scope object which automatically pops the element when it goes + * out of scope. + */ + scope push_element_scope(const xml_name_t& name); + + /** + * Push a new element to the stack, and write an opening element to the + * output stream. + * + * @param name name of the element. + */ + void push_element(const xml_name_t& name); + + /** + * Add a namespace definition for the next element to be pushed. + * + * @param alias alias for the namespace. + * @param value value of the namespace definition. + * + * @return ID for the namespace being added. + */ + xmlns_id_t add_namespace(std::string_view alias, std::string_view value); + + /** + * Add a new attribute for the next element to be pushed. + * + * @param name name of the attribute to be added. + * @param value value of the attribute to be added. + */ + void add_attribute(const xml_name_t& name, std::string_view value); + + /** + * Add a content to the current element on the stack. The content will be + * properly encoded. + * + * @param content content to be added to the current element. + */ + void add_content(std::string_view content); + + /** + * Pop the current element from the stack, and write a closing element to + * the output stream. + * + * @return the name of the element being popped. + */ + xml_name_t pop_element(); +}; + +} + +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/yaml_document_tree.hpp b/include/orcus/yaml_document_tree.hpp new file mode 100644 index 0000000..d22a588 --- /dev/null +++ b/include/orcus/yaml_document_tree.hpp @@ -0,0 +1,109 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_YAML_DOCUMENT_TREE_HPP +#define INCLUDED_ORCUS_YAML_DOCUMENT_TREE_HPP + +#include "env.hpp" +#include "exception.hpp" + +#include <string> +#include <memory> +#include <vector> + +namespace orcus { + +namespace yaml { + +class document_tree; + +class ORCUS_DLLPUBLIC document_error : public general_error +{ +public: + document_error(const std::string& msg); + virtual ~document_error(); +}; + +enum class node_t : uint8_t +{ + unset, + string, + number, + map, + sequence, + boolean_true, + boolean_false, + null +}; + +struct yaml_value; + +class ORCUS_DLLPUBLIC const_node +{ + friend class ::orcus::yaml::document_tree; + + struct impl; + std::unique_ptr<impl> mp_impl; + + const_node(const yaml_value* yv); + +public: + const_node() = delete; + + const_node(const const_node& other); + const_node(const_node&& rhs); + ~const_node(); + + node_t type() const; + + size_t child_count() const; + + std::vector<const_node> keys() const; + + const_node key(size_t index) const; + + const_node child(size_t index) const; + + const_node child(const const_node& key) const; + + const_node parent() const; + + std::string_view string_value() const; + double numeric_value() const; + + const_node& operator=(const const_node& other); + + uintptr_t identity() const; +}; + +class ORCUS_DLLPUBLIC document_tree +{ + struct impl; + std::unique_ptr<impl> mp_impl; + +public: + document_tree(); + document_tree(const document_tree&) = delete; + document_tree(document_tree&& other); + ~document_tree(); + + void load(std::string_view s); + + size_t get_document_count() const; + + const_node get_document_root(size_t index) const; + + std::string dump_yaml() const; + + std::string dump_json() const; +}; + +}} + +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/yaml_parser.hpp b/include/orcus/yaml_parser.hpp new file mode 100644 index 0000000..836a902 --- /dev/null +++ b/include/orcus/yaml_parser.hpp @@ -0,0 +1,691 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_YAML_PARSER_HPP +#define INCLUDED_ORCUS_YAML_PARSER_HPP + +#include "orcus/yaml_parser_base.hpp" +#include "orcus/parser_global.hpp" + +namespace orcus { + +/** + * Blank handler class for yaml_parser. One can sub-class this and overwrite + * callback functions one needs to handle. + */ +class yaml_handler +{ +public: + /** + * Called when the parser starts parsing a content. + */ + void begin_parse() {} + + /** + * Called when the parser finishes parsing an entire content. + */ + void end_parse() {} + + /** + * Called when a new document is encountered. + */ + void begin_document() {} + + /** + * Called when the parser has finished parsing a document. + */ + void end_document() {} + + /** + * Called when a sequence begins. + */ + void begin_sequence() {} + + /** + * Called when a sequence ends. + */ + void end_sequence() {} + + /** + * Called when a map begins. + */ + void begin_map() {} + + /** + * Called when the parser starts parsing a map key. + */ + void begin_map_key() {} + + /** + * Called when the parser finishes parsing a map key. + */ + void end_map_key() {} + + /** + * Called when the parser finishes parsing an entire map. + */ + void end_map() {} + + /** + * Called when a string value is encountered. + * + * @param value string value. + */ + void string(std::string_view value) + { + (void)value; + } + + /** + * Called when a numeric value is encountered. + * + * @param val numeric value. + */ + void number(double val) + { + (void)val; + } + + /** + * Called when a boolean 'true' keyword is encountered. + */ + void boolean_true() {} + + /** + * Called when a boolean 'false' keyword is encountered. + */ + void boolean_false() {} + + /** + * Called when a 'null' keyword is encountered. + */ + void null() {} +}; + +/** + * Parser for YAML documents. + * + * @tparam HandlerT Hanlder type with member functions for event callbacks. + * Refer to yaml_handler. + * + * @warning This parser is still highly experimental. Use with caution. + */ +template<typename HandlerT> +class yaml_parser : public yaml::parser_base +{ +public: + typedef HandlerT handler_type; + + yaml_parser(std::string_view content, handler_type& hdl); + + void parse(); + +private: + size_t end_scope(); + void check_or_begin_document(); + void check_or_begin_map(); + void check_or_begin_sequence(); + void parse_value(const char* p, size_t len); + void push_value(const char* p, size_t len); + void parse_line(const char* p, size_t len); + void parse_map_key(const char* p, size_t len); + + void handler_begin_parse(); + void handler_end_parse(); + void handler_begin_document(); + void handler_end_document(); + void handler_begin_sequence(); + void handler_end_sequence(); + void handler_begin_map(); + void handler_end_map(); + void handler_begin_map_key(); + void handler_end_map_key(); + void handler_string(const char* p, size_t n); + void handler_number(double val); + void handler_boolean_true(); + void handler_boolean_false(); + void handler_null(); + +private: + handler_type& m_handler; +}; + +template<typename _Handler> +void yaml_parser<_Handler>::handler_begin_parse() +{ + push_parse_token(yaml::detail::parse_token_t::begin_parse); + m_handler.begin_parse(); +} + +template<typename _Handler> +void yaml_parser<_Handler>::handler_end_parse() +{ + push_parse_token(yaml::detail::parse_token_t::end_parse); + m_handler.end_parse(); +} + +template<typename _Handler> +void yaml_parser<_Handler>::handler_begin_document() +{ + push_parse_token(yaml::detail::parse_token_t::begin_document); + m_handler.begin_document(); +} + +template<typename _Handler> +void yaml_parser<_Handler>::handler_end_document() +{ + push_parse_token(yaml::detail::parse_token_t::end_document); + m_handler.end_document(); +} + +template<typename _Handler> +void yaml_parser<_Handler>::handler_begin_sequence() +{ + push_parse_token(yaml::detail::parse_token_t::begin_sequence); + m_handler.begin_sequence(); +} + +template<typename _Handler> +void yaml_parser<_Handler>::handler_end_sequence() +{ + push_parse_token(yaml::detail::parse_token_t::end_sequence); + m_handler.end_sequence(); +} + +template<typename _Handler> +void yaml_parser<_Handler>::handler_begin_map() +{ + push_parse_token(yaml::detail::parse_token_t::begin_map); + m_handler.begin_map(); +} + +template<typename _Handler> +void yaml_parser<_Handler>::handler_end_map() +{ + push_parse_token(yaml::detail::parse_token_t::end_map); + m_handler.end_map(); +} + +template<typename _Handler> +void yaml_parser<_Handler>::handler_begin_map_key() +{ + push_parse_token(yaml::detail::parse_token_t::begin_map_key); + m_handler.begin_map_key(); +} + +template<typename _Handler> +void yaml_parser<_Handler>::handler_end_map_key() +{ + push_parse_token(yaml::detail::parse_token_t::end_map_key); + m_handler.end_map_key(); +} + +template<typename _Handler> +void yaml_parser<_Handler>::handler_string(const char* p, size_t n) +{ + push_parse_token(yaml::detail::parse_token_t::string); + m_handler.string({p, n}); +} + +template<typename _Handler> +void yaml_parser<_Handler>::handler_number(double val) +{ + push_parse_token(yaml::detail::parse_token_t::number); + m_handler.number(val); +} + +template<typename _Handler> +void yaml_parser<_Handler>::handler_boolean_true() +{ + push_parse_token(yaml::detail::parse_token_t::boolean_true); + m_handler.boolean_true(); +} + +template<typename _Handler> +void yaml_parser<_Handler>::handler_boolean_false() +{ + push_parse_token(yaml::detail::parse_token_t::boolean_false); + m_handler.boolean_false(); +} + +template<typename _Handler> +void yaml_parser<_Handler>::handler_null() +{ + push_parse_token(yaml::detail::parse_token_t::null); + m_handler.null(); +} + +template<typename _Handler> +yaml_parser<_Handler>::yaml_parser(std::string_view content, handler_type& hdl) : + yaml::parser_base(content), m_handler(hdl) {} + +template<typename _Handler> +void yaml_parser<_Handler>::parse() +{ + handler_begin_parse(); + + while (has_char()) + { + reset_on_new_line(); + + size_t indent = parse_indent(); + if (indent == parse_indent_end_of_stream) + break; + + if (indent == parse_indent_blank_line) + continue; + + size_t cur_scope = get_scope(); + + if (cur_scope <= indent) + { + if (in_literal_block()) + { + handle_line_in_literal(indent); + continue; + } + + if (has_line_buffer()) + { + // This line is part of multi-line string. Push the line to the + // buffer as-is. + handle_line_in_multi_line_string(); + continue; + } + } + + if (cur_scope == scope_empty) + { + if (indent > 0) + throw parse_error( + "first node of the document should not be indented.", offset()); + + push_scope(indent); + } + else if (indent > cur_scope) + { + push_scope(indent); + } + else if (indent < cur_scope) + { + // Current indent is less than the current scope level. + do + { + cur_scope = end_scope(); + if (cur_scope < indent) + throw parse_error("parse: invalid indent level.", offset()); + } + while (indent < cur_scope); + } + + // Parse the rest of the line. + std::string_view line = parse_to_end_of_line(); + line = trim(line); + + assert(!line.empty()); + parse_line(line.data(), line.size()); + } + + // End all remaining scopes. + size_t cur_scope = get_scope(); + while (cur_scope != scope_empty) + cur_scope = end_scope(); + + if (get_doc_hash()) + handler_end_document(); + + handler_end_parse(); +} + +template<typename _Handler> +size_t yaml_parser<_Handler>::end_scope() +{ + switch (get_scope_type()) + { + case yaml::detail::scope_t::map: + { + if (get_last_parse_token() == yaml::detail::parse_token_t::end_map_key) + handler_null(); + + handler_end_map(); + break; + } + case yaml::detail::scope_t::sequence: + { + if (get_last_parse_token() == yaml::detail::parse_token_t::begin_sequence_element) + handler_null(); + + handler_end_sequence(); + break; + } + case yaml::detail::scope_t::multi_line_string: + { + std::string_view merged = merge_line_buffer(); + handler_string(merged.data(), merged.size()); + break; + } + default: + { + if (has_line_buffer()) + { + assert(get_line_buffer_count() == 1); + std::string_view line = pop_line_front(); + parse_value(line.data(), line.size()); + } + } + } + return pop_scope(); +} + +template<typename _Handler> +void yaml_parser<_Handler>::check_or_begin_document() +{ + if (!get_doc_hash()) + { + set_doc_hash(mp_char); + handler_begin_document(); + } +} + +template<typename _Handler> +void yaml_parser<_Handler>::check_or_begin_map() +{ + switch (get_scope_type()) + { + case yaml::detail::scope_t::unset: + { + check_or_begin_document(); + set_scope_type(yaml::detail::scope_t::map); + handler_begin_map(); + break; + } + case yaml::detail::scope_t::map: + { + if (get_last_parse_token() == yaml::detail::parse_token_t::end_map_key) + handler_null(); + break; + } + default: + ; + } +} + +template<typename _Handler> +void yaml_parser<_Handler>::check_or_begin_sequence() +{ + switch (get_scope_type()) + { + case yaml::detail::scope_t::unset: + { + check_or_begin_document(); + set_scope_type(yaml::detail::scope_t::sequence); + handler_begin_sequence(); + break; + } + case yaml::detail::scope_t::sequence: + { + if (get_last_parse_token() == yaml::detail::parse_token_t::begin_sequence_element) + handler_null(); + break; + } + default: + ; + } + + push_parse_token(yaml::detail::parse_token_t::begin_sequence_element); +} + +template<typename _Handler> +void yaml_parser<_Handler>::parse_value(const char* p, size_t len) +{ + check_or_begin_document(); + + const char* p0 = p; + const char* p_end = p + len; + double val; + p = parse_numeric(p, p_end, val); + if (p == p_end) + { + handler_number(val); + return; + } + + yaml::detail::keyword_t kw = parse_keyword(p0, len); + + if (kw != yaml::detail::keyword_t::unknown) + { + switch (kw) + { + case yaml::detail::keyword_t::null: + handler_null(); + break; + case yaml::detail::keyword_t::boolean_true: + handler_boolean_true(); + break; + case yaml::detail::keyword_t::boolean_false: + handler_boolean_false(); + break; + default: + ; + } + + return; + } + + // Failed to parse it as a number or a keyword. It must be a string. + handler_string(p0, len); +} + +template<typename _Handler> +void yaml_parser<_Handler>::push_value(const char* p, size_t len) +{ + check_or_begin_document(); + + if (has_line_buffer() && get_scope_type() == yaml::detail::scope_t::unset) + set_scope_type(yaml::detail::scope_t::multi_line_string); + + push_line_back(p, len); +} + +template<typename _Handler> +void yaml_parser<_Handler>::parse_line(const char* p, size_t len) +{ + const char* p_end = p + len; + const char* p0 = p; // Save the original head position. + + if (*p == '-') + { + ++p; + if (p == p_end) + { + // List item start. + check_or_begin_sequence(); + return; + } + + switch (*p) + { + case '-': + { + // start of a document + ++p; + if (p == p_end) + throw parse_error("parse_line: line ended with '--'.", offset_last_char_of_line()); + + if (*p != '-') + parse_error::throw_with( + "parse_line: '-' expected but '", *p, "' found.", + offset_last_char_of_line() - std::ptrdiff_t(p_end-p)); + + ++p; // Skip the '-'. + set_doc_hash(p); + handler_begin_document(); + clear_scopes(); + + if (p != p_end) + { + skip_blanks(p, p_end-p); + + // Whatever comes after '---' is equivalent of first node. + assert(p != p_end); + push_scope(0); + parse_line(p, p_end-p); + } + return; + } + case ' ': + { + check_or_begin_sequence(); + + // list item start with inline first item content. + ++p; + if (p == p_end) + throw parse_error( + "parse_line: list item expected, but the line ended prematurely.", + offset_last_char_of_line() - std::ptrdiff_t(p_end-p)); + + skip_blanks(p, p_end-p); + + size_t scope_width = get_scope() + (p-p0); + push_scope(scope_width); + parse_line(p, p_end-p); + return; + } + default: + // It is none of the above. + p = p0; + } + + } + + if (get_scope_type() == yaml::detail::scope_t::sequence) + parse_error::throw_with( + "'-' was expected for a sequence element, but '", *p, "' was found.", + offset_last_char_of_line()-len+1); + + // If the line doesn't start with a "- ", it must be a dictionary key. + parse_map_key(p, len); +} + +template<typename _Handler> +void yaml_parser<_Handler>::parse_map_key(const char* p, size_t len) +{ + const char* p_end = p + len; + const char* p0 = p; // Save the original head position. + + switch (*p) + { + case '"': + { + std::string_view quoted_str = parse_double_quoted_string_value(p, len); + + if (p == p_end) + { + handler_string(quoted_str.data(), quoted_str.size()); + return; + } + + skip_blanks(p, p_end-p); + + if (*p != ':') + throw parse_error( + "parse_map_key: ':' is expected after the quoted string key.", + offset() - std::ptrdiff_t(p_end-p+1)); + + check_or_begin_map(); + handler_begin_map_key(); + handler_string(quoted_str.data(), quoted_str.size()); + handler_end_map_key(); + + ++p; // skip the ':'. + if (p == p_end) + return; + + // Skip all white spaces. + skip_blanks(p, p_end-p); + } + break; + case '\'': + { + std::string_view quoted_str = parse_single_quoted_string_value(p, len); + + if (p == p_end) + { + handler_string(quoted_str.data(), quoted_str.size()); + return; + } + + skip_blanks(p, p_end-p); + + if (*p != ':') + throw parse_error( + "parse_map_key: ':' is expected after the quoted string key.", + offset() - std::ptrdiff_t(p_end-p+1)); + + check_or_begin_map(); + handler_begin_map_key(); + handler_string(quoted_str.data(), quoted_str.size()); + handler_end_map_key(); + + ++p; // skip the ':'. + if (p == p_end) + return; + + skip_blanks(p, p_end-p); + } + break; + default: + { + key_value kv = parse_key_value(p, p_end-p); + + if (kv.key.empty()) + { + // No map key found. + if (*p == '|') + { + start_literal_block(); + return; + } + + push_value(p, len); + return; + } + + check_or_begin_map(); + handler_begin_map_key(); + parse_value(kv.key.data(), kv.key.size()); + handler_end_map_key(); + + if (kv.value.empty()) + return; + + p = kv.value.data(); + } + } + + if (*p == '|') + { + start_literal_block(); + return; + } + + // inline map item. + if (*p == '-') + throw parse_error( + "parse_map_key: sequence entry is not allowed as an inline map item.", + offset() - std::ptrdiff_t(p_end-p+1)); + + size_t scope_width = get_scope() + (p-p0); + push_scope(scope_width); + parse_line(p, p_end-p); +} + +} + +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/yaml_parser_base.hpp b/include/orcus/yaml_parser_base.hpp new file mode 100644 index 0000000..13b4c91 --- /dev/null +++ b/include/orcus/yaml_parser_base.hpp @@ -0,0 +1,195 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_YAML_PARSER_BASE_HPP +#define INCLUDED_ORCUS_YAML_PARSER_BASE_HPP + +#include "orcus/parser_base.hpp" + +#include <memory> +#include <cassert> + +namespace orcus { namespace yaml { + +namespace detail { + +enum class scope_t +{ + unset, + sequence, + map, + multi_line_string +}; + +enum class keyword_t +{ + unknown, + boolean_true, + boolean_false, + null +}; + +enum class parse_token_t +{ + unknown, + + // handler tokens (tokens associated with handler events) + + begin_parse, + end_parse, + begin_document, + end_document, + begin_sequence, + end_sequence, + begin_map, + end_map, + begin_map_key, + end_map_key, + string, + number, + boolean_true, + boolean_false, + null, + + // non-handler tokens + + begin_sequence_element +}; + +} + +class ORCUS_PSR_DLLPUBLIC parser_base : public ::orcus::parser_base +{ + struct impl; + std::unique_ptr<impl> mp_impl; + +protected: + + // The entire line is empty. + static const size_t parse_indent_blank_line; + + // End of stream has reached while parsing in the indent part of a line. + static const size_t parse_indent_end_of_stream; + + static const size_t scope_empty; + + struct key_value + { + std::string_view key; + std::string_view value; + }; + + parser_base() = delete; + parser_base(const parser_base&) = delete; + parser_base& operator=(const parser_base&) = delete; + + parser_base(std::string_view content); + ~parser_base(); + + void push_parse_token(detail::parse_token_t t); + + detail::parse_token_t get_last_parse_token() const; + + /** + * Get the offset position of the last character of the current line + * without comment or trailing whitespaces (if present). Call this only + * after the current line has been parsed to the end, that is, only after + * parse_to_end_of_line() has been called. + * + * @return offset position of the last character of the current line. + */ + size_t offset_last_char_of_line() const; + + /** + * Parse the prefix indent part of a line. + * + * @return number of whitespace characters encountered. + */ + size_t parse_indent(); + + /** + * Once a non-whitespace character is reached, parse until the end of the + * line. + */ + std::string_view parse_to_end_of_line(); + + /** + * Upon encountering a '#', skip until either the line-feed or the + * end-of-stream is reached. + */ + void skip_comment(); + + void reset_on_new_line(); + + size_t get_scope() const; + + void push_scope(size_t scope_width); + + void clear_scopes(); + + detail::scope_t get_scope_type() const; + + void set_scope_type(detail::scope_t type); + + /** + * Pop the current scope and return the new scope width after the pop. + * + * @return new scope width after the pop. + */ + size_t pop_scope(); + + void push_line_back(const char* p, size_t n); + + std::string_view pop_line_front(); + + bool has_line_buffer() const; + + size_t get_line_buffer_count() const; + + std::string_view merge_line_buffer(); + + /** + * Get the hash value of current document, or nullptr if a document has + * not started. + * + * @return hash value of current document. + */ + const char* get_doc_hash() const; + + /** + * Set the hash value representing the current document. For now the + * memory address of the first character of the document is used as its + * hash value. + * + * @param hash hash value of a document. + */ + void set_doc_hash(const char* hash); + + detail::keyword_t parse_keyword(const char* p, size_t len); + + key_value parse_key_value(const char* p, size_t len); + + std::string_view parse_single_quoted_string_value(const char*& p, size_t max_length); + + std::string_view parse_double_quoted_string_value(const char*& p, size_t max_length); + + void skip_blanks(const char*& p, size_t len); + + void start_literal_block(); + + bool in_literal_block() const; + + void handle_line_in_literal(size_t indent); + + void handle_line_in_multi_line_string(); +}; + +}} + +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/zip_archive.hpp b/include/orcus/zip_archive.hpp new file mode 100644 index 0000000..afc6727 --- /dev/null +++ b/include/orcus/zip_archive.hpp @@ -0,0 +1,126 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_ZIP_ARCHIVE_HPP +#define INCLUDED_ORCUS_ZIP_ARCHIVE_HPP + +#include "env.hpp" +#include "exception.hpp" + +#include <string_view> +#include <vector> +#include <memory> +#include <ostream> + +namespace orcus { + +/** + * Structure containing file entry header attributes. + */ +struct ORCUS_PSR_DLLPUBLIC zip_file_entry_header +{ + uint32_t header_signature = 0; + uint16_t required_version = 0; + uint16_t flag = 0; + uint16_t compression_method = 0; + uint16_t last_modified_time = 0; + uint16_t last_modified_date = 0; + uint32_t crc32 = 0; + uint32_t compressed_size = 0; + uint32_t uncompressed_size = 0; + + std::string filename; + std::vector<uint8_t> extra_field; + + zip_file_entry_header(); + zip_file_entry_header(const zip_file_entry_header& other); + zip_file_entry_header(zip_file_entry_header&& other); + ~zip_file_entry_header(); + + zip_file_entry_header& operator=(const zip_file_entry_header& other); + zip_file_entry_header& operator=(zip_file_entry_header&& other); +}; + +ORCUS_PSR_DLLPUBLIC std::ostream& operator<<(std::ostream& os, const zip_file_entry_header& header); + +class zip_archive_stream; + +class ORCUS_PSR_DLLPUBLIC zip_archive +{ + class impl; + + std::unique_ptr<impl> mp_impl; + +public: + zip_archive() = delete; + zip_archive(const zip_archive&) = delete; + zip_archive& operator= (const zip_archive) = delete; + + zip_archive(zip_archive_stream* stream); + ~zip_archive(); + + /** + * Loading involves the parsing of the central directory of a zip archive + * (located toward the end of the stream) and building of file entry data + * which are stored in the central directory. + */ + void load(); + + /** + * Retrieve the header information for a file entry specified by index. + * + * @param index file entry index. + * + * @return header information for a file entry. + */ + zip_file_entry_header get_file_entry_header(std::size_t index) const; + + /** + * Retrieve the header information for a file entry specified by name. + * + * @param name file entry name. + * + * @return header information for a file entry. + */ + zip_file_entry_header get_file_entry_header(std::string_view name) const; + + /** + * Get file entry name from its index. + * + * @param index file entry index + * + * @return file entry name + */ + std::string_view get_file_entry_name(std::size_t index) const; + + /** + * Return the number of file entries stored in this zip archive. Note + * that a file entry may be a directory, so the number of files stored in + * the zip archive may not equal the number of file entries. + * + * @return number of file entries. + */ + size_t get_file_entry_count() const; + + /** + * Retrieve data stream of specified file entry. The retrieved data stream + * gets uncompressed if the original stream is compressed. + * + * @param entry_name file entry name. + * + * @return buffer containing the data stream for specified entry. + * + * @exception zip_error thrown when any problem is encountered during data + * stream retrieval. + */ + std::vector<unsigned char> read_file_entry(std::string_view entry_name) const; +}; + +} + +#endif +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/include/orcus/zip_archive_stream.hpp b/include/orcus/zip_archive_stream.hpp new file mode 100644 index 0000000..7a6bb02 --- /dev/null +++ b/include/orcus/zip_archive_stream.hpp @@ -0,0 +1,71 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef __ORCUS_ZIP_ARCHIVE_STREAM_HPP__ +#define __ORCUS_ZIP_ARCHIVE_STREAM_HPP__ + +#include "env.hpp" +#include <cstdlib> +#include <cstdio> +#include <cstdint> + +namespace orcus { + +class ORCUS_PSR_DLLPUBLIC zip_archive_stream +{ +public: + virtual ~zip_archive_stream(); + + virtual size_t size() const = 0; + virtual size_t tell() const = 0; + virtual void seek(size_t pos) = 0; + virtual void read(unsigned char* buffer, size_t length) const = 0; +}; + +/** + * Zip archive based on file descriptor. The caller needs to provide the + * file path to the zip archive. + */ +class ORCUS_PSR_DLLPUBLIC zip_archive_stream_fd : public zip_archive_stream +{ + FILE* m_stream; + +public: + zip_archive_stream_fd() = delete; + zip_archive_stream_fd(const char* filepath); + virtual ~zip_archive_stream_fd(); + + virtual size_t size() const; + virtual size_t tell() const; + virtual void seek(size_t pos); + virtual void read(unsigned char* buffer, size_t length) const; +}; + +/** + * Zip archive whose content is already loaded onto memory. + */ +class ORCUS_PSR_DLLPUBLIC zip_archive_stream_blob : public zip_archive_stream +{ + const uint8_t* m_blob; + const uint8_t* m_cur; + std::size_t m_size; + +public: + zip_archive_stream_blob() = delete; + zip_archive_stream_blob(const uint8_t* blob, std::size_t size); + virtual ~zip_archive_stream_blob(); + + virtual size_t size() const; + virtual size_t tell() const; + virtual void seek(size_t pos); + virtual void read(unsigned char* buffer, size_t length) const; +}; + +} + +#endif +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |