diff options
Diffstat (limited to 'intl/icu/source/tools/toolutil/xmlparser.h')
-rw-r--r-- | intl/icu/source/tools/toolutil/xmlparser.h | 247 |
1 files changed, 247 insertions, 0 deletions
diff --git a/intl/icu/source/tools/toolutil/xmlparser.h b/intl/icu/source/tools/toolutil/xmlparser.h new file mode 100644 index 0000000000..75c8ed7e53 --- /dev/null +++ b/intl/icu/source/tools/toolutil/xmlparser.h @@ -0,0 +1,247 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2004-2005, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: xmlparser.h +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2004jul21 +* created by: Andy Heninger +* +* Tiny XML parser using ICU and intended for use in ICU tests and in build tools. +* Not suitable for production use. Not supported. +* Not conformant. Not efficient. +* But very small. +*/ + +#ifndef __XMLPARSER_H__ +#define __XMLPARSER_H__ + +#include "unicode/uobject.h" +#include "unicode/unistr.h" +#include "unicode/regex.h" +#include "uvector.h" +#include "hash.h" + +#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION + +enum UXMLNodeType { + /** Node type string (text contents), stored as a UnicodeString. */ + UXML_NODE_TYPE_STRING, + /** Node type element, stored as a UXMLElement. */ + UXML_NODE_TYPE_ELEMENT, + UXML_NODE_TYPE_COUNT +}; + +U_NAMESPACE_BEGIN + +class UXMLParser; + +/** + * This class represents an element node in a parsed XML tree. + */ +class U_TOOLUTIL_API UXMLElement : public UObject { +public: + /** + * Destructor. + */ + virtual ~UXMLElement(); + + /** + * Get the tag name of this element. + */ + const UnicodeString &getTagName() const; + /** + * Get the text contents of the element. + * Append the contents of all text child nodes. + * @param recurse If TRUE, also recursively appends the contents of all + * text child nodes of element children. + * @return The text contents. + */ + UnicodeString getText(UBool recurse) const; + /** + * Get the number of attributes. + */ + int32_t countAttributes() const; + /** + * Get the i-th attribute. + * @param i Index of the attribute. + * @param name Output parameter, receives the attribute name. + * @param value Output parameter, receives the attribute value. + * @return A pointer to the attribute value (may be &value or a pointer to an + * internal string object), or NULL if i is out of bounds. + */ + const UnicodeString *getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const; + /** + * Get the value of the attribute with the given name. + * @param name Attribute name to be looked up. + * @return A pointer to the attribute value, or NULL if this element + * does not have this attribute. + */ + const UnicodeString *getAttribute(const UnicodeString &name) const; + /** + * Get the number of child nodes. + */ + int32_t countChildren() const; + /** + * Get the i-th child node. + * @param i Index of the child node. + * @param type The child node type. + * @return A pointer to the child node object, or NULL if i is out of bounds. + */ + const UObject *getChild(int32_t i, UXMLNodeType &type) const; + /** + * Get the next child element node, skipping non-element child nodes. + * @param i Enumeration index; initialize to 0 before getting the first child element. + * @return A pointer to the next child element, or NULL if there is none. + */ + const UXMLElement *nextChildElement(int32_t &i) const; + /** + * Get the immediate child element with the given name. + * If there are multiple child elements with this name, then return + * the first one. + * @param name Element name to be looked up. + * @return A pointer to the element node, or NULL if this element + * does not have this immediate child element. + */ + const UXMLElement *getChildElement(const UnicodeString &name) const; + + /** + * ICU "poor man's RTTI", returns a UClassID for the actual class. + */ + virtual UClassID getDynamicClassID() const; + + /** + * ICU "poor man's RTTI", returns a UClassID for this class. + */ + static UClassID U_EXPORT2 getStaticClassID(); + +private: + // prevent default construction etc. + UXMLElement(); + UXMLElement(const UXMLElement &other); + UXMLElement &operator=(const UXMLElement &other); + + void appendText(UnicodeString &text, UBool recurse) const; + + friend class UXMLParser; + + UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode); + + const UXMLParser *fParser; + const UnicodeString *fName; // The tag name of this element (owned by the UXMLParser) + UnicodeString fContent; // The text content of this node. All element content is + // concatenated even when there are intervening nested elements + // (which doesn't happen with most xml files we care about) + // Sections of content containing only white space are dropped, + // which gets rid the bogus white space content from + // elements which are primarily containers for nested elements. + UVector fAttNames; // A vector containing the names of this element's attributes + // The names are UnicodeString objects, owned by the UXMLParser. + UVector fAttValues; // A vector containing the attribute values for + // this element's attributes. The order is the same + // as that of the attribute name vector. + + UVector fChildren; // The child nodes of this element (a Vector) + + UXMLElement *fParent; // A pointer to the parent element of this element. +}; + +/** + * A simple XML parser; it is neither efficient nor conformant and only useful for + * restricted types of XML documents. + * + * The parse methods parse whole documents and return the parse trees via their + * root elements. + */ +class U_TOOLUTIL_API UXMLParser : public UObject { +public: + /** + * Create an XML parser. + */ + static UXMLParser *createParser(UErrorCode &errorCode); + /** + * Destructor. + */ + virtual ~UXMLParser(); + + /** + * Parse an XML document, create the entire document tree, and + * return a pointer to the root element of the parsed tree. + * The caller must delete the element. + */ + UXMLElement *parse(const UnicodeString &src, UErrorCode &errorCode); + /** + * Parse an XML file, create the entire document tree, and + * return a pointer to the root element of the parsed tree. + * The caller must delete the element. + */ + UXMLElement *parseFile(const char *filename, UErrorCode &errorCode); + + /** + * ICU "poor man's RTTI", returns a UClassID for the actual class. + */ + virtual UClassID getDynamicClassID() const; + + /** + * ICU "poor man's RTTI", returns a UClassID for this class. + */ + static UClassID U_EXPORT2 getStaticClassID(); + +private: + // prevent default construction etc. + UXMLParser(); + UXMLParser(const UXMLParser &other); + UXMLParser &operator=(const UXMLParser &other); + + // constructor + UXMLParser(UErrorCode &status); + + void parseMisc(UErrorCode &status); + UXMLElement *createElement(RegexMatcher &mEl, UErrorCode &status); + void error(const char *message, UErrorCode &status); + UnicodeString scanContent(UErrorCode &status); + void replaceCharRefs(UnicodeString &s, UErrorCode &status); + + const UnicodeString *intern(const UnicodeString &s, UErrorCode &errorCode); +public: + // public for UXMLElement only + const UnicodeString *findName(const UnicodeString &s) const; +private: + + // There is one ICU regex matcher for each of the major XML syntax items + // that are recognized. + RegexMatcher mXMLDecl; + RegexMatcher mXMLComment; + RegexMatcher mXMLSP; + RegexMatcher mXMLDoctype; + RegexMatcher mXMLPI; + RegexMatcher mXMLElemStart; + RegexMatcher mXMLElemEnd; + RegexMatcher mXMLElemEmpty; + RegexMatcher mXMLCharData; + RegexMatcher mAttrValue; + RegexMatcher mAttrNormalizer; + RegexMatcher mNewLineNormalizer; + RegexMatcher mAmps; + + Hashtable fNames; // interned element/attribute name strings + UStack fElementStack; // Stack holds the parent elements when nested + // elements are being parsed. All items on this + // stack are of type UXMLElement. + int32_t fPos; // String index of the current scan position in + // xml source (in fSrc). + UnicodeString fOneLF; +}; + +U_NAMESPACE_END +#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ + +#endif |