From 109be507377fe7f6e8819ac94041d3fdcdf6fd2f Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 28 Apr 2024 15:18:25 +0200 Subject: Adding upstream version 1.19.8. Signed-off-by: Daniel Baumann --- src/encoding/xml/xml.go | 2062 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 2062 insertions(+) create mode 100644 src/encoding/xml/xml.go (limited to 'src/encoding/xml/xml.go') diff --git a/src/encoding/xml/xml.go b/src/encoding/xml/xml.go new file mode 100644 index 0000000..4a8c154 --- /dev/null +++ b/src/encoding/xml/xml.go @@ -0,0 +1,2062 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package xml implements a simple XML 1.0 parser that +// understands XML name spaces. +package xml + +// References: +// Annotated XML spec: https://www.xml.com/axml/testaxml.htm +// XML name spaces: https://www.w3.org/TR/REC-xml-names/ + +import ( + "bufio" + "bytes" + "errors" + "fmt" + "io" + "strconv" + "strings" + "unicode" + "unicode/utf8" +) + +// A SyntaxError represents a syntax error in the XML input stream. +type SyntaxError struct { + Msg string + Line int +} + +func (e *SyntaxError) Error() string { + return "XML syntax error on line " + strconv.Itoa(e.Line) + ": " + e.Msg +} + +// A Name represents an XML name (Local) annotated +// with a name space identifier (Space). +// In tokens returned by Decoder.Token, the Space identifier +// is given as a canonical URL, not the short prefix used +// in the document being parsed. +type Name struct { + Space, Local string +} + +// An Attr represents an attribute in an XML element (Name=Value). +type Attr struct { + Name Name + Value string +} + +// A Token is an interface holding one of the token types: +// StartElement, EndElement, CharData, Comment, ProcInst, or Directive. +type Token any + +// A StartElement represents an XML start element. +type StartElement struct { + Name Name + Attr []Attr +} + +// Copy creates a new copy of StartElement. +func (e StartElement) Copy() StartElement { + attrs := make([]Attr, len(e.Attr)) + copy(attrs, e.Attr) + e.Attr = attrs + return e +} + +// End returns the corresponding XML end element. +func (e StartElement) End() EndElement { + return EndElement{e.Name} +} + +// An EndElement represents an XML end element. +type EndElement struct { + Name Name +} + +// A CharData represents XML character data (raw text), +// in which XML escape sequences have been replaced by +// the characters they represent. +type CharData []byte + +func makeCopy(b []byte) []byte { + b1 := make([]byte, len(b)) + copy(b1, b) + return b1 +} + +// Copy creates a new copy of CharData. +func (c CharData) Copy() CharData { return CharData(makeCopy(c)) } + +// A Comment represents an XML comment of the form . +// The bytes do not include the comment markers. +type Comment []byte + +// Copy creates a new copy of Comment. +func (c Comment) Copy() Comment { return Comment(makeCopy(c)) } + +// A ProcInst represents an XML processing instruction of the form +type ProcInst struct { + Target string + Inst []byte +} + +// Copy creates a new copy of ProcInst. +func (p ProcInst) Copy() ProcInst { + p.Inst = makeCopy(p.Inst) + return p +} + +// A Directive represents an XML directive of the form . +// The bytes do not include the markers. +type Directive []byte + +// Copy creates a new copy of Directive. +func (d Directive) Copy() Directive { return Directive(makeCopy(d)) } + +// CopyToken returns a copy of a Token. +func CopyToken(t Token) Token { + switch v := t.(type) { + case CharData: + return v.Copy() + case Comment: + return v.Copy() + case Directive: + return v.Copy() + case ProcInst: + return v.Copy() + case StartElement: + return v.Copy() + } + return t +} + +// A TokenReader is anything that can decode a stream of XML tokens, including a +// Decoder. +// +// When Token encounters an error or end-of-file condition after successfully +// reading a token, it returns the token. It may return the (non-nil) error from +// the same call or return the error (and a nil token) from a subsequent call. +// An instance of this general case is that a TokenReader returning a non-nil +// token at the end of the token stream may return either io.EOF or a nil error. +// The next Read should return nil, io.EOF. +// +// Implementations of Token are discouraged from returning a nil token with a +// nil error. Callers should treat a return of nil, nil as indicating that +// nothing happened; in particular it does not indicate EOF. +type TokenReader interface { + Token() (Token, error) +} + +// A Decoder represents an XML parser reading a particular input stream. +// The parser assumes that its input is encoded in UTF-8. +type Decoder struct { + // Strict defaults to true, enforcing the requirements + // of the XML specification. + // If set to false, the parser allows input containing common + // mistakes: + // * If an element is missing an end tag, the parser invents + // end tags as necessary to keep the return values from Token + // properly balanced. + // * In attribute values and character data, unknown or malformed + // character entities (sequences beginning with &) are left alone. + // + // Setting: + // + // d.Strict = false + // d.AutoClose = xml.HTMLAutoClose + // d.Entity = xml.HTMLEntity + // + // creates a parser that can handle typical HTML. + // + // Strict mode does not enforce the requirements of the XML name spaces TR. + // In particular it does not reject name space tags using undefined prefixes. + // Such tags are recorded with the unknown prefix as the name space URL. + Strict bool + + // When Strict == false, AutoClose indicates a set of elements to + // consider closed immediately after they are opened, regardless + // of whether an end element is present. + AutoClose []string + + // Entity can be used to map non-standard entity names to string replacements. + // The parser behaves as if these standard mappings are present in the map, + // regardless of the actual map content: + // + // "lt": "<", + // "gt": ">", + // "amp": "&", + // "apos": "'", + // "quot": `"`, + Entity map[string]string + + // CharsetReader, if non-nil, defines a function to generate + // charset-conversion readers, converting from the provided + // non-UTF-8 charset into UTF-8. If CharsetReader is nil or + // returns an error, parsing stops with an error. One of the + // CharsetReader's result values must be non-nil. + CharsetReader func(charset string, input io.Reader) (io.Reader, error) + + // DefaultSpace sets the default name space used for unadorned tags, + // as if the entire XML stream were wrapped in an element containing + // the attribute xmlns="DefaultSpace". + DefaultSpace string + + r io.ByteReader + t TokenReader + buf bytes.Buffer + saved *bytes.Buffer + stk *stack + free *stack + needClose bool + toClose Name + nextToken Token + nextByte int + ns map[string]string + err error + line int + linestart int64 + offset int64 + unmarshalDepth int +} + +// NewDecoder creates a new XML parser reading from r. +// If r does not implement io.ByteReader, NewDecoder will +// do its own buffering. +func NewDecoder(r io.Reader) *Decoder { + d := &Decoder{ + ns: make(map[string]string), + nextByte: -1, + line: 1, + Strict: true, + } + d.switchToReader(r) + return d +} + +// NewTokenDecoder creates a new XML parser using an underlying token stream. +func NewTokenDecoder(t TokenReader) *Decoder { + // Is it already a Decoder? + if d, ok := t.(*Decoder); ok { + return d + } + d := &Decoder{ + ns: make(map[string]string), + t: t, + nextByte: -1, + line: 1, + Strict: true, + } + return d +} + +// Token returns the next XML token in the input stream. +// At the end of the input stream, Token returns nil, io.EOF. +// +// Slices of bytes in the returned token data refer to the +// parser's internal buffer and remain valid only until the next +// call to Token. To acquire a copy of the bytes, call CopyToken +// or the token's Copy method. +// +// Token expands self-closing elements such as
+// into separate start and end elements returned by successive calls. +// +// Token guarantees that the StartElement and EndElement +// tokens it returns are properly nested and matched: +// if Token encounters an unexpected end element +// or EOF before all expected end elements, +// it will return an error. +// +// Token implements XML name spaces as described by +// https://www.w3.org/TR/REC-xml-names/. Each of the +// Name structures contained in the Token has the Space +// set to the URL identifying its name space when known. +// If Token encounters an unrecognized name space prefix, +// it uses the prefix as the Space rather than report an error. +func (d *Decoder) Token() (Token, error) { + var t Token + var err error + if d.stk != nil && d.stk.kind == stkEOF { + return nil, io.EOF + } + if d.nextToken != nil { + t = d.nextToken + d.nextToken = nil + } else { + if t, err = d.rawToken(); t == nil && err != nil { + if err == io.EOF && d.stk != nil && d.stk.kind != stkEOF { + err = d.syntaxError("unexpected EOF") + } + return nil, err + } + // We still have a token to process, so clear any + // errors (e.g. EOF) and proceed. + err = nil + } + if !d.Strict { + if t1, ok := d.autoClose(t); ok { + d.nextToken = t + t = t1 + } + } + switch t1 := t.(type) { + case StartElement: + // In XML name spaces, the translations listed in the + // attributes apply to the element name and + // to the other attribute names, so process + // the translations first. + for _, a := range t1.Attr { + if a.Name.Space == xmlnsPrefix { + v, ok := d.ns[a.Name.Local] + d.pushNs(a.Name.Local, v, ok) + d.ns[a.Name.Local] = a.Value + } + if a.Name.Space == "" && a.Name.Local == xmlnsPrefix { + // Default space for untagged names + v, ok := d.ns[""] + d.pushNs("", v, ok) + d.ns[""] = a.Value + } + } + + d.translate(&t1.Name, true) + for i := range t1.Attr { + d.translate(&t1.Attr[i].Name, false) + } + d.pushElement(t1.Name) + t = t1 + + case EndElement: + d.translate(&t1.Name, true) + if !d.popElement(&t1) { + return nil, d.err + } + t = t1 + } + return t, err +} + +const ( + xmlURL = "http://www.w3.org/XML/1998/namespace" + xmlnsPrefix = "xmlns" + xmlPrefix = "xml" +) + +// Apply name space translation to name n. +// The default name space (for Space=="") +// applies only to element names, not to attribute names. +func (d *Decoder) translate(n *Name, isElementName bool) { + switch { + case n.Space == xmlnsPrefix: + return + case n.Space == "" && !isElementName: + return + case n.Space == xmlPrefix: + n.Space = xmlURL + case n.Space == "" && n.Local == xmlnsPrefix: + return + } + if v, ok := d.ns[n.Space]; ok { + n.Space = v + } else if n.Space == "" { + n.Space = d.DefaultSpace + } +} + +func (d *Decoder) switchToReader(r io.Reader) { + // Get efficient byte at a time reader. + // Assume that if reader has its own + // ReadByte, it's efficient enough. + // Otherwise, use bufio. + if rb, ok := r.(io.ByteReader); ok { + d.r = rb + } else { + d.r = bufio.NewReader(r) + } +} + +// Parsing state - stack holds old name space translations +// and the current set of open elements. The translations to pop when +// ending a given tag are *below* it on the stack, which is +// more work but forced on us by XML. +type stack struct { + next *stack + kind int + name Name + ok bool +} + +const ( + stkStart = iota + stkNs + stkEOF +) + +func (d *Decoder) push(kind int) *stack { + s := d.free + if s != nil { + d.free = s.next + } else { + s = new(stack) + } + s.next = d.stk + s.kind = kind + d.stk = s + return s +} + +func (d *Decoder) pop() *stack { + s := d.stk + if s != nil { + d.stk = s.next + s.next = d.free + d.free = s + } + return s +} + +// Record that after the current element is finished +// (that element is already pushed on the stack) +// Token should return EOF until popEOF is called. +func (d *Decoder) pushEOF() { + // Walk down stack to find Start. + // It might not be the top, because there might be stkNs + // entries above it. + start := d.stk + for start.kind != stkStart { + start = start.next + } + // The stkNs entries below a start are associated with that + // element too; skip over them. + for start.next != nil && start.next.kind == stkNs { + start = start.next + } + s := d.free + if s != nil { + d.free = s.next + } else { + s = new(stack) + } + s.kind = stkEOF + s.next = start.next + start.next = s +} + +// Undo a pushEOF. +// The element must have been finished, so the EOF should be at the top of the stack. +func (d *Decoder) popEOF() bool { + if d.stk == nil || d.stk.kind != stkEOF { + return false + } + d.pop() + return true +} + +// Record that we are starting an element with the given name. +func (d *Decoder) pushElement(name Name) { + s := d.push(stkStart) + s.name = name +} + +// Record that we are changing the value of ns[local]. +// The old value is url, ok. +func (d *Decoder) pushNs(local string, url string, ok bool) { + s := d.push(stkNs) + s.name.Local = local + s.name.Space = url + s.ok = ok +} + +// Creates a SyntaxError with the current line number. +func (d *Decoder) syntaxError(msg string) error { + return &SyntaxError{Msg: msg, Line: d.line} +} + +// Record that we are ending an element with the given name. +// The name must match the record at the top of the stack, +// which must be a pushElement record. +// After popping the element, apply any undo records from +// the stack to restore the name translations that existed +// before we saw this element. +func (d *Decoder) popElement(t *EndElement) bool { + s := d.pop() + name := t.Name + switch { + case s == nil || s.kind != stkStart: + d.err = d.syntaxError("unexpected end element ") + return false + case s.name.Local != name.Local: + if !d.Strict { + d.needClose = true + d.toClose = t.Name + t.Name = s.name + return true + } + d.err = d.syntaxError("element <" + s.name.Local + "> closed by ") + return false + case s.name.Space != name.Space: + d.err = d.syntaxError("element <" + s.name.Local + "> in space " + s.name.Space + + " closed by in space " + name.Space) + return false + } + + // Pop stack until a Start or EOF is on the top, undoing the + // translations that were associated with the element we just closed. + for d.stk != nil && d.stk.kind != stkStart && d.stk.kind != stkEOF { + s := d.pop() + if s.ok { + d.ns[s.name.Local] = s.name.Space + } else { + delete(d.ns, s.name.Local) + } + } + + return true +} + +// If the top element on the stack is autoclosing and +// t is not the end tag, invent the end tag. +func (d *Decoder) autoClose(t Token) (Token, bool) { + if d.stk == nil || d.stk.kind != stkStart { + return nil, false + } + for _, s := range d.AutoClose { + if strings.EqualFold(s, d.stk.name.Local) { + // This one should be auto closed if t doesn't close it. + et, ok := t.(EndElement) + if !ok || !strings.EqualFold(et.Name.Local, d.stk.name.Local) { + return EndElement{d.stk.name}, true + } + break + } + } + return nil, false +} + +var errRawToken = errors.New("xml: cannot use RawToken from UnmarshalXML method") + +// RawToken is like Token but does not verify that +// start and end elements match and does not translate +// name space prefixes to their corresponding URLs. +func (d *Decoder) RawToken() (Token, error) { + if d.unmarshalDepth > 0 { + return nil, errRawToken + } + return d.rawToken() +} + +func (d *Decoder) rawToken() (Token, error) { + if d.t != nil { + return d.t.Token() + } + if d.err != nil { + return nil, d.err + } + if d.needClose { + // The last element we read was self-closing and + // we returned just the StartElement half. + // Return the EndElement half now. + d.needClose = false + return EndElement{d.toClose}, nil + } + + b, ok := d.getc() + if !ok { + return nil, d.err + } + + if b != '<' { + // Text section. + d.ungetc(b) + data := d.text(-1, false) + if data == nil { + return nil, d.err + } + return CharData(data), nil + } + + if b, ok = d.mustgetc(); !ok { + return nil, d.err + } + switch b { + case '/': + // ' { + d.err = d.syntaxError("invalid characters between ") + return nil, d.err + } + return EndElement{name}, nil + + case '?': + // ' { + break + } + b0 = b + } + data := d.buf.Bytes() + data = data[0 : len(data)-2] // chop ?> + + if target == "xml" { + content := string(data) + ver := procInst("version", content) + if ver != "" && ver != "1.0" { + d.err = fmt.Errorf("xml: unsupported version %q; only version 1.0 is supported", ver) + return nil, d.err + } + enc := procInst("encoding", content) + if enc != "" && enc != "utf-8" && enc != "UTF-8" && !strings.EqualFold(enc, "utf-8") { + if d.CharsetReader == nil { + d.err = fmt.Errorf("xml: encoding %q declared but Decoder.CharsetReader is nil", enc) + return nil, d.err + } + newr, err := d.CharsetReader(enc, d.r.(io.Reader)) + if err != nil { + d.err = fmt.Errorf("xml: opening charset %q: %v", enc, err) + return nil, d.err + } + if newr == nil { + panic("CharsetReader returned a nil Reader for charset " + enc) + } + d.switchToReader(newr) + } + } + return ProcInst{target, data}, nil + + case '!': + // ' { + d.err = d.syntaxError( + `invalid sequence "--" not allowed in comments`) + return nil, d.err + } + break + } + b0, b1 = b1, b + } + data := d.buf.Bytes() + data = data[0 : len(data)-3] // chop --> + return Comment(data), nil + + case '[': // . + data := d.text(-1, true) + if data == nil { + return nil, d.err + } + return CharData(data), nil + } + + // Probably a directive: , , etc. + // We don't care, but accumulate for caller. Quoted angle + // brackets do not count for nesting. + d.buf.Reset() + d.buf.WriteByte(b) + inquote := uint8(0) + depth := 0 + for { + if b, ok = d.mustgetc(); !ok { + return nil, d.err + } + if inquote == 0 && b == '>' && depth == 0 { + break + } + HandleB: + d.buf.WriteByte(b) + switch { + case b == inquote: + inquote = 0 + + case inquote != 0: + // in quotes, no special action + + case b == '\'' || b == '"': + inquote = b + + case b == '>' && inquote == 0: + depth-- + + case b == '<' && inquote == 0: + // Look for