diff options
Diffstat (limited to '')
-rw-r--r-- | src/go/doc/comment/parse.go | 1262 |
1 files changed, 1262 insertions, 0 deletions
diff --git a/src/go/doc/comment/parse.go b/src/go/doc/comment/parse.go new file mode 100644 index 0000000..62a0f8f --- /dev/null +++ b/src/go/doc/comment/parse.go @@ -0,0 +1,1262 @@ +// Copyright 2022 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package comment + +import ( + "sort" + "strings" + "unicode" + "unicode/utf8" +) + +// A Doc is a parsed Go doc comment. +type Doc struct { + // Content is the sequence of content blocks in the comment. + Content []Block + + // Links is the link definitions in the comment. + Links []*LinkDef +} + +// A LinkDef is a single link definition. +type LinkDef struct { + Text string // the link text + URL string // the link URL + Used bool // whether the comment uses the definition +} + +// A Block is block-level content in a doc comment, +// one of [*Code], [*Heading], [*List], or [*Paragraph]. +type Block interface { + block() +} + +// A Heading is a doc comment heading. +type Heading struct { + Text []Text // the heading text +} + +func (*Heading) block() {} + +// A List is a numbered or bullet list. +// Lists are always non-empty: len(Items) > 0. +// In a numbered list, every Items[i].Number is a non-empty string. +// In a bullet list, every Items[i].Number is an empty string. +type List struct { + // Items is the list items. + Items []*ListItem + + // ForceBlankBefore indicates that the list must be + // preceded by a blank line when reformatting the comment, + // overriding the usual conditions. See the BlankBefore method. + // + // The comment parser sets ForceBlankBefore for any list + // that is preceded by a blank line, to make sure + // the blank line is preserved when printing. + ForceBlankBefore bool + + // ForceBlankBetween indicates that list items must be + // separated by blank lines when reformatting the comment, + // overriding the usual conditions. See the BlankBetween method. + // + // The comment parser sets ForceBlankBetween for any list + // that has a blank line between any two of its items, to make sure + // the blank lines are preserved when printing. + ForceBlankBetween bool +} + +func (*List) block() {} + +// BlankBefore reports whether a reformatting of the comment +// should include a blank line before the list. +// The default rule is the same as for [BlankBetween]: +// if the list item content contains any blank lines +// (meaning at least one item has multiple paragraphs) +// then the list itself must be preceded by a blank line. +// A preceding blank line can be forced by setting [List].ForceBlankBefore. +func (l *List) BlankBefore() bool { + return l.ForceBlankBefore || l.BlankBetween() +} + +// BlankBetween reports whether a reformatting of the comment +// should include a blank line between each pair of list items. +// The default rule is that if the list item content contains any blank lines +// (meaning at least one item has multiple paragraphs) +// then list items must themselves be separated by blank lines. +// Blank line separators can be forced by setting [List].ForceBlankBetween. +func (l *List) BlankBetween() bool { + if l.ForceBlankBetween { + return true + } + for _, item := range l.Items { + if len(item.Content) != 1 { + // Unreachable for parsed comments today, + // since the only way to get multiple item.Content + // is multiple paragraphs, which must have been + // separated by a blank line. + return true + } + } + return false +} + +// A ListItem is a single item in a numbered or bullet list. +type ListItem struct { + // Number is a decimal string in a numbered list + // or an empty string in a bullet list. + Number string // "1", "2", ...; "" for bullet list + + // Content is the list content. + // Currently, restrictions in the parser and printer + // require every element of Content to be a *Paragraph. + Content []Block // Content of this item. +} + +// A Paragraph is a paragraph of text. +type Paragraph struct { + Text []Text +} + +func (*Paragraph) block() {} + +// A Code is a preformatted code block. +type Code struct { + // Text is the preformatted text, ending with a newline character. + // It may be multiple lines, each of which ends with a newline character. + // It is never empty, nor does it start or end with a blank line. + Text string +} + +func (*Code) block() {} + +// A Text is text-level content in a doc comment, +// one of [Plain], [Italic], [*Link], or [*DocLink]. +type Text interface { + text() +} + +// A Plain is a string rendered as plain text (not italicized). +type Plain string + +func (Plain) text() {} + +// An Italic is a string rendered as italicized text. +type Italic string + +func (Italic) text() {} + +// A Link is a link to a specific URL. +type Link struct { + Auto bool // is this an automatic (implicit) link of a literal URL? + Text []Text // text of link + URL string // target URL of link +} + +func (*Link) text() {} + +// A DocLink is a link to documentation for a Go package or symbol. +type DocLink struct { + Text []Text // text of link + + // ImportPath, Recv, and Name identify the Go package or symbol + // that is the link target. The potential combinations of + // non-empty fields are: + // - ImportPath: a link to another package + // - ImportPath, Name: a link to a const, func, type, or var in another package + // - ImportPath, Recv, Name: a link to a method in another package + // - Name: a link to a const, func, type, or var in this package + // - Recv, Name: a link to a method in this package + ImportPath string // import path + Recv string // receiver type, without any pointer star, for methods + Name string // const, func, type, var, or method name +} + +func (*DocLink) text() {} + +// A Parser is a doc comment parser. +// The fields in the struct can be filled in before calling Parse +// in order to customize the details of the parsing process. +type Parser struct { + // Words is a map of Go identifier words that + // should be italicized and potentially linked. + // If Words[w] is the empty string, then the word w + // is only italicized. Otherwise it is linked, using + // Words[w] as the link target. + // Words corresponds to the [go/doc.ToHTML] words parameter. + Words map[string]string + + // LookupPackage resolves a package name to an import path. + // + // If LookupPackage(name) returns ok == true, then [name] + // (or [name.Sym] or [name.Sym.Method]) + // is considered a documentation link to importPath's package docs. + // It is valid to return "", true, in which case name is considered + // to refer to the current package. + // + // If LookupPackage(name) returns ok == false, + // then [name] (or [name.Sym] or [name.Sym.Method]) + // will not be considered a documentation link, + // except in the case where name is the full (but single-element) import path + // of a package in the standard library, such as in [math] or [io.Reader]. + // LookupPackage is still called for such names, + // in order to permit references to imports of other packages + // with the same package names. + // + // Setting LookupPackage to nil is equivalent to setting it to + // a function that always returns "", false. + LookupPackage func(name string) (importPath string, ok bool) + + // LookupSym reports whether a symbol name or method name + // exists in the current package. + // + // If LookupSym("", "Name") returns true, then [Name] + // is considered a documentation link for a const, func, type, or var. + // + // Similarly, if LookupSym("Recv", "Name") returns true, + // then [Recv.Name] is considered a documentation link for + // type Recv's method Name. + // + // Setting LookupSym to nil is equivalent to setting it to a function + // that always returns false. + LookupSym func(recv, name string) (ok bool) +} + +// parseDoc is parsing state for a single doc comment. +type parseDoc struct { + *Parser + *Doc + links map[string]*LinkDef + lines []string + lookupSym func(recv, name string) bool +} + +// lookupPkg is called to look up the pkg in [pkg], [pkg.Name], and [pkg.Name.Recv]. +// If pkg has a slash, it is assumed to be the full import path and is returned with ok = true. +// +// Otherwise, pkg is probably a simple package name like "rand" (not "crypto/rand" or "math/rand"). +// d.LookupPackage provides a way for the caller to allow resolving such names with reference +// to the imports in the surrounding package. +// +// There is one collision between these two cases: single-element standard library names +// like "math" are full import paths but don't contain slashes. We let d.LookupPackage have +// the first chance to resolve it, in case there's a different package imported as math, +// and otherwise we refer to a built-in list of single-element standard library package names. +func (d *parseDoc) lookupPkg(pkg string) (importPath string, ok bool) { + if strings.Contains(pkg, "/") { // assume a full import path + if validImportPath(pkg) { + return pkg, true + } + return "", false + } + if d.LookupPackage != nil { + // Give LookupPackage a chance. + if path, ok := d.LookupPackage(pkg); ok { + return path, true + } + } + return DefaultLookupPackage(pkg) +} + +func isStdPkg(path string) bool { + // TODO(rsc): Use sort.Find once we don't have to worry about + // copying this code into older Go environments. + i := sort.Search(len(stdPkgs), func(i int) bool { return stdPkgs[i] >= path }) + return i < len(stdPkgs) && stdPkgs[i] == path +} + +// DefaultLookupPackage is the default package lookup +// function, used when [Parser].LookupPackage is nil. +// It recognizes names of the packages from the standard +// library with single-element import paths, such as math, +// which would otherwise be impossible to name. +// +// Note that the go/doc package provides a more sophisticated +// lookup based on the imports used in the current package. +func DefaultLookupPackage(name string) (importPath string, ok bool) { + if isStdPkg(name) { + return name, true + } + return "", false +} + +// Parse parses the doc comment text and returns the *Doc form. +// Comment markers (/* // and */) in the text must have already been removed. +func (p *Parser) Parse(text string) *Doc { + lines := unindent(strings.Split(text, "\n")) + d := &parseDoc{ + Parser: p, + Doc: new(Doc), + links: make(map[string]*LinkDef), + lines: lines, + lookupSym: func(recv, name string) bool { return false }, + } + if p.LookupSym != nil { + d.lookupSym = p.LookupSym + } + + // First pass: break into block structure and collect known links. + // The text is all recorded as Plain for now. + var prev span + for _, s := range parseSpans(lines) { + var b Block + switch s.kind { + default: + panic("go/doc/comment: internal error: unknown span kind") + case spanList: + b = d.list(lines[s.start:s.end], prev.end < s.start) + case spanCode: + b = d.code(lines[s.start:s.end]) + case spanOldHeading: + b = d.oldHeading(lines[s.start]) + case spanHeading: + b = d.heading(lines[s.start]) + case spanPara: + b = d.paragraph(lines[s.start:s.end]) + } + if b != nil { + d.Content = append(d.Content, b) + } + prev = s + } + + // Second pass: interpret all the Plain text now that we know the links. + for _, b := range d.Content { + switch b := b.(type) { + case *Paragraph: + b.Text = d.parseLinkedText(string(b.Text[0].(Plain))) + case *List: + for _, i := range b.Items { + for _, c := range i.Content { + p := c.(*Paragraph) + p.Text = d.parseLinkedText(string(p.Text[0].(Plain))) + } + } + } + } + + return d.Doc +} + +// A span represents a single span of comment lines (lines[start:end]) +// of an identified kind (code, heading, paragraph, and so on). +type span struct { + start int + end int + kind spanKind +} + +// A spanKind describes the kind of span. +type spanKind int + +const ( + _ spanKind = iota + spanCode + spanHeading + spanList + spanOldHeading + spanPara +) + +func parseSpans(lines []string) []span { + var spans []span + + // The loop may process a line twice: once as unindented + // and again forced indented. So the maximum expected + // number of iterations is 2*len(lines). The repeating logic + // can be subtle, though, and to protect against introduction + // of infinite loops in future changes, we watch to see that + // we are not looping too much. A panic is better than a + // quiet infinite loop. + watchdog := 2 * len(lines) + + i := 0 + forceIndent := 0 +Spans: + for { + // Skip blank lines. + for i < len(lines) && lines[i] == "" { + i++ + } + if i >= len(lines) { + break + } + if watchdog--; watchdog < 0 { + panic("go/doc/comment: internal error: not making progress") + } + + var kind spanKind + start := i + end := i + if i < forceIndent || indented(lines[i]) { + // Indented (or force indented). + // Ends before next unindented. (Blank lines are OK.) + // If this is an unindented list that we are heuristically treating as indented, + // then accept unindented list item lines up to the first blank lines. + // The heuristic is disabled at blank lines to contain its effect + // to non-gofmt'ed sections of the comment. + unindentedListOK := isList(lines[i]) && i < forceIndent + i++ + for i < len(lines) && (lines[i] == "" || i < forceIndent || indented(lines[i]) || (unindentedListOK && isList(lines[i]))) { + if lines[i] == "" { + unindentedListOK = false + } + i++ + } + + // Drop trailing blank lines. + end = i + for end > start && lines[end-1] == "" { + end-- + } + + // If indented lines are followed (without a blank line) + // by an unindented line ending in a brace, + // take that one line too. This fixes the common mistake + // of pasting in something like + // + // func main() { + // fmt.Println("hello, world") + // } + // + // and forgetting to indent it. + // The heuristic will never trigger on a gofmt'ed comment, + // because any gofmt'ed code block or list would be + // followed by a blank line or end of comment. + if end < len(lines) && strings.HasPrefix(lines[end], "}") { + end++ + } + + if isList(lines[start]) { + kind = spanList + } else { + kind = spanCode + } + } else { + // Unindented. Ends at next blank or indented line. + i++ + for i < len(lines) && lines[i] != "" && !indented(lines[i]) { + i++ + } + end = i + + // If unindented lines are followed (without a blank line) + // by an indented line that would start a code block, + // check whether the final unindented lines + // should be left for the indented section. + // This can happen for the common mistakes of + // unindented code or unindented lists. + // The heuristic will never trigger on a gofmt'ed comment, + // because any gofmt'ed code block would have a blank line + // preceding it after the unindented lines. + if i < len(lines) && lines[i] != "" && !isList(lines[i]) { + switch { + case isList(lines[i-1]): + // If the final unindented line looks like a list item, + // this may be the first indented line wrap of + // a mistakenly unindented list. + // Leave all the unindented list items. + forceIndent = end + end-- + for end > start && isList(lines[end-1]) { + end-- + } + + case strings.HasSuffix(lines[i-1], "{") || strings.HasSuffix(lines[i-1], `\`): + // If the final unindented line ended in { or \ + // it is probably the start of a misindented code block. + // Give the user a single line fix. + // Often that's enough; if not, the user can fix the others themselves. + forceIndent = end + end-- + } + + if start == end && forceIndent > start { + i = start + continue Spans + } + } + + // Span is either paragraph or heading. + if end-start == 1 && isHeading(lines[start]) { + kind = spanHeading + } else if end-start == 1 && isOldHeading(lines[start], lines, start) { + kind = spanOldHeading + } else { + kind = spanPara + } + } + + spans = append(spans, span{start, end, kind}) + i = end + } + + return spans +} + +// indented reports whether line is indented +// (starts with a leading space or tab). +func indented(line string) bool { + return line != "" && (line[0] == ' ' || line[0] == '\t') +} + +// unindent removes any common space/tab prefix +// from each line in lines, returning a copy of lines in which +// those prefixes have been trimmed from each line. +// It also replaces any lines containing only spaces with blank lines (empty strings). +func unindent(lines []string) []string { + // Trim leading and trailing blank lines. + for len(lines) > 0 && isBlank(lines[0]) { + lines = lines[1:] + } + for len(lines) > 0 && isBlank(lines[len(lines)-1]) { + lines = lines[:len(lines)-1] + } + if len(lines) == 0 { + return nil + } + + // Compute and remove common indentation. + prefix := leadingSpace(lines[0]) + for _, line := range lines[1:] { + if !isBlank(line) { + prefix = commonPrefix(prefix, leadingSpace(line)) + } + } + + out := make([]string, len(lines)) + for i, line := range lines { + line = strings.TrimPrefix(line, prefix) + if strings.TrimSpace(line) == "" { + line = "" + } + out[i] = line + } + for len(out) > 0 && out[0] == "" { + out = out[1:] + } + for len(out) > 0 && out[len(out)-1] == "" { + out = out[:len(out)-1] + } + return out +} + +// isBlank reports whether s is a blank line. +func isBlank(s string) bool { + return len(s) == 0 || (len(s) == 1 && s[0] == '\n') +} + +// commonPrefix returns the longest common prefix of a and b. +func commonPrefix(a, b string) string { + i := 0 + for i < len(a) && i < len(b) && a[i] == b[i] { + i++ + } + return a[0:i] +} + +// leadingSpace returns the longest prefix of s consisting of spaces and tabs. +func leadingSpace(s string) string { + i := 0 + for i < len(s) && (s[i] == ' ' || s[i] == '\t') { + i++ + } + return s[:i] +} + +// isOldHeading reports whether line is an old-style section heading. +// line is all[off]. +func isOldHeading(line string, all []string, off int) bool { + if off <= 0 || all[off-1] != "" || off+2 >= len(all) || all[off+1] != "" || leadingSpace(all[off+2]) != "" { + return false + } + + line = strings.TrimSpace(line) + + // a heading must start with an uppercase letter + r, _ := utf8.DecodeRuneInString(line) + if !unicode.IsLetter(r) || !unicode.IsUpper(r) { + return false + } + + // it must end in a letter or digit: + r, _ = utf8.DecodeLastRuneInString(line) + if !unicode.IsLetter(r) && !unicode.IsDigit(r) { + return false + } + + // exclude lines with illegal characters. we allow "()," + if strings.ContainsAny(line, ";:!?+*/=[]{}_^°&§~%#@<\">\\") { + return false + } + + // allow "'" for possessive "'s" only + for b := line; ; { + var ok bool + if _, b, ok = strings.Cut(b, "'"); !ok { + break + } + if b != "s" && !strings.HasPrefix(b, "s ") { + return false // ' not followed by s and then end-of-word + } + } + + // allow "." when followed by non-space + for b := line; ; { + var ok bool + if _, b, ok = strings.Cut(b, "."); !ok { + break + } + if b == "" || strings.HasPrefix(b, " ") { + return false // not followed by non-space + } + } + + return true +} + +// oldHeading returns the *Heading for the given old-style section heading line. +func (d *parseDoc) oldHeading(line string) Block { + return &Heading{Text: []Text{Plain(strings.TrimSpace(line))}} +} + +// isHeading reports whether line is a new-style section heading. +func isHeading(line string) bool { + return len(line) >= 2 && + line[0] == '#' && + (line[1] == ' ' || line[1] == '\t') && + strings.TrimSpace(line) != "#" +} + +// heading returns the *Heading for the given new-style section heading line. +func (d *parseDoc) heading(line string) Block { + return &Heading{Text: []Text{Plain(strings.TrimSpace(line[1:]))}} +} + +// code returns a code block built from the lines. +func (d *parseDoc) code(lines []string) *Code { + body := unindent(lines) + body = append(body, "") // to get final \n from Join + return &Code{Text: strings.Join(body, "\n")} +} + +// paragraph returns a paragraph block built from the lines. +// If the lines are link definitions, paragraph adds them to d and returns nil. +func (d *parseDoc) paragraph(lines []string) Block { + // Is this a block of known links? Handle. + var defs []*LinkDef + for _, line := range lines { + def, ok := parseLink(line) + if !ok { + goto NoDefs + } + defs = append(defs, def) + } + for _, def := range defs { + d.Links = append(d.Links, def) + if d.links[def.Text] == nil { + d.links[def.Text] = def + } + } + return nil +NoDefs: + + return &Paragraph{Text: []Text{Plain(strings.Join(lines, "\n"))}} +} + +// parseLink parses a single link definition line: +// +// [text]: url +// +// It returns the link definition and whether the line was well formed. +func parseLink(line string) (*LinkDef, bool) { + if line == "" || line[0] != '[' { + return nil, false + } + i := strings.Index(line, "]:") + if i < 0 || i+3 >= len(line) || (line[i+2] != ' ' && line[i+2] != '\t') { + return nil, false + } + + text := line[1:i] + url := strings.TrimSpace(line[i+3:]) + j := strings.Index(url, "://") + if j < 0 || !isScheme(url[:j]) { + return nil, false + } + + // Line has right form and has valid scheme://. + // That's good enough for us - we are not as picky + // about the characters beyond the :// as we are + // when extracting inline URLs from text. + return &LinkDef{Text: text, URL: url}, true +} + +// list returns a list built from the indented lines, +// using forceBlankBefore as the value of the List's ForceBlankBefore field. +func (d *parseDoc) list(lines []string, forceBlankBefore bool) *List { + num, _, _ := listMarker(lines[0]) + var ( + list *List = &List{ForceBlankBefore: forceBlankBefore} + item *ListItem + text []string + ) + flush := func() { + if item != nil { + if para := d.paragraph(text); para != nil { + item.Content = append(item.Content, para) + } + } + text = nil + } + + for _, line := range lines { + if n, after, ok := listMarker(line); ok && (n != "") == (num != "") { + // start new list item + flush() + + item = &ListItem{Number: n} + list.Items = append(list.Items, item) + line = after + } + line = strings.TrimSpace(line) + if line == "" { + list.ForceBlankBetween = true + flush() + continue + } + text = append(text, strings.TrimSpace(line)) + } + flush() + return list +} + +// listMarker parses the line as beginning with a list marker. +// If it can do that, it returns the numeric marker ("" for a bullet list), +// the rest of the line, and ok == true. +// Otherwise, it returns "", "", false. +func listMarker(line string) (num, rest string, ok bool) { + line = strings.TrimSpace(line) + if line == "" { + return "", "", false + } + + // Can we find a marker? + if r, n := utf8.DecodeRuneInString(line); r == '•' || r == '*' || r == '+' || r == '-' { + num, rest = "", line[n:] + } else if '0' <= line[0] && line[0] <= '9' { + n := 1 + for n < len(line) && '0' <= line[n] && line[n] <= '9' { + n++ + } + if n >= len(line) || (line[n] != '.' && line[n] != ')') { + return "", "", false + } + num, rest = line[:n], line[n+1:] + } else { + return "", "", false + } + + if !indented(rest) || strings.TrimSpace(rest) == "" { + return "", "", false + } + + return num, rest, true +} + +// isList reports whether the line is the first line of a list, +// meaning starts with a list marker after any indentation. +// (The caller is responsible for checking the line is indented, as appropriate.) +func isList(line string) bool { + _, _, ok := listMarker(line) + return ok +} + +// parseLinkedText parses text that is allowed to contain explicit links, +// such as [math.Sin] or [Go home page], into a slice of Text items. +// +// A “pkg” is only assumed to be a full import path if it starts with +// a domain name (a path element with a dot) or is one of the packages +// from the standard library (“[os]”, “[encoding/json]”, and so on). +// To avoid problems with maps, generics, and array types, doc links +// must be both preceded and followed by punctuation, spaces, tabs, +// or the start or end of a line. An example problem would be treating +// map[ast.Expr]TypeAndValue as containing a link. +func (d *parseDoc) parseLinkedText(text string) []Text { + var out []Text + wrote := 0 + flush := func(i int) { + if wrote < i { + out = d.parseText(out, text[wrote:i], true) + wrote = i + } + } + + start := -1 + var buf []byte + for i := 0; i < len(text); i++ { + c := text[i] + if c == '\n' || c == '\t' { + c = ' ' + } + switch c { + case '[': + start = i + case ']': + if start >= 0 { + if def, ok := d.links[string(buf)]; ok { + def.Used = true + flush(start) + out = append(out, &Link{ + Text: d.parseText(nil, text[start+1:i], false), + URL: def.URL, + }) + wrote = i + 1 + } else if link, ok := d.docLink(text[start+1:i], text[:start], text[i+1:]); ok { + flush(start) + link.Text = d.parseText(nil, text[start+1:i], false) + out = append(out, link) + wrote = i + 1 + } + } + start = -1 + buf = buf[:0] + } + if start >= 0 && i != start { + buf = append(buf, c) + } + } + + flush(len(text)) + return out +} + +// docLink parses text, which was found inside [ ] brackets, +// as a doc link if possible, returning the DocLink and ok == true +// or else nil, false. +// The before and after strings are the text before the [ and after the ] +// on the same line. Doc links must be preceded and followed by +// punctuation, spaces, tabs, or the start or end of a line. +func (d *parseDoc) docLink(text, before, after string) (link *DocLink, ok bool) { + if before != "" { + r, _ := utf8.DecodeLastRuneInString(before) + if !unicode.IsPunct(r) && r != ' ' && r != '\t' && r != '\n' { + return nil, false + } + } + if after != "" { + r, _ := utf8.DecodeRuneInString(after) + if !unicode.IsPunct(r) && r != ' ' && r != '\t' && r != '\n' { + return nil, false + } + } + text = strings.TrimPrefix(text, "*") + pkg, name, ok := splitDocName(text) + var recv string + if ok { + pkg, recv, _ = splitDocName(pkg) + } + if pkg != "" { + if pkg, ok = d.lookupPkg(pkg); !ok { + return nil, false + } + } else { + if ok = d.lookupSym(recv, name); !ok { + return nil, false + } + } + link = &DocLink{ + ImportPath: pkg, + Recv: recv, + Name: name, + } + return link, true +} + +// If text is of the form before.Name, where Name is a capitalized Go identifier, +// then splitDocName returns before, name, true. +// Otherwise it returns text, "", false. +func splitDocName(text string) (before, name string, foundDot bool) { + i := strings.LastIndex(text, ".") + name = text[i+1:] + if !isName(name) { + return text, "", false + } + if i >= 0 { + before = text[:i] + } + return before, name, true +} + +// parseText parses s as text and returns the result of appending +// those parsed Text elements to out. +// parseText does not handle explicit links like [math.Sin] or [Go home page]: +// those are handled by parseLinkedText. +// If autoLink is true, then parseText recognizes URLs and words from d.Words +// and converts those to links as appropriate. +func (d *parseDoc) parseText(out []Text, s string, autoLink bool) []Text { + var w strings.Builder + wrote := 0 + writeUntil := func(i int) { + w.WriteString(s[wrote:i]) + wrote = i + } + flush := func(i int) { + writeUntil(i) + if w.Len() > 0 { + out = append(out, Plain(w.String())) + w.Reset() + } + } + for i := 0; i < len(s); { + t := s[i:] + if autoLink { + if url, ok := autoURL(t); ok { + flush(i) + // Note: The old comment parser would look up the URL in words + // and replace the target with words[URL] if it was non-empty. + // That would allow creating links that display as one URL but + // when clicked go to a different URL. Not sure what the point + // of that is, so we're not doing that lookup here. + out = append(out, &Link{Auto: true, Text: []Text{Plain(url)}, URL: url}) + i += len(url) + wrote = i + continue + } + if id, ok := ident(t); ok { + url, italics := d.Words[id] + if !italics { + i += len(id) + continue + } + flush(i) + if url == "" { + out = append(out, Italic(id)) + } else { + out = append(out, &Link{Auto: true, Text: []Text{Italic(id)}, URL: url}) + } + i += len(id) + wrote = i + continue + } + } + switch { + case strings.HasPrefix(t, "``"): + if len(t) >= 3 && t[2] == '`' { + // Do not convert `` inside ```, in case people are mistakenly writing Markdown. + i += 3 + for i < len(t) && t[i] == '`' { + i++ + } + break + } + writeUntil(i) + w.WriteRune('“') + i += 2 + wrote = i + case strings.HasPrefix(t, "''"): + writeUntil(i) + w.WriteRune('”') + i += 2 + wrote = i + default: + i++ + } + } + flush(len(s)) + return out +} + +// autoURL checks whether s begins with a URL that should be hyperlinked. +// If so, it returns the URL, which is a prefix of s, and ok == true. +// Otherwise it returns "", false. +// The caller should skip over the first len(url) bytes of s +// before further processing. +func autoURL(s string) (url string, ok bool) { + // Find the ://. Fast path to pick off non-URL, + // since we call this at every position in the string. + // The shortest possible URL is ftp://x, 7 bytes. + var i int + switch { + case len(s) < 7: + return "", false + case s[3] == ':': + i = 3 + case s[4] == ':': + i = 4 + case s[5] == ':': + i = 5 + case s[6] == ':': + i = 6 + default: + return "", false + } + if i+3 > len(s) || s[i:i+3] != "://" { + return "", false + } + + // Check valid scheme. + if !isScheme(s[:i]) { + return "", false + } + + // Scan host part. Must have at least one byte, + // and must start and end in non-punctuation. + i += 3 + if i >= len(s) || !isHost(s[i]) || isPunct(s[i]) { + return "", false + } + i++ + end := i + for i < len(s) && isHost(s[i]) { + if !isPunct(s[i]) { + end = i + 1 + } + i++ + } + i = end + + // At this point we are definitely returning a URL (scheme://host). + // We just have to find the longest path we can add to it. + // Heuristics abound. + // We allow parens, braces, and brackets, + // but only if they match (#5043, #22285). + // We allow .,:;?! in the path but not at the end, + // to avoid end-of-sentence punctuation (#18139, #16565). + stk := []byte{} + end = i +Path: + for ; i < len(s); i++ { + if isPunct(s[i]) { + continue + } + if !isPath(s[i]) { + break + } + switch s[i] { + case '(': + stk = append(stk, ')') + case '{': + stk = append(stk, '}') + case '[': + stk = append(stk, ']') + case ')', '}', ']': + if len(stk) == 0 || stk[len(stk)-1] != s[i] { + break Path + } + stk = stk[:len(stk)-1] + } + if len(stk) == 0 { + end = i + 1 + } + } + + return s[:end], true +} + +// isScheme reports whether s is a recognized URL scheme. +// Note that if strings of new length (beyond 3-7) +// are added here, the fast path at the top of autoURL will need updating. +func isScheme(s string) bool { + switch s { + case "file", + "ftp", + "gopher", + "http", + "https", + "mailto", + "nntp": + return true + } + return false +} + +// isHost reports whether c is a byte that can appear in a URL host, +// like www.example.com or user@[::1]:8080 +func isHost(c byte) bool { + // mask is a 128-bit bitmap with 1s for allowed bytes, + // so that the byte c can be tested with a shift and an and. + // If c > 128, then 1<<c and 1<<(c-64) will both be zero, + // and this function will return false. + const mask = 0 | + (1<<26-1)<<'A' | + (1<<26-1)<<'a' | + (1<<10-1)<<'0' | + 1<<'_' | + 1<<'@' | + 1<<'-' | + 1<<'.' | + 1<<'[' | + 1<<']' | + 1<<':' + + return ((uint64(1)<<c)&(mask&(1<<64-1)) | + (uint64(1)<<(c-64))&(mask>>64)) != 0 +} + +// isPunct reports whether c is a punctuation byte that can appear +// inside a path but not at the end. +func isPunct(c byte) bool { + // mask is a 128-bit bitmap with 1s for allowed bytes, + // so that the byte c can be tested with a shift and an and. + // If c > 128, then 1<<c and 1<<(c-64) will both be zero, + // and this function will return false. + const mask = 0 | + 1<<'.' | + 1<<',' | + 1<<':' | + 1<<';' | + 1<<'?' | + 1<<'!' + + return ((uint64(1)<<c)&(mask&(1<<64-1)) | + (uint64(1)<<(c-64))&(mask>>64)) != 0 +} + +// isPath reports whether c is a (non-punctuation) path byte. +func isPath(c byte) bool { + // mask is a 128-bit bitmap with 1s for allowed bytes, + // so that the byte c can be tested with a shift and an and. + // If c > 128, then 1<<c and 1<<(c-64) will both be zero, + // and this function will return false. + const mask = 0 | + (1<<26-1)<<'A' | + (1<<26-1)<<'a' | + (1<<10-1)<<'0' | + 1<<'$' | + 1<<'\'' | + 1<<'(' | + 1<<')' | + 1<<'*' | + 1<<'+' | + 1<<'&' | + 1<<'#' | + 1<<'=' | + 1<<'@' | + 1<<'~' | + 1<<'_' | + 1<<'/' | + 1<<'-' | + 1<<'[' | + 1<<']' | + 1<<'{' | + 1<<'}' | + 1<<'%' + + return ((uint64(1)<<c)&(mask&(1<<64-1)) | + (uint64(1)<<(c-64))&(mask>>64)) != 0 +} + +// isName reports whether s is a capitalized Go identifier (like Name). +func isName(s string) bool { + t, ok := ident(s) + if !ok || t != s { + return false + } + r, _ := utf8.DecodeRuneInString(s) + return unicode.IsUpper(r) +} + +// ident checks whether s begins with a Go identifier. +// If so, it returns the identifier, which is a prefix of s, and ok == true. +// Otherwise it returns "", false. +// The caller should skip over the first len(id) bytes of s +// before further processing. +func ident(s string) (id string, ok bool) { + // Scan [\pL_][\pL_0-9]* + n := 0 + for n < len(s) { + if c := s[n]; c < utf8.RuneSelf { + if isIdentASCII(c) && (n > 0 || c < '0' || c > '9') { + n++ + continue + } + break + } + r, nr := utf8.DecodeRuneInString(s[n:]) + if unicode.IsLetter(r) { + n += nr + continue + } + break + } + return s[:n], n > 0 +} + +// isIdentASCII reports whether c is an ASCII identifier byte. +func isIdentASCII(c byte) bool { + // mask is a 128-bit bitmap with 1s for allowed bytes, + // so that the byte c can be tested with a shift and an and. + // If c > 128, then 1<<c and 1<<(c-64) will both be zero, + // and this function will return false. + const mask = 0 | + (1<<26-1)<<'A' | + (1<<26-1)<<'a' | + (1<<10-1)<<'0' | + 1<<'_' + + return ((uint64(1)<<c)&(mask&(1<<64-1)) | + (uint64(1)<<(c-64))&(mask>>64)) != 0 +} + +// validImportPath reports whether path is a valid import path. +// It is a lightly edited copy of golang.org/x/mod/module.CheckImportPath. +func validImportPath(path string) bool { + if !utf8.ValidString(path) { + return false + } + if path == "" { + return false + } + if path[0] == '-' { + return false + } + if strings.Contains(path, "//") { + return false + } + if path[len(path)-1] == '/' { + return false + } + elemStart := 0 + for i, r := range path { + if r == '/' { + if !validImportPathElem(path[elemStart:i]) { + return false + } + elemStart = i + 1 + } + } + return validImportPathElem(path[elemStart:]) +} + +func validImportPathElem(elem string) bool { + if elem == "" || elem[0] == '.' || elem[len(elem)-1] == '.' { + return false + } + for i := 0; i < len(elem); i++ { + if !importPathOK(elem[i]) { + return false + } + } + return true +} + +func importPathOK(c byte) bool { + // mask is a 128-bit bitmap with 1s for allowed bytes, + // so that the byte c can be tested with a shift and an and. + // If c > 128, then 1<<c and 1<<(c-64) will both be zero, + // and this function will return false. + const mask = 0 | + (1<<26-1)<<'A' | + (1<<26-1)<<'a' | + (1<<10-1)<<'0' | + 1<<'-' | + 1<<'.' | + 1<<'~' | + 1<<'_' | + 1<<'+' + + return ((uint64(1)<<c)&(mask&(1<<64-1)) | + (uint64(1)<<(c-64))&(mask>>64)) != 0 +} |