diff options
Diffstat (limited to '')
-rw-r--r-- | src/go/doc/comment.go | 513 |
1 files changed, 513 insertions, 0 deletions
diff --git a/src/go/doc/comment.go b/src/go/doc/comment.go new file mode 100644 index 0000000..92131a3 --- /dev/null +++ b/src/go/doc/comment.go @@ -0,0 +1,513 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Godoc comment extraction and comment -> HTML formatting. + +package doc + +import ( + "bytes" + "internal/lazyregexp" + "io" + "strings" + "text/template" // for HTMLEscape + "unicode" + "unicode/utf8" +) + +const ( + ldquo = "“" + rdquo = "”" + ulquo = "“" + urquo = "”" +) + +var ( + htmlQuoteReplacer = strings.NewReplacer(ulquo, ldquo, urquo, rdquo) + unicodeQuoteReplacer = strings.NewReplacer("``", ulquo, "''", urquo) +) + +// Escape comment text for HTML. If nice is set, +// also turn `` into “ and '' into ”. +func commentEscape(w io.Writer, text string, nice bool) { + if nice { + // In the first pass, we convert `` and '' into their unicode equivalents. + // This prevents them from being escaped in HTMLEscape. + text = convertQuotes(text) + var buf bytes.Buffer + template.HTMLEscape(&buf, []byte(text)) + // Now we convert the unicode quotes to their HTML escaped entities to maintain old behavior. + // We need to use a temp buffer to read the string back and do the conversion, + // otherwise HTMLEscape will escape & to & + htmlQuoteReplacer.WriteString(w, buf.String()) + return + } + template.HTMLEscape(w, []byte(text)) +} + +func convertQuotes(text string) string { + return unicodeQuoteReplacer.Replace(text) +} + +const ( + // Regexp for Go identifiers + identRx = `[\pL_][\pL_0-9]*` + + // Regexp for URLs + // Match parens, and check later for balance - see #5043, #22285 + // Match .,:;?! within path, but not at end - see #18139, #16565 + // This excludes some rare yet valid urls ending in common punctuation + // in order to allow sentences ending in URLs. + + // protocol (required) e.g. http + protoPart = `(https?|ftp|file|gopher|mailto|nntp)` + // host (required) e.g. www.example.com or [::1]:8080 + hostPart = `([a-zA-Z0-9_@\-.\[\]:]+)` + // path+query+fragment (optional) e.g. /path/index.html?q=foo#bar + pathPart = `([.,:;?!]*[a-zA-Z0-9$'()*+&#=@~_/\-\[\]%])*` + + urlRx = protoPart + `://` + hostPart + pathPart +) + +var matchRx = lazyregexp.New(`(` + urlRx + `)|(` + identRx + `)`) + +var ( + html_a = []byte(`<a href="`) + html_aq = []byte(`">`) + html_enda = []byte("</a>") + html_i = []byte("<i>") + html_endi = []byte("</i>") + html_p = []byte("<p>\n") + html_endp = []byte("</p>\n") + html_pre = []byte("<pre>") + html_endpre = []byte("</pre>\n") + html_h = []byte(`<h3 id="`) + html_hq = []byte(`">`) + html_endh = []byte("</h3>\n") +) + +// Emphasize and escape a line of text for HTML. URLs are converted into links; +// if the URL also appears in the words map, the link is taken from the map (if +// the corresponding map value is the empty string, the URL is not converted +// into a link). Go identifiers that appear in the words map are italicized; if +// the corresponding map value is not the empty string, it is considered a URL +// and the word is converted into a link. If nice is set, the remaining text's +// appearance is improved where it makes sense (e.g., `` is turned into “ +// and '' into ”). +func emphasize(w io.Writer, line string, words map[string]string, nice bool) { + for { + m := matchRx.FindStringSubmatchIndex(line) + if m == nil { + break + } + // m >= 6 (two parenthesized sub-regexps in matchRx, 1st one is urlRx) + + // write text before match + commentEscape(w, line[0:m[0]], nice) + + // adjust match for URLs + match := line[m[0]:m[1]] + if strings.Contains(match, "://") { + m0, m1 := m[0], m[1] + for _, s := range []string{"()", "{}", "[]"} { + open, close := s[:1], s[1:] // E.g., "(" and ")" + // require opening parentheses before closing parentheses (#22285) + if i := strings.Index(match, close); i >= 0 && i < strings.Index(match, open) { + m1 = m0 + i + match = line[m0:m1] + } + // require balanced pairs of parentheses (#5043) + for i := 0; strings.Count(match, open) != strings.Count(match, close) && i < 10; i++ { + m1 = strings.LastIndexAny(line[:m1], s) + match = line[m0:m1] + } + } + if m1 != m[1] { + // redo matching with shortened line for correct indices + m = matchRx.FindStringSubmatchIndex(line[:m[0]+len(match)]) + } + } + + // analyze match + url := "" + italics := false + if words != nil { + url, italics = words[match] + } + if m[2] >= 0 { + // match against first parenthesized sub-regexp; must be match against urlRx + if !italics { + // no alternative URL in words list, use match instead + url = match + } + italics = false // don't italicize URLs + } + + // write match + if len(url) > 0 { + w.Write(html_a) + template.HTMLEscape(w, []byte(url)) + w.Write(html_aq) + } + if italics { + w.Write(html_i) + } + commentEscape(w, match, nice) + if italics { + w.Write(html_endi) + } + if len(url) > 0 { + w.Write(html_enda) + } + + // advance + line = line[m[1]:] + } + commentEscape(w, line, nice) +} + +func indentLen(s string) int { + i := 0 + for i < len(s) && (s[i] == ' ' || s[i] == '\t') { + i++ + } + return i +} + +func isBlank(s string) bool { + return len(s) == 0 || (len(s) == 1 && s[0] == '\n') +} + +func commonPrefix(a, b string) string { + i := 0 + for i < len(a) && i < len(b) && a[i] == b[i] { + i++ + } + return a[0:i] +} + +func unindent(block []string) { + if len(block) == 0 { + return + } + + // compute maximum common white prefix + prefix := block[0][0:indentLen(block[0])] + for _, line := range block { + if !isBlank(line) { + prefix = commonPrefix(prefix, line[0:indentLen(line)]) + } + } + n := len(prefix) + + // remove + for i, line := range block { + if !isBlank(line) { + block[i] = line[n:] + } + } +} + +// heading returns the trimmed line if it passes as a section heading; +// otherwise it returns the empty string. +func heading(line string) string { + line = strings.TrimSpace(line) + if len(line) == 0 { + return "" + } + + // a heading must start with an uppercase letter + r, _ := utf8.DecodeRuneInString(line) + if !unicode.IsLetter(r) || !unicode.IsUpper(r) { + return "" + } + + // it must end in a letter or digit: + r, _ = utf8.DecodeLastRuneInString(line) + if !unicode.IsLetter(r) && !unicode.IsDigit(r) { + return "" + } + + // exclude lines with illegal characters. we allow "()," + if strings.ContainsAny(line, ";:!?+*/=[]{}_^°&§~%#@<\">\\") { + return "" + } + + // allow "'" for possessive "'s" only + for b := line; ; { + i := strings.IndexRune(b, '\'') + if i < 0 { + break + } + if i+1 >= len(b) || b[i+1] != 's' || (i+2 < len(b) && b[i+2] != ' ') { + return "" // not followed by "s " + } + b = b[i+2:] + } + + // allow "." when followed by non-space + for b := line; ; { + i := strings.IndexRune(b, '.') + if i < 0 { + break + } + if i+1 >= len(b) || b[i+1] == ' ' { + return "" // not followed by non-space + } + b = b[i+1:] + } + + return line +} + +type op int + +const ( + opPara op = iota + opHead + opPre +) + +type block struct { + op op + lines []string +} + +var nonAlphaNumRx = lazyregexp.New(`[^a-zA-Z0-9]`) + +func anchorID(line string) string { + // Add a "hdr-" prefix to avoid conflicting with IDs used for package symbols. + return "hdr-" + nonAlphaNumRx.ReplaceAllString(line, "_") +} + +// ToHTML converts comment text to formatted HTML. +// The comment was prepared by DocReader, +// so it is known not to have leading, trailing blank lines +// nor to have trailing spaces at the end of lines. +// The comment markers have already been removed. +// +// Each span of unindented non-blank lines is converted into +// a single paragraph. There is one exception to the rule: a span that +// consists of a single line, is followed by another paragraph span, +// begins with a capital letter, and contains no punctuation +// other than parentheses and commas is formatted as a heading. +// +// A span of indented lines is converted into a <pre> block, +// with the common indent prefix removed. +// +// URLs in the comment text are converted into links; if the URL also appears +// in the words map, the link is taken from the map (if the corresponding map +// value is the empty string, the URL is not converted into a link). +// +// A pair of (consecutive) backticks (`) is converted to a unicode left quote (“), and a pair of (consecutive) +// single quotes (') is converted to a unicode right quote (”). +// +// Go identifiers that appear in the words map are italicized; if the corresponding +// map value is not the empty string, it is considered a URL and the word is converted +// into a link. +func ToHTML(w io.Writer, text string, words map[string]string) { + for _, b := range blocks(text) { + switch b.op { + case opPara: + w.Write(html_p) + for _, line := range b.lines { + emphasize(w, line, words, true) + } + w.Write(html_endp) + case opHead: + w.Write(html_h) + id := "" + for _, line := range b.lines { + if id == "" { + id = anchorID(line) + w.Write([]byte(id)) + w.Write(html_hq) + } + commentEscape(w, line, true) + } + if id == "" { + w.Write(html_hq) + } + w.Write(html_endh) + case opPre: + w.Write(html_pre) + for _, line := range b.lines { + emphasize(w, line, nil, false) + } + w.Write(html_endpre) + } + } +} + +func blocks(text string) []block { + var ( + out []block + para []string + + lastWasBlank = false + lastWasHeading = false + ) + + close := func() { + if para != nil { + out = append(out, block{opPara, para}) + para = nil + } + } + + lines := strings.SplitAfter(text, "\n") + unindent(lines) + for i := 0; i < len(lines); { + line := lines[i] + if isBlank(line) { + // close paragraph + close() + i++ + lastWasBlank = true + continue + } + if indentLen(line) > 0 { + // close paragraph + close() + + // count indented or blank lines + j := i + 1 + for j < len(lines) && (isBlank(lines[j]) || indentLen(lines[j]) > 0) { + j++ + } + // but not trailing blank lines + for j > i && isBlank(lines[j-1]) { + j-- + } + pre := lines[i:j] + i = j + + unindent(pre) + + // put those lines in a pre block + out = append(out, block{opPre, pre}) + lastWasHeading = false + continue + } + + if lastWasBlank && !lastWasHeading && i+2 < len(lines) && + isBlank(lines[i+1]) && !isBlank(lines[i+2]) && indentLen(lines[i+2]) == 0 { + // current line is non-blank, surrounded by blank lines + // and the next non-blank line is not indented: this + // might be a heading. + if head := heading(line); head != "" { + close() + out = append(out, block{opHead, []string{head}}) + i += 2 + lastWasHeading = true + continue + } + } + + // open paragraph + lastWasBlank = false + lastWasHeading = false + para = append(para, lines[i]) + i++ + } + close() + + return out +} + +// ToText prepares comment text for presentation in textual output. +// It wraps paragraphs of text to width or fewer Unicode code points +// and then prefixes each line with the indent. In preformatted sections +// (such as program text), it prefixes each non-blank line with preIndent. +// +// A pair of (consecutive) backticks (`) is converted to a unicode left quote (“), and a pair of (consecutive) +// single quotes (') is converted to a unicode right quote (”). +func ToText(w io.Writer, text string, indent, preIndent string, width int) { + l := lineWrapper{ + out: w, + width: width, + indent: indent, + } + for _, b := range blocks(text) { + switch b.op { + case opPara: + // l.write will add leading newline if required + for _, line := range b.lines { + line = convertQuotes(line) + l.write(line) + } + l.flush() + case opHead: + w.Write(nl) + for _, line := range b.lines { + line = convertQuotes(line) + l.write(line + "\n") + } + l.flush() + case opPre: + w.Write(nl) + for _, line := range b.lines { + if isBlank(line) { + w.Write([]byte("\n")) + } else { + w.Write([]byte(preIndent)) + w.Write([]byte(line)) + } + } + } + } +} + +type lineWrapper struct { + out io.Writer + printed bool + width int + indent string + n int + pendSpace int +} + +var nl = []byte("\n") +var space = []byte(" ") +var prefix = []byte("// ") + +func (l *lineWrapper) write(text string) { + if l.n == 0 && l.printed { + l.out.Write(nl) // blank line before new paragraph + } + l.printed = true + + needsPrefix := false + isComment := strings.HasPrefix(text, "//") + for _, f := range strings.Fields(text) { + w := utf8.RuneCountInString(f) + // wrap if line is too long + if l.n > 0 && l.n+l.pendSpace+w > l.width { + l.out.Write(nl) + l.n = 0 + l.pendSpace = 0 + needsPrefix = isComment && !strings.HasPrefix(f, "//") + } + if l.n == 0 { + l.out.Write([]byte(l.indent)) + } + if needsPrefix { + l.out.Write(prefix) + needsPrefix = false + } + l.out.Write(space[:l.pendSpace]) + l.out.Write([]byte(f)) + l.n += l.pendSpace + w + l.pendSpace = 1 + } +} + +func (l *lineWrapper) flush() { + if l.n == 0 { + return + } + l.out.Write(nl) + l.pendSpace = 0 + l.n = 0 +} |