summaryrefslogtreecommitdiffstats
path: root/modules/markup/mdstripper
diff options
context:
space:
mode:
Diffstat (limited to 'modules/markup/mdstripper')
-rw-r--r--modules/markup/mdstripper/mdstripper.go199
-rw-r--r--modules/markup/mdstripper/mdstripper_test.go85
2 files changed, 284 insertions, 0 deletions
diff --git a/modules/markup/mdstripper/mdstripper.go b/modules/markup/mdstripper/mdstripper.go
new file mode 100644
index 00000000..2a69d952
--- /dev/null
+++ b/modules/markup/mdstripper/mdstripper.go
@@ -0,0 +1,199 @@
+// Copyright 2019 The Gitea Authors. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+package mdstripper
+
+import (
+ "bytes"
+ "io"
+ "net/url"
+ "strings"
+ "sync"
+
+ "code.gitea.io/gitea/modules/log"
+ "code.gitea.io/gitea/modules/markup/common"
+ "code.gitea.io/gitea/modules/setting"
+
+ "github.com/yuin/goldmark"
+ "github.com/yuin/goldmark/ast"
+ "github.com/yuin/goldmark/extension"
+ "github.com/yuin/goldmark/parser"
+ "github.com/yuin/goldmark/renderer"
+ "github.com/yuin/goldmark/renderer/html"
+ "github.com/yuin/goldmark/text"
+)
+
+var (
+ giteaHostInit sync.Once
+ giteaHost *url.URL
+)
+
+type stripRenderer struct {
+ localhost *url.URL
+ links []string
+ empty bool
+}
+
+func (r *stripRenderer) Render(w io.Writer, source []byte, doc ast.Node) error {
+ return ast.Walk(doc, func(n ast.Node, entering bool) (ast.WalkStatus, error) {
+ if !entering {
+ return ast.WalkContinue, nil
+ }
+ switch v := n.(type) {
+ case *ast.Text:
+ if !v.IsRaw() {
+ _, prevSibIsText := n.PreviousSibling().(*ast.Text)
+ coalesce := prevSibIsText
+ r.processString(
+ w,
+ v.Text(source),
+ coalesce)
+ if v.SoftLineBreak() {
+ r.doubleSpace(w)
+ }
+ }
+ return ast.WalkContinue, nil
+ case *ast.Link:
+ r.processLink(v.Destination)
+ return ast.WalkSkipChildren, nil
+ case *ast.AutoLink:
+ // This could be a reference to an issue or pull - if so convert it
+ r.processAutoLink(w, v.URL(source))
+ return ast.WalkSkipChildren, nil
+ }
+ return ast.WalkContinue, nil
+ })
+}
+
+func (r *stripRenderer) doubleSpace(w io.Writer) {
+ if !r.empty {
+ _, _ = w.Write([]byte{'\n'})
+ }
+}
+
+func (r *stripRenderer) processString(w io.Writer, text []byte, coalesce bool) {
+ // Always break-up words
+ if !coalesce {
+ r.doubleSpace(w)
+ }
+ _, _ = w.Write(text)
+ r.empty = false
+}
+
+// ProcessAutoLinks to detect and handle links to issues and pulls
+func (r *stripRenderer) processAutoLink(w io.Writer, link []byte) {
+ linkStr := string(link)
+ u, err := url.Parse(linkStr)
+ if err != nil {
+ // Process out of band
+ r.links = append(r.links, linkStr)
+ return
+ }
+
+ // Note: we're not attempting to match the URL scheme (http/https)
+ host := strings.ToLower(u.Host)
+ if host != "" && host != strings.ToLower(r.localhost.Host) {
+ // Process out of band
+ r.links = append(r.links, linkStr)
+ return
+ }
+
+ // We want: /user/repo/issues/3
+ parts := strings.Split(strings.TrimPrefix(u.EscapedPath(), r.localhost.EscapedPath()), "/")
+ if len(parts) != 5 || parts[0] != "" {
+ // Process out of band
+ r.links = append(r.links, linkStr)
+ return
+ }
+
+ var sep string
+ if parts[3] == "issues" {
+ sep = "#"
+ } else if parts[3] == "pulls" {
+ sep = "!"
+ } else {
+ // Process out of band
+ r.links = append(r.links, linkStr)
+ return
+ }
+
+ _, _ = w.Write([]byte(parts[1]))
+ _, _ = w.Write([]byte("/"))
+ _, _ = w.Write([]byte(parts[2]))
+ _, _ = w.Write([]byte(sep))
+ _, _ = w.Write([]byte(parts[4]))
+}
+
+func (r *stripRenderer) processLink(link []byte) {
+ // Links are processed out of band
+ r.links = append(r.links, string(link))
+}
+
+// GetLinks returns the list of link data collected while parsing
+func (r *stripRenderer) GetLinks() []string {
+ return r.links
+}
+
+// AddOptions adds given option to this renderer.
+func (r *stripRenderer) AddOptions(...renderer.Option) {
+ // no-op
+}
+
+// StripMarkdown parses markdown content by removing all markup and code blocks
+// in order to extract links and other references
+func StripMarkdown(rawBytes []byte) (string, []string) {
+ buf, links := StripMarkdownBytes(rawBytes)
+ return string(buf), links
+}
+
+var (
+ stripParser parser.Parser
+ once = sync.Once{}
+)
+
+// StripMarkdownBytes parses markdown content by removing all markup and code blocks
+// in order to extract links and other references
+func StripMarkdownBytes(rawBytes []byte) ([]byte, []string) {
+ once.Do(func() {
+ gdMarkdown := goldmark.New(
+ goldmark.WithExtensions(extension.Table,
+ extension.Strikethrough,
+ extension.TaskList,
+ extension.DefinitionList,
+ common.FootnoteExtension,
+ common.Linkify,
+ ),
+ goldmark.WithParserOptions(
+ parser.WithAttribute(),
+ parser.WithAutoHeadingID(),
+ ),
+ goldmark.WithRendererOptions(
+ html.WithUnsafe(),
+ ),
+ )
+ stripParser = gdMarkdown.Parser()
+ })
+ stripper := &stripRenderer{
+ localhost: getGiteaHost(),
+ links: make([]string, 0, 10),
+ empty: true,
+ }
+ reader := text.NewReader(rawBytes)
+ doc := stripParser.Parse(reader)
+ var buf bytes.Buffer
+ if err := stripper.Render(&buf, rawBytes, doc); err != nil {
+ log.Error("Unable to strip: %v", err)
+ }
+ return buf.Bytes(), stripper.GetLinks()
+}
+
+// getGiteaHostName returns a normalized string with the local host name, with no scheme or port information
+func getGiteaHost() *url.URL {
+ giteaHostInit.Do(func() {
+ var err error
+ if giteaHost, err = url.Parse(setting.AppURL); err != nil {
+ giteaHost = &url.URL{}
+ }
+ })
+ return giteaHost
+}
diff --git a/modules/markup/mdstripper/mdstripper_test.go b/modules/markup/mdstripper/mdstripper_test.go
new file mode 100644
index 00000000..ea34df0a
--- /dev/null
+++ b/modules/markup/mdstripper/mdstripper_test.go
@@ -0,0 +1,85 @@
+// Copyright 2019 The Gitea Authors. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+package mdstripper
+
+import (
+ "strings"
+ "testing"
+
+ "github.com/stretchr/testify/assert"
+)
+
+func TestMarkdownStripper(t *testing.T) {
+ type testItem struct {
+ markdown string
+ expectedText []string
+ expectedLinks []string
+ }
+
+ list := []testItem{
+ {
+ `
+## This is a title
+
+This is [one](link) to paradise.
+This **is emphasized**.
+This: should coalesce.
+
+` + "```" + `
+This is a code block.
+This should not appear in the output at all.
+` + "```" + `
+
+* Bullet 1
+* Bullet 2
+
+A HIDDEN ` + "`" + `GHOST` + "`" + ` IN THIS LINE.
+ `,
+ []string{
+ "This is a title",
+ "This is",
+ "to paradise.",
+ "This",
+ "is emphasized",
+ ".",
+ "This: should coalesce.",
+ "Bullet 1",
+ "Bullet 2",
+ "A HIDDEN",
+ "IN THIS LINE.",
+ },
+ []string{
+ "link",
+ },
+ },
+ {
+ "Simply closes: #29 yes",
+ []string{
+ "Simply closes: #29 yes",
+ },
+ []string{},
+ },
+ {
+ "Simply closes: !29 yes",
+ []string{
+ "Simply closes: !29 yes",
+ },
+ []string{},
+ },
+ }
+
+ for _, test := range list {
+ text, links := StripMarkdown([]byte(test.markdown))
+ rawlines := strings.Split(text, "\n")
+ lines := make([]string, 0, len(rawlines))
+ for _, line := range rawlines {
+ line := strings.TrimSpace(line)
+ if line != "" {
+ lines = append(lines, line)
+ }
+ }
+ assert.EqualValues(t, test.expectedText, lines)
+ assert.EqualValues(t, test.expectedLinks, links)
+ }
+}