aboutsummaryrefslogtreecommitdiffhomepage
path: root/resources/page
diff options
context:
space:
mode:
authorBjørn Erik Pedersen <[email protected]>2024-09-10 08:54:03 +0200
committerBjørn Erik Pedersen <[email protected]>2024-09-10 11:03:47 +0200
commit3d6baedaec306300f2c6f7ed471e774dca0f112a (patch)
tree4a7b7f62c337aceb9983f8a0490b7e153a7b3d23 /resources/page
parent84ee00bbc24328295237695a39e6e876ed186312 (diff)
downloadhugo-3d6baedaec306300f2c6f7ed471e774dca0f112a.tar.gz
hugo-3d6baedaec306300f2c6f7ed471e774dca0f112a.zip
Don't count HTML markup in auto summaries
This commit also fixes a bug where a `</picture>` end tag was wrongly used to detect a end paragraph. This should be very rare, though. Closes #12837
Diffstat (limited to 'resources/page')
-rw-r--r--resources/page/page_markup.go20
-rw-r--r--resources/page/page_markup_test.go57
2 files changed, 76 insertions, 1 deletions
diff --git a/resources/page/page_markup.go b/resources/page/page_markup.go
index ef4a56e3a..44980e8b0 100644
--- a/resources/page/page_markup.go
+++ b/resources/page/page_markup.go
@@ -161,6 +161,16 @@ func (s *HtmlSummary) resolveParagraphTagAndSetWrapper(mt media.Type) tagReStart
return ptag
}
+// Avoid counting words that are most likely HTML tokens.
+var (
+ isProbablyHTMLTag = regexp.MustCompile(`^<\/?[A-Za-z]+>?$`)
+ isProablyHTMLAttribute = regexp.MustCompile(`^[A-Za-z]+=["']`)
+)
+
+func isProbablyHTMLToken(s string) bool {
+ return s == ">" || isProbablyHTMLTag.MatchString(s) || isProablyHTMLAttribute.MatchString(s)
+}
+
// ExtractSummaryFromHTML extracts a summary from the given HTML content.
func ExtractSummaryFromHTML(mt media.Type, input string, numWords int, isCJK bool) (result HtmlSummary) {
result.source = input
@@ -173,6 +183,14 @@ func ExtractSummaryFromHTML(mt media.Type, input string, numWords int, isCJK boo
var count int
countWord := func(word string) int {
+ word = strings.TrimSpace(word)
+ if len(word) == 0 {
+ return 0
+ }
+ if isProbablyHTMLToken(word) {
+ return 0
+ }
+
if isCJK {
word = tpl.StripHTML(word)
runeCount := utf8.RuneCountInString(word)
@@ -193,7 +211,7 @@ func ExtractSummaryFromHTML(mt media.Type, input string, numWords int, isCJK boo
for j := result.WrapperStart.High; j < high; {
s := input[j:]
- closingIndex := strings.Index(s, "</"+ptag.tagName)
+ closingIndex := strings.Index(s, "</"+ptag.tagName+">")
if closingIndex == -1 {
break
diff --git a/resources/page/page_markup_test.go b/resources/page/page_markup_test.go
index b7d363f8f..43eaae6f6 100644
--- a/resources/page/page_markup_test.go
+++ b/resources/page/page_markup_test.go
@@ -49,6 +49,46 @@ func TestExtractSummaryFromHTML(t *testing.T) {
}
}
+// See https://discourse.gohugo.io/t/automatic-summarys-summarylength-seems-broken-in-the-case-of-plainify/51466/4
+// Also issue 12837
+func TestExtractSummaryFromHTMLLotsOfHTMLInSummary(t *testing.T) {
+ c := qt.New(t)
+
+ input := `
+<p>
+<div>
+ <picture>
+ <img src="imgs/1.jpg" alt="1"/>
+ </picture>
+ <picture>
+ <img src="imgs/2.jpg" alt="2"/>
+ </picture>
+ <picture>
+ <img src="imgs/3.jpg" alt="3"/>
+ </picture>
+ <picture>
+ <img src="imgs/4.jpg" alt="4"/>
+ </picture>
+ <picture>
+ <img src="imgs/5.jpg" alt="5"/>
+ </picture>
+</div>
+</p>
+<p>
+This is a story about a cat.
+</p>
+<p>
+The cat was white and fluffy.
+</p>
+<p>
+And it liked milk.
+</p>
+`
+
+ summary := ExtractSummaryFromHTML(media.Builtin.MarkdownType, input, 10, false)
+ c.Assert(strings.HasSuffix(summary.Summary(), "<p>\nThis is a story about a cat.\n</p>\n<p>\nThe cat was white and fluffy.\n</p>"), qt.IsTrue)
+}
+
func TestExtractSummaryFromHTMLWithDivider(t *testing.T) {
c := qt.New(t)
@@ -114,6 +154,23 @@ func TestExpandDivider(t *testing.T) {
}
}
+func TestIsProbablyHTMLToken(t *testing.T) {
+ c := qt.New(t)
+
+ for i, test := range []struct {
+ input string
+ expect bool
+ }{
+ {"<p>", true},
+ {"<p", true},
+ {"width=\"32\"", true},
+ {"width='32'", true},
+ {"<p>Æøå", false},
+ } {
+ c.Assert(isProbablyHTMLToken(test.input), qt.Equals, test.expect, qt.Commentf("[%d] Test.expect %q", i, test.input))
+ }
+}
+
func BenchmarkSummaryFromHTML(b *testing.B) {
b.StopTimer()
input := "<p>First paragraph</p><p>Second paragraph</p>"