Don't count HTML markup in auto summaries

This commit also fixes a bug where a `</picture>` end tag was wrongly used to detect a end paragraph. This should be very rare, though. Closes #12837
author: Bjørn Erik Pedersen <[email protected]> 2024-09-10 08:54:03 +0200
committer: Bjørn Erik Pedersen <[email protected]> 2024-09-10 11:03:47 +0200
commit: 3d6baedaec306300f2c6f7ed471e774dca0f112a (patch)
tree: 4a7b7f62c337aceb9983f8a0490b7e153a7b3d23
parent: 84ee00bbc24328295237695a39e6e876ed186312 (diff)
download: hugo-3d6baedaec306300f2c6f7ed471e774dca0f112a.tar.gz
hugo-3d6baedaec306300f2c6f7ed471e774dca0f112a.zip
3 files changed, 77 insertions, 2 deletions
diff --git a/hugolib/page_test.go b/hugolib/page_test.go
index 66afd7d96..429ab2659 100644
--- a/hugolib/page_test.go
+++ b/hugolib/page_test.go
@@ -593,7 +593,7 @@ func TestPageSummary(t *testing.T) {
 		// Source is not Asciidoctor- or RST-compatible so don't test them
 		if ext != "ad" && ext != "rst" {
 			checkPageContent(t, p, normalizeExpected(ext, "<p><a href=\"https://lipsum.com/\">Lorem ipsum</a> dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>\n\n<p>Additional text.</p>\n\n<p>Further text.</p>\n"), ext)
-			checkPageSummary(t, p, normalizeExpected(ext, "<p><a href=\"https://lipsum.com/\">Lorem ipsum</a> dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>"), ext)
+			checkPageSummary(t, p, normalizeExpected(ext, "<p><a href=\"https://lipsum.com/\">Lorem ipsum</a> dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p><p>Additional text.</p>"), ext)
 		}
 		checkPageType(t, p, "page")
 	}
diff --git a/resources/page/page_markup.go b/resources/page/page_markup.go
index ef4a56e3a..44980e8b0 100644
--- a/resources/page/page_markup.go
+++ b/resources/page/page_markup.go
@@ -161,6 +161,16 @@ func (s *HtmlSummary) resolveParagraphTagAndSetWrapper(mt media.Type) tagReStart
 	return ptag
 }
 
+// Avoid counting words that are most likely HTML tokens.
+var (
+	isProbablyHTMLTag      = regexp.MustCompile(`^<\/?[A-Za-z]+>?$`)
+	isProablyHTMLAttribute = regexp.MustCompile(`^[A-Za-z]+=["']`)
+)
+
+func isProbablyHTMLToken(s string) bool {
+	return s == ">" || isProbablyHTMLTag.MatchString(s) || isProablyHTMLAttribute.MatchString(s)
+}
+
 // ExtractSummaryFromHTML extracts a summary from the given HTML content.
 func ExtractSummaryFromHTML(mt media.Type, input string, numWords int, isCJK bool) (result HtmlSummary) {
 	result.source = input
@@ -173,6 +183,14 @@ func ExtractSummaryFromHTML(mt media.Type, input string, numWords int, isCJK boo
 	var count int
 
 	countWord := func(word string) int {
+		word = strings.TrimSpace(word)
+		if len(word) == 0 {
+			return 0
+		}
+		if isProbablyHTMLToken(word) {
+			return 0
+		}
+
 		if isCJK {
 			word = tpl.StripHTML(word)
 			runeCount := utf8.RuneCountInString(word)
@@ -193,7 +211,7 @@ func ExtractSummaryFromHTML(mt media.Type, input string, numWords int, isCJK boo
 
 	for j := result.WrapperStart.High; j < high; {
 		s := input[j:]
-		closingIndex := strings.Index(s, "</"+ptag.tagName)
+		closingIndex := strings.Index(s, "</"+ptag.tagName+">")
 
 		if closingIndex == -1 {
 			break
diff --git a/resources/page/page_markup_test.go b/resources/page/page_markup_test.go
index b7d363f8f..43eaae6f6 100644
--- a/resources/page/page_markup_test.go
+++ b/resources/page/page_markup_test.go
@@ -49,6 +49,46 @@ func TestExtractSummaryFromHTML(t *testing.T) {
 	}
 }
 
+// See https://discourse.gohugo.io/t/automatic-summarys-summarylength-seems-broken-in-the-case-of-plainify/51466/4
+// Also issue 12837
+func TestExtractSummaryFromHTMLLotsOfHTMLInSummary(t *testing.T) {
+	c := qt.New(t)
+
+	input := `
+<p>
+<div>
+  <picture>
+    <img src="imgs/1.jpg" alt="1"/>
+  </picture>
+  <picture>
+    <img src="imgs/2.jpg" alt="2"/>
+  </picture>
+  <picture>
+    <img src="imgs/3.jpg" alt="3"/>
+  </picture>
+  <picture>
+    <img src="imgs/4.jpg" alt="4"/>
+  </picture>
+  <picture>
+    <img src="imgs/5.jpg" alt="5"/>
+  </picture>
+</div>
+</p>
+<p>
+This is a story about a cat.
+</p>
+<p>
+The cat was white and fluffy.
+</p>
+<p>
+And it liked milk.
+</p>
+`
+
+	summary := ExtractSummaryFromHTML(media.Builtin.MarkdownType, input, 10, false)
+	c.Assert(strings.HasSuffix(summary.Summary(), "<p>\nThis is a story about a cat.\n</p>\n<p>\nThe cat was white and fluffy.\n</p>"), qt.IsTrue)
+}
+
 func TestExtractSummaryFromHTMLWithDivider(t *testing.T) {
 	c := qt.New(t)
 
@@ -114,6 +154,23 @@ func TestExpandDivider(t *testing.T) {
 	}
 }
 
+func TestIsProbablyHTMLToken(t *testing.T) {
+	c := qt.New(t)
+
+	for i, test := range []struct {
+		input  string
+		expect bool
+	}{
+		{"<p>", true},
+		{"<p", true},
+		{"width=\"32\"", true},
+		{"width='32'", true},
+		{"<p>Æøå", false},
+	} {
+		c.Assert(isProbablyHTMLToken(test.input), qt.Equals, test.expect, qt.Commentf("[%d] Test.expect %q", i, test.input))
+	}
+}
+
 func BenchmarkSummaryFromHTML(b *testing.B) {
 	b.StopTimer()
 	input := "<p>First paragraph</p><p>Second paragraph</p>"
author	Bjørn Erik Pedersen <[email protected]>	2024-09-10 08:54:03 +0200
committer	Bjørn Erik Pedersen <[email protected]>	2024-09-10 11:03:47 +0200
commit	3d6baedaec306300f2c6f7ed471e774dca0f112a (patch)
tree	4a7b7f62c337aceb9983f8a0490b7e153a7b3d23
parent	84ee00bbc24328295237695a39e6e876ed186312 (diff)
download	hugo-3d6baedaec306300f2c6f7ed471e774dca0f112a.tar.gz hugo-3d6baedaec306300f2c6f7ed471e774dca0f112a.zip