diff options
author | Bjørn Erik Pedersen <[email protected]> | 2024-09-10 08:54:03 +0200 |
---|---|---|
committer | Bjørn Erik Pedersen <[email protected]> | 2024-09-10 11:03:47 +0200 |
commit | 3d6baedaec306300f2c6f7ed471e774dca0f112a (patch) | |
tree | 4a7b7f62c337aceb9983f8a0490b7e153a7b3d23 /resources/page | |
parent | 84ee00bbc24328295237695a39e6e876ed186312 (diff) | |
download | hugo-3d6baedaec306300f2c6f7ed471e774dca0f112a.tar.gz hugo-3d6baedaec306300f2c6f7ed471e774dca0f112a.zip |
Don't count HTML markup in auto summaries
This commit also fixes a bug where a `</picture>` end tag was wrongly used to detect a end paragraph. This should be very rare, though.
Closes #12837
Diffstat (limited to 'resources/page')
-rw-r--r-- | resources/page/page_markup.go | 20 | ||||
-rw-r--r-- | resources/page/page_markup_test.go | 57 |
2 files changed, 76 insertions, 1 deletions
diff --git a/resources/page/page_markup.go b/resources/page/page_markup.go index ef4a56e3a..44980e8b0 100644 --- a/resources/page/page_markup.go +++ b/resources/page/page_markup.go @@ -161,6 +161,16 @@ func (s *HtmlSummary) resolveParagraphTagAndSetWrapper(mt media.Type) tagReStart return ptag } +// Avoid counting words that are most likely HTML tokens. +var ( + isProbablyHTMLTag = regexp.MustCompile(`^<\/?[A-Za-z]+>?$`) + isProablyHTMLAttribute = regexp.MustCompile(`^[A-Za-z]+=["']`) +) + +func isProbablyHTMLToken(s string) bool { + return s == ">" || isProbablyHTMLTag.MatchString(s) || isProablyHTMLAttribute.MatchString(s) +} + // ExtractSummaryFromHTML extracts a summary from the given HTML content. func ExtractSummaryFromHTML(mt media.Type, input string, numWords int, isCJK bool) (result HtmlSummary) { result.source = input @@ -173,6 +183,14 @@ func ExtractSummaryFromHTML(mt media.Type, input string, numWords int, isCJK boo var count int countWord := func(word string) int { + word = strings.TrimSpace(word) + if len(word) == 0 { + return 0 + } + if isProbablyHTMLToken(word) { + return 0 + } + if isCJK { word = tpl.StripHTML(word) runeCount := utf8.RuneCountInString(word) @@ -193,7 +211,7 @@ func ExtractSummaryFromHTML(mt media.Type, input string, numWords int, isCJK boo for j := result.WrapperStart.High; j < high; { s := input[j:] - closingIndex := strings.Index(s, "</"+ptag.tagName) + closingIndex := strings.Index(s, "</"+ptag.tagName+">") if closingIndex == -1 { break diff --git a/resources/page/page_markup_test.go b/resources/page/page_markup_test.go index b7d363f8f..43eaae6f6 100644 --- a/resources/page/page_markup_test.go +++ b/resources/page/page_markup_test.go @@ -49,6 +49,46 @@ func TestExtractSummaryFromHTML(t *testing.T) { } } +// See https://discourse.gohugo.io/t/automatic-summarys-summarylength-seems-broken-in-the-case-of-plainify/51466/4 +// Also issue 12837 +func TestExtractSummaryFromHTMLLotsOfHTMLInSummary(t *testing.T) { + c := qt.New(t) + + input := ` +<p> +<div> + <picture> + <img src="imgs/1.jpg" alt="1"/> + </picture> + <picture> + <img src="imgs/2.jpg" alt="2"/> + </picture> + <picture> + <img src="imgs/3.jpg" alt="3"/> + </picture> + <picture> + <img src="imgs/4.jpg" alt="4"/> + </picture> + <picture> + <img src="imgs/5.jpg" alt="5"/> + </picture> +</div> +</p> +<p> +This is a story about a cat. +</p> +<p> +The cat was white and fluffy. +</p> +<p> +And it liked milk. +</p> +` + + summary := ExtractSummaryFromHTML(media.Builtin.MarkdownType, input, 10, false) + c.Assert(strings.HasSuffix(summary.Summary(), "<p>\nThis is a story about a cat.\n</p>\n<p>\nThe cat was white and fluffy.\n</p>"), qt.IsTrue) +} + func TestExtractSummaryFromHTMLWithDivider(t *testing.T) { c := qt.New(t) @@ -114,6 +154,23 @@ func TestExpandDivider(t *testing.T) { } } +func TestIsProbablyHTMLToken(t *testing.T) { + c := qt.New(t) + + for i, test := range []struct { + input string + expect bool + }{ + {"<p>", true}, + {"<p", true}, + {"width=\"32\"", true}, + {"width='32'", true}, + {"<p>Æøå", false}, + } { + c.Assert(isProbablyHTMLToken(test.input), qt.Equals, test.expect, qt.Commentf("[%d] Test.expect %q", i, test.input)) + } +} + func BenchmarkSummaryFromHTML(b *testing.B) { b.StopTimer() input := "<p>First paragraph</p><p>Second paragraph</p>" |