Skip to content

Commit 3d6baed

Browse files
committed
Don't count HTML markup in auto summaries
This commit also fixes a bug where a `</picture>` end tag was wrongly used to detect a end paragraph. This should be very rare, though. Closes gohugoio#12837
1 parent 84ee00b commit 3d6baed

File tree

3 files changed

+77
-2
lines changed

3 files changed

+77
-2
lines changed

hugolib/page_test.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -593,7 +593,7 @@ func TestPageSummary(t *testing.T) {
593593
// Source is not Asciidoctor- or RST-compatible so don't test them
594594
if ext != "ad" && ext != "rst" {
595595
checkPageContent(t, p, normalizeExpected(ext, "<p><a href=\"https://lipsum.com/\">Lorem ipsum</a> dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>\n\n<p>Additional text.</p>\n\n<p>Further text.</p>\n"), ext)
596-
checkPageSummary(t, p, normalizeExpected(ext, "<p><a href=\"https://lipsum.com/\">Lorem ipsum</a> dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>"), ext)
596+
checkPageSummary(t, p, normalizeExpected(ext, "<p><a href=\"https://lipsum.com/\">Lorem ipsum</a> dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p><p>Additional text.</p>"), ext)
597597
}
598598
checkPageType(t, p, "page")
599599
}

resources/page/page_markup.go

+19-1
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,16 @@ func (s *HtmlSummary) resolveParagraphTagAndSetWrapper(mt media.Type) tagReStart
161161
return ptag
162162
}
163163

164+
// Avoid counting words that are most likely HTML tokens.
165+
var (
166+
isProbablyHTMLTag = regexp.MustCompile(`^<\/?[A-Za-z]+>?$`)
167+
isProablyHTMLAttribute = regexp.MustCompile(`^[A-Za-z]+=["']`)
168+
)
169+
170+
func isProbablyHTMLToken(s string) bool {
171+
return s == ">" || isProbablyHTMLTag.MatchString(s) || isProablyHTMLAttribute.MatchString(s)
172+
}
173+
164174
// ExtractSummaryFromHTML extracts a summary from the given HTML content.
165175
func ExtractSummaryFromHTML(mt media.Type, input string, numWords int, isCJK bool) (result HtmlSummary) {
166176
result.source = input
@@ -173,6 +183,14 @@ func ExtractSummaryFromHTML(mt media.Type, input string, numWords int, isCJK boo
173183
var count int
174184

175185
countWord := func(word string) int {
186+
word = strings.TrimSpace(word)
187+
if len(word) == 0 {
188+
return 0
189+
}
190+
if isProbablyHTMLToken(word) {
191+
return 0
192+
}
193+
176194
if isCJK {
177195
word = tpl.StripHTML(word)
178196
runeCount := utf8.RuneCountInString(word)
@@ -193,7 +211,7 @@ func ExtractSummaryFromHTML(mt media.Type, input string, numWords int, isCJK boo
193211

194212
for j := result.WrapperStart.High; j < high; {
195213
s := input[j:]
196-
closingIndex := strings.Index(s, "</"+ptag.tagName)
214+
closingIndex := strings.Index(s, "</"+ptag.tagName+">")
197215

198216
if closingIndex == -1 {
199217
break

resources/page/page_markup_test.go

+57
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,46 @@ func TestExtractSummaryFromHTML(t *testing.T) {
4949
}
5050
}
5151

52+
// See https://discourse.gohugo.io/t/automatic-summarys-summarylength-seems-broken-in-the-case-of-plainify/51466/4
53+
// Also issue 12837
54+
func TestExtractSummaryFromHTMLLotsOfHTMLInSummary(t *testing.T) {
55+
c := qt.New(t)
56+
57+
input := `
58+
<p>
59+
<div>
60+
<picture>
61+
<img src="imgs/1.jpg" alt="1"/>
62+
</picture>
63+
<picture>
64+
<img src="imgs/2.jpg" alt="2"/>
65+
</picture>
66+
<picture>
67+
<img src="imgs/3.jpg" alt="3"/>
68+
</picture>
69+
<picture>
70+
<img src="imgs/4.jpg" alt="4"/>
71+
</picture>
72+
<picture>
73+
<img src="imgs/5.jpg" alt="5"/>
74+
</picture>
75+
</div>
76+
</p>
77+
<p>
78+
This is a story about a cat.
79+
</p>
80+
<p>
81+
The cat was white and fluffy.
82+
</p>
83+
<p>
84+
And it liked milk.
85+
</p>
86+
`
87+
88+
summary := ExtractSummaryFromHTML(media.Builtin.MarkdownType, input, 10, false)
89+
c.Assert(strings.HasSuffix(summary.Summary(), "<p>\nThis is a story about a cat.\n</p>\n<p>\nThe cat was white and fluffy.\n</p>"), qt.IsTrue)
90+
}
91+
5292
func TestExtractSummaryFromHTMLWithDivider(t *testing.T) {
5393
c := qt.New(t)
5494

@@ -114,6 +154,23 @@ func TestExpandDivider(t *testing.T) {
114154
}
115155
}
116156

157+
func TestIsProbablyHTMLToken(t *testing.T) {
158+
c := qt.New(t)
159+
160+
for i, test := range []struct {
161+
input string
162+
expect bool
163+
}{
164+
{"<p>", true},
165+
{"<p", true},
166+
{"width=\"32\"", true},
167+
{"width='32'", true},
168+
{"<p>Æøå", false},
169+
} {
170+
c.Assert(isProbablyHTMLToken(test.input), qt.Equals, test.expect, qt.Commentf("[%d] Test.expect %q", i, test.input))
171+
}
172+
}
173+
117174
func BenchmarkSummaryFromHTML(b *testing.B) {
118175
b.StopTimer()
119176
input := "<p>First paragraph</p><p>Second paragraph</p>"

0 commit comments

Comments
 (0)