aboutsummaryrefslogtreecommitdiffhomepage
path: root/related
diff options
context:
space:
mode:
authorBjørn Erik Pedersen <[email protected]>2023-02-23 15:20:31 +0100
committerBjørn Erik Pedersen <[email protected]>2023-02-23 17:06:22 +0100
commite442a63bb7659d95aec2d48bf954cd9d61163559 (patch)
treed8868929c74430e530f41ff8a359f451fd981729 /related
parentd5601e8391470be952ba48948c5a976884fea871 (diff)
downloadhugo-e442a63bb7659d95aec2d48bf954cd9d61163559.tar.gz
hugo-e442a63bb7659d95aec2d48bf954cd9d61163559.zip
related: Add config option cardinalityThreshold
Fixes #10744
Diffstat (limited to 'related')
-rw-r--r--related/inverted_index.go65
-rw-r--r--related/inverted_index_test.go35
2 files changed, 98 insertions, 2 deletions
diff --git a/related/inverted_index.go b/related/inverted_index.go
index eab97098a..967855133 100644
--- a/related/inverted_index.go
+++ b/related/inverted_index.go
@@ -135,9 +135,21 @@ type IndexConfig struct {
// This field's weight when doing multi-index searches. Higher is "better".
Weight int
+ // A percentage (0-100) used to remove common keywords from the index.
+ // As an example, setting this to 50 will remove all keywords that are
+ // used in more than 50% of the documents in the index.
+ CardinalityThreshold int
+
// Will lower case all string values in and queries tothis index.
// May get better accurate results, but at a slight performance cost.
ToLower bool
+
+ // Counts the number of documents in the index.
+ numDocs int
+}
+
+func (cfg *IndexConfig) incrNumDocs() {
+ cfg.numDocs++
}
// Document is the interface an indexable document in Hugo must fulfill.
@@ -169,6 +181,9 @@ type InvertedIndex struct {
minWeight int
maxWeight int
+
+ // No modifications after this is set.
+ finalized bool
}
func (idx *InvertedIndex) getIndexCfg(name string) (IndexConfig, bool) {
@@ -202,8 +217,11 @@ func NewInvertedIndex(cfg Config) *InvertedIndex {
// Add documents to the inverted index.
// The value must support == and !=.
func (idx *InvertedIndex) Add(ctx context.Context, docs ...Document) error {
+ if idx.finalized {
+ panic("index is finalized")
+ }
var err error
- for _, config := range idx.cfg.Indices {
+ for i, config := range idx.cfg.Indices {
if config.Weight == 0 {
// Disabled
continue
@@ -211,6 +229,7 @@ func (idx *InvertedIndex) Add(ctx context.Context, docs ...Document) error {
setm := idx.index[config.Name]
for _, doc := range docs {
+ var added bool
var words []Keyword
words, err = doc.RelatedKeywords(config)
if err != nil {
@@ -218,22 +237,60 @@ func (idx *InvertedIndex) Add(ctx context.Context, docs ...Document) error {
}
for _, keyword := range words {
+ added = true
setm[keyword] = append(setm[keyword], doc)
}
if config.Type == TypeFragments {
if fp, ok := doc.(FragmentProvider); ok {
for _, fragment := range fp.Fragments(ctx).Identifiers {
+ added = true
setm[FragmentKeyword(fragment)] = append(setm[FragmentKeyword(fragment)], doc)
}
}
}
+
+ if added {
+ c := &idx.cfg.Indices[i]
+ (*c).incrNumDocs()
+ }
}
}
return err
}
+func (idx *InvertedIndex) Finalize(ctx context.Context) error {
+ if idx.finalized {
+ return nil
+ }
+
+ for _, config := range idx.cfg.Indices {
+ if config.CardinalityThreshold == 0 {
+ continue
+ }
+ setm := idx.index[config.Name]
+ numDocs := config.numDocs
+ if numDocs == 0 {
+ continue
+ }
+
+ // Remove high cardinality terms.
+ for k, v := range setm {
+ percentageWithKeyword := int(math.Ceil(float64(len(v)) / float64(numDocs) * 100))
+ if percentageWithKeyword > config.CardinalityThreshold {
+ delete(setm, k)
+ }
+ }
+
+ }
+
+ idx.finalized = true
+
+ return nil
+
+}
+
// queryElement holds the index name and keywords that can be used to compose a
// search for related content.
type queryElement struct {
@@ -548,12 +605,16 @@ func DecodeConfig(m maps.Params) (Config, error) {
}
}
for i := range c.Indices {
- if c.Indices[i].Type == "" {
+ icfg := c.Indices[i]
+ if icfg.Type == "" {
c.Indices[i].Type = TypeBasic
}
if !validTypes[c.Indices[i].Type] {
return c, fmt.Errorf("invalid index type %q. Must be one of %v", c.Indices[i].Type, xmaps.Keys(validTypes))
}
+ if icfg.CardinalityThreshold < 0 || icfg.CardinalityThreshold > 100 {
+ return Config{}, errors.New("cardinalityThreshold threshold must be between 0 and 100")
+ }
}
return c, nil
diff --git a/related/inverted_index_test.go b/related/inverted_index_test.go
index d38a7f6eb..c7348e088 100644
--- a/related/inverted_index_test.go
+++ b/related/inverted_index_test.go
@@ -86,6 +86,41 @@ func (d *testDoc) PublishDate() time.Time {
return d.date
}
+func TestCardinalityThreshold(t *testing.T) {
+ c := qt.New(t)
+ config := Config{
+ Threshold: 90,
+ IncludeNewer: false,
+ Indices: IndexConfigs{
+ IndexConfig{Name: "tags", Weight: 50, CardinalityThreshold: 79},
+ IndexConfig{Name: "keywords", Weight: 65, CardinalityThreshold: 90},
+ },
+ }
+
+ idx := NewInvertedIndex(config)
+ hasKeyword := func(index, keyword string) bool {
+ _, found := idx.index[index][StringKeyword(keyword)]
+ return found
+ }
+
+ docs := []Document{
+ newTestDoc("tags", "a", "b", "c", "d"),
+ newTestDoc("tags", "b", "d", "g"),
+ newTestDoc("tags", "b", "d", "g"),
+ newTestDoc("tags", "b", "h").addKeywords("keywords", "a"),
+ newTestDoc("tags", "g", "h").addKeywords("keywords", "a", "b", "z"),
+ }
+
+ idx.Add(context.Background(), docs...)
+ c.Assert(idx.Finalize(context.Background()), qt.IsNil)
+ // Only tags=b should be removed.
+ c.Assert(hasKeyword("tags", "a"), qt.Equals, true)
+ c.Assert(hasKeyword("tags", "b"), qt.Equals, false)
+ c.Assert(hasKeyword("tags", "d"), qt.Equals, true)
+ c.Assert(hasKeyword("keywords", "b"), qt.Equals, true)
+
+}
+
func TestSearch(t *testing.T) {
config := Config{
Threshold: 90,