From e442a63bb7659d95aec2d48bf954cd9d61163559 Mon Sep 17 00:00:00 2001 From: Bjørn Erik Pedersen Date: Thu, 23 Feb 2023 15:20:31 +0100 Subject: related: Add config option cardinalityThreshold Fixes #10744 --- related/inverted_index.go | 65 ++++++++++++++++++++++++++++++++++++++++-- related/inverted_index_test.go | 35 +++++++++++++++++++++++ 2 files changed, 98 insertions(+), 2 deletions(-) (limited to 'related') diff --git a/related/inverted_index.go b/related/inverted_index.go index eab97098a..967855133 100644 --- a/related/inverted_index.go +++ b/related/inverted_index.go @@ -135,9 +135,21 @@ type IndexConfig struct { // This field's weight when doing multi-index searches. Higher is "better". Weight int + // A percentage (0-100) used to remove common keywords from the index. + // As an example, setting this to 50 will remove all keywords that are + // used in more than 50% of the documents in the index. + CardinalityThreshold int + // Will lower case all string values in and queries tothis index. // May get better accurate results, but at a slight performance cost. ToLower bool + + // Counts the number of documents in the index. + numDocs int +} + +func (cfg *IndexConfig) incrNumDocs() { + cfg.numDocs++ } // Document is the interface an indexable document in Hugo must fulfill. @@ -169,6 +181,9 @@ type InvertedIndex struct { minWeight int maxWeight int + + // No modifications after this is set. + finalized bool } func (idx *InvertedIndex) getIndexCfg(name string) (IndexConfig, bool) { @@ -202,8 +217,11 @@ func NewInvertedIndex(cfg Config) *InvertedIndex { // Add documents to the inverted index. // The value must support == and !=. func (idx *InvertedIndex) Add(ctx context.Context, docs ...Document) error { + if idx.finalized { + panic("index is finalized") + } var err error - for _, config := range idx.cfg.Indices { + for i, config := range idx.cfg.Indices { if config.Weight == 0 { // Disabled continue @@ -211,6 +229,7 @@ func (idx *InvertedIndex) Add(ctx context.Context, docs ...Document) error { setm := idx.index[config.Name] for _, doc := range docs { + var added bool var words []Keyword words, err = doc.RelatedKeywords(config) if err != nil { @@ -218,22 +237,60 @@ func (idx *InvertedIndex) Add(ctx context.Context, docs ...Document) error { } for _, keyword := range words { + added = true setm[keyword] = append(setm[keyword], doc) } if config.Type == TypeFragments { if fp, ok := doc.(FragmentProvider); ok { for _, fragment := range fp.Fragments(ctx).Identifiers { + added = true setm[FragmentKeyword(fragment)] = append(setm[FragmentKeyword(fragment)], doc) } } } + + if added { + c := &idx.cfg.Indices[i] + (*c).incrNumDocs() + } } } return err } +func (idx *InvertedIndex) Finalize(ctx context.Context) error { + if idx.finalized { + return nil + } + + for _, config := range idx.cfg.Indices { + if config.CardinalityThreshold == 0 { + continue + } + setm := idx.index[config.Name] + numDocs := config.numDocs + if numDocs == 0 { + continue + } + + // Remove high cardinality terms. + for k, v := range setm { + percentageWithKeyword := int(math.Ceil(float64(len(v)) / float64(numDocs) * 100)) + if percentageWithKeyword > config.CardinalityThreshold { + delete(setm, k) + } + } + + } + + idx.finalized = true + + return nil + +} + // queryElement holds the index name and keywords that can be used to compose a // search for related content. type queryElement struct { @@ -548,12 +605,16 @@ func DecodeConfig(m maps.Params) (Config, error) { } } for i := range c.Indices { - if c.Indices[i].Type == "" { + icfg := c.Indices[i] + if icfg.Type == "" { c.Indices[i].Type = TypeBasic } if !validTypes[c.Indices[i].Type] { return c, fmt.Errorf("invalid index type %q. Must be one of %v", c.Indices[i].Type, xmaps.Keys(validTypes)) } + if icfg.CardinalityThreshold < 0 || icfg.CardinalityThreshold > 100 { + return Config{}, errors.New("cardinalityThreshold threshold must be between 0 and 100") + } } return c, nil diff --git a/related/inverted_index_test.go b/related/inverted_index_test.go index d38a7f6eb..c7348e088 100644 --- a/related/inverted_index_test.go +++ b/related/inverted_index_test.go @@ -86,6 +86,41 @@ func (d *testDoc) PublishDate() time.Time { return d.date } +func TestCardinalityThreshold(t *testing.T) { + c := qt.New(t) + config := Config{ + Threshold: 90, + IncludeNewer: false, + Indices: IndexConfigs{ + IndexConfig{Name: "tags", Weight: 50, CardinalityThreshold: 79}, + IndexConfig{Name: "keywords", Weight: 65, CardinalityThreshold: 90}, + }, + } + + idx := NewInvertedIndex(config) + hasKeyword := func(index, keyword string) bool { + _, found := idx.index[index][StringKeyword(keyword)] + return found + } + + docs := []Document{ + newTestDoc("tags", "a", "b", "c", "d"), + newTestDoc("tags", "b", "d", "g"), + newTestDoc("tags", "b", "d", "g"), + newTestDoc("tags", "b", "h").addKeywords("keywords", "a"), + newTestDoc("tags", "g", "h").addKeywords("keywords", "a", "b", "z"), + } + + idx.Add(context.Background(), docs...) + c.Assert(idx.Finalize(context.Background()), qt.IsNil) + // Only tags=b should be removed. + c.Assert(hasKeyword("tags", "a"), qt.Equals, true) + c.Assert(hasKeyword("tags", "b"), qt.Equals, false) + c.Assert(hasKeyword("tags", "d"), qt.Equals, true) + c.Assert(hasKeyword("keywords", "b"), qt.Equals, true) + +} + func TestSearch(t *testing.T) { config := Config{ Threshold: 90, -- cgit v1.2.3