diff options
author | Bjørn Erik Pedersen <[email protected]> | 2017-08-19 13:16:00 +0200 |
---|---|---|
committer | Bjørn Erik Pedersen <[email protected]> | 2017-09-06 00:20:02 +0200 |
commit | 3b4f17bbc9ff789faa581ac278ad109d1ac5b816 (patch) | |
tree | 7b706ad5fce15afa1825b6565bae09bc517cc687 /related | |
parent | 16c9127663951ace1a3901cf669c49cc72780ced (diff) | |
download | hugo-3b4f17bbc9ff789faa581ac278ad109d1ac5b816.tar.gz hugo-3b4f17bbc9ff789faa581ac278ad109d1ac5b816.zip |
hugolib: Implement "related content"
This closes #98, even if this commit does not do full content text search.
We may revisit that problem in the future, but that deserves its own issue.
Fixes #98
Diffstat (limited to 'related')
-rw-r--r-- | related/inverted_index.go | 450 | ||||
-rw-r--r-- | related/inverted_index_test.go | 276 |
2 files changed, 726 insertions, 0 deletions
diff --git a/related/inverted_index.go b/related/inverted_index.go new file mode 100644 index 000000000..f0d598d33 --- /dev/null +++ b/related/inverted_index.go @@ -0,0 +1,450 @@ +// Copyright 2017-present The Hugo Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package related holds code to help finding related content. +package related + +import ( + "errors" + "fmt" + "math" + "sort" + "strings" + "time" + + "github.com/gohugoio/hugo/common/types" + "github.com/mitchellh/mapstructure" +) + +var ( + _ Keyword = (*StringKeyword)(nil) + zeroDate = time.Time{} + + // DefaultConfig is the default related config. + DefaultConfig = Config{ + Threshold: 80, + Indices: IndexConfigs{ + IndexConfig{Name: "keywords", Weight: 100}, + IndexConfig{Name: "date", Weight: 10}, + }, + } +) + +/* +Config is the top level configuration element used to configure how to retrieve +related content in Hugo. + +An example site config.toml: + + [related] + threshold = 1 + [[related.indices]] + name = "keywords" + weight = 200 + [[related.indices]] + name = "tags" + weight = 100 + [[related.indices]] + name = "date" + weight = 1 + pattern = "2006" +*/ +type Config struct { + // Only include matches >= threshold, a normalized rank between 0 and 100. + Threshold int + + // To get stable "See also" sections we, by default, exclude newer related pages. + IncludeNewer bool + + // Will lower case all string values and queries to the indices. + // May get better results, but at a slight performance cost. + ToLower bool + + Indices IndexConfigs +} + +func (c *Config) Add(index IndexConfig) { + if c.ToLower { + index.ToLower = true + } + c.Indices = append(c.Indices, index) +} + +// IndexConfigs holds a set of index configurations. +type IndexConfigs []IndexConfig + +// IndexConfig configures an index. +type IndexConfig struct { + // The index name. This directly maps to a field or Param name. + Name string + + // Contextual pattern used to convert the Param value into a string. + // Currently only used for dates. Can be used to, say, bump posts in the same + // time frame when searching for related documents. + // For dates it follows Go's time.Format patterns, i.e. + // "2006" for YYYY and "200601" for YYYYMM. + Pattern string + + // This field's weight when doing multi-index searches. Higher is "better". + Weight int + + // Will lower case all string values in and queries tothis index. + // May get better accurate results, but at a slight performance cost. + ToLower bool +} + +// Document is the interface an indexable document in Hugo must fulfill. +type Document interface { + // SearchKeywords returns a list of keywords for the given index config. + SearchKeywords(cfg IndexConfig) ([]Keyword, error) + + // When this document was or will be published. + PubDate() time.Time +} + +// InvertedIndex holds an inverted index, also sometimes named posting list, which +// lists, for every possible search term, the documents that contain that term. +type InvertedIndex struct { + cfg Config + index map[string]map[Keyword][]Document + + minWeight int + maxWeight int +} + +func (idx *InvertedIndex) getIndexCfg(name string) (IndexConfig, bool) { + for _, conf := range idx.cfg.Indices { + if conf.Name == name { + return conf, true + } + } + + return IndexConfig{}, false +} + +// NewInvertedIndex creates a new InvertedIndex. +// Documents to index must be added in Add. +func NewInvertedIndex(cfg Config) *InvertedIndex { + idx := &InvertedIndex{index: make(map[string]map[Keyword][]Document), cfg: cfg} + for _, conf := range cfg.Indices { + idx.index[conf.Name] = make(map[Keyword][]Document) + if conf.Weight < idx.minWeight { + // By default, the weight scale starts at 0, but we allow + // negative weights. + idx.minWeight = conf.Weight + } + if conf.Weight > idx.maxWeight { + idx.maxWeight = conf.Weight + } + } + return idx +} + +// Add documents to the inverted index. +// The value must support == and !=. +func (idx *InvertedIndex) Add(docs ...Document) error { + var err error + for _, config := range idx.cfg.Indices { + if config.Weight == 0 { + // Disabled + continue + } + setm := idx.index[config.Name] + + for _, doc := range docs { + var words []Keyword + words, err = doc.SearchKeywords(config) + if err != nil { + continue + } + + for _, keyword := range words { + setm[keyword] = append(setm[keyword], doc) + } + } + } + + return err + +} + +// queryElement holds the index name and keywords that can be used to compose a +// search for related content. +type queryElement struct { + Index string + Keywords []Keyword +} + +func newQueryElement(index string, keywords ...Keyword) queryElement { + return queryElement{Index: index, Keywords: keywords} +} + +type ranks []*rank + +type rank struct { + Doc Document + Weight int + Matches int +} + +func (r *rank) addWeight(w int) { + r.Weight += w + r.Matches++ +} + +func newRank(doc Document, weight int) *rank { + return &rank{Doc: doc, Weight: weight, Matches: 1} +} + +func (r ranks) Len() int { return len(r) } +func (r ranks) Swap(i, j int) { r[i], r[j] = r[j], r[i] } +func (r ranks) Less(i, j int) bool { + if r[i].Weight == r[j].Weight { + return r[i].Doc.PubDate().After(r[j].Doc.PubDate()) + } + return r[i].Weight > r[j].Weight +} + +// SearchDoc finds the documents matching any of the keywords in the given indices +// against the given document. +// The resulting document set will be sorted according to number of matches +// and the index weights, and any matches with a rank below the configured +// threshold (normalize to 0..100) will be removed. +// If an index name is provided, only that index will be queried. +func (idx *InvertedIndex) SearchDoc(doc Document, indices ...string) ([]Document, error) { + var q []queryElement + + var configs IndexConfigs + + if len(indices) == 0 { + configs = idx.cfg.Indices + } else { + configs = make(IndexConfigs, len(indices)) + for i, indexName := range indices { + cfg, found := idx.getIndexCfg(indexName) + if !found { + return nil, fmt.Errorf("index %q not found", indexName) + } + configs[i] = cfg + } + } + + for _, cfg := range configs { + keywords, err := doc.SearchKeywords(cfg) + if err != nil { + return nil, err + } + + q = append(q, newQueryElement(cfg.Name, keywords...)) + + } + + return idx.searchDate(doc.PubDate(), q...) +} + +func (cfg IndexConfig) ToKeywords(v interface{}) ([]Keyword, error) { + var ( + keywords []Keyword + toLower = cfg.ToLower + ) + switch vv := v.(type) { + case string: + if toLower { + vv = strings.ToLower(vv) + } + keywords = append(keywords, StringKeyword(vv)) + case []string: + if toLower { + for i := 0; i < len(vv); i++ { + vv[i] = strings.ToLower(vv[i]) + } + } + keywords = append(keywords, StringsToKeywords(vv...)...) + case time.Time: + layout := "2006" + if cfg.Pattern != "" { + layout = cfg.Pattern + } + keywords = append(keywords, StringKeyword(vv.Format(layout))) + case nil: + return keywords, nil + default: + return keywords, fmt.Errorf("indexing currently not supported for for index %q and type %T", cfg.Name, vv) + } + + return keywords, nil +} + +// SearchKeyValues finds the documents matching any of the keywords in the given indices. +// The resulting document set will be sorted according to number of matches +// and the index weights, and any matches with a rank below the configured +// threshold (normalize to 0..100) will be removed. +func (idx *InvertedIndex) SearchKeyValues(args ...types.KeyValues) ([]Document, error) { + q := make([]queryElement, len(args)) + + for i, arg := range args { + var keywords []Keyword + key := arg.KeyString() + if key == "" { + return nil, fmt.Errorf("index %q not valid", arg.Key) + } + conf, found := idx.getIndexCfg(key) + if !found { + return nil, fmt.Errorf("index %q not found", key) + } + + for _, val := range arg.Values { + k, err := conf.ToKeywords(val) + if err != nil { + return nil, err + } + keywords = append(keywords, k...) + } + + q[i] = newQueryElement(conf.Name, keywords...) + + } + + return idx.search(q...) +} + +func (idx *InvertedIndex) search(query ...queryElement) ([]Document, error) { + return idx.searchDate(zeroDate, query...) +} + +func (idx *InvertedIndex) searchDate(upperDate time.Time, query ...queryElement) ([]Document, error) { + matchm := make(map[Document]*rank, 200) + applyDateFilter := !idx.cfg.IncludeNewer && !upperDate.IsZero() + + for _, el := range query { + setm, found := idx.index[el.Index] + if !found { + return []Document{}, fmt.Errorf("index for %q not found", el.Index) + } + + config, found := idx.getIndexCfg(el.Index) + if !found { + return []Document{}, fmt.Errorf("index config for %q not found", el.Index) + } + + for _, kw := range el.Keywords { + if docs, found := setm[kw]; found { + for _, doc := range docs { + if applyDateFilter { + // Exclude newer than the limit given + if doc.PubDate().After(upperDate) { + continue + } + } + r, found := matchm[doc] + if !found { + matchm[doc] = newRank(doc, config.Weight) + } else { + r.addWeight(config.Weight) + } + } + } + } + } + + if len(matchm) == 0 { + return []Document{}, nil + } + + matches := make(ranks, 0, 100) + + for _, v := range matchm { + avgWeight := v.Weight / v.Matches + weight := norm(avgWeight, idx.minWeight, idx.maxWeight) + threshold := idx.cfg.Threshold / v.Matches + + if weight >= threshold { + matches = append(matches, v) + } + } + + sort.Stable(matches) + + result := make([]Document, len(matches)) + + for i, m := range matches { + result[i] = m.Doc + } + + return result, nil +} + +// normalizes num to a number between 0 and 100. +func norm(num, min, max int) int { + if min > max { + panic("min > max") + } + return int(math.Floor((float64(num-min) / float64(max-min) * 100) + 0.5)) +} + +// DecodeConfig decodes a slice of map into Config. +func DecodeConfig(in interface{}) (Config, error) { + if in == nil { + return Config{}, errors.New("no related config provided") + } + + m, ok := in.(map[string]interface{}) + if !ok { + return Config{}, fmt.Errorf("expected map[string]interface {} got %T", in) + } + + if len(m) == 0 { + return Config{}, errors.New("empty related config provided") + } + + var c Config + + if err := mapstructure.WeakDecode(m, &c); err != nil { + return c, err + } + + if c.Threshold < 0 || c.Threshold > 100 { + return Config{}, errors.New("related threshold must be between 0 and 100") + } + + if c.ToLower { + for i, _ := range c.Indices { + c.Indices[i].ToLower = true + } + } + + return c, nil +} + +// StringKeyword is a string search keyword. +type StringKeyword string + +func (s StringKeyword) String() string { + return string(s) +} + +// Keyword is the interface a keyword in the search index must implement. +type Keyword interface { + String() string +} + +// StringsToKeywords converts the given slice of strings to a slice of Keyword. +func StringsToKeywords(s ...string) []Keyword { + kw := make([]Keyword, len(s)) + + for i := 0; i < len(s); i++ { + kw[i] = StringKeyword(s[i]) + } + + return kw +} diff --git a/related/inverted_index_test.go b/related/inverted_index_test.go new file mode 100644 index 000000000..781a969fb --- /dev/null +++ b/related/inverted_index_test.go @@ -0,0 +1,276 @@ +// Copyright 2017-present The Hugo Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package related + +import ( + "fmt" + "math/rand" + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +type testDoc struct { + keywords map[string][]Keyword + date time.Time +} + +func (k *testDoc) String() string { + s := "\n" + for k, v := range k.keywords { + s += k + ":\t\t" + for _, vv := range v { + s += " " + vv.String() + } + s += "\n" + } + return s +} + +func newTestDoc(name string, keywords ...string) *testDoc { + km := make(map[string][]Keyword) + + time.Sleep(1 * time.Millisecond) + kw := &testDoc{keywords: km, date: time.Now()} + + kw.addKeywords(name, keywords...) + return kw +} + +func (t *testDoc) addKeywords(name string, keywords ...string) *testDoc { + keywordm := createTestKeywords(name, keywords...) + + for k, v := range keywordm { + keywords := make([]Keyword, len(v)) + for i := 0; i < len(v); i++ { + keywords[i] = StringKeyword(v[i]) + } + t.keywords[k] = keywords + } + return t +} + +func createTestKeywords(name string, keywords ...string) map[string][]string { + return map[string][]string{ + name: keywords, + } +} + +func (k *testDoc) SearchKeywords(cfg IndexConfig) ([]Keyword, error) { + return k.keywords[cfg.Name], nil +} + +func (k *testDoc) PubDate() time.Time { + return k.date +} + +func TestSearch(t *testing.T) { + + config := Config{ + Threshold: 90, + IncludeNewer: false, + Indices: IndexConfigs{ + IndexConfig{Name: "tags", Weight: 50}, + IndexConfig{Name: "keywords", Weight: 65}, + }, + } + + idx := NewInvertedIndex(config) + //idx.debug = true + + docs := []Document{ + newTestDoc("tags", "a", "b", "c", "d"), + newTestDoc("tags", "b", "d", "g"), + newTestDoc("tags", "b", "h").addKeywords("keywords", "a"), + newTestDoc("tags", "g", "h").addKeywords("keywords", "a", "b"), + } + + idx.Add(docs...) + + t.Run("count", func(t *testing.T) { + assert := require.New(t) + assert.Len(idx.index, 2) + set1, found := idx.index["tags"] + assert.True(found) + // 6 tags + assert.Len(set1, 6) + + set2, found := idx.index["keywords"] + assert.True(found) + assert.Len(set2, 2) + + }) + + t.Run("search-tags", func(t *testing.T) { + assert := require.New(t) + m, err := idx.search(newQueryElement("tags", StringsToKeywords("a", "b", "d", "z")...)) + assert.NoError(err) + assert.Len(m, 2) + assert.Equal(docs[0], m[0]) + assert.Equal(docs[1], m[1]) + }) + + t.Run("search-tags-and-keywords", func(t *testing.T) { + assert := require.New(t) + m, err := idx.search( + newQueryElement("tags", StringsToKeywords("a", "b", "z")...), + newQueryElement("keywords", StringsToKeywords("a", "b")...)) + assert.NoError(err) + assert.Len(m, 3) + assert.Equal(docs[3], m[0]) + assert.Equal(docs[2], m[1]) + assert.Equal(docs[0], m[2]) + }) + + t.Run("searchdoc-all", func(t *testing.T) { + assert := require.New(t) + doc := newTestDoc("tags", "a").addKeywords("keywords", "a") + m, err := idx.SearchDoc(doc) + assert.NoError(err) + assert.Len(m, 2) + assert.Equal(docs[3], m[0]) + assert.Equal(docs[2], m[1]) + }) + + t.Run("searchdoc-tags", func(t *testing.T) { + assert := require.New(t) + doc := newTestDoc("tags", "a", "b", "d", "z").addKeywords("keywords", "a", "b") + m, err := idx.SearchDoc(doc, "tags") + assert.NoError(err) + assert.Len(m, 2) + assert.Equal(docs[0], m[0]) + assert.Equal(docs[1], m[1]) + }) + + t.Run("searchdoc-keywords-date", func(t *testing.T) { + assert := require.New(t) + doc := newTestDoc("tags", "a", "b", "d", "z").addKeywords("keywords", "a", "b") + // This will get a date newer than the others. + newDoc := newTestDoc("keywords", "a", "b") + idx.Add(newDoc) + + m, err := idx.SearchDoc(doc, "keywords") + assert.NoError(err) + assert.Len(m, 2) + assert.Equal(docs[3], m[0]) + }) + +} + +func BenchmarkRelatedNewIndex(b *testing.B) { + + pages := make([]*testDoc, 100) + numkeywords := 30 + allKeywords := make([]string, numkeywords) + for i := 0; i < numkeywords; i++ { + allKeywords[i] = fmt.Sprintf("keyword%d", i+1) + } + + for i := 0; i < len(pages); i++ { + start := rand.Intn(len(allKeywords)) + end := start + 3 + if end >= len(allKeywords) { + end = start + 1 + } + + kw := newTestDoc("tags", allKeywords[start:end]...) + if i%5 == 0 { + start := rand.Intn(len(allKeywords)) + end := start + 3 + if end >= len(allKeywords) { + end = start + 1 + } + kw.addKeywords("keywords", allKeywords[start:end]...) + } + + pages[i] = kw + } + + cfg := Config{ + Threshold: 50, + Indices: IndexConfigs{ + IndexConfig{Name: "tags", Weight: 100}, + IndexConfig{Name: "keywords", Weight: 200}, + }, + } + + b.Run("singles", func(b *testing.B) { + for i := 0; i < b.N; i++ { + idx := NewInvertedIndex(cfg) + for _, doc := range pages { + idx.Add(doc) + } + } + }) + + b.Run("all", func(b *testing.B) { + for i := 0; i < b.N; i++ { + idx := NewInvertedIndex(cfg) + docs := make([]Document, len(pages)) + for i := 0; i < len(pages); i++ { + docs[i] = pages[i] + } + idx.Add(docs...) + } + }) + +} + +func BenchmarkRelatedMatchesIn(b *testing.B) { + + q1 := newQueryElement("tags", StringsToKeywords("keyword2", "keyword5", "keyword32", "asdf")...) + q2 := newQueryElement("keywords", StringsToKeywords("keyword3", "keyword4")...) + + docs := make([]*testDoc, 1000) + numkeywords := 20 + allKeywords := make([]string, numkeywords) + for i := 0; i < numkeywords; i++ { + allKeywords[i] = fmt.Sprintf("keyword%d", i+1) + } + + cfg := Config{ + Threshold: 20, + Indices: IndexConfigs{ + IndexConfig{Name: "tags", Weight: 100}, + IndexConfig{Name: "keywords", Weight: 200}, + }, + } + + idx := NewInvertedIndex(cfg) + + for i := 0; i < len(docs); i++ { + start := rand.Intn(len(allKeywords)) + end := start + 3 + if end >= len(allKeywords) { + end = start + 1 + } + + index := "tags" + if i%5 == 0 { + index = "keywords" + } + + idx.Add(newTestDoc(index, allKeywords[start:end]...)) + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + if i%10 == 0 { + idx.search(q2) + } else { + idx.search(q1) + } + } +} |