Skip to content

Commit

Permalink
Add inverted index
Browse files Browse the repository at this point in the history
See #98
  • Loading branch information
bep committed Aug 29, 2017
1 parent 8127e37 commit a68aa43
Show file tree
Hide file tree
Showing 2 changed files with 196 additions and 0 deletions.
78 changes: 78 additions & 0 deletions index/inverted_index.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
// Copyright 2017-present The Hugo Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package index

// This is the interface an indexable document in Hugo must fulfill.
type Document interface {
// SearchKeywords maps a keyword set (tags, categories etc.) to a list of terms
// in that keyword set.
SearchKeywords() map[string][]string
}

// InvertedIndex holds an inverted index, also sometimes named posting list, which
// lists, for every possible search term, the documents that contain that term.
type InvertedIndex map[string]map[string][]Document

// NewInvertedIndex creates a new InvertedIndex. Documents to index must be added in
// Add.
func NewInvertedIndex() InvertedIndex {
return make(InvertedIndex)
}

// Add a document to the inverted index.
// The value must support == and !=.
func (index InvertedIndex) Add(doc Document) (err error) {
for set, keywords := range doc.SearchKeywords() {
setm, found := index[set]
if !found {
setm = make(map[string][]Document)
index[set] = setm
}
for _, keyword := range keywords {
setm[keyword] = append(setm[keyword], doc)
}
}

return
}

// MachesIn finds the documents matching any of the keywords in the given keywords set.
func (index InvertedIndex) MachesIn(set string, keywords ...string) []Document {
matchm := make(map[Document]struct{})
// TODO(bep) ranking

setm, found := index[set]
if !found {
return []Document{}
}

for _, keyword := range keywords {
if docs, found := setm[keyword]; found {
for _, doc := range docs {
matchm[doc] = struct{}{}
}
}
}

matches := make([]Document, len(matchm))

i := 0
for k, _ := range matchm {
matches[i] = k
i++
}

return matches

}
118 changes: 118 additions & 0 deletions index/inverted_index_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
// Copyright 2017-present The Hugo Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package index

import (
"fmt"
"math/rand"
"testing"

"github.com/stretchr/testify/require"
)

type tstKeywords struct {
name string
keywords map[string][]string
}

func (k *tstKeywords) String() string {
return fmt.Sprintf("%s: %q", k.name, k.keywords)
}

func newTstKeywords(name string, keywords []string) *tstKeywords {
m := make(map[string][]string)
m["set1"] = keywords
return &tstKeywords{name: name, keywords: m}

}

func (k *tstKeywords) SearchKeywords() map[string][]string {
return k.keywords
}

func TestNewInvertedIndex(t *testing.T) {
assert := require.New(t)

idx := NewInvertedIndex()
first := newTstKeywords("d1", []string{"a", "b"})
second := newTstKeywords("d2", []string{"b", "c"})

assert.NoError(idx.Add(first))
assert.NoError(idx.Add(second))

assert.Len(idx, 1)
set1, found := idx["set1"]
assert.True(found)
// 3 keywords
assert.Len(set1, 3)

aMatch := idx.MachesIn("set1", "a")
assert.Len(aMatch, 1)
assert.Equal(first, aMatch[0])

}

func BenchmarkNewInvertedIndex(b *testing.B) {

docs := make([]*tstKeywords, 100)
numkeywords := 50
allKeywords := make([]string, numkeywords)
for i := 0; i < numkeywords; i++ {
allKeywords[i] = fmt.Sprintf("keyword%d", i+1)
}

for i := 0; i < len(docs); i++ {
start := rand.Intn(len(allKeywords))
end := start + 3
if end >= len(allKeywords) {
end = start + 1
}
docs[i] = newTstKeywords(fmt.Sprintf("d%d", i), allKeywords[start:end])
}

b.ResetTimer()
for i := 0; i < b.N; i++ {
idx := NewInvertedIndex()
for _, doc := range docs {
idx.Add(doc)
}
}
}

func BenchmarkMatchesIn(b *testing.B) {

docs := make([]*tstKeywords, 100)
numkeywords := 50
allKeywords := make([]string, numkeywords)
for i := 0; i < numkeywords; i++ {
allKeywords[i] = fmt.Sprintf("keyword%d", i+1)
}

idx := NewInvertedIndex()

for i := 0; i < len(docs); i++ {
start := rand.Intn(len(allKeywords))
end := start + 3
if end >= len(allKeywords) {
end = start + 1
}

idx.Add(newTstKeywords(fmt.Sprintf("d%d", i), allKeywords[start:end]))
}

b.ResetTimer()
for i := 0; i < b.N; i++ {
idx.MachesIn("set1", "keyword2", "keyword5", "keyword32", "asdf")
}
}

0 comments on commit a68aa43

Please sign in to comment.