Skip to content

Commit

Permalink
Add Thesaurus API and Synonym Index Handling in Search (#268)
Browse files Browse the repository at this point in the history
- Add Thesaurus API to find equivalent terms for a given term.
- Enable Synonym Document objects with Synonym Field objects to provide
Synonym Definitions.
  for creating the thesaurus in the search index.
- Add a synonym section to handle synonym document processing; persist
the synonym index in
segments (separating it from the inverted and vector indexes), and
manage the synonym index
  merging during segment merges.
- Add command line tooling to access the thesaurus by parsing the
segment file.
- Update zap.md to reflect the index file format for thesaurus support.

---------

Co-authored-by: Abhinav Dangeti <[email protected]>
  • Loading branch information
CascadingRadium and abhinavdangeti authored Dec 19, 2024
1 parent e1dde3e commit 82553cd
Show file tree
Hide file tree
Showing 17 changed files with 2,182 additions and 56 deletions.
1 change: 1 addition & 0 deletions build.go
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,7 @@ func InitSegmentBase(mem []byte, memCRC uint32, chunkMode uint32, numDocs uint64
docValueOffset: 0, // docValueOffsets identified automatically by the section
fieldFSTs: make(map[uint16]*vellum.FST),
vecIndexCache: newVectorIndexCache(),
synIndexCache: newSynonymIndexCache(),
// following fields gets populated by loadFieldsNew
fieldsMap: make(map[string]uint16),
dictLocs: make([]uint64, 0),
Expand Down
140 changes: 140 additions & 0 deletions cmd/zap/cmd/synonym.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
// Copyright (c) 2024 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package cmd

import (
"bytes"
"encoding/binary"
"fmt"

"github.com/RoaringBitmap/roaring/roaring64"
"github.com/blevesearch/vellum"
"github.com/spf13/cobra"
)

var thesaurusCmd = &cobra.Command{
Use: "thesaurus [path] [name]",
Short: "thesaurus prints the thesaurus with the specified name",
Long: `The thesaurus command lets you print the thesaurus with the specified name.`,
RunE: func(cmd *cobra.Command, args []string) error {
pos := segment.FieldsIndexOffset()
if pos == 0 {
// this is the case only for older file formats
return fmt.Errorf("file format not supported")
}
if len(args) < 2 {
return fmt.Errorf("must specify thesaurus name")
}

pos, err := segment.ThesaurusAddr(args[1])
if err != nil {
return fmt.Errorf("error determining address: %v", err)
}
fmt.Printf("thesaurus with name %s starts at %d (%x)\n", args[1], pos, pos)

data := segment.Data()
vellumLen, read := binary.Uvarint(data[pos : pos+binary.MaxVarintLen64])
pos += uint64(read)
fmt.Printf("vellum length: %d\n", vellumLen)

fstBytes := data[pos : pos+vellumLen]
pos += vellumLen
fst, err := vellum.Load(fstBytes)
if err != nil {
return fmt.Errorf("thesaurus name %s vellum err: %v", args[1], err)
}
fmt.Printf("raw vellum data:\n % x\n", fstBytes)

numSyns, n := binary.Uvarint(data[pos : pos+binary.MaxVarintLen64])
pos += uint64(n)
if numSyns == 0 {
return fmt.Errorf("no synonyms found")
}
synTermMap := make(map[uint32][]byte, numSyns)
for i := 0; i < int(numSyns); i++ {
synID, n := binary.Uvarint(data[pos : pos+binary.MaxVarintLen64])
pos += uint64(n)
termLen, n := binary.Uvarint(data[pos : pos+binary.MaxVarintLen64])
pos += uint64(n)
if termLen == 0 {
return fmt.Errorf("term length is 0")
}
term := data[pos : pos+uint64(termLen)]
pos += uint64(termLen)
synTermMap[uint32(synID)] = term
}

fmt.Printf("termID to term mapping:\n")
fmt.Printf(" termID\tterm\n")
for k, v := range synTermMap {
fmt.Printf(" %d\t%s\n", k, string(v))
}
fmt.Printf("thesaurus (term -> [{termID|docNum},...]):\n")
var totalTerms int
itr, err := fst.Iterator(nil, nil)
for err == nil {
var sl *roaring64.Bitmap
currTerm, currVal := itr.Current()
sl, err = readSynonymsList(currVal, data)
if err != nil {
return err
}
sitr := sl.Iterator()
printStr := fmt.Sprintf(" %s -> [", currTerm)
for sitr.HasNext() {
encodedVal := sitr.Next()
tID, docNum := decodeSynonym(encodedVal)
str := fmt.Sprintf("{%d|%d},", tID, docNum)
printStr += str
}
printStr = printStr[:len(printStr)-1] + "]"
fmt.Printf("%s\n", printStr)
totalTerms++
err = itr.Next()
}
fmt.Printf("Total terms in thesaurus : %d\n", totalTerms)
if err != nil && err != vellum.ErrIteratorDone {
return fmt.Errorf("error iterating thesaurus: %v", err)
}
return nil
},
}

func readSynonymsList(postingsOffset uint64, data []byte) (*roaring64.Bitmap, error) {
var n uint64
var read int

var postingsLen uint64
postingsLen, read = binary.Uvarint(data[postingsOffset : postingsOffset+binary.MaxVarintLen64])
n += uint64(read)

buf := bytes.NewReader(data[postingsOffset+n : postingsOffset+n+postingsLen])
r := roaring64.NewBitmap()

_, err := r.ReadFrom(buf)
if err != nil {
return nil, fmt.Errorf("error loading roaring bitmap: %v", err)
}

return r, nil
}

func decodeSynonym(synonymCode uint64) (synonymID uint32, docID uint32) {
return uint32(synonymCode >> 32), uint32(synonymCode)
}

func init() {
RootCmd.AddCommand(thesaurusCmd)
}
154 changes: 154 additions & 0 deletions doc_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -176,3 +176,157 @@ func (s *stubField) NumPlainTextBytes() uint64 {
func (s *stubField) Compose(field string, length int, freq index.TokenFrequencies) {

}

// -----------------------------------------------------------------------------
type stubSynonymField struct {
name string
analyzer string
input []string
synonyms []string

synonymMap map[string][]string
}

func (s *stubSynonymField) Name() string {
return s.name
}

func (s *stubSynonymField) Value() []byte {
return nil
}

func (s *stubSynonymField) ArrayPositions() []uint64 {
return nil
}

func (s *stubSynonymField) EncodedFieldType() byte {
return 0
}

func (s *stubSynonymField) Analyze() {
var analyzedInput []string
if len(s.input) > 0 {
analyzedInput = make([]string, 0, len(s.input))
for _, term := range s.input {
analyzedInput = append(analyzedInput, analyzeStubTerm(term, s.analyzer))
}
}
analyzedSynonyms := make([]string, 0, len(s.synonyms))
for _, syn := range s.synonyms {
analyzedSynonyms = append(analyzedSynonyms, analyzeStubTerm(syn, s.analyzer))
}
s.synonymMap = processSynonymData(analyzedInput, analyzedSynonyms)
}

func (s *stubSynonymField) Options() index.FieldIndexingOptions {
return 0
}

func (s *stubSynonymField) AnalyzedLength() int {
return 0
}

func (s *stubSynonymField) AnalyzedTokenFrequencies() index.TokenFrequencies {
return nil
}

func (s *stubSynonymField) NumPlainTextBytes() uint64 {
return 0
}

func (sf *stubSynonymField) IterateSynonyms(visitor func(term string, synonyms []string)) {
for term, synonyms := range sf.synonymMap {
visitor(term, synonyms)
}
}

func processSynonymData(input []string, synonyms []string) map[string][]string {
var synonymMap map[string][]string
if len(input) > 0 {
// Map each term to the same list of synonyms.
synonymMap = make(map[string][]string, len(input))
for _, term := range input {
synonymMap[term] = append([]string(nil), synonyms...) // Avoid sharing slices.
}
} else {
synonymMap = make(map[string][]string, len(synonyms))
// Precompute a map where each synonym points to all other synonyms.
for i, elem := range synonyms {
synonymMap[elem] = make([]string, 0, len(synonyms)-1)
for j, otherElem := range synonyms {
if i != j {
synonymMap[elem] = append(synonymMap[elem], otherElem)
}
}
}
}
return synonymMap
}

func analyzeStubTerm(term string, analyzer string) string {
lowerCaseTerm := strings.ToLower(term)
return lowerCaseTerm
}

func newStubSynonymField(name string, analyzer string, input []string, synonyms []string) index.SynonymField {
return &stubSynonymField{
name: name,
analyzer: analyzer,
input: input,
synonyms: synonyms,
}
}

// -----------------------------------------------------------------------------
type stubSynonymDocument struct {
id string
fields []index.Field
}

func (s *stubSynonymDocument) ID() string {
return s.id
}

func (s *stubSynonymDocument) Size() int {
return 0
}

func (s *stubSynonymDocument) VisitFields(visitor index.FieldVisitor) {
for _, f := range s.fields {
visitor(f)
}
}

func (s *stubSynonymDocument) HasComposite() bool {
return false
}

func (s *stubSynonymDocument) VisitComposite(visitor index.CompositeFieldVisitor) {
}

func (s *stubSynonymDocument) NumPlainTextBytes() uint64 {
return 0
}
func (s *stubSynonymDocument) StoredFieldsBytes() uint64 {
return 0
}

func (s *stubSynonymDocument) AddIDField() {
s.fields = append(s.fields, newStubFieldSplitString("_id", nil, s.id, true, false, false))
}

func (s *stubSynonymDocument) VisitSynonymFields(visitor index.SynonymFieldVisitor) {
for _, f := range s.fields {
if sf, ok := f.(index.SynonymField); ok {
visitor(sf)
}
}
}

func newStubSynonymDocument(id string, synonymField index.SynonymField) index.SynonymDocument {
rv := &stubSynonymDocument{
id: id,
fields: []index.Field{synonymField},
}
return rv
}
4 changes: 3 additions & 1 deletion faiss_vector_posting.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,9 @@ var emptyVecPostingsIterator = &VecPostingsIterator{}
var emptyVecPostingsList = &VecPostingsList{}

func (vpl *VecPostingsList) Iterator(prealloc segment.VecPostingsIterator) segment.VecPostingsIterator {

if vpl.postings == nil {
return emptyVecPostingsIterator
}
// tbd: do we check the cardinality of postings and scores?
var preallocPI *VecPostingsIterator
pi, ok := prealloc.(*VecPostingsIterator)
Expand Down
6 changes: 3 additions & 3 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@ go 1.21

require (
github.com/RoaringBitmap/roaring v1.9.3
github.com/blevesearch/bleve_index_api v1.1.13
github.com/blevesearch/bleve_index_api v1.2.0
github.com/blevesearch/go-faiss v1.0.24
github.com/blevesearch/mmap-go v1.0.4
github.com/blevesearch/scorch_segment_api/v2 v2.2.16
github.com/blevesearch/vellum v1.0.11
github.com/blevesearch/scorch_segment_api/v2 v2.3.0
github.com/blevesearch/vellum v1.1.0
github.com/golang/snappy v0.0.4
github.com/spf13/cobra v1.7.0
)
Expand Down
12 changes: 6 additions & 6 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,16 @@ github.com/RoaringBitmap/roaring v1.9.3 h1:t4EbC5qQwnisr5PrP9nt0IRhRTb9gMUgQF4t4
github.com/RoaringBitmap/roaring v1.9.3/go.mod h1:6AXUsoIEzDTFFQCe1RbGA6uFONMhvejWj5rqITANK90=
github.com/bits-and-blooms/bitset v1.12.0 h1:U/q1fAF7xXRhFCrhROzIfffYnu+dlS38vCZtmFVPHmA=
github.com/bits-and-blooms/bitset v1.12.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
github.com/blevesearch/bleve_index_api v1.1.13 h1:+nrA6oRJr85aCPyqaeZtsruObwKojutfonHJin/BP48=
github.com/blevesearch/bleve_index_api v1.1.13/go.mod h1:PbcwjIcRmjhGbkS/lJCpfgVSMROV6TRubGGAODaK1W8=
github.com/blevesearch/bleve_index_api v1.2.0 h1:/DXMMWBwx/UmGKM1xDhTwDoJI5yQrG6rqRWPFcOgUVo=
github.com/blevesearch/bleve_index_api v1.2.0/go.mod h1:PbcwjIcRmjhGbkS/lJCpfgVSMROV6TRubGGAODaK1W8=
github.com/blevesearch/go-faiss v1.0.24 h1:K79IvKjoKHdi7FdiXEsAhxpMuns0x4fM0BO93bW5jLI=
github.com/blevesearch/go-faiss v1.0.24/go.mod h1:OMGQwOaRRYxrmeNdMrXJPvVx8gBnvE5RYrr0BahNnkk=
github.com/blevesearch/mmap-go v1.0.4 h1:OVhDhT5B/M1HNPpYPBKIEJaD0F3Si+CrEKULGCDPWmc=
github.com/blevesearch/mmap-go v1.0.4/go.mod h1:EWmEAOmdAS9z/pi/+Toxu99DnsbhG1TIxUoRmJw/pSs=
github.com/blevesearch/scorch_segment_api/v2 v2.2.16 h1:uGvKVvG7zvSxCwcm4/ehBa9cCEuZVE+/zvrSl57QUVY=
github.com/blevesearch/scorch_segment_api/v2 v2.2.16/go.mod h1:VF5oHVbIFTu+znY1v30GjSpT5+9YFs9dV2hjvuh34F0=
github.com/blevesearch/vellum v1.0.11 h1:SJI97toEFTtA9WsDZxkyGTaBWFdWl1n2LEDCXLCq/AU=
github.com/blevesearch/vellum v1.0.11/go.mod h1:QgwWryE8ThtNPxtgWJof5ndPfx0/YMBh+W2weHKPw8Y=
github.com/blevesearch/scorch_segment_api/v2 v2.3.0 h1:vxCjbXAkkEBSb4AB3Iqgr/EJcPyYRsiGxpcvsS8E1Dw=
github.com/blevesearch/scorch_segment_api/v2 v2.3.0/go.mod h1:5y+TgXYSx+xJGaCwSlvy9G/UJBIY5wzvIkhvhBm2ATc=
github.com/blevesearch/vellum v1.1.0 h1:CinkGyIsgVlYf8Y2LUQHvdelgXr6PYuvoDIajq6yR9w=
github.com/blevesearch/vellum v1.1.0/go.mod h1:QgwWryE8ThtNPxtgWJof5ndPfx0/YMBh+W2weHKPw8Y=
github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
Expand Down
Loading

0 comments on commit 82553cd

Please sign in to comment.