Skip to content

Commit

Permalink
support 3gram wildcard
Browse files Browse the repository at this point in the history
Signed-off-by: gesong.samuel <[email protected]>
  • Loading branch information
gesong.samuel committed Feb 14, 2025
1 parent 38e4b33 commit dc88af1
Show file tree
Hide file tree
Showing 3 changed files with 79 additions and 130 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ public WildcardFieldMapper build(BuilderContext context) {

}

public static final int NGRAM_SIZE = 3;
public static final String CONTENT_TYPE = "wildcard";
public static final TypeParser PARSER = new TypeParser((n, c) -> new WildcardFieldMapper.Builder(n, c.getIndexAnalyzers()));

Expand Down Expand Up @@ -230,97 +231,49 @@ protected void parseCreateField(ParseContext context) throws IOException {
/**
* Tokenizer to emit tokens to support wildcard first-phase matching.
* <p>
* Will emit all substrings of length 1,2, and 3, with 0-valued anchors for the prefix/suffix.
* Will emit all substrings of only 3, with 0-valued anchors for the prefix/suffix.
* <p>
* For example, given the string "lucene", output the following terms:
* <p>
* [0, 'l']
* [0, 0, 'l']
* [0, 'l', 'u']
* ['l']
* ['l', 'u']
* ['l', 'u', 'c']
* ['u']
* ['u','c']
* ['u','c','e']
* ['c']
* ['c', 'e']
* ['c', 'e', 'n']
* ['e']
* ['e', 'n']
* ['e', 'n', 'e']
* ['n']
* ['n', 'e']
* ['n', 'e', 0]
* ['e']
* ['e', 0]
* ['e', 0, 0]
* <p>
* Visible for testing.
*/
static final class WildcardFieldTokenizer extends Tokenizer {
private final CharTermAttribute charTermAttribute = addAttribute(CharTermAttribute.class);
private final char[] buffer = new char[3]; // Ring buffer for up to 3 chars
private int offset = 0; // Position in the buffer
private int length = 2; // First token is anchor + first char
private final char[] buffer = new char[NGRAM_SIZE]; // Ring buffer for up to 3 chars
private int offset = NGRAM_SIZE - 1; // next position in buffer to store next input char

@Override
public void reset() throws IOException {
super.reset();
buffer[0] = 0;
int firstChar = input.read();
if (firstChar != -1) {
buffer[1] = (char) firstChar;
int secondChar = input.read();
if (secondChar != -1) {
buffer[2] = (char) secondChar;
} else {
buffer[2] = 0;
}
} else {
buffer[1] = 0;
for (int i = 0; i < NGRAM_SIZE - 1; i++) {
buffer[i] = 0;
}

}

@Override
public boolean incrementToken() throws IOException {
charTermAttribute.setLength(length);
int numZeroes = 0;
for (int i = 0; i < length; i++) {
char curChar = buffer[(i + offset) % 3];
if (curChar == 0) {
numZeroes++;
}
charTermAttribute.buffer()[i] = buffer[(i + offset) % 3];
}
if (numZeroes == 2) {
// Two zeroes usually means we're done.
if (length == 3 && charTermAttribute.buffer()[1] != 0) {
// The only case where we're not done is if the input has exactly 1 character, so the buffer
// contains 0, char, 0. In that case, we return char now, then return char, 0 on the next iteration
charTermAttribute.buffer()[0] = charTermAttribute.buffer()[1];
charTermAttribute.buffer()[1] = 0;
charTermAttribute.setLength(1);
length = 2;
offset = 1;
return true;
}
return false;
}
if (length == 3) {
// Read the next character, overwriting the current offset
int nextChar = input.read();
if (nextChar != -1) {
buffer[offset] = (char) nextChar;
} else {
// End of input. Pad with extra 0 to trigger the logic above.
buffer[offset] = 0;
}
offset = (offset + 1) % 3;
length = 1;
} else {
length = length + 1;
charTermAttribute.setLength(NGRAM_SIZE);
int c = input.read();
c = c == -1 ? 0 : c;

buffer[offset++ % NGRAM_SIZE] = (char) c;
boolean has_next = false;
for (int i = 0; i < NGRAM_SIZE; i++) {
char curChar = buffer[(offset + i) % NGRAM_SIZE];
charTermAttribute.buffer()[i] = curChar;
has_next |= curChar != 0;
}
return true;

return has_next;
}
}

Expand Down Expand Up @@ -479,8 +432,8 @@ public Query wildcardQuery(String value, MultiTermQuery.RewriteMethod method, bo
Query approximation;
if (requiredNGrams.isEmpty()) {
// This only happens when all characters are wildcard characters (* or ?),
// or it's the empty string.
if (value.length() == 0 || value.contains("?")) {
// or it's only contains sequential characters less than NGRAM_SIZE (which defaults to 3).
if (findNonWildcardSequence(value, 0) != value.length() || value.length() == 0 || value.contains("?")) {
approximation = this.existsQuery(context);
} else {
return existsQuery(context);
Expand All @@ -502,39 +455,48 @@ static Set<String> getRequiredNGrams(String value, boolean regexpMode) {
int pos = 0;
String rawSequence = null;
String currentSequence = null;
char[] buffer = new char[NGRAM_SIZE];
if (!value.startsWith("?") && !value.startsWith("*")) {
// Can add prefix term
rawSequence = getNonWildcardSequence(value, 0);
currentSequence = performEscape(rawSequence, regexpMode);
if (currentSequence.length() == 1) {
terms.add(new String(new char[] { 0, currentSequence.charAt(0) }));
} else {
terms.add(new String(new char[] { 0, currentSequence.charAt(0), currentSequence.charAt(1) }));

// buffer[0] is automatically set to 0
Arrays.fill(buffer, (char) 0);
int startIdx = Math.max(NGRAM_SIZE - currentSequence.length(), 1);
for (int j = 0; j < currentSequence.length() && j < NGRAM_SIZE - 1; j++) {
buffer[startIdx + j] = currentSequence.charAt(j);
}

terms.add(new String(buffer));
} else {
pos = findNonWildcardSequence(value, pos);
rawSequence = getNonWildcardSequence(value, pos);
}
while (pos < value.length()) {
boolean isEndOfValue = pos + rawSequence.length() == value.length();
currentSequence = performEscape(rawSequence, regexpMode);
if (!currentSequence.isEmpty() && currentSequence.length() < 3 && !isEndOfValue && pos > 0) {
// If this is a prefix or suffix of length < 3, then we already have a longer token including the anchor.
terms.add(currentSequence);
} else {
for (int i = 0; i < currentSequence.length() - 2; i++) {
terms.add(currentSequence.substring(i, i + 3));
}

for (int i = 0; i < currentSequence.length() - NGRAM_SIZE + 1; i++) {
terms.add(currentSequence.substring(i, i + 3));
}
if (isEndOfValue) {
// This is the end of the input. We can attach a suffix anchor.
if (currentSequence.length() == 1) {
terms.add(new String(new char[] { currentSequence.charAt(0), 0 }));
} else {
char a = currentSequence.charAt(currentSequence.length() - 2);
char b = currentSequence.charAt(currentSequence.length() - 1);
terms.add(new String(new char[] { a, b, 0 }));
// special case when we should generate '0xxxxxxx0', where we have (NGRAM_SIZE - 2) * x
Arrays.fill(buffer, (char) 0);
if (pos == 0 && currentSequence.length() == NGRAM_SIZE - 2) {
for (int i = 0; i < currentSequence.length(); i++) {
buffer[i + 1] = currentSequence.charAt(i);
}
terms.add(new String(buffer));
Arrays.fill(buffer, (char) 0);
}
int rightStartIdx = NGRAM_SIZE - currentSequence.length() - 2;
rightStartIdx = rightStartIdx < 0 ? NGRAM_SIZE - 2 : rightStartIdx;
for (int j = 0; j < currentSequence.length() && j < NGRAM_SIZE - 1; j++) {
buffer[rightStartIdx - j] = currentSequence.charAt(currentSequence.length() - j - 1);
}
terms.add(new String(buffer));
}
pos = findNonWildcardSequence(value, pos + rawSequence.length());
rawSequence = getNonWildcardSequence(value, pos);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,22 +82,11 @@ public void testTokenizer() throws IOException {
List.of(
WildcardFieldTypeTests.prefixAnchored("p"),
WildcardFieldTypeTests.prefixAnchored("pi"),
"p",
"pi",
"pic",
"i",
"ic",
"ick",
"c",
"ck",
"ckl",
"k",
"kl",
"kle",
"l",
"le",
WildcardFieldTypeTests.suffixAnchored("le"),
"e",
WildcardFieldTypeTests.suffixAnchored("e")
),
terms
Expand All @@ -111,7 +100,14 @@ public void testTokenizer() throws IOException {
terms.add(charTermAttribute.toString());
}
}
assertEquals(List.of(WildcardFieldTypeTests.prefixAnchored("a"), "a", WildcardFieldTypeTests.suffixAnchored("a")), terms);
assertEquals(
List.of(
WildcardFieldTypeTests.prefixAnchored("a"),
WildcardFieldTypeTests.suffixAnchored((char) 0 + "a"),
WildcardFieldTypeTests.suffixAnchored("a")
),
terms
);
}

public void testEnableDocValues() throws IOException {
Expand Down Expand Up @@ -188,13 +184,8 @@ public void testNormalizer() throws IOException {
List.of(
WildcardFieldTypeTests.prefixAnchored("a"),
WildcardFieldTypeTests.prefixAnchored("ab"),
"a",
"ab",
"abc",
"b",
"bc",
WildcardFieldTypeTests.suffixAnchored("bc"),
"c",
WildcardFieldTypeTests.suffixAnchored("c")
),
terms
Expand Down Expand Up @@ -242,13 +233,8 @@ public void testNullValue() throws IOException {
List.of(
WildcardFieldTypeTests.prefixAnchored("u"),
WildcardFieldTypeTests.prefixAnchored("ur"),
"u",
"ur",
"uri",
"r",
"ri",
WildcardFieldTypeTests.suffixAnchored("ri"),
"i",
WildcardFieldTypeTests.suffixAnchored("i")
),
terms
Expand Down Expand Up @@ -281,16 +267,9 @@ public void testDefaults() throws Exception {
List.of(
WildcardFieldTypeTests.prefixAnchored("1"),
WildcardFieldTypeTests.prefixAnchored("12"),
"1",
"12",
"123",
"2",
"23",
"234",
"3",
"34",
WildcardFieldTypeTests.suffixAnchored("34"),
"4",
WildcardFieldTypeTests.suffixAnchored("4")
),
terms
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,19 @@
public class WildcardFieldTypeTests extends FieldTypeTestCase {

static String prefixAnchored(String val) {
return (char) 0 + val;
String ret = (char) 0 + val;
if (ret.length() < WildcardFieldMapper.NGRAM_SIZE) {
ret = prefixAnchored(ret);
}
return ret;
}

static String suffixAnchored(String val) {
return val + (char) 0;
String ret = val + (char) 0;
if (ret.length() < WildcardFieldMapper.NGRAM_SIZE) {
ret = suffixAnchored(ret);
}
return ret;
}

public void testTermQuery() {
Expand Down Expand Up @@ -104,13 +112,14 @@ public void testEscapedWildcardQuery() {
ft.wildcardQuery("\\**\\*", null, null)
);

assertEquals(new WildcardFieldMapper.WildcardMatchingQuery("field", builder.build(), "\\*"), ft.wildcardQuery("\\*", null, null));

expectedTerms.remove(suffixAnchored("*"));
expectedTerms.add(prefixAnchored("*" + (char) 0));
builder = new BooleanQuery.Builder();
for (String term : expectedTerms) {
builder.add(new TermQuery(new Term("field", term)), BooleanClause.Occur.FILTER);
}
assertEquals(new WildcardFieldMapper.WildcardMatchingQuery("field", builder.build(), "\\*"), ft.wildcardQuery("\\*", null, null));
builder = new BooleanQuery.Builder();
builder.add(new TermQuery(new Term("field", prefixAnchored("*"))), BooleanClause.Occur.FILTER);
assertEquals(new WildcardFieldMapper.WildcardMatchingQuery("field", builder.build(), "\\**"), ft.wildcardQuery("\\**", null, null));
}

Expand All @@ -119,7 +128,6 @@ public void testMultipleWildcardsInQuery() {
MappedFieldType ft = new WildcardFieldMapper.WildcardFieldType("field");
Set<String> expectedTerms = new HashSet<>();
expectedTerms.add(prefixAnchored("a"));
expectedTerms.add("cd");
expectedTerms.add("efg");
expectedTerms.add(suffixAnchored("h"));
BooleanQuery.Builder builder = new BooleanQuery.Builder();
Expand Down Expand Up @@ -153,27 +161,27 @@ public void testRegexpQuery() {
assertTrue(actualMatchingQuery.getSecondPhaseMatcher().test("foo_apple_foo"));
assertFalse(actualMatchingQuery.getSecondPhaseMatcher().test("foo_apply_foo"));

pattern = "ab(zz|cd|ef.*)(hi|jk)";
pattern = "abc(zzz|def|ghi.*)(jkl|mno)";
builder = new BooleanQuery.Builder();
builder.add(new TermQuery(new Term("field", "ab")), BooleanClause.Occur.FILTER);
builder.add(new TermQuery(new Term("field", "abc")), BooleanClause.Occur.FILTER);
builder.add(
new BooleanQuery.Builder().add(new TermQuery(new Term("field", "zz")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("field", "cd")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("field", "ef")), BooleanClause.Occur.SHOULD)
new BooleanQuery.Builder().add(new TermQuery(new Term("field", "zzz")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("field", "def")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("field", "ghi")), BooleanClause.Occur.SHOULD)
.build(),
BooleanClause.Occur.FILTER
);
builder.add(
new BooleanQuery.Builder().add(new TermQuery(new Term("field", "hi")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("field", "jk")), BooleanClause.Occur.SHOULD)
new BooleanQuery.Builder().add(new TermQuery(new Term("field", "jkl")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("field", "mno")), BooleanClause.Occur.SHOULD)
.build(),
BooleanClause.Occur.FILTER
);
actual = ft.regexpQuery(pattern, 0, 0, 1000, null, null);
assertEquals(new WildcardFieldMapper.WildcardMatchingQuery("field", builder.build(), "/" + pattern + "/"), actual);
actualMatchingQuery = (WildcardFieldMapper.WildcardMatchingQuery) actual;
assertTrue(actualMatchingQuery.getSecondPhaseMatcher().test("abcdjk"));
assertTrue(actualMatchingQuery.getSecondPhaseMatcher().test("abefqwertyhi"));
assertTrue(actualMatchingQuery.getSecondPhaseMatcher().test("abcdefmno"));
assertTrue(actualMatchingQuery.getSecondPhaseMatcher().test("abcghiqwertyjkl"));
}

public void testWildcardMatchAll() {
Expand Down

0 comments on commit dc88af1

Please sign in to comment.