Skip to content

Commit

Permalink
add a WhitespaceAndQuoteTokenFactory as the default tokenizer, which …
Browse files Browse the repository at this point in the history
…seems to fix #34 for reasons I do not understand
  • Loading branch information
nolanlawson committed Nov 1, 2013
1 parent 4658949 commit 8a397ec
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 1 deletion.
2 changes: 1 addition & 1 deletion examples/example_config.xml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
For details, read the discussion here: http://github.com/healthonnet/hon-lucene-synonyms/issues/26
-->
<lst name="tokenizer">
<str name="class">solr.WhitespaceTokenizerFactory</str>
<str name="class">solr.WhitespaceAndQuoteTokenizerFactory</str>
</lst>
<!-- The ShingleFilterFactory outputs synonyms of multiple token lengths (e.g. unigrams, bigrams, trigrams, etc.).
The default here is to assume you don't have any synonyms longer than 4 tokens.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
package org.apache.solr.search;

import java.io.Reader;
import java.util.Map;

import org.apache.lucene.analysis.util.CharTokenizer;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.util.Version;
import org.apache.lucene.util.AttributeSource.AttributeFactory;

/**
* Copy of the WhitespaceTokenizerFactory, but tokenizes on quotes as well. Seems to work really well for most
* of our synonym-related use cases.
* @author nolan
*
*/
public class WhitespaceAndQuoteTokenizerFactory extends TokenizerFactory {

public WhitespaceAndQuoteTokenizerFactory(Map<String, String> args) {
super(args);
assureMatchVersion();
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}

@Override
public WhitespaceAndQuoteTokenizer create(AttributeFactory factory, Reader input) {
return new WhitespaceAndQuoteTokenizer(luceneMatchVersion, factory, input);
}

private static class WhitespaceAndQuoteTokenizer extends CharTokenizer {

public WhitespaceAndQuoteTokenizer(Version matchVersion, Reader in) {
super(matchVersion, in);
}

public WhitespaceAndQuoteTokenizer(Version matchVersion, AttributeFactory factory, Reader in) {
super(matchVersion, factory, in);
}

/**
* Collects only characters which do not satisfy
* {@link Character#isWhitespace(int)}. or '"' This method represents the main
* difference between this class and the normal WhitespaceTokenizer.
*/
@Override
protected boolean isTokenChar(int c) {
return c != '"' && !Character.isWhitespace(c);
}
}
}

0 comments on commit 8a397ec

Please sign in to comment.