add a WhitespaceAndQuoteTokenFactory as the default tokenizer, which …

…seems to fix #34 for reasons I do not understand
healthonnet · Nov 1, 2013 · 8a397ec · 8a397ec
1 parent 4658949
commit 8a397ec
Show file tree

Hide file tree

Showing 2 changed files with 53 additions and 1 deletion.
diff --git a/examples/example_config.xml b/examples/example_config.xml
@@ -17,7 +17,7 @@
            For details, read the discussion here: http://github.com/healthonnet/hon-lucene-synonyms/issues/26 
         -->
       <lst name="tokenizer">
-        <str name="class">solr.WhitespaceTokenizerFactory</str>
+        <str name="class">solr.WhitespaceAndQuoteTokenizerFactory</str>
       </lst>
       <!-- The ShingleFilterFactory outputs synonyms of multiple token lengths (e.g. unigrams, bigrams, trigrams, etc.).
            The default here is to assume you don't have any synonyms longer than 4 tokens.

diff --git a/src/main/java/org/apache/solr/search/WhitespaceAndQuoteTokenizerFactory.java b/src/main/java/org/apache/solr/search/WhitespaceAndQuoteTokenizerFactory.java
@@ -0,0 +1,52 @@
+package org.apache.solr.search;
+
+import java.io.Reader;
+import java.util.Map;
+
+import org.apache.lucene.analysis.util.CharTokenizer;
+import org.apache.lucene.analysis.util.TokenizerFactory;
+import org.apache.lucene.util.Version;
+import org.apache.lucene.util.AttributeSource.AttributeFactory;
+
+/**
+ * Copy of the WhitespaceTokenizerFactory, but tokenizes on quotes as well.  Seems to work really well for most
+ * of our synonym-related use cases.
+ * @author nolan
+ *
+ */
+public class WhitespaceAndQuoteTokenizerFactory extends TokenizerFactory {
+
+    public WhitespaceAndQuoteTokenizerFactory(Map<String, String> args) {
+        super(args);
+        assureMatchVersion();
+        if (!args.isEmpty()) {
+            throw new IllegalArgumentException("Unknown parameters: " + args);
+        }
+    }
+
+    @Override
+    public WhitespaceAndQuoteTokenizer create(AttributeFactory factory, Reader input) {
+        return new WhitespaceAndQuoteTokenizer(luceneMatchVersion, factory, input);
+    }
+
+    private static class WhitespaceAndQuoteTokenizer extends CharTokenizer {
+
+        public WhitespaceAndQuoteTokenizer(Version matchVersion, Reader in) {
+            super(matchVersion, in);
+        }
+
+        public WhitespaceAndQuoteTokenizer(Version matchVersion, AttributeFactory factory, Reader in) {
+            super(matchVersion, factory, in);
+        }
+
+        /**
+         * Collects only characters which do not satisfy
+         * {@link Character#isWhitespace(int)}. or '"' This method represents the main
+         * difference between this class and the normal WhitespaceTokenizer.
+         */
+        @Override
+        protected boolean isTokenChar(int c) {
+            return c != '"' && !Character.isWhitespace(c);
+        }
+    }
+}