simplify build by loading words directly from a plain text file inste…

…ad of a binary (it's even faster and the plain text file isn't larger)
danielnaber · Sep 15, 2023 · 11117fe · 11117fe
1 parent ea77049
commit 11117fe
Show file tree

Hide file tree

Showing 9 changed files with 45 additions and 217 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -1,8 +1,10 @@
 jWordSplitter Change Log
 ========================
 
-### 2021-xx-xx (4.6-SNAPSHOT)
-* ...
+### 2023-xx-xx (4.6-SNAPSHOT)
+* extended the dictionary
+* load compound parts from a plain text file in the JAR, not a binary file
+  (it's even faster, about the same size and makes development easier)
 
 ### 2021-06-15 (4.5)
 * extended the dictionary

diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@ jWordSplitter 4.6-SNAPSHOT
 ==========================
 
 Copyright 2004-2007 Sven Abels  
-Copyright 2007-2021 Daniel Naber  
+Copyright 2007-2023 Daniel Naber  
 Source code licensed under Apache License, Version 2.0 (see below)
 
 This Java library can split German compound words into smaller parts.
@@ -37,24 +37,16 @@ List<String> parts = splitter.splitWord("Versuchsreihe");
 System.out.println(parts);    // prints: [Versuchs, reihe]
 ```
 
-#### Usage from Command Line
+#### Usage from command Line
 
 To split a list of words (one word per line), use this command:
 
     java -jar jwordsplitter-x.y.jar <filename>
 
-#### Data Import and Export
+#### Data location
 
-To export the German dictionary from the JAR file, use this command:
-
-    java -cp jwordsplitter-x.y.jar de.danielnaber.jwordsplitter.converter.ExportDict /de/danielnaber/jwordsplitter/wordsGerman.ser
-
-To serialize a text dictionary (one word per line) to a binary format
-so it can be used by jWordSplitter, use this command:
-
-    java -cp jwordsplitter-x.y.jar de.danielnaber.jwordsplitter.converter.SerializeDict <textDict> <output>
-
-The binary format used is simply the standard Java object serialization.
+To access the German dictionary from the JAR file, unzip the JAR. The dictionary is at
+`de/danielnaber/jwordsplitter/wordsGerman.txt`.
 
 #### Notes about the algorithm
 
@@ -78,8 +70,7 @@ The binary format used is simply the standard Java object serialization.
 
 #### Building
 
-Use `build.sh` to create the JAR. It will build the internal binary dictionary
-from the plain text files in `resources` and then run the required mvn commands.
+Use `build.sh` to create the dictionary from the text files in `resources`.
 
 #### Changelog
 
@@ -90,7 +81,7 @@ moving to git), check it out from SVN at https://sourceforge.net/p/jwordsplitter
 #### License
 
 The source code part of this project is licensed under [Apache License, Version 2.0](https://github.com/danielnaber/jwordsplitter/blob/master/LICENSE.txt).
-The integrated dictionary (`wordsGerman.ser`) is a subset of
-[Morphy](http://morphy.wolfganglezius.de/) with additions from
+The integrated dictionary (`wordsGerman.txt`) is a subset of
+[Morphy](https://danielnaber.de/morphologie/) with additions from
 [LanguageTool](https://languagetool.org) and licensed under
 [Creative Commons Attribution-Share Alike 4.0](https://creativecommons.org/licenses/by-sa/4.0/).
diff --git a/build.sh b/build.sh
@@ -1,24 +1,21 @@
 #!/bin/sh
-# Build the binary dictionary, then build the artifact
+# Build the dictionary
 
 RESOURCES=src/main/resources/de/danielnaber/jwordsplitter
-BIN_FILE=$RESOURCES/wordsGerman.ser
-ALL_WORDS=$RESOURCES/all-words.txt
+TXT_FILE=$RESOURCES/wordsGerman.txt
 
-echo "Removing $BIN_FILE"
-rm $BIN_FILE
-echo "Removing $RESOURCES/all-words.txt"
-rm $ALL_WORDS
+echo "Removing $TXT_FILE"
+rm $TXT_FILE
 
-mvn clean package -DskipTests
-grep -v -f $RESOURCES/removals.txt $RESOURCES/languagetool-dict.txt | cat - $RESOURCES/additions.txt $RESOURCES/germanPrefixes.txt | grep -v "^#" >$ALL_WORDS
-java -cp target/jwordsplitter-*-SNAPSHOT.jar de.danielnaber.jwordsplitter.converter.SerializeDict $ALL_WORDS $BIN_FILE
+echo "# DO NOT EDIT -- File generated by build.sh" >$TXT_FILE
+echo -n "# " >>$TXT_FILE
+date >>$TXT_FILE
+grep -v -f $RESOURCES/removals.txt $RESOURCES/languagetool-dict.txt | cat - $RESOURCES/additions.txt $RESOURCES/germanPrefixes.txt | grep -v "^#" >>$TXT_FILE
 
-mvn clean package
-echo "Writing new file to: $BIN_FILE"
+#mvn clean package
+echo "Writing new file to: $TXT_FILE"
 echo -n "Result: "
-ls -l $BIN_FILE
+ls -l $TXT_FILE
 
 echo -n "Total lines: "
-wc -l $ALL_WORDS
-rm $ALL_WORDS
+wc -l $TXT_FILE
diff --git a/pom.xml b/pom.xml
@@ -118,7 +118,7 @@
         <version>2.4</version>
         <configuration>
           <excludes>
-            <!-- the data is in the binary dict, including the text would only make the JAR bigger: -->
+            <!-- the data is in the 'wordsGerman.txt' dict, including the text would only make the JAR bigger: -->
             <exclude>**/de/danielnaber/jwordsplitter/additions.txt</exclude>
             <exclude>**/de/danielnaber/jwordsplitter/removals.txt</exclude>
             <exclude>**/de/danielnaber/jwordsplitter/removals.README</exclude>

diff --git a/src/main/java/de/danielnaber/jwordsplitter/EmbeddedGermanDictionary.java b/src/main/java/de/danielnaber/jwordsplitter/EmbeddedGermanDictionary.java
@@ -15,10 +15,8 @@
  */
 package de.danielnaber.jwordsplitter;
 
-import de.danielnaber.jwordsplitter.tools.FastObjectSaver;
-
-import java.io.IOException;
-import java.util.Collections;
+import java.io.*;
+import java.nio.charset.StandardCharsets;
 import java.util.HashSet;
 import java.util.Set;
 
@@ -28,7 +26,7 @@
  */
 public final class EmbeddedGermanDictionary {
 
-  private static final String SERIALIZED_DICT = "/de/danielnaber/jwordsplitter/wordsGerman.ser";   // dict inside the JAR
+  private static final String DICT = "/de/danielnaber/jwordsplitter/wordsGerman.txt";   // dict inside the JAR
 
   private static Set<String> words;
 
@@ -37,10 +35,22 @@ private EmbeddedGermanDictionary() {
 
   public static synchronized Set<String> getWords() {
     if (words == null) {
-      try {
-        words = Collections.unmodifiableSet((HashSet<String>)FastObjectSaver.load(SERIALIZED_DICT));
+      words = new HashSet<>();
+      //long t = System.currentTimeMillis();
+      try (InputStream is = new BufferedInputStream(EmbeddedGermanDictionary.class.getResourceAsStream(DICT));
+           InputStreamReader isr = new InputStreamReader(is, StandardCharsets.UTF_8);
+           BufferedReader br = new BufferedReader(isr)
+      ) {
+        String line;
+        while ((line = br.readLine()) != null) {
+          if (!line.startsWith("#")) {
+            words.add(line.trim().toLowerCase());
+          }
+        }
+        //long t2 = System.currentTimeMillis();
+        //System.out.println("Loading time: " + (t2-t) + "ms");
       } catch (IOException e) {
-        throw new RuntimeException("Could not load " + SERIALIZED_DICT, e);
+        throw new RuntimeException("Could not load " + DICT, e);
       }
     }
     return words;

diff --git a/src/main/java/de/danielnaber/jwordsplitter/converter/ExportDict.java b/src/main/java/de/danielnaber/jwordsplitter/converter/ExportDict.java
diff --git a/src/main/java/de/danielnaber/jwordsplitter/converter/SerializeDict.java b/src/main/java/de/danielnaber/jwordsplitter/converter/SerializeDict.java
diff --git a/src/main/java/de/danielnaber/jwordsplitter/tools/FastObjectSaver.java b/src/main/java/de/danielnaber/jwordsplitter/tools/FastObjectSaver.java
diff --git a/src/test/java/de/danielnaber/jwordsplitter/GermanInterfixDisambiguatorTest.java b/src/test/java/de/danielnaber/jwordsplitter/GermanInterfixDisambiguatorTest.java
@@ -15,7 +15,6 @@
  */
 package de.danielnaber.jwordsplitter;
 
-import de.danielnaber.jwordsplitter.tools.FastObjectSaver;
 import de.danielnaber.jwordsplitter.tools.FileTools;
 import org.junit.Test;
 
@@ -41,10 +40,9 @@ public void testSmallDict() throws IOException {
         }
     }
 
-    @SuppressWarnings("unchecked")
     @Test
-    public void testFullDict() throws IOException {
-        HashSet<String> compoundParts = (HashSet<String>) FastObjectSaver.load("/de/danielnaber/jwordsplitter/wordsGerman.ser");
+    public void testFullDict() {
+        HashSet<String> compoundParts = (HashSet<String>) EmbeddedGermanDictionary.getWords();
         GermanInterfixDisambiguator disambiguator = new GermanInterfixDisambiguator(compoundParts);
         assertSplit("Verkehr samt", "Verkehrs, amt", disambiguator);
         assertSplit("Sauerstoff flaschen störung s verhalten", "Sauerstoff, flaschen, störungs, verhalten", disambiguator);