initial commit.

scascor · Aug 5, 2013 · 3ec44e7 · 3ec44e7
commit 3ec44e7
Show file tree

Hide file tree

Showing 13 changed files with 617,140 additions and 0 deletions.
diff --git a/conf/jieba/sougou.dict b/conf/jieba/sougou.dict
diff --git a/conf/jieba/user.dict b/conf/jieba/user.dict
@@ -0,0 +1,4 @@
+小清新 3
+百搭 3
+显瘦 3
+又拍云 3
diff --git a/pom.xml b/pom.xml
@@ -0,0 +1,48 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+	<modelVersion>4.0.0</modelVersion>
+
+	<groupId>com.huaban</groupId>
+	<artifactId>jieba-analysis</artifactId>
+	<version>0.0.1-SNAPSHOT</version>
+	<packaging>jar</packaging>
+
+	<name>结巴分词工具(jieba for java)</name>
+	<url>http://maven.apache.org</url>
+	<inceptionYear>2013</inceptionYear>
+	<licenses>
+		<license>
+			<name>The Apache Software License, Version 2.0</name>
+			<url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
+			<distribution>repo</distribution>
+		</license>
+	</licenses>
+
+
+	<properties>
+		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+	</properties>
+
+	<dependencies>
+		<dependency>
+			<groupId>junit</groupId>
+			<artifactId>junit</artifactId>
+			<version>4.8</version>
+			<scope>test</scope>
+		</dependency>
+	</dependencies>
+
+	<build>
+		<plugins>
+			<plugin>
+				<groupId>org.apache.maven.plugins</groupId>
+				<artifactId>maven-compiler-plugin</artifactId>
+				<version>2.3.2</version>
+				<configuration>
+					<source>1.7</source>
+					<target>1.7</target>
+				</configuration>
+			</plugin>
+		</plugins>
+	</build>
+</project>
diff --git a/src/main/java/com/huaban/analysis/jieba/CharacterUtil.java b/src/main/java/com/huaban/analysis/jieba/CharacterUtil.java
@@ -0,0 +1,55 @@
+package com.huaban.analysis.jieba;
+
+import java.util.regex.Pattern;
+
+public class CharacterUtil {
+    public static Pattern reSkip = Pattern.compile("(\\d+\\.\\d+|[a-zA-Z0-9]+)");
+    private static final char[] connectors = new char[] {'+', '#', '&', '.', '_'};
+
+    public static boolean isChineseLetter(char ch) {
+        if (ch >= 0x4E00 && ch <= 0x9FA5) return true;
+        return false;
+    }    
+
+    public static boolean isEnglishLetter(char ch) {
+        if ((ch >= 0x0041 && ch <= 0x005A) || (ch >= 0x0061 && ch <= 0x007A))
+            return true;
+        return false;
+    }
+
+    public static boolean isDigit(char ch) {
+        if (ch >= 0x0030 && ch <= 0x0039) return true;
+        return false;
+    }
+
+    public static boolean isConnector(char ch) {
+        for (char connector : connectors)
+            if (ch == connector) return true;
+        return false;
+    }
+
+    public static boolean ccFind(char ch) {
+        if(isChineseLetter(ch)) return true;
+        if(isEnglishLetter(ch)) return true;
+        if(isDigit(ch)) return true;
+        if(isConnector(ch)) return true;
+        return false;
+    }
+
+    /**
+     * 全角->半角,大写->小写
+     * @param input
+     * @return
+     */
+	public static char regularize(char input){
+        if (input == 12288) {
+            return 32;
+        }else if (input > 65280 && input < 65375) {
+            return (char) (input - 65248);
+        }else if (input >= 'A' && input <= 'Z') {
+        	return (input += 32);
+		}
+        return input;
+	}
+
+}
diff --git a/src/main/java/com/huaban/analysis/jieba/JiebaSegmenter.java b/src/main/java/com/huaban/analysis/jieba/JiebaSegmenter.java
@@ -0,0 +1,207 @@
+package com.huaban.analysis.jieba;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import com.huaban.analysis.jieba.viterbi.FinalSeg;
+
+public class JiebaSegmenter {
+    private static WordDictionary wordDict = WordDictionary.getInstance();
+    private static FinalSeg finalSeg = FinalSeg.getInstance();
+
+    public static enum SegMode {
+        INDEX, SEARCH
+    }
+
+    private Map<Integer, List<Integer>> createDAG(String sentence) {
+        Map<Integer, List<Integer>> dag = new HashMap<Integer, List<Integer>>();
+        TrieNode trie = wordDict.getTrie();
+        int N = sentence.length();
+        int i = 0, j = 0;
+        TrieNode p = trie;
+        while (i < N) {
+            char ch = sentence.charAt(j);
+            if (p.childs.containsKey(ch)) {
+                p = p.childs.get(ch);
+                if (p.childs.containsKey(' ')) {
+                    if (!dag.containsKey(i)) {
+                        List<Integer> value = new ArrayList<Integer>();
+                        dag.put(i, value);
+                        value.add(j);
+                    } else
+                        dag.get(i).add(j);
+                }
+                j += 1;
+                if (j >= N) {
+                    i += 1;
+                    j = i;
+                    p = trie;
+                }
+            } else {
+                p = trie;
+                i += 1;
+                j = i;
+            }
+        }
+        for (i = 0; i < N; ++i) {
+            if (!dag.containsKey(i)) {
+                List<Integer> value = new ArrayList<Integer>();
+                value.add(i);
+                dag.put(i, value);
+            }
+        }
+        return dag;
+    }
+
+    private Map<Integer, Pair<Integer>> calc(String sentence, Map<Integer, List<Integer>> dag) {
+        int N = sentence.length();
+        HashMap<Integer, Pair<Integer>> route = new HashMap<Integer, Pair<Integer>>();
+        route.put(N, new Pair<Integer>(0, 0.0));
+        for (int i = N - 1; i > -1; i--) {
+            Pair<Integer> candidate = null;
+            for (Integer x : dag.get(i)) {
+                double freq =
+                        wordDict.getFreq(sentence.substring(i, x + 1)) + route.get(x + 1).freq;
+                if (null == candidate) {
+                    candidate = new Pair<Integer>(x, freq);
+                } else if (candidate.freq < freq) {
+                    candidate.freq = freq;
+                    candidate.key = x;
+                }
+            }
+            route.put(i, candidate);
+        }
+        return route;
+    }
+
+    public List<SegToken> process(String paragraph, SegMode mode) {
+        List<SegToken> tokens = new ArrayList<SegToken>();
+        StringBuilder sb = new StringBuilder();
+        int offset = 0;
+        for (int i = 0; i < paragraph.length(); ++i) {
+            char ch = CharacterUtil.regularize(paragraph.charAt(i));
+            if (CharacterUtil.ccFind(ch))
+                sb.append(ch);
+            else {
+                if (sb.length() > 0) {
+                    // process
+                    if (mode == SegMode.SEARCH) {
+                        for (String token : sentenceProcess(sb.toString())) {
+                            tokens.add(new SegToken(token, offset, offset += token.length()));
+                        }
+                    } else {
+                        for (String token : sentenceProcess(sb.toString())) {
+                            if (token.length() > 2) {
+                                String gram2 = "";
+                                int j = 0;
+                                for (; j < token.length() - 1; ++j) {
+                                    gram2 = token.substring(j, j + 2);
+                                    if (wordDict.containsFreq(gram2))
+                                        tokens.add(new SegToken(gram2, offset + j, offset + j + 2));
+                                }
+                            }
+                            if (token.length() > 3) {
+                                String gram3 = "";
+                                int j = 0;
+                                for (; j < token.length() - 2; ++j) {
+                                    gram3 = token.substring(j, j + 3);
+                                    if (wordDict.containsFreq(gram3))
+                                        tokens.add(new SegToken(gram3, offset + j, offset + j + 3));
+                                }
+                            }
+                            tokens.add(new SegToken(token, offset, offset += token.length()));
+                        }
+                    }
+                    sb = new StringBuilder();
+                    offset = i;
+                }
+                tokens.add(new SegToken(paragraph.substring(i, i + 1), offset, ++offset));
+            }
+        }
+        if (sb.length() > 0)
+            if (mode == SegMode.SEARCH) {
+                for (String token : sentenceProcess(sb.toString())) {
+                    tokens.add(new SegToken(token, offset, offset += token.length()));
+                }
+            } else {
+                for (String token : sentenceProcess(sb.toString())) {
+                    if (token.length() > 2) {
+                        String gram2 = "";
+                        int j = 0;
+                        for (; j < token.length() - 1; ++j) {
+                            gram2 = token.substring(j, j + 2);
+                            if (wordDict.containsFreq(gram2))
+                                tokens.add(new SegToken(gram2, offset + j, offset + j + 2));
+                        }
+                    }
+                    if (token.length() > 3) {
+                        String gram3 = "";
+                        int j = 0;
+                        for (; j < token.length() - 2; ++j) {
+                            gram3 = token.substring(j, j + 3);
+                            if (wordDict.containsFreq(gram3))
+                                tokens.add(new SegToken(gram3, offset + j, offset + j + 3));
+                        }
+                    }
+                    tokens.add(new SegToken(token, offset, offset += token.length()));
+                }
+            }
+
+        return tokens;
+    }
+
+    public List<String> sentenceProcess(String sentence) {
+        List<String> tokens = new ArrayList<String>();
+        int N = sentence.length();
+        Map<Integer, List<Integer>> dag = createDAG(sentence);
+        Map<Integer, Pair<Integer>> route = calc(sentence, dag);
+
+        int x = 0;
+        int y = 0;
+        String buf = "";
+        while (x < N) {
+            y = route.get(x).key + 1;
+            String lWord = sentence.substring(x, y);
+            if (y - x == 1)
+                buf += lWord;
+            else {
+                if (buf.length() > 0) {
+                    if (buf.length() == 1) {
+                        tokens.add(buf);
+                        buf = "";
+                    } else {
+                        if (wordDict.containsFreq(buf)) {
+                            for (int i = 0; i < buf.length(); ++i) {
+                                tokens.add(buf.substring(i, i + 1));
+                            }
+                        } else {
+                            finalSeg.cut(buf, tokens);
+                        }
+                        buf = "";
+                    }
+                }
+                tokens.add(lWord);
+            }
+            x = y;
+        }
+        if (buf.length() > 0) {
+            if (buf.length() == 1) {
+                tokens.add(buf);
+                buf = "";
+            } else {
+                if (wordDict.containsFreq(buf)) {
+                    for (int i = 0; i < buf.length(); ++i) {
+                        tokens.add(buf.substring(i, i + 1));
+                    }
+                } else {
+                    finalSeg.cut(buf, tokens);
+                }
+                buf = "";
+            }
+
+        }
+        return tokens;
+    }
+}
diff --git a/src/main/java/com/huaban/analysis/jieba/Pair.java b/src/main/java/com/huaban/analysis/jieba/Pair.java
@@ -0,0 +1,17 @@
+package com.huaban.analysis.jieba;
+
+public class Pair<K> {
+    public K key;
+    public Double freq = 0.0;
+
+    public Pair(K key, double freq) {
+	this.key = key;
+	this.freq = freq;
+    }
+
+    @Override
+    public String toString() {
+	return "Candidate [key=" + key + ", freq=" + freq + "]";
+    }
+
+}
diff --git a/src/main/java/com/huaban/analysis/jieba/SegToken.java b/src/main/java/com/huaban/analysis/jieba/SegToken.java
@@ -0,0 +1,21 @@
+package com.huaban.analysis.jieba;
+
+public class SegToken {
+    public String token;
+
+    public int startOffset;
+
+    public int endOffset;
+
+    public SegToken(String token, int startOffset, int endOffset) {
+	this.token = token;
+	this.startOffset = startOffset;
+	this.endOffset = endOffset;
+    }
+
+    @Override
+    public String toString() {
+	return "[" + token + ", " + startOffset + ", " + endOffset + "]";
+    }
+
+}
diff --git a/src/main/java/com/huaban/analysis/jieba/TrieNode.java b/src/main/java/com/huaban/analysis/jieba/TrieNode.java
@@ -0,0 +1,15 @@
+package com.huaban.analysis.jieba;
+
+import java.util.HashMap;
+
+public class TrieNode {
+    public char key = (char)0;
+
+    public HashMap<Character, TrieNode> childs = new HashMap<Character, TrieNode>();
+
+    public TrieNode() {}
+
+    public TrieNode(char key) {
+        this.key = key;
+    }
+}