forked from huaban/jieba-analysis
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 3ec44e7
Showing
13 changed files
with
617,140 additions
and
0 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
小清新 3 | ||
百搭 3 | ||
显瘦 3 | ||
又拍云 3 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
<modelVersion>4.0.0</modelVersion> | ||
|
||
<groupId>com.huaban</groupId> | ||
<artifactId>jieba-analysis</artifactId> | ||
<version>0.0.1-SNAPSHOT</version> | ||
<packaging>jar</packaging> | ||
|
||
<name>结巴分词工具(jieba for java)</name> | ||
<url>http://maven.apache.org</url> | ||
<inceptionYear>2013</inceptionYear> | ||
<licenses> | ||
<license> | ||
<name>The Apache Software License, Version 2.0</name> | ||
<url>http://www.apache.org/licenses/LICENSE-2.0.txt</url> | ||
<distribution>repo</distribution> | ||
</license> | ||
</licenses> | ||
|
||
|
||
<properties> | ||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> | ||
</properties> | ||
|
||
<dependencies> | ||
<dependency> | ||
<groupId>junit</groupId> | ||
<artifactId>junit</artifactId> | ||
<version>4.8</version> | ||
<scope>test</scope> | ||
</dependency> | ||
</dependencies> | ||
|
||
<build> | ||
<plugins> | ||
<plugin> | ||
<groupId>org.apache.maven.plugins</groupId> | ||
<artifactId>maven-compiler-plugin</artifactId> | ||
<version>2.3.2</version> | ||
<configuration> | ||
<source>1.7</source> | ||
<target>1.7</target> | ||
</configuration> | ||
</plugin> | ||
</plugins> | ||
</build> | ||
</project> |
55 changes: 55 additions & 0 deletions
55
src/main/java/com/huaban/analysis/jieba/CharacterUtil.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
package com.huaban.analysis.jieba; | ||
|
||
import java.util.regex.Pattern; | ||
|
||
public class CharacterUtil { | ||
public static Pattern reSkip = Pattern.compile("(\\d+\\.\\d+|[a-zA-Z0-9]+)"); | ||
private static final char[] connectors = new char[] {'+', '#', '&', '.', '_'}; | ||
|
||
public static boolean isChineseLetter(char ch) { | ||
if (ch >= 0x4E00 && ch <= 0x9FA5) return true; | ||
return false; | ||
} | ||
|
||
public static boolean isEnglishLetter(char ch) { | ||
if ((ch >= 0x0041 && ch <= 0x005A) || (ch >= 0x0061 && ch <= 0x007A)) | ||
return true; | ||
return false; | ||
} | ||
|
||
public static boolean isDigit(char ch) { | ||
if (ch >= 0x0030 && ch <= 0x0039) return true; | ||
return false; | ||
} | ||
|
||
public static boolean isConnector(char ch) { | ||
for (char connector : connectors) | ||
if (ch == connector) return true; | ||
return false; | ||
} | ||
|
||
public static boolean ccFind(char ch) { | ||
if(isChineseLetter(ch)) return true; | ||
if(isEnglishLetter(ch)) return true; | ||
if(isDigit(ch)) return true; | ||
if(isConnector(ch)) return true; | ||
return false; | ||
} | ||
|
||
/** | ||
* 全角->半角,大写->小写 | ||
* @param input | ||
* @return | ||
*/ | ||
public static char regularize(char input){ | ||
if (input == 12288) { | ||
return 32; | ||
}else if (input > 65280 && input < 65375) { | ||
return (char) (input - 65248); | ||
}else if (input >= 'A' && input <= 'Z') { | ||
return (input += 32); | ||
} | ||
return input; | ||
} | ||
|
||
} |
207 changes: 207 additions & 0 deletions
207
src/main/java/com/huaban/analysis/jieba/JiebaSegmenter.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,207 @@ | ||
package com.huaban.analysis.jieba; | ||
|
||
import java.util.ArrayList; | ||
import java.util.HashMap; | ||
import java.util.List; | ||
import java.util.Map; | ||
|
||
import com.huaban.analysis.jieba.viterbi.FinalSeg; | ||
|
||
public class JiebaSegmenter { | ||
private static WordDictionary wordDict = WordDictionary.getInstance(); | ||
private static FinalSeg finalSeg = FinalSeg.getInstance(); | ||
|
||
public static enum SegMode { | ||
INDEX, SEARCH | ||
} | ||
|
||
private Map<Integer, List<Integer>> createDAG(String sentence) { | ||
Map<Integer, List<Integer>> dag = new HashMap<Integer, List<Integer>>(); | ||
TrieNode trie = wordDict.getTrie(); | ||
int N = sentence.length(); | ||
int i = 0, j = 0; | ||
TrieNode p = trie; | ||
while (i < N) { | ||
char ch = sentence.charAt(j); | ||
if (p.childs.containsKey(ch)) { | ||
p = p.childs.get(ch); | ||
if (p.childs.containsKey(' ')) { | ||
if (!dag.containsKey(i)) { | ||
List<Integer> value = new ArrayList<Integer>(); | ||
dag.put(i, value); | ||
value.add(j); | ||
} else | ||
dag.get(i).add(j); | ||
} | ||
j += 1; | ||
if (j >= N) { | ||
i += 1; | ||
j = i; | ||
p = trie; | ||
} | ||
} else { | ||
p = trie; | ||
i += 1; | ||
j = i; | ||
} | ||
} | ||
for (i = 0; i < N; ++i) { | ||
if (!dag.containsKey(i)) { | ||
List<Integer> value = new ArrayList<Integer>(); | ||
value.add(i); | ||
dag.put(i, value); | ||
} | ||
} | ||
return dag; | ||
} | ||
|
||
private Map<Integer, Pair<Integer>> calc(String sentence, Map<Integer, List<Integer>> dag) { | ||
int N = sentence.length(); | ||
HashMap<Integer, Pair<Integer>> route = new HashMap<Integer, Pair<Integer>>(); | ||
route.put(N, new Pair<Integer>(0, 0.0)); | ||
for (int i = N - 1; i > -1; i--) { | ||
Pair<Integer> candidate = null; | ||
for (Integer x : dag.get(i)) { | ||
double freq = | ||
wordDict.getFreq(sentence.substring(i, x + 1)) + route.get(x + 1).freq; | ||
if (null == candidate) { | ||
candidate = new Pair<Integer>(x, freq); | ||
} else if (candidate.freq < freq) { | ||
candidate.freq = freq; | ||
candidate.key = x; | ||
} | ||
} | ||
route.put(i, candidate); | ||
} | ||
return route; | ||
} | ||
|
||
public List<SegToken> process(String paragraph, SegMode mode) { | ||
List<SegToken> tokens = new ArrayList<SegToken>(); | ||
StringBuilder sb = new StringBuilder(); | ||
int offset = 0; | ||
for (int i = 0; i < paragraph.length(); ++i) { | ||
char ch = CharacterUtil.regularize(paragraph.charAt(i)); | ||
if (CharacterUtil.ccFind(ch)) | ||
sb.append(ch); | ||
else { | ||
if (sb.length() > 0) { | ||
// process | ||
if (mode == SegMode.SEARCH) { | ||
for (String token : sentenceProcess(sb.toString())) { | ||
tokens.add(new SegToken(token, offset, offset += token.length())); | ||
} | ||
} else { | ||
for (String token : sentenceProcess(sb.toString())) { | ||
if (token.length() > 2) { | ||
String gram2 = ""; | ||
int j = 0; | ||
for (; j < token.length() - 1; ++j) { | ||
gram2 = token.substring(j, j + 2); | ||
if (wordDict.containsFreq(gram2)) | ||
tokens.add(new SegToken(gram2, offset + j, offset + j + 2)); | ||
} | ||
} | ||
if (token.length() > 3) { | ||
String gram3 = ""; | ||
int j = 0; | ||
for (; j < token.length() - 2; ++j) { | ||
gram3 = token.substring(j, j + 3); | ||
if (wordDict.containsFreq(gram3)) | ||
tokens.add(new SegToken(gram3, offset + j, offset + j + 3)); | ||
} | ||
} | ||
tokens.add(new SegToken(token, offset, offset += token.length())); | ||
} | ||
} | ||
sb = new StringBuilder(); | ||
offset = i; | ||
} | ||
tokens.add(new SegToken(paragraph.substring(i, i + 1), offset, ++offset)); | ||
} | ||
} | ||
if (sb.length() > 0) | ||
if (mode == SegMode.SEARCH) { | ||
for (String token : sentenceProcess(sb.toString())) { | ||
tokens.add(new SegToken(token, offset, offset += token.length())); | ||
} | ||
} else { | ||
for (String token : sentenceProcess(sb.toString())) { | ||
if (token.length() > 2) { | ||
String gram2 = ""; | ||
int j = 0; | ||
for (; j < token.length() - 1; ++j) { | ||
gram2 = token.substring(j, j + 2); | ||
if (wordDict.containsFreq(gram2)) | ||
tokens.add(new SegToken(gram2, offset + j, offset + j + 2)); | ||
} | ||
} | ||
if (token.length() > 3) { | ||
String gram3 = ""; | ||
int j = 0; | ||
for (; j < token.length() - 2; ++j) { | ||
gram3 = token.substring(j, j + 3); | ||
if (wordDict.containsFreq(gram3)) | ||
tokens.add(new SegToken(gram3, offset + j, offset + j + 3)); | ||
} | ||
} | ||
tokens.add(new SegToken(token, offset, offset += token.length())); | ||
} | ||
} | ||
|
||
return tokens; | ||
} | ||
|
||
public List<String> sentenceProcess(String sentence) { | ||
List<String> tokens = new ArrayList<String>(); | ||
int N = sentence.length(); | ||
Map<Integer, List<Integer>> dag = createDAG(sentence); | ||
Map<Integer, Pair<Integer>> route = calc(sentence, dag); | ||
|
||
int x = 0; | ||
int y = 0; | ||
String buf = ""; | ||
while (x < N) { | ||
y = route.get(x).key + 1; | ||
String lWord = sentence.substring(x, y); | ||
if (y - x == 1) | ||
buf += lWord; | ||
else { | ||
if (buf.length() > 0) { | ||
if (buf.length() == 1) { | ||
tokens.add(buf); | ||
buf = ""; | ||
} else { | ||
if (wordDict.containsFreq(buf)) { | ||
for (int i = 0; i < buf.length(); ++i) { | ||
tokens.add(buf.substring(i, i + 1)); | ||
} | ||
} else { | ||
finalSeg.cut(buf, tokens); | ||
} | ||
buf = ""; | ||
} | ||
} | ||
tokens.add(lWord); | ||
} | ||
x = y; | ||
} | ||
if (buf.length() > 0) { | ||
if (buf.length() == 1) { | ||
tokens.add(buf); | ||
buf = ""; | ||
} else { | ||
if (wordDict.containsFreq(buf)) { | ||
for (int i = 0; i < buf.length(); ++i) { | ||
tokens.add(buf.substring(i, i + 1)); | ||
} | ||
} else { | ||
finalSeg.cut(buf, tokens); | ||
} | ||
buf = ""; | ||
} | ||
|
||
} | ||
return tokens; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
package com.huaban.analysis.jieba; | ||
|
||
public class Pair<K> { | ||
public K key; | ||
public Double freq = 0.0; | ||
|
||
public Pair(K key, double freq) { | ||
this.key = key; | ||
this.freq = freq; | ||
} | ||
|
||
@Override | ||
public String toString() { | ||
return "Candidate [key=" + key + ", freq=" + freq + "]"; | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
package com.huaban.analysis.jieba; | ||
|
||
public class SegToken { | ||
public String token; | ||
|
||
public int startOffset; | ||
|
||
public int endOffset; | ||
|
||
public SegToken(String token, int startOffset, int endOffset) { | ||
this.token = token; | ||
this.startOffset = startOffset; | ||
this.endOffset = endOffset; | ||
} | ||
|
||
@Override | ||
public String toString() { | ||
return "[" + token + ", " + startOffset + ", " + endOffset + "]"; | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
package com.huaban.analysis.jieba; | ||
|
||
import java.util.HashMap; | ||
|
||
public class TrieNode { | ||
public char key = (char)0; | ||
|
||
public HashMap<Character, TrieNode> childs = new HashMap<Character, TrieNode>(); | ||
|
||
public TrieNode() {} | ||
|
||
public TrieNode(char key) { | ||
this.key = key; | ||
} | ||
} |
Oops, something went wrong.