Skip to content

Commit

Permalink
initial commit.
Browse files Browse the repository at this point in the history
  • Loading branch information
piaolingxue committed Aug 5, 2013
0 parents commit 3ec44e7
Show file tree
Hide file tree
Showing 13 changed files with 617,140 additions and 0 deletions.
232,030 changes: 232,030 additions & 0 deletions conf/jieba/sougou.dict

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions conf/jieba/user.dict
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
小清新 3
百搭 3
显瘦 3
又拍云 3
48 changes: 48 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<groupId>com.huaban</groupId>
<artifactId>jieba-analysis</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>

<name>结巴分词工具(jieba for java)</name>
<url>http://maven.apache.org</url>
<inceptionYear>2013</inceptionYear>
<licenses>
<license>
<name>The Apache Software License, Version 2.0</name>
<url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
<distribution>repo</distribution>
</license>
</licenses>


<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>

<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.8</version>
<scope>test</scope>
</dependency>
</dependencies>

<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>2.3.2</version>
<configuration>
<source>1.7</source>
<target>1.7</target>
</configuration>
</plugin>
</plugins>
</build>
</project>
55 changes: 55 additions & 0 deletions src/main/java/com/huaban/analysis/jieba/CharacterUtil.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
package com.huaban.analysis.jieba;

import java.util.regex.Pattern;

public class CharacterUtil {
public static Pattern reSkip = Pattern.compile("(\\d+\\.\\d+|[a-zA-Z0-9]+)");
private static final char[] connectors = new char[] {'+', '#', '&', '.', '_'};

public static boolean isChineseLetter(char ch) {
if (ch >= 0x4E00 && ch <= 0x9FA5) return true;
return false;
}

public static boolean isEnglishLetter(char ch) {
if ((ch >= 0x0041 && ch <= 0x005A) || (ch >= 0x0061 && ch <= 0x007A))
return true;
return false;
}

public static boolean isDigit(char ch) {
if (ch >= 0x0030 && ch <= 0x0039) return true;
return false;
}

public static boolean isConnector(char ch) {
for (char connector : connectors)
if (ch == connector) return true;
return false;
}

public static boolean ccFind(char ch) {
if(isChineseLetter(ch)) return true;
if(isEnglishLetter(ch)) return true;
if(isDigit(ch)) return true;
if(isConnector(ch)) return true;
return false;
}

/**
* 全角->半角,大写->小写
* @param input
* @return
*/
public static char regularize(char input){
if (input == 12288) {
return 32;
}else if (input > 65280 && input < 65375) {
return (char) (input - 65248);
}else if (input >= 'A' && input <= 'Z') {
return (input += 32);
}
return input;
}

}
207 changes: 207 additions & 0 deletions src/main/java/com/huaban/analysis/jieba/JiebaSegmenter.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
package com.huaban.analysis.jieba;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import com.huaban.analysis.jieba.viterbi.FinalSeg;

public class JiebaSegmenter {
private static WordDictionary wordDict = WordDictionary.getInstance();
private static FinalSeg finalSeg = FinalSeg.getInstance();

public static enum SegMode {
INDEX, SEARCH
}

private Map<Integer, List<Integer>> createDAG(String sentence) {
Map<Integer, List<Integer>> dag = new HashMap<Integer, List<Integer>>();
TrieNode trie = wordDict.getTrie();
int N = sentence.length();
int i = 0, j = 0;
TrieNode p = trie;
while (i < N) {
char ch = sentence.charAt(j);
if (p.childs.containsKey(ch)) {
p = p.childs.get(ch);
if (p.childs.containsKey(' ')) {
if (!dag.containsKey(i)) {
List<Integer> value = new ArrayList<Integer>();
dag.put(i, value);
value.add(j);
} else
dag.get(i).add(j);
}
j += 1;
if (j >= N) {
i += 1;
j = i;
p = trie;
}
} else {
p = trie;
i += 1;
j = i;
}
}
for (i = 0; i < N; ++i) {
if (!dag.containsKey(i)) {
List<Integer> value = new ArrayList<Integer>();
value.add(i);
dag.put(i, value);
}
}
return dag;
}

private Map<Integer, Pair<Integer>> calc(String sentence, Map<Integer, List<Integer>> dag) {
int N = sentence.length();
HashMap<Integer, Pair<Integer>> route = new HashMap<Integer, Pair<Integer>>();
route.put(N, new Pair<Integer>(0, 0.0));
for (int i = N - 1; i > -1; i--) {
Pair<Integer> candidate = null;
for (Integer x : dag.get(i)) {
double freq =
wordDict.getFreq(sentence.substring(i, x + 1)) + route.get(x + 1).freq;
if (null == candidate) {
candidate = new Pair<Integer>(x, freq);
} else if (candidate.freq < freq) {
candidate.freq = freq;
candidate.key = x;
}
}
route.put(i, candidate);
}
return route;
}

public List<SegToken> process(String paragraph, SegMode mode) {
List<SegToken> tokens = new ArrayList<SegToken>();
StringBuilder sb = new StringBuilder();
int offset = 0;
for (int i = 0; i < paragraph.length(); ++i) {
char ch = CharacterUtil.regularize(paragraph.charAt(i));
if (CharacterUtil.ccFind(ch))
sb.append(ch);
else {
if (sb.length() > 0) {
// process
if (mode == SegMode.SEARCH) {
for (String token : sentenceProcess(sb.toString())) {
tokens.add(new SegToken(token, offset, offset += token.length()));
}
} else {
for (String token : sentenceProcess(sb.toString())) {
if (token.length() > 2) {
String gram2 = "";
int j = 0;
for (; j < token.length() - 1; ++j) {
gram2 = token.substring(j, j + 2);
if (wordDict.containsFreq(gram2))
tokens.add(new SegToken(gram2, offset + j, offset + j + 2));
}
}
if (token.length() > 3) {
String gram3 = "";
int j = 0;
for (; j < token.length() - 2; ++j) {
gram3 = token.substring(j, j + 3);
if (wordDict.containsFreq(gram3))
tokens.add(new SegToken(gram3, offset + j, offset + j + 3));
}
}
tokens.add(new SegToken(token, offset, offset += token.length()));
}
}
sb = new StringBuilder();
offset = i;
}
tokens.add(new SegToken(paragraph.substring(i, i + 1), offset, ++offset));
}
}
if (sb.length() > 0)
if (mode == SegMode.SEARCH) {
for (String token : sentenceProcess(sb.toString())) {
tokens.add(new SegToken(token, offset, offset += token.length()));
}
} else {
for (String token : sentenceProcess(sb.toString())) {
if (token.length() > 2) {
String gram2 = "";
int j = 0;
for (; j < token.length() - 1; ++j) {
gram2 = token.substring(j, j + 2);
if (wordDict.containsFreq(gram2))
tokens.add(new SegToken(gram2, offset + j, offset + j + 2));
}
}
if (token.length() > 3) {
String gram3 = "";
int j = 0;
for (; j < token.length() - 2; ++j) {
gram3 = token.substring(j, j + 3);
if (wordDict.containsFreq(gram3))
tokens.add(new SegToken(gram3, offset + j, offset + j + 3));
}
}
tokens.add(new SegToken(token, offset, offset += token.length()));
}
}

return tokens;
}

public List<String> sentenceProcess(String sentence) {
List<String> tokens = new ArrayList<String>();
int N = sentence.length();
Map<Integer, List<Integer>> dag = createDAG(sentence);
Map<Integer, Pair<Integer>> route = calc(sentence, dag);

int x = 0;
int y = 0;
String buf = "";
while (x < N) {
y = route.get(x).key + 1;
String lWord = sentence.substring(x, y);
if (y - x == 1)
buf += lWord;
else {
if (buf.length() > 0) {
if (buf.length() == 1) {
tokens.add(buf);
buf = "";
} else {
if (wordDict.containsFreq(buf)) {
for (int i = 0; i < buf.length(); ++i) {
tokens.add(buf.substring(i, i + 1));
}
} else {
finalSeg.cut(buf, tokens);
}
buf = "";
}
}
tokens.add(lWord);
}
x = y;
}
if (buf.length() > 0) {
if (buf.length() == 1) {
tokens.add(buf);
buf = "";
} else {
if (wordDict.containsFreq(buf)) {
for (int i = 0; i < buf.length(); ++i) {
tokens.add(buf.substring(i, i + 1));
}
} else {
finalSeg.cut(buf, tokens);
}
buf = "";
}

}
return tokens;
}
}
17 changes: 17 additions & 0 deletions src/main/java/com/huaban/analysis/jieba/Pair.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
package com.huaban.analysis.jieba;

public class Pair<K> {
public K key;
public Double freq = 0.0;

public Pair(K key, double freq) {
this.key = key;
this.freq = freq;
}

@Override
public String toString() {
return "Candidate [key=" + key + ", freq=" + freq + "]";
}

}
21 changes: 21 additions & 0 deletions src/main/java/com/huaban/analysis/jieba/SegToken.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
package com.huaban.analysis.jieba;

public class SegToken {
public String token;

public int startOffset;

public int endOffset;

public SegToken(String token, int startOffset, int endOffset) {
this.token = token;
this.startOffset = startOffset;
this.endOffset = endOffset;
}

@Override
public String toString() {
return "[" + token + ", " + startOffset + ", " + endOffset + "]";
}

}
15 changes: 15 additions & 0 deletions src/main/java/com/huaban/analysis/jieba/TrieNode.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
package com.huaban.analysis.jieba;

import java.util.HashMap;

public class TrieNode {
public char key = (char)0;

public HashMap<Character, TrieNode> childs = new HashMap<Character, TrieNode>();

public TrieNode() {}

public TrieNode(char key) {
this.key = key;
}
}
Loading

0 comments on commit 3ec44e7

Please sign in to comment.