Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

增加了词性 #4

Merged
merged 2 commits into from
Mar 26, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,11 @@
<version>4.8</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.3.1</version>
</dependency>
</dependencies>

<build>
Expand Down Expand Up @@ -77,7 +82,11 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>2.7</version>
<version>2.9.1</version>
<configuration>
<encoding>UTF-8</encoding>
<docencoding>UTF-8</docencoding>
</configuration>
<executions>
<execution>
<id>attach-javadocs</id>
Expand Down
6 changes: 3 additions & 3 deletions src/main/java/com/huaban/analysis/jieba/CharacterUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@ public static boolean ccFind(char ch) {
}

/**
* 全角->半角,大写->小写
* @param input
* @return
* 全角 to 半角,大写 to 小写
* @param input 输入字符
* @return 转换后的字符
*/
public static char regularize(char input){
if (input == 12288) {
Expand Down
65 changes: 32 additions & 33 deletions src/main/java/com/huaban/analysis/jieba/JiebaSegmenter.java
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
package com.huaban.analysis.jieba;

import com.huaban.analysis.jieba.viterbi.FinalSeg;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import com.huaban.analysis.jieba.viterbi.FinalSeg;

public class JiebaSegmenter {
private static WordDictionary wordDict = WordDictionary.getInstance();
private static FinalSeg finalSeg = FinalSeg.getInstance();
Expand Down Expand Up @@ -88,26 +88,26 @@ public List<SegToken> process(String paragraph, SegMode mode) {
if (sb.length() > 0) {
// process
if (mode == SegMode.SEARCH) {
for (String token : sentenceProcess(sb.toString())) {
tokens.add(new SegToken(token, offset, offset += token.length()));
for (Word word : sentenceProcess(sb.toString())) {
tokens.add(new SegToken(word, offset, offset += word.length()));
}
} else {
for (String token : sentenceProcess(sb.toString())) {
for (Word token : sentenceProcess(sb.toString())) {
if (token.length() > 2) {
String gram2 = "";
Word gram2;
int j = 0;
for (; j < token.length() - 1; ++j) {
gram2 = token.substring(j, j + 2);
if (wordDict.containsFreq(gram2))
gram2 = token.subSequence(j, j + 2);
if (wordDict.containsWord(gram2.getToken()))
tokens.add(new SegToken(gram2, offset + j, offset + j + 2));
}
}
if (token.length() > 3) {
String gram3 = "";
Word gram3;
int j = 0;
for (; j < token.length() - 2; ++j) {
gram3 = token.substring(j, j + 3);
if (wordDict.containsFreq(gram3))
gram3 = token.subSequence(j, j + 3);
if (wordDict.containsWord(gram3.getToken()))
tokens.add(new SegToken(gram3, offset + j, offset + j + 3));
}
}
Expand All @@ -117,31 +117,34 @@ public List<SegToken> process(String paragraph, SegMode mode) {
sb = new StringBuilder();
offset = i;
}
tokens.add(new SegToken(paragraph.substring(i, i + 1), offset, ++offset));
if (wordDict.containsWord(paragraph.substring(i, i + 1)))
tokens.add(new SegToken(wordDict.getWord(paragraph.substring(i, i + 1)), offset, ++offset));
else
tokens.add(new SegToken(Word.createWord(paragraph.substring(i, i + 1)), offset, ++offset));
}
}
if (sb.length() > 0)
if (mode == SegMode.SEARCH) {
for (String token : sentenceProcess(sb.toString())) {
for (Word token : sentenceProcess(sb.toString())) {
tokens.add(new SegToken(token, offset, offset += token.length()));
}
} else {
for (String token : sentenceProcess(sb.toString())) {
for (Word token : sentenceProcess(sb.toString())) {
if (token.length() > 2) {
String gram2 = "";
Word gram2;
int j = 0;
for (; j < token.length() - 1; ++j) {
gram2 = token.substring(j, j + 2);
if (wordDict.containsFreq(gram2))
gram2 = token.subSequence(j, j + 2);
if (wordDict.containsWord(gram2.getToken()))
tokens.add(new SegToken(gram2, offset + j, offset + j + 2));
}
}
if (token.length() > 3) {
String gram3 = "";
Word gram3;
int j = 0;
for (; j < token.length() - 2; ++j) {
gram3 = token.substring(j, j + 3);
if (wordDict.containsFreq(gram3))
gram3 = token.subSequence(j, j + 3);
if (wordDict.containsWord(gram3.getToken()))
tokens.add(new SegToken(gram3, offset + j, offset + j + 3));
}
}
Expand All @@ -152,8 +155,8 @@ public List<SegToken> process(String paragraph, SegMode mode) {
return tokens;
}

public List<String> sentenceProcess(String sentence) {
List<String> tokens = new ArrayList<String>();
public List<Word> sentenceProcess(String sentence) {
List<Word> tokens = new ArrayList<Word>();
int N = sentence.length();
Map<Integer, List<Integer>> dag = createDAG(sentence);
Map<Integer, Pair<Integer>> route = calc(sentence, dag);
Expand All @@ -169,32 +172,28 @@ public List<String> sentenceProcess(String sentence) {
else {
if (buf.length() > 0) {
if (buf.length() == 1) {
tokens.add(buf);
tokens.add(Word.createWord(buf));
buf = "";
} else {
if (wordDict.containsFreq(buf)) {
for (int i = 0; i < buf.length(); ++i) {
tokens.add(buf.substring(i, i + 1));
}
if (wordDict.containsWord(buf)) {
tokens.add(wordDict.getWord(buf));
} else {
finalSeg.cut(buf, tokens);
}
buf = "";
}
}
tokens.add(lWord);
tokens.add(Word.createWord(lWord));
}
x = y;
}
if (buf.length() > 0) {
if (buf.length() == 1) {
tokens.add(buf);
tokens.add(Word.createWord(buf));
buf = "";
} else {
if (wordDict.containsFreq(buf)) {
for (int i = 0; i < buf.length(); ++i) {
tokens.add(buf.substring(i, i + 1));
}
if (wordDict.containsWord(buf)) {
tokens.add(wordDict.getWord(buf));
} else {
finalSeg.cut(buf, tokens);
}
Expand Down
18 changes: 12 additions & 6 deletions src/main/java/com/huaban/analysis/jieba/SegToken.java
Original file line number Diff line number Diff line change
@@ -1,21 +1,27 @@
package com.huaban.analysis.jieba;

import org.apache.commons.lang3.StringUtils;

public class SegToken {
public String token;
public Word word;

public int startOffset;

public int endOffset;

public SegToken(String token, int startOffset, int endOffset) {
this.token = token;
this.startOffset = startOffset;
this.endOffset = endOffset;

public SegToken(Word word, int startOffset, int endOffset) {
this.word = word;
this.startOffset = startOffset;
this.endOffset = endOffset;
}

@Override
public String toString() {
return "[" + token + ", " + startOffset + ", " + endOffset + "]";
if (StringUtils.isBlank(this.word.getTokenType()))
return "[" + this.word.getToken() + ", " + startOffset + ", " + endOffset + "]";
else
return "[" + this.word.getToken() + ", " + startOffset + ", " + endOffset + ", " + this.word.getTokenType() + "]";
}

}
122 changes: 122 additions & 0 deletions src/main/java/com/huaban/analysis/jieba/Word.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
package com.huaban.analysis.jieba;

/**
* Created by linkerlin on 3/21/14.
*/
public class Word implements CharSequence{
private String token;
private Double freq;
private String tokenType;
private static WordDictionary wordDict = WordDictionary.getInstance();

private Word(String token, Double freq, String tokenType){
this.token = token;
this.freq = freq;
this.tokenType = tokenType;
}

private Word(String token, Double freq){
this.token = token;
this.freq = freq;
this.tokenType = "";
}

private Word(String token){
this.token = token;
this.freq = 0.0;
this.tokenType = "";
}

public static Word createWord(String token, Double freq, String tokenType) {
if(wordDict.containsWord(token))
return wordDict.getWord(token);
return new Word(token, freq, tokenType);
}

public static Word createWord(String token, Double freq) {
if(wordDict.containsWord(token))
return wordDict.getWord(token);
return new Word(token, freq, "");
}

public static Word createWord(String token) {
if(wordDict.containsWord(token))
return wordDict.getWord(token);
return new Word(token, 0.0, "");
}

public String getToken() {
return token;
}

public void setToken(String token) {
this.token = token;
}

public Double getFreq() {
return freq;
}

public void setFreq(Double freq) {
this.freq = freq;
}

public String getTokenType() {
return tokenType;
}

public void setTokenType(String tokenType) {
this.tokenType = tokenType;
}

/**
* Returns the length of this character sequence. The length is the number
* of 16-bit <code>char</code>s in the sequence.
*
* @return the number of <code>char</code>s in this sequence
*/
@Override
public int length() {
return token.length();
}

/**
* Returns the <code>char</code> value at the specified index. An index ranges from zero
* to <tt>length() - 1</tt>. The first <code>char</code> value of the sequence is at
* index zero, the next at index one, and so on, as for array
* indexing.
*
* <p>If the <code>char</code> value specified by the index is a
* <a href="{@docRoot}/java/lang/Character.html#unicode">surrogate</a>, the surrogate
* value is returned.
*
* @param index the index of the <code>char</code> value to be returned
* @return the specified <code>char</code> value
* @throws IndexOutOfBoundsException if the <tt>index</tt> argument is negative or not less than
* <tt>length()</tt>
*/
@Override
public char charAt(int index) {
return token.charAt(index);
}

/**
* Returns a new <code>CharSequence</code> that is a subsequence of this sequence.
* The subsequence starts with the <code>char</code> value at the specified index and
* ends with the <code>char</code> value at index <tt>end - 1</tt>. The length
* (in <code>char</code>s) of the
* returned sequence is <tt>end - start</tt>, so if <tt>start == end</tt>
* then an empty sequence is returned.
*
* @param start the start index, inclusive
* @param end the end index, exclusive
* @return the specified subsequence
* @throws IndexOutOfBoundsException if <tt>start</tt> or <tt>end</tt> are negative,
* if <tt>end</tt> is greater than <tt>length()</tt>,
* or if <tt>start</tt> is greater than <tt>end</tt>
*/
@Override
public Word subSequence(int start, int end) {
return createWord(token.subSequence(start, end).toString(),freq,tokenType);
}
}
Loading