Skip to content

Commit

Permalink
Added orthography tokenizer for Finnish
Browse files Browse the repository at this point in the history
  • Loading branch information
Hugo-ter-Doest committed Mar 17, 2018
1 parent 34d72e3 commit ceeb885
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 5 deletions.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,10 @@ console.log(tokenizer.tokenize("flea-dog"));
tokenizer = new natural.WordPunctTokenizer();
console.log(tokenizer.tokenize("my dog hasn't any fleas."));
// [ 'my', 'dog', 'hasn', '\'', 't', 'any', 'fleas', '.' ]

tokenizer = new natural.OrthographyTokenizer({language: "fi"});
console.log(tokenizer.tokenize("Mikä sinun nimesi on?"));
// [ 'Mikä', 'sinun', 'nimesi', 'on' ]
```

## String Distance
Expand Down
3 changes: 1 addition & 2 deletions lib/natural/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ exports.AggressiveTokenizerNo = require('./tokenizers/aggressive_tokenizer_no');
exports.AggressiveTokenizer = require('./tokenizers/aggressive_tokenizer');
exports.CaseTokenizer = require('./tokenizers/tokenizer_case');
exports.RegexpTokenizer = require('./tokenizers/regexp_tokenizer').RegexpTokenizer;
exports.OrthographyTokenizer = require('./tokenizers/regexp_tokenizer').OrthographyTokenizer;
exports.WordTokenizer = require('./tokenizers/regexp_tokenizer').WordTokenizer;
exports.WordPunctTokenizer = require('./tokenizers/regexp_tokenizer').WordPunctTokenizer;
exports.TreebankWordTokenizer = require('./tokenizers/treebank_word_tokenizer');
Expand Down Expand Up @@ -81,5 +82,3 @@ exports.transliterate_ja = require('./transliterators/ja');
exports.BrillPOSTagger = require('./brill_pos_tagger/lib/Brill_POS_Tagger');
exports.Lexicon = require('./brill_pos_tagger/lib/Lexicon');
exports.RuleSet = require('./brill_pos_tagger/lib/RuleSet');


8 changes: 5 additions & 3 deletions lib/natural/tokenizers/regexp_tokenizer.js
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ var RegexpTokenizer = function(options) {

// Match and split on GAPS not the actual WORDS
this._gaps = options.gaps;

if (this._gaps === undefined) {
this._gaps = true;
}
Expand Down Expand Up @@ -69,6 +69,8 @@ var OrthographyTokenizer = function(options) {
}
};

util.inherits(OrthographyTokenizer, RegexpTokenizer);

exports.OrthographyTokenizer = OrthographyTokenizer;

/***
Expand All @@ -77,7 +79,7 @@ exports.OrthographyTokenizer = OrthographyTokenizer;
*
* >>> WordTokenizer().tokenize("She said 'hello'.")
* ['She', 'said', 'hello']
*
*
*/
var WordTokenizer = function(options) {
this._pattern = /[^A-Za-zА-Яа-я0-9_]+/;
Expand All @@ -93,7 +95,7 @@ exports.WordTokenizer = WordTokenizer;
*
* >>> WordPunctTokenizer().tokenize("She said 'hello'.")
* ["She","said","'","hello","'","."]
*
*
*/
var WordPunctTokenizer = function(options) {
this._pattern = new RegExp(/(\w+|[а-я0-9_]+|\.|\!|\'|\"")/i);
Expand Down
22 changes: 22 additions & 0 deletions spec/orthography_tokenizer_spec.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@



var OrthographyTokenizer = require('../lib/natural/tokenizers/regexp_tokenizer').OrthographyTokenizer;
console.log(OrthographyTokenizer);

var sentencesInFinnish = [
["Mikä sinun nimesi on?", [ 'Mikä', 'sinun', 'nimesi', 'on' ]],
["Hyvää kiitos, entä sinulle?", [ 'Hyvää', 'kiitos', 'entä', 'sinulle' ]],
["Tämä herrasmies maksaa kaiken", [ 'Tämä', 'herrasmies', 'maksaa', 'kaiken' ]]
];

describe("The orthography tokenizer tokenizes sentences in Finnish", function() {
var tokenizer = new OrthographyTokenizer({language: "fi"});
console.log(tokenizer);
sentencesInFinnish.forEach(function(sentencePlusResult) {
it("It should correctly tokenize the following sentence: " + sentencePlusResult[0], function() {
//console.log(tokenizer.tokenize(sentencePlusResult[0]));
expect(tokenizer.tokenize(sentencePlusResult[0])).toEqual(sentencePlusResult[1]);
});
});
});

0 comments on commit ceeb885

Please sign in to comment.