From 34d72e3acf4f61f88264068ba78b28a006d2b7af Mon Sep 17 00:00:00 2001 From: David Rosson Date: Sat, 31 Dec 2016 20:23:51 -0800 Subject: [PATCH] Add tokenizer with language-specific RegExp matchers --- .../tokenizers/orthography_matchers.js | 33 +++++++++++++++++++ lib/natural/tokenizers/regexp_tokenizer.js | 18 ++++++++++ 2 files changed, 51 insertions(+) create mode 100644 lib/natural/tokenizers/orthography_matchers.js diff --git a/lib/natural/tokenizers/orthography_matchers.js b/lib/natural/tokenizers/orthography_matchers.js new file mode 100644 index 000000000..50ae062d2 --- /dev/null +++ b/lib/natural/tokenizers/orthography_matchers.js @@ -0,0 +1,33 @@ +/* +Copyright (c) 2011, Chris Umbel + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + + +/*** + * RegExp definitions for tokenizing text in a specific language based + * on its alphabet. Each language is keyed by the two-letter code per + * ISO 639-1, and defines a RegExp that excludes alphabetic characters. + */ +var matchers = { + fi: /[^A-Za-zÅåÄäÖö]/ +}; + +module.exports = matchers; diff --git a/lib/natural/tokenizers/regexp_tokenizer.js b/lib/natural/tokenizers/regexp_tokenizer.js index ae9ddac2a..12109d379 100644 --- a/lib/natural/tokenizers/regexp_tokenizer.js +++ b/lib/natural/tokenizers/regexp_tokenizer.js @@ -53,6 +53,24 @@ RegexpTokenizer.prototype.tokenize = function(s) { exports.RegexpTokenizer = RegexpTokenizer; +var orthographyMatchers = require('./orthography_matchers'); + +/*** + * A tokenizer that accepts an alphabet definition. + * @param {string} options.language ISO 639-1 for the language, e.g. 'en' + */ +var OrthographyTokenizer = function(options) { + var pattern = orthographyMatchers[options.language]; + if (!pattern) { + WordTokenizer.call(this, options); + } else { + this._pattern = pattern; + RegexpTokenizer.call(this, options); + } +}; + +exports.OrthographyTokenizer = OrthographyTokenizer; + /*** * A tokenizer that divides a text into sequences of alphabetic and * non-alphabetic characters. E.g.: