Skip to content

Commit

Permalink
Improve Emoticons
Browse files Browse the repository at this point in the history
Change-Id: I0d72781b41381aa2c86e41287b8f824af4af95d1
  • Loading branch information
Akron committed Mar 27, 2022
1 parent f94b9ce commit b98e4cf
Show file tree
Hide file tree
Showing 8 changed files with 75 additions and 32 deletions.
3 changes: 3 additions & 0 deletions Changes
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
0.1.5 2022-03-28
- Improve Emoticon-List.

0.1.4 2022-03-27
- Improved handling of ellipsis.
- Make algorithm more robust to nevere fail.
Expand Down
28 changes: 27 additions & 1 deletion matrix_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -376,7 +376,7 @@ Innstetten!`
assert.Equal(\nNun\n,\ngib\ndich\nzufrieden\n,\nich\nfange\nschon\nan\n...", sentences[0])
assert.Equal("Also\nBaron\nInnstetten\n!", sentences[1])

// Check paranthesis at the end of sentences.
// Check parantheses at the end of the sentence
w.Reset()
assert.True(mat.Transduce(strings.NewReader("(Er ging.) Und kam (später)."), w))
sentences = strings.Split(w.String(), "\n\n")
Expand Down Expand Up @@ -1083,6 +1083,32 @@ func TestMatrixFullTokenizerTokenSplitter(t *testing.T) {
*/
}

func TestMatrixEmoticons(t *testing.T) {
assert := assert.New(t)

if mat == nil {
mat = LoadMatrixFile("testdata/tokenizer.matok")
}

assert.NotNil(mat)

b := make([]byte, 0, 2048)
w := bytes.NewBuffer(b)
var tokens []string

tokens = ttokenize(mat, w, ":-* ;) :)) :*( ^___^ T__T ^^; -_-;;; -_-^")
assert.Equal(tokens[0], ":-*")
assert.Equal(tokens[1], ";)")
assert.Equal(tokens[2], ":))")
assert.Equal(tokens[3], ":*(")
assert.Equal(tokens[4], "^___^")
assert.Equal(tokens[5], "T__T")
assert.Equal(tokens[6], "^^;")
assert.Equal(tokens[7], "-_-;;;")
assert.Equal(tokens[8], "-_-^")
assert.Equal(len(tokens), 9)
}

func TestMatrixFullTokenizerXML(t *testing.T) {
assert := assert.New(t)

Expand Down
28 changes: 0 additions & 28 deletions src/emoji.xfst

This file was deleted.

42 changes: 42 additions & 0 deletions src/emoticons.xfst
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
! Partially by Park, Barash, Fink & Cha (2013)

define verticalemoticon [
[ "ಠ" "_" "ಠ"]|
[ "T" ["_"|"."|"-"]+ "T"] |
[ "♥" ["_"|"."|"-"]+ "♥" ] |
[ "@" ["_"|"."|"-"]* "@" ] |
[ "*" ["_"|"."|"-"]+ "*" ] |
[ "x" ["_"|"."|"-"]+ "x" ] |
[ "X" ["_"|"."|"-"]+ "X" ] |
[ "-" ["_"|"."]+ "-" ] |
[ "." ["_"]+ "." ] |
[ "^" ["_"|"."|"-"]* "^" ] |
[ ">" ["_"|"."|"-"]* "<" ] |
[ ["o"|"O"] ["_"|"."|"-"]+ ["o"|"O"] ]
];

read regex [
["<" ("/") "3"+] |
verticalemoticon (";"+|"^") |
["(" verticalemoticon ")"] |

! May be end of brackets as well, like
! Author (2018):
[ [")"|"("] ["'"|"-"|"o"]* [":"|"="|"x"] ] |
! May be end of xml, like
! <b class="emp">=</b>
[ ["<"*|">"*] ["B"|"X"|"8"|":"|";"|"="|"x"] ["'"|"-"|"o"]* ["/"|"<"|"C"|"["|")"|"("|"D"|"P"|"d"|"p"|"3"|">"|"o"|"O"|"*"]] |
[ ["D"|">"] ("'") ":"] |

! May be end of square bracket
! Author [2018]:
["]" ":"] |
[(">") [";"|":"] ["-"|"*"]* [ ")" | "(" | %] | %[ ]+ ] |
[(">") [";"|":"] ["-"]* ["*"|"P"|"p"|"o"|"O"|"D"]] |
["x" "("] |
["^" (".") "^"] |
[%\ ["{" "o" "}"|"o"|"m"] "/"] |
[":" ["a"|"b"|"c"|"d"|"e"|"f"|"g"|"h"|"i"|"j"|"k"|"l"|"m"|"n"|"o"|"p"|"q"|"r"|"s"|"t"|"u"|"v"|"w"|"x"|"y"|"z"|"_"|"-"]+ ":"] |
[">" "_" "<"] |
["*" "<" ":" "-" ")"]
];
6 changes: 3 additions & 3 deletions src/tokenizer.xfst
Original file line number Diff line number Diff line change
Expand Up @@ -119,8 +119,8 @@ define Years ["(" Digit+ (".") ")"] | ["[" Digit+ (".") "]"];
! 20:00 Uhr, 00:12:25,34 Minuten
define Times [ ( [%0|1|2|3|4|5] ) Digit [ ":" [%0|1|2|3|4|5] Digit ]^{1,2} ( "," [ Digit ]^{1,3} ) ];

source emoji.xfst
define Emoji;
source emoticons.xfst
define Emoticons;

! acronyms: U.S.A., I.B.M., etc.
! use a post-filter to remove dots
Expand Down Expand Up @@ -215,7 +215,7 @@ define Token [
Email @-> ... NLout,
File @-> ... NLout,
Domain @-> ... NLout,
Emoji @-> ... NLout
Emoticons @-> ... NLout
];

echo - Introduce Sentence splitter
Expand Down
Binary file modified testdata/tokenizer.datok
Binary file not shown.
Binary file modified testdata/tokenizer.fst
Binary file not shown.
Binary file modified testdata/tokenizer.matok
Binary file not shown.

0 comments on commit b98e4cf

Please sign in to comment.