Skip to content

Commit

Permalink
feat(crafters): add SlidingWindowSegmenter for nlp
Browse files Browse the repository at this point in the history
  • Loading branch information
Frederic Haase authored and Frederic Haase committed Jun 1, 2020
1 parent a89564c commit c6ef697
Show file tree
Hide file tree
Showing 3 changed files with 88 additions and 4 deletions.
6 changes: 4 additions & 2 deletions docs/chapters/all_exec.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ This version of Jina includes 80 Executors.
- `BaseCrafter`
- `BaseSegmenter`
- `JiebaSegmenter`
- `SlidingWindowSegmenter`
- `Sentencizer`
- `ImageReader`
- `BaseChunkCrafter`
Expand Down Expand Up @@ -153,7 +154,7 @@ This version of Jina includes 80 Executors.
| `ImageResizer` | `jina.executors.crafters.image` |
| `ImageTorchEncoder` | `jina.executors.encoders.frameworks` |
| `IncrementalPCAEncoder` | `jina.executors.encoders` |
| `JiebaSegmenter` | `jina.executors.crafters` |
| `JiebaSegmenter` | `jina.executors.crafters.nlp` |
| `KerasImageEncoder` | `jina.executors.encoders.frameworks` |
| `LeveldbIndexer` | `jina.executors.indexers.keyvalue.leveldb` |
| `MaxRanker` | `jina.executors.rankers.bi_match` |
Expand All @@ -164,8 +165,9 @@ This version of Jina includes 80 Executors.
| `OnnxImageEncoder` | `jina.executors.encoders.frameworks` |
| `PipelineEncoder` | `jina.executors.encoders` |
| `RandomImageCropper` | `jina.executors.crafters.image` |
| `Sentencizer` | `jina.executors.crafters` |
| `Sentencizer` | `jina.executors.crafters.nlp` |
| `SlidingWindowImageCropper` | `jina.executors.crafters.image` |
| `SlidingWindowSegmenter` | `jina.executors.crafters.nlp` |
| `SptagIndexer` | `jina.executors.indexers.vector.nmslib` |
| `TextPaddlehubEncoder` | `jina.executors.encoders.frameworks` |
| `TfIdfRanker` | `jina.executors.rankers.bi_match` |
Expand Down
74 changes: 73 additions & 1 deletion jina/executors/crafters/nlp/split.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
__license__ = "Apache-2.0"

import re
from typing import List, Dict
from collections import deque
from itertools import islice
from typing import Dict, List

from .. import BaseSegmenter

Expand Down Expand Up @@ -97,3 +99,73 @@ def craft(self, buffer: bytes, doc_id: int, *args, **kwargs) -> List[Dict]:
dict(text=word, offset=idx, weight=1.0))

return chunks


class SlidingWindowSegmenter(BaseSegmenter):
"""
:class:`SlidingWindowSegmenter` split the text on the doc-level into overlapping substrings on the chunk-level.
The text is split into substrings of length ``window_size`` if possible.
The degree of overlapping can be configured through the ``step_size`` parameter.
The substrings that are shorter than the ``min_substring_len`` will be discarded.
"""

def __init__(self,
window_size: int = 300,
step_size: int = 150,
min_substring_len: int = 1,
*args, **kwargs):
super().__init__(*args, **kwargs)
self.window_size = window_size
self.step_size = step_size
self.min_substring_len = min_substring_len
if self.min_substring_len > self.window_size:
self.logger.warning(
'the min_substring_len (={}) should be smaller to the window_size (={})'.format(
self.min_substring_len, self.window_size))
if self.window_size <= 0:
self.logger.warning(
'the window_size (={}) should be larger than zero'.format(
self.window_size))
if self.step_size > self.window_size:
self.logger.warning(
'the step_size (={}) should not be larger than the window_size (={})'.format(
self.window_size, self.step_size))

def craft(self, buffer: bytes, doc_id: int, *args, **kwargs) -> List[Dict]:
"""
Split the chinese text into overlapping chunks
:param buffer: the raw text in the `bytes` format
:param doc_id: the doc id
:return: a list of chunk dicts
"""

def sliding_window(iterable, size, step):
i = iter(text)
d = deque(islice(i, size),
maxlen=size)
if not d:
# empty text
return results
while True:
yield iter(d)
try:
d.append(next(i))
except StopIteration:
return
d.extend(next(i, None)
for _ in range(step-1))

text = buffer.decode('utf-8')
chunks = ["".join(filter(None, list(chunk))) for chunk in sliding_window(
text, self.window_size, self.step_size)]

results = []
for idx, s in enumerate(chunks):
if self.min_substring_len <= len(s):
results.append(dict(
doc_id=doc_id,
text=s,
offset=idx,
weight=1.0,
length=len(chunks)))
return results
12 changes: 11 additions & 1 deletion tests/executors/crafters/nlp/split.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import unittest

from jina.executors.crafters.nlp.split import Sentencizer, JiebaSegmenter
from jina.executors.crafters.nlp.split import Sentencizer, JiebaSegmenter, SlidingWindowSegmenter
from tests import JinaTestCase


Expand Down Expand Up @@ -55,6 +55,16 @@ def test_jieba_crafter(self):
crafted_chunk_list = jieba_crafter.craft(buffer, 0)
self.assertEqual(len(crafted_chunk_list), 14)

def test_sliding_window_segmenter(self):
window_size = 20
step_size = 10
sliding_window_segmenter = SlidingWindowSegmenter(
window_size=window_size, step_size=step_size)
buffer = b'It is a sunny day!!!! When Andy comes back, we are going to the zoo.'
crafted_chunk_list = sliding_window_segmenter.craft(buffer, 0)
print(len(buffer) // step_size)
self.assertEqual(len(crafted_chunk_list), len(buffer) // step_size)


if __name__ == '__main__':
unittest.main()

0 comments on commit c6ef697

Please sign in to comment.