From 2fa5f4587135286a24af193ea7091a4e4ed07cc1 Mon Sep 17 00:00:00 2001 From: fhaase2 <44052928+fhaase2@users.noreply.github.com> Date: Wed, 3 Jun 2020 16:52:02 +0200 Subject: [PATCH] feat(crafters): update SlidingWindowSegmenter for nlp --- jina/executors/crafters/nlp/split.py | 14 +++++--------- tests/executors/crafters/nlp/split.py | 7 +++---- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/jina/executors/crafters/nlp/split.py b/jina/executors/crafters/nlp/split.py index 2782cf5666e8b..b285c3c75933a 100644 --- a/jina/executors/crafters/nlp/split.py +++ b/jina/executors/crafters/nlp/split.py @@ -131,10 +131,10 @@ def __init__(self, 'the step_size (={}) should not be larger than the window_size (={})'.format( self.window_size, self.step_size)) - def craft(self, buffer: bytes, doc_id: int, *args, **kwargs) -> List[Dict]: + def craft(self, text: str, doc_id: int, *args, **kwargs) -> List[Dict]: """ Split the text into overlapping chunks - :param buffer: the raw text in the `bytes` format + :param text: the raw text in string format :param doc_id: the doc id :return: a list of chunk dicts """ @@ -155,17 +155,13 @@ def sliding_window(iterable, size, step): d.extend(next(i, None) for _ in range(step-1)) - text = buffer.decode('utf-8') - chunks = ["".join(filter(None, list(chunk))) for chunk in sliding_window( - text, self.window_size, self.step_size)] - + chunks = [''.join(filter(None, list(chunk))) for chunk in + sliding_window(text, self.window_size, self.step_size)] results = [] for idx, s in enumerate(chunks): if self.min_substring_len <= len(s): results.append(dict( - doc_id=doc_id, text=s, offset=idx, - weight=1.0, - length=len(chunks))) + weight=1.0)) return results diff --git a/tests/executors/crafters/nlp/split.py b/tests/executors/crafters/nlp/split.py index 260f83ae8ed64..ecbb269b628b4 100644 --- a/tests/executors/crafters/nlp/split.py +++ b/tests/executors/crafters/nlp/split.py @@ -60,10 +60,9 @@ def test_sliding_window_segmenter(self): step_size = 10 sliding_window_segmenter = SlidingWindowSegmenter( window_size=window_size, step_size=step_size) - buffer = b'It is a sunny day!!!! When Andy comes back, we are going to the zoo.' - crafted_chunk_list = sliding_window_segmenter.craft(buffer, 0) - print(len(buffer) // step_size) - self.assertEqual(len(crafted_chunk_list), len(buffer) // step_size) + text = 'It is a sunny day!!!! When Andy comes back, we are going to the zoo.' + crafted_chunk_list = sliding_window_segmenter.craft(text, 0) + self.assertEqual(len(crafted_chunk_list), len(text) // step_size) if __name__ == '__main__':