Skip to content

Commit

Permalink
feat(crafters): update SlidingWindowSegmenter for nlp
Browse files Browse the repository at this point in the history
  • Loading branch information
fhaase2 committed Jun 3, 2020
1 parent d8b7f20 commit 2fa5f45
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 13 deletions.
14 changes: 5 additions & 9 deletions jina/executors/crafters/nlp/split.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,10 +131,10 @@ def __init__(self,
'the step_size (={}) should not be larger than the window_size (={})'.format(
self.window_size, self.step_size))

def craft(self, buffer: bytes, doc_id: int, *args, **kwargs) -> List[Dict]:
def craft(self, text: str, doc_id: int, *args, **kwargs) -> List[Dict]:
"""
Split the text into overlapping chunks
:param buffer: the raw text in the `bytes` format
:param text: the raw text in string format
:param doc_id: the doc id
:return: a list of chunk dicts
"""
Expand All @@ -155,17 +155,13 @@ def sliding_window(iterable, size, step):
d.extend(next(i, None)
for _ in range(step-1))

text = buffer.decode('utf-8')
chunks = ["".join(filter(None, list(chunk))) for chunk in sliding_window(
text, self.window_size, self.step_size)]

chunks = [''.join(filter(None, list(chunk))) for chunk in
sliding_window(text, self.window_size, self.step_size)]
results = []
for idx, s in enumerate(chunks):
if self.min_substring_len <= len(s):
results.append(dict(
doc_id=doc_id,
text=s,
offset=idx,
weight=1.0,
length=len(chunks)))
weight=1.0))
return results
7 changes: 3 additions & 4 deletions tests/executors/crafters/nlp/split.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,9 @@ def test_sliding_window_segmenter(self):
step_size = 10
sliding_window_segmenter = SlidingWindowSegmenter(
window_size=window_size, step_size=step_size)
buffer = b'It is a sunny day!!!! When Andy comes back, we are going to the zoo.'
crafted_chunk_list = sliding_window_segmenter.craft(buffer, 0)
print(len(buffer) // step_size)
self.assertEqual(len(crafted_chunk_list), len(buffer) // step_size)
text = 'It is a sunny day!!!! When Andy comes back, we are going to the zoo.'
crafted_chunk_list = sliding_window_segmenter.craft(text, 0)
self.assertEqual(len(crafted_chunk_list), len(text) // step_size)


if __name__ == '__main__':
Expand Down

0 comments on commit 2fa5f45

Please sign in to comment.