From 46f3d1945a45aade4110ea2a395837cb558c4365 Mon Sep 17 00:00:00 2001 From: markus583 Date: Mon, 9 Sep 2024 12:37:16 +0200 Subject: [PATCH] fix splitting of short seqs into chars --- setup.py | 2 +- wtpsplit/__init__.py | 2 +- wtpsplit/utils/__init__.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 0f9fd5e8..269a7f0f 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name="wtpsplit", - version="2.0.7", + version="2.0.8", packages=find_packages(), description="Universal Robust, Efficient and Adaptable Sentence Segmentation", author="Markus Frohmann, Igor Sterner, Benjamin Minixhofer", diff --git a/wtpsplit/__init__.py b/wtpsplit/__init__.py index b1f39e7d..a98c6dfc 100644 --- a/wtpsplit/__init__.py +++ b/wtpsplit/__init__.py @@ -18,7 +18,7 @@ from wtpsplit.extract import BertCharORTWrapper, PyTorchWrapper, extract from wtpsplit.utils import Constants, indices_to_sentences, sigmoid, token_to_char_probs -__version__ = "2.0.7" +__version__ = "2.0.8" warnings.simplefilter("default", DeprecationWarning) # show by default warnings.simplefilter("ignore", category=FutureWarning) # for tranformers diff --git a/wtpsplit/utils/__init__.py b/wtpsplit/utils/__init__.py index 509e2dfc..7d690a56 100644 --- a/wtpsplit/utils/__init__.py +++ b/wtpsplit/utils/__init__.py @@ -449,7 +449,7 @@ def get_token_spans(tokenizer, offsets_mapping, tokens): def token_to_char_probs(text, tokens, token_logits, tokenizer, offsets_mapping): """Map from token probabalities to character probabilities""" - char_probs = np.full((len(text), token_logits.shape[1]), np.min(token_logits)) # Initialize with very low numbers + char_probs = np.full((len(text), token_logits.shape[1]), -np.inf) # Initialize with very low numbers valid_indices, valid_offsets = get_token_spans(tokenizer, offsets_mapping, tokens)