From 46f3d1945a45aade4110ea2a395837cb558c4365 Mon Sep 17 00:00:00 2001
From: markus583 <markus.frohmann@gmail.com>
Date: Mon, 9 Sep 2024 12:37:16 +0200
Subject: [PATCH] fix splitting of short seqs into chars

---
 setup.py                   | 2 +-
 wtpsplit/__init__.py       | 2 +-
 wtpsplit/utils/__init__.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index 0f9fd5e8..269a7f0f 100644
--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,7 @@
 
 setup(
     name="wtpsplit",
-    version="2.0.7",
+    version="2.0.8",
     packages=find_packages(),
     description="Universal Robust, Efficient and Adaptable Sentence Segmentation",
     author="Markus Frohmann, Igor Sterner, Benjamin Minixhofer",
diff --git a/wtpsplit/__init__.py b/wtpsplit/__init__.py
index b1f39e7d..a98c6dfc 100644
--- a/wtpsplit/__init__.py
+++ b/wtpsplit/__init__.py
@@ -18,7 +18,7 @@
 from wtpsplit.extract import BertCharORTWrapper, PyTorchWrapper, extract
 from wtpsplit.utils import Constants, indices_to_sentences, sigmoid, token_to_char_probs
 
-__version__ = "2.0.7"
+__version__ = "2.0.8"
 
 warnings.simplefilter("default", DeprecationWarning)  # show by default
 warnings.simplefilter("ignore", category=FutureWarning)  # for tranformers
diff --git a/wtpsplit/utils/__init__.py b/wtpsplit/utils/__init__.py
index 509e2dfc..7d690a56 100644
--- a/wtpsplit/utils/__init__.py
+++ b/wtpsplit/utils/__init__.py
@@ -449,7 +449,7 @@ def get_token_spans(tokenizer, offsets_mapping, tokens):
 
 def token_to_char_probs(text, tokens, token_logits, tokenizer, offsets_mapping):
     """Map from token probabalities to character probabilities"""
-    char_probs = np.full((len(text), token_logits.shape[1]), np.min(token_logits))  # Initialize with very low numbers
+    char_probs = np.full((len(text), token_logits.shape[1]), -np.inf)  # Initialize with very low numbers
 
     valid_indices, valid_offsets = get_token_spans(tokenizer, offsets_mapping, tokens)