Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fixed string index out of range in embeddings.py #1135

Merged
merged 4 commits into from
Oct 4, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions flair/data.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from abc import abstractmethod
from typing import List, Dict, Union, Callable
import re

import torch, flair
import logging
Expand Down Expand Up @@ -519,6 +520,7 @@ def __init__(

# if text is passed, instantiate sentence with tokens (words)
if text is not None:
text = self._restore_windows_1252_characters(text)
[self.add_token(token) for token in tokenizer(text)]

# log a warning if the dataset is empty
Expand Down Expand Up @@ -849,6 +851,16 @@ def get_language_code(self) -> str:

return self.language_code

def _restore_windows_1252_characters(self, text:str)->str:
def to_windows_1252(match):
try:
return bytes([ord(match.group(0))]).decode("windows-1252")
except UnicodeDecodeError:
# No character at the corresponding code point: remove it
return ""

return re.sub(r"[\u0080-\u0099]", to_windows_1252, text)


class Image(DataPoint):
def __init__(self, data=None, imageURL=None):
Expand Down
2 changes: 1 addition & 1 deletion flair/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -1620,7 +1620,7 @@ def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = False):
if f"{dataset}/{label}" in m.name
],
)
with open(f"{data_path}/{dataset}.txt", "at") as f_p:
with open(f"{data_path}/{dataset}.txt", "at", encoding="utf-8") as f_p:
current_path = data_path / "original" / dataset / label
for file_name in current_path.iterdir():
if file_name.is_file():
Expand Down
26 changes: 14 additions & 12 deletions flair/embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -1904,19 +1904,21 @@ def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]:
local_embedding = token._embeddings[self.context_embeddings.name]
local_embedding = local_embedding.to(flair.device)

if token.text[0].isupper() or not self.only_capitalized:
# check token.text is empty or not
if token.text:
if token.text[0].isupper() or not self.only_capitalized:

if token.text not in self.word_embeddings:
self.word_embeddings[token.text] = local_embedding
self.word_count[token.text] = 1
else:
aggregated_embedding = self.aggregate_op(
self.word_embeddings[token.text], local_embedding
)
if self.pooling == "fade":
aggregated_embedding /= 2
self.word_embeddings[token.text] = aggregated_embedding
self.word_count[token.text] += 1
if token.text not in self.word_embeddings:
self.word_embeddings[token.text] = local_embedding
self.word_count[token.text] = 1
else:
aggregated_embedding = self.aggregate_op(
self.word_embeddings[token.text], local_embedding
)
if self.pooling == "fade":
aggregated_embedding /= 2
self.word_embeddings[token.text] = aggregated_embedding
self.word_count[token.text] += 1

# add embeddings after updating
for sentence in sentences:
Expand Down