Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gh-3080: add functionality for using proxies #3082

Merged
merged 2 commits into from
Feb 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions flair/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
from transformers import set_seed as hf_set_seed

# global variable: cache_root
from .file_utils import set_proxies

cache_root = Path(os.getenv("FLAIR_CACHE_ROOT", Path(Path.home(), ".flair")))

# global variable: device
Expand Down Expand Up @@ -64,4 +66,5 @@ def set_seed(seed: int):
"trainers",
"visual",
"datasets",
"set_proxies",
]
19 changes: 0 additions & 19 deletions flair/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,6 @@ def __init__(self, add_unk=True):
self.add_item("<unk>")

def remove_item(self, item: str):

bytes_item = item.encode("utf-8")
if bytes_item in self.item2idx:
self.idx2item.remove(bytes_item)
Expand Down Expand Up @@ -342,7 +341,6 @@ def has_metadata(self, key: str) -> bool:
return key in self._metadata

def add_label(self, typename: str, value: str, score: float = 1.0):

if typename not in self.annotation_layers:
self.annotation_layers[typename] = [Label(self, value, score)]
else:
Expand Down Expand Up @@ -382,7 +380,6 @@ def unlabeled_identifier(self):
raise NotImplementedError

def _printout_labels(self, main_label=None, add_score: bool = True):

all_labels = []
keys = [main_label] if main_label is not None else self.annotation_layers.keys()
if add_score:
Expand Down Expand Up @@ -779,7 +776,6 @@ def get_token(self, token_id: int) -> Optional[Token]:
return None

def _add_token(self, token: Union[Token, str]):

if isinstance(token, Token):
assert token.sentence is None

Expand Down Expand Up @@ -813,7 +809,6 @@ def embedding(self):
return self.get_embedding()

def to(self, device: str, pin_memory: bool = False):

# move sentence embeddings to device
super().to(device=device, pin_memory=pin_memory)

Expand All @@ -822,7 +817,6 @@ def to(self, device: str, pin_memory: bool = False):
token.to(device, pin_memory)

def clear_embeddings(self, embedding_names: List[str] = None):

super().clear_embeddings(embedding_names)

# clear token embeddings
Expand Down Expand Up @@ -862,7 +856,6 @@ def __str__(self):
return self.to_tagged_string()

def to_tagged_string(self, main_label=None) -> str:

already_printed = [self]

output = super().__str__()
Expand All @@ -886,7 +879,6 @@ def text(self):
return self.to_original_text()

def to_tokenized_string(self) -> str:

if self.tokenized is None:
self.tokenized = " ".join([t.text for t in self.tokens])

Expand Down Expand Up @@ -919,7 +911,6 @@ def infer_space_after(self):
last_token.whitespace_after = 0

if last_token is not None:

if token.text in [".", ":", ",", ";", ")", "n't", "!", "?"]:
last_token.whitespace_after = 0

Expand Down Expand Up @@ -1091,7 +1082,6 @@ def set_context_for_sentences(cls, sentences: List["Sentence"]) -> None:
previous_sentence = sentence

def get_labels(self, label_type: str = None):

# if no label if specified, return all labels
if label_type is None:
return sorted(self.labels)
Expand All @@ -1104,7 +1094,6 @@ def get_labels(self, label_type: str = None):
return []

def remove_labels(self, typename: str):

# labels also need to be deleted at all tokens
for token in self:
token.remove_labels(typename)
Expand Down Expand Up @@ -1248,7 +1237,6 @@ def downsample(
downsample_dev=True,
downsample_test=True,
):

if downsample_train and self._train is not None:
self._train = self._downsample_to_proportion(self._train, percentage)

Expand Down Expand Up @@ -1282,7 +1270,6 @@ def filter_long_sentences(self, max_charlength: int):

@staticmethod
def _filter_long_sentences(dataset, max_charlength: int) -> Dataset:

# find out empty sentence indices
empty_sentence_indices = []
non_empty_sentence_indices = []
Expand All @@ -1300,7 +1287,6 @@ def _filter_long_sentences(dataset, max_charlength: int) -> Dataset:

@staticmethod
def _filter_empty_sentences(dataset) -> Dataset:

# find out empty sentence indices
empty_sentence_indices = []
non_empty_sentence_indices = []
Expand Down Expand Up @@ -1353,7 +1339,6 @@ def _get_all_tokens(self) -> List[str]:

@staticmethod
def _downsample_to_proportion(dataset: Dataset, proportion: float):

sampled_size: int = round(_len_dataset(dataset) * proportion)
splits = randomly_split_into_two_datasets(dataset, sampled_size)
return splits[0]
Expand Down Expand Up @@ -1455,7 +1440,6 @@ def make_label_dictionary(self, label_type: str, min_count: int = -1, add_unk: b
label_value_counter: typing.Counter[str] = Counter()
all_sentence_labels: List[str] = []
for sentence in Tqdm.tqdm(_iter_dataset(data)):

# count all label types per sentence
sentence_label_type_counter.update(sentence.annotation_layers.keys())

Expand Down Expand Up @@ -1522,7 +1506,6 @@ def get_all_sentences(self) -> ConcatDataset:

@deprecated(version="0.8", reason="Use 'make_label_dictionary' instead.")
def make_tag_dictionary(self, tag_type: str) -> Dictionary:

# Make the tag dictionary
tag_dictionary: Dictionary = Dictionary(add_unk=False)
tag_dictionary.add_item("O")
Expand All @@ -1542,7 +1525,6 @@ def __init__(
name: str = "multicorpus",
**corpusargs,
):

self.corpora: List[Corpus] = corpora

ids = task_ids if task_ids else [f"Task_{i}" for i in range(len(corpora))]
Expand Down Expand Up @@ -1706,7 +1688,6 @@ def get_spans_from_bio(bioes_tags: List[str], bioes_scores=None) -> List[typing.
current_span: List[int] = []
current_span_scores: List[float] = []
for idx, bioes_tag in enumerate(bioes_tags):

# non-set tags are OUT tags
if bioes_tag == "" or bioes_tag == "O" or bioes_tag == "_":
bioes_tag = "O-"
Expand Down
1 change: 0 additions & 1 deletion flair/datasets/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ def __init__(
timeout=0,
worker_init_fn=None,
):

# in certain cases, multi-CPU data loading makes no sense and slows
# everything down. For this reason, we detect if a dataset is in-memory:
# if so, num_workers is set to 0 for faster processing
Expand Down
2 changes: 0 additions & 2 deletions flair/datasets/biomedical.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,6 @@ def bioc_to_internal(bioc_file: Path):
document_text += " " + text

for annotation in passage.xpath(".//annotation"):

entity_types = [
i.text.replace(" ", "_")
for i in annotation.xpath("./infon")
Expand Down Expand Up @@ -2642,7 +2641,6 @@ def parse_dataset(cls, corpus_folder: Path, fix_annotation=True):
file for file in os.listdir(str(corpus_folder)) if file.endswith(".txt") and not file.startswith("README")
]
for text_file in input_files:

with open(os.path.join(str(corpus_folder), text_file), encoding="utf8") as text_reader:
document_text = text_reader.read()
if not document_text:
Expand Down
Loading