diff --git a/CHANGELOG.rst b/CHANGELOG.rst index e2500fb68..dc6436ff9 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,3 +1,9 @@ +Version 0.2.4 (coming soon...) +------------------------------ + +* `@lukehsiao`_: Organize documentation for lf_helpers by modality. + (`#85 `_) + Version 0.2.3 ------------- diff --git a/docs/user/lf_helpers.rst b/docs/user/lf_helpers.rst index aa6e61d53..ea120e9c6 100644 --- a/docs/user/lf_helpers.rst +++ b/docs/user/lf_helpers.rst @@ -3,11 +3,39 @@ Labeling Function Helpers This page shows descriptions of the helper functions included with Fonduer_ which can be used to label candidates based on textual, structural, tabular, -and visual information. +and visual information. We group each lf_helper based on the modality of +information that they leverage. ---- -.. automodule:: fonduer.supervision.lf_helpers +General Labeling Function Helpers +--------------------------------- + +.. automodule:: fonduer.supervision.lf_helpers.utils + :members: + +Textual Labeling Function Helpers +--------------------------------- + +.. automodule:: fonduer.supervision.lf_helpers.textual + :members: + +Structural Labeling Function Helpers +--------------------------------- + +.. automodule:: fonduer.supervision.lf_helpers.structural + :members: + +Tabular Labeling Function Helpers +--------------------------------- + +.. automodule:: fonduer.supervision.lf_helpers.tabular + :members: + +Visual Labeling Function Helpers +--------------------------------- + +.. automodule:: fonduer.supervision.lf_helpers.visual :members: .. _Fonduer: https://github.com/HazyResearch/fonduer diff --git a/fonduer/_version.py b/fonduer/_version.py index d31c31eae..788da1fb3 100644 --- a/fonduer/_version.py +++ b/fonduer/_version.py @@ -1 +1 @@ -__version__ = "0.2.3" +__version__ = "0.2.4" diff --git a/fonduer/supervision/lf_helpers/__init__.py b/fonduer/supervision/lf_helpers/__init__.py index 349f31178..f18651089 100644 --- a/fonduer/supervision/lf_helpers/__init__.py +++ b/fonduer/supervision/lf_helpers/__init__.py @@ -1,5 +1,3 @@ -import logging - from fonduer.supervision.lf_helpers.structural import ( common_ancestor, get_ancestor_class_names, @@ -13,90 +11,46 @@ lowest_common_ancestor_depth, ) from fonduer.supervision.lf_helpers.tabular import ( - same_document, - same_table, - same_row, - same_col, - is_tabular_aligned, - same_cell, - same_sentence, + get_aligned_ngrams, + get_cell_ngrams, + get_col_ngrams, + get_head_ngrams, get_max_col_num, get_min_col_num, - get_sentence_ngrams, - get_neighbor_sentence_ngrams, - get_cell_ngrams, get_neighbor_cell_ngrams, + get_neighbor_sentence_ngrams, get_row_ngrams, - get_col_ngrams, - get_aligned_ngrams, - get_head_ngrams, + get_sentence_ngrams, + is_tabular_aligned, + same_cell, + same_col, + same_document, + same_row, + same_sentence, + same_table, ) from fonduer.supervision.lf_helpers.textual import ( get_between_ngrams, get_left_ngrams, get_right_ngrams, ) +from fonduer.supervision.lf_helpers.utils import get_matches, is_superset, overlap from fonduer.supervision.lf_helpers.visual import ( + get_aligned_lemmas, + get_horz_ngrams, get_page, + get_page_horz_percentile, + get_page_vert_percentile, + get_vert_ngrams, + get_visual_aligned_lemmas, is_horz_aligned, is_vert_aligned, + is_vert_aligned_center, is_vert_aligned_left, is_vert_aligned_right, - is_vert_aligned_center, same_page, - get_horz_ngrams, - get_vert_ngrams, - get_page_vert_percentile, - get_page_horz_percentile, - get_visual_aligned_lemmas, - get_aligned_lemmas, ) - -def is_superset(a, b): - """Check if a is a superset of b. - - This is typically used to check if ALL of a list of sentences is in the ngrams returned by an lf_helper. - - :param a: A collection of items - :param b: A collection of items - :rtype: boolean - """ - return set(a).issuperset(b) - - -def overlap(a, b): - """Check if a overlaps b. - - This is typically used to check if ANY of a list of sentences is in the ngrams returned by an lf_helper. - - :param a: A collection of items - :param b: A collection of items - :rtype: boolean - """ - return not set(a).isdisjoint(b) - - -def get_matches(lf, candidate_set, match_values=[1, -1]): - """Return a list of candidates that are matched by a particular LF. - - A simple helper function to see how many matches (non-zero by default) an LF gets. - - :param lf: The labeling function to apply to the candidate_set - :param candidate_set: The set of candidates to evaluate - :param match_values: An option list of the values to consider as matched. [1, -1] by default. - :rtype: a list of candidates - """ - logger = logging.getLogger(__name__) - matches = [] - for c in candidate_set: - label = lf(c) - if label in match_values: - matches.append(c) - logger.info(("%s matches") % len(matches)) - return matches - - __all__ = [ "common_ancestor", "get_aligned_lemmas", diff --git a/fonduer/supervision/lf_helpers/utils.py b/fonduer/supervision/lf_helpers/utils.py new file mode 100644 index 000000000..111d22eab --- /dev/null +++ b/fonduer/supervision/lf_helpers/utils.py @@ -0,0 +1,49 @@ +import logging + + +def is_superset(a, b): + """Check if a is a superset of b. + + This is typically used to check if ALL of a list of sentences is in the + ngrams returned by an lf_helper. + + :param a: A collection of items + :param b: A collection of items + :rtype: boolean + """ + return set(a).issuperset(b) + + +def overlap(a, b): + """Check if a overlaps b. + + This is typically used to check if ANY of a list of sentences is in the + ngrams returned by an lf_helper. + + :param a: A collection of items + :param b: A collection of items + :rtype: boolean + """ + return not set(a).isdisjoint(b) + + +def get_matches(lf, candidate_set, match_values=[1, -1]): + """Return a list of candidates that are matched by a particular LF. + + A simple helper function to see how many matches (non-zero by default) an + LF gets. + + :param lf: The labeling function to apply to the candidate_set + :param candidate_set: The set of candidates to evaluate + :param match_values: An option list of the values to consider as matched. + [1, -1] by default. + :rtype: a list of candidates + """ + logger = logging.getLogger(__name__) + matches = [] + for c in candidate_set: + label = lf(c) + if label in match_values: + matches.append(c) + logger.info(("%s matches") % len(matches)) + return matches