diff --git a/docs/src/apiref.rst b/docs/src/apiref.rst index c4587f14f9..d67d27a16a 100644 --- a/docs/src/apiref.rst +++ b/docs/src/apiref.rst @@ -88,6 +88,7 @@ Modules: summarization/commons summarization/graph summarization/keywords + summarization/mz_entropy summarization/pagerank_weighted summarization/summariser summarization/syntactic_unit diff --git a/docs/src/summarization/graph.rst b/docs/src/summarization/graph.rst index 909b15cf5e..eb3588077d 100644 --- a/docs/src/summarization/graph.rst +++ b/docs/src/summarization/graph.rst @@ -1,8 +1,8 @@ -:mod:`summarization.graph` -- TextRank graph -========================================================= +:mod:`summarization.graph` -- Graph +=================================== .. automodule:: gensim.summarization.graph - :synopsis: TextRank graph + :synopsis: Graph :members: :inherited-members: :undoc-members: diff --git a/docs/src/summarization/mz_entropy.rst b/docs/src/summarization/mz_entropy.rst new file mode 100644 index 0000000000..31222ca6ab --- /dev/null +++ b/docs/src/summarization/mz_entropy.rst @@ -0,0 +1,9 @@ +:mod:`summarization.mz_entropy` -- Keywords for the Montemurro and Zanette entropy algorithm +============================================================================================ + +.. automodule:: gensim.summarization.mz_entropy + :synopsis: Keywords for the Montemurro and Zanette entropy algorithm + :members: + :inherited-members: + :undoc-members: + :show-inheritance: diff --git a/gensim/parsing/preprocessing.py b/gensim/parsing/preprocessing.py index f0cf22a6e8..cc15b4665b 100644 --- a/gensim/parsing/preprocessing.py +++ b/gensim/parsing/preprocessing.py @@ -363,7 +363,7 @@ def preprocess_documents(docs): Returns ------- - list of (list of str) + list of list of str Processed documents split by whitespace. Examples diff --git a/gensim/summarization/bm25.py b/gensim/summarization/bm25.py index bd4d70911a..ec484949cf 100644 --- a/gensim/summarization/bm25.py +++ b/gensim/summarization/bm25.py @@ -3,20 +3,75 @@ # # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html +"""This module contains function of computing rank scores for documents in +corpus and helper class `BM25` used in calculations. Original alhorithm +descibed in [1]_, also you may check Wikipedia page [2]_. + + +.. [1] Robertson, Stephen; Zaragoza, Hugo (2009). The Probabilistic Relevance Framework: BM25 and Beyond, + http://www.staff.city.ac.uk/~sb317/papers/foundations_bm25_review.pdf +.. [2] Okapi BM25 on Wikipedia, https://en.wikipedia.org/wiki/Okapi_BM25 + + + +Examples +-------- +>>> from gensim.summarization.bm25 import get_bm25_weights +>>> corpus = [ +... ["black", "cat", "white", "cat"], +... ["cat", "outer", "space"], +... ["wag", "dog"] +... ] +>>> result = get_bm25_weights(corpus) + + +Data: +----- +.. data:: PARAM_K1 - Free smoothing parameter for BM25. +.. data:: PARAM_B - Free smoothing parameter for BM25. +.. data:: EPSILON - Constant used for negative idf of document in corpus. + +""" + + import math from six import iteritems from six.moves import xrange -# BM25 parameters. PARAM_K1 = 1.5 PARAM_B = 0.75 EPSILON = 0.25 class BM25(object): + """Implementation of Best Matching 25 ranking function. + + Attributes + ---------- + corpus_size : int + Size of corpus (number of documents). + avgdl : float + Average length of document in `corpus`. + corpus : list of list of str + Corpus of documents. + f : list of dicts of int + Dictionary with terms frequencies for each document in `corpus`. Words used as keys and frequencies as values. + df : dict + Dictionary with terms frequencies for whole `corpus`. Words used as keys and frequencies as values. + idf : dict + Dictionary with inversed terms frequencies for whole `corpus`. Words used as keys and frequencies as values. + + """ def __init__(self, corpus): + """ + Parameters + ---------- + corpus : list of list of str + Given corpus. + + """ self.corpus_size = len(corpus) self.avgdl = sum(float(len(x)) for x in corpus) / self.corpus_size self.corpus = corpus @@ -26,6 +81,7 @@ def __init__(self, corpus): self.initialize() def initialize(self): + """Calculates frequencies of terms in documents and in corpus. Also computes inverse document frequencies.""" for document in self.corpus: frequencies = {} for word in document: @@ -43,6 +99,23 @@ def initialize(self): self.idf[word] = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5) def get_score(self, document, index, average_idf): + """Computes BM25 score of given `document` in relation to item of corpus selected by `index`. + + Parameters + ---------- + document : list of str + Document to be scored. + index : int + Index of document in corpus selected to score with `document`. + average_idf : float + Average idf in corpus. + + Returns + ------- + float + BM25 score. + + """ score = 0 for word in document: if word not in self.f[index]: @@ -53,6 +126,22 @@ def get_score(self, document, index, average_idf): return score def get_scores(self, document, average_idf): + """Computes and returns BM25 scores of given `document` in relation to + every item in corpus. + + Parameters + ---------- + document : list of str + Document to be scored. + average_idf : float + Average idf in corpus. + + Returns + ------- + list of float + BM25 scores. + + """ scores = [] for index in xrange(self.corpus_size): score = self.get_score(document, index, average_idf) @@ -61,6 +150,30 @@ def get_scores(self, document, average_idf): def get_bm25_weights(corpus): + """Returns BM25 scores (weights) of documents in corpus. + Each document has to be weighted with every document in given corpus. + + Parameters + ---------- + corpus : list of list of str + Corpus of documents. + + Returns + ------- + list of list of float + BM25 scores. + + Examples + -------- + >>> from gensim.summarization.bm25 import get_bm25_weights + >>> corpus = [ + ... ["black", "cat", "white", "cat"], + ... ["cat", "outer", "space"], + ... ["wag", "dog"] + ... ] + >>> result = get_bm25_weights(corpus) + + """ bm25 = BM25(corpus) average_idf = sum(float(val) for val in bm25.idf.values()) / len(bm25.idf) diff --git a/gensim/summarization/commons.py b/gensim/summarization/commons.py index 1c467098f9..f1a2264e46 100644 --- a/gensim/summarization/commons.py +++ b/gensim/summarization/commons.py @@ -3,10 +3,45 @@ # # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html +"""This module provides functions of creating graph from sequence of values and removing of unreachable nodes. + + +Examples +-------- + +Create simple graph and add edges. Let's take a look at nodes. + +>>> gg = build_graph(['Felidae', 'Lion', 'Tiger', 'Wolf']) +>>> gg.add_edge(("Felidae", "Lion")) +>>> gg.add_edge(("Felidae", "Tiger")) +>>> sorted(gg.nodes()) +['Felidae', 'Lion', 'Tiger', 'Wolf'] + +Remove nodes with no edges. + +>>> remove_unreachable_nodes(gg) +>>> sorted(gg.nodes()) +['Felidae', 'Lion', 'Tiger'] + +""" + from gensim.summarization.graph import Graph def build_graph(sequence): + """Creates and returns undirected graph with given sequence of values. + + Parameters + ---------- + sequence : list of hashable + Sequence of values. + + Returns + ------- + :class:`~gensim.summarization.graph.Graph` + Created graph. + + """ graph = Graph() for item in sequence: if not graph.has_node(item): @@ -15,6 +50,15 @@ def build_graph(sequence): def remove_unreachable_nodes(graph): + """Removes unreachable nodes (nodes with no edges), inplace. + + Parameters + ---------- + graph : :class:`~gensim.summarization.graph.Graph` + Given graph. + + """ + for node in graph.nodes(): if sum(graph.edge_weight((node, other)) for other in graph.neighbors(node)) == 0: graph.del_node(node) diff --git a/gensim/summarization/graph.py b/gensim/summarization/graph.py index c35a59a25d..79cd1a160f 100644 --- a/gensim/summarization/graph.py +++ b/gensim/summarization/graph.py @@ -3,141 +3,203 @@ # # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html +"""This module contains abstract class IGraph represents graphs interface and +class Graph (based on IGraph) which implements undirected graph. + +Examples +-------- + +Create simple graph with 4 nodes. + +>>> g = Graph() +>>> g.add_node('Felidae') +>>> g.add_node('Lion') +>>> g.add_node('Tiger') +>>> g.add_node('Wolf') +>>> sorted(g.nodes()) +['Felidae', 'Lion', 'Tiger', 'Wolf'] + +Add some edges and check neighbours. + +>>> g.add_edge(("Felidae", "Lion")) +>>> g.add_edge(("Felidae", "Tiger")) +>>> g.neighbors("Felidae") +['Lion', 'Tiger'] + +One node has no neighbours. + +>>> g.neighbors("Wolf") +[] + +""" + from abc import ABCMeta, abstractmethod class IGraph(object): - """ Represents the interface or contract that the graph for TextRank + """Represents the interface or contract that the graph for TextRank should implement. """ __metaclass__ = ABCMeta @abstractmethod def nodes(self): - """ - Return node list. + """Returns all nodes of graph. + + Returns + ------- + list of hashable + Nodes of graph. - @rtype: list - @return: Node list. """ pass @abstractmethod def edges(self): - """ - Return all edges in the graph. + """Returns all edges of graph. + + Returns + ------- + list of (hashable, hashable) + Edges of graph. - @rtype: list - @return: List of all edges in the graph. """ pass @abstractmethod def neighbors(self, node): - """ - Return all nodes that are directly accessible from given node. + """Return all nodes that are directly accessible from given node. + + Parameters + ---------- + node : hashable + Given node identifier. - @type node: node - @param node: Node identifier + Returns + ------- + list of hashable + Nodes directly accessible from given `node`. - @rtype: list - @return: List of nodes directly accessible from given node. """ pass @abstractmethod def has_node(self, node): - """ - Return whether the requested node exists. + """Returns whether the requested node exists. + + Parameters + ---------- + node : hashable + Given node identifier. - @type node: node - @param node: Node identifier + Returns + ------- + bool + True if `node` exists, False otherwise. - @rtype: boolean - @return: Truth-value for node existence. """ pass @abstractmethod def add_node(self, node, attrs=None): - """ - Add given node to the graph. + """Adds given node to the graph. - @attention: While nodes can be of any type, it's strongly recommended - to use only numbers and single-line strings as node identifiers if you - intend to use write(). + Note + ---- + While nodes can be of any type, it's strongly recommended to use only numbers and single-line strings + as node identifiers if you intend to use write(). - @type node: node - @param node: Node identifier. + Parameters + ---------- + node : hashable + Given node + attrs : list, optional + Node attributes specified as (attribute, value) - @type attrs: list - @param attrs: List of node attributes specified as (attribute, value) - tuples. """ pass @abstractmethod def add_edge(self, edge, wt=1, label='', attrs=None): - """ - Add an edge to the graph connecting two nodes. - - An edge, here, is a pair of nodes like C{(n, m)}. - - @type edge: tuple - @param edge: Edge. - - @type wt: number - @param wt: Edge weight. + """Adds an edge to the graph connecting two nodes. An edge, here, + is a tuple of two nodes. + + Parameters + ---------- + edge : (hashable, hashable) + Given edge. + wt : float, optional + Weight of new edge. + label : str, optional + Edge label. + attrs : list, optional + Node attributes specified as (attribute, value) - @type label: string - @param label: Edge label. - - @type attrs: list - @param attrs: List of node attributes specified as (attribute, value) - tuples. """ pass @abstractmethod def has_edge(self, edge): - """ - Return whether an edge exists. + """Returns whether an edge exists. + + Parameters + ---------- + edge : (hashable, hashable) + Given edge. - @type edge: tuple - @param edge: Edge. + Returns + ------- + bool + True if `edge` exists, False otherwise. - @rtype: boolean - @return: Truth-value for edge existence. """ pass @abstractmethod def edge_weight(self, edge): - """ - Get the weight of an edge. + """Returns weigth of given edge. + + Parameters + ---------- + edge : (hashable, hashable) + Given edge. - @type edge: edge - @param edge: One edge. + Returns + ------- + float + Edge weight. - @rtype: number - @return: Edge weight. """ pass @abstractmethod def del_node(self, node): - """ - Remove a node from the graph. + """Removes node and its edges from the graph. + + Parameters + ---------- + node : hashable + Node to delete. - @type node: node - @param node: Node identifier. """ pass class Graph(IGraph): """ - Implementation of an undirected graph, based on Pygraph + Implementation of an undirected graph, based on IGraph. + + Attributes + ---------- + Graph.WEIGHT_ATTRIBUTE_NAME : str + Name of weight attribute in graph. + Graph.DEFAULT_WEIGHT : float + Weight set by default. + Graph.LABEL_ATTRIBUTE_NAME : str + Default name of attribute. Not used. + Graph.DEFAULT_LABEL : str + Label set by default. Not used. + """ WEIGHT_ATTRIBUTE_NAME = "weight" @@ -147,6 +209,8 @@ class Graph(IGraph): DEFAULT_LABEL = "" def __init__(self): + """Initializes object.""" + # Metadata about edges # Mapping: Edge -> Dict mapping, lablel-> str, wt->num self.edge_properties = {} @@ -160,19 +224,90 @@ def __init__(self): self.node_neighbors = {} def has_edge(self, edge): + """Returns whether an edge exists. + + Parameters + ---------- + edge : (hashable, hashable) + Given edge. + + Returns + ------- + bool + True if `edge` exists, False otherwise. + + """ u, v = edge return (u, v) in self.edge_properties and (v, u) in self.edge_properties def edge_weight(self, edge): + """Returns weight of given edge. + + Parameters + ---------- + edge : (hashable, hashable) + Given edge. + + Returns + ------- + float + Edge weight. + + """ return self.get_edge_properties(edge).setdefault(self.WEIGHT_ATTRIBUTE_NAME, self.DEFAULT_WEIGHT) def neighbors(self, node): + """Returns all nodes that are directly accessible from given node. + + Parameters + ---------- + node : hashable + Given node identifier. + + Returns + ------- + list of hashable + Nodes directly accessible from given `node`. + + """ return self.node_neighbors[node] def has_node(self, node): + """Returns whether the requested node exists. + + Parameters + ---------- + node : hashable + Given node. + + Returns + ------- + bool + True if `node` exists, False otherwise. + + """ return node in self.node_neighbors def add_edge(self, edge, wt=1, label='', attrs=None): + """Adds an edge to the graph connecting two nodes. + + Parameters + ---------- + edge : (hashable, hashable) + Given edge. + wt : float, optional + Weight of new edge. + label : str, optional + Edge label. + attrs : list, optional + Node attributes specified as (attribute, value). + + Raises + ------ + ValueError + If `edge` already exists in graph. + + """ if attrs is None: attrs = [] u, v = edge @@ -187,6 +322,27 @@ def add_edge(self, edge, wt=1, label='', attrs=None): raise ValueError("Edge (%s, %s) already in graph" % (u, v)) def add_node(self, node, attrs=None): + """Adds given node to the graph. + + Note + ---- + While nodes can be of any type, it's strongly recommended + to use only numbers and single-line strings as node identifiers if you + intend to use write(). + + Parameters + ---------- + node : hashable + Given node. + attrs : list of (hashable, hashable), optional + Node attributes specified as (attribute, value) + + Raises + ------ + ValueError + If `node` already exists in graph. + + """ if attrs is None: attrs = [] if node not in self.node_neighbors: @@ -196,44 +352,138 @@ def add_node(self, node, attrs=None): raise ValueError("Node %s already in graph" % node) def nodes(self): + """Returns all nodes of the graph. + + Returns + ------- + list of hashable + Nodes of graph. + + """ return list(self.node_neighbors.keys()) def edges(self): + """Returns all edges of the graph. + + Returns + ------- + list of (hashable, hashable) + Edges of graph. + + """ return [a for a in self.edge_properties.keys()] def del_node(self, node): + """Removes given node and its edges from the graph. + + Parameters + ---------- + node : hashable + Given node. + + """ for each in list(self.neighbors(node)): if each != node: self.del_edge((each, node)) del self.node_neighbors[node] del self.node_attr[node] - # Helper methods def get_edge_properties(self, edge): + """Returns properties of given given edge. If edge doesn't exist + empty dictionary will be returned. + + Parameters + ---------- + edge : (hashable, hashable) + Given edge. + + Returns + ------- + dict + Properties of graph. + + """ return self.edge_properties.setdefault(edge, {}) def add_edge_attributes(self, edge, attrs): + """Adds attributes `attrs` to given edge, order of nodes in edge doesn't matter. + + Parameters + ---------- + edge : (hashable, hashable) + Given edge. + attrs : list + Provided attributes to add. + + """ for attr in attrs: self.add_edge_attribute(edge, attr) def add_edge_attribute(self, edge, attr): + """Adds attribute `attr` to given edge, order of nodes in edge doesn't matter. + + Parameters + ---------- + edge : (hashable, hashable) + Given edge. + + attr : object + Provided attribute to add. + + """ self.edge_attr[edge] = self.edge_attributes(edge) + [attr] if edge[0] != edge[1]: self.edge_attr[(edge[1], edge[0])] = self.edge_attributes((edge[1], edge[0])) + [attr] def edge_attributes(self, edge): + """Returns attributes of given edge. + + Note + ---- + In case of non existing edge returns empty list. + + Parameters + ---------- + edge : (hashable, hashable) + Given edge. + + Returns + ------- + list + Attributes of given edge. + + """ try: return self.edge_attr[edge] except KeyError: return [] def set_edge_properties(self, edge, **properties): + """Adds `properties` to given edge, order of nodes in edge doesn't matter. + + Parameters + ---------- + edge : (hashable, hashable) + Given edge. + + properties : dict + Properties to add. + + """ self.edge_properties.setdefault(edge, {}).update(properties) if edge[0] != edge[1]: self.edge_properties.setdefault((edge[1], edge[0]), {}).update(properties) def del_edge(self, edge): + """Removes given edges from the graph. + + Parameters + ---------- + edge : (hashable, hashable) + Given edge. + + """ u, v = edge self.node_neighbors[u].remove(v) self.del_edge_labeling((u, v)) @@ -242,6 +492,14 @@ def del_edge(self, edge): self.del_edge_labeling((v, u)) def del_edge_labeling(self, edge): + """Removes attributes and properties of given edge. + + Parameters + ---------- + edge : (hashable, hashable) + Given edge. + + """ keys = [edge, edge[::-1]] for key in keys: diff --git a/gensim/summarization/keywords.py b/gensim/summarization/keywords.py index 1630c9389d..4074088a04 100644 --- a/gensim/summarization/keywords.py +++ b/gensim/summarization/keywords.py @@ -3,6 +3,35 @@ # # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html +"""This module contains functions to find keywords of the text and building graph on tokens from text. + +Examples +-------- +Extract keywords from text + +>>> from gensim.summarization import keywords +>>> text='''Challenges in natural language processing frequently involve +... speech recognition, natural language understanding, natural language +... generation (frequently from formal, machine-readable logical forms), +... connecting language and machine perception, dialog systems, or some +... combination thereof.''' +>>> keywords(text).split('\\n') +[u'natural language', u'machine', u'frequently'] + + +Notes +----- +Check tags in http://www.clips.ua.ac.be/pages/mbsp-tags and use only first two letters +for `INCLUDING_FILTER` and `EXCLUDING_FILTER` + +Data: +----- +.. data:: WINDOW_SIZE - Size of window, number of consecutive tokens in processing. +.. data:: INCLUDING_FILTER - Including part of speech filters. +.. data:: EXCLUDING_FILTER - Excluding part of speech filters. + +""" + from gensim.summarization.pagerank_weighted import pagerank_weighted as _pagerank from gensim.summarization.textcleaner import clean_text_by_word as _clean_text_by_word from gensim.summarization.textcleaner import tokenize_by_word as _tokenize_by_word @@ -17,20 +46,43 @@ WINDOW_SIZE = 2 -""" -Check tags in http://www.clips.ua.ac.be/pages/mbsp-tags and use only first two letters -Example: filter for nouns and adjectives: -INCLUDING_FILTER = ['NN', 'JJ'] -""" INCLUDING_FILTER = ['NN', 'JJ'] EXCLUDING_FILTER = [] def _get_pos_filters(): + """Get default including and excluding filters as frozen sets. + + Returns + ------- + (frozenset of str, frozenset of str) + Including and excluding filters. + + """ return frozenset(INCLUDING_FILTER), frozenset(EXCLUDING_FILTER) def _get_words_for_graph(tokens, pos_filter=None): + """Filters given dictionary of tokens using provided part of speech filters. + + Parameters + ---------- + tokens : dict + Original units (words) as keys and processed units (tokens) as values. + pos_filter : iterable + Part of speech filters, optional. If `None` - using :func:`_get_pos_filters`. + + Returns + ------- + list of str + Filtered tokens. + + Raises + ------ + ValueError + If include and exclude filters ar not empty at the same time. + + """ if pos_filter is None: include_filters, exclude_filters = _get_pos_filters() else: @@ -49,10 +101,37 @@ def _get_words_for_graph(tokens, pos_filter=None): def _get_first_window(split_text): + """Get first :const:`~gensim.parsing.keywords.WINDOW_SIZE` tokens from given `split_text`. + + Parameters + ---------- + split_text : list of str + Splitted text. + + Returns + ------- + list of str + First :const:`~gensim.parsing.keywords.WINDOW_SIZE` tokens. + + """ return split_text[:WINDOW_SIZE] def _set_graph_edge(graph, tokens, word_a, word_b): + """Sets an edge between nodes named word_a and word_b if they exists in `tokens` and `graph`, inplace. + + Parameters + ---------- + graph : :class:~gensim.summarization.graph.Graph + Given graph. + tokens : dict + Original units (words) as keys and processed units (tokens) as values. + word_a : str + First word, name of first node. + word_b : str + Second word, name of second node. + + """ if word_a in tokens and word_b in tokens: lemma_a = tokens[word_a].token lemma_b = tokens[word_b].token @@ -63,12 +142,38 @@ def _set_graph_edge(graph, tokens, word_a, word_b): def _process_first_window(graph, tokens, split_text): + """Sets an edges between nodes taken from first :const:`~gensim.parsing.keywords.WINDOW_SIZE` + words of `split_text` if they exist in `tokens` and `graph`, inplace. + + Parameters + ---------- + graph : :class:~gensim.summarization.graph.Graph + Given graph. + tokens : dict + Original units (words) as keys and processed units (tokens) as values. + split_text : list of str + Splitted text. + + """ first_window = _get_first_window(split_text) for word_a, word_b in _combinations(first_window, 2): _set_graph_edge(graph, tokens, word_a, word_b) def _init_queue(split_text): + """Initialize queue by first words from `split_text`. + + Parameters + ---------- + split_text : list of str + Splitted text. + + Returns + ------- + Queue + Initialized queue. + + """ queue = _Queue() first_window = _get_first_window(split_text) for word in first_window[1:]: @@ -77,17 +182,56 @@ def _init_queue(split_text): def _process_word(graph, tokens, queue, word): + """Sets edge between `word` and each element in queue in `graph` if such nodes + exist in `tokens` and `graph`. + + Parameters + ---------- + graph : :class:`~gensim.summarization.graph.Graph` + Given graph. + tokens : dict + Original units (words) as keys and processed units (tokens) as values. + queue : Queue + Given queue. + word : str + Word, possible `node` in graph and item in `tokens`. + + """ for word_to_compare in _queue_iterator(queue): _set_graph_edge(graph, tokens, word, word_to_compare) def _update_queue(queue, word): + """Updates given `queue` (removes last item and puts `word`). + + Parameters + ---------- + queue : Queue + Given queue. + word : str + Word to be added to queue. + + """ queue.get() queue.put(word) assert queue.qsize() == (WINDOW_SIZE - 1) def _process_text(graph, tokens, split_text): + """Process `split_text` by updating given `graph` with new eges between nodes + if they exists in `tokens` and `graph`. + Words are taken from `split_text` with window size :const:`~gensim.parsing.keywords.WINDOW_SIZE`. + + Parameters + ---------- + graph : :class:`~gensim.summarization.graph.Graph` + Given graph. + tokens : dict + Original units (words) as keys and processed units (tokens) as values. + split_text : list of str + Splitted text. + + """ queue = _init_queue(split_text) for i in xrange(WINDOW_SIZE, len(split_text)): word = split_text[i] @@ -96,6 +240,19 @@ def _process_text(graph, tokens, split_text): def _queue_iterator(queue): + """Represents iterator of the given queue. + + Parameters + ---------- + queue : Queue + Given queue. + + Yields + ------ + str + Current item of queue. + + """ iterations = queue.qsize() for _ in xrange(iterations): var = queue.get() @@ -104,20 +261,63 @@ def _queue_iterator(queue): def _set_graph_edges(graph, tokens, split_text): + """Updates given `graph` by setting eges between nodes if they exists in `tokens` and `graph`. + Words are taken from `split_text` with window size :const:`~gensim.parsing.keywords.WINDOW_SIZE`. + + Parameters + ---------- + graph : :class:~gensim.summarization.graph.Graph + Given graph. + tokens : dict + Original units (words) as keys and processed units (tokens) as values. + split_text : list of str + Splitted text. + + """ _process_first_window(graph, tokens, split_text) _process_text(graph, tokens, split_text) def _extract_tokens(lemmas, scores, ratio, words): - lemmas.sort(key=lambda s: scores[s], reverse=True) + """Extracts tokens from provided lemmas. Most scored lemmas are used if `words` not provided. + + Parameters + ---------- + lemmas : list of str + Given lemmas. + scores : dict + Dictionary with lemmas and its scores. + ratio : float + Proportion of lemmas used for final result. + words : int + Number of used words. If no "words" option is selected, the number of + sentences is reduced by the provided ratio, else, the ratio is ignored. + + Returns + ------- + list of (float, str) + Scores and corresponded lemmas. - # If no "words" option is selected, the number of sentences is - # reduced by the provided ratio, else, the ratio is ignored. + """ + lemmas.sort(key=lambda s: scores[s], reverse=True) length = len(lemmas) * ratio if words is None else words return [(scores[lemmas[i]], lemmas[i],) for i in range(int(length))] def _lemmas_to_words(tokens): + """Get words and lemmas from given tokens. Produces "reversed" `tokens`. + + Parameters + ---------- + tokens : dict + Original units (words) as keys and processed units (tokens) as values. + + Returns + ------- + dict + Lemmas as keys and lists corresponding words as values. + + """ lemma_to_word = {} for word, unit in iteritems(tokens): lemma = unit.token @@ -129,11 +329,22 @@ def _lemmas_to_words(tokens): def _get_keywords_with_score(extracted_lemmas, lemma_to_word): + """Get words of `extracted_lemmas` and its scores, words contains in `lemma_to_word`. + + Parameters + ---------- + extracted_lemmas : list of (float, str) + Given lemmas with scores + lemma_to_word : dict + Lemmas and corresponding words. + + Returns + ------- + dict + Keywords as keys and its scores as values. + """ - :param extracted_lemmas:list of tuples - :param lemma_to_word: dict of {lemma:list of words} - :return: dict of {keyword:score} - """ + keywords = {} for score, lemma in extracted_lemmas: keyword_list = lemma_to_word[lemma] @@ -143,15 +354,37 @@ def _get_keywords_with_score(extracted_lemmas, lemma_to_word): def _strip_word(word): + """Get cleaned `word`. + + Parameters + ---------- + word : str + Given word. + + Returns + ------- + str + Cleaned word. + """ stripped_word_list = list(_tokenize_by_word(word)) return stripped_word_list[0] if stripped_word_list else "" def _get_combined_keywords(_keywords, split_text): - """ - :param keywords:dict of keywords:scores - :param split_text: list of strings - :return: combined_keywords:list + """Get most scored words (`_keywords`) contained in `split_text` and it's combinations. + + Parameters + ---------- + _keywords : dict + Keywords as keys and its scores as values. + split_text : list of str + Splitted text. + + Returns + ------- + list of str + Keywords and/or its combinations. + """ result = [] _keywords = _keywords.copy() @@ -175,6 +408,21 @@ def _get_combined_keywords(_keywords, split_text): def _get_average_score(concept, _keywords): + """Get average score of words in `concept`. + + Parameters + ---------- + concept : str + Input text. + _keywords : dict + Keywords as keys and its scores as values. + + Returns + ------- + float + Average score. + + """ word_list = concept.split() word_counter = 0 total = 0 @@ -185,9 +433,29 @@ def _get_average_score(concept, _keywords): def _format_results(_keywords, combined_keywords, split, scores): - """ - :param keywords:dict of keywords:scores - :param combined_keywords:list of word/s + """Formats, sorts and returns `combined_keywords` in desired format. + + Parameters + ---------- + _keywords : dict + Keywords as keys and its scores as values. + combined_keywords : list of str + Most ranked words and/or its combinations. + split : bool + Split result if True or return string otherwise, optional. + scores : bool + Whether return `combined_keywords` with scores, optional. If True + `split` is ignored. + + Returns + ------- + result: list of (str, float) + If `scores`, keywords with scores **OR** + result: list of str + If `split`, keywords only **OR** + result: str + Keywords, joined by endl. + """ combined_keywords.sort(key=lambda w: _get_average_score(w, _keywords), reverse=True) if scores: @@ -199,6 +467,39 @@ def _format_results(_keywords, combined_keywords, split, scores): def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=('NN', 'JJ'), lemmatize=False, deacc=True): + """Get most ranked words of provided text and/or its combinations. + + Parameters + ---------- + + text : str + Input text. + ratio : float, optional + If no "words" option is selected, the number of sentences is reduced by the provided ratio, + else, the ratio is ignored. + words : int, optional + Number of returned words. + split : bool, optional + Whether split keywords if True. + scores : bool, optional + Whether score of keyword. + pos_filter : tuple, optional + Part of speech filters. + lemmatize : bool, optional + If True - lemmatize words. + deacc : bool, optional + If True - remove accentuation. + + Returns + ------- + result: list of (str, float) + If `scores`, keywords with scores **OR** + result: list of str + If `split`, keywords only **OR** + result: str + Keywords, joined by endl. + + """ # Gets a dict of word -> lemma text = to_unicode(text) tokens = _clean_text_by_word(text, deacc=deacc) @@ -233,6 +534,19 @@ def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter= def get_graph(text): + """Creates and returns graph from given text, cleans and tokenize text before building graph. + + Parameters + ---------- + text : str + Sequence of values. + + Returns + ------- + :class:`~gensim.summarization.graph.Graph` + Created graph. + + """ tokens = _clean_text_by_word(text) split_text = list(_tokenize_by_word(text)) diff --git a/gensim/summarization/mz_entropy.py b/gensim/summarization/mz_entropy.py index b9c5c02f33..11437f5c86 100644 --- a/gensim/summarization/mz_entropy.py +++ b/gensim/summarization/mz_entropy.py @@ -11,52 +11,47 @@ def mz_keywords(text, blocksize=1024, scores=False, split=False, weighted=True, threshold=0.0): - """Extract keywords from text using the Montemurro and Zanette entropy - algorithm. [1]_ + """Extract keywords from text using the Montemurro and Zanette entropy algorithm. [1]_ Parameters ---------- text: str - document to summarize + Document for summarization. blocksize: int, optional - size of blocks to use in analysis, default is 1024 + Size of blocks to use in analysis. scores: bool, optional - Whether to return score with keywords, default is False + Whether to return score with keywords. split: bool, optional - Whether to return results as list, default is False + Whether to return results as list. weighted: bool, optional - Whether to weight scores by word frequency. Default is True. - False can useful for shorter texts, and allows automatic thresholding + Whether to weight scores by word frequency. + False can useful for shorter texts, and allows automatic thresholding. threshold: float or 'auto', optional - minimum score for returned keywords, default 0.0 - 'auto' calculates the threshold as n_blocks / (n_blocks + 1.0) + 1.0e-8 - Use 'auto' with weighted=False) + Minimum score for returned keywords, 'auto' calculates the threshold as n_blocks / (n_blocks + 1.0) + 1e-8, + use 'auto' with `weighted=False`. Returns ------- results: str - newline separated keywords if `split` == False OR + newline separated keywords if `split` == False **OR** results: list(str) - list of keywords if `scores` == False OR + list of keywords if `scores` == False **OR** results: list(tuple(str, float)) list of (keyword, score) tuples if `scores` == True Results are returned in descending order of score regardless of the format. - Notes - ----- + Note + ---- This algorithm looks for keywords that contribute to the structure of the - text on scales of blocksize words of larger. It is suitable for extracting + text on scales of `blocksize` words of larger. It is suitable for extracting keywords representing the major themes of long texts. References ---------- - [1] Marcello A Montemurro, Damian Zanette, - "Towards the quantification of the semantic information encoded in - written language" - Advances in Complex Systems, Volume 13, Issue 2 (2010), pp. 135-153 - DOI: 10.1142/S0219525910002530 - https://arxiv.org/abs/0907.1558 + .. [1] Marcello A Montemurro, Damian Zanette, "Towards the quantification of the semantic information encoded in + written language". Advances in Complex Systems, Volume 13, Issue 2 (2010), pp. 135-153, + DOI: 10.1142/S0219525910002530, https://arxiv.org/abs/0907.1558 """ text = to_unicode(text) diff --git a/gensim/summarization/pagerank_weighted.py b/gensim/summarization/pagerank_weighted.py index f5a24635a1..df1352367c 100644 --- a/gensim/summarization/pagerank_weighted.py +++ b/gensim/summarization/pagerank_weighted.py @@ -2,6 +2,36 @@ # -*- coding: utf-8 -*- # # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html + +"""This module calculate PageRank [1]_ based on wordgraph. + + +.. [1] https://en.wikipedia.org/wiki/PageRank + +Examples +-------- + +Calculate Pagerank for words + +>>> from gensim.summarization.keywords import get_graph +>>> from gensim.summarization.pagerank_weighted import pagerank_weighted +>>> graph = get_graph("The road to hell is paved with good intentions.") +>>> # result will looks like {'good': 0.70432858653171504, 'hell': 0.051128871128006126, ...} +>>> result = pagerank_weighted(graph) + +Build matrix from graph + +>>> from gensim.summarization.pagerank_weighted import build_adjacency_matrix +>>> build_adjacency_matrix(graph).todense() +matrix([[ 0., 0., 0., 0., 0.], + [ 0., 0., 1., 0., 0.], + [ 0., 1., 0., 0., 0.], + [ 0., 0., 0., 0., 0.], + [ 0., 0., 0., 0., 0.]]) + +""" + + import numpy from numpy import empty as empty_matrix from scipy.linalg import eig @@ -9,15 +39,23 @@ from scipy.sparse.linalg import eigs from six.moves import xrange -try: - from numpy import VisibleDeprecationWarning - import warnings - warnings.filterwarnings("ignore", category=VisibleDeprecationWarning) -except ImportError: - pass - def pagerank_weighted(graph, damping=0.85): + """Get dictionary of `graph` nodes and its ranks. + + Parameters + ---------- + graph : :class:`~gensim.summarization.graph.Graph` + Given graph. + damping : float + Damping parameter, optional + + Returns + ------- + dict + Nodes of `graph` as keys, its ranks as values. + + """ adjacency_matrix = build_adjacency_matrix(graph) probability_matrix = build_probability_matrix(graph) @@ -30,6 +68,19 @@ def pagerank_weighted(graph, damping=0.85): def build_adjacency_matrix(graph): + """Get matrix representation of given `graph`. + + Parameters + ---------- + graph : :class:`~gensim.summarization.graph.Graph` + Given graph. + + Returns + ------- + :class:`scipy.sparse.csr_matrix`, shape = [n, n] + Adjacency matrix of given `graph`, n is number of nodes. + + """ row = [] col = [] data = [] @@ -50,6 +101,20 @@ def build_adjacency_matrix(graph): def build_probability_matrix(graph): + """Get square matrix of shape (n, n), where n is number of nodes of the + given `graph`. + + Parameters + ---------- + graph : :class:`~gensim.summarization.graph.Graph` + Given graph. + + Returns + ------- + numpy.ndarray, shape = [n, n] + Eigenvector of matrix `a`, n is number of nodes of `graph`. + + """ dimension = len(graph.nodes()) matrix = empty_matrix((dimension, dimension)) @@ -60,6 +125,19 @@ def build_probability_matrix(graph): def principal_eigenvector(a): + """Get eigenvector of square matrix `a`. + + Parameters + ---------- + a : numpy.ndarray, shape = [n, n] + Given matrix. + + Returns + ------- + numpy.ndarray, shape = [n, ] + Eigenvector of matrix `a`. + + """ # Note that we prefer to use `eigs` even for dense matrix # because we need only one eigenvector. See #441, #438 for discussion. @@ -74,6 +152,22 @@ def principal_eigenvector(a): def process_results(graph, vec): + """Get `graph` nodes and corresponding absolute values of provided eigenvector. + This function is helper for :func:`~gensim.summarization.pagerank_weighted.pagerank_weighted` + + Parameters + ---------- + graph : :class:`~gensim.summarization.graph.Graph` + Given graph. + vec : numpy.ndarray, shape = [n, ] + Given eigenvector, n is number of nodes of `graph`. + + Returns + ------- + dict + Graph nodes as keys, corresponding elements of eigenvector as values. + + """ scores = {} for i, node in enumerate(graph.nodes()): scores[node] = abs(vec[i]) diff --git a/gensim/summarization/summarizer.py b/gensim/summarization/summarizer.py index 1b1251b1d7..afeb359b15 100644 --- a/gensim/summarization/summarizer.py +++ b/gensim/summarization/summarizer.py @@ -3,7 +3,55 @@ # # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html +"""This module provides functions for summarizing texts. Summarizing is based on +ranks of text sentences using a variation of the TextRank algorithm [1]_. + +.. [1] Federico Barrios, Federico LĀ“opez, Luis Argerich, Rosita Wachenchauzer (2016). + Variations of the Similarity Function of TextRank for Automated Summarization, + https://arxiv.org/abs/1602.03606 + + +Data +---- + +.. data:: INPUT_MIN_LENGTH - Minimal number of sentences in text +.. data:: WEIGHT_THRESHOLD - Minimal weight of edge between graph nodes. Smaller weights set to zero. + +Example +------- + +>>> from gensim.summarization.summarizer import summarize +>>> text = '''Rice Pudding - Poem by Alan Alexander Milne +... What is the matter with Mary Jane? +... She's crying with all her might and main, +... And she won't eat her dinner - rice pudding again - +... What is the matter with Mary Jane? +... What is the matter with Mary Jane? +... I've promised her dolls and a daisy-chain, +... And a book about animals - all in vain - +... What is the matter with Mary Jane? +... What is the matter with Mary Jane? +... She's perfectly well, and she hasn't a pain; +... But, look at her, now she's beginning again! - +... What is the matter with Mary Jane? +... What is the matter with Mary Jane? +... I've promised her sweets and a ride in the train, +... And I've begged her to stop for a bit and explain - +... What is the matter with Mary Jane? +... What is the matter with Mary Jane? +... She's perfectly well and she hasn't a pain, +... And it's lovely rice pudding for dinner again! +... What is the matter with Mary Jane?''' +>>> print(summarize(text)) +And she won't eat her dinner - rice pudding again - +I've promised her dolls and a daisy-chain, +I've promised her sweets and a ride in the train, +And it's lovely rice pudding for dinner again! + +""" + import logging +from gensim.utils import deprecated from gensim.summarization.pagerank_weighted import pagerank_weighted as _pagerank from gensim.summarization.textcleaner import clean_text_by_sentences as _clean_text_by_sentences from gensim.summarization.commons import build_graph as _build_graph @@ -22,6 +70,15 @@ def _set_graph_edge_weights(graph): + """Sets weights using BM25 algorithm. Leaves small weights as zeroes. If all weights are fairly small, + forces all weights to 1, inplace. + + Parameters + ---------- + graph : :class:`~gensim.summarization.graph.Graph` + Given graph. + + """ documents = graph.nodes() weights = _bm25_weights(documents) @@ -48,6 +105,14 @@ def _set_graph_edge_weights(graph): def _create_valid_graph(graph): + """Sets all weights of edges for different edges as 1, inplace. + + Parameters + ---------- + graph : :class:`~gensim.summarization.graph.Graph` + Given graph. + + """ nodes = graph.nodes() for i in xrange(len(nodes)): @@ -63,11 +128,45 @@ def _create_valid_graph(graph): graph.add_edge(edge, 1) +@deprecated("Function will be removed in 4.0.0") def _get_doc_length(doc): + """Get length of (tokenized) document. + + Parameters + ---------- + doc : list of (list of (tuple of int)) + Given document. + + Returns + ------- + int + Length of document. + + """ return sum([item[1] for item in doc]) +@deprecated("Function will be removed in 4.0.0") def _get_similarity(doc1, doc2, vec1, vec2): + """Returns similarity of two documents. + + Parameters + ---------- + doc1 : list of (list of (tuple of int)) + First document. + doc2 : list of (list of (tuple of int)) + Second document. + vec1 : array + ? of first document. + vec1 : array + ? of secont document. + + Returns + ------- + float + Similarity of two documents. + + """ numerator = vec1.dot(vec2.transpose()).toarray()[0][0] length_1 = _get_doc_length(doc1) length_2 = _get_doc_length(doc2) @@ -78,20 +177,63 @@ def _get_similarity(doc1, doc2, vec1, vec2): def _build_corpus(sentences): + """Construct corpus from provided sentences. + + Parameters + ---------- + sentences : list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` + Given sentences. + + Returns + ------- + list of list of (int, int) + Corpus built from sentences. + + """ split_tokens = [sentence.token.split() for sentence in sentences] dictionary = Dictionary(split_tokens) return [dictionary.doc2bow(token) for token in split_tokens] def _get_important_sentences(sentences, corpus, important_docs): + """Get most important sentences. + + Parameters + ---------- + sentences : list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` + Given sentences. + corpus : list of list of (int, int) + Provided corpus. + important_docs : list of list of (int, int) + Most important documents of the corpus. + + Returns + ------- + list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` + Most important sentences. + + """ hashable_corpus = _build_hasheable_corpus(corpus) sentences_by_corpus = dict(zip(hashable_corpus, sentences)) return [sentences_by_corpus[tuple(important_doc)] for important_doc in important_docs] def _get_sentences_with_word_count(sentences, word_count): - """ Given a list of sentences, returns a list of sentences with a - total word count similar to the word count provided.""" + """Get list of sentences. Total number of returned words close to specified `word_count`. + + Parameters + ---------- + sentences : list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` + Given sentences. + word_count : int or None + Number of returned words. If None full most important sentences will be returned. + + Returns + ------- + list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` + Most important sentences. + + """ length = 0 selected_sentences = [] @@ -111,6 +253,25 @@ def _get_sentences_with_word_count(sentences, word_count): def _extract_important_sentences(sentences, corpus, important_docs, word_count): + """Get most important sentences of the `corpus`. + + Parameters + ---------- + sentences : list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` + Given sentences. + corpus : list of list of (int, int) + Provided corpus. + important_docs : list of list of (int, int) + Most important docs of the corpus. + word_count : int + Number of returned words. If None full most important sentences will be returned. + + Returns + ------- + list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` + Most important sentences. + + """ important_sentences = _get_important_sentences(sentences, corpus, important_docs) # If no "word_count" option is provided, the number of sentences is @@ -121,29 +282,69 @@ def _extract_important_sentences(sentences, corpus, important_docs, word_count): def _format_results(extracted_sentences, split): + """Returns `extracted_sentences` in desired format. + + Parameters + ---------- + extracted_sentences : list of :class:~gensim.summarization.syntactic_unit.SyntacticUnit + Given sentences. + split : bool + If True sentences will be returned as list. Otherwise sentences will be merged and returned as string. + + Returns + ------- + list of str + If `split` **OR** + str + Formatted result. + + """ if split: return [sentence.text for sentence in extracted_sentences] return "\n".join([sentence.text for sentence in extracted_sentences]) def _build_hasheable_corpus(corpus): + """Hashes and get `corpus`. + + Parameters + ---------- + corpus : list of list of (int, int) + Given corpus. + + Returns + ------- + list of list of (int, int) + Hashable corpus. + + """ return [tuple(doc) for doc in corpus] def summarize_corpus(corpus, ratio=0.2): + """Get a list of the most important documents of a corpus using a variation of the TextRank algorithm [1]_. + Used as helper for summarize :func:`~gensim.summarization.summarizer.summarizer` + + Note + ---- + The input must have at least :const:`~gensim.summarization.summarizer.INPUT_MIN_LENGTH` documents for the summary + to make sense. + + + Parameters + ---------- + corpus : list of list of (int, int) + Given corpus. + ratio : float, optional + Number between 0 and 1 that determines the proportion of the number of + sentences of the original text to be chosen for the summary, optional. + + Returns + ------- + list of str + Most important documents of given `corpus` sorted by the document score, highest first. + """ - Returns a list of the most important documents of a corpus using a - variation of the TextRank algorithm. - The input must have at least INPUT_MIN_LENGTH (%d) documents for the - summary to make sense. - - The length of the output can be specified using the ratio parameter, - which determines how many documents will be chosen for the summary - (defaults at 20%% of the number of documents of the corpus). - - The most important documents are returned as a list sorted by the - document score, highest first. - """ % INPUT_MIN_LENGTH hashable_corpus = _build_hasheable_corpus(corpus) # If the corpus is empty, the function ends. @@ -173,29 +374,39 @@ def summarize_corpus(corpus, ratio=0.2): def summarize(text, ratio=0.2, word_count=None, split=False): - """ - Returns a summarized version of the given text using a variation of - the TextRank algorithm (see https://arxiv.org/abs/1602.03606). + """Get a summarized version of the given text. The output summary will consist of the most representative sentences and will be returned as a string, divided by newlines. - If the split parameter is set to True, a list of sentences will be - returned instead. - - The input should be a string, and must be longer than - INPUT_MIN_LENGTH sentences for the summary to make sense. The text - will be split into sentences using the split_sentences method in the - summarization.texcleaner module. - Note that newlines divide sentences. - - The length of the output can be specified using the ratio and - word_count parameters: - - ratio should be a number between 0 and 1 that determines the - percentage of the number of sentences of the original text to be - chosen for the summary (defaults at 0.2). - word_count determines how many words will the output contain. + + Note + ---- + The input should be a string, and must be longer than :const:`~gensim.summarization.summarizer.INPUT_MIN_LENGTH` + sentences for the summary to make sense. + The text will be split into sentences using the split_sentences method in the :mod:`gensim.summarization.texcleaner` + module. Note that newlines divide sentences. + + + Parameters + ---------- + text : str + Given text. + ratio : float, optional + Number between 0 and 1 that determines the proportion of the number of + sentences of the original text to be chosen for the summary. + word_count : int or None, optional + Determines how many words will the output contain. If both parameters are provided, the ratio will be ignored. + split : bool, optional + If True, list of sentences will be returned. Otherwise joined + strings will bwe returned. + + Returns + ------- + list of str + If `split` **OR** + str + Most representative sentences of given the text. """ # Gets a list of processed sentences. diff --git a/gensim/summarization/syntactic_unit.py b/gensim/summarization/syntactic_unit.py index 89842e1122..335ee6a212 100644 --- a/gensim/summarization/syntactic_unit.py +++ b/gensim/summarization/syntactic_unit.py @@ -3,13 +3,46 @@ # # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html +"""This module contains implementation of SyntacticUnit class. It generally used while text cleaning. +:class:`~gensim.summarization.syntactic_unit.SyntacticUnit` represents printable version of provided text. + +""" + class SyntacticUnit(object): + """SyntacticUnit class. + + Attributes + ---------- + text : str + Input text. + token : str + Tokenized text. + tag : str + Tag of unit, optional. + index : int + Index of sytactic unit in corpus, optional. + score : float + Score of synctatic unit, optional. + + """ def __init__(self, text, token=None, tag=None): + """ + + Parameters + ---------- + text : str + Input text. + token : str + Tokenized text, optional. + tag : str + Tag of unit, optional. + + """ self.text = text self.token = token - self.tag = tag[:2] if tag else None # Just first two letters of tag + self.tag = tag[:2] if tag else None # Just first two letters of tag self.index = -1 self.score = -1 diff --git a/gensim/summarization/textcleaner.py b/gensim/summarization/textcleaner.py index 4829d9f892..5af6bef257 100644 --- a/gensim/summarization/textcleaner.py +++ b/gensim/summarization/textcleaner.py @@ -3,6 +3,23 @@ # # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html +"""This module contains functions and processors used for processing text, +extracting sentences from text, working with acronyms and abbreviations. + +Data +---- + +.. data:: SEPARATOR - Special separator used in abbreviations. +.. data:: RE_SENTENCE - Pattern to split text to sentences. +.. data:: AB_SENIOR - Pattern for detecting abbreviations (example: Sgt. Pepper). +.. data:: AB_ACRONYM - Pattern for detecting acronyms. +.. data:: AB_ACRONYM_LETTERS - Pattern for detecting acronyms (example: P.S. I love you). +.. data:: UNDO_AB_SENIOR - Pattern like AB_SENIOR but with SEPARATOR between abbreviation and next word. +.. data:: UNDO_AB_ACRONYM - Pattern like AB_ACRONYM but with SEPARATOR between abbreviation and next word. + +""" + + from gensim.summarization.syntactic_unit import SyntacticUnit from gensim.parsing.preprocessing import preprocess_documents from gensim.utils import tokenize @@ -10,7 +27,7 @@ import re import logging -logger = logging.getLogger('summa.preprocessing.cleaner') +logger = logging.getLogger('summarizer.preprocessing.cleaner') try: from pattern.en import tag @@ -31,19 +48,96 @@ def split_sentences(text): + """Split and get list of sentences from given text. It preserves abbreviations set in + :const:`~gensim.summarization.textcleaner.AB_SENIOR` and :const:`~gensim.summarization.textcleaner.AB_ACRONYM`. + + Parameters + ---------- + text : str + Input text. + + Returns + ------- + list of str + Sentences of given text. + + Example + ------- + >>> from gensim.summarization.textcleaner import split_sentences + >>> text = '''Beautiful is better than ugly. + ... Explicit is better than implicit. Simple is better than complex.''' + >>> split_sentences(text) + ['Beautiful is better than ugly.', + 'Explicit is better than implicit.', + 'Simple is better than complex.'] + + """ processed = replace_abbreviations(text) return [undo_replacement(sentence) for sentence in get_sentences(processed)] def replace_abbreviations(text): + """Replace blank space to '@' separator after abbreviation and next word. + + Parameters + ---------- + text : str + Input sentence. + + Returns + ------- + str + Sentence with changed separator. + + Example + ------- + >>> replace_abbreviations("God bless you, please, Mrs. Robinson") + God bless you, please, Mrs.@Robinson + + """ return replace_with_separator(text, SEPARATOR, [AB_SENIOR, AB_ACRONYM]) def undo_replacement(sentence): + """Replace `@` separator back to blank space after each abbreviation. + + Parameters + ---------- + sentence : str + Input sentence. + + Returns + ------- + str + Sentence with changed separator. + + Example + ------- + >>> undo_replacement("God bless you, please, Mrs.@Robinson") + God bless you, please, Mrs. Robinson + + """ return replace_with_separator(sentence, r" ", [UNDO_AB_SENIOR, UNDO_AB_ACRONYM]) def replace_with_separator(text, separator, regexs): + """Get text with replaced separator if provided regular expressions were matched. + + Parameters + ---------- + text : str + Input text. + separator : str + The separator between words to be replaced. + regexs : list of `_sre.SRE_Pattern` + Regular expressions used in processing text. + + Returns + ------- + str + Text with replaced separators. + + """ replacement = r"\1" + separator + r"\2" result = text for regex in regexs: @@ -52,11 +146,51 @@ def replace_with_separator(text, separator, regexs): def get_sentences(text): + """Sentence generator from provided text. Sentence pattern set + in :const:`~gensim.summarization.textcleaner.RE_SENTENCE`. + + Parameters + ---------- + text : str + Input text. + + Yields + ------ + str + Single sentence extracted from text. + + Example + ------- + >>> text = "Does this text contains two sentences? Yes, it does." + >>> for sentence in get_sentences(text): + >>> print(sentence) + Does this text contains two sentences? + Yes, it does. + + """ for match in RE_SENTENCE.finditer(text): yield match.group() def merge_syntactic_units(original_units, filtered_units, tags=None): + """Process given sentences and its filtered (tokenized) copies into + :class:`~gensim.summarization.syntactic_unit.SyntacticUnit`. Also adds tags if they are provided to produced units. + + Parameters + ---------- + original_units : list + List of original sentences. + filtered_units : list + List of tokenized sentences. + tags : list of str, optional + List of strings used as tags for each unit. None as deafault. + + Returns + ------- + list of :class:~gensim.summarization.syntactic_unit.SyntacticUnit + List of syntactic units (sentences). + + """ units = [] for i in xrange(len(original_units)): if filtered_units[i] == '': @@ -74,12 +208,38 @@ def merge_syntactic_units(original_units, filtered_units, tags=None): def join_words(words, separator=" "): + """Concatenates `words` with `separator` between elements. + + Parameters + ---------- + words : list of str + Given words. + separator : str, optional + The separator between elements. + + Returns + ------- + str + String of merged words with separator between elements. + + """ return separator.join(words) def clean_text_by_sentences(text): - """ Tokenizes a given text into sentences, applying filters and lemmatizing them. - Returns a SyntacticUnit list. """ + """Tokenize a given text into sentences, applying filters and lemmatize them. + + Parameters + ---------- + text : str + Given text. + + Returns + ------- + list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` + Sentences of the given text. + + """ original_sentences = split_sentences(text) filtered_sentences = [join_words(sentence) for sentence in preprocess_documents(original_sentences)] @@ -87,8 +247,29 @@ def clean_text_by_sentences(text): def clean_text_by_word(text, deacc=True): - """ Tokenizes a given text into words, applying filters and lemmatizing them. - Returns a dict of word -> syntacticUnit. """ + """Tokenize a given text into words, applying filters and lemmatize them. + + Parameters + ---------- + text : str + Given text. + deacc : bool, optional + Remove accentuation if True. + + Returns + ------- + dict + Words as keys, :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` as values. + + Example + ------- + >>> from gensim.summarization.textcleaner import clean_text_by_word + >>> clean_text_by_word("God helps those who help themselves") + {'god': Original unit: 'god' *-*-*-* Processed unit: 'god', + 'help': Original unit: 'help' *-*-*-* Processed unit: 'help', + 'helps': Original unit: 'helps' *-*-*-* Processed unit: 'help'} + + """ text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS]) original_words = list(tokenize(text_without_acronyms, to_lower=True, deacc=deacc)) filtered_words = [join_words(word_list, "") for word_list in preprocess_documents(original_words)] @@ -101,5 +282,30 @@ def clean_text_by_word(text, deacc=True): def tokenize_by_word(text): + """Tokenize input text. Before tokenizing transforms text to lower case and removes accentuation and acronyms set + :const:`~gensim.summarization.textcleaner.AB_ACRONYM_LETTERS`. + + Parameters + ---------- + text : str + Given text. + + Returns + ------- + generator + Generator that yields sequence words of the given text. + + Example + ------- + >>> from gensim.summarization.textcleaner import tokenize_by_word + >>> g = tokenize_by_word('Veni. Vedi. Vici.') + >>> print(next(g)) + veni + >>> print(next(g)) + vedi + >>> print(next(g)) + vici + + """ text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS]) return tokenize(text_without_acronyms, to_lower=True, deacc=True)