Skip to content

Commit

Permalink
Fix segment-wiki script (#1694)
Browse files Browse the repository at this point in the history
* Fix script docstring (format description), remove  pruning through  tokenization, more descriptive filed names, stdout support (as default option)

* Add link to fresh en wiki

* Add link to fresh en wiki, examples section, filter redirect

* strip -> lstrip

* Added time for processing & ignore empty articles

* extend filtering section

* reduce threshold

* parametrize minimal article length, reduce to 200
  • Loading branch information
menshikh-iv authored Nov 6, 2017
1 parent c583b28 commit 64f9a92
Showing 1 changed file with 83 additions and 36 deletions.
119 changes: 83 additions & 36 deletions gensim/scripts/segment_wiki.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,37 @@
# Copyright (C) 2016 RaRe Technologies

"""
Construct a corpus from a Wikipedia (or other MediaWiki-based) database dump and extract sections of pages from it
and save to json-line format.
If you have the `pattern` package installed, this module will use a fancy
lemmatization to get a lemma of each token (instead of plain alphabetic
tokenizer). The package is available at https://github.com/clips/pattern .
Construct a corpus from a Wikipedia (or other MediaWiki-based) database dump (typical filename
is <LANG>wiki-<YYYYMMDD>-pages-articles.xml.bz2 or <LANG>wiki-latest-pages-articles.xml.bz2),
extract titles, section names, section content and save to json-line format,
that contains 3 fields ::
'title' (str) - title of article,
'section_titles' (list) - list of titles of sections,
'section_texts' (list) - list of content from sections.
English Wikipedia dump available
`here <https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2>`_. Approximate time
for processing is 2.5 hours (i7-6700HQ, SSD).
Examples
--------
Convert wiki to json-lines format:
`python -m gensim.scripts.segment_wiki -f enwiki-latest-pages-articles.xml.bz2 | gzip > enwiki-latest.json.gz`
Read json-lines dump
>>> # iterate over the plain text file we just created
>>> for line in smart_open('enwiki-latest.json.gz'):
>>> # decode JSON into a Python object
>>> article = json.loads(line)
>>>
>>> # each article has a "title", "section_titles" and "section_texts" fields
>>> print("Article title: %s" % article['title'])
>>> for section_title, section_text in zip(article['section_titles'], article['section_texts']):
>>> print("Section title: %s" % section_title)
>>> print("Section text: %s" % section_text)
"""

Expand All @@ -22,15 +47,14 @@
import sys
from xml.etree import cElementTree

from gensim.corpora.wikicorpus import ARTICLE_MIN_WORDS, IGNORED_NAMESPACES, WikiCorpus, \
filter_wiki, get_namespace, tokenize, utils
from gensim.corpora.wikicorpus import IGNORED_NAMESPACES, WikiCorpus, filter_wiki, get_namespace, utils
from smart_open import smart_open


logger = logging.getLogger(__name__)


def segment_all_articles(file_path):
def segment_all_articles(file_path, min_article_character=200):
"""Extract article titles and sections from a MediaWiki bz2 database dump.
Parameters
Expand All @@ -39,27 +63,30 @@ def segment_all_articles(file_path):
Path to MediaWiki dump, typical filename is <LANG>wiki-<YYYYMMDD>-pages-articles.xml.bz2
or <LANG>wiki-latest-pages-articles.xml.bz2.
min_article_character : int, optional
Minimal number of character for article (except titles and leading gaps).
Yields
------
(str, list of (str, str))
Structure contains (title, [(section_heading, section_content), ...]).
"""
with smart_open(file_path, 'rb') as xml_fileobj:
wiki_sections_corpus = WikiSectionsCorpus(xml_fileobj)
wiki_sections_corpus = _WikiSectionsCorpus(xml_fileobj, min_article_character=min_article_character)
wiki_sections_corpus.metadata = True
wiki_sections_text = wiki_sections_corpus.get_texts_with_sections()
for article_title, article_sections in wiki_sections_text:
yield article_title, article_sections


def segment_and_print_all_articles(file_path, output_file):
def segment_and_write_all_articles(file_path, output_file, min_article_character=200):
"""Write article title and sections to output_file,
output_file is json-line file with 3 fields::
'tl' - title of article,
'st' - list of titles of sections,
'sc' - list of content from sections.
'title' - title of article,
'section_titles' - list of titles of sections,
'section_texts' - list of content from sections.
Parameters
----------
Expand All @@ -68,18 +95,28 @@ def segment_and_print_all_articles(file_path, output_file):
or <LANG>wiki-latest-pages-articles.xml.bz2.
output_file : str
Path to output file.
Path to output file in json-lines format.
min_article_character : int, optional
Minimal number of character for article (except titles and leading gaps).
"""
with smart_open(output_file, 'w') as outfile:
for idx, (article_title, article_sections) in enumerate(segment_all_articles(file_path)):
output_data = {"tl": article_title, "st": [], "sc": []}
if output_file is None:
outfile = sys.stdout
else:
outfile = smart_open(output_file, 'wb')

try:
for idx, (article_title, article_sections) in enumerate(segment_all_articles(file_path, min_article_character)):
output_data = {"title": article_title, "section_titles": [], "section_texts": []}
for section_heading, section_content in article_sections:
output_data["st"].append(section_heading)
output_data["sc"].append(section_content)
output_data["section_titles"].append(section_heading)
output_data["section_texts"].append(section_content)
if (idx + 1) % 100000 == 0:
logger.info("Processed #%d articles", idx + 1)
outfile.write(json.dumps(output_data) + "\n")
finally:
outfile.close()


def extract_page_xmls(f):
Expand Down Expand Up @@ -160,25 +197,28 @@ def segment(page_xml):
return title, sections


class WikiSectionsCorpus(WikiCorpus):
class _WikiSectionsCorpus(WikiCorpus):
"""Treat a wikipedia articles dump (<LANG>wiki-<YYYYMMDD>-pages-articles.xml.bz2
or <LANG>wiki-latest-pages-articles.xml.bz2) as a (read-only) corpus.
The documents are extracted on-the-fly, so that the whole (massive) dump can stay compressed on disk.
"""
def __init__(self, fileobj, processes=None, lemmatize=utils.has_pattern(), filter_namespaces=('0',)):
def __init__(self, fileobj, min_article_character=200, processes=None,
lemmatize=utils.has_pattern(), filter_namespaces=('0',)):
"""
Parameters
----------
fileobj : file
File descriptor of MediaWiki dump.
processes : int
min_article_character : int, optional
Minimal number of character for article (except titles and leading gaps).
processes : int, optional
Number of processes, max(1, multiprocessing.cpu_count() - 1) if None.
lemmatize : bool
lemmatize : bool, optional
If `pattern` package is installed, use fancier shallow parsing to get token lemmas.
Otherwise, use simple regexp tokenization.
filter_namespaces : tuple of int
filter_namespaces : tuple of int, optional
Enumeration of namespaces that will be ignored.
"""
Expand All @@ -189,6 +229,7 @@ def __init__(self, fileobj, processes=None, lemmatize=utils.has_pattern(), filte
processes = max(1, multiprocessing.cpu_count() - 1)
self.processes = processes
self.lemmatize = lemmatize
self.min_article_character = min_article_character

def get_texts_with_sections(self):
"""Iterate over the dump, returning titles and text versions of all sections of articles.
Expand Down Expand Up @@ -217,16 +258,15 @@ def get_texts_with_sections(self):
# is dumb and would load the entire input into RAM at once...
for group in utils.chunkize(page_xmls, chunksize=10 * self.processes, maxsize=1):
for article_title, sections in pool.imap(segment, group): # chunksize=10):
# article redirects and short stubs are pruned here
num_total_tokens = 0
for section_title, section_content in sections:
if self.lemmatize:
num_total_tokens += len(utils.lemmatize(section_content))
else:
num_total_tokens += len(tokenize(section_content))
if num_total_tokens < ARTICLE_MIN_WORDS or \
any(article_title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES):
# article redirects are pruned here
if any(article_title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES): # filter non-articles
continue
if not sections or sections[0][1].lstrip().lower().startswith("#redirect"): # filter redirect
continue
if sum(len(body.strip()) for (_, body) in sections) < self.min_article_character:
# filter very short articles (trash)
continue

articles += 1
yield (article_title, sections)
pool.terminate()
Expand All @@ -239,8 +279,15 @@ def get_texts_with_sections(self):

parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, description=globals()['__doc__'])
parser.add_argument('-f', '--file', help='Path to MediaWiki database dump', required=True)
parser.add_argument('-o', '--output', help='Path to output file', required=True)
parser.add_argument('-o', '--output', help='Path to output file (stdout if not specified)')
parser.add_argument(
'-m', '--min-article-character',
help="Minimal number of character for article (except titles and leading gaps), "
"if article contains less characters that this value, "
"article will be filtered (will not be in the output file), default: %(default)s",
default=200
)
args = parser.parse_args()
segment_and_print_all_articles(args.file, args.output)
segment_and_write_all_articles(args.file, args.output, args.min_article_character)

logger.info("finished running %s", sys.argv[0])

0 comments on commit 64f9a92

Please sign in to comment.