Fix segment-wiki script (#1694)

* Fix script docstring (format description), remove pruning through tokenization, more descriptive filed names, stdout support (as default option) * Add link to fresh en wiki * Add link to fresh en wiki, examples section, filter redirect * strip -> lstrip * Added time for processing & ignore empty articles * extend filtering section * reduce threshold * parametrize minimal article length, reduce to 200
piskvorky · Nov 6, 2017 · 64f9a92 · 64f9a92
1 parent c583b28
commit 64f9a92
Showing 1 changed file with 83 additions and 36 deletions.
diff --git a/gensim/scripts/segment_wiki.py b/gensim/scripts/segment_wiki.py
@@ -5,12 +5,37 @@
 # Copyright (C) 2016 RaRe Technologies
 
 """
-Construct a corpus from a Wikipedia (or other MediaWiki-based) database dump and extract sections of pages from it
-and save to json-line format.
-
-If you have the `pattern` package installed, this module will use a fancy
-lemmatization to get a lemma of each token (instead of plain alphabetic
-tokenizer). The package is available at https://github.com/clips/pattern .
+Construct a corpus from a Wikipedia (or other MediaWiki-based) database dump (typical filename
+is <LANG>wiki-<YYYYMMDD>-pages-articles.xml.bz2 or <LANG>wiki-latest-pages-articles.xml.bz2),
+extract titles, section names, section content and save to json-line format,
+that contains 3 fields ::
+
+    'title' (str) - title of article,
+    'section_titles' (list) - list of titles of sections,
+    'section_texts' (list) - list of content from sections.
+
+English Wikipedia dump available
+`here <https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2>`_. Approximate time
+for processing is 2.5 hours (i7-6700HQ, SSD).
+
+Examples
+--------
+
+Convert wiki to json-lines format:
+`python -m gensim.scripts.segment_wiki -f enwiki-latest-pages-articles.xml.bz2 | gzip > enwiki-latest.json.gz`
+
+Read json-lines dump
+
+>>> # iterate over the plain text file we just created
+>>> for line in smart_open('enwiki-latest.json.gz'):
+>>>    # decode JSON into a Python object
+>>>    article = json.loads(line)
+>>>
+>>>    # each article has a "title", "section_titles" and "section_texts" fields
+>>>    print("Article title: %s" % article['title'])
+>>>    for section_title, section_text in zip(article['section_titles'], article['section_texts']):
+>>>        print("Section title: %s" % section_title)
+>>>        print("Section text: %s" % section_text)
 
 """
 
@@ -22,15 +47,14 @@
 import sys
 from xml.etree import cElementTree
 
-from gensim.corpora.wikicorpus import ARTICLE_MIN_WORDS, IGNORED_NAMESPACES, WikiCorpus, \
-    filter_wiki, get_namespace, tokenize, utils
+from gensim.corpora.wikicorpus import IGNORED_NAMESPACES, WikiCorpus, filter_wiki, get_namespace, utils
 from smart_open import smart_open
 
 
 logger = logging.getLogger(__name__)
 
 
-def segment_all_articles(file_path):
+def segment_all_articles(file_path, min_article_character=200):
     """Extract article titles and sections from a MediaWiki bz2 database dump.
 
     Parameters
@@ -39,27 +63,30 @@ def segment_all_articles(file_path):
         Path to MediaWiki dump, typical filename is <LANG>wiki-<YYYYMMDD>-pages-articles.xml.bz2
         or <LANG>wiki-latest-pages-articles.xml.bz2.
 
+    min_article_character : int, optional
+        Minimal number of character for article (except titles and leading gaps).
+
     Yields
     ------
     (str, list of (str, str))
         Structure contains (title, [(section_heading, section_content), ...]).
 
     """
     with smart_open(file_path, 'rb') as xml_fileobj:
-        wiki_sections_corpus = WikiSectionsCorpus(xml_fileobj)
+        wiki_sections_corpus = _WikiSectionsCorpus(xml_fileobj, min_article_character=min_article_character)
         wiki_sections_corpus.metadata = True
         wiki_sections_text = wiki_sections_corpus.get_texts_with_sections()
         for article_title, article_sections in wiki_sections_text:
             yield article_title, article_sections
 
 
-def segment_and_print_all_articles(file_path, output_file):
+def segment_and_write_all_articles(file_path, output_file, min_article_character=200):
     """Write article title and sections to output_file,
     output_file is json-line file with 3 fields::
 
-        'tl' - title of article,
-        'st' - list of titles of sections,
-        'sc' - list of content from sections.
+        'title' - title of article,
+        'section_titles' - list of titles of sections,
+        'section_texts' - list of content from sections.
 
     Parameters
     ----------
@@ -68,18 +95,28 @@ def segment_and_print_all_articles(file_path, output_file):
         or <LANG>wiki-latest-pages-articles.xml.bz2.
 
     output_file : str
-        Path to output file.
+        Path to output file in json-lines format.
+
+    min_article_character : int, optional
+        Minimal number of character for article (except titles and leading gaps).
 
     """
-    with smart_open(output_file, 'w') as outfile:
-        for idx, (article_title, article_sections) in enumerate(segment_all_articles(file_path)):
-            output_data = {"tl": article_title, "st": [], "sc": []}
+    if output_file is None:
+        outfile = sys.stdout
+    else:
+        outfile = smart_open(output_file, 'wb')
+
+    try:
+        for idx, (article_title, article_sections) in enumerate(segment_all_articles(file_path, min_article_character)):
+            output_data = {"title": article_title, "section_titles": [], "section_texts": []}
             for section_heading, section_content in article_sections:
-                output_data["st"].append(section_heading)
-                output_data["sc"].append(section_content)
+                output_data["section_titles"].append(section_heading)
+                output_data["section_texts"].append(section_content)
             if (idx + 1) % 100000 == 0:
                 logger.info("Processed #%d articles", idx + 1)
             outfile.write(json.dumps(output_data) + "\n")
+    finally:
+        outfile.close()
 
 
 def extract_page_xmls(f):
@@ -160,25 +197,28 @@ def segment(page_xml):
     return title, sections
 
 
-class WikiSectionsCorpus(WikiCorpus):
+class _WikiSectionsCorpus(WikiCorpus):
     """Treat a wikipedia articles dump (<LANG>wiki-<YYYYMMDD>-pages-articles.xml.bz2
     or <LANG>wiki-latest-pages-articles.xml.bz2) as a (read-only) corpus.
 
     The documents are extracted on-the-fly, so that the whole (massive) dump can stay compressed on disk.
 
     """
-    def __init__(self, fileobj, processes=None, lemmatize=utils.has_pattern(), filter_namespaces=('0',)):
+    def __init__(self, fileobj, min_article_character=200, processes=None,
+                 lemmatize=utils.has_pattern(), filter_namespaces=('0',)):
         """
         Parameters
         ----------
         fileobj : file
             File descriptor of MediaWiki dump.
-        processes : int
+        min_article_character : int, optional
+            Minimal number of character for article (except titles and leading gaps).
+        processes : int, optional
             Number of processes, max(1, multiprocessing.cpu_count() - 1) if None.
-        lemmatize : bool
+        lemmatize : bool, optional
             If `pattern` package is installed, use fancier shallow parsing to get token lemmas.
             Otherwise, use simple regexp tokenization.
-        filter_namespaces : tuple of int
+        filter_namespaces : tuple of int, optional
             Enumeration of namespaces that will be ignored.
 
         """
@@ -189,6 +229,7 @@ def __init__(self, fileobj, processes=None, lemmatize=utils.has_pattern(), filte
             processes = max(1, multiprocessing.cpu_count() - 1)
         self.processes = processes
         self.lemmatize = lemmatize
+        self.min_article_character = min_article_character
 
     def get_texts_with_sections(self):
         """Iterate over the dump, returning titles and text versions of all sections of articles.
@@ -217,16 +258,15 @@ def get_texts_with_sections(self):
         # is dumb and would load the entire input into RAM at once...
         for group in utils.chunkize(page_xmls, chunksize=10 * self.processes, maxsize=1):
             for article_title, sections in pool.imap(segment, group):  # chunksize=10):
-                # article redirects and short stubs are pruned here
-                num_total_tokens = 0
-                for section_title, section_content in sections:
-                    if self.lemmatize:
-                        num_total_tokens += len(utils.lemmatize(section_content))
-                    else:
-                        num_total_tokens += len(tokenize(section_content))
-                if num_total_tokens < ARTICLE_MIN_WORDS or \
-                        any(article_title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES):
+                # article redirects are pruned here
+                if any(article_title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES):  # filter non-articles
+                    continue
+                if not sections or sections[0][1].lstrip().lower().startswith("#redirect"):  # filter redirect
                     continue
+                if sum(len(body.strip()) for (_, body) in sections) < self.min_article_character:
+                    # filter very short articles (trash)
+                    continue
+
                 articles += 1
                 yield (article_title, sections)
         pool.terminate()
@@ -239,8 +279,15 @@ def get_texts_with_sections(self):
 
     parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, description=globals()['__doc__'])
     parser.add_argument('-f', '--file', help='Path to MediaWiki database dump', required=True)
-    parser.add_argument('-o', '--output', help='Path to output file', required=True)
+    parser.add_argument('-o', '--output', help='Path to output file (stdout if not specified)')
+    parser.add_argument(
+        '-m', '--min-article-character',
+        help="Minimal number of character for article (except titles and leading gaps), "
+             "if article contains less characters that this value, "
+             "article will be filtered (will not be in the output file), default: %(default)s",
+        default=200
+    )
     args = parser.parse_args()
-    segment_and_print_all_articles(args.file, args.output)
+    segment_and_write_all_articles(args.file, args.output, args.min_article_character)
 
     logger.info("finished running %s", sys.argv[0])