moinwiki · RogerHaase · Jun 3, 2023 · May 21, 2023 · May 23, 2023 · May 29, 2023
diff --git a/docs/admin/maintenance.rst b/docs/admin/maintenance.rst
@@ -35,9 +35,10 @@ The processes below check for and optionally fix the following issues:
 
 * size does not match size of the revision's data in bytes
 * sha1 hash does not match has of the revision's data
-* revision numbers for an item's revision should make an unbroken sequence starting at 1
 * parent id should not be present for revision number 1 of a given item
 * parent id for each revision should be the data id for the previous revision number for that item
+* every revision should have a revision number
+* an item should not have repeated revision numbers
 
 To check for invalid metadata, run the following command::
 
@@ -47,10 +48,7 @@ To view detailed list of invalid items::
 
  moin maint-validate-metadata --all-backends --verbose
 
-To fix issues, take your wiki offline and add ``--fix`` option to any of the above commands.
+To fix issues, add ``--fix`` option to any of the above commands.
 
 To operate on only a selection of backends, replace ``--all--backends`` option with ``--backends``
 followed by comma separated list of backends to process
-
-If the ``--fix`` finds anything to fix, you must rebuild the index
-with the newly created metadata, see :doc:`index`
diff --git a/requirements.d/development.txt b/requirements.d/development.txt
@@ -1,4 +1,5 @@
 tox
+psutil
 pytest
 # we use lxml.etree for xpath-based testing
 lxml

diff --git a/src/moin/_tests/__init__.py b/src/moin/_tests/__init__.py
@@ -11,6 +11,7 @@
 import socket
 from io import BytesIO
 from pathlib import Path
+import psutil
 from typing import Tuple
 
 from flask import g as flaskg
@@ -104,3 +105,11 @@ def get_dirs(subdir: str) -> Tuple[Path, Path]:
     if not artifacts_dir.exists():
         artifacts_dir.mkdir(parents=True)
     return moin_dir, artifacts_dir
+
+
+def get_open_wiki_files():
+    proc = psutil.Process()
+    files = [f for f in proc.open_files() if 'wiki' in f.path]
+    for file in files:
+        print(f'open wiki {file}')
+    return files
diff --git a/src/moin/cli/_tests/__init__.py b/src/moin/cli/_tests/__init__.py
@@ -20,16 +20,19 @@
 logging = log.getLogger(__name__)
 
 
-def run(cmd: List[str], log=None, wait: bool = True, timeout: int = None) \
+def run(cmd: List[str], log=None, wait: bool = True, timeout: int = None, env=None) \
         -> Union[subprocess.CompletedProcess, subprocess.Popen]:
     """run a shell command, redirecting output to log
     :param cmd: list of strings containing command arguments
     :param log: open file handle to log file (binary mode) or None in which case output will be captured
     :param wait: if True return after process is complete, otherwise return immediately after start
     :param timeout: timeout setting in seconds, can only be used when wait is True
+    :param env: dictionary of environment variables to add to current env for subprocess
     :return: CompletedProcess object if wait else Popen object"""
     subprocess_environ = copy(os.environ)
     subprocess_environ['PYTHONIOENCODING'] = 'cp1252'  # simulate windows terminal to ferret out encoding issues
+    if env:
+        subprocess_environ.update(env)
     logging.info(f'running {cmd}')
     if stdout := log:
         stderr = subprocess.STDOUT

diff --git a/src/moin/cli/_tests/conftest.py b/src/moin/cli/_tests/conftest.py
@@ -103,6 +103,16 @@ def get_crawl_server_log_path():
     return artifact_base_dir / 'server-crawl.log'
 
 
+def get_crawl_log_path():
+    _, artifact_base_dir = get_dirs('')
+    return artifact_base_dir / 'crawl.log'
+
+
+def get_crawl_csv_path():
+    _, artifact_base_dir = get_dirs('')
+    return artifact_base_dir / 'crawl.csv'
+
+
 @pytest.fixture(scope="package")
 def server(welcome, load_help, artifact_dir):
     run(['moin', 'index-build'])
@@ -140,7 +150,7 @@ def server(welcome, load_help, artifact_dir):
         if not started:
             logging.error('server not started. server.log:')
             try:
-                with open(server_log.name) as f:
+                with open(get_crawl_server_log_path()) as f:
                     logging.error(f.read())
             except IOError as e:
                 logging.error(f'{repr(e)} when trying to open server log')
@@ -150,8 +160,11 @@ def server(welcome, load_help, artifact_dir):
 @pytest.fixture(scope="package")
 def do_crawl(request, artifact_dir):
     moin_dir, artifact_base_dir = get_dirs('')
-    (artifact_base_dir / 'crawl.log').touch()  # insure github workflow will have a file to archive
-    (artifact_base_dir / 'crawl.csv').touch()
+    # initialize output files
+    with open(get_crawl_log_path(), 'w'):
+        pass
+    with open(get_crawl_csv_path(), 'w'):
+        pass
     server_started = True
     crawl_success = True
     if settings.SITE_HOST == '127.0.0.1:9080':
@@ -163,13 +176,17 @@ def do_crawl(request, artifact_dir):
         os.chdir(moin_dir / 'src' / 'moin' / 'cli' / '_tests' / 'scrapy')
         try:
             com = ['scrapy', 'crawl', '-a', f'url={settings.CRAWL_START}', 'ref_checker']
-            with open(artifact_dir / 'crawl.log', 'wb') as crawl_log:
-                p = run(com, crawl_log, timeout=600)
+            with open(get_crawl_log_path(), 'wb') as crawl_log:
+                try:
+                    p = run(com, crawl_log, timeout=600, env={'MOIN_SCRAPY_CRAWL_CSV': str(get_crawl_csv_path())})
+                except subprocess.TimeoutExpired as e:
+                    crawl_log.write(f'\n{repr(e)}\n'.encode())
+                    raise
             if p.returncode != 0:
                 crawl_success = False
             if not crawl_success:
                 logging.error('crawl failed. crawl.log:')
-                with open('crawl.log') as f:
+                with open(get_crawl_log_path()) as f:
                     logging.error(f.read())
         finally:
             os.chdir(artifact_dir)
@@ -184,15 +201,16 @@ def crawl_results(request, artifact_dir) -> List[CrawlResult]:
         crawl_success = request.getfixturevalue('do_crawl')
     if crawl_success:
         try:
-            with open(artifact_base_dir / 'crawl.csv') as f:
+            logging.info(f'reading {get_crawl_csv_path()}')
+            with open(get_crawl_csv_path()) as f:
                 in_csv = csv.DictReader(f)
-                return [CrawlResult(**r) for r in in_csv]
+                return [CrawlResult(**r) for r in in_csv], crawl_success
         except Exception as e:
             crawl_success = False
             logging.error(f'exception reading crawl.csv {repr(e)}')
     if not crawl_success:
         logging.error('crawl failed')
-        return []
+        return [], crawl_success
 
 
 @pytest.fixture(scope="package")

diff --git a/src/moin/cli/_tests/data/MyPage-vblank.meta b/src/moin/cli/_tests/data/MyPage-vblank.meta
@@ -0,0 +1,24 @@
+{
+  "action": "SAVE",
+  "address": "127.0.0.1",
+  "comment": "",
+  "contenttype": "text/x.moin.wiki;charset=utf-8",
+  "dataid": "d6af14cd8edd4df6a992c5ac52dd78bf",
+  "externallinks": [],
+  "itemid": "b35958ca34f047b0924ba38ed652ce15",
+  "itemlinks": [],
+  "itemtransclusions": [],
+  "itemtype": "default",
+  "mtime": 1680488272,
+  "name": [
+    "MyPage"
+  ],
+  "name_old": [],
+  "namespace": "",
+  "revid": "484e73725601407e9f9ab0bcaa151fb6",
+  "sha1": "487076f6c9eb3ce9cb18fd9800a62b35383a34ee",
+  "size": 16,
+  "summary": "",
+  "tags": [],
+  "wikiname": "MyMoinMoin"
+}
diff --git a/src/moin/cli/_tests/data/MyPage-vblank2.meta b/src/moin/cli/_tests/data/MyPage-vblank2.meta
@@ -0,0 +1,24 @@
+{
+  "action": "SAVE",
+  "address": "127.0.0.1",
+  "comment": "",
+  "contenttype": "text/x.moin.wiki;charset=utf-8",
+  "dataid": "d6af14cd8edd4df6a992c5ac52dd78bf",
+  "externallinks": [],
+  "itemid": "b35958ca34f047b0924ba38ed652ce15",
+  "itemlinks": [],
+  "itemtransclusions": [],
+  "itemtype": "default",
+  "mtime": 1680488273,
+  "name": [
+    "MyPage"
+  ],
+  "name_old": [],
+  "namespace": "",
+  "revid": "a8a8233bc8264216915ad3137ee6c20f",
+  "sha1": "487076f6c9eb3ce9cb18fd9800a62b35383a34ee",
+  "size": 16,
+  "summary": "",
+  "tags": [],
+  "wikiname": "MyMoinMoin"
+}
diff --git a/src/moin/cli/_tests/scrapy/moincrawler/spiders/ref_checker.py b/src/moin/cli/_tests/scrapy/moincrawler/spiders/ref_checker.py
@@ -10,6 +10,8 @@
 
 import csv
 from dataclasses import fields, astuple
+import os
+from traceback import print_exc
 
 import scrapy
 from scrapy import signals
@@ -23,6 +25,7 @@
 except ImportError:
     from moin.cli._tests import default_settings as settings
 
+from moin.cli._tests.conftest import get_crawl_csv_path
 from moin.utils.iri import Iri
 from moin import log
 
@@ -55,17 +58,26 @@ def from_crawler(cls, crawler, *args, **kwargs):
         return spider
 
     def spider_closed(self):
-        _, artifact_base_dir = get_dirs('')
-        for k, c in self.crawler.stats.get_stats().items():  # bubble up spider exceptions into test failures
-            if k.startswith('spider_exceptions'):
-                self.results.append(CrawlResult(response_exc=f'crawler stats: {k} = {c}'))
-        with open(artifact_base_dir / 'crawl.csv', 'w') as fh:
-            out_csv = csv.writer(fh, lineterminator='\n')
-            out_csv.writerow([f.name for f in fields(CrawlResult)])
-            for result in self.results:
-                out_csv.writerow(astuple(result))
-
-    def parse(self, response, **kwargs):
+        logging.info('entering spider_closed')
+        try:
+            _, artifact_base_dir = get_dirs('')
+            for k, c in self.crawler.stats.get_stats().items():  # bubble up spider exceptions into test failures
+                if k.startswith('spider_exceptions'):
+                    logging.error(f'spider_exception: {c}')
+                    self.results.append(CrawlResult(response_exc=f'crawler stats: {k} = {c}'))
+            crawl_csv_path = os.environ.get('MOIN_SCRAPY_CRAWL_CSV', get_crawl_csv_path())
+            logging.info(f'writing {len(self.results)} to {crawl_csv_path}')
+            with open(crawl_csv_path, 'w') as fh:
+                out_csv = csv.writer(fh, lineterminator='\n')
+                out_csv.writerow([f.name for f in fields(CrawlResult)])
+                for result in self.results:
+                    out_csv.writerow(astuple(result))
+        except Exception as e:  # noqa
+            logging.error(f'exception in spider_closed {repr(e)}')
+            print_exc()
+            raise
+
+    def _parse(self, response, **kwargs):
         """Main method that parses downloaded pages.
 
         requests yielded from this method are added to the crawl queue"""
@@ -136,6 +148,15 @@ def parse(self, response, **kwargs):
                     request.meta['my_data'] = new_result
                     yield request
 
+    def parse(self, response, **kwargs):
+        """called by scrapy framework"""
+        try:
+            yield from self._parse(response, **kwargs)
+        except Exception as e:  # noqa
+            logging.error(f'parse exception : {repr(e)}')
+            print_exc()
+            raise
+
     def errback(self, failure):
         """called when request comes back with anything other than a 200 OK response"""
         if failure.value.__class__ is IgnoreRequest:  # ignore urls disallowed by robots.txt

diff --git a/src/moin/cli/_tests/test_modify_item.py b/src/moin/cli/_tests/test_modify_item.py
@@ -9,7 +9,8 @@
 from pathlib import Path
 
 from moin._tests import get_dirs
-from moin.cli._tests import run, assert_p_succcess, read_index_dump_latest_revs
+from moin.cli._tests import run, assert_p_succcess, read_index_dump_latest_revs, read_index_dump
+from moin.constants.keys import REVID, PARENTID, SIZE, REV_NUMBER, NAMES
 
 
 def validate_meta(expected, actual, message):
@@ -172,35 +173,76 @@ def test_validate_metadata(index_create2):
     assert_p_succcess(item_put)
     validate = run(['moin', 'maint-validate-metadata', '-b', 'default', '-v'])
     assert_p_succcess(validate)
-    outlines = validate.stdout.splitlines()
+    outlines = validate.stdout.decode().splitlines()
     assert 4 == len(outlines)
-    rev_id1 = b'7ed018d7ceda49409e18b8efb914f5ff'  # Corrupt.meta
-    rev_id2 = b'0a2f1b476b6c42be80908b3b799df3fd'  # Corrupt2.meta
-    rev_id3 = b'39c8fe8da0a048c0b7839bf8aa02cd04'  # Corrupt3.meta
+    rev_id1 = '7ed018d7ceda49409e18b8efb914f5ff'  # Corrupt.meta
+    rev_id2 = '0a2f1b476b6c42be80908b3b799df3fd'  # Corrupt2.meta
+    rev_id3 = '39c8fe8da0a048c0b7839bf8aa02cd04'  # Corrupt3.meta
+    rev_id4 = '484e73725601407e9f9ab0bcaa151fb6'  # MyPage-v1.meta
+    rev_id5 = 'b0b07c407c3143aabc4d34aac1b1d303'  # MyPage-v2.meta
     outlines_by_rev_id = {}
     for outline in outlines:
         words = iter(outline.split())
         for word in words:
-            if word == b'rev_id:':
+            if word == 'rev_id:':
                 outlines_by_rev_id[next(words)] = outline
                 break
     assert {rev_id1, rev_id2, rev_id3} == set(outlines_by_rev_id.keys())
-    assert b'size_error name: Home item: cbd6fc46f88740acbc1dca90bb1eb8f3 rev_number: 1 rev_id: 7ed018d7ceda49409e18b8efb914f5ff '\
-           b'meta_size: 8 real_size: 11' == outlines_by_rev_id[rev_id1]
-    assert b'sha1_error name: Page2 item: 9999989aca5e45cc8683432f986a0e50 rev_number: 1 rev_id: 0a2f1b476b6c42be80908b3b799df3fd '\
-           b'meta_sha1: 25ff6d28976a9e0feb97710a0c4b08ae197a0000 '\
-           b'real_sha1: 25ff6d28976a9e0feb97710a0c4b08ae197afbfe' == outlines_by_rev_id[rev_id2]
-    assert b'parentid_error name: Page3 item: 3c7e36466726441faf6d7d266ac224e2 rev_number: 2 rev_id: 39c8fe8da0a048c0b7839bf8aa02cd04 '\
-           b'meta_parentid: 002e5210cc884010b0dd75a1c337032d correct_parentid: None meta_revision_number: 2 correct_revision_number: 1' \
+    assert 'size_error name: Home item: cbd6fc46f88740acbc1dca90bb1eb8f3 rev_number: 1 rev_id: 7ed018d7ceda49409e18b8efb914f5ff '\
+           'meta_size: 8 real_size: 11' == outlines_by_rev_id[rev_id1]
+    assert 'sha1_error name: Page2 item: 9999989aca5e45cc8683432f986a0e50 rev_number: 1 rev_id: 0a2f1b476b6c42be80908b3b799df3fd '\
+           'meta_sha1: 25ff6d28976a9e0feb97710a0c4b08ae197a0000 '\
+           'real_sha1: 25ff6d28976a9e0feb97710a0c4b08ae197afbfe' == outlines_by_rev_id[rev_id2]
+    assert 'parentid_error name: Page3 item: 3c7e36466726441faf6d7d266ac224e2 rev_number: 2 rev_id: 39c8fe8da0a048c0b7839bf8aa02cd04 '\
+           'meta_parentid: 002e5210cc884010b0dd75a1c337032d correct_parentid: None meta_revision_number: 2' \
            == outlines_by_rev_id[rev_id3]
-    assert b'3 items with invalid metadata found' == outlines[3]
+    assert '3 items with invalid metadata found' == outlines[3]
     validate = run(['moin', 'maint-validate-metadata', '-b', 'default', '-f'])
     assert_p_succcess(validate)
     outlines = validate.stdout.splitlines()
-    assert 2 == len(outlines)
+    assert 1 == len(outlines)
     assert b'3 items with invalid metadata found and fixed' == outlines[0]
     validate = run(['moin', 'maint-validate-metadata', '-b', 'default', '-v'])
     assert_p_succcess(validate)
     outlines = validate.stdout.splitlines()
     assert 1 == len(outlines)
     assert b'0 items with invalid metadata found' == outlines[0]
+    # validate index is updated
+    index_dump = run(['moin', 'index-dump', '--no-truncate'])
+    metas = {m[REVID]: m for m in read_index_dump(index_dump.stdout.decode())}
+    assert {rev_id1, rev_id2, rev_id3, rev_id4, rev_id5} == set(metas.keys())
+    assert 11 == metas[rev_id1][SIZE]
+    assert PARENTID not in metas[rev_id3]
+    # create a repeated revision_number
+    item_put = run(['moin', 'item-put', '-m', data_dir / 'MyPage-v2.meta', '-d', data_dir / 'MyPage-v2.data'])
+    assert_p_succcess(item_put)
+    validate = run(['moin', 'maint-validate-metadata', '-b', 'default', '-v', '-f'])
+    assert_p_succcess(validate)
+    outlines = validate.stdout.decode().splitlines()
+    assert '1 items with invalid metadata found and fixed' == outlines[-1]
+    assert 3 == len(outlines)
+    outlines_by_error = {}
+    for outline in outlines[0:2]:
+        words = outline.split()
+        outlines_by_error[words[0]] = outline
+    assert {'parentid_error', 'revision_number_error'} == set(outlines_by_error.keys())
+    index_dump = run(['moin', 'index-dump', '--no-truncate'])
+    rev_numbers = {m[REV_NUMBER]: m for m in read_index_dump(index_dump.stdout.decode()) if m[NAMES] == 'MyPage'}
+    assert {1, 2, 3} == set(rev_numbers.keys())
+    assert rev_numbers[1][REVID] == rev_id4
+    assert rev_numbers[2][REVID] == rev_id5
+
+
+def test_validate_metadata_missing_rev_num(index_create2):
+    moin_dir, _ = get_dirs('')
+    data_dir = moin_dir / 'src' / 'moin' / 'cli' / '_tests' / 'data'
+    item_put = run(['moin', 'item-put', '-m', data_dir / 'MyPage-vblank.meta', '-d', data_dir / 'MyPage-v1.data', '-o'])
+    assert_p_succcess(item_put)
+    item_put = run(['moin', 'item-put', '-m', data_dir / 'MyPage-vblank2.meta', '-d', data_dir / 'MyPage-v1.data', '-o'])
+    assert_p_succcess(item_put)
+    validate = run(['moin', 'maint-validate-metadata', '-b', 'default', '-v', '-f'])
+    assert_p_succcess(validate)
+    index_dump = run(['moin', 'index-dump', '--no-truncate'])
+    print(index_dump.stdout.decode())
+    rev_numbers = {m[REV_NUMBER]: m for m in read_index_dump(index_dump.stdout.decode()) if m[NAMES] == 'MyPage'}
+    assert {1, 2} == set(rev_numbers.keys())
-Original file line number
+Diff line change
@@ -1,4 +1,5 @@
     tox
+    psutil
     pytest
     # we use lxml.etree for xpath-based testing
     lxml
@@ Expand Down @@