diff --git a/docs/admin/maintenance.rst b/docs/admin/maintenance.rst index c9b75bf94..0cf28bd2e 100644 --- a/docs/admin/maintenance.rst +++ b/docs/admin/maintenance.rst @@ -35,9 +35,10 @@ The processes below check for and optionally fix the following issues: * size does not match size of the revision's data in bytes * sha1 hash does not match has of the revision's data -* revision numbers for an item's revision should make an unbroken sequence starting at 1 * parent id should not be present for revision number 1 of a given item * parent id for each revision should be the data id for the previous revision number for that item +* every revision should have a revision number +* an item should not have repeated revision numbers To check for invalid metadata, run the following command:: @@ -47,10 +48,7 @@ To view detailed list of invalid items:: moin maint-validate-metadata --all-backends --verbose -To fix issues, take your wiki offline and add ``--fix`` option to any of the above commands. +To fix issues, add ``--fix`` option to any of the above commands. To operate on only a selection of backends, replace ``--all--backends`` option with ``--backends`` followed by comma separated list of backends to process - -If the ``--fix`` finds anything to fix, you must rebuild the index -with the newly created metadata, see :doc:`index` diff --git a/requirements.d/development.txt b/requirements.d/development.txt index b18d39e3e..12af8100d 100644 --- a/requirements.d/development.txt +++ b/requirements.d/development.txt @@ -1,4 +1,5 @@ tox +psutil pytest # we use lxml.etree for xpath-based testing lxml diff --git a/src/moin/_tests/__init__.py b/src/moin/_tests/__init__.py index 8b2cebf5e..2bcffe7ef 100644 --- a/src/moin/_tests/__init__.py +++ b/src/moin/_tests/__init__.py @@ -11,6 +11,7 @@ import socket from io import BytesIO from pathlib import Path +import psutil from typing import Tuple from flask import g as flaskg @@ -104,3 +105,11 @@ def get_dirs(subdir: str) -> Tuple[Path, Path]: if not artifacts_dir.exists(): artifacts_dir.mkdir(parents=True) return moin_dir, artifacts_dir + + +def get_open_wiki_files(): + proc = psutil.Process() + files = [f for f in proc.open_files() if 'wiki' in f.path] + for file in files: + print(f'open wiki {file}') + return files diff --git a/src/moin/cli/_tests/__init__.py b/src/moin/cli/_tests/__init__.py index 68e2cb98f..735c2dd45 100644 --- a/src/moin/cli/_tests/__init__.py +++ b/src/moin/cli/_tests/__init__.py @@ -20,16 +20,19 @@ logging = log.getLogger(__name__) -def run(cmd: List[str], log=None, wait: bool = True, timeout: int = None) \ +def run(cmd: List[str], log=None, wait: bool = True, timeout: int = None, env=None) \ -> Union[subprocess.CompletedProcess, subprocess.Popen]: """run a shell command, redirecting output to log :param cmd: list of strings containing command arguments :param log: open file handle to log file (binary mode) or None in which case output will be captured :param wait: if True return after process is complete, otherwise return immediately after start :param timeout: timeout setting in seconds, can only be used when wait is True + :param env: dictionary of environment variables to add to current env for subprocess :return: CompletedProcess object if wait else Popen object""" subprocess_environ = copy(os.environ) subprocess_environ['PYTHONIOENCODING'] = 'cp1252' # simulate windows terminal to ferret out encoding issues + if env: + subprocess_environ.update(env) logging.info(f'running {cmd}') if stdout := log: stderr = subprocess.STDOUT diff --git a/src/moin/cli/_tests/conftest.py b/src/moin/cli/_tests/conftest.py index d7eb9d702..8d276f873 100644 --- a/src/moin/cli/_tests/conftest.py +++ b/src/moin/cli/_tests/conftest.py @@ -103,6 +103,16 @@ def get_crawl_server_log_path(): return artifact_base_dir / 'server-crawl.log' +def get_crawl_log_path(): + _, artifact_base_dir = get_dirs('') + return artifact_base_dir / 'crawl.log' + + +def get_crawl_csv_path(): + _, artifact_base_dir = get_dirs('') + return artifact_base_dir / 'crawl.csv' + + @pytest.fixture(scope="package") def server(welcome, load_help, artifact_dir): run(['moin', 'index-build']) @@ -140,7 +150,7 @@ def server(welcome, load_help, artifact_dir): if not started: logging.error('server not started. server.log:') try: - with open(server_log.name) as f: + with open(get_crawl_server_log_path()) as f: logging.error(f.read()) except IOError as e: logging.error(f'{repr(e)} when trying to open server log') @@ -150,8 +160,11 @@ def server(welcome, load_help, artifact_dir): @pytest.fixture(scope="package") def do_crawl(request, artifact_dir): moin_dir, artifact_base_dir = get_dirs('') - (artifact_base_dir / 'crawl.log').touch() # insure github workflow will have a file to archive - (artifact_base_dir / 'crawl.csv').touch() + # initialize output files + with open(get_crawl_log_path(), 'w'): + pass + with open(get_crawl_csv_path(), 'w'): + pass server_started = True crawl_success = True if settings.SITE_HOST == '127.0.0.1:9080': @@ -163,13 +176,17 @@ def do_crawl(request, artifact_dir): os.chdir(moin_dir / 'src' / 'moin' / 'cli' / '_tests' / 'scrapy') try: com = ['scrapy', 'crawl', '-a', f'url={settings.CRAWL_START}', 'ref_checker'] - with open(artifact_dir / 'crawl.log', 'wb') as crawl_log: - p = run(com, crawl_log, timeout=600) + with open(get_crawl_log_path(), 'wb') as crawl_log: + try: + p = run(com, crawl_log, timeout=600, env={'MOIN_SCRAPY_CRAWL_CSV': str(get_crawl_csv_path())}) + except subprocess.TimeoutExpired as e: + crawl_log.write(f'\n{repr(e)}\n'.encode()) + raise if p.returncode != 0: crawl_success = False if not crawl_success: logging.error('crawl failed. crawl.log:') - with open('crawl.log') as f: + with open(get_crawl_log_path()) as f: logging.error(f.read()) finally: os.chdir(artifact_dir) @@ -184,15 +201,16 @@ def crawl_results(request, artifact_dir) -> List[CrawlResult]: crawl_success = request.getfixturevalue('do_crawl') if crawl_success: try: - with open(artifact_base_dir / 'crawl.csv') as f: + logging.info(f'reading {get_crawl_csv_path()}') + with open(get_crawl_csv_path()) as f: in_csv = csv.DictReader(f) - return [CrawlResult(**r) for r in in_csv] + return [CrawlResult(**r) for r in in_csv], crawl_success except Exception as e: crawl_success = False logging.error(f'exception reading crawl.csv {repr(e)}') if not crawl_success: logging.error('crawl failed') - return [] + return [], crawl_success @pytest.fixture(scope="package") diff --git a/src/moin/cli/_tests/data/MyPage-vblank.meta b/src/moin/cli/_tests/data/MyPage-vblank.meta new file mode 100644 index 000000000..6e0899828 --- /dev/null +++ b/src/moin/cli/_tests/data/MyPage-vblank.meta @@ -0,0 +1,24 @@ +{ + "action": "SAVE", + "address": "127.0.0.1", + "comment": "", + "contenttype": "text/x.moin.wiki;charset=utf-8", + "dataid": "d6af14cd8edd4df6a992c5ac52dd78bf", + "externallinks": [], + "itemid": "b35958ca34f047b0924ba38ed652ce15", + "itemlinks": [], + "itemtransclusions": [], + "itemtype": "default", + "mtime": 1680488272, + "name": [ + "MyPage" + ], + "name_old": [], + "namespace": "", + "revid": "484e73725601407e9f9ab0bcaa151fb6", + "sha1": "487076f6c9eb3ce9cb18fd9800a62b35383a34ee", + "size": 16, + "summary": "", + "tags": [], + "wikiname": "MyMoinMoin" +} diff --git a/src/moin/cli/_tests/data/MyPage-vblank2.meta b/src/moin/cli/_tests/data/MyPage-vblank2.meta new file mode 100644 index 000000000..487338df5 --- /dev/null +++ b/src/moin/cli/_tests/data/MyPage-vblank2.meta @@ -0,0 +1,24 @@ +{ + "action": "SAVE", + "address": "127.0.0.1", + "comment": "", + "contenttype": "text/x.moin.wiki;charset=utf-8", + "dataid": "d6af14cd8edd4df6a992c5ac52dd78bf", + "externallinks": [], + "itemid": "b35958ca34f047b0924ba38ed652ce15", + "itemlinks": [], + "itemtransclusions": [], + "itemtype": "default", + "mtime": 1680488273, + "name": [ + "MyPage" + ], + "name_old": [], + "namespace": "", + "revid": "a8a8233bc8264216915ad3137ee6c20f", + "sha1": "487076f6c9eb3ce9cb18fd9800a62b35383a34ee", + "size": 16, + "summary": "", + "tags": [], + "wikiname": "MyMoinMoin" +} diff --git a/src/moin/cli/_tests/scrapy/moincrawler/spiders/ref_checker.py b/src/moin/cli/_tests/scrapy/moincrawler/spiders/ref_checker.py index f45e2f78a..cf661fe09 100644 --- a/src/moin/cli/_tests/scrapy/moincrawler/spiders/ref_checker.py +++ b/src/moin/cli/_tests/scrapy/moincrawler/spiders/ref_checker.py @@ -10,6 +10,8 @@ import csv from dataclasses import fields, astuple +import os +from traceback import print_exc import scrapy from scrapy import signals @@ -23,6 +25,7 @@ except ImportError: from moin.cli._tests import default_settings as settings +from moin.cli._tests.conftest import get_crawl_csv_path from moin.utils.iri import Iri from moin import log @@ -55,17 +58,26 @@ def from_crawler(cls, crawler, *args, **kwargs): return spider def spider_closed(self): - _, artifact_base_dir = get_dirs('') - for k, c in self.crawler.stats.get_stats().items(): # bubble up spider exceptions into test failures - if k.startswith('spider_exceptions'): - self.results.append(CrawlResult(response_exc=f'crawler stats: {k} = {c}')) - with open(artifact_base_dir / 'crawl.csv', 'w') as fh: - out_csv = csv.writer(fh, lineterminator='\n') - out_csv.writerow([f.name for f in fields(CrawlResult)]) - for result in self.results: - out_csv.writerow(astuple(result)) - - def parse(self, response, **kwargs): + logging.info('entering spider_closed') + try: + _, artifact_base_dir = get_dirs('') + for k, c in self.crawler.stats.get_stats().items(): # bubble up spider exceptions into test failures + if k.startswith('spider_exceptions'): + logging.error(f'spider_exception: {c}') + self.results.append(CrawlResult(response_exc=f'crawler stats: {k} = {c}')) + crawl_csv_path = os.environ.get('MOIN_SCRAPY_CRAWL_CSV', get_crawl_csv_path()) + logging.info(f'writing {len(self.results)} to {crawl_csv_path}') + with open(crawl_csv_path, 'w') as fh: + out_csv = csv.writer(fh, lineterminator='\n') + out_csv.writerow([f.name for f in fields(CrawlResult)]) + for result in self.results: + out_csv.writerow(astuple(result)) + except Exception as e: # noqa + logging.error(f'exception in spider_closed {repr(e)}') + print_exc() + raise + + def _parse(self, response, **kwargs): """Main method that parses downloaded pages. requests yielded from this method are added to the crawl queue""" @@ -136,6 +148,15 @@ def parse(self, response, **kwargs): request.meta['my_data'] = new_result yield request + def parse(self, response, **kwargs): + """called by scrapy framework""" + try: + yield from self._parse(response, **kwargs) + except Exception as e: # noqa + logging.error(f'parse exception : {repr(e)}') + print_exc() + raise + def errback(self, failure): """called when request comes back with anything other than a 200 OK response""" if failure.value.__class__ is IgnoreRequest: # ignore urls disallowed by robots.txt diff --git a/src/moin/cli/_tests/test_modify_item.py b/src/moin/cli/_tests/test_modify_item.py index 6e763ea9d..2d0f22329 100644 --- a/src/moin/cli/_tests/test_modify_item.py +++ b/src/moin/cli/_tests/test_modify_item.py @@ -9,7 +9,8 @@ from pathlib import Path from moin._tests import get_dirs -from moin.cli._tests import run, assert_p_succcess, read_index_dump_latest_revs +from moin.cli._tests import run, assert_p_succcess, read_index_dump_latest_revs, read_index_dump +from moin.constants.keys import REVID, PARENTID, SIZE, REV_NUMBER, NAMES def validate_meta(expected, actual, message): @@ -172,35 +173,76 @@ def test_validate_metadata(index_create2): assert_p_succcess(item_put) validate = run(['moin', 'maint-validate-metadata', '-b', 'default', '-v']) assert_p_succcess(validate) - outlines = validate.stdout.splitlines() + outlines = validate.stdout.decode().splitlines() assert 4 == len(outlines) - rev_id1 = b'7ed018d7ceda49409e18b8efb914f5ff' # Corrupt.meta - rev_id2 = b'0a2f1b476b6c42be80908b3b799df3fd' # Corrupt2.meta - rev_id3 = b'39c8fe8da0a048c0b7839bf8aa02cd04' # Corrupt3.meta + rev_id1 = '7ed018d7ceda49409e18b8efb914f5ff' # Corrupt.meta + rev_id2 = '0a2f1b476b6c42be80908b3b799df3fd' # Corrupt2.meta + rev_id3 = '39c8fe8da0a048c0b7839bf8aa02cd04' # Corrupt3.meta + rev_id4 = '484e73725601407e9f9ab0bcaa151fb6' # MyPage-v1.meta + rev_id5 = 'b0b07c407c3143aabc4d34aac1b1d303' # MyPage-v2.meta outlines_by_rev_id = {} for outline in outlines: words = iter(outline.split()) for word in words: - if word == b'rev_id:': + if word == 'rev_id:': outlines_by_rev_id[next(words)] = outline break assert {rev_id1, rev_id2, rev_id3} == set(outlines_by_rev_id.keys()) - assert b'size_error name: Home item: cbd6fc46f88740acbc1dca90bb1eb8f3 rev_number: 1 rev_id: 7ed018d7ceda49409e18b8efb914f5ff '\ - b'meta_size: 8 real_size: 11' == outlines_by_rev_id[rev_id1] - assert b'sha1_error name: Page2 item: 9999989aca5e45cc8683432f986a0e50 rev_number: 1 rev_id: 0a2f1b476b6c42be80908b3b799df3fd '\ - b'meta_sha1: 25ff6d28976a9e0feb97710a0c4b08ae197a0000 '\ - b'real_sha1: 25ff6d28976a9e0feb97710a0c4b08ae197afbfe' == outlines_by_rev_id[rev_id2] - assert b'parentid_error name: Page3 item: 3c7e36466726441faf6d7d266ac224e2 rev_number: 2 rev_id: 39c8fe8da0a048c0b7839bf8aa02cd04 '\ - b'meta_parentid: 002e5210cc884010b0dd75a1c337032d correct_parentid: None meta_revision_number: 2 correct_revision_number: 1' \ + assert 'size_error name: Home item: cbd6fc46f88740acbc1dca90bb1eb8f3 rev_number: 1 rev_id: 7ed018d7ceda49409e18b8efb914f5ff '\ + 'meta_size: 8 real_size: 11' == outlines_by_rev_id[rev_id1] + assert 'sha1_error name: Page2 item: 9999989aca5e45cc8683432f986a0e50 rev_number: 1 rev_id: 0a2f1b476b6c42be80908b3b799df3fd '\ + 'meta_sha1: 25ff6d28976a9e0feb97710a0c4b08ae197a0000 '\ + 'real_sha1: 25ff6d28976a9e0feb97710a0c4b08ae197afbfe' == outlines_by_rev_id[rev_id2] + assert 'parentid_error name: Page3 item: 3c7e36466726441faf6d7d266ac224e2 rev_number: 2 rev_id: 39c8fe8da0a048c0b7839bf8aa02cd04 '\ + 'meta_parentid: 002e5210cc884010b0dd75a1c337032d correct_parentid: None meta_revision_number: 2' \ == outlines_by_rev_id[rev_id3] - assert b'3 items with invalid metadata found' == outlines[3] + assert '3 items with invalid metadata found' == outlines[3] validate = run(['moin', 'maint-validate-metadata', '-b', 'default', '-f']) assert_p_succcess(validate) outlines = validate.stdout.splitlines() - assert 2 == len(outlines) + assert 1 == len(outlines) assert b'3 items with invalid metadata found and fixed' == outlines[0] validate = run(['moin', 'maint-validate-metadata', '-b', 'default', '-v']) assert_p_succcess(validate) outlines = validate.stdout.splitlines() assert 1 == len(outlines) assert b'0 items with invalid metadata found' == outlines[0] + # validate index is updated + index_dump = run(['moin', 'index-dump', '--no-truncate']) + metas = {m[REVID]: m for m in read_index_dump(index_dump.stdout.decode())} + assert {rev_id1, rev_id2, rev_id3, rev_id4, rev_id5} == set(metas.keys()) + assert 11 == metas[rev_id1][SIZE] + assert PARENTID not in metas[rev_id3] + # create a repeated revision_number + item_put = run(['moin', 'item-put', '-m', data_dir / 'MyPage-v2.meta', '-d', data_dir / 'MyPage-v2.data']) + assert_p_succcess(item_put) + validate = run(['moin', 'maint-validate-metadata', '-b', 'default', '-v', '-f']) + assert_p_succcess(validate) + outlines = validate.stdout.decode().splitlines() + assert '1 items with invalid metadata found and fixed' == outlines[-1] + assert 3 == len(outlines) + outlines_by_error = {} + for outline in outlines[0:2]: + words = outline.split() + outlines_by_error[words[0]] = outline + assert {'parentid_error', 'revision_number_error'} == set(outlines_by_error.keys()) + index_dump = run(['moin', 'index-dump', '--no-truncate']) + rev_numbers = {m[REV_NUMBER]: m for m in read_index_dump(index_dump.stdout.decode()) if m[NAMES] == 'MyPage'} + assert {1, 2, 3} == set(rev_numbers.keys()) + assert rev_numbers[1][REVID] == rev_id4 + assert rev_numbers[2][REVID] == rev_id5 + + +def test_validate_metadata_missing_rev_num(index_create2): + moin_dir, _ = get_dirs('') + data_dir = moin_dir / 'src' / 'moin' / 'cli' / '_tests' / 'data' + item_put = run(['moin', 'item-put', '-m', data_dir / 'MyPage-vblank.meta', '-d', data_dir / 'MyPage-v1.data', '-o']) + assert_p_succcess(item_put) + item_put = run(['moin', 'item-put', '-m', data_dir / 'MyPage-vblank2.meta', '-d', data_dir / 'MyPage-v1.data', '-o']) + assert_p_succcess(item_put) + validate = run(['moin', 'maint-validate-metadata', '-b', 'default', '-v', '-f']) + assert_p_succcess(validate) + index_dump = run(['moin', 'index-dump', '--no-truncate']) + print(index_dump.stdout.decode()) + rev_numbers = {m[REV_NUMBER]: m for m in read_index_dump(index_dump.stdout.decode()) if m[NAMES] == 'MyPage'} + assert {1, 2} == set(rev_numbers.keys()) diff --git a/src/moin/cli/_tests/test_scrapy_crawl.py b/src/moin/cli/_tests/test_scrapy_crawl.py index 713a62443..adb45d8fe 100644 --- a/src/moin/cli/_tests/test_scrapy_crawl.py +++ b/src/moin/cli/_tests/test_scrapy_crawl.py @@ -13,6 +13,7 @@ except ImportError: from moin.cli._tests import default_settings as settings from moin.cli._tests.scrapy.moincrawler.items import CrawlResultMatch +from moin.cli._tests.conftest import get_crawl_log_path from moin.utils.iri import Iri from moin import log @@ -51,14 +52,19 @@ def is_expected_404(self, r): return self._matches_one_of(r, self.EXPECTED_404) def test_home_page(self, crawl_results): - assert len(crawl_results) > 0 - r = crawl_results[0] + assert crawl_results[1], f'crawl failed, check {get_crawl_log_path()}' + for line in open(get_crawl_log_path(), 'rb'): + if b'crawl.csv' in line: + logging.info(f'{line} from {get_crawl_log_path()}') + assert len(crawl_results[0]) > 0 + r = crawl_results[0][0] expected = CrawlResultMatch(url='/Home') assert expected.match(r), f'unexpected redirect for / {r}' def test_200(self, crawl_results): + assert crawl_results[1], f'crawl failed, check {get_crawl_log_path()}' failures = [] - for r in [r for r in crawl_results if r.url + for r in [r for r in crawl_results[0] if r.url and r.url.authority == settings.SITE_HOST and not self.is_known_issue(r)]: if 'Discussion' in r.url.path: expected = {200, 404} @@ -73,8 +79,9 @@ def test_200(self, crawl_results): @pytest.mark.xfail(reason='issue #1414 - remaining bad links in help') def test_expected_failures(self, crawl_results): + assert crawl_results[1], f'crawl failed, check {get_crawl_log_path()}' failures = [] - for r in [r for r in crawl_results if self.is_known_issue(r)]: + for r in [r for r in crawl_results[0] if self.is_known_issue(r)]: if r.response_code != 200: logging.info(f'known issue {r}') failures.append(r) @@ -85,12 +92,13 @@ def test_known_issues_exist(self, crawl_results): """enable this test to check for KNOWN_ISSUES which can be removed after removing, be sure to confirm by crawling a host with non-blank SITE_WIKI_ROOT as some issues only exist when moin is running behind apache""" + assert crawl_results[1], f'crawl failed, check {get_crawl_log_path()}' fixed = [] for m in self.KNOWN_ISSUES: seen = False my_fixed = [] my_not_fixed = [] - for r in [r for r in crawl_results if m.match(r)]: + for r in [r for r in crawl_results[0] if m.match(r)]: seen = True if r.response_code == 200: my_fixed.append(r) @@ -106,8 +114,9 @@ def test_known_issues_exist(self, crawl_results): assert len(fixed) == 0 def test_valid_request(self, crawl_results): + assert crawl_results[1], f'crawl failed, check {get_crawl_log_path()}' failures = [] - for r in [r for r in crawl_results if not self.is_known_issue(r)]: + for r in [r for r in crawl_results[0] if not self.is_known_issue(r)]: if not r.response_code: logging.error(f'no response code for {r}') failures.append(r) diff --git a/src/moin/cli/_util.py b/src/moin/cli/_util.py index 5ea1bac83..805b3159e 100644 --- a/src/moin/cli/_util.py +++ b/src/moin/cli/_util.py @@ -9,6 +9,9 @@ from flask import current_app as app from moin.storage.backends.stores import Backend +from moin import log + +logging = log.getLogger(__name__) def get_backends(backends: Optional[str], all_backends: bool) -> Set[Backend]: @@ -27,4 +30,5 @@ def get_backends(backends: Optional[str], all_backends: bool) -> Set[Backend]: print("Given Backends: %r" % backends) print("Configured Backends: %r" % existing_backends) else: + logging.warning('no backends specified') return set() diff --git a/src/moin/cli/maint/modify_item.py b/src/moin/cli/maint/modify_item.py index 9be7cb2f2..f919d8734 100644 --- a/src/moin/cli/maint/modify_item.py +++ b/src/moin/cli/maint/modify_item.py @@ -8,7 +8,7 @@ """ from collections import defaultdict -from dataclasses import dataclass +from dataclasses import dataclass, field import json import io import os @@ -229,7 +229,15 @@ def _fix_if_bad(bad, meta, data, bad_revids, fix, backend): if bad: bad_revids.add(meta[REVID]) if fix: - backend.store(meta, data) + try: + item = app.storage.get_item(itemid=meta[ITEMID]) + rev = item.get_revision(meta[REVID]) + dict(rev.meta) # force load to validate rev is in index + except KeyError: + logging.warning(f'bad revision not found in index {get_rev_str(meta)}') + backend.store(meta, data) + else: + item.store_revision(meta, data, overwrite=True, trusted=True) @dataclass @@ -239,10 +247,11 @@ class RevData: rev_number: int mtime: int parent_id: str = None - is_bad: bool = False + issues: list[str] = field(default_factory=list) def ValidateMetadata(backends=None, all_backends=False, verbose=False, fix=False): + flaskg.add_lineno_attr = False backends = get_backends(backends, all_backends) bad_revids = set() for backend in backends: @@ -255,31 +264,39 @@ def ValidateMetadata(backends=None, all_backends=False, verbose=False, fix=False for issue in issues: print(issue) _fix_if_bad(bad, meta, data, bad_revids, fix, backend) + # fix bad parentid references and repeated or missing revision numbers for item_id, rev_datum in revs.items(): rev_datum.sort(key=lambda r: (r.rev_number, r.mtime)) - rev_number = 0 prev_rev_data = None for rev_data in rev_datum: - rev_number += 1 - if rev_data.rev_number != rev_number: - rev_data.rev_number = rev_number - rev_data.is_bad = True if prev_rev_data is None: if rev_data.parent_id: - rev_data.is_bad = True + rev_data.issues.append('parentid_error') rev_data.parent_id = None - elif rev_data.parent_id != prev_rev_data.rev_id: - rev_data.parent_id = prev_rev_data.rev_id - rev_data.is_bad = True + if rev_data.rev_number == -1: + rev_data.issues.append('revision_number_error') + rev_data.rev_number = 1 + else: # prev_rev_data is not None + if rev_data.parent_id != prev_rev_data.rev_id: + rev_data.parent_id = prev_rev_data.rev_id + rev_data.issues.append('parentid_error') + if rev_data.rev_number <= prev_rev_data.rev_number: + rev_data.rev_number = prev_rev_data.rev_number + 1 + rev_data.issues.append('revision_number_error') prev_rev_data = rev_data - for rev_data in [r for r in rev_datum if r.is_bad]: + for rev_data in [r for r in rev_datum if r.issues]: bad = True meta, data = backend.retrieve(rev_data.rev_id) rev_str = get_rev_str(meta) if verbose: - print(f'parentid_error {rev_str} meta_parentid: {meta.get(PARENTID)} ' - f'correct_parentid: {rev_data.parent_id} meta_revision_number: {meta.get(REV_NUMBER)} ' - f'correct_revision_number: {rev_data.rev_number}') + for issue in rev_data.issues: + if issue == 'parentid_error': + print(f'{issue} {rev_str} meta_parentid: {meta.get(PARENTID)} ' + f'correct_parentid: {rev_data.parent_id} ' + f'meta_revision_number: {meta.get(REV_NUMBER)}') + else: # issue == 'revision_number_error' + print(f'{issue} {rev_str} meta_revision_number: {meta.get(REV_NUMBER)} ' + f'correct_revision_number: {rev_data.rev_number}') if rev_data.parent_id: meta[PARENTID] = rev_data.parent_id else: @@ -290,9 +307,6 @@ def ValidateMetadata(backends=None, all_backends=False, verbose=False, fix=False meta[REV_NUMBER] = rev_data.rev_number _fix_if_bad(bad, meta, data, bad_revids, fix, backend) print(f'{len(bad_revids)} items with invalid metadata found{" and fixed" if fix else ""}') - if fix and len(bad_revids): - print('item metadata has been updated, you will need to run moin index-destroy; moin index-create; moin ' - 'index-build to update the index') return bad_revids diff --git a/src/moin/conftest.py b/src/moin/conftest.py index bf15783e5..b2c3576bd 100644 --- a/src/moin/conftest.py +++ b/src/moin/conftest.py @@ -22,7 +22,7 @@ import moin.log import moin from moin.app import create_app_ext, destroy_app, before_wiki, teardown_wiki -from moin._tests import wikiconfig +from moin._tests import wikiconfig, get_open_wiki_files from moin.storage import create_simple_mapping @@ -70,7 +70,12 @@ def app_ctx(cfg): teardown_wiki('') ctx.pop() - destroy_app(app) + try: + # simulate ERROR PermissionError: + # [WinError 32] The process cannot access the file because it is being used by another process + assert [] == get_open_wiki_files() + finally: + destroy_app(app) @pytest.fixture(autouse=True) diff --git a/src/moin/storage/middleware/_tests/test_indexing.py b/src/moin/storage/middleware/_tests/test_indexing.py index 2fe92029f..38b060c8c 100644 --- a/src/moin/storage/middleware/_tests/test_indexing.py +++ b/src/moin/storage/middleware/_tests/test_indexing.py @@ -16,7 +16,8 @@ from moin.constants.keys import (NAME, NAME_EXACT, SIZE, ITEMID, REVID, DATAID, HASH_ALGORITHM, CONTENT, COMMENT, LATEST_REVS, ALL_REVS, NAMESPACE, NAMERE, NAMEPREFIX, - CONTENTTYPE, ITEMTYPE, ITEMLINKS, REV_NUMBER) + CONTENTTYPE, ITEMTYPE, ITEMLINKS, REV_NUMBER, + PARENTID, MTIME) from moin.constants.namespaces import NAMESPACE_USERS @@ -82,29 +83,43 @@ def test_overwrite_revision(self): assert len(revids) == 1 # we still have the revision, cleared assert revid in revids # it is still same revid - def test_destroy_revision(self): + def _store_three_revs(self, acl=None): item_name = 'foo' item = self.imw[item_name] - rev = item.store_revision(dict(name=[item_name, ], mtime=1), + rev = item.store_revision(dict(name=[item_name, ], mtime=1, acl=acl), BytesIO(b'bar'), trusted=True, return_rev=True) revid0 = rev.revid - rev = item.store_revision(dict(name=[item_name, ], mtime=2), + rev = item.store_revision(dict(name=[item_name, ], mtime=2, parentid=revid0, acl=acl), BytesIO(b'baz'), trusted=True, return_rev=True) revid1 = rev.revid - rev = item.store_revision(dict(name=[item_name, ], mtime=3), + rev = item.store_revision(dict(name=[item_name, ], mtime=3, parentid=revid1, acl=acl), BytesIO(b'...'), trusted=True, return_rev=True) revid2 = rev.revid print("revids:", revid0, revid1, revid2) + return item, item_name, revid0, revid1, revid2 + + def test_destroy_revision(self): + item, item_name, revid0, revid1, revid2 = self._store_three_revs() + query = Term(NAME_EXACT, item_name) + metas = {m[REVID]: m for m in flaskg.storage.search_meta(query, idx_name=ALL_REVS)} + rev1_mtime = metas[revid1][MTIME] # destroy a non-current revision: item.destroy_revision(revid0) # check if the revision was destroyed: - item = self.imw[item_name] - query = Term(NAME_EXACT, item_name) - metas = flaskg.storage.search_meta(query, idx_name=ALL_REVS) - revids = [meta[REVID] for meta in metas] + metas = {m[REVID]: m for m in flaskg.storage.search_meta(query, idx_name=ALL_REVS)} + revids = list(metas.keys()) print("after destroy revid0", revids) assert sorted(revids) == sorted([revid1, revid2]) + # validate parent id of remaining revision is updated + assert PARENTID not in metas[revid1] + # validate mtime not updated + assert rev1_mtime == metas[revid1][MTIME] + # validate revid2 is still the current one + metas = {m[REVID]: m for m in flaskg.storage.search_meta(query)} + assert 1 == len(metas) + assert revid2 in metas # destroy a current revision: + item = self.imw[item_name] item.destroy_revision(revid2) # check if the revision was destroyed: item = self.imw[item_name] @@ -123,6 +138,14 @@ def test_destroy_revision(self): print("after destroy revid1", revids) assert sorted(revids) == sorted([]) + def test_destroy_middle_revision(self): + item, item_name, revid0, revid1, revid2 = self._store_three_revs() + # destroy the middle revision: + item.destroy_revision(revid1) + with item.get_revision(revid2) as rev: + # validate that the parentid of remaining rev was updated + assert rev.meta[PARENTID] == revid0 + def test_destroy_item(self): revids = [] item_name = 'foo' diff --git a/src/moin/storage/middleware/_tests/test_protecting.py b/src/moin/storage/middleware/_tests/test_protecting.py index 122dc0080..951da6e24 100644 --- a/src/moin/storage/middleware/_tests/test_protecting.py +++ b/src/moin/storage/middleware/_tests/test_protecting.py @@ -9,6 +9,8 @@ import pytest +from moin.constants.keys import PARENTID + from ..protecting import ProtectingMiddleware, AccessDenied from .test_indexing import TestIndexingMiddleware @@ -113,6 +115,14 @@ def test_destroy_revision(self): with pytest.raises(AccessDenied): item.destroy_revision(revid_protected) + def test_destroy_middle_revision(self): + item, item_name, revid0, revid1, revid2 = self._store_three_revs('joe:read,write,destroy') + # destroy the middle revision: + item.destroy_revision(revid1) + with item.get_revision(revid2) as rev: + # validate that the parentid of remaining rev was updated + assert rev.meta[PARENTID] == revid0 + def test_destroy_item(self): revid_unprotected, revid_protected = self.make_items('joe:destroy', 'boss:destroy') # now testing: diff --git a/src/moin/storage/middleware/indexing.py b/src/moin/storage/middleware/indexing.py index 567387d40..3b08f85a7 100644 --- a/src/moin/storage/middleware/indexing.py +++ b/src/moin/storage/middleware/indexing.py @@ -48,6 +48,7 @@ usually it is even just the small and thus quick latest-revs index. """ +import gc import os import sys import shutil @@ -554,14 +555,19 @@ def move_index(self): index_dir, index_dir_tmp = params[0], params_tmp[0] os.rename(index_dir_tmp, index_dir) - def index_revision(self, meta, content, backend_name, async_=True): + def index_revision(self, meta, content, backend_name, async_=True, force_latest=True): """ Index a single revision, add it to all-revs and latest-revs index. :param meta: metadata dict :param content: preprocessed (filtered) indexable content :param async_: if True, use the AsyncWriter, otherwise use normal writer + :param force_latest: True - unconditionally store this rev in LATEST_REVS + False - store in LATEST_REVS if this rev MTIME is most recent + overrides async_ parameter to False """ + if not force_latest: + async_ = False # must wait for storage in ALL_REVS before check for latest doc = backend_to_index(meta, content, self.schemas[ALL_REVS], self.wikiname, backend_name) if async_: writer = AsyncWriter(self.ix[ALL_REVS]) @@ -569,13 +575,21 @@ def index_revision(self, meta, content, backend_name, async_=True): writer = self.ix[ALL_REVS].writer() with writer as writer: writer.update_document(**doc) # update, because store_revision() may give us an existing revid - doc = backend_to_index(meta, content, self.schemas[LATEST_REVS], self.wikiname, backend_name) - if async_: - writer = AsyncWriter(self.ix[LATEST_REVS]) + if force_latest: + is_latest = True else: - writer = self.ix[LATEST_REVS].writer() - with writer as writer: - writer.update_document(**doc) + with self.ix[ALL_REVS].searcher() as searcher: + is_latest = (searcher.search(Term(ITEMID, doc[ITEMID]), + sortedby=FieldFacet(MTIME, reverse=True), + limit=1)[0][REVID] == doc[REVID]) + if is_latest: + doc = backend_to_index(meta, content, self.schemas[LATEST_REVS], self.wikiname, backend_name) + if async_: + writer = AsyncWriter(self.ix[LATEST_REVS]) + else: + writer = self.ix[LATEST_REVS].writer() + with writer as writer: + writer.update_document(**doc) def remove_revision(self, revid, async_=True): """ @@ -1240,7 +1254,8 @@ def store_revision(self, meta, data, overwrite=False, data.seek(0) # rewind file backend_name, revid = backend.store(meta, data) meta[REVID] = revid - self.indexer.index_revision(meta, content, backend_name) + self.indexer.index_revision(meta, content, backend_name, force_latest=not overwrite) + gc.collect() # triggers close of index files from is_latest search if not overwrite: self._current = get_indexer(self.indexer._document, revid=revid) if return_rev: @@ -1264,6 +1279,17 @@ def destroy_revision(self, revid): refcount = len(list(searcher.document_numbers(**query))) self.backend.remove(rev.backend_name, revid, destroy_data=refcount == 1) self.indexer.remove_revision(revid) + my_parent = rev.meta.get(PARENTID) + with flaskg.storage.indexer.ix[ALL_REVS].searcher() as searcher: + for hit in searcher.search(Term(PARENTID, revid)): + doc = hit.fields() + with Revision(self, doc[REVID], doc=doc) as child_rev: + child_meta = dict(child_rev.meta) + if my_parent: + child_meta[PARENTID] = my_parent + else: + del child_meta[PARENTID] + self.store_revision(child_meta, child_rev.data, overwrite=True, trusted=True) def destroy_all_revisions(self): """ diff --git a/src/moin/storage/middleware/validation.py b/src/moin/storage/middleware/validation.py index 2f6230077..c08d92d82 100644 --- a/src/moin/storage/middleware/validation.py +++ b/src/moin/storage/middleware/validation.py @@ -87,7 +87,7 @@ def userid_validator(element, state): if userid is None: # unknown user is acceptable return True - return uuid_validator(element, state) + return element.raw is Unset or uuid_validator(element, state) def name_validator(element, state):