From efcf06ab63bb71bd01bb9cbe9cc0d199c8007d62 Mon Sep 17 00:00:00 2001 From: bylsmad <25931535+bylsmad@users.noreply.github.com> Date: Sun, 21 May 2023 06:54:13 -0700 Subject: [PATCH 1/8] repairing parentid for destroy rev, fixes #1448 also updating maint-validate-metadata, final fix for #1402 * updating index when fixing items * not replacing revision numbers for gaps in sequence --- docs/admin/maintenance.rst | 1 - src/moin/cli/_tests/test_modify_item.py | 39 ++++++++++++------- src/moin/cli/_util.py | 4 ++ src/moin/cli/maint/modify_item.py | 22 +++++------ .../middleware/_tests/test_indexing.py | 29 ++++++++++---- .../middleware/_tests/test_protecting.py | 10 +++++ src/moin/storage/middleware/indexing.py | 11 ++++++ 7 files changed, 81 insertions(+), 35 deletions(-) diff --git a/docs/admin/maintenance.rst b/docs/admin/maintenance.rst index c9b75bf94..d0d03b927 100644 --- a/docs/admin/maintenance.rst +++ b/docs/admin/maintenance.rst @@ -35,7 +35,6 @@ The processes below check for and optionally fix the following issues: * size does not match size of the revision's data in bytes * sha1 hash does not match has of the revision's data -* revision numbers for an item's revision should make an unbroken sequence starting at 1 * parent id should not be present for revision number 1 of a given item * parent id for each revision should be the data id for the previous revision number for that item diff --git a/src/moin/cli/_tests/test_modify_item.py b/src/moin/cli/_tests/test_modify_item.py index 6e763ea9d..bc83d6eb2 100644 --- a/src/moin/cli/_tests/test_modify_item.py +++ b/src/moin/cli/_tests/test_modify_item.py @@ -9,7 +9,8 @@ from pathlib import Path from moin._tests import get_dirs -from moin.cli._tests import run, assert_p_succcess, read_index_dump_latest_revs +from moin.cli._tests import run, assert_p_succcess, read_index_dump_latest_revs, read_index_dump +from moin.constants.keys import REVID, PARENTID, SIZE def validate_meta(expected, actual, message): @@ -172,35 +173,43 @@ def test_validate_metadata(index_create2): assert_p_succcess(item_put) validate = run(['moin', 'maint-validate-metadata', '-b', 'default', '-v']) assert_p_succcess(validate) - outlines = validate.stdout.splitlines() + outlines = validate.stdout.decode().splitlines() assert 4 == len(outlines) - rev_id1 = b'7ed018d7ceda49409e18b8efb914f5ff' # Corrupt.meta - rev_id2 = b'0a2f1b476b6c42be80908b3b799df3fd' # Corrupt2.meta - rev_id3 = b'39c8fe8da0a048c0b7839bf8aa02cd04' # Corrupt3.meta + rev_id1 = '7ed018d7ceda49409e18b8efb914f5ff' # Corrupt.meta + rev_id2 = '0a2f1b476b6c42be80908b3b799df3fd' # Corrupt2.meta + rev_id3 = '39c8fe8da0a048c0b7839bf8aa02cd04' # Corrupt3.meta + rev_id4 = '484e73725601407e9f9ab0bcaa151fb6' # MyPage-v1.meta + rev_id5 = 'b0b07c407c3143aabc4d34aac1b1d303' # MyPage-v2.meta outlines_by_rev_id = {} for outline in outlines: words = iter(outline.split()) for word in words: - if word == b'rev_id:': + if word == 'rev_id:': outlines_by_rev_id[next(words)] = outline break assert {rev_id1, rev_id2, rev_id3} == set(outlines_by_rev_id.keys()) - assert b'size_error name: Home item: cbd6fc46f88740acbc1dca90bb1eb8f3 rev_number: 1 rev_id: 7ed018d7ceda49409e18b8efb914f5ff '\ - b'meta_size: 8 real_size: 11' == outlines_by_rev_id[rev_id1] - assert b'sha1_error name: Page2 item: 9999989aca5e45cc8683432f986a0e50 rev_number: 1 rev_id: 0a2f1b476b6c42be80908b3b799df3fd '\ - b'meta_sha1: 25ff6d28976a9e0feb97710a0c4b08ae197a0000 '\ - b'real_sha1: 25ff6d28976a9e0feb97710a0c4b08ae197afbfe' == outlines_by_rev_id[rev_id2] - assert b'parentid_error name: Page3 item: 3c7e36466726441faf6d7d266ac224e2 rev_number: 2 rev_id: 39c8fe8da0a048c0b7839bf8aa02cd04 '\ - b'meta_parentid: 002e5210cc884010b0dd75a1c337032d correct_parentid: None meta_revision_number: 2 correct_revision_number: 1' \ + assert 'size_error name: Home item: cbd6fc46f88740acbc1dca90bb1eb8f3 rev_number: 1 rev_id: 7ed018d7ceda49409e18b8efb914f5ff '\ + 'meta_size: 8 real_size: 11' == outlines_by_rev_id[rev_id1] + assert 'sha1_error name: Page2 item: 9999989aca5e45cc8683432f986a0e50 rev_number: 1 rev_id: 0a2f1b476b6c42be80908b3b799df3fd '\ + 'meta_sha1: 25ff6d28976a9e0feb97710a0c4b08ae197a0000 '\ + 'real_sha1: 25ff6d28976a9e0feb97710a0c4b08ae197afbfe' == outlines_by_rev_id[rev_id2] + assert 'parentid_error name: Page3 item: 3c7e36466726441faf6d7d266ac224e2 rev_number: 2 rev_id: 39c8fe8da0a048c0b7839bf8aa02cd04 '\ + 'meta_parentid: 002e5210cc884010b0dd75a1c337032d correct_parentid: None meta_revision_number: 2' \ == outlines_by_rev_id[rev_id3] - assert b'3 items with invalid metadata found' == outlines[3] + assert '3 items with invalid metadata found' == outlines[3] validate = run(['moin', 'maint-validate-metadata', '-b', 'default', '-f']) assert_p_succcess(validate) outlines = validate.stdout.splitlines() - assert 2 == len(outlines) + assert 1 == len(outlines) assert b'3 items with invalid metadata found and fixed' == outlines[0] validate = run(['moin', 'maint-validate-metadata', '-b', 'default', '-v']) assert_p_succcess(validate) outlines = validate.stdout.splitlines() assert 1 == len(outlines) assert b'0 items with invalid metadata found' == outlines[0] + # validate index is updated + index_dump = run(['moin', 'index-dump', '--no-truncate']) + metas = {m[REVID]: m for m in read_index_dump(index_dump.stdout.decode())} + assert {rev_id1, rev_id2, rev_id3, rev_id4, rev_id5} == set(metas.keys()) + assert 11 == metas[rev_id1][SIZE] + assert PARENTID not in metas[rev_id3] diff --git a/src/moin/cli/_util.py b/src/moin/cli/_util.py index 5ea1bac83..805b3159e 100644 --- a/src/moin/cli/_util.py +++ b/src/moin/cli/_util.py @@ -9,6 +9,9 @@ from flask import current_app as app from moin.storage.backends.stores import Backend +from moin import log + +logging = log.getLogger(__name__) def get_backends(backends: Optional[str], all_backends: bool) -> Set[Backend]: @@ -27,4 +30,5 @@ def get_backends(backends: Optional[str], all_backends: bool) -> Set[Backend]: print("Given Backends: %r" % backends) print("Configured Backends: %r" % existing_backends) else: + logging.warning('no backends specified') return set() diff --git a/src/moin/cli/maint/modify_item.py b/src/moin/cli/maint/modify_item.py index 9be7cb2f2..49290ab74 100644 --- a/src/moin/cli/maint/modify_item.py +++ b/src/moin/cli/maint/modify_item.py @@ -23,6 +23,7 @@ from moin.cli._util import get_backends from moin.storage.middleware.serialization import get_rev_str, correcting_rev_iter from moin.constants.keys import CURRENT, ITEMID, DATAID, NAMESPACE, WIKINAME, REVID, PARENTID, REV_NUMBER, MTIME +from moin.storage.error import NoSuchItemError from moin.utils.interwiki import split_fqname from moin.items import Item @@ -229,7 +230,12 @@ def _fix_if_bad(bad, meta, data, bad_revids, fix, backend): if bad: bad_revids.add(meta[REVID]) if fix: - backend.store(meta, data) + try: + item = app.storage.existing_item(revid=meta[REVID]) + except NoSuchItemError: + backend.store(meta, data) + else: + item.store_revision(meta, data, overwrite=True) @dataclass @@ -243,6 +249,7 @@ class RevData: def ValidateMetadata(backends=None, all_backends=False, verbose=False, fix=False): + flaskg.add_lineno_attr = False backends = get_backends(backends, all_backends) bad_revids = set() for backend in backends: @@ -255,15 +262,11 @@ def ValidateMetadata(backends=None, all_backends=False, verbose=False, fix=False for issue in issues: print(issue) _fix_if_bad(bad, meta, data, bad_revids, fix, backend) + # fix bad parentid references for item_id, rev_datum in revs.items(): rev_datum.sort(key=lambda r: (r.rev_number, r.mtime)) - rev_number = 0 prev_rev_data = None for rev_data in rev_datum: - rev_number += 1 - if rev_data.rev_number != rev_number: - rev_data.rev_number = rev_number - rev_data.is_bad = True if prev_rev_data is None: if rev_data.parent_id: rev_data.is_bad = True @@ -278,8 +281,7 @@ def ValidateMetadata(backends=None, all_backends=False, verbose=False, fix=False rev_str = get_rev_str(meta) if verbose: print(f'parentid_error {rev_str} meta_parentid: {meta.get(PARENTID)} ' - f'correct_parentid: {rev_data.parent_id} meta_revision_number: {meta.get(REV_NUMBER)} ' - f'correct_revision_number: {rev_data.rev_number}') + f'correct_parentid: {rev_data.parent_id} meta_revision_number: {meta.get(REV_NUMBER)}') if rev_data.parent_id: meta[PARENTID] = rev_data.parent_id else: @@ -287,12 +289,8 @@ def ValidateMetadata(backends=None, all_backends=False, verbose=False, fix=False del meta[PARENTID] except KeyError: pass - meta[REV_NUMBER] = rev_data.rev_number _fix_if_bad(bad, meta, data, bad_revids, fix, backend) print(f'{len(bad_revids)} items with invalid metadata found{" and fixed" if fix else ""}') - if fix and len(bad_revids): - print('item metadata has been updated, you will need to run moin index-destroy; moin index-create; moin ' - 'index-build to update the index') return bad_revids diff --git a/src/moin/storage/middleware/_tests/test_indexing.py b/src/moin/storage/middleware/_tests/test_indexing.py index 2fe92029f..e4939db39 100644 --- a/src/moin/storage/middleware/_tests/test_indexing.py +++ b/src/moin/storage/middleware/_tests/test_indexing.py @@ -16,7 +16,8 @@ from moin.constants.keys import (NAME, NAME_EXACT, SIZE, ITEMID, REVID, DATAID, HASH_ALGORITHM, CONTENT, COMMENT, LATEST_REVS, ALL_REVS, NAMESPACE, NAMERE, NAMEPREFIX, - CONTENTTYPE, ITEMTYPE, ITEMLINKS, REV_NUMBER) + CONTENTTYPE, ITEMTYPE, ITEMLINKS, REV_NUMBER, + PARENTID) from moin.constants.namespaces import NAMESPACE_USERS @@ -82,28 +83,34 @@ def test_overwrite_revision(self): assert len(revids) == 1 # we still have the revision, cleared assert revid in revids # it is still same revid - def test_destroy_revision(self): + def _store_three_revs(self, acl=None): item_name = 'foo' item = self.imw[item_name] - rev = item.store_revision(dict(name=[item_name, ], mtime=1), + rev = item.store_revision(dict(name=[item_name, ], mtime=1, acl=acl), BytesIO(b'bar'), trusted=True, return_rev=True) revid0 = rev.revid - rev = item.store_revision(dict(name=[item_name, ], mtime=2), + rev = item.store_revision(dict(name=[item_name, ], mtime=2, parentid=revid0, acl=acl), BytesIO(b'baz'), trusted=True, return_rev=True) revid1 = rev.revid - rev = item.store_revision(dict(name=[item_name, ], mtime=3), + rev = item.store_revision(dict(name=[item_name, ], mtime=3, parentid=revid1, acl=acl), BytesIO(b'...'), trusted=True, return_rev=True) revid2 = rev.revid print("revids:", revid0, revid1, revid2) + return item, item_name, revid0, revid1, revid2 + + def test_destroy_revision(self): + item, item_name, revid0, revid1, revid2 = self._store_three_revs() # destroy a non-current revision: item.destroy_revision(revid0) # check if the revision was destroyed: item = self.imw[item_name] query = Term(NAME_EXACT, item_name) - metas = flaskg.storage.search_meta(query, idx_name=ALL_REVS) - revids = [meta[REVID] for meta in metas] + metas = {m[REVID]: m for m in flaskg.storage.search_meta(query, idx_name=ALL_REVS)} + revids = list(metas.keys()) print("after destroy revid0", revids) assert sorted(revids) == sorted([revid1, revid2]) + # validate parent id of remaining revision is updated + assert PARENTID not in metas[revid1] # destroy a current revision: item.destroy_revision(revid2) # check if the revision was destroyed: @@ -123,6 +130,14 @@ def test_destroy_revision(self): print("after destroy revid1", revids) assert sorted(revids) == sorted([]) + def test_destroy_middle_revision(self): + item, item_name, revid0, revid1, revid2 = self._store_three_revs() + # destroy the middle revision: + item.destroy_revision(revid1) + with item.get_revision(revid2) as rev: + # validate that the parentid of remaining rev was updated + assert rev.meta[PARENTID] == revid0 + def test_destroy_item(self): revids = [] item_name = 'foo' diff --git a/src/moin/storage/middleware/_tests/test_protecting.py b/src/moin/storage/middleware/_tests/test_protecting.py index 122dc0080..951da6e24 100644 --- a/src/moin/storage/middleware/_tests/test_protecting.py +++ b/src/moin/storage/middleware/_tests/test_protecting.py @@ -9,6 +9,8 @@ import pytest +from moin.constants.keys import PARENTID + from ..protecting import ProtectingMiddleware, AccessDenied from .test_indexing import TestIndexingMiddleware @@ -113,6 +115,14 @@ def test_destroy_revision(self): with pytest.raises(AccessDenied): item.destroy_revision(revid_protected) + def test_destroy_middle_revision(self): + item, item_name, revid0, revid1, revid2 = self._store_three_revs('joe:read,write,destroy') + # destroy the middle revision: + item.destroy_revision(revid1) + with item.get_revision(revid2) as rev: + # validate that the parentid of remaining rev was updated + assert rev.meta[PARENTID] == revid0 + def test_destroy_item(self): revid_unprotected, revid_protected = self.make_items('joe:destroy', 'boss:destroy') # now testing: diff --git a/src/moin/storage/middleware/indexing.py b/src/moin/storage/middleware/indexing.py index 567387d40..a0d36f769 100644 --- a/src/moin/storage/middleware/indexing.py +++ b/src/moin/storage/middleware/indexing.py @@ -1264,6 +1264,17 @@ def destroy_revision(self, revid): refcount = len(list(searcher.document_numbers(**query))) self.backend.remove(rev.backend_name, revid, destroy_data=refcount == 1) self.indexer.remove_revision(revid) + my_parent = rev.meta.get(PARENTID) + with flaskg.storage.indexer.ix[ALL_REVS].searcher() as searcher: + for hit in searcher.search(Term(PARENTID, revid)): + doc = hit.fields() + with Revision(self, doc[REVID], doc=doc) as child_rev: + child_meta = dict(child_rev.meta) + if my_parent: + child_meta[PARENTID] = my_parent + else: + del child_meta[PARENTID] + self.store_revision(child_meta, child_rev.data, overwrite=True) def destroy_all_revisions(self): """ From f7ecf943be74f96da9ca54e9742f860f7c8f51bd Mon Sep 17 00:00:00 2001 From: bylsmad <25931535+bylsmad@users.noreply.github.com> Date: Tue, 23 May 2023 06:45:17 -0700 Subject: [PATCH 2/8] fixing repeated rev_number, fixes #1448 also correcting index update to search in ALL_REVS existing_item only searches LATEST_REVS --- docs/admin/maintenance.rst | 7 ++-- src/moin/cli/_tests/data/MyPage-vblank.meta | 24 ++++++++++++ src/moin/cli/_tests/test_modify_item.py | 35 ++++++++++++++++- src/moin/cli/maint/modify_item.py | 42 ++++++++++++++------- 4 files changed, 90 insertions(+), 18 deletions(-) create mode 100644 src/moin/cli/_tests/data/MyPage-vblank.meta diff --git a/docs/admin/maintenance.rst b/docs/admin/maintenance.rst index d0d03b927..0cf28bd2e 100644 --- a/docs/admin/maintenance.rst +++ b/docs/admin/maintenance.rst @@ -37,6 +37,8 @@ The processes below check for and optionally fix the following issues: * sha1 hash does not match has of the revision's data * parent id should not be present for revision number 1 of a given item * parent id for each revision should be the data id for the previous revision number for that item +* every revision should have a revision number +* an item should not have repeated revision numbers To check for invalid metadata, run the following command:: @@ -46,10 +48,7 @@ To view detailed list of invalid items:: moin maint-validate-metadata --all-backends --verbose -To fix issues, take your wiki offline and add ``--fix`` option to any of the above commands. +To fix issues, add ``--fix`` option to any of the above commands. To operate on only a selection of backends, replace ``--all--backends`` option with ``--backends`` followed by comma separated list of backends to process - -If the ``--fix`` finds anything to fix, you must rebuild the index -with the newly created metadata, see :doc:`index` diff --git a/src/moin/cli/_tests/data/MyPage-vblank.meta b/src/moin/cli/_tests/data/MyPage-vblank.meta new file mode 100644 index 000000000..6e0899828 --- /dev/null +++ b/src/moin/cli/_tests/data/MyPage-vblank.meta @@ -0,0 +1,24 @@ +{ + "action": "SAVE", + "address": "127.0.0.1", + "comment": "", + "contenttype": "text/x.moin.wiki;charset=utf-8", + "dataid": "d6af14cd8edd4df6a992c5ac52dd78bf", + "externallinks": [], + "itemid": "b35958ca34f047b0924ba38ed652ce15", + "itemlinks": [], + "itemtransclusions": [], + "itemtype": "default", + "mtime": 1680488272, + "name": [ + "MyPage" + ], + "name_old": [], + "namespace": "", + "revid": "484e73725601407e9f9ab0bcaa151fb6", + "sha1": "487076f6c9eb3ce9cb18fd9800a62b35383a34ee", + "size": 16, + "summary": "", + "tags": [], + "wikiname": "MyMoinMoin" +} diff --git a/src/moin/cli/_tests/test_modify_item.py b/src/moin/cli/_tests/test_modify_item.py index bc83d6eb2..ab2006be3 100644 --- a/src/moin/cli/_tests/test_modify_item.py +++ b/src/moin/cli/_tests/test_modify_item.py @@ -10,7 +10,7 @@ from moin._tests import get_dirs from moin.cli._tests import run, assert_p_succcess, read_index_dump_latest_revs, read_index_dump -from moin.constants.keys import REVID, PARENTID, SIZE +from moin.constants.keys import REVID, PARENTID, SIZE, REV_NUMBER, NAMES def validate_meta(expected, actual, message): @@ -213,3 +213,36 @@ def test_validate_metadata(index_create2): assert {rev_id1, rev_id2, rev_id3, rev_id4, rev_id5} == set(metas.keys()) assert 11 == metas[rev_id1][SIZE] assert PARENTID not in metas[rev_id3] + # create a repeated revision_number + item_put = run(['moin', 'item-put', '-m', data_dir / 'MyPage-v2.meta', '-d', data_dir / 'MyPage-v2.data']) + assert_p_succcess(item_put) + validate = run(['moin', 'maint-validate-metadata', '-b', 'default', '-v', '-f']) + assert_p_succcess(validate) + outlines = validate.stdout.decode().splitlines() + assert '1 items with invalid metadata found and fixed' == outlines[-1] + assert 3 == len(outlines) + outlines_by_error = {} + for outline in outlines[0:2]: + words = outline.split() + outlines_by_error[words[0]] = outline + assert {'parentid_error', 'revision_number_error'} == set(outlines_by_error.keys()) + index_dump = run(['moin', 'index-dump', '--no-truncate']) + rev_numbers = {m[REV_NUMBER]: m for m in read_index_dump(index_dump.stdout.decode()) if m[NAMES] == 'MyPage'} + assert {1, 2, 3} == set(rev_numbers.keys()) + assert rev_numbers[1][REVID] == rev_id4 + assert rev_numbers[2][REVID] == rev_id5 + + +def test_validate_metadata_missing_rev_num(index_create2): + moin_dir, _ = get_dirs('') + data_dir = moin_dir / 'src' / 'moin' / 'cli' / '_tests' / 'data' + item_put = run(['moin', 'item-put', '-m', data_dir / 'MyPage-vblank.meta', '-d', data_dir / 'MyPage-v1.data']) + assert_p_succcess(item_put) + item_put = run(['moin', 'item-put', '-m', data_dir / 'MyPage-vblank.meta', '-d', data_dir / 'MyPage-v1.data']) + assert_p_succcess(item_put) + validate = run(['moin', 'maint-validate-metadata', '-b', 'default', '-v', '-f']) + assert_p_succcess(validate) + index_dump = run(['moin', 'index-dump', '--no-truncate']) + print(index_dump.stdout.decode()) + rev_numbers = {m[REV_NUMBER]: m for m in read_index_dump(index_dump.stdout.decode()) if m[NAMES] == 'MyPage'} + assert {1, 2} == set(rev_numbers.keys()) diff --git a/src/moin/cli/maint/modify_item.py b/src/moin/cli/maint/modify_item.py index 49290ab74..abffaccc7 100644 --- a/src/moin/cli/maint/modify_item.py +++ b/src/moin/cli/maint/modify_item.py @@ -8,7 +8,7 @@ """ from collections import defaultdict -from dataclasses import dataclass +from dataclasses import dataclass, field import json import io import os @@ -23,7 +23,6 @@ from moin.cli._util import get_backends from moin.storage.middleware.serialization import get_rev_str, correcting_rev_iter from moin.constants.keys import CURRENT, ITEMID, DATAID, NAMESPACE, WIKINAME, REVID, PARENTID, REV_NUMBER, MTIME -from moin.storage.error import NoSuchItemError from moin.utils.interwiki import split_fqname from moin.items import Item @@ -231,8 +230,11 @@ def _fix_if_bad(bad, meta, data, bad_revids, fix, backend): bad_revids.add(meta[REVID]) if fix: try: - item = app.storage.existing_item(revid=meta[REVID]) - except NoSuchItemError: + item = app.storage.get_item(itemid=meta[ITEMID]) + rev = item.get_revision(meta[REVID]) + dict(rev.meta) # force load to validate rev is in index + except KeyError: + logging.warning(f'bad revision not found in index {get_rev_str(meta)}') backend.store(meta, data) else: item.store_revision(meta, data, overwrite=True) @@ -245,7 +247,7 @@ class RevData: rev_number: int mtime: int parent_id: str = None - is_bad: bool = False + issues: list[str] = field(default_factory=list) def ValidateMetadata(backends=None, all_backends=False, verbose=False, fix=False): @@ -262,26 +264,39 @@ def ValidateMetadata(backends=None, all_backends=False, verbose=False, fix=False for issue in issues: print(issue) _fix_if_bad(bad, meta, data, bad_revids, fix, backend) - # fix bad parentid references + # fix bad parentid references and repeated or missing revision numbers for item_id, rev_datum in revs.items(): rev_datum.sort(key=lambda r: (r.rev_number, r.mtime)) prev_rev_data = None for rev_data in rev_datum: if prev_rev_data is None: if rev_data.parent_id: - rev_data.is_bad = True + rev_data.issues.append('parentid_error') rev_data.parent_id = None - elif rev_data.parent_id != prev_rev_data.rev_id: - rev_data.parent_id = prev_rev_data.rev_id - rev_data.is_bad = True + if rev_data.rev_number == -1: + rev_data.issues.append('revision_number_error') + rev_data.rev_number = 1 + else: # prev_rev_data is not None + if rev_data.parent_id != prev_rev_data.rev_id: + rev_data.parent_id = prev_rev_data.rev_id + rev_data.issues.append('parentid_error') + if rev_data.rev_number <= prev_rev_data.rev_number: + rev_data.rev_number = prev_rev_data.rev_number + 1 + rev_data.issues.append('revision_number_error') prev_rev_data = rev_data - for rev_data in [r for r in rev_datum if r.is_bad]: + for rev_data in [r for r in rev_datum if r.issues]: bad = True meta, data = backend.retrieve(rev_data.rev_id) rev_str = get_rev_str(meta) if verbose: - print(f'parentid_error {rev_str} meta_parentid: {meta.get(PARENTID)} ' - f'correct_parentid: {rev_data.parent_id} meta_revision_number: {meta.get(REV_NUMBER)}') + for issue in rev_data.issues: + if issue == 'parentid_error': + print(f'{issue} {rev_str} meta_parentid: {meta.get(PARENTID)} ' + f'correct_parentid: {rev_data.parent_id} ' + f'meta_revision_number: {meta.get(REV_NUMBER)}') + else: # issue == 'revision_number_error' + print(f'{issue} {rev_str} meta_revision_number: {meta.get(REV_NUMBER)} ' + f'correct_revision_number: {rev_data.rev_number}') if rev_data.parent_id: meta[PARENTID] = rev_data.parent_id else: @@ -289,6 +304,7 @@ def ValidateMetadata(backends=None, all_backends=False, verbose=False, fix=False del meta[PARENTID] except KeyError: pass + meta[REV_NUMBER] = rev_data.rev_number _fix_if_bad(bad, meta, data, bad_revids, fix, backend) print(f'{len(bad_revids)} items with invalid metadata found{" and fixed" if fix else ""}') return bad_revids From ee698f2f627a7c8ce216385658cc93780ee56d28 Mon Sep 17 00:00:00 2001 From: bylsmad <25931535+bylsmad@users.noreply.github.com> Date: Mon, 29 May 2023 11:13:08 -0700 Subject: [PATCH 3/8] destroy_rev no longer updates MTIME, fixes #1448 previous code when destroying a middle revision the update to PARENTID was making the next revision into the latest applying same fix ValidateMetadata --- requirements.d/development.txt | 1 + src/moin/_tests/__init__.py | 9 +++++ src/moin/cli/maint/modify_item.py | 2 +- src/moin/conftest.py | 9 +++-- .../middleware/_tests/test_indexing.py | 14 ++++++-- src/moin/storage/middleware/indexing.py | 33 ++++++++++++++----- src/moin/storage/middleware/validation.py | 2 +- 7 files changed, 54 insertions(+), 16 deletions(-) diff --git a/requirements.d/development.txt b/requirements.d/development.txt index b18d39e3e..12af8100d 100644 --- a/requirements.d/development.txt +++ b/requirements.d/development.txt @@ -1,4 +1,5 @@ tox +psutil pytest # we use lxml.etree for xpath-based testing lxml diff --git a/src/moin/_tests/__init__.py b/src/moin/_tests/__init__.py index 8b2cebf5e..2bcffe7ef 100644 --- a/src/moin/_tests/__init__.py +++ b/src/moin/_tests/__init__.py @@ -11,6 +11,7 @@ import socket from io import BytesIO from pathlib import Path +import psutil from typing import Tuple from flask import g as flaskg @@ -104,3 +105,11 @@ def get_dirs(subdir: str) -> Tuple[Path, Path]: if not artifacts_dir.exists(): artifacts_dir.mkdir(parents=True) return moin_dir, artifacts_dir + + +def get_open_wiki_files(): + proc = psutil.Process() + files = [f for f in proc.open_files() if 'wiki' in f.path] + for file in files: + print(f'open wiki {file}') + return files diff --git a/src/moin/cli/maint/modify_item.py b/src/moin/cli/maint/modify_item.py index abffaccc7..f919d8734 100644 --- a/src/moin/cli/maint/modify_item.py +++ b/src/moin/cli/maint/modify_item.py @@ -237,7 +237,7 @@ def _fix_if_bad(bad, meta, data, bad_revids, fix, backend): logging.warning(f'bad revision not found in index {get_rev_str(meta)}') backend.store(meta, data) else: - item.store_revision(meta, data, overwrite=True) + item.store_revision(meta, data, overwrite=True, trusted=True) @dataclass diff --git a/src/moin/conftest.py b/src/moin/conftest.py index bf15783e5..b2c3576bd 100644 --- a/src/moin/conftest.py +++ b/src/moin/conftest.py @@ -22,7 +22,7 @@ import moin.log import moin from moin.app import create_app_ext, destroy_app, before_wiki, teardown_wiki -from moin._tests import wikiconfig +from moin._tests import wikiconfig, get_open_wiki_files from moin.storage import create_simple_mapping @@ -70,7 +70,12 @@ def app_ctx(cfg): teardown_wiki('') ctx.pop() - destroy_app(app) + try: + # simulate ERROR PermissionError: + # [WinError 32] The process cannot access the file because it is being used by another process + assert [] == get_open_wiki_files() + finally: + destroy_app(app) @pytest.fixture(autouse=True) diff --git a/src/moin/storage/middleware/_tests/test_indexing.py b/src/moin/storage/middleware/_tests/test_indexing.py index e4939db39..38b060c8c 100644 --- a/src/moin/storage/middleware/_tests/test_indexing.py +++ b/src/moin/storage/middleware/_tests/test_indexing.py @@ -17,7 +17,7 @@ HASH_ALGORITHM, CONTENT, COMMENT, LATEST_REVS, ALL_REVS, NAMESPACE, NAMERE, NAMEPREFIX, CONTENTTYPE, ITEMTYPE, ITEMLINKS, REV_NUMBER, - PARENTID) + PARENTID, MTIME) from moin.constants.namespaces import NAMESPACE_USERS @@ -100,18 +100,26 @@ def _store_three_revs(self, acl=None): def test_destroy_revision(self): item, item_name, revid0, revid1, revid2 = self._store_three_revs() + query = Term(NAME_EXACT, item_name) + metas = {m[REVID]: m for m in flaskg.storage.search_meta(query, idx_name=ALL_REVS)} + rev1_mtime = metas[revid1][MTIME] # destroy a non-current revision: item.destroy_revision(revid0) # check if the revision was destroyed: - item = self.imw[item_name] - query = Term(NAME_EXACT, item_name) metas = {m[REVID]: m for m in flaskg.storage.search_meta(query, idx_name=ALL_REVS)} revids = list(metas.keys()) print("after destroy revid0", revids) assert sorted(revids) == sorted([revid1, revid2]) # validate parent id of remaining revision is updated assert PARENTID not in metas[revid1] + # validate mtime not updated + assert rev1_mtime == metas[revid1][MTIME] + # validate revid2 is still the current one + metas = {m[REVID]: m for m in flaskg.storage.search_meta(query)} + assert 1 == len(metas) + assert revid2 in metas # destroy a current revision: + item = self.imw[item_name] item.destroy_revision(revid2) # check if the revision was destroyed: item = self.imw[item_name] diff --git a/src/moin/storage/middleware/indexing.py b/src/moin/storage/middleware/indexing.py index a0d36f769..51e8b76d7 100644 --- a/src/moin/storage/middleware/indexing.py +++ b/src/moin/storage/middleware/indexing.py @@ -48,6 +48,7 @@ usually it is even just the small and thus quick latest-revs index. """ +import gc import os import sys import shutil @@ -554,14 +555,19 @@ def move_index(self): index_dir, index_dir_tmp = params[0], params_tmp[0] os.rename(index_dir_tmp, index_dir) - def index_revision(self, meta, content, backend_name, async_=True): + def index_revision(self, meta, content, backend_name, async_=True, force_latest=True): """ Index a single revision, add it to all-revs and latest-revs index. :param meta: metadata dict :param content: preprocessed (filtered) indexable content :param async_: if True, use the AsyncWriter, otherwise use normal writer + :param force_latest: True - unconditionally store this rev in LATEST_REVS + False - store in LATEST_REVS if this rev MTIME is most recent + overrides async_ parameter to False """ + if force_latest: + async_ = False # must wait for storage in ALL_REVS before check for latest doc = backend_to_index(meta, content, self.schemas[ALL_REVS], self.wikiname, backend_name) if async_: writer = AsyncWriter(self.ix[ALL_REVS]) @@ -569,13 +575,21 @@ def index_revision(self, meta, content, backend_name, async_=True): writer = self.ix[ALL_REVS].writer() with writer as writer: writer.update_document(**doc) # update, because store_revision() may give us an existing revid - doc = backend_to_index(meta, content, self.schemas[LATEST_REVS], self.wikiname, backend_name) - if async_: - writer = AsyncWriter(self.ix[LATEST_REVS]) + if force_latest: + is_latest = True else: - writer = self.ix[LATEST_REVS].writer() - with writer as writer: - writer.update_document(**doc) + with self.ix[ALL_REVS].searcher() as searcher: + is_latest = (searcher.search(Term(ITEMID, doc[ITEMID]), + sortedby=FieldFacet(MTIME, reverse=True), + limit=1)[0][REVID] == doc[REVID]) + if is_latest: + doc = backend_to_index(meta, content, self.schemas[LATEST_REVS], self.wikiname, backend_name) + if async_: + writer = AsyncWriter(self.ix[LATEST_REVS]) + else: + writer = self.ix[LATEST_REVS].writer() + with writer as writer: + writer.update_document(**doc) def remove_revision(self, revid, async_=True): """ @@ -1240,7 +1254,8 @@ def store_revision(self, meta, data, overwrite=False, data.seek(0) # rewind file backend_name, revid = backend.store(meta, data) meta[REVID] = revid - self.indexer.index_revision(meta, content, backend_name) + self.indexer.index_revision(meta, content, backend_name, force_latest=not overwrite) + gc.collect() # triggers close of index files from is_latest search if not overwrite: self._current = get_indexer(self.indexer._document, revid=revid) if return_rev: @@ -1274,7 +1289,7 @@ def destroy_revision(self, revid): child_meta[PARENTID] = my_parent else: del child_meta[PARENTID] - self.store_revision(child_meta, child_rev.data, overwrite=True) + self.store_revision(child_meta, child_rev.data, overwrite=True, trusted=True) def destroy_all_revisions(self): """ diff --git a/src/moin/storage/middleware/validation.py b/src/moin/storage/middleware/validation.py index 2f6230077..c08d92d82 100644 --- a/src/moin/storage/middleware/validation.py +++ b/src/moin/storage/middleware/validation.py @@ -87,7 +87,7 @@ def userid_validator(element, state): if userid is None: # unknown user is acceptable return True - return uuid_validator(element, state) + return element.raw is Unset or uuid_validator(element, state) def name_validator(element, state): From e62b3b3cd998dbd12450f045119ae320dd79bcd3 Mon Sep 17 00:00:00 2001 From: bylsmad <25931535+bylsmad@users.noreply.github.com> Date: Mon, 29 May 2023 16:07:34 -0700 Subject: [PATCH 4/8] fixing logging for scrapy crawl.log --- src/moin/cli/_tests/conftest.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/moin/cli/_tests/conftest.py b/src/moin/cli/_tests/conftest.py index d7eb9d702..8eb98d40e 100644 --- a/src/moin/cli/_tests/conftest.py +++ b/src/moin/cli/_tests/conftest.py @@ -163,8 +163,12 @@ def do_crawl(request, artifact_dir): os.chdir(moin_dir / 'src' / 'moin' / 'cli' / '_tests' / 'scrapy') try: com = ['scrapy', 'crawl', '-a', f'url={settings.CRAWL_START}', 'ref_checker'] - with open(artifact_dir / 'crawl.log', 'wb') as crawl_log: - p = run(com, crawl_log, timeout=600) + with open(artifact_base_dir / 'crawl.log', 'wb') as crawl_log: + try: + p = run(com, crawl_log, timeout=600) + except subprocess.TimeoutExpired as e: + crawl_log.write(f'\n{repr(e)}\n'.encode()) + raise if p.returncode != 0: crawl_success = False if not crawl_success: From 6888aaacc7d8c9082c0fd13ef7a103f06bec2578 Mon Sep 17 00:00:00 2001 From: bylsmad <25931535+bylsmad@users.noreply.github.com> Date: Thu, 1 Jun 2023 21:49:02 -0700 Subject: [PATCH 5/8] improving scrapy error reporting fixing couple more bad file paths --- src/moin/cli/_tests/conftest.py | 22 +++++++---- .../scrapy/moincrawler/spiders/ref_checker.py | 39 +++++++++++++------ src/moin/cli/_tests/test_scrapy_crawl.py | 18 ++++++--- 3 files changed, 55 insertions(+), 24 deletions(-) diff --git a/src/moin/cli/_tests/conftest.py b/src/moin/cli/_tests/conftest.py index 8eb98d40e..2ee78d251 100644 --- a/src/moin/cli/_tests/conftest.py +++ b/src/moin/cli/_tests/conftest.py @@ -103,6 +103,11 @@ def get_crawl_server_log_path(): return artifact_base_dir / 'server-crawl.log' +def get_crawl_log_path(): + _, artifact_base_dir = get_dirs('') + return artifact_base_dir / 'crawl.log' + + @pytest.fixture(scope="package") def server(welcome, load_help, artifact_dir): run(['moin', 'index-build']) @@ -140,7 +145,7 @@ def server(welcome, load_help, artifact_dir): if not started: logging.error('server not started. server.log:') try: - with open(server_log.name) as f: + with open(get_crawl_server_log_path()) as f: logging.error(f.read()) except IOError as e: logging.error(f'{repr(e)} when trying to open server log') @@ -150,8 +155,11 @@ def server(welcome, load_help, artifact_dir): @pytest.fixture(scope="package") def do_crawl(request, artifact_dir): moin_dir, artifact_base_dir = get_dirs('') - (artifact_base_dir / 'crawl.log').touch() # insure github workflow will have a file to archive - (artifact_base_dir / 'crawl.csv').touch() + # initialize output files + with open(get_crawl_log_path(), 'w'): + pass + with open(artifact_base_dir / 'crawl.csv', 'w'): + pass server_started = True crawl_success = True if settings.SITE_HOST == '127.0.0.1:9080': @@ -163,7 +171,7 @@ def do_crawl(request, artifact_dir): os.chdir(moin_dir / 'src' / 'moin' / 'cli' / '_tests' / 'scrapy') try: com = ['scrapy', 'crawl', '-a', f'url={settings.CRAWL_START}', 'ref_checker'] - with open(artifact_base_dir / 'crawl.log', 'wb') as crawl_log: + with open(get_crawl_log_path(), 'wb') as crawl_log: try: p = run(com, crawl_log, timeout=600) except subprocess.TimeoutExpired as e: @@ -173,7 +181,7 @@ def do_crawl(request, artifact_dir): crawl_success = False if not crawl_success: logging.error('crawl failed. crawl.log:') - with open('crawl.log') as f: + with open(get_crawl_log_path()) as f: logging.error(f.read()) finally: os.chdir(artifact_dir) @@ -190,13 +198,13 @@ def crawl_results(request, artifact_dir) -> List[CrawlResult]: try: with open(artifact_base_dir / 'crawl.csv') as f: in_csv = csv.DictReader(f) - return [CrawlResult(**r) for r in in_csv] + return [CrawlResult(**r) for r in in_csv], crawl_success except Exception as e: crawl_success = False logging.error(f'exception reading crawl.csv {repr(e)}') if not crawl_success: logging.error('crawl failed') - return [] + return [], crawl_success @pytest.fixture(scope="package") diff --git a/src/moin/cli/_tests/scrapy/moincrawler/spiders/ref_checker.py b/src/moin/cli/_tests/scrapy/moincrawler/spiders/ref_checker.py index f45e2f78a..d835d55de 100644 --- a/src/moin/cli/_tests/scrapy/moincrawler/spiders/ref_checker.py +++ b/src/moin/cli/_tests/scrapy/moincrawler/spiders/ref_checker.py @@ -10,6 +10,7 @@ import csv from dataclasses import fields, astuple +from traceback import print_exc import scrapy from scrapy import signals @@ -55,17 +56,24 @@ def from_crawler(cls, crawler, *args, **kwargs): return spider def spider_closed(self): - _, artifact_base_dir = get_dirs('') - for k, c in self.crawler.stats.get_stats().items(): # bubble up spider exceptions into test failures - if k.startswith('spider_exceptions'): - self.results.append(CrawlResult(response_exc=f'crawler stats: {k} = {c}')) - with open(artifact_base_dir / 'crawl.csv', 'w') as fh: - out_csv = csv.writer(fh, lineterminator='\n') - out_csv.writerow([f.name for f in fields(CrawlResult)]) - for result in self.results: - out_csv.writerow(astuple(result)) - - def parse(self, response, **kwargs): + logging.info('entering spider_closed') + try: + _, artifact_base_dir = get_dirs('') + for k, c in self.crawler.stats.get_stats().items(): # bubble up spider exceptions into test failures + if k.startswith('spider_exceptions'): + logging.error(f'spider_exception: {c}') + self.results.append(CrawlResult(response_exc=f'crawler stats: {k} = {c}')) + with open(artifact_base_dir / 'crawl.csv', 'w') as fh: + out_csv = csv.writer(fh, lineterminator='\n') + out_csv.writerow([f.name for f in fields(CrawlResult)]) + for result in self.results: + out_csv.writerow(astuple(result)) + except Exception as e: # noqa + logging.error(f'exception in spider_closed {repr(e)}') + print_exc() + raise + + def _parse(self, response, **kwargs): """Main method that parses downloaded pages. requests yielded from this method are added to the crawl queue""" @@ -136,6 +144,15 @@ def parse(self, response, **kwargs): request.meta['my_data'] = new_result yield request + def parse(self, response, **kwargs): + """called by scrapy framework""" + try: + yield from self._parse(response, **kwargs) + except Exception as e: # noqa + logging.error(f'parse exception : {repr(e)}') + print_exc() + raise + def errback(self, failure): """called when request comes back with anything other than a 200 OK response""" if failure.value.__class__ is IgnoreRequest: # ignore urls disallowed by robots.txt diff --git a/src/moin/cli/_tests/test_scrapy_crawl.py b/src/moin/cli/_tests/test_scrapy_crawl.py index 713a62443..1837b34c9 100644 --- a/src/moin/cli/_tests/test_scrapy_crawl.py +++ b/src/moin/cli/_tests/test_scrapy_crawl.py @@ -13,6 +13,7 @@ except ImportError: from moin.cli._tests import default_settings as settings from moin.cli._tests.scrapy.moincrawler.items import CrawlResultMatch +from moin.cli._tests.conftest import get_crawl_log_path from moin.utils.iri import Iri from moin import log @@ -51,14 +52,16 @@ def is_expected_404(self, r): return self._matches_one_of(r, self.EXPECTED_404) def test_home_page(self, crawl_results): - assert len(crawl_results) > 0 - r = crawl_results[0] + assert crawl_results[1], f'crawl failed, check {get_crawl_log_path()}' + assert len(crawl_results[0]) > 0 + r = crawl_results[0][0] expected = CrawlResultMatch(url='/Home') assert expected.match(r), f'unexpected redirect for / {r}' def test_200(self, crawl_results): + assert crawl_results[1], f'crawl failed, check {get_crawl_log_path()}' failures = [] - for r in [r for r in crawl_results if r.url + for r in [r for r in crawl_results[0] if r.url and r.url.authority == settings.SITE_HOST and not self.is_known_issue(r)]: if 'Discussion' in r.url.path: expected = {200, 404} @@ -73,8 +76,9 @@ def test_200(self, crawl_results): @pytest.mark.xfail(reason='issue #1414 - remaining bad links in help') def test_expected_failures(self, crawl_results): + assert crawl_results[1], f'crawl failed, check {get_crawl_log_path()}' failures = [] - for r in [r for r in crawl_results if self.is_known_issue(r)]: + for r in [r for r in crawl_results[0] if self.is_known_issue(r)]: if r.response_code != 200: logging.info(f'known issue {r}') failures.append(r) @@ -85,12 +89,13 @@ def test_known_issues_exist(self, crawl_results): """enable this test to check for KNOWN_ISSUES which can be removed after removing, be sure to confirm by crawling a host with non-blank SITE_WIKI_ROOT as some issues only exist when moin is running behind apache""" + assert crawl_results[1], f'crawl failed, check {get_crawl_log_path()}' fixed = [] for m in self.KNOWN_ISSUES: seen = False my_fixed = [] my_not_fixed = [] - for r in [r for r in crawl_results if m.match(r)]: + for r in [r for r in crawl_results[0] if m.match(r)]: seen = True if r.response_code == 200: my_fixed.append(r) @@ -106,8 +111,9 @@ def test_known_issues_exist(self, crawl_results): assert len(fixed) == 0 def test_valid_request(self, crawl_results): + assert crawl_results[1], f'crawl failed, check {get_crawl_log_path()}' failures = [] - for r in [r for r in crawl_results if not self.is_known_issue(r)]: + for r in [r for r in crawl_results[0] if not self.is_known_issue(r)]: if not r.response_code: logging.error(f'no response code for {r}') failures.append(r) From f3e83fb87ad07f0c9195352daafc3131bc22a668 Mon Sep 17 00:00:00 2001 From: bylsmad <25931535+bylsmad@users.noreply.github.com> Date: Fri, 2 Jun 2023 05:45:05 -0700 Subject: [PATCH 6/8] fix for logic for _async in store_rev, #1448 --- src/moin/storage/middleware/indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/moin/storage/middleware/indexing.py b/src/moin/storage/middleware/indexing.py index 51e8b76d7..3b08f85a7 100644 --- a/src/moin/storage/middleware/indexing.py +++ b/src/moin/storage/middleware/indexing.py @@ -566,7 +566,7 @@ def index_revision(self, meta, content, backend_name, async_=True, force_latest= False - store in LATEST_REVS if this rev MTIME is most recent overrides async_ parameter to False """ - if force_latest: + if not force_latest: async_ = False # must wait for storage in ALL_REVS before check for latest doc = backend_to_index(meta, content, self.schemas[ALL_REVS], self.wikiname, backend_name) if async_: From 6842b33bfd2f663ba53b7cf864cb3cd13e57ed42 Mon Sep 17 00:00:00 2001 From: bylsmad <25931535+bylsmad@users.noreply.github.com> Date: Fri, 2 Jun 2023 11:40:34 -0700 Subject: [PATCH 7/8] fixing race contidion in test, #1448 error was caused by AsyncWriter which is not used when overwrite is True --- src/moin/cli/_tests/data/MyPage-vblank2.meta | 24 ++++++++++++++++++++ src/moin/cli/_tests/test_modify_item.py | 4 ++-- 2 files changed, 26 insertions(+), 2 deletions(-) create mode 100644 src/moin/cli/_tests/data/MyPage-vblank2.meta diff --git a/src/moin/cli/_tests/data/MyPage-vblank2.meta b/src/moin/cli/_tests/data/MyPage-vblank2.meta new file mode 100644 index 000000000..487338df5 --- /dev/null +++ b/src/moin/cli/_tests/data/MyPage-vblank2.meta @@ -0,0 +1,24 @@ +{ + "action": "SAVE", + "address": "127.0.0.1", + "comment": "", + "contenttype": "text/x.moin.wiki;charset=utf-8", + "dataid": "d6af14cd8edd4df6a992c5ac52dd78bf", + "externallinks": [], + "itemid": "b35958ca34f047b0924ba38ed652ce15", + "itemlinks": [], + "itemtransclusions": [], + "itemtype": "default", + "mtime": 1680488273, + "name": [ + "MyPage" + ], + "name_old": [], + "namespace": "", + "revid": "a8a8233bc8264216915ad3137ee6c20f", + "sha1": "487076f6c9eb3ce9cb18fd9800a62b35383a34ee", + "size": 16, + "summary": "", + "tags": [], + "wikiname": "MyMoinMoin" +} diff --git a/src/moin/cli/_tests/test_modify_item.py b/src/moin/cli/_tests/test_modify_item.py index ab2006be3..2d0f22329 100644 --- a/src/moin/cli/_tests/test_modify_item.py +++ b/src/moin/cli/_tests/test_modify_item.py @@ -236,9 +236,9 @@ def test_validate_metadata(index_create2): def test_validate_metadata_missing_rev_num(index_create2): moin_dir, _ = get_dirs('') data_dir = moin_dir / 'src' / 'moin' / 'cli' / '_tests' / 'data' - item_put = run(['moin', 'item-put', '-m', data_dir / 'MyPage-vblank.meta', '-d', data_dir / 'MyPage-v1.data']) + item_put = run(['moin', 'item-put', '-m', data_dir / 'MyPage-vblank.meta', '-d', data_dir / 'MyPage-v1.data', '-o']) assert_p_succcess(item_put) - item_put = run(['moin', 'item-put', '-m', data_dir / 'MyPage-vblank.meta', '-d', data_dir / 'MyPage-v1.data']) + item_put = run(['moin', 'item-put', '-m', data_dir / 'MyPage-vblank2.meta', '-d', data_dir / 'MyPage-v1.data', '-o']) assert_p_succcess(item_put) validate = run(['moin', 'maint-validate-metadata', '-b', 'default', '-v', '-f']) assert_p_succcess(validate) From 7fe23f95f03d9f20536ab0ea273ee8000c2ff097 Mon Sep 17 00:00:00 2001 From: bylsmad <25931535+bylsmad@users.noreply.github.com> Date: Fri, 2 Jun 2023 15:00:09 -0700 Subject: [PATCH 8/8] fix for scrapy test running under tox --- src/moin/cli/_tests/__init__.py | 5 ++++- src/moin/cli/_tests/conftest.py | 12 +++++++++--- .../_tests/scrapy/moincrawler/spiders/ref_checker.py | 6 +++++- src/moin/cli/_tests/test_scrapy_crawl.py | 3 +++ 4 files changed, 21 insertions(+), 5 deletions(-) diff --git a/src/moin/cli/_tests/__init__.py b/src/moin/cli/_tests/__init__.py index 68e2cb98f..735c2dd45 100644 --- a/src/moin/cli/_tests/__init__.py +++ b/src/moin/cli/_tests/__init__.py @@ -20,16 +20,19 @@ logging = log.getLogger(__name__) -def run(cmd: List[str], log=None, wait: bool = True, timeout: int = None) \ +def run(cmd: List[str], log=None, wait: bool = True, timeout: int = None, env=None) \ -> Union[subprocess.CompletedProcess, subprocess.Popen]: """run a shell command, redirecting output to log :param cmd: list of strings containing command arguments :param log: open file handle to log file (binary mode) or None in which case output will be captured :param wait: if True return after process is complete, otherwise return immediately after start :param timeout: timeout setting in seconds, can only be used when wait is True + :param env: dictionary of environment variables to add to current env for subprocess :return: CompletedProcess object if wait else Popen object""" subprocess_environ = copy(os.environ) subprocess_environ['PYTHONIOENCODING'] = 'cp1252' # simulate windows terminal to ferret out encoding issues + if env: + subprocess_environ.update(env) logging.info(f'running {cmd}') if stdout := log: stderr = subprocess.STDOUT diff --git a/src/moin/cli/_tests/conftest.py b/src/moin/cli/_tests/conftest.py index 2ee78d251..8d276f873 100644 --- a/src/moin/cli/_tests/conftest.py +++ b/src/moin/cli/_tests/conftest.py @@ -108,6 +108,11 @@ def get_crawl_log_path(): return artifact_base_dir / 'crawl.log' +def get_crawl_csv_path(): + _, artifact_base_dir = get_dirs('') + return artifact_base_dir / 'crawl.csv' + + @pytest.fixture(scope="package") def server(welcome, load_help, artifact_dir): run(['moin', 'index-build']) @@ -158,7 +163,7 @@ def do_crawl(request, artifact_dir): # initialize output files with open(get_crawl_log_path(), 'w'): pass - with open(artifact_base_dir / 'crawl.csv', 'w'): + with open(get_crawl_csv_path(), 'w'): pass server_started = True crawl_success = True @@ -173,7 +178,7 @@ def do_crawl(request, artifact_dir): com = ['scrapy', 'crawl', '-a', f'url={settings.CRAWL_START}', 'ref_checker'] with open(get_crawl_log_path(), 'wb') as crawl_log: try: - p = run(com, crawl_log, timeout=600) + p = run(com, crawl_log, timeout=600, env={'MOIN_SCRAPY_CRAWL_CSV': str(get_crawl_csv_path())}) except subprocess.TimeoutExpired as e: crawl_log.write(f'\n{repr(e)}\n'.encode()) raise @@ -196,7 +201,8 @@ def crawl_results(request, artifact_dir) -> List[CrawlResult]: crawl_success = request.getfixturevalue('do_crawl') if crawl_success: try: - with open(artifact_base_dir / 'crawl.csv') as f: + logging.info(f'reading {get_crawl_csv_path()}') + with open(get_crawl_csv_path()) as f: in_csv = csv.DictReader(f) return [CrawlResult(**r) for r in in_csv], crawl_success except Exception as e: diff --git a/src/moin/cli/_tests/scrapy/moincrawler/spiders/ref_checker.py b/src/moin/cli/_tests/scrapy/moincrawler/spiders/ref_checker.py index d835d55de..cf661fe09 100644 --- a/src/moin/cli/_tests/scrapy/moincrawler/spiders/ref_checker.py +++ b/src/moin/cli/_tests/scrapy/moincrawler/spiders/ref_checker.py @@ -10,6 +10,7 @@ import csv from dataclasses import fields, astuple +import os from traceback import print_exc import scrapy @@ -24,6 +25,7 @@ except ImportError: from moin.cli._tests import default_settings as settings +from moin.cli._tests.conftest import get_crawl_csv_path from moin.utils.iri import Iri from moin import log @@ -63,7 +65,9 @@ def spider_closed(self): if k.startswith('spider_exceptions'): logging.error(f'spider_exception: {c}') self.results.append(CrawlResult(response_exc=f'crawler stats: {k} = {c}')) - with open(artifact_base_dir / 'crawl.csv', 'w') as fh: + crawl_csv_path = os.environ.get('MOIN_SCRAPY_CRAWL_CSV', get_crawl_csv_path()) + logging.info(f'writing {len(self.results)} to {crawl_csv_path}') + with open(crawl_csv_path, 'w') as fh: out_csv = csv.writer(fh, lineterminator='\n') out_csv.writerow([f.name for f in fields(CrawlResult)]) for result in self.results: diff --git a/src/moin/cli/_tests/test_scrapy_crawl.py b/src/moin/cli/_tests/test_scrapy_crawl.py index 1837b34c9..adb45d8fe 100644 --- a/src/moin/cli/_tests/test_scrapy_crawl.py +++ b/src/moin/cli/_tests/test_scrapy_crawl.py @@ -53,6 +53,9 @@ def is_expected_404(self, r): def test_home_page(self, crawl_results): assert crawl_results[1], f'crawl failed, check {get_crawl_log_path()}' + for line in open(get_crawl_log_path(), 'rb'): + if b'crawl.csv' in line: + logging.info(f'{line} from {get_crawl_log_path()}') assert len(crawl_results[0]) > 0 r = crawl_results[0][0] expected = CrawlResultMatch(url='/Home')