Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

repairing parentid for destroy rev, fixes #1448 #1455

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 3 additions & 5 deletions docs/admin/maintenance.rst
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,10 @@ The processes below check for and optionally fix the following issues:

* size does not match size of the revision's data in bytes
* sha1 hash does not match has of the revision's data
* revision numbers for an item's revision should make an unbroken sequence starting at 1
* parent id should not be present for revision number 1 of a given item
* parent id for each revision should be the data id for the previous revision number for that item
* every revision should have a revision number
* an item should not have repeated revision numbers

To check for invalid metadata, run the following command::

Expand All @@ -47,10 +48,7 @@ To view detailed list of invalid items::

moin maint-validate-metadata --all-backends --verbose

To fix issues, take your wiki offline and add ``--fix`` option to any of the above commands.
To fix issues, add ``--fix`` option to any of the above commands.

To operate on only a selection of backends, replace ``--all--backends`` option with ``--backends``
followed by comma separated list of backends to process

If the ``--fix`` finds anything to fix, you must rebuild the index
with the newly created metadata, see :doc:`index`
1 change: 1 addition & 0 deletions requirements.d/development.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
tox
psutil
pytest
# we use lxml.etree for xpath-based testing
lxml
Expand Down
9 changes: 9 additions & 0 deletions src/moin/_tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import socket
from io import BytesIO
from pathlib import Path
import psutil
from typing import Tuple

from flask import g as flaskg
Expand Down Expand Up @@ -104,3 +105,11 @@ def get_dirs(subdir: str) -> Tuple[Path, Path]:
if not artifacts_dir.exists():
artifacts_dir.mkdir(parents=True)
return moin_dir, artifacts_dir


def get_open_wiki_files():
proc = psutil.Process()
files = [f for f in proc.open_files() if 'wiki' in f.path]
for file in files:
print(f'open wiki {file}')
return files
5 changes: 4 additions & 1 deletion src/moin/cli/_tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,19 @@
logging = log.getLogger(__name__)


def run(cmd: List[str], log=None, wait: bool = True, timeout: int = None) \
def run(cmd: List[str], log=None, wait: bool = True, timeout: int = None, env=None) \
-> Union[subprocess.CompletedProcess, subprocess.Popen]:
"""run a shell command, redirecting output to log
:param cmd: list of strings containing command arguments
:param log: open file handle to log file (binary mode) or None in which case output will be captured
:param wait: if True return after process is complete, otherwise return immediately after start
:param timeout: timeout setting in seconds, can only be used when wait is True
:param env: dictionary of environment variables to add to current env for subprocess
:return: CompletedProcess object if wait else Popen object"""
subprocess_environ = copy(os.environ)
subprocess_environ['PYTHONIOENCODING'] = 'cp1252' # simulate windows terminal to ferret out encoding issues
if env:
subprocess_environ.update(env)
logging.info(f'running {cmd}')
if stdout := log:
stderr = subprocess.STDOUT
Expand Down
36 changes: 27 additions & 9 deletions src/moin/cli/_tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,16 @@ def get_crawl_server_log_path():
return artifact_base_dir / 'server-crawl.log'


def get_crawl_log_path():
_, artifact_base_dir = get_dirs('')
return artifact_base_dir / 'crawl.log'


def get_crawl_csv_path():
_, artifact_base_dir = get_dirs('')
return artifact_base_dir / 'crawl.csv'


@pytest.fixture(scope="package")
def server(welcome, load_help, artifact_dir):
run(['moin', 'index-build'])
Expand Down Expand Up @@ -140,7 +150,7 @@ def server(welcome, load_help, artifact_dir):
if not started:
logging.error('server not started. server.log:')
try:
with open(server_log.name) as f:
with open(get_crawl_server_log_path()) as f:
logging.error(f.read())
except IOError as e:
logging.error(f'{repr(e)} when trying to open server log')
Expand All @@ -150,8 +160,11 @@ def server(welcome, load_help, artifact_dir):
@pytest.fixture(scope="package")
def do_crawl(request, artifact_dir):
moin_dir, artifact_base_dir = get_dirs('')
(artifact_base_dir / 'crawl.log').touch() # insure github workflow will have a file to archive
(artifact_base_dir / 'crawl.csv').touch()
# initialize output files
with open(get_crawl_log_path(), 'w'):
pass
with open(get_crawl_csv_path(), 'w'):
pass
server_started = True
crawl_success = True
if settings.SITE_HOST == '127.0.0.1:9080':
Expand All @@ -163,13 +176,17 @@ def do_crawl(request, artifact_dir):
os.chdir(moin_dir / 'src' / 'moin' / 'cli' / '_tests' / 'scrapy')
try:
com = ['scrapy', 'crawl', '-a', f'url={settings.CRAWL_START}', 'ref_checker']
with open(artifact_dir / 'crawl.log', 'wb') as crawl_log:
p = run(com, crawl_log, timeout=600)
with open(get_crawl_log_path(), 'wb') as crawl_log:
try:
p = run(com, crawl_log, timeout=600, env={'MOIN_SCRAPY_CRAWL_CSV': str(get_crawl_csv_path())})
except subprocess.TimeoutExpired as e:
crawl_log.write(f'\n{repr(e)}\n'.encode())
raise
if p.returncode != 0:
crawl_success = False
if not crawl_success:
logging.error('crawl failed. crawl.log:')
with open('crawl.log') as f:
with open(get_crawl_log_path()) as f:
logging.error(f.read())
finally:
os.chdir(artifact_dir)
Expand All @@ -184,15 +201,16 @@ def crawl_results(request, artifact_dir) -> List[CrawlResult]:
crawl_success = request.getfixturevalue('do_crawl')
if crawl_success:
try:
with open(artifact_base_dir / 'crawl.csv') as f:
logging.info(f'reading {get_crawl_csv_path()}')
with open(get_crawl_csv_path()) as f:
in_csv = csv.DictReader(f)
return [CrawlResult(**r) for r in in_csv]
return [CrawlResult(**r) for r in in_csv], crawl_success
except Exception as e:
crawl_success = False
logging.error(f'exception reading crawl.csv {repr(e)}')
if not crawl_success:
logging.error('crawl failed')
return []
return [], crawl_success


@pytest.fixture(scope="package")
Expand Down
24 changes: 24 additions & 0 deletions src/moin/cli/_tests/data/MyPage-vblank.meta
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{
"action": "SAVE",
"address": "127.0.0.1",
"comment": "",
"contenttype": "text/x.moin.wiki;charset=utf-8",
"dataid": "d6af14cd8edd4df6a992c5ac52dd78bf",
"externallinks": [],
"itemid": "b35958ca34f047b0924ba38ed652ce15",
"itemlinks": [],
"itemtransclusions": [],
"itemtype": "default",
"mtime": 1680488272,
"name": [
"MyPage"
],
"name_old": [],
"namespace": "",
"revid": "484e73725601407e9f9ab0bcaa151fb6",
"sha1": "487076f6c9eb3ce9cb18fd9800a62b35383a34ee",
"size": 16,
"summary": "",
"tags": [],
"wikiname": "MyMoinMoin"
}
24 changes: 24 additions & 0 deletions src/moin/cli/_tests/data/MyPage-vblank2.meta
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{
"action": "SAVE",
"address": "127.0.0.1",
"comment": "",
"contenttype": "text/x.moin.wiki;charset=utf-8",
"dataid": "d6af14cd8edd4df6a992c5ac52dd78bf",
"externallinks": [],
"itemid": "b35958ca34f047b0924ba38ed652ce15",
"itemlinks": [],
"itemtransclusions": [],
"itemtype": "default",
"mtime": 1680488273,
"name": [
"MyPage"
],
"name_old": [],
"namespace": "",
"revid": "a8a8233bc8264216915ad3137ee6c20f",
"sha1": "487076f6c9eb3ce9cb18fd9800a62b35383a34ee",
"size": 16,
"summary": "",
"tags": [],
"wikiname": "MyMoinMoin"
}
43 changes: 32 additions & 11 deletions src/moin/cli/_tests/scrapy/moincrawler/spiders/ref_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@

import csv
from dataclasses import fields, astuple
import os
from traceback import print_exc

import scrapy
from scrapy import signals
Expand All @@ -23,6 +25,7 @@
except ImportError:
from moin.cli._tests import default_settings as settings

from moin.cli._tests.conftest import get_crawl_csv_path
from moin.utils.iri import Iri
from moin import log

Expand Down Expand Up @@ -55,17 +58,26 @@ def from_crawler(cls, crawler, *args, **kwargs):
return spider

def spider_closed(self):
_, artifact_base_dir = get_dirs('')
for k, c in self.crawler.stats.get_stats().items(): # bubble up spider exceptions into test failures
if k.startswith('spider_exceptions'):
self.results.append(CrawlResult(response_exc=f'crawler stats: {k} = {c}'))
with open(artifact_base_dir / 'crawl.csv', 'w') as fh:
out_csv = csv.writer(fh, lineterminator='\n')
out_csv.writerow([f.name for f in fields(CrawlResult)])
for result in self.results:
out_csv.writerow(astuple(result))

def parse(self, response, **kwargs):
logging.info('entering spider_closed')
try:
_, artifact_base_dir = get_dirs('')
for k, c in self.crawler.stats.get_stats().items(): # bubble up spider exceptions into test failures
if k.startswith('spider_exceptions'):
logging.error(f'spider_exception: {c}')
self.results.append(CrawlResult(response_exc=f'crawler stats: {k} = {c}'))
crawl_csv_path = os.environ.get('MOIN_SCRAPY_CRAWL_CSV', get_crawl_csv_path())
logging.info(f'writing {len(self.results)} to {crawl_csv_path}')
with open(crawl_csv_path, 'w') as fh:
out_csv = csv.writer(fh, lineterminator='\n')
out_csv.writerow([f.name for f in fields(CrawlResult)])
for result in self.results:
out_csv.writerow(astuple(result))
except Exception as e: # noqa
logging.error(f'exception in spider_closed {repr(e)}')
print_exc()
raise

def _parse(self, response, **kwargs):
"""Main method that parses downloaded pages.

requests yielded from this method are added to the crawl queue"""
Expand Down Expand Up @@ -136,6 +148,15 @@ def parse(self, response, **kwargs):
request.meta['my_data'] = new_result
yield request

def parse(self, response, **kwargs):
"""called by scrapy framework"""
try:
yield from self._parse(response, **kwargs)
except Exception as e: # noqa
logging.error(f'parse exception : {repr(e)}')
print_exc()
raise

def errback(self, failure):
"""called when request comes back with anything other than a 200 OK response"""
if failure.value.__class__ is IgnoreRequest: # ignore urls disallowed by robots.txt
Expand Down
72 changes: 57 additions & 15 deletions src/moin/cli/_tests/test_modify_item.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
from pathlib import Path

from moin._tests import get_dirs
from moin.cli._tests import run, assert_p_succcess, read_index_dump_latest_revs
from moin.cli._tests import run, assert_p_succcess, read_index_dump_latest_revs, read_index_dump
from moin.constants.keys import REVID, PARENTID, SIZE, REV_NUMBER, NAMES


def validate_meta(expected, actual, message):
Expand Down Expand Up @@ -172,35 +173,76 @@ def test_validate_metadata(index_create2):
assert_p_succcess(item_put)
validate = run(['moin', 'maint-validate-metadata', '-b', 'default', '-v'])
assert_p_succcess(validate)
outlines = validate.stdout.splitlines()
outlines = validate.stdout.decode().splitlines()
assert 4 == len(outlines)
rev_id1 = b'7ed018d7ceda49409e18b8efb914f5ff' # Corrupt.meta
rev_id2 = b'0a2f1b476b6c42be80908b3b799df3fd' # Corrupt2.meta
rev_id3 = b'39c8fe8da0a048c0b7839bf8aa02cd04' # Corrupt3.meta
rev_id1 = '7ed018d7ceda49409e18b8efb914f5ff' # Corrupt.meta
rev_id2 = '0a2f1b476b6c42be80908b3b799df3fd' # Corrupt2.meta
rev_id3 = '39c8fe8da0a048c0b7839bf8aa02cd04' # Corrupt3.meta
rev_id4 = '484e73725601407e9f9ab0bcaa151fb6' # MyPage-v1.meta
rev_id5 = 'b0b07c407c3143aabc4d34aac1b1d303' # MyPage-v2.meta
outlines_by_rev_id = {}
for outline in outlines:
words = iter(outline.split())
for word in words:
if word == b'rev_id:':
if word == 'rev_id:':
outlines_by_rev_id[next(words)] = outline
break
assert {rev_id1, rev_id2, rev_id3} == set(outlines_by_rev_id.keys())
assert b'size_error name: Home item: cbd6fc46f88740acbc1dca90bb1eb8f3 rev_number: 1 rev_id: 7ed018d7ceda49409e18b8efb914f5ff '\
b'meta_size: 8 real_size: 11' == outlines_by_rev_id[rev_id1]
assert b'sha1_error name: Page2 item: 9999989aca5e45cc8683432f986a0e50 rev_number: 1 rev_id: 0a2f1b476b6c42be80908b3b799df3fd '\
b'meta_sha1: 25ff6d28976a9e0feb97710a0c4b08ae197a0000 '\
b'real_sha1: 25ff6d28976a9e0feb97710a0c4b08ae197afbfe' == outlines_by_rev_id[rev_id2]
assert b'parentid_error name: Page3 item: 3c7e36466726441faf6d7d266ac224e2 rev_number: 2 rev_id: 39c8fe8da0a048c0b7839bf8aa02cd04 '\
b'meta_parentid: 002e5210cc884010b0dd75a1c337032d correct_parentid: None meta_revision_number: 2 correct_revision_number: 1' \
assert 'size_error name: Home item: cbd6fc46f88740acbc1dca90bb1eb8f3 rev_number: 1 rev_id: 7ed018d7ceda49409e18b8efb914f5ff '\
'meta_size: 8 real_size: 11' == outlines_by_rev_id[rev_id1]
assert 'sha1_error name: Page2 item: 9999989aca5e45cc8683432f986a0e50 rev_number: 1 rev_id: 0a2f1b476b6c42be80908b3b799df3fd '\
'meta_sha1: 25ff6d28976a9e0feb97710a0c4b08ae197a0000 '\
'real_sha1: 25ff6d28976a9e0feb97710a0c4b08ae197afbfe' == outlines_by_rev_id[rev_id2]
assert 'parentid_error name: Page3 item: 3c7e36466726441faf6d7d266ac224e2 rev_number: 2 rev_id: 39c8fe8da0a048c0b7839bf8aa02cd04 '\
'meta_parentid: 002e5210cc884010b0dd75a1c337032d correct_parentid: None meta_revision_number: 2' \
== outlines_by_rev_id[rev_id3]
assert b'3 items with invalid metadata found' == outlines[3]
assert '3 items with invalid metadata found' == outlines[3]
validate = run(['moin', 'maint-validate-metadata', '-b', 'default', '-f'])
assert_p_succcess(validate)
outlines = validate.stdout.splitlines()
assert 2 == len(outlines)
assert 1 == len(outlines)
assert b'3 items with invalid metadata found and fixed' == outlines[0]
validate = run(['moin', 'maint-validate-metadata', '-b', 'default', '-v'])
assert_p_succcess(validate)
outlines = validate.stdout.splitlines()
assert 1 == len(outlines)
assert b'0 items with invalid metadata found' == outlines[0]
# validate index is updated
index_dump = run(['moin', 'index-dump', '--no-truncate'])
metas = {m[REVID]: m for m in read_index_dump(index_dump.stdout.decode())}
assert {rev_id1, rev_id2, rev_id3, rev_id4, rev_id5} == set(metas.keys())
assert 11 == metas[rev_id1][SIZE]
assert PARENTID not in metas[rev_id3]
# create a repeated revision_number
item_put = run(['moin', 'item-put', '-m', data_dir / 'MyPage-v2.meta', '-d', data_dir / 'MyPage-v2.data'])
assert_p_succcess(item_put)
validate = run(['moin', 'maint-validate-metadata', '-b', 'default', '-v', '-f'])
assert_p_succcess(validate)
outlines = validate.stdout.decode().splitlines()
assert '1 items with invalid metadata found and fixed' == outlines[-1]
assert 3 == len(outlines)
outlines_by_error = {}
for outline in outlines[0:2]:
words = outline.split()
outlines_by_error[words[0]] = outline
assert {'parentid_error', 'revision_number_error'} == set(outlines_by_error.keys())
index_dump = run(['moin', 'index-dump', '--no-truncate'])
rev_numbers = {m[REV_NUMBER]: m for m in read_index_dump(index_dump.stdout.decode()) if m[NAMES] == 'MyPage'}
assert {1, 2, 3} == set(rev_numbers.keys())
assert rev_numbers[1][REVID] == rev_id4
assert rev_numbers[2][REVID] == rev_id5


def test_validate_metadata_missing_rev_num(index_create2):
moin_dir, _ = get_dirs('')
data_dir = moin_dir / 'src' / 'moin' / 'cli' / '_tests' / 'data'
item_put = run(['moin', 'item-put', '-m', data_dir / 'MyPage-vblank.meta', '-d', data_dir / 'MyPage-v1.data', '-o'])
assert_p_succcess(item_put)
item_put = run(['moin', 'item-put', '-m', data_dir / 'MyPage-vblank2.meta', '-d', data_dir / 'MyPage-v1.data', '-o'])
assert_p_succcess(item_put)
validate = run(['moin', 'maint-validate-metadata', '-b', 'default', '-v', '-f'])
assert_p_succcess(validate)
index_dump = run(['moin', 'index-dump', '--no-truncate'])
print(index_dump.stdout.decode())
rev_numbers = {m[REV_NUMBER]: m for m in read_index_dump(index_dump.stdout.decode()) if m[NAMES] == 'MyPage'}
assert {1, 2} == set(rev_numbers.keys())
Loading