-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #423 from Wikidata/rotten-urls
Rotten URLs for catalog providers
- Loading branch information
Showing
3 changed files
with
93 additions
and
56 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,13 +5,14 @@ | |
|
||
__author__ = 'Marco Fossati' | ||
__email__ = '[email protected]' | ||
__version__ = '1.0' | ||
__version__ = '2.0' | ||
__license__ = 'GPL-3.0' | ||
__copyright__ = 'Copyleft 2018, Hjfocs' | ||
__copyright__ = 'Copyleft 2021, Hjfocs' | ||
|
||
import logging | ||
import re | ||
from functools import lru_cache | ||
from typing import Optional | ||
from urllib.parse import unquote, urlsplit | ||
|
||
import regex | ||
|
@@ -100,7 +101,13 @@ def validate(url): | |
|
||
|
||
@lru_cache() | ||
def resolve(url): | ||
def resolve(url: str) -> Optional[str]: | ||
"""Try to resolve an URL via a set of strategies. | ||
:param url: an URL | ||
:return: the resolved URL (may differ from the given one), or ``None`` | ||
if the resolution attempt failed | ||
""" | ||
# Don't show warnings in case of unverified HTTPS requests | ||
disable_warnings(InsecureRequestWarning) | ||
# Some Web sites return 4xx just because of a non-browser user agent header | ||
|
@@ -122,42 +129,32 @@ def resolve(url): | |
response = get(url, headers=browser_ua, stream=True, verify=False) | ||
except Exception as unexpected_error: | ||
LOGGER.warning( | ||
'Dropping URL that led to an unexpected error: <%s> - Reason: %s', | ||
url, | ||
unexpected_error, | ||
'Unexpected error: <%s> - Reason: %s', url, unexpected_error, | ||
) | ||
return None | ||
except requests.exceptions.Timeout as timeout: | ||
LOGGER.info( | ||
'Dropping URL that led to a request timeout: <%s> - Reason: %s', | ||
url, | ||
timeout, | ||
'Request timeout: <%s> - Reason: %s', url, timeout, | ||
) | ||
return None | ||
except requests.exceptions.TooManyRedirects as too_many_redirects: | ||
LOGGER.info( | ||
'Dropping URL because of too many redirects: <%s> - %s', | ||
url, | ||
too_many_redirects, | ||
'Too many redirects: <%s> - %s', url, too_many_redirects, | ||
) | ||
return None | ||
except requests.exceptions.ConnectionError as connection_error: | ||
LOGGER.info( | ||
'Dropping URL that led to an aborted connection: <%s> - Reason: %s', | ||
url, | ||
connection_error, | ||
'Aborted connection: <%s> - Reason: %s', url, connection_error, | ||
) | ||
return None | ||
except Exception as unexpected_error: | ||
LOGGER.warning( | ||
'Dropping URL that led to an unexpected error: <%s> - Reason: %s', | ||
url, | ||
unexpected_error, | ||
'Unexpected error: <%s> - Reason: %s', url, unexpected_error, | ||
) | ||
return None | ||
if not response.ok: | ||
LOGGER.info( | ||
"Dropping dead URL that returned HTTP status '%s' (%d): <%s>", | ||
"HTTP status '%s' (%d): <%s>", | ||
response.reason, | ||
response.status_code, | ||
url, | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,15 +5,15 @@ | |
|
||
__author__ = 'Marco Fossati' | ||
__email__ = '[email protected]' | ||
__version__ = '1.0' | ||
__version__ = '2.0' | ||
__license__ = 'GPL-3.0' | ||
__copyright__ = 'Copyleft 2018, Hjfocs' | ||
__copyright__ = 'Copyleft 2021, Hjfocs' | ||
|
||
import click | ||
|
||
from soweego.importer.importer import check_links_cli, import_cli | ||
from soweego.importer.importer import check_urls_cli, import_cli | ||
|
||
CLI_COMMANDS = {'import': import_cli, 'check_urls': check_links_cli} | ||
CLI_COMMANDS = {'import': import_cli, 'check_urls': check_urls_cli} | ||
|
||
|
||
@click.group(name='importer', commands=CLI_COMMANDS) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,11 +3,11 @@ | |
|
||
"""Download, extract, and import a supported catalog.""" | ||
|
||
__author__ = 'Massimo Frasson' | ||
__email__ = '[email protected]' | ||
__version__ = '1.0' | ||
__author__ = 'Massimo Frasson, Marco Fossati' | ||
__email__ = '[email protected], [email protected]' | ||
__version__ = '2.0' | ||
__license__ = 'GPL-3.0' | ||
__copyright__ = 'Copyleft 2018, MaxFrax96' | ||
__copyright__ = 'Copyleft 2018-2021, MaxFrax96, Hjfocs' | ||
|
||
import datetime | ||
import logging | ||
|
@@ -33,6 +33,7 @@ | |
keys.IMDB: IMDbDumpExtractor, | ||
keys.MUSICBRAINZ: MusicBrainzDumpExtractor, | ||
} | ||
ROTTEN_URLS_FNAME = '{catalog}_{entity}_rotten_urls.txt' | ||
|
||
|
||
@click.command() | ||
|
@@ -48,7 +49,6 @@ | |
), | ||
) | ||
@click.option( | ||
'-d', | ||
'--dir-io', | ||
type=click.Path(file_okay=False), | ||
default=constants.SHARED_FOLDER, | ||
|
@@ -70,47 +70,87 @@ def _resolve_url(res): | |
@click.argument( | ||
'catalog', type=click.Choice(target_database.supported_targets()) | ||
) | ||
def check_links_cli(catalog: str): | ||
"""Check for rotten URLs of an imported catalog.""" | ||
for entity_type in target_database.supported_entities_for_target(catalog): | ||
@click.option( | ||
'-d', '--drop', is_flag=True, help=f'Drop rotten URLs from the DB.', | ||
) | ||
@click.option( | ||
'--dir-io', | ||
type=click.Path(file_okay=False), | ||
default=constants.SHARED_FOLDER, | ||
help=f'Input/output directory, default: {constants.SHARED_FOLDER}.', | ||
) | ||
def check_urls_cli(catalog, drop, dir_io): | ||
"""Check for rotten URLs of an imported catalog. | ||
For every catalog entity, dump a text file with rotten URLs, one per line. | ||
Use '-d' to drop rotten URLs from the DB on the fly. | ||
""" | ||
for entity in target_database.supported_entities_for_target(catalog): | ||
out_path = os.path.join( | ||
dir_io, ROTTEN_URLS_FNAME.format(catalog=catalog, entity=entity) | ||
) | ||
|
||
LOGGER.info("Validating %s %s links...", catalog, entity_type) | ||
entity = target_database.get_link_entity(catalog, entity_type) | ||
if not entity: | ||
LOGGER.info('Starting check of %s %s URLs ...', catalog, entity) | ||
link_entity = target_database.get_link_entity(catalog, entity) | ||
if not link_entity: | ||
LOGGER.info( | ||
"%s %s does not have a links table. Skipping...", | ||
'%s %s does not have a links table. Skipping ...', | ||
catalog, | ||
entity_type, | ||
entity, | ||
) | ||
continue | ||
|
||
session = DBManager.connect_to_db() | ||
total = session.query(entity).count() | ||
removed = 0 | ||
query_session = DBManager.connect_to_db() | ||
total = query_session.query(link_entity).count() | ||
|
||
with Pool() as pool: | ||
# Validate each link | ||
rotten = 0 | ||
if drop: | ||
removed = 0 | ||
|
||
# Parallel operation | ||
with Pool() as pool, open(out_path, 'w', buffering=1) as fout: | ||
# Try to resolve every URL | ||
for resolved, res_entity in tqdm( | ||
pool.imap_unordered(_resolve_url, session.query(entity)), | ||
pool.imap_unordered( | ||
_resolve_url, query_session.query(link_entity) | ||
), | ||
total=total, | ||
): | ||
if not resolved: | ||
session_delete = DBManager.connect_to_db() | ||
# if not valid delete | ||
session_delete.delete(res_entity) | ||
try: | ||
session_delete.commit() | ||
removed += 1 | ||
except: | ||
session.rollback() | ||
raise | ||
finally: | ||
session_delete.close() | ||
|
||
session.close() | ||
# Dump | ||
fout.write(res_entity.url + '\n') | ||
rotten += 1 | ||
|
||
# Drop from DB | ||
if drop: | ||
delete_session = DBManager.connect_to_db() | ||
delete_session.delete(res_entity) | ||
try: | ||
delete_session.commit() | ||
removed += 1 | ||
except: | ||
delete_session.rollback() | ||
raise | ||
finally: | ||
delete_session.close() | ||
query_session.close() | ||
|
||
LOGGER.info( | ||
"Removed %s/%s from %s %s", removed, total, catalog, entity_type | ||
"Total %s %s rotten URLs dumped to '%s': %d / %d", | ||
catalog, | ||
entity, | ||
out_path, | ||
rotten, | ||
total, | ||
) | ||
if drop: | ||
LOGGER.info( | ||
'Total %s %s rotten URLs dropped from the DB: %d / %d', | ||
catalog, | ||
entity, | ||
rotten, | ||
removed, | ||
) | ||
|
||
|
||
class Importer: | ||
|