Skip to content

Commit

Permalink
Merge branch 'master' of github.com:Wikidata/soweego into values-for-…
Browse files Browse the repository at this point in the history
…catalogs
  • Loading branch information
marfox committed Aug 23, 2021
2 parents 2a8811a + 9ed8b4e commit 20d0da6
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 26 deletions.
28 changes: 9 additions & 19 deletions soweego/commons/url_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,14 @@

__author__ = 'Marco Fossati'
__email__ = '[email protected]'
__version__ = '1.0'
__version__ = '2.0'
__license__ = 'GPL-3.0'
__copyright__ = 'Copyleft 2018, Hjfocs'
__copyright__ = 'Copyleft 2021, Hjfocs'

import logging
import re
from functools import lru_cache
from typing import Union
from typing import Optional
from urllib.parse import unquote, urlsplit

import regex
Expand Down Expand Up @@ -101,7 +101,7 @@ def validate(url):


@lru_cache()
def resolve(url: str) -> Union[str, None]:
def resolve(url: str) -> Optional[str]:
"""Try to resolve an URL via a set of strategies.
:param url: an URL
Expand Down Expand Up @@ -129,37 +129,27 @@ def resolve(url: str) -> Union[str, None]:
response = get(url, headers=browser_ua, stream=True, verify=False)
except Exception as unexpected_error:
LOGGER.warning(
'Unexpected error: <%s> - Reason: %s',
url,
unexpected_error,
'Unexpected error: <%s> - Reason: %s', url, unexpected_error,
)
return None
except requests.exceptions.Timeout as timeout:
LOGGER.info(
'Request timeout: <%s> - Reason: %s',
url,
timeout,
'Request timeout: <%s> - Reason: %s', url, timeout,
)
return None
except requests.exceptions.TooManyRedirects as too_many_redirects:
LOGGER.info(
'Too many redirects: <%s> - %s',
url,
too_many_redirects,
'Too many redirects: <%s> - %s', url, too_many_redirects,
)
return None
except requests.exceptions.ConnectionError as connection_error:
LOGGER.info(
'Aborted connection: <%s> - Reason: %s',
url,
connection_error,
'Aborted connection: <%s> - Reason: %s', url, connection_error,
)
return None
except Exception as unexpected_error:
LOGGER.warning(
'Unexpected error: <%s> - Reason: %s',
url,
unexpected_error,
'Unexpected error: <%s> - Reason: %s', url, unexpected_error,
)
return None
if not response.ok:
Expand Down
20 changes: 13 additions & 7 deletions soweego/importer/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
}
ROTTEN_URLS_FNAME = '{catalog}_{entity}_rotten_urls.txt'


@click.command()
@click.argument(
'catalog', type=click.Choice(target_database.supported_targets())
Expand Down Expand Up @@ -70,10 +71,7 @@ def _resolve_url(res):
'catalog', type=click.Choice(target_database.supported_targets())
)
@click.option(
'-d',
'--drop',
is_flag=True,
help=f'Drop rotten URLs from the DB.',
'-d', '--drop', is_flag=True, help=f'Drop rotten URLs from the DB.',
)
@click.option(
'--dir-io',
Expand All @@ -97,7 +95,8 @@ def check_urls_cli(catalog, drop, dir_io):
if not link_entity:
LOGGER.info(
'%s %s does not have a links table. Skipping ...',
catalog, entity
catalog,
entity,
)
continue

Expand Down Expand Up @@ -138,12 +137,19 @@ def check_urls_cli(catalog, drop, dir_io):

LOGGER.info(
"Total %s %s rotten URLs dumped to '%s': %d / %d",
catalog, entity, out_path, rotten, total
catalog,
entity,
out_path,
rotten,
total,
)
if drop:
LOGGER.info(
'Total %s %s rotten URLs dropped from the DB: %d / %d',
catalog, entity, rotten, removed
catalog,
entity,
rotten,
removed,
)


Expand Down

0 comments on commit 20d0da6

Please sign in to comment.