Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Do not parse dates as prices. Sort imports. #19

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
45 changes: 40 additions & 5 deletions price_parser/parser.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
# -*- coding: utf-8 -*-

import re
import string
from typing import Callable, Optional, Pattern, List, Tuple
from datetime import datetime
from decimal import Decimal, InvalidOperation
from typing import Callable, List, Optional, Pattern, Tuple

import attr

from ._currencies import (CURRENCY_CODES, CURRENCY_NATIONAL_SYMBOLS,
CURRENCY_SYMBOLS)

Expand Down Expand Up @@ -73,7 +76,7 @@ def or_regex(symbols: List[str]) -> Pattern:

# unique currency symbols
'$', '€', '£', 'zł', 'Zł', 'Kč', '₽', '¥', '¥',
'฿', 'դր.', 'դր', '₦', '₴', '₱', '৳', '₭', '₪', '﷼', '៛', '₩', '₫', '₡',
'฿', 'դր.', 'դր', '₦', '₴', '₱', '৳', '₭', '₪', '﷼', '៛', '₩', '₫', '₡',
'টকা', 'ƒ', '₲', '؋', '₮', 'नेरू', '₨',
'₶', '₾', '֏', 'ރ', '৲', '૱', '௹', '₠', '₢', '₣', '₤', '₧', '₯',
'₰', '₳', '₷', '₸', '₹', '₺', '₼', '₾', '₿', 'ℳ',
Expand All @@ -86,7 +89,7 @@ def or_regex(symbols: List[str]) -> Pattern:

# other common symbols, which we consider unambiguous
'EUR', 'euro', 'eur', 'CHF', 'DKK', 'Rp', 'lei',
'руб.', 'руб', 'грн.', 'грн', 'дин.', 'Dinara', 'динар', 'лв.', 'лв',
'руб.', 'руб', 'грн.', 'грн', 'дин.', 'Dinara', 'динар', 'лв.', 'лв',
'р.', 'тңг', 'тңг.', 'ман.',
]

Expand Down Expand Up @@ -143,8 +146,8 @@ def extract_currency_symbol(price: Optional[str],
if price and '$' in price:
methods.insert(0, (_search_dollar_code, price))

for meth, attr in methods:
m = meth(attr) if attr else None
for meth, attrib in methods:
m = meth(attrib) if attrib else None
if m:
return m.group(0)

Expand Down Expand Up @@ -184,6 +187,12 @@ def extract_price_text(price: str) -> Optional[str]:
>>> extract_price_text("50")
'50'
"""

if date_format(price):
return None

price = strip_date(price)

if price.count('€') == 1:
m = re.search(r"""
[\d\s.,]*?\d # number, probably with thousand separators
Expand Down Expand Up @@ -292,3 +301,29 @@ def parse_number(num: str,
return Decimal(num)
except InvalidOperation:
return None


def date_format(price):
for fmt in ['%d.%m.%Y', '%B, %Y', '%b, %Y', '%Y-%m-%d']:
try:
return datetime.strptime(price, fmt)
except (ValueError, TypeError):
continue


def strip_date(text):
# normalize whitspace
text = re.sub(r'\s+', ' ', text)

all_date_regexp = [
r'\d{1,4}-\d{1,2}-\d{2,4}',
r' \S{3,8},\s\d{4}',
]

text_processed = text
for regexp in all_date_regexp:
for match in re.finditer(regexp, text):
if date_format(match.group(0).strip()):
text_processed = text_processed.replace(match.group(0), '')

return text_processed
60 changes: 52 additions & 8 deletions tests/test_price_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,14 @@
we've found in a wild; PRICE_PARSING_EXAMPLES_NEW is a list of tests for
new features. New tests should probably go these two lists.
"""
from typing import Optional, Union
from datetime import datetime
from decimal import Decimal
from typing import Optional, Union

import pytest

from price_parser import Price
from price_parser.parser import date_format, strip_date


class Example(Price):
Expand Down Expand Up @@ -63,6 +65,22 @@ def __eq__(self, other):
'GBP', '29.1583', 29.1583),
Example(None, '1.11000000000000009770',
None, '1.11000000000000009770', Decimal('1.11000000000000009770')),

# dates
Example(None, 'July, 2004',
None, None, None),
Example(None, '15.08.2017',
None, None, None),
Example(None, '0€ until May, 2005, 35€ afterwards',
'€', '0', 0),
Example(None, '2019-08-19: 22 USD',
'USD', '22', 22),
Example(None, '2105 EUR at July, 2004',
'EUR', '2105', 2105),
Example(None, '$10 EUR during March, 2016',
'$', '10', 10),
Example(None, '$10 EUR at March, 2016 or 2019-08-19',
'$', '10', 10),
]


Expand Down Expand Up @@ -1939,13 +1957,6 @@ def __eq__(self, other):
Example('Купить', 'Печная труба',
None, None, None),

# dates
Example(None, 'July, 2004',
None, None, None),

Example(None, '15.08.2017',
None, None, None),

# other incorrectly extracted prices
Example('8.5', '25-09',
None, None, None),
Expand Down Expand Up @@ -2018,3 +2029,36 @@ def test_price_decimal_separator(price_raw, decimal_separator, expected_result):
decimal_separator=decimal_separator
)
assert parsed.amount == expected_result


@pytest.mark.parametrize(
"price, result",
[
('10.04.2004', datetime(2004, 4, 10, 0, 0)),
('July, 2004', datetime(2004, 7, 1, 0, 0)),
('Jul, 2004', datetime(2004, 7, 1, 0, 0)),
('200', None),
('2004', None),
(2004, None),
(10.2014, None),
]
)
def test_date_format(price, result):
assert date_format(price) == result


@pytest.mark.parametrize(
"price, result",
[
('0€ until May, 2005, 35€ afterwards', '0€ until, 35€ afterwards'),
('2019-08-19: 22 USD', ': 22 USD'),
('105 EUR at July, 2004', '105 EUR at'),
('$10 EUR during March, 2016', '$10 EUR during'),
('$10 EUR during March, 2016 -- March, 2020', '$10 EUR during --'),
('$10', '$10'),
('sample text', 'sample text'),
('$10 - 1-08-19', '$10 - 1-08-19'),
]
)
def test_strip_date(price, result):
assert strip_date(price) == result