Skip to content

Commit 782885e

Browse files
authored
Fix issue 289, add function is_binary, add explicit support py 3.12 (#306)
Release 3.2.0
1 parent 1b0fb5c commit 782885e

File tree

13 files changed

+155
-14
lines changed

13 files changed

+155
-14
lines changed

.github/workflows/cd.yml

+2-1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ permissions:
1212

1313
jobs:
1414
pre_flight_check:
15+
name: Preflight Checks
1516
uses: ./.github/workflows/ci.yml
1617

1718
universal-wheel:
@@ -127,7 +128,7 @@ jobs:
127128
id-token: write
128129
contents: write
129130
with:
130-
subject-base64: ${{ needs.checksum.outputs.hashes }}
131+
base64-subjects: ${{ needs.checksum.outputs.hashes }}
131132
upload-assets: true
132133

133134
deploy:

.github/workflows/ci.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ jobs:
174174
strategy:
175175
fail-fast: false
176176
matrix:
177-
python-version: [ "3.7", "3.8", "3.9", "3.10", "3.11" ]
177+
python-version: [ "3.7", "3.8", "3.9", "3.10", "3.11" ] # , "3.12-dev"
178178
os: [ ubuntu-latest, macos-latest, windows-latest ]
179179
env:
180180
PYTHONIOENCODING: utf8 # only needed for Windows (console IO output encoding)

CHANGELOG.md

+10-1
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,19 @@
22
All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
33
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
44

5-
## [3.1.1.dev0](https://github.com/Ousret/charset_normalizer/compare/3.0.1...3.1.0) (2023-05-??)
5+
## [3.2.0](https://github.com/Ousret/charset_normalizer/compare/3.1.0...3.2.0) (2023-06-07)
66

77
### Changed
88
- Typehint for function `from_path` no longer enforce `PathLike` as its first argument
9+
- Minor improvement over the global detection reliability
10+
11+
### Added
12+
- Introduce function `is_binary` that relies on main capabilities, and optimized to detect binaries
13+
- Propagate `enable_fallback` argument throughout `from_bytes`, `from_path`, and `from_fp` that allow a deeper control over the detection (default True)
14+
- Explicit support for Python 3.12
15+
16+
### Fixed
17+
- Edge case detection failure where a file would contain 'very-long' camel cased word (Issue #289)
918

1019
## [3.1.0](https://github.com/Ousret/charset_normalizer/compare/3.0.1...3.1.0) (2023-03-06)
1120

charset_normalizer/__init__.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
"""
2222
import logging
2323

24-
from .api import from_bytes, from_fp, from_path
24+
from .api import from_bytes, from_fp, from_path, is_binary
2525
from .legacy import detect
2626
from .models import CharsetMatch, CharsetMatches
2727
from .utils import set_logging_handler
@@ -31,6 +31,7 @@
3131
"from_fp",
3232
"from_path",
3333
"from_bytes",
34+
"is_binary",
3435
"detect",
3536
"CharsetMatch",
3637
"CharsetMatches",

charset_normalizer/api.py

+74-2
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131

3232

3333
def from_bytes(
34-
sequences: bytes,
34+
sequences: Union[bytes, bytearray],
3535
steps: int = 5,
3636
chunk_size: int = 512,
3737
threshold: float = 0.2,
@@ -40,6 +40,7 @@ def from_bytes(
4040
preemptive_behaviour: bool = True,
4141
explain: bool = False,
4242
language_threshold: float = 0.1,
43+
enable_fallback: bool = True,
4344
) -> CharsetMatches:
4445
"""
4546
Given a raw bytes sequence, return the best possibles charset usable to render str objects.
@@ -361,7 +362,8 @@ def from_bytes(
361362
)
362363
# Preparing those fallbacks in case we got nothing.
363364
if (
364-
encoding_iana in ["ascii", "utf_8", specified_encoding]
365+
enable_fallback
366+
and encoding_iana in ["ascii", "utf_8", specified_encoding]
365367
and not lazy_str_hard_failure
366368
):
367369
fallback_entry = CharsetMatch(
@@ -507,6 +509,7 @@ def from_fp(
507509
preemptive_behaviour: bool = True,
508510
explain: bool = False,
509511
language_threshold: float = 0.1,
512+
enable_fallback: bool = True,
510513
) -> CharsetMatches:
511514
"""
512515
Same thing than the function from_bytes but using a file pointer that is already ready.
@@ -522,6 +525,7 @@ def from_fp(
522525
preemptive_behaviour,
523526
explain,
524527
language_threshold,
528+
enable_fallback,
525529
)
526530

527531

@@ -535,6 +539,7 @@ def from_path(
535539
preemptive_behaviour: bool = True,
536540
explain: bool = False,
537541
language_threshold: float = 0.1,
542+
enable_fallback: bool = True,
538543
) -> CharsetMatches:
539544
"""
540545
Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
@@ -551,4 +556,71 @@ def from_path(
551556
preemptive_behaviour,
552557
explain,
553558
language_threshold,
559+
enable_fallback,
554560
)
561+
562+
563+
def is_binary(
564+
fp_or_path_or_payload: Union[PathLike, str, BinaryIO, bytes], # type: ignore[type-arg]
565+
steps: int = 5,
566+
chunk_size: int = 512,
567+
threshold: float = 0.20,
568+
cp_isolation: Optional[List[str]] = None,
569+
cp_exclusion: Optional[List[str]] = None,
570+
preemptive_behaviour: bool = True,
571+
explain: bool = False,
572+
language_threshold: float = 0.1,
573+
enable_fallback: bool = False,
574+
) -> bool:
575+
"""
576+
Detect if the given input (file, bytes, or path) points to a binary file. aka. not a string.
577+
Based on the same main heuristic algorithms and default kwargs at the sole exception that fallbacks match
578+
are disabled to be stricter around ASCII-compatible but unlikely to be a string.
579+
"""
580+
if isinstance(fp_or_path_or_payload, (str, PathLike)):
581+
guesses = from_path(
582+
fp_or_path_or_payload,
583+
steps=steps,
584+
chunk_size=chunk_size,
585+
threshold=threshold,
586+
cp_isolation=cp_isolation,
587+
cp_exclusion=cp_exclusion,
588+
preemptive_behaviour=preemptive_behaviour,
589+
explain=explain,
590+
language_threshold=language_threshold,
591+
enable_fallback=enable_fallback,
592+
)
593+
elif isinstance(
594+
fp_or_path_or_payload,
595+
(
596+
bytes,
597+
bytearray,
598+
),
599+
):
600+
guesses = from_bytes(
601+
fp_or_path_or_payload,
602+
steps=steps,
603+
chunk_size=chunk_size,
604+
threshold=threshold,
605+
cp_isolation=cp_isolation,
606+
cp_exclusion=cp_exclusion,
607+
preemptive_behaviour=preemptive_behaviour,
608+
explain=explain,
609+
language_threshold=language_threshold,
610+
enable_fallback=enable_fallback,
611+
)
612+
else:
613+
guesses = from_fp(
614+
fp_or_path_or_payload,
615+
steps=steps,
616+
chunk_size=chunk_size,
617+
threshold=threshold,
618+
cp_isolation=cp_isolation,
619+
cp_exclusion=cp_exclusion,
620+
preemptive_behaviour=preemptive_behaviour,
621+
explain=explain,
622+
language_threshold=language_threshold,
623+
enable_fallback=enable_fallback,
624+
)
625+
626+
return not guesses

charset_normalizer/md.py

+14-3
Original file line numberDiff line numberDiff line change
@@ -294,14 +294,25 @@ def feed(self, character: str) -> None:
294294
if buffer_length >= 4:
295295
if self._buffer_accent_count / buffer_length > 0.34:
296296
self._is_current_word_bad = True
297-
# Word/Buffer ending with a upper case accentuated letter are so rare,
297+
# Word/Buffer ending with an upper case accentuated letter are so rare,
298298
# that we will consider them all as suspicious. Same weight as foreign_long suspicious.
299299
if is_accentuated(self._buffer[-1]) and self._buffer[-1].isupper():
300300
self._foreign_long_count += 1
301301
self._is_current_word_bad = True
302302
if buffer_length >= 24 and self._foreign_long_watch:
303-
self._foreign_long_count += 1
304-
self._is_current_word_bad = True
303+
camel_case_dst = [
304+
i
305+
for c, i in zip(self._buffer, range(0, buffer_length))
306+
if c.isupper()
307+
]
308+
probable_camel_cased: bool = False
309+
310+
if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3):
311+
probable_camel_cased = True
312+
313+
if not probable_camel_cased:
314+
self._foreign_long_count += 1
315+
self._is_current_word_bad = True
305316

306317
if self._is_current_word_bad:
307318
self._bad_word_count += 1

charset_normalizer/utils.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -120,12 +120,12 @@ def is_emoticon(character: str) -> bool:
120120

121121
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
122122
def is_separator(character: str) -> bool:
123-
if character.isspace() or character in {"|", "+", ",", ";", "<", ">"}:
123+
if character.isspace() or character in {"|", "+", "<", ">"}:
124124
return True
125125

126126
character_category: str = unicodedata.category(character)
127127

128-
return "Z" in character_category
128+
return "Z" in character_category or character_category in {"Po", "Pd", "Pc"}
129129

130130

131131
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)

charset_normalizer/version.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,5 @@
22
Expose version
33
"""
44

5-
__version__ = "3.1.1.dev0"
5+
__version__ = "3.2.0"
66
VERSION = __version__.split(".")

docs/api.rst

+1-2
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ Those functions are publicly exposed and are protected through our BC guarantee.
1313
.. autofunction:: from_bytes
1414
.. autofunction:: from_fp
1515
.. autofunction:: from_path
16+
.. autofunction:: is_binary
1617

1718
.. autoclass:: charset_normalizer.models.CharsetMatches
1819
:inherited-members:
@@ -100,5 +101,3 @@ Some reusable functions used across the project. We do not guarantee the BC in t
100101

101102

102103
.. class:: os.PathLike
103-
104-
Used as a generic way to accept AnyStr for paths.

docs/index.rst

+1
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ Features
5151
- Transpose any encoded content to Unicode the best we can.
5252
- Detect spoken language in text.
5353
- Ship with a great CLI.
54+
- Also, detect binaries.
5455

5556
Start Guide
5657
-----------

docs/user/miscellaneous.rst

+18
Original file line numberDiff line numberDiff line change
@@ -44,3 +44,21 @@ On `DEBUG` only one entry will be observed and that is about the detection resul
4444

4545
Then regarding the others log entries, they will be pushed as `Level 5`. Commonly known as TRACE level, but we do
4646
not register it globally.
47+
48+
49+
Detect binaries
50+
---------------
51+
52+
This package offers a neat way to detect files that can be considered as 'binaries'
53+
meaning that it is not likely to be a text-file.
54+
55+
::
56+
57+
from charset_normalizer import is_binary
58+
59+
# It can receive both a path or bytes or even a file pointer.
60+
result = is_binary("./my-file.ext")
61+
62+
# This should print 'True' or 'False'
63+
print(result)
64+

setup.cfg

+1
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ classifiers =
2424
Programming Language :: Python :: 3.9
2525
Programming Language :: Python :: 3.10
2626
Programming Language :: Python :: 3.11
27+
Programming Language :: Python :: 3.12
2728
Programming Language :: Python :: Implementation :: PyPy
2829
Topic :: Text Processing :: Linguistic
2930
Topic :: Utilities

tests/test_isbinary.py

+28
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
import pytest
2+
import typing
3+
from io import BytesIO
4+
from base64 import b64decode
5+
from charset_normalizer import is_binary
6+
from os import path, pardir
7+
8+
DIR_PATH = path.join(
9+
path.dirname(path.realpath(__file__)),
10+
pardir
11+
)
12+
13+
14+
@pytest.mark.parametrize(
15+
"raw, expected",
16+
[
17+
(b'\x00\x5f\x2f\xff'*50, True),
18+
(b64decode("R0lGODlhAQABAAAAACw="), True),
19+
(BytesIO(b64decode("R0lGODlhAQABAAAAACw=")), True),
20+
('sample-polish.txt', False),
21+
('sample-arabic.txt', False)
22+
]
23+
)
24+
def test_isbinary(raw: typing.Union[bytes, typing.BinaryIO, str], expected: bool) -> None:
25+
if isinstance(raw, str):
26+
raw = DIR_PATH + "/data/{}".format(raw)
27+
28+
assert is_binary(raw) is expected

0 commit comments

Comments
 (0)