Skip to content

Commit

Permalink
Merge pull request #533 from jawah/preemptive-patch
Browse files Browse the repository at this point in the history
fix html file is not reported as UTF8 after conversion
  • Loading branch information
Ousret authored Sep 28, 2024
2 parents 39b6f5c + bf920e1 commit 957bd6a
Show file tree
Hide file tree
Showing 7 changed files with 136 additions and 13 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,13 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

## [3.3.3](https://github.com/Ousret/charset_normalizer/compare/3.3.2...master) (2024-09-??)

### Added
- Argument `--no-preemptive` in the CLI to prevent the detector to search for hints.

### Fixed
- Relax the TypeError exception thrown when trying to compare a CharsetMatch with anything else than a CharsetMatch.
- Improved the general reliability of the detector based on user feedbacks. (#520) (#509) (#498) (#407)
- Declared charset in content (preemptive detection) not changed when converting to utf-8 bytes. (#381)

## [3.3.2](https://github.com/Ousret/charset_normalizer/compare/3.3.1...3.3.2) (2023-10-31)

Expand Down
9 changes: 8 additions & 1 deletion charset_normalizer/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,7 +371,13 @@ def from_bytes(
and not lazy_str_hard_failure
):
fallback_entry = CharsetMatch(
sequences, encoding_iana, threshold, False, [], decoded_payload
sequences,
encoding_iana,
threshold,
False,
[],
decoded_payload,
preemptive_declaration=specified_encoding,
)
if encoding_iana == specified_encoding:
fallback_specified = fallback_entry
Expand Down Expand Up @@ -433,6 +439,7 @@ def from_bytes(
bom_or_sig_available,
cd_ratios_merged,
decoded_payload,
preemptive_declaration=specified_encoding,
)
)

Expand Down
36 changes: 30 additions & 6 deletions charset_normalizer/cli/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,14 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
dest="force",
help="Replace file without asking if you are sure, use this flag with caution.",
)
parser.add_argument(
"-i",
"--no-preemptive",
action="store_true",
default=False,
dest="no_preemptive",
help="Disable looking at a charset declaration to hint the detector.",
)
parser.add_argument(
"-t",
"--threshold",
Expand All @@ -133,31 +141,47 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
args = parser.parse_args(argv)

if args.replace is True and args.normalize is False:
if args.files:
for my_file in args.files:
my_file.close()
print("Use --replace in addition of --normalize only.", file=sys.stderr)
return 1

if args.force is True and args.replace is False:
if args.files:
for my_file in args.files:
my_file.close()
print("Use --force in addition of --replace only.", file=sys.stderr)
return 1

if args.threshold < 0.0 or args.threshold > 1.0:
if args.files:
for my_file in args.files:
my_file.close()
print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr)
return 1

x_ = []

for my_file in args.files:
matches = from_fp(my_file, threshold=args.threshold, explain=args.verbose)
matches = from_fp(
my_file,
threshold=args.threshold,
explain=args.verbose,
preemptive_behaviour=args.no_preemptive is False,
)

best_guess = matches.best()

if best_guess is None:
print(
'Unable to identify originating encoding for "{}". {}'.format(
my_file.name,
"Maybe try increasing maximum amount of chaos."
if args.threshold < 1.0
else "",
(
"Maybe try increasing maximum amount of chaos."
if args.threshold < 1.0
else ""
),
),
file=sys.stderr,
)
Expand Down Expand Up @@ -258,8 +282,8 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
try:
x_[0].unicode_path = join(dir_path, ".".join(o_))

with open(x_[0].unicode_path, "w", encoding="utf-8") as fp:
fp.write(str(best_guess))
with open(x_[0].unicode_path, "wb") as fp:
fp.write(best_guess.output())
except IOError as e:
print(str(e), file=sys.stderr)
if my_file.closed is False:
Expand Down
25 changes: 23 additions & 2 deletions charset_normalizer/models.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from encodings.aliases import aliases
from hashlib import sha256
from json import dumps
from re import sub
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union

from .constant import TOO_BIG_SEQUENCE
from .constant import RE_POSSIBLE_ENCODING_INDICATION, TOO_BIG_SEQUENCE
from .utils import iana_name, is_multi_byte_encoding, unicode_range


Expand All @@ -16,6 +17,7 @@ def __init__(
has_sig_or_bom: bool,
languages: "CoherenceMatches",
decoded_payload: Optional[str] = None,
preemptive_declaration: Optional[str] = None,
):
self._payload: bytes = payload

Expand All @@ -33,6 +35,8 @@ def __init__(

self._string: Optional[str] = decoded_payload

self._preemptive_declaration: Optional[str] = preemptive_declaration

def __eq__(self, other: object) -> bool:
if not isinstance(other, CharsetMatch):
if isinstance(other, str):
Expand Down Expand Up @@ -208,7 +212,24 @@ def output(self, encoding: str = "utf_8") -> bytes:
"""
if self._output_encoding is None or self._output_encoding != encoding:
self._output_encoding = encoding
self._output_payload = str(self).encode(encoding, "replace")
decoded_string = str(self)
if (
self._preemptive_declaration is not None
and self._preemptive_declaration.lower()
not in ["utf-8", "utf8", "utf_8"]
):
patched_header = sub(
RE_POSSIBLE_ENCODING_INDICATION,
lambda m: m.string[m.span()[0] : m.span()[1]].replace(
m.groups()[0], iana_name(self._output_encoding) # type: ignore[arg-type]
),
decoded_string[:8192],
1,
)

decoded_string = patched_header + decoded_string[8192:]

self._output_payload = decoded_string.encode(encoding, "replace")

return self._output_payload # type: ignore

Expand Down
5 changes: 1 addition & 4 deletions docs/community/featured.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,7 @@ your level or opinions.
Niquests
--------

Started as a simple though..

.. image:: https://i.imgflip.com/7xet0f.jpg
:width: 200
Started as a simple though.. IE 11 has built-in HTTP/2 support while Requests 2.32 does not!

Most of our programs that interact with HTTP server are built with ``requests`` and
we aren't likely to switch without a substantial effort.
Expand Down
30 changes: 30 additions & 0 deletions tests/test_edge_case.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,33 @@ def test_unicode_edge_case():

assert best_guess is not None, "Payload should have given something, detection failure"
assert best_guess.encoding == "utf_8", "UTF-8 payload wrongly detected"


def test_issue_gh520():
"""Verify that minorities does not strip basic latin characters!"""
payload = b"/includes/webform.compon\xd2\xaants.inc/"

best_guess = from_bytes(payload).best()

assert best_guess is not None, "Payload should have given something, detection failure"
assert "Basic Latin" in best_guess.alphabets


def test_issue_gh509():
"""Two common ASCII punctuations should render as-is."""
payload = b");"

best_guess = from_bytes(payload).best()

assert best_guess is not None, "Payload should have given something, detection failure"
assert "ascii" == best_guess.encoding


def test_issue_gh498():
"""This case was mistaken for utf-16-le, this should never happen again."""
payload = b'\x84\xae\xaa\xe3\xac\xa5\xad\xe2 Microsoft Word.docx'

best_guess = from_bytes(payload).best()

assert best_guess is not None, "Payload should have given something, detection failure"
assert "Cyrillic" in best_guess.alphabets
40 changes: 40 additions & 0 deletions tests/test_preemptive_detection.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pytest

from charset_normalizer.utils import any_specified_encoding
from charset_normalizer import CharsetMatch


@pytest.mark.parametrize(
Expand All @@ -24,3 +25,42 @@ def test_detect_most_common_body_encoding(payload, expected_encoding):
)

assert specified_encoding == expected_encoding, "Unable to determine properly encoding from given body"


@pytest.mark.parametrize(
"payload, expected_outcome",
[
(b'<?xml version="1.0" encoding="EUC-JP"?>', b'<?xml version="1.0" encoding="utf_8"?>'),
(b'<html><head><meta charset="utf-8"></head></html>', b'<html><head><meta charset="utf-8"></head></html>'),
(b'<html><head><meta charset="utf-57"></head></html>', b'<html><head><meta charset="utf-57"></head></html>'),
(b'# coding: utf-8', b'# coding: utf-8'),
(b'<?xml version="1.0" encoding="UTF-8"?>', b'<?xml version="1.0" encoding="UTF-8"?>'),
(b'<?xml version="1.0" encoding="US-ASCII"?>', b'<?xml version="1.0" encoding="utf_8"?>'),
(b'<?xml version="1.0" encoding="JohaB"?>', b'<?xml version="1.0" encoding="utf_8"?>'),
(b'<html><head><meta charset=WINDOWS-1252></head></html>', b'<html><head><meta charset=utf_8></head></html>'),
(b'<html><head><meta charset="WINDOWS-1256"></head></html>', b'<html><head><meta charset="utf_8"></head></html>'),
]
)
def test_preemptive_mark_replacement(payload, expected_outcome):
"""
When generating (to Unicode converted) bytes, we want to change any potential declarative charset
to utf-8. This test that.
"""
specified_encoding = any_specified_encoding(
payload
)

detected_encoding = specified_encoding if specified_encoding is not None else "utf-8"

m = CharsetMatch(
payload,
detected_encoding,
0.,
False,
[],
preemptive_declaration=specified_encoding,
)

transformed_output = m.output()

assert transformed_output == expected_outcome

0 comments on commit 957bd6a

Please sign in to comment.