From cf2cdf29ba29a93852166d71f33e4ac71767fc65 Mon Sep 17 00:00:00 2001 From: ShaharNaveh <50263213+ShaharNaveh@users.noreply.github.com> Date: Thu, 24 Jul 2025 16:56:02 +0200 Subject: [PATCH] Update `html` from 3.13.5 --- Lib/html/__init__.py | 2 +- Lib/html/entities.py | 9 ++- Lib/html/parser.py | 29 +++++-- Lib/test/test_htmlparser.py | 148 ++++++++++++++++++++++++++++++------ 4 files changed, 154 insertions(+), 34 deletions(-) diff --git a/Lib/html/__init__.py b/Lib/html/__init__.py index da0a0a3ce70..1543460ca33 100644 --- a/Lib/html/__init__.py +++ b/Lib/html/__init__.py @@ -25,7 +25,7 @@ def escape(s, quote=True): return s -# see http://www.w3.org/TR/html5/syntax.html#tokenizing-character-references +# see https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state _invalid_charrefs = { 0x00: '\ufffd', # REPLACEMENT CHARACTER diff --git a/Lib/html/entities.py b/Lib/html/entities.py index dc508631ac4..eb6dc121905 100644 --- a/Lib/html/entities.py +++ b/Lib/html/entities.py @@ -3,8 +3,7 @@ __all__ = ['html5', 'name2codepoint', 'codepoint2name', 'entitydefs'] -# maps the HTML entity name to the Unicode code point -# from https://html.spec.whatwg.org/multipage/named-characters.html +# maps HTML4 entity name to the Unicode code point name2codepoint = { 'AElig': 0x00c6, # latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1 'Aacute': 0x00c1, # latin capital letter A with acute, U+00C1 ISOlat1 @@ -261,7 +260,11 @@ } -# maps the HTML5 named character references to the equivalent Unicode character(s) +# HTML5 named character references +# Generated by Tools/build/parse_html5_entities.py +# from https://html.spec.whatwg.org/entities.json and +# https://html.spec.whatwg.org/multipage/named-characters.html. +# Map HTML5 named character references to the equivalent Unicode character(s). html5 = { 'Aacute': '\xc1', 'aacute': '\xe1', diff --git a/Lib/html/parser.py b/Lib/html/parser.py index bef0f4fe4bf..1e30956fe24 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -12,6 +12,7 @@ import _markupbase from html import unescape +from html.entities import html5 as html5_entities __all__ = ['HTMLParser'] @@ -23,6 +24,7 @@ entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') charref = re.compile('(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]') +attr_charref = re.compile(r'&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*)[;=]?') starttagopen = re.compile('<[a-zA-Z]') piclose = re.compile('>') @@ -57,6 +59,22 @@ # and the tag name, so maybe this should be fixed endtagfind = re.compile(r'\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>') +# Character reference processing logic specific to attribute values +# See: https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state +def _replace_attr_charref(match): + ref = match.group(0) + # Numeric / hex char refs must always be unescaped + if ref.startswith(''): + return unescape(ref) + # Named character / entity references must only be unescaped + # if they are an exact match, and they are not followed by an equals sign + if not ref.endswith('=') and ref[1:] in html5_entities: + return unescape(ref) + # Otherwise do not unescape + return ref + +def _unescape_attrvalue(s): + return attr_charref.sub(_replace_attr_charref, s) class HTMLParser(_markupbase.ParserBase): @@ -89,6 +107,7 @@ def __init__(self, *, convert_charrefs=True): If convert_charrefs is True (the default), all character references are automatically converted to the corresponding Unicode characters. """ + super().__init__() self.convert_charrefs = convert_charrefs self.reset() @@ -98,7 +117,7 @@ def reset(self): self.lasttag = '???' self.interesting = interesting_normal self.cdata_elem = None - _markupbase.ParserBase.reset(self) + super().reset() def feed(self, data): r"""Feed data to the parser. @@ -241,7 +260,7 @@ def goahead(self, end): else: assert 0, "interesting.search() lied" # end while - if end and i < n and not self.cdata_elem: + if end and i < n: if self.convert_charrefs and not self.cdata_elem: self.handle_data(unescape(rawdata[i:n])) else: @@ -259,7 +278,7 @@ def parse_html_declaration(self, i): if rawdata[i:i+4] == ' ¬-an-entity-ref; +
+ ''""" + s = f'