gh-135661: Fix parsing start and end tags in HTMLParser according to … · python/cpython@0243f97 · GitHub | Latest TMZ Celebrity News & Gossip | Watch TMZ Live
Skip to content

Commit 0243f97

Browse files
gh-135661: Fix parsing start and end tags in HTMLParser according to the HTML5 standard (GH-135930)
* Whitespaces no longer accepted between `</` and the tag name. E.g. `</ script>` does not end the script section. * Vertical tabulation (`\v`) and non-ASCII whitespaces no longer recognized as whitespaces. The only whitespaces are `\t\n\r\f `. * Null character (U+0000) no longer ends the tag name. * Attributes and slashes after the tag name in end tags are now ignored, instead of terminating after the first `>` in quoted attribute value. E.g. `</script/foo=">"/>`. * Multiple slashes and whitespaces between the last attribute and closing `>` are now ignored in both start and end tags. E.g. `<a foo=bar/ //>`. * Multiple `=` between attribute name and value are no longer collapsed. E.g. `<a foo==bar>` produces attribute "foo" with value "=bar". * Whitespaces between the `=` separator and attribute name or value are no longer ignored. E.g. `<a foo =bar>` produces two attributes "foo" and "=bar", both with value None; `<a foo= bar>` produces two attributes: "foo" with value "" and "bar" with value None. * Fix Sphinx errors. * Apply suggestions from code review Co-authored-by: Ezio Melotti <ezio.melotti@gmail.com> * Address review comments. * Move to Security. --------- Co-authored-by: Ezio Melotti <ezio.melotti@gmail.com>
1 parent 938a5d7 commit 0243f97

File tree

3 files changed

+194
-129
lines changed

3 files changed

+194
-129
lines changed

Lib/html/parser.py

Lines changed: 69 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -31,15 +31,43 @@
3131
piclose = re.compile('>')
3232
commentclose = re.compile(r'--\s*>')
3333
# Note:
34-
# 1) if you change tagfind/attrfind remember to update locatestarttagend too;
35-
# 2) if you change tagfind/attrfind and/or locatestarttagend the parser will
34+
# 1) if you change tagfind/attrfind remember to update locatetagend too;
35+
# 2) if you change tagfind/attrfind and/or locatetagend the parser will
3636
# explode, so don't do it.
37-
# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
38-
# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
39-
tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')
40-
attrfind_tolerant = re.compile(
41-
r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
42-
r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
37+
# see the HTML5 specs section "13.2.5.6 Tag open state",
38+
# "13.2.5.8 Tag name state" and "13.2.5.33 Attribute name state".
39+
# https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
40+
# https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
41+
# https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
42+
tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />]*)(?:[\t\n\r\f ]|/(?!>))*')
43+
attrfind_tolerant = re.compile(r"""
44+
(
45+
(?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name
46+
)
47+
(= # value indicator
48+
('[^']*' # LITA-enclosed value
49+
|"[^"]*" # LIT-enclosed value
50+
|(?!['"])[^>\t\n\r\f ]* # bare value
51+
)
52+
)?
53+
(?:[\t\n\r\f ]|/(?!>))* # possibly followed by a space
54+
""", re.VERBOSE)
55+
locatetagend = re.compile(r"""
56+
[a-zA-Z][^\t\n\r\f />]* # tag name
57+
[\t\n\r\f /]* # optional whitespace before attribute name
58+
(?:(?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name
59+
(?:= # value indicator
60+
(?:'[^']*' # LITA-enclosed value
61+
|"[^"]*" # LIT-enclosed value
62+
|(?!['"])[^>\t\n\r\f ]* # bare value
63+
)
64+
)?
65+
[\t\n\r\f /]* # possibly followed by a space
66+
)*
67+
>?
68+
""", re.VERBOSE)
69+
# The following variables are not used, but are temporarily left for
70+
# backward compatibility.
4371
locatestarttagend_tolerant = re.compile(r"""
4472
<[a-zA-Z][^\t\n\r\f />\x00]* # tag name
4573
(?:[\s/]* # optional whitespace before attribute name
@@ -56,8 +84,6 @@
5684
\s* # trailing whitespace
5785
""", re.VERBOSE)
5886
endendtag = re.compile('>')
59-
# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
60-
# </ and the tag name, so maybe this should be fixed
6187
endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
6288

6389
# Character reference processing logic specific to attribute values
@@ -141,7 +167,8 @@ def get_starttag_text(self):
141167

142168
def set_cdata_mode(self, elem):
143169
self.cdata_elem = elem.lower()
144-
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
170+
self.interesting = re.compile(r'</%s(?=[\t\n\r\f />])' % self.cdata_elem,
171+
re.IGNORECASE|re.ASCII)
145172

146173
def clear_cdata_mode(self):
147174
self.interesting = interesting_normal
@@ -166,7 +193,7 @@ def goahead(self, end):
166193
# & near the end and see if it's followed by a space or ;.
167194
amppos = rawdata.rfind('&', max(i, n-34))
168195
if (amppos >= 0 and
169-
not re.compile(r'[\s;]').search(rawdata, amppos)):
196+
not re.compile(r'[\t\n\r\f ;]').search(rawdata, amppos)):
170197
break # wait till we get all the text
171198
j = n
172199
else:
@@ -310,7 +337,7 @@ def parse_html_declaration(self, i):
310337
return self.parse_bogus_comment(i)
311338

312339
# Internal -- parse bogus comment, return length or -1 if not terminated
313-
# see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
340+
# see https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
314341
def parse_bogus_comment(self, i, report=1):
315342
rawdata = self.rawdata
316343
assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
@@ -336,6 +363,8 @@ def parse_pi(self, i):
336363

337364
# Internal -- handle starttag, return end or -1 if not terminated
338365
def parse_starttag(self, i):
366+
# See the HTML5 specs section "13.2.5.8 Tag name state"
367+
# https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
339368
self.__starttag_text = None
340369
endpos = self.check_for_whole_start_tag(i)
341370
if endpos < 0:
@@ -381,76 +410,42 @@ def parse_starttag(self, i):
381410
# or -1 if incomplete.
382411
def check_for_whole_start_tag(self, i):
383412
rawdata = self.rawdata
384-
m = locatestarttagend_tolerant.match(rawdata, i)
385-
if m:
386-
j = m.end()
387-
next = rawdata[j:j+1]
388-
if next == ">":
389-
return j + 1
390-
if next == "/":
391-
if rawdata.startswith("/>", j):
392-
return j + 2
393-
if rawdata.startswith("/", j):
394-
# buffer boundary
395-
return -1
396-
# else bogus input
397-
if j > i:
398-
return j
399-
else:
400-
return i + 1
401-
if next == "":
402-
# end of input
403-
return -1
404-
if next in ("abcdefghijklmnopqrstuvwxyz=/"
405-
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
406-
# end of input in or before attribute value, or we have the
407-
# '/' from a '/>' ending
408-
return -1
409-
if j > i:
410-
return j
411-
else:
412-
return i + 1
413-
raise AssertionError("we should not get here!")
413+
match = locatetagend.match(rawdata, i+1)
414+
assert match
415+
j = match.end()
416+
if rawdata[j-1] != ">":
417+
return -1
418+
return j
414419

415420
# Internal -- parse endtag, return end or -1 if incomplete
416421
def parse_endtag(self, i):
422+
# See the HTML5 specs section "13.2.5.7 End tag open state"
423+
# https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
417424
rawdata = self.rawdata
418425
assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
419-
match = endendtag.search(rawdata, i+1) # >
420-
if not match:
426+
if rawdata.find('>', i+2) < 0: # fast check
421427
return -1
422-
gtpos = match.end()
423-
match = endtagfind.match(rawdata, i) # </ + tag + >
424-
if not match:
425-
if self.cdata_elem is not None:
426-
self.handle_data(rawdata[i:gtpos])
427-
return gtpos
428-
# find the name: w3.org/TR/html5/tokenization.html#tag-name-state
429-
namematch = tagfind_tolerant.match(rawdata, i+2)
430-
if not namematch:
431-
# w3.org/TR/html5/tokenization.html#end-tag-open-state
432-
if rawdata[i:i+3] == '</>':
433-
return i+3
434-
else:
435-
return self.parse_bogus_comment(i)
436-
tagname = namematch.group(1).lower()
437-
# consume and ignore other stuff between the name and the >
438-
# Note: this is not 100% correct, since we might have things like
439-
# </tag attr=">">, but looking for > after the name should cover
440-
# most of the cases and is much simpler
441-
gtpos = rawdata.find('>', namematch.end())
442-
self.handle_endtag(tagname)
443-
return gtpos+1
428+
if not endtagopen.match(rawdata, i): # </ + letter
429+
if rawdata[i+2:i+3] == '>': # </> is ignored
430+
# "missing-end-tag-name" parser error
431+
return i+3
432+
else:
433+
return self.parse_bogus_comment(i)
444434

445-
elem = match.group(1).lower() # script or style
446-
if self.cdata_elem is not None:
447-
if elem != self.cdata_elem:
448-
self.handle_data(rawdata[i:gtpos])
449-
return gtpos
435+
match = locatetagend.match(rawdata, i+2)
436+
assert match
437+
j = match.end()
438+
if rawdata[j-1] != ">":
439+
return -1
450440

451-
self.handle_endtag(elem)
441+
# find the name: "13.2.5.8 Tag name state"
442+
# https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
443+
match = tagfind_tolerant.match(rawdata, i+2)
444+
assert match
445+
tag = match.group(1).lower()
446+
self.handle_endtag(tag)
452447
self.clear_cdata_mode()
453-
return gtpos
448+
return j
454449

455450
# Overridable -- finish processing of start+end tag: <tag.../>
456451
def handle_startendtag(self, tag, attrs):

0 commit comments

Comments
 (0)

TMZ Celebrity News – Breaking Stories, Videos & Gossip

Looking for the latest TMZ celebrity news? You've come to the right place. From shocking Hollywood scandals to exclusive videos, TMZ delivers it all in real time.

Whether it’s a red carpet slip-up, a viral paparazzi moment, or a legal drama involving your favorite stars, TMZ news is always first to break the story. Stay in the loop with daily updates, insider tips, and jaw-dropping photos.

🎥 Watch TMZ Live

TMZ Live brings you daily celebrity news and interviews straight from the TMZ newsroom. Don’t miss a beat—watch now and see what’s trending in Hollywood.