Skip to content

Commit 5970fd9

Browse files
committed
fix: revert libxml2 regression with HTML4 recovery
Fixes #2461
1 parent 49b8663 commit 5970fd9

File tree

2 files changed

+65
-0
lines changed

2 files changed

+65
-0
lines changed
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
From ddc5f3d22644e0f6fbcc20541c86825757ffee62 Mon Sep 17 00:00:00 2001
2+
From: Mike Dalessio <[email protected]>
3+
Date: Mon, 21 Feb 2022 18:27:45 -0500
4+
Subject: [PATCH] Revert "Different approach to fix quadratic behavior in HTML
5+
push parser"
6+
7+
This reverts commit 798bdf13f6964a650b9a0b7b4b3a769f6f1d509a.
8+
---
9+
HTMLparser.c | 14 +-------------
10+
1 file changed, 1 insertion(+), 13 deletions(-)
11+
12+
diff --git a/HTMLparser.c b/HTMLparser.c
13+
index eba2d7c..c0b8119 100644
14+
--- a/HTMLparser.c
15+
+++ b/HTMLparser.c
16+
@@ -3960,25 +3960,13 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
17+
htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
18+
"htmlParseStartTag: invalid element name\n",
19+
NULL, NULL);
20+
- /*
21+
- * The recovery code is disabled for now as it can result in
22+
- * quadratic behavior with the push parser. htmlParseStartTag
23+
- * must consume all content up to the final '>' in order to avoid
24+
- * rescanning for this terminator.
25+
- *
26+
- * For a proper fix in line with HTML5, htmlParseStartTag and
27+
- * htmlParseElement should only be called when there's an ASCII
28+
- * alpha character following the initial '<'. Otherwise, the '<'
29+
- * should be emitted as text (unless followed by '!', '/' or '?').
30+
- */
31+
-#if 0
32+
/* if recover preserve text on classic misconstructs */
33+
if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) || (CUR == '<') ||
34+
(CUR == '=') || (CUR == '>') || (((CUR >= '0') && (CUR <= '9'))))) {
35+
htmlParseCharDataInternal(ctxt, '<');
36+
return(-1);
37+
}
38+
-#endif
39+
+
40+
41+
/* Dump the bogus tag like browsers do */
42+
while ((CUR != 0) && (CUR != '>') &&
43+
--
44+
2.31.0
45+

test/html4/test_document.rb

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -783,6 +783,26 @@ def test_leaking_dtd_nodes_after_internal_subset_removal
783783
assert(html_strict.strict?)
784784
end
785785

786+
describe "ill-formed < character" do
787+
let(:input) { %{<html><body><div>this < that</div><div>second element</div></body></html>} }
788+
789+
it "skips to the next start tag" do
790+
# see https://github.com/sparklemotion/nokogiri/issues/2461 for why we're testing this edge case
791+
if Nokogiri.uses_libxml?(">= 2.9.13")
792+
skip_unless_libxml2_patch("0010-Revert-Different-approach-to-fix-quadratic-behavior.patch")
793+
end
794+
795+
doc = Nokogiri::HTML4.parse(input)
796+
body = doc.at_xpath("//body")
797+
798+
expected_error_snippet = Nokogiri.uses_libxml? ? "invalid element name" : "Missing start element name"
799+
assert_includes(doc.errors.first.to_s, expected_error_snippet)
800+
801+
assert_equal("this < that", body.children.first.text, body.to_html)
802+
assert_equal(["div", "div"], body.children.map(&:name), body.to_html)
803+
end
804+
end
805+
786806
describe "read memory" do
787807
let(:input) { "<html><body><div" }
788808

0 commit comments

Comments
 (0)