fix: revert libxml2 regression with HTML4 recovery

flavorjones · flavorjones · commit 5970fd95c830 · 2022-02-21T23:22:42.000-05:00
Fixes #2461
diff --git a/patches/libxml2/0010-Revert-Different-approach-to-fix-quadratic-behavior.patch b/patches/libxml2/0010-Revert-Different-approach-to-fix-quadratic-behavior.patch
@@ -0,0 +1,45 @@
+From ddc5f3d22644e0f6fbcc20541c86825757ffee62 Mon Sep 17 00:00:00 2001
+From: Mike Dalessio <mike.dalessio@gmail.com>
+Date: Mon, 21 Feb 2022 18:27:45 -0500
+Subject: [PATCH] Revert "Different approach to fix quadratic behavior in HTML
+ push parser"
+
+This reverts commit 798bdf13f6964a650b9a0b7b4b3a769f6f1d509a.
+---
+ HTMLparser.c | 14 +-------------
+ 1 file changed, 1 insertion(+), 13 deletions(-)
+
+diff --git a/HTMLparser.c b/HTMLparser.c
+index eba2d7c..c0b8119 100644
+--- a/HTMLparser.c
++++ b/HTMLparser.c
+@@ -3960,25 +3960,13 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
+ 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
+ 	             "htmlParseStartTag: invalid element name\n",
+ 		     NULL, NULL);
+-        /*
+-         * The recovery code is disabled for now as it can result in
+-         * quadratic behavior with the push parser. htmlParseStartTag
+-         * must consume all content up to the final '>' in order to avoid
+-         * rescanning for this terminator.
+-         *
+-         * For a proper fix in line with HTML5, htmlParseStartTag and
+-         * htmlParseElement should only be called when there's an ASCII
+-         * alpha character following the initial '<'. Otherwise, the '<'
+-         * should be emitted as text (unless followed by '!', '/' or '?').
+-         */
+-#if 0
+ 	/* if recover preserve text on classic misconstructs */
+ 	if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) || (CUR == '<') ||
+ 	    (CUR == '=') || (CUR == '>') || (((CUR >= '0') && (CUR <= '9'))))) {
+ 	    htmlParseCharDataInternal(ctxt, '<');
+ 	    return(-1);
+ 	}
+-#endif
++
+ 
+ 	/* Dump the bogus tag like browsers do */
+ 	while ((CUR != 0) && (CUR != '>') &&
+-- 
+2.31.0
+
diff --git a/test/html4/test_document.rb b/test/html4/test_document.rb
@@ -783,6 +783,26 @@ def test_leaking_dtd_nodes_after_internal_subset_removal
             assert(html_strict.strict?)
           end
 
+          describe "ill-formed < character" do
+            let(:input) { %{<html><body><div>this < that</div><div>second element</div></body></html>} }
+
+            it "skips to the next start tag" do
+              # see https://github.com/sparklemotion/nokogiri/issues/2461 for why we're testing this edge case
+              if Nokogiri.uses_libxml?(">= 2.9.13")
+                skip_unless_libxml2_patch("0010-Revert-Different-approach-to-fix-quadratic-behavior.patch")
+              end
+
+              doc = Nokogiri::HTML4.parse(input)
+              body = doc.at_xpath("//body")
+
+              expected_error_snippet = Nokogiri.uses_libxml? ? "invalid element name" : "Missing start element name"
+              assert_includes(doc.errors.first.to_s, expected_error_snippet)
+
+              assert_equal("this < that", body.children.first.text, body.to_html)
+              assert_equal(["div", "div"], body.children.map(&:name), body.to_html)
+            end
+          end
+
           describe "read memory" do
             let(:input) { "<html><body><div" }