Browse Source

LibXML: Read code points when parsing names

Gingeh 5 months ago
parent
commit
453e034801
2 changed files with 26 additions and 6 deletions
  1. 6 0
      Tests/LibXML/TestParser.cpp
  2. 20 6
      Userland/Libraries/LibXML/Parser/Parser.cpp

+ 6 - 0
Tests/LibXML/TestParser.cpp

@@ -41,3 +41,9 @@ TEST_CASE(predefined_character_reference)
     auto const& content = node.children[0]->content.get<XML::Node::Text>();
     EXPECT_EQ(content.builder.string_view(), "Well hello &, <, >, ', and \"!");
 }
+
+TEST_CASE(unicode_name)
+{
+    XML::Parser parser("<div 中文=\"\"></div>"sv);
+    TRY_OR_FAIL(parser.parse());
+}

+ 20 - 6
Userland/Libraries/LibXML/Parser/Parser.cpp

@@ -545,16 +545,30 @@ ErrorOr<Name, ParseError> Parser::parse_name()
     auto rule = enter_rule();
 
     // Name ::= NameStartChar (NameChar)*
-    auto start = TRY(expect(s_name_start_characters, "a NameStartChar"sv));
+
+    // FIXME: This is a hacky workaround to read code points instead of bytes.
+    // Replace this once we have a unicode-aware lexer.
+    auto start = m_lexer.tell();
+    StringView remaining = m_lexer.input().substring_view(start);
+    Utf8View view { remaining };
+    auto code_points = view.begin();
+    if (code_points.done() || !s_name_start_characters.contains(*code_points)) {
+        if (m_options.treat_errors_as_fatal)
+            return parse_error(m_lexer.current_position(), Expectation { "a NameStartChar"sv });
+    }
+
+    m_lexer.ignore(code_points.underlying_code_point_length_in_bytes());
+    ++code_points;
+
     auto accept = accept_rule();
 
-    auto rest = m_lexer.consume_while(s_name_characters);
-    StringBuilder builder;
-    builder.append(start);
-    builder.append(rest);
+    while (!code_points.done() && s_name_characters.contains(*code_points)) {
+        m_lexer.ignore(code_points.underlying_code_point_length_in_bytes());
+        ++code_points;
+    }
 
     rollback.disarm();
-    return builder.to_byte_string();
+    return remaining.substring_view(0, m_lexer.tell() - start);
 }
 
 // 2.8.28. doctypedecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-doctypedecl