Skip to content

Commit e2546e6

Browse files
committed
parse pi: improve invalid case detection
1 parent 73661ef commit e2546e6

File tree

2 files changed

+53
-17
lines changed

2 files changed

+53
-17
lines changed

lib/rexml/parsers/baseparser.rb

+20-15
Original file line numberDiff line numberDiff line change
@@ -124,11 +124,10 @@ class BaseParser
124124
}
125125

126126
module Private
127-
INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um
128127
TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
129128
CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
130129
ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
131-
NAME_PATTERN = /\s*#{NAME}/um
130+
NAME_PATTERN = /#{NAME}/um
132131
GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
133132
PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
134133
ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um
@@ -242,7 +241,7 @@ def pull_event
242241
if @document_status == nil
243242
start_position = @source.position
244243
if @source.match("<?", true)
245-
return process_instruction(start_position)
244+
return process_instruction
246245
elsif @source.match("<!", true)
247246
if @source.match("--", true)
248247
md = @source.match(/(.*?)-->/um, true)
@@ -442,7 +441,7 @@ def pull_event
442441
raise REXML::ParseException.new( "Declarations can only occur "+
443442
"in the doctype declaration.", @source)
444443
elsif @source.match("?", true)
445-
return process_instruction(start_position)
444+
return process_instruction
446445
else
447446
# Get the next tag
448447
md = @source.match(Private::TAG_PATTERN, true)
@@ -588,14 +587,14 @@ def need_source_encoding_update?(xml_declaration_encoding)
588587
def parse_name(base_error_message)
589588
md = @source.match(Private::NAME_PATTERN, true)
590589
unless md
591-
if @source.match(/\s*\S/um)
590+
if @source.match(/\S/um)
592591
message = "#{base_error_message}: invalid name"
593592
else
594593
message = "#{base_error_message}: name is missing"
595594
end
596595
raise REXML::ParseException.new(message, @source)
597596
end
598-
md[1]
597+
md[0]
599598
end
600599

601600
def parse_id(base_error_message,
@@ -664,18 +663,24 @@ def parse_id_invalid_details(accept_external_id:,
664663
end
665664
end
666665

667-
def process_instruction(start_position)
668-
match_data = @source.match(Private::INSTRUCTION_END, true)
669-
unless match_data
670-
message = "Invalid processing instruction node"
671-
@source.position = start_position
672-
raise REXML::ParseException.new(message, @source)
666+
def process_instruction
667+
name = parse_name("Malformed XML: Invalid processing instruction node")
668+
if @source.match(/\s+/um, true)
669+
match_data = @source.match(/(.*?)\?>/um, true)
670+
unless match_data
671+
raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
672+
end
673+
content = match_data[1]
674+
else
675+
content = nil
676+
unless @source.match("?>", true)
677+
raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
678+
end
673679
end
674-
if match_data[1] == "xml"
680+
if name == "xml"
675681
if @document_status
676682
raise ParseException.new("Malformed XML: XML declaration is not at the start", @source)
677683
end
678-
content = match_data[2]
679684
version = VERSION.match(content)
680685
version = version[1] unless version.nil?
681686
encoding = ENCODING.match(content)
@@ -690,7 +695,7 @@ def process_instruction(start_position)
690695
standalone = standalone[1] unless standalone.nil?
691696
return [ :xmldecl, version, encoding, standalone ]
692697
end
693-
[:processing_instruction, match_data[1], match_data[2]]
698+
[:processing_instruction, name, content]
694699
end
695700

696701
def parse_attributes(prefixes, curr_ns)

test/parse/test_processing_instruction.rb

+33-2
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,37 @@ def test_no_name
1717
parse("<??>")
1818
end
1919
assert_equal(<<-DETAIL.chomp, exception.to_s)
20-
Invalid processing instruction node
20+
Malformed XML: Invalid processing instruction node: invalid name
2121
Line: 1
2222
Position: 4
2323
Last 80 unconsumed characters:
24-
<??>
24+
?>
25+
DETAIL
26+
end
27+
28+
def test_unclosed_content
29+
exception = assert_raise(REXML::ParseException) do
30+
parse("<?name content")
31+
end
32+
assert_equal(<<-DETAIL.chomp, exception.to_s)
33+
Malformed XML: Unclosed processing instruction
34+
Line: 1
35+
Position: 14
36+
Last 80 unconsumed characters:
37+
content
38+
DETAIL
39+
end
40+
41+
def test_unclosed_no_content
42+
exception = assert_raise(REXML::ParseException) do
43+
parse("<?name")
44+
end
45+
assert_equal(<<-DETAIL.chomp, exception.to_s)
46+
Malformed XML: Unclosed processing instruction
47+
Line: 1
48+
Position: 6
49+
Last 80 unconsumed characters:
50+
2551
DETAIL
2652
end
2753

@@ -79,6 +105,11 @@ def test_after_root
79105
assert_equal("abc", events[:processing_instruction])
80106
end
81107

108+
def test_content_question
109+
document = REXML::Document.new("<a><?name con?tent?></a>")
110+
assert_equal("con?tent", document.root.children.first.content)
111+
end
112+
82113
def test_linear_performance_gt
83114
seq = [10000, 50000, 100000, 150000, 200000]
84115
assert_linear_performance(seq, rehearsal: 10) do |n|

0 commit comments

Comments
 (0)