Skip to content

Commit ef8b22d

Browse files
authored
Merge pull request #219 from irisTa56/fix_zero_width_joiner_between_surrogate_pairs
2 parents 6fc4b7a + 8ce1309 commit ef8b22d

File tree

2 files changed

+44
-17
lines changed

2 files changed

+44
-17
lines changed

lib/poison/parser.ex

Lines changed: 25 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -536,8 +536,8 @@ defmodule Poison.Parser do
536536
# http://www.ietf.org/rfc/rfc2781.txt
537537
# http://perldoc.perl.org/Encode/Unicode.html#Surrogate-Pairs
538538
# http://mathiasbynens.be/notes/javascript-encoding#surrogate-pairs
539-
defguardp is_surrogate(cp) when cp in 0xD800..0xDFFF
540-
defguardp is_surrogate_pair(hi, lo) when hi in 0xD800..0xDBFF and lo in 0xDC00..0xDFFF
539+
defguardp is_hi_surrogate(cp) when cp in 0xD800..0xDBFF
540+
defguardp is_lo_surrogate(cp) when cp in 0xDC00..0xDFFF
541541

542542
defmacrop get_codepoint(seq, skip) do
543543
quote bind_quoted: [seq: seq, skip: skip] do
@@ -552,28 +552,36 @@ defmodule Poison.Parser do
552552

553553
@compile {:inline, string_escape_unicode: 5}
554554

555-
defp string_escape_unicode(<<"\\u", seq2::binary-size(4), rest::bits>>, data, skip, acc, seq1) do
556-
hi = get_codepoint(seq1, skip)
557-
lo = get_codepoint(seq2, skip + 6)
555+
defp string_escape_unicode(rest, data, skip, acc, seq1) do
556+
cp1 = get_codepoint(seq1, skip)
558557

559558
cond do
560-
is_surrogate_pair(hi, lo) ->
561-
codepoint = 0x10000 + ((hi &&& 0x03FF) <<< 10) + (lo &&& 0x03FF)
562-
string_continue(rest, data, skip + 11, true, 0, [acc, codepoint])
563-
564-
is_surrogate(hi) ->
565-
raise ParseError, skip: skip, value: "\\u#{seq1}\\u#{seq2}"
559+
is_hi_surrogate(cp1) -> string_escape_surrogate_pair(rest, data, skip, acc, seq1, cp1)
560+
is_lo_surrogate(cp1) -> raise ParseError, skip: skip, value: "\\u#{seq1}"
561+
true -> string_continue(rest, data, skip + 5, true, 0, [acc, cp1])
562+
end
563+
end
566564

567-
is_surrogate(lo) ->
568-
raise ParseError, skip: skip + 6, value: "\\u#{seq2}"
565+
@compile {:inline, string_escape_surrogate_pair: 6}
569566

570-
true ->
571-
string_continue(rest, data, skip + 11, true, 0, [acc, hi, lo])
567+
defp string_escape_surrogate_pair(
568+
<<"\\u", seq2::binary-size(4), rest::bits>>,
569+
data,
570+
skip,
571+
acc,
572+
seq1,
573+
hi
574+
) do
575+
with lo when is_lo_surrogate(lo) <- get_codepoint(seq2, skip + 6) do
576+
codepoint = 0x10000 + ((hi &&& 0x03FF) <<< 10) + (lo &&& 0x03FF)
577+
string_continue(rest, data, skip + 11, true, 0, [acc, codepoint])
578+
else
579+
_ -> raise ParseError, skip: skip, value: "\\u#{seq1}\\u#{seq2}"
572580
end
573581
end
574582

575-
defp string_escape_unicode(rest, data, skip, acc, seq1) do
576-
string_continue(rest, data, skip + 5, true, 0, [acc, get_codepoint(seq1, skip)])
583+
defp string_escape_surrogate_pair(_rest, _data, skip, _acc, seq1, _hi) do
584+
raise ParseError, skip: skip, value: "\\u#{seq1}"
577585
end
578586

579587
## Whitespace

test/poison/parser_test.exs

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,13 +126,32 @@ defmodule Poison.ParserTest do
126126
parse!(~s("\\uxxxx"))
127127
end
128128

129+
assert_raise ParseError,
130+
~s(cannot parse value at position 2: "\\\\uD800\\\\uDBFF"),
131+
fn ->
132+
parse!(~s("\\uD800\\uDBFF"))
133+
end
134+
135+
assert_raise ParseError,
136+
~s(cannot parse value at position 2: "\\\\uD800"),
137+
fn ->
138+
parse!(~s("\\uD800"))
139+
end
140+
141+
assert_raise ParseError,
142+
~s(cannot parse value at position 2: "\\\\uDC00"),
143+
fn ->
144+
parse!(~s("\\uDC00"))
145+
end
146+
129147
assert parse!(~s("\\"\\\\\\/\\b\\f\\n\\r\\t")) == ~s("\\/\b\f\n\r\t)
130148
assert parse!(~s("\\u2603")) == "☃"
131149
assert parse!(~s("\\u2028\\u2029")) == "\u2028\u2029"
132150
assert parse!(~s("\\uD834\\uDD1E")) == "𝄞"
133151
assert parse!(~s("\\uD834\\uDD1E")) == "𝄞"
134152
assert parse!(~s("\\uD799\\uD799")) == "힙힙"
135153
assert parse!(~s("✔︎")) == "✔︎"
154+
assert parse!(~s("\\uD83D\\uDC68\\u200D\\uD83D\\uDC76")) == "👨‍👶"
136155
end
137156

138157
property "strings" do

0 commit comments

Comments
 (0)