Skip to content

Commit 172371f

Browse files
authored
Add support to optionally allow surrogate pair entities (#165) (#174)
1 parent 37232fc commit 172371f

File tree

7 files changed

+262
-62
lines changed

7 files changed

+262
-62
lines changed

release-notes/CREDITS

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,3 +89,8 @@ Tim Martin (@Orbisman)
8989

9090
* Contributed fix for #67: Wrong line for XML event location in elements following DTD
9191
(6.6.0)
92+
93+
Kamil Gołębiewski (@Magmaruss)
94+
95+
* Contributed #165: Add support to optionally allow surrogate pair entities
96+
(6.6.0)

release-notes/VERSION

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ Project: woodstox
99
#67: Wrong line for XML event location in elements following DTD
1010
(reported by @m-g-sonar)
1111
(fix contributed by Tim M)
12+
#165: Add support to optionally allow surrogate pair entities
13+
(contributed by Kamil G)
1214
#176: Fix parser when not replacing entities and treating char references
1315
as entities
1416
(contributed by Guillaume N)

src/main/java/com/ctc/wstx/api/ReaderConfig.java

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,11 @@ public final class ReaderConfig
139139

140140
final static int PROP_MAX_DTD_DEPTH = 69;
141141

142+
/**
143+
* @since 6.6
144+
*/
145+
final static int PROP_ALLOW_SURROGATE_PAIR_ENTITIES = 70;
146+
142147
/*
143148
////////////////////////////////////////////////
144149
// Limits for numeric properties
@@ -361,6 +366,8 @@ public final class ReaderConfig
361366
PROP_UNDECLARED_ENTITY_RESOLVER);
362367
sProperties.put(WstxInputProperties.P_BASE_URL,
363368
PROP_BASE_URL);
369+
sProperties.put(WstxInputProperties.P_ALLOW_SURROGATE_PAIR_ENTITIES,
370+
PROP_ALLOW_SURROGATE_PAIR_ENTITIES);
364371
sProperties.put(WstxInputProperties.P_INPUT_PARSING_MODE,
365372
PROP_INPUT_PARSING_MODE);
366373
}
@@ -419,6 +426,13 @@ public final class ReaderConfig
419426
*/
420427
protected URL mBaseURL;
421428

429+
/**
430+
* Whether to allow surrogate pairs as entities (2 code-points as one target character).
431+
*
432+
* @since 6.6
433+
*/
434+
protected boolean mAllowSurrogatePairEntities = false;
435+
422436
/**
423437
* Parsing mode can be changed from the default xml compliant
424438
* behavior to one of alternate modes (fragment processing,
@@ -583,6 +597,7 @@ public ReaderConfig createNonShared(SymbolTable sym)
583597
rc.mMaxEntityDepth = mMaxEntityDepth;
584598
rc.mMaxEntityCount = mMaxEntityCount;
585599
rc.mMaxDtdDepth = mMaxDtdDepth;
600+
rc.mAllowSurrogatePairEntities = mAllowSurrogatePairEntities;
586601
if (mSpecialProperties != null) {
587602
int len = mSpecialProperties.length;
588603
Object[] specProps = new Object[len];
@@ -792,6 +807,10 @@ public XMLResolver getUndeclaredEntityResolver() {
792807

793808
public URL getBaseURL() { return mBaseURL; }
794809

810+
public boolean allowsSurrogatePairEntities() {
811+
return mAllowSurrogatePairEntities;
812+
}
813+
795814
public WstxInputProperties.ParsingMode getInputParsingMode() {
796815
return mParsingMode;
797816
}
@@ -1074,6 +1093,10 @@ public void setUndeclaredEntityResolver(XMLResolver r) {
10741093
}
10751094

10761095
public void setBaseURL(URL baseURL) { mBaseURL = baseURL; }
1096+
1097+
public void doAllowSurrogatePairEntities(boolean state) {
1098+
mAllowSurrogatePairEntities = state;
1099+
}
10771100

10781101
public void setInputParsingMode(WstxInputProperties.ParsingMode mode) {
10791102
mParsingMode = mode;
@@ -1533,6 +1556,8 @@ public Object getProperty(int id)
15331556
return getUndeclaredEntityResolver();
15341557
case PROP_BASE_URL:
15351558
return getBaseURL();
1559+
case PROP_ALLOW_SURROGATE_PAIR_ENTITIES:
1560+
return allowsSurrogatePairEntities();
15361561
case PROP_INPUT_PARSING_MODE:
15371562
return getInputParsingMode();
15381563

@@ -1757,6 +1782,10 @@ public boolean setProperty(String propName, int id, Object value)
17571782
setBaseURL(u);
17581783
}
17591784
break;
1785+
1786+
case PROP_ALLOW_SURROGATE_PAIR_ENTITIES:
1787+
doAllowSurrogatePairEntities(ArgUtil.convertToBoolean(propName, value));
1788+
break;
17601789

17611790
case PROP_INPUT_PARSING_MODE:
17621791
setInputParsingMode((WstxInputProperties.ParsingMode) value);

src/main/java/com/ctc/wstx/api/WstxInputProperties.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,15 @@ public final class WstxInputProperties
300300
* DTD subset).
301301
*/
302302
public final static String P_BASE_URL = "com.ctc.wstx.baseURL";
303+
304+
/**
305+
* Property of type {@link java.lang.Boolean}, that will allow parsing
306+
* high unicode characters written by surrogate pairs (2 code points)
307+
* Default set as Boolean.FALSE, because it is not a standard behavior
308+
*
309+
* @since 6.6
310+
*/
311+
public final static String P_ALLOW_SURROGATE_PAIR_ENTITIES = "com.ctc.wstx.allowSurrogatePairEntities";
303312

304313
// // // Alternate parsing modes
305314

src/main/java/com/ctc/wstx/sr/StreamScanner.java

Lines changed: 60 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1183,59 +1183,62 @@ protected int resolveSimpleEntity(boolean checkStd)
11831183
char[] buf = mInputBuffer;
11841184
int ptr = mInputPtr;
11851185
char c = buf[ptr++];
1186+
final boolean allowSurrogatePairs = mConfig.allowsSurrogatePairEntities();
11861187

11871188
// Numeric reference?
11881189
if (c == '#') {
1189-
c = buf[ptr++];
11901190
int value = 0;
1191+
int pairValue = 0;
11911192
int inputLen = mInputEnd;
1192-
if (c == 'x') { // hex
1193-
while (ptr < inputLen) {
1193+
1194+
mInputPtr = ptr;
1195+
value = resolveCharEnt(null, false);
1196+
ptr = mInputPtr;
1197+
c = buf[ptr - 1];
1198+
1199+
// If resolving entity surrogate pairs enabled and if current entity
1200+
// is in range of high surrogate value, try to find surrogate pair
1201+
if (allowSurrogatePairs && value >= 0xD800 && value <= 0xDBFF) {
1202+
if (c == ';' && ptr + 1 < inputLen) {
11941203
c = buf[ptr++];
1195-
if (c == ';') {
1196-
break;
1197-
}
1198-
value = value << 4;
1199-
if (c <= '9' && c >= '0') {
1200-
value += (c - '0');
1201-
} else if (c >= 'a' && c <= 'f') {
1202-
value += (10 + (c - 'a'));
1203-
} else if (c >= 'A' && c <= 'F') {
1204-
value += (10 + (c - 'A'));
1205-
} else {
1206-
mInputPtr = ptr; // so error points to correct char
1207-
throwUnexpectedChar(c, "; expected a hex digit (0-9a-fA-F).");
1208-
}
1209-
/* Need to check for overflow; easiest to do right as
1210-
* it happens...
1211-
*/
1212-
if (value > MAX_UNICODE_CHAR) {
1213-
reportUnicodeOverflow();
1214-
}
1215-
}
1216-
} else { // numeric (decimal)
1217-
while (c != ';') {
1218-
if (c <= '9' && c >= '0') {
1219-
value = (value * 10) + (c - '0');
1220-
// Overflow?
1221-
if (value > MAX_UNICODE_CHAR) {
1222-
reportUnicodeOverflow();
1204+
if (c == '&' && ptr + 1 < inputLen) {
1205+
c = buf[ptr++];
1206+
if (c == '#' && ptr + 1 < inputLen) {
1207+
try {
1208+
mInputPtr = ptr;
1209+
pairValue = resolveCharEnt(null, false);
1210+
ptr = mInputPtr;
1211+
c = buf[ptr -1];
1212+
} catch (WstxUnexpectedCharException wuce) {
1213+
reportNoSurrogatePair(value);
1214+
}
1215+
} else {
1216+
reportNoSurrogatePair(value);
12231217
}
12241218
} else {
1225-
mInputPtr = ptr; // so error points to correct char
1226-
throwUnexpectedChar(c, "; expected a decimal number.");
1219+
reportNoSurrogatePair(value);
12271220
}
1228-
if (ptr >= inputLen) {
1229-
break;
1230-
}
1231-
c = buf[ptr++];
1221+
} else {
1222+
reportNoSurrogatePair(value);
12321223
}
12331224
}
1225+
12341226
// We get here either if we got it all, OR if we ran out of
12351227
// input in current buffer.
12361228
if (c == ';') { // got the full thing
12371229
mInputPtr = ptr;
1238-
validateChar(value);
1230+
1231+
if (allowSurrogatePairs && pairValue > 0) {
1232+
// [woodstox-core#165]
1233+
// If pair value is not in range of low surrogate values, then throw an error
1234+
if (pairValue < 0xDC00 || pairValue > 0xDFFF) {
1235+
reportInvalidSurrogatePair(value, pairValue);
1236+
}
1237+
value = 0x10000 + (value - 0xD800) * 0x400 + (pairValue - 0xDC00);
1238+
} else {
1239+
validateChar(value);
1240+
}
1241+
12391242
return value;
12401243
}
12411244

@@ -1352,7 +1355,7 @@ protected int resolveCharOnlyEntity(boolean checkStd)
13521355
// A char reference?
13531356
if (c == '#') { // yup
13541357
++mInputPtr;
1355-
return resolveCharEnt(null);
1358+
return resolveCharEnt(null, true);
13561359
}
13571360

13581361
// nope... except may be a pre-def?
@@ -1518,7 +1521,7 @@ protected int fullyResolveEntity(boolean allowExt)
15181521
// Do we have a (numeric) character entity reference?
15191522
if (c == '#') { // numeric
15201523
final StringBuffer originalSurface = new StringBuffer("#");
1521-
int ch = resolveCharEnt(originalSurface);
1524+
int ch = resolveCharEnt(originalSurface, true);
15221525
if (mCfgTreatCharRefsAsEntities) {
15231526
final char[] originalChars = new char[originalSurface.length()];
15241527
originalSurface.getChars(0, originalSurface.length(), originalChars, 0);
@@ -2314,7 +2317,7 @@ protected final void parseUntil(TextBuffer tb, char endChar, boolean convertLFs,
23142317
///////////////////////////////////////////////////////////////////////
23152318
*/
23162319

2317-
private int resolveCharEnt(StringBuffer originalCharacters)
2320+
private int resolveCharEnt(StringBuffer originalCharacters, boolean validateChar)
23182321
throws XMLStreamException
23192322
{
23202323
int value = 0;
@@ -2369,7 +2372,9 @@ private int resolveCharEnt(StringBuffer originalCharacters)
23692372
}
23702373
}
23712374
}
2372-
validateChar(value);
2375+
if (validateChar) {
2376+
validateChar(value);
2377+
}
23732378
return value;
23742379
}
23752380

@@ -2455,7 +2460,19 @@ private void reportUnicodeOverflow()
24552460
private void reportIllegalChar(int value)
24562461
throws XMLStreamException
24572462
{
2458-
throwParseError("Illegal character entity: expansion character (code 0x{0}", Integer.toHexString(value), null);
2463+
throwParseError("Illegal character entity: expansion character (code 0x{0})", Integer.toHexString(value), null);
2464+
}
2465+
2466+
private void reportNoSurrogatePair(int highSurrogate)
2467+
throws XMLStreamException
2468+
{
2469+
throwParseError("Cannot find surrogate pair: high surrogate character (code 0x{0})", Integer.toHexString(highSurrogate), null);
2470+
}
2471+
2472+
private void reportInvalidSurrogatePair(int firstSurrogate, int secondSurrogate)
2473+
throws XMLStreamException
2474+
{
2475+
throwParseError("Invalid surrogate pair: first surrogate character (code 0x{0}), second surrogate character (code 0x{1})", Integer.toHexString(firstSurrogate), Integer.toHexString(secondSurrogate));
24592476
}
24602477

24612478
protected void verifyLimit(String type, long maxValue, long currentValue)

src/test/java/org/codehaus/stax/test/BaseStaxTest.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
import javax.xml.stream.*;
99
import javax.xml.stream.events.XMLEvent;
1010

11+
import com.ctc.wstx.api.WstxInputProperties;
12+
1113
/* Latest updates:
1214
*
1315
* - 07-Sep-2007, TSa: Updating based on latest understanding of
@@ -275,6 +277,14 @@ protected static boolean setSupportExternalEntities(XMLInputFactory f, boolean s
275277
return false;
276278
}
277279
}
280+
281+
protected static void setResolveEntitySurrogatePairs(XMLInputFactory f, boolean state)
282+
throws XMLStreamException
283+
{
284+
Boolean b = state ? Boolean.TRUE : Boolean.FALSE;
285+
f.setProperty(WstxInputProperties.P_ALLOW_SURROGATE_PAIR_ENTITIES, b);
286+
assertEquals(b, f.getProperty(WstxInputProperties.P_ALLOW_SURROGATE_PAIR_ENTITIES));
287+
}
278288

279289
protected static void setResolver(XMLInputFactory f, XMLResolver resolver)
280290
throws XMLStreamException

0 commit comments

Comments
 (0)