Skip to content

Commit a38dcec

Browse files
committed
Avoid matching surrogate halves when lone surrogates are to be matched
Fixes #28.
1 parent 216a51a commit a38dcec

File tree

3 files changed

+118
-58
lines changed

3 files changed

+118
-58
lines changed

README.md

+3-3
Original file line numberDiff line numberDiff line change
@@ -160,13 +160,13 @@ regenerate()
160160
.addRange(0x000000, 0x10FFFF) // add all Unicode code points
161161
.removeRange('A', 'z') // remove all symbols from `A` to `z`
162162
.toString();
163-
// → '[\\0-@\\{-\\uD7FF\\uDC00-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF]'
163+
// → '[\\0-@\\{-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF]'
164164

165165
regenerate()
166166
.addRange(0x000000, 0x10FFFF) // add all Unicode code points
167167
.removeRange(0x0041, 0x007A) // remove all code points from U+0041 to U+007A
168168
.toString();
169-
// → '[\\0-@\\{-\\uD7FF\\uDC00-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF]'
169+
// → '[\\0-@\\{-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF]'
170170
```
171171

172172
### `regenerate.prototype.intersection(codePoints)`
@@ -295,7 +295,7 @@ regenerate(codePoints).toString();
295295

296296
## Support
297297

298-
Regenerate supports at least Chrome 27+, Firefox 3+, Safari 4+, Opera 10+, IE 6+, Node.js v0.10.0+, Narwhal 0.3.2+, RingoJS 0.8+, PhantomJS 1.9.0+, and Rhino 1.7RC4+.
298+
Regenerate supports at least Chrome 27+, Firefox 3+, Safari 4+, Opera 10+, IE 6+, Node.js v0.10.0+, io.js v1.0.0+, Narwhal 0.3.2+, RingoJS 0.8+, PhantomJS 1.9.0+, and Rhino 1.7RC4+.
299299

300300
## Unit tests & code coverage
301301

regenerate.js

+104-44
Original file line numberDiff line numberDiff line change
@@ -626,6 +626,7 @@
626626
var splitAtBMP = function(data) {
627627
// Iterate over the data per `(start, end)` pair.
628628
var loneHighSurrogates = [];
629+
var loneLowSurrogates = [];
629630
var bmp = [];
630631
var astral = [];
631632
var index = 0;
@@ -635,51 +636,105 @@
635636
while (index < length) {
636637
start = data[index];
637638
end = data[index + 1] - 1; // Note: the `- 1` makes `end` inclusive.
638-
if (start <= 0xFFFF && end <= 0xFFFF) {
639-
// Both `start` and `end` are within the BMP range.
640-
if (start >= HIGH_SURROGATE_MIN && start <= HIGH_SURROGATE_MAX) {
641-
// `start` lies in the high surrogates range.
642-
if (end <= HIGH_SURROGATE_MAX) {
643-
loneHighSurrogates.push(start, end + 1);
644-
} else {
645-
loneHighSurrogates.push(start, HIGH_SURROGATE_MAX + 1);
646-
bmp.push(HIGH_SURROGATE_MAX + 1, end + 1);
647-
}
648-
} else if (end >= HIGH_SURROGATE_MIN && end <= HIGH_SURROGATE_MAX) {
639+
640+
if (start < HIGH_SURROGATE_MIN) {
641+
642+
// starts before high surr & ends before high surr e.g. (0, 0x10)
643+
if (end < HIGH_SURROGATE_MIN) {
644+
bmp.push(start, end + 1);
645+
}
646+
647+
// starts before high surr & ends in high surr range e.g. (0, 0xD855)
648+
if (end >= HIGH_SURROGATE_MIN && end <= HIGH_SURROGATE_MAX) {
649649
bmp.push(start, HIGH_SURROGATE_MIN);
650650
loneHighSurrogates.push(HIGH_SURROGATE_MIN, end + 1);
651-
} else if (start < HIGH_SURROGATE_MIN && end > HIGH_SURROGATE_MAX) {
652-
bmp.push(start, HIGH_SURROGATE_MIN, HIGH_SURROGATE_MAX + 1, end + 1);
651+
}
652+
653+
// starts before high surr & ends in low surr range e.g. (0, 0xDCFF)
654+
if (end >= LOW_SURROGATE_MIN && end <= LOW_SURROGATE_MAX) {
655+
bmp.push(start, HIGH_SURROGATE_MIN);
653656
loneHighSurrogates.push(HIGH_SURROGATE_MIN, HIGH_SURROGATE_MAX + 1);
654-
} else {
655-
bmp.push(start, end + 1);
657+
loneLowSurrogates.push(LOW_SURROGATE_MIN, end + 1);
656658
}
657-
}
658-
else if (start <= 0xFFFF && end > 0xFFFF) {
659-
// `start` is in the BMP range, but `end` lies within the astral range.
660-
if (start >= HIGH_SURROGATE_MIN && start <= HIGH_SURROGATE_MAX) {
661-
// `start` lies in the high surrogates range. Since `end` is astral,
662-
// we can just add all high surrogates starting from `start` to
663-
// `loneHighSurrogates`, any other BMP code points to `bmp`, and the
664-
// remaining symbols to `astral`.
665-
loneHighSurrogates.push(start, HIGH_SURROGATE_MAX + 1);
666-
bmp.push(HIGH_SURROGATE_MAX + 1, 0xFFFF + 1);
667-
} else if (start < HIGH_SURROGATE_MIN) {
668-
bmp.push(start, HIGH_SURROGATE_MIN, HIGH_SURROGATE_MAX + 1, 0xFFFF + 1);
659+
660+
// starts before high surr & ends after low surr e.g. (0, 0x10FFFF)
661+
if (end > LOW_SURROGATE_MAX) {
662+
bmp.push(start, HIGH_SURROGATE_MIN);
669663
loneHighSurrogates.push(HIGH_SURROGATE_MIN, HIGH_SURROGATE_MAX + 1);
670-
} else { // `start > HIGH_SURROGATE_MAX` holds true.
664+
loneLowSurrogates.push(LOW_SURROGATE_MIN, LOW_SURROGATE_MAX + 1);
665+
if (end <= 0xFFFF) {
666+
bmp.push(LOW_SURROGATE_MAX + 1, end + 1);
667+
} else {
668+
bmp.push(LOW_SURROGATE_MAX + 1, 0xFFFF + 1);
669+
astral.push(0xFFFF + 1, end + 1);
670+
}
671+
}
672+
673+
} else if (start >= HIGH_SURROGATE_MIN && start <= HIGH_SURROGATE_MAX) {
674+
675+
// starts in high surr range & ends in high surr range e.g. (0xD855, 0xD866)
676+
if (end >= HIGH_SURROGATE_MIN && end <= HIGH_SURROGATE_MAX) {
677+
loneHighSurrogates.push(start, end + 1);
678+
}
679+
680+
// starts in high surr range & ends in low surr range e.g. (0xD855, 0xDCFF)
681+
if (end >= LOW_SURROGATE_MIN && end <= LOW_SURROGATE_MAX) {
682+
loneHighSurrogates.push(start, LOW_SURROGATE_MAX + 1);
683+
loneLowSurrogates.push(LOW_SURROGATE_MIN, end + 1);
684+
}
685+
686+
// starts in high surr range & ends after low surr e.g. (0xD855, 0x10FFFF)
687+
if (end > LOW_SURROGATE_MAX) {
688+
loneHighSurrogates.push(start, LOW_SURROGATE_MAX + 1);
689+
loneLowSurrogates.push(LOW_SURROGATE_MIN, LOW_SURROGATE_MAX + 1);
690+
if (end <= 0xFFFF) {
691+
bmp.push(LOW_SURROGATE_MAX + 1, end + 1);
692+
} else {
693+
bmp.push(LOW_SURROGATE_MAX + 1, 0xFFFF + 1);
694+
astral.push(0xFFFF + 1, end + 1);
695+
}
696+
}
697+
698+
} else if (start >= LOW_SURROGATE_MIN && start <= LOW_SURROGATE_MAX) {
699+
700+
// starts in low surr range & ends in low surr range e.g. (0xDCFF, 0xDDFF)
701+
if (end >= LOW_SURROGATE_MIN && end <= LOW_SURROGATE_MAX) {
702+
loneLowSurrogates.push(start, end + 1);
703+
}
704+
705+
// starts in low surr range & ends after low surr in BMP e.g. (0xDCFF, 0xFFFF)
706+
if (end > LOW_SURROGATE_MAX) {
707+
loneLowSurrogates.push(start, LOW_SURROGATE_MAX + 1);
708+
if (end <= 0xFFFF) {
709+
bmp.push(LOW_SURROGATE_MAX + 1, end + 1);
710+
} else {
711+
bmp.push(LOW_SURROGATE_MAX + 1, 0xFFFF + 1);
712+
astral.push(0xFFFF + 1, end + 1);
713+
}
714+
}
715+
716+
} else if (start > LOW_SURROGATE_MAX && start <= 0xFFFF) {
717+
718+
// starts after low surr in BMP & ends after low surr e.g. (0xFFAA, 0x10FFFF)
719+
if (end <= 0xFFFF) {
720+
bmp.push(start, end + 1);
721+
} else {
671722
bmp.push(start, 0xFFFF + 1);
723+
astral.push(0xFFFF + 1, end + 1);
672724
}
673-
astral.push(0xFFFF + 1, end + 1);
674-
}
675-
else {
725+
726+
} else {
727+
676728
// Both `start` and `end` are in the astral range.
677729
astral.push(start, end + 1);
730+
678731
}
732+
679733
index += 2;
680734
}
681735
return {
682736
'loneHighSurrogates': loneHighSurrogates,
737+
'loneLowSurrogates': loneLowSurrogates,
683738
'bmp': bmp,
684739
'astral': astral
685740
};
@@ -898,19 +953,15 @@
898953

899954
var parts = splitAtBMP(data);
900955
var loneHighSurrogates = parts.loneHighSurrogates;
956+
var loneLowSurrogates = parts.loneLowSurrogates;
901957
var bmp = parts.bmp;
902958
var astral = parts.astral;
903959
var hasAstral = !dataIsEmpty(parts.astral);
904-
var hasLoneSurrogates = !dataIsEmpty(loneHighSurrogates);
960+
var hasLoneHighSurrogates = !dataIsEmpty(loneHighSurrogates);
961+
var hasLoneLowSurrogates = !dataIsEmpty(loneLowSurrogates);
905962

906963
var surrogateMappings = surrogateSet(astral);
907964

908-
// If we’re not dealing with any astral symbols, there’s no need to move
909-
// individual code points that are high surrogates to the end of the regex.
910-
if (!hasAstral && hasLoneSurrogates) {
911-
bmp = dataAddData(bmp, loneHighSurrogates);
912-
}
913-
914965
if (!dataIsEmpty(bmp)) {
915966
// The data set contains BMP code points that are not high surrogates
916967
// needed for astral code points in the set.
@@ -921,11 +972,20 @@
921972
// based on their surrogate pairs.
922973
result.push(createSurrogateCharacterClasses(surrogateMappings));
923974
}
924-
if (hasAstral && hasLoneSurrogates) {
925-
// The data set contains lone high surrogates; append these. Lone high
926-
// surrogates must go at the end of the regex if astral symbols are to be
927-
// matched as well.
928-
result.push(createBMPCharacterClasses(loneHighSurrogates));
975+
// https://gist.github.com/mathiasbynens/bbe7f870208abcfec860
976+
if (hasLoneHighSurrogates) {
977+
result.push(
978+
createBMPCharacterClasses(loneHighSurrogates) +
979+
// Make sure the high surrogates aren’t part of a surrogate pair.
980+
'(?![\\uDC00-\\uDFFF])'
981+
);
982+
}
983+
if (hasLoneLowSurrogates) {
984+
result.push(
985+
// Make sure the low surrogates aren’t part of a surrogate pair.
986+
'(?:[^\\uD800-\\uDBFF]|^)' +
987+
createBMPCharacterClasses(loneLowSurrogates)
988+
);
929989
}
930990
return result.join('|');
931991
};
@@ -1066,7 +1126,7 @@
10661126
return regenerate;
10671127
});
10681128
} else if (freeExports && !freeExports.nodeType) {
1069-
if (freeModule) { // in Node.js or RingoJS v0.8.0+
1129+
if (freeModule) { // in Node.js, io.js, or RingoJS v0.8.0+
10701130
freeModule.exports = regenerate;
10711131
} else { // in Narwhal or RingoJS v0.7.0-
10721132
freeExports.regenerate = regenerate;

tests/tests.js

+11-11
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)