|
626 | 626 | var splitAtBMP = function(data) {
|
627 | 627 | // Iterate over the data per `(start, end)` pair.
|
628 | 628 | var loneHighSurrogates = [];
|
| 629 | + var loneLowSurrogates = []; |
629 | 630 | var bmp = [];
|
630 | 631 | var astral = [];
|
631 | 632 | var index = 0;
|
|
635 | 636 | while (index < length) {
|
636 | 637 | start = data[index];
|
637 | 638 | end = data[index + 1] - 1; // Note: the `- 1` makes `end` inclusive.
|
638 |
| - if (start <= 0xFFFF && end <= 0xFFFF) { |
639 |
| - // Both `start` and `end` are within the BMP range. |
640 |
| - if (start >= HIGH_SURROGATE_MIN && start <= HIGH_SURROGATE_MAX) { |
641 |
| - // `start` lies in the high surrogates range. |
642 |
| - if (end <= HIGH_SURROGATE_MAX) { |
643 |
| - loneHighSurrogates.push(start, end + 1); |
644 |
| - } else { |
645 |
| - loneHighSurrogates.push(start, HIGH_SURROGATE_MAX + 1); |
646 |
| - bmp.push(HIGH_SURROGATE_MAX + 1, end + 1); |
647 |
| - } |
648 |
| - } else if (end >= HIGH_SURROGATE_MIN && end <= HIGH_SURROGATE_MAX) { |
| 639 | + |
| 640 | + if (start < HIGH_SURROGATE_MIN) { |
| 641 | + |
| 642 | + // starts before high surr & ends before high surr e.g. (0, 0x10) |
| 643 | + if (end < HIGH_SURROGATE_MIN) { |
| 644 | + bmp.push(start, end + 1); |
| 645 | + } |
| 646 | + |
| 647 | + // starts before high surr & ends in high surr range e.g. (0, 0xD855) |
| 648 | + if (end >= HIGH_SURROGATE_MIN && end <= HIGH_SURROGATE_MAX) { |
649 | 649 | bmp.push(start, HIGH_SURROGATE_MIN);
|
650 | 650 | loneHighSurrogates.push(HIGH_SURROGATE_MIN, end + 1);
|
651 |
| - } else if (start < HIGH_SURROGATE_MIN && end > HIGH_SURROGATE_MAX) { |
652 |
| - bmp.push(start, HIGH_SURROGATE_MIN, HIGH_SURROGATE_MAX + 1, end + 1); |
| 651 | + } |
| 652 | + |
| 653 | + // starts before high surr & ends in low surr range e.g. (0, 0xDCFF) |
| 654 | + if (end >= LOW_SURROGATE_MIN && end <= LOW_SURROGATE_MAX) { |
| 655 | + bmp.push(start, HIGH_SURROGATE_MIN); |
653 | 656 | loneHighSurrogates.push(HIGH_SURROGATE_MIN, HIGH_SURROGATE_MAX + 1);
|
654 |
| - } else { |
655 |
| - bmp.push(start, end + 1); |
| 657 | + loneLowSurrogates.push(LOW_SURROGATE_MIN, end + 1); |
656 | 658 | }
|
657 |
| - } |
658 |
| - else if (start <= 0xFFFF && end > 0xFFFF) { |
659 |
| - // `start` is in the BMP range, but `end` lies within the astral range. |
660 |
| - if (start >= HIGH_SURROGATE_MIN && start <= HIGH_SURROGATE_MAX) { |
661 |
| - // `start` lies in the high surrogates range. Since `end` is astral, |
662 |
| - // we can just add all high surrogates starting from `start` to |
663 |
| - // `loneHighSurrogates`, any other BMP code points to `bmp`, and the |
664 |
| - // remaining symbols to `astral`. |
665 |
| - loneHighSurrogates.push(start, HIGH_SURROGATE_MAX + 1); |
666 |
| - bmp.push(HIGH_SURROGATE_MAX + 1, 0xFFFF + 1); |
667 |
| - } else if (start < HIGH_SURROGATE_MIN) { |
668 |
| - bmp.push(start, HIGH_SURROGATE_MIN, HIGH_SURROGATE_MAX + 1, 0xFFFF + 1); |
| 659 | + |
| 660 | + // starts before high surr & ends after low surr e.g. (0, 0x10FFFF) |
| 661 | + if (end > LOW_SURROGATE_MAX) { |
| 662 | + bmp.push(start, HIGH_SURROGATE_MIN); |
669 | 663 | loneHighSurrogates.push(HIGH_SURROGATE_MIN, HIGH_SURROGATE_MAX + 1);
|
670 |
| - } else { // `start > HIGH_SURROGATE_MAX` holds true. |
| 664 | + loneLowSurrogates.push(LOW_SURROGATE_MIN, LOW_SURROGATE_MAX + 1); |
| 665 | + if (end <= 0xFFFF) { |
| 666 | + bmp.push(LOW_SURROGATE_MAX + 1, end + 1); |
| 667 | + } else { |
| 668 | + bmp.push(LOW_SURROGATE_MAX + 1, 0xFFFF + 1); |
| 669 | + astral.push(0xFFFF + 1, end + 1); |
| 670 | + } |
| 671 | + } |
| 672 | + |
| 673 | + } else if (start >= HIGH_SURROGATE_MIN && start <= HIGH_SURROGATE_MAX) { |
| 674 | + |
| 675 | + // starts in high surr range & ends in high surr range e.g. (0xD855, 0xD866) |
| 676 | + if (end >= HIGH_SURROGATE_MIN && end <= HIGH_SURROGATE_MAX) { |
| 677 | + loneHighSurrogates.push(start, end + 1); |
| 678 | + } |
| 679 | + |
| 680 | + // starts in high surr range & ends in low surr range e.g. (0xD855, 0xDCFF) |
| 681 | + if (end >= LOW_SURROGATE_MIN && end <= LOW_SURROGATE_MAX) { |
| 682 | + loneHighSurrogates.push(start, LOW_SURROGATE_MAX + 1); |
| 683 | + loneLowSurrogates.push(LOW_SURROGATE_MIN, end + 1); |
| 684 | + } |
| 685 | + |
| 686 | + // starts in high surr range & ends after low surr e.g. (0xD855, 0x10FFFF) |
| 687 | + if (end > LOW_SURROGATE_MAX) { |
| 688 | + loneHighSurrogates.push(start, LOW_SURROGATE_MAX + 1); |
| 689 | + loneLowSurrogates.push(LOW_SURROGATE_MIN, LOW_SURROGATE_MAX + 1); |
| 690 | + if (end <= 0xFFFF) { |
| 691 | + bmp.push(LOW_SURROGATE_MAX + 1, end + 1); |
| 692 | + } else { |
| 693 | + bmp.push(LOW_SURROGATE_MAX + 1, 0xFFFF + 1); |
| 694 | + astral.push(0xFFFF + 1, end + 1); |
| 695 | + } |
| 696 | + } |
| 697 | + |
| 698 | + } else if (start >= LOW_SURROGATE_MIN && start <= LOW_SURROGATE_MAX) { |
| 699 | + |
| 700 | + // starts in low surr range & ends in low surr range e.g. (0xDCFF, 0xDDFF) |
| 701 | + if (end >= LOW_SURROGATE_MIN && end <= LOW_SURROGATE_MAX) { |
| 702 | + loneLowSurrogates.push(start, end + 1); |
| 703 | + } |
| 704 | + |
| 705 | + // starts in low surr range & ends after low surr in BMP e.g. (0xDCFF, 0xFFFF) |
| 706 | + if (end > LOW_SURROGATE_MAX) { |
| 707 | + loneLowSurrogates.push(start, LOW_SURROGATE_MAX + 1); |
| 708 | + if (end <= 0xFFFF) { |
| 709 | + bmp.push(LOW_SURROGATE_MAX + 1, end + 1); |
| 710 | + } else { |
| 711 | + bmp.push(LOW_SURROGATE_MAX + 1, 0xFFFF + 1); |
| 712 | + astral.push(0xFFFF + 1, end + 1); |
| 713 | + } |
| 714 | + } |
| 715 | + |
| 716 | + } else if (start > LOW_SURROGATE_MAX && start <= 0xFFFF) { |
| 717 | + |
| 718 | + // starts after low surr in BMP & ends after low surr e.g. (0xFFAA, 0x10FFFF) |
| 719 | + if (end <= 0xFFFF) { |
| 720 | + bmp.push(start, end + 1); |
| 721 | + } else { |
671 | 722 | bmp.push(start, 0xFFFF + 1);
|
| 723 | + astral.push(0xFFFF + 1, end + 1); |
672 | 724 | }
|
673 |
| - astral.push(0xFFFF + 1, end + 1); |
674 |
| - } |
675 |
| - else { |
| 725 | + |
| 726 | + } else { |
| 727 | + |
676 | 728 | // Both `start` and `end` are in the astral range.
|
677 | 729 | astral.push(start, end + 1);
|
| 730 | + |
678 | 731 | }
|
| 732 | + |
679 | 733 | index += 2;
|
680 | 734 | }
|
681 | 735 | return {
|
682 | 736 | 'loneHighSurrogates': loneHighSurrogates,
|
| 737 | + 'loneLowSurrogates': loneLowSurrogates, |
683 | 738 | 'bmp': bmp,
|
684 | 739 | 'astral': astral
|
685 | 740 | };
|
|
898 | 953 |
|
899 | 954 | var parts = splitAtBMP(data);
|
900 | 955 | var loneHighSurrogates = parts.loneHighSurrogates;
|
| 956 | + var loneLowSurrogates = parts.loneLowSurrogates; |
901 | 957 | var bmp = parts.bmp;
|
902 | 958 | var astral = parts.astral;
|
903 | 959 | var hasAstral = !dataIsEmpty(parts.astral);
|
904 |
| - var hasLoneSurrogates = !dataIsEmpty(loneHighSurrogates); |
| 960 | + var hasLoneHighSurrogates = !dataIsEmpty(loneHighSurrogates); |
| 961 | + var hasLoneLowSurrogates = !dataIsEmpty(loneLowSurrogates); |
905 | 962 |
|
906 | 963 | var surrogateMappings = surrogateSet(astral);
|
907 | 964 |
|
908 |
| - // If we’re not dealing with any astral symbols, there’s no need to move |
909 |
| - // individual code points that are high surrogates to the end of the regex. |
910 |
| - if (!hasAstral && hasLoneSurrogates) { |
911 |
| - bmp = dataAddData(bmp, loneHighSurrogates); |
912 |
| - } |
913 |
| - |
914 | 965 | if (!dataIsEmpty(bmp)) {
|
915 | 966 | // The data set contains BMP code points that are not high surrogates
|
916 | 967 | // needed for astral code points in the set.
|
|
921 | 972 | // based on their surrogate pairs.
|
922 | 973 | result.push(createSurrogateCharacterClasses(surrogateMappings));
|
923 | 974 | }
|
924 |
| - if (hasAstral && hasLoneSurrogates) { |
925 |
| - // The data set contains lone high surrogates; append these. Lone high |
926 |
| - // surrogates must go at the end of the regex if astral symbols are to be |
927 |
| - // matched as well. |
928 |
| - result.push(createBMPCharacterClasses(loneHighSurrogates)); |
| 975 | + // https://gist.github.com/mathiasbynens/bbe7f870208abcfec860 |
| 976 | + if (hasLoneHighSurrogates) { |
| 977 | + result.push( |
| 978 | + createBMPCharacterClasses(loneHighSurrogates) + |
| 979 | + // Make sure the high surrogates aren’t part of a surrogate pair. |
| 980 | + '(?![\\uDC00-\\uDFFF])' |
| 981 | + ); |
| 982 | + } |
| 983 | + if (hasLoneLowSurrogates) { |
| 984 | + result.push( |
| 985 | + // Make sure the low surrogates aren’t part of a surrogate pair. |
| 986 | + '(?:[^\\uD800-\\uDBFF]|^)' + |
| 987 | + createBMPCharacterClasses(loneLowSurrogates) |
| 988 | + ); |
929 | 989 | }
|
930 | 990 | return result.join('|');
|
931 | 991 | };
|
|
1066 | 1126 | return regenerate;
|
1067 | 1127 | });
|
1068 | 1128 | } else if (freeExports && !freeExports.nodeType) {
|
1069 |
| - if (freeModule) { // in Node.js or RingoJS v0.8.0+ |
| 1129 | + if (freeModule) { // in Node.js, io.js, or RingoJS v0.8.0+ |
1070 | 1130 | freeModule.exports = regenerate;
|
1071 | 1131 | } else { // in Narwhal or RingoJS v0.7.0-
|
1072 | 1132 | freeExports.regenerate = regenerate;
|
|
0 commit comments