Skip to content
This repository was archived by the owner on Sep 11, 2024. It is now read-only.

Commit 7b68e82

Browse files
committed
Don't consider textual characters to be emoji
We were using emojibase-regex to match emoji within messages. However, the docs (https://emojibase.dev/docs/regex/) state that this regex matches both emoji and text presentation characters. This is not what we want, and will result in false positives for characters like '↔' that could turn into an emoji if paired with a variation selector. Unfortunately, none of the other regexes provided by Emojibase do what we want either (milesj/emojibase#174). In the meantime, browser support for the RGI_Emoji character sequence class has made it feasible to write an emoji regex by hand, so that's what I've done.
1 parent 99b2485 commit 7b68e82

File tree

6 files changed

+80
-12
lines changed

6 files changed

+80
-12
lines changed

.eslintrc.js

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,11 @@ module.exports = {
7070
name: "matrix-react-sdk/",
7171
message: "Please use matrix-react-sdk/src/index instead",
7272
},
73+
{
74+
name: "emojibase-regex",
75+
message:
76+
"This regex doesn't actually test for emoji. See the docs at https://emojibase.dev/docs/regex/ and prefer our own EMOJI_REGEX from HtmlUtils.",
77+
},
7378
],
7479
patterns: [
7580
{
@@ -138,6 +143,11 @@ module.exports = {
138143
],
139144
message: "Please use matrix-js-sdk/src/matrix instead",
140145
},
146+
{
147+
group: ["emojibase-regex/emoji*"],
148+
message:
149+
"This regex doesn't actually test for emoji. See the docs at https://emojibase.dev/docs/regex/ and prefer our own EMOJI_REGEX from HtmlUtils.",
150+
},
141151
],
142152
},
143153
],

src/HtmlUtils.tsx

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ limitations under the License.
2020
import React, { LegacyRef, ReactNode } from "react";
2121
import sanitizeHtml from "sanitize-html";
2222
import classNames from "classnames";
23-
import EMOJIBASE_REGEX from "emojibase-regex";
2423
import katex from "katex";
2524
import { decode } from "html-entities";
2625
import { IContent } from "matrix-js-sdk/src/matrix";
@@ -46,10 +45,17 @@ const SURROGATE_PAIR_PATTERN = /([\ud800-\udbff])([\udc00-\udfff])/;
4645
const SYMBOL_PATTERN = /([\u2100-\u2bff])/;
4746

4847
// Regex pattern for non-emoji characters that can appear in an "all-emoji" message
49-
// (Zero-Width Joiner, Zero-Width Space, Emoji presentation character, other whitespace)
50-
const EMOJI_SEPARATOR_REGEX = /[\u200D\u200B\s]|\uFE0F/g;
48+
// (Zero-Width Space, other whitespace)
49+
const EMOJI_SEPARATOR_REGEX = /[\u200B\s]/g;
5150

52-
const BIGEMOJI_REGEX = new RegExp(`^(${EMOJIBASE_REGEX.source})+$`, "i");
51+
// Regex for emoji. This includes any RGI_Emoji sequence followed by an optional
52+
// emoji presentation VS (U+FE0F), but not those sequences that are followed by
53+
// a text presentation VS (U+FE0E). Technically this produces false negatives
54+
// for emoji followed by U+FE0E when the emoji doesn't have a text variant, but
55+
// in practice this doesn't matter.
56+
export const EMOJI_REGEX = /\p{RGI_Emoji}(?!\uFE0E)(?:(?<!\uFE0F)\uFE0F)?/v;
57+
58+
const BIGEMOJI_REGEX = new RegExp(`^(${EMOJI_REGEX.source})+$`, "iv");
5359

5460
/*
5561
* Return true if the given string contains emoji
@@ -266,7 +272,7 @@ export function formatEmojis(message: string | undefined, isHtmlMessage?: boolea
266272

267273
const splitter = new GraphemeSplitter();
268274
for (const char of splitter.iterateGraphemes(message)) {
269-
if (EMOJIBASE_REGEX.test(char)) {
275+
if (EMOJI_REGEX.test(char)) {
270276
if (text) {
271277
result.push(text);
272278
text = "";

src/components/views/rooms/SendMessageComposer.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ limitations under the License.
1515
*/
1616

1717
import React, { createRef, KeyboardEvent, SyntheticEvent } from "react";
18-
import EMOJI_REGEX from "emojibase-regex";
1918
import {
2019
IContent,
2120
MatrixEvent,
@@ -70,6 +69,7 @@ import { doMaybeLocalRoomAction } from "../../../utils/local-room";
7069
import { Caret } from "../../../editor/caret";
7170
import { IDiff } from "../../../editor/diff";
7271
import { getBlobSafeMimeType } from "../../../utils/blobs";
72+
import { EMOJI_REGEX } from "../../../HtmlUtils";
7373

7474
/**
7575
* Build the mentions information based on the editor model (and any related events):

src/editor/parts.ts

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,11 @@ See the License for the specific language governing permissions and
1515
limitations under the License.
1616
*/
1717

18-
import EMOJIBASE_REGEX from "emojibase-regex";
1918
import { MatrixClient, RoomMember, Room } from "matrix-js-sdk/src/matrix";
2019
import GraphemeSplitter from "graphemer";
2120

2221
import AutocompleteWrapperModel, { GetAutocompleterComponent, UpdateCallback, UpdateQuery } from "./autocomplete";
23-
import { unicodeToShortcode } from "../HtmlUtils";
22+
import { EMOJI_REGEX, unicodeToShortcode } from "../HtmlUtils";
2423
import * as Avatar from "../Avatar";
2524
import defaultDispatcher from "../dispatcher/dispatcher";
2625
import { Action } from "../dispatcher/actions";
@@ -198,7 +197,7 @@ abstract class BasePart {
198197

199198
abstract class PlainBasePart extends BasePart {
200199
protected acceptsInsertion(chr: string, offset: number, inputType: string): boolean {
201-
if (chr === "\n" || EMOJIBASE_REGEX.test(chr)) {
200+
if (chr === "\n" || EMOJI_REGEX.test(chr)) {
202201
return false;
203202
}
204203
// when not pasting or dropping text, reject characters that should start a pill candidate
@@ -376,7 +375,7 @@ class NewlinePart extends BasePart implements IBasePart {
376375

377376
export class EmojiPart extends BasePart implements IBasePart {
378377
protected acceptsInsertion(chr: string, offset: number): boolean {
379-
return EMOJIBASE_REGEX.test(chr);
378+
return EMOJI_REGEX.test(chr);
380379
}
381380

382381
protected acceptsRemoval(position: number, chr: string): boolean {
@@ -574,7 +573,7 @@ export class PartCreator {
574573
case "\n":
575574
return new NewlinePart();
576575
default:
577-
if (EMOJIBASE_REGEX.test(getFirstGrapheme(input))) {
576+
if (EMOJI_REGEX.test(getFirstGrapheme(input))) {
578577
return new EmojiPart();
579578
}
580579
return new PlainPart();
@@ -652,7 +651,7 @@ export class PartCreator {
652651

653652
const splitter = new GraphemeSplitter();
654653
for (const char of splitter.iterateGraphemes(text)) {
655-
if (EMOJIBASE_REGEX.test(char)) {
654+
if (EMOJI_REGEX.test(char)) {
656655
if (plainText) {
657656
parts.push(this.plain(plainText));
658657
plainText = "";

test/HtmlUtils-test.tsx

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,12 @@ describe("bodyToHtml", () => {
107107
expect(html).toMatchInlineSnapshot(`"<span class="mx_EventTile_searchHighlight">test</span> foo &lt;b&gt;bar"`);
108108
});
109109

110+
it("generates big emoji for emoji made of multiple characters", () => {
111+
const { asFragment } = render(bodyToHtml({ body: "👨‍👩‍👧‍👦 ↔️ 🇮🇸", msgtype: "m.text" }, [], {}) as ReactElement);
112+
113+
expect(asFragment()).toMatchSnapshot();
114+
});
115+
110116
it("should generate big emoji for an emoji-only reply to a message", () => {
111117
const { asFragment } = render(
112118
bodyToHtml(
@@ -132,6 +138,12 @@ describe("bodyToHtml", () => {
132138
expect(asFragment()).toMatchSnapshot();
133139
});
134140

141+
it("does not mistake characters in text presentation mode for emoji", () => {
142+
const { asFragment } = render(bodyToHtml({ body: "↔", msgtype: "m.text" }, [], {}) as ReactElement);
143+
144+
expect(asFragment()).toMatchSnapshot();
145+
});
146+
135147
describe("feature_latex_maths", () => {
136148
beforeEach(() => {
137149
jest.spyOn(SettingsStore, "getValue").mockImplementation((feature) => feature === "feature_latex_maths");

test/__snapshots__/HtmlUtils-test.tsx.snap

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,16 @@
11
// Jest Snapshot v1, https://goo.gl/fbAQLP
22

3+
exports[`bodyToHtml does not mistake characters in text presentation mode for emoji 1`] = `
4+
<DocumentFragment>
5+
<span
6+
class="mx_EventTile_body"
7+
dir="auto"
8+
>
9+
10+
</span>
11+
</DocumentFragment>
12+
`;
13+
314
exports[`bodyToHtml feature_latex_maths should not mangle code blocks 1`] = `"<p>hello</p><pre><code>$\\xi$</code></pre><p>world</p>"`;
415

516
exports[`bodyToHtml feature_latex_maths should not mangle divs 1`] = `"<p>hello</p><div>world</div>"`;
@@ -8,6 +19,36 @@ exports[`bodyToHtml feature_latex_maths should render block katex 1`] = `"<p>hel
819

920
exports[`bodyToHtml feature_latex_maths should render inline katex 1`] = `"hello <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>ξ</mi></mrow><annotation encoding="application/x-tex">\\xi</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.8889em;vertical-align:-0.1944em;"></span><span class="mord mathnormal" style="margin-right:0.04601em;">ξ</span></span></span></span> world"`;
1021

22+
exports[`bodyToHtml generates big emoji for emoji made of multiple characters 1`] = `
23+
<DocumentFragment>
24+
<span
25+
class="mx_EventTile_body mx_EventTile_bigEmoji"
26+
dir="auto"
27+
>
28+
<span
29+
class="mx_Emoji"
30+
title=":man-woman-girl-boy:"
31+
>
32+
👨‍👩‍👧‍👦
33+
</span>
34+
35+
<span
36+
class="mx_Emoji"
37+
title=":left_right_arrow:"
38+
>
39+
↔️
40+
</span>
41+
42+
<span
43+
class="mx_Emoji"
44+
title=":flag-is:"
45+
>
46+
🇮🇸
47+
</span>
48+
</span>
49+
</DocumentFragment>
50+
`;
51+
1152
exports[`bodyToHtml should generate big emoji for an emoji-only reply to a message 1`] = `
1253
<DocumentFragment>
1354
<span

0 commit comments

Comments
 (0)