Skip to content

Commit e8777e1

Browse files
authored
Fix: Use lazy encoding for utf-8 encoded string comparison (#2299)
1 parent c6c85b6 commit e8777e1

File tree

3 files changed

+304
-11
lines changed

3 files changed

+304
-11
lines changed

dev/src/order.ts

+50-7
Original file line numberDiff line numberDiff line change
@@ -248,19 +248,62 @@ function compareVectors(left: ApiMapValue, right: ApiMapValue): number {
248248
return compareArrays(leftArray, rightArray);
249249
}
250250

251-
function stringToUtf8Bytes(str: string): Uint8Array {
252-
return new TextEncoder().encode(str);
253-
}
254-
255251
/*!
256252
* Compare strings in UTF-8 encoded byte order
257253
* @private
258254
* @internal
259255
*/
260256
export function compareUtf8Strings(left: string, right: string): number {
261-
const leftBytes = stringToUtf8Bytes(left);
262-
const rightBytes = stringToUtf8Bytes(right);
263-
return compareBlobs(Buffer.from(leftBytes), Buffer.from(rightBytes));
257+
let i = 0;
258+
while (i < left.length && i < right.length) {
259+
const leftCodePoint = left.codePointAt(i)!;
260+
const rightCodePoint = right.codePointAt(i)!;
261+
262+
if (leftCodePoint !== rightCodePoint) {
263+
if (leftCodePoint < 128 && rightCodePoint < 128) {
264+
// ASCII comparison
265+
return primitiveComparator(leftCodePoint, rightCodePoint);
266+
} else {
267+
// Lazy instantiate TextEncoder
268+
const encoder = new TextEncoder();
269+
270+
// UTF-8 encode the character at index i for byte comparison.
271+
const leftBytes = encoder.encode(getUtf8SafeSubstring(left, i));
272+
const rightBytes = encoder.encode(getUtf8SafeSubstring(right, i));
273+
const comp = compareBlobs(
274+
Buffer.from(leftBytes),
275+
Buffer.from(rightBytes)
276+
);
277+
if (comp !== 0) {
278+
return comp;
279+
} else {
280+
// EXTREMELY RARE CASE: Code points differ, but their UTF-8 byte
281+
// representations are identical. This can happen with malformed input
282+
// (invalid surrogate pairs). The backend also actively prevents invalid
283+
// surrogates as INVALID_ARGUMENT errors, so we almost never receive
284+
// invalid strings from backend.
285+
// Fallback to code point comparison for graceful handling.
286+
return primitiveComparator(leftCodePoint, rightCodePoint);
287+
}
288+
}
289+
}
290+
// Increment by 2 for surrogate pairs, 1 otherwise
291+
i += leftCodePoint > 0xffff ? 2 : 1;
292+
}
293+
294+
// Compare lengths if all characters are equal
295+
return primitiveComparator(left.length, right.length);
296+
}
297+
298+
function getUtf8SafeSubstring(str: string, index: number): string {
299+
const firstCodePoint = str.codePointAt(index)!;
300+
if (firstCodePoint > 0xffff) {
301+
// It's a surrogate pair, return the whole pair
302+
return str.substring(index, index + 2);
303+
} else {
304+
// It's a single code point, return it
305+
return str.substring(index, index + 1);
306+
}
264307
}
265308

266309
/*!

dev/system-test/firestore.ts

+38-4
Original file line numberDiff line numberDiff line change
@@ -4086,6 +4086,20 @@ describe('Query class', () => {
40864086
});
40874087

40884088
describe('sort unicode strings', () => {
4089+
const expectedDocs = [
4090+
'b',
4091+
'a',
4092+
'h',
4093+
'i',
4094+
'c',
4095+
'f',
4096+
'e',
4097+
'd',
4098+
'g',
4099+
'k',
4100+
'j',
4101+
];
4102+
40894103
it('snapshot listener sorts unicode strings same as server', async () => {
40904104
const collection = await testCollectionWithDocs({
40914105
a: {value: 'Łukasiewicz'},
@@ -4095,10 +4109,13 @@ describe('Query class', () => {
40954109
e: {value: 'P'},
40964110
f: {value: '︒'},
40974111
g: {value: '🐵'},
4112+
h: {value: '你好'},
4113+
i: {value: '你顥'},
4114+
j: {value: '😁'},
4115+
k: {value: '😀'},
40984116
});
40994117

41004118
const query = collection.orderBy('value');
4101-
const expectedDocs = ['b', 'a', 'c', 'f', 'e', 'd', 'g'];
41024119

41034120
const getSnapshot = await query.get();
41044121
expect(getSnapshot.docs.map(d => d.id)).to.deep.equal(expectedDocs);
@@ -4123,10 +4140,13 @@ describe('Query class', () => {
41234140
e: {value: ['P']},
41244141
f: {value: ['︒']},
41254142
g: {value: ['🐵']},
4143+
h: {value: ['你好']},
4144+
i: {value: ['你顥']},
4145+
j: {value: ['😁']},
4146+
k: {value: ['😀']},
41264147
});
41274148

41284149
const query = collection.orderBy('value');
4129-
const expectedDocs = ['b', 'a', 'c', 'f', 'e', 'd', 'g'];
41304150

41314151
const getSnapshot = await query.get();
41324152
expect(getSnapshot.docs.map(d => d.id)).to.deep.equal(expectedDocs);
@@ -4151,10 +4171,13 @@ describe('Query class', () => {
41514171
e: {value: {foo: 'P'}},
41524172
f: {value: {foo: '︒'}},
41534173
g: {value: {foo: '🐵'}},
4174+
h: {value: {foo: '你好'}},
4175+
i: {value: {foo: '你顥'}},
4176+
j: {value: {foo: '😁'}},
4177+
k: {value: {foo: '😀'}},
41544178
});
41554179

41564180
const query = collection.orderBy('value');
4157-
const expectedDocs = ['b', 'a', 'c', 'f', 'e', 'd', 'g'];
41584181

41594182
const getSnapshot = await query.get();
41604183
expect(getSnapshot.docs.map(d => d.id)).to.deep.equal(expectedDocs);
@@ -4179,10 +4202,13 @@ describe('Query class', () => {
41794202
e: {value: {: true}},
41804203
f: {value: {'︒': true}},
41814204
g: {value: {'🐵': true}},
4205+
h: {value: {你好: true}},
4206+
i: {value: {你顥: true}},
4207+
j: {value: {'😁': true}},
4208+
k: {value: {'😀': true}},
41824209
});
41834210

41844211
const query = collection.orderBy('value');
4185-
const expectedDocs = ['b', 'a', 'c', 'f', 'e', 'd', 'g'];
41864212

41874213
const getSnapshot = await query.get();
41884214
expect(getSnapshot.docs.map(d => d.id)).to.deep.equal(expectedDocs);
@@ -4207,17 +4233,25 @@ describe('Query class', () => {
42074233
: {value: true},
42084234
'︒': {value: true},
42094235
'🐵': {value: true},
4236+
你好: {value: true},
4237+
你顥: {value: true},
4238+
'😁': {value: true},
4239+
'😀': {value: true},
42104240
});
42114241

42124242
const query = collection.orderBy(FieldPath.documentId());
42134243
const expectedDocs = [
42144244
'Sierpiński',
42154245
'Łukasiewicz',
4246+
'你好',
4247+
'你顥',
42164248
'岩澤',
42174249
'︒',
42184250
'P',
42194251
'🄟',
42204252
'🐵',
4253+
'😀',
4254+
'😁',
42214255
];
42224256

42234257
const getSnapshot = await query.get();

dev/test/order.ts

+216
Original file line numberDiff line numberDiff line change
@@ -284,3 +284,219 @@ describe('Order', () => {
284284
}
285285
});
286286
});
287+
288+
class StringPair {
289+
constructor(
290+
readonly s1: string,
291+
readonly s2: string
292+
) {}
293+
}
294+
295+
class StringPairGenerator {
296+
constructor(private stringGenerator: StringGenerator) {}
297+
298+
next(): StringPair {
299+
const prefix = this.stringGenerator.next();
300+
const s1 = prefix + this.stringGenerator.next();
301+
const s2 = prefix + this.stringGenerator.next();
302+
return new StringPair(s1, s2);
303+
}
304+
}
305+
306+
class StringGenerator {
307+
private static readonly DEFAULT_SURROGATE_PAIR_PROBABILITY = 0.33;
308+
private static readonly DEFAULT_MAX_LENGTH = 20;
309+
310+
private readonly rnd: Random;
311+
private readonly surrogatePairProbability: number;
312+
private readonly maxLength: number;
313+
314+
constructor(seed: number);
315+
constructor(rnd: Random, surrogatePairProbability: number, maxLength: number);
316+
constructor(
317+
seedOrRnd: number | Random,
318+
surrogatePairProbability?: number,
319+
maxLength?: number
320+
) {
321+
if (typeof seedOrRnd === 'number') {
322+
this.rnd = new Random(seedOrRnd);
323+
this.surrogatePairProbability =
324+
StringGenerator.DEFAULT_SURROGATE_PAIR_PROBABILITY;
325+
this.maxLength = StringGenerator.DEFAULT_MAX_LENGTH;
326+
} else {
327+
this.rnd = seedOrRnd;
328+
this.surrogatePairProbability = StringGenerator.validateProbability(
329+
surrogatePairProbability!
330+
);
331+
this.maxLength = StringGenerator.validateLength(maxLength!);
332+
}
333+
}
334+
335+
private static validateProbability(probability: number): number {
336+
if (!Number.isFinite(probability)) {
337+
throw new Error(
338+
`invalid surrogate pair probability: ${probability} (must be between 0.0 and 1.0, inclusive)`
339+
);
340+
} else if (probability < 0.0) {
341+
throw new Error(
342+
`invalid surrogate pair probability: ${probability} (must be greater than or equal to zero)`
343+
);
344+
} else if (probability > 1.0) {
345+
throw new Error(
346+
`invalid surrogate pair probability: ${probability} (must be less than or equal to 1)`
347+
);
348+
}
349+
return probability;
350+
}
351+
352+
private static validateLength(length: number): number {
353+
if (length < 0) {
354+
throw new Error(
355+
`invalid maximum string length: ${length} (must be greater than or equal to zero)`
356+
);
357+
}
358+
return length;
359+
}
360+
361+
next(): string {
362+
const length = this.rnd.nextInt(this.maxLength + 1);
363+
const sb = new StringBuilder();
364+
while (sb.length() < length) {
365+
const codePoint = this.nextCodePoint();
366+
sb.appendCodePoint(codePoint);
367+
}
368+
return sb.toString();
369+
}
370+
371+
private isNextSurrogatePair(): boolean {
372+
return StringGenerator.nextBoolean(this.rnd, this.surrogatePairProbability);
373+
}
374+
375+
private static nextBoolean(rnd: Random, probability: number): boolean {
376+
if (probability === 0.0) {
377+
return false;
378+
} else if (probability === 1.0) {
379+
return true;
380+
} else {
381+
return rnd.nextFloat() < probability;
382+
}
383+
}
384+
385+
private nextCodePoint(): number {
386+
if (this.isNextSurrogatePair()) {
387+
return this.nextSurrogateCodePoint();
388+
} else {
389+
return this.nextNonSurrogateCodePoint();
390+
}
391+
}
392+
393+
private nextSurrogateCodePoint(): number {
394+
const highSurrogateMin = 0xd800;
395+
const highSurrogateMax = 0xdbff;
396+
const lowSurrogateMin = 0xdc00;
397+
const lowSurrogateMax = 0xdfff;
398+
399+
const highSurrogate = this.nextCodePointRange(
400+
highSurrogateMin,
401+
highSurrogateMax
402+
);
403+
const lowSurrogate = this.nextCodePointRange(
404+
lowSurrogateMin,
405+
lowSurrogateMax
406+
);
407+
408+
return (highSurrogate - 0xd800) * 0x400 + (lowSurrogate - 0xdc00) + 0x10000;
409+
}
410+
411+
private nextNonSurrogateCodePoint(): number {
412+
let codePoint;
413+
do {
414+
codePoint = this.nextCodePointRange(0, 0xffff); // BMP range
415+
} while (codePoint >= 0xd800 && codePoint <= 0xdfff); // Exclude surrogate range
416+
417+
return codePoint;
418+
}
419+
420+
private nextCodePointRange(min: number, max: number): number {
421+
const rangeSize = max - min + 1;
422+
const offset = this.rnd.nextInt(rangeSize);
423+
return min + offset;
424+
}
425+
}
426+
427+
class Random {
428+
private seed: number;
429+
430+
constructor(seed: number) {
431+
this.seed = seed;
432+
}
433+
434+
nextInt(max: number): number {
435+
this.seed = (this.seed * 9301 + 49297) % 233280;
436+
const rnd = this.seed / 233280;
437+
return Math.floor(rnd * max);
438+
}
439+
440+
nextFloat(): number {
441+
this.seed = (this.seed * 9301 + 49297) % 233280;
442+
return this.seed / 233280;
443+
}
444+
}
445+
446+
class StringBuilder {
447+
private buffer: string[] = [];
448+
449+
append(str: string): StringBuilder {
450+
this.buffer.push(str);
451+
return this;
452+
}
453+
454+
appendCodePoint(codePoint: number): StringBuilder {
455+
this.buffer.push(String.fromCodePoint(codePoint));
456+
return this;
457+
}
458+
459+
toString(): string {
460+
return this.buffer.join('');
461+
}
462+
463+
length(): number {
464+
return this.buffer.join('').length;
465+
}
466+
}
467+
468+
describe('CompareUtf8Strings', () => {
469+
it('compareUtf8Strings should return correct results', () => {
470+
const errors = [];
471+
const seed = Math.floor(Math.random() * Number.MAX_SAFE_INTEGER);
472+
let passCount = 0;
473+
const stringGenerator = new StringGenerator(new Random(seed), 0.33, 20);
474+
const stringPairGenerator = new StringPairGenerator(stringGenerator);
475+
476+
for (let i = 0; i < 1000000 && errors.length < 10; i++) {
477+
const {s1, s2} = stringPairGenerator.next();
478+
479+
const actual = order.compareUtf8Strings(s1, s2);
480+
const expected = Buffer.from(s1, 'utf8').compare(Buffer.from(s2, 'utf8'));
481+
482+
if (actual === expected) {
483+
passCount++;
484+
} else {
485+
errors.push(
486+
`compareUtf8Strings(s1="${s1}", s2="${s2}") returned ${actual}, ` +
487+
`but expected ${expected} (i=${i}, s1.length=${s1.length}, s2.length=${s2.length})`
488+
);
489+
}
490+
}
491+
492+
if (errors.length > 0) {
493+
console.error(
494+
`${errors.length} test cases failed, ${passCount} test cases passed, seed=${seed};`
495+
);
496+
errors.forEach((error, index) =>
497+
console.error(`errors[${index}]: ${error}`)
498+
);
499+
throw new Error('Test failed');
500+
}
501+
}).timeout(20000);
502+
});

0 commit comments

Comments
 (0)