Skip to content

Commit 9d863f5

Browse files
authored
Merge pull request #17331 from calixteman/lang_marker
Remove language codes from text strings.
2 parents 4b1de15 + eb5f610 commit 9d863f5

File tree

2 files changed

+47
-2
lines changed

2 files changed

+47
-2
lines changed

src/shared/util.js

+21-2
Original file line numberDiff line numberDiff line change
@@ -905,12 +905,21 @@ const PDFStringTranslateTable = [
905905
];
906906

907907
function stringToPDFString(str) {
908+
// See section 7.9.2.2 Text String Type.
909+
// The string can contain some language codes bracketed with 0x0b,
910+
// so we must remove them.
908911
if (str[0] >= "\xEF") {
909912
let encoding;
910913
if (str[0] === "\xFE" && str[1] === "\xFF") {
911914
encoding = "utf-16be";
915+
if (str.length % 2 === 1) {
916+
str = str.slice(0, -1);
917+
}
912918
} else if (str[0] === "\xFF" && str[1] === "\xFE") {
913919
encoding = "utf-16le";
920+
if (str.length % 2 === 1) {
921+
str = str.slice(0, -1);
922+
}
914923
} else if (str[0] === "\xEF" && str[1] === "\xBB" && str[2] === "\xBF") {
915924
encoding = "utf-8";
916925
}
@@ -919,7 +928,11 @@ function stringToPDFString(str) {
919928
try {
920929
const decoder = new TextDecoder(encoding, { fatal: true });
921930
const buffer = stringToBytes(str);
922-
return decoder.decode(buffer);
931+
const decoded = decoder.decode(buffer);
932+
if (!decoded.includes("\x1b")) {
933+
return decoded;
934+
}
935+
return decoded.replaceAll(/\x1b[^\x1b]*(?:\x1b|$)/g, "");
923936
} catch (ex) {
924937
warn(`stringToPDFString: "${ex}".`);
925938
}
@@ -928,7 +941,13 @@ function stringToPDFString(str) {
928941
// ISO Latin 1
929942
const strBuf = [];
930943
for (let i = 0, ii = str.length; i < ii; i++) {
931-
const code = PDFStringTranslateTable[str.charCodeAt(i)];
944+
const charCode = str.charCodeAt(i);
945+
if (charCode === 0x1b) {
946+
// eslint-disable-next-line no-empty
947+
while (++i < ii && str.charCodeAt(i) !== 0x1b) {}
948+
continue;
949+
}
950+
const code = PDFStringTranslateTable[charCode];
932951
strBuf.push(code ? String.fromCharCode(code) : str.charAt(i));
933952
}
934953
return strBuf.join("");

test/unit/util_spec.js

+26
Original file line numberDiff line numberDiff line change
@@ -99,11 +99,21 @@ describe("util", function () {
9999
expect(stringToPDFString(str)).toEqual("string");
100100
});
101101

102+
it("handles incomplete UTF-16 big-endian strings", function () {
103+
const str = "\xFE\xFF\x00\x73\x00\x74\x00\x72\x00\x69\x00\x6E\x00";
104+
expect(stringToPDFString(str)).toEqual("strin");
105+
});
106+
102107
it("handles UTF-16 little-endian strings", function () {
103108
const str = "\xFF\xFE\x73\x00\x74\x00\x72\x00\x69\x00\x6E\x00\x67\x00";
104109
expect(stringToPDFString(str)).toEqual("string");
105110
});
106111

112+
it("handles incomplete UTF-16 little-endian strings", function () {
113+
const str = "\xFF\xFE\x73\x00\x74\x00\x72\x00\x69\x00\x6E\x00\x67";
114+
expect(stringToPDFString(str)).toEqual("strin");
115+
});
116+
107117
it("handles UTF-8 strings", function () {
108118
const simpleStr = "\xEF\xBB\xBF\x73\x74\x72\x69\x6E\x67";
109119
expect(stringToPDFString(simpleStr)).toEqual("string");
@@ -134,6 +144,22 @@ describe("util", function () {
134144
const str4 = "\xEF\xBB\xBF";
135145
expect(stringToPDFString(str4)).toEqual("");
136146
});
147+
148+
it("handles strings with language code", function () {
149+
// ISO Latin 1
150+
const str1 = "hello \x1benUS\x1bworld";
151+
expect(stringToPDFString(str1)).toEqual("hello world");
152+
153+
// UTF-16BE
154+
const str2 =
155+
"\xFE\xFF\x00h\x00e\x00l\x00l\x00o\x00 \x00\x1b\x00e\x00n\x00U\x00S\x00\x1b\x00w\x00o\x00r\x00l\x00d";
156+
expect(stringToPDFString(str2)).toEqual("hello world");
157+
158+
// UTF-16LE
159+
const str3 =
160+
"\xFF\xFEh\x00e\x00l\x00l\x00o\x00 \x00\x1b\x00e\x00n\x00U\x00S\x00\x1b\x00w\x00o\x00r\x00l\x00d\x00";
161+
expect(stringToPDFString(str3)).toEqual("hello world");
162+
});
137163
});
138164

139165
describe("ReadableStream", function () {

0 commit comments

Comments
 (0)