Skip to content

Commit 8bf52f0

Browse files
committed
fix(csv-parse): rtrim encoding support (fix #349)
1 parent 737ac66 commit 8bf52f0

File tree

11 files changed

+414
-153
lines changed

11 files changed

+414
-153
lines changed

packages/csv-parse/dist/cjs/index.cjs

Lines changed: 43 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,16 @@ class ResizeableBuffer{
115115
}
116116
}
117117

118+
// white space characters
119+
// https://en.wikipedia.org/wiki/Whitespace_character
120+
// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Character_Classes#Types
121+
// \f\n\r\t\v\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff
122+
const np = 12;
123+
const cr$1 = 13; // `\r`, carriage return, 0x0D in hexadécimal, 13 in decimal
124+
const nl$1 = 10; // `\n`, newline, 0x0A in hexadecimal, 10 in decimal
125+
const space = 32;
126+
const tab = 9;
127+
118128
const init_state = function(options){
119129
return {
120130
bomSkipped: false,
@@ -148,7 +158,14 @@ const init_state = function(options){
148158
recordDelimiterMaxLength: options.record_delimiter.length === 0 ? 2 : Math.max(...options.record_delimiter.map((v) => v.length)),
149159
trimChars: [Buffer.from(' ', options.encoding)[0], Buffer.from('\t', options.encoding)[0]],
150160
wasQuoting: false,
151-
wasRowDelimiter: false
161+
wasRowDelimiter: false,
162+
timchars: [
163+
Buffer.from(Buffer.from([cr$1], 'utf8').toString(), options.encoding),
164+
Buffer.from(Buffer.from([nl$1], 'utf8').toString(), options.encoding),
165+
Buffer.from(Buffer.from([np], 'utf8').toString(), options.encoding),
166+
Buffer.from(Buffer.from([space], 'utf8').toString(), options.encoding),
167+
Buffer.from(Buffer.from([tab], 'utf8').toString(), options.encoding),
168+
]
152169
};
153170
};
154171

@@ -571,15 +588,9 @@ const isRecordEmpty = function(record){
571588
return record.every((field) => field == null || field.toString && field.toString().trim() === '');
572589
};
573590

574-
// white space characters
575-
// https://en.wikipedia.org/wiki/Whitespace_character
576-
// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Character_Classes#Types
577-
// \f\n\r\t\v\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff
578-
const tab = 9;
579-
const nl = 10; // \n, 0x0A in hexadecimal, 10 in decimal
580-
const np = 12;
581-
const cr = 13; // \r, 0x0D in hexadécimal, 13 in decimal
582-
const space = 32;
591+
const cr = 13; // `\r`, carriage return, 0x0D in hexadécimal, 13 in decimal
592+
const nl = 10; // `\n`, newline, 0x0A in hexadecimal, 10 in decimal
593+
583594
const boms = {
584595
// Note, the following are equals:
585596
// Buffer.from("\ufeff")
@@ -724,7 +735,7 @@ const transform = function(original_options = {}) {
724735
if(this.state.commenting === false && this.__isQuote(buf, pos)){
725736
if(this.state.quoting === true){
726737
const nextChr = buf[pos+quote.length];
727-
const isNextChrTrimable = rtrim && this.__isCharTrimable(nextChr);
738+
const isNextChrTrimable = rtrim && this.__isCharTrimable(buf, pos+quote.length);
728739
const isNextChrComment = comment !== null && this.__compareBytes(comment, buf, pos+quote.length, nextChr);
729740
const isNextChrDelimiter = this.__isDelimiter(buf, pos+quote.length, nextChr);
730741
const isNextChrRecordDelimiter = record_delimiter.length === 0 ? this.__autoDiscoverRecordDelimiter(buf, pos+quote.length) : this.__isRecordDelimiter(nextChr, buf, pos+quote.length);
@@ -834,30 +845,34 @@ const transform = function(original_options = {}) {
834845
}
835846
if(this.state.commenting === false){
836847
if(max_record_size !== 0 && this.state.record_length + this.state.field.length > max_record_size){
837-
const err = this.__error(
848+
return this.__error(
838849
new CsvError('CSV_MAX_RECORD_SIZE', [
839850
'Max Record Size:',
840851
'record exceed the maximum number of tolerated bytes',
841852
`of ${max_record_size}`,
842853
`at line ${this.info.lines}`,
843854
], this.options, this.__infoField())
844855
);
845-
if(err !== undefined) return err;
846856
}
847857
}
848-
const lappend = ltrim === false || this.state.quoting === true || this.state.field.length !== 0 || !this.__isCharTrimable(chr);
858+
const lappend = ltrim === false || this.state.quoting === true || this.state.field.length !== 0 || !this.__isCharTrimable(buf, pos);
849859
// rtrim in non quoting is handle in __onField
850860
const rappend = rtrim === false || this.state.wasQuoting === false;
851861
if(lappend === true && rappend === true){
852862
this.state.field.append(chr);
853-
}else if(rtrim === true && !this.__isCharTrimable(chr)){
863+
}else if(rtrim === true && !this.__isCharTrimable(buf, pos)){
854864
return this.__error(
855865
new CsvError('CSV_NON_TRIMABLE_CHAR_AFTER_CLOSING_QUOTE', [
856866
'Invalid Closing Quote:',
857867
'found non trimable byte after quote',
858868
`at line ${this.info.lines}`,
859869
], this.options, this.__infoField())
860870
);
871+
}else {
872+
if(lappend === false){
873+
pos += this.__isCharTrimable(buf, pos) - 1;
874+
}
875+
continue;
861876
}
862877
}
863878
if(end === true){
@@ -1114,8 +1129,19 @@ const transform = function(original_options = {}) {
11141129
return [undefined, field];
11151130
},
11161131
// Helper to test if a character is a space or a line delimiter
1117-
__isCharTrimable: function(chr){
1118-
return chr === space || chr === tab || chr === cr || chr === nl || chr === np;
1132+
__isCharTrimable: function(buf, pos){
1133+
const isTrim = (buf, pos) => {
1134+
const {timchars} = this.state;
1135+
loop1: for(let i = 0; i < timchars.length; i++){
1136+
const timchar = timchars[i];
1137+
for(let j = 0; j < timchar.length; j++){
1138+
if(timchar[j] !== buf[pos+j]) continue loop1;
1139+
}
1140+
return timchar.length;
1141+
}
1142+
return 0;
1143+
};
1144+
return isTrim(buf, pos);
11191145
},
11201146
// Keep it in case we implement the `cast_int` option
11211147
// __isInt(value){

packages/csv-parse/dist/cjs/sync.cjs

Lines changed: 43 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,16 @@ class ResizeableBuffer{
113113
}
114114
}
115115

116+
// white space characters
117+
// https://en.wikipedia.org/wiki/Whitespace_character
118+
// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Character_Classes#Types
119+
// \f\n\r\t\v\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff
120+
const np = 12;
121+
const cr$1 = 13; // `\r`, carriage return, 0x0D in hexadécimal, 13 in decimal
122+
const nl$1 = 10; // `\n`, newline, 0x0A in hexadecimal, 10 in decimal
123+
const space = 32;
124+
const tab = 9;
125+
116126
const init_state = function(options){
117127
return {
118128
bomSkipped: false,
@@ -146,7 +156,14 @@ const init_state = function(options){
146156
recordDelimiterMaxLength: options.record_delimiter.length === 0 ? 2 : Math.max(...options.record_delimiter.map((v) => v.length)),
147157
trimChars: [Buffer.from(' ', options.encoding)[0], Buffer.from('\t', options.encoding)[0]],
148158
wasQuoting: false,
149-
wasRowDelimiter: false
159+
wasRowDelimiter: false,
160+
timchars: [
161+
Buffer.from(Buffer.from([cr$1], 'utf8').toString(), options.encoding),
162+
Buffer.from(Buffer.from([nl$1], 'utf8').toString(), options.encoding),
163+
Buffer.from(Buffer.from([np], 'utf8').toString(), options.encoding),
164+
Buffer.from(Buffer.from([space], 'utf8').toString(), options.encoding),
165+
Buffer.from(Buffer.from([tab], 'utf8').toString(), options.encoding),
166+
]
150167
};
151168
};
152169

@@ -569,15 +586,9 @@ const isRecordEmpty = function(record){
569586
return record.every((field) => field == null || field.toString && field.toString().trim() === '');
570587
};
571588

572-
// white space characters
573-
// https://en.wikipedia.org/wiki/Whitespace_character
574-
// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Character_Classes#Types
575-
// \f\n\r\t\v\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff
576-
const tab = 9;
577-
const nl = 10; // \n, 0x0A in hexadecimal, 10 in decimal
578-
const np = 12;
579-
const cr = 13; // \r, 0x0D in hexadécimal, 13 in decimal
580-
const space = 32;
589+
const cr = 13; // `\r`, carriage return, 0x0D in hexadécimal, 13 in decimal
590+
const nl = 10; // `\n`, newline, 0x0A in hexadecimal, 10 in decimal
591+
581592
const boms = {
582593
// Note, the following are equals:
583594
// Buffer.from("\ufeff")
@@ -722,7 +733,7 @@ const transform = function(original_options = {}) {
722733
if(this.state.commenting === false && this.__isQuote(buf, pos)){
723734
if(this.state.quoting === true){
724735
const nextChr = buf[pos+quote.length];
725-
const isNextChrTrimable = rtrim && this.__isCharTrimable(nextChr);
736+
const isNextChrTrimable = rtrim && this.__isCharTrimable(buf, pos+quote.length);
726737
const isNextChrComment = comment !== null && this.__compareBytes(comment, buf, pos+quote.length, nextChr);
727738
const isNextChrDelimiter = this.__isDelimiter(buf, pos+quote.length, nextChr);
728739
const isNextChrRecordDelimiter = record_delimiter.length === 0 ? this.__autoDiscoverRecordDelimiter(buf, pos+quote.length) : this.__isRecordDelimiter(nextChr, buf, pos+quote.length);
@@ -832,30 +843,34 @@ const transform = function(original_options = {}) {
832843
}
833844
if(this.state.commenting === false){
834845
if(max_record_size !== 0 && this.state.record_length + this.state.field.length > max_record_size){
835-
const err = this.__error(
846+
return this.__error(
836847
new CsvError('CSV_MAX_RECORD_SIZE', [
837848
'Max Record Size:',
838849
'record exceed the maximum number of tolerated bytes',
839850
`of ${max_record_size}`,
840851
`at line ${this.info.lines}`,
841852
], this.options, this.__infoField())
842853
);
843-
if(err !== undefined) return err;
844854
}
845855
}
846-
const lappend = ltrim === false || this.state.quoting === true || this.state.field.length !== 0 || !this.__isCharTrimable(chr);
856+
const lappend = ltrim === false || this.state.quoting === true || this.state.field.length !== 0 || !this.__isCharTrimable(buf, pos);
847857
// rtrim in non quoting is handle in __onField
848858
const rappend = rtrim === false || this.state.wasQuoting === false;
849859
if(lappend === true && rappend === true){
850860
this.state.field.append(chr);
851-
}else if(rtrim === true && !this.__isCharTrimable(chr)){
861+
}else if(rtrim === true && !this.__isCharTrimable(buf, pos)){
852862
return this.__error(
853863
new CsvError('CSV_NON_TRIMABLE_CHAR_AFTER_CLOSING_QUOTE', [
854864
'Invalid Closing Quote:',
855865
'found non trimable byte after quote',
856866
`at line ${this.info.lines}`,
857867
], this.options, this.__infoField())
858868
);
869+
}else {
870+
if(lappend === false){
871+
pos += this.__isCharTrimable(buf, pos) - 1;
872+
}
873+
continue;
859874
}
860875
}
861876
if(end === true){
@@ -1112,8 +1127,19 @@ const transform = function(original_options = {}) {
11121127
return [undefined, field];
11131128
},
11141129
// Helper to test if a character is a space or a line delimiter
1115-
__isCharTrimable: function(chr){
1116-
return chr === space || chr === tab || chr === cr || chr === nl || chr === np;
1130+
__isCharTrimable: function(buf, pos){
1131+
const isTrim = (buf, pos) => {
1132+
const {timchars} = this.state;
1133+
loop1: for(let i = 0; i < timchars.length; i++){
1134+
const timchar = timchars[i];
1135+
for(let j = 0; j < timchar.length; j++){
1136+
if(timchar[j] !== buf[pos+j]) continue loop1;
1137+
}
1138+
return timchar.length;
1139+
}
1140+
return 0;
1141+
};
1142+
return isTrim(buf, pos);
11171143
},
11181144
// Keep it in case we implement the `cast_int` option
11191145
// __isInt(value){

packages/csv-parse/dist/esm/index.js

Lines changed: 43 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -5171,6 +5171,16 @@ class ResizeableBuffer{
51715171
}
51725172
}
51735173

5174+
// white space characters
5175+
// https://en.wikipedia.org/wiki/Whitespace_character
5176+
// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Character_Classes#Types
5177+
// \f\n\r\t\v\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff
5178+
const np = 12;
5179+
const cr$1 = 13; // `\r`, carriage return, 0x0D in hexadécimal, 13 in decimal
5180+
const nl$1 = 10; // `\n`, newline, 0x0A in hexadecimal, 10 in decimal
5181+
const space = 32;
5182+
const tab = 9;
5183+
51745184
const init_state = function(options){
51755185
return {
51765186
bomSkipped: false,
@@ -5204,7 +5214,14 @@ const init_state = function(options){
52045214
recordDelimiterMaxLength: options.record_delimiter.length === 0 ? 2 : Math.max(...options.record_delimiter.map((v) => v.length)),
52055215
trimChars: [Buffer.from(' ', options.encoding)[0], Buffer.from('\t', options.encoding)[0]],
52065216
wasQuoting: false,
5207-
wasRowDelimiter: false
5217+
wasRowDelimiter: false,
5218+
timchars: [
5219+
Buffer.from(Buffer.from([cr$1], 'utf8').toString(), options.encoding),
5220+
Buffer.from(Buffer.from([nl$1], 'utf8').toString(), options.encoding),
5221+
Buffer.from(Buffer.from([np], 'utf8').toString(), options.encoding),
5222+
Buffer.from(Buffer.from([space], 'utf8').toString(), options.encoding),
5223+
Buffer.from(Buffer.from([tab], 'utf8').toString(), options.encoding),
5224+
]
52085225
};
52095226
};
52105227

@@ -5627,15 +5644,9 @@ const isRecordEmpty = function(record){
56275644
return record.every((field) => field == null || field.toString && field.toString().trim() === '');
56285645
};
56295646

5630-
// white space characters
5631-
// https://en.wikipedia.org/wiki/Whitespace_character
5632-
// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Character_Classes#Types
5633-
// \f\n\r\t\v\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff
5634-
const tab = 9;
5635-
const nl = 10; // \n, 0x0A in hexadecimal, 10 in decimal
5636-
const np = 12;
5637-
const cr = 13; // \r, 0x0D in hexadécimal, 13 in decimal
5638-
const space = 32;
5647+
const cr = 13; // `\r`, carriage return, 0x0D in hexadécimal, 13 in decimal
5648+
const nl = 10; // `\n`, newline, 0x0A in hexadecimal, 10 in decimal
5649+
56395650
const boms = {
56405651
// Note, the following are equals:
56415652
// Buffer.from("\ufeff")
@@ -5780,7 +5791,7 @@ const transform = function(original_options = {}) {
57805791
if(this.state.commenting === false && this.__isQuote(buf, pos)){
57815792
if(this.state.quoting === true){
57825793
const nextChr = buf[pos+quote.length];
5783-
const isNextChrTrimable = rtrim && this.__isCharTrimable(nextChr);
5794+
const isNextChrTrimable = rtrim && this.__isCharTrimable(buf, pos+quote.length);
57845795
const isNextChrComment = comment !== null && this.__compareBytes(comment, buf, pos+quote.length, nextChr);
57855796
const isNextChrDelimiter = this.__isDelimiter(buf, pos+quote.length, nextChr);
57865797
const isNextChrRecordDelimiter = record_delimiter.length === 0 ? this.__autoDiscoverRecordDelimiter(buf, pos+quote.length) : this.__isRecordDelimiter(nextChr, buf, pos+quote.length);
@@ -5890,30 +5901,34 @@ const transform = function(original_options = {}) {
58905901
}
58915902
if(this.state.commenting === false){
58925903
if(max_record_size !== 0 && this.state.record_length + this.state.field.length > max_record_size){
5893-
const err = this.__error(
5904+
return this.__error(
58945905
new CsvError('CSV_MAX_RECORD_SIZE', [
58955906
'Max Record Size:',
58965907
'record exceed the maximum number of tolerated bytes',
58975908
`of ${max_record_size}`,
58985909
`at line ${this.info.lines}`,
58995910
], this.options, this.__infoField())
59005911
);
5901-
if(err !== undefined) return err;
59025912
}
59035913
}
5904-
const lappend = ltrim === false || this.state.quoting === true || this.state.field.length !== 0 || !this.__isCharTrimable(chr);
5914+
const lappend = ltrim === false || this.state.quoting === true || this.state.field.length !== 0 || !this.__isCharTrimable(buf, pos);
59055915
// rtrim in non quoting is handle in __onField
59065916
const rappend = rtrim === false || this.state.wasQuoting === false;
59075917
if(lappend === true && rappend === true){
59085918
this.state.field.append(chr);
5909-
}else if(rtrim === true && !this.__isCharTrimable(chr)){
5919+
}else if(rtrim === true && !this.__isCharTrimable(buf, pos)){
59105920
return this.__error(
59115921
new CsvError('CSV_NON_TRIMABLE_CHAR_AFTER_CLOSING_QUOTE', [
59125922
'Invalid Closing Quote:',
59135923
'found non trimable byte after quote',
59145924
`at line ${this.info.lines}`,
59155925
], this.options, this.__infoField())
59165926
);
5927+
}else {
5928+
if(lappend === false){
5929+
pos += this.__isCharTrimable(buf, pos) - 1;
5930+
}
5931+
continue;
59175932
}
59185933
}
59195934
if(end === true){
@@ -6170,8 +6185,19 @@ const transform = function(original_options = {}) {
61706185
return [undefined, field];
61716186
},
61726187
// Helper to test if a character is a space or a line delimiter
6173-
__isCharTrimable: function(chr){
6174-
return chr === space || chr === tab || chr === cr || chr === nl || chr === np;
6188+
__isCharTrimable: function(buf, pos){
6189+
const isTrim = (buf, pos) => {
6190+
const {timchars} = this.state;
6191+
loop1: for(let i = 0; i < timchars.length; i++){
6192+
const timchar = timchars[i];
6193+
for(let j = 0; j < timchar.length; j++){
6194+
if(timchar[j] !== buf[pos+j]) continue loop1;
6195+
}
6196+
return timchar.length;
6197+
}
6198+
return 0;
6199+
};
6200+
return isTrim(buf, pos);
61756201
},
61766202
// Keep it in case we implement the `cast_int` option
61776203
// __isInt(value){

0 commit comments

Comments
 (0)