Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 12 additions & 11 deletions packages/csv-parse/lib/api/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -747,18 +747,19 @@ const transform = function (original_options = {}) {
},
// Helper to test if a character is trimable
__isCharTrimable: function (buf, pos) {
const isTrim = (buf, pos) => {
const { timchars } = this.state;
loop1: for (let i = 0; i < timchars.length; i++) {
const timchar = timchars[i];
for (let j = 0; j < timchar.length; j++) {
if (timchar[j] !== buf[pos + j]) continue loop1;
}
return timchar.length;
const { timchars, timcharFirstBytes } = this.state;
// Fast bail-out: non-whitespace bytes (the common case) are rejected
// without scanning the full timchar list.
const first = buf[pos];
if (first === undefined || timcharFirstBytes[first] === 0) return 0;
loop1: for (let i = 0; i < timchars.length; i++) {
const timchar = timchars[i];
for (let j = 0; j < timchar.length; j++) {
if (timchar[j] !== buf[pos + j]) continue loop1;
}
return 0;
};
return isTrim(buf, pos);
return timchar.length;
}
return 0;
},
__isDelimiter: function (buf, pos, chr) {
const { delimiter, ignore_last_delimiters } = this.options;
Expand Down
74 changes: 57 additions & 17 deletions packages/csv-parse/lib/api/init_state.js
Original file line number Diff line number Diff line change
@@ -1,16 +1,59 @@
import ResizeableBuffer from "../utils/ResizeableBuffer.js";

// white space characters
// https://en.wikipedia.org/wiki/Whitespace_character
// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Character_Classes#Types
// \f\n\r\t\v\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff
const np = 12;
const cr = 13; // `\r`, carriage return, 0x0D in hexadécimal, 13 in decimal
const nl = 10; // `\n`, newline, 0x0A in hexadecimal, 10 in decimal
const space = 32;
const tab = 9;

const init_state = function (options) {
// ECMAScript WhiteSpace + LineTerminator codepoints, encoded under
// `options.encoding`. Aligns trimming with `String.prototype.trim()`.
// https://tc39.es/ecma262/#sec-white-space
// https://tc39.es/ecma262/#sec-line-terminators
//
// Codepoints unrepresentable in the target encoding are dropped: Node's
// Buffer substitutes them with `?` (0x3F), and including those would cause
// literal `?` bytes in the input to be trimmed under `latin1`/`ascii`.
const timchars = [
// Basic Latin
0x0020, // [Space](https://www.fileformat.info/info/unicode/char/0020/index.htm)
0x0009, // [CHARACTER TABULATION (HT)](https://www.fileformat.info/info/unicode/char/0009/index.htm)
0x000a, // [LINE FEED (LF)](https://www.fileformat.info/info/unicode/char/000a/index.htm)
0x000d, // [CARRIAGE RETURN (CR)](https://www.fileformat.info/info/unicode/char/000d/index.htm)
0x000c, // [FORM FEED (FF)](https://www.fileformat.info/info/unicode/char/000c/index.htm)
0x000b, // [LINE TABULATION (VT)](https://www.fileformat.info/info/unicode/char/000b/index.htm)
// Latin-1 Supplement
0x00a0, // [NO-BREAK SPACE (NBSP)](https://www.fileformat.info/info/unicode/char/00a0/index.htm)
// Ogham
0x1680, // [OGHAM SPACE MARK](https://www.fileformat.info/info/unicode/char/1680/index.htm)
// General Punctuation
0x2000, // [EN QUAD](https://www.fileformat.info/info/unicode/char/2000/index.htm)
0x2001, // [EM QUAD](https://www.fileformat.info/info/unicode/char/2001/index.htm)
0x2002, // [EN SPACE](https://www.fileformat.info/info/unicode/char/2002/index.htm)
0x2003, // [EM SPACE](https://www.fileformat.info/info/unicode/char/2003/index.htm)
0x2004, // [THREE-PER-EM SPACE](https://www.fileformat.info/info/unicode/char/2004/index.htm)
0x2005, // [FOUR-PER-EM SPACE](https://www.fileformat.info/info/unicode/char/2005/index.htm)
0x2006, // [SIX-PER-EM SPACE](https://www.fileformat.info/info/unicode/char/2006/index.htm)
0x2007, // [FIGURE SPACE](https://www.fileformat.info/info/unicode/char/2007/index.htm)
0x2008, // [PUNCTUATION SPACE](https://www.fileformat.info/info/unicode/char/2008/index.htm)
0x2009, // [THIN SPACE](https://www.fileformat.info/info/unicode/char/2009/index.htm)
0x200a, // [HAIR SPACE](https://www.fileformat.info/info/unicode/char/200a/index.htm)
0x2028, // [LINE SEPARATOR](https://www.fileformat.info/info/unicode/char/2028/index.htm)
0x2029, // [PARAGRAPH SEPARATOR](https://www.fileformat.info/info/unicode/char/2029/index.htm)
0x202f, // [NARROW NO-BREAK SPACE (NNBSP)](https://www.fileformat.info/info/unicode/char/202f/index.htm)
0x205f, // [MEDIUM MATHEMATICAL SPACE (MMSP)](https://www.fileformat.info/info/unicode/char/205f/index.htm)
0x3000, // [IDEOGRAPHIC SPACE](https://www.fileformat.info/info/unicode/char/3000/index.htm)
0xfeff, // [ZERO WIDTH NO-BREAK SPACE (BOM)](https://www.fileformat.info/info/unicode/char/feff/index.htm)
].reduce((acc, codepoint) => {
const encoded = Buffer.from(
String.fromCharCode(codepoint),
options.encoding,
);
if (codepoint !== 0x3f && encoded.length === 1 && encoded[0] === 0x3f) {
return acc;
}
acc.push(encoded);
return acc;
}, []);
// First-byte lookup table for `__isCharTrimable`. Non-whitespace bytes
// (the common case) bail out in O(1) without scanning every timchar.
const timcharFirstBytes = new Uint8Array(256);
for (const t of timchars) timcharFirstBytes[t[0]] = 1;
return {
bomSkipped: false,
bufBytesStart: 0,
Expand All @@ -37,6 +80,8 @@ const init_state = function (options) {
...options.delimiter.map((delimiter) => delimiter.length),
// Skip if the remaining buffer can be escape sequence
options.quote !== null ? options.quote.length : 0,
// Skip if the remaining buffer can be a multi-byte trim character
...timchars.map((t) => t.length),
),
previousBuf: undefined,
quoting: false,
Expand All @@ -55,13 +100,8 @@ const init_state = function (options) {
],
wasQuoting: false,
wasRowDelimiter: false,
timchars: [
Buffer.from(Buffer.from([cr], "utf8").toString(), options.encoding),
Buffer.from(Buffer.from([nl], "utf8").toString(), options.encoding),
Buffer.from(Buffer.from([np], "utf8").toString(), options.encoding),
Buffer.from(Buffer.from([space], "utf8").toString(), options.encoding),
Buffer.from(Buffer.from([tab], "utf8").toString(), options.encoding),
],
timchars: timchars,
timcharFirstBytes: timcharFirstBytes,
};
};

Expand Down
90 changes: 90 additions & 0 deletions packages/csv-parse/test/option.trim.ts
Original file line number Diff line number Diff line change
Expand Up @@ -295,4 +295,94 @@ describe("Option `trim`", function () {
);
});
});

describe("unicode whitespace", function () {
const ws = (codepoint: number): string => String.fromCharCode(codepoint);

it("trim ideographic space U+3000", function (next) {
const sp = ws(0x3000);
parse(`${sp}a${sp},${sp}b${sp}`, { trim: true }, (err, records) => {
if (err) return next(err);
records.should.eql([["a", "b"]]);
next();
});
});

it("trim vertical tab U+000B", function (next) {
const sp = ws(0x000b);
parse(
`${sp}a${sp},${sp}b${sp}`,
{ trim: true, record_delimiter: "|" },
(err, records) => {
if (err) return next(err);
records.should.eql([["a", "b"]]);
next();
},
);
});

it("trim no-break space U+00A0", function (next) {
const sp = ws(0x00a0);
parse(`${sp}a${sp},${sp}b${sp}`, { trim: true }, (err, records) => {
if (err) return next(err);
records.should.eql([["a", "b"]]);
next();
});
});

it("trim mixed ECMAScript whitespace at field boundaries", function (next) {
// U+2028 and U+2029 are excluded because they act as record delimiters.
const codepoints = [
0x00a0, 0x1680, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006,
0x2007, 0x2008, 0x2009, 0x200a, 0x202f, 0x205f, 0x3000, 0xfeff,
];
const surround = codepoints.map(ws).join("");
parse(
`${surround}field-1${surround},${surround}field-2${surround}`,
{ trim: true },
(err, records) => {
if (err) return next(err);
records.should.eql([["field-1", "field-2"]]);
next();
},
);
});

it("does not trim '?' under latin1 encoding", function (next) {
parse(
Buffer.from("?a?,?b?", "latin1"),
{ encoding: "latin1", trim: true },
(err, records) => {
if (err) return next(err);
records.should.eql([["?a?", "?b?"]]);
next();
},
);
});

it("trim multi-byte whitespace split across writes", function (next) {
const records: string[] = [];
const parser = parse({ trim: true });
parser.on("readable", () => {
let d;
while ((d = parser.read())) {
records.push(d);
}
});
parser.on("end", () => {
records.should.eql([["a", "b"]]);
next();
});
const sp = Buffer.from(ws(0x3000), "utf8");
parser.write(sp.subarray(0, 1));
parser.write(sp.subarray(1));
parser.write("a");
parser.write(sp.subarray(0, 2));
parser.write(sp.subarray(2));
parser.write(",");
parser.write("b");
parser.write(sp);
parser.end();
});
});
});