import { normalize_columns_array } from "./normalize_columns_array.js"; import { init_state } from "./init_state.js"; import { normalize_options } from "./normalize_options.js"; import { CsvError } from "./CsvError.js"; const isRecordEmpty = function (record) { return record.every( (field) => field == null || (field.toString && field.toString().trim() === ""), ); }; const cr = 13; // `\r`, carriage return, 0x0D in hexadécimal, 13 in decimal const nl = 10; // `\n`, newline, 0x0A in hexadecimal, 10 in decimal const boms = { // Note, the following are equals: // Buffer.from("\ufeff") // Buffer.from([239, 187, 191]) // Buffer.from('EFBBBF', 'hex') utf8: Buffer.from([239, 187, 191]), // Note, the following are equals: // Buffer.from "\ufeff", 'utf16le // Buffer.from([255, 254]) utf16le: Buffer.from([255, 254]), }; const transform = function (original_options = {}) { const info = { bytes: 0, comment_lines: 0, empty_lines: 0, invalid_field_length: 0, lines: 1, records: 0, }; const options = normalize_options(original_options); return { info: info, original_options: original_options, options: options, state: init_state(options), __needMoreData: function (i, bufLen, end) { if (end) return false; const { encoding, escape, quote } = this.options; const { quoting, needMoreDataSize, recordDelimiterMaxLength } = this.state; const numOfCharLeft = bufLen - i - 1; const requiredLength = Math.max( needMoreDataSize, // Skip if the remaining buffer smaller than record delimiter // If "record_delimiter" is yet to be discovered: // 1. It is equals to `[]` and "recordDelimiterMaxLength" equals `0` // 2. We set the length to windows line ending in the current encoding // Note, that encoding is known from user or bom discovery at that point // recordDelimiterMaxLength, recordDelimiterMaxLength === 0 ? Buffer.from("\r\n", encoding).length : recordDelimiterMaxLength, // Skip if remaining buffer can be an escaped quote quoting ? (escape === null ? 0 : escape.length) + quote.length : 0, // Skip if remaining buffer can be record delimiter following the closing quote quoting ? quote.length + recordDelimiterMaxLength : 0, ); return numOfCharLeft < requiredLength; }, // Central parser implementation parse: function (nextBuf, end, push, close) { const { bom, comment_no_infix, encoding, from_line, ltrim, max_record_size, raw, relax_quotes, rtrim, skip_empty_lines, to, to_line, } = this.options; let { comment, escape, quote, record_delimiter } = this.options; const { bomSkipped, previousBuf, rawBuffer, escapeIsQuote } = this.state; let buf; if (previousBuf === undefined) { if (nextBuf === undefined) { // Handle empty string close(); return; } else { buf = nextBuf; } } else if (previousBuf !== undefined && nextBuf === undefined) { buf = previousBuf; } else { buf = Buffer.concat([previousBuf, nextBuf]); } // Handle UTF BOM if (bomSkipped === false) { if (bom === false) { this.state.bomSkipped = true; } else if (buf.length < 3) { // No enough data if (end === false) { // Wait for more data this.state.previousBuf = buf; return; } } else { for (const encoding in boms) { if (boms[encoding].compare(buf, 0, boms[encoding].length) === 0) { // Skip BOM const bomLength = boms[encoding].length; this.state.bufBytesStart += bomLength; buf = buf.slice(bomLength); // Renormalize original options with the new encoding this.options = normalize_options({ ...this.original_options, encoding: encoding, }); // Options will re-evaluate the Buffer with the new encoding ({ comment, escape, quote } = this.options); break; } } this.state.bomSkipped = true; } } const bufLen = buf.length; let pos; for (pos = 0; pos < bufLen; pos++) { // Ensure we get enough space to look ahead // There should be a way to move this out of the loop if (this.__needMoreData(pos, bufLen, end)) { break; } if (this.state.wasRowDelimiter === true) { this.info.lines++; this.state.wasRowDelimiter = false; } if (to_line !== -1 && this.info.lines > to_line) { this.state.stop = true; close(); return; } // Auto discovery of record_delimiter, unix, mac and windows supported if (this.state.quoting === false && record_delimiter.length === 0) { const record_delimiterCount = this.__autoDiscoverRecordDelimiter( buf, pos, ); if (record_delimiterCount) { record_delimiter = this.options.record_delimiter; } } const chr = buf[pos]; if (raw === true) { rawBuffer.append(chr); } if ( (chr === cr || chr === nl) && this.state.wasRowDelimiter === false ) { this.state.wasRowDelimiter = true; } // Previous char was a valid escape char // treat the current char as a regular char if (this.state.escaping === true) { this.state.escaping = false; } else { // Escape is only active inside quoted fields // We are quoting, the char is an escape chr and there is a chr to escape // if(escape !== null && this.state.quoting === true && chr === escape && pos + 1 < bufLen){ if ( escape !== null && this.state.quoting === true && this.__isEscape(buf, pos, chr) && pos + escape.length < bufLen ) { if (escapeIsQuote) { if (this.__isQuote(buf, pos + escape.length)) { this.state.escaping = true; pos += escape.length - 1; continue; } } else { this.state.escaping = true; pos += escape.length - 1; continue; } } // Not currently escaping and chr is a quote // TODO: need to compare bytes instead of single char if (this.state.commenting === false && this.__isQuote(buf, pos)) { if (this.state.quoting === true) { const nextChr = buf[pos + quote.length]; const isNextChrTrimable = rtrim && this.__isCharTrimable(buf, pos + quote.length); const isNextChrComment = comment !== null && this.__compareBytes(comment, buf, pos + quote.length, nextChr); const isNextChrDelimiter = this.__isDelimiter( buf, pos + quote.length, nextChr, ); const isNextChrRecordDelimiter = record_delimiter.length === 0 ? this.__autoDiscoverRecordDelimiter(buf, pos + quote.length) : this.__isRecordDelimiter(nextChr, buf, pos + quote.length); // Escape a quote // Treat next char as a regular character if ( escape !== null && this.__isEscape(buf, pos, chr) && this.__isQuote(buf, pos + escape.length) ) { pos += escape.length - 1; } else if ( !nextChr || isNextChrDelimiter || isNextChrRecordDelimiter || isNextChrComment || isNextChrTrimable ) { this.state.quoting = false; this.state.wasQuoting = true; pos += quote.length - 1; continue; } else if (relax_quotes === false) { const err = this.__error( new CsvError( "CSV_INVALID_CLOSING_QUOTE", [ "Invalid Closing Quote:", `got "${String.fromCharCode(nextChr)}"`, `at line ${this.info.lines}`, "instead of delimiter, record delimiter, trimable character", "(if activated) or comment", ], this.options, this.__infoField(), ), ); if (err !== undefined) return err; } else { this.state.quoting = false; this.state.wasQuoting = true; this.state.field.prepend(quote); pos += quote.length - 1; } } else { if (this.state.field.length !== 0) { // In relax_quotes mode, treat opening quote preceded by chrs as regular if (relax_quotes === false) { const info = this.__infoField(); const bom = Object.keys(boms) .map((b) => boms[b].equals(this.state.field.toString()) ? b : false, ) .filter(Boolean)[0]; const err = this.__error( new CsvError( "INVALID_OPENING_QUOTE", [ "Invalid Opening Quote:", `a quote is found on field ${JSON.stringify(info.column)} at line ${info.lines}, value is ${JSON.stringify(this.state.field.toString(encoding))}`, bom ? `(${bom} bom)` : undefined, ], this.options, info, { field: this.state.field, }, ), ); if (err !== undefined) return err; } } else { this.state.quoting = true; pos += quote.length - 1; continue; } } } if (this.state.quoting === false) { const recordDelimiterLength = this.__isRecordDelimiter( chr, buf, pos, ); if (recordDelimiterLength !== 0) { // Do not emit comments which take a full line const skipCommentLine = this.state.commenting && this.state.wasQuoting === false && this.state.record.length === 0 && this.state.field.length === 0; if (skipCommentLine) { this.info.comment_lines++; // Skip full comment line } else { // Activate records emition if above from_line if ( this.state.enabled === false && this.info.lines + (this.state.wasRowDelimiter === true ? 1 : 0) >= from_line ) { this.state.enabled = true; this.__resetField(); this.__resetRecord(); pos += recordDelimiterLength - 1; continue; } // Skip if line is empty and skip_empty_lines activated if ( skip_empty_lines === true && this.state.wasQuoting === false && this.state.record.length === 0 && this.state.field.length === 0 ) { this.info.empty_lines++; pos += recordDelimiterLength - 1; continue; } this.info.bytes = this.state.bufBytesStart + pos; const errField = this.__onField(); if (errField !== undefined) return errField; this.info.bytes = this.state.bufBytesStart + pos + recordDelimiterLength; const errRecord = this.__onRecord(push); if (errRecord !== undefined) return errRecord; if (to !== -1 && this.info.records >= to) { this.state.stop = true; close(); return; } } this.state.commenting = false; pos += recordDelimiterLength - 1; continue; } if (this.state.commenting) { continue; } if ( comment !== null && (comment_no_infix === false || (this.state.record.length === 0 && this.state.field.length === 0)) ) { const commentCount = this.__compareBytes(comment, buf, pos, chr); if (commentCount !== 0) { this.state.commenting = true; continue; } } const delimiterLength = this.__isDelimiter(buf, pos, chr); if (delimiterLength !== 0) { this.info.bytes = this.state.bufBytesStart + pos; const errField = this.__onField(); if (errField !== undefined) return errField; pos += delimiterLength - 1; continue; } } } if (this.state.commenting === false) { if ( max_record_size !== 0 && this.state.record_length + this.state.field.length > max_record_size ) { return this.__error( new CsvError( "CSV_MAX_RECORD_SIZE", [ "Max Record Size:", "record exceed the maximum number of tolerated bytes", `of ${max_record_size}`, `at line ${this.info.lines}`, ], this.options, this.__infoField(), ), ); } } const lappend = ltrim === false || this.state.quoting === true || this.state.field.length !== 0 || !this.__isCharTrimable(buf, pos); // rtrim in non quoting is handle in __onField const rappend = rtrim === false || this.state.wasQuoting === false; if (lappend === true && rappend === true) { this.state.field.append(chr); } else if (rtrim === true && !this.__isCharTrimable(buf, pos)) { return this.__error( new CsvError( "CSV_NON_TRIMABLE_CHAR_AFTER_CLOSING_QUOTE", [ "Invalid Closing Quote:", "found non trimable byte after quote", `at line ${this.info.lines}`, ], this.options, this.__infoField(), ), ); } else { if (lappend === false) { pos += this.__isCharTrimable(buf, pos) - 1; } continue; } } if (end === true) { // Ensure we are not ending in a quoting state if (this.state.quoting === true) { const err = this.__error( new CsvError( "CSV_QUOTE_NOT_CLOSED", [ "Quote Not Closed:", `the parsing is finished with an opening quote at line ${this.info.lines}`, ], this.options, this.__infoField(), ), ); if (err !== undefined) return err; } else { // Skip last line if it has no characters if ( this.state.wasQuoting === true || this.state.record.length !== 0 || this.state.field.length !== 0 ) { this.info.bytes = this.state.bufBytesStart + pos; const errField = this.__onField(); if (errField !== undefined) return errField; const errRecord = this.__onRecord(push); if (errRecord !== undefined) return errRecord; } else if (this.state.wasRowDelimiter === true) { this.info.empty_lines++; } else if (this.state.commenting === true) { this.info.comment_lines++; } } } else { this.state.bufBytesStart += pos; this.state.previousBuf = buf.slice(pos); } if (this.state.wasRowDelimiter === true) { this.info.lines++; this.state.wasRowDelimiter = false; } }, __onRecord: function (push) { const { columns, group_columns_by_name, encoding, info, from, relax_column_count, relax_column_count_less, relax_column_count_more, raw, skip_records_with_empty_values, } = this.options; const { enabled, record } = this.state; if (enabled === false) { return this.__resetRecord(); } // Convert the first line into column names const recordLength = record.length; if (columns === true) { if (skip_records_with_empty_values === true && isRecordEmpty(record)) { this.__resetRecord(); return; } return this.__firstLineToColumns(record); } if (columns === false && this.info.records === 0) { this.state.expectedRecordLength = recordLength; } if (recordLength !== this.state.expectedRecordLength) { const err = columns === false ? new CsvError( "CSV_RECORD_INCONSISTENT_FIELDS_LENGTH", [ "Invalid Record Length:", `expect ${this.state.expectedRecordLength},`, `got ${recordLength} on line ${this.info.lines}`, ], this.options, this.__infoField(), { record: record, }, ) : new CsvError( "CSV_RECORD_INCONSISTENT_COLUMNS", [ "Invalid Record Length:", `columns length is ${columns.length},`, // rename columns `got ${recordLength} on line ${this.info.lines}`, ], this.options, this.__infoField(), { record: record, }, ); if ( relax_column_count === true || (relax_column_count_less === true && recordLength < this.state.expectedRecordLength) || (relax_column_count_more === true && recordLength > this.state.expectedRecordLength) ) { this.info.invalid_field_length++; this.state.error = err; // Error is undefined with skip_records_with_error } else { const finalErr = this.__error(err); if (finalErr) return finalErr; } } if (skip_records_with_empty_values === true && isRecordEmpty(record)) { this.__resetRecord(); return; } if (this.state.recordHasError === true) { this.__resetRecord(); this.state.recordHasError = false; return; } this.info.records++; if (from === 1 || this.info.records >= from) { const { objname } = this.options; // With columns, records are object if (columns !== false) { const obj = {}; // Transform record array to an object for (let i = 0, l = record.length; i < l; i++) { if (columns[i] === undefined || columns[i].disabled) continue; // Turn duplicate columns into an array if ( group_columns_by_name === true && obj[columns[i].name] !== undefined ) { if (Array.isArray(obj[columns[i].name])) { obj[columns[i].name] = obj[columns[i].name].concat(record[i]); } else { obj[columns[i].name] = [obj[columns[i].name], record[i]]; } } else { obj[columns[i].name] = record[i]; } } // Without objname (default) if (raw === true || info === true) { const extRecord = Object.assign( { record: obj }, raw === true ? { raw: this.state.rawBuffer.toString(encoding) } : {}, info === true ? { info: this.__infoRecord() } : {}, ); const err = this.__push( objname === undefined ? extRecord : [obj[objname], extRecord], push, ); if (err) { return err; } } else { const err = this.__push( objname === undefined ? obj : [obj[objname], obj], push, ); if (err) { return err; } } // Without columns, records are array } else { if (raw === true || info === true) { const extRecord = Object.assign( { record: record }, raw === true ? { raw: this.state.rawBuffer.toString(encoding) } : {}, info === true ? { info: this.__infoRecord() } : {}, ); const err = this.__push( objname === undefined ? extRecord : [record[objname], extRecord], push, ); if (err) { return err; } } else { const err = this.__push( objname === undefined ? record : [record[objname], record], push, ); if (err) { return err; } } } } this.__resetRecord(); }, __firstLineToColumns: function (record) { const { firstLineToHeaders } = this.state; try { const headers = firstLineToHeaders === undefined ? record : firstLineToHeaders.call(null, record); if (!Array.isArray(headers)) { return this.__error( new CsvError( "CSV_INVALID_COLUMN_MAPPING", [ "Invalid Column Mapping:", "expect an array from column function,", `got ${JSON.stringify(headers)}`, ], this.options, this.__infoField(), { headers: headers, }, ), ); } const normalizedHeaders = normalize_columns_array(headers); this.state.expectedRecordLength = normalizedHeaders.length; this.options.columns = normalizedHeaders; this.__resetRecord(); return; } catch (err) { return err; } }, __resetRecord: function () { if (this.options.raw === true) { this.state.rawBuffer.reset(); } this.state.error = undefined; this.state.record = []; this.state.record_length = 0; }, __onField: function () { const { cast, encoding, rtrim, max_record_size } = this.options; const { enabled, wasQuoting } = this.state; // Short circuit for the from_line options if (enabled === false) { return this.__resetField(); } let field = this.state.field.toString(encoding); if (rtrim === true && wasQuoting === false) { field = field.trimRight(); } if (cast === true) { const [err, f] = this.__cast(field); if (err !== undefined) return err; field = f; } this.state.record.push(field); // Increment record length if record size must not exceed a limit if (max_record_size !== 0 && typeof field === "string") { this.state.record_length += field.length; } this.__resetField(); }, __resetField: function () { this.state.field.reset(); this.state.wasQuoting = false; }, __push: function (record, push) { const { on_record } = this.options; if (on_record !== undefined) { const info = this.__infoRecord(); try { record = on_record.call(null, record, info); } catch (err) { return err; } if (record === undefined || record === null) { return; } } push(record); }, // Return a tuple with the error and the casted value __cast: function (field) { const { columns, relax_column_count } = this.options; const isColumns = Array.isArray(columns); // Dont loose time calling cast // because the final record is an object // and this field can't be associated to a key present in columns if ( isColumns === true && relax_column_count && this.options.columns.length <= this.state.record.length ) { return [undefined, undefined]; } if (this.state.castField !== null) { try { const info = this.__infoField(); return [undefined, this.state.castField.call(null, field, info)]; } catch (err) { return [err]; } } if (this.__isFloat(field)) { return [undefined, parseFloat(field)]; } else if (this.options.cast_date !== false) { const info = this.__infoField(); return [undefined, this.options.cast_date.call(null, field, info)]; } return [undefined, field]; }, // Helper to test if a character is a space or a line delimiter __isCharTrimable: function (buf, pos) { const isTrim = (buf, pos) => { const { timchars } = this.state; loop1: for (let i = 0; i < timchars.length; i++) { const timchar = timchars[i]; for (let j = 0; j < timchar.length; j++) { if (timchar[j] !== buf[pos + j]) continue loop1; } return timchar.length; } return 0; }; return isTrim(buf, pos); }, // Keep it in case we implement the `cast_int` option // __isInt(value){ // // return Number.isInteger(parseInt(value)) // // return !isNaN( parseInt( obj ) ); // return /^(\-|\+)?[1-9][0-9]*$/.test(value) // } __isFloat: function (value) { return value - parseFloat(value) + 1 >= 0; // Borrowed from jquery }, __compareBytes: function (sourceBuf, targetBuf, targetPos, firstByte) { if (sourceBuf[0] !== firstByte) return 0; const sourceLength = sourceBuf.length; for (let i = 1; i < sourceLength; i++) { if (sourceBuf[i] !== targetBuf[targetPos + i]) return 0; } return sourceLength; }, __isDelimiter: function (buf, pos, chr) { const { delimiter, ignore_last_delimiters } = this.options; if ( ignore_last_delimiters === true && this.state.record.length === this.options.columns.length - 1 ) { return 0; } else if ( ignore_last_delimiters !== false && typeof ignore_last_delimiters === "number" && this.state.record.length === ignore_last_delimiters - 1 ) { return 0; } loop1: for (let i = 0; i < delimiter.length; i++) { const del = delimiter[i]; if (del[0] === chr) { for (let j = 1; j < del.length; j++) { if (del[j] !== buf[pos + j]) continue loop1; } return del.length; } } return 0; }, __isRecordDelimiter: function (chr, buf, pos) { const { record_delimiter } = this.options; const recordDelimiterLength = record_delimiter.length; loop1: for (let i = 0; i < recordDelimiterLength; i++) { const rd = record_delimiter[i]; const rdLength = rd.length; if (rd[0] !== chr) { continue; } for (let j = 1; j < rdLength; j++) { if (rd[j] !== buf[pos + j]) { continue loop1; } } return rd.length; } return 0; }, __isEscape: function (buf, pos, chr) { const { escape } = this.options; if (escape === null) return false; const l = escape.length; if (escape[0] === chr) { for (let i = 0; i < l; i++) { if (escape[i] !== buf[pos + i]) { return false; } } return true; } return false; }, __isQuote: function (buf, pos) { const { quote } = this.options; if (quote === null) return false; const l = quote.length; for (let i = 0; i < l; i++) { if (quote[i] !== buf[pos + i]) { return false; } } return true; }, __autoDiscoverRecordDelimiter: function (buf, pos) { const { encoding } = this.options; // Note, we don't need to cache this information in state, // It is only called on the first line until we find out a suitable // record delimiter. const rds = [ // Important, the windows line ending must be before mac os 9 Buffer.from("\r\n", encoding), Buffer.from("\n", encoding), Buffer.from("\r", encoding), ]; loop: for (let i = 0; i < rds.length; i++) { const l = rds[i].length; for (let j = 0; j < l; j++) { if (rds[i][j] !== buf[pos + j]) { continue loop; } } this.options.record_delimiter.push(rds[i]); this.state.recordDelimiterMaxLength = rds[i].length; return rds[i].length; } return 0; }, __error: function (msg) { const { encoding, raw, skip_records_with_error } = this.options; const err = typeof msg === "string" ? new Error(msg) : msg; if (skip_records_with_error) { this.state.recordHasError = true; if (this.options.on_skip !== undefined) { this.options.on_skip( err, raw ? this.state.rawBuffer.toString(encoding) : undefined, ); } // this.emit('skip', err, raw ? this.state.rawBuffer.toString(encoding) : undefined); return undefined; } else { return err; } }, __infoDataSet: function () { return { ...this.info, columns: this.options.columns, }; }, __infoRecord: function () { const { columns, raw, encoding } = this.options; return { ...this.__infoDataSet(), error: this.state.error, header: columns === true, index: this.state.record.length, raw: raw ? this.state.rawBuffer.toString(encoding) : undefined, }; }, __infoField: function () { const { columns } = this.options; const isColumns = Array.isArray(columns); return { ...this.__infoRecord(), column: isColumns === true ? columns.length > this.state.record.length ? columns[this.state.record.length].name : null : this.state.record.length, quoting: this.state.wasQuoting, }; }, }; }; export { transform, CsvError };