v/vlib/encoding/csv/reader.v

// Copyright (c) 2019-2022 Alexander Medvednikov. All rights reserved.
// Use of this source code is governed by an MIT license
// that can be found in the LICENSE file.
module csv

// Once interfaces are further along the idea would be to have something similar to
// go's io.reader & bufio.reader rather than reading the whole file into string, this
// would then satisfy that interface. I designed it this way to be easily adapted.
struct CommentIsDelimiterError {
	Error
}

fn (err CommentIsDelimiterError) msg() string {
	return 'encoding.csv: comment cannot be the same as delimiter'
}

struct InvalidDelimiterError {
	Error
}

fn (err InvalidDelimiterError) msg() string {
	return 'encoding.csv: invalid delimiter'
}

struct EndOfFileError {
	Error
}

fn (err EndOfFileError) msg() string {
	return 'encoding.csv: end of file'
}

struct InvalidLineEndingError {
	Error
}

fn (err InvalidLineEndingError) msg() string {
	return 'encoding.csv: could not find any valid line endings'
}

struct Reader {
	// not used yet
	// has_header        bool
	// headings          []string
	data string
pub mut:
	delimiter         byte
	comment           byte
	is_mac_pre_osx_le bool
	row_pos           int
}

// new_reader initializes a Reader with string data to parse
pub fn new_reader(data string) &Reader {
	return &Reader{
		delimiter: `,`
		comment: `#`
		data: data
	}
}

// read reads a row from the CSV data.
// If successful, the result holds an array of each column's data.
pub fn (mut r Reader) read() ?[]string {
	l := r.read_record() ?
	return l
}

// Once we have multi dimensional array
// pub fn (mut r Reader) read_all() ?[][]string {
// 	mut records := []string{}
// 	for {
// 		record := r.read_record() or {
// 			if err.error == err_eof.error {
// 				return records
// 			} else {
// 				return err
// 			}
// 		}
// 		records << record
// 	}
// 	return records
// }
fn (mut r Reader) read_line() ?string {
	// last record
	if r.row_pos == r.data.len {
		return IError(&EndOfFileError{})
	}
	le := if r.is_mac_pre_osx_le { '\r' } else { '\n' }
	mut i := r.data.index_after(le, r.row_pos)
	if i == -1 {
		if r.row_pos == 0 {
			// check for pre osx mac line endings
			i = r.data.index_after('\r', r.row_pos)
			if i != -1 {
				r.is_mac_pre_osx_le = true
			} else {
				// no valid line endings found
				return IError(&InvalidLineEndingError{})
			}
		} else {
			// No line ending on file
			i = r.data.len - 1
		}
	}
	mut line := r.data[r.row_pos..i]
	r.row_pos = i + 1
	// normalize win line endings (remove extra \r)
	if !r.is_mac_pre_osx_le && (line.len >= 1 && line[line.len - 1] == `\r`) {
		line = line[..line.len - 1]
	}
	return line
}

fn (mut r Reader) read_record() ?[]string {
	if r.delimiter == r.comment {
		return IError(&CommentIsDelimiterError{})
	}
	if !valid_delim(r.delimiter) {
		return IError(&InvalidDelimiterError{})
	}
	mut need_read := true
	mut keep_raw := false
	mut line := ''
	mut fields := []string{}
	mut i := -1
	for {
		if need_read {
			l := r.read_line() ?
			if l.len <= 0 {
				if keep_raw {
					line += '\n'
				}
				continue
			} else if l[0] == r.comment {
				if keep_raw {
					line += '\n' + l
				}
				continue
			} else {
				if keep_raw {
					line += '\n'
				}
				line += l
			}
			need_read = false
			keep_raw = false
		}
		if line.len == 0 || line[0] != `"` { // not quoted
			j := line.index(r.delimiter.ascii_str()) or {
				// last
				fields << line[..line.len]
				break
			}
			i = j
			fields << line[..i]
			line = line[i + 1..]
			continue
		} else { // quoted
			mut need_more := true
			mut has_double_quotes := false
			mut j := 0
			mut n := 1
			for n < line.len {
				if line[n] == `"` {
					if n == line.len - 1 || line[n + 1] != `"` {
						need_more = false
						j = n - 1
						break
					} else {
						has_double_quotes = true
						n++
					}
				}
				n++
			}
			if need_more {
				need_read = true
				keep_raw = true
				continue
			}
			line = line[1..]
			if j + 1 == line.len {
				// last record
				fields << if has_double_quotes { line[..j].replace('""', '"') } else { line[..j] }
				break
			}
			next := line[j + 1]
			if next == r.delimiter {
				fields << if has_double_quotes { line[..j].replace('""', '"') } else { line[..j] }
				if j + 2 == line.len {
					line = ''
				} else {
					line = line[j + 2..]
				}
				continue
			}
		}
		if i <= -1 && fields.len == 0 {
			return IError(&InvalidDelimiterError{})
		}
	}
	return fields
}

fn valid_delim(b byte) bool {
	return b != 0 && b != `"` && b != `\r` && b != `\n`
}
all: update copyright year 2022-01-04 10:21:08 +01:00			`// Copyright (c) 2019-2022 Alexander Medvednikov. All rights reserved.`
encoding.csv module 2019-08-14 08:45:56 +02:00			`// Use of this source code is governed by an MIT license`
			`// that can be found in the LICENSE file.`
encoding.csv: add write support 2019-08-17 14:51:20 +02:00			`module csv`

encoding.csv module 2019-08-14 08:45:56 +02:00			`// Once interfaces are further along the idea would be to have something similar to`
			`// go's io.reader & bufio.reader rather than reading the whole file into string, this`
			`// would then satisfy that interface. I designed it this way to be easily adapted.`
docs, builtin, encoding.csv: update error implementations (#13440) 2022-02-12 10:54:10 +01:00			`struct CommentIsDelimiterError {`
			`Error`
add custom errors to encoding lib (#9513) 2021-03-30 14:27:26 +02:00			`}`

docs, builtin, encoding.csv: update error implementations (#13440) 2022-02-12 10:54:10 +01:00			`fn (err CommentIsDelimiterError) msg() string {`
			`return 'encoding.csv: comment cannot be the same as delimiter'`
add custom errors to encoding lib (#9513) 2021-03-30 14:27:26 +02:00			`}`

docs, builtin, encoding.csv: update error implementations (#13440) 2022-02-12 10:54:10 +01:00			`struct InvalidDelimiterError {`
			`Error`
add custom errors to encoding lib (#9513) 2021-03-30 14:27:26 +02:00			`}`

docs, builtin, encoding.csv: update error implementations (#13440) 2022-02-12 10:54:10 +01:00			`fn (err InvalidDelimiterError) msg() string {`
			`return 'encoding.csv: invalid delimiter'`
			`}`

			`struct EndOfFileError {`
			`Error`
			`}`

			`fn (err EndOfFileError) msg() string {`
			`return 'encoding.csv: end of file'`
			`}`

			`struct InvalidLineEndingError {`
			`Error`
			`}`

			`fn (err InvalidLineEndingError) msg() string {`
			`return 'encoding.csv: could not find any valid line endings'`
add custom errors to encoding lib (#9513) 2021-03-30 14:27:26 +02:00			`}`
encoding.csv module 2019-08-14 08:45:56 +02:00
all: byte.str() => byte.ascii_str() 2021-01-05 19:14:35 +01:00			`struct Reader {`
encoding.csv module 2019-08-14 08:45:56 +02:00			`// not used yet`
			`// has_header bool`
			`// headings []string`
csv: fix error of read() (#9193) 2021-03-08 15:57:02 +01:00			`data string`
access modifiers: update tests/examples 2019-12-13 18:09:11 +01:00			`pub mut:`
encoding.csv module 2019-08-14 08:45:56 +02:00			`delimiter byte`
			`comment byte`
			`is_mac_pre_osx_le bool`
			`row_pos int`
			`}`

encoding/csv: improve Reader docs (#6828) 2020-11-14 18:49:36 +01:00			`// new_reader initializes a Reader with string data to parse`
fix bugs breaking tests 2019-09-03 13:57:04 +02:00			`pub fn new_reader(data string) &Reader {`
encoding.csv module 2019-08-14 08:45:56 +02:00			`return &Reader{`
all: byte.str() => byte.ascii_str() 2021-01-05 19:14:35 +01:00			delimiter: `,`
			comment: `#`
encoding.csv module 2019-08-14 08:45:56 +02:00			`data: data`
			`}`
			`}`

encoding/csv: improve Reader docs (#6828) 2020-11-14 18:49:36 +01:00			`// read reads a row from the CSV data.`
			`// If successful, the result holds an array of each column's data.`
parser: check `(mut f Foo)` syntax 2020-05-17 13:51:18 +02:00			`pub fn (mut r Reader) read() ?[]string {`
all: byte.str() => byte.ascii_str() 2021-01-05 19:14:35 +01:00			`l := r.read_record() ?`
encoding.csv module 2019-08-14 08:45:56 +02:00			`return l`
			`}`

			`// Once we have multi dimensional array`
parser: check `(mut f Foo)` syntax 2020-05-17 13:51:18 +02:00			`// pub fn (mut r Reader) read_all() ?[][]string {`
all: update`import ()` and `[]array` 2020-04-26 13:49:31 +02:00			`// mut records := []string{}`
encoding.csv module 2019-08-14 08:45:56 +02:00			`// for {`
			`// record := r.read_record() or {`
all: update repo to use the new error handling syntax (#8950) 2021-02-28 21:20:21 +01:00			`// if err.error == err_eof.error {`
encoding.csv module 2019-08-14 08:45:56 +02:00			`// return records`
			`// } else {`
all: update repo to use the new error handling syntax (#8950) 2021-02-28 21:20:21 +01:00			`// return err`
encoding.csv module 2019-08-14 08:45:56 +02:00			`// }`
			`// }`
			`// records << record`
			`// }`
			`// return records`
			`// }`
parser: check `(mut f Foo)` syntax 2020-05-17 13:51:18 +02:00			`fn (mut r Reader) read_line() ?string {`
encoding.csv module 2019-08-14 08:45:56 +02:00			`// last record`
			`if r.row_pos == r.data.len {`
docs, builtin, encoding.csv: update error implementations (#13440) 2022-02-12 10:54:10 +01:00			`return IError(&EndOfFileError{})`
encoding.csv module 2019-08-14 08:45:56 +02:00			`}`
			`le := if r.is_mac_pre_osx_le { '\r' } else { '\n' }`
			`mut i := r.data.index_after(le, r.row_pos)`
			`if i == -1 {`
			`if r.row_pos == 0 {`
			`// check for pre osx mac line endings`
			`i = r.data.index_after('\r', r.row_pos)`
			`if i != -1 {`
			`r.is_mac_pre_osx_le = true`
			`} else {`
			`// no valid line endings found`
docs, builtin, encoding.csv: update error implementations (#13440) 2022-02-12 10:54:10 +01:00			`return IError(&InvalidLineEndingError{})`
encoding.csv module 2019-08-14 08:45:56 +02:00			`}`
csv: handle missing line ending 2020-04-21 00:02:55 +02:00			`} else {`
			`// No line ending on file`
all: byte.str() => byte.ascii_str() 2021-01-05 19:14:35 +01:00			`i = r.data.len - 1`
encoding.csv module 2019-08-14 08:45:56 +02:00			`}`
			`}`
compiler/vlib: replace substr/left/right with `[start..end]` everywhere 2019-10-27 08:03:15 +01:00			`mut line := r.data[r.row_pos..i]`
all: byte.str() => byte.ascii_str() 2021-01-05 19:14:35 +01:00			`r.row_pos = i + 1`
encoding.csv module 2019-08-14 08:45:56 +02:00			`// normalize win line endings (remove extra \r)`
all: byte.str() => byte.ascii_str() 2021-01-05 19:14:35 +01:00			if !r.is_mac_pre_osx_le && (line.len >= 1 && line[line.len - 1] == `\r`) {
			`line = line[..line.len - 1]`
encoding.csv module 2019-08-14 08:45:56 +02:00			`}`
			`return line`
			`}`

parser: check `(mut f Foo)` syntax 2020-05-17 13:51:18 +02:00			`fn (mut r Reader) read_record() ?[]string {`
encoding.csv module 2019-08-14 08:45:56 +02:00			`if r.delimiter == r.comment {`
docs, builtin, encoding.csv: update error implementations (#13440) 2022-02-12 10:54:10 +01:00			`return IError(&CommentIsDelimiterError{})`
encoding.csv module 2019-08-14 08:45:56 +02:00			`}`
encoding.csv: add write support 2019-08-17 14:51:20 +02:00			`if !valid_delim(r.delimiter) {`
docs, builtin, encoding.csv: update error implementations (#13440) 2022-02-12 10:54:10 +01:00			`return IError(&InvalidDelimiterError{})`
encoding.csv: add write support 2019-08-17 14:51:20 +02:00			`}`
csv: fix field multiple lines error 2020-05-10 14:19:26 +02:00			`mut need_read := true`
			`mut keep_raw := false`
encoding.csv module 2019-08-14 08:45:56 +02:00			`mut line := ''`
all: update`import ()` and `[]array` 2020-04-26 13:49:31 +02:00			`mut fields := []string{}`
encoding.csv module 2019-08-14 08:45:56 +02:00			`mut i := -1`
			`for {`
csv: fix field multiple lines error 2020-05-10 14:19:26 +02:00			`if need_read {`
all: byte.str() => byte.ascii_str() 2021-01-05 19:14:35 +01:00			`l := r.read_line() ?`
csv: fix field multiple lines error 2020-05-10 14:19:26 +02:00			`if l.len <= 0 {`
all: byte.str() => byte.ascii_str() 2021-01-05 19:14:35 +01:00			`if keep_raw {`
			`line += '\n'`
			`}`
csv: fix field multiple lines error 2020-05-10 14:19:26 +02:00			`continue`
			`} else if l[0] == r.comment {`
all: byte.str() => byte.ascii_str() 2021-01-05 19:14:35 +01:00			`if keep_raw {`
			`line += '\n' + l`
			`}`
csv: fix field multiple lines error 2020-05-10 14:19:26 +02:00			`continue`
			`} else {`
all: byte.str() => byte.ascii_str() 2021-01-05 19:14:35 +01:00			`if keep_raw {`
			`line += '\n'`
			`}`
csv: fix field multiple lines error 2020-05-10 14:19:26 +02:00			`line += l`
			`}`
			`need_read = false`
			`keep_raw = false`
			`}`
csv: fix parse error of last empty field on unquoted line (#10083) 2021-05-13 16:51:07 +02:00			if line.len == 0 \|\| line[0] != `"` { // not quoted
all: byte.str() => byte.ascii_str() 2021-01-05 19:14:35 +01:00			`j := line.index(r.delimiter.ascii_str()) or {`
encoding.csv module 2019-08-14 08:45:56 +02:00			`// last`
csv: fix missing last column 2020-04-20 21:49:05 +02:00			`fields << line[..line.len]`
encoding.csv module 2019-08-14 08:45:56 +02:00			`break`
			`}`
cgen: fix returning optional consts; fix csv test 2020-04-08 17:21:36 +02:00			`i = j`
compiler/vlib: replace substr/left/right with `[start..end]` everywhere 2019-10-27 08:03:15 +01:00			`fields << line[..i]`
all: byte.str() => byte.ascii_str() 2021-01-05 19:14:35 +01:00			`line = line[i + 1..]`
encoding.csv module 2019-08-14 08:45:56 +02:00			`continue`
all: byte.str() => byte.ascii_str() 2021-01-05 19:14:35 +01:00			`} else { // quoted`
csv: fix csv fields with double quotes (#10399) 2021-06-10 18:24:20 +02:00			`mut need_more := true`
			`mut has_double_quotes := false`
			`mut j := 0`
			`mut n := 1`
			`for n < line.len {`
			if line[n] == `"` {
			if n == line.len - 1 \|\| line[n + 1] != `"` {
			`need_more = false`
			`j = n - 1`
			`break`
			`} else {`
			`has_double_quotes = true`
			`n++`
			`}`
			`}`
			`n++`
			`}`
			`if need_more {`
csv: fix field multiple lines error 2020-05-10 14:19:26 +02:00			`need_read = true`
			`keep_raw = true`
			`continue`
csv: fix last-field-empty error 2020-04-29 16:50:02 +02:00			`}`
csv: fix field multiple lines error 2020-05-10 14:19:26 +02:00			`line = line[1..]`
all: byte.str() => byte.ascii_str() 2021-01-05 19:14:35 +01:00			`if j + 1 == line.len {`
csv: fix last-field-empty error 2020-04-29 16:50:02 +02:00			`// last record`
csv: fix csv fields with double quotes (#10399) 2021-06-10 18:24:20 +02:00			`fields << if has_double_quotes { line[..j].replace('""', '"') } else { line[..j] }`
csv: fix last-field-empty error 2020-04-29 16:50:02 +02:00			`break`
			`}`
all: byte.str() => byte.ascii_str() 2021-01-05 19:14:35 +01:00			`next := line[j + 1]`
csv: fix last-field-empty error 2020-04-29 16:50:02 +02:00			`if next == r.delimiter {`
csv: fix csv fields with double quotes (#10399) 2021-06-10 18:24:20 +02:00			`fields << if has_double_quotes { line[..j].replace('""', '"') } else { line[..j] }`
csv: fix error of read() (#9193) 2021-03-08 15:57:02 +01:00			`if j + 2 == line.len {`
csv: fix parse error of last empty field on unquoted line (#10083) 2021-05-13 16:51:07 +02:00			`line = ''`
			`} else {`
			`line = line[j + 2..]`
csv: fix error of read() (#9193) 2021-03-08 15:57:02 +01:00			`}`
csv: fix last-field-empty error 2020-04-29 16:50:02 +02:00			`continue`
encoding.csv module 2019-08-14 08:45:56 +02:00			`}`
			`}`
			`if i <= -1 && fields.len == 0 {`
docs, builtin, encoding.csv: update error implementations (#13440) 2022-02-12 10:54:10 +01:00			`return IError(&InvalidDelimiterError{})`
encoding.csv module 2019-08-14 08:45:56 +02:00			`}`
			`}`
			`return fields`
			`}`
encoding.csv: add write support 2019-08-17 14:51:20 +02:00
			`fn valid_delim(b byte) bool {`
all: byte.str() => byte.ascii_str() 2021-01-05 19:14:35 +01:00			return b != 0 && b != `"` && b != `\r` && b != `\n`
encoding.csv: add write support 2019-08-17 14:51:20 +02:00			`}`