v/vlib/encoding/csv/reader.v

216 lines
4.6 KiB
V
Raw Permalink Normal View History

2022-01-04 10:21:08 +01:00
// Copyright (c) 2019-2022 Alexander Medvednikov. All rights reserved.
2019-08-14 08:45:56 +02:00
// Use of this source code is governed by an MIT license
// that can be found in the LICENSE file.
2019-08-17 14:51:20 +02:00
module csv
2019-08-14 08:45:56 +02:00
// Once interfaces are further along the idea would be to have something similar to
// go's io.reader & bufio.reader rather than reading the whole file into string, this
// would then satisfy that interface. I designed it this way to be easily adapted.
struct CommentIsDelimiterError {
Error
}
fn (err CommentIsDelimiterError) msg() string {
return 'encoding.csv: comment cannot be the same as delimiter'
}
struct InvalidDelimiterError {
Error
}
fn (err InvalidDelimiterError) msg() string {
return 'encoding.csv: invalid delimiter'
}
struct EndOfFileError {
Error
}
fn (err EndOfFileError) msg() string {
return 'encoding.csv: end of file'
}
struct InvalidLineEndingError {
Error
}
fn (err InvalidLineEndingError) msg() string {
return 'encoding.csv: could not find any valid line endings'
}
2019-08-14 08:45:56 +02:00
2022-06-21 07:31:47 +02:00
pub struct Reader {
2019-08-14 08:45:56 +02:00
// not used yet
// has_header bool
// headings []string
2021-03-08 15:57:02 +01:00
data string
pub mut:
2022-04-15 17:25:45 +02:00
delimiter u8
comment u8
2019-08-14 08:45:56 +02:00
is_mac_pre_osx_le bool
row_pos int
}
[params]
pub struct ReaderConfig {
2022-04-15 17:25:45 +02:00
delimiter u8 = `,`
comment u8 = `#`
}
// new_reader initializes a Reader with string data to parse and,
// optionally, a custom delimiter.
pub fn new_reader(data string, config ReaderConfig) &Reader {
2019-08-14 08:45:56 +02:00
return &Reader{
data: data
delimiter: config.delimiter
comment: config.comment
2019-08-14 08:45:56 +02:00
}
}
// read reads a row from the CSV data.
// If successful, the result holds an array of each column's data.
2020-05-17 13:51:18 +02:00
pub fn (mut r Reader) read() ?[]string {
l := r.read_record()?
2019-08-14 08:45:56 +02:00
return l
}
// Once we have multi dimensional array
2020-05-17 13:51:18 +02:00
// pub fn (mut r Reader) read_all() ?[][]string {
2020-04-26 13:49:31 +02:00
// mut records := []string{}
2019-08-14 08:45:56 +02:00
// for {
// record := r.read_record() or {
// if err.error == err_eof.error {
2019-08-14 08:45:56 +02:00
// return records
// } else {
// return err
2019-08-14 08:45:56 +02:00
// }
// }
// records << record
// }
// return records
// }
2020-05-17 13:51:18 +02:00
fn (mut r Reader) read_line() ?string {
2019-08-14 08:45:56 +02:00
// last record
if r.row_pos == r.data.len {
return IError(&EndOfFileError{})
2019-08-14 08:45:56 +02:00
}
le := if r.is_mac_pre_osx_le { '\r' } else { '\n' }
mut i := r.data.index_after(le, r.row_pos)
if i == -1 {
if r.row_pos == 0 {
// check for pre osx mac line endings
i = r.data.index_after('\r', r.row_pos)
if i != -1 {
r.is_mac_pre_osx_le = true
} else {
// no valid line endings found
return IError(&InvalidLineEndingError{})
2019-08-14 08:45:56 +02:00
}
2020-04-21 00:02:55 +02:00
} else {
// No line ending on file
2021-01-05 19:14:35 +01:00
i = r.data.len - 1
2019-08-14 08:45:56 +02:00
}
}
mut line := r.data[r.row_pos..i]
2021-01-05 19:14:35 +01:00
r.row_pos = i + 1
2019-08-14 08:45:56 +02:00
// normalize win line endings (remove extra \r)
2021-01-05 19:14:35 +01:00
if !r.is_mac_pre_osx_le && (line.len >= 1 && line[line.len - 1] == `\r`) {
line = line[..line.len - 1]
2019-08-14 08:45:56 +02:00
}
return line
}
2020-05-17 13:51:18 +02:00
fn (mut r Reader) read_record() ?[]string {
2019-08-14 08:45:56 +02:00
if r.delimiter == r.comment {
return IError(&CommentIsDelimiterError{})
2019-08-14 08:45:56 +02:00
}
2019-08-17 14:51:20 +02:00
if !valid_delim(r.delimiter) {
return IError(&InvalidDelimiterError{})
2019-08-17 14:51:20 +02:00
}
2020-05-10 14:19:26 +02:00
mut need_read := true
mut keep_raw := false
2019-08-14 08:45:56 +02:00
mut line := ''
2020-04-26 13:49:31 +02:00
mut fields := []string{}
2019-08-14 08:45:56 +02:00
mut i := -1
for {
2020-05-10 14:19:26 +02:00
if need_read {
l := r.read_line()?
2020-05-10 14:19:26 +02:00
if l.len <= 0 {
2021-01-05 19:14:35 +01:00
if keep_raw {
line += '\n'
}
2020-05-10 14:19:26 +02:00
continue
} else if l[0] == r.comment {
2021-01-05 19:14:35 +01:00
if keep_raw {
line += '\n' + l
}
2020-05-10 14:19:26 +02:00
continue
} else {
2021-01-05 19:14:35 +01:00
if keep_raw {
line += '\n'
}
2020-05-10 14:19:26 +02:00
line += l
}
need_read = false
keep_raw = false
}
if line.len == 0 || line[0] != `"` { // not quoted
2021-01-05 19:14:35 +01:00
j := line.index(r.delimiter.ascii_str()) or {
2019-08-14 08:45:56 +02:00
// last
2020-04-20 21:49:05 +02:00
fields << line[..line.len]
2019-08-14 08:45:56 +02:00
break
}
i = j
fields << line[..i]
2021-01-05 19:14:35 +01:00
line = line[i + 1..]
2019-08-14 08:45:56 +02:00
continue
2021-01-05 19:14:35 +01:00
} else { // quoted
mut need_more := true
mut has_double_quotes := false
mut j := 0
mut n := 1
for n < line.len {
if line[n] == `"` {
if n == line.len - 1 || line[n + 1] != `"` {
need_more = false
j = n - 1
break
} else {
has_double_quotes = true
n++
}
}
n++
}
if need_more {
2020-05-10 14:19:26 +02:00
need_read = true
keep_raw = true
continue
2020-04-29 16:50:02 +02:00
}
2020-05-10 14:19:26 +02:00
line = line[1..]
2021-01-05 19:14:35 +01:00
if j + 1 == line.len {
2020-04-29 16:50:02 +02:00
// last record
fields << if has_double_quotes { line[..j].replace('""', '"') } else { line[..j] }
2020-04-29 16:50:02 +02:00
break
}
2021-01-05 19:14:35 +01:00
next := line[j + 1]
2020-04-29 16:50:02 +02:00
if next == r.delimiter {
fields << if has_double_quotes { line[..j].replace('""', '"') } else { line[..j] }
2021-03-08 15:57:02 +01:00
if j + 2 == line.len {
line = ''
} else {
line = line[j + 2..]
2021-03-08 15:57:02 +01:00
}
2020-04-29 16:50:02 +02:00
continue
2019-08-14 08:45:56 +02:00
}
}
if i <= -1 && fields.len == 0 {
return IError(&InvalidDelimiterError{})
2019-08-14 08:45:56 +02:00
}
}
return fields
}
2019-08-17 14:51:20 +02:00
2022-04-15 17:25:45 +02:00
fn valid_delim(b u8) bool {
2021-01-05 19:14:35 +01:00
return b != 0 && b != `"` && b != `\r` && b != `\n`
2019-08-17 14:51:20 +02:00
}