regex: add OR error, if sourounded by char classes, and a test (#12278)
parent
e45cd02029
commit
47a2301139
|
@ -28,6 +28,7 @@ simple token, is a single character.
|
||||||
`abc` OR `ebc`. Instead it is evaluated like `ab`, followed by `c OR e`,
|
`abc` OR `ebc`. Instead it is evaluated like `ab`, followed by `c OR e`,
|
||||||
followed by `bc`, because the **token is the base element**,
|
followed by `bc`, because the **token is the base element**,
|
||||||
not the sequence of symbols.
|
not the sequence of symbols.
|
||||||
|
Note: **Two char classes with an `OR` in the middle is a syntax error.**
|
||||||
|
|
||||||
- The **match operation stops at the end of the string**. It does *NOT* stop
|
- The **match operation stops at the end of the string**. It does *NOT* stop
|
||||||
at new line characters.
|
at new line characters.
|
||||||
|
@ -155,6 +156,7 @@ match too, finally test the token `c`.
|
||||||
|
|
||||||
NB: ** unlike in PCRE, the OR operation works at token level!**
|
NB: ** unlike in PCRE, the OR operation works at token level!**
|
||||||
It doesn't work at concatenation level!
|
It doesn't work at concatenation level!
|
||||||
|
NB2: **Two char classes with an `OR` in the middle is a syntax error.**
|
||||||
|
|
||||||
That also means, that a query string like `abc|bde` is not equal to
|
That also means, that a query string like `abc|bde` is not equal to
|
||||||
`(abc)|(bde)`, but instead to `ab(c|b)de.
|
`(abc)|(bde)`, but instead to `ab(c|b)de.
|
||||||
|
@ -474,21 +476,21 @@ the behavior of the parser itself.
|
||||||
```v ignore
|
```v ignore
|
||||||
// example of flag settings
|
// example of flag settings
|
||||||
mut re := regex.new()
|
mut re := regex.new()
|
||||||
re.flag = regex.F_BIN
|
re.flag = regex.f_bin
|
||||||
```
|
```
|
||||||
|
|
||||||
- `F_BIN`: parse a string as bytes, utf-8 management disabled.
|
- `f_bin`: parse a string as bytes, utf-8 management disabled.
|
||||||
|
|
||||||
- `F_EFM`: exit on the first char matches in the query, used by the
|
- `f_efm`: exit on the first char matches in the query, used by the
|
||||||
find function.
|
find function.
|
||||||
|
|
||||||
- `F_MS`: matches only if the index of the start match is 0,
|
- `f_ms`: matches only if the index of the start match is 0,
|
||||||
same as `^` at the start of the query string.
|
same as `^` at the start of the query string.
|
||||||
|
|
||||||
- `F_ME`: matches only if the end index of the match is the last char
|
- `f_me`: matches only if the end index of the match is the last char
|
||||||
of the input string, same as `$` end of query string.
|
of the input string, same as `$` end of query string.
|
||||||
|
|
||||||
- `F_NL`: stop the matching if found a new line char `\n` or `\r`
|
- `f_nl`: stop the matching if found a new line char `\n` or `\r`
|
||||||
|
|
||||||
## Functions
|
## Functions
|
||||||
|
|
||||||
|
|
|
@ -40,6 +40,7 @@ pub const (
|
||||||
err_groups_max_nested = -8 // max number of nested group reached
|
err_groups_max_nested = -8 // max number of nested group reached
|
||||||
err_group_not_balanced = -9 // group not balanced
|
err_group_not_balanced = -9 // group not balanced
|
||||||
err_group_qm_notation = -10 // group invalid notation
|
err_group_qm_notation = -10 // group invalid notation
|
||||||
|
err_invalid_or_with_cc = -11 // invalid or on two consecutive char class
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
|
@ -196,6 +197,7 @@ pub fn (re RE) get_parse_error_string(err int) string {
|
||||||
regex.err_groups_max_nested { return 'err_groups_max_nested' }
|
regex.err_groups_max_nested { return 'err_groups_max_nested' }
|
||||||
regex.err_group_not_balanced { return 'err_group_not_balanced' }
|
regex.err_group_not_balanced { return 'err_group_not_balanced' }
|
||||||
regex.err_group_qm_notation { return 'err_group_qm_notation' }
|
regex.err_group_qm_notation { return 'err_group_qm_notation' }
|
||||||
|
regex.err_invalid_or_with_cc { return 'err_invalid_or_with_cc' }
|
||||||
else { return 'err_unknown' }
|
else { return 'err_unknown' }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -252,6 +254,8 @@ mut:
|
||||||
// dot_char token variables
|
// dot_char token variables
|
||||||
dot_check_pc int = -1 // pc of the next token to check
|
dot_check_pc int = -1 // pc of the next token to check
|
||||||
last_dot_flag bool // if true indicate that is the last dot_char in the regex
|
last_dot_flag bool // if true indicate that is the last dot_char in the regex
|
||||||
|
// debug fields
|
||||||
|
source_index int
|
||||||
}
|
}
|
||||||
|
|
||||||
[inline]
|
[inline]
|
||||||
|
@ -1028,11 +1032,11 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) {
|
||||||
|
|
||||||
// OR branch
|
// OR branch
|
||||||
if char_len == 1 && pc > 0 && byte(char_tmp) == `|` {
|
if char_len == 1 && pc > 0 && byte(char_tmp) == `|` {
|
||||||
// two consecutive ist_dot_char are an error
|
|
||||||
if pc > 0 && re.prog[pc - 1].ist == regex.ist_or_branch {
|
if pc > 0 && re.prog[pc - 1].ist == regex.ist_or_branch {
|
||||||
return regex.err_syntax_error, i
|
return regex.err_syntax_error, i
|
||||||
}
|
}
|
||||||
re.prog[pc].ist = u32(0) | regex.ist_or_branch
|
re.prog[pc].ist = u32(0) | regex.ist_or_branch
|
||||||
|
re.prog[pc].source_index = i
|
||||||
pc = pc + 1
|
pc = pc + 1
|
||||||
i = i + char_len
|
i = i + char_len
|
||||||
continue
|
continue
|
||||||
|
@ -1252,12 +1256,20 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) {
|
||||||
pc1 = 0
|
pc1 = 0
|
||||||
for pc1 < pc - 2 {
|
for pc1 < pc - 2 {
|
||||||
// println("Here $pc1 ${pc-2}")
|
// println("Here $pc1 ${pc-2}")
|
||||||
|
// println("source index: ${pc1 + 1} => ${re.prog[pc1+1].source_index}")
|
||||||
|
if re.prog[pc1 + 1].ist == regex.ist_or_branch {
|
||||||
// two consecutive OR are a syntax error
|
// two consecutive OR are a syntax error
|
||||||
if re.prog[pc1 + 1].ist == regex.ist_or_branch
|
if re.prog[pc1 + 2].ist == regex.ist_or_branch {
|
||||||
&& re.prog[pc1 + 2].ist == regex.ist_or_branch {
|
|
||||||
return regex.err_syntax_error, i
|
return regex.err_syntax_error, i
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// check for []|[] errors
|
||||||
|
if re.prog[pc1].ist == regex.ist_char_class_pos
|
||||||
|
&& re.prog[pc1 + 2].ist == regex.ist_char_class_pos {
|
||||||
|
return regex.err_invalid_or_with_cc, re.prog[pc1 + 1].source_index
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// manange a|b chains like a|(b)|c|d...
|
// manange a|b chains like a|(b)|c|d...
|
||||||
// standard solution
|
// standard solution
|
||||||
if re.prog[pc1].ist != regex.ist_or_branch && re.prog[pc1 + 1].ist == regex.ist_or_branch
|
if re.prog[pc1].ist != regex.ist_or_branch && re.prog[pc1 + 1].ist == regex.ist_or_branch
|
||||||
|
@ -1280,7 +1292,7 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) {
|
||||||
|
|
||||||
pc2++
|
pc2++
|
||||||
}
|
}
|
||||||
// special case query of few chars, teh true can't go on the first instruction
|
// special case query of few chars, the true can't go on the first instruction
|
||||||
if re.prog[pc1 + 1].rep_max == pc1 {
|
if re.prog[pc1 + 1].rep_max == pc1 {
|
||||||
re.prog[pc1 + 1].rep_max = 3
|
re.prog[pc1 + 1].rep_max = 3
|
||||||
}
|
}
|
||||||
|
|
|
@ -389,9 +389,8 @@ fn test_regex(){
|
||||||
}
|
}
|
||||||
|
|
||||||
if start != to.s || end != to.e {
|
if start != to.s || end != to.e {
|
||||||
//println("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end")
|
println("#$c [$to.src] q[$to.q] res[$tmp_str] base:[${to.s},${to.e}] $start, $end")
|
||||||
eprintln("ERROR!")
|
eprintln("ERROR!")
|
||||||
//C.printf("ERROR!! res:(%d, %d) refh:(%d, %d)\n",start, end, to.s, to.e)
|
|
||||||
assert false
|
assert false
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
@ -430,8 +429,8 @@ fn test_regex(){
|
||||||
for ln:=0; ln < re.groups.len; ln++ {
|
for ln:=0; ln < re.groups.len; ln++ {
|
||||||
if re.groups[ln] != to.cg[ln] {
|
if re.groups[ln] != to.cg[ln] {
|
||||||
eprintln("Capture group doesn't match:")
|
eprintln("Capture group doesn't match:")
|
||||||
eprintln("true ground: [${to.cg}]")
|
eprintln("true ground: ${to.cg}")
|
||||||
eprintln("elaborated : [${re.groups}]")
|
eprintln("elaborated : ${re.groups}")
|
||||||
assert false
|
assert false
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -551,7 +550,6 @@ fn test_regex(){
|
||||||
if start != to.s || end != to.e {
|
if start != to.s || end != to.e {
|
||||||
eprintln("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end")
|
eprintln("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end")
|
||||||
eprintln("ERROR!")
|
eprintln("ERROR!")
|
||||||
//C.printf("ERROR!! res:(%d, %d) refh:(%d, %d)\n",start, end, to.s, to.e)
|
|
||||||
assert false
|
assert false
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
@ -706,3 +704,19 @@ fn test_groups_in_find(){
|
||||||
assert re.groups == test_obj.res
|
assert re.groups == test_obj.res
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const(
|
||||||
|
err_query_list = [
|
||||||
|
r'([a]|[b])*'
|
||||||
|
]
|
||||||
|
)
|
||||||
|
fn test_errors(){
|
||||||
|
mut count := 0
|
||||||
|
for query in err_query_list {
|
||||||
|
_, err, _ := regex.regex_base(query)
|
||||||
|
if err != regex.compile_ok {
|
||||||
|
count++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assert count == err_query_list.len
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in New Issue