regex fixes

pull/3564/head
penguindark 2020-01-25 19:12:23 +01:00 committed by Alexander Medvednikov
parent 222fc4b04f
commit 15a63b5bcb
2 changed files with 159 additions and 52 deletions

View File

@ -159,6 +159,91 @@ for gi < re.groups.len {
**note:** *to show the `group id number` in the result of the `get_query()` the flag `debug` of the RE object must be `1` or `2`* **note:** *to show the `group id number` in the result of the `get_query()` the flag `debug` of the RE object must be `1` or `2`*
### Groups Continuous saving
In particular situations it is useful have a continuous save of the groups, this is possible initializing the saving array field in `RE` struct: `group_csave`.
This feature allow to collect data in a continuous way.
In the example we pass a text followed by a integer list that we want collect.
To achieve this task we can use the continuous saving of the group that save each captured group in a array that we set with: `re.group_csave = [-1].repeat(3*20+1)`.
The array will be filled with the following logic:
`re.group_csave[0]` number of total saved records
`re.group_csave[1+n*3]` id of the saved group
`re.group_csave[1+n*3]` start index in the source string of the saved group
`re.group_csave[1+n*3]` end index in the source string of the saved group
The regex save until finish or found that the array have no space. If the space ends no error is raised, further records will not be saved.
```v
fn example2() {
test_regex()
text := "tst: 01,23,45 ,56, 78"
query:= r".*:(\s*\d+[\s,]*)+"
mut re := regex.new_regex()
//re.debug = 2
re.group_csave = [-1].repeat(3*20+1) // we expect max 20 records
re_err, err_pos := re.compile(query)
if re_err == regex.COMPILE_OK {
q_str := re.get_query()
println("Query: $q_str")
start, end := re.match_string(text)
if start < 0 {
println("ERROR : ${re.get_parse_error_string(start)}, $start")
} else {
println("found in [$start, $end] => [${text[start..end]}]")
}
// groups capture
mut gi := 0
for gi < re.groups.len {
if re.groups[gi] >= 0 {
println("${gi/2} ${re.groups[gi]},${re.groups[gi+1]} :[${text[re.groups[gi]..re.groups[gi+1]]}]")
}
gi += 2
}
// continuous saving
gi = 0
println("num: ${re.group_csave[0]}")
for gi < re.group_csave[0] {
id := re.group_csave[1+gi*3]
st := re.group_csave[1+gi*3+1]
en := re.group_csave[1+gi*3+2]
println("cg id: ${id} [${st}, ${en}] => [${text[st..en]}]")
gi++
}
} else {
println("query: $query")
lc := "-".repeat(err_pos)
println("err : $lc^")
err_str := re.get_parse_error_string(re_err)
println("ERROR: $err_str")
}
}
```
The output will be:
```
Query: .*:(\s*\d+[\s,]*)+
found in [0, 21] => [tst: 01,23,45 ,56, 78]
0 19,21 :[78]
num: 5
cg id: 0 [4, 8] => [ 01,]
cg id: 0 [8, 11] => [23,]
cg id: 0 [11, 15] => [45 ,]
cg id: 0 [15, 19] => [56, ]
cg id: 0 [19, 21] => [78]
```
## Flags ## Flags
It is possible to set some flags in the regex parser that change the behavior of the parser itself. It is possible to set some flags in the regex parser that change the behavior of the parser itself.

View File

@ -266,23 +266,20 @@ fn (tok mut Token) reset() {
* *
******************************************************************************/ ******************************************************************************/
pub const ( pub const (
//F_FND = 0x00000001 // check until the end of the input string, it act like a "find first match", not efficient!! F_NL = 0x00000001 // end the match when find a new line symbol
//F_PM = 0x00000004 // partial match: if the source text finish and the match is positive until then return true F_MS = 0x00000002 // match true only if the match is at the start of the string
F_ME = 0x00000004 // match true only if the match is at the end of the string
F_NL = 0x00000002 // end the match when find a new line symbol F_EFM = 0x00000100 // exit on first token matched, used by search
F_MS = 0x00000008 // match true only if the match is at the start of the string F_BIN = 0x00000200 // work only on bytes, ignore utf-8
F_ME = 0x00000010 // match true only if the match is at the end of the string
F_EFM = 0x01000000 // exit on first token matched, used by search
F_BIN = 0x02000000 // work only on bytes, ignore utf-8
) )
struct StateDotObj{ struct StateDotObj{
mut: mut:
i int = -1 // char index in the input buffer i int = -1 // char index in the input buffer
pc int = -1 // program counter saved pc int = -1 // program counter saved
mi int = -1 // match_index saved mi int = -1 // match_index saved
group_stack_index int = -1 // group index stack pointer saved group_stack_index int = -1 // continuous save on capturing groups
} }
pub pub
@ -305,6 +302,9 @@ pub mut:
group_max_nested int = 3 // max nested group group_max_nested int = 3 // max nested group
group_max int = 8 // max allowed number of different groups group_max int = 8 // max allowed number of different groups
group_csave []int = []int // groups continuous save array
group_csave_index int= -1 // groups continuous save index
// flags // flags
flag int = 0 // flag for optional parameters flag int = 0 // flag for optional parameters
@ -328,6 +328,12 @@ fn (re mut RE) reset(){
re.groups = [-1].repeat(re.group_count*2) re.groups = [-1].repeat(re.group_count*2)
re.state_stack_index = -1 re.state_stack_index = -1
// reset group_csave
if re.group_csave.len > 0 {
re.group_csave_index = 1
re.group_csave[0] = 0 // reset the capture count
}
} }
/****************************************************************************** /******************************************************************************
@ -734,8 +740,6 @@ fn (re RE) parse_quantifier(in_txt string, in_i int) (int, int, int, bool) {
} }
} }
// not a {} quantifier, exit // not a {} quantifier, exit
return ERR_SYNTAX_ERROR, i, 0, false return ERR_SYNTAX_ERROR, i, 0, false
} }
@ -997,7 +1001,6 @@ pub fn (re mut RE) compile(in_txt string) (int,int) {
// Post processing // Post processing
//****************************************** //******************************************
// count IST_DOT_CHAR to set the size of the state stack // count IST_DOT_CHAR to set the size of the state stack
mut pc1 := 0 mut pc1 := 0
mut tmp_count := 0 mut tmp_count := 0
@ -1054,7 +1057,6 @@ pub fn (re mut RE) compile(in_txt string) (int,int) {
pc1++ pc1++
} }
//****************************************** //******************************************
// DEBUG PRINT REGEX GENERATED CODE // DEBUG PRINT REGEX GENERATED CODE
//****************************************** //******************************************
@ -1075,14 +1077,15 @@ pub fn (re RE) get_code() string {
mut stop_flag := false mut stop_flag := false
for pc1 <= re.prog.len { for pc1 <= re.prog.len {
tk := re.prog[pc1]
res.write("PC:${pc1:3d}") res.write("PC:${pc1:3d}")
res.write(" ist: ") res.write(" ist: ")
res.write("${re.prog[pc1].ist:8x}".replace(" ","0") ) res.write("${tk.ist:8x}".replace(" ","0") )
res.write(" ") res.write(" ")
ist :=re.prog[pc1].ist ist :=tk.ist
if ist == IST_BSLS_CHAR { if ist == IST_BSLS_CHAR {
res.write("[\\${re.prog[pc1].ch:1c}] BSLS") res.write("[\\${tk.ch:1c}] BSLS")
} else if ist == IST_PROG_END { } else if ist == IST_PROG_END {
res.write("PROG_END") res.write("PROG_END")
stop_flag = true stop_flag = true
@ -1095,22 +1098,22 @@ pub fn (re RE) get_code() string {
} else if ist == IST_DOT_CHAR { } else if ist == IST_DOT_CHAR {
res.write(". DOT_CHAR") res.write(". DOT_CHAR")
} else if ist == IST_GROUP_START { } else if ist == IST_GROUP_START {
res.write("( GROUP_START #:${re.prog[pc1].group_id}") res.write("( GROUP_START #:${tk.group_id}")
} else if ist == IST_GROUP_END { } else if ist == IST_GROUP_END {
res.write(") GROUP_END #:${re.prog[pc1].group_id}") res.write(") GROUP_END #:${tk.group_id}")
} else if ist == IST_SIMPLE_CHAR { } else if ist == IST_SIMPLE_CHAR {
res.write("[${re.prog[pc1].ch:1c}] query_ch") res.write("[${tk.ch:1c}] query_ch")
} }
if re.prog[pc1].rep_max == MAX_QUANTIFIER { if tk.rep_max == MAX_QUANTIFIER {
res.write(" {${re.prog[pc1].rep_min:3d},MAX}") res.write(" {${tk.rep_min:3d},MAX}")
}else{ }else{
if ist == IST_OR_BRANCH { if ist == IST_OR_BRANCH {
res.write(" if false go: ${re.prog[pc1].rep_min:3d} if true go: ${re.prog[pc1].rep_max:3d}") res.write(" if false go: ${tk.rep_min:3d} if true go: ${tk.rep_max:3d}")
} else { } else {
res.write(" {${re.prog[pc1].rep_min:3d},${re.prog[pc1].rep_max:3d}}") res.write(" {${tk.rep_min:3d},${tk.rep_max:3d}}")
} }
if re.prog[pc1].greedy == true { if tk.greedy == true {
res.write("?") res.write("?")
} }
} }
@ -1123,11 +1126,9 @@ pub fn (re RE) get_code() string {
res.write("========================================\n") res.write("========================================\n")
return res.str() return res.str()
} }
// get_query return a string with a reconstruction of the query starting from the regex program code // get_query return a string with a reconstruction of the query starting from the regex program code
pub fn (re RE) get_query() string { pub fn (re RE) get_query() string {
mut res := strings.new_builder(re.query.len*2) mut res := strings.new_builder(re.query.len*2)
@ -1137,14 +1138,15 @@ pub fn (re RE) get_query() string {
mut i := 0 mut i := 0
for i < re.prog.len && re.prog[i].ist != IST_PROG_END && re.prog[i].ist != 0{ for i < re.prog.len && re.prog[i].ist != IST_PROG_END && re.prog[i].ist != 0{
ch := re.prog[i].ist tk := &re.prog[i]
ch := tk.ist
// GROUP start // GROUP start
if ch == IST_GROUP_START { if ch == IST_GROUP_START {
if re.debug == 0 { if re.debug == 0 {
res.write("(") res.write("(")
} else { } else {
res.write("#${re.prog[i].group_id}(") res.write("#${tk.group_id}(")
} }
i++ i++
continue continue
@ -1159,7 +1161,7 @@ pub fn (re RE) get_query() string {
if ch == IST_OR_BRANCH { if ch == IST_OR_BRANCH {
res.write("|") res.write("|")
if re.debug > 0 { if re.debug > 0 {
res.write("{${re.prog[i].rep_min},${re.prog[i].rep_max}}") res.write("{${tk.rep_min},${tk.rep_max}}")
} }
i++ i++
continue continue
@ -1177,7 +1179,7 @@ pub fn (re RE) get_query() string {
// bsls char // bsls char
if ch == IST_BSLS_CHAR { if ch == IST_BSLS_CHAR {
res.write("\\${re.prog[i].ch:1c}") res.write("\\${tk.ch:1c}")
} }
// IST_DOT_CHAR // IST_DOT_CHAR
@ -1190,29 +1192,28 @@ pub fn (re RE) get_query() string {
if byte(ch) in BSLS_ESCAPE_LIST { if byte(ch) in BSLS_ESCAPE_LIST {
res.write("\\") res.write("\\")
} }
res.write("${re.prog[i].ch:c}") res.write("${tk.ch:c}")
} }
// quantifier // quantifier
if !(re.prog[i].rep_min == 1 && re.prog[i].rep_max == 1) { if !(tk.rep_min == 1 && tk.rep_max == 1) {
if re.prog[i].rep_min == 0 && re.prog[i].rep_max == 1 { if tk.rep_min == 0 && tk.rep_max == 1 {
res.write("?") res.write("?")
} else if re.prog[i].rep_min == 1 && re.prog[i].rep_max == MAX_QUANTIFIER { } else if tk.rep_min == 1 && tk.rep_max == MAX_QUANTIFIER {
res.write("+") res.write("+")
} else if re.prog[i].rep_min == 0 && re.prog[i].rep_max == MAX_QUANTIFIER { } else if tk.rep_min == 0 && tk.rep_max == MAX_QUANTIFIER {
res.write("*") res.write("*")
} else { } else {
if re.prog[i].rep_max == MAX_QUANTIFIER { if tk.rep_max == MAX_QUANTIFIER {
res.write("{${re.prog[i].rep_min},MAX}") res.write("{${tk.rep_min},MAX}")
} else { } else {
res.write("{${re.prog[i].rep_min},${re.prog[i].rep_max}}") res.write("{${tk.rep_min},${tk.rep_max}}")
} }
if re.prog[i].greedy == true { if tk.greedy == true {
res.write("?") res.write("?")
} }
} }
} }
i++ i++
} }
if (re.flag & F_ME) != 0 { if (re.flag & F_ME) != 0 {
@ -1411,6 +1412,20 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
re.groups[g_index] = 0 re.groups[g_index] = 0
} }
re.groups[g_index+1] = i re.groups[g_index+1] = i
// continuous save, save until we have space
if re.group_csave_index > 0 {
// check if we have space to save the record
if (re.group_csave_index + 3) < re.group_csave.len {
// incrment counter
re.group_csave[0]++
// save the record
re.group_csave[re.group_csave_index++] = g_index // group id
re.group_csave[re.group_csave_index++] = re.groups[g_index] // start
re.group_csave[re.group_csave_index++] = re.groups[g_index+1] // end
}
}
} }
group_index-- group_index--
@ -1543,6 +1558,19 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
} }
re.groups[g_index+1] = i re.groups[g_index+1] = i
//C.printf("GROUP %d END [%d, %d]\n", re.prog[pc].group_id, re.groups[g_index], re.groups[g_index+1]) //C.printf("GROUP %d END [%d, %d]\n", re.prog[pc].group_id, re.groups[g_index], re.groups[g_index+1])
// continuous save, save until we have space
if re.group_csave_index > 0 {
// check if we have space to save the record
if (re.group_csave_index + 3) < re.group_csave.len {
// incrment counter
re.group_csave[0]++
// save the record
re.group_csave[re.group_csave_index++] = g_index // group id
re.group_csave[re.group_csave_index++] = re.groups[g_index] // start
re.group_csave[re.group_csave_index++] = re.groups[g_index+1] // end
}
}
} }
re.prog[pc].group_rep++ // increase repetitions re.prog[pc].group_rep++ // increase repetitions
@ -1796,8 +1824,6 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
if rep < re.prog[tmp_pc].rep_min { if rep < re.prog[tmp_pc].rep_min {
//C.printf("ist_quant_pg UNDER RANGE\n") //C.printf("ist_quant_pg UNDER RANGE\n")
pc = re.prog[tmp_pc].goto_pc pc = re.prog[tmp_pc].goto_pc
//group_index--
m_state = .ist_next m_state = .ist_next
continue continue
} }
@ -1841,12 +1867,6 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
m_state = .ist_next // go to next ist m_state = .ist_next // go to next ist
continue continue
} }
// match failed
else if rep == 0 && re.prog[pc].rep_min > 0 {
//C.printf("ist_quant_n NO MATCH\n")
// dummy
}
// match + or * // match + or *
else if rep >= re.prog[pc].rep_min { else if rep >= re.prog[pc].rep_min {
//C.printf("ist_quant_n MATCH RANGE\n") //C.printf("ist_quant_n MATCH RANGE\n")
@ -1902,7 +1922,6 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
m_state = .ist_next m_state = .ist_next
continue continue
} }
m_state = .ist_load m_state = .ist_load
continue continue
} }
@ -1981,6 +2000,9 @@ pub fn (re mut RE) match_string(in_txt string) (int,int) {
return NO_MATCH_FOUND, 0 return NO_MATCH_FOUND, 0
} }
if (re.flag & F_ME) != 0 && end < in_txt.len { if (re.flag & F_ME) != 0 && end < in_txt.len {
if in_txt[end] in NEW_LINE_LIST {
return start, end
}
return NO_MATCH_FOUND, 0 return NO_MATCH_FOUND, 0
} }
return start, end return start, end
@ -2002,7 +2024,7 @@ pub fn (re mut RE) find(in_txt string) (int,int) {
for i < in_txt.len { for i < in_txt.len {
// test only the first part of the query string // test only the first part of the query string
re.flag &= F_EFM // set to exit on the first token match re.flag |= F_EFM // set to exit on the first token match
mut tmp_end := i+re.query.len mut tmp_end := i+re.query.len
if tmp_end > in_txt.len { tmp_end = in_txt.len } if tmp_end > in_txt.len { tmp_end = in_txt.len }
tmp_txt := string{ str: in_txt.str+i, len: tmp_end-i } tmp_txt := string{ str: in_txt.str+i, len: tmp_end-i }