regex fixes

pull/3564/head
penguindark 2020-01-25 19:12:23 +01:00 committed by Alexander Medvednikov
parent 222fc4b04f
commit 15a63b5bcb
2 changed files with 159 additions and 52 deletions

View File

@ -159,6 +159,91 @@ for gi < re.groups.len {
**note:** *to show the `group id number` in the result of the `get_query()` the flag `debug` of the RE object must be `1` or `2`*
### Groups Continuous saving
In particular situations it is useful have a continuous save of the groups, this is possible initializing the saving array field in `RE` struct: `group_csave`.
This feature allow to collect data in a continuous way.
In the example we pass a text followed by a integer list that we want collect.
To achieve this task we can use the continuous saving of the group that save each captured group in a array that we set with: `re.group_csave = [-1].repeat(3*20+1)`.
The array will be filled with the following logic:
`re.group_csave[0]` number of total saved records
`re.group_csave[1+n*3]` id of the saved group
`re.group_csave[1+n*3]` start index in the source string of the saved group
`re.group_csave[1+n*3]` end index in the source string of the saved group
The regex save until finish or found that the array have no space. If the space ends no error is raised, further records will not be saved.
```v
fn example2() {
test_regex()
text := "tst: 01,23,45 ,56, 78"
query:= r".*:(\s*\d+[\s,]*)+"
mut re := regex.new_regex()
//re.debug = 2
re.group_csave = [-1].repeat(3*20+1) // we expect max 20 records
re_err, err_pos := re.compile(query)
if re_err == regex.COMPILE_OK {
q_str := re.get_query()
println("Query: $q_str")
start, end := re.match_string(text)
if start < 0 {
println("ERROR : ${re.get_parse_error_string(start)}, $start")
} else {
println("found in [$start, $end] => [${text[start..end]}]")
}
// groups capture
mut gi := 0
for gi < re.groups.len {
if re.groups[gi] >= 0 {
println("${gi/2} ${re.groups[gi]},${re.groups[gi+1]} :[${text[re.groups[gi]..re.groups[gi+1]]}]")
}
gi += 2
}
// continuous saving
gi = 0
println("num: ${re.group_csave[0]}")
for gi < re.group_csave[0] {
id := re.group_csave[1+gi*3]
st := re.group_csave[1+gi*3+1]
en := re.group_csave[1+gi*3+2]
println("cg id: ${id} [${st}, ${en}] => [${text[st..en]}]")
gi++
}
} else {
println("query: $query")
lc := "-".repeat(err_pos)
println("err : $lc^")
err_str := re.get_parse_error_string(re_err)
println("ERROR: $err_str")
}
}
```
The output will be:
```
Query: .*:(\s*\d+[\s,]*)+
found in [0, 21] => [tst: 01,23,45 ,56, 78]
0 19,21 :[78]
num: 5
cg id: 0 [4, 8] => [ 01,]
cg id: 0 [8, 11] => [23,]
cg id: 0 [11, 15] => [45 ,]
cg id: 0 [15, 19] => [56, ]
cg id: 0 [19, 21] => [78]
```
## Flags
It is possible to set some flags in the regex parser that change the behavior of the parser itself.

View File

@ -266,23 +266,20 @@ fn (tok mut Token) reset() {
*
******************************************************************************/
pub const (
//F_FND = 0x00000001 // check until the end of the input string, it act like a "find first match", not efficient!!
//F_PM = 0x00000004 // partial match: if the source text finish and the match is positive until then return true
F_NL = 0x00000001 // end the match when find a new line symbol
F_MS = 0x00000002 // match true only if the match is at the start of the string
F_ME = 0x00000004 // match true only if the match is at the end of the string
F_NL = 0x00000002 // end the match when find a new line symbol
F_MS = 0x00000008 // match true only if the match is at the start of the string
F_ME = 0x00000010 // match true only if the match is at the end of the string
F_EFM = 0x01000000 // exit on first token matched, used by search
F_BIN = 0x02000000 // work only on bytes, ignore utf-8
F_EFM = 0x00000100 // exit on first token matched, used by search
F_BIN = 0x00000200 // work only on bytes, ignore utf-8
)
struct StateDotObj{
mut:
i int = -1 // char index in the input buffer
pc int = -1 // program counter saved
mi int = -1 // match_index saved
group_stack_index int = -1 // group index stack pointer saved
pc int = -1 // program counter saved
mi int = -1 // match_index saved
group_stack_index int = -1 // continuous save on capturing groups
}
pub
@ -305,6 +302,9 @@ pub mut:
group_max_nested int = 3 // max nested group
group_max int = 8 // max allowed number of different groups
group_csave []int = []int // groups continuous save array
group_csave_index int= -1 // groups continuous save index
// flags
flag int = 0 // flag for optional parameters
@ -328,6 +328,12 @@ fn (re mut RE) reset(){
re.groups = [-1].repeat(re.group_count*2)
re.state_stack_index = -1
// reset group_csave
if re.group_csave.len > 0 {
re.group_csave_index = 1
re.group_csave[0] = 0 // reset the capture count
}
}
/******************************************************************************
@ -734,8 +740,6 @@ fn (re RE) parse_quantifier(in_txt string, in_i int) (int, int, int, bool) {
}
}
// not a {} quantifier, exit
return ERR_SYNTAX_ERROR, i, 0, false
}
@ -997,7 +1001,6 @@ pub fn (re mut RE) compile(in_txt string) (int,int) {
// Post processing
//******************************************
// count IST_DOT_CHAR to set the size of the state stack
mut pc1 := 0
mut tmp_count := 0
@ -1054,7 +1057,6 @@ pub fn (re mut RE) compile(in_txt string) (int,int) {
pc1++
}
//******************************************
// DEBUG PRINT REGEX GENERATED CODE
//******************************************
@ -1075,14 +1077,15 @@ pub fn (re RE) get_code() string {
mut stop_flag := false
for pc1 <= re.prog.len {
tk := re.prog[pc1]
res.write("PC:${pc1:3d}")
res.write(" ist: ")
res.write("${re.prog[pc1].ist:8x}".replace(" ","0") )
res.write("${tk.ist:8x}".replace(" ","0") )
res.write(" ")
ist :=re.prog[pc1].ist
ist :=tk.ist
if ist == IST_BSLS_CHAR {
res.write("[\\${re.prog[pc1].ch:1c}] BSLS")
res.write("[\\${tk.ch:1c}] BSLS")
} else if ist == IST_PROG_END {
res.write("PROG_END")
stop_flag = true
@ -1095,22 +1098,22 @@ pub fn (re RE) get_code() string {
} else if ist == IST_DOT_CHAR {
res.write(". DOT_CHAR")
} else if ist == IST_GROUP_START {
res.write("( GROUP_START #:${re.prog[pc1].group_id}")
res.write("( GROUP_START #:${tk.group_id}")
} else if ist == IST_GROUP_END {
res.write(") GROUP_END #:${re.prog[pc1].group_id}")
res.write(") GROUP_END #:${tk.group_id}")
} else if ist == IST_SIMPLE_CHAR {
res.write("[${re.prog[pc1].ch:1c}] query_ch")
res.write("[${tk.ch:1c}] query_ch")
}
if re.prog[pc1].rep_max == MAX_QUANTIFIER {
res.write(" {${re.prog[pc1].rep_min:3d},MAX}")
if tk.rep_max == MAX_QUANTIFIER {
res.write(" {${tk.rep_min:3d},MAX}")
}else{
if ist == IST_OR_BRANCH {
res.write(" if false go: ${re.prog[pc1].rep_min:3d} if true go: ${re.prog[pc1].rep_max:3d}")
res.write(" if false go: ${tk.rep_min:3d} if true go: ${tk.rep_max:3d}")
} else {
res.write(" {${re.prog[pc1].rep_min:3d},${re.prog[pc1].rep_max:3d}}")
res.write(" {${tk.rep_min:3d},${tk.rep_max:3d}}")
}
if re.prog[pc1].greedy == true {
if tk.greedy == true {
res.write("?")
}
}
@ -1123,11 +1126,9 @@ pub fn (re RE) get_code() string {
res.write("========================================\n")
return res.str()
}
// get_query return a string with a reconstruction of the query starting from the regex program code
pub fn (re RE) get_query() string {
mut res := strings.new_builder(re.query.len*2)
@ -1137,14 +1138,15 @@ pub fn (re RE) get_query() string {
mut i := 0
for i < re.prog.len && re.prog[i].ist != IST_PROG_END && re.prog[i].ist != 0{
ch := re.prog[i].ist
tk := &re.prog[i]
ch := tk.ist
// GROUP start
if ch == IST_GROUP_START {
if re.debug == 0 {
res.write("(")
} else {
res.write("#${re.prog[i].group_id}(")
res.write("#${tk.group_id}(")
}
i++
continue
@ -1159,7 +1161,7 @@ pub fn (re RE) get_query() string {
if ch == IST_OR_BRANCH {
res.write("|")
if re.debug > 0 {
res.write("{${re.prog[i].rep_min},${re.prog[i].rep_max}}")
res.write("{${tk.rep_min},${tk.rep_max}}")
}
i++
continue
@ -1177,7 +1179,7 @@ pub fn (re RE) get_query() string {
// bsls char
if ch == IST_BSLS_CHAR {
res.write("\\${re.prog[i].ch:1c}")
res.write("\\${tk.ch:1c}")
}
// IST_DOT_CHAR
@ -1190,29 +1192,28 @@ pub fn (re RE) get_query() string {
if byte(ch) in BSLS_ESCAPE_LIST {
res.write("\\")
}
res.write("${re.prog[i].ch:c}")
res.write("${tk.ch:c}")
}
// quantifier
if !(re.prog[i].rep_min == 1 && re.prog[i].rep_max == 1) {
if re.prog[i].rep_min == 0 && re.prog[i].rep_max == 1 {
if !(tk.rep_min == 1 && tk.rep_max == 1) {
if tk.rep_min == 0 && tk.rep_max == 1 {
res.write("?")
} else if re.prog[i].rep_min == 1 && re.prog[i].rep_max == MAX_QUANTIFIER {
} else if tk.rep_min == 1 && tk.rep_max == MAX_QUANTIFIER {
res.write("+")
} else if re.prog[i].rep_min == 0 && re.prog[i].rep_max == MAX_QUANTIFIER {
} else if tk.rep_min == 0 && tk.rep_max == MAX_QUANTIFIER {
res.write("*")
} else {
if re.prog[i].rep_max == MAX_QUANTIFIER {
res.write("{${re.prog[i].rep_min},MAX}")
if tk.rep_max == MAX_QUANTIFIER {
res.write("{${tk.rep_min},MAX}")
} else {
res.write("{${re.prog[i].rep_min},${re.prog[i].rep_max}}")
res.write("{${tk.rep_min},${tk.rep_max}}")
}
if re.prog[i].greedy == true {
if tk.greedy == true {
res.write("?")
}
}
}
i++
}
if (re.flag & F_ME) != 0 {
@ -1411,6 +1412,20 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
re.groups[g_index] = 0
}
re.groups[g_index+1] = i
// continuous save, save until we have space
if re.group_csave_index > 0 {
// check if we have space to save the record
if (re.group_csave_index + 3) < re.group_csave.len {
// incrment counter
re.group_csave[0]++
// save the record
re.group_csave[re.group_csave_index++] = g_index // group id
re.group_csave[re.group_csave_index++] = re.groups[g_index] // start
re.group_csave[re.group_csave_index++] = re.groups[g_index+1] // end
}
}
}
group_index--
@ -1543,6 +1558,19 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
}
re.groups[g_index+1] = i
//C.printf("GROUP %d END [%d, %d]\n", re.prog[pc].group_id, re.groups[g_index], re.groups[g_index+1])
// continuous save, save until we have space
if re.group_csave_index > 0 {
// check if we have space to save the record
if (re.group_csave_index + 3) < re.group_csave.len {
// incrment counter
re.group_csave[0]++
// save the record
re.group_csave[re.group_csave_index++] = g_index // group id
re.group_csave[re.group_csave_index++] = re.groups[g_index] // start
re.group_csave[re.group_csave_index++] = re.groups[g_index+1] // end
}
}
}
re.prog[pc].group_rep++ // increase repetitions
@ -1796,8 +1824,6 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
if rep < re.prog[tmp_pc].rep_min {
//C.printf("ist_quant_pg UNDER RANGE\n")
pc = re.prog[tmp_pc].goto_pc
//group_index--
m_state = .ist_next
continue
}
@ -1841,12 +1867,6 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
m_state = .ist_next // go to next ist
continue
}
// match failed
else if rep == 0 && re.prog[pc].rep_min > 0 {
//C.printf("ist_quant_n NO MATCH\n")
// dummy
}
// match + or *
else if rep >= re.prog[pc].rep_min {
//C.printf("ist_quant_n MATCH RANGE\n")
@ -1902,7 +1922,6 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
m_state = .ist_next
continue
}
m_state = .ist_load
continue
}
@ -1981,6 +2000,9 @@ pub fn (re mut RE) match_string(in_txt string) (int,int) {
return NO_MATCH_FOUND, 0
}
if (re.flag & F_ME) != 0 && end < in_txt.len {
if in_txt[end] in NEW_LINE_LIST {
return start, end
}
return NO_MATCH_FOUND, 0
}
return start, end
@ -2002,7 +2024,7 @@ pub fn (re mut RE) find(in_txt string) (int,int) {
for i < in_txt.len {
// test only the first part of the query string
re.flag &= F_EFM // set to exit on the first token match
re.flag |= F_EFM // set to exit on the first token match
mut tmp_end := i+re.query.len
if tmp_end > in_txt.len { tmp_end = in_txt.len }
tmp_txt := string{ str: in_txt.str+i, len: tmp_end-i }