regex fixes
parent
222fc4b04f
commit
15a63b5bcb
|
@ -159,6 +159,91 @@ for gi < re.groups.len {
|
||||||
|
|
||||||
**note:** *to show the `group id number` in the result of the `get_query()` the flag `debug` of the RE object must be `1` or `2`*
|
**note:** *to show the `group id number` in the result of the `get_query()` the flag `debug` of the RE object must be `1` or `2`*
|
||||||
|
|
||||||
|
### Groups Continuous saving
|
||||||
|
|
||||||
|
In particular situations it is useful have a continuous save of the groups, this is possible initializing the saving array field in `RE` struct: `group_csave`.
|
||||||
|
|
||||||
|
This feature allow to collect data in a continuous way.
|
||||||
|
|
||||||
|
In the example we pass a text followed by a integer list that we want collect.
|
||||||
|
To achieve this task we can use the continuous saving of the group that save each captured group in a array that we set with: `re.group_csave = [-1].repeat(3*20+1)`.
|
||||||
|
|
||||||
|
The array will be filled with the following logic:
|
||||||
|
|
||||||
|
`re.group_csave[0]` number of total saved records
|
||||||
|
|
||||||
|
`re.group_csave[1+n*3]` id of the saved group
|
||||||
|
`re.group_csave[1+n*3]` start index in the source string of the saved group
|
||||||
|
`re.group_csave[1+n*3]` end index in the source string of the saved group
|
||||||
|
|
||||||
|
The regex save until finish or found that the array have no space. If the space ends no error is raised, further records will not be saved.
|
||||||
|
|
||||||
|
```v
|
||||||
|
fn example2() {
|
||||||
|
test_regex()
|
||||||
|
|
||||||
|
text := "tst: 01,23,45 ,56, 78"
|
||||||
|
query:= r".*:(\s*\d+[\s,]*)+"
|
||||||
|
|
||||||
|
mut re := regex.new_regex()
|
||||||
|
//re.debug = 2
|
||||||
|
re.group_csave = [-1].repeat(3*20+1) // we expect max 20 records
|
||||||
|
|
||||||
|
re_err, err_pos := re.compile(query)
|
||||||
|
if re_err == regex.COMPILE_OK {
|
||||||
|
q_str := re.get_query()
|
||||||
|
println("Query: $q_str")
|
||||||
|
|
||||||
|
start, end := re.match_string(text)
|
||||||
|
if start < 0 {
|
||||||
|
println("ERROR : ${re.get_parse_error_string(start)}, $start")
|
||||||
|
} else {
|
||||||
|
println("found in [$start, $end] => [${text[start..end]}]")
|
||||||
|
}
|
||||||
|
|
||||||
|
// groups capture
|
||||||
|
mut gi := 0
|
||||||
|
for gi < re.groups.len {
|
||||||
|
if re.groups[gi] >= 0 {
|
||||||
|
println("${gi/2} ${re.groups[gi]},${re.groups[gi+1]} :[${text[re.groups[gi]..re.groups[gi+1]]}]")
|
||||||
|
}
|
||||||
|
gi += 2
|
||||||
|
}
|
||||||
|
|
||||||
|
// continuous saving
|
||||||
|
gi = 0
|
||||||
|
println("num: ${re.group_csave[0]}")
|
||||||
|
for gi < re.group_csave[0] {
|
||||||
|
id := re.group_csave[1+gi*3]
|
||||||
|
st := re.group_csave[1+gi*3+1]
|
||||||
|
en := re.group_csave[1+gi*3+2]
|
||||||
|
println("cg id: ${id} [${st}, ${en}] => [${text[st..en]}]")
|
||||||
|
gi++
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
println("query: $query")
|
||||||
|
lc := "-".repeat(err_pos)
|
||||||
|
println("err : $lc^")
|
||||||
|
err_str := re.get_parse_error_string(re_err)
|
||||||
|
println("ERROR: $err_str")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
The output will be:
|
||||||
|
|
||||||
|
```
|
||||||
|
Query: .*:(\s*\d+[\s,]*)+
|
||||||
|
found in [0, 21] => [tst: 01,23,45 ,56, 78]
|
||||||
|
0 19,21 :[78]
|
||||||
|
num: 5
|
||||||
|
cg id: 0 [4, 8] => [ 01,]
|
||||||
|
cg id: 0 [8, 11] => [23,]
|
||||||
|
cg id: 0 [11, 15] => [45 ,]
|
||||||
|
cg id: 0 [15, 19] => [56, ]
|
||||||
|
cg id: 0 [19, 21] => [78]
|
||||||
|
```
|
||||||
|
|
||||||
## Flags
|
## Flags
|
||||||
|
|
||||||
It is possible to set some flags in the regex parser that change the behavior of the parser itself.
|
It is possible to set some flags in the regex parser that change the behavior of the parser itself.
|
||||||
|
|
|
@ -266,23 +266,20 @@ fn (tok mut Token) reset() {
|
||||||
*
|
*
|
||||||
******************************************************************************/
|
******************************************************************************/
|
||||||
pub const (
|
pub const (
|
||||||
//F_FND = 0x00000001 // check until the end of the input string, it act like a "find first match", not efficient!!
|
F_NL = 0x00000001 // end the match when find a new line symbol
|
||||||
//F_PM = 0x00000004 // partial match: if the source text finish and the match is positive until then return true
|
F_MS = 0x00000002 // match true only if the match is at the start of the string
|
||||||
|
F_ME = 0x00000004 // match true only if the match is at the end of the string
|
||||||
|
|
||||||
F_NL = 0x00000002 // end the match when find a new line symbol
|
F_EFM = 0x00000100 // exit on first token matched, used by search
|
||||||
F_MS = 0x00000008 // match true only if the match is at the start of the string
|
F_BIN = 0x00000200 // work only on bytes, ignore utf-8
|
||||||
F_ME = 0x00000010 // match true only if the match is at the end of the string
|
|
||||||
|
|
||||||
F_EFM = 0x01000000 // exit on first token matched, used by search
|
|
||||||
F_BIN = 0x02000000 // work only on bytes, ignore utf-8
|
|
||||||
)
|
)
|
||||||
|
|
||||||
struct StateDotObj{
|
struct StateDotObj{
|
||||||
mut:
|
mut:
|
||||||
i int = -1 // char index in the input buffer
|
i int = -1 // char index in the input buffer
|
||||||
pc int = -1 // program counter saved
|
pc int = -1 // program counter saved
|
||||||
mi int = -1 // match_index saved
|
mi int = -1 // match_index saved
|
||||||
group_stack_index int = -1 // group index stack pointer saved
|
group_stack_index int = -1 // continuous save on capturing groups
|
||||||
}
|
}
|
||||||
|
|
||||||
pub
|
pub
|
||||||
|
@ -305,6 +302,9 @@ pub mut:
|
||||||
group_max_nested int = 3 // max nested group
|
group_max_nested int = 3 // max nested group
|
||||||
group_max int = 8 // max allowed number of different groups
|
group_max int = 8 // max allowed number of different groups
|
||||||
|
|
||||||
|
group_csave []int = []int // groups continuous save array
|
||||||
|
group_csave_index int= -1 // groups continuous save index
|
||||||
|
|
||||||
// flags
|
// flags
|
||||||
flag int = 0 // flag for optional parameters
|
flag int = 0 // flag for optional parameters
|
||||||
|
|
||||||
|
@ -328,6 +328,12 @@ fn (re mut RE) reset(){
|
||||||
re.groups = [-1].repeat(re.group_count*2)
|
re.groups = [-1].repeat(re.group_count*2)
|
||||||
|
|
||||||
re.state_stack_index = -1
|
re.state_stack_index = -1
|
||||||
|
|
||||||
|
// reset group_csave
|
||||||
|
if re.group_csave.len > 0 {
|
||||||
|
re.group_csave_index = 1
|
||||||
|
re.group_csave[0] = 0 // reset the capture count
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/******************************************************************************
|
/******************************************************************************
|
||||||
|
@ -734,8 +740,6 @@ fn (re RE) parse_quantifier(in_txt string, in_i int) (int, int, int, bool) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// not a {} quantifier, exit
|
// not a {} quantifier, exit
|
||||||
return ERR_SYNTAX_ERROR, i, 0, false
|
return ERR_SYNTAX_ERROR, i, 0, false
|
||||||
}
|
}
|
||||||
|
@ -997,7 +1001,6 @@ pub fn (re mut RE) compile(in_txt string) (int,int) {
|
||||||
// Post processing
|
// Post processing
|
||||||
//******************************************
|
//******************************************
|
||||||
|
|
||||||
|
|
||||||
// count IST_DOT_CHAR to set the size of the state stack
|
// count IST_DOT_CHAR to set the size of the state stack
|
||||||
mut pc1 := 0
|
mut pc1 := 0
|
||||||
mut tmp_count := 0
|
mut tmp_count := 0
|
||||||
|
@ -1054,7 +1057,6 @@ pub fn (re mut RE) compile(in_txt string) (int,int) {
|
||||||
pc1++
|
pc1++
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//******************************************
|
//******************************************
|
||||||
// DEBUG PRINT REGEX GENERATED CODE
|
// DEBUG PRINT REGEX GENERATED CODE
|
||||||
//******************************************
|
//******************************************
|
||||||
|
@ -1075,14 +1077,15 @@ pub fn (re RE) get_code() string {
|
||||||
mut stop_flag := false
|
mut stop_flag := false
|
||||||
|
|
||||||
for pc1 <= re.prog.len {
|
for pc1 <= re.prog.len {
|
||||||
|
tk := re.prog[pc1]
|
||||||
res.write("PC:${pc1:3d}")
|
res.write("PC:${pc1:3d}")
|
||||||
|
|
||||||
res.write(" ist: ")
|
res.write(" ist: ")
|
||||||
res.write("${re.prog[pc1].ist:8x}".replace(" ","0") )
|
res.write("${tk.ist:8x}".replace(" ","0") )
|
||||||
res.write(" ")
|
res.write(" ")
|
||||||
ist :=re.prog[pc1].ist
|
ist :=tk.ist
|
||||||
if ist == IST_BSLS_CHAR {
|
if ist == IST_BSLS_CHAR {
|
||||||
res.write("[\\${re.prog[pc1].ch:1c}] BSLS")
|
res.write("[\\${tk.ch:1c}] BSLS")
|
||||||
} else if ist == IST_PROG_END {
|
} else if ist == IST_PROG_END {
|
||||||
res.write("PROG_END")
|
res.write("PROG_END")
|
||||||
stop_flag = true
|
stop_flag = true
|
||||||
|
@ -1095,22 +1098,22 @@ pub fn (re RE) get_code() string {
|
||||||
} else if ist == IST_DOT_CHAR {
|
} else if ist == IST_DOT_CHAR {
|
||||||
res.write(". DOT_CHAR")
|
res.write(". DOT_CHAR")
|
||||||
} else if ist == IST_GROUP_START {
|
} else if ist == IST_GROUP_START {
|
||||||
res.write("( GROUP_START #:${re.prog[pc1].group_id}")
|
res.write("( GROUP_START #:${tk.group_id}")
|
||||||
} else if ist == IST_GROUP_END {
|
} else if ist == IST_GROUP_END {
|
||||||
res.write(") GROUP_END #:${re.prog[pc1].group_id}")
|
res.write(") GROUP_END #:${tk.group_id}")
|
||||||
} else if ist == IST_SIMPLE_CHAR {
|
} else if ist == IST_SIMPLE_CHAR {
|
||||||
res.write("[${re.prog[pc1].ch:1c}] query_ch")
|
res.write("[${tk.ch:1c}] query_ch")
|
||||||
}
|
}
|
||||||
|
|
||||||
if re.prog[pc1].rep_max == MAX_QUANTIFIER {
|
if tk.rep_max == MAX_QUANTIFIER {
|
||||||
res.write(" {${re.prog[pc1].rep_min:3d},MAX}")
|
res.write(" {${tk.rep_min:3d},MAX}")
|
||||||
}else{
|
}else{
|
||||||
if ist == IST_OR_BRANCH {
|
if ist == IST_OR_BRANCH {
|
||||||
res.write(" if false go: ${re.prog[pc1].rep_min:3d} if true go: ${re.prog[pc1].rep_max:3d}")
|
res.write(" if false go: ${tk.rep_min:3d} if true go: ${tk.rep_max:3d}")
|
||||||
} else {
|
} else {
|
||||||
res.write(" {${re.prog[pc1].rep_min:3d},${re.prog[pc1].rep_max:3d}}")
|
res.write(" {${tk.rep_min:3d},${tk.rep_max:3d}}")
|
||||||
}
|
}
|
||||||
if re.prog[pc1].greedy == true {
|
if tk.greedy == true {
|
||||||
res.write("?")
|
res.write("?")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1123,11 +1126,9 @@ pub fn (re RE) get_code() string {
|
||||||
|
|
||||||
res.write("========================================\n")
|
res.write("========================================\n")
|
||||||
return res.str()
|
return res.str()
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// get_query return a string with a reconstruction of the query starting from the regex program code
|
// get_query return a string with a reconstruction of the query starting from the regex program code
|
||||||
|
|
||||||
pub fn (re RE) get_query() string {
|
pub fn (re RE) get_query() string {
|
||||||
mut res := strings.new_builder(re.query.len*2)
|
mut res := strings.new_builder(re.query.len*2)
|
||||||
|
|
||||||
|
@ -1137,14 +1138,15 @@ pub fn (re RE) get_query() string {
|
||||||
|
|
||||||
mut i := 0
|
mut i := 0
|
||||||
for i < re.prog.len && re.prog[i].ist != IST_PROG_END && re.prog[i].ist != 0{
|
for i < re.prog.len && re.prog[i].ist != IST_PROG_END && re.prog[i].ist != 0{
|
||||||
ch := re.prog[i].ist
|
tk := &re.prog[i]
|
||||||
|
ch := tk.ist
|
||||||
|
|
||||||
// GROUP start
|
// GROUP start
|
||||||
if ch == IST_GROUP_START {
|
if ch == IST_GROUP_START {
|
||||||
if re.debug == 0 {
|
if re.debug == 0 {
|
||||||
res.write("(")
|
res.write("(")
|
||||||
} else {
|
} else {
|
||||||
res.write("#${re.prog[i].group_id}(")
|
res.write("#${tk.group_id}(")
|
||||||
}
|
}
|
||||||
i++
|
i++
|
||||||
continue
|
continue
|
||||||
|
@ -1159,7 +1161,7 @@ pub fn (re RE) get_query() string {
|
||||||
if ch == IST_OR_BRANCH {
|
if ch == IST_OR_BRANCH {
|
||||||
res.write("|")
|
res.write("|")
|
||||||
if re.debug > 0 {
|
if re.debug > 0 {
|
||||||
res.write("{${re.prog[i].rep_min},${re.prog[i].rep_max}}")
|
res.write("{${tk.rep_min},${tk.rep_max}}")
|
||||||
}
|
}
|
||||||
i++
|
i++
|
||||||
continue
|
continue
|
||||||
|
@ -1177,7 +1179,7 @@ pub fn (re RE) get_query() string {
|
||||||
|
|
||||||
// bsls char
|
// bsls char
|
||||||
if ch == IST_BSLS_CHAR {
|
if ch == IST_BSLS_CHAR {
|
||||||
res.write("\\${re.prog[i].ch:1c}")
|
res.write("\\${tk.ch:1c}")
|
||||||
}
|
}
|
||||||
|
|
||||||
// IST_DOT_CHAR
|
// IST_DOT_CHAR
|
||||||
|
@ -1190,29 +1192,28 @@ pub fn (re RE) get_query() string {
|
||||||
if byte(ch) in BSLS_ESCAPE_LIST {
|
if byte(ch) in BSLS_ESCAPE_LIST {
|
||||||
res.write("\\")
|
res.write("\\")
|
||||||
}
|
}
|
||||||
res.write("${re.prog[i].ch:c}")
|
res.write("${tk.ch:c}")
|
||||||
}
|
}
|
||||||
|
|
||||||
// quantifier
|
// quantifier
|
||||||
if !(re.prog[i].rep_min == 1 && re.prog[i].rep_max == 1) {
|
if !(tk.rep_min == 1 && tk.rep_max == 1) {
|
||||||
if re.prog[i].rep_min == 0 && re.prog[i].rep_max == 1 {
|
if tk.rep_min == 0 && tk.rep_max == 1 {
|
||||||
res.write("?")
|
res.write("?")
|
||||||
} else if re.prog[i].rep_min == 1 && re.prog[i].rep_max == MAX_QUANTIFIER {
|
} else if tk.rep_min == 1 && tk.rep_max == MAX_QUANTIFIER {
|
||||||
res.write("+")
|
res.write("+")
|
||||||
} else if re.prog[i].rep_min == 0 && re.prog[i].rep_max == MAX_QUANTIFIER {
|
} else if tk.rep_min == 0 && tk.rep_max == MAX_QUANTIFIER {
|
||||||
res.write("*")
|
res.write("*")
|
||||||
} else {
|
} else {
|
||||||
if re.prog[i].rep_max == MAX_QUANTIFIER {
|
if tk.rep_max == MAX_QUANTIFIER {
|
||||||
res.write("{${re.prog[i].rep_min},MAX}")
|
res.write("{${tk.rep_min},MAX}")
|
||||||
} else {
|
} else {
|
||||||
res.write("{${re.prog[i].rep_min},${re.prog[i].rep_max}}")
|
res.write("{${tk.rep_min},${tk.rep_max}}")
|
||||||
}
|
}
|
||||||
if re.prog[i].greedy == true {
|
if tk.greedy == true {
|
||||||
res.write("?")
|
res.write("?")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
i++
|
i++
|
||||||
}
|
}
|
||||||
if (re.flag & F_ME) != 0 {
|
if (re.flag & F_ME) != 0 {
|
||||||
|
@ -1411,6 +1412,20 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
|
||||||
re.groups[g_index] = 0
|
re.groups[g_index] = 0
|
||||||
}
|
}
|
||||||
re.groups[g_index+1] = i
|
re.groups[g_index+1] = i
|
||||||
|
|
||||||
|
// continuous save, save until we have space
|
||||||
|
if re.group_csave_index > 0 {
|
||||||
|
// check if we have space to save the record
|
||||||
|
if (re.group_csave_index + 3) < re.group_csave.len {
|
||||||
|
// incrment counter
|
||||||
|
re.group_csave[0]++
|
||||||
|
// save the record
|
||||||
|
re.group_csave[re.group_csave_index++] = g_index // group id
|
||||||
|
re.group_csave[re.group_csave_index++] = re.groups[g_index] // start
|
||||||
|
re.group_csave[re.group_csave_index++] = re.groups[g_index+1] // end
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
group_index--
|
group_index--
|
||||||
|
@ -1543,6 +1558,19 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
|
||||||
}
|
}
|
||||||
re.groups[g_index+1] = i
|
re.groups[g_index+1] = i
|
||||||
//C.printf("GROUP %d END [%d, %d]\n", re.prog[pc].group_id, re.groups[g_index], re.groups[g_index+1])
|
//C.printf("GROUP %d END [%d, %d]\n", re.prog[pc].group_id, re.groups[g_index], re.groups[g_index+1])
|
||||||
|
|
||||||
|
// continuous save, save until we have space
|
||||||
|
if re.group_csave_index > 0 {
|
||||||
|
// check if we have space to save the record
|
||||||
|
if (re.group_csave_index + 3) < re.group_csave.len {
|
||||||
|
// incrment counter
|
||||||
|
re.group_csave[0]++
|
||||||
|
// save the record
|
||||||
|
re.group_csave[re.group_csave_index++] = g_index // group id
|
||||||
|
re.group_csave[re.group_csave_index++] = re.groups[g_index] // start
|
||||||
|
re.group_csave[re.group_csave_index++] = re.groups[g_index+1] // end
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
re.prog[pc].group_rep++ // increase repetitions
|
re.prog[pc].group_rep++ // increase repetitions
|
||||||
|
@ -1796,8 +1824,6 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
|
||||||
if rep < re.prog[tmp_pc].rep_min {
|
if rep < re.prog[tmp_pc].rep_min {
|
||||||
//C.printf("ist_quant_pg UNDER RANGE\n")
|
//C.printf("ist_quant_pg UNDER RANGE\n")
|
||||||
pc = re.prog[tmp_pc].goto_pc
|
pc = re.prog[tmp_pc].goto_pc
|
||||||
//group_index--
|
|
||||||
|
|
||||||
m_state = .ist_next
|
m_state = .ist_next
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
@ -1841,12 +1867,6 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
|
||||||
m_state = .ist_next // go to next ist
|
m_state = .ist_next // go to next ist
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// match failed
|
|
||||||
else if rep == 0 && re.prog[pc].rep_min > 0 {
|
|
||||||
//C.printf("ist_quant_n NO MATCH\n")
|
|
||||||
// dummy
|
|
||||||
}
|
|
||||||
// match + or *
|
// match + or *
|
||||||
else if rep >= re.prog[pc].rep_min {
|
else if rep >= re.prog[pc].rep_min {
|
||||||
//C.printf("ist_quant_n MATCH RANGE\n")
|
//C.printf("ist_quant_n MATCH RANGE\n")
|
||||||
|
@ -1902,7 +1922,6 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
|
||||||
m_state = .ist_next
|
m_state = .ist_next
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
m_state = .ist_load
|
m_state = .ist_load
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
@ -1981,6 +2000,9 @@ pub fn (re mut RE) match_string(in_txt string) (int,int) {
|
||||||
return NO_MATCH_FOUND, 0
|
return NO_MATCH_FOUND, 0
|
||||||
}
|
}
|
||||||
if (re.flag & F_ME) != 0 && end < in_txt.len {
|
if (re.flag & F_ME) != 0 && end < in_txt.len {
|
||||||
|
if in_txt[end] in NEW_LINE_LIST {
|
||||||
|
return start, end
|
||||||
|
}
|
||||||
return NO_MATCH_FOUND, 0
|
return NO_MATCH_FOUND, 0
|
||||||
}
|
}
|
||||||
return start, end
|
return start, end
|
||||||
|
@ -2002,7 +2024,7 @@ pub fn (re mut RE) find(in_txt string) (int,int) {
|
||||||
for i < in_txt.len {
|
for i < in_txt.len {
|
||||||
|
|
||||||
// test only the first part of the query string
|
// test only the first part of the query string
|
||||||
re.flag &= F_EFM // set to exit on the first token match
|
re.flag |= F_EFM // set to exit on the first token match
|
||||||
mut tmp_end := i+re.query.len
|
mut tmp_end := i+re.query.len
|
||||||
if tmp_end > in_txt.len { tmp_end = in_txt.len }
|
if tmp_end > in_txt.len { tmp_end = in_txt.len }
|
||||||
tmp_txt := string{ str: in_txt.str+i, len: tmp_end-i }
|
tmp_txt := string{ str: in_txt.str+i, len: tmp_end-i }
|
||||||
|
|
Loading…
Reference in New Issue