regex: refactoring of continuous capturing groups (#7310)

pull/7324/head
penguindark 2020-12-14 14:02:13 +01:00 committed by GitHub
parent 89ef316db3
commit 4f986ccac4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 52 additions and 35 deletions

View File

@ -237,7 +237,7 @@ This feature allow to collect data in a continuous way.
In the example we pass a text followed by a integer list that we want collect. In the example we pass a text followed by a integer list that we want collect.
To achieve this task we can use the continuous saving of the group To achieve this task we can use the continuous saving of the group
that save each captured group in a array that we set with: `re.group_csave = [-1].repeat(3*20+1)`. enabling the right flag: `re.group_csave_flag = true`.
The array will be filled with the following logic: The array will be filled with the following logic:
@ -250,14 +250,14 @@ The array will be filled with the following logic:
The regex save until finish or found that the array have no space. The regex save until finish or found that the array have no space.
If the space ends no error is raised, further records will not be saved. If the space ends no error is raised, further records will not be saved.
```v oksyntax ```v ignore
fn example2() { fn example2() {
test_regex() test_regex()
text := 'tst: 01,23,45 ,56, 78' text := 'tst: 01,23,45 ,56, 78'
query := r'.*:(\s*\d+[\s,]*)+' query := r'.*:(\s*\d+[\s,]*)+'
mut re := new() or { panic(err) } mut re := new() or { panic(err) }
// re.debug = 2 // re.debug = 2
re.group_csave = [-1].repeat(3 * 20 + 1) // we expect max 20 records re.group_csave_flag = true // enable continuous capture
re.compile_opt(query) or { re.compile_opt(query) or {
println(err) println(err)
return return
@ -330,7 +330,7 @@ fn main() {
query := r'(?P<format>https?)|(?:ftps?)://(?P<token>[\w_]+.)+' query := r'(?P<format>https?)|(?:ftps?)://(?P<token>[\w_]+.)+'
mut re := new() mut re := new()
re.debug = 2 re.debug = 2
// must provide an array of the right size if want the continuos saving of the groups // must provide an array of the right size if want the continuous saving of the groups
re.group_csave = [-1].repeat(3 * 20 + 1) re.group_csave = [-1].repeat(3 * 20 + 1)
re.compile_opt(query) or { re.compile_opt(query) or {
println(err) println(err)

View File

@ -314,8 +314,8 @@ pub mut:
group_max_nested int = 3 // max nested group group_max_nested int = 3 // max nested group
group_max int = 8 // max allowed number of different groups group_max int = 8 // max allowed number of different groups
group_csave []int = []int{} // groups continuous save array group_csave_flag bool // flag to enable continuous saving
group_csave_index int = -1 // groups continuous save index group_csave []int = []int{} // groups continuous save list
group_map map[string]int // groups names map group_map map[string]int // groups names map
@ -344,10 +344,7 @@ fn (mut re RE) reset(){
re.state_stack_index = -1 re.state_stack_index = -1
// reset group_csave // reset group_csave
if re.group_csave.len > 0 { re.group_csave = []int{}
re.group_csave_index = 1
re.group_csave[0] = 0 // reset the capture count
}
} }
// reset for search mode fail // reset for search mode fail
@ -1482,6 +1479,45 @@ pub fn (re RE) get_query() string {
/* /*
Groups saving utilities
*/
[inline]
fn (mut re RE) group_continuous_save(g_index int) {
if re.group_csave_flag == true {
// continuous save, save until we have space
// init the first element as counter
if re.group_csave.len == 0 {
re.group_csave << 0
}
gi := g_index >> 1
start := re.groups[g_index]
end := re.groups[g_index+1]
// check if we are simply increasing the size ot the found group
if re.group_csave.len >=4 &&
gi == re.group_csave[re.group_csave.len - 3] &&
start == re.group_csave[re.group_csave.len - 2]
{
re.group_csave[re.group_csave.len - 1] = end
return
}
// otherwise append a new group to the list
// increment counter
re.group_csave[0]++
// save the record
re.group_csave << (g_index >> 1) // group id
re.group_csave << re.groups[g_index] // start
re.group_csave << re.groups[g_index+1] // end
}
}
/*
Matching Matching
*/ */
@ -1684,17 +1720,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
} }
// continuous save, save until we have space // continuous save, save until we have space
if re.group_csave_index > 0 { re.group_continuous_save(g_index)
// check if we have space to save the record
if (re.group_csave_index + 3) < re.group_csave.len {
// incrment counter
re.group_csave[0]++
// save the record
re.group_csave[re.group_csave_index++] = g_index >> 1 // group id
re.group_csave[re.group_csave_index++] = re.groups[g_index] // start
re.group_csave[re.group_csave_index++] = re.groups[g_index+1] // end
}
}
} }
@ -1879,17 +1905,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
//println("GROUP ${re.prog[pc].group_id} END [${re.groups[g_index]}, ${re.groups[g_index+1]}]") //println("GROUP ${re.prog[pc].group_id} END [${re.groups[g_index]}, ${re.groups[g_index+1]}]")
// continuous save, save until we have space // continuous save, save until we have space
if re.group_csave_index > 0 { re.group_continuous_save(g_index)
// check if we have space to save the record
if (re.group_csave_index + 3) < re.group_csave.len {
// incrment counter
re.group_csave[0]++
// save the record
re.group_csave[re.group_csave_index++] = g_index >> 1 // group id
re.group_csave[re.group_csave_index++] = re.groups[g_index] // start
re.group_csave[re.group_csave_index++] = re.groups[g_index+1] // end
}
}
} }
re.prog[pc].group_rep++ // increase repetitions re.prog[pc].group_rep++ // increase repetitions

View File

@ -230,7 +230,8 @@ fn test_regex(){
} }
if to.cgn.len > 0 { if to.cgn.len > 0 {
re.group_csave = [-1].repeat(3*20+1) re.group_csave_flag = true
//re.group_csave = [-1].repeat(3*20+1)
if debug { println("continuous save")} if debug { println("continuous save")}
} else { } else {
if debug { println("NO continuous save")} if debug { println("NO continuous save")}