regex: add a find_all_str function (#7517)

pull/7530/head
penguindark 2020-12-24 06:27:46 +01:00 committed by GitHub
parent 36dcace0a7
commit 2824e07baa
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 248 additions and 198 deletions

View File

@ -473,7 +473,7 @@ pub fn new() RE
```
#### **Custom initialization**
For some particular need it is possible initialize a fully customized regex:
For some particular needs it is possible initialize a fully manually customized regex:
```v ignore
pattern = r"ab(.*)(ac)"
// init custom regex
@ -484,6 +484,8 @@ re.cc = []CharClass{len: pattern.len} // can not be more char class the th
re.group_csave_flag = false // true enable continuos group saving if needed
re.group_max_nested = 128 // set max 128 group nested possible
re.group_max = pattern.len>>1 // we can't have more groups than the half of the pattern legth
re.group_stack = []int{len: re.group_max, init: -1}
re.group_data = []int{len: re.group_max, init: -1}
```
### Compiling
@ -494,22 +496,14 @@ After an initializer is used, the regex expression must be compiled with:
pub fn (re mut RE) compile_opt(in_txt string) ?
```
### Operative Functions
### Matching Functions
These are the operative functions
These are the matching functions
```v ignore
// match_string try to match the input string, return start and end index if found else start is -1
pub fn (re mut RE) match_string(in_txt string) (int,int)
// find try to find the first match in the input string, return start and end index if found else start is -1
pub fn (re mut RE) find(in_txt string) (int,int)
// find_all find all the "non overlapping" occurrences of the matching pattern, return a list of start end indexes
pub fn (re mut RE) find_all(in_txt string) []int
// replace return a string where the matches are replaced with the replace string, only non overlapped matches are used
pub fn (re mut RE) replace(in_txt string, repl string) string
```
## Find and Replace
@ -519,13 +513,19 @@ There are the following find and replace functions:
#### Find functions
```v ignore
// find try to find the first match in the input string, return start and end index if found else start is -1
// find try to find the first match in the input string
// return start and end index if found else start is -1
pub fn (re mut RE) find(in_txt string) (int,int)
// find_all find all the "non overlapping" occurrences of the matching pattern
// return a list of start end indexes like: [3,4,6,8]
// the matches are [3,4] and [6,8]
pub fn (re mut RE) find_all(in_txt string) []int
// find_all find all the "non overlapping" occurrences of the matching pattern
// return a list of strings
// the result is like ["first match","secon match"]
pub fn (mut re RE) find_all_str(in_txt string) []string
```
#### Replace functions
@ -543,10 +543,12 @@ The`replace_by_fn` use a custom replace function making possible customizations.
The custom function must be of the type:
```v ignore
// re RE struct
// in_txt all the text passed to the regex expression
// the match is: in_txt[start..end]
fn (re RE, in_txt string, start int, end int) string
// type of function used for custom replace
// in_txt source text
// start index of the start of the match in in_txt
// end index of the end of the match in in_txt
// --- the match is in in_txt[start..end] ---
fn (re RE, in_txt string, start int, end int) string
```
The following example will clarify the use:

View File

@ -1554,14 +1554,14 @@ fn state_str(s Match_state) string {
struct StateObj {
pub mut:
group_index int = -1 // group id used to know how many groups are open
match_flag bool
match_index int = -1
first_match int = -1 //index of the first match
pc int = -1 // program counter
i int = -1 // source string index
char_len int
last_dot_pc int = -1 // last dot chat pc
group_index int = -1 // group id used to know how many groups are open
match_flag bool // indicate if we are in a match condition
match_index int = -1 // index of the last match
first_match int = -1 // index of the first match
pc int = -1 // program counter
i int = -1 // source string index
char_len int // last char legth
last_dot_pc int = -1 // last dot chat pc
}
[direct_array_access]
@ -1579,13 +1579,6 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
mut ist := rune(0) // actual instruction
mut l_ist := rune(0) // last matched instruction
//mut state_list := []StateObj{}
//mut group_stack := []int{len: re.group_max, init: -1}
//mut group_data := []int{len: re.group_max, init: -1}
//mut group_index := -1 // group id used to know how many groups are open
mut step_count := 0 // stats for debug
mut dbg_line := 0 // count debug line printed
@ -1900,7 +1893,6 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
//println("g.id: ${re.prog[state.pc].group_id} group_index: ${state.group_index}")
if state.group_index >= 0 && re.prog[state.pc].group_id >= 0 {
start_i := re.group_stack[state.group_index]
//re.group_stack[state.group_index]=-1
// save group results
g_index := re.prog[state.pc].group_id*2
@ -1960,8 +1952,6 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
// check next token to be false
mut next_check_flag := false
//if re.prog[state.pc].rep >= re.prog[state.pc].rep_min &&
// if we are done with max go on dot char are dedicated case!!
if re.prog[state.pc].rep >= re.prog[state.pc].rep_max
{
@ -2415,113 +2405,3 @@ pub fn (mut re RE) match_string(in_txt string) (int,int) {
}
return start, end
}
//
// Finders
//
// find try to find the first match in the input string
[direct_array_access]
pub fn (mut re RE) find(in_txt string) (int,int) {
old_flag := re.flag
re.flag |= f_src // enable search mode
start, mut end := re.match_base(in_txt.str, in_txt.len + 1)
//print("Find [$start,$end] '${in_txt[start..end]}'")
if end > in_txt.len {
end = in_txt.len
}
re.flag = old_flag
if start >= 0 && end > start {
return start, end
}
return no_match_found, 0
}
// find all the non overlapping occurrences of the match pattern
[direct_array_access]
pub fn (mut re RE) find_all(in_txt string) []int {
mut i := 0
mut res := []int{}
mut ls := -1
for i < in_txt.len {
s,e := re.find(in_txt[i..])
if s >= 0 && e > s && i+s > ls {
//println("find match in: ${i+s},${i+e} [${in_txt[i+s..i+e]}] ls:$ls")
res << i+s
res << i+e
ls = i+s
i = i+e
continue
} else {
i++
}
}
return res
}
// replace return a string where the matches are replaced with the replace string
pub fn (mut re RE) replace(in_txt string, repl string) string {
pos := re.find_all(in_txt)
if pos.len > 0 {
mut res := ""
mut i := 0
mut s1 := 0
mut e1 := in_txt.len
for i < pos.len {
e1 = pos[i]
res += in_txt[s1..e1] + repl
s1 = pos[i+1]
i += 2
}
res += in_txt[s1..]
return res
}
return in_txt
}
pub type FnReplace = fn (re RE, in_txt string, start int, end int) string
// replace_by_fn return a string where the matches are replaced with the string from the repl_fn callback function
pub fn (mut re RE) replace_by_fn(in_txt string, repl_fn FnReplace) string {
mut i := 0
mut res := ""
mut ls := -1
mut s1 := 0
//mut e1 := in_txt.len
for i < in_txt.len {
s,e := re.find(in_txt[i..])
if s >= 0 && e > s && i+s > ls {
//println("find match in: ${i+s},${i+e} [${in_txt[i+s..i+e]}] ls:$ls")
start := i + s
end := i + e
// update grups index diplacement
mut gi := 0
for gi < re.groups.len {
re.groups[gi] += i
gi++
}
repl := repl_fn(re, in_txt, start, end)
res += in_txt[s1..start] + repl
s1 = end
ls = i + s
i = i + e
continue
} else {
i++
}
}
res += in_txt[s1..]
return res
}

View File

@ -17,9 +17,19 @@ pub fn (mut re RE) compile_opt(pattern string) ? {
}
// new_regex create a RE of small size, usually sufficient for ordinary use
[deprecated]
pub fn new() RE {
return impl_new_regex_by_size(1)
// init regex
mut re := regex.RE{}
re.prog = []Token {len: max_code_len + 1} // max program length, can not be longer then the pattern
re.cc = []CharClass{len: max_code_len} // can not be more char class the the length of the pattern
re.group_csave_flag = false // enable continuos group saving
re.group_max_nested = 128 // set max 128 group nested
re.group_max = max_code_len >> 1 // we can't have more groups than the half of the pattern legth
re.group_stack = []int{len: re.group_max, init: -1}
re.group_data = []int{len: re.group_max, init: -1}
return re
}
// new_regex_by_size create a RE of large size, mult specify the scale factor of the memory that will be allocated

View File

@ -144,29 +144,6 @@ match_test_suite = [
]
)
struct TestItemFa {
src string
q string
r []int
}
const (
match_test_suite_fa = [
// find_all tests
TestItemFa{
"oggi pippo è andato a casa di pluto ed ha trovato pippo",
r"p[iplut]+o",
[5, 10, 31, 36, 51, 56]
},
TestItemFa{
"oggi pibao è andato a casa di pbababao ed ha trovato pibabababao",
r"(pi?(ba)+o)",
[5, 10, 31, 39, 54, 65]
},
]
)
struct TestItemRe {
src string
q string
@ -174,7 +151,7 @@ struct TestItemRe {
r string
}
const (
match_test_suite_re = [
match_test_suite_replace = [
// replace tests
TestItemRe{
"oggi pibao è andato a casa di pbababao ed ha trovato pibabababao",
@ -241,12 +218,53 @@ cgroups_test_suite = [
]
)
struct Test_find_all {
src string
q string
res []int // [0,4,5,6...]
res_str []string // ['find0','find1'...]
}
const (
find_all_test_suite = [
Test_find_all{
"abcd 1234 efgh 1234 ghkl1234 ab34546df",
r"\d+",
[5, 9, 15, 19, 24, 28, 31, 36],
['1234', '1234', '1234', '34546']
},
Test_find_all{
"abcd 1234 efgh 1234 ghkl1234 ab34546df",
r"\a+",
[0, 4, 10, 14, 20, 24, 29, 31, 36, 38],
['abcd', 'efgh', 'ghkl', 'ab', 'df']
},
Test_find_all{
"oggi pippo è andato a casa di pluto ed ha trovato pippo",
r"p[iplut]+o",
[5, 10, 31, 36, 51, 56],
['pippo', 'pluto', 'pippo']
},
Test_find_all{
"oggi pibao è andato a casa di pbababao ed ha trovato pibabababao",
r"(pi?(ba)+o)",
[5, 10, 31, 39, 54, 65],
['pibao', 'pbababao', 'pibabababao']
},
Test_find_all{
"Today is a good day and tomorrow will be for sure.",
r"[Tt]o\w+",
[0, 5, 24, 32],
['Today', 'tomorrow']
}
]
)
const (
debug = false // true for debug println
)
fn test_regex(){
// check capturing groups
for c,to in cgroups_test_suite {
// debug print
@ -275,8 +293,8 @@ fn test_regex(){
if start != to.s || end != to.e {
//println("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end")
println("ERROR!")
C.printf("ERROR!! res:(%d, %d) refh:(%d, %d)\n",start, end, to.s, to.e)
eprintln("ERROR!")
//C.printf("ERROR!! res:(%d, %d) refh:(%d, %d)\n",start, end, to.s, to.e)
assert false
continue
}
@ -284,7 +302,7 @@ fn test_regex(){
// check cgroups
if to.cgn.len > 0 {
if re.group_csave.len == 0 || re.group_csave[0] != to.cg[0] {
println("Capturing group len error! found: ${re.group_csave[0]} true ground: ${to.cg[0]}")
eprintln("Capturing group len error! found: ${re.group_csave[0]} true ground: ${to.cg[0]}")
assert false
continue
}
@ -293,7 +311,7 @@ fn test_regex(){
mut ln := re.group_csave[0]*3
for ln > 0 {
if re.group_csave[ln] != to.cg[ln] {
println("Capturing group failed on $ln item!")
eprintln("Capturing group failed on $ln item!")
assert false
}
ln--
@ -302,7 +320,7 @@ fn test_regex(){
// check named captured groups
for k in to.cgn.keys() {
if to.cgn[k] != (re.group_map[k]-1) { // we have -1 because the map not found is 0, in groups we start from 0 and we store using +1
println("Named capturing group error! [$k]")
eprintln("Named capturing group error! [$k]")
assert false
continue
}
@ -314,9 +332,9 @@ fn test_regex(){
}
for ln:=0; ln < re.groups.len; ln++ {
if re.groups[ln] != to.cg[ln] {
println("Capture group doesn't match:")
println("true ground: [${to.cg}]")
println("elaborated : [${re.groups}]")
eprintln("Capture group doesn't match:")
eprintln("true ground: [${to.cg}]")
eprintln("elaborated : [${re.groups}]")
assert false
}
}
@ -324,9 +342,9 @@ fn test_regex(){
}
// check find_all
for c,to in match_test_suite_fa{
for c,to in find_all_test_suite {
// debug print
if debug { println("#$c [$to.src] q[$to.q] $to.r") }
if debug { println("#$c [$to.src] q[$to.q] ($to.res, $to.res_str)") }
mut re := regex.regex_opt(to.q) or {
eprintln('err: $err')
@ -334,25 +352,24 @@ fn test_regex(){
continue
}
re.reset()
res := re.find_all(to.src)
if res.len != to.r.len {
println("ERROR: find_all, array of different size.")
if res != to.res {
eprintln('err: find_all !!')
if debug { println("#$c exp: $to.res calculated: $res") }
assert false
continue
}
for c1,i in res {
if i != to.r[c1] {
println("ERROR: find_all, different indexes.")
assert false
continue
}
res_str := re.find_all_str(to.src)
if res_str != to.res_str {
eprintln('err: find_all_str !!')
if debug { println("#$c exp: $to.res_str calculated: $res_str") }
assert false
}
}
// check replace
for c,to in match_test_suite_re{
for c,to in match_test_suite_replace{
// debug print
if debug { println("#$c [$to.src] q[$to.q] $to.r") }
@ -364,7 +381,7 @@ fn test_regex(){
res := re.replace(to.src,to.rep)
if res != to.r {
println("ERROR: replace.")
eprintln("ERROR: replace.")
assert false
continue
}
@ -383,12 +400,12 @@ fn test_regex(){
continue
}
// q_str := re.get_query()
// println("Query: $q_str")
// eprintln("Query: $q_str")
start,end := re.find(to.src)
if start != to.s || end != to.e {
err_str := re.get_parse_error_string(start)
println("ERROR : $err_str start: ${start} end: ${end}")
eprintln("ERROR : $err_str start: ${start} end: ${end}")
assert false
} else {
//tmp_str := text[start..end]
@ -416,8 +433,8 @@ fn test_regex(){
}
if start != to.s || end != to.e {
println("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end")
println("ERROR!")
eprintln("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end")
eprintln("ERROR!")
//C.printf("ERROR!! res:(%d, %d) refh:(%d, %d)\n",start, end, to.s, to.e)
assert false
continue
@ -427,7 +444,7 @@ fn test_regex(){
tmp_str1 := to.src.clone()
start1, end1 := re.match_string(tmp_str1)
if start1 != start || end1 != end {
println("two run ERROR!!")
eprintln("two run ERROR!!")
assert false
continue
}

View File

@ -117,6 +117,7 @@ pub fn (re RE) get_group_list() []Re_group {
mut res := []Re_group{len: re.groups.len >> 1}
mut gi := 0
//println("len: ${re.groups.len} groups: ${re.groups}")
for gi < re.groups.len {
if re.groups[gi] >= 0 {
txt_st := re.groups[gi]
@ -136,3 +137,143 @@ pub fn (re RE) get_group_list() []Re_group {
return res
}
/******************************************************************************
*
* Finders
*
******************************************************************************/
// find try to find the first match in the input string
[direct_array_access]
pub fn (mut re RE) find(in_txt string) (int,int) {
old_flag := re.flag
re.flag |= f_src // enable search mode
start, mut end := re.match_base(in_txt.str, in_txt.len + 1)
//print("Find [$start,$end] '${in_txt[start..end]}'")
if end > in_txt.len {
end = in_txt.len
}
re.flag = old_flag
if start >= 0 && end > start {
return start, end
}
return no_match_found, 0
}
// find_all find all the non overlapping occurrences of the match pattern
[direct_array_access]
pub fn (mut re RE) find_all(in_txt string) []int {
mut i := 0
mut res := []int{}
mut ls := -1
for i < in_txt.len {
s,e := re.find(in_txt[i..])
if s >= 0 && e > s && i+s > ls {
//println("find match in: ${i+s},${i+e} [${in_txt[i+s..i+e]}] ls:$ls")
res << i+s
res << i+e
ls = i+s
i = i+e
continue
} else {
i++
}
}
return res
}
// find_all_str find all the non overlapping occurrences of the match pattern, return a string list
[direct_array_access]
pub fn (mut re RE) find_all_str(in_txt string) []string {
mut i := 0
mut res := []string{}
mut ls := -1
for i < in_txt.len {
s,e := re.find(in_txt[i..])
if s >= 0 && e > s && i+s > ls {
//println("find match in: ${i+s},${i+e} [${in_txt[i+s..i+e]}] ls:$ls")
res << in_txt[i+s..i+e]
ls = i+s
i = i+e
continue
} else {
i++
}
}
return res
}
/******************************************************************************
*
* Replacers
*
******************************************************************************/
// replace return a string where the matches are replaced with the replace string
pub fn (mut re RE) replace(in_txt string, repl string) string {
pos := re.find_all(in_txt)
if pos.len > 0 {
mut res := ""
mut i := 0
mut s1 := 0
mut e1 := in_txt.len
for i < pos.len {
e1 = pos[i]
res += in_txt[s1..e1] + repl
s1 = pos[i+1]
i += 2
}
res += in_txt[s1..]
return res
}
return in_txt
}
// type of function used for custom replace
// in_txt source text
// start index of the start of the match in in_txt
// end index of the end of the match in in_txt
// the match is in in_txt[start..end]
pub type FnReplace = fn (re RE, in_txt string, start int, end int) string
// replace_by_fn return a string where the matches are replaced with the string from the repl_fn callback function
pub fn (mut re RE) replace_by_fn(in_txt string, repl_fn FnReplace) string {
mut i := 0
mut res := ""
mut ls := -1
mut s1 := 0
for i < in_txt.len {
s,e := re.find(in_txt[i..])
if s >= 0 && e > s && i+s > ls {
//println("find match in: ${i+s},${i+e} [${in_txt[i+s..i+e]}] ls:$ls")
start := i + s
end := i + e
// update grups index diplacement
mut gi := 0
for gi < re.groups.len {
re.groups[gi] += i
gi++
}
repl := repl_fn(re, in_txt, start, end)
res += in_txt[s1..start] + repl
s1 = end
ls = i + s
i = i + e
continue
} else {
i++
}
}
res += in_txt[s1..]
return res
}