regex: lots of fixes (#7380)

pull/7385/head
penguindark 2020-12-18 05:57:31 +01:00 committed by GitHub
parent 05e15bdd59
commit a6baffcb8c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 635 additions and 485 deletions

View File

@ -54,13 +54,13 @@ fn convert_html_rgb_n(in_col string) u32 {
println("start: $start, end: $end")
mut res := u32(0)
if start >= 0 {
red_s, red_e := re.get_group("red")
red_s, red_e := re.get_group_bounds_by_name("red")
r := ("0x" + in_col[red_s..red_e]).int() << col_mul
green_s, green_e := re.get_group("green")
green_s, green_e := re.get_group_bounds_by_name("green")
g := ("0x" + in_col[green_s..green_e]).int() << col_mul
blue_s, blue_e := re.get_group("blue")
blue_s, blue_e := re.get_group_bounds_by_name("blue")
b := ("0x" + in_col[blue_s..blue_e]).int() << col_mul
println("r: $r g: $g b: $b")

View File

@ -1,4 +1,4 @@
# V RegEx (Regular expression) 0.9h
# V RegEx (Regular expression) 1.0 alpha
[TOC]
@ -226,7 +226,18 @@ fn convert_html_rgb(in_col string) u32 {
}
```
Others utility functions are `get_group_by_id` and `get_group_bounds_by_id`
that get directly the string of a group using its `id`:
```v ignore
txt := "my used string...."
for g_index := 0; g_index < re.group_count ; g_index++ {
println("#${g_index} [${re.get_group_by_id(txt, g_index)}] \
bounds: ${re.get_group_bounds_by_id(g_index)}")
}
```
more helper functions are listed in the **Groups query functions** section.
### Groups Continuous saving
@ -251,43 +262,35 @@ The regex save until finish or found that the array have no space.
If the space ends no error is raised, further records will not be saved.
```v ignore
fn example2() {
test_regex()
text := 'tst: 01,23,45 ,56, 78'
query := r'.*:(\s*\d+[\s,]*)+'
mut re := new() or { panic(err) }
// re.debug = 2
re.group_csave_flag = true // enable continuous capture
re.compile_opt(query) or {
println(err)
return
}
q_str := re.get_query()
println('Query: $q_str')
start, end := re.match_string(text)
if start < 0 {
println('ERROR : ${re.get_parse_error_string(start)}, $start')
import regex
fn main(){
txt := "http://www.ciao.mondo/hello/pippo12_/pera.html"
query := r"(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+"
mut re := regex.regex_opt(query) or { panic(err) }
//println(re.get_code()) // uncomment to see the print of the regex execution code
re.debug=2 // enable maximum log
println("String: ${txt}")
println("Query : ${re.get_query()}")
re.debug=0 // disable log
re.group_csave_flag = true
start, end := re.match_string(txt)
if start >= 0 {
println("Match ($start, $end) => [${txt[start..end]}]")
} else {
println('found in [$start, $end] => [${text[start..end]}]')
println("No Match")
}
// groups capture
mut gi := 0
for gi < re.groups.len {
if re.groups[gi] >= 0 {
println('${gi / 2} ${re.groups[gi]},${re.groups[gi + 1]} :[${text[re.groups[gi]..re.groups[gi +
1]]}]')
if re.group_csave_flag == true && start >= 0 && re.group_csave.len > 0{
println("cg: $re.group_csave")
mut cs_i := 1
for cs_i < re.group_csave[0]*3 {
g_id := re.group_csave[cs_i]
st := re.group_csave[cs_i+1]
en := re.group_csave[cs_i+2]
println("cg[$g_id] $st $en:[${txt[st..en]}]")
cs_i += 3
}
gi += 2
}
// continuous saving
gi = 0
println('num: ${re.group_csave[0]}')
for gi < re.group_csave[0] {
id := re.group_csave[1 + gi * 3]
st := re.group_csave[1 + gi * 3 + 1]
en := re.group_csave[1 + gi * 3 + 2]
println('cg id: $id [$st, $en] => [${text[st..en]}]')
gi++
}
}
```
@ -295,15 +298,18 @@ fn example2() {
The output will be:
```
Query: .*:(\s*\d+[\s,]*)+
found in [0, 21] => [tst: 01,23,45 ,56, 78]
0 19,21 :[78]
num: 5
cg id: 0 [4, 8] => [ 01,]
cg id: 0 [8, 11] => [23,]
cg id: 0 [11, 15] => [45 ,]
cg id: 0 [15, 19] => [56, ]
cg id: 0 [19, 21] => [78]
String: http://www.ciao.mondo/hello/pippo12_/pera.html
Query : #0(?P<format>https?)|{8,14}#0(?P<format>ftps?)://#1(?P<token>[\w_]+.)+
Match (0, 46) => [http://www.ciao.mondo/hello/pippo12_/pera.html]
cg: [8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46]
cg[0] 0 4:[http]
cg[1] 7 11:[www.]
cg[1] 11 16:[ciao.]
cg[1] 16 22:[mondo/]
cg[1] 22 28:[hello/]
cg[1] 28 37:[pippo12_/]
cg[1] 37 42:[pera.]
cg[1] 42 46:[html]
```
### Named capturing groups
@ -323,60 +329,26 @@ example:
```v ignore
import regex
fn main(){
test_regex()
text := 'http://www.ciao.mondo/hello/pippo12_/pera.html'
query := r'(?P<format>https?)|(?:ftps?)://(?P<token>[\w_]+.)+'
mut re := new()
re.debug = 2
// must provide an array of the right size if want the continuous saving of the groups
re.group_csave = [-1].repeat(3 * 20 + 1)
re.compile_opt(query) or {
println(err)
return
}
q_str := re.get_query()
println('O.Query: $query')
println('Query : $q_str')
re.debug = 0
start, end := re.match_string(text)
if start < 0 {
err_str := re.get_parse_error_string(start)
println('ERROR : $err_str, $start')
txt := "http://www.ciao.mondo/hello/pippo12_/pera.html"
query := r"(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+"
mut re := regex.regex_opt(query) or { panic(err) }
//println(re.get_code()) // uncomment to see the print of the regex execution code
re.debug=2 // enable maximum log
println("String: ${txt}")
println("Query : ${re.get_query()}")
re.debug=0 // disable log
start, end := re.match_string(txt)
if start >= 0 {
println("Match ($start, $end) => [${txt[start..end]}]")
} else {
text1 := text[start..end]
println('found in [$start, $end] => [$text1]')
}
// groups
mut gi := 0
for gi < re.groups.len {
if re.groups[gi] >= 0 {
println('${gi / 2} ${re.groups[gi]},${re.groups[gi + 1]} :[${text[re.groups[gi]..re.groups[gi +
1]]}]')
}
gi += 2
}
// continuous saving
gi = 0
println('num of group item saved: ${re.group_csave[0]}')
for gi < re.group_csave[0] {
id := re.group_csave[1 + gi * 3]
st := re.group_csave[1 + gi * 3 + 1]
en := re.group_csave[1 + gi * 3 + 2]
println('cg id: $id [$st, $en] => [${text[st..en]}]')
gi++
}
println('raw array: ${re.group_csave[0..gi * 3 + 2 - 1]}')
// named capturing groups
println('named capturing groups:')
for g_name in re.group_map.keys() {
s, e := re.get_group(g_name)
if s >= 0 && e > s {
println("'$g_name':[$s, $e] => '${text[s..e]}'")
} else {
println("Group [$g_name] doesn't exist.")
println("No Match")
}
for name in re.group_map.keys() {
println("group:'$name' \t=> [${re.get_group_by_name(txt, name)}] \
bounds: ${re.get_group_bounds_by_name(name)}")
}
}
```
@ -384,28 +356,15 @@ fn main() {
Output:
```
O.Query: (?P<format>https?)|(?:ftps?)://(?P<token>[\w_]+.)+
Query : #0(?P<format>https?)|{8,14}(?:ftps?)://#1(?P<token>[\w_]+.)+
found in [0, 46] => [http://www.ciao.mondo/hello/pippo12_/pera.html]
0 0,4 :[http]
1 42,46 :[html]
num of group item saved: 8
cg id: 0 [0, 4] => [http]
cg id: 1 [7, 11] => [www.]
cg id: 1 [11, 16] => [ciao.]
cg id: 1 [16, 22] => [mondo/]
cg id: 1 [22, 28] => [hello/]
cg id: 1 [28, 37] => [pippo12_/]
cg id: 1 [37, 42] => [pera.]
cg id: 1 [42, 46] => [html]
raw array: [8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46]
named capturing groups:
'format':[0, 4] => 'http'
'token':[42, 46] => 'html'
String: http://www.ciao.mondo/hello/pippo12_/pera.html
Query : #0(?P<format>https?)|{8,14}#0(?P<format>ftps?)://#1(?P<token>[\w_]+.)+
Match (0, 46) => [http://www.ciao.mondo/hello/pippo12_/pera.html]
group:'format' => [http] bounds: (0, 4)
group:'token' => [html] bounds: (42, 46)
```
In order to simplify the use of the named groups it possible to use names map in the `re`
struct using the function `re.get_group`.
struct using the function `re.get_group_by_name`.
Here a more complex example of use:
@ -420,11 +379,11 @@ fn convert_html_rgb_n(in_col string) u32 {
println('start: $start, end: $end')
mut res := u32(0)
if start >= 0 {
red_s, red_e := re.get_group('red')
red_s, red_e := re.get_group_by_name('red')
r := ('0x' + in_col[red_s..red_e]).int() << col_mul
green_s, green_e := re.get_group('green')
green_s, green_e := re.get_group_by_name('green')
g := ('0x' + in_col[green_s..green_e]).int() << col_mul
blue_s, blue_e := re.get_group('blue')
blue_s, blue_e := re.get_group_by_name('blue')
b := ('0x' + in_col[blue_s..blue_e]).int() << col_mul
println('r: $r g: $g b: $b')
res = u32(r) << 16 | u32(g) << 8 | u32(b)
@ -433,7 +392,45 @@ fn convert_html_rgb_n(in_col string) u32 {
}
```
Others utility functions are `get_group_by_name` and `get_group_bounds_by_name`
that get directly the string of a group using its `name`:
```v ignore
txt := "my used string...."
for name in re.group_map.keys() {
println("group:'$name' \t=> [${re.get_group_by_name(txt, name)}] \
bounds: ${re.get_group_bounds_by_name(name)}")
}
```
### Groups query functions
These functions are helpers to query the captured groups
```v ignore
// get_group_bounds_by_name get a group boundaries by its name
pub fn (re RE) get_group_bounds_by_name(group_name string) (int, int)
// get_group_by_name get a group boundaries by its name
pub fn (re RE) get_group_by_name(group_name string) string
// get_group_by_id get a group boundaries by its id
pub fn (re RE) get_group_bounds_by_id(group_id int) (int,int)
// get_group_by_id get a group string by its id
pub fn (re RE) get_group_by_id(in_txt string, group_id int) string
struct Re_group {
pub:
start int = -1
end int = -1
}
// get_group_list return a list of Re_group for the found groups
pub fn (re RE) get_group_list() []Re_group
```
## Flags
@ -501,6 +498,48 @@ pub fn (re mut RE) find_all(in_txt string) []int
pub fn (re mut RE) replace(in_txt string, repl string) string
```
## Find and Replace
For complex find and replace operations it is available the function `replace_by_fn` .
The`replace_by_fn` use a custom replace function making possible customizations.
**The custom function is called for every non overlapped find.**
The custom function must be of the type:
```v ignore
fn (re RE, in_txt string, start int, end int) string
```
The following example will clarify the use:
```v ignore
import regex
// customized replace functions
// it will be called on each non overlapped find
fn my_repl(re regex.RE, in_txt string, start int, end int) string {
g0 := re.get_group_by_id(in_txt, 0)
g1 := re.get_group_by_id(in_txt, 1)
g2 := re.get_group_by_id(in_txt, 2)
return "*$g0*$g1*$g2*"
}
fn main(){
txt := "today [John] is gone to his house with (Jack) and [Marie]."
query := r"(.)(\A\w+)(.)"
mut re := regex.regex_opt(query) or { panic(err) }
result := re.replace_by_fn(txt, my_repl)
println(result)
}
```
Output:
```
today *[*John*]* is gone to his house with *(*Jack*)* and *[*Marie*]*.
```
## Debugging
This module has few small utilities to help the writing of regex expressions.
@ -527,11 +566,20 @@ The result will be something like this:
```
========================================
v RegEx compiler v 0.9c output:
PC: 0 ist: 7fffffff [a] query_ch { 1, 1}
PC: 1 ist: 7fffffff [b] query_ch { 1,MAX}
PC: 2 ist: 88000000 PROG_END { 0, 0}
v RegEx compiler v 1.0 alpha output:
PC: 0 ist: 92000000 ( GROUP_START #:0 { 1, 1}
PC: 1 ist: 98000000 . DOT_CHAR nx chk: 4 { 1, 1}
PC: 2 ist: 94000000 ) GROUP_END #:0 { 1, 1}
PC: 3 ist: 92000000 ( GROUP_START #:1 { 1, 1}
PC: 4 ist: 90000000 [\A] BSLS { 1, 1}
PC: 5 ist: 90000000 [\w] BSLS { 1,MAX}
PC: 6 ist: 94000000 ) GROUP_END #:1 { 1, 1}
PC: 7 ist: 92000000 ( GROUP_START #:2 { 1, 1}
PC: 8 ist: 98000000 . DOT_CHAR nx chk: -1 last! { 1, 1}
PC: 9 ist: 94000000 ) GROUP_END #:2 { 1, 1}
PC: 10 ist: 88000000 PROG_END { 0, 0}
========================================
```
`PC`:`int` is the program counter or step of execution, each single step is a token.
@ -625,54 +673,29 @@ re.log_func = custom_print
Here there is a simple code to perform some basically match of strings
```v oksyntax
struct TestObj {
source string // source string to parse
query string // regex query string
s int // expected match start index
e int // expected match end index
}
const (
tests = [
TestObj{'this is a good.', r'this (\w+) a', 0, 9},
TestObj{'this,these,those. over', r'(th[eio]se?[,. ])+', 0, 17},
TestObj{'test1@post.pip.com, pera', r'[\w]+@([\w]+\.)+\w+', 0, 18},
TestObj{'cpapaz ole. pippo,', r'.*c.+ole.*pi', 0, 14},
TestObj{'adce aabe', r'(a(ab)+)|(a(dc)+)e', 0, 4},
]
)
fn example() {
for c, tst in tests {
mut re := regex.new()
re.compile_opt(tst.query) or {
println(err)
continue
}
// print the query parsed with the groups ids
re.debug = 1 // set debug on at minimum level
println('#${c:2d} query parsed: $re.get_query()')
re.debug = 0
// do the match
start, end := re.match_string(tst.source)
if start >= 0 && end > start {
println('#${c:2d} found in: [$start, $end] => [${tst.source[start..end]}]')
}
// print the groups
mut gi := 0
for gi < re.groups.len {
if re.groups[gi] >= 0 {
println('group ${gi / 2:2d} :[${tst.source[re.groups[gi]..re.groups[gi + 1]]}]')
}
gi += 2
}
println('')
}
}
```v ignore
import regex
fn main(){
example()
txt := "http://www.ciao.mondo/hello/pippo12_/pera.html"
query := r"(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+"
mut re := regex.regex_opt(query) or { panic(err) }
start, end := re.match_string(txt)
if start >= 0 {
println("Match ($start, $end) => [${txt[start..end]}]")
for g_index := 0; g_index < re.group_count ; g_index++ {
println("#${g_index} [${re.get_group_by_id(txt, g_index)}] \
bounds: ${re.get_group_bounds_by_id(g_index)}")
}
for name in re.group_map.keys() {
println("group:'$name' \t=> [${re.get_group_by_name(txt, name)}] \
bounds: ${re.get_group_bounds_by_name(name)}")
}
} else {
println("No Match")
}
}
```

File diff suppressed because it is too large Load Diff

View File

@ -7,7 +7,7 @@ pub fn (mut re RE) compile_opt(pattern string) ? {
if re_err != compile_ok {
mut err_msg := strings.new_builder(300)
err_msg.write("query: $pattern\n")
err_msg.write("\nquery: $pattern\n")
line := "-".repeat(err_pos)
err_msg.write("err : ${line}^\n")
err_str := re.get_parse_error_string(re_err)

View File

@ -21,6 +21,10 @@ match_test_suite = [
TestItem{"b",r"b|a",0,1},
TestItem{"c",r"b|a",-1,0},
// test base
TestItem{"[ciao]",r"(.)ciao(.)",0,6},
TestItem{"[ciao] da me",r"(.)ciao(.)",0,6},
// positive
TestItem{"this is a good.",r"this",0,4},
TestItem{"this is a good.",r"good",10,14},
@ -193,7 +197,8 @@ cgroups_test_suite = [
TestItemCGroup{
"http://www.ciao.mondo/hello/pippo12_/pera.html",
r"(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+",0,46,
[8, 0, 0, 4, 1, 7, 12, 1, 11, 17, 1, 16, 23, 1, 22, 29, 1, 28, 38, 1, 37, 43, 1, 42, 46],
[8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46]
//[8, 0, 0, 4, 1, 7, 10, 1, 11, 15, 1, 16, 21, 1, 22, 27, 1, 28, 36, 1, 37, 41, 1, 42, 46],
{'format':int(0),'token':1}
},
TestItemCGroup{