regex: lots of fixes (#7380)

2020-12-18 05:57:31 +01:00 · 2020-12-18 05:57:31 +01:00 · a6baffcb8c
parent 05e15bdd59
commit a6baffcb8c
5 changed files with 635 additions and 485 deletions
--- a/examples/regex_example.v
+++ b/examples/regex_example.v
@ -54,13 +54,13 @@ fn convert_html_rgb_n(in_col string) u32 {
 	println("start: $start, end: $end")
 	mut res := u32(0)
 	if start >= 0 {
-		red_s, red_e := re.get_group("red")
+		red_s, red_e := re.get_group_bounds_by_name("red")
 		r := ("0x" + in_col[red_s..red_e]).int() << col_mul
-		green_s, green_e := re.get_group("green")
+		green_s, green_e := re.get_group_bounds_by_name("green")
 		g := ("0x" + in_col[green_s..green_e]).int() << col_mul
-		blue_s, blue_e := re.get_group("blue")
+		blue_s, blue_e := re.get_group_bounds_by_name("blue")
 		b := ("0x" + in_col[blue_s..blue_e]).int() << col_mul
 		println("r: $r g: $g b: $b")
--- a/vlib/regex/README.md
+++ b/vlib/regex/README.md
@ -1,4 +1,4 @@
-# V RegEx (Regular expression) 0.9h
+# V RegEx (Regular expression) 1.0 alpha
 [TOC]
@ -226,7 +226,18 @@ fn convert_html_rgb(in_col string) u32 {
 }
 ```
 Others utility functions are `get_group_by_id` and `get_group_bounds_by_id` 
 that get  directly the string of a group using its `id`:
 ```v ignore
 txt := "my used string...."
 for g_index := 0; g_index < re.group_count ; g_index++ {
 	println("#${g_index} [${re.get_group_by_id(txt, g_index)}] \
    	bounds: ${re.get_group_bounds_by_id(g_index)}") 
 }
 ```
 more helper functions are listed in the **Groups query functions** section.
 ### Groups Continuous saving
@ -251,59 +262,54 @@ The regex save until finish or found that the array have no space.
 If the space ends no error is raised, further records will not be saved.
 ```v ignore
-fn example2() {
+import regex
-	test_regex()
+fn main(){
-	text := 'tst: 01,23,45 ,56, 78'
+    txt   := "http://www.ciao.mondo/hello/pippo12_/pera.html"
-	query := r'.*:(\s*\d+[\s,]*)+'
+    query := r"(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+"
-	mut re := new() or { panic(err) }
+
-	// re.debug = 2
+    mut re := regex.regex_opt(query) or { panic(err) }
-	re.group_csave_flag = true  // enable continuous capture
+    //println(re.get_code())   // uncomment to see the print of the regex execution code
-	re.compile_opt(query) or {
+    re.debug=2  // enable maximum log
-		println(err)
+    println("String: ${txt}")
-		return
+    println("Query : ${re.get_query()}")
-	}
+    re.debug=0  // disable log
-	q_str := re.get_query()
+    re.group_csave_flag = true
-	println('Query: $q_str')
+    start, end := re.match_string(txt)
-	start, end := re.match_string(text)
+    if start >= 0 {
-	if start < 0 {
+        println("Match ($start, $end) => [${txt[start..end]}]")
-		println('ERROR : ${re.get_parse_error_string(start)}, $start')
+    } else {
-	} else {
+        println("No Match")
-		println('found in [$start, $end] => [${text[start..end]}]')
+    }
-	}
+
-	// groups capture
+    if re.group_csave_flag == true && start >= 0 && re.group_csave.len > 0{
-	mut gi := 0
+        println("cg: $re.group_csave")
-	for gi < re.groups.len {
+        mut cs_i := 1
-		if re.groups[gi] >= 0 {
+        for cs_i < re.group_csave[0]*3 {
-			println('${gi / 2} ${re.groups[gi]},${re.groups[gi + 1]} :[${text[re.groups[gi]..re.groups[gi +
+            g_id := re.group_csave[cs_i]
-				1]]}]')
+            st   := re.group_csave[cs_i+1]
-		}
+            en   := re.group_csave[cs_i+2]
-		gi += 2
+            println("cg[$g_id] $st $en:[${txt[st..en]}]")
-	}
+            cs_i += 3
-	// continuous saving
+        }
-	gi = 0
+    }
 	println('num: ${re.group_csave[0]}')
 	for gi < re.group_csave[0] {
 		id := re.group_csave[1 + gi * 3]
 		st := re.group_csave[1 + gi * 3 + 1]
 		en := re.group_csave[1 + gi * 3 + 2]
 		println('cg id: $id [$st, $en] => [${text[st..en]}]')
 		gi++
 	}
 }
 ```
 The output will be:
 ```
-Query: .*:(\s*\d+[\s,]*)+
+String: http://www.ciao.mondo/hello/pippo12_/pera.html
-found in [0, 21] => [tst: 01,23,45 ,56, 78]
+Query : #0(?P<format>https?)|{8,14}#0(?P<format>ftps?)://#1(?P<token>[\w_]+.)+
-0 19,21 :[78]
+Match (0, 46) => [http://www.ciao.mondo/hello/pippo12_/pera.html]
-num: 5
+cg: [8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46]
-cg id: 0 [4, 8] => [ 01,]
+cg[0] 0 4:[http]
-cg id: 0 [8, 11] => [23,]
+cg[1] 7 11:[www.]
-cg id: 0 [11, 15] => [45 ,]
+cg[1] 11 16:[ciao.]
-cg id: 0 [15, 19] => [56, ]
+cg[1] 16 22:[mondo/]
-cg id: 0 [19, 21] => [78]
+cg[1] 22 28:[hello/]
 cg[1] 28 37:[pippo12_/]
 cg[1] 37 42:[pera.]
 cg[1] 42 46:[html]
 ```
 ### Named capturing groups
@ -323,89 +329,42 @@ example:
 ```v ignore
 import regex
 fn main(){
    txt   := "http://www.ciao.mondo/hello/pippo12_/pera.html"
    query := r"(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+"
-fn main() {
+    mut re := regex.regex_opt(query) or { panic(err) }
-	test_regex()
+    //println(re.get_code())   // uncomment to see the print of the regex execution code
-	text := 'http://www.ciao.mondo/hello/pippo12_/pera.html'
+    re.debug=2  // enable maximum log
-	query := r'(?P<format>https?)|(?:ftps?)://(?P<token>[\w_]+.)+'
+    println("String: ${txt}")
-	mut re := new()
+    println("Query : ${re.get_query()}")
-	re.debug = 2
+    re.debug=0  // disable log
-	// must provide an array of the right size if want the continuous saving of the groups
+    start, end := re.match_string(txt)
-	re.group_csave = [-1].repeat(3 * 20 + 1)
+    if start >= 0 {
-	re.compile_opt(query) or {
+        println("Match ($start, $end) => [${txt[start..end]}]")
-		println(err)
+    } else {
-		return
+        println("No Match")
-	}
+    }
-	q_str := re.get_query()
+
-	println('O.Query: $query')
+    for name in re.group_map.keys() {
-	println('Query  : $q_str')
+        println("group:'$name' \t=> [${re.get_group_by_name(txt, name)}] \
-	re.debug = 0
+        bounds: ${re.get_group_bounds_by_name(name)}")
-	start, end := re.match_string(text)
+    }
 	if start < 0 {
 		err_str := re.get_parse_error_string(start)
 		println('ERROR : $err_str, $start')
 	} else {
 		text1 := text[start..end]
 		println('found in [$start, $end] => [$text1]')
 	}
 	// groups
 	mut gi := 0
 	for gi < re.groups.len {
 		if re.groups[gi] >= 0 {
 			println('${gi / 2} ${re.groups[gi]},${re.groups[gi + 1]} :[${text[re.groups[gi]..re.groups[gi +
 				1]]}]')
 		}
 		gi += 2
 	}
 	// continuous saving
 	gi = 0
 	println('num of group item saved: ${re.group_csave[0]}')
 	for gi < re.group_csave[0] {
 		id := re.group_csave[1 + gi * 3]
 		st := re.group_csave[1 + gi * 3 + 1]
 		en := re.group_csave[1 + gi * 3 + 2]
 		println('cg id: $id [$st, $en] => [${text[st..en]}]')
 		gi++
 	}
 	println('raw array: ${re.group_csave[0..gi * 3 + 2 - 1]}')
 	// named capturing groups
 	println('named capturing groups:')
 	for g_name in re.group_map.keys() {
 		s, e := re.get_group(g_name)
 		if s >= 0 && e > s {
 			println("'$g_name':[$s, $e] => '${text[s..e]}'")
 		} else {
 			println("Group [$g_name] doesn't exist.")
 		}
 	}
 }
 ```
 Output:
 ```
-O.Query: (?P<format>https?)|(?:ftps?)://(?P<token>[\w_]+.)+
+String: http://www.ciao.mondo/hello/pippo12_/pera.html
-Query  : #0(?P<format>https?)|{8,14}(?:ftps?)://#1(?P<token>[\w_]+.)+
+Query : #0(?P<format>https?)|{8,14}#0(?P<format>ftps?)://#1(?P<token>[\w_]+.)+
-found in [0, 46] => [http://www.ciao.mondo/hello/pippo12_/pera.html]
+Match (0, 46) => [http://www.ciao.mondo/hello/pippo12_/pera.html]
-0 0,4 :[http]
+group:'format' 	=> [http] bounds: (0, 4)
-1 42,46 :[html]
+group:'token' 	=> [html] bounds: (42, 46)
 num of group item saved: 8
 cg id: 0 [0, 4] => [http]
 cg id: 1 [7, 11] => [www.]
 cg id: 1 [11, 16] => [ciao.]
 cg id: 1 [16, 22] => [mondo/]
 cg id: 1 [22, 28] => [hello/]
 cg id: 1 [28, 37] => [pippo12_/]
 cg id: 1 [37, 42] => [pera.]
 cg id: 1 [42, 46] => [html]
 raw array: [8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46]
 named capturing groups:
 'format':[0, 4] => 'http'
 'token':[42, 46] => 'html'
 ```
 In order to simplify the use of the named groups it possible to use names map in the `re`
-struct using the function `re.get_group`.
+struct using the function `re.get_group_by_name`.
 Here a more complex example of use:
@ -420,11 +379,11 @@ fn convert_html_rgb_n(in_col string) u32 {
 	println('start: $start, end: $end')
 	mut res := u32(0)
 	if start >= 0 {
-		red_s, red_e := re.get_group('red')
+		red_s, red_e := re.get_group_by_name('red')
 		r := ('0x' + in_col[red_s..red_e]).int() << col_mul
-		green_s, green_e := re.get_group('green')
+		green_s, green_e := re.get_group_by_name('green')
 		g := ('0x' + in_col[green_s..green_e]).int() << col_mul
-		blue_s, blue_e := re.get_group('blue')
+		blue_s, blue_e := re.get_group_by_name('blue')
 		b := ('0x' + in_col[blue_s..blue_e]).int() << col_mul
 		println('r: $r g: $g b: $b')
 		res = u32(r) << 16 | u32(g) << 8 | u32(b)
@ -433,7 +392,45 @@ fn convert_html_rgb_n(in_col string) u32 {
 }
 ```
 Others utility functions are `get_group_by_name` and `get_group_bounds_by_name`
 that get  directly the string of a group using its `name`:
 ```v ignore
 txt := "my used string...."
 for name in re.group_map.keys() {
 	println("group:'$name' \t=> [${re.get_group_by_name(txt, name)}] \
    bounds: ${re.get_group_bounds_by_name(name)}")
 }
 ```
 ### Groups query functions
 These functions are helpers to query the captured groups
 ```v ignore
 // get_group_bounds_by_name get a group boundaries by its name
 pub fn (re RE) get_group_bounds_by_name(group_name string) (int, int) 
 // get_group_by_name get a group boundaries by its name
 pub fn (re RE) get_group_by_name(group_name string) string
 // get_group_by_id get a group boundaries by its id
 pub fn (re RE) get_group_bounds_by_id(group_id int) (int,int)
 // get_group_by_id get a group string by its id
 pub fn (re RE) get_group_by_id(in_txt string, group_id int) string
 struct Re_group {
 pub:
 	start int = -1
 	end   int = -1
 }
 // get_group_list return a list of Re_group for the found groups
 pub fn (re RE) get_group_list() []Re_group
 ```
 ## Flags
@ -501,6 +498,48 @@ pub fn (re mut RE) find_all(in_txt string) []int
 pub fn (re mut RE) replace(in_txt string, repl string) string
 ```
 ## Find and Replace
 For complex find and replace operations it is available the function `replace_by_fn` .
 The`replace_by_fn` use a custom replace function making possible customizations. 
 **The custom function is called for every non overlapped find.**
 The custom function must be of the type:
 ```v ignore
 fn (re RE, in_txt string, start int, end int) string
 ```
 The following example will clarify the use:
 ```v ignore
 import regex
 // customized replace functions
 // it will be called on each non overlapped find
 fn my_repl(re regex.RE, in_txt string, start int, end int) string {
    g0 := re.get_group_by_id(in_txt, 0)
    g1 := re.get_group_by_id(in_txt, 1)
    g2 := re.get_group_by_id(in_txt, 2)
    return "*$g0*$g1*$g2*"    
 }
 fn main(){
    txt   := "today [John] is gone to his house with (Jack) and [Marie]."
    query := r"(.)(\A\w+)(.)"
    mut re := regex.regex_opt(query) or { panic(err) }
    result := re.replace_by_fn(txt, my_repl)
    println(result)
 }
 ```
 Output:
 ```
 today *[*John*]* is gone to his house with *(*Jack*)* and *[*Marie*]*.
 ```
 ## Debugging
 This module has few small utilities to help the writing of regex expressions.
@ -527,11 +566,20 @@ The result will be something like this:
 ```
 ========================================
-v RegEx compiler v 0.9c output:
+v RegEx compiler v 1.0 alpha output:
-PC:  0 ist: 7fffffff [a]      query_ch {  1,  1}
+PC:  0 ist: 92000000 (        GROUP_START #:0 {  1,  1}
-PC:  1 ist: 7fffffff [b]      query_ch {  1,MAX}
+PC:  1 ist: 98000000 .        DOT_CHAR nx chk: 4 {  1,  1}
-PC:  2 ist: 88000000 PROG_END {  0,  0}
+PC:  2 ist: 94000000 )        GROUP_END   #:0 {  1,  1}
 PC:  3 ist: 92000000 (        GROUP_START #:1 {  1,  1}
 PC:  4 ist: 90000000 [\A]     BSLS {  1,  1}
 PC:  5 ist: 90000000 [\w]     BSLS {  1,MAX}
 PC:  6 ist: 94000000 )        GROUP_END   #:1 {  1,  1}
 PC:  7 ist: 92000000 (        GROUP_START #:2 {  1,  1}
 PC:  8 ist: 98000000 .        DOT_CHAR nx chk: -1 last! {  1,  1}
 PC:  9 ist: 94000000 )        GROUP_END   #:2 {  1,  1}
 PC: 10 ist: 88000000 PROG_END {  0,  0}
 ========================================
 ```
 `PC`:`int` is the program counter or step of execution, each single step is a token.
@ -625,54 +673,29 @@ re.log_func = custom_print
 Here there is a simple code to perform some basically match of strings
-```v oksyntax
+```v ignore
-struct TestObj {
+import regex
 	source string // source string to parse
 	query  string // regex query string
 	s      int // expected match start index
 	e      int // expected match end index
 }
-const (
+fn main(){
-	tests = [
+    txt   := "http://www.ciao.mondo/hello/pippo12_/pera.html"
-		TestObj{'this is a good.', r'this (\w+) a', 0, 9},
+    query := r"(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+"
 		TestObj{'this,these,those. over', r'(th[eio]se?[,. ])+', 0, 17},
 		TestObj{'test1@post.pip.com, pera', r'[\w]+@([\w]+\.)+\w+', 0, 18},
 		TestObj{'cpapaz ole. pippo,', r'.*c.+ole.*pi', 0, 14},
 		TestObj{'adce aabe', r'(a(ab)+)|(a(dc)+)e', 0, 4},
 	]
 )
-fn example() {
+    mut re := regex.regex_opt(query) or { panic(err) }
-	for c, tst in tests {
+   
-		mut re := regex.new()
+    start, end := re.match_string(txt)
-		re.compile_opt(tst.query) or {
+    if start >= 0 {
-			println(err)
+        println("Match ($start, $end) => [${txt[start..end]}]")
-			continue
+        for g_index := 0; g_index < re.group_count ; g_index++ {
-		}
+            println("#${g_index} [${re.get_group_by_id(txt, g_index)}] \
-		// print the query parsed with the groups ids
+            bounds: ${re.get_group_bounds_by_id(g_index)}")  
-		re.debug = 1 // set debug on at minimum level
+        }
-		println('#${c:2d} query parsed: $re.get_query()')
+        for name in re.group_map.keys() {
-		re.debug = 0
+            println("group:'$name' \t=> [${re.get_group_by_name(txt, name)}] \
-		// do the match
+            bounds: ${re.get_group_bounds_by_name(name)}")
-		start, end := re.match_string(tst.source)
+        }
-		if start >= 0 && end > start {
+    } else {
-			println('#${c:2d} found in: [$start, $end] => [${tst.source[start..end]}]')
+        println("No Match")
-		}
+    }
 		// print the groups
 		mut gi := 0
 		for gi < re.groups.len {
 			if re.groups[gi] >= 0 {
 				println('group ${gi / 2:2d} :[${tst.source[re.groups[gi]..re.groups[gi + 1]]}]')
 			}
 			gi += 2
 		}
 		println('')
 	}
 }
 fn main() {
 	example()
 }
 ```
--- a/vlib/regex/regex.v
+++ b/vlib/regex/regex.v
--- a/vlib/regex/regex_opt.v
+++ b/vlib/regex/regex_opt.v
@ -7,7 +7,7 @@ pub fn (mut re RE) compile_opt(pattern string) ? {
 	if re_err != compile_ok {
 		mut err_msg := strings.new_builder(300)
-		err_msg.write("query: $pattern\n")
+		err_msg.write("\nquery: $pattern\n")
 		line := "-".repeat(err_pos)
 		err_msg.write("err  : ${line}^\n")
 		err_str := re.get_parse_error_string(re_err)
--- a/vlib/regex/regex_test.v
+++ b/vlib/regex/regex_test.v
@ -21,6 +21,10 @@ match_test_suite = [
 	TestItem{"b",r"b|a",0,1},
 	TestItem{"c",r"b|a",-1,0},
 	// test base
 	TestItem{"[ciao]",r"(.)ciao(.)",0,6},
 	TestItem{"[ciao] da me",r"(.)ciao(.)",0,6},
 	// positive
 	TestItem{"this is a good.",r"this",0,4},
 	TestItem{"this is a good.",r"good",10,14},
@ -193,7 +197,8 @@ cgroups_test_suite = [
 	TestItemCGroup{
 		"http://www.ciao.mondo/hello/pippo12_/pera.html",
 		r"(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+",0,46,
-		[8, 0, 0, 4, 1, 7, 12, 1, 11, 17, 1, 16, 23, 1, 22, 29, 1, 28, 38, 1, 37, 43, 1, 42, 46],
+		[8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46]
 		//[8, 0, 0, 4, 1, 7, 10, 1, 11, 15, 1, 16, 21, 1, 22, 27, 1, 28, 36, 1, 37, 41, 1, 42, 46],		
 		{'format':int(0),'token':1}
 	},
 	TestItemCGroup{