regex: named capturing groups, small fixes

2020-01-28 20:34:11 +01:00 · 2020-01-28 20:34:11 +01:00 · 5a2534122e
parent 9ac0c54eb0
commit 5a2534122e
3 changed files with 387 additions and 31 deletions
--- a/vlib/regex/README.md
+++ b/vlib/regex/README.md
@ -55,7 +55,7 @@ A meta-char is specified by a backslash before a char like `\w` in this case the
 A meta-char can match different type of chars.
-* `\w` match an alphanumeric char `[a-zA-Z0-9]`
+* `\w` match an alphanumeric char `[a-zA-Z0-9_]`
 * `\W` match a non alphanumeric char
 * `\d` match a digit `[0-9]`
 * `\D` match a non digit
@ -244,6 +244,114 @@ cg id: 0 [15, 19] => [56, ]
 cg id: 0 [19, 21] => [78] 
 ```
 ### Named capturing groups
 This regex module support partially the question mark `?` PCRE syntax for groups.
 `(?:abcd)` **non capturing group**:  the content of the group will not be saved
 `(?P<mygroup>abcdef)` **named group:** the group content is saved and labeled as `mygroup`
 The label of the groups is saved in the `group_map` of the `RE` struct, this is a map from `string` to `int` where the value is the index in `group_csave` list of index.
 Have a look at the example for the use of them.
 example:
 ```v
 fn main() {
 	test_regex()
 	text := "http://www.ciao.mondo/hello/pippo12_/pera.html"
 	query:= r"(?P<format>https?)|(?:ftps?)://(?P<token>[\w_]+.)+"
 	mut re := new_regex()
 	re.debug = 2
 	// must provide an array of the right size if want the continuos saving of the groups
 	re.group_csave = [-1].repeat(3*20+1)
 	re_err, err_pos := re.compile(query)
 	if re_err == COMPILE_OK {
 		q_str := re.get_query()
 		println("O.Query: $query")
 		println("Query  : $q_str")
 		re.debug = 0	
 		start, end := re.match_string(text)
 		if start < 0 {
 			err_str := re.get_parse_error_string(start)
 			println("ERROR : $err_str, $start")
 		} else {
 			text1 := text[start..end]
 			println("found in [$start, $end] => [$text1]")
 		}
 		// groups
 		mut gi := 0
 		for gi < re.groups.len {
 			if re.groups[gi] >= 0 {
 				println("${gi/2} ${re.groups[gi]},${re.groups[gi+1]} :[${text[re.groups[gi]..re.groups[gi+1]]}]")
 			}
 			gi += 2
 		}
 		// continuous saving
 		gi = 0
 		println("num of group item saved: ${re.group_csave[0]}")
 		for gi < re.group_csave[0] {
 			id := re.group_csave[1+gi*3]
 			st := re.group_csave[1+gi*3+1]
 			en := re.group_csave[1+gi*3+2]
 			println("cg id: ${id} [${st}, ${en}] => [${text[st..en]}]")
 			gi++
 		}
 		println("raw array: ${re.group_csave[0..gi*3+2-1]}")
 		// named capturing groups
 		println("named capturing groups:")
 		for g_name in re.group_map.keys() {
 			s,e := re.get_group(g_name)
 			if s >= 0 && e > s {
 				println("'${g_name}':[$s, $e] => '${text[s..e]}'")
 			} else {
 				println("Group [${g_name}] doesn't exist.")
 			}
 		}
 	} else {
 		println("query: $query")
 		lc := "-".repeat(err_pos)
 		println("err  : $lc^")
 		err_str := re.get_parse_error_string(re_err)
 		println("ERROR: $err_str")	
 	}
 }
 ```
 Output:
 ```
 O.Query: (?P<format>https?)|(?:ftps?)://(?P<token>[\w_]+.)+
 Query  : #0(?P<format>https?)|{8,14}(?:ftps?)://#1(?P<token>[\w_]+.)+
 found in [0, 46] => [http://www.ciao.mondo/hello/pippo12_/pera.html]
 0 0,4 :[http]
 1 42,46 :[html]
 num of group item saved: 8
 cg id: 0 [0, 4] => [http]
 cg id: 1 [7, 11] => [www.]
 cg id: 1 [11, 16] => [ciao.]
 cg id: 1 [16, 22] => [mondo/]
 cg id: 1 [22, 28] => [hello/]
 cg id: 1 [28, 37] => [pippo12_/]
 cg id: 1 [37, 42] => [pera.]
 cg id: 1 [42, 46] => [html]
 raw array: [8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46] 
 named capturing groups:
 'format':[0, 4] => 'http'
 'token':[42, 46] => 'html'
 ```
 ## Flags
 It is possible to set some flags in the regex parser that change the behavior of the parser itself.
--- a/vlib/regex/regex.v
+++ b/vlib/regex/regex.v
@ -1,6 +1,6 @@
 /**********************************************************************
 *
-* regex 0.9c
+* regex 0.9d
 *
 * Copyright (c) 2019-2020 Dario Deledda. All rights reserved.
 * Use of this source code is governed by an MIT license
@ -9,8 +9,9 @@
 * This file contains regex module
 *
 * Know limitation:
 * - max 8 stacked groups
 * - find is implemented in a trivial way
 * - not full compliant PCRE
 * - not compliant POSIX ERE
 *
 *
 **********************************************************************/
@ -18,7 +19,7 @@ module regex
 import strings
 pub const(
-	V_REGEX_VERSION = "0.9c"      // regex module version
+	V_REGEX_VERSION = "0.9d"      // regex module version
 	MAX_CODE_LEN     = 256        // default small base code len for the regex programs
 	MAX_QUANTIFIER   = 1073741824 // default max repetitions allowed for the quantifiers = 2^30
@ -41,6 +42,7 @@ pub const(
 	ERR_GROUPS_OVERFLOW     = -7   // max number of groups reached
 	ERR_GROUPS_MAX_NESTED   = -8   // max number of nested group reached
 	ERR_GROUP_NOT_BALANCED  = -9   // group not balanced
 	ERR_GROUP_QM_NOTATION   = -10  // group invalid notation
 )
 const(
@ -133,6 +135,7 @@ fn is_alnum(in_char byte) bool {
 	if tmp >= 0x00 && tmp <= 25 { return true }
 	tmp = in_char - `0`
 	if tmp >= 0x00 && tmp <= 9  { return true }
 	if tmp == `_` { return true }
 	return false
 }
@ -193,9 +196,10 @@ pub fn (re RE) get_parse_error_string(err int) string {
 		ERR_INTERNAL_ERROR     { return "ERR_INTERNAL_ERROR" }
 		ERR_CC_ALLOC_OVERFLOW  { return "ERR_CC_ALLOC_OVERFLOW" }
 		ERR_SYNTAX_ERROR       { return "ERR_SYNTAX_ERROR" }
-		ERR_GROUPS_OVERFLOW    { return "ERR_GROUPS_OVERFLOW"}
+		ERR_GROUPS_OVERFLOW    { return "ERR_GROUPS_OVERFLOW" }
-		ERR_GROUPS_MAX_NESTED  { return "ERR_GROUPS_MAX_NESTED"}
+		ERR_GROUPS_MAX_NESTED  { return "ERR_GROUPS_MAX_NESTED" }
-		ERR_GROUP_NOT_BALANCED { return "ERR_GROUP_NOT_BALANCED"}
+		ERR_GROUP_NOT_BALANCED { return "ERR_GROUP_NOT_BALANCED" }
 		ERR_GROUP_QM_NOTATION  { return "ERR_GROUP_QM_NOTATION" }
 		else { return "ERR_UNKNOWN" }
 	}
 }
@ -272,6 +276,9 @@ pub const (
 	F_EFM = 0x00000100  // exit on first token matched, used by search
 	F_BIN = 0x00000200  // work only on bytes, ignore utf-8
 	// behaviour modifier flags
 	//F_OR  = 0x00010000  // the OR work with concatenation like PCRE
 )
 struct StateDotObj{
@ -305,6 +312,8 @@ pub mut:
 	group_csave []int    = []int  // groups continuous save array
 	group_csave_index int= -1     // groups continuous save index
 	group_map map[string]int      // groups names map
 	// flags
 	flag int             = 0   // flag for optional parameters
@ -336,6 +345,16 @@ fn (re mut RE) reset(){
 	}
 }
 pub fn (re RE) get_group(group_name string) (int, int) {
 	if group_name in re.group_map {
 		tmp_index := re.group_map[group_name]-1
 		start := re.groups[tmp_index*2]
 		end := re.groups[tmp_index*2+1]
 		return start,end
 	}
 	return -1, -1
 }
 /******************************************************************************
 *
 * Backslashes chars
@ -631,7 +650,7 @@ enum Quant_parse_state {
 	finish
 }
-// parse_quantifier return (min, max, str_len) of a {min,max}? quantifier starting after the { char
+// parse_quantifier return (min, max, str_len, greedy_flag) of a {min,max}? quantifier starting after the { char
 fn (re RE) parse_quantifier(in_txt string, in_i int) (int, int, int, bool) {
 	mut status := Quant_parse_state.start
 	mut i := in_i
@ -748,6 +767,104 @@ fn (re RE) parse_quantifier(in_txt string, in_i int) (int, int, int, bool) {
 	return ERR_SYNTAX_ERROR, i, 0, false
 }
 //
 // Groups
 //
 enum Group_parse_state {
 	start,
 	q_mark,      // (?
 	q_mark1,     // (?:|P  checking
 	p_status,    // (?P
 	p_start,     // (?P<
 	p_end,       // (?P<...>
 	p_in_name,   // (?P<...	
 	finish
 }
 // parse_groups parse a group for ? (question mark) syntax, if found, return (error, capture_flag, name_of_the_group, next_index)
 fn (re RE) parse_groups(in_txt string, in_i int) (int, bool, string, int) {
 	mut status := Group_parse_state.start
 	mut i := in_i
 	mut name := ''
 	for i < in_txt.len && status != .finish {
 		// get our char
 		char_tmp,char_len := re.get_char(in_txt,i)
 		ch := byte(char_tmp)
 		// start
 		if status == .start && ch == `(` {
 			status = .q_mark
 			i += char_len
 			continue
 		}
 		// check for question marks
 		if status == .q_mark && ch == `?` {
 			status = .q_mark1
 			i += char_len
 			continue
 		}
 		// non capturing group
 		if status == .q_mark1 && ch == `:` {
 			i += char_len
 			return 0, false, name, i
 		}
 		// enter in P section
 		if status == .q_mark1 && ch == `P` {
 			status = .p_status
 			i += char_len
 			continue
 		}
 		// not a valid q mark found
 		if status == .q_mark1 {
 			//println("NO VALID Q MARK")
 			return -2 , true, name, i
 		}
 		if status == .p_status && ch == `<` {
 			status = .p_start
 			i += char_len
 			continue
 		}
 		if status == .p_start && ch != `>` {
 			status = .p_in_name
 			name += "${ch:1c}" // TODO: manage utf8 chars
 			i += char_len
 			continue
 		}
 		// colect name
 		if status == .p_in_name && ch != `>` && is_alnum(ch) {
 			name += "${ch:1c}" // TODO: manage utf8 chars
 			i += char_len
 			continue
 		}
 		// end name
 		if status == .p_in_name && ch == `>` {
 			i += char_len
 			return 0, true, name, i
 		}
 		// error on name group
 		if status == .p_in_name {
 			return -2 , true, name, i
 		}
 		// normal group, nothig to do, exit
 		return  0 , true, name, i
 	}
 	/* UNREACHABLE */
 	//println("ERROR!! NOT MEANT TO BE HERE!!1")
 	return -2 , true, name, i
 }
 //
 // main compiler
 //
@ -795,7 +912,6 @@ pub fn (re mut RE) compile(in_txt string) (int,int) {
 			if group_count > re.group_max {
 				return ERR_GROUPS_OVERFLOW,i+1
 			}
 			group_stack_index++
 			// check max nested groups allowed
@ -803,17 +919,50 @@ pub fn (re mut RE) compile(in_txt string) (int,int) {
 				return ERR_GROUPS_MAX_NESTED,i+1
 			}
-			group_count++
+			tmp_res, cgroup_flag, cgroup_name, next_i := re.parse_groups(in_txt,i)
 			// manage question mark format error
 			if tmp_res < -1 {
 				return ERR_GROUP_QM_NOTATION,next_i
 			}
 			//println("Parse group: [$tmp_res, $cgroup_flag, ($i,$next_i), '${in_txt[i..next_i]}' ]")
 			i = next_i
 			if cgroup_flag == true {
 				group_count++
 			}
 			// calculate the group id
 			// if it is a named group, recycle the group id
 			// NOTE: **** the group index is +1 because map return 0 when not found!! ****
 			mut group_id := group_count
 			if cgroup_name.len > 0 {
 				//println("GROUP NAME: ${cgroup_name}")
 				if cgroup_name in re.group_map{
 					group_id = re.group_map[cgroup_name]-1
 					group_count--
 				} else {
 					re.group_map[cgroup_name] = group_id+1
 				}
 			}
 			group_stack_txt_index[group_stack_index] = i
 			group_stack[group_stack_index] = pc
 			re.prog[pc].ist = u32(0) | IST_GROUP_START
 			re.prog[pc].group_id = group_count
 			re.prog[pc].rep_min = 1
 			re.prog[pc].rep_max = 1
 			// set the group id
 			if cgroup_flag == false {
 				//println("NO CAPTURE GROUP")
 				re.prog[pc].group_id = -1 
 			} else {
 				re.prog[pc].group_id = group_id
 			}
 			pc = pc + 1
 			i = i + char_len
 			continue
 		}
@ -1099,6 +1248,16 @@ pub fn (re RE) get_code() string {
 				res.write(".        DOT_CHAR")
 			} else if ist == IST_GROUP_START {
 				res.write("(        GROUP_START #:${tk.group_id}")
 				if tk.group_id == -1 {
 					res.write(" ?:")
 				} else {
 					for x in re.group_map.keys() {
 						if re.group_map[x] == (tk.group_id+1) {
 							res.write(" ?P<${x}>")
 							break
 						}
 					}
 				}
 			} else if ist == IST_GROUP_END {
 				res.write(")        GROUP_END   #:${tk.group_id}")
 			} else if ist == IST_SIMPLE_CHAR {
@ -1146,8 +1305,20 @@ pub fn (re RE) get_query() string {
 			if re.debug == 0 {
 				res.write("(")
 			} else {
-				res.write("#${tk.group_id}(")
+				if tk.group_id == -1 {
 					res.write("(?:")   // non capturing group
 				} else {
 					res.write("#${tk.group_id}(")
 				}
 			}
 			for x in re.group_map.keys() {
 				if re.group_map[x] == (tk.group_id+1) {
 					res.write("?P<${x}>")
 					break
 				}
 			}
 			i++
 			continue
 		}
@ -1400,7 +1571,7 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 						re.prog[tmp_pc].group_rep
 					)
 					*/
-					if re.prog[tmp_pc].group_rep >= re.prog[tmp_pc].rep_min{
+					if re.prog[tmp_pc].group_rep >= re.prog[tmp_pc].rep_min && re.prog[tmp_pc].group_id >= 0{
 						start_i   := group_stack[group_index]
 	 					group_stack[group_index]=-1
@ -1420,7 +1591,7 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 								// incrment counter
 								re.group_csave[0]++
 								// save the record  
-								re.group_csave[re.group_csave_index++] = g_index               // group id
+								re.group_csave[re.group_csave_index++] = g_index >> 1          // group id
 								re.group_csave[re.group_csave_index++] = re.groups[g_index]    // start
 								re.group_csave[re.group_csave_index++] = re.groups[g_index+1]  // end
 							}
@ -1545,7 +1716,7 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 					// restore txt index stack and save the group data
 					//C.printf("g.id: %d group_index: %d\n", re.prog[pc].group_id, group_index)
-					if group_index >= 0 {
+					if group_index >= 0 && re.prog[pc].group_id >= 0 {
 	 					start_i   := group_stack[group_index]
 	 					//group_stack[group_index]=-1
@ -1566,7 +1737,7 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 								// incrment counter
 								re.group_csave[0]++
 								// save the record  
-								re.group_csave[re.group_csave_index++] = g_index               // group id
+								re.group_csave[re.group_csave_index++] = g_index >> 1          // group id
 								re.group_csave[re.group_csave_index++] = re.groups[g_index]    // start
 								re.group_csave[re.group_csave_index++] = re.groups[g_index+1]  // end
 							}
@ -1709,7 +1880,7 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 				if re.prog[pc].ch == ch
 				{
 					state.match_flag = true
-					l_ist = u32(IST_SIMPLE_CHAR)
+					l_ist = IST_SIMPLE_CHAR
 					if first_match < 0 {
 						first_match = i
--- a/vlib/regex/regex_test.v
+++ b/vlib/regex/regex_test.v
@ -7,9 +7,9 @@ import regex
 ******************************************************************************/
 struct TestItem {
 	src string
-	q   string
+	q string
-	s   int = 0
+	s int = 0
-	e   int = 0
+	e int = 0
 }
 const(
@ -72,6 +72,7 @@ match_test_suite = [
 	TestItem{" pippo pera",r"\s(.*)pe(.*)",0,11},
 	TestItem{" abb",r"\s(.*)",0,4},
 	// negative
 	TestItem{"zthis ciao",r"((t[hieo]+se?)\s*)+",-1,0},
 	TestItem{"this is a good.",r"thes",-1,0},
@ -81,7 +82,6 @@ match_test_suite = [
 	TestItem{"1234this cpapaz adce aabe ter",r"(c(pa)+z)(\s[\a]+){2}$",-1,0},
 	TestItem{"cpapaz ole. pipipo,",r"^.*c.+ol?e.*p([ip])+o$",-1,0},
 	// check unicode
 	TestItem{"this is a Ⅰ Ⅱ Ⅲ Ⅳ Ⅴ Ⅵ test",r".*a [Ⅰ-Ⅵ ]+",0,34},
 	TestItem{"123Ⅰ Ⅱ Ⅲ Ⅳ Ⅴ Ⅵ test",r"[Ⅰ-Ⅴ\s]+",3,23},
@ -90,15 +90,14 @@ match_test_suite = [
 struct TestItemFa {
 	src string
-	q   string
+	q string
-	r   []int
+	r []int
 }
 const (
 match_test_suite_fa = [
 	// find_all tests
 	TestItemFa{
 		"oggi pippo è andato a casa di pluto ed ha trovato pippo",
 		r"p[iplut]+o",
@ -115,16 +114,13 @@ match_test_suite_fa = [
 struct TestItemRe {
 	src string
-	q   string
+	q string
 	rep string
-	r   string
+	r string
 }
 const (
 match_test_suite_re = [
 	// replace tests
 	TestItemRe{
 		"oggi pibao è andato a casa di pbababao ed ha trovato pibabababao",
 		r"(pi?(ba)+o)",
@ -140,7 +136,88 @@ match_test_suite_re = [
 ]
 )
 struct TestItemCGroup {
 	src string
 	q string
 	s int = 0
 	e int = 0
 	cg []int
 	cgn map[string]int
 }
 const (
 cgroups_test_suite = [
 	TestItemCGroup{
 		"http://www.ciao.mondo/hello/pippo12_/pera.html",
 		r"(?P<format>https?)|(?:ftps?)://(?P<token>[\w_]+.)+",0,46,
 		[8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46],
 		{'format':0,'token':1}
 	},
 	TestItemCGroup{
 		"http://www.ciao.mondo/hello/pippo12_/pera.html",
 		r"(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+",0,46,
 		[8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46],
 		{'format':0,'token':1}
 	},
 	TestItemCGroup{
 		"http://www.ciao.mondo/hello/pippo12_/pera.html",
 		r"(?P<format>https?)|(?P<format>ftps?)://([\w_]+.)+",0,46,
 		[8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46],
 		{'format':0}
 	},
 ]
 )
 fn test_regex(){
 	// check capturing groups
 	for c,to in cgroups_test_suite {
 		// debug print
 		//println("#$c [$to.src] q[$to.q] ($to.s, $to.e)")
 		mut re, re_err, err_pos := regex.regex(to.q)
 		re.group_csave = [-1].repeat(3*20+1)
 		if re_err == regex.COMPILE_OK {
 			start, end := re.match_string(to.src)
 			mut tmp_str := ""
 			if start >= 0 && end  > start{
 				tmp_str = to.src[start..end]
 			}
 			if start != to.s || end != to.e {
 				println("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end")	
 				println("ERROR!")
 				//C.printf("ERROR!! res:(%d, %d) refh:(%d, %d)\n",start, end, to.s, to.e)
 				assert false
 				break
 			}
 			// check cgroups
 			if re.group_csave.len == 0 || re.group_csave[0] != to.cg[0] {
 				println("Capturing group len error!")
 				assert false
 			}
 			// check captured groups
 			mut ln := re.group_csave[0]*3
 			for ln > 0 {
 				if re.group_csave[ln] != to.cg[ln] {
 					assert false
 				}
 				ln--
 			}
 			// check named captured groups
 			for k in to.cgn.keys() {
 				if to.cgn[k] != (re.group_map[k]-1) { // we have -1 because the map not found is 0, in groups we start from 0 and we store using +1
 					println("Named capturing group error! [$k]")
 					assert false
 				}
 			}
 		}
 	}
 	// check find_all
 	for c,to in match_test_suite_fa{
 		// debug print