regex fixes

2020-01-25 19:12:23 +01:00 · 2020-01-25 19:12:23 +01:00 · 15a63b5bcb
parent 222fc4b04f
commit 15a63b5bcb
2 changed files with 159 additions and 52 deletions
--- a/vlib/regex/README.md
+++ b/vlib/regex/README.md
@ -159,6 +159,91 @@ for gi < re.groups.len {
 **note:** *to show the `group id number` in the result of the `get_query()` the flag `debug` of the RE object must be `1` or `2`*
 ### Groups Continuous saving
 In particular situations it is useful have a continuous save of the groups, this is possible initializing the saving array field in `RE` struct: `group_csave`.
 This feature allow to collect data in a  continuous way.
 In the example we pass a text followed by a integer list that we want collect. 
 To achieve this task we can use the continuous saving of the group that save each captured group in a array that we set with: `re.group_csave = [-1].repeat(3*20+1)`.
 The array will be filled with the following logic:
 `re.group_csave[0]` number of total saved records
 `re.group_csave[1+n*3]` id of the saved group
 `re.group_csave[1+n*3]` start index in the source string of the saved group
 `re.group_csave[1+n*3]` end index in the source string of the saved group
 The regex save until finish or found that the array have no space. If the space ends no error is raised, further records will not be saved.
 ```v
 fn example2() {
 	test_regex()
 	text := "tst: 01,23,45 ,56, 78"
 	query:= r".*:(\s*\d+[\s,]*)+"
 	mut re := regex.new_regex()
 	//re.debug = 2
 	re.group_csave = [-1].repeat(3*20+1)  // we expect max 20 records
 	re_err, err_pos := re.compile(query)
 	if re_err == regex.COMPILE_OK {
 		q_str := re.get_query()
 		println("Query: $q_str")
 		start, end := re.match_string(text)
 		if start < 0 {
 			println("ERROR : ${re.get_parse_error_string(start)}, $start")
 		} else {
 			println("found in [$start, $end] => [${text[start..end]}]")
 		}
 		// groups capture
 		mut gi := 0
 		for gi < re.groups.len {
 			if re.groups[gi] >= 0 {
 				println("${gi/2} ${re.groups[gi]},${re.groups[gi+1]} :[${text[re.groups[gi]..re.groups[gi+1]]}]")
 			}
 			gi += 2
 		}
 		// continuous saving
 		gi = 0
 		println("num: ${re.group_csave[0]}")
 		for gi < re.group_csave[0] {
 			id := re.group_csave[1+gi*3]
 			st := re.group_csave[1+gi*3+1]
 			en := re.group_csave[1+gi*3+2]
 			println("cg id: ${id} [${st}, ${en}] => [${text[st..en]}]")
 			gi++
 		}
 	} else {
 		println("query: $query")
 		lc := "-".repeat(err_pos)
 		println("err  : $lc^")
 		err_str := re.get_parse_error_string(re_err)
 		println("ERROR: $err_str")	
 	}
 }
 ```
 The output will be:
 ```
 Query: .*:(\s*\d+[\s,]*)+
 found in [0, 21] => [tst: 01,23,45 ,56, 78]
 0 19,21 :[78]
 num: 5
 cg id: 0 [4, 8] => [ 01,]
 cg id: 0 [8, 11] => [23,]
 cg id: 0 [11, 15] => [45 ,]
 cg id: 0 [15, 19] => [56, ]
 cg id: 0 [19, 21] => [78] 
 ```
 ## Flags
 It is possible to set some flags in the regex parser that change the behavior of the parser itself.
--- a/vlib/regex/regex.v
+++ b/vlib/regex/regex.v
@ -266,23 +266,20 @@ fn (tok mut Token) reset() {
 *
 ******************************************************************************/
 pub const (
-	//F_FND = 0x00000001  // check until the end of the input string, it act like a "find first match", not efficient!!
+	F_NL  = 0x00000001  // end the match when find a new line symbol
-	//F_PM  = 0x00000004  // partial match: if the source text finish and the match is positive until then return true
+	F_MS  = 0x00000002  // match true only if the match is at the start of the string
 	F_ME  = 0x00000004  // match true only if the match is at the end of the string 
-	F_NL  = 0x00000002  // end the match when find a new line symbol
+	F_EFM = 0x00000100  // exit on first token matched, used by search
-	F_MS  = 0x00000008  // match true only if the match is at the start of the string
+	F_BIN = 0x00000200  // work only on bytes, ignore utf-8
 	F_ME  = 0x00000010  // match true only if the match is at the end of the string 
 	F_EFM = 0x01000000  // exit on first token matched, used by search
 	F_BIN = 0x02000000  // work only on bytes, ignore utf-8
 )
 struct StateDotObj{
 mut:
 	i  int                = -1  // char index in the input buffer
-	pc int                = -1   // program counter saved
+	pc int                = -1  // program counter saved
-	mi int                = -1   // match_index saved
+	mi int                = -1  // match_index saved
-	group_stack_index int = -1  // group index stack pointer saved
+	group_stack_index int = -1  // continuous save on capturing groups
 }
 pub
@ -305,6 +302,9 @@ pub mut:
 	group_max_nested int = 3   // max nested group
 	group_max int        = 8   // max allowed number of different groups
 	group_csave []int    = []int  // groups continuous save array
 	group_csave_index int= -1     // groups continuous save index
 	// flags
 	flag int             = 0   // flag for optional parameters
@ -328,6 +328,12 @@ fn (re mut RE) reset(){
 	re.groups = [-1].repeat(re.group_count*2)
 	re.state_stack_index = -1
 	// reset group_csave
 	if re.group_csave.len > 0 {
 		re.group_csave_index = 1
 		re.group_csave[0] = 0     // reset the capture count
 	}
 }
 /******************************************************************************
@ -734,8 +740,6 @@ fn (re RE) parse_quantifier(in_txt string, in_i int) (int, int, int, bool) {
 			}
 		}
 		// not  a {} quantifier, exit
 		return ERR_SYNTAX_ERROR, i, 0, false
 	}
@ -997,7 +1001,6 @@ pub fn (re mut RE) compile(in_txt string) (int,int) {
 	// Post processing
 	//******************************************
 	// count IST_DOT_CHAR to set the size of the state stack
 	mut pc1 := 0
 	mut tmp_count := 0
@ -1054,7 +1057,6 @@ pub fn (re mut RE) compile(in_txt string) (int,int) {
 		pc1++
 	}
 	//******************************************
 	// DEBUG PRINT REGEX GENERATED CODE
 	//******************************************
@ -1075,14 +1077,15 @@ pub fn (re RE) get_code() string {
 		mut stop_flag := false
 		for pc1 <= re.prog.len {
 			tk := re.prog[pc1]
 			res.write("PC:${pc1:3d}")
 		    res.write(" ist: ")
-		    res.write("${re.prog[pc1].ist:8x}".replace(" ","0") )
+		    res.write("${tk.ist:8x}".replace(" ","0") )
 		    res.write(" ")
-			ist :=re.prog[pc1].ist
+			ist :=tk.ist
 			if ist == IST_BSLS_CHAR {
-				res.write("[\\${re.prog[pc1].ch:1c}]     BSLS")
+				res.write("[\\${tk.ch:1c}]     BSLS")
 			} else if ist == IST_PROG_END {
 				res.write("PROG_END")
 				stop_flag = true
@ -1095,22 +1098,22 @@ pub fn (re RE) get_code() string {
 			} else if ist == IST_DOT_CHAR {
 				res.write(".        DOT_CHAR")
 			} else if ist == IST_GROUP_START {
-				res.write("(        GROUP_START #:${re.prog[pc1].group_id}")
+				res.write("(        GROUP_START #:${tk.group_id}")
 			} else if ist == IST_GROUP_END {
-				res.write(")        GROUP_END   #:${re.prog[pc1].group_id}")
+				res.write(")        GROUP_END   #:${tk.group_id}")
 			} else if ist == IST_SIMPLE_CHAR {
-				res.write("[${re.prog[pc1].ch:1c}]      query_ch")
+				res.write("[${tk.ch:1c}]      query_ch")
 			}
-			if re.prog[pc1].rep_max == MAX_QUANTIFIER {
+			if tk.rep_max == MAX_QUANTIFIER {
-				res.write(" {${re.prog[pc1].rep_min:3d},MAX}")
+				res.write(" {${tk.rep_min:3d},MAX}")
 			}else{
 				if ist == IST_OR_BRANCH {
-					res.write(" if false go: ${re.prog[pc1].rep_min:3d} if true go: ${re.prog[pc1].rep_max:3d}")
+					res.write(" if false go: ${tk.rep_min:3d} if true go: ${tk.rep_max:3d}")
 				} else {
-					res.write(" {${re.prog[pc1].rep_min:3d},${re.prog[pc1].rep_max:3d}}")
+					res.write(" {${tk.rep_min:3d},${tk.rep_max:3d}}")
 				}
-				if re.prog[pc1].greedy == true {
+				if tk.greedy == true {
 					res.write("?")
 				}
 			}
@ -1123,11 +1126,9 @@ pub fn (re RE) get_code() string {
 		res.write("========================================\n")
 		return res.str()
 }
 // get_query return a string with a reconstruction of the query starting from the regex program code
 pub fn (re RE) get_query() string {
 	mut res := strings.new_builder(re.query.len*2)
@ -1137,14 +1138,15 @@ pub fn (re RE) get_query() string {
 	mut i := 0
 	for i < re.prog.len && re.prog[i].ist != IST_PROG_END && re.prog[i].ist != 0{
-		ch := re.prog[i].ist
+		tk := &re.prog[i]
 		ch := tk.ist
 		// GROUP start
 		if ch == IST_GROUP_START {
 			if re.debug == 0 {
 				res.write("(")
 			} else {
-				res.write("#${re.prog[i].group_id}(")
+				res.write("#${tk.group_id}(")
 			}
 			i++
 			continue
@ -1159,7 +1161,7 @@ pub fn (re RE) get_query() string {
 		if ch == IST_OR_BRANCH {
 			res.write("|")
 			if re.debug > 0 {
-				res.write("{${re.prog[i].rep_min},${re.prog[i].rep_max}}")
+				res.write("{${tk.rep_min},${tk.rep_max}}")
 			}
 			i++
 			continue
@ -1177,7 +1179,7 @@ pub fn (re RE) get_query() string {
 		// bsls char
 		if ch == IST_BSLS_CHAR {
-			res.write("\\${re.prog[i].ch:1c}")
+			res.write("\\${tk.ch:1c}")
 		}
 		// IST_DOT_CHAR
@ -1190,29 +1192,28 @@ pub fn (re RE) get_query() string {
 			if byte(ch) in BSLS_ESCAPE_LIST {
 				res.write("\\")
 			}
-			res.write("${re.prog[i].ch:c}")
+			res.write("${tk.ch:c}")
 		}
 		// quantifier
-		if !(re.prog[i].rep_min == 1 && re.prog[i].rep_max == 1) {
+		if !(tk.rep_min == 1 && tk.rep_max == 1) {
-			if re.prog[i].rep_min == 0 && re.prog[i].rep_max == 1 {
+			if tk.rep_min == 0 && tk.rep_max == 1 {
 				res.write("?")
-			} else if re.prog[i].rep_min == 1 && re.prog[i].rep_max == MAX_QUANTIFIER {
+			} else if tk.rep_min == 1 && tk.rep_max == MAX_QUANTIFIER {
 				res.write("+")
-			} else if re.prog[i].rep_min == 0 && re.prog[i].rep_max == MAX_QUANTIFIER {
+			} else if tk.rep_min == 0 && tk.rep_max == MAX_QUANTIFIER {
 				res.write("*")
 			} else {
-				if re.prog[i].rep_max == MAX_QUANTIFIER {
+				if tk.rep_max == MAX_QUANTIFIER {
-					res.write("{${re.prog[i].rep_min},MAX}")
+					res.write("{${tk.rep_min},MAX}")
 				} else {
-					res.write("{${re.prog[i].rep_min},${re.prog[i].rep_max}}")
+					res.write("{${tk.rep_min},${tk.rep_max}}")
 				}
-				if re.prog[i].greedy == true {
+				if tk.greedy == true {
 					res.write("?")
 				}
 			}
 		}
 		i++
 	}
 	if (re.flag & F_ME) != 0 {
@ -1411,6 +1412,20 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 							re.groups[g_index] = 0
 						}
 						re.groups[g_index+1] = i
 						// continuous save, save until we have space
 						if re.group_csave_index > 0 {
 							// check if we have space to save the record
 							if (re.group_csave_index + 3) < re.group_csave.len {
 								// incrment counter
 								re.group_csave[0]++
 								// save the record  
 								re.group_csave[re.group_csave_index++] = g_index               // group id
 								re.group_csave[re.group_csave_index++] = re.groups[g_index]    // start
 								re.group_csave[re.group_csave_index++] = re.groups[g_index+1]  // end
 							}
 						}
 					}
 					group_index--
@ -1543,6 +1558,19 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 						}
 						re.groups[g_index+1] = i
 						//C.printf("GROUP %d END [%d, %d]\n", re.prog[pc].group_id, re.groups[g_index], re.groups[g_index+1])
 						// continuous save, save until we have space
 						if re.group_csave_index > 0 {
 							// check if we have space to save the record
 							if (re.group_csave_index + 3) < re.group_csave.len {
 								// incrment counter
 								re.group_csave[0]++
 								// save the record  
 								re.group_csave[re.group_csave_index++] = g_index               // group id
 								re.group_csave[re.group_csave_index++] = re.groups[g_index]    // start
 								re.group_csave[re.group_csave_index++] = re.groups[g_index+1]  // end
 							}
 						}
 					}
 					re.prog[pc].group_rep++ // increase repetitions
@ -1796,8 +1824,6 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 			if rep < re.prog[tmp_pc].rep_min {
 				//C.printf("ist_quant_pg UNDER RANGE\n")
 				pc = re.prog[tmp_pc].goto_pc 
 				//group_index--
 				m_state = .ist_next
 				continue
 			}
@ -1841,12 +1867,6 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 				m_state = .ist_next // go to next ist
 				continue
 			}
 			// match failed
 			else if rep == 0 && re.prog[pc].rep_min > 0 {
 				//C.printf("ist_quant_n NO MATCH\n")
 				// dummy
 			}
 			// match + or *
 			else if rep >= re.prog[pc].rep_min {
 				//C.printf("ist_quant_n MATCH RANGE\n")
@ -1902,7 +1922,6 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 					m_state = .ist_next
 					continue
 				}
 				m_state = .ist_load
 				continue
 			}
@ -1981,6 +2000,9 @@ pub fn (re mut RE) match_string(in_txt string) (int,int) {
 			return NO_MATCH_FOUND, 0
 		}
 		if (re.flag & F_ME) != 0 && end < in_txt.len {
 			if in_txt[end] in NEW_LINE_LIST {
 				return start, end
 			}
 			return NO_MATCH_FOUND, 0
 		}
 		return start, end
@ -2002,7 +2024,7 @@ pub fn (re mut RE) find(in_txt string) (int,int) {
 	for i < in_txt.len {
 		// test only the first part of the query string
-		re.flag &= F_EFM // set to exit on the first token match
+		re.flag |= F_EFM // set to exit on the first token match
 		mut tmp_end := i+re.query.len
 		if tmp_end > in_txt.len { tmp_end = in_txt.len }
 		tmp_txt := string{ str: in_txt.str+i, len: tmp_end-i }