From 5a2534122e4261062e37f6cd7c277271a09e6b29 Mon Sep 17 00:00:00 2001 From: penguindark <57967770+penguindark@users.noreply.github.com> Date: Tue, 28 Jan 2020 20:34:11 +0100 Subject: [PATCH] regex: named capturing groups, small fixes --- vlib/regex/README.md | 110 ++++++++++++++++++++- vlib/regex/regex.v | 205 ++++++++++++++++++++++++++++++++++++---- vlib/regex/regex_test.v | 103 +++++++++++++++++--- 3 files changed, 387 insertions(+), 31 deletions(-) diff --git a/vlib/regex/README.md b/vlib/regex/README.md index 7d09db7a48..ab3b5f7457 100644 --- a/vlib/regex/README.md +++ b/vlib/regex/README.md @@ -55,7 +55,7 @@ A meta-char is specified by a backslash before a char like `\w` in this case the A meta-char can match different type of chars. -* `\w` match an alphanumeric char `[a-zA-Z0-9]` +* `\w` match an alphanumeric char `[a-zA-Z0-9_]` * `\W` match a non alphanumeric char * `\d` match a digit `[0-9]` * `\D` match a non digit @@ -244,6 +244,114 @@ cg id: 0 [15, 19] => [56, ] cg id: 0 [19, 21] => [78] ``` +### Named capturing groups + +This regex module support partially the question mark `?` PCRE syntax for groups. + +`(?:abcd)` **non capturing group**: the content of the group will not be saved + +`(?Pabcdef)` **named group:** the group content is saved and labeled as `mygroup` + +The label of the groups is saved in the `group_map` of the `RE` struct, this is a map from `string` to `int` where the value is the index in `group_csave` list of index. + +Have a look at the example for the use of them. + +example: + +```v +fn main() { + test_regex() + + text := "http://www.ciao.mondo/hello/pippo12_/pera.html" + query:= r"(?Phttps?)|(?:ftps?)://(?P[\w_]+.)+" + + mut re := new_regex() + re.debug = 2 + + // must provide an array of the right size if want the continuos saving of the groups + re.group_csave = [-1].repeat(3*20+1) + + re_err, err_pos := re.compile(query) + if re_err == COMPILE_OK { + q_str := re.get_query() + println("O.Query: $query") + println("Query : $q_str") + + re.debug = 0 + start, end := re.match_string(text) + if start < 0 { + err_str := re.get_parse_error_string(start) + println("ERROR : $err_str, $start") + } else { + text1 := text[start..end] + println("found in [$start, $end] => [$text1]") + } + + // groups + mut gi := 0 + for gi < re.groups.len { + if re.groups[gi] >= 0 { + println("${gi/2} ${re.groups[gi]},${re.groups[gi+1]} :[${text[re.groups[gi]..re.groups[gi+1]]}]") + } + gi += 2 + } + // continuous saving + gi = 0 + println("num of group item saved: ${re.group_csave[0]}") + for gi < re.group_csave[0] { + id := re.group_csave[1+gi*3] + st := re.group_csave[1+gi*3+1] + en := re.group_csave[1+gi*3+2] + println("cg id: ${id} [${st}, ${en}] => [${text[st..en]}]") + gi++ + } + println("raw array: ${re.group_csave[0..gi*3+2-1]}") + + // named capturing groups + println("named capturing groups:") + for g_name in re.group_map.keys() { + s,e := re.get_group(g_name) + if s >= 0 && e > s { + println("'${g_name}':[$s, $e] => '${text[s..e]}'") + } else { + println("Group [${g_name}] doesn't exist.") + } + } + + } else { + println("query: $query") + lc := "-".repeat(err_pos) + println("err : $lc^") + err_str := re.get_parse_error_string(re_err) + println("ERROR: $err_str") + } + +} +``` + +Output: + +``` +O.Query: (?Phttps?)|(?:ftps?)://(?P[\w_]+.)+ +Query : #0(?Phttps?)|{8,14}(?:ftps?)://#1(?P[\w_]+.)+ +found in [0, 46] => [http://www.ciao.mondo/hello/pippo12_/pera.html] +0 0,4 :[http] +1 42,46 :[html] +num of group item saved: 8 +cg id: 0 [0, 4] => [http] +cg id: 1 [7, 11] => [www.] +cg id: 1 [11, 16] => [ciao.] +cg id: 1 [16, 22] => [mondo/] +cg id: 1 [22, 28] => [hello/] +cg id: 1 [28, 37] => [pippo12_/] +cg id: 1 [37, 42] => [pera.] +cg id: 1 [42, 46] => [html] +raw array: [8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46] +named capturing groups: +'format':[0, 4] => 'http' +'token':[42, 46] => 'html' +``` + ## Flags It is possible to set some flags in the regex parser that change the behavior of the parser itself. diff --git a/vlib/regex/regex.v b/vlib/regex/regex.v index e40f1a4e27..4502c3d4bd 100644 --- a/vlib/regex/regex.v +++ b/vlib/regex/regex.v @@ -1,6 +1,6 @@ /********************************************************************** * -* regex 0.9c +* regex 0.9d * * Copyright (c) 2019-2020 Dario Deledda. All rights reserved. * Use of this source code is governed by an MIT license @@ -9,8 +9,9 @@ * This file contains regex module * * Know limitation: -* - max 8 stacked groups * - find is implemented in a trivial way +* - not full compliant PCRE +* - not compliant POSIX ERE * * **********************************************************************/ @@ -18,7 +19,7 @@ module regex import strings pub const( - V_REGEX_VERSION = "0.9c" // regex module version + V_REGEX_VERSION = "0.9d" // regex module version MAX_CODE_LEN = 256 // default small base code len for the regex programs MAX_QUANTIFIER = 1073741824 // default max repetitions allowed for the quantifiers = 2^30 @@ -41,6 +42,7 @@ pub const( ERR_GROUPS_OVERFLOW = -7 // max number of groups reached ERR_GROUPS_MAX_NESTED = -8 // max number of nested group reached ERR_GROUP_NOT_BALANCED = -9 // group not balanced + ERR_GROUP_QM_NOTATION = -10 // group invalid notation ) const( @@ -133,6 +135,7 @@ fn is_alnum(in_char byte) bool { if tmp >= 0x00 && tmp <= 25 { return true } tmp = in_char - `0` if tmp >= 0x00 && tmp <= 9 { return true } + if tmp == `_` { return true } return false } @@ -193,9 +196,10 @@ pub fn (re RE) get_parse_error_string(err int) string { ERR_INTERNAL_ERROR { return "ERR_INTERNAL_ERROR" } ERR_CC_ALLOC_OVERFLOW { return "ERR_CC_ALLOC_OVERFLOW" } ERR_SYNTAX_ERROR { return "ERR_SYNTAX_ERROR" } - ERR_GROUPS_OVERFLOW { return "ERR_GROUPS_OVERFLOW"} - ERR_GROUPS_MAX_NESTED { return "ERR_GROUPS_MAX_NESTED"} - ERR_GROUP_NOT_BALANCED { return "ERR_GROUP_NOT_BALANCED"} + ERR_GROUPS_OVERFLOW { return "ERR_GROUPS_OVERFLOW" } + ERR_GROUPS_MAX_NESTED { return "ERR_GROUPS_MAX_NESTED" } + ERR_GROUP_NOT_BALANCED { return "ERR_GROUP_NOT_BALANCED" } + ERR_GROUP_QM_NOTATION { return "ERR_GROUP_QM_NOTATION" } else { return "ERR_UNKNOWN" } } } @@ -272,6 +276,9 @@ pub const ( F_EFM = 0x00000100 // exit on first token matched, used by search F_BIN = 0x00000200 // work only on bytes, ignore utf-8 + + // behaviour modifier flags + //F_OR = 0x00010000 // the OR work with concatenation like PCRE ) struct StateDotObj{ @@ -305,6 +312,8 @@ pub mut: group_csave []int = []int // groups continuous save array group_csave_index int= -1 // groups continuous save index + group_map map[string]int // groups names map + // flags flag int = 0 // flag for optional parameters @@ -336,6 +345,16 @@ fn (re mut RE) reset(){ } } +pub fn (re RE) get_group(group_name string) (int, int) { + if group_name in re.group_map { + tmp_index := re.group_map[group_name]-1 + start := re.groups[tmp_index*2] + end := re.groups[tmp_index*2+1] + return start,end + } + return -1, -1 +} + /****************************************************************************** * * Backslashes chars @@ -631,7 +650,7 @@ enum Quant_parse_state { finish } -// parse_quantifier return (min, max, str_len) of a {min,max}? quantifier starting after the { char +// parse_quantifier return (min, max, str_len, greedy_flag) of a {min,max}? quantifier starting after the { char fn (re RE) parse_quantifier(in_txt string, in_i int) (int, int, int, bool) { mut status := Quant_parse_state.start mut i := in_i @@ -748,6 +767,104 @@ fn (re RE) parse_quantifier(in_txt string, in_i int) (int, int, int, bool) { return ERR_SYNTAX_ERROR, i, 0, false } +// +// Groups +// +enum Group_parse_state { + start, + q_mark, // (? + q_mark1, // (?:|P checking + p_status, // (?P + p_start, // (?P< + p_end, // (?P<...> + p_in_name, // (?P<... + finish +} + +// parse_groups parse a group for ? (question mark) syntax, if found, return (error, capture_flag, name_of_the_group, next_index) +fn (re RE) parse_groups(in_txt string, in_i int) (int, bool, string, int) { + mut status := Group_parse_state.start + mut i := in_i + mut name := '' + + for i < in_txt.len && status != .finish { + + // get our char + char_tmp,char_len := re.get_char(in_txt,i) + ch := byte(char_tmp) + + // start + if status == .start && ch == `(` { + status = .q_mark + i += char_len + continue + } + + // check for question marks + if status == .q_mark && ch == `?` { + status = .q_mark1 + i += char_len + continue + } + + // non capturing group + if status == .q_mark1 && ch == `:` { + i += char_len + return 0, false, name, i + } + + // enter in P section + if status == .q_mark1 && ch == `P` { + status = .p_status + i += char_len + continue + } + + // not a valid q mark found + if status == .q_mark1 { + //println("NO VALID Q MARK") + return -2 , true, name, i + } + + if status == .p_status && ch == `<` { + status = .p_start + i += char_len + continue + } + + if status == .p_start && ch != `>` { + status = .p_in_name + name += "${ch:1c}" // TODO: manage utf8 chars + i += char_len + continue + } + + // colect name + if status == .p_in_name && ch != `>` && is_alnum(ch) { + name += "${ch:1c}" // TODO: manage utf8 chars + i += char_len + continue + } + + // end name + if status == .p_in_name && ch == `>` { + i += char_len + return 0, true, name, i + } + + // error on name group + if status == .p_in_name { + return -2 , true, name, i + } + + // normal group, nothig to do, exit + return 0 , true, name, i + } + /* UNREACHABLE */ + //println("ERROR!! NOT MEANT TO BE HERE!!1") + return -2 , true, name, i +} + // // main compiler // @@ -795,7 +912,6 @@ pub fn (re mut RE) compile(in_txt string) (int,int) { if group_count > re.group_max { return ERR_GROUPS_OVERFLOW,i+1 } - group_stack_index++ // check max nested groups allowed @@ -803,17 +919,50 @@ pub fn (re mut RE) compile(in_txt string) (int,int) { return ERR_GROUPS_MAX_NESTED,i+1 } - group_count++ + tmp_res, cgroup_flag, cgroup_name, next_i := re.parse_groups(in_txt,i) + + // manage question mark format error + if tmp_res < -1 { + return ERR_GROUP_QM_NOTATION,next_i + } + + //println("Parse group: [$tmp_res, $cgroup_flag, ($i,$next_i), '${in_txt[i..next_i]}' ]") + i = next_i + + if cgroup_flag == true { + group_count++ + } + + // calculate the group id + // if it is a named group, recycle the group id + // NOTE: **** the group index is +1 because map return 0 when not found!! **** + mut group_id := group_count + if cgroup_name.len > 0 { + //println("GROUP NAME: ${cgroup_name}") + if cgroup_name in re.group_map{ + group_id = re.group_map[cgroup_name]-1 + group_count-- + } else { + re.group_map[cgroup_name] = group_id+1 + } + } group_stack_txt_index[group_stack_index] = i group_stack[group_stack_index] = pc re.prog[pc].ist = u32(0) | IST_GROUP_START - re.prog[pc].group_id = group_count re.prog[pc].rep_min = 1 re.prog[pc].rep_max = 1 + + // set the group id + if cgroup_flag == false { + //println("NO CAPTURE GROUP") + re.prog[pc].group_id = -1 + } else { + re.prog[pc].group_id = group_id + } + pc = pc + 1 - i = i + char_len continue } @@ -1099,6 +1248,16 @@ pub fn (re RE) get_code() string { res.write(". DOT_CHAR") } else if ist == IST_GROUP_START { res.write("( GROUP_START #:${tk.group_id}") + if tk.group_id == -1 { + res.write(" ?:") + } else { + for x in re.group_map.keys() { + if re.group_map[x] == (tk.group_id+1) { + res.write(" ?P<${x}>") + break + } + } + } } else if ist == IST_GROUP_END { res.write(") GROUP_END #:${tk.group_id}") } else if ist == IST_SIMPLE_CHAR { @@ -1146,8 +1305,20 @@ pub fn (re RE) get_query() string { if re.debug == 0 { res.write("(") } else { - res.write("#${tk.group_id}(") + if tk.group_id == -1 { + res.write("(?:") // non capturing group + } else { + res.write("#${tk.group_id}(") + } } + + for x in re.group_map.keys() { + if re.group_map[x] == (tk.group_id+1) { + res.write("?P<${x}>") + break + } + } + i++ continue } @@ -1400,7 +1571,7 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { re.prog[tmp_pc].group_rep ) */ - if re.prog[tmp_pc].group_rep >= re.prog[tmp_pc].rep_min{ + if re.prog[tmp_pc].group_rep >= re.prog[tmp_pc].rep_min && re.prog[tmp_pc].group_id >= 0{ start_i := group_stack[group_index] group_stack[group_index]=-1 @@ -1420,7 +1591,7 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { // incrment counter re.group_csave[0]++ // save the record - re.group_csave[re.group_csave_index++] = g_index // group id + re.group_csave[re.group_csave_index++] = g_index >> 1 // group id re.group_csave[re.group_csave_index++] = re.groups[g_index] // start re.group_csave[re.group_csave_index++] = re.groups[g_index+1] // end } @@ -1545,7 +1716,7 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { // restore txt index stack and save the group data //C.printf("g.id: %d group_index: %d\n", re.prog[pc].group_id, group_index) - if group_index >= 0 { + if group_index >= 0 && re.prog[pc].group_id >= 0 { start_i := group_stack[group_index] //group_stack[group_index]=-1 @@ -1566,7 +1737,7 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { // incrment counter re.group_csave[0]++ // save the record - re.group_csave[re.group_csave_index++] = g_index // group id + re.group_csave[re.group_csave_index++] = g_index >> 1 // group id re.group_csave[re.group_csave_index++] = re.groups[g_index] // start re.group_csave[re.group_csave_index++] = re.groups[g_index+1] // end } @@ -1709,7 +1880,7 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { if re.prog[pc].ch == ch { state.match_flag = true - l_ist = u32(IST_SIMPLE_CHAR) + l_ist = IST_SIMPLE_CHAR if first_match < 0 { first_match = i diff --git a/vlib/regex/regex_test.v b/vlib/regex/regex_test.v index f421e6b3d7..2a0d37af6e 100644 --- a/vlib/regex/regex_test.v +++ b/vlib/regex/regex_test.v @@ -7,9 +7,9 @@ import regex ******************************************************************************/ struct TestItem { src string - q string - s int = 0 - e int = 0 + q string + s int = 0 + e int = 0 } const( @@ -72,6 +72,7 @@ match_test_suite = [ TestItem{" pippo pera",r"\s(.*)pe(.*)",0,11}, TestItem{" abb",r"\s(.*)",0,4}, + // negative TestItem{"zthis ciao",r"((t[hieo]+se?)\s*)+",-1,0}, TestItem{"this is a good.",r"thes",-1,0}, @@ -81,7 +82,6 @@ match_test_suite = [ TestItem{"1234this cpapaz adce aabe ter",r"(c(pa)+z)(\s[\a]+){2}$",-1,0}, TestItem{"cpapaz ole. pipipo,",r"^.*c.+ol?e.*p([ip])+o$",-1,0}, - // check unicode TestItem{"this is a Ⅰ Ⅱ Ⅲ Ⅳ Ⅴ Ⅵ test",r".*a [Ⅰ-Ⅵ ]+",0,34}, TestItem{"123Ⅰ Ⅱ Ⅲ Ⅳ Ⅴ Ⅵ test",r"[Ⅰ-Ⅴ\s]+",3,23}, @@ -90,15 +90,14 @@ match_test_suite = [ struct TestItemFa { src string - q string - r []int + q string + r []int } + const ( match_test_suite_fa = [ - // find_all tests - TestItemFa{ "oggi pippo è andato a casa di pluto ed ha trovato pippo", r"p[iplut]+o", @@ -115,16 +114,13 @@ match_test_suite_fa = [ struct TestItemRe { src string - q string + q string rep string - r string + r string } - const ( match_test_suite_re = [ - // replace tests - TestItemRe{ "oggi pibao è andato a casa di pbababao ed ha trovato pibabababao", r"(pi?(ba)+o)", @@ -140,7 +136,88 @@ match_test_suite_re = [ ] ) +struct TestItemCGroup { + src string + q string + s int = 0 + e int = 0 + cg []int + cgn map[string]int +} +const ( +cgroups_test_suite = [ + TestItemCGroup{ + "http://www.ciao.mondo/hello/pippo12_/pera.html", + r"(?Phttps?)|(?:ftps?)://(?P[\w_]+.)+",0,46, + [8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46], + {'format':0,'token':1} + }, + TestItemCGroup{ + "http://www.ciao.mondo/hello/pippo12_/pera.html", + r"(?Phttps?)|(?Pftps?)://(?P[\w_]+.)+",0,46, + [8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46], + {'format':0,'token':1} + }, + TestItemCGroup{ + "http://www.ciao.mondo/hello/pippo12_/pera.html", + r"(?Phttps?)|(?Pftps?)://([\w_]+.)+",0,46, + [8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46], + {'format':0} + }, +] +) + fn test_regex(){ + // check capturing groups + for c,to in cgroups_test_suite { + // debug print + //println("#$c [$to.src] q[$to.q] ($to.s, $to.e)") + + mut re, re_err, err_pos := regex.regex(to.q) + re.group_csave = [-1].repeat(3*20+1) + + if re_err == regex.COMPILE_OK { + start, end := re.match_string(to.src) + + mut tmp_str := "" + if start >= 0 && end > start{ + tmp_str = to.src[start..end] + } + + if start != to.s || end != to.e { + println("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end") + println("ERROR!") + //C.printf("ERROR!! res:(%d, %d) refh:(%d, %d)\n",start, end, to.s, to.e) + assert false + break + } + + // check cgroups + if re.group_csave.len == 0 || re.group_csave[0] != to.cg[0] { + println("Capturing group len error!") + assert false + } + + // check captured groups + mut ln := re.group_csave[0]*3 + for ln > 0 { + if re.group_csave[ln] != to.cg[ln] { + assert false + } + ln-- + } + + // check named captured groups + for k in to.cgn.keys() { + if to.cgn[k] != (re.group_map[k]-1) { // we have -1 because the map not found is 0, in groups we start from 0 and we store using +1 + println("Named capturing group error! [$k]") + assert false + } + } + + } + } + // check find_all for c,to in match_test_suite_fa{ // debug print