regex: named capturing groups, small fixes

pull/3584/head^2
penguindark 2020-01-28 20:34:11 +01:00 committed by Alexander Medvednikov
parent 9ac0c54eb0
commit 5a2534122e
3 changed files with 387 additions and 31 deletions

View File

@ -55,7 +55,7 @@ A meta-char is specified by a backslash before a char like `\w` in this case the
A meta-char can match different type of chars. A meta-char can match different type of chars.
* `\w` match an alphanumeric char `[a-zA-Z0-9]` * `\w` match an alphanumeric char `[a-zA-Z0-9_]`
* `\W` match a non alphanumeric char * `\W` match a non alphanumeric char
* `\d` match a digit `[0-9]` * `\d` match a digit `[0-9]`
* `\D` match a non digit * `\D` match a non digit
@ -244,6 +244,114 @@ cg id: 0 [15, 19] => [56, ]
cg id: 0 [19, 21] => [78] cg id: 0 [19, 21] => [78]
``` ```
### Named capturing groups
This regex module support partially the question mark `?` PCRE syntax for groups.
`(?:abcd)` **non capturing group**: the content of the group will not be saved
`(?P<mygroup>abcdef)` **named group:** the group content is saved and labeled as `mygroup`
The label of the groups is saved in the `group_map` of the `RE` struct, this is a map from `string` to `int` where the value is the index in `group_csave` list of index.
Have a look at the example for the use of them.
example:
```v
fn main() {
test_regex()
text := "http://www.ciao.mondo/hello/pippo12_/pera.html"
query:= r"(?P<format>https?)|(?:ftps?)://(?P<token>[\w_]+.)+"
mut re := new_regex()
re.debug = 2
// must provide an array of the right size if want the continuos saving of the groups
re.group_csave = [-1].repeat(3*20+1)
re_err, err_pos := re.compile(query)
if re_err == COMPILE_OK {
q_str := re.get_query()
println("O.Query: $query")
println("Query : $q_str")
re.debug = 0
start, end := re.match_string(text)
if start < 0 {
err_str := re.get_parse_error_string(start)
println("ERROR : $err_str, $start")
} else {
text1 := text[start..end]
println("found in [$start, $end] => [$text1]")
}
// groups
mut gi := 0
for gi < re.groups.len {
if re.groups[gi] >= 0 {
println("${gi/2} ${re.groups[gi]},${re.groups[gi+1]} :[${text[re.groups[gi]..re.groups[gi+1]]}]")
}
gi += 2
}
// continuous saving
gi = 0
println("num of group item saved: ${re.group_csave[0]}")
for gi < re.group_csave[0] {
id := re.group_csave[1+gi*3]
st := re.group_csave[1+gi*3+1]
en := re.group_csave[1+gi*3+2]
println("cg id: ${id} [${st}, ${en}] => [${text[st..en]}]")
gi++
}
println("raw array: ${re.group_csave[0..gi*3+2-1]}")
// named capturing groups
println("named capturing groups:")
for g_name in re.group_map.keys() {
s,e := re.get_group(g_name)
if s >= 0 && e > s {
println("'${g_name}':[$s, $e] => '${text[s..e]}'")
} else {
println("Group [${g_name}] doesn't exist.")
}
}
} else {
println("query: $query")
lc := "-".repeat(err_pos)
println("err : $lc^")
err_str := re.get_parse_error_string(re_err)
println("ERROR: $err_str")
}
}
```
Output:
```
O.Query: (?P<format>https?)|(?:ftps?)://(?P<token>[\w_]+.)+
Query : #0(?P<format>https?)|{8,14}(?:ftps?)://#1(?P<token>[\w_]+.)+
found in [0, 46] => [http://www.ciao.mondo/hello/pippo12_/pera.html]
0 0,4 :[http]
1 42,46 :[html]
num of group item saved: 8
cg id: 0 [0, 4] => [http]
cg id: 1 [7, 11] => [www.]
cg id: 1 [11, 16] => [ciao.]
cg id: 1 [16, 22] => [mondo/]
cg id: 1 [22, 28] => [hello/]
cg id: 1 [28, 37] => [pippo12_/]
cg id: 1 [37, 42] => [pera.]
cg id: 1 [42, 46] => [html]
raw array: [8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46]
named capturing groups:
'format':[0, 4] => 'http'
'token':[42, 46] => 'html'
```
## Flags ## Flags
It is possible to set some flags in the regex parser that change the behavior of the parser itself. It is possible to set some flags in the regex parser that change the behavior of the parser itself.

View File

@ -1,6 +1,6 @@
/********************************************************************** /**********************************************************************
* *
* regex 0.9c * regex 0.9d
* *
* Copyright (c) 2019-2020 Dario Deledda. All rights reserved. * Copyright (c) 2019-2020 Dario Deledda. All rights reserved.
* Use of this source code is governed by an MIT license * Use of this source code is governed by an MIT license
@ -9,8 +9,9 @@
* This file contains regex module * This file contains regex module
* *
* Know limitation: * Know limitation:
* - max 8 stacked groups
* - find is implemented in a trivial way * - find is implemented in a trivial way
* - not full compliant PCRE
* - not compliant POSIX ERE
* *
* *
**********************************************************************/ **********************************************************************/
@ -18,7 +19,7 @@ module regex
import strings import strings
pub const( pub const(
V_REGEX_VERSION = "0.9c" // regex module version V_REGEX_VERSION = "0.9d" // regex module version
MAX_CODE_LEN = 256 // default small base code len for the regex programs MAX_CODE_LEN = 256 // default small base code len for the regex programs
MAX_QUANTIFIER = 1073741824 // default max repetitions allowed for the quantifiers = 2^30 MAX_QUANTIFIER = 1073741824 // default max repetitions allowed for the quantifiers = 2^30
@ -41,6 +42,7 @@ pub const(
ERR_GROUPS_OVERFLOW = -7 // max number of groups reached ERR_GROUPS_OVERFLOW = -7 // max number of groups reached
ERR_GROUPS_MAX_NESTED = -8 // max number of nested group reached ERR_GROUPS_MAX_NESTED = -8 // max number of nested group reached
ERR_GROUP_NOT_BALANCED = -9 // group not balanced ERR_GROUP_NOT_BALANCED = -9 // group not balanced
ERR_GROUP_QM_NOTATION = -10 // group invalid notation
) )
const( const(
@ -133,6 +135,7 @@ fn is_alnum(in_char byte) bool {
if tmp >= 0x00 && tmp <= 25 { return true } if tmp >= 0x00 && tmp <= 25 { return true }
tmp = in_char - `0` tmp = in_char - `0`
if tmp >= 0x00 && tmp <= 9 { return true } if tmp >= 0x00 && tmp <= 9 { return true }
if tmp == `_` { return true }
return false return false
} }
@ -193,9 +196,10 @@ pub fn (re RE) get_parse_error_string(err int) string {
ERR_INTERNAL_ERROR { return "ERR_INTERNAL_ERROR" } ERR_INTERNAL_ERROR { return "ERR_INTERNAL_ERROR" }
ERR_CC_ALLOC_OVERFLOW { return "ERR_CC_ALLOC_OVERFLOW" } ERR_CC_ALLOC_OVERFLOW { return "ERR_CC_ALLOC_OVERFLOW" }
ERR_SYNTAX_ERROR { return "ERR_SYNTAX_ERROR" } ERR_SYNTAX_ERROR { return "ERR_SYNTAX_ERROR" }
ERR_GROUPS_OVERFLOW { return "ERR_GROUPS_OVERFLOW"} ERR_GROUPS_OVERFLOW { return "ERR_GROUPS_OVERFLOW" }
ERR_GROUPS_MAX_NESTED { return "ERR_GROUPS_MAX_NESTED"} ERR_GROUPS_MAX_NESTED { return "ERR_GROUPS_MAX_NESTED" }
ERR_GROUP_NOT_BALANCED { return "ERR_GROUP_NOT_BALANCED"} ERR_GROUP_NOT_BALANCED { return "ERR_GROUP_NOT_BALANCED" }
ERR_GROUP_QM_NOTATION { return "ERR_GROUP_QM_NOTATION" }
else { return "ERR_UNKNOWN" } else { return "ERR_UNKNOWN" }
} }
} }
@ -272,6 +276,9 @@ pub const (
F_EFM = 0x00000100 // exit on first token matched, used by search F_EFM = 0x00000100 // exit on first token matched, used by search
F_BIN = 0x00000200 // work only on bytes, ignore utf-8 F_BIN = 0x00000200 // work only on bytes, ignore utf-8
// behaviour modifier flags
//F_OR = 0x00010000 // the OR work with concatenation like PCRE
) )
struct StateDotObj{ struct StateDotObj{
@ -305,6 +312,8 @@ pub mut:
group_csave []int = []int // groups continuous save array group_csave []int = []int // groups continuous save array
group_csave_index int= -1 // groups continuous save index group_csave_index int= -1 // groups continuous save index
group_map map[string]int // groups names map
// flags // flags
flag int = 0 // flag for optional parameters flag int = 0 // flag for optional parameters
@ -336,6 +345,16 @@ fn (re mut RE) reset(){
} }
} }
pub fn (re RE) get_group(group_name string) (int, int) {
if group_name in re.group_map {
tmp_index := re.group_map[group_name]-1
start := re.groups[tmp_index*2]
end := re.groups[tmp_index*2+1]
return start,end
}
return -1, -1
}
/****************************************************************************** /******************************************************************************
* *
* Backslashes chars * Backslashes chars
@ -631,7 +650,7 @@ enum Quant_parse_state {
finish finish
} }
// parse_quantifier return (min, max, str_len) of a {min,max}? quantifier starting after the { char // parse_quantifier return (min, max, str_len, greedy_flag) of a {min,max}? quantifier starting after the { char
fn (re RE) parse_quantifier(in_txt string, in_i int) (int, int, int, bool) { fn (re RE) parse_quantifier(in_txt string, in_i int) (int, int, int, bool) {
mut status := Quant_parse_state.start mut status := Quant_parse_state.start
mut i := in_i mut i := in_i
@ -748,6 +767,104 @@ fn (re RE) parse_quantifier(in_txt string, in_i int) (int, int, int, bool) {
return ERR_SYNTAX_ERROR, i, 0, false return ERR_SYNTAX_ERROR, i, 0, false
} }
//
// Groups
//
enum Group_parse_state {
start,
q_mark, // (?
q_mark1, // (?:|P checking
p_status, // (?P
p_start, // (?P<
p_end, // (?P<...>
p_in_name, // (?P<...
finish
}
// parse_groups parse a group for ? (question mark) syntax, if found, return (error, capture_flag, name_of_the_group, next_index)
fn (re RE) parse_groups(in_txt string, in_i int) (int, bool, string, int) {
mut status := Group_parse_state.start
mut i := in_i
mut name := ''
for i < in_txt.len && status != .finish {
// get our char
char_tmp,char_len := re.get_char(in_txt,i)
ch := byte(char_tmp)
// start
if status == .start && ch == `(` {
status = .q_mark
i += char_len
continue
}
// check for question marks
if status == .q_mark && ch == `?` {
status = .q_mark1
i += char_len
continue
}
// non capturing group
if status == .q_mark1 && ch == `:` {
i += char_len
return 0, false, name, i
}
// enter in P section
if status == .q_mark1 && ch == `P` {
status = .p_status
i += char_len
continue
}
// not a valid q mark found
if status == .q_mark1 {
//println("NO VALID Q MARK")
return -2 , true, name, i
}
if status == .p_status && ch == `<` {
status = .p_start
i += char_len
continue
}
if status == .p_start && ch != `>` {
status = .p_in_name
name += "${ch:1c}" // TODO: manage utf8 chars
i += char_len
continue
}
// colect name
if status == .p_in_name && ch != `>` && is_alnum(ch) {
name += "${ch:1c}" // TODO: manage utf8 chars
i += char_len
continue
}
// end name
if status == .p_in_name && ch == `>` {
i += char_len
return 0, true, name, i
}
// error on name group
if status == .p_in_name {
return -2 , true, name, i
}
// normal group, nothig to do, exit
return 0 , true, name, i
}
/* UNREACHABLE */
//println("ERROR!! NOT MEANT TO BE HERE!!1")
return -2 , true, name, i
}
// //
// main compiler // main compiler
// //
@ -795,7 +912,6 @@ pub fn (re mut RE) compile(in_txt string) (int,int) {
if group_count > re.group_max { if group_count > re.group_max {
return ERR_GROUPS_OVERFLOW,i+1 return ERR_GROUPS_OVERFLOW,i+1
} }
group_stack_index++ group_stack_index++
// check max nested groups allowed // check max nested groups allowed
@ -803,17 +919,50 @@ pub fn (re mut RE) compile(in_txt string) (int,int) {
return ERR_GROUPS_MAX_NESTED,i+1 return ERR_GROUPS_MAX_NESTED,i+1
} }
tmp_res, cgroup_flag, cgroup_name, next_i := re.parse_groups(in_txt,i)
// manage question mark format error
if tmp_res < -1 {
return ERR_GROUP_QM_NOTATION,next_i
}
//println("Parse group: [$tmp_res, $cgroup_flag, ($i,$next_i), '${in_txt[i..next_i]}' ]")
i = next_i
if cgroup_flag == true {
group_count++ group_count++
}
// calculate the group id
// if it is a named group, recycle the group id
// NOTE: **** the group index is +1 because map return 0 when not found!! ****
mut group_id := group_count
if cgroup_name.len > 0 {
//println("GROUP NAME: ${cgroup_name}")
if cgroup_name in re.group_map{
group_id = re.group_map[cgroup_name]-1
group_count--
} else {
re.group_map[cgroup_name] = group_id+1
}
}
group_stack_txt_index[group_stack_index] = i group_stack_txt_index[group_stack_index] = i
group_stack[group_stack_index] = pc group_stack[group_stack_index] = pc
re.prog[pc].ist = u32(0) | IST_GROUP_START re.prog[pc].ist = u32(0) | IST_GROUP_START
re.prog[pc].group_id = group_count
re.prog[pc].rep_min = 1 re.prog[pc].rep_min = 1
re.prog[pc].rep_max = 1 re.prog[pc].rep_max = 1
// set the group id
if cgroup_flag == false {
//println("NO CAPTURE GROUP")
re.prog[pc].group_id = -1
} else {
re.prog[pc].group_id = group_id
}
pc = pc + 1 pc = pc + 1
i = i + char_len
continue continue
} }
@ -1099,6 +1248,16 @@ pub fn (re RE) get_code() string {
res.write(". DOT_CHAR") res.write(". DOT_CHAR")
} else if ist == IST_GROUP_START { } else if ist == IST_GROUP_START {
res.write("( GROUP_START #:${tk.group_id}") res.write("( GROUP_START #:${tk.group_id}")
if tk.group_id == -1 {
res.write(" ?:")
} else {
for x in re.group_map.keys() {
if re.group_map[x] == (tk.group_id+1) {
res.write(" ?P<${x}>")
break
}
}
}
} else if ist == IST_GROUP_END { } else if ist == IST_GROUP_END {
res.write(") GROUP_END #:${tk.group_id}") res.write(") GROUP_END #:${tk.group_id}")
} else if ist == IST_SIMPLE_CHAR { } else if ist == IST_SIMPLE_CHAR {
@ -1145,9 +1304,21 @@ pub fn (re RE) get_query() string {
if ch == IST_GROUP_START { if ch == IST_GROUP_START {
if re.debug == 0 { if re.debug == 0 {
res.write("(") res.write("(")
} else {
if tk.group_id == -1 {
res.write("(?:") // non capturing group
} else { } else {
res.write("#${tk.group_id}(") res.write("#${tk.group_id}(")
} }
}
for x in re.group_map.keys() {
if re.group_map[x] == (tk.group_id+1) {
res.write("?P<${x}>")
break
}
}
i++ i++
continue continue
} }
@ -1400,7 +1571,7 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
re.prog[tmp_pc].group_rep re.prog[tmp_pc].group_rep
) )
*/ */
if re.prog[tmp_pc].group_rep >= re.prog[tmp_pc].rep_min{ if re.prog[tmp_pc].group_rep >= re.prog[tmp_pc].rep_min && re.prog[tmp_pc].group_id >= 0{
start_i := group_stack[group_index] start_i := group_stack[group_index]
group_stack[group_index]=-1 group_stack[group_index]=-1
@ -1420,7 +1591,7 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
// incrment counter // incrment counter
re.group_csave[0]++ re.group_csave[0]++
// save the record // save the record
re.group_csave[re.group_csave_index++] = g_index // group id re.group_csave[re.group_csave_index++] = g_index >> 1 // group id
re.group_csave[re.group_csave_index++] = re.groups[g_index] // start re.group_csave[re.group_csave_index++] = re.groups[g_index] // start
re.group_csave[re.group_csave_index++] = re.groups[g_index+1] // end re.group_csave[re.group_csave_index++] = re.groups[g_index+1] // end
} }
@ -1545,7 +1716,7 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
// restore txt index stack and save the group data // restore txt index stack and save the group data
//C.printf("g.id: %d group_index: %d\n", re.prog[pc].group_id, group_index) //C.printf("g.id: %d group_index: %d\n", re.prog[pc].group_id, group_index)
if group_index >= 0 { if group_index >= 0 && re.prog[pc].group_id >= 0 {
start_i := group_stack[group_index] start_i := group_stack[group_index]
//group_stack[group_index]=-1 //group_stack[group_index]=-1
@ -1566,7 +1737,7 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
// incrment counter // incrment counter
re.group_csave[0]++ re.group_csave[0]++
// save the record // save the record
re.group_csave[re.group_csave_index++] = g_index // group id re.group_csave[re.group_csave_index++] = g_index >> 1 // group id
re.group_csave[re.group_csave_index++] = re.groups[g_index] // start re.group_csave[re.group_csave_index++] = re.groups[g_index] // start
re.group_csave[re.group_csave_index++] = re.groups[g_index+1] // end re.group_csave[re.group_csave_index++] = re.groups[g_index+1] // end
} }
@ -1709,7 +1880,7 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
if re.prog[pc].ch == ch if re.prog[pc].ch == ch
{ {
state.match_flag = true state.match_flag = true
l_ist = u32(IST_SIMPLE_CHAR) l_ist = IST_SIMPLE_CHAR
if first_match < 0 { if first_match < 0 {
first_match = i first_match = i

View File

@ -72,6 +72,7 @@ match_test_suite = [
TestItem{" pippo pera",r"\s(.*)pe(.*)",0,11}, TestItem{" pippo pera",r"\s(.*)pe(.*)",0,11},
TestItem{" abb",r"\s(.*)",0,4}, TestItem{" abb",r"\s(.*)",0,4},
// negative // negative
TestItem{"zthis ciao",r"((t[hieo]+se?)\s*)+",-1,0}, TestItem{"zthis ciao",r"((t[hieo]+se?)\s*)+",-1,0},
TestItem{"this is a good.",r"thes",-1,0}, TestItem{"this is a good.",r"thes",-1,0},
@ -81,7 +82,6 @@ match_test_suite = [
TestItem{"1234this cpapaz adce aabe ter",r"(c(pa)+z)(\s[\a]+){2}$",-1,0}, TestItem{"1234this cpapaz adce aabe ter",r"(c(pa)+z)(\s[\a]+){2}$",-1,0},
TestItem{"cpapaz ole. pipipo,",r"^.*c.+ol?e.*p([ip])+o$",-1,0}, TestItem{"cpapaz ole. pipipo,",r"^.*c.+ol?e.*p([ip])+o$",-1,0},
// check unicode // check unicode
TestItem{"this is a test",r".*a [-Ⅵ ]+",0,34}, TestItem{"this is a test",r".*a [-Ⅵ ]+",0,34},
TestItem{"123 test",r"[-\s]+",3,23}, TestItem{"123 test",r"[-\s]+",3,23},
@ -94,11 +94,10 @@ struct TestItemFa {
r []int r []int
} }
const ( const (
match_test_suite_fa = [ match_test_suite_fa = [
// find_all tests // find_all tests
TestItemFa{ TestItemFa{
"oggi pippo è andato a casa di pluto ed ha trovato pippo", "oggi pippo è andato a casa di pluto ed ha trovato pippo",
r"p[iplut]+o", r"p[iplut]+o",
@ -119,12 +118,9 @@ struct TestItemRe {
rep string rep string
r string r string
} }
const ( const (
match_test_suite_re = [ match_test_suite_re = [
// replace tests // replace tests
TestItemRe{ TestItemRe{
"oggi pibao è andato a casa di pbababao ed ha trovato pibabababao", "oggi pibao è andato a casa di pbababao ed ha trovato pibabababao",
r"(pi?(ba)+o)", r"(pi?(ba)+o)",
@ -140,7 +136,88 @@ match_test_suite_re = [
] ]
) )
struct TestItemCGroup {
src string
q string
s int = 0
e int = 0
cg []int
cgn map[string]int
}
const (
cgroups_test_suite = [
TestItemCGroup{
"http://www.ciao.mondo/hello/pippo12_/pera.html",
r"(?P<format>https?)|(?:ftps?)://(?P<token>[\w_]+.)+",0,46,
[8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46],
{'format':0,'token':1}
},
TestItemCGroup{
"http://www.ciao.mondo/hello/pippo12_/pera.html",
r"(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+",0,46,
[8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46],
{'format':0,'token':1}
},
TestItemCGroup{
"http://www.ciao.mondo/hello/pippo12_/pera.html",
r"(?P<format>https?)|(?P<format>ftps?)://([\w_]+.)+",0,46,
[8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46],
{'format':0}
},
]
)
fn test_regex(){ fn test_regex(){
// check capturing groups
for c,to in cgroups_test_suite {
// debug print
//println("#$c [$to.src] q[$to.q] ($to.s, $to.e)")
mut re, re_err, err_pos := regex.regex(to.q)
re.group_csave = [-1].repeat(3*20+1)
if re_err == regex.COMPILE_OK {
start, end := re.match_string(to.src)
mut tmp_str := ""
if start >= 0 && end > start{
tmp_str = to.src[start..end]
}
if start != to.s || end != to.e {
println("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end")
println("ERROR!")
//C.printf("ERROR!! res:(%d, %d) refh:(%d, %d)\n",start, end, to.s, to.e)
assert false
break
}
// check cgroups
if re.group_csave.len == 0 || re.group_csave[0] != to.cg[0] {
println("Capturing group len error!")
assert false
}
// check captured groups
mut ln := re.group_csave[0]*3
for ln > 0 {
if re.group_csave[ln] != to.cg[ln] {
assert false
}
ln--
}
// check named captured groups
for k in to.cgn.keys() {
if to.cgn[k] != (re.group_map[k]-1) { // we have -1 because the map not found is 0, in groups we start from 0 and we store using +1
println("Named capturing group error! [$k]")
assert false
}
}
}
}
// check find_all // check find_all
for c,to in match_test_suite_fa{ for c,to in match_test_suite_fa{
// debug print // debug print