regex: named capturing groups, small fixes

pull/3584/head^2
penguindark 2020-01-28 20:34:11 +01:00 committed by Alexander Medvednikov
parent 9ac0c54eb0
commit 5a2534122e
3 changed files with 387 additions and 31 deletions

View File

@ -55,7 +55,7 @@ A meta-char is specified by a backslash before a char like `\w` in this case the
A meta-char can match different type of chars.
* `\w` match an alphanumeric char `[a-zA-Z0-9]`
* `\w` match an alphanumeric char `[a-zA-Z0-9_]`
* `\W` match a non alphanumeric char
* `\d` match a digit `[0-9]`
* `\D` match a non digit
@ -244,6 +244,114 @@ cg id: 0 [15, 19] => [56, ]
cg id: 0 [19, 21] => [78]
```
### Named capturing groups
This regex module support partially the question mark `?` PCRE syntax for groups.
`(?:abcd)` **non capturing group**: the content of the group will not be saved
`(?P<mygroup>abcdef)` **named group:** the group content is saved and labeled as `mygroup`
The label of the groups is saved in the `group_map` of the `RE` struct, this is a map from `string` to `int` where the value is the index in `group_csave` list of index.
Have a look at the example for the use of them.
example:
```v
fn main() {
test_regex()
text := "http://www.ciao.mondo/hello/pippo12_/pera.html"
query:= r"(?P<format>https?)|(?:ftps?)://(?P<token>[\w_]+.)+"
mut re := new_regex()
re.debug = 2
// must provide an array of the right size if want the continuos saving of the groups
re.group_csave = [-1].repeat(3*20+1)
re_err, err_pos := re.compile(query)
if re_err == COMPILE_OK {
q_str := re.get_query()
println("O.Query: $query")
println("Query : $q_str")
re.debug = 0
start, end := re.match_string(text)
if start < 0 {
err_str := re.get_parse_error_string(start)
println("ERROR : $err_str, $start")
} else {
text1 := text[start..end]
println("found in [$start, $end] => [$text1]")
}
// groups
mut gi := 0
for gi < re.groups.len {
if re.groups[gi] >= 0 {
println("${gi/2} ${re.groups[gi]},${re.groups[gi+1]} :[${text[re.groups[gi]..re.groups[gi+1]]}]")
}
gi += 2
}
// continuous saving
gi = 0
println("num of group item saved: ${re.group_csave[0]}")
for gi < re.group_csave[0] {
id := re.group_csave[1+gi*3]
st := re.group_csave[1+gi*3+1]
en := re.group_csave[1+gi*3+2]
println("cg id: ${id} [${st}, ${en}] => [${text[st..en]}]")
gi++
}
println("raw array: ${re.group_csave[0..gi*3+2-1]}")
// named capturing groups
println("named capturing groups:")
for g_name in re.group_map.keys() {
s,e := re.get_group(g_name)
if s >= 0 && e > s {
println("'${g_name}':[$s, $e] => '${text[s..e]}'")
} else {
println("Group [${g_name}] doesn't exist.")
}
}
} else {
println("query: $query")
lc := "-".repeat(err_pos)
println("err : $lc^")
err_str := re.get_parse_error_string(re_err)
println("ERROR: $err_str")
}
}
```
Output:
```
O.Query: (?P<format>https?)|(?:ftps?)://(?P<token>[\w_]+.)+
Query : #0(?P<format>https?)|{8,14}(?:ftps?)://#1(?P<token>[\w_]+.)+
found in [0, 46] => [http://www.ciao.mondo/hello/pippo12_/pera.html]
0 0,4 :[http]
1 42,46 :[html]
num of group item saved: 8
cg id: 0 [0, 4] => [http]
cg id: 1 [7, 11] => [www.]
cg id: 1 [11, 16] => [ciao.]
cg id: 1 [16, 22] => [mondo/]
cg id: 1 [22, 28] => [hello/]
cg id: 1 [28, 37] => [pippo12_/]
cg id: 1 [37, 42] => [pera.]
cg id: 1 [42, 46] => [html]
raw array: [8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46]
named capturing groups:
'format':[0, 4] => 'http'
'token':[42, 46] => 'html'
```
## Flags
It is possible to set some flags in the regex parser that change the behavior of the parser itself.

View File

@ -1,6 +1,6 @@
/**********************************************************************
*
* regex 0.9c
* regex 0.9d
*
* Copyright (c) 2019-2020 Dario Deledda. All rights reserved.
* Use of this source code is governed by an MIT license
@ -9,8 +9,9 @@
* This file contains regex module
*
* Know limitation:
* - max 8 stacked groups
* - find is implemented in a trivial way
* - not full compliant PCRE
* - not compliant POSIX ERE
*
*
**********************************************************************/
@ -18,7 +19,7 @@ module regex
import strings
pub const(
V_REGEX_VERSION = "0.9c" // regex module version
V_REGEX_VERSION = "0.9d" // regex module version
MAX_CODE_LEN = 256 // default small base code len for the regex programs
MAX_QUANTIFIER = 1073741824 // default max repetitions allowed for the quantifiers = 2^30
@ -41,6 +42,7 @@ pub const(
ERR_GROUPS_OVERFLOW = -7 // max number of groups reached
ERR_GROUPS_MAX_NESTED = -8 // max number of nested group reached
ERR_GROUP_NOT_BALANCED = -9 // group not balanced
ERR_GROUP_QM_NOTATION = -10 // group invalid notation
)
const(
@ -133,6 +135,7 @@ fn is_alnum(in_char byte) bool {
if tmp >= 0x00 && tmp <= 25 { return true }
tmp = in_char - `0`
if tmp >= 0x00 && tmp <= 9 { return true }
if tmp == `_` { return true }
return false
}
@ -193,9 +196,10 @@ pub fn (re RE) get_parse_error_string(err int) string {
ERR_INTERNAL_ERROR { return "ERR_INTERNAL_ERROR" }
ERR_CC_ALLOC_OVERFLOW { return "ERR_CC_ALLOC_OVERFLOW" }
ERR_SYNTAX_ERROR { return "ERR_SYNTAX_ERROR" }
ERR_GROUPS_OVERFLOW { return "ERR_GROUPS_OVERFLOW"}
ERR_GROUPS_MAX_NESTED { return "ERR_GROUPS_MAX_NESTED"}
ERR_GROUP_NOT_BALANCED { return "ERR_GROUP_NOT_BALANCED"}
ERR_GROUPS_OVERFLOW { return "ERR_GROUPS_OVERFLOW" }
ERR_GROUPS_MAX_NESTED { return "ERR_GROUPS_MAX_NESTED" }
ERR_GROUP_NOT_BALANCED { return "ERR_GROUP_NOT_BALANCED" }
ERR_GROUP_QM_NOTATION { return "ERR_GROUP_QM_NOTATION" }
else { return "ERR_UNKNOWN" }
}
}
@ -272,6 +276,9 @@ pub const (
F_EFM = 0x00000100 // exit on first token matched, used by search
F_BIN = 0x00000200 // work only on bytes, ignore utf-8
// behaviour modifier flags
//F_OR = 0x00010000 // the OR work with concatenation like PCRE
)
struct StateDotObj{
@ -305,6 +312,8 @@ pub mut:
group_csave []int = []int // groups continuous save array
group_csave_index int= -1 // groups continuous save index
group_map map[string]int // groups names map
// flags
flag int = 0 // flag for optional parameters
@ -336,6 +345,16 @@ fn (re mut RE) reset(){
}
}
pub fn (re RE) get_group(group_name string) (int, int) {
if group_name in re.group_map {
tmp_index := re.group_map[group_name]-1
start := re.groups[tmp_index*2]
end := re.groups[tmp_index*2+1]
return start,end
}
return -1, -1
}
/******************************************************************************
*
* Backslashes chars
@ -631,7 +650,7 @@ enum Quant_parse_state {
finish
}
// parse_quantifier return (min, max, str_len) of a {min,max}? quantifier starting after the { char
// parse_quantifier return (min, max, str_len, greedy_flag) of a {min,max}? quantifier starting after the { char
fn (re RE) parse_quantifier(in_txt string, in_i int) (int, int, int, bool) {
mut status := Quant_parse_state.start
mut i := in_i
@ -748,6 +767,104 @@ fn (re RE) parse_quantifier(in_txt string, in_i int) (int, int, int, bool) {
return ERR_SYNTAX_ERROR, i, 0, false
}
//
// Groups
//
enum Group_parse_state {
start,
q_mark, // (?
q_mark1, // (?:|P checking
p_status, // (?P
p_start, // (?P<
p_end, // (?P<...>
p_in_name, // (?P<...
finish
}
// parse_groups parse a group for ? (question mark) syntax, if found, return (error, capture_flag, name_of_the_group, next_index)
fn (re RE) parse_groups(in_txt string, in_i int) (int, bool, string, int) {
mut status := Group_parse_state.start
mut i := in_i
mut name := ''
for i < in_txt.len && status != .finish {
// get our char
char_tmp,char_len := re.get_char(in_txt,i)
ch := byte(char_tmp)
// start
if status == .start && ch == `(` {
status = .q_mark
i += char_len
continue
}
// check for question marks
if status == .q_mark && ch == `?` {
status = .q_mark1
i += char_len
continue
}
// non capturing group
if status == .q_mark1 && ch == `:` {
i += char_len
return 0, false, name, i
}
// enter in P section
if status == .q_mark1 && ch == `P` {
status = .p_status
i += char_len
continue
}
// not a valid q mark found
if status == .q_mark1 {
//println("NO VALID Q MARK")
return -2 , true, name, i
}
if status == .p_status && ch == `<` {
status = .p_start
i += char_len
continue
}
if status == .p_start && ch != `>` {
status = .p_in_name
name += "${ch:1c}" // TODO: manage utf8 chars
i += char_len
continue
}
// colect name
if status == .p_in_name && ch != `>` && is_alnum(ch) {
name += "${ch:1c}" // TODO: manage utf8 chars
i += char_len
continue
}
// end name
if status == .p_in_name && ch == `>` {
i += char_len
return 0, true, name, i
}
// error on name group
if status == .p_in_name {
return -2 , true, name, i
}
// normal group, nothig to do, exit
return 0 , true, name, i
}
/* UNREACHABLE */
//println("ERROR!! NOT MEANT TO BE HERE!!1")
return -2 , true, name, i
}
//
// main compiler
//
@ -795,7 +912,6 @@ pub fn (re mut RE) compile(in_txt string) (int,int) {
if group_count > re.group_max {
return ERR_GROUPS_OVERFLOW,i+1
}
group_stack_index++
// check max nested groups allowed
@ -803,17 +919,50 @@ pub fn (re mut RE) compile(in_txt string) (int,int) {
return ERR_GROUPS_MAX_NESTED,i+1
}
tmp_res, cgroup_flag, cgroup_name, next_i := re.parse_groups(in_txt,i)
// manage question mark format error
if tmp_res < -1 {
return ERR_GROUP_QM_NOTATION,next_i
}
//println("Parse group: [$tmp_res, $cgroup_flag, ($i,$next_i), '${in_txt[i..next_i]}' ]")
i = next_i
if cgroup_flag == true {
group_count++
}
// calculate the group id
// if it is a named group, recycle the group id
// NOTE: **** the group index is +1 because map return 0 when not found!! ****
mut group_id := group_count
if cgroup_name.len > 0 {
//println("GROUP NAME: ${cgroup_name}")
if cgroup_name in re.group_map{
group_id = re.group_map[cgroup_name]-1
group_count--
} else {
re.group_map[cgroup_name] = group_id+1
}
}
group_stack_txt_index[group_stack_index] = i
group_stack[group_stack_index] = pc
re.prog[pc].ist = u32(0) | IST_GROUP_START
re.prog[pc].group_id = group_count
re.prog[pc].rep_min = 1
re.prog[pc].rep_max = 1
// set the group id
if cgroup_flag == false {
//println("NO CAPTURE GROUP")
re.prog[pc].group_id = -1
} else {
re.prog[pc].group_id = group_id
}
pc = pc + 1
i = i + char_len
continue
}
@ -1099,6 +1248,16 @@ pub fn (re RE) get_code() string {
res.write(". DOT_CHAR")
} else if ist == IST_GROUP_START {
res.write("( GROUP_START #:${tk.group_id}")
if tk.group_id == -1 {
res.write(" ?:")
} else {
for x in re.group_map.keys() {
if re.group_map[x] == (tk.group_id+1) {
res.write(" ?P<${x}>")
break
}
}
}
} else if ist == IST_GROUP_END {
res.write(") GROUP_END #:${tk.group_id}")
} else if ist == IST_SIMPLE_CHAR {
@ -1145,9 +1304,21 @@ pub fn (re RE) get_query() string {
if ch == IST_GROUP_START {
if re.debug == 0 {
res.write("(")
} else {
if tk.group_id == -1 {
res.write("(?:") // non capturing group
} else {
res.write("#${tk.group_id}(")
}
}
for x in re.group_map.keys() {
if re.group_map[x] == (tk.group_id+1) {
res.write("?P<${x}>")
break
}
}
i++
continue
}
@ -1400,7 +1571,7 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
re.prog[tmp_pc].group_rep
)
*/
if re.prog[tmp_pc].group_rep >= re.prog[tmp_pc].rep_min{
if re.prog[tmp_pc].group_rep >= re.prog[tmp_pc].rep_min && re.prog[tmp_pc].group_id >= 0{
start_i := group_stack[group_index]
group_stack[group_index]=-1
@ -1420,7 +1591,7 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
// incrment counter
re.group_csave[0]++
// save the record
re.group_csave[re.group_csave_index++] = g_index // group id
re.group_csave[re.group_csave_index++] = g_index >> 1 // group id
re.group_csave[re.group_csave_index++] = re.groups[g_index] // start
re.group_csave[re.group_csave_index++] = re.groups[g_index+1] // end
}
@ -1545,7 +1716,7 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
// restore txt index stack and save the group data
//C.printf("g.id: %d group_index: %d\n", re.prog[pc].group_id, group_index)
if group_index >= 0 {
if group_index >= 0 && re.prog[pc].group_id >= 0 {
start_i := group_stack[group_index]
//group_stack[group_index]=-1
@ -1566,7 +1737,7 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
// incrment counter
re.group_csave[0]++
// save the record
re.group_csave[re.group_csave_index++] = g_index // group id
re.group_csave[re.group_csave_index++] = g_index >> 1 // group id
re.group_csave[re.group_csave_index++] = re.groups[g_index] // start
re.group_csave[re.group_csave_index++] = re.groups[g_index+1] // end
}
@ -1709,7 +1880,7 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
if re.prog[pc].ch == ch
{
state.match_flag = true
l_ist = u32(IST_SIMPLE_CHAR)
l_ist = IST_SIMPLE_CHAR
if first_match < 0 {
first_match = i

View File

@ -72,6 +72,7 @@ match_test_suite = [
TestItem{" pippo pera",r"\s(.*)pe(.*)",0,11},
TestItem{" abb",r"\s(.*)",0,4},
// negative
TestItem{"zthis ciao",r"((t[hieo]+se?)\s*)+",-1,0},
TestItem{"this is a good.",r"thes",-1,0},
@ -81,7 +82,6 @@ match_test_suite = [
TestItem{"1234this cpapaz adce aabe ter",r"(c(pa)+z)(\s[\a]+){2}$",-1,0},
TestItem{"cpapaz ole. pipipo,",r"^.*c.+ol?e.*p([ip])+o$",-1,0},
// check unicode
TestItem{"this is a test",r".*a [-Ⅵ ]+",0,34},
TestItem{"123 test",r"[-\s]+",3,23},
@ -94,11 +94,10 @@ struct TestItemFa {
r []int
}
const (
match_test_suite_fa = [
// find_all tests
TestItemFa{
"oggi pippo è andato a casa di pluto ed ha trovato pippo",
r"p[iplut]+o",
@ -119,12 +118,9 @@ struct TestItemRe {
rep string
r string
}
const (
match_test_suite_re = [
// replace tests
TestItemRe{
"oggi pibao è andato a casa di pbababao ed ha trovato pibabababao",
r"(pi?(ba)+o)",
@ -140,7 +136,88 @@ match_test_suite_re = [
]
)
struct TestItemCGroup {
src string
q string
s int = 0
e int = 0
cg []int
cgn map[string]int
}
const (
cgroups_test_suite = [
TestItemCGroup{
"http://www.ciao.mondo/hello/pippo12_/pera.html",
r"(?P<format>https?)|(?:ftps?)://(?P<token>[\w_]+.)+",0,46,
[8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46],
{'format':0,'token':1}
},
TestItemCGroup{
"http://www.ciao.mondo/hello/pippo12_/pera.html",
r"(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+",0,46,
[8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46],
{'format':0,'token':1}
},
TestItemCGroup{
"http://www.ciao.mondo/hello/pippo12_/pera.html",
r"(?P<format>https?)|(?P<format>ftps?)://([\w_]+.)+",0,46,
[8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46],
{'format':0}
},
]
)
fn test_regex(){
// check capturing groups
for c,to in cgroups_test_suite {
// debug print
//println("#$c [$to.src] q[$to.q] ($to.s, $to.e)")
mut re, re_err, err_pos := regex.regex(to.q)
re.group_csave = [-1].repeat(3*20+1)
if re_err == regex.COMPILE_OK {
start, end := re.match_string(to.src)
mut tmp_str := ""
if start >= 0 && end > start{
tmp_str = to.src[start..end]
}
if start != to.s || end != to.e {
println("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end")
println("ERROR!")
//C.printf("ERROR!! res:(%d, %d) refh:(%d, %d)\n",start, end, to.s, to.e)
assert false
break
}
// check cgroups
if re.group_csave.len == 0 || re.group_csave[0] != to.cg[0] {
println("Capturing group len error!")
assert false
}
// check captured groups
mut ln := re.group_csave[0]*3
for ln > 0 {
if re.group_csave[ln] != to.cg[ln] {
assert false
}
ln--
}
// check named captured groups
for k in to.cgn.keys() {
if to.cgn[k] != (re.group_map[k]-1) { // we have -1 because the map not found is 0, in groups we start from 0 and we store using +1
println("Named capturing group error! [$k]")
assert false
}
}
}
}
// check find_all
for c,to in match_test_suite_fa{
// debug print