regex: named capturing groups, small fixes
parent
9ac0c54eb0
commit
5a2534122e
|
@ -55,7 +55,7 @@ A meta-char is specified by a backslash before a char like `\w` in this case the
|
||||||
|
|
||||||
A meta-char can match different type of chars.
|
A meta-char can match different type of chars.
|
||||||
|
|
||||||
* `\w` match an alphanumeric char `[a-zA-Z0-9]`
|
* `\w` match an alphanumeric char `[a-zA-Z0-9_]`
|
||||||
* `\W` match a non alphanumeric char
|
* `\W` match a non alphanumeric char
|
||||||
* `\d` match a digit `[0-9]`
|
* `\d` match a digit `[0-9]`
|
||||||
* `\D` match a non digit
|
* `\D` match a non digit
|
||||||
|
@ -244,6 +244,114 @@ cg id: 0 [15, 19] => [56, ]
|
||||||
cg id: 0 [19, 21] => [78]
|
cg id: 0 [19, 21] => [78]
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Named capturing groups
|
||||||
|
|
||||||
|
This regex module support partially the question mark `?` PCRE syntax for groups.
|
||||||
|
|
||||||
|
`(?:abcd)` **non capturing group**: the content of the group will not be saved
|
||||||
|
|
||||||
|
`(?P<mygroup>abcdef)` **named group:** the group content is saved and labeled as `mygroup`
|
||||||
|
|
||||||
|
The label of the groups is saved in the `group_map` of the `RE` struct, this is a map from `string` to `int` where the value is the index in `group_csave` list of index.
|
||||||
|
|
||||||
|
Have a look at the example for the use of them.
|
||||||
|
|
||||||
|
example:
|
||||||
|
|
||||||
|
```v
|
||||||
|
fn main() {
|
||||||
|
test_regex()
|
||||||
|
|
||||||
|
text := "http://www.ciao.mondo/hello/pippo12_/pera.html"
|
||||||
|
query:= r"(?P<format>https?)|(?:ftps?)://(?P<token>[\w_]+.)+"
|
||||||
|
|
||||||
|
mut re := new_regex()
|
||||||
|
re.debug = 2
|
||||||
|
|
||||||
|
// must provide an array of the right size if want the continuos saving of the groups
|
||||||
|
re.group_csave = [-1].repeat(3*20+1)
|
||||||
|
|
||||||
|
re_err, err_pos := re.compile(query)
|
||||||
|
if re_err == COMPILE_OK {
|
||||||
|
q_str := re.get_query()
|
||||||
|
println("O.Query: $query")
|
||||||
|
println("Query : $q_str")
|
||||||
|
|
||||||
|
re.debug = 0
|
||||||
|
start, end := re.match_string(text)
|
||||||
|
if start < 0 {
|
||||||
|
err_str := re.get_parse_error_string(start)
|
||||||
|
println("ERROR : $err_str, $start")
|
||||||
|
} else {
|
||||||
|
text1 := text[start..end]
|
||||||
|
println("found in [$start, $end] => [$text1]")
|
||||||
|
}
|
||||||
|
|
||||||
|
// groups
|
||||||
|
mut gi := 0
|
||||||
|
for gi < re.groups.len {
|
||||||
|
if re.groups[gi] >= 0 {
|
||||||
|
println("${gi/2} ${re.groups[gi]},${re.groups[gi+1]} :[${text[re.groups[gi]..re.groups[gi+1]]}]")
|
||||||
|
}
|
||||||
|
gi += 2
|
||||||
|
}
|
||||||
|
// continuous saving
|
||||||
|
gi = 0
|
||||||
|
println("num of group item saved: ${re.group_csave[0]}")
|
||||||
|
for gi < re.group_csave[0] {
|
||||||
|
id := re.group_csave[1+gi*3]
|
||||||
|
st := re.group_csave[1+gi*3+1]
|
||||||
|
en := re.group_csave[1+gi*3+2]
|
||||||
|
println("cg id: ${id} [${st}, ${en}] => [${text[st..en]}]")
|
||||||
|
gi++
|
||||||
|
}
|
||||||
|
println("raw array: ${re.group_csave[0..gi*3+2-1]}")
|
||||||
|
|
||||||
|
// named capturing groups
|
||||||
|
println("named capturing groups:")
|
||||||
|
for g_name in re.group_map.keys() {
|
||||||
|
s,e := re.get_group(g_name)
|
||||||
|
if s >= 0 && e > s {
|
||||||
|
println("'${g_name}':[$s, $e] => '${text[s..e]}'")
|
||||||
|
} else {
|
||||||
|
println("Group [${g_name}] doesn't exist.")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
println("query: $query")
|
||||||
|
lc := "-".repeat(err_pos)
|
||||||
|
println("err : $lc^")
|
||||||
|
err_str := re.get_parse_error_string(re_err)
|
||||||
|
println("ERROR: $err_str")
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Output:
|
||||||
|
|
||||||
|
```
|
||||||
|
O.Query: (?P<format>https?)|(?:ftps?)://(?P<token>[\w_]+.)+
|
||||||
|
Query : #0(?P<format>https?)|{8,14}(?:ftps?)://#1(?P<token>[\w_]+.)+
|
||||||
|
found in [0, 46] => [http://www.ciao.mondo/hello/pippo12_/pera.html]
|
||||||
|
0 0,4 :[http]
|
||||||
|
1 42,46 :[html]
|
||||||
|
num of group item saved: 8
|
||||||
|
cg id: 0 [0, 4] => [http]
|
||||||
|
cg id: 1 [7, 11] => [www.]
|
||||||
|
cg id: 1 [11, 16] => [ciao.]
|
||||||
|
cg id: 1 [16, 22] => [mondo/]
|
||||||
|
cg id: 1 [22, 28] => [hello/]
|
||||||
|
cg id: 1 [28, 37] => [pippo12_/]
|
||||||
|
cg id: 1 [37, 42] => [pera.]
|
||||||
|
cg id: 1 [42, 46] => [html]
|
||||||
|
raw array: [8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46]
|
||||||
|
named capturing groups:
|
||||||
|
'format':[0, 4] => 'http'
|
||||||
|
'token':[42, 46] => 'html'
|
||||||
|
```
|
||||||
|
|
||||||
## Flags
|
## Flags
|
||||||
|
|
||||||
It is possible to set some flags in the regex parser that change the behavior of the parser itself.
|
It is possible to set some flags in the regex parser that change the behavior of the parser itself.
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
/**********************************************************************
|
/**********************************************************************
|
||||||
*
|
*
|
||||||
* regex 0.9c
|
* regex 0.9d
|
||||||
*
|
*
|
||||||
* Copyright (c) 2019-2020 Dario Deledda. All rights reserved.
|
* Copyright (c) 2019-2020 Dario Deledda. All rights reserved.
|
||||||
* Use of this source code is governed by an MIT license
|
* Use of this source code is governed by an MIT license
|
||||||
|
@ -9,8 +9,9 @@
|
||||||
* This file contains regex module
|
* This file contains regex module
|
||||||
*
|
*
|
||||||
* Know limitation:
|
* Know limitation:
|
||||||
* - max 8 stacked groups
|
|
||||||
* - find is implemented in a trivial way
|
* - find is implemented in a trivial way
|
||||||
|
* - not full compliant PCRE
|
||||||
|
* - not compliant POSIX ERE
|
||||||
*
|
*
|
||||||
*
|
*
|
||||||
**********************************************************************/
|
**********************************************************************/
|
||||||
|
@ -18,7 +19,7 @@ module regex
|
||||||
import strings
|
import strings
|
||||||
|
|
||||||
pub const(
|
pub const(
|
||||||
V_REGEX_VERSION = "0.9c" // regex module version
|
V_REGEX_VERSION = "0.9d" // regex module version
|
||||||
|
|
||||||
MAX_CODE_LEN = 256 // default small base code len for the regex programs
|
MAX_CODE_LEN = 256 // default small base code len for the regex programs
|
||||||
MAX_QUANTIFIER = 1073741824 // default max repetitions allowed for the quantifiers = 2^30
|
MAX_QUANTIFIER = 1073741824 // default max repetitions allowed for the quantifiers = 2^30
|
||||||
|
@ -41,6 +42,7 @@ pub const(
|
||||||
ERR_GROUPS_OVERFLOW = -7 // max number of groups reached
|
ERR_GROUPS_OVERFLOW = -7 // max number of groups reached
|
||||||
ERR_GROUPS_MAX_NESTED = -8 // max number of nested group reached
|
ERR_GROUPS_MAX_NESTED = -8 // max number of nested group reached
|
||||||
ERR_GROUP_NOT_BALANCED = -9 // group not balanced
|
ERR_GROUP_NOT_BALANCED = -9 // group not balanced
|
||||||
|
ERR_GROUP_QM_NOTATION = -10 // group invalid notation
|
||||||
)
|
)
|
||||||
|
|
||||||
const(
|
const(
|
||||||
|
@ -133,6 +135,7 @@ fn is_alnum(in_char byte) bool {
|
||||||
if tmp >= 0x00 && tmp <= 25 { return true }
|
if tmp >= 0x00 && tmp <= 25 { return true }
|
||||||
tmp = in_char - `0`
|
tmp = in_char - `0`
|
||||||
if tmp >= 0x00 && tmp <= 9 { return true }
|
if tmp >= 0x00 && tmp <= 9 { return true }
|
||||||
|
if tmp == `_` { return true }
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -193,9 +196,10 @@ pub fn (re RE) get_parse_error_string(err int) string {
|
||||||
ERR_INTERNAL_ERROR { return "ERR_INTERNAL_ERROR" }
|
ERR_INTERNAL_ERROR { return "ERR_INTERNAL_ERROR" }
|
||||||
ERR_CC_ALLOC_OVERFLOW { return "ERR_CC_ALLOC_OVERFLOW" }
|
ERR_CC_ALLOC_OVERFLOW { return "ERR_CC_ALLOC_OVERFLOW" }
|
||||||
ERR_SYNTAX_ERROR { return "ERR_SYNTAX_ERROR" }
|
ERR_SYNTAX_ERROR { return "ERR_SYNTAX_ERROR" }
|
||||||
ERR_GROUPS_OVERFLOW { return "ERR_GROUPS_OVERFLOW"}
|
ERR_GROUPS_OVERFLOW { return "ERR_GROUPS_OVERFLOW" }
|
||||||
ERR_GROUPS_MAX_NESTED { return "ERR_GROUPS_MAX_NESTED"}
|
ERR_GROUPS_MAX_NESTED { return "ERR_GROUPS_MAX_NESTED" }
|
||||||
ERR_GROUP_NOT_BALANCED { return "ERR_GROUP_NOT_BALANCED"}
|
ERR_GROUP_NOT_BALANCED { return "ERR_GROUP_NOT_BALANCED" }
|
||||||
|
ERR_GROUP_QM_NOTATION { return "ERR_GROUP_QM_NOTATION" }
|
||||||
else { return "ERR_UNKNOWN" }
|
else { return "ERR_UNKNOWN" }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -272,6 +276,9 @@ pub const (
|
||||||
|
|
||||||
F_EFM = 0x00000100 // exit on first token matched, used by search
|
F_EFM = 0x00000100 // exit on first token matched, used by search
|
||||||
F_BIN = 0x00000200 // work only on bytes, ignore utf-8
|
F_BIN = 0x00000200 // work only on bytes, ignore utf-8
|
||||||
|
|
||||||
|
// behaviour modifier flags
|
||||||
|
//F_OR = 0x00010000 // the OR work with concatenation like PCRE
|
||||||
)
|
)
|
||||||
|
|
||||||
struct StateDotObj{
|
struct StateDotObj{
|
||||||
|
@ -305,6 +312,8 @@ pub mut:
|
||||||
group_csave []int = []int // groups continuous save array
|
group_csave []int = []int // groups continuous save array
|
||||||
group_csave_index int= -1 // groups continuous save index
|
group_csave_index int= -1 // groups continuous save index
|
||||||
|
|
||||||
|
group_map map[string]int // groups names map
|
||||||
|
|
||||||
// flags
|
// flags
|
||||||
flag int = 0 // flag for optional parameters
|
flag int = 0 // flag for optional parameters
|
||||||
|
|
||||||
|
@ -336,6 +345,16 @@ fn (re mut RE) reset(){
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn (re RE) get_group(group_name string) (int, int) {
|
||||||
|
if group_name in re.group_map {
|
||||||
|
tmp_index := re.group_map[group_name]-1
|
||||||
|
start := re.groups[tmp_index*2]
|
||||||
|
end := re.groups[tmp_index*2+1]
|
||||||
|
return start,end
|
||||||
|
}
|
||||||
|
return -1, -1
|
||||||
|
}
|
||||||
|
|
||||||
/******************************************************************************
|
/******************************************************************************
|
||||||
*
|
*
|
||||||
* Backslashes chars
|
* Backslashes chars
|
||||||
|
@ -631,7 +650,7 @@ enum Quant_parse_state {
|
||||||
finish
|
finish
|
||||||
}
|
}
|
||||||
|
|
||||||
// parse_quantifier return (min, max, str_len) of a {min,max}? quantifier starting after the { char
|
// parse_quantifier return (min, max, str_len, greedy_flag) of a {min,max}? quantifier starting after the { char
|
||||||
fn (re RE) parse_quantifier(in_txt string, in_i int) (int, int, int, bool) {
|
fn (re RE) parse_quantifier(in_txt string, in_i int) (int, int, int, bool) {
|
||||||
mut status := Quant_parse_state.start
|
mut status := Quant_parse_state.start
|
||||||
mut i := in_i
|
mut i := in_i
|
||||||
|
@ -748,6 +767,104 @@ fn (re RE) parse_quantifier(in_txt string, in_i int) (int, int, int, bool) {
|
||||||
return ERR_SYNTAX_ERROR, i, 0, false
|
return ERR_SYNTAX_ERROR, i, 0, false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// Groups
|
||||||
|
//
|
||||||
|
enum Group_parse_state {
|
||||||
|
start,
|
||||||
|
q_mark, // (?
|
||||||
|
q_mark1, // (?:|P checking
|
||||||
|
p_status, // (?P
|
||||||
|
p_start, // (?P<
|
||||||
|
p_end, // (?P<...>
|
||||||
|
p_in_name, // (?P<...
|
||||||
|
finish
|
||||||
|
}
|
||||||
|
|
||||||
|
// parse_groups parse a group for ? (question mark) syntax, if found, return (error, capture_flag, name_of_the_group, next_index)
|
||||||
|
fn (re RE) parse_groups(in_txt string, in_i int) (int, bool, string, int) {
|
||||||
|
mut status := Group_parse_state.start
|
||||||
|
mut i := in_i
|
||||||
|
mut name := ''
|
||||||
|
|
||||||
|
for i < in_txt.len && status != .finish {
|
||||||
|
|
||||||
|
// get our char
|
||||||
|
char_tmp,char_len := re.get_char(in_txt,i)
|
||||||
|
ch := byte(char_tmp)
|
||||||
|
|
||||||
|
// start
|
||||||
|
if status == .start && ch == `(` {
|
||||||
|
status = .q_mark
|
||||||
|
i += char_len
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// check for question marks
|
||||||
|
if status == .q_mark && ch == `?` {
|
||||||
|
status = .q_mark1
|
||||||
|
i += char_len
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// non capturing group
|
||||||
|
if status == .q_mark1 && ch == `:` {
|
||||||
|
i += char_len
|
||||||
|
return 0, false, name, i
|
||||||
|
}
|
||||||
|
|
||||||
|
// enter in P section
|
||||||
|
if status == .q_mark1 && ch == `P` {
|
||||||
|
status = .p_status
|
||||||
|
i += char_len
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// not a valid q mark found
|
||||||
|
if status == .q_mark1 {
|
||||||
|
//println("NO VALID Q MARK")
|
||||||
|
return -2 , true, name, i
|
||||||
|
}
|
||||||
|
|
||||||
|
if status == .p_status && ch == `<` {
|
||||||
|
status = .p_start
|
||||||
|
i += char_len
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if status == .p_start && ch != `>` {
|
||||||
|
status = .p_in_name
|
||||||
|
name += "${ch:1c}" // TODO: manage utf8 chars
|
||||||
|
i += char_len
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// colect name
|
||||||
|
if status == .p_in_name && ch != `>` && is_alnum(ch) {
|
||||||
|
name += "${ch:1c}" // TODO: manage utf8 chars
|
||||||
|
i += char_len
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// end name
|
||||||
|
if status == .p_in_name && ch == `>` {
|
||||||
|
i += char_len
|
||||||
|
return 0, true, name, i
|
||||||
|
}
|
||||||
|
|
||||||
|
// error on name group
|
||||||
|
if status == .p_in_name {
|
||||||
|
return -2 , true, name, i
|
||||||
|
}
|
||||||
|
|
||||||
|
// normal group, nothig to do, exit
|
||||||
|
return 0 , true, name, i
|
||||||
|
}
|
||||||
|
/* UNREACHABLE */
|
||||||
|
//println("ERROR!! NOT MEANT TO BE HERE!!1")
|
||||||
|
return -2 , true, name, i
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// main compiler
|
// main compiler
|
||||||
//
|
//
|
||||||
|
@ -795,7 +912,6 @@ pub fn (re mut RE) compile(in_txt string) (int,int) {
|
||||||
if group_count > re.group_max {
|
if group_count > re.group_max {
|
||||||
return ERR_GROUPS_OVERFLOW,i+1
|
return ERR_GROUPS_OVERFLOW,i+1
|
||||||
}
|
}
|
||||||
|
|
||||||
group_stack_index++
|
group_stack_index++
|
||||||
|
|
||||||
// check max nested groups allowed
|
// check max nested groups allowed
|
||||||
|
@ -803,17 +919,50 @@ pub fn (re mut RE) compile(in_txt string) (int,int) {
|
||||||
return ERR_GROUPS_MAX_NESTED,i+1
|
return ERR_GROUPS_MAX_NESTED,i+1
|
||||||
}
|
}
|
||||||
|
|
||||||
group_count++
|
tmp_res, cgroup_flag, cgroup_name, next_i := re.parse_groups(in_txt,i)
|
||||||
|
|
||||||
|
// manage question mark format error
|
||||||
|
if tmp_res < -1 {
|
||||||
|
return ERR_GROUP_QM_NOTATION,next_i
|
||||||
|
}
|
||||||
|
|
||||||
|
//println("Parse group: [$tmp_res, $cgroup_flag, ($i,$next_i), '${in_txt[i..next_i]}' ]")
|
||||||
|
i = next_i
|
||||||
|
|
||||||
|
if cgroup_flag == true {
|
||||||
|
group_count++
|
||||||
|
}
|
||||||
|
|
||||||
|
// calculate the group id
|
||||||
|
// if it is a named group, recycle the group id
|
||||||
|
// NOTE: **** the group index is +1 because map return 0 when not found!! ****
|
||||||
|
mut group_id := group_count
|
||||||
|
if cgroup_name.len > 0 {
|
||||||
|
//println("GROUP NAME: ${cgroup_name}")
|
||||||
|
if cgroup_name in re.group_map{
|
||||||
|
group_id = re.group_map[cgroup_name]-1
|
||||||
|
group_count--
|
||||||
|
} else {
|
||||||
|
re.group_map[cgroup_name] = group_id+1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
group_stack_txt_index[group_stack_index] = i
|
group_stack_txt_index[group_stack_index] = i
|
||||||
group_stack[group_stack_index] = pc
|
group_stack[group_stack_index] = pc
|
||||||
|
|
||||||
re.prog[pc].ist = u32(0) | IST_GROUP_START
|
re.prog[pc].ist = u32(0) | IST_GROUP_START
|
||||||
re.prog[pc].group_id = group_count
|
|
||||||
re.prog[pc].rep_min = 1
|
re.prog[pc].rep_min = 1
|
||||||
re.prog[pc].rep_max = 1
|
re.prog[pc].rep_max = 1
|
||||||
|
|
||||||
|
// set the group id
|
||||||
|
if cgroup_flag == false {
|
||||||
|
//println("NO CAPTURE GROUP")
|
||||||
|
re.prog[pc].group_id = -1
|
||||||
|
} else {
|
||||||
|
re.prog[pc].group_id = group_id
|
||||||
|
}
|
||||||
|
|
||||||
pc = pc + 1
|
pc = pc + 1
|
||||||
i = i + char_len
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -1099,6 +1248,16 @@ pub fn (re RE) get_code() string {
|
||||||
res.write(". DOT_CHAR")
|
res.write(". DOT_CHAR")
|
||||||
} else if ist == IST_GROUP_START {
|
} else if ist == IST_GROUP_START {
|
||||||
res.write("( GROUP_START #:${tk.group_id}")
|
res.write("( GROUP_START #:${tk.group_id}")
|
||||||
|
if tk.group_id == -1 {
|
||||||
|
res.write(" ?:")
|
||||||
|
} else {
|
||||||
|
for x in re.group_map.keys() {
|
||||||
|
if re.group_map[x] == (tk.group_id+1) {
|
||||||
|
res.write(" ?P<${x}>")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
} else if ist == IST_GROUP_END {
|
} else if ist == IST_GROUP_END {
|
||||||
res.write(") GROUP_END #:${tk.group_id}")
|
res.write(") GROUP_END #:${tk.group_id}")
|
||||||
} else if ist == IST_SIMPLE_CHAR {
|
} else if ist == IST_SIMPLE_CHAR {
|
||||||
|
@ -1146,8 +1305,20 @@ pub fn (re RE) get_query() string {
|
||||||
if re.debug == 0 {
|
if re.debug == 0 {
|
||||||
res.write("(")
|
res.write("(")
|
||||||
} else {
|
} else {
|
||||||
res.write("#${tk.group_id}(")
|
if tk.group_id == -1 {
|
||||||
|
res.write("(?:") // non capturing group
|
||||||
|
} else {
|
||||||
|
res.write("#${tk.group_id}(")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for x in re.group_map.keys() {
|
||||||
|
if re.group_map[x] == (tk.group_id+1) {
|
||||||
|
res.write("?P<${x}>")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
i++
|
i++
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
@ -1400,7 +1571,7 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
|
||||||
re.prog[tmp_pc].group_rep
|
re.prog[tmp_pc].group_rep
|
||||||
)
|
)
|
||||||
*/
|
*/
|
||||||
if re.prog[tmp_pc].group_rep >= re.prog[tmp_pc].rep_min{
|
if re.prog[tmp_pc].group_rep >= re.prog[tmp_pc].rep_min && re.prog[tmp_pc].group_id >= 0{
|
||||||
start_i := group_stack[group_index]
|
start_i := group_stack[group_index]
|
||||||
group_stack[group_index]=-1
|
group_stack[group_index]=-1
|
||||||
|
|
||||||
|
@ -1420,7 +1591,7 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
|
||||||
// incrment counter
|
// incrment counter
|
||||||
re.group_csave[0]++
|
re.group_csave[0]++
|
||||||
// save the record
|
// save the record
|
||||||
re.group_csave[re.group_csave_index++] = g_index // group id
|
re.group_csave[re.group_csave_index++] = g_index >> 1 // group id
|
||||||
re.group_csave[re.group_csave_index++] = re.groups[g_index] // start
|
re.group_csave[re.group_csave_index++] = re.groups[g_index] // start
|
||||||
re.group_csave[re.group_csave_index++] = re.groups[g_index+1] // end
|
re.group_csave[re.group_csave_index++] = re.groups[g_index+1] // end
|
||||||
}
|
}
|
||||||
|
@ -1545,7 +1716,7 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
|
||||||
// restore txt index stack and save the group data
|
// restore txt index stack and save the group data
|
||||||
|
|
||||||
//C.printf("g.id: %d group_index: %d\n", re.prog[pc].group_id, group_index)
|
//C.printf("g.id: %d group_index: %d\n", re.prog[pc].group_id, group_index)
|
||||||
if group_index >= 0 {
|
if group_index >= 0 && re.prog[pc].group_id >= 0 {
|
||||||
start_i := group_stack[group_index]
|
start_i := group_stack[group_index]
|
||||||
//group_stack[group_index]=-1
|
//group_stack[group_index]=-1
|
||||||
|
|
||||||
|
@ -1566,7 +1737,7 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
|
||||||
// incrment counter
|
// incrment counter
|
||||||
re.group_csave[0]++
|
re.group_csave[0]++
|
||||||
// save the record
|
// save the record
|
||||||
re.group_csave[re.group_csave_index++] = g_index // group id
|
re.group_csave[re.group_csave_index++] = g_index >> 1 // group id
|
||||||
re.group_csave[re.group_csave_index++] = re.groups[g_index] // start
|
re.group_csave[re.group_csave_index++] = re.groups[g_index] // start
|
||||||
re.group_csave[re.group_csave_index++] = re.groups[g_index+1] // end
|
re.group_csave[re.group_csave_index++] = re.groups[g_index+1] // end
|
||||||
}
|
}
|
||||||
|
@ -1709,7 +1880,7 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
|
||||||
if re.prog[pc].ch == ch
|
if re.prog[pc].ch == ch
|
||||||
{
|
{
|
||||||
state.match_flag = true
|
state.match_flag = true
|
||||||
l_ist = u32(IST_SIMPLE_CHAR)
|
l_ist = IST_SIMPLE_CHAR
|
||||||
|
|
||||||
if first_match < 0 {
|
if first_match < 0 {
|
||||||
first_match = i
|
first_match = i
|
||||||
|
|
|
@ -7,9 +7,9 @@ import regex
|
||||||
******************************************************************************/
|
******************************************************************************/
|
||||||
struct TestItem {
|
struct TestItem {
|
||||||
src string
|
src string
|
||||||
q string
|
q string
|
||||||
s int = 0
|
s int = 0
|
||||||
e int = 0
|
e int = 0
|
||||||
}
|
}
|
||||||
|
|
||||||
const(
|
const(
|
||||||
|
@ -72,6 +72,7 @@ match_test_suite = [
|
||||||
TestItem{" pippo pera",r"\s(.*)pe(.*)",0,11},
|
TestItem{" pippo pera",r"\s(.*)pe(.*)",0,11},
|
||||||
TestItem{" abb",r"\s(.*)",0,4},
|
TestItem{" abb",r"\s(.*)",0,4},
|
||||||
|
|
||||||
|
|
||||||
// negative
|
// negative
|
||||||
TestItem{"zthis ciao",r"((t[hieo]+se?)\s*)+",-1,0},
|
TestItem{"zthis ciao",r"((t[hieo]+se?)\s*)+",-1,0},
|
||||||
TestItem{"this is a good.",r"thes",-1,0},
|
TestItem{"this is a good.",r"thes",-1,0},
|
||||||
|
@ -81,7 +82,6 @@ match_test_suite = [
|
||||||
TestItem{"1234this cpapaz adce aabe ter",r"(c(pa)+z)(\s[\a]+){2}$",-1,0},
|
TestItem{"1234this cpapaz adce aabe ter",r"(c(pa)+z)(\s[\a]+){2}$",-1,0},
|
||||||
TestItem{"cpapaz ole. pipipo,",r"^.*c.+ol?e.*p([ip])+o$",-1,0},
|
TestItem{"cpapaz ole. pipipo,",r"^.*c.+ol?e.*p([ip])+o$",-1,0},
|
||||||
|
|
||||||
|
|
||||||
// check unicode
|
// check unicode
|
||||||
TestItem{"this is a Ⅰ Ⅱ Ⅲ Ⅳ Ⅴ Ⅵ test",r".*a [Ⅰ-Ⅵ ]+",0,34},
|
TestItem{"this is a Ⅰ Ⅱ Ⅲ Ⅳ Ⅴ Ⅵ test",r".*a [Ⅰ-Ⅵ ]+",0,34},
|
||||||
TestItem{"123Ⅰ Ⅱ Ⅲ Ⅳ Ⅴ Ⅵ test",r"[Ⅰ-Ⅴ\s]+",3,23},
|
TestItem{"123Ⅰ Ⅱ Ⅲ Ⅳ Ⅴ Ⅵ test",r"[Ⅰ-Ⅴ\s]+",3,23},
|
||||||
|
@ -90,15 +90,14 @@ match_test_suite = [
|
||||||
|
|
||||||
struct TestItemFa {
|
struct TestItemFa {
|
||||||
src string
|
src string
|
||||||
q string
|
q string
|
||||||
r []int
|
r []int
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
const (
|
const (
|
||||||
match_test_suite_fa = [
|
match_test_suite_fa = [
|
||||||
|
|
||||||
// find_all tests
|
// find_all tests
|
||||||
|
|
||||||
TestItemFa{
|
TestItemFa{
|
||||||
"oggi pippo è andato a casa di pluto ed ha trovato pippo",
|
"oggi pippo è andato a casa di pluto ed ha trovato pippo",
|
||||||
r"p[iplut]+o",
|
r"p[iplut]+o",
|
||||||
|
@ -115,16 +114,13 @@ match_test_suite_fa = [
|
||||||
|
|
||||||
struct TestItemRe {
|
struct TestItemRe {
|
||||||
src string
|
src string
|
||||||
q string
|
q string
|
||||||
rep string
|
rep string
|
||||||
r string
|
r string
|
||||||
}
|
}
|
||||||
|
|
||||||
const (
|
const (
|
||||||
match_test_suite_re = [
|
match_test_suite_re = [
|
||||||
|
|
||||||
// replace tests
|
// replace tests
|
||||||
|
|
||||||
TestItemRe{
|
TestItemRe{
|
||||||
"oggi pibao è andato a casa di pbababao ed ha trovato pibabababao",
|
"oggi pibao è andato a casa di pbababao ed ha trovato pibabababao",
|
||||||
r"(pi?(ba)+o)",
|
r"(pi?(ba)+o)",
|
||||||
|
@ -140,7 +136,88 @@ match_test_suite_re = [
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
struct TestItemCGroup {
|
||||||
|
src string
|
||||||
|
q string
|
||||||
|
s int = 0
|
||||||
|
e int = 0
|
||||||
|
cg []int
|
||||||
|
cgn map[string]int
|
||||||
|
}
|
||||||
|
const (
|
||||||
|
cgroups_test_suite = [
|
||||||
|
TestItemCGroup{
|
||||||
|
"http://www.ciao.mondo/hello/pippo12_/pera.html",
|
||||||
|
r"(?P<format>https?)|(?:ftps?)://(?P<token>[\w_]+.)+",0,46,
|
||||||
|
[8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46],
|
||||||
|
{'format':0,'token':1}
|
||||||
|
},
|
||||||
|
TestItemCGroup{
|
||||||
|
"http://www.ciao.mondo/hello/pippo12_/pera.html",
|
||||||
|
r"(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+",0,46,
|
||||||
|
[8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46],
|
||||||
|
{'format':0,'token':1}
|
||||||
|
},
|
||||||
|
TestItemCGroup{
|
||||||
|
"http://www.ciao.mondo/hello/pippo12_/pera.html",
|
||||||
|
r"(?P<format>https?)|(?P<format>ftps?)://([\w_]+.)+",0,46,
|
||||||
|
[8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46],
|
||||||
|
{'format':0}
|
||||||
|
},
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
fn test_regex(){
|
fn test_regex(){
|
||||||
|
// check capturing groups
|
||||||
|
for c,to in cgroups_test_suite {
|
||||||
|
// debug print
|
||||||
|
//println("#$c [$to.src] q[$to.q] ($to.s, $to.e)")
|
||||||
|
|
||||||
|
mut re, re_err, err_pos := regex.regex(to.q)
|
||||||
|
re.group_csave = [-1].repeat(3*20+1)
|
||||||
|
|
||||||
|
if re_err == regex.COMPILE_OK {
|
||||||
|
start, end := re.match_string(to.src)
|
||||||
|
|
||||||
|
mut tmp_str := ""
|
||||||
|
if start >= 0 && end > start{
|
||||||
|
tmp_str = to.src[start..end]
|
||||||
|
}
|
||||||
|
|
||||||
|
if start != to.s || end != to.e {
|
||||||
|
println("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end")
|
||||||
|
println("ERROR!")
|
||||||
|
//C.printf("ERROR!! res:(%d, %d) refh:(%d, %d)\n",start, end, to.s, to.e)
|
||||||
|
assert false
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
// check cgroups
|
||||||
|
if re.group_csave.len == 0 || re.group_csave[0] != to.cg[0] {
|
||||||
|
println("Capturing group len error!")
|
||||||
|
assert false
|
||||||
|
}
|
||||||
|
|
||||||
|
// check captured groups
|
||||||
|
mut ln := re.group_csave[0]*3
|
||||||
|
for ln > 0 {
|
||||||
|
if re.group_csave[ln] != to.cg[ln] {
|
||||||
|
assert false
|
||||||
|
}
|
||||||
|
ln--
|
||||||
|
}
|
||||||
|
|
||||||
|
// check named captured groups
|
||||||
|
for k in to.cgn.keys() {
|
||||||
|
if to.cgn[k] != (re.group_map[k]-1) { // we have -1 because the map not found is 0, in groups we start from 0 and we store using +1
|
||||||
|
println("Named capturing group error! [$k]")
|
||||||
|
assert false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// check find_all
|
// check find_all
|
||||||
for c,to in match_test_suite_fa{
|
for c,to in match_test_suite_fa{
|
||||||
// debug print
|
// debug print
|
||||||
|
|
Loading…
Reference in New Issue