regex: new options returning interface to the regex module (#6062)

pull/6093/head
Maciej Obarski 2020-08-08 08:04:12 +02:00 committed by GitHub
parent 664c26ab4b
commit 2a4ef2acbd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 268 additions and 274 deletions

View File

@ -1,4 +1,4 @@
# V RegEx (Regular expression) 0.9d # V RegEx (Regular expression) 0.9g
[TOC] [TOC]
@ -137,7 +137,7 @@ The "capture groups" are store as couple of index in the field `groups` that is
```v ```v
text := "cpaz cpapaz cpapapaz" text := "cpaz cpapaz cpapapaz"
query:= r"(c(pa)+z ?)+" query:= r"(c(pa)+z ?)+"
re, _, _ := regex.regex(query) mut re := regex.regex_opt(query) or { panic(err) }
println(re.get_query()) println(re.get_query())
// #0(c#1(pa)+z ?)+ // #0 and #1 are the ids of the groups, are shown if re.debug is 1 or 2 // #0(c#1(pa)+z ?)+ // #0 and #1 are the ids of the groups, are shown if re.debug is 1 or 2
@ -155,8 +155,6 @@ for gi < re.groups.len {
// groups captured // groups captured
// 0 :[cpapapaz] // 0 :[cpapapaz]
// 1 :[pa] // 1 :[pa]
``` ```
**note:** *to show the `group id number` in the result of the `get_query()` the flag `debug` of the RE object must be `1` or `2`* **note:** *to show the `group id number` in the result of the `get_query()` the flag `debug` of the RE object must be `1` or `2`*
@ -187,48 +185,41 @@ fn example2() {
text := "tst: 01,23,45 ,56, 78" text := "tst: 01,23,45 ,56, 78"
query:= r".*:(\s*\d+[\s,]*)+" query:= r".*:(\s*\d+[\s,]*)+"
mut re := regex.new_regex() mut re := new() or { panic(err) }
//re.debug = 2 //re.debug = 2
re.group_csave = [-1].repeat(3*20+1) // we expect max 20 records re.group_csave = [-1].repeat(3*20+1) // we expect max 20 records
re_err, err_pos := re.compile(query) re.compile_opt(query) or { println(err) return }
if re_err == regex.COMPILE_OK {
q_str := re.get_query()
println("Query: $q_str")
start, end := re.match_string(text)
if start < 0 {
println("ERROR : ${re.get_parse_error_string(start)}, $start")
} else {
println("found in [$start, $end] => [${text[start..end]}]")
}
// groups capture q_str := re.get_query()
mut gi := 0 println("Query: $q_str")
for gi < re.groups.len {
if re.groups[gi] >= 0 {
println("${gi/2} ${re.groups[gi]},${re.groups[gi+1]} :[${text[re.groups[gi]..re.groups[gi+1]]}]")
}
gi += 2
}
// continuous saving start, end := re.match_string(text)
gi = 0 if start < 0 {
println("num: ${re.group_csave[0]}") println("ERROR : ${re.get_parse_error_string(start)}, $start")
for gi < re.group_csave[0] { } else {
id := re.group_csave[1+gi*3] println("found in [$start, $end] => [${text[start..end]}]")
st := re.group_csave[1+gi*3+1] }
en := re.group_csave[1+gi*3+2]
println("cg id: ${id} [${st}, ${en}] => [${text[st..en]}]") // groups capture
gi++ mut gi := 0
} for gi < re.groups.len {
} else { if re.groups[gi] >= 0 {
println("query: $query") println("${gi/2} ${re.groups[gi]},${re.groups[gi+1]} :[${text[re.groups[gi]..re.groups[gi+1]]}]")
lc := "-".repeat(err_pos) }
println("err : $lc^") gi += 2
err_str := re.get_parse_error_string(re_err) }
println("ERROR: $err_str")
} // continuous saving
gi = 0
println("num: ${re.group_csave[0]}")
for gi < re.group_csave[0] {
id := re.group_csave[1+gi*3]
st := re.group_csave[1+gi*3+1]
en := re.group_csave[1+gi*3+2]
println("cg id: ${id} [${st}, ${en}] => [${text[st..en]}]")
gi++
}
} }
``` ```
@ -261,73 +252,65 @@ Have a look at the example for the use of them.
example: example:
```v ```v
import regex
fn main() { fn main() {
test_regex() test_regex()
text := "http://www.ciao.mondo/hello/pippo12_/pera.html" text := "http://www.ciao.mondo/hello/pippo12_/pera.html"
query:= r"(?P<format>https?)|(?:ftps?)://(?P<token>[\w_]+.)+" query:= r"(?P<format>https?)|(?:ftps?)://(?P<token>[\w_]+.)+"
mut re := new_regex() mut re := new()
re.debug = 2 re.debug = 2
// must provide an array of the right size if want the continuos saving of the groups // must provide an array of the right size if want the continuos saving of the groups
re.group_csave = [-1].repeat(3*20+1) re.group_csave = [-1].repeat(3*20+1)
re_err, err_pos := re.compile(query) re.compile_opt(query) or { println(err) return }
if re_err == COMPILE_OK {
q_str := re.get_query()
println("O.Query: $query")
println("Query : $q_str")
re.debug = 0
start, end := re.match_string(text)
if start < 0 {
err_str := re.get_parse_error_string(start)
println("ERROR : $err_str, $start")
} else {
text1 := text[start..end]
println("found in [$start, $end] => [$text1]")
}
// groups q_str := re.get_query()
mut gi := 0 println("O.Query: $query")
for gi < re.groups.len { println("Query : $q_str")
if re.groups[gi] >= 0 {
println("${gi/2} ${re.groups[gi]},${re.groups[gi+1]} :[${text[re.groups[gi]..re.groups[gi+1]]}]") re.debug = 0
} start, end := re.match_string(text)
gi += 2 if start < 0 {
} err_str := re.get_parse_error_string(start)
// continuous saving println("ERROR : $err_str, $start")
gi = 0 } else {
println("num of group item saved: ${re.group_csave[0]}") text1 := text[start..end]
for gi < re.group_csave[0] { println("found in [$start, $end] => [$text1]")
id := re.group_csave[1+gi*3] }
st := re.group_csave[1+gi*3+1]
en := re.group_csave[1+gi*3+2]
println("cg id: ${id} [${st}, ${en}] => [${text[st..en]}]")
gi++
}
println("raw array: ${re.group_csave[0..gi*3+2-1]}")
// named capturing groups // groups
println("named capturing groups:") mut gi := 0
for g_name in re.group_map.keys() { for gi < re.groups.len {
s,e := re.get_group(g_name) if re.groups[gi] >= 0 {
if s >= 0 && e > s { println("${gi/2} ${re.groups[gi]},${re.groups[gi+1]} :[${text[re.groups[gi]..re.groups[gi+1]]}]")
println("'${g_name}':[$s, $e] => '${text[s..e]}'") }
} else { gi += 2
println("Group [${g_name}] doesn't exist.") }
} // continuous saving
} gi = 0
println("num of group item saved: ${re.group_csave[0]}")
} else { for gi < re.group_csave[0] {
println("query: $query") id := re.group_csave[1+gi*3]
lc := "-".repeat(err_pos) st := re.group_csave[1+gi*3+1]
println("err : $lc^") en := re.group_csave[1+gi*3+2]
err_str := re.get_parse_error_string(re_err) println("cg id: ${id} [${st}, ${en}] => [${text[st..en]}]")
println("ERROR: $err_str") gi++
} }
println("raw array: ${re.group_csave[0..gi*3+2-1]}")
// named capturing groups
println("named capturing groups:")
for g_name in re.group_map.keys() {
s,e := re.get_group(g_name)
if s >= 0 && e > s {
println("'${g_name}':[$s, $e] => '${text[s..e]}'")
} else {
println("Group [${g_name}] doesn't exist.")
}
}
} }
``` ```
@ -360,7 +343,7 @@ It is possible to set some flags in the regex parser that change the behavior of
```v ```v
// example of flag settings // example of flag settings
mut re := regex.new_regex() mut re := regex.new()
re.flag = regex.F_BIN re.flag = regex.F_BIN
``` ```
@ -382,22 +365,22 @@ These functions are helper that create the `RE` struct, a `RE` struct can be cre
```v ```v
// regex create a regex object from the query string and compile it // regex create a regex object from the query string and compile it
pub fn regex(in_query string) (RE,int,int) pub fn regex_opt(in_query string) ?RE
``` ```
#### **Base initializer** #### **Base initializer**
```v ```v
// new_regex create a REgex of small size, usually sufficient for ordinary use // new_regex create a REgex of small size, usually sufficient for ordinary use
pub fn new_regex() RE pub fn new() RE
// new_regex_by_size create a REgex of large size, mult specify the scale factor of the memory that will be allocated // new_regex_by_size create a REgex of large size, mult specify the scale factor of the memory that will be allocated
pub fn new_regex_by_size(mult int) RE pub fn new_by_size(mult int) RE
``` ```
After a base initializer is used, the regex expression must be compiled with: After a base initializer is used, the regex expression must be compiled with:
```v ```v
// compile return (return code, index) where index is the index of the error in the query string if return code is an error code // compile compiles the REgex returning an error if the compilation fails
pub fn (re mut RE) compile(in_txt string) (int,int) pub fn (re mut RE) compile_opt(in_txt string) ?
``` ```
### Operative Functions ### Operative Functions
@ -428,20 +411,9 @@ the following example code show how to visualize the syntax errors in the compil
```v ```v
query:= r"ciao da ab[ab-]" // there is an error, a range not closed!! query:= r"ciao da ab[ab-]" // there is an error, a range not closed!!
mut re := new_regex() mut re := new()
// re_err ==> is the return value, if < 0 it is an error re.compile_opt(query) or { println(err) }
// re_pos ==> if re_err < 0, re_pos is the error index in the query string
re_err, err_pos := re.compile(query)
// print the error if one happen
if re_err != COMPILE_OK {
println("query: $query")
lc := "-".repeat(err_pos)
println("err : $lc^")
err_str := re.get_parse_error_string(re_err) // get the error string
println("ERROR: $err_str")
}
// output!! // output!!
@ -543,7 +515,7 @@ fn custom_print(txt string) {
println("my log: $txt") println("my log: $txt")
} }
mut re := new_regex() mut re := new()
re.log_func = custom_print // every debug output from now will call this function re.log_func = custom_print // every debug output from now will call this function
``` ```
@ -571,38 +543,29 @@ tests = [
fn example() { fn example() {
for c,tst in tests { for c,tst in tests {
mut re := regex.new_regex() mut re := regex.new()
re_err, err_pos := re.compile(tst.query) re.compile_opt(tst.query) or { println(err) continue }
if re_err == regex.COMPILE_OK {
// print the query parsed with the groups ids // print the query parsed with the groups ids
re.debug = 1 // set debug on at minimum level re.debug = 1 // set debug on at minimum level
println("#${c:2d} query parsed: ${re.get_query()}") println("#${c:2d} query parsed: ${re.get_query()}")
re.debug = 0 re.debug = 0
// do the match // do the match
start, end := re.match_string(tst.source) start, end := re.match_string(tst.source)
if start >= 0 && end > start { if start >= 0 && end > start {
println("#${c:2d} found in: [$start, $end] => [${tst.source[start..end]}]") println("#${c:2d} found in: [$start, $end] => [${tst.source[start..end]}]")
} }
// print the groups // print the groups
mut gi := 0 mut gi := 0
for gi < re.groups.len { for gi < re.groups.len {
if re.groups[gi] >= 0 { if re.groups[gi] >= 0 {
println("group ${gi/2:2d} :[${tst.source[re.groups[gi]..re.groups[gi+1]]}]") println("group ${gi/2:2d} :[${tst.source[re.groups[gi]..re.groups[gi+1]]}]")
} }
gi += 2 gi += 2
} }
println("") println("")
} else {
// print the compile error
println("query: $tst.query")
lc := "-".repeat(err_pos-1)
println("err : $lc^")
err_str := re.get_parse_error_string(re_err)
println("ERROR: $err_str")
}
} }
} }

View File

@ -1,6 +1,6 @@
/* /*
regex 0.9e regex 0.9g
Copyright (c) 2019-2020 Dario Deledda. All rights reserved. Copyright (c) 2019-2020 Dario Deledda. All rights reserved.
Use of this source code is governed by an MIT license Use of this source code is governed by an MIT license
@ -19,7 +19,7 @@ module regex
import strings import strings
pub const( pub const(
v_regex_version = "0.9e" // regex module version v_regex_version = "0.9g" // regex module version
max_code_len = 256 // default small base code len for the regex programs max_code_len = 256 // default small base code len for the regex programs
max_quantifier = 1073741824 // default max repetitions allowed for the quantifiers = 2^30 max_quantifier = 1073741824 // default max repetitions allowed for the quantifiers = 2^30
@ -912,7 +912,12 @@ fn (re RE) parse_groups(in_txt string, in_i int) (int, bool, string, int) {
// main compiler // main compiler
// //
// compile return (return code, index) where index is the index of the error in the query string if return code is an error code // compile return (return code, index) where index is the index of the error in the query string if return code is an error code
[deprecated]
pub fn (mut re RE) compile(in_txt string) (int,int) { pub fn (mut re RE) compile(in_txt string) (int,int) {
return re.impl_compile(in_txt)
}
fn (mut re RE) impl_compile(in_txt string) (int,int) {
mut i := 0 // input string index mut i := 0 // input string index
mut pc := 0 // program counter mut pc := 0 // program counter
mut tmp_code := u32(0) mut tmp_code := u32(0)
@ -2187,6 +2192,7 @@ Public functions
// //
// regex create a regex object from the query string // regex create a regex object from the query string
[deprecated]
pub fn regex(in_query string) (RE,int,int){ pub fn regex(in_query string) (RE,int,int){
mut re := RE{} mut re := RE{}
re.prog = [Token{}].repeat(in_query.len+1) re.prog = [Token{}].repeat(in_query.len+1)
@ -2198,12 +2204,17 @@ pub fn regex(in_query string) (RE,int,int){
} }
// new_regex create a RE of small size, usually sufficient for ordinary use // new_regex create a RE of small size, usually sufficient for ordinary use
[deprecated]
pub fn new_regex() RE { pub fn new_regex() RE {
return new_regex_by_size(1) return impl_new_regex_by_size(1)
} }
// new_regex_by_size create a RE of large size, mult specify the scale factor of the memory that will be allocated // new_regex_by_size create a RE of large size, mult specify the scale factor of the memory that will be allocated
[deprecated]
pub fn new_regex_by_size(mult int) RE { pub fn new_regex_by_size(mult int) RE {
return impl_new_regex_by_size(mult)
}
fn impl_new_regex_by_size(mult int) RE {
mut re := RE{} mut re := RE{}
re.prog = [Token{}].repeat(max_code_len*mult) // max program length, default 256 istructions re.prog = [Token{}].repeat(max_code_len*mult) // max program length, default 256 istructions
re.cc = [CharClass{}].repeat(max_code_len*mult) // char class list re.cc = [CharClass{}].repeat(max_code_len*mult) // char class list

View File

@ -0,0 +1,34 @@
module regex
import strings
// compile_opt compile RE pattern string
pub fn (mut re RE) compile_opt(pattern string) ? {
re_err,err_pos := re.impl_compile(pattern)
if re_err != compile_ok {
mut err_msg := strings.new_builder(300)
err_msg.write("query: $pattern\n")
line := "-".repeat(err_pos)
err_msg.write("err : ${line}^\n")
err_str := re.get_parse_error_string(re_err)
err_msg.write("ERROR: $err_str\n")
return error_with_code(err_msg.str(), re_err)
}
}
// new_regex create a RE of small size, usually sufficient for ordinary use
pub fn new() RE {
return impl_new_regex_by_size(1)
}
// new_regex_by_size create a RE of large size, mult specify the scale factor of the memory that will be allocated
pub fn new_by_size(mult int) RE {
return impl_new_regex_by_size(mult)
}
// regex_opt create new RE object from RE pattern string
pub fn regex_opt(pattern string) ?RE {
mut re := new()
re.compile_opt(pattern)?
return re
}

View File

@ -175,48 +175,52 @@ fn test_regex(){
// debug print // debug print
//println("#$c [$to.src] q[$to.q] ($to.s, $to.e)") //println("#$c [$to.src] q[$to.q] ($to.s, $to.e)")
mut re, re_err, _ := regex.regex(to.q) mut re := regex.regex_opt(to.q) or {
eprintln('err: $err')
assert false
continue
}
re.group_csave = [-1].repeat(3*20+1) re.group_csave = [-1].repeat(3*20+1)
if re_err == regex.compile_ok { start, end := re.match_string(to.src)
start, end := re.match_string(to.src)
mut tmp_str := "" mut tmp_str := ""
if start >= 0 && end > start{ if start >= 0 && end > start{
tmp_str = to.src[start..end] tmp_str = to.src[start..end]
} }
if start != to.s || end != to.e { if start != to.s || end != to.e {
println("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end") println("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end")
println("ERROR!") println("ERROR!")
//C.printf("ERROR!! res:(%d, %d) refh:(%d, %d)\n",start, end, to.s, to.e) //C.printf("ERROR!! res:(%d, %d) refh:(%d, %d)\n",start, end, to.s, to.e)
assert false assert false
break continue
} }
// check cgroups // check cgroups
if re.group_csave.len == 0 || re.group_csave[0] != to.cg[0] { if re.group_csave.len == 0 || re.group_csave[0] != to.cg[0] {
println("Capturing group len error!") println("Capturing group len error!")
assert false
continue
}
// check captured groups
mut ln := re.group_csave[0]*3
for ln > 0 {
if re.group_csave[ln] != to.cg[ln] {
assert false assert false
} }
ln--
}
// check captured groups // check named captured groups
mut ln := re.group_csave[0]*3 for k in to.cgn.keys() {
for ln > 0 { if to.cgn[k] != (re.group_map[k]-1) { // we have -1 because the map not found is 0, in groups we start from 0 and we store using +1
if re.group_csave[ln] != to.cg[ln] { println("Named capturing group error! [$k]")
assert false assert false
} continue
ln--
} }
// check named captured groups
for k in to.cgn.keys() {
if to.cgn[k] != (re.group_map[k]-1) { // we have -1 because the map not found is 0, in groups we start from 0 and we store using +1
println("Named capturing group error! [$k]")
assert false
}
}
} }
} }
@ -225,29 +229,27 @@ fn test_regex(){
// debug print // debug print
//println("#$c [$to.src] q[$to.q] $to.r") //println("#$c [$to.src] q[$to.q] $to.r")
mut re, re_err, err_pos := regex.regex(to.q) mut re := regex.regex_opt(to.q) or {
if re_err == regex.compile_ok { eprintln('err: $err')
res := re.find_all(to.src)
if res.len != to.r.len {
println("ERROR: find_all, array of different size.")
assert false
}
for c1,i in res {
if i != to.r[c1] {
println("ERROR: find_all, different indexes.")
assert false
}
}
} else {
println("query: $to.q")
lc := "-".repeat(err_pos-1)
println("err : $lc^")
err_str := re.get_parse_error_string(re_err)
println("ERROR: $err_str")
assert false assert false
continue
} }
res := re.find_all(to.src)
if res.len != to.r.len {
println("ERROR: find_all, array of different size.")
assert false
continue
}
for c1,i in res {
if i != to.r[c1] {
println("ERROR: find_all, different indexes.")
assert false
continue
}
}
} }
// check replace // check replace
@ -255,97 +257,81 @@ fn test_regex(){
// debug print // debug print
//println("#$c [$to.src] q[$to.q] $to.r") //println("#$c [$to.src] q[$to.q] $to.r")
mut re, re_err, err_pos := regex.regex(to.q) mut re := regex.regex_opt(to.q) or {
if re_err == regex.compile_ok { eprintln('err: $err')
res := re.replace(to.src,to.rep)
if res != to.r {
println("ERROR: replace.")
assert false
}
} else {
println("query: $to.q")
lc := "-".repeat(err_pos-1)
println("err : $lc^")
err_str := re.get_parse_error_string(re_err)
println("ERROR: $err_str")
assert false assert false
continue
} }
res := re.replace(to.src,to.rep)
if res != to.r {
println("ERROR: replace.")
assert false
continue
}
} }
// check match and find // check match and find
for c,to in match_test_suite { for c,to in match_test_suite {
// debug print // debug print
//println("#$c [$to.src] q[$to.q] $to.s") println("#$c [$to.src] q[$to.q] $to.s $to.e")
// test the find // test the find
if to.s > 0 { if to.s > 0 {
mut re, re_err, err_pos := regex.regex(to.q) mut re := regex.regex_opt(to.q) or {
if re_err == regex.compile_ok { eprintln('err: $err')
//q_str := re.get_query()
//println("Query: $q_str")
start,end := re.find(to.src)
if start != to.s || end != to.e {
err_str := re.get_parse_error_string(start)
println("ERROR : $err_str")
assert false
} else {
//tmp_str := text[start..end]
//println("found in [$start, $end] => [$tmp_str]")
assert true
}
} else {
println("query: $to.q")
lc := "-".repeat(err_pos-1)
println("err : $lc^")
err_str := re.get_parse_error_string(re_err)
println("ERROR: $err_str")
assert false assert false
continue
}
// q_str := re.get_query()
// println("Query: $q_str")
start,end := re.find(to.src)
if start != to.s || end != to.e {
err_str := re.get_parse_error_string(start)
println("ERROR : $err_str")
assert false
} else {
//tmp_str := text[start..end]
//println("found in [$start, $end] => [$tmp_str]")
assert true
} }
continue continue
} }
// test the match // test the match
mut re := regex.new_regex() mut re := regex.new()
//re.debug = true //re.debug = true
re_err,err_pos := re.compile(to.q) re.compile_opt(to.q) or {
if re_err == regex.compile_ok { eprintln('err: $err')
//println("#$c [$to.src] q[$to.q]")
start, end := re.match_string(to.src)
mut tmp_str := ""
if start >= 0 && end > start{
tmp_str = to.src[start..end]
}
if start != to.s || end != to.e {
println("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end")
println("ERROR!")
//C.printf("ERROR!! res:(%d, %d) refh:(%d, %d)\n",start, end, to.s, to.e)
assert false
break
}
// rerun to test consistency
tmp_str1 := to.src.clone()
start1, end1 := re.match_string(tmp_str1)
if start1 != start || end1 != end {
println("two run ERROR!!")
assert false
break
}
} else {
println("query: $to.q")
lc := "-".repeat(err_pos-1)
println("err : $lc^")
err_str := re.get_parse_error_string(re_err)
println("ERROR: $err_str")
assert false assert false
break continue
} }
//println("#$c [$to.src] q[$to.q]")
start, end := re.match_string(to.src)
mut tmp_str := ""
if start >= 0 && end > start{
tmp_str = to.src[start..end]
}
if start != to.s || end != to.e {
println("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end")
println("ERROR!")
//C.printf("ERROR!! res:(%d, %d) refh:(%d, %d)\n",start, end, to.s, to.e)
assert false
continue
}
// rerun to test consistency
tmp_str1 := to.src.clone()
start1, end1 := re.match_string(tmp_str1)
if start1 != start || end1 != end {
println("two run ERROR!!")
assert false
continue
}
} }
} }