From 25fabac059675a1a9348694977d45f8fa58f9f19 Mon Sep 17 00:00:00 2001 From: penguindark <57967770+penguindark@users.noreply.github.com> Date: Thu, 16 Jan 2020 00:39:33 +0100 Subject: [PATCH] regex 0.9c --- vlib/regex/README.md | 411 ++++++++++++++++++++++++++++++++++++++++ vlib/regex/regex.v | 196 ++++++++++++------- vlib/regex/regex_test.v | 1 + 3 files changed, 542 insertions(+), 66 deletions(-) create mode 100644 vlib/regex/README.md diff --git a/vlib/regex/README.md b/vlib/regex/README.md new file mode 100644 index 0000000000..0343aed4d3 --- /dev/null +++ b/vlib/regex/README.md @@ -0,0 +1,411 @@ +# V RegEx (Regular expression) 0.9c + +[TOC] + +## introduction + +Write here the introduction + +## Basic assumption + +In this release, during the writing of the code some assumption are made and are valid for all the features. + +1. The matching stop at the end of the string not at the newline chars +2. The basic element of this regex engine are the tokens, in aquery string a simple char is a token. The token is the atomic unit of this regex engine. + +## Match positional limiter + +The module supports the following features: + +- `$` `^` delimiter + + + +`^` (Caret.) Matches the start of the string + +`?` Matches the end of the string + +## Tokens + +The token are the atomic unit used by this regex engine and can be one of the following: + +### Simple char + +this token is a simple single character like `a`. + +### Char class (cc) + +The cc match all the char specified in its inside, it is delimited by square brackets `[ ]` + +the sequence of chars in the class is evaluated with an OR operation. + +For example the following cc `[abc]` match any char that is or `a` or `b` or `c` but doesn't match `C` or `z`. + +Inside a cc is possible to specify a "range" of chars, for example `[ad-f]` is equivalent to write `[adef]`. + +A cc can have different ranges in the same like `[a-zA-z0-9]` that match all the lowercase,uppercase and numeric chars. + +It is possible negate the cc using the caret char at the start of the cc like: `[^abc]` that match every char that is not `a` or `b` or `c`. + +A cc can contain meta-chars like: `[a-z\d]` that match all the lowercase latin chars `a-z` and all the digits `\d`. + +It is possible to mix all the properties of the char class together. + +### Meta-chars + +A meta-char is specified by a back slash before a char like `\w` in this case the meta-char is `w`. + +A meta-char can match different type of chars. + +* `\w` match an alphanumeric char `[a-zA-Z0-9]` +* `\W` match a non alphanumeric char +* `\d` match a digit `[0-9]` +* `\D` match a non digit +* `\s`match a space char, one of `[' ','\t','\n','\r','\v','\f']` +* `\S` match a non space char +* `\a` match only a lowercase char `[a-z]` +* `\A` match only an uppercase char `[A-Z]` + +### Quantifier + +Each token can have a quantifier that specify how many times the char can or must be matched. + +**Short quantifier** + +- `?` match 0 or 1 time, `a?b` match both `ab` or `b` +- `+` match at minimum 1 time, `a+` match both `aaa` or `a` +- `*` match 0 or more time, `a*b` match both `aaab` or `ab` or `b` + +**Long quantifier** + +- `{x}` match exactly x time, `a{2}` match `aa` but doesn't match `aaa` or `a` +- `{min,}` match at minimum min time, `a{2,}` match `aaa` or `aa` bit doesn't march `a` +- `{,max}` match at least 1 and maximum max time, `a{,2}` match `a` and `aa` but doesn't match `aaa` +- `{min,max}` match from min times to max times, `a{2,3}` match `aa` and `aaa` but doesn't match `a` or `aaaa` + +a long quantifier may have a `greedy` flag that is the `?` char after the brackets, `{2,4}?` means to match at the minimum possible tokens thus 2. + +### dot char + +the dot is a particular meta char that match "any char", is more simple explain it with an example: + +supposed to have `abccc ddeef` as string to parse with regex, the following table show the query strings and the result of parsing source string. + +| query string | result | +| ------------ | ------ | +| `.*c` | `abc` | +| `.*dd` | `abcc dd` | +| `ab.*e` | `abccc dde` | +| `ab.{3} .*e` | `abccc dde` | + +the dot char match any char until the next token match is satisfied. + +### OR token + +the token `|` is an logic OR operation between two consecutive tokens, `a|b` match a char that is `a` or `b`. + +The or token can work in a "chained way": `a|(b)|cd ` test first `a` if the char is not `a` the test the group `(b)` and if the group doesn't match test the token `c`. + +**note: The OR work at token level! It doesn't work at concatenation level!** + +A query string like `abc|bde` is not equal to `(abc)|(bde)`!! + +The OR work only on `c|b` not at char concatenation level. + + + +### Groups + +Groups are a method to create complex patterns with repetition of blocks of token. + +The groups a delimited by round brackets `( )`, groups can be nested and can have a quantifier as all the tokens. + +`c(pa)+z` match `cpapaz` or `cpaz` or `cpapapaz` . + +`(c(pa)+z ?)+` match `cpaz cpapaz cpapapaz` or `cpapaz` + +let analyze this last case, first we have the group 0 that are the most outer round brackets `(...)+`, this group has a quantifier that say to match its content at least one time `+`. + +After we have a simple char token `c` and a second group that is the number 1 `(pa)+`, this group try to match the sequence `pa` at least one time as specified by the `+` quantifier. + +After we have another simple token `z` and another simple token ` ?` that is the space char (ascii code 32) with the `?` quantifier that say to capture this char or 0 or 1 time + +This explain because the `(c(pa)+z ?)+` query string can match `cpaz cpapaz cpapapaz` . + +In this implementation the groups are capturing groups that means that the last result for each group can be retrieved from the `RE` struct. + +The captured groups are store as couple of index in the field `groups` that is an `[]int` each captured group + +**example:** + +```v +text := "cpaz cpapaz cpapapaz" +query:= r"(c(pa)+z ?)+" +re, _, _ := regex.regex(query) + +println(re.get_query()) +// #0(c#1(pa)+z ?)+ // #0 and #1 are the ids of the groups, are shown if re.debug is 1 or 2 + +start, end := re.match_string(text) +// [start=0, end=20] match => [cpaz cpapaz cpapapaz] + +mut gi := 0 +for gi < re.groups.len { + if re.groups[gi] >= 0 { + println("${gi/2} :[${text[re.groups[gi]..re.groups[gi+1]]}]") + } + gi += 2 +} +// groups captured +// 0 :[cpapapaz] +// 1 :[pa] + + +``` + +**note:** *to show the `group id number` in the result of the `get_query()` the flag `debug` of the RE object must be `1` or `2`* + +## Flags + +It is possible to set some flag in the regex parser that change the behavior of the parser itself. + +```v +// example of flag settings +mut re := regex.new_regex() +re.flag = regex.F_BIN + +``` + +- `F_BIN`: parse a string as bytes, utf-8 management disabled. + +- `F_EFM`: exit on the first char match in the query, used by the find function +- `F_MS`: match only if the index of the start match is 0, same as `^` at the start of query string +- `F_ME`: match only if the end index of the match is the last char of the input string, same as `$` end of query string +- `F_NL`: stop the matching if found a new line char `\n` or `\r` + +## Functions + +### Initializer + +These function are helper that create the `RE` struct, the struct can be manually create if you need it + +**Simplified initializer** + +```v +// regex create a regex object from the query string and compile it +pub fn regex(in_query string) (RE,int,int) +``` + +**Base initializer** + +```v +// new_regex create a REgex of small size, usually sufficient for ordinary use +pub fn new_regex() RE + +// new_regex_by_size create a REgex of large size, mult specify the scale factor of the memory that will be allocated +pub fn new_regex_by_size(mult int) RE +``` +After the base initializer use the regex expression must be compiled with: +```v +// compile return (return code, index) where index is the index of the error in the query string if return code is an error code +pub fn (re mut RE) compile(in_txt string) (int,int) +``` + +### Functions + +These are the operative functions + +```v +// match_string try to match the input string, return start and end index if found else start is -1 +pub fn (re mut RE) match_string(in_txt string) (int,int) + +// find try to find the first match in the input string, return start and end index if found else start is -1 +pub fn (re mut RE) find(in_txt string) (int,int) + +// find all the non overlapping occurrences of the match pattern, return a list of start end indexes +pub fn (re mut RE) find_all(in_txt string) []int + +// replace return a string where the matches are replaced with the replace string, only non overlapped match are used +pub fn (re mut RE) replace(in_txt string, repl string) string +``` + +## Debugging + +This module has few small utilities to help the writing of regex expressions. + +**Syntax errors highlight** + +the following example code show how to visualize the syntax errors in the compiling pahse: + +```v +query:= r"ciao da ab[ab-]" // there is an error, a range not closed +mut re := new_regex() + +// re_err ==> is the return value, if < 0 it is an error +// re_pos ==> if re_err < 0, re_pos is the error index in the query string +re_err, err_pos := re.compile(query) + +// print the error if one happen +if re_err != COMPILE_OK { + println("query: $query") + lc := "-".repeat(err_pos) + println("err : $lc^") + err_str := re.get_parse_error_string(re_err) // get the error string + println("ERROR: $err_str") +} + +// output!! + +//query: ciao da ab[ab-] +//err : ----------^ +//ERROR: ERR_SYNTAX_ERROR + +``` + +**Compiled code** + +It is possible view the compiled code calling the function `get_query()` the result will something like this: + +``` +======================================== +v RegEx compiler v 0.9c output: +PC: 0 ist: 7fffffff [a] query_ch { 1, 1} +PC: 1 ist: 7fffffff [b] query_ch { 1,MAX} +PC: 2 ist: 88000000 PROG_END { 0, 0} +======================================== +``` + +`PC`:`int` is the program counter or step of execution, each single step is a token + +`ist`:`hex` is the token instruction id + +`[a]` is the char used by the token + +`query_ch` is the type of token + +`{m,n}` are the quantifier, the greedy flag `?` will be showed if present in the token + +**Log debug** + +The log debugger allow to print the status of the regex parser when the parser is running. + +It is possible to have two different level of debug: 1 is normal while 2 is verbose. + +here an example: + +*normal* + +list only the token instruction with the values + +``` +// re.flag = 1 // log level normal +flags: 00000000 +# 2 s: ist_load PC: 0=>7fffffff i,ch,len:[ 0,'a',1] f.m:[ -1, -1] query_ch: [a]{1,1}:0 (#-1) +# 5 s: ist_load PC: 1=>7fffffff i,ch,len:[ 1,'b',1] f.m:[ 0, 0] query_ch: [b]{2,3}:0? (#-1) +# 7 s: ist_load PC: 1=>7fffffff i,ch,len:[ 2,'b',1] f.m:[ 0, 1] query_ch: [b]{2,3}:1? (#-1) +# 10 PROG_END +``` + +*verbose* + +list all the instruction and states of the parser + +``` +flags: 00000000 +# 0 s: start PC: NA +# 1 s: ist_next PC: NA +# 2 s: ist_load PC: 0=>7fffffff i,ch,len:[ 0,'a',1] f.m:[ -1, -1] query_ch: [a]{1,1}:0 (#-1) +# 3 s: ist_quant_p PC: 0=>7fffffff i,ch,len:[ 1,'b',1] f.m:[ 0, 0] query_ch: [a]{1,1}:1 (#-1) +# 4 s: ist_next PC: NA +# 5 s: ist_load PC: 1=>7fffffff i,ch,len:[ 1,'b',1] f.m:[ 0, 0] query_ch: [b]{2,3}:0? (#-1) +# 6 s: ist_quant_p PC: 1=>7fffffff i,ch,len:[ 2,'b',1] f.m:[ 0, 1] query_ch: [b]{2,3}:1? (#-1) +# 7 s: ist_load PC: 1=>7fffffff i,ch,len:[ 2,'b',1] f.m:[ 0, 1] query_ch: [b]{2,3}:1? (#-1) +# 8 s: ist_quant_p PC: 1=>7fffffff i,ch,len:[ 3,'b',1] f.m:[ 0, 2] query_ch: [b]{2,3}:2? (#-1) +# 9 s: ist_next PC: NA +# 10 PROG_END +# 11 PROG_END +``` + +the column have the following meaning: + +`# 2` number of actual steps from the start of parsing + +`s: ist_next` state of the present step + +`PC: 1` program counter of the step + +`=>7fffffff ` hex code of the instruction + +`i,ch,len:[ 0,'a',1]` `i` index in the source string, `ch` the char parsed, `len` the length in byte of the char parsed + +`f.m:[ 0, 1]` `f` index of the first match in the source string, `m` index that is actual matching + +`query_ch: [b]` token in use and its char + +`{2,3}:1?` quantifier `{min,max}`, `:1` is the actual counter of repetition, `?` is the greedy flag if present + +## Example code + +Here there is a simple code to perform some basically match of strings + +```v +struct TestObj { + source string // source string to parse + query string // regex query string + s int // expected match start index + e int // expected match end index +} +const ( +tests = [ + TestObj{"this is a good.",r"this (\w+) a",0,9}, + TestObj{"this,these,those. over",r"(th[eio]se?[,. ])+",0,17}, + TestObj{"test1@post.pip.com, pera",r"[\w]+@([\w]+\.)+\w+",0,18}, + TestObj{"cpapaz ole. pippo,",r".*c.+ole.*pi",0,14}, + TestObj{"adce aabe",r"(a(ab)+)|(a(dc)+)e",0,4}, +] +) + +fn example() { + for c,tst in tests { + mut re := regex.new_regex() + re_err, err_pos := re.compile(tst.query) + if re_err == regex.COMPILE_OK { + + // print the query parsed with the groups ids + re.debug = 1 // set debug on at minimum level + println("#${c:2d} query parsed: ${re.get_query()}") + re.debug = 0 + + // do the match + start, end := re.match_string(tst.source) + if start >= 0 && end > start { + println("#${c:2d} found in: [$start, $end] => [${tst.source[start..end]}]") + } + + // print the groups + mut gi := 0 + for gi < re.groups.len { + if re.groups[gi] >= 0 { + println("group ${gi/2:2d} :[${tst.source[re.groups[gi]..re.groups[gi+1]]}]") + } + gi += 2 + } + println("") + } else { + // print the compile error + println("query: $tst.query") + lc := "-".repeat(err_pos-1) + println("err : $lc^") + err_str := re.get_parse_error_string(re_err) + println("ERROR: $err_str") + } + } +} + +fn main() { + example() +} +``` + +more example code is available in the test code for the `regex` module `vlib\regex\regex_test.v`. + diff --git a/vlib/regex/regex.v b/vlib/regex/regex.v index 6a58215448..cfc93070b9 100644 --- a/vlib/regex/regex.v +++ b/vlib/regex/regex.v @@ -1,6 +1,6 @@ /********************************************************************** * -* regex 0.9b +* regex 0.9c * * Copyright (c) 2019 Dario Deledda. All rights reserved. * Use of this source code is governed by an MIT license @@ -18,7 +18,7 @@ module regex import strings pub const( - V_REGEX_VERSION = "0.9b" // regex module version + V_REGEX_VERSION = "0.9c" // regex module version MAX_CODE_LEN = 256 // default small base code len for the regex programs MAX_QUANTIFIER = 1073741824 // default max repetitions allowed for the quantifiers = 2^30 @@ -47,7 +47,6 @@ const( //************************************* // regex program instructions //************************************* - SIMPLE_CHAR_MASK = u32(0x80000000) // single char mask IST_SIMPLE_CHAR = u32(0x7FFFFFFF) // single char instruction, 31 bit available to char // char class 11 0100 AA xxxxxxxx @@ -88,9 +87,11 @@ fn utf8util_char_len(b byte) int { // get_char get a char from position i and return an u32 with the unicode code [inline] -fn get_char(in_txt string, i int) (u32,int) { +fn (re RE) get_char(in_txt string, i int) (u32,int) { // ascii 8 bit - if in_txt.str[i] & 0x80 == 0 { + if (re.flag & F_BIN) !=0 || + in_txt.str[i] & 0x80 == 0 + { return u32(in_txt.str[i]), 1 } // unicode char @@ -106,9 +107,11 @@ fn get_char(in_txt string, i int) (u32,int) { // get_charb get a char from position i and return an u32 with the unicode code [inline] -fn get_charb(in_txt byteptr, i int) (u32,int) { +fn (re RE) get_charb(in_txt byteptr, i int) (u32,int) { // ascii 8 bit - if in_txt[i] & 0x80 == 0 { + if (re.flag & F_BIN) !=0 || + in_txt[i] & 0x80 == 0 + { return u32(in_txt[i]), 1 } // unicode char @@ -215,8 +218,7 @@ fn utf8_str(ch u32) string { // simple_log default log function fn simple_log(txt string) { - C.fprintf(C.stdout, "%s",txt.str) - C.fflush(stdout) + print(txt) } /****************************************************************************** @@ -228,9 +230,14 @@ struct Token{ mut: ist u32 = u32(0) + // char + ch u32 = u32(0)// char of the token if any + ch_len byte = byte(0) // char len + // Quantifiers / branch - rep_min int = 0 // used also for jump next in the OR branch [no match] pc jump - rep_max int = 0 // used also for jump next in the OR branch [ match] pc jump + rep_min int = 0 // used also for jump next in the OR branch [no match] pc jump + rep_max int = 0 // used also for jump next in the OR branch [ match] pc jump + greedy bool = false // greedy quantifier flag // Char class cc_index int = -1 @@ -240,15 +247,14 @@ mut: // validator function pointer and control char validator fn (byte) bool - v_ch u32 = u32(0) // debug, helper for recreate the query string // groups variables - group_rep int = 0 // repetition of the group - group_id int = -1 // id of the group - goto_pc int = -1 // jump to this PC if is needed + group_rep int = 0 // repetition of the group + group_id int = -1 // id of the group + goto_pc int = -1 // jump to this PC if is needed // OR flag for the token - next_is_or bool = false // true if the next token is an OR + next_is_or bool = false // true if the next token is an OR } fn (tok mut Token) reset() { @@ -262,13 +268,14 @@ fn (tok mut Token) reset() { ******************************************************************************/ pub const ( //F_FND = 0x00000001 // check until the end of the input string, it act like a "find first match", not efficient!! - //F_NL = 0x00000002 // end the match when find a new line symbol //F_PM = 0x00000004 // partial match: if the source text finish and the match is positive until then return true + F_NL = 0x00000002 // end the match when find a new line symbol F_MS = 0x00000008 // match true only if the match is at the start of the string F_ME = 0x00000010 // match true only if the match is at the end of the string F_EFM = 0x01000000 // exit on first token matched, used by search + F_BIN = 0x02000000 // work only on bytes, ignore utf-8 ) struct StateDotObj{ @@ -364,7 +371,7 @@ fn (re RE) parse_bsls(in_txt string, in_i int) (int,int){ for i < in_txt.len { // get our char - char_tmp,char_len := get_char(in_txt,i) + char_tmp,char_len := re.get_char(in_txt,i) ch := byte(char_tmp) if status == .start && ch == `\\` { @@ -512,7 +519,7 @@ fn (re mut RE) parse_char_class(in_txt string, in_i int) (int, int, u32) { } // get our char - char_tmp,char_len := get_char(in_txt,i) + char_tmp,char_len := re.get_char(in_txt,i) ch := byte(char_tmp) //C.printf("CC #%3d ch: %c\n",i,ch) @@ -614,11 +621,13 @@ enum Quant_parse_state { min_parse, comma_checked, max_parse, + greedy, + gredy_parse, finish } -// parse_quantifier return (min, max, str_len) of a {min,max} quantifier starting after the { char -fn (re RE) parse_quantifier(in_txt string, in_i int) (int, int, int) { +// parse_quantifier return (min, max, str_len) of a {min,max}? quantifier starting after the { char +fn (re RE) parse_quantifier(in_txt string, in_i int) (int, int, int, bool) { mut status := Quant_parse_state.start mut i := in_i @@ -634,7 +643,7 @@ fn (re RE) parse_quantifier(in_txt string, in_i int) (int, int, int) { // exit on no compatible char with {} quantifier if utf8util_char_len(ch) != 1 { - return ERR_SYNTAX_ERROR,i,0 + return ERR_SYNTAX_ERROR,i,0,false } // min parsing skip if comma present @@ -670,13 +679,17 @@ fn (re RE) parse_quantifier(in_txt string, in_i int) (int, int, int) { // single value {4} if status == .min_parse && ch == `}` { q_max = q_min - return q_min, q_max, i-in_i+2 + + status = .greedy + continue } // end without max if status == .comma_checked && ch == `}` { q_max = MAX_QUANTIFIER - return q_min, q_max, i-in_i+2 + + status = .greedy + continue } // start max parsing @@ -696,17 +709,40 @@ fn (re RE) parse_quantifier(in_txt string, in_i int) (int, int, int) { continue } - // end the parsing + // finished the quantifier if status == .max_parse && ch == `}` { - return q_min, q_max, i-in_i+2 + status = .greedy + continue } - + + // check if greedy flag char ? is present + if status == .greedy { + if i+1 < in_txt.len { + i++ + status = .gredy_parse + continue + } + return q_min, q_max, i-in_i+2, false + } + + // check the greedy flag + if status == .gredy_parse { + if ch == `?` { + return q_min, q_max, i-in_i+2, true + } else { + i-- + return q_min, q_max, i-in_i+2, false + } + } + + + // not a {} quantifier, exit - return ERR_SYNTAX_ERROR,i,0 + return ERR_SYNTAX_ERROR, i, 0, false } // not a conform {} quantifier - return ERR_SYNTAX_ERROR,i,0 + return ERR_SYNTAX_ERROR, i, 0, false } // @@ -733,7 +769,7 @@ pub fn (re mut RE) compile(in_txt string) (int,int) { mut char_len := 0 //C.printf("i: %3d ch: %c\n", i, in_txt.str[i]) - char_tmp,char_len = get_char(in_txt,i) + char_tmp,char_len = re.get_char(in_txt,i) // // check special cases: $ ^ @@ -848,13 +884,14 @@ pub fn (re mut RE) compile(in_txt string) (int,int) { } `{` { - min,max,tmp := re.parse_quantifier(in_txt, i+1) + min, max, tmp, greedy := re.parse_quantifier(in_txt, i+1) // it is a quantifier if min >= 0 { - //C.printf("{%d,%d}\n str:[%s]\n",min,max,in_txt[i..i+tmp]) + //C.printf("{%d,%d}\n str:[%s] greedy: %d\n", min, max, in_txt[i..i+tmp], greedy) i = i + tmp re.prog[pc-1].rep_min = min re.prog[pc-1].rep_max = max + re.prog[pc-1].greedy = greedy continue } else { @@ -879,7 +916,7 @@ pub fn (re mut RE) compile(in_txt string) (int,int) { } } - // IST_CHAR_CLASS + // IST_CHAR_CLASS_* if char_len==1 && pc >= 0{ if byte(char_tmp) == `[` { cc_index,tmp,cc_type := re.parse_char_class(in_txt, i+1) @@ -912,14 +949,14 @@ pub fn (re mut RE) compile(in_txt string) (int,int) { re.prog[pc].rep_min = 1 re.prog[pc].rep_max = 1 re.prog[pc].validator = BSLS_VALIDATOR_ARRAY[bsls_index].validator - re.prog[pc].v_ch = BSLS_VALIDATOR_ARRAY[bsls_index].ch + re.prog[pc].ch = BSLS_VALIDATOR_ARRAY[bsls_index].ch pc = pc + 1 continue } // this is an escape char, skip the bsls and continue as a normal char else if bsls_index == NO_MATCH_FOUND { i += char_len - char_tmp,char_len = get_char(in_txt,i) + char_tmp,char_len = re.get_char(in_txt,i) // continue as simple char } // if not an escape or a bsls char then it is an error (at least for now!) @@ -930,8 +967,9 @@ pub fn (re mut RE) compile(in_txt string) (int,int) { } // IST_SIMPLE_CHAR - tmp_code = (tmp_code | char_tmp) & IST_SIMPLE_CHAR - re.prog[pc].ist = tmp_code + re.prog[pc].ist = IST_SIMPLE_CHAR + re.prog[pc].ch = char_tmp + re.prog[pc].ch_len = char_len re.prog[pc].rep_min = 1 re.prog[pc].rep_max = 1 //C.printf("char: %c\n",char_tmp) @@ -1044,7 +1082,7 @@ pub fn (re RE) get_code() string { res.write(" ") ist :=re.prog[pc1].ist if ist == IST_BSLS_CHAR { - res.write("[\\${re.prog[pc1].v_ch:1c}] BSLS") + res.write("[\\${re.prog[pc1].ch:1c}] BSLS") } else if ist == IST_PROG_END { res.write("PROG_END") stop_flag = true @@ -1060,8 +1098,8 @@ pub fn (re RE) get_code() string { res.write("( GROUP_START #:${re.prog[pc1].group_id}") } else if ist == IST_GROUP_END { res.write(") GROUP_END #:${re.prog[pc1].group_id}") - } else if ist & SIMPLE_CHAR_MASK == 0 { - res.write("[${ist & IST_SIMPLE_CHAR:1c}] query_ch") + } else if ist == IST_SIMPLE_CHAR { + res.write("[${re.prog[pc1].ch:1c}] query_ch") } if re.prog[pc1].rep_max == MAX_QUANTIFIER { @@ -1072,6 +1110,9 @@ pub fn (re RE) get_code() string { } else { res.write(" {${re.prog[pc1].rep_min:3d},${re.prog[pc1].rep_max:3d}}") } + if re.prog[pc1].greedy == true { + res.write("?") + } } res.write("\n") if stop_flag { @@ -1136,7 +1177,7 @@ pub fn (re RE) get_query() string { // bsls char if ch == IST_BSLS_CHAR { - res.write("\\${re.prog[i].v_ch:1c}") + res.write("\\${re.prog[i].ch:1c}") } // IST_DOT_CHAR @@ -1145,11 +1186,11 @@ pub fn (re RE) get_query() string { } // char alone - if ch & SIMPLE_CHAR_MASK == 0 { + if ch == IST_SIMPLE_CHAR { if byte(ch) in BSLS_ESCAPE_LIST { res.write("\\") } - res.write("${re.prog[i].ist:c}") + res.write("${re.prog[i].ch:c}") } // quantifier @@ -1166,6 +1207,9 @@ pub fn (re RE) get_query() string { } else { res.write("{${re.prog[i].rep_min},${re.prog[i].rep_max}}") } + if re.prog[i].greedy == true { + res.write("?") + } } } @@ -1187,10 +1231,11 @@ enum match_state{ start = 0, stop, end, + new_line, - ist_load, // load and execute istruction - ist_next, // go to next istruction - ist_next_ks, // go to next istruction without clenaning the state + ist_load, // load and execute instruction + ist_next, // go to next instruction + ist_next_ks, // go to next instruction without clenaning the state ist_quant_p, // match positive ,quantifier check ist_quant_n, // match negative, quantifier check ist_quant_pg, // match positive ,group quantifier check @@ -1202,6 +1247,7 @@ fn state_str(s match_state) string { .start { return "start" } .stop { return "stop" } .end { return "end" } + .new_line { return "new line" } .ist_load { return "ist_load" } .ist_next { return "ist_next" } @@ -1277,7 +1323,7 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { re.log_func(buf2.str()) }else{ - // print only the exe istruction + // print only the exe instruction if (re.debug == 1 && m_state == .ist_load) || re.debug == 2 { @@ -1287,23 +1333,17 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { else if ist == 0 || m_state in [.start,.ist_next,.stop] { buf2.write("# ${step_count:3d} s: ${state_str(m_state):12s} PC: NA\n") }else{ - ch, char_len = get_charb(in_txt,i) + ch, char_len = re.get_charb(in_txt,i) buf2.write("# ${step_count:3d} s: ${state_str(m_state):12s} PC: ${pc:3d}=>") buf2.write("${ist:8x}".replace(" ","0")) buf2.write(" i,ch,len:[${i:3d},'${utf8_str(ch)}',${char_len}] f.m:[${first_match:3d},${state.match_index:3d}] ") - if ist & SIMPLE_CHAR_MASK == 0 { - if char_len < 4 { - tmp_c := ist & IST_SIMPLE_CHAR - buf2.write("query_ch: [${tmp_c:1c}]") - } else { - tmp_c := ist | IST_SIMPLE_CHAR - buf2.write("query_ch: [${tmp_c:1c}]") - } + if ist == IST_SIMPLE_CHAR { + buf2.write("query_ch: [${re.prog[pc].ch:1c}]") } else { if ist == IST_BSLS_CHAR { - buf2.write("BSLS [\\${re.prog[pc].v_ch:1c}]") + buf2.write("BSLS [\\${re.prog[pc].ch:1c}]") } else if ist == IST_PROG_END { buf2.write("PROG_END") } else if ist == IST_OR_BRANCH { @@ -1327,6 +1367,9 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { } else { buf2.write("{${re.prog[pc].rep_min},${re.prog[pc].rep_max}}:${re.prog[pc].rep}") } + if re.prog[pc].greedy == true { + buf2.write("?") + } buf2.write(" (#${group_index})\n") } re.log_func(buf2.str()) @@ -1338,7 +1381,7 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { //****************************************** // we're out of text, manage it - if i >= in_txt_len { + if i >= in_txt_len || m_state == .new_line { // manage groups if group_index >= 0 && state.match_index >= 0 { @@ -1376,7 +1419,7 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { // manage IST_DOT_CHAR if re.state_stack_index >= 0 { //C.printf("DOT CHAR text end management!\n") - // if DOT CHAR is not the last istruction and we are still going, then no match!! + // if DOT CHAR is not the last instruction and we are still going, then no match!! if pc < re.prog.len && re.prog[pc+1].ist != IST_PROG_END { return NO_MATCH_FOUND,0 } @@ -1395,7 +1438,7 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { continue } - // ist_next, next istruction reseting its state + // ist_next, next instruction reseting its state if m_state == .ist_next { pc = pc + 1 re.prog[pc].reset() @@ -1408,7 +1451,7 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { continue } - // ist_next_ks, next istruction keeping its state + // ist_next_ks, next instruction keeping its state if m_state == .ist_next_ks { pc = pc + 1 // check if we are in the program bounds @@ -1421,7 +1464,13 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { } // load the char - ch, char_len = get_charb(in_txt,i) + ch, char_len = re.get_charb(in_txt,i) + + // check new line if flag F_NL enabled + if (re.flag & F_NL) != 0 && char_len == 1 && byte(ch) in NEW_LINE_LIST { + m_state = .new_line + continue + } // check if stop if m_state == .stop { @@ -1547,7 +1596,7 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { m_state = .ist_next continue } - // IST_DOT_CHAR is the last istruction, get all + // IST_DOT_CHAR is the last instruction, get all else { //C.printf("We are the last one!\n") pc-- @@ -1613,12 +1662,11 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { } // simple char IST - else if ist & IST_SIMPLE_CHAR != 0 { + else if ist == IST_SIMPLE_CHAR { //C.printf("IST_SIMPLE_CHAR\n") state.match_flag = false - if (char_len<4 && ist == ch) || - (char_len == 4 && (ist | SIMPLE_CHAR_MASK) == ch ) + if re.prog[pc].ch == ch { state.match_flag = true @@ -1749,6 +1797,15 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { } else if rep >= re.prog[tmp_pc].rep_min { //C.printf("ist_quant_pg IN RANGE group_index:%d\n", group_index) + + // check greedy flag, if true exit on minimum + if re.prog[tmp_pc].greedy == true { + re.prog[tmp_pc].group_rep = 0 // clear the repetitions + group_index-- + m_state = .ist_next + continue + } + pc = re.prog[tmp_pc].goto_pc - 1 group_index-- m_state = .ist_next @@ -1832,6 +1889,13 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { // range ok, continue loop else if rep >= re.prog[pc].rep_min && rep < re.prog[pc].rep_max { //C.printf("ist_quant_p IN RANGE\n") + + // check greedy flag, if true exit on minimum + if re.prog[pc].greedy == true { + m_state = .ist_next + continue + } + m_state = .ist_load continue } diff --git a/vlib/regex/regex_test.v b/vlib/regex/regex_test.v index e29ca428c5..9248a1106d 100644 --- a/vlib/regex/regex_test.v +++ b/vlib/regex/regex_test.v @@ -65,6 +65,7 @@ match_test_suite = [ TestItem{"cpapaz ole. pippo,",r".*c.+ole.*pi",0,14}, TestItem{"cpapaz ole. pipipo,",r".*c.+ole.*p([ip])+o",0,18}, TestItem{"cpapaz ole. pipipo",r"^.*c.+ol?e.*p([ip])+o$",0,18}, + TestItem{"abbb",r"ab{2,3}?",0,3}, // negative TestItem{"zthis ciao",r"((t[hieo]+se?)\s*)+",-1,0},