From f81654e3a7b3aa0703920ed36ce762d4fa6c2aaf Mon Sep 17 00:00:00 2001 From: Delyan Angelov Date: Mon, 20 Dec 2021 14:15:51 +0200 Subject: [PATCH] builtin: add `s.match_glob(wildcard_pattern)` --- vlib/builtin/string.v | 109 ++++++++++++++++++++++++++ vlib/builtin/string_match_glob_test.v | 103 ++++++++++++++++++++++++ 2 files changed, 212 insertions(+) create mode 100644 vlib/builtin/string_match_glob_test.v diff --git a/vlib/builtin/string.v b/vlib/builtin/string.v index af1008d032..d3d4c60a9b 100644 --- a/vlib/builtin/string.v +++ b/vlib/builtin/string.v @@ -1753,3 +1753,112 @@ pub fn (s string) strip_margin_custom(del byte) string { return ret.vstring_with_len(count) } } + +// match_glob matches the string, with a Unix shell-style wildcard pattern. +// NB: wildcard patterns are NOT the same as regular expressions. +// They are much simpler, and do not allow backtracking, captures, etc. +// The special characters used in shell-style wildcards are: +// `*` - matches everything +// `?` - matches any single character +// `[seq]` - matches any of the characters in the sequence +// `[^seq]` - matches any character that is NOT in the sequence +// Any other character in `pattern`, is matched 1:1 to the corresponding +// character in `name`, including / and \. +// You can wrap the meta-characters in brackets too, i.e. `[?]` matches `?` +// in the string, and `[*]` matches `*` in the string. +// Example: assert 'ABCD'.match_glob('AB*') +// Example: assert 'ABCD'.match_glob('*D') +// Example: assert 'ABCD'.match_glob('*B*') +// Example: assert !'ABCD'.match_glob('AB') +[direct_array_access] +pub fn (name string) match_glob(pattern string) bool { + // Initial port based on https://research.swtch.com/glob.go + // See also https://research.swtch.com/glob + mut px := 0 + mut nx := 0 + mut next_px := 0 + mut next_nx := 0 + plen := pattern.len + nlen := name.len + for px < plen || nx < nlen { + if px < plen { + c := pattern[px] + match c { + `?` { + // single-character wildcard + if nx < nlen { + px++ + nx++ + continue + } + } + `*` { + // zero-or-more-character wildcard + // Try to match at nx. + // If that doesn't work out, restart at nx+1 next. + next_px = px + next_nx = nx + 1 + px++ + continue + } + `[` { + if nx < nlen { + wanted_c := name[nx] + mut bstart := px + mut is_inverted := false + mut inner_match := false + mut inner_idx := bstart + 1 + mut inner_c := 0 + if inner_idx < plen { + inner_c = pattern[inner_idx] + if inner_c == `^` { + is_inverted = true + inner_idx++ + } + } + for ; inner_idx < plen; inner_idx++ { + inner_c = pattern[inner_idx] + if inner_c == `]` { + break + } + if inner_c == wanted_c { + inner_match = true + for px < plen && pattern[px] != `]` { + px++ + } + break + } + } + if is_inverted { + if inner_match { + return false + } else { + px = inner_idx + } + } + } + px++ + nx++ + continue + } + else { + // an ordinary character + if nx < nlen && name[nx] == c { + px++ + nx++ + continue + } + } + } + } + if 0 < next_nx && next_nx <= nlen { + // A mismatch, try restarting: + px = next_px + nx = next_nx + continue + } + return false + } + // Matched all of `pattern` to all of `name` + return true +} diff --git a/vlib/builtin/string_match_glob_test.v b/vlib/builtin/string_match_glob_test.v new file mode 100644 index 0000000000..b08424f109 --- /dev/null +++ b/vlib/builtin/string_match_glob_test.v @@ -0,0 +1,103 @@ +import time + +fn test_match_glob_on_empty_string() { + assert ''.match_glob('') + assert !''.match_glob('x') +} + +fn test_match_glob_on_x() { + assert !'x'.match_glob('') + assert 'x'.match_glob('x') + assert 'xxx'.match_glob('*x') + assert 'xxx'.match_glob('x*') +} + +fn test_match_glob_on_abc() { + assert !'abc'.match_glob('') + assert 'abc'.match_glob('*') + // + assert !'abc'.match_glob('ab') + assert 'abc'.match_glob('abc') + assert 'abc'.match_glob('abc*') + // + assert 'abc'.match_glob('*c') + assert !'abc'.match_glob('*b') + assert 'abc'.match_glob('*bc') + assert 'abc'.match_glob('*abc') + // + assert 'abc'.match_glob('a*') + assert !'abc'.match_glob('b*') + assert 'abc'.match_glob('a*c') + // + assert 'abc'.match_glob('ab?') + assert 'abc'.match_glob('a??') + assert 'abc'.match_glob('???') + assert !'abc'.match_glob('??') + assert !'abc'.match_glob('?') +} + +fn test_match_glob_on_a() { + assert 'a'.match_glob('a') + assert 'a'.match_glob('?') + assert !'a'.match_glob('??') + assert 'a'.match_glob('*') + assert 'a'.match_glob('a*') + assert 'a'.match_glob('*a') +} + +fn test_match_glob_with_any_charset_patterns() { + assert 'axbxcxdxe'.match_glob('*c[xyz]d*') + assert 'axbxcxdxe'.match_glob('*c[yxz]d*') + assert 'axbxcxdxe'.match_glob('*c[zyx]d*') + // + assert 'axbxcxdxe'.match_glob('*dx[QeW]') + assert 'axbxcxdxe'.match_glob('*dx[QeW]*') + // + assert !'axbxcxdxe'.match_glob('*bx[QcW]') + assert 'axbxcxdxe'.match_glob('*bx[QcW]*') + // + assert !'axbxcxdxe'.match_glob('*zx[QeW]') + assert !'axbxcxdxe'.match_glob('*zx[QeW]*') +} + +fn test_match_glob_with_none_of_charset_patterns() { + assert 'axbxcxdxe'.match_glob('*c[^XYZ]d*') + assert !'axbxcxdxe'.match_glob('*c[^xYZ]d*') + assert !'axbxcxdxe'.match_glob('*c[^YxZ]d*') + assert !'axbxcxdxe'.match_glob('*c[^YZx]d*') +} + +fn test_match_glob_with_escaped_metachars() { + assert 'axbx?cxdxe'.match_glob('*x[?]c*') + assert !'axbxXcxdxe'.match_glob('*x[?]c*') + assert 'zaxbx*cxdxez'.match_glob('*x[Q*W]c*') + assert 'zaxbx*cxdxez'.match_glob('*x[QW*]c*') + assert 'zaxbx*cxdxez'.match_glob('*bx[*QW]c*') + assert 'zaxbW*cxdxez'.match_glob('*W[*nmk]c*') + assert 'zaxbW*cxdxez'.match_glob('*W[n*mk]c*') + assert 'zaxbW*cxdxez'.match_glob('*W[nm*k]c*') + assert 'zaxbW*cxdxez'.match_glob('*W[nmk*]c*') +} + +fn test_match_glob_with_complex_patterns() { + assert 'axbxcxdxe'.match_glob('*xdx*') + assert !'axbxcxdxe'.match_glob('*xzx*') + assert 'axbxcxdxe'.match_glob('a*b*c*d*e*') + assert 'axbxcxdxexxx'.match_glob('a*b*c*d*e*') + assert 'abxbbxdbxebxczzx'.match_glob('a*b?c*x') + assert !'abxbbxdbxebxczzy'.match_glob('a*b?c*x') +} + +fn test_match_glob_search_is_linear() { + // NB: these are pathological cases, when matches are performed + // using the exponential recursive approach, that can take many + // seconds, even minutes, but take usually only microseconds, + // using the linear approach from https://research.swtch.com/glob + // that does not backtrack. + long_a := 'a'.repeat(500) + sw := time.new_stopwatch() + assert !long_a.match_glob('a*a*a*a*b') + assert sw.elapsed().milliseconds() < 10 + assert !long_a.match_glob('a*a*a*a*a*a*a*a*a*b') + assert sw.elapsed().milliseconds() < 10 +}