diff --git a/Cargo.toml b/Cargo.toml index 17f77a9..c5820bb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,9 +1,8 @@ [package] name = "lua-patterns" -version = "0.2.0" +version = "0.3.0" authors = ["steve donovan "] description = "Binding to Lua String Patterns" -build = "build.rs" license = "MIT" repository = "https://github.com/stevedonovan/lua-patterns" documentation = "https://docs.rs/lua-patterns" @@ -12,7 +11,5 @@ keywords = ["string","matching","lua"] categories = ["parsing","api-bindings"] -[build-dependencies] -gcc="0.3" diff --git a/build.rs b/build.rs deleted file mode 100644 index 2290b75..0000000 --- a/build.rs +++ /dev/null @@ -1,5 +0,0 @@ -extern crate gcc; - -fn main() { - gcc::compile_library("liblua-str.a", &["src/lua-str.c"]); -} diff --git a/examples/errors.rs b/examples/errors.rs index ee361ac..1fa52ea 100644 --- a/examples/errors.rs +++ b/examples/errors.rs @@ -1,4 +1,5 @@ extern crate lua_patterns; +use lua_patterns::errors::PatternError; fn main() { let bad = [ @@ -8,12 +9,12 @@ fn main() { ("bonzo (dog (cat)","unfinished capture"), ("frodo %f[%A","malformed pattern (missing ']')"), ("frodo (1) (2(3)%2)%1","invalid capture index %2"), - ]; - - fn error(s: &str) -> lua_patterns::PatternError { - lua_patterns::PatternError(s.into()) + ]; + + fn error(s: &str) -> PatternError { + PatternError(s.into()) } - + for p in bad.iter() { let res = lua_patterns::LuaPattern::new_try(p.0); if let Err(e) = res { @@ -21,6 +22,6 @@ fn main() { } else { println!("'{}' was fine",p.0); } - } - + } + } diff --git a/examples/iter.rs b/examples/iter.rs index 77d0662..80196e4 100644 --- a/examples/iter.rs +++ b/examples/iter.rs @@ -8,7 +8,6 @@ fn main() { //~ m.matches("hello"); //~ println!("ok"); - ///* let mut m = lp::LuaPattern::new("(%a+)"); let mut iter = m.gmatch("one two three"); assert_eq!(iter.next(), Some("one")); diff --git a/src/errors.rs b/src/errors.rs new file mode 100644 index 0000000..c15a544 --- /dev/null +++ b/src/errors.rs @@ -0,0 +1,19 @@ +use std::fmt; +use std::error::Error; + +/// Error type returned by _try methods +#[derive(Debug,PartialEq)] +pub struct PatternError(pub String); + +impl fmt::Display for PatternError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f,"{}",self.0) + } +} + +impl Error for PatternError { + fn description(&self) -> &str { + &self.0 + } +} + diff --git a/src/lib.rs b/src/lib.rs index b6de483..06ba3c7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -41,70 +41,26 @@ //! } //! ``` -use std::ptr; use std::ops; -use std::os::raw::{c_int,c_char,c_uint}; -use std::ffi::CStr; -#[repr(C)] -struct LuaMatch { - start: c_int, - end: c_int -} +pub mod errors; +use errors::*; +mod luapat; +use luapat::*; -static LUA_MAXCAPTURES: usize = 32; - -use std::fmt; -use std::error::Error; - -/// Error type returned by _try methods -#[derive(Debug,PartialEq)] -pub struct PatternError(String); - -impl fmt::Display for PatternError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f,"{}",self.0) - } -} - -impl Error for PatternError { - fn description(&self) -> &str { - &self.0 - } -} - -#[link(name = "lua-str", kind="static")] -extern { - fn str_match ( - s: *const u8, ls: c_uint, p: *const u8, lp: c_uint, - err_msg: *mut *mut c_char, - mm: *mut LuaMatch - ) -> c_int; - - fn str_check ( - p: *const u8, lp: c_uint - ) -> *const c_char; -} /// Represents a Lua string pattern and the results of a match pub struct LuaPattern<'a> { patt: &'a [u8], - matches: Vec, + matches: [LuaMatch; LUA_MAXCAPTURES], n_match: usize } impl <'a> LuaPattern<'a> { /// Maybe create a new Lua pattern from a slice of bytes pub fn from_bytes_try (bytes: &'a [u8]) -> Result,PatternError> { - let mut matches: Vec = Vec::with_capacity(LUA_MAXCAPTURES); - unsafe { - let res = str_check(bytes.as_ptr(),bytes.len() as c_uint); - if ! res.is_null() { - let sres = CStr::from_ptr(res).to_str().unwrap().to_string(); - return Err(PatternError(sres)); - } - } - unsafe {matches.set_len(LUA_MAXCAPTURES);} + str_check(bytes)?; + let matches = [LuaMatch{start: 0, end: 0}; LUA_MAXCAPTURES]; Ok(LuaPattern{patt: bytes, matches: matches, n_match: 0}) } @@ -133,20 +89,7 @@ impl <'a> LuaPattern<'a> { /// assert_eq!(&bytes[m.range()], &[0xFE,0xEE,0xEE,0xED]); /// ``` pub fn matches_bytes(&mut self, s: &[u8]) -> bool { - let c_ptr: *mut c_char = ptr::null_mut(); - let pvoid = Box::into_raw(Box::new(c_ptr)); - let err_msg : *mut *mut c_char = pvoid; - - unsafe { - self.n_match = str_match(s.as_ptr(),s.len() as c_uint, - self.patt.as_ptr(),self.patt.len() as c_uint, - err_msg, self.matches.as_mut_ptr()) as usize; - let ep = *err_msg; - if ! ep.is_null() { - panic!(format!("REPORT AS BUG: lua-pattern {:?}",CStr::from_ptr(ep))); - } - } - + self.n_match = str_match(s,self.patt,&mut self.matches).expect("Should not fail - report as bug"); self.n_match > 0 } @@ -236,7 +179,7 @@ impl <'a> LuaPattern<'a> { /// assert_eq!(cc.get(1), "hello"); /// } /// ``` - pub fn match_captures<'b,'c>(&'c mut self, text: &'b str) -> Captures<'a,'b,'c> { + pub fn match_captures<'b,'c>(&'c self, text: &'b str) -> Captures<'a,'b,'c> { Captures {m: self, text: text} } @@ -457,19 +400,33 @@ pub fn generate_gsub_patterns(repl: &str) -> Vec { res } -pub fn subst(patt: &mut LuaPattern, text: &str, repl: &Vec) -> String { - let mut res = String::new(); - let captures = patt.match_captures(text); - for r in repl { - match *r { - Subst::Text(ref s) => res.push_str(&s), - Subst::Capture(i) => res.push_str(captures.get(i)) +pub struct Substitute { + repl: Vec +} + +impl Substitute { + pub fn new(repl: &str) -> Substitute { + Substitute{ + repl: generate_gsub_patterns(repl) } } - res + + pub fn subst(&self, patt: &LuaPattern, text: &str) -> String { + let mut res = String::new(); + let captures = patt.match_captures(text); + for r in &self.repl { + match *r { + Subst::Text(ref s) => res.push_str(&s), + Subst::Capture(i) => res.push_str(captures.get(i)) + } + } + res + } + } + /// Low-overhead convenient access to string match captures // note: there are three borrows going on here. // The lifetime 'a is for the _pattern_, the lifetime 'b is diff --git a/src/lua-str.c b/src/lua-str.c deleted file mode 100644 index 42d3b6d..0000000 --- a/src/lua-str.c +++ /dev/null @@ -1,501 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -typedef struct LuaMatch { - int start; - int end; -} LuaMatch; - - -/* macro to `unsign' a character */ -#define uchar(c) ((unsigned char)(c)) - - -/* -** {====================================================== -** PATTERN MATCHING -** ======================================================= -*/ - -#define LUA_MAXCAPTURES 32 - -/* maximum recursion depth for 'match' */ -#define MAXCCALLS 200 - -#define CAP_UNFINISHED (-1) -#define CAP_POSITION (-2) - -typedef struct MatchState { - int matchdepth; /* control for recursive depth (to avoid C stack overflow) */ - const char *src_init; /* init of source string */ - const char *src_end; /* end ('\0') of source string */ - const char *p_end; /* end ('\0') of pattern */ - int level; /* total number of captures (finished or unfinished) */ - struct { - const char *init; - ptrdiff_t len; - } capture[LUA_MAXCAPTURES]; - jmp_buf jump_buf; - char msg_buff[256]; -} MatchState; - -/* recursive function */ -static const char *match (MatchState *ms, const char *s, const char *p); - -#define L_ESC '%' -#define SPECIALS "^$*+?.([%-" - -static int throw_error(MatchState *ms, const char *fmt,...) { - va_list ap; - va_start(ap,fmt); - vsnprintf(ms->msg_buff,sizeof(ms->msg_buff),fmt,ap); - va_end(ap); - longjmp(ms->jump_buf,1); - return 0; -} - -static int check_capture (MatchState *ms, int l) { - l -= '1'; - if (l < 0 || l >= ms->level || ms->capture[l].len == CAP_UNFINISHED) - return throw_error(ms,"invalid capture index %%%d", l + 1); - return l; -} - -static int capture_to_close (MatchState *ms) { - int level = ms->level; - for (level--; level>=0; level--) - if (ms->capture[level].len == CAP_UNFINISHED) return level; - return throw_error(ms,"invalid pattern capture"); -} - - -static const char *classend (MatchState *ms, const char *p) { - - switch (*p++) { - case L_ESC: { - if (p == ms->p_end) - throw_error(ms,"malformed pattern (ends with '%')"); - return p+1; - } - case '[': { - if (*p == '^') p++; - do { /* look for a `]' */ - if (p == ms->p_end) - throw_error(ms,"malformed pattern (missing ']')"); - if (*(p++) == L_ESC && p < ms->p_end) - p++; /* skip escapes (e.g. `%]') */ - } while (*p != ']'); - return p+1; - } - default: { - return p; - } - } -} - - -static int match_class (int c, int cl) { - int res; - switch (tolower(cl)) { - case 'a' : res = isalpha(c); break; - case 'c' : res = iscntrl(c); break; - case 'd' : res = isdigit(c); break; - case 'g' : res = isgraph(c); break; - case 'l' : res = islower(c); break; - case 'p' : res = ispunct(c); break; - case 's' : res = isspace(c); break; - case 'u' : res = isupper(c); break; - case 'w' : res = isalnum(c); break; - case 'x' : res = isxdigit(c); break; - case 'z' : res = (c == 0); break; /* deprecated option */ - default: return (cl == c); - } - return (islower(cl) ? res : !res); -} - - -static int matchbracketclass (int c, const char *p, const char *ec) { - int sig = 1; - if (*(p+1) == '^') { - sig = 0; - p++; /* skip the `^' */ - } - while (++p < ec) { - if (*p == L_ESC) { - p++; - if (match_class(c, uchar(*p))) - return sig; - } - else if ((*(p+1) == '-') && (p+2 < ec)) { - p+=2; - if (uchar(*(p-2)) <= c && c <= uchar(*p)) - return sig; - } - else if (uchar(*p) == c) return sig; - } - return !sig; -} - - -static int singlematch (MatchState *ms, const char *s, const char *p, - const char *ep) { - if (s >= ms->src_end) - return 0; - else { - int c = uchar(*s); - switch (*p) { - case '.': return 1; /* matches any char */ - case L_ESC: return match_class(c, uchar(*(p+1))); - case '[': return matchbracketclass(c, p, ep-1); - default: return (uchar(*p) == c); - } - } -} - - -static const char *matchbalance (MatchState *ms, const char *s, - const char *p) { - if (p >= ms->p_end - 1) - throw_error(ms,"malformed pattern " - "(missing arguments to '%b')"); - if (*s != *p) return NULL; - else { - int b = *p; - int e = *(p+1); - int cont = 1; - while (++s < ms->src_end) { - if (*s == e) { - if (--cont == 0) return s+1; - } - else if (*s == b) cont++; - } - } - return NULL; /* string ends out of balance */ -} - - -static const char *max_expand (MatchState *ms, const char *s, - const char *p, const char *ep) { - ptrdiff_t i = 0; /* counts maximum expand for item */ - while (singlematch(ms, s + i, p, ep)) - i++; - /* keeps trying to match with the maximum repetitions */ - while (i>=0) { - const char *res = match(ms, (s+i), ep+1); - if (res) return res; - i--; /* else didn't match; reduce 1 repetition to try again */ - } - return NULL; -} - - -static const char *min_expand (MatchState *ms, const char *s, - const char *p, const char *ep) { - for (;;) { - const char *res = match(ms, s, ep+1); - if (res != NULL) - return res; - else if (singlematch(ms, s, p, ep)) - s++; /* try with one more repetition */ - else return NULL; - } -} - - -static const char *start_capture (MatchState *ms, const char *s, - const char *p, int what) { - const char *res; - int level = ms->level; - if (level >= LUA_MAXCAPTURES) throw_error(ms,"too many captures"); - ms->capture[level].init = s; - ms->capture[level].len = what; - ms->level = level+1; - if ((res=match(ms, s, p)) == NULL) /* match failed? */ - ms->level--; /* undo capture */ - return res; -} - - -static const char *end_capture (MatchState *ms, const char *s, - const char *p) { - int l = capture_to_close(ms); - const char *res; - ms->capture[l].len = s - ms->capture[l].init; /* close capture */ - if ((res = match(ms, s, p)) == NULL) /* match failed? */ - ms->capture[l].len = CAP_UNFINISHED; /* undo capture */ - return res; -} - - -static const char *match_capture (MatchState *ms, const char *s, int l) { - size_t len; - l = check_capture(ms, l); - len = ms->capture[l].len; - if ((size_t)(ms->src_end-s) >= len && - memcmp(ms->capture[l].init, s, len) == 0) - return s+len; - else return NULL; -} - -static const char *match (MatchState *ms, const char *s, const char *p) { - if (ms->matchdepth-- == 0) - throw_error(ms,"pattern too complex"); - init: /* using goto's to optimize tail recursion */ - if (p != ms->p_end) { /* end of pattern? */ - switch (*p) { - case '(': { /* start capture */ - if (*(p + 1) == ')') /* position capture? */ - s = start_capture(ms, s, p + 2, CAP_POSITION); - else - s = start_capture(ms, s, p + 1, CAP_UNFINISHED); - break; - } - case ')': { /* end capture */ - s = end_capture(ms, s, p + 1); - break; - } - case '$': { - if ((p + 1) != ms->p_end) /* is the `$' the last char in pattern? */ - goto dflt; /* no; go to default */ - s = (s == ms->src_end) ? s : NULL; /* check end of string */ - break; - } - case L_ESC: { /* escaped sequences not in the format class[*+?-]? */ - switch (*(p + 1)) { - case 'b': { /* balanced string? */ - s = matchbalance(ms, s, p + 2); - if (s != NULL) { - p += 4; goto init; /* return match(ms, s, p + 4); */ - } /* else fail (s == NULL) */ - break; - } - case 'f': { /* frontier? */ - const char *ep; char previous; - p += 2; - if (*p != '[') - throw_error(ms,"missing '[' after '%%f' in pattern"); - ep = classend(ms, p); /* points to what is next */ - previous = (s == ms->src_init) ? '\0' : *(s - 1); - if (!matchbracketclass(uchar(previous), p, ep - 1) && - matchbracketclass(uchar(*s), p, ep - 1)) { - p = ep; goto init; /* return match(ms, s, ep); */ - } - s = NULL; /* match failed */ - break; - } - case '0': case '1': case '2': case '3': - case '4': case '5': case '6': case '7': - case '8': case '9': { /* capture results (%0-%9)? */ - s = match_capture(ms, s, uchar(*(p + 1))); - if (s != NULL) { - p += 2; goto init; /* return match(ms, s, p + 2) */ - } - break; - } - default: goto dflt; - } - break; - } - default: dflt: { /* pattern class plus optional suffix */ - const char *ep = classend(ms, p); /* points to optional suffix */ - /* does not match at least once? */ - if (!singlematch(ms, s, p, ep)) { - if (*ep == '*' || *ep == '?' || *ep == '-') { /* accept empty? */ - p = ep + 1; goto init; /* return match(ms, s, ep + 1); */ - } - else /* '+' or no suffix */ - s = NULL; /* fail */ - } - else { /* matched once */ - switch (*ep) { /* handle optional suffix */ - case '?': { /* optional */ - const char *res; - if ((res = match(ms, s + 1, ep + 1)) != NULL) - s = res; - else { - p = ep + 1; goto init; /* else return match(ms, s, ep + 1); */ - } - break; - } - case '+': /* 1 or more repetitions */ - s++; /* 1 match already done */ - /* go through */ - case '*': /* 0 or more repetitions */ - s = max_expand(ms, s, p, ep); - break; - case '-': /* 0 or more repetitions (minimum) */ - s = min_expand(ms, s, p, ep); - break; - default: /* no suffix */ - s++; p = ep; goto init; /* return match(ms, s + 1, ep); */ - } - } - break; - } - } - } - ms->matchdepth++; - return s; -} - - - -static void push_onecapture (MatchState *ms, int i, const char *s, - const char *e, LuaMatch *mm) { - if (i >= ms->level) { - if (i == 0) { /* ms->level == 0, too */ - mm->start = 0; - mm->end = e - s; - } else - throw_error(ms,"invalid capture index"); - } - else { - ptrdiff_t l = ms->capture[i].len; - if (l == CAP_UNFINISHED) throw_error(ms,"unfinished capture"); - if (l == CAP_POSITION) { - mm[i].start = ms->capture[i].init - ms->src_init + 1; - mm[i].end = mm[i].start; - } else { - mm[i].start = ms->capture[i].init - ms->src_init; - mm[i].end = mm[i].start + l; - } - } -} - - -static int push_captures (MatchState *ms, const char *s, const char *e, LuaMatch *mm) { - int i; - int nlevels = (ms->level == 0 && s) ? 1 : ms->level; - for (i = 0; i < nlevels; i++) - push_onecapture(ms, i, s, e, mm); - return nlevels; /* number of strings pushed */ -} - -int str_match (const char *s, unsigned int ls, const char *p, unsigned int lp, char **err_msg, LuaMatch *mm) { - const char *s1 = s; - MatchState ms; - int anchor = (*p == '^'); - if (anchor) { - p++; lp--; /* skip anchor character */ - } - - memset(ms.msg_buff,0,sizeof(ms.msg_buff)); - - if (setjmp(ms.jump_buf) != 0) { - if (err_msg != NULL) { - *err_msg = strdup(ms.msg_buff); - } - return 0; - } - - ms.matchdepth = MAXCCALLS; - ms.src_init = s; - ms.src_end = s + ls; - ms.p_end = p + lp; - do { - const char *res; - ms.level = 0; - if ((res=match(&ms, s1, p)) != NULL) { - mm[0].start = s1 - s; /* start */ - mm[0].end = res - s; /* end */ - return push_captures(&ms, NULL, 0, mm+1) + 1; - } - } while (s1++ < ms.src_end && !anchor); - - return 0; -} - -static void str_match_check(MatchState *ms, const char *p) { - char ch; - int level_stack[LUA_MAXCAPTURES]; - int stack_idx = 0; - int current_level = 0; - while (p < ms->p_end && (ch=*p++)) { - switch (ch) { - case L_ESC: { - switch ((ch=*p++)) { - case 'b': { - p++; - if (p >= ms->p_end) throw_error(ms,"malformed pattern " - "(missing arguments to '%b')"); - } break; - case 'f': { - if (*p != '[') throw_error(ms,"missing '[' after '%%f' in pattern"); - --p; // so we see [...] - } break; - case '0': case '1': case '2': case '3': - case '4': case '5': case '6': case '7': - case '8': case '9': { - int l = uchar(ch) - '1'; // - if (l < 0 || l >= ms->level || ms->capture[l].len == CAP_UNFINISHED) - throw_error(ms,"invalid capture index %%%d", l + 1); - --p; - } break; - } - } break; - case '[': { - do { /* look for a `]' */ - if (p == ms->p_end) - throw_error(ms,"malformed pattern (missing ']')"); - if (*(p++) == L_ESC && p < ms->p_end) - p++; /* skip escapes (e.g. `%]') */ - } while (*p != ']'); - } break; - case '(': { - if (*p != ')') { /* not a position capture */ - level_stack[stack_idx++] = ms->level; - ms->capture[ms->level].len = CAP_UNFINISHED; - ms->level ++; /* level counts total number of captures */ - if (ms->level >= LUA_MAXCAPTURES) throw_error(ms,"too many captures"); - } else { - ++p; - } - } break; - case ')': { - if (stack_idx == 0) - throw_error(ms, "no open capture"); - ms->capture[level_stack[--stack_idx]].len = CAP_POSITION; - } break; - default: { - - } - } - } - if (stack_idx > 0) { - throw_error(ms,"unfinished capture"); - } -} - -const char *str_check (const char *p, unsigned int lp) { - MatchState ms; - int anchor = (*p == '^'); - if (anchor) { - p++; /* skip anchor character */ - } - - memset(ms.msg_buff,0,sizeof(ms.msg_buff)); - - if (setjmp(ms.jump_buf) != 0) { - return strdup(ms.msg_buff); - } - - ms.level = 0; - ms.matchdepth = MAXCCALLS; - ms.p_end = p + lp; - - if ( *(ms.p_end-1) == '%') { - throw_error(&ms,"malformed pattern (ends with '%')"); - } - - str_match_check(&ms,p); - - return NULL; -} - diff --git a/src/luapat.rs b/src/luapat.rs new file mode 100644 index 0000000..1627de1 --- /dev/null +++ b/src/luapat.rs @@ -0,0 +1,625 @@ +// translation of Lua 5.2 string pattern code + +use errors::*; +use std::ptr::null; + +pub const LUA_MAXCAPTURES: usize = 32; +/* maximum recursion depth for 'match' */ +const MAXCCALLS: usize = 200; + +const L_ESC: u8 = b'%'; + +fn add(p: CPtr, count: usize) -> CPtr { + unsafe {p.offset(count as isize)} +} + +fn sub(p: CPtr, count: usize) -> CPtr { + unsafe {p.offset(-(count as isize))} +} + +fn next(p: CPtr) -> CPtr { + add(p, 1) +} + +fn at(p: CPtr) -> u8 { + unsafe { *p } +} + +fn diff(p1: CPtr, p2: CPtr) -> usize { + let d = (p1 as isize).wrapping_sub(p2 as isize); + d as usize +} + +#[derive(Copy,Clone,Debug)] +pub struct LuaMatch { + pub start: usize, + pub end: usize, +} + +#[derive(Copy,Clone)] +enum CapLen { + Len(usize), + Unfinished, + Position, +} + +impl CapLen { + fn is_unfinished(&self) -> bool { + match *self { + CapLen::Unfinished => true, + _ => false + } + } + + fn size(&self) -> Result { + match *self { + CapLen::Len(size) => Ok(size), + _ => error("capture was unfinished or positional") + } + } + +} + +type CPtr = *const u8; + +#[derive(Copy,Clone)] +struct Capture { + init: CPtr, + len: CapLen, +} + +impl Capture { + fn is_unfinished(&self) -> bool { + self.len.is_unfinished() + } +} + +use std::result; + +type Result = result::Result; + +fn error(msg: &str) -> Result { + Err(PatternError(msg.into())) +} + +struct MatchState { + matchdepth: usize, /* control for recursive depth (to avoid stack overflow) */ + src_init: CPtr, /* init of source string */ + src_end: CPtr, /* end ('\0') of source string */ + p_end: CPtr, /* end ('\0') of pattern */ + level: usize, /* total number of captures (finished or unfinished) */ + capture: [Capture; LUA_MAXCAPTURES], +} + +impl MatchState { + fn new(s: CPtr, se: CPtr, pe: CPtr) -> MatchState { + MatchState { + matchdepth: MAXCCALLS, + src_init: s, + src_end: se, + p_end: pe, + level: 0, + capture: [Capture{init: null(), len: CapLen::Len(0) }; LUA_MAXCAPTURES], + } + } + + fn check_capture(&self, l: usize) -> Result { + let l = l as i8 - b'1' as i8; + if l < 0 || l as usize >= self.level || self.capture[l as usize].is_unfinished() { + return error(&format!("invalid capture index %{}", l + 1)); + } + Ok(l as usize) + } + + fn capture_to_close(&self) -> Result { + let mut level = (self.level - 1) as isize; + while level >= 0 { + if self.capture[level as usize].is_unfinished() { + return Ok(level as usize); + } + level -= 1; + } + error("invalid pattern capture") + } + + fn classend (&self, p: CPtr) -> Result { + let ch = at(p); + let mut next_p = next(p); + Ok(match ch { + L_ESC => { + if next_p == self.p_end { + return error("malformed pattern (ends with '%')"); + } + next(next_p) + }, + b'[' => { + if at(next_p) == b'^' { + next_p = next(next_p); + } + while at(next_p) != b']' { + if next_p == self.p_end { + return error("malformed pattern (missing ']')"); + } + let ch = at(next_p); + next_p = next(next_p); + if ch == L_ESC && p < self.p_end { + next_p = next(next_p); /* skip escapes (e.g. `%]') */ + } + } + next(next_p) + }, + _ => next_p + }) + } + +} + +fn match_class (ch: u8, class: u8) -> bool { + let res = match class.to_ascii_lowercase() { + b'a' => ch.is_ascii_alphabetic(), + b'c' => ch.is_ascii_control(), + b'd' => ch.is_ascii_digit(), + b'g' => ch.is_ascii_graphic(), + b'l' => ch.is_ascii_lowercase(), + b'p' => ch.is_ascii_punctuation(), + b's' => ch.is_ascii_whitespace(), + b'u' => ch.is_ascii_uppercase(), + b'w' => ch.is_ascii_alphanumeric(), + b'x' => ch.is_ascii_hexdigit(), + lc => return lc == ch, + }; + if class.is_ascii_lowercase() { res } else {! res} +} + + +fn matchbracketclass (c: u8, p: CPtr, ec: CPtr) -> bool { + let mut p = p; + // [^ inverts match + let sig = if at(next(p)) == b'^' { + p = next(p); + false + } else { + true + }; + p = next(p); + while p < ec { + if at(p) == L_ESC { // e.g %s + p = next(p); + if match_class(c, at(p)) { + return sig; + } + } else + // e.g a-z + if at(next(p)) == b'-' && add(p,2) < ec { + let lastc = at(p); + p = add(p,2); + if lastc <= c && c <= at(p) { + return sig; + } + } else + if at(p) == c { + return sig; + } + p = next(p); + } + return ! sig; +} + +impl MatchState { + + fn singlematch (&self, s: CPtr, p: CPtr, ep: CPtr) -> bool { + if s >= self.src_end { + return false; + } + let c = at(s); + let pc = at(p); + match pc { + b'.' => true, /* matches any char */ + L_ESC => match_class(c, at(next(p))), + b'[' => matchbracketclass(c, p, sub(ep,1)), + _ => c == pc + } + } + + fn matchbalance (&self, s: CPtr, p: CPtr) -> Result { + if p >= sub(self.p_end,1) { + return error("malformed pattern (missing arguments to '%b')"); + } + if at(s) != at(p) { + return Ok(null()); + } + // e.g. %b() + let b = at(p); + let e = at(next(p)); + let mut cont = 1; + let mut s = next(s); + while s < self.src_end { + let ch = at(s); + if ch == e { + cont -= 1; + if cont == 0 { + return Ok(next(s)); + } + } else + if ch == b { + cont += 1; + } + s = next(s); + } + Ok(null()) /* string ends out of balance */ + } + + fn max_expand(&mut self, s: CPtr, p: CPtr, ep: CPtr) -> Result { + let mut i = 0isize; /* counts maximum expand for item */ + while self.singlematch(add(s,i as usize),p,ep) { + i += 1; + } + /* keeps trying to match with the maximum repetitions */ + while i >= 0 { + let res = self.patt_match(add(s,i as usize),next(ep))?; + if ! res.is_null() { + return Ok(res); + } + i -= 1; /* else didn't match; reduce 1 repetition to try again */ + } + Ok(null()) + } + + fn min_expand(&mut self, s: CPtr, p: CPtr, ep: CPtr) -> Result { + let mut s = s; + loop { + let res = self.patt_match(s,next(ep))?; + if ! res.is_null() { + return Ok(res); + } else + if self.singlematch(s, p, ep) { + s = next(s); + } else { + return Ok(null()); + } + } + } + + fn start_capture(&mut self, s: CPtr, p: CPtr, what: CapLen) -> Result { + let level = self.level; + if level >= LUA_MAXCAPTURES { + return error("too many captures"); + } + self.capture[level].init = s; + self.capture[level].len = what; + self.level = level + 1; + let res = self.patt_match(s, p)?; + if res.is_null() { /* match failed? */ + self.level -= 1; /* undo capture */ + } + Ok(res) + } + + fn end_capture(&mut self, s: CPtr, p: CPtr) -> Result { + let l = self.capture_to_close()?; + self.capture[l].len = CapLen::Len(diff(s,self.capture[l].init)); /* close capture */ + let res = self.patt_match(s, p)?; + if res.is_null() { /* match failed? */ + self.capture[l].len = CapLen::Unfinished; + } + Ok(res) + } + + fn match_capture(&mut self, s: CPtr, l: usize) -> Result { + let l = self.check_capture(l)?; + let len = self.capture[l].len.size()?; + if diff(self.src_end, s) >= len { + unsafe {s.copy_to_nonoverlapping(self.capture[l].init as *mut u8, len);} + return Ok(add(s,len)); + } + Ok(null()) + } + + + fn patt_match(&mut self, s: CPtr, p: CPtr) -> Result { + let mut s = s; + let mut p = p; + self.matchdepth -= 1; + if self.matchdepth == 0 { + return error("pattern too complex"); + } + + if p == self.p_end { /* end of pattern? */ + self.matchdepth += 1; + return Ok(s); + } + match at(p) { + b'(' => { /* start capture */ + if at(next(p)) == b')' { /* position capture? */ + s = self.start_capture(s, add(p,2), CapLen::Position)?; + } else { + s = self.start_capture(s, next(p), CapLen::Unfinished)?; + } + }, + b')' => { /* end capture */ + s = self.end_capture(s, next(p))?; + }, + b'$' => { + if next(p) != self.p_end { /* is the `$' the last char in pattern? */ + /* no; go to default */ + return self.patt_default_match(s, p); + } + s = if s == self.src_end {s} else {null()}; /* check end of string */ + } + L_ESC => { /* escaped sequences not in the format class[*+?-]? */ + match at(next(p)) { + b'b' => { /* balanced string? */ + s = self.matchbalance(s, add(p,2))?; + if ! s.is_null() { + // e.g, after %b() + return self.patt_match(s, add(p,4)); + } + }, + b'f' => { /* frontier? */ + p = add(p,2); + if at(p) != b'[' { + return error("missing '[' after '%f' in pattern"); + } + let ep = self.classend(p)?; /* points to what is next */ + let previous = if s == self.src_init {b'\0'} else {at(sub(s,1))}; + let epl = sub(ep,1); + if ! matchbracketclass(previous,p,epl) + && matchbracketclass(at(s),p,epl) { + return self.patt_match(s, ep); + } + s = null(); /* match failed */ + }, + b'0'...b'9' => { /* capture results (%0-%9)? */ + s = self.match_capture(s,at(next(p)) as usize)?; + if ! s.is_null() { + return self.patt_match(s, add(p,2)); + } + }, + _ => return self.patt_default_match(s, p) + } + + }, + _ => return self.patt_default_match(s, p) + + } + self.matchdepth += 1; + Ok(s) + } + + fn patt_default_match(&mut self, s: CPtr, p: CPtr) -> Result { + let mut s = s; + /* pattern class plus optional suffix */ + let ep = self.classend(p)?; /* points to optional suffix */ + /* does not match at least once? */ + if ! self.singlematch(s, p, ep) { + let epc = at(ep); + if epc == b'*' || epc == b'?' || epc == b'-' { /* accept empty? */ + return self.patt_match(s, next(ep)); + } else { /* '+' or no suffix */ + s = null(); /* fail */ + } + } else { /* matched once */ + match at(ep) { /* handle optional suffix */ + b'?' => { + let res = self.patt_match(next(s),next(ep))?; + if ! res.is_null() { + s = res; + } else { + return self.patt_match(s, next(ep)); + } + }, + b'+' => { /* 1 or more repetitions */ + s = next(s); + s = self.max_expand(s, p, ep)?; + }, + b'*' => { /* 0 or more repetitions */ + s = self.max_expand(s, p, ep)?; + }, + b'-' => { /* 0 or more repetitions (minimum) */ + s = self.min_expand(s, p, ep)? ; + }, + _ => { /* no suffix */ + return self.patt_match(next(s),ep); + } + } + } + self.matchdepth += 1; + Ok(s) + } + + fn push_onecapture(&mut self, i: usize, s: CPtr, e: CPtr, mm: &mut [LuaMatch]) -> Result<()> { + if i >= self.level { + if i == 0 { /* ms->level == 0, too */ + mm[0].start = 0; + mm[0].end = diff(e,s); + Ok(()) + } else { + return error("invalid capture index"); + } + } else { + let init = self.capture[i].init; + match self.capture[i].len { + CapLen::Unfinished => error("unfinished capture"), + CapLen::Position => { + mm[i].start = diff(init,next(self.src_init)); + mm[i].end = mm[i].start; + Ok(()) + }, + CapLen::Len(l) => { + mm[i].start = diff(init,self.src_init); + mm[i].end = mm[i].start + l; + Ok(()) + } + } + } + + } + + fn push_captures(&mut self, s: CPtr, e: CPtr, mm: &mut [LuaMatch]) -> Result { + let nlevels = if self.level == 0 && ! s.is_null() {1} else {self.level}; + for i in 0..nlevels { + self.push_onecapture(i, s, e, mm)?; + } + Ok(nlevels) /* number of strings pushed */ + } + + pub fn str_match_check(&mut self, p: CPtr) -> Result<()> { + let mut level_stack = [0; LUA_MAXCAPTURES]; + let mut stack_idx = 0; + let mut p = p; + while p < self.p_end { + let ch = at(p); + p = next(p); + match ch { + L_ESC => { + //p = next(p); + let c = at(p); + match c { + b'b' => { + p = next(p); + if p >= self.p_end { + return error("malformed pattern (missing arguments to '%b')"); + } + }, + b'f' => { + p = next(p); + if at(p) != b'[' { + return error("missing '[' after '%f' in pattern"); + } + p = sub(p,1); // so we see [...] + }, + b'0' ... b'9' => { + let l = (c as i8) - (b'1' as i8); + println!("level {}", self.level); + if l < 0 || l as usize >= self.level || self.capture[l as usize].is_unfinished() { + return error(&format!("invalid capture index %{}", l + 1)); + } + p = sub(p,1); + }, + _ => {} + } + }, + b'[' => { + while at(p) != b']' { + if p == self.p_end { + return error("malformed pattern (missing ']')"); + } + if at(p) == L_ESC && p < self.p_end { + p = next(p); + } + p = next(p); + } + }, + b'(' => { + if at(p) != b')' { // not a position capture + level_stack[stack_idx] = self.level; + stack_idx += 1; + self.capture[self.level].len = CapLen::Unfinished; + self.level += 1; + if self.level >= LUA_MAXCAPTURES { + return error("too many captures"); + } + } else { + p = next(p); + } + }, + b')' => { + if stack_idx == 0 { + return error("no open capture"); + } + stack_idx -= 1; + self.capture[level_stack[stack_idx]].len = CapLen::Position; + }, + _ => {} + } + } + if stack_idx > 0 { + return error("unfinished capture"); + } + Ok(()) + } +} + +pub fn str_match(s: &[u8], p: &[u8], mm: &mut [LuaMatch]) -> Result { + let mut lp = p.len(); + let mut p = p.as_ptr(); + let ls = s.len(); + let s = s.as_ptr(); + let mut s1 = s; + let anchor = at(p) == b'^'; + if anchor { + p = next(p); + lp -= 1; /* skip anchor character */ + } + + let mut ms = MatchState::new(s,add(s,ls),add(p,lp)); + loop { + let res = ms.patt_match(s1, p)?; + if ! res.is_null() { + mm[0].start = diff(s1,s); /* start */ + mm[0].end = diff(res,s); /* end */ + return Ok(ms.push_captures(null(),null(),&mut mm[1..])? + 1); + } + s1 = next(s1); + if ! (s1 < ms.src_end && ! anchor) { + break; + } + } + Ok(0) +} + +pub fn str_check(p: &[u8]) -> Result<()> { + let mut lp = p.len(); + let mut p = p.as_ptr(); + let anchor = at(p) == b'^'; + if anchor { + p = next(p); + lp -= 1; /* skip anchor character */ + } + let mut ms = MatchState::new(null(),null(),add(p,lp)); + if at(sub(ms.p_end,1)) == b'%' { + return error("malformed pattern (ends with '%')"); + } + ms.str_match_check(p)?; + Ok(()) +} + +/* +fn check(s: &[u8], p: &[u8]) { + if let Err(e) = str_check(p) { + println!("check error {}",e); + return; + } + + let mut matches = [LuaMatch{start: 0, end: 0}; 10]; + match str_match(s, p, &mut matches) { + Ok(n) => { + println!("ok {} matches", n); + for i in 0..n { + println!("match {:?} {:?}", + matches[i], + String::from_utf8(s[matches[i].start .. matches[i].end].to_vec()) + ); + } + }, + Err(e) => { + println!("error: {}", e) + } + } +} + + + +fn main() { + let mut args = std::env::args().skip(1); + let pat = args.next().unwrap(); + let s = args.next().unwrap(); + check(s.as_bytes(), pat.as_bytes()); + + //~ check(b"hello",b"%a"); + //~ check(b"0hello",b"%a+"); + //~ check(b"hello",b"%l(%a)"); + //check(b"hello",b"he(l+)"); + //check(b"k {and {so}}",b"k%s+(%b{})"); +} + */