commit 816fd0db2ce5828ea7c5cfdb75c15b100b289b59 Author: steve donovan Date: Mon Apr 17 13:16:52 2017 +0200 initial commit diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..ab6544f --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "lua-patterns" +version = "0.1.0" +authors = ["steve donovan "] +links = "foo" +build = "build.rs" + +[dependencies] +libc="0.2.0" + +[build-dependencies] +gcc="0.3" + + diff --git a/build.rs b/build.rs new file mode 100644 index 0000000..2290b75 --- /dev/null +++ b/build.rs @@ -0,0 +1,5 @@ +extern crate gcc; + +fn main() { + gcc::compile_library("liblua-str.a", &["src/lua-str.c"]); +} diff --git a/examples/iter.rs b/examples/iter.rs new file mode 100644 index 0000000..acf6126 --- /dev/null +++ b/examples/iter.rs @@ -0,0 +1,36 @@ +extern crate lua_patterns as lp; + + + +fn main() { + let mut m = lp::LuaPattern::new("(%a+)"); + let mut iter = m.gmatch("one two three"); + assert_eq!(iter.next(), Some("one")); + assert_eq!(iter.next(), Some("two")); + assert_eq!(iter.next(), Some("three")); + assert_eq!(iter.next(), None); + + let mut m = lp::LuaPattern::new("(%a+)"); + let split: Vec<_> = m.gmatch("dog cat leopard wolf").collect(); + assert_eq!(split,&["dog","cat","leopard","wolf"]); + + let mut m = lp::LuaPattern::new("(%S+)%s*=%s*(.+)"); + let cc = m.captures(" hello= bonzo dog"); + assert_eq!(cc[0], "hello= bonzo dog"); + assert_eq!(cc[1],"hello"); + assert_eq!(cc[2],"bonzo dog"); + + let captures = m.match_captures(" frodo = baggins"); + for s in captures { + println!("{:?}",s); + } + + + let mut m = lp::LuaPattern::new("%$(%S+)"); + let res = m.gsub("hello $dolly you're so $fine", + |cc| cc.get(1).to_uppercase() + ); + assert_eq!(res,"hello DOLLY you're so FINE"); + + +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..b91d65d --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,261 @@ +extern crate libc; +use libc::{size_t,c_int}; +use std::ptr; +use std::ops; + +#[repr(C)] +#[derive(PartialEq,Eq,Debug)] +struct LuaMatch { + start: c_int, + end: c_int +} + +static LUA_MAXCAPTURES: usize = 32; + +// int str_match (const char *s, size_t ls, const char *p, size_t lp, char **err_msg, LuaMatch *mm) +#[link(name = "lua-str", kind="static")] +extern { + fn str_match (s: *const u8, ls: size_t, p: *const u8, lp: size_t, + //err_msg: *mut *const u8, + err_msg: *const u8, + mm: *mut LuaMatch) -> c_int; +} + +pub struct LuaPattern<'a> { + patt: &'a [u8], + matches: Vec, + n_match: usize +} + +impl <'a> LuaPattern<'a> { + pub fn new(patt: &'a str) -> LuaPattern<'a> { + LuaPattern::from_bytes(patt.as_bytes()) + } + + pub fn from_bytes (bytes: &'a [u8]) -> LuaPattern<'a> { + let mut matches: Vec = Vec::with_capacity(LUA_MAXCAPTURES); + unsafe {matches.set_len(LUA_MAXCAPTURES);} + LuaPattern{patt: bytes, matches: matches, n_match: 0} + } + + pub fn matches_bytes(&mut self, s: &[u8]) -> bool { + let err_msg: *const u8 = ptr::null(); + + unsafe { + self.n_match = str_match(s.as_ptr(),s.len() as size_t, + self.patt.as_ptr(),self.patt.len() as size_t, + err_msg, self.matches.as_mut_ptr()) as usize; + } + self.n_match > 0 + } + + pub fn matches(&mut self, text: &str) -> bool { + self.matches_bytes(text.as_bytes()) + } + + + pub fn captures<'b>(&mut self, text: &'b str) -> Vec<&'b str> { + let mut res = Vec::new(); + self.capture_into(text, &mut res); + res + } + + pub fn match_captures<'b>(&'a mut self, text: &'b str) -> Captures<'a,'b> { + self.matches(text); + Captures {m: self, text: text} + } + + pub fn capture_into<'b>(&mut self, text: &'b str, vec: &mut Vec<&'b str>) -> bool { + self.matches(text); + vec.clear(); + for i in 0..self.n_match { + vec.push(&text[self.bounds(i)]); + } + self.n_match > 0 + } + + pub fn range(&self) -> ops::Range { + self.bounds(0) + } + + pub fn bounds(&self, i: usize) -> ops::Range { + ops::Range{ + start: self.matches[i].start as usize, + end: self.matches[i].end as usize + } + } + + pub fn first_match<'b>(&mut self, text: &'b str) -> Option<&'b str> { + self.matches(text); + if self.n_match > 0 { + Some(&text[self.bounds(if self.n_match > 1 {1} else {0})]) + } else { + None + } + } + + pub fn gmatch<'b>(&'a mut self, text: &'b str) -> GMatch<'a,'b> { + GMatch{m: self, text: text} + } + + pub fn gsub (&mut self, text: &str, lookup: F) -> String + where F: Fn(Captures)-> String { + let mut slice = text; + let mut res = String::new(); + while self.matches(slice) { + // full range of match + let all = self.bounds(0); + let captures = Captures{m: self, text: slice}; + let repl = lookup(captures); + // append everything up to match + res.push_str(&slice[0..all.start]); + res.push_str(&repl); + slice = &slice[all.end..]; + } + res.push_str(slice); + res + } + +} + +pub struct Captures<'a,'b> { + m: &'a LuaPattern<'a>, + text: &'b str +} + +impl <'a,'b> Captures<'a,'b> { + pub fn get(&self, i: usize) -> &'b str { + &self.text[self.m.bounds(i)] + } + + pub fn num_matches(&self) -> usize { + self.m.n_match + } +} + +pub struct CaptureIter<'a,'b> { + cc: Captures<'a,'b>, + idx: usize, + top: usize +} + +impl <'a,'b>Iterator for CaptureIter<'a,'b> { + type Item = &'b str; + + fn next(&mut self) -> Option { + if self.idx < self.top { + let res = self.cc.get(self.idx); + self.idx += 1; + Some(res) + } else { + None + } + } +} + +impl <'a,'b> IntoIterator for Captures<'a,'b> { + type Item = &'b str; + type IntoIter = CaptureIter<'a,'b>; + + fn into_iter(self) -> Self::IntoIter { + CaptureIter{idx: 0, top: self.num_matches(),cc: self} + } +} + + +pub struct GMatch<'a,'b> { + m: &'a mut LuaPattern<'a>, + text: &'b str +} + +impl <'a,'b>Iterator for GMatch<'a,'b> { + type Item = &'b str; + + fn next(&mut self) -> Option { + if ! self.m.matches(self.text) { + None + } else { + let first = if self.m.n_match > 1 {1} else {0}; + let slice = &self.text[self.m.bounds(first)]; + self.text = &self.text[self.m.range().end..]; + Some(slice) + } + } + +} + +#[cfg(test)] +mod tests { + use super::*; + + + #[test] + fn captures_and_matching() { + let mut m = LuaPattern::new("(one).+"); + assert_eq!(m.captures(" one two"),&["one two","one"]); + let empty: &[&str] = &[]; + assert_eq!(m.captures("four"),empty); + + assert_eq!(m.matches("one dog"),true); + assert_eq!(m.matches("dog one "),true); + assert_eq!(m.matches("dog one"),false); + + let text = "one dog"; + let mut m = LuaPattern::new("^(%a+)"); + assert_eq!(m.matches(text),true); + assert_eq!(&text[m.bounds(1)], "one"); + assert_eq!(m.matches(" one dog"),false); + + // captures without allocation + let captures = m.match_captures(text); + assert_eq!(captures.get(0),"one"); + assert_eq!(captures.get(1),"one"); + + let mut m = LuaPattern::new("(%S+)%s*=%s*(.+)"); + + // captures as Vec + let cc = m.captures(" hello= bonzo dog"); + assert_eq!(cc[0], "hello= bonzo dog"); + assert_eq!(cc[1],"hello"); + assert_eq!(cc[2],"bonzo dog"); + + // captures as iterator + let mut iter = m.match_captures(" frodo = baggins").into_iter(); + assert_eq!(iter.next(), Some("frodo = baggins")); + assert_eq!(iter.next(), Some("frodo")); + assert_eq!(iter.next(), Some("baggins")); + assert_eq!(iter.next(), None); + + + + } + + #[test] + fn gmatch() { + let mut m = LuaPattern::new("%a+"); + let mut iter = m.gmatch("one two three"); + assert_eq!(iter.next(), Some("one")); + assert_eq!(iter.next(), Some("two")); + assert_eq!(iter.next(), Some("three")); + assert_eq!(iter.next(), None); + + let mut m = LuaPattern::new("(%a+)"); + let mut iter = m.gmatch("one two three"); + assert_eq!(iter.next(), Some("one")); + assert_eq!(iter.next(), Some("two")); + assert_eq!(iter.next(), Some("three")); + assert_eq!(iter.next(), None); + + } + + #[test] + fn gsub() { + let mut m = LuaPattern::new("%$(%S+)"); + let res = m.gsub("hello $dolly you're so $fine!", + |cc| cc.get(1).to_uppercase() + ); + assert_eq!(res,"hello DOLLY you're so FINE!"); + + + } +} diff --git a/src/lua-str.c b/src/lua-str.c new file mode 100644 index 0000000..34c3786 --- /dev/null +++ b/src/lua-str.c @@ -0,0 +1,415 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +typedef struct LuaMatch { + int start; + int end; +} LuaMatch; + + +// not thread safe (yet) +static jmp_buf s_jmp_buf; +static char s_msg_buff[256]; + + +/* macro to `unsign' a character */ +#define uchar(c) ((unsigned char)(c)) + + +/* +** {====================================================== +** PATTERN MATCHING +** ======================================================= +*/ + +#define LUA_MAXCAPTURES 32 + +/* maximum recursion depth for 'match' */ +#define MAXCCALLS 200 + +#define CAP_UNFINISHED (-1) +#define CAP_POSITION (-2) + +typedef struct MatchState { + int matchdepth; /* control for recursive depth (to avoid C stack overflow) */ + const char *src_init; /* init of source string */ + const char *src_end; /* end ('\0') of source string */ + const char *p_end; /* end ('\0') of pattern */ + int level; /* total number of captures (finished or unfinished) */ + struct { + const char *init; + ptrdiff_t len; + } capture[LUA_MAXCAPTURES]; +} MatchState; + +/* recursive function */ +static const char *match (MatchState *ms, const char *s, const char *p); + +#define L_ESC '%' +#define SPECIALS "^$*+?.([%-" + +// error handling, hm?? NB + +static int throw_error(const char *fmt,...) { + va_list ap; + va_start(ap,fmt); + vsnprintf(s_msg_buff,sizeof(s_msg_buff),fmt,ap); + va_end(ap); + longjmp(s_jmp_buf,1); + return 0; +} + +static int check_capture (MatchState *ms, int l) { + l -= '1'; + if (l < 0 || l >= ms->level || ms->capture[l].len == CAP_UNFINISHED) + return throw_error("invalid capture index %%%d", l + 1); + return l; +} + +static int capture_to_close (MatchState *ms) { + int level = ms->level; + for (level--; level>=0; level--) + if (ms->capture[level].len == CAP_UNFINISHED) return level; + return throw_error("invalid pattern capture"); +} + + +static const char *classend (MatchState *ms, const char *p) { + switch (*p++) { + case L_ESC: { + if (p == ms->p_end) + throw_error("malformed pattern (ends with '%')"); + return p+1; + } + case '[': { + if (*p == '^') p++; + do { /* look for a `]' */ + if (p == ms->p_end) + throw_error("malformed pattern (missing ']')"); + if (*(p++) == L_ESC && p < ms->p_end) + p++; /* skip escapes (e.g. `%]') */ + } while (*p != ']'); + return p+1; + } + default: { + return p; + } + } +} + + +static int match_class (int c, int cl) { + int res; + switch (tolower(cl)) { + case 'a' : res = isalpha(c); break; + case 'c' : res = iscntrl(c); break; + case 'd' : res = isdigit(c); break; + case 'g' : res = isgraph(c); break; + case 'l' : res = islower(c); break; + case 'p' : res = ispunct(c); break; + case 's' : res = isspace(c); break; + case 'u' : res = isupper(c); break; + case 'w' : res = isalnum(c); break; + case 'x' : res = isxdigit(c); break; + case 'z' : res = (c == 0); break; /* deprecated option */ + default: return (cl == c); + } + return (islower(cl) ? res : !res); +} + + +static int matchbracketclass (int c, const char *p, const char *ec) { + int sig = 1; + if (*(p+1) == '^') { + sig = 0; + p++; /* skip the `^' */ + } + while (++p < ec) { + if (*p == L_ESC) { + p++; + if (match_class(c, uchar(*p))) + return sig; + } + else if ((*(p+1) == '-') && (p+2 < ec)) { + p+=2; + if (uchar(*(p-2)) <= c && c <= uchar(*p)) + return sig; + } + else if (uchar(*p) == c) return sig; + } + return !sig; +} + + +static int singlematch (MatchState *ms, const char *s, const char *p, + const char *ep) { + if (s >= ms->src_end) + return 0; + else { + int c = uchar(*s); + switch (*p) { + case '.': return 1; /* matches any char */ + case L_ESC: return match_class(c, uchar(*(p+1))); + case '[': return matchbracketclass(c, p, ep-1); + default: return (uchar(*p) == c); + } + } +} + + +static const char *matchbalance (MatchState *ms, const char *s, + const char *p) { + if (p >= ms->p_end - 1) + throw_error("malformed pattern " + "(missing arguments to '%b')"); + if (*s != *p) return NULL; + else { + int b = *p; + int e = *(p+1); + int cont = 1; + while (++s < ms->src_end) { + if (*s == e) { + if (--cont == 0) return s+1; + } + else if (*s == b) cont++; + } + } + return NULL; /* string ends out of balance */ +} + + +static const char *max_expand (MatchState *ms, const char *s, + const char *p, const char *ep) { + ptrdiff_t i = 0; /* counts maximum expand for item */ + while (singlematch(ms, s + i, p, ep)) + i++; + /* keeps trying to match with the maximum repetitions */ + while (i>=0) { + const char *res = match(ms, (s+i), ep+1); + if (res) return res; + i--; /* else didn't match; reduce 1 repetition to try again */ + } + return NULL; +} + + +static const char *min_expand (MatchState *ms, const char *s, + const char *p, const char *ep) { + for (;;) { + const char *res = match(ms, s, ep+1); + if (res != NULL) + return res; + else if (singlematch(ms, s, p, ep)) + s++; /* try with one more repetition */ + else return NULL; + } +} + + +static const char *start_capture (MatchState *ms, const char *s, + const char *p, int what) { + const char *res; + int level = ms->level; + if (level >= LUA_MAXCAPTURES) throw_error("too many captures"); + ms->capture[level].init = s; + ms->capture[level].len = what; + ms->level = level+1; + if ((res=match(ms, s, p)) == NULL) /* match failed? */ + ms->level--; /* undo capture */ + return res; +} + + +static const char *end_capture (MatchState *ms, const char *s, + const char *p) { + int l = capture_to_close(ms); + const char *res; + ms->capture[l].len = s - ms->capture[l].init; /* close capture */ + if ((res = match(ms, s, p)) == NULL) /* match failed? */ + ms->capture[l].len = CAP_UNFINISHED; /* undo capture */ + return res; +} + + +static const char *match_capture (MatchState *ms, const char *s, int l) { + size_t len; + l = check_capture(ms, l); + len = ms->capture[l].len; + if ((size_t)(ms->src_end-s) >= len && + memcmp(ms->capture[l].init, s, len) == 0) + return s+len; + else return NULL; +} + +static const char *match (MatchState *ms, const char *s, const char *p) { + if (ms->matchdepth-- == 0) + throw_error("pattern too complex"); + init: /* using goto's to optimize tail recursion */ + if (p != ms->p_end) { /* end of pattern? */ + switch (*p) { + case '(': { /* start capture */ + if (*(p + 1) == ')') /* position capture? */ + s = start_capture(ms, s, p + 2, CAP_POSITION); + else + s = start_capture(ms, s, p + 1, CAP_UNFINISHED); + break; + } + case ')': { /* end capture */ + s = end_capture(ms, s, p + 1); + break; + } + case '$': { + if ((p + 1) != ms->p_end) /* is the `$' the last char in pattern? */ + goto dflt; /* no; go to default */ + s = (s == ms->src_end) ? s : NULL; /* check end of string */ + break; + } + case L_ESC: { /* escaped sequences not in the format class[*+?-]? */ + switch (*(p + 1)) { + case 'b': { /* balanced string? */ + s = matchbalance(ms, s, p + 2); + if (s != NULL) { + p += 4; goto init; /* return match(ms, s, p + 4); */ + } /* else fail (s == NULL) */ + break; + } + case 'f': { /* frontier? */ + const char *ep; char previous; + p += 2; + if (*p != '[') + throw_error("missing '[' after '%f' in pattern"); + ep = classend(ms, p); /* points to what is next */ + previous = (s == ms->src_init) ? '\0' : *(s - 1); + if (!matchbracketclass(uchar(previous), p, ep - 1) && + matchbracketclass(uchar(*s), p, ep - 1)) { + p = ep; goto init; /* return match(ms, s, ep); */ + } + s = NULL; /* match failed */ + break; + } + case '0': case '1': case '2': case '3': + case '4': case '5': case '6': case '7': + case '8': case '9': { /* capture results (%0-%9)? */ + s = match_capture(ms, s, uchar(*(p + 1))); + if (s != NULL) { + p += 2; goto init; /* return match(ms, s, p + 2) */ + } + break; + } + default: goto dflt; + } + break; + } + default: dflt: { /* pattern class plus optional suffix */ + const char *ep = classend(ms, p); /* points to optional suffix */ + /* does not match at least once? */ + if (!singlematch(ms, s, p, ep)) { + if (*ep == '*' || *ep == '?' || *ep == '-') { /* accept empty? */ + p = ep + 1; goto init; /* return match(ms, s, ep + 1); */ + } + else /* '+' or no suffix */ + s = NULL; /* fail */ + } + else { /* matched once */ + switch (*ep) { /* handle optional suffix */ + case '?': { /* optional */ + const char *res; + if ((res = match(ms, s + 1, ep + 1)) != NULL) + s = res; + else { + p = ep + 1; goto init; /* else return match(ms, s, ep + 1); */ + } + break; + } + case '+': /* 1 or more repetitions */ + s++; /* 1 match already done */ + /* go through */ + case '*': /* 0 or more repetitions */ + s = max_expand(ms, s, p, ep); + break; + case '-': /* 0 or more repetitions (minimum) */ + s = min_expand(ms, s, p, ep); + break; + default: /* no suffix */ + s++; p = ep; goto init; /* return match(ms, s + 1, ep); */ + } + } + break; + } + } + } + ms->matchdepth++; + return s; +} + + + +static void push_onecapture (MatchState *ms, int i, const char *s, + const char *e, LuaMatch *mm) { + if (i >= ms->level) { + if (i == 0) { /* ms->level == 0, too */ + mm->start = 0; + mm->end = e - s ; + //lua_pushlstring(ms->L, s, e - s); /* add whole match */ + } else + throw_error("invalid capture index"); + } + else { + ptrdiff_t l = ms->capture[i].len; + if (l == CAP_UNFINISHED) throw_error("unfinished capture"); + if (l == CAP_POSITION) { + mm[i].start = ms->capture[i].init - ms->src_init + 1; + mm[i].end = mm[i].start; + } else { + mm[i].start = ms->capture[i].init - ms->src_init; + mm[i].end = mm[i].start + l; + } + } +} + + +static int push_captures (MatchState *ms, const char *s, const char *e, LuaMatch *mm) { + int i; + int nlevels = (ms->level == 0 && s) ? 1 : ms->level; + for (i = 0; i < nlevels; i++) + push_onecapture(ms, i, s, e, mm); + return nlevels; /* number of strings pushed */ +} + + +int str_match (const char *s, size_t ls, const char *p, size_t lp, char **err_msg, LuaMatch *mm) { + const char *s1 = s; + MatchState ms; + int anchor = (*p == '^'); + if (anchor) { + p++; lp--; /* skip anchor character */ + } + ms.matchdepth = MAXCCALLS; + ms.src_init = s; + ms.src_end = s + ls; + ms.p_end = p + lp; + do { + const char *res; + ms.level = 0; + if ((res=match(&ms, s1, p)) != NULL) { + mm[0].start = s1 - s; /* start */ + mm[0].end = res - s; /* end */ + return push_captures(&ms, NULL, 0, mm+1) + 1; + } + } while (s1++ < ms.src_end && !anchor); + + if (setjmp(s_jmp_buf) != 0) { + if (err_msg != NULL) *err_msg = s_msg_buff; + } else { + if (err_msg != NULL) *err_msg = NULL; + } + return 0; +} +