diff --git a/luatokens.lua b/luatokens.lua index 226a81a..7bf9f68 100644 --- a/luatokens.lua +++ b/luatokens.lua @@ -9,12 +9,15 @@ local collect_fallback = parser.collect_fallback -- "dummies" -- see http://www.lua.org/source/5.3/llex.h.html#RESERVED +-- keywords local TK_AND, TK_BREAK, TK_DO, TK_ELSE, TK_ELSEIF, TK_END, TK_FALSE, TK_FOR, TK_FUNCTION, TK_GOTO, TK_IF, TK_IN, TK_LOCAL, TK_NIL, TK_NOT, TK_OR, TK_REPEAT, TK_RETURN, TK_THEN, TK_TRUE, TK_UNTIL, TK_WHILE, + -- operators TK_IDIV, TK_CONCAT, TK_DOTS, TK_EQ, TK_GE, TK_LE, TK_NE, TK_SHL, TK_SHR, + -- misc TK_DBCOLON, TK_EOS, TK_FLT, TK_INT, TK_NAME, TK_STRING = {}, {}, @@ -26,7 +29,32 @@ local TK_AND, TK_BREAK, {}, {}, {}, {}, {}, {} -local defs = {} +local keywords = { + ["and"] = TK_AND, + ["break"] = TK_BREAK, + ["do"] = TK_DO, + ["else"] = TK_ELSE, + ["elseif"] = TK_ELSEIF, + ["end"] = TK_END, + ["false"] = TK_FALSE, + ["for"] = TK_FOR, + ["function"] = TK_FUNCTION, + ["goto"] = TK_GOTO, + ["if"] = TK_IF, + ["in"] = TK_IN, + ["local"] = TK_LOCAL, + ["nil"] = TK_NIL, + ["not"] = TK_NOT, + ["or"] = TK_OR, + ["repeat"] = TK_REPEAT, + ["return"] = TK_RETURN, + ["then"] = TK_THEN, + ["true"] = TK_TRUE, + ["until"] = TK_UNTIL, + ["while"] = TK_WHILE, +} + +local defs = selfify({}) defs.base = { [" "] = "whitespace", @@ -280,9 +308,46 @@ do local tstring = selfify({}) end end -do local tlongstring = selfify({}) +do local tlongstring = {} defs.longstring = tlongstring - -- TODO + do local tllongstring_proper = selfify({[""] = "self", ["]"] = function(state, token) state.longstring_close = 0 return "maybe_end" end}) + tllongstring_proper[1] = collect_fallback + + do local tllmaybe_end = selfify({defs = defs}, "maybe_end") + tllongstring_proper.maybe_end = tllmaybe_end + tllmaybe_end["="] = function(state, token) + state.longstring_close = state.longstring_close + 1 + return "maybe_end" + end + tllmaybe_end["]"] = function(state, token) + if state.longstring_close == state.longstring_count then + state.longstring_close = nil + state.longstring_count = nil + local pos = #state + state[pos+1] = TK_STRING + state[pos+2] = table.concat(state[COLLECT]) + state[COLLECT] = nil + return "defs" + else + collect_fallback(state, "]") + collect_fallback(state, ("="):rep(state.longstring_close)) + state.longstring_close = 0 + return "maybe_end" + end + end + tllmaybe_end[-1] = function(state, token, rule) + if not rule then + collect_fallback(state, "]") + collect_fallback(state, ("="):rep(state.longstring_close)) + state.longstring_close = nil + end + end + end + + tlongstring.longstring_proper = tllongstring_proper + mknewline(tlongstring, 1, tllongstring_proper) + setmetatable(tlongstring, {__index=tllongstring_proper}) + end end defs["'"] = "string_open" @@ -297,6 +362,10 @@ defs.maybe_longstring = setmetatable({ state.longstring_count = state.longstring_count + 1 return "self" end, + ["["] = function(state, token) + state[COLLECT] = {coalesce=63} -- TODO tweak this for CPU/memory tradeoff? + return "longstring" + end, longstring = defs.longstring }), longstring_open = function(state, token) @@ -304,6 +373,8 @@ defs.maybe_longstring = setmetatable({ state.longstring_count = state.longstring_count or 0 + 1 return "longstring_count" elseif token == "[" then + state.longstring_count = 0 + state[COLLECT] = {coalesce=63} -- TODO tweak this for CPU/memory tradeoff? return "longstring" end end, @@ -319,12 +390,38 @@ defs.maybe_longstring = setmetatable({ --defs["\r"] = setmetatable({["\n"] = setmetatable({}, {__index=defs})}, {__index=defs}) mknewline(defs, 1) +defs.whitespace = "self" +defs.hexdigit = "alpha" +defs["_"] = "alpha" +defs.in_alpha = setmetatable(selfify({digit = "in_alpha", hexdigit = "in_alpha", alpha = "in_alpha", _ = "in_alpha", [parser.EOZ] = "self"}, "in_alpha"), {__index=defs}) +function defs.alpha(state, token) + state[COLLECT] = {coalesce=15} -- TODO tweak this for CPU/memory tradeoff? + collect_fallback(state, token) + return "in_alpha" +end +defs.in_alpha[-1] = function(state, token, rule) + if rule == "alpha" or rule == "digit" or rule == "hexdigit" or token == "_" then + collect_fallback(state, token) + else + local key = table.concat(state[COLLECT]) + state[COLLECT] = nil + local keyword = keywords[key] + if keyword then + state[#state+1] = keyword + else + local pos = #state + state[pos+1] = TK_NAME + state[pos+2] = key + end + end +end + setmetatable(defs, {__index=defs.base}) function defs.string_open(state, token) if not state.in_string then state[#state+1] = TK_STRING - state[COLLECT] = {coalesce=50} -- TODO tweak this for CPU/memory tradeoff? + state[COLLECT] = {coalesce=63} -- TODO tweak this for CPU/memory tradeoff? state.in_string = token return "string" end diff --git a/parser.lua b/parser.lua index bfa7dd3..4f4e166 100644 --- a/parser.lua +++ b/parser.lua @@ -24,6 +24,8 @@ local DATA = {} local GEN = {} -- key for DATA OFFSET local OFFDATA = {} +-- key for End of Stream +local EOZ = {} local optimize_lookups = {} for i=0, 255 do @@ -39,6 +41,9 @@ local function get_next_common(state, in_pos, token) if state[STATE] then local st = state[STATE] local rule = st[token] + if not rule and token == EOZ then + return in_pos, state + end do -- pre-hooks local pos = -1 local hook = st[pos] @@ -83,7 +88,9 @@ local function get_next_common(state, in_pos, token) end local function get_next_table(state, in_pos) - if state[DATA] == nil or #state[DATA] == 0 then return in_pos, state end -- TODO end-of-stream handling + if state[DATA] == nil or #state[DATA] == 0 then + return get_next_common(state, in_pos, EOZ) + end in_pos = in_pos + 1 local token = state[DATA][in_pos - state[OFFDATA]] if token == nil then @@ -95,7 +102,13 @@ local function get_next_table(state, in_pos) end local function get_next_string(state, in_pos) - if state[DATA] == nil or #state[DATA] == 0 then return in_pos, state end -- TODO end-of-stream handling + if state[DATA] == nil or #state[DATA] == 0 then + if state[STATE] == nil then + return in_pos, state + else + return get_next_common(state, in_pos, EOZ) + end + end in_pos = in_pos + 1 local token = optimize_lookups[string.byte(state[DATA], in_pos - state[OFFDATA], in_pos - state[OFFDATA])] if token == nil then @@ -142,6 +155,7 @@ local COLLECT = {} return { STATE = STATE, COLLECT = COLLECT, + EOZ = EOZ, stream = stream, parse = parse, -- common utility function @@ -154,7 +168,7 @@ return { if not rule then local t = state[COLLECT] t[#t+1] = token - if t.coalesce and #t > t.coalesce then + if t.coalesce and #t >= t.coalesce then t[1] = table.concat(t) for i=2, #t do t[i] = nil end end diff --git a/test.lua b/test.lua index 8672903..ef0a586 100644 --- a/test.lua +++ b/test.lua @@ -24,6 +24,14 @@ local function case() return caseno end +do -- basic check + local case = case() + local defs = {} + local count = 0 + local state, err = parser.parse(defs, function() assert(count == 0, "should be called only once"); count = count + 1 return nil end) + assert(state) +end -- basic check + do -- trim left spaces local defs = {} defs.self = defs @@ -82,6 +90,7 @@ do -- lua tokens else assert(state[1] == luatokens.tokens.TK_STRING) assert(state[2] == "hello world") + assert(state.line == 1 or not state.line) end end -- lua tokens @@ -101,6 +110,7 @@ do -- more lua tokens else assert(state[1] == luatokens.tokens.TK_STRING) assert(state[2] == "\7\8\12\10\13\9\11\92\34\39\65\65\10") + assert(state.line == 2) end end -- lua tokens @@ -119,6 +129,7 @@ do -- even more lua tokens else assert(state[1] == luatokens.tokens.TK_STRING) assert(state[2] == "A") + assert(state.line == 1 or not state.line) end end -- lua tokens @@ -157,6 +168,7 @@ do -- even more lua tokens assert(table.remove(state, 1) == "\252\132\128\128\128\128") assert(table.remove(state, 1) == luatokens.tokens.TK_STRING) assert(table.remove(state, 1) == "\253\191\191\191\191\191") + assert(state.line == 1 or not state.line) end end -- lua tokens @@ -176,6 +188,7 @@ do -- simple lua tokens assert(table.remove(state, 1) == "[") assert(table.remove(state, 1) == luatokens.tokens.TK_STRING) assert(table.remove(state, 1) == "") + assert(state.line == 1 or not state.line) end end -- lua tokens @@ -194,8 +207,9 @@ do -- simple long string else assert(table.remove(state, 1) == luatokens.tokens.TK_STRING) assert(table.remove(state, 1) == "") + assert(state.line == 1 or not state.line) end -end -- lua tokens +end -- long string do -- long string with depth 1 local luatokens = require "luatokens" @@ -212,8 +226,9 @@ do -- long string with depth 1 else assert(table.remove(state, 1) == luatokens.tokens.TK_STRING) assert(table.remove(state, 1) == "") + assert(state.line == 1 or not state.line) end -end -- lua tokens +end -- long string do -- long string with "nested" long string local luatokens = require "luatokens" @@ -230,5 +245,86 @@ do -- long string with "nested" long string else assert(table.remove(state, 1) == luatokens.tokens.TK_STRING) assert(table.remove(state, 1) == "[[]]") + assert(state.line == 1 or not state.line) end -end -- lua tokens +end -- long string + +do -- long string edge cases + local luatokens = require "luatokens" + local tokens = luatokens.defs + local state, err, etoken, estate = parser.parse(tokens, "[==[]=]==][==[]]==]") + local case = case() + if not state then + print(case, "---- IN TOKENS ----") + print(case, err, etoken) + for i,v in pairs(estate) do + print(case, i, v) + end + print(case, "---- OUT TOKENS ----") + else + assert(table.remove(state, 1) == luatokens.tokens.TK_STRING) + assert(table.remove(state, 1) == "]=") + assert(table.remove(state, 1) == luatokens.tokens.TK_STRING) + assert(table.remove(state, 1) == "]") + assert(state.line == 1 or not state.line) + end +end -- long string + +do -- keywords + local luatokens = require "luatokens" + local tokens = luatokens.defs + local state, err, etoken, estate = parser.parse(tokens, [[ + and break do else elseif end + false for function goto if in + local nil not or repeat return + then true until while]]) + local case = case() + if not state then + print(case, "---- IN TOKENS ----") + print(case, err, etoken) + for i,v in pairs(estate) do + print(case, i, v) + end + print(case, "---- OUT TOKENS ----") + else + assert(table.remove(state, 1) == luatokens.tokens.TK_AND) + assert(table.remove(state, 1) == luatokens.tokens.TK_BREAK) + assert(table.remove(state, 1) == luatokens.tokens.TK_DO) + assert(table.remove(state, 1) == luatokens.tokens.TK_ELSE) + assert(table.remove(state, 1) == luatokens.tokens.TK_ELSEIF) + assert(table.remove(state, 1) == luatokens.tokens.TK_END) + assert(table.remove(state, 1) == luatokens.tokens.TK_FALSE) + assert(table.remove(state, 1) == luatokens.tokens.TK_FOR) + assert(table.remove(state, 1) == luatokens.tokens.TK_FUNCTION) + assert(table.remove(state, 1) == luatokens.tokens.TK_GOTO) + assert(table.remove(state, 1) == luatokens.tokens.TK_IF) + assert(table.remove(state, 1) == luatokens.tokens.TK_IN) + assert(table.remove(state, 1) == luatokens.tokens.TK_LOCAL) + assert(table.remove(state, 1) == luatokens.tokens.TK_NIL) + assert(table.remove(state, 1) == luatokens.tokens.TK_NOT) + assert(table.remove(state, 1) == luatokens.tokens.TK_OR) + assert(table.remove(state, 1) == luatokens.tokens.TK_REPEAT) + assert(table.remove(state, 1) == luatokens.tokens.TK_RETURN) + assert(table.remove(state, 1) == luatokens.tokens.TK_THEN) + assert(table.remove(state, 1) == luatokens.tokens.TK_TRUE) + assert(table.remove(state, 1) == luatokens.tokens.TK_UNTIL) + assert(table.remove(state, 1) == luatokens.tokens.TK_WHILE) + assert(state.line == 4) + end +end -- keywords + +do -- FUCK + local luatokens = require "luatokens" + local luatokens_file = io.open("./luatokens.lua", "r"):read((_VERSION == "5.1" or _VERSION == "5.2") and "*a" or "a") + local tokens = luatokens.defs + local state, err, etoken, estate = parser.parse(tokens, luatokens_file) + local case = case() + if not state then + print(case, "---- IN TOKENS ----") + print(case, err, etoken) + for i,v in pairs(estate) do + print(case, i, v) + end + print(case, "---- OUT TOKENS ----") + end +end -- FUCK