--[[ luatokens.lua - pure-Lua Lua tokenizer Copyright (C) 2019 Soni L. This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . --]] --[[ This software is based on Lua 5.1 and Lua 5.3 Lua 5.1 license: /****************************************************************************** * Copyright (C) 1994-2012 Lua.org, PUC-Rio. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ******************************************************************************/ Lua 5.3 license: /****************************************************************************** * Copyright (C) 1994-2018 Lua.org, PUC-Rio. * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ******************************************************************************/ --]] -- we need some stuff from here local parser = require "parser" local selfify = parser.selfify local EOF = parser.EOF local COLLECT = parser.COLLECT local collect_fallback = parser.collect_fallback -- "dummies" -- see http://www.lua.org/source/5.3/llex.h.html#RESERVED -- keywords local TK_AND, TK_BREAK, TK_DO, TK_ELSE, TK_ELSEIF, TK_END, TK_FALSE, TK_FOR, TK_FUNCTION, TK_GOTO, TK_IF, TK_IN, TK_LOCAL, TK_NIL, TK_NOT, TK_OR, TK_REPEAT, TK_RETURN, TK_THEN, TK_TRUE, TK_UNTIL, TK_WHILE, -- operators TK_IDIV, TK_CONCAT, TK_DOTS, TK_EQ, TK_GE, TK_LE, TK_NE, TK_SHL, TK_SHR, -- misc TK_DBCOLON, TK_EOS, -- values/constants TK_FLT, TK_INT, TK_NAME, TK_STRING = {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {} local keywords = { ["and"] = TK_AND, ["break"] = TK_BREAK, ["do"] = TK_DO, ["else"] = TK_ELSE, ["elseif"] = TK_ELSEIF, ["end"] = TK_END, ["false"] = TK_FALSE, ["for"] = TK_FOR, ["function"] = TK_FUNCTION, ["goto"] = TK_GOTO, ["if"] = TK_IF, ["in"] = TK_IN, ["local"] = TK_LOCAL, ["nil"] = TK_NIL, ["not"] = TK_NOT, ["or"] = TK_OR, ["repeat"] = TK_REPEAT, ["return"] = TK_RETURN, ["then"] = TK_THEN, ["true"] = TK_TRUE, ["until"] = TK_UNTIL, ["while"] = TK_WHILE, } local reverse_keywords = {} for k,v in pairs(keywords) do reverse_keywords[v] = k end local defs = selfify({}) defs.base = { [" "] = "whitespace", ["\n"] = "newline", ["\r"] = "newline", ["\v"] = "whitespace", ["\t"] = "whitespace", ["\f"] = "whitespace", ["0"] = "digit", ["1"] = "digit", ["2"] = "digit", ["3"] = "digit", ["4"] = "digit", ["5"] = "digit", ["6"] = "digit", ["7"] = "digit", ["8"] = "digit", ["9"] = "digit", ["a"] = "hexdigit", ["b"] = "hexdigit", ["c"] = "hexdigit", ["d"] = "hexdigit", ["e"] = "hexdigit", ["f"] = "hexdigit", ["A"] = "hexdigit", ["B"] = "hexdigit", ["C"] = "hexdigit", ["D"] = "hexdigit", ["E"] = "hexdigit", ["F"] = "hexdigit", ["g"] = "alpha", ["h"] = "alpha", ["i"] = "alpha", ["j"] = "alpha", ["k"] = "alpha", ["l"] = "alpha", ["m"] = "alpha", ["n"] = "alpha", ["o"] = "alpha", ["p"] = "alpha", ["q"] = "alpha", ["r"] = "alpha", ["s"] = "alpha", ["t"] = "alpha", ["u"] = "alpha", ["v"] = "alpha", ["w"] = "alpha", ["x"] = "alpha", ["y"] = "alpha", ["z"] = "alpha", ["G"] = "alpha", ["H"] = "alpha", ["I"] = "alpha", ["J"] = "alpha", ["K"] = "alpha", ["L"] = "alpha", ["M"] = "alpha", ["N"] = "alpha", ["O"] = "alpha", ["P"] = "alpha", ["Q"] = "alpha", ["R"] = "alpha", ["S"] = "alpha", ["T"] = "alpha", ["U"] = "alpha", ["V"] = "alpha", ["W"] = "alpha", ["X"] = "alpha", ["Y"] = "alpha", ["Z"] = "alpha", } local function countline(state, token, rule) state.line = (state.line or 1) + 1 end local function mknewline(t, hookn, fallback) fallback = fallback or t t["\n"] = setmetatable({[hookn] = countline, ["\r"] = setmetatable({}, {__index=fallback})}, {__index=fallback}) t["\r"] = setmetatable({[hookn] = countline, ["\n"] = setmetatable({}, {__index=fallback})}, {__index=fallback}) return t end do local tstring = selfify({}) defs.string = tstring tstring.defs = defs do local tsescapes = setmetatable(mknewline({ ["'"] = "insertraw", ['"'] = "insertraw", ['\\'] = "insertraw", ["a"] = "insertmap", ["b"] = "insertmap", ["f"] = "insertmap", ["n"] = "insertmap", ["r"] = "insertmap", ["t"] = "insertmap", ["v"] = "insertmap", ["z"] = "skipwhitespace", ["u"] = "unicode", ["x"] = "hex", --["\n"] = setmetatable({[1] = countline, ["\r"] = setmetatable({}, {__index=tstring})}, {__index=tstring}), --["\r"] = setmetatable({[1] = countline, ["\n"] = setmetatable({}, {__index=tstring})}, {__index=tstring}), [1] = function(state, token, rule) if token == "\r" or token == "\n" then collect_fallback(state, "\n") end end, }, 1, tstring), {__index = defs.base}) defs.string.escapes = tsescapes tsescapes.string = defs.string function tsescapes.insertraw(state, token) collect_fallback(state, token) return "string" end do local map = { ["a"] = "\a", ["b"] = "\b", ["f"] = "\f", ["n"] = "\n", ["r"] = "\r", ["t"] = "\t", ["v"] = "\v" } function tsescapes.insertmap(state, token) collect_fallback(state, map[token]) return "string" end end function tsescapes.digit(state, token) local digit = string.find("1234567890", token, 1, true) local num = state.in_digit if digit then num = (num or 0) * 10 + digit % 10 state.c = (state.c or 0) + 1 if state.c < 3 then state.in_digit = num return "digitc" end end if num > 255 then return nil end collect_fallback(state, string.char(num)) state.in_digit = nil state.c = nil if not digit then collect_fallback(state, token) end return "string" end tsescapes.digitc = setmetatable(selfify({[parser.FALLBACK] = tsescapes.digit, string = tstring}, "digitc"), {__index=tstring}) tsescapes.digitc[1]=function(state, token, rule) if rule == nil then collect_fallback(state, string.char(state.in_digit)) state.in_digit = nil state.c = nil end end tsescapes.hex = setmetatable(selfify({string = defs.string, digit = "hexdigit"}), {__index=defs.base}) function tsescapes.hex.hexdigit(state, token) local digit = string.find("123456789ABCDEF0123456789abcdef0", token, 1, true) assert(digit, "this should never be called for non-hex-digits") local num = state.in_hex if num then num = num * 16 + digit % 16 collect_fallback(state, string.char(num)) state.in_hex = nil return "string" else state.in_hex = digit % 16 return "self" end end do local tseunicode = {} tseunicode["{"] = "hex" do local tseuhex = setmetatable(selfify({digit = "hexdigit", string=tstring}), {__index=defs.base}) tseunicode.hex = tseuhex function tseuhex.hexdigit(state, token) local digit = string.find("123456789ABCDEF0123456789abcdef0", token, 1, true) assert(digit, "this should never be called for non-hex-digits") state.in_hex = (state.in_hex or 0) * 16 + digit % 16 if state.in_hex <= 2147483647 then return "self" end end tseuhex["}"] = function(state, token) local num = state.in_hex state.in_hex = nil if num < 128 then collect_fallback(state, string.char(num)) return "string" end local bytes = "" while num > 63 do local v = num % 64 bytes = string.char(128 + v) .. bytes -- yeah ik, not the most efficient num = (num - v) / 64 end if num >= 2^6/(2^#bytes) then local v = num % 64 bytes = string.char(128 + v) .. bytes num = (num - v) / 64 end do local v = 0 for i=1,#bytes do v = v + 128 / 2^i end v = v + num assert(v < 126) bytes = string.char(128 + v) .. bytes end collect_fallback(state, bytes) return "string" end end tsescapes.unicode = tseunicode end do local tseskipwhitespace = selfify(mknewline({ string = defs.string, whitespace = "self", [parser.FALLBACK] = "string", [1] = collect_fallback, }, 2)) --tseskipwhitespace["\n"] = setmetatable({[2] = countline, ["\r"] = setmetatable({}, {__index=tseskipwhitespace})}, {__index=tseskipwhitespace}) --tseskipwhitespace["\r"] = setmetatable({[2] = countline, ["\n"] = setmetatable({}, {__index=tseskipwhitespace})}, {__index=tseskipwhitespace}) local tbase = defs.base local tbasemap = {whitespace = "whitespace"} setmetatable(tseskipwhitespace, {__index = function(t, k) return tbasemap[tbase[k]] or tstring[k] end}) tsescapes.skipwhitespace = tseskipwhitespace end end tstring['\\'] = "escapes" tstring['"'] = "close" tstring["'"] = "close" tstring['\n'] = false tstring['\r'] = false tstring[parser.FALLBACK] = "self" tstring[1] = collect_fallback function tstring.close(state, token) if state.in_string == token then state.in_string = nil state[#state+1] = table.concat(state[COLLECT]) state[COLLECT] = nil return "defs" else collect_fallback(state, token) return "self" end end end do local tlongstring = {} defs.longstring = tlongstring do local tllongstring_proper = selfify({[parser.FALLBACK] = "self", ["]"] = function(state, token) state.longstring_close = 0 return "maybe_end" end}) tllongstring_proper[1] = false -- placeholder for newline handling tllongstring_proper[2] = collect_fallback do local tllmaybe_end = selfify({defs = defs}, "maybe_end") tllongstring_proper.maybe_end = tllmaybe_end tllmaybe_end.longstring_proper = tllongstring_proper tllmaybe_end["="] = function(state, token) state.longstring_close = state.longstring_close + 1 return "maybe_end" end tllmaybe_end["]"] = function(state, token) if state.longstring_close == state.longstring_count then state.longstring_close = nil state.longstring_count = nil local pos = #state state[pos+1] = TK_STRING state[pos+2] = table.concat(state[COLLECT]) state[COLLECT] = nil return "defs" else collect_fallback(state, "]") collect_fallback(state, ("="):rep(state.longstring_close)) state.longstring_close = 0 return "maybe_end" end end tllmaybe_end[parser.FALLBACK] = "longstring_proper" tllmaybe_end[1] = collect_fallback tllmaybe_end[-1] = function(state, token, rule) if not rule then collect_fallback(state, "]") collect_fallback(state, ("="):rep(state.longstring_close)) state.longstring_close = nil end end end tlongstring.longstring_proper = tllongstring_proper mknewline(tlongstring, 1, tllongstring_proper) setmetatable(tlongstring, {__index=tllongstring_proper}) end end defs["'"] = "string_open" defs['"'] = "string_open" defs["["] = "maybe_longstring" defs.maybe_longstring = setmetatable({ defs = defs, ['['] = "longstring_open", ['='] = "longstring_open", longstring_count = selfify({ ["="] = function(state, token) state.longstring_count = state.longstring_count + 1 return "self" end, ["["] = function(state, token) state[COLLECT] = {coalesce=63} -- TODO tweak this for CPU/memory tradeoff? return "longstring" end, longstring = defs.longstring }), longstring_open = function(state, token) if token == "=" then state.longstring_count = state.longstring_count or 0 + 1 return "longstring_count" elseif token == "[" then state.longstring_count = 0 state[COLLECT] = {coalesce=63} -- TODO tweak this for CPU/memory tradeoff? return "longstring" end end, [-1] = function(state, token, rule) if rule ~= "longstring_open" then state[#state+1] = "[" end end }, {__index=defs}) -- these are needed for proper line counts --defs["\n"] = setmetatable({["\r"] = setmetatable({}, {__index=defs})}, {__index=defs}) --defs["\r"] = setmetatable({["\n"] = setmetatable({}, {__index=defs})}, {__index=defs}) mknewline(defs, 1) -- thankfully comments are easy defs["-"] = "maybe_comment" do local tmaybe_comment = setmetatable({["-"] = "comment"}, {__index=defs}) defs.maybe_comment = tmaybe_comment tmaybe_comment[parser.EOZ] = "self" -- defs tmaybe_comment[-1] = function(state, token, rule) if rule ~= "comment" then state[#state+1] = "-" end end do local tmcomment = {comment_proper = selfify({})} tmaybe_comment.comment = tmcomment tmcomment[parser.FALLBACK] = "comment_proper" tmcomment["["] = "maybe_longcomment" mknewline(tmcomment, 1, defs) mknewline(tmcomment.comment_proper, 1, defs) tmcomment.comment_proper[parser.FALLBACK] = "self" do local tllongcomment_proper = selfify({[parser.FALLBACK] = "self", ["]"] = function(state, token) state.longcomment_close = 0 return "maybe_end" end}) tmcomment.longcomment = tllongcomment_proper do local tllmaybe_end = selfify({defs = defs}, "maybe_end") tllongcomment_proper.maybe_end = tllmaybe_end tllmaybe_end.longcomment_proper = tllongcomment_proper tllmaybe_end["="] = function(state, token) state.longcomment_close = state.longcomment_close + 1 return "maybe_end" end tllmaybe_end["]"] = function(state, token) if state.longcomment_close == state.longcomment_count then state.longcomment_close = nil state.longcomment_count = nil return "defs" else state.longcomment_close = 0 return "maybe_end" end end tllmaybe_end[parser.FALLBACK] = "longcomment_proper" tllmaybe_end[-1] = function(state, token, rule) if not rule then state.longcomment_close = nil end end end mknewline(tllongcomment_proper, 1, tllongcomment_proper) end tmcomment.maybe_longcomment = setmetatable({ comment = tmcomment, ['['] = "longcomment_open", ['='] = "longcomment_open", longcomment_count = setmetatable(selfify({ ["="] = function(state, token) state.longcomment_count = state.longcomment_count + 1 return "longcomment_count" end, ["["] = "longcomment", longcomment = tmcomment.longcomment, }, "longcomment_count"), {__index=tmcomment}), longcomment_open = function(state, token) if token == "=" then state.longcomment_count = state.longcomment_count or 0 + 1 return "longcomment_count" elseif token == "[" then state.longcomment_count = 0 return "longcomment" end end, }, {__index=tmcomment}) end end local STATE = parser.STATE defs.multitokens = setmetatable({ [parser.EOZ] = "self", [-1] = function(state, token, rule) if not state[STATE].multitoken[token] then state[#state+1] = state[STATE].first end end, second = function(state, token) state[#state+1] = state[STATE].multitoken[token] return "self" -- actually goes into defs end }, { __index=defs, __call=function(t, first, ...) local function helper(t, second, result, ...) if not second then return end t[second] = "second" t.multitoken[second] = result return helper(t, ...) end defs[first] = setmetatable({ first = first, multitoken = {} }, {__index=t}) return helper(defs[first], ...) end }) defs.multitokens("=", "=", TK_EQ) defs.multitokens("/", "/", TK_IDIV) defs.multitokens("<", "<", TK_SHL, "=", TK_LE) defs.multitokens(">", ">", TK_SHR, "=", TK_GE) defs.multitokens("~", "=", TK_NE) defs.multitokens(":", ":", TK_DBCOLON) defs["."] = setmetatable({ [-1] = function(state, token, rule) if token ~= "." then if rule ~= "digit" then state[#state+1] = "." end end end, digit = function(state, token, rule) state[#state+1] = TK_FLT state[COLLECT] = {".", coalesce=31} return "in_decimal" end, ["."] = setmetatable({ [-1] = function(state, token, rule) if token ~= "." then state[#state+1] = TK_CONCAT end end, ["."] = function(state, token) state[#state+1] = TK_DOTS return "self" -- actually goes into defs end, }, {__index=defs}) }, {__index=defs}) function defs.digit(state, token) state[COLLECT] = {token, coalesce=31} if token == "0" then return "in_zero" else return "in_integer" end end defs.in_integer = setmetatable(selfify({ hexdigit = "alpha", alpha = false, ['e'] = "exp", ['E'] = "exp", [parser.EOZ] = "self", -- defs exp = function(state, token) collect_fallback(state, token) return "in_exp" end, ['.'] = function(state, token) collect_fallback(state, token) return "in_decimal" end, digit = function(state, token) collect_fallback(state, token) return "in_digit" end, [-1] = function(state, token, rule) -- TODO figure out best order for these checks if rule == "digit" or token == "." or rule == "hexdigit" or rule == "into_hex" or rule == "exp" then return end state[#state+1] = state[STATE].numtype state[#state+1] = tonumber(table.concat(state[COLLECT])) -- TODO maybe not the best option state[COLLECT] = nil end, numtype = TK_INT }, "in_digit"), {__index=defs}) defs.in_zero = setmetatable({ ['x'] = "into_hex", ['X'] = "into_hex", into_hex = function(state, token) collect_fallback(state, token) return "in_hex" end, }, {__index=defs.in_integer}) defs.in_decimal = setmetatable(selfify({ ['.'] = false, numtype = TK_FLT }, "in_digit"), {__index=defs.in_integer}) defs.in_expnum = setmetatable(selfify({ exp = false, }, "in_digit"), {__index=defs.in_decimal}) defs.in_subexp = setmetatable({ in_expnum = defs.in_expnum, digit = function(state, token) collect_fallback(state, token) return "in_expnum" end, }, {__index=defs.base}) defs.in_exp = setmetatable({ in_subexp = defs.in_subexp, ["+"] = "sign", ["-"] = "sign", sign = function(state, token) collect_fallback(state, token) return "in_subexp" end, }, {__index=defs.in_subexp}) defs.in_hex = setmetatable(selfify({ in_decimal = "in_hex_fraction", hexdigit = 'digit', ['e'] = 'hexdigit', ['E'] = 'hexdigit', ['p'] = 'exp', ['P'] = 'exp', }, "in_digit"), {__index=defs.in_integer}) defs.in_hex_fraction = setmetatable(selfify({ ['.'] = false, numtype = TK_FLT }, "in_digit"), {__index=defs.in_hex}) function defs.simpletoken(state, token) state[#state+1] = token return "self" end for token in string.gmatch("+*%^#&|(){}];,", ".") do defs[token] = "simpletoken" end defs.whitespace = "self" defs.hexdigit = "alpha" defs["_"] = "alpha" defs.in_alpha = setmetatable(selfify({digit = "in_alpha", hexdigit = "in_alpha", alpha = "in_alpha", _ = "in_alpha", [parser.EOZ] = "self"}, "in_alpha"), {__index=defs}) function defs.alpha(state, token) state[COLLECT] = {coalesce=15} -- TODO tweak this for CPU/memory tradeoff? collect_fallback(state, token) return "in_alpha" end defs.in_alpha[-1] = function(state, token, rule) if rule == "alpha" or rule == "digit" or rule == "hexdigit" or token == "_" then collect_fallback(state, token) else local key = table.concat(state[COLLECT]) state[COLLECT] = nil local keyword = keywords[key] if keyword then state[#state+1] = keyword else local pos = #state state[pos+1] = TK_NAME state[pos+2] = key end end end setmetatable(defs, {__index=defs.base}) function defs.string_open(state, token) if not state.in_string then state[#state+1] = TK_STRING state[COLLECT] = {coalesce=63} -- TODO tweak this for CPU/memory tradeoff? state.in_string = token return "string" end assert("this shouldn't happen") end local tokens = { TK_AND = TK_AND, TK_BREAK = TK_BREAK, TK_DO = TK_DO, TK_ELSE = TK_ELSE, TK_ELSEIF = TK_ELSEIF, TK_END = TK_END, TK_FALSE = TK_FALSE, TK_FOR = TK_FOR, TK_FUNCTION = TK_FUNCTION, TK_GOTO = TK_GOTO, TK_IF = TK_IF, TK_IN = TK_IN, TK_LOCAL = TK_LOCAL, TK_NIL = TK_NIL, TK_NOT = TK_NOT, TK_OR = TK_OR, TK_REPEAT = TK_REPEAT, TK_RETURN = TK_RETURN, TK_THEN = TK_THEN, TK_TRUE = TK_TRUE, TK_UNTIL = TK_UNTIL, TK_WHILE = TK_WHILE, TK_IDIV = TK_IDIV, TK_CONCAT = TK_CONCAT, TK_DOTS = TK_DOTS, TK_EQ = TK_EQ, TK_GE = TK_GE, TK_LE = TK_LE, TK_NE = TK_NE, TK_SHL = TK_SHL, TK_SHR = TK_SHR, TK_DBCOLON = TK_DBCOLON, TK_EOS = TK_EOS, TK_FLT = TK_FLT, TK_INT = TK_INT, TK_NAME = TK_NAME, TK_STRING = TK_STRING } local TK = {} for k,v in pairs(tokens) do setmetatable(v, {__name=k, __tostring=function(self) return getmetatable(self).__name end}) TK[k:sub(4)] = v end return { defs = defs, tokens = tokens, TK = TK, reverse_keywords = reverse_keywords, reverse_tokens = { [TK_IDIV] = "//", [TK_CONCAT] = "..", [TK_DOTS] = "...", [TK_EQ] = "==", [TK_GE] = ">=", [TK_LE] = "<=", [TK_NE] = "~=", [TK_SHL] = "<<", [TK_SHR] = ">>", [TK_DBCOLON] = "::", [TK_EOS] = "", [TK_FLT] = "", [TK_INT] = "", [TK_NAME] = "", [TK_STRING] = "" }, }