Attempted lua tokenizer didn't work

Publishing anyway because someone might be able to learn from my failure
This commit is contained in:
SoniEx2 2019-04-03 17:08:29 -03:00
parent d03d77d28b
commit 5a4b41bd47
3 changed files with 304 additions and 11 deletions

225
luatokens.lua Normal file
View File

@ -0,0 +1,225 @@
-- Lua tokens
-- we need some stuff from here
local parser = require "parser"
local selfify = parser.selfify
-- "dummies"
local TK_STRING = {}
local tokens = {}
tokens.base = {
[" "] = "whitespace",
["\n"] = "newline",
["\r"] = "newline",
["\v"] = "whitespace",
["\t"] = "whitespace",
["\f"] = "whitespace",
["0"] = "digit",
["1"] = "digit",
["2"] = "digit",
["3"] = "digit",
["4"] = "digit",
["5"] = "digit",
["6"] = "digit",
["7"] = "digit",
["8"] = "digit",
["9"] = "digit",
["a"] = "hexdigit",
["b"] = "hexdigit",
["c"] = "hexdigit",
["d"] = "hexdigit",
["e"] = "hexdigit",
["f"] = "hexdigit",
["A"] = "hexdigit",
["B"] = "hexdigit",
["C"] = "hexdigit",
["D"] = "hexdigit",
["E"] = "hexdigit",
["F"] = "hexdigit",
["g"] = "alpha",
["h"] = "alpha",
["i"] = "alpha",
["j"] = "alpha",
["k"] = "alpha",
["l"] = "alpha",
["m"] = "alpha",
["n"] = "alpha",
["o"] = "alpha",
["p"] = "alpha",
["q"] = "alpha",
["r"] = "alpha",
["s"] = "alpha",
["t"] = "alpha",
["u"] = "alpha",
["v"] = "alpha",
["w"] = "alpha",
["x"] = "alpha",
["y"] = "alpha",
["z"] = "alpha",
["G"] = "alpha",
["H"] = "alpha",
["I"] = "alpha",
["J"] = "alpha",
["K"] = "alpha",
["L"] = "alpha",
["M"] = "alpha",
["N"] = "alpha",
["O"] = "alpha",
["P"] = "alpha",
["Q"] = "alpha",
["R"] = "alpha",
["S"] = "alpha",
["T"] = "alpha",
["U"] = "alpha",
["V"] = "alpha",
["W"] = "alpha",
["X"] = "alpha",
["Y"] = "alpha",
["Z"] = "alpha",
}
local function linecount(state, token, rule)
if token == "\n" or token == "\r" then
state.line = (state.line or 1) + 1
end
end
do local tstring = selfify({})
tokens.string = tstring
tstring.tokens = tokens
do local tsescapes = setmetatable({
["'"] = "insertraw",
['"'] = "insertraw",
['\\'] = "insertraw",
["a"] = "insertmap",
["b"] = "insertmap",
["f"] = "insertmap",
["n"] = "insertmap",
["r"] = "insertmap",
["t"] = "insertmap",
["v"] = "insertmap",
["z"] = "skipwhitespace",
["u"] = "unicode",
["x"] = "hex",
["\n"] = setmetatable({["\r"] = setmetatable({}, {__index=tstring})}, {__index=tstring}),
["\r"] = setmetatable({["\n"] = setmetatable({}, {__index=tstring})}, {__index=tstring}),
[1] = linecount,
[2] = print
}, {__index = tokens.base})
tokens.string.escapes = tsescapes
tsescapes.string = tokens.string
function tsescapes.insertraw(state, token)
state[#state+1] = token
return "string"
end
do
local map = { ["a"] = "\a", ["b"] = "\b", ["f"] = "\f", ["n"] = "\n", ["r"] = "\r", ["t"] = "\t", ["v"] = "\v" }
function tsescapes.insertmap(state, token)
state[#state+1] = map[token]
return "string"
end
end
function tsescapes.digit(state, token)
local digit = string.find("1234567890", token, 1, true)
local num = state.in_digit
if digit then
num = (num or 0) * 10 + digit % 10
state.c = (state.c or 0) + 1
if state.c < 3 then
state.in_digit = num
return "digitc"
end
end
if num > 255 then
return nil
end
state[#state+1] = string.char(num)
state.in_digit = nil
state.c = nil
return "string"
end
tsescapes.digitc = setmetatable(selfify({[""] = tsescapes.digit, digitc = "self", string = tstring}), {__index=tstring})
tsescapes.hex = setmetatable(selfify({string = tokens.string}), {__index=tokens.base})
function tsescapes.hex.hexdigit(state, token)
local digit = string.find("123456789ABCDEF0123456789abcdef0", token, 1, true)
assert(digit, "this should never be called for non-hex-digits")
local num = state.in_hex
if num then
num = num * 16 + digit % 16
state[#state+1] = string.char(num)
state.in_hex = nil
return "string"
else
state.in_hex = digit % 16
return "self"
end
end
do local tseskipwhitespace = selfify({
string = tokens.string,
whitespace = "self",
[""] = "string",
[1] = parser.insert_fallback,
[2] = linecount,
})
local tbase = tokens.base
local tbasemap = {whitespace = "whitespace", newline = "whitespace"}
setmetatable(tseskipwhitespace, {__index = function(t, k) return tbasemap[tbase[k]] or tstring[k] end})
tsescapes.skipwhitespace = tseskipwhitespace
end
end
tstring['\\'] = "escapes"
tstring['"'] = "close"
tstring["'"] = "close"
tstring['\n'] = false
tstring['\r'] = false
tstring[""] = "self"
tstring[1] = parser.insert_fallback
function tstring.close(state, token)
if state.in_string == token then
local i = state.string_start
state.in_string = nil
state.string_start = nil
state[i+1] = table.concat(state, '', i+1)
for j=i+2, #state do
state[j]=nil
end
return "tokens"
else
state[#state+1] = token
return "self"
end
end
end
tokens["'"] = "string_open"
tokens['"'] = "string_open"
setmetatable(tokens, {__index=whitespace})
function tokens.string_open(state, token)
if not state.in_string then
state[#state+1] = TK_STRING
state.in_string = token
state.string_start = #state
return "string"
end
assert("this shouldn't happen")
end
return {
tokens = tokens,
TK_STRING = TK_STRING,
}

View File

@ -31,27 +31,34 @@ local type, tostring
local function get_next_common(state, in_pos, token)
-- note: must preserve "token" - do not call recursively with a different token
local transition
if state[STATE] ~= nil then
transition = state[STATE][token]
if not transition then
transition = state[STATE][""]
if state[STATE] then
local st = state[STATE]
local rule = st[token]
transition = rule
if transition == nil then
transition = st[""]
end
local recheck = true
while recheck do
recheck = false
local tytrans = type(transition)
if tytrans == "string" then
transition = state[STATE][transition]
transition = st[transition]
recheck = true
elseif tytrans == "function" then
transition = transition(state, token)
recheck = true
end
end
state[STATE] = transition -- may be nil
for i, hook in ipairs(st) do
if hook then -- allow overriding/disabling hooks
hook(state, token, rule)
end
-- must NOT use elseif here - the above may set state to nil!
if state[STATE] == nil then
end
state[STATE] = transition -- may be nil or false
end
-- must NOT use elseif here - the above may set state to nil or false!
if not state[STATE] then
-- unexpected token. stream consumer may attempt to recover,
-- but we do this mostly to differentiate it from "end of stream" condition.
return in_pos - 1, nil, "unexpected token", token, state
@ -112,8 +119,21 @@ local function parse(defs, data)
end
end
-- utility function that's quite common
local function selfify(t)
t.self = t
return t
end
-- common hook
local function insert_fallback(state, token, rule)
if not rule then
state[#state+1] = token
end
end
return {
STATE = STATE,
stream = stream,
parse = parse,
selfify = selfify,
}

View File

@ -18,6 +18,12 @@
local parser = require "parser"
local caseno = 0
local function case()
caseno = caseno + 1
return caseno
end
do -- trim left spaces
local defs = {}
defs.self = defs
@ -52,7 +58,49 @@ do -- trim left spaces
end
for k,v in ipairs({"hello", " hello", "\t \v \n\r hallo", "I really like this parser thingy if it can be called that"}) do
local state, err = parser.parse(defs, v)
if not state then print(err) end
print(table.concat(state))
if not state then
print(case(), err)
else
print(case(), table.concat(state))
end
end
end
end -- trim left spaces
do -- lua tokens
local luatokens = require "luatokens"
local tokens = luatokens.tokens
local state, err, etoken, estate = parser.parse(tokens, [["hello world"]])
local case = case()
print(case, "---- IN TOKENS ----")
if not state then
print(case, err, etoken)
for i,v in pairs(estate) do
print(case, i, v)
end
else
for i,v in ipairs(state) do
print(case, i, v)
end
end
print(case, "---- OUT TOKENS ----")
end -- lua tokens
do -- more lua tokens
local luatokens = require "luatokens"
local tokens = luatokens.tokens
local state, err, etoken, estate = parser.parse(tokens, [["\a\b\f\n\r\t\v\\\"\'\z \x41\65\
"]])
local case = case()
print(case, "---- IN TOKENS ----")
if not state then
print(case, err, etoken)
for i,v in pairs(estate) do
print(case, i, v)
end
else
for i,v in ipairs(state) do
print(case, i, v)
end
end
print(case, "---- OUT TOKENS ----")
end -- lua tokens