Attempted lua tokenizer didn't work
Publishing anyway because someone might be able to learn from my failure
This commit is contained in:
parent
d03d77d28b
commit
5a4b41bd47
|
@ -0,0 +1,225 @@
|
|||
-- Lua tokens
|
||||
|
||||
-- we need some stuff from here
|
||||
local parser = require "parser"
|
||||
local selfify = parser.selfify
|
||||
|
||||
-- "dummies"
|
||||
local TK_STRING = {}
|
||||
|
||||
local tokens = {}
|
||||
|
||||
tokens.base = {
|
||||
[" "] = "whitespace",
|
||||
["\n"] = "newline",
|
||||
["\r"] = "newline",
|
||||
["\v"] = "whitespace",
|
||||
["\t"] = "whitespace",
|
||||
["\f"] = "whitespace",
|
||||
["0"] = "digit",
|
||||
["1"] = "digit",
|
||||
["2"] = "digit",
|
||||
["3"] = "digit",
|
||||
["4"] = "digit",
|
||||
["5"] = "digit",
|
||||
["6"] = "digit",
|
||||
["7"] = "digit",
|
||||
["8"] = "digit",
|
||||
["9"] = "digit",
|
||||
["a"] = "hexdigit",
|
||||
["b"] = "hexdigit",
|
||||
["c"] = "hexdigit",
|
||||
["d"] = "hexdigit",
|
||||
["e"] = "hexdigit",
|
||||
["f"] = "hexdigit",
|
||||
["A"] = "hexdigit",
|
||||
["B"] = "hexdigit",
|
||||
["C"] = "hexdigit",
|
||||
["D"] = "hexdigit",
|
||||
["E"] = "hexdigit",
|
||||
["F"] = "hexdigit",
|
||||
["g"] = "alpha",
|
||||
["h"] = "alpha",
|
||||
["i"] = "alpha",
|
||||
["j"] = "alpha",
|
||||
["k"] = "alpha",
|
||||
["l"] = "alpha",
|
||||
["m"] = "alpha",
|
||||
["n"] = "alpha",
|
||||
["o"] = "alpha",
|
||||
["p"] = "alpha",
|
||||
["q"] = "alpha",
|
||||
["r"] = "alpha",
|
||||
["s"] = "alpha",
|
||||
["t"] = "alpha",
|
||||
["u"] = "alpha",
|
||||
["v"] = "alpha",
|
||||
["w"] = "alpha",
|
||||
["x"] = "alpha",
|
||||
["y"] = "alpha",
|
||||
["z"] = "alpha",
|
||||
["G"] = "alpha",
|
||||
["H"] = "alpha",
|
||||
["I"] = "alpha",
|
||||
["J"] = "alpha",
|
||||
["K"] = "alpha",
|
||||
["L"] = "alpha",
|
||||
["M"] = "alpha",
|
||||
["N"] = "alpha",
|
||||
["O"] = "alpha",
|
||||
["P"] = "alpha",
|
||||
["Q"] = "alpha",
|
||||
["R"] = "alpha",
|
||||
["S"] = "alpha",
|
||||
["T"] = "alpha",
|
||||
["U"] = "alpha",
|
||||
["V"] = "alpha",
|
||||
["W"] = "alpha",
|
||||
["X"] = "alpha",
|
||||
["Y"] = "alpha",
|
||||
["Z"] = "alpha",
|
||||
}
|
||||
|
||||
local function linecount(state, token, rule)
|
||||
if token == "\n" or token == "\r" then
|
||||
state.line = (state.line or 1) + 1
|
||||
end
|
||||
end
|
||||
|
||||
do local tstring = selfify({})
|
||||
tokens.string = tstring
|
||||
tstring.tokens = tokens
|
||||
do local tsescapes = setmetatable({
|
||||
["'"] = "insertraw",
|
||||
['"'] = "insertraw",
|
||||
['\\'] = "insertraw",
|
||||
["a"] = "insertmap",
|
||||
["b"] = "insertmap",
|
||||
["f"] = "insertmap",
|
||||
["n"] = "insertmap",
|
||||
["r"] = "insertmap",
|
||||
["t"] = "insertmap",
|
||||
["v"] = "insertmap",
|
||||
["z"] = "skipwhitespace",
|
||||
["u"] = "unicode",
|
||||
["x"] = "hex",
|
||||
["\n"] = setmetatable({["\r"] = setmetatable({}, {__index=tstring})}, {__index=tstring}),
|
||||
["\r"] = setmetatable({["\n"] = setmetatable({}, {__index=tstring})}, {__index=tstring}),
|
||||
[1] = linecount,
|
||||
[2] = print
|
||||
}, {__index = tokens.base})
|
||||
tokens.string.escapes = tsescapes
|
||||
tsescapes.string = tokens.string
|
||||
|
||||
function tsescapes.insertraw(state, token)
|
||||
state[#state+1] = token
|
||||
return "string"
|
||||
end
|
||||
|
||||
do
|
||||
local map = { ["a"] = "\a", ["b"] = "\b", ["f"] = "\f", ["n"] = "\n", ["r"] = "\r", ["t"] = "\t", ["v"] = "\v" }
|
||||
function tsescapes.insertmap(state, token)
|
||||
state[#state+1] = map[token]
|
||||
return "string"
|
||||
end
|
||||
end
|
||||
|
||||
function tsescapes.digit(state, token)
|
||||
local digit = string.find("1234567890", token, 1, true)
|
||||
local num = state.in_digit
|
||||
if digit then
|
||||
num = (num or 0) * 10 + digit % 10
|
||||
state.c = (state.c or 0) + 1
|
||||
if state.c < 3 then
|
||||
state.in_digit = num
|
||||
return "digitc"
|
||||
end
|
||||
end
|
||||
if num > 255 then
|
||||
return nil
|
||||
end
|
||||
state[#state+1] = string.char(num)
|
||||
state.in_digit = nil
|
||||
state.c = nil
|
||||
return "string"
|
||||
end
|
||||
tsescapes.digitc = setmetatable(selfify({[""] = tsescapes.digit, digitc = "self", string = tstring}), {__index=tstring})
|
||||
|
||||
tsescapes.hex = setmetatable(selfify({string = tokens.string}), {__index=tokens.base})
|
||||
function tsescapes.hex.hexdigit(state, token)
|
||||
local digit = string.find("123456789ABCDEF0123456789abcdef0", token, 1, true)
|
||||
assert(digit, "this should never be called for non-hex-digits")
|
||||
local num = state.in_hex
|
||||
if num then
|
||||
num = num * 16 + digit % 16
|
||||
state[#state+1] = string.char(num)
|
||||
state.in_hex = nil
|
||||
return "string"
|
||||
else
|
||||
state.in_hex = digit % 16
|
||||
return "self"
|
||||
end
|
||||
end
|
||||
|
||||
do local tseskipwhitespace = selfify({
|
||||
string = tokens.string,
|
||||
whitespace = "self",
|
||||
[""] = "string",
|
||||
[1] = parser.insert_fallback,
|
||||
[2] = linecount,
|
||||
})
|
||||
local tbase = tokens.base
|
||||
local tbasemap = {whitespace = "whitespace", newline = "whitespace"}
|
||||
setmetatable(tseskipwhitespace, {__index = function(t, k) return tbasemap[tbase[k]] or tstring[k] end})
|
||||
tsescapes.skipwhitespace = tseskipwhitespace
|
||||
end
|
||||
end
|
||||
|
||||
tstring['\\'] = "escapes"
|
||||
|
||||
tstring['"'] = "close"
|
||||
tstring["'"] = "close"
|
||||
|
||||
tstring['\n'] = false
|
||||
tstring['\r'] = false
|
||||
|
||||
tstring[""] = "self"
|
||||
|
||||
tstring[1] = parser.insert_fallback
|
||||
|
||||
function tstring.close(state, token)
|
||||
if state.in_string == token then
|
||||
local i = state.string_start
|
||||
state.in_string = nil
|
||||
state.string_start = nil
|
||||
state[i+1] = table.concat(state, '', i+1)
|
||||
for j=i+2, #state do
|
||||
state[j]=nil
|
||||
end
|
||||
return "tokens"
|
||||
else
|
||||
state[#state+1] = token
|
||||
return "self"
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
tokens["'"] = "string_open"
|
||||
tokens['"'] = "string_open"
|
||||
|
||||
setmetatable(tokens, {__index=whitespace})
|
||||
|
||||
function tokens.string_open(state, token)
|
||||
if not state.in_string then
|
||||
state[#state+1] = TK_STRING
|
||||
state.in_string = token
|
||||
state.string_start = #state
|
||||
return "string"
|
||||
end
|
||||
assert("this shouldn't happen")
|
||||
end
|
||||
|
||||
return {
|
||||
tokens = tokens,
|
||||
TK_STRING = TK_STRING,
|
||||
}
|
36
parser.lua
36
parser.lua
|
@ -31,27 +31,34 @@ local type, tostring
|
|||
local function get_next_common(state, in_pos, token)
|
||||
-- note: must preserve "token" - do not call recursively with a different token
|
||||
local transition
|
||||
if state[STATE] ~= nil then
|
||||
transition = state[STATE][token]
|
||||
if not transition then
|
||||
transition = state[STATE][""]
|
||||
if state[STATE] then
|
||||
local st = state[STATE]
|
||||
local rule = st[token]
|
||||
transition = rule
|
||||
if transition == nil then
|
||||
transition = st[""]
|
||||
end
|
||||
local recheck = true
|
||||
while recheck do
|
||||
recheck = false
|
||||
local tytrans = type(transition)
|
||||
if tytrans == "string" then
|
||||
transition = state[STATE][transition]
|
||||
transition = st[transition]
|
||||
recheck = true
|
||||
elseif tytrans == "function" then
|
||||
transition = transition(state, token)
|
||||
recheck = true
|
||||
end
|
||||
end
|
||||
state[STATE] = transition -- may be nil
|
||||
for i, hook in ipairs(st) do
|
||||
if hook then -- allow overriding/disabling hooks
|
||||
hook(state, token, rule)
|
||||
end
|
||||
-- must NOT use elseif here - the above may set state to nil!
|
||||
if state[STATE] == nil then
|
||||
end
|
||||
state[STATE] = transition -- may be nil or false
|
||||
end
|
||||
-- must NOT use elseif here - the above may set state to nil or false!
|
||||
if not state[STATE] then
|
||||
-- unexpected token. stream consumer may attempt to recover,
|
||||
-- but we do this mostly to differentiate it from "end of stream" condition.
|
||||
return in_pos - 1, nil, "unexpected token", token, state
|
||||
|
@ -112,8 +119,21 @@ local function parse(defs, data)
|
|||
end
|
||||
end
|
||||
|
||||
-- utility function that's quite common
|
||||
local function selfify(t)
|
||||
t.self = t
|
||||
return t
|
||||
end
|
||||
-- common hook
|
||||
local function insert_fallback(state, token, rule)
|
||||
if not rule then
|
||||
state[#state+1] = token
|
||||
end
|
||||
end
|
||||
|
||||
return {
|
||||
STATE = STATE,
|
||||
stream = stream,
|
||||
parse = parse,
|
||||
selfify = selfify,
|
||||
}
|
||||
|
|
54
test.lua
54
test.lua
|
@ -18,6 +18,12 @@
|
|||
|
||||
local parser = require "parser"
|
||||
|
||||
local caseno = 0
|
||||
local function case()
|
||||
caseno = caseno + 1
|
||||
return caseno
|
||||
end
|
||||
|
||||
do -- trim left spaces
|
||||
local defs = {}
|
||||
defs.self = defs
|
||||
|
@ -52,7 +58,49 @@ do -- trim left spaces
|
|||
end
|
||||
for k,v in ipairs({"hello", " hello", "\t \v \n\r hallo", "I really like this parser thingy if it can be called that"}) do
|
||||
local state, err = parser.parse(defs, v)
|
||||
if not state then print(err) end
|
||||
print(table.concat(state))
|
||||
if not state then
|
||||
print(case(), err)
|
||||
else
|
||||
print(case(), table.concat(state))
|
||||
end
|
||||
end
|
||||
end
|
||||
end -- trim left spaces
|
||||
|
||||
do -- lua tokens
|
||||
local luatokens = require "luatokens"
|
||||
local tokens = luatokens.tokens
|
||||
local state, err, etoken, estate = parser.parse(tokens, [["hello world"]])
|
||||
local case = case()
|
||||
print(case, "---- IN TOKENS ----")
|
||||
if not state then
|
||||
print(case, err, etoken)
|
||||
for i,v in pairs(estate) do
|
||||
print(case, i, v)
|
||||
end
|
||||
else
|
||||
for i,v in ipairs(state) do
|
||||
print(case, i, v)
|
||||
end
|
||||
end
|
||||
print(case, "---- OUT TOKENS ----")
|
||||
end -- lua tokens
|
||||
|
||||
do -- more lua tokens
|
||||
local luatokens = require "luatokens"
|
||||
local tokens = luatokens.tokens
|
||||
local state, err, etoken, estate = parser.parse(tokens, [["\a\b\f\n\r\t\v\\\"\'\z \x41\65\
|
||||
"]])
|
||||
local case = case()
|
||||
print(case, "---- IN TOKENS ----")
|
||||
if not state then
|
||||
print(case, err, etoken)
|
||||
for i,v in pairs(estate) do
|
||||
print(case, i, v)
|
||||
end
|
||||
else
|
||||
for i,v in ipairs(state) do
|
||||
print(case, i, v)
|
||||
end
|
||||
end
|
||||
print(case, "---- OUT TOKENS ----")
|
||||
end -- lua tokens
|
||||
|
|
Loading…
Reference in New Issue