Fix \u
This commit is contained in:
parent
1f1f6c0732
commit
0118cdcb80
117
luatokens.lua
117
luatokens.lua
|
@ -1,4 +1,4 @@
|
|||
-- Lua tokens
|
||||
-- Lua defs
|
||||
|
||||
-- we need some stuff from here
|
||||
local parser = require "parser"
|
||||
|
@ -8,11 +8,27 @@ local COLLECT = parser.COLLECT
|
|||
local collect_fallback = parser.collect_fallback
|
||||
|
||||
-- "dummies"
|
||||
local TK_STRING = {}
|
||||
-- see http://www.lua.org/source/5.3/llex.h.html#RESERVED
|
||||
local TK_AND, TK_BREAK,
|
||||
TK_DO, TK_ELSE, TK_ELSEIF, TK_END, TK_FALSE, TK_FOR, TK_FUNCTION,
|
||||
TK_GOTO, TK_IF, TK_IN, TK_LOCAL, TK_NIL, TK_NOT, TK_OR, TK_REPEAT,
|
||||
TK_RETURN, TK_THEN, TK_TRUE, TK_UNTIL, TK_WHILE,
|
||||
TK_IDIV, TK_CONCAT, TK_DOTS, TK_EQ, TK_GE, TK_LE, TK_NE,
|
||||
TK_SHL, TK_SHR,
|
||||
TK_DBCOLON, TK_EOS,
|
||||
TK_FLT, TK_INT, TK_NAME, TK_STRING =
|
||||
{}, {},
|
||||
{}, {}, {}, {}, {}, {}, {},
|
||||
{}, {}, {}, {}, {}, {}, {}, {},
|
||||
{}, {}, {}, {}, {},
|
||||
{}, {}, {}, {}, {}, {}, {},
|
||||
{}, {},
|
||||
{}, {},
|
||||
{}, {}, {}, {}
|
||||
|
||||
local tokens = {}
|
||||
local defs = {}
|
||||
|
||||
tokens.base = {
|
||||
defs.base = {
|
||||
[" "] = "whitespace",
|
||||
["\n"] = "newline",
|
||||
["\r"] = "newline",
|
||||
|
@ -84,14 +100,15 @@ tokens.base = {
|
|||
}
|
||||
|
||||
local function linecount(state, token, rule)
|
||||
-- TODO fix
|
||||
if token == "\n" or token == "\r" then
|
||||
state.line = (state.line or 1) + 1
|
||||
end
|
||||
end
|
||||
|
||||
do local tstring = selfify({})
|
||||
tokens.string = tstring
|
||||
tstring.tokens = tokens
|
||||
defs.string = tstring
|
||||
tstring.defs = defs
|
||||
do local tsescapes = setmetatable({
|
||||
["'"] = "insertraw",
|
||||
['"'] = "insertraw",
|
||||
|
@ -110,9 +127,9 @@ do local tstring = selfify({})
|
|||
["\r"] = setmetatable({["\n"] = setmetatable({}, {__index=tstring})}, {__index=tstring}),
|
||||
[1] = linecount,
|
||||
[2] = function(state, token, rule) if token == "\r" or token == "\n" then collect_fallback(state, "\n") end end,
|
||||
}, {__index = tokens.base})
|
||||
tokens.string.escapes = tsescapes
|
||||
tsescapes.string = tokens.string
|
||||
}, {__index = defs.base})
|
||||
defs.string.escapes = tsescapes
|
||||
tsescapes.string = defs.string
|
||||
|
||||
function tsescapes.insertraw(state, token)
|
||||
collect_fallback(state, token)
|
||||
|
@ -158,7 +175,7 @@ do local tstring = selfify({})
|
|||
end
|
||||
end
|
||||
|
||||
tsescapes.hex = setmetatable(selfify({string = tokens.string, digit = "hexdigit"}), {__index=tokens.base})
|
||||
tsescapes.hex = setmetatable(selfify({string = defs.string, digit = "hexdigit"}), {__index=defs.base})
|
||||
function tsescapes.hex.hexdigit(state, token)
|
||||
local digit = string.find("123456789ABCDEF0123456789abcdef0", token, 1, true)
|
||||
assert(digit, "this should never be called for non-hex-digits")
|
||||
|
@ -174,14 +191,60 @@ do local tstring = selfify({})
|
|||
end
|
||||
end
|
||||
|
||||
do local tseunicode = {}
|
||||
tseunicode["{"] = "hex"
|
||||
do local tseuhex = setmetatable(selfify({digit = "hexdigit", string=tstring}), {__index=defs.base})
|
||||
tseunicode.hex = tseuhex
|
||||
function tseuhex.hexdigit(state, token)
|
||||
local digit = string.find("123456789ABCDEF0123456789abcdef0", token, 1, true)
|
||||
assert(digit, "this should never be called for non-hex-digits")
|
||||
state.in_hex = (state.in_hex or 0) * 16 + digit % 16
|
||||
if state.in_hex <= 2147483647 then
|
||||
return "self"
|
||||
end
|
||||
end
|
||||
tseuhex["}"] = function(state, token)
|
||||
local num = state.in_hex
|
||||
state.in_hex = nil
|
||||
if num < 128 then
|
||||
collect_fallback(state, string.char(num))
|
||||
return "string"
|
||||
end
|
||||
local bytes = ""
|
||||
while num > 63 do
|
||||
local v = num % 64
|
||||
bytes = string.char(128 + v) .. bytes -- yeah ik, not the most efficient
|
||||
num = (num - v) / 64
|
||||
end
|
||||
if num >= 2^6/(2^#bytes) then
|
||||
local v = num % 64
|
||||
bytes = string.char(128 + v) .. bytes
|
||||
num = (num - v) / 64
|
||||
end
|
||||
do
|
||||
local v = 0
|
||||
for i=1,#bytes do
|
||||
v = v + 128 / 2^i
|
||||
end
|
||||
v = v + num
|
||||
assert(v < 126)
|
||||
bytes = string.char(128 + v) .. bytes
|
||||
end
|
||||
collect_fallback(state, bytes)
|
||||
return "string"
|
||||
end
|
||||
end
|
||||
tsescapes.unicode = tseunicode
|
||||
end
|
||||
|
||||
do local tseskipwhitespace = selfify({
|
||||
string = tokens.string,
|
||||
string = defs.string,
|
||||
whitespace = "self",
|
||||
[""] = "string",
|
||||
[1] = collect_fallback,
|
||||
[2] = linecount,
|
||||
})
|
||||
local tbase = tokens.base
|
||||
local tbase = defs.base
|
||||
local tbasemap = {whitespace = "whitespace", newline = "whitespace"}
|
||||
setmetatable(tseskipwhitespace, {__index = function(t, k) return tbasemap[tbase[k]] or tstring[k] end})
|
||||
tsescapes.skipwhitespace = tseskipwhitespace
|
||||
|
@ -205,7 +268,7 @@ do local tstring = selfify({})
|
|||
state.in_string = nil
|
||||
state[#state+1] = table.concat(state[COLLECT])
|
||||
state[COLLECT] = nil
|
||||
return "tokens"
|
||||
return "defs"
|
||||
else
|
||||
collect_fallback(state, token)
|
||||
return "self"
|
||||
|
@ -213,13 +276,18 @@ do local tstring = selfify({})
|
|||
end
|
||||
end
|
||||
|
||||
tokens["'"] = "string_open"
|
||||
tokens['"'] = "string_open"
|
||||
tokens[1] = linecount
|
||||
do local tlongstring = {}
|
||||
-- TODO
|
||||
end
|
||||
|
||||
setmetatable(tokens, {__index=whitespace})
|
||||
defs["'"] = "string_open"
|
||||
defs['"'] = "string_open"
|
||||
defs["["] = "maybe_longstring"
|
||||
defs[1] = linecount
|
||||
|
||||
function tokens.string_open(state, token)
|
||||
setmetatable(defs, {__index=whitespace})
|
||||
|
||||
function defs.string_open(state, token)
|
||||
if not state.in_string then
|
||||
state[#state+1] = TK_STRING
|
||||
state[COLLECT] = {}
|
||||
|
@ -230,6 +298,15 @@ function tokens.string_open(state, token)
|
|||
end
|
||||
|
||||
return {
|
||||
tokens = tokens,
|
||||
TK_STRING = TK_STRING,
|
||||
defs = defs,
|
||||
tokens = {
|
||||
TK_AND = TK_AND, TK_BREAK = TK_BREAK,
|
||||
TK_DO = TK_DO, TK_ELSE = TK_ELSE, TK_ELSEIF = TK_ELSEIF, TK_END = TK_END, TK_FALSE = TK_FALSE, TK_FOR = TK_FOR, TK_FUNCTION = TK_FUNCTION,
|
||||
TK_GOTO = TK_GOTO, TK_IF = TK_IF, TK_IN = TK_IN, TK_LOCAL = TK_LOCAL, TK_NIL = TK_NIL, TK_NOT = TK_NOT, TK_OR = TK_OR, TK_REPEAT = TK_REPEAT,
|
||||
TK_RETURN = TK_RETURN, TK_THEN = TK_THEN, TK_TRUE = TK_TRUE, TK_UNTIL = TK_UNTIL, TK_WHILE = TK_WHILE,
|
||||
TK_IDIV = TK_IDIV, TK_CONCAT = TK_CONCAT, TK_DOTS = TK_DOTS, TK_EQ = TK_EQ, TK_GE = TK_GE, TK_LE = TK_LE, TK_NE = TK_NE,
|
||||
TK_SHL = TK_SHL, TK_SHR = TK_SHR,
|
||||
TK_DBCOLON = TK_DBCOLON, TK_EOS = TK_EOS,
|
||||
TK_FLT = TK_FLT, TK_INT = TK_INT, TK_NAME = TK_NAME, TK_STRING = TK_STRING
|
||||
},
|
||||
}
|
||||
|
|
|
@ -72,7 +72,7 @@ local function get_next_common(state, in_pos, token)
|
|||
end
|
||||
|
||||
local function get_next_table(state, in_pos)
|
||||
if state[DATA] == nil or #state[DATA] == 0 then return in_pos, state end
|
||||
if state[DATA] == nil or #state[DATA] == 0 then return in_pos, state end -- TODO end-of-stream handling
|
||||
in_pos = in_pos + 1
|
||||
local token = state[DATA][in_pos - state[OFFDATA]]
|
||||
if token == nil then
|
||||
|
@ -84,10 +84,10 @@ local function get_next_table(state, in_pos)
|
|||
end
|
||||
|
||||
local function get_next_string(state, in_pos)
|
||||
if state[DATA] == nil or #state[DATA] == 0 then return in_pos, state end
|
||||
if state[DATA] == nil or #state[DATA] == 0 then return in_pos, state end -- TODO end-of-stream handling
|
||||
in_pos = in_pos + 1
|
||||
local token = optimize_lookups[string.byte(state[DATA], in_pos - state[OFFDATA], in_pos - state[OFFDATA])] or ""
|
||||
if token == "" then
|
||||
local token = optimize_lookups[string.byte(state[DATA], in_pos - state[OFFDATA], in_pos - state[OFFDATA])]
|
||||
if token == nil then
|
||||
state[OFFDATA] = in_pos - 1
|
||||
state[DATA] = state[GEN]()
|
||||
return get_next_string(state, state[OFFDATA])
|
||||
|
|
91
test.lua
91
test.lua
|
@ -56,57 +56,106 @@ do -- trim left spaces
|
|||
end
|
||||
return "self"
|
||||
end
|
||||
for k,v in ipairs({"hello", " hello", "\t \v \n\r hallo", "I really like this parser thingy if it can be called that"}) do
|
||||
for k,v in ipairs({"hello", " hello", "\t \v \n\r hello"}) do
|
||||
local state, err = parser.parse(defs, v)
|
||||
local case = case()
|
||||
if not state then
|
||||
print(case(), err)
|
||||
print(case, err)
|
||||
else
|
||||
print(case(), table.concat(state))
|
||||
assert(table.concat(state) == "hello")
|
||||
end
|
||||
end
|
||||
end -- trim left spaces
|
||||
|
||||
do -- lua tokens
|
||||
local luatokens = require "luatokens"
|
||||
local tokens = luatokens.tokens
|
||||
local tokens = luatokens.defs
|
||||
local state, err, etoken, estate = parser.parse(tokens, [["hello world"]])
|
||||
local case = case()
|
||||
print(case, "---- IN TOKENS ----")
|
||||
if not state then
|
||||
print(case, "---- IN TOKENS ----")
|
||||
print(case, err, etoken)
|
||||
for i,v in pairs(estate) do
|
||||
print(case, i, v)
|
||||
end
|
||||
print(case, "---- OUT TOKENS ----")
|
||||
else
|
||||
for i,v in ipairs(state) do
|
||||
print(case, i, v)
|
||||
end
|
||||
assert(state[1] == luatokens.tokens.TK_STRING)
|
||||
assert(state[2] == "hello world")
|
||||
end
|
||||
print(case, "---- OUT TOKENS ----")
|
||||
end -- lua tokens
|
||||
|
||||
do -- more lua tokens
|
||||
local luatokens = require "luatokens"
|
||||
local tokens = luatokens.tokens
|
||||
local tokens = luatokens.defs
|
||||
local state, err, etoken, estate = parser.parse(tokens, [["\a\b\f\n\r\t\v\\\"\'\z \x41\65\
|
||||
"]])
|
||||
local case = case()
|
||||
print(case, "---- IN TOKENS ----")
|
||||
if not state then
|
||||
print(case, "---- IN TOKENS ----")
|
||||
print(case, err, etoken)
|
||||
for i,v in pairs(estate) do
|
||||
print(case, i, v)
|
||||
end
|
||||
print(case, "---- OUT TOKENS ----")
|
||||
else
|
||||
for i,v in ipairs(state) do
|
||||
print(case, i, v)
|
||||
if v == luatokens.TK_STRING then
|
||||
in_string = true
|
||||
elseif in_string then
|
||||
print(case, v:gsub(".", function(v) return "\\"..string.byte(v) end))
|
||||
in_string = false
|
||||
end
|
||||
end
|
||||
assert(state[1] == luatokens.tokens.TK_STRING)
|
||||
assert(state[2] == "\7\8\12\10\13\9\11\92\34\39\65\65\10")
|
||||
end
|
||||
end -- lua tokens
|
||||
|
||||
do -- even more lua tokens
|
||||
local luatokens = require "luatokens"
|
||||
local tokens = luatokens.defs
|
||||
local state, err, etoken, estate = parser.parse(tokens, [["\u{000000000000000000000000000000000000000000000000000000000000041}"]])
|
||||
local case = case()
|
||||
if not state then
|
||||
print(case, "---- IN TOKENS ----")
|
||||
print(case, err, etoken)
|
||||
for i,v in pairs(estate) do
|
||||
print(case, i, v)
|
||||
end
|
||||
print(case, "---- OUT TOKENS ----")
|
||||
else
|
||||
assert(state[1] == luatokens.tokens.TK_STRING)
|
||||
assert(state[2] == "A")
|
||||
end
|
||||
end -- lua tokens
|
||||
|
||||
do -- even more lua tokens
|
||||
local luatokens = require "luatokens"
|
||||
local tokens = luatokens.defs
|
||||
local state, err, etoken, estate = parser.parse(tokens, [["\u{7F}""\u{80}""\u{7FF}""\u{800}""\u{FFFF}""\u{10000}""\u{1FFFFF}""\u{200000}""\u{3FFFFFF}""\u{4000000}""\u{7FFFFFFF}"]])
|
||||
local case = case()
|
||||
if not state then
|
||||
print(case, "---- IN TOKENS ----")
|
||||
print(case, err, etoken)
|
||||
for i,v in pairs(estate) do
|
||||
print(case, i, v)
|
||||
end
|
||||
print(case, "---- OUT TOKENS ----")
|
||||
else
|
||||
assert(table.remove(state, 1) == luatokens.tokens.TK_STRING)
|
||||
assert(table.remove(state, 1) == "\127")
|
||||
assert(table.remove(state, 1) == luatokens.tokens.TK_STRING)
|
||||
assert(table.remove(state, 1) == "\194\128")
|
||||
assert(table.remove(state, 1) == luatokens.tokens.TK_STRING)
|
||||
assert(table.remove(state, 1) == "\223\191")
|
||||
assert(table.remove(state, 1) == luatokens.tokens.TK_STRING)
|
||||
assert(table.remove(state, 1) == "\224\160\128")
|
||||
assert(table.remove(state, 1) == luatokens.tokens.TK_STRING)
|
||||
assert(table.remove(state, 1) == "\239\191\191")
|
||||
assert(table.remove(state, 1) == luatokens.tokens.TK_STRING)
|
||||
assert(table.remove(state, 1) == "\240\144\128\128")
|
||||
assert(table.remove(state, 1) == luatokens.tokens.TK_STRING)
|
||||
assert(table.remove(state, 1) == "\247\191\191\191")
|
||||
assert(table.remove(state, 1) == luatokens.tokens.TK_STRING)
|
||||
assert(table.remove(state, 1) == "\248\136\128\128\128")
|
||||
assert(table.remove(state, 1) == luatokens.tokens.TK_STRING)
|
||||
assert(table.remove(state, 1) == "\251\191\191\191\191")
|
||||
assert(table.remove(state, 1) == luatokens.tokens.TK_STRING)
|
||||
assert(table.remove(state, 1) == "\252\132\128\128\128\128")
|
||||
assert(table.remove(state, 1) == luatokens.tokens.TK_STRING)
|
||||
assert(table.remove(state, 1) == "\253\191\191\191\191\191")
|
||||
end
|
||||
print(case, "---- OUT TOKENS ----")
|
||||
end -- lua tokens
|
||||
|
|
Loading…
Reference in New Issue