Fix \u
This commit is contained in:
parent
1f1f6c0732
commit
0118cdcb80
117
luatokens.lua
117
luatokens.lua
|
@ -1,4 +1,4 @@
|
||||||
-- Lua tokens
|
-- Lua defs
|
||||||
|
|
||||||
-- we need some stuff from here
|
-- we need some stuff from here
|
||||||
local parser = require "parser"
|
local parser = require "parser"
|
||||||
|
@ -8,11 +8,27 @@ local COLLECT = parser.COLLECT
|
||||||
local collect_fallback = parser.collect_fallback
|
local collect_fallback = parser.collect_fallback
|
||||||
|
|
||||||
-- "dummies"
|
-- "dummies"
|
||||||
local TK_STRING = {}
|
-- see http://www.lua.org/source/5.3/llex.h.html#RESERVED
|
||||||
|
local TK_AND, TK_BREAK,
|
||||||
|
TK_DO, TK_ELSE, TK_ELSEIF, TK_END, TK_FALSE, TK_FOR, TK_FUNCTION,
|
||||||
|
TK_GOTO, TK_IF, TK_IN, TK_LOCAL, TK_NIL, TK_NOT, TK_OR, TK_REPEAT,
|
||||||
|
TK_RETURN, TK_THEN, TK_TRUE, TK_UNTIL, TK_WHILE,
|
||||||
|
TK_IDIV, TK_CONCAT, TK_DOTS, TK_EQ, TK_GE, TK_LE, TK_NE,
|
||||||
|
TK_SHL, TK_SHR,
|
||||||
|
TK_DBCOLON, TK_EOS,
|
||||||
|
TK_FLT, TK_INT, TK_NAME, TK_STRING =
|
||||||
|
{}, {},
|
||||||
|
{}, {}, {}, {}, {}, {}, {},
|
||||||
|
{}, {}, {}, {}, {}, {}, {}, {},
|
||||||
|
{}, {}, {}, {}, {},
|
||||||
|
{}, {}, {}, {}, {}, {}, {},
|
||||||
|
{}, {},
|
||||||
|
{}, {},
|
||||||
|
{}, {}, {}, {}
|
||||||
|
|
||||||
local tokens = {}
|
local defs = {}
|
||||||
|
|
||||||
tokens.base = {
|
defs.base = {
|
||||||
[" "] = "whitespace",
|
[" "] = "whitespace",
|
||||||
["\n"] = "newline",
|
["\n"] = "newline",
|
||||||
["\r"] = "newline",
|
["\r"] = "newline",
|
||||||
|
@ -84,14 +100,15 @@ tokens.base = {
|
||||||
}
|
}
|
||||||
|
|
||||||
local function linecount(state, token, rule)
|
local function linecount(state, token, rule)
|
||||||
|
-- TODO fix
|
||||||
if token == "\n" or token == "\r" then
|
if token == "\n" or token == "\r" then
|
||||||
state.line = (state.line or 1) + 1
|
state.line = (state.line or 1) + 1
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
do local tstring = selfify({})
|
do local tstring = selfify({})
|
||||||
tokens.string = tstring
|
defs.string = tstring
|
||||||
tstring.tokens = tokens
|
tstring.defs = defs
|
||||||
do local tsescapes = setmetatable({
|
do local tsescapes = setmetatable({
|
||||||
["'"] = "insertraw",
|
["'"] = "insertraw",
|
||||||
['"'] = "insertraw",
|
['"'] = "insertraw",
|
||||||
|
@ -110,9 +127,9 @@ do local tstring = selfify({})
|
||||||
["\r"] = setmetatable({["\n"] = setmetatable({}, {__index=tstring})}, {__index=tstring}),
|
["\r"] = setmetatable({["\n"] = setmetatable({}, {__index=tstring})}, {__index=tstring}),
|
||||||
[1] = linecount,
|
[1] = linecount,
|
||||||
[2] = function(state, token, rule) if token == "\r" or token == "\n" then collect_fallback(state, "\n") end end,
|
[2] = function(state, token, rule) if token == "\r" or token == "\n" then collect_fallback(state, "\n") end end,
|
||||||
}, {__index = tokens.base})
|
}, {__index = defs.base})
|
||||||
tokens.string.escapes = tsescapes
|
defs.string.escapes = tsescapes
|
||||||
tsescapes.string = tokens.string
|
tsescapes.string = defs.string
|
||||||
|
|
||||||
function tsescapes.insertraw(state, token)
|
function tsescapes.insertraw(state, token)
|
||||||
collect_fallback(state, token)
|
collect_fallback(state, token)
|
||||||
|
@ -158,7 +175,7 @@ do local tstring = selfify({})
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
tsescapes.hex = setmetatable(selfify({string = tokens.string, digit = "hexdigit"}), {__index=tokens.base})
|
tsescapes.hex = setmetatable(selfify({string = defs.string, digit = "hexdigit"}), {__index=defs.base})
|
||||||
function tsescapes.hex.hexdigit(state, token)
|
function tsescapes.hex.hexdigit(state, token)
|
||||||
local digit = string.find("123456789ABCDEF0123456789abcdef0", token, 1, true)
|
local digit = string.find("123456789ABCDEF0123456789abcdef0", token, 1, true)
|
||||||
assert(digit, "this should never be called for non-hex-digits")
|
assert(digit, "this should never be called for non-hex-digits")
|
||||||
|
@ -174,14 +191,60 @@ do local tstring = selfify({})
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
do local tseunicode = {}
|
||||||
|
tseunicode["{"] = "hex"
|
||||||
|
do local tseuhex = setmetatable(selfify({digit = "hexdigit", string=tstring}), {__index=defs.base})
|
||||||
|
tseunicode.hex = tseuhex
|
||||||
|
function tseuhex.hexdigit(state, token)
|
||||||
|
local digit = string.find("123456789ABCDEF0123456789abcdef0", token, 1, true)
|
||||||
|
assert(digit, "this should never be called for non-hex-digits")
|
||||||
|
state.in_hex = (state.in_hex or 0) * 16 + digit % 16
|
||||||
|
if state.in_hex <= 2147483647 then
|
||||||
|
return "self"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
tseuhex["}"] = function(state, token)
|
||||||
|
local num = state.in_hex
|
||||||
|
state.in_hex = nil
|
||||||
|
if num < 128 then
|
||||||
|
collect_fallback(state, string.char(num))
|
||||||
|
return "string"
|
||||||
|
end
|
||||||
|
local bytes = ""
|
||||||
|
while num > 63 do
|
||||||
|
local v = num % 64
|
||||||
|
bytes = string.char(128 + v) .. bytes -- yeah ik, not the most efficient
|
||||||
|
num = (num - v) / 64
|
||||||
|
end
|
||||||
|
if num >= 2^6/(2^#bytes) then
|
||||||
|
local v = num % 64
|
||||||
|
bytes = string.char(128 + v) .. bytes
|
||||||
|
num = (num - v) / 64
|
||||||
|
end
|
||||||
|
do
|
||||||
|
local v = 0
|
||||||
|
for i=1,#bytes do
|
||||||
|
v = v + 128 / 2^i
|
||||||
|
end
|
||||||
|
v = v + num
|
||||||
|
assert(v < 126)
|
||||||
|
bytes = string.char(128 + v) .. bytes
|
||||||
|
end
|
||||||
|
collect_fallback(state, bytes)
|
||||||
|
return "string"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
tsescapes.unicode = tseunicode
|
||||||
|
end
|
||||||
|
|
||||||
do local tseskipwhitespace = selfify({
|
do local tseskipwhitespace = selfify({
|
||||||
string = tokens.string,
|
string = defs.string,
|
||||||
whitespace = "self",
|
whitespace = "self",
|
||||||
[""] = "string",
|
[""] = "string",
|
||||||
[1] = collect_fallback,
|
[1] = collect_fallback,
|
||||||
[2] = linecount,
|
[2] = linecount,
|
||||||
})
|
})
|
||||||
local tbase = tokens.base
|
local tbase = defs.base
|
||||||
local tbasemap = {whitespace = "whitespace", newline = "whitespace"}
|
local tbasemap = {whitespace = "whitespace", newline = "whitespace"}
|
||||||
setmetatable(tseskipwhitespace, {__index = function(t, k) return tbasemap[tbase[k]] or tstring[k] end})
|
setmetatable(tseskipwhitespace, {__index = function(t, k) return tbasemap[tbase[k]] or tstring[k] end})
|
||||||
tsescapes.skipwhitespace = tseskipwhitespace
|
tsescapes.skipwhitespace = tseskipwhitespace
|
||||||
|
@ -205,7 +268,7 @@ do local tstring = selfify({})
|
||||||
state.in_string = nil
|
state.in_string = nil
|
||||||
state[#state+1] = table.concat(state[COLLECT])
|
state[#state+1] = table.concat(state[COLLECT])
|
||||||
state[COLLECT] = nil
|
state[COLLECT] = nil
|
||||||
return "tokens"
|
return "defs"
|
||||||
else
|
else
|
||||||
collect_fallback(state, token)
|
collect_fallback(state, token)
|
||||||
return "self"
|
return "self"
|
||||||
|
@ -213,13 +276,18 @@ do local tstring = selfify({})
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
tokens["'"] = "string_open"
|
do local tlongstring = {}
|
||||||
tokens['"'] = "string_open"
|
-- TODO
|
||||||
tokens[1] = linecount
|
end
|
||||||
|
|
||||||
setmetatable(tokens, {__index=whitespace})
|
defs["'"] = "string_open"
|
||||||
|
defs['"'] = "string_open"
|
||||||
|
defs["["] = "maybe_longstring"
|
||||||
|
defs[1] = linecount
|
||||||
|
|
||||||
function tokens.string_open(state, token)
|
setmetatable(defs, {__index=whitespace})
|
||||||
|
|
||||||
|
function defs.string_open(state, token)
|
||||||
if not state.in_string then
|
if not state.in_string then
|
||||||
state[#state+1] = TK_STRING
|
state[#state+1] = TK_STRING
|
||||||
state[COLLECT] = {}
|
state[COLLECT] = {}
|
||||||
|
@ -230,6 +298,15 @@ function tokens.string_open(state, token)
|
||||||
end
|
end
|
||||||
|
|
||||||
return {
|
return {
|
||||||
tokens = tokens,
|
defs = defs,
|
||||||
TK_STRING = TK_STRING,
|
tokens = {
|
||||||
|
TK_AND = TK_AND, TK_BREAK = TK_BREAK,
|
||||||
|
TK_DO = TK_DO, TK_ELSE = TK_ELSE, TK_ELSEIF = TK_ELSEIF, TK_END = TK_END, TK_FALSE = TK_FALSE, TK_FOR = TK_FOR, TK_FUNCTION = TK_FUNCTION,
|
||||||
|
TK_GOTO = TK_GOTO, TK_IF = TK_IF, TK_IN = TK_IN, TK_LOCAL = TK_LOCAL, TK_NIL = TK_NIL, TK_NOT = TK_NOT, TK_OR = TK_OR, TK_REPEAT = TK_REPEAT,
|
||||||
|
TK_RETURN = TK_RETURN, TK_THEN = TK_THEN, TK_TRUE = TK_TRUE, TK_UNTIL = TK_UNTIL, TK_WHILE = TK_WHILE,
|
||||||
|
TK_IDIV = TK_IDIV, TK_CONCAT = TK_CONCAT, TK_DOTS = TK_DOTS, TK_EQ = TK_EQ, TK_GE = TK_GE, TK_LE = TK_LE, TK_NE = TK_NE,
|
||||||
|
TK_SHL = TK_SHL, TK_SHR = TK_SHR,
|
||||||
|
TK_DBCOLON = TK_DBCOLON, TK_EOS = TK_EOS,
|
||||||
|
TK_FLT = TK_FLT, TK_INT = TK_INT, TK_NAME = TK_NAME, TK_STRING = TK_STRING
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
|
@ -72,7 +72,7 @@ local function get_next_common(state, in_pos, token)
|
||||||
end
|
end
|
||||||
|
|
||||||
local function get_next_table(state, in_pos)
|
local function get_next_table(state, in_pos)
|
||||||
if state[DATA] == nil or #state[DATA] == 0 then return in_pos, state end
|
if state[DATA] == nil or #state[DATA] == 0 then return in_pos, state end -- TODO end-of-stream handling
|
||||||
in_pos = in_pos + 1
|
in_pos = in_pos + 1
|
||||||
local token = state[DATA][in_pos - state[OFFDATA]]
|
local token = state[DATA][in_pos - state[OFFDATA]]
|
||||||
if token == nil then
|
if token == nil then
|
||||||
|
@ -84,10 +84,10 @@ local function get_next_table(state, in_pos)
|
||||||
end
|
end
|
||||||
|
|
||||||
local function get_next_string(state, in_pos)
|
local function get_next_string(state, in_pos)
|
||||||
if state[DATA] == nil or #state[DATA] == 0 then return in_pos, state end
|
if state[DATA] == nil or #state[DATA] == 0 then return in_pos, state end -- TODO end-of-stream handling
|
||||||
in_pos = in_pos + 1
|
in_pos = in_pos + 1
|
||||||
local token = optimize_lookups[string.byte(state[DATA], in_pos - state[OFFDATA], in_pos - state[OFFDATA])] or ""
|
local token = optimize_lookups[string.byte(state[DATA], in_pos - state[OFFDATA], in_pos - state[OFFDATA])]
|
||||||
if token == "" then
|
if token == nil then
|
||||||
state[OFFDATA] = in_pos - 1
|
state[OFFDATA] = in_pos - 1
|
||||||
state[DATA] = state[GEN]()
|
state[DATA] = state[GEN]()
|
||||||
return get_next_string(state, state[OFFDATA])
|
return get_next_string(state, state[OFFDATA])
|
||||||
|
|
89
test.lua
89
test.lua
|
@ -56,57 +56,106 @@ do -- trim left spaces
|
||||||
end
|
end
|
||||||
return "self"
|
return "self"
|
||||||
end
|
end
|
||||||
for k,v in ipairs({"hello", " hello", "\t \v \n\r hallo", "I really like this parser thingy if it can be called that"}) do
|
for k,v in ipairs({"hello", " hello", "\t \v \n\r hello"}) do
|
||||||
local state, err = parser.parse(defs, v)
|
local state, err = parser.parse(defs, v)
|
||||||
|
local case = case()
|
||||||
if not state then
|
if not state then
|
||||||
print(case(), err)
|
print(case, err)
|
||||||
else
|
else
|
||||||
print(case(), table.concat(state))
|
assert(table.concat(state) == "hello")
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end -- trim left spaces
|
end -- trim left spaces
|
||||||
|
|
||||||
do -- lua tokens
|
do -- lua tokens
|
||||||
local luatokens = require "luatokens"
|
local luatokens = require "luatokens"
|
||||||
local tokens = luatokens.tokens
|
local tokens = luatokens.defs
|
||||||
local state, err, etoken, estate = parser.parse(tokens, [["hello world"]])
|
local state, err, etoken, estate = parser.parse(tokens, [["hello world"]])
|
||||||
local case = case()
|
local case = case()
|
||||||
print(case, "---- IN TOKENS ----")
|
|
||||||
if not state then
|
if not state then
|
||||||
|
print(case, "---- IN TOKENS ----")
|
||||||
print(case, err, etoken)
|
print(case, err, etoken)
|
||||||
for i,v in pairs(estate) do
|
for i,v in pairs(estate) do
|
||||||
print(case, i, v)
|
print(case, i, v)
|
||||||
end
|
end
|
||||||
else
|
|
||||||
for i,v in ipairs(state) do
|
|
||||||
print(case, i, v)
|
|
||||||
end
|
|
||||||
end
|
|
||||||
print(case, "---- OUT TOKENS ----")
|
print(case, "---- OUT TOKENS ----")
|
||||||
|
else
|
||||||
|
assert(state[1] == luatokens.tokens.TK_STRING)
|
||||||
|
assert(state[2] == "hello world")
|
||||||
|
end
|
||||||
end -- lua tokens
|
end -- lua tokens
|
||||||
|
|
||||||
do -- more lua tokens
|
do -- more lua tokens
|
||||||
local luatokens = require "luatokens"
|
local luatokens = require "luatokens"
|
||||||
local tokens = luatokens.tokens
|
local tokens = luatokens.defs
|
||||||
local state, err, etoken, estate = parser.parse(tokens, [["\a\b\f\n\r\t\v\\\"\'\z \x41\65\
|
local state, err, etoken, estate = parser.parse(tokens, [["\a\b\f\n\r\t\v\\\"\'\z \x41\65\
|
||||||
"]])
|
"]])
|
||||||
local case = case()
|
local case = case()
|
||||||
print(case, "---- IN TOKENS ----")
|
|
||||||
if not state then
|
if not state then
|
||||||
|
print(case, "---- IN TOKENS ----")
|
||||||
print(case, err, etoken)
|
print(case, err, etoken)
|
||||||
for i,v in pairs(estate) do
|
for i,v in pairs(estate) do
|
||||||
print(case, i, v)
|
print(case, i, v)
|
||||||
end
|
end
|
||||||
|
print(case, "---- OUT TOKENS ----")
|
||||||
else
|
else
|
||||||
for i,v in ipairs(state) do
|
assert(state[1] == luatokens.tokens.TK_STRING)
|
||||||
|
assert(state[2] == "\7\8\12\10\13\9\11\92\34\39\65\65\10")
|
||||||
|
end
|
||||||
|
end -- lua tokens
|
||||||
|
|
||||||
|
do -- even more lua tokens
|
||||||
|
local luatokens = require "luatokens"
|
||||||
|
local tokens = luatokens.defs
|
||||||
|
local state, err, etoken, estate = parser.parse(tokens, [["\u{000000000000000000000000000000000000000000000000000000000000041}"]])
|
||||||
|
local case = case()
|
||||||
|
if not state then
|
||||||
|
print(case, "---- IN TOKENS ----")
|
||||||
|
print(case, err, etoken)
|
||||||
|
for i,v in pairs(estate) do
|
||||||
print(case, i, v)
|
print(case, i, v)
|
||||||
if v == luatokens.TK_STRING then
|
|
||||||
in_string = true
|
|
||||||
elseif in_string then
|
|
||||||
print(case, v:gsub(".", function(v) return "\\"..string.byte(v) end))
|
|
||||||
in_string = false
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
print(case, "---- OUT TOKENS ----")
|
print(case, "---- OUT TOKENS ----")
|
||||||
|
else
|
||||||
|
assert(state[1] == luatokens.tokens.TK_STRING)
|
||||||
|
assert(state[2] == "A")
|
||||||
|
end
|
||||||
|
end -- lua tokens
|
||||||
|
|
||||||
|
do -- even more lua tokens
|
||||||
|
local luatokens = require "luatokens"
|
||||||
|
local tokens = luatokens.defs
|
||||||
|
local state, err, etoken, estate = parser.parse(tokens, [["\u{7F}""\u{80}""\u{7FF}""\u{800}""\u{FFFF}""\u{10000}""\u{1FFFFF}""\u{200000}""\u{3FFFFFF}""\u{4000000}""\u{7FFFFFFF}"]])
|
||||||
|
local case = case()
|
||||||
|
if not state then
|
||||||
|
print(case, "---- IN TOKENS ----")
|
||||||
|
print(case, err, etoken)
|
||||||
|
for i,v in pairs(estate) do
|
||||||
|
print(case, i, v)
|
||||||
|
end
|
||||||
|
print(case, "---- OUT TOKENS ----")
|
||||||
|
else
|
||||||
|
assert(table.remove(state, 1) == luatokens.tokens.TK_STRING)
|
||||||
|
assert(table.remove(state, 1) == "\127")
|
||||||
|
assert(table.remove(state, 1) == luatokens.tokens.TK_STRING)
|
||||||
|
assert(table.remove(state, 1) == "\194\128")
|
||||||
|
assert(table.remove(state, 1) == luatokens.tokens.TK_STRING)
|
||||||
|
assert(table.remove(state, 1) == "\223\191")
|
||||||
|
assert(table.remove(state, 1) == luatokens.tokens.TK_STRING)
|
||||||
|
assert(table.remove(state, 1) == "\224\160\128")
|
||||||
|
assert(table.remove(state, 1) == luatokens.tokens.TK_STRING)
|
||||||
|
assert(table.remove(state, 1) == "\239\191\191")
|
||||||
|
assert(table.remove(state, 1) == luatokens.tokens.TK_STRING)
|
||||||
|
assert(table.remove(state, 1) == "\240\144\128\128")
|
||||||
|
assert(table.remove(state, 1) == luatokens.tokens.TK_STRING)
|
||||||
|
assert(table.remove(state, 1) == "\247\191\191\191")
|
||||||
|
assert(table.remove(state, 1) == luatokens.tokens.TK_STRING)
|
||||||
|
assert(table.remove(state, 1) == "\248\136\128\128\128")
|
||||||
|
assert(table.remove(state, 1) == luatokens.tokens.TK_STRING)
|
||||||
|
assert(table.remove(state, 1) == "\251\191\191\191\191")
|
||||||
|
assert(table.remove(state, 1) == luatokens.tokens.TK_STRING)
|
||||||
|
assert(table.remove(state, 1) == "\252\132\128\128\128\128")
|
||||||
|
assert(table.remove(state, 1) == luatokens.tokens.TK_STRING)
|
||||||
|
assert(table.remove(state, 1) == "\253\191\191\191\191\191")
|
||||||
|
end
|
||||||
end -- lua tokens
|
end -- lua tokens
|
||||||
|
|
Loading…
Reference in New Issue