Fix \u
This commit is contained in:
		
							parent
							
								
									1f1f6c0732
								
							
						
					
					
						commit
						0118cdcb80
					
				
					 3 changed files with 171 additions and 45 deletions
				
			
		
							
								
								
									
										117
									
								
								luatokens.lua
									
										
									
									
									
								
							
							
						
						
									
										117
									
								
								luatokens.lua
									
										
									
									
									
								
							| 
						 | 
				
			
			@ -1,4 +1,4 @@
 | 
			
		|||
-- Lua tokens
 | 
			
		||||
-- Lua defs
 | 
			
		||||
 | 
			
		||||
-- we need some stuff from here
 | 
			
		||||
local parser = require "parser"
 | 
			
		||||
| 
						 | 
				
			
			@ -8,11 +8,27 @@ local COLLECT = parser.COLLECT
 | 
			
		|||
local collect_fallback = parser.collect_fallback
 | 
			
		||||
 | 
			
		||||
-- "dummies"
 | 
			
		||||
local TK_STRING = {}
 | 
			
		||||
-- see http://www.lua.org/source/5.3/llex.h.html#RESERVED
 | 
			
		||||
local TK_AND, TK_BREAK,
 | 
			
		||||
    TK_DO, TK_ELSE, TK_ELSEIF, TK_END, TK_FALSE, TK_FOR, TK_FUNCTION,
 | 
			
		||||
    TK_GOTO, TK_IF, TK_IN, TK_LOCAL, TK_NIL, TK_NOT, TK_OR, TK_REPEAT,
 | 
			
		||||
    TK_RETURN, TK_THEN, TK_TRUE, TK_UNTIL, TK_WHILE,
 | 
			
		||||
    TK_IDIV, TK_CONCAT, TK_DOTS, TK_EQ, TK_GE, TK_LE, TK_NE,
 | 
			
		||||
    TK_SHL, TK_SHR,
 | 
			
		||||
    TK_DBCOLON, TK_EOS,
 | 
			
		||||
    TK_FLT, TK_INT, TK_NAME, TK_STRING =
 | 
			
		||||
    {}, {},
 | 
			
		||||
    {}, {}, {}, {}, {}, {}, {},
 | 
			
		||||
    {}, {}, {}, {}, {}, {}, {}, {},
 | 
			
		||||
    {}, {}, {}, {}, {},
 | 
			
		||||
    {}, {}, {}, {}, {}, {}, {},
 | 
			
		||||
    {}, {},
 | 
			
		||||
    {}, {},
 | 
			
		||||
    {}, {}, {}, {}
 | 
			
		||||
 | 
			
		||||
local tokens = {}
 | 
			
		||||
local defs = {}
 | 
			
		||||
 | 
			
		||||
tokens.base = {
 | 
			
		||||
defs.base = {
 | 
			
		||||
    [" "] = "whitespace",
 | 
			
		||||
    ["\n"] = "newline",
 | 
			
		||||
    ["\r"] = "newline",
 | 
			
		||||
| 
						 | 
				
			
			@ -84,14 +100,15 @@ tokens.base = {
 | 
			
		|||
}
 | 
			
		||||
 | 
			
		||||
local function linecount(state, token, rule)
 | 
			
		||||
    -- TODO fix
 | 
			
		||||
    if token == "\n" or token == "\r" then
 | 
			
		||||
        state.line = (state.line or 1) + 1
 | 
			
		||||
    end
 | 
			
		||||
end
 | 
			
		||||
 | 
			
		||||
do local tstring = selfify({})
 | 
			
		||||
    tokens.string = tstring
 | 
			
		||||
    tstring.tokens = tokens
 | 
			
		||||
    defs.string = tstring
 | 
			
		||||
    tstring.defs = defs
 | 
			
		||||
    do local tsescapes = setmetatable({
 | 
			
		||||
            ["'"] = "insertraw",
 | 
			
		||||
            ['"'] = "insertraw",
 | 
			
		||||
| 
						 | 
				
			
			@ -110,9 +127,9 @@ do local tstring = selfify({})
 | 
			
		|||
            ["\r"] = setmetatable({["\n"] = setmetatable({}, {__index=tstring})}, {__index=tstring}),
 | 
			
		||||
            [1] = linecount,
 | 
			
		||||
            [2] = function(state, token, rule) if token == "\r" or token == "\n" then collect_fallback(state, "\n") end end,
 | 
			
		||||
        }, {__index = tokens.base})
 | 
			
		||||
        tokens.string.escapes = tsescapes
 | 
			
		||||
        tsescapes.string = tokens.string
 | 
			
		||||
        }, {__index = defs.base})
 | 
			
		||||
        defs.string.escapes = tsescapes
 | 
			
		||||
        tsescapes.string = defs.string
 | 
			
		||||
 | 
			
		||||
        function tsescapes.insertraw(state, token)
 | 
			
		||||
            collect_fallback(state, token)
 | 
			
		||||
| 
						 | 
				
			
			@ -158,7 +175,7 @@ do local tstring = selfify({})
 | 
			
		|||
            end
 | 
			
		||||
        end
 | 
			
		||||
 | 
			
		||||
        tsescapes.hex = setmetatable(selfify({string = tokens.string, digit = "hexdigit"}), {__index=tokens.base})
 | 
			
		||||
        tsescapes.hex = setmetatable(selfify({string = defs.string, digit = "hexdigit"}), {__index=defs.base})
 | 
			
		||||
        function tsescapes.hex.hexdigit(state, token)
 | 
			
		||||
            local digit = string.find("123456789ABCDEF0123456789abcdef0", token, 1, true)
 | 
			
		||||
            assert(digit, "this should never be called for non-hex-digits")
 | 
			
		||||
| 
						 | 
				
			
			@ -174,14 +191,60 @@ do local tstring = selfify({})
 | 
			
		|||
            end
 | 
			
		||||
        end
 | 
			
		||||
 | 
			
		||||
        do local tseunicode = {}
 | 
			
		||||
            tseunicode["{"] = "hex"
 | 
			
		||||
            do local tseuhex = setmetatable(selfify({digit = "hexdigit", string=tstring}), {__index=defs.base})
 | 
			
		||||
                tseunicode.hex = tseuhex
 | 
			
		||||
                function tseuhex.hexdigit(state, token)
 | 
			
		||||
                    local digit = string.find("123456789ABCDEF0123456789abcdef0", token, 1, true)
 | 
			
		||||
                    assert(digit, "this should never be called for non-hex-digits")
 | 
			
		||||
                    state.in_hex = (state.in_hex or 0) * 16 + digit % 16
 | 
			
		||||
                    if state.in_hex <= 2147483647 then
 | 
			
		||||
                        return "self"
 | 
			
		||||
                    end
 | 
			
		||||
                end
 | 
			
		||||
                tseuhex["}"] = function(state, token)
 | 
			
		||||
                    local num = state.in_hex
 | 
			
		||||
                    state.in_hex = nil
 | 
			
		||||
                    if num < 128 then
 | 
			
		||||
                        collect_fallback(state, string.char(num))
 | 
			
		||||
                        return "string"
 | 
			
		||||
                    end
 | 
			
		||||
                    local bytes = ""
 | 
			
		||||
                    while num > 63 do
 | 
			
		||||
                        local v = num % 64
 | 
			
		||||
                        bytes = string.char(128 + v) .. bytes -- yeah ik, not the most efficient
 | 
			
		||||
                        num = (num - v) / 64
 | 
			
		||||
                    end
 | 
			
		||||
                    if num >= 2^6/(2^#bytes) then
 | 
			
		||||
                        local v = num % 64
 | 
			
		||||
                        bytes = string.char(128 + v) .. bytes
 | 
			
		||||
                        num = (num - v) / 64
 | 
			
		||||
                    end
 | 
			
		||||
                    do
 | 
			
		||||
                        local v = 0
 | 
			
		||||
                        for i=1,#bytes do
 | 
			
		||||
                            v = v + 128 / 2^i
 | 
			
		||||
                        end
 | 
			
		||||
                        v = v + num
 | 
			
		||||
                        assert(v < 126)
 | 
			
		||||
                        bytes = string.char(128 + v) .. bytes
 | 
			
		||||
                    end
 | 
			
		||||
                    collect_fallback(state, bytes)
 | 
			
		||||
                    return "string"
 | 
			
		||||
                end
 | 
			
		||||
            end
 | 
			
		||||
            tsescapes.unicode = tseunicode
 | 
			
		||||
        end
 | 
			
		||||
 | 
			
		||||
        do local tseskipwhitespace = selfify({
 | 
			
		||||
                string = tokens.string,
 | 
			
		||||
                string = defs.string,
 | 
			
		||||
                whitespace = "self",
 | 
			
		||||
                [""] = "string",
 | 
			
		||||
                [1] = collect_fallback,
 | 
			
		||||
                [2] = linecount,
 | 
			
		||||
            })
 | 
			
		||||
            local tbase = tokens.base
 | 
			
		||||
            local tbase = defs.base
 | 
			
		||||
            local tbasemap = {whitespace = "whitespace", newline = "whitespace"}
 | 
			
		||||
            setmetatable(tseskipwhitespace, {__index = function(t, k) return tbasemap[tbase[k]] or tstring[k] end})
 | 
			
		||||
            tsescapes.skipwhitespace =  tseskipwhitespace
 | 
			
		||||
| 
						 | 
				
			
			@ -205,7 +268,7 @@ do local tstring = selfify({})
 | 
			
		|||
            state.in_string = nil
 | 
			
		||||
            state[#state+1] = table.concat(state[COLLECT])
 | 
			
		||||
            state[COLLECT] = nil
 | 
			
		||||
            return "tokens"
 | 
			
		||||
            return "defs"
 | 
			
		||||
        else
 | 
			
		||||
            collect_fallback(state, token)
 | 
			
		||||
            return "self"
 | 
			
		||||
| 
						 | 
				
			
			@ -213,13 +276,18 @@ do local tstring = selfify({})
 | 
			
		|||
    end
 | 
			
		||||
end
 | 
			
		||||
 | 
			
		||||
tokens["'"] = "string_open"
 | 
			
		||||
tokens['"'] = "string_open"
 | 
			
		||||
tokens[1] = linecount
 | 
			
		||||
do local tlongstring = {}
 | 
			
		||||
    -- TODO
 | 
			
		||||
end
 | 
			
		||||
 | 
			
		||||
setmetatable(tokens, {__index=whitespace})
 | 
			
		||||
defs["'"] = "string_open"
 | 
			
		||||
defs['"'] = "string_open"
 | 
			
		||||
defs["["] = "maybe_longstring"
 | 
			
		||||
defs[1] = linecount
 | 
			
		||||
 | 
			
		||||
function tokens.string_open(state, token)
 | 
			
		||||
setmetatable(defs, {__index=whitespace})
 | 
			
		||||
 | 
			
		||||
function defs.string_open(state, token)
 | 
			
		||||
    if not state.in_string then
 | 
			
		||||
        state[#state+1] = TK_STRING
 | 
			
		||||
        state[COLLECT] = {}
 | 
			
		||||
| 
						 | 
				
			
			@ -230,6 +298,15 @@ function tokens.string_open(state, token)
 | 
			
		|||
end
 | 
			
		||||
 | 
			
		||||
return {
 | 
			
		||||
    tokens = tokens,
 | 
			
		||||
    TK_STRING = TK_STRING,
 | 
			
		||||
    defs = defs,
 | 
			
		||||
    tokens = {
 | 
			
		||||
        TK_AND = TK_AND, TK_BREAK = TK_BREAK,
 | 
			
		||||
        TK_DO = TK_DO, TK_ELSE = TK_ELSE, TK_ELSEIF = TK_ELSEIF, TK_END = TK_END, TK_FALSE = TK_FALSE, TK_FOR = TK_FOR, TK_FUNCTION = TK_FUNCTION,
 | 
			
		||||
        TK_GOTO = TK_GOTO, TK_IF = TK_IF, TK_IN = TK_IN, TK_LOCAL = TK_LOCAL, TK_NIL = TK_NIL, TK_NOT = TK_NOT, TK_OR = TK_OR, TK_REPEAT = TK_REPEAT,
 | 
			
		||||
        TK_RETURN = TK_RETURN, TK_THEN = TK_THEN, TK_TRUE = TK_TRUE, TK_UNTIL = TK_UNTIL, TK_WHILE = TK_WHILE,
 | 
			
		||||
        TK_IDIV = TK_IDIV, TK_CONCAT = TK_CONCAT, TK_DOTS = TK_DOTS, TK_EQ = TK_EQ, TK_GE = TK_GE, TK_LE = TK_LE, TK_NE = TK_NE,
 | 
			
		||||
        TK_SHL = TK_SHL, TK_SHR = TK_SHR,
 | 
			
		||||
        TK_DBCOLON = TK_DBCOLON, TK_EOS = TK_EOS,
 | 
			
		||||
        TK_FLT = TK_FLT, TK_INT = TK_INT, TK_NAME = TK_NAME, TK_STRING = TK_STRING
 | 
			
		||||
    },
 | 
			
		||||
}
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -72,7 +72,7 @@ local function get_next_common(state, in_pos, token)
 | 
			
		|||
end
 | 
			
		||||
 | 
			
		||||
local function get_next_table(state, in_pos)
 | 
			
		||||
    if state[DATA] == nil or #state[DATA] == 0 then return in_pos, state end
 | 
			
		||||
    if state[DATA] == nil or #state[DATA] == 0 then return in_pos, state end -- TODO end-of-stream handling
 | 
			
		||||
    in_pos = in_pos + 1
 | 
			
		||||
    local token = state[DATA][in_pos - state[OFFDATA]]
 | 
			
		||||
    if token == nil then
 | 
			
		||||
| 
						 | 
				
			
			@ -84,10 +84,10 @@ local function get_next_table(state, in_pos)
 | 
			
		|||
end
 | 
			
		||||
 | 
			
		||||
local function get_next_string(state, in_pos)
 | 
			
		||||
    if state[DATA] == nil or #state[DATA] == 0 then return in_pos, state end
 | 
			
		||||
    if state[DATA] == nil or #state[DATA] == 0 then return in_pos, state end -- TODO end-of-stream handling
 | 
			
		||||
    in_pos = in_pos + 1
 | 
			
		||||
    local token = optimize_lookups[string.byte(state[DATA], in_pos - state[OFFDATA], in_pos - state[OFFDATA])] or ""
 | 
			
		||||
    if token == "" then
 | 
			
		||||
    local token = optimize_lookups[string.byte(state[DATA], in_pos - state[OFFDATA], in_pos - state[OFFDATA])]
 | 
			
		||||
    if token == nil then
 | 
			
		||||
        state[OFFDATA] = in_pos - 1
 | 
			
		||||
        state[DATA] = state[GEN]()
 | 
			
		||||
        return get_next_string(state, state[OFFDATA])
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										91
									
								
								test.lua
									
										
									
									
									
								
							
							
						
						
									
										91
									
								
								test.lua
									
										
									
									
									
								
							| 
						 | 
				
			
			@ -56,57 +56,106 @@ do -- trim left spaces
 | 
			
		|||
        end
 | 
			
		||||
        return "self"
 | 
			
		||||
    end
 | 
			
		||||
    for k,v in ipairs({"hello", "    hello", "\t \v \n\r hallo", "I really like this parser thingy if it can be called that"}) do
 | 
			
		||||
    for k,v in ipairs({"hello", "    hello", "\t \v \n\r hello"}) do
 | 
			
		||||
        local state, err = parser.parse(defs, v)
 | 
			
		||||
        local case = case()
 | 
			
		||||
        if not state then
 | 
			
		||||
            print(case(), err)
 | 
			
		||||
            print(case, err)
 | 
			
		||||
        else
 | 
			
		||||
            print(case(), table.concat(state))
 | 
			
		||||
            assert(table.concat(state) == "hello")
 | 
			
		||||
        end
 | 
			
		||||
    end
 | 
			
		||||
end -- trim left spaces
 | 
			
		||||
 | 
			
		||||
do -- lua tokens
 | 
			
		||||
    local luatokens = require "luatokens"
 | 
			
		||||
    local tokens = luatokens.tokens
 | 
			
		||||
    local tokens = luatokens.defs
 | 
			
		||||
    local state, err, etoken, estate = parser.parse(tokens, [["hello world"]])
 | 
			
		||||
    local case = case()
 | 
			
		||||
    print(case, "---- IN  TOKENS ----")
 | 
			
		||||
    if not state then
 | 
			
		||||
        print(case, "---- IN  TOKENS ----")
 | 
			
		||||
        print(case, err, etoken)
 | 
			
		||||
        for i,v in pairs(estate) do
 | 
			
		||||
            print(case, i, v)
 | 
			
		||||
        end
 | 
			
		||||
        print(case, "---- OUT TOKENS ----")
 | 
			
		||||
    else
 | 
			
		||||
        for i,v in ipairs(state) do
 | 
			
		||||
            print(case, i, v)
 | 
			
		||||
        end
 | 
			
		||||
        assert(state[1] == luatokens.tokens.TK_STRING)
 | 
			
		||||
        assert(state[2] == "hello world")
 | 
			
		||||
    end
 | 
			
		||||
    print(case, "---- OUT TOKENS ----")
 | 
			
		||||
end -- lua tokens
 | 
			
		||||
 | 
			
		||||
do -- more lua tokens
 | 
			
		||||
    local luatokens = require "luatokens"
 | 
			
		||||
    local tokens = luatokens.tokens
 | 
			
		||||
    local tokens = luatokens.defs
 | 
			
		||||
    local state, err, etoken, estate = parser.parse(tokens, [["\a\b\f\n\r\t\v\\\"\'\z        \x41\65\
 | 
			
		||||
"]])
 | 
			
		||||
    local case = case()
 | 
			
		||||
    print(case, "---- IN  TOKENS ----")
 | 
			
		||||
    if not state then
 | 
			
		||||
        print(case, "---- IN  TOKENS ----")
 | 
			
		||||
        print(case, err, etoken)
 | 
			
		||||
        for i,v in pairs(estate) do
 | 
			
		||||
            print(case, i, v)
 | 
			
		||||
        end
 | 
			
		||||
        print(case, "---- OUT TOKENS ----")
 | 
			
		||||
    else
 | 
			
		||||
        for i,v in ipairs(state) do
 | 
			
		||||
            print(case, i, v)
 | 
			
		||||
            if v == luatokens.TK_STRING then
 | 
			
		||||
                in_string = true
 | 
			
		||||
            elseif in_string then
 | 
			
		||||
                print(case, v:gsub(".", function(v) return "\\"..string.byte(v) end))
 | 
			
		||||
                in_string = false
 | 
			
		||||
            end
 | 
			
		||||
        end
 | 
			
		||||
        assert(state[1] == luatokens.tokens.TK_STRING)
 | 
			
		||||
        assert(state[2] == "\7\8\12\10\13\9\11\92\34\39\65\65\10")
 | 
			
		||||
    end
 | 
			
		||||
end -- lua tokens
 | 
			
		||||
 | 
			
		||||
do -- even more lua tokens
 | 
			
		||||
    local luatokens = require "luatokens"
 | 
			
		||||
    local tokens = luatokens.defs
 | 
			
		||||
    local state, err, etoken, estate = parser.parse(tokens, [["\u{000000000000000000000000000000000000000000000000000000000000041}"]])
 | 
			
		||||
    local case = case()
 | 
			
		||||
    if not state then
 | 
			
		||||
        print(case, "---- IN  TOKENS ----")
 | 
			
		||||
        print(case, err, etoken)
 | 
			
		||||
        for i,v in pairs(estate) do
 | 
			
		||||
            print(case, i, v)
 | 
			
		||||
        end
 | 
			
		||||
        print(case, "---- OUT TOKENS ----")
 | 
			
		||||
    else
 | 
			
		||||
        assert(state[1] == luatokens.tokens.TK_STRING)
 | 
			
		||||
        assert(state[2] == "A")
 | 
			
		||||
    end
 | 
			
		||||
end -- lua tokens
 | 
			
		||||
 | 
			
		||||
do -- even more lua tokens
 | 
			
		||||
    local luatokens = require "luatokens"
 | 
			
		||||
    local tokens = luatokens.defs
 | 
			
		||||
    local state, err, etoken, estate = parser.parse(tokens, [["\u{7F}""\u{80}""\u{7FF}""\u{800}""\u{FFFF}""\u{10000}""\u{1FFFFF}""\u{200000}""\u{3FFFFFF}""\u{4000000}""\u{7FFFFFFF}"]])
 | 
			
		||||
    local case = case()
 | 
			
		||||
    if not state then
 | 
			
		||||
        print(case, "---- IN  TOKENS ----")
 | 
			
		||||
        print(case, err, etoken)
 | 
			
		||||
        for i,v in pairs(estate) do
 | 
			
		||||
            print(case, i, v)
 | 
			
		||||
        end
 | 
			
		||||
        print(case, "---- OUT TOKENS ----")
 | 
			
		||||
    else
 | 
			
		||||
        assert(table.remove(state, 1) == luatokens.tokens.TK_STRING)
 | 
			
		||||
        assert(table.remove(state, 1) == "\127")
 | 
			
		||||
        assert(table.remove(state, 1) == luatokens.tokens.TK_STRING)
 | 
			
		||||
        assert(table.remove(state, 1) == "\194\128")
 | 
			
		||||
        assert(table.remove(state, 1) == luatokens.tokens.TK_STRING)
 | 
			
		||||
        assert(table.remove(state, 1) == "\223\191")
 | 
			
		||||
        assert(table.remove(state, 1) == luatokens.tokens.TK_STRING)
 | 
			
		||||
        assert(table.remove(state, 1) == "\224\160\128")
 | 
			
		||||
        assert(table.remove(state, 1) == luatokens.tokens.TK_STRING)
 | 
			
		||||
        assert(table.remove(state, 1) == "\239\191\191")
 | 
			
		||||
        assert(table.remove(state, 1) == luatokens.tokens.TK_STRING)
 | 
			
		||||
        assert(table.remove(state, 1) == "\240\144\128\128")
 | 
			
		||||
        assert(table.remove(state, 1) == luatokens.tokens.TK_STRING)
 | 
			
		||||
        assert(table.remove(state, 1) == "\247\191\191\191")
 | 
			
		||||
        assert(table.remove(state, 1) == luatokens.tokens.TK_STRING)
 | 
			
		||||
        assert(table.remove(state, 1) == "\248\136\128\128\128")
 | 
			
		||||
        assert(table.remove(state, 1) == luatokens.tokens.TK_STRING)
 | 
			
		||||
        assert(table.remove(state, 1) == "\251\191\191\191\191")
 | 
			
		||||
        assert(table.remove(state, 1) == luatokens.tokens.TK_STRING)
 | 
			
		||||
        assert(table.remove(state, 1) == "\252\132\128\128\128\128")
 | 
			
		||||
        assert(table.remove(state, 1) == luatokens.tokens.TK_STRING)
 | 
			
		||||
        assert(table.remove(state, 1) == "\253\191\191\191\191\191")
 | 
			
		||||
    end
 | 
			
		||||
    print(case, "---- OUT TOKENS ----")
 | 
			
		||||
end -- lua tokens
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		
		Reference in a new issue