Attempted lua tokenizer didn't work
Publishing anyway because someone might be able to learn from my failure
This commit is contained in:
		
							parent
							
								
									d03d77d28b
								
							
						
					
					
						commit
						5a4b41bd47
					
				
					 3 changed files with 304 additions and 11 deletions
				
			
		
							
								
								
									
										225
									
								
								luatokens.lua
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										225
									
								
								luatokens.lua
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,225 @@
 | 
			
		|||
-- Lua tokens
 | 
			
		||||
 | 
			
		||||
-- we need some stuff from here
 | 
			
		||||
local parser = require "parser"
 | 
			
		||||
local selfify = parser.selfify
 | 
			
		||||
 | 
			
		||||
-- "dummies"
 | 
			
		||||
local TK_STRING = {}
 | 
			
		||||
 | 
			
		||||
local tokens = {}
 | 
			
		||||
 | 
			
		||||
tokens.base = {
 | 
			
		||||
    [" "] = "whitespace",
 | 
			
		||||
    ["\n"] = "newline",
 | 
			
		||||
    ["\r"] = "newline",
 | 
			
		||||
    ["\v"] = "whitespace",
 | 
			
		||||
    ["\t"] = "whitespace",
 | 
			
		||||
    ["\f"] = "whitespace",
 | 
			
		||||
    ["0"] = "digit",
 | 
			
		||||
    ["1"] = "digit",
 | 
			
		||||
    ["2"] = "digit",
 | 
			
		||||
    ["3"] = "digit",
 | 
			
		||||
    ["4"] = "digit",
 | 
			
		||||
    ["5"] = "digit",
 | 
			
		||||
    ["6"] = "digit",
 | 
			
		||||
    ["7"] = "digit",
 | 
			
		||||
    ["8"] = "digit",
 | 
			
		||||
    ["9"] = "digit",
 | 
			
		||||
    ["a"] = "hexdigit",
 | 
			
		||||
    ["b"] = "hexdigit",
 | 
			
		||||
    ["c"] = "hexdigit",
 | 
			
		||||
    ["d"] = "hexdigit",
 | 
			
		||||
    ["e"] = "hexdigit",
 | 
			
		||||
    ["f"] = "hexdigit",
 | 
			
		||||
    ["A"] = "hexdigit",
 | 
			
		||||
    ["B"] = "hexdigit",
 | 
			
		||||
    ["C"] = "hexdigit",
 | 
			
		||||
    ["D"] = "hexdigit",
 | 
			
		||||
    ["E"] = "hexdigit",
 | 
			
		||||
    ["F"] = "hexdigit",
 | 
			
		||||
    ["g"] = "alpha",
 | 
			
		||||
    ["h"] = "alpha",
 | 
			
		||||
    ["i"] = "alpha",
 | 
			
		||||
    ["j"] = "alpha",
 | 
			
		||||
    ["k"] = "alpha",
 | 
			
		||||
    ["l"] = "alpha",
 | 
			
		||||
    ["m"] = "alpha",
 | 
			
		||||
    ["n"] = "alpha",
 | 
			
		||||
    ["o"] = "alpha",
 | 
			
		||||
    ["p"] = "alpha",
 | 
			
		||||
    ["q"] = "alpha",
 | 
			
		||||
    ["r"] = "alpha",
 | 
			
		||||
    ["s"] = "alpha",
 | 
			
		||||
    ["t"] = "alpha",
 | 
			
		||||
    ["u"] = "alpha",
 | 
			
		||||
    ["v"] = "alpha",
 | 
			
		||||
    ["w"] = "alpha",
 | 
			
		||||
    ["x"] = "alpha",
 | 
			
		||||
    ["y"] = "alpha",
 | 
			
		||||
    ["z"] = "alpha",
 | 
			
		||||
    ["G"] = "alpha",
 | 
			
		||||
    ["H"] = "alpha",
 | 
			
		||||
    ["I"] = "alpha",
 | 
			
		||||
    ["J"] = "alpha",
 | 
			
		||||
    ["K"] = "alpha",
 | 
			
		||||
    ["L"] = "alpha",
 | 
			
		||||
    ["M"] = "alpha",
 | 
			
		||||
    ["N"] = "alpha",
 | 
			
		||||
    ["O"] = "alpha",
 | 
			
		||||
    ["P"] = "alpha",
 | 
			
		||||
    ["Q"] = "alpha",
 | 
			
		||||
    ["R"] = "alpha",
 | 
			
		||||
    ["S"] = "alpha",
 | 
			
		||||
    ["T"] = "alpha",
 | 
			
		||||
    ["U"] = "alpha",
 | 
			
		||||
    ["V"] = "alpha",
 | 
			
		||||
    ["W"] = "alpha",
 | 
			
		||||
    ["X"] = "alpha",
 | 
			
		||||
    ["Y"] = "alpha",
 | 
			
		||||
    ["Z"] = "alpha",
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
local function linecount(state, token, rule)
 | 
			
		||||
    if token == "\n" or token == "\r" then
 | 
			
		||||
        state.line = (state.line or 1) + 1
 | 
			
		||||
    end
 | 
			
		||||
end
 | 
			
		||||
 | 
			
		||||
do local tstring = selfify({})
 | 
			
		||||
    tokens.string = tstring
 | 
			
		||||
    tstring.tokens = tokens
 | 
			
		||||
    do local tsescapes = setmetatable({
 | 
			
		||||
            ["'"] = "insertraw",
 | 
			
		||||
            ['"'] = "insertraw",
 | 
			
		||||
            ['\\'] = "insertraw",
 | 
			
		||||
            ["a"] = "insertmap",
 | 
			
		||||
            ["b"] = "insertmap",
 | 
			
		||||
            ["f"] = "insertmap",
 | 
			
		||||
            ["n"] = "insertmap",
 | 
			
		||||
            ["r"] = "insertmap",
 | 
			
		||||
            ["t"] = "insertmap",
 | 
			
		||||
            ["v"] = "insertmap",
 | 
			
		||||
            ["z"] = "skipwhitespace",
 | 
			
		||||
            ["u"] = "unicode",
 | 
			
		||||
            ["x"] = "hex",
 | 
			
		||||
            ["\n"] = setmetatable({["\r"] = setmetatable({}, {__index=tstring})}, {__index=tstring}),
 | 
			
		||||
            ["\r"] = setmetatable({["\n"] = setmetatable({}, {__index=tstring})}, {__index=tstring}),
 | 
			
		||||
            [1] = linecount,
 | 
			
		||||
            [2] = print
 | 
			
		||||
        }, {__index = tokens.base})
 | 
			
		||||
        tokens.string.escapes = tsescapes
 | 
			
		||||
        tsescapes.string = tokens.string
 | 
			
		||||
 | 
			
		||||
        function tsescapes.insertraw(state, token)
 | 
			
		||||
            state[#state+1] = token
 | 
			
		||||
            return "string"
 | 
			
		||||
        end
 | 
			
		||||
 | 
			
		||||
        do
 | 
			
		||||
            local map = { ["a"] = "\a", ["b"] = "\b", ["f"] = "\f", ["n"] = "\n", ["r"] = "\r", ["t"] = "\t", ["v"] = "\v" }
 | 
			
		||||
            function tsescapes.insertmap(state, token)
 | 
			
		||||
                state[#state+1] = map[token]
 | 
			
		||||
                return "string"
 | 
			
		||||
            end
 | 
			
		||||
        end
 | 
			
		||||
 | 
			
		||||
        function tsescapes.digit(state, token)
 | 
			
		||||
            local digit = string.find("1234567890", token, 1, true)
 | 
			
		||||
            local num = state.in_digit
 | 
			
		||||
            if digit then
 | 
			
		||||
                num = (num or 0) * 10 + digit % 10
 | 
			
		||||
                state.c = (state.c or 0) + 1
 | 
			
		||||
                if state.c < 3 then
 | 
			
		||||
                    state.in_digit = num
 | 
			
		||||
                    return "digitc"
 | 
			
		||||
                end
 | 
			
		||||
            end
 | 
			
		||||
            if num > 255 then
 | 
			
		||||
                return nil
 | 
			
		||||
            end
 | 
			
		||||
            state[#state+1] = string.char(num)
 | 
			
		||||
            state.in_digit = nil
 | 
			
		||||
            state.c = nil
 | 
			
		||||
            return "string"
 | 
			
		||||
        end
 | 
			
		||||
        tsescapes.digitc = setmetatable(selfify({[""] = tsescapes.digit, digitc = "self", string = tstring}), {__index=tstring})
 | 
			
		||||
 | 
			
		||||
        tsescapes.hex = setmetatable(selfify({string = tokens.string}), {__index=tokens.base})
 | 
			
		||||
        function tsescapes.hex.hexdigit(state, token)
 | 
			
		||||
            local digit = string.find("123456789ABCDEF0123456789abcdef0", token, 1, true)
 | 
			
		||||
            assert(digit, "this should never be called for non-hex-digits")
 | 
			
		||||
            local num = state.in_hex
 | 
			
		||||
            if num then
 | 
			
		||||
                num = num * 16 + digit % 16
 | 
			
		||||
                state[#state+1] = string.char(num)
 | 
			
		||||
                state.in_hex = nil
 | 
			
		||||
                return "string"
 | 
			
		||||
            else
 | 
			
		||||
                state.in_hex = digit % 16
 | 
			
		||||
                return "self"
 | 
			
		||||
            end
 | 
			
		||||
        end
 | 
			
		||||
 | 
			
		||||
        do local tseskipwhitespace = selfify({
 | 
			
		||||
                string = tokens.string,
 | 
			
		||||
                whitespace = "self",
 | 
			
		||||
                [""] = "string",
 | 
			
		||||
                [1] = parser.insert_fallback,
 | 
			
		||||
                [2] = linecount,
 | 
			
		||||
            })
 | 
			
		||||
            local tbase = tokens.base
 | 
			
		||||
            local tbasemap = {whitespace = "whitespace", newline = "whitespace"}
 | 
			
		||||
            setmetatable(tseskipwhitespace, {__index = function(t, k) return tbasemap[tbase[k]] or tstring[k] end})
 | 
			
		||||
            tsescapes.skipwhitespace =  tseskipwhitespace
 | 
			
		||||
        end
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    tstring['\\'] = "escapes"
 | 
			
		||||
 | 
			
		||||
    tstring['"'] = "close"
 | 
			
		||||
    tstring["'"] = "close"
 | 
			
		||||
 | 
			
		||||
    tstring['\n'] = false
 | 
			
		||||
    tstring['\r'] = false
 | 
			
		||||
 | 
			
		||||
    tstring[""] = "self"
 | 
			
		||||
 | 
			
		||||
    tstring[1] = parser.insert_fallback
 | 
			
		||||
 | 
			
		||||
    function tstring.close(state, token)
 | 
			
		||||
        if state.in_string == token then
 | 
			
		||||
            local i = state.string_start
 | 
			
		||||
            state.in_string = nil
 | 
			
		||||
            state.string_start = nil
 | 
			
		||||
            state[i+1] = table.concat(state, '', i+1)
 | 
			
		||||
            for j=i+2, #state do
 | 
			
		||||
                state[j]=nil
 | 
			
		||||
            end
 | 
			
		||||
            return "tokens"
 | 
			
		||||
        else
 | 
			
		||||
            state[#state+1] = token
 | 
			
		||||
            return "self"
 | 
			
		||||
        end
 | 
			
		||||
    end
 | 
			
		||||
end
 | 
			
		||||
 | 
			
		||||
tokens["'"] = "string_open"
 | 
			
		||||
tokens['"'] = "string_open"
 | 
			
		||||
 | 
			
		||||
setmetatable(tokens, {__index=whitespace})
 | 
			
		||||
 | 
			
		||||
function tokens.string_open(state, token)
 | 
			
		||||
    if not state.in_string then
 | 
			
		||||
        state[#state+1] = TK_STRING
 | 
			
		||||
        state.in_string = token
 | 
			
		||||
        state.string_start = #state
 | 
			
		||||
        return "string"
 | 
			
		||||
    end
 | 
			
		||||
    assert("this shouldn't happen")
 | 
			
		||||
end
 | 
			
		||||
 | 
			
		||||
return {
 | 
			
		||||
    tokens = tokens,
 | 
			
		||||
    TK_STRING = TK_STRING,
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										36
									
								
								parser.lua
									
										
									
									
									
								
							
							
						
						
									
										36
									
								
								parser.lua
									
										
									
									
									
								
							| 
						 | 
				
			
			@ -31,27 +31,34 @@ local type, tostring
 | 
			
		|||
local function get_next_common(state, in_pos, token)
 | 
			
		||||
    -- note: must preserve "token" - do not call recursively with a different token
 | 
			
		||||
    local transition
 | 
			
		||||
    if state[STATE] ~= nil then
 | 
			
		||||
        transition = state[STATE][token]
 | 
			
		||||
        if not transition then
 | 
			
		||||
            transition = state[STATE][""]
 | 
			
		||||
    if state[STATE] then
 | 
			
		||||
        local st = state[STATE]
 | 
			
		||||
        local rule = st[token]
 | 
			
		||||
        transition = rule
 | 
			
		||||
        if transition == nil then
 | 
			
		||||
            transition = st[""]
 | 
			
		||||
        end
 | 
			
		||||
        local recheck = true
 | 
			
		||||
        while recheck do
 | 
			
		||||
            recheck = false
 | 
			
		||||
            local tytrans = type(transition)
 | 
			
		||||
            if tytrans == "string" then
 | 
			
		||||
                transition = state[STATE][transition]
 | 
			
		||||
                transition = st[transition]
 | 
			
		||||
                recheck = true
 | 
			
		||||
            elseif tytrans == "function" then
 | 
			
		||||
                transition = transition(state, token)
 | 
			
		||||
                recheck = true
 | 
			
		||||
            end
 | 
			
		||||
        end
 | 
			
		||||
        state[STATE] = transition -- may be nil
 | 
			
		||||
        for i, hook in ipairs(st) do
 | 
			
		||||
            if hook then -- allow overriding/disabling hooks
 | 
			
		||||
                hook(state, token, rule)
 | 
			
		||||
            end
 | 
			
		||||
        end
 | 
			
		||||
        state[STATE] = transition -- may be nil or false
 | 
			
		||||
    end
 | 
			
		||||
    -- must NOT use elseif here - the above may set state to nil!
 | 
			
		||||
    if state[STATE] == nil then
 | 
			
		||||
    -- must NOT use elseif here - the above may set state to nil or false!
 | 
			
		||||
    if not state[STATE] then
 | 
			
		||||
        -- unexpected token. stream consumer may attempt to recover,
 | 
			
		||||
        -- but we do this mostly to differentiate it from "end of stream" condition.
 | 
			
		||||
        return in_pos - 1, nil, "unexpected token", token, state
 | 
			
		||||
| 
						 | 
				
			
			@ -112,8 +119,21 @@ local function parse(defs, data)
 | 
			
		|||
    end
 | 
			
		||||
end
 | 
			
		||||
 | 
			
		||||
-- utility function that's quite common
 | 
			
		||||
local function selfify(t)
 | 
			
		||||
    t.self = t
 | 
			
		||||
    return t
 | 
			
		||||
end
 | 
			
		||||
-- common hook
 | 
			
		||||
local function insert_fallback(state, token, rule)
 | 
			
		||||
    if not rule then
 | 
			
		||||
        state[#state+1] = token
 | 
			
		||||
    end
 | 
			
		||||
end
 | 
			
		||||
 | 
			
		||||
return {
 | 
			
		||||
    STATE = STATE,
 | 
			
		||||
    stream = stream,
 | 
			
		||||
    parse = parse,
 | 
			
		||||
    selfify = selfify,
 | 
			
		||||
}
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										54
									
								
								test.lua
									
										
									
									
									
								
							
							
						
						
									
										54
									
								
								test.lua
									
										
									
									
									
								
							| 
						 | 
				
			
			@ -18,6 +18,12 @@
 | 
			
		|||
 | 
			
		||||
local parser = require "parser"
 | 
			
		||||
 | 
			
		||||
local caseno = 0
 | 
			
		||||
local function case()
 | 
			
		||||
    caseno = caseno + 1
 | 
			
		||||
    return caseno
 | 
			
		||||
end
 | 
			
		||||
 | 
			
		||||
do -- trim left spaces
 | 
			
		||||
    local defs = {}
 | 
			
		||||
    defs.self = defs
 | 
			
		||||
| 
						 | 
				
			
			@ -52,7 +58,49 @@ do -- trim left spaces
 | 
			
		|||
    end
 | 
			
		||||
    for k,v in ipairs({"hello", "    hello", "\t \v \n\r hallo", "I really like this parser thingy if it can be called that"}) do
 | 
			
		||||
        local state, err = parser.parse(defs, v)
 | 
			
		||||
        if not state then print(err) end
 | 
			
		||||
        print(table.concat(state))
 | 
			
		||||
        if not state then
 | 
			
		||||
            print(case(), err)
 | 
			
		||||
        else
 | 
			
		||||
            print(case(), table.concat(state))
 | 
			
		||||
        end
 | 
			
		||||
    end
 | 
			
		||||
end
 | 
			
		||||
end -- trim left spaces
 | 
			
		||||
 | 
			
		||||
do -- lua tokens
 | 
			
		||||
    local luatokens = require "luatokens"
 | 
			
		||||
    local tokens = luatokens.tokens
 | 
			
		||||
    local state, err, etoken, estate = parser.parse(tokens, [["hello world"]])
 | 
			
		||||
    local case = case()
 | 
			
		||||
    print(case, "---- IN  TOKENS ----")
 | 
			
		||||
    if not state then
 | 
			
		||||
        print(case, err, etoken)
 | 
			
		||||
        for i,v in pairs(estate) do
 | 
			
		||||
            print(case, i, v)
 | 
			
		||||
        end
 | 
			
		||||
    else
 | 
			
		||||
        for i,v in ipairs(state) do
 | 
			
		||||
            print(case, i, v)
 | 
			
		||||
        end
 | 
			
		||||
    end
 | 
			
		||||
    print(case, "---- OUT TOKENS ----")
 | 
			
		||||
end -- lua tokens
 | 
			
		||||
 | 
			
		||||
do -- more lua tokens
 | 
			
		||||
    local luatokens = require "luatokens"
 | 
			
		||||
    local tokens = luatokens.tokens
 | 
			
		||||
    local state, err, etoken, estate = parser.parse(tokens, [["\a\b\f\n\r\t\v\\\"\'\z        \x41\65\
 | 
			
		||||
"]])
 | 
			
		||||
    local case = case()
 | 
			
		||||
    print(case, "---- IN  TOKENS ----")
 | 
			
		||||
    if not state then
 | 
			
		||||
        print(case, err, etoken)
 | 
			
		||||
        for i,v in pairs(estate) do
 | 
			
		||||
            print(case, i, v)
 | 
			
		||||
        end
 | 
			
		||||
    else
 | 
			
		||||
        for i,v in ipairs(state) do
 | 
			
		||||
            print(case, i, v)
 | 
			
		||||
        end
 | 
			
		||||
    end
 | 
			
		||||
    print(case, "---- OUT TOKENS ----")
 | 
			
		||||
end -- lua tokens
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		
		Reference in a new issue