From bcd3d1ca29053dc44def83cae31b794028c2bbbc Mon Sep 17 00:00:00 2001 From: David Guillen Fandos Date: Sat, 11 Dec 2021 11:30:03 +0100 Subject: [PATCH] [aarch64] Adding new aarch64 dynarec! This is based on the MIPS dynarec (more or less) with some ARM borrowings. Seems to be quite fast (under my testing fixed results: faster than ARM on A1 but not a lot faster than the interpreter on Android Snapdragon 845) but still some optimizations are missing at the moment. Seems to pass my testing suite and compatibility wise is very similar to arm. --- Makefile | 14 + Makefile.common | 19 +- arm/arm64_codegen.h | 297 +++++++ arm/arm64_emit.h | 1879 +++++++++++++++++++++++++++++++++++++++++++ arm/arm64_stub.S | 705 ++++++++++++++++ cpu_threaded.c | 4 +- jni/Android.mk | 4 + tests/Makefile | 12 + tests/arm64gen.S | 208 +++++ tests/arm64gen.c | 223 +++++ 10 files changed, 3354 insertions(+), 11 deletions(-) create mode 100644 arm/arm64_codegen.h create mode 100644 arm/arm64_emit.h create mode 100644 arm/arm64_stub.S create mode 100644 tests/Makefile create mode 100644 tests/arm64gen.S create mode 100644 tests/arm64gen.c diff --git a/Makefile b/Makefile index a0ed865..6b183b8 100644 --- a/Makefile +++ b/Makefile @@ -349,6 +349,17 @@ else ifeq ($(platform), wii) CFLAGS += -DGEKKO -DHW_RVL -mrvl -mcpu=750 -meabi -mhard-float -DMSB_FIRST -D__ppc__ STATIC_LINKING = 1 +# aarch64 (armv8) +else ifeq ($(platform), arm64) + TARGET := $(TARGET_NAME)_libretro.so + SHARED := -shared -Wl,--version-script=link.T + fpic := -fPIC + CFLAGS += -fomit-frame-pointer -ffast-math + LDFLAGS += -Wl,--no-undefined + HAVE_DYNAREC := 1 + MMAP_JIT_CACHE = 1 + CPU_ARCH := arm64 + # ARM else ifneq (,$(findstring armv,$(platform))) TARGET := $(TARGET_NAME)_libretro.so @@ -488,6 +499,7 @@ CFLAGS += -DMMAP_JIT_CACHE endif # Add -DTRACE_INSTRUCTIONS to trace instruction execution +# Can add -DTRACE_REGISTERS to additionally print register values ifeq ($(DEBUG), 1) OPTIMIZE := -O0 -g else @@ -502,6 +514,8 @@ endif ifeq ($(CPU_ARCH), arm) DEFINES += -DARM_ARCH +else ifeq ($(CPU_ARCH), arm64) + DEFINES += -DARM64_ARCH else ifeq ($(CPU_ARCH), mips) DEFINES += -DMIPS_ARCH else ifeq ($(CPU_ARCH), x86_32) diff --git a/Makefile.common b/Makefile.common index bc4cdd3..d3e1493 100644 --- a/Makefile.common +++ b/Makefile.common @@ -31,16 +31,15 @@ SOURCES_C += $(CORE_DIR)/cpu_threaded.c endif ifeq ($(HAVE_DYNAREC), 1) - -ifeq ($(CPU_ARCH), x86_32) -SOURCES_ASM += $(CORE_DIR)/x86/x86_stub.S -endif -ifeq ($(CPU_ARCH), arm) -SOURCES_ASM += $(CORE_DIR)/arm/arm_stub.S -endif -ifeq ($(CPU_ARCH), mips) -SOURCES_ASM += $(CORE_DIR)/mips/mips_stub.S -endif + ifeq ($(CPU_ARCH), x86_32) + SOURCES_ASM += $(CORE_DIR)/x86/x86_stub.S + else ifeq ($(CPU_ARCH), arm) + SOURCES_ASM += $(CORE_DIR)/arm/arm_stub.S + else ifeq ($(CPU_ARCH), arm64) + SOURCES_ASM += $(CORE_DIR)/arm/arm64_stub.S + else ifeq ($(CPU_ARCH), mips) + SOURCES_ASM += $(CORE_DIR)/mips/mips_stub.S + endif endif ifeq ($(CPU_ARCH), arm) diff --git a/arm/arm64_codegen.h b/arm/arm64_codegen.h new file mode 100644 index 0000000..81a151b --- /dev/null +++ b/arm/arm64_codegen.h @@ -0,0 +1,297 @@ +/* gameplaySP + * + * Copyright (C) 2021 David Guillen Fandos + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +typedef enum +{ + aa64_opcode_logic = 0x0A, + aa64_opcode_addsub = 0x0B, + aa64_opcode_adr = 0x10, + aa64_opcode_addsubi = 0x11, + aa64_opcode_movi = 0x12, + aa64_opcode_bfm = 0x13, + aa64_opcode_b = 0x14, + aa64_opcode_b2 = 0x15, + aa64_opcode_tbz = 0x16, + aa64_opcode_tbnz = 0x17, + aa64_opcode_memi = 0x19, + aa64_opcode_misc = 0x1A, + aa64_opcode_mul4 = 0x1B, + +} aa64_opcode; + +typedef enum +{ + ccode_eq = 0x0, /* Equal Z == 1 */ + ccode_ne = 0x1, /* Not Equal Z == 0 */ + ccode_hs = 0x2, /* Carry Set C == 1 */ + ccode_lo = 0x3, /* Carry Clear C == 0 */ + ccode_mi = 0x4, /* Minus/Neg N == 1 */ + ccode_pl = 0x5, /* Plus/Pos N == 0 */ + ccode_vs = 0x6, /* Overflow V == 1 */ + ccode_vc = 0x7, /* !Overflow V == 0 */ + ccode_hi = 0x8, /* UGreatThan C && !Z */ + ccode_ls = 0x9, /* ULessEqual !C || Z */ + ccode_ge = 0xA, /* SGreatEqual N == V */ + ccode_lt = 0xB, /* SLessThan N != V */ + ccode_gt = 0xC, /* SLessThan !Z&N==V */ + ccode_le = 0xD, /* SLessEqual Z|(N!=V) */ + ccode_al = 0xE, /* Always */ + ccode_nv = 0xF, /* Never */ +} aa64_condcode; + + + +#define aa64_br_offset(label) \ + (((uintptr_t)(label) - (uintptr_t)(translation_ptr)) >> 2) \ + +#define aa64_br_offset_from(label, from) \ + (((uintptr_t)(label) - (uintptr_t)(from)) >> 2) \ + +#define aa64_emit_inst(opcode, ope, rd, rs, extra) \ +{ \ + *((u32 *)translation_ptr) = (aa64_opcode_##opcode << 24) | ((ope) << 29) | \ + ((rs) << 5) | (rd) | (extra); \ + translation_ptr += 4; \ +} + +#define aa64_emit_ldr(rv, rb, offset) \ + aa64_emit_inst(memi, 5, rv, rb, (1 << 22) | ((offset) << 10)) \ + +#define aa64_emit_str(rv, rb, offset) \ + aa64_emit_inst(memi, 5, rv, rb, (0 << 22) | ((offset) << 10)) \ + +#define aa64_emit_addshift(rd, rs, rm, st, sa) \ + aa64_emit_inst(addsub, 0, rd, rs, ((rm) << 16) | ((st)<<22) | ((sa)<<10)) \ + +#define aa64_emit_add_lsl(rd, rs, rm, sa) \ + aa64_emit_addshift(rd, rs, rm, 0, sa) \ + +#define aa64_emit_addi(rd, rs, imm) \ + aa64_emit_inst(addsubi, 0, rd, rs, (imm) << 10) \ + +#define aa64_emit_addi12(rd, rs, imm) \ + aa64_emit_inst(addsubi, 0, rd, rs, ((imm) << 10) | (1 << 22)) \ + +#define aa64_emit_addis(rd, rs, imm) \ + aa64_emit_inst(addsubi, 1, rd, rs, (imm) << 10) \ + +#define aa64_emit_subi(rd, rs, imm) \ + aa64_emit_inst(addsubi, 2, rd, rs, (imm) << 10) \ + +#define aa64_emit_subi12(rd, rs, imm) \ + aa64_emit_inst(addsubi, 2, rd, rs, ((imm) << 10) | (1 << 22)) \ + +#define aa64_emit_subis(rd, rs, imm) \ + aa64_emit_inst(addsubi, 3, rd, rs, (imm) << 10) \ + +/* rd = ra + rn * rm */ +#define aa64_emit_madd(rd, ra, rn, rm) \ + aa64_emit_inst(mul4, 0, rd, rn, ((ra) << 10) | ((rm) << 16)) \ + +/* rd = ra - rn * rm */ +#define aa64_emit_msub(rd, ra, rn, rm) \ + aa64_emit_inst(mul4, 0, rd, rn, ((ra) << 10) | ((rm) << 16) | 0x8000) \ + +#define aa64_emit_smaddl(rd, ra, rn, rm) \ + aa64_emit_inst(mul4, 4, rd, rn, ((ra) << 10) | ((rm) << 16) | 0x200000) \ + +#define aa64_emit_umaddl(rd, ra, rn, rm) \ + aa64_emit_inst(mul4, 4, rd, rn, ((ra) << 10) | ((rm) << 16) | 0xA00000) \ + +#define aa64_emit_mul(rd, rn, rm) \ + aa64_emit_madd(rd, 31, rn, rm) \ + +// MovZ, clears the highest bits and sets the lower ones +#define aa64_emit_movlo(rd, imm) \ + aa64_emit_inst(movi, 2, rd, 0, (((imm) & 0xffff) << 5) | (4 << 21)) \ + +// MovZ, clears the lowest bits and sets the higher ones +#define aa64_emit_movhiz(rd, imm) \ + aa64_emit_inst(movi, 2, rd, 0, (((imm) & 0xffff) << 5) | (5 << 21)) \ + +// MovK, keeps the other (lower) bits +#define aa64_emit_movhi(rd, imm) \ + aa64_emit_inst(movi, 3, rd, 0, (((imm) & 0xffff) << 5) | (5 << 21)) \ + +// MovN, moves the inverted immediate (for negative numbers) +#define aa64_emit_movne(rd, imm) \ + aa64_emit_inst(movi, 0, rd, 0, (((imm) & 0xffff) << 5) | (4 << 21)) \ + +#define aa64_emit_branch(offset) \ + aa64_emit_inst(b, 0, 0, 0, (((u32)(offset))) & 0x3ffffff) \ + +#define aa64_emit_branch_patch(ptr, offset) \ + *(ptr) = (((*(ptr)) & 0xfc000000) | (((u32)(offset)) & 0x3ffffff)) \ + +#define aa64_emit_brcond(cond, offset) \ + aa64_emit_inst(b, 2, cond, 0, ((((u32)(offset))) & 0x7ffff) << 5) \ + +#define aa64_emit_brcond_patch(ptr, offset) \ + *(ptr) = (((*(ptr)) & 0xff00001f) | (((((u32)(offset))) & 0x7ffff) << 5)) \ + +#define aa64_emit_brlink(offset) \ + aa64_emit_inst(b, 4, 0, 0, (((u32)(offset))) & 0x3ffffff) \ + +#define aa64_emit_extr(rd, rs, rm, amount) \ + aa64_emit_inst(bfm, 0, rd, rs, (1 << 23) | ((amount) << 10) | ((rm) << 16)) \ + +#define aa64_emit_ror(rd, rs, amount) \ + aa64_emit_extr(rd, rs, rs, amount) \ + +#define aa64_emit_lsr(rd, rs, amount) \ + aa64_emit_inst(bfm, 2, rd, rs, (31 << 10) | ((amount) << 16)) \ + +#define aa64_emit_lsl(rd, rs, amount) \ + aa64_emit_inst(bfm, 2, rd, rs, ((31-(amount)) << 10) | (((32-(amount)) & 31) << 16)) + +#define aa64_emit_asr(rd, rs, amount) \ + aa64_emit_inst(bfm, 0, rd, rs, (31 << 10) | ((amount) << 16)) \ + +#define aa64_emit_lsr64(rd, rs, amount) \ + aa64_emit_inst(bfm, 6, rd, rs, (1 << 22) | (63 << 10) | ((amount) << 16)) \ + +#define aa64_emit_eori(rd, rs, immr, imms) \ + aa64_emit_inst(movi, 2, rd, rs, ((imms) << 10) | ((immr) << 16)) \ + +#define aa64_emit_orri(rd, rs, immr, imms) \ + aa64_emit_inst(movi, 1, rd, rs, ((imms) << 10) | ((immr) << 16)) \ + +#define aa64_emit_andi(rd, rs, immr, imms) \ + aa64_emit_inst(movi, 0, rd, rs, ((imms) << 10) | ((immr) << 16)) \ + +#define aa64_emit_andi64(rd, rs, immr, imms) \ + aa64_emit_inst(movi, 4, rd, rs, (1 << 22) | ((imms) << 10) | ((immr) << 16)) + +#define aa64_emit_mov(rd, rs) \ + aa64_emit_orr(rd, 31, rs) \ + +#define aa64_emit_orr(rd, rs, rm) \ + aa64_emit_inst(logic, 1, rd, rs, ((rm) << 16)) \ + +#define aa64_emit_orn(rd, rs, rm) \ + aa64_emit_inst(logic, 1, rd, rs, ((rm) << 16) | (1 << 21)) \ + +#define aa64_emit_and(rd, rs, rm) \ + aa64_emit_inst(logic, 0, rd, rs, ((rm) << 16)) \ + +#define aa64_emit_ands(rd, rs, rm) \ + aa64_emit_inst(logic, 3, rd, rs, ((rm) << 16)) \ + +#define aa64_emit_tst(rs, rm) \ + aa64_emit_ands(31, rs, rm) \ + +#define aa64_emit_cmpi(rs, imm) \ + aa64_emit_subis(31, rs, imm) \ + +#define aa64_emit_xor(rd, rs, rm) \ + aa64_emit_inst(logic, 2, rd, rs, ((rm) << 16)) \ + +#define aa64_emit_bic(rd, rs, rm) \ + aa64_emit_inst(logic, 0, rd, rs, ((rm) << 16) | (1 << 21)) \ + +#define aa64_emit_add(rd, rs, rm) \ + aa64_emit_inst(addsub, 0, rd, rs, ((rm) << 16)) \ + +#define aa64_emit_sub(rd, rs, rm) \ + aa64_emit_inst(addsub, 2, rd, rs, ((rm) << 16)) \ + +#define aa64_emit_adc(rd, rs, rm) \ + aa64_emit_inst(misc, 0, rd, rs, ((rm) << 16)) \ + +#define aa64_emit_sbc(rd, rs, rm) \ + aa64_emit_inst(misc, 2, rd, rs, ((rm) << 16)) \ + +#define aa64_emit_adds(rd, rs, rm) \ + aa64_emit_inst(addsub, 1, rd, rs, ((rm) << 16)) \ + +#define aa64_emit_subs(rd, rs, rm) \ + aa64_emit_inst(addsub, 3, rd, rs, ((rm) << 16)) \ + +#define aa64_emit_adcs(rd, rs, rm) \ + aa64_emit_inst(misc, 1, rd, rs, ((rm) << 16)) \ + +#define aa64_emit_sbcs(rd, rs, rm) \ + aa64_emit_inst(misc, 3, rd, rs, ((rm) << 16)) \ + +#define aa64_emit_adr(rd, offset) \ + aa64_emit_inst(adr, (offset) & 3, rd, 0, ((offset) >> 2) & 0x7ffff) \ + +#define aa64_emit_tbz(rd, bitn, offset) \ + aa64_emit_inst(tbz, 1, rd, 0, ((((u32)(offset)) & 0x3fff) << 5) | ((bitn) << 19)) + +#define aa64_emit_tbnz(rd, bitn, offset) \ + aa64_emit_inst(tbnz, 1, rd, 0, ((((u32)(offset)) & 0x3fff) << 5) | ((bitn) << 19)) + +#define aa64_emit_cbz(rd, offset) \ + aa64_emit_inst(b, 1, rd, 0, ((((u32)offset) & 0x7ffff)) << 5) \ + +#define aa64_emit_cbnz(rd, offset) \ + aa64_emit_inst(b2, 1, rd, 0, ((((u32)offset) & 0x7ffff)) << 5) \ + +/* Misc Operations: Cond-select, Cond-Compare, ADC/SBC, CLZ/O, REV ... */ +#define aa64_emit_csel(rd, rtrue, rfalse, cond) \ + aa64_emit_inst(misc, 0, rd, rtrue, (1<<23)|((rfalse) << 16)|((cond) << 12)) \ + +#define aa64_emit_csinc(rd, rs, rm, cond) \ + aa64_emit_inst(misc, 0, rd, rs, 0x800400 | ((rm) << 16) | ((cond) << 12)) \ + +#define aa64_emit_csinv(rd, rs, rm, cond) \ + aa64_emit_inst(misc, 2, rd, rs, 0x800000 | ((rm) << 16) | ((cond) << 12)) \ + +#define aa64_emit_csneg(rd, rs, rm, cond) \ + aa64_emit_inst(misc, 2, rd, rs, 0x800400 | ((rm) << 16) | ((cond) << 12)) \ + +#define aa64_emit_ubfm(rd, rs, imms, immr) \ + aa64_emit_inst(bfm, 2, rd, rs, ((imms) << 10) | ((immr) << 16)) \ + +#define aa64_emit_ubfx(rd, rs, pos, size) \ + aa64_emit_ubfm(rd, rs, pos + size - 1, pos) \ + +#define aa64_emit_cset(rd, cond) \ + aa64_emit_csinc(rd, 31, 31, ((cond) ^ 1)) \ + +#define aa64_emit_csetm(rd, cond) \ + aa64_emit_csinv(rd, 31, 31, ((cond) ^ 1)) \ + +#define aa64_emit_ccmpi(rn, immv, flags, cond) \ + aa64_emit_inst(misc, 3, rn, flags, 0x400800 | ((immv)<<16) | ((cond)<<12)) \ + +#define aa64_emit_rorv(rd, rs, ra) \ + aa64_emit_inst(misc, 0, rd, rs, ((ra) << 16) | 0xC02C00) \ + +#define aa64_emit_lslv(rd, rs, ra) \ + aa64_emit_inst(misc, 0, rd, rs, ((ra) << 16) | 0xC02000) \ + +#define aa64_emit_lsrv(rd, rs, ra) \ + aa64_emit_inst(misc, 0, rd, rs, ((ra) << 16) | 0xC02400) \ + +#define aa64_emit_asrv(rd, rs, ra) \ + aa64_emit_inst(misc, 0, rd, rs, ((ra) << 16) | 0xC02800) \ + +#define aa64_emit_orr_shift64(rd, rs, rm, st, sa) \ + aa64_emit_inst(logic, 5, rd, rs, ((rm) << 16) | ((st)<<22) | ((sa)<<10)) \ + +#define aa64_emit_merge_regs(rd, rhi, rlo) \ + aa64_emit_orr_shift64(rd, rlo, rhi, 0, 32) \ + +#define aa64_emit_sdiv(rd, rs, rm) \ + aa64_emit_inst(misc, 0, rd, rs, ((rm) << 16) | 0xC00C00) \ + + + diff --git a/arm/arm64_emit.h b/arm/arm64_emit.h new file mode 100644 index 0000000..4f372c8 --- /dev/null +++ b/arm/arm64_emit.h @@ -0,0 +1,1879 @@ +/* gameplaySP + * + * Copyright (C) 2021 David Guillen Fandos + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef ARM64_EMIT_H +#define ARM64_EMIT_H + +#include "arm64_codegen.h" + +/* This is a fork of the MIPS dynarec, since A64 has 32 regs as well and + does not map great to the armv4 instruction set. Also flexible operand + is fairly limited and cannot map to armv4 well. + All flags are kept in registers and loaded/restored as needed. */ + +u32 a64_update_gba(u32 pc); + +// Although these are defined as a function, don't call them as +// such (jump to it instead) +void a64_indirect_branch_arm(u32 address); +void a64_indirect_branch_thumb(u32 address); +void a64_indirect_branch_dual(u32 address); + +u32 execute_read_cpsr(); +u32 execute_read_spsr(); +void execute_swi(u32 pc); +void a64_cheat_hook(void); + +u32 execute_spsr_restore(u32 address); +void execute_store_cpsr(u32 new_cpsr, u32 store_mask); +void execute_store_spsr(u32 new_spsr, u32 store_mask); + +void execute_aligned_store32(u32 addr, u32 data); +u32 execute_aligned_load32(u32 addr); + +typedef enum +{ + arm64_reg_x0, // arg0 + arm64_reg_x1, // arg1 + arm64_reg_x2, // arg2 + arm64_reg_x3, // temporary reg + arm64_reg_x4, // temporary reg + arm64_reg_x5, // temporary reg + arm64_reg_x6, // ARM reg 0 (temporary) + arm64_reg_x7, // ARM reg 1 (temporary) + arm64_reg_x8, // ARM reg 2 (temporary) + arm64_reg_x9, // ARM reg 3 (temporary) + arm64_reg_x10, // ARM reg 4 (temporary) + arm64_reg_x11, // ARM reg 5 (temporary) + arm64_reg_x12, // ARM reg 6 (temporary) + arm64_reg_x13, // ARM reg 7 (temporary) + arm64_reg_x14, // ARM reg 8 (temporary) + arm64_reg_x15, // ARM reg 9 (temporary) + arm64_reg_x16, // ARM reg 10 (temporary) + arm64_reg_x17, // ARM reg 11 (temporary) + arm64_reg_x18, + arm64_reg_x19, // save0 (mem-scratch) (saved) + arm64_reg_x20, // base pointer (saved) + arm64_reg_x21, // cycle counter (saved) + arm64_reg_x22, // C-flag (contains 0 or 1, carry bit) + arm64_reg_x23, // V-flag (contains 0 or 1, overflow bit) + arm64_reg_x24, // Z-flag (contains 0 or 1, zero bit) + arm64_reg_x25, // N-flag (contains 0 or 1, sign bit) + arm64_reg_x26, // ARM reg 12 (saved) + arm64_reg_x27, // ARM reg 13 (saved) + arm64_reg_x28, // ARM reg 14 (saved) + arm64_reg_x29, // ARM reg 15 (block start ~ PC) (saved) + arm64_reg_lr, + arm64_reg_sp, +} arm64_reg_number; + + +#define reg_save0 arm64_reg_x19 +#define reg_base arm64_reg_x20 +#define reg_cycles arm64_reg_x21 +#define reg_res arm64_reg_x0 +#define reg_a0 arm64_reg_x0 +#define reg_a1 arm64_reg_x1 +#define reg_a2 arm64_reg_x2 +#define reg_temp arm64_reg_x3 +#define reg_temp2 arm64_reg_x4 +#define reg_pc arm64_reg_x29 +#define reg_c_cache arm64_reg_x22 +#define reg_v_cache arm64_reg_x23 +#define reg_z_cache arm64_reg_x24 +#define reg_n_cache arm64_reg_x25 + +#define reg_r0 arm64_reg_x6 +#define reg_r1 arm64_reg_x7 +#define reg_r2 arm64_reg_x8 +#define reg_r3 arm64_reg_x9 +#define reg_r4 arm64_reg_x10 +#define reg_r5 arm64_reg_x11 +#define reg_r6 arm64_reg_x12 +#define reg_r7 arm64_reg_x13 +#define reg_r8 arm64_reg_x14 +#define reg_r9 arm64_reg_x15 +#define reg_r10 arm64_reg_x16 +#define reg_r11 arm64_reg_x17 +#define reg_r12 arm64_reg_x26 +#define reg_r13 arm64_reg_x27 +#define reg_r14 arm64_reg_x28 + +#define reg_zero arm64_reg_sp // Careful it's also SP + +// Writing to r15 goes straight to a0, to be chained with other ops + +u32 arm_to_a64_reg[] = +{ + reg_r0, + reg_r1, + reg_r2, + reg_r3, + reg_r4, + reg_r5, + reg_r6, + reg_r7, + reg_r8, + reg_r9, + reg_r10, + reg_r11, + reg_r12, + reg_r13, + reg_r14, + reg_a0, + reg_a1, + reg_a2 +}; + +#define arm_reg_a0 15 +#define arm_reg_a1 16 +#define arm_reg_a2 17 + +#define generate_save_reg(regnum) \ + aa64_emit_str(arm_to_a64_reg[regnum], reg_base, regnum) \ + +#define generate_restore_reg(regnum) \ + aa64_emit_ldr(arm_to_a64_reg[regnum], reg_base, regnum) \ + +#define emit_save_regs() \ +{ \ + unsigned i; \ + for (i = 0; i < 15; i++) { \ + generate_save_reg(i); \ + } \ +} + +#define emit_restore_regs() \ +{ \ + unsigned i; \ + for (i = 0; i < 15; i++) { \ + generate_restore_reg(i); \ + } \ +} + +#define generate_load_reg(ireg, reg_index) \ + aa64_emit_mov(ireg, arm_to_a64_reg[reg_index]) \ + +#define generate_load_imm(ireg, imm) \ + if ((s32)(imm) < 0 && (s32)(imm) >= -65536) { \ + /* immediate like 0xffffxxxx */ \ + aa64_emit_movne(ireg, (~(imm))); \ + } else if (((imm) & 0xffff) == 0) { \ + /* immediate like 0xxxxx0000 */ \ + aa64_emit_movhiz(ireg, (imm)); \ + } else { \ + aa64_emit_movlo(ireg, imm); \ + if ((imm) >= (1 << 16)) { \ + aa64_emit_movhi(ireg, ((imm) >> 16)); \ + } \ + } + +#define generate_load_pc_2inst(ireg, new_pc) \ +{ \ + aa64_emit_movlo(ireg, new_pc); \ + aa64_emit_movhi(ireg, ((new_pc) >> 16)); \ +} + +#define generate_load_pc(ireg, new_pc) \ +{ \ + s32 pc_delta = (new_pc) - (stored_pc); \ + if (pc_delta >= 0) { \ + if (pc_delta < 4096) { \ + aa64_emit_addi(ireg, reg_pc, pc_delta); \ + } else { \ + generate_load_imm(ireg, new_pc); \ + } \ + } else { \ + if (pc_delta >= -4096) { \ + aa64_emit_subi(ireg, reg_pc, -pc_delta); \ + } else { \ + generate_load_imm(ireg, new_pc); \ + } \ + } \ +} \ + +#define generate_store_reg(ireg, reg_index) \ + aa64_emit_mov(arm_to_a64_reg[reg_index], ireg) \ + +/* Logical immediates are weird in aarch64, load imm to register */ +#define generate_logical_imm(optype, ireg_dest, ireg_src, imm) \ + generate_load_imm(reg_temp, imm); \ + aa64_emit_##optype(ireg_dest, ireg_src, reg_temp); \ + +/* TODO Use addi12 if the immediate is <24 bits ? */ +#define generate_alu_imm(imm_type, reg_type, ireg_dest, ireg_src, imm) \ + if((u32)(imm) < 4096) \ + { \ + aa64_emit_##imm_type(ireg_dest, ireg_src, imm); \ + } \ + else \ + { \ + generate_load_imm(reg_temp, imm); \ + aa64_emit_##reg_type(ireg_dest, ireg_src, reg_temp); \ + } \ + +#define generate_mov(ireg_dest, ireg_src) \ + aa64_emit_mov(arm_to_a64_reg[ireg_dest], arm_to_a64_reg[ireg_src]) \ + +#define generate_function_call(function_location) \ + aa64_emit_brlink(aa64_br_offset(function_location)); \ + +#define generate_cycle_update() \ + if(cycle_count != 0) \ + { \ + unsigned hicycle = cycle_count >> 12; \ + if (hicycle) { \ + aa64_emit_subi12(reg_cycles, reg_cycles, hicycle); \ + } \ + aa64_emit_subi(reg_cycles, reg_cycles, (cycle_count & 0xfff)); \ + cycle_count = 0; \ + } \ + +/* Patches ARM-mode conditional branches */ +#define generate_branch_patch_conditional(dest, label) \ + aa64_emit_brcond_patch(((u32*)dest), aa64_br_offset_from(label, dest)) + +#define emit_branch_filler(writeback_location) \ + (writeback_location) = translation_ptr; \ + aa64_emit_branch(0); \ + +#define generate_branch_patch_unconditional(dest, target) \ + aa64_emit_branch_patch((u32*)dest, aa64_br_offset_from(target, dest)) \ + +#define generate_branch_no_cycle_update(writeback_location, new_pc) \ + if(pc == idle_loop_target_pc) \ + { \ + generate_load_pc(reg_a0, new_pc); \ + generate_function_call(a64_update_gba); \ + emit_branch_filler(writeback_location); \ + } \ + else \ + { \ + aa64_emit_tbnz(reg_cycles, 31, 2); \ + emit_branch_filler(writeback_location); \ + generate_load_pc_2inst(reg_a0, new_pc); \ + generate_function_call(a64_update_gba); \ + aa64_emit_branch(-4); \ + } \ + +#define generate_branch_cycle_update(writeback_location, new_pc) \ + generate_cycle_update(); \ + generate_branch_no_cycle_update(writeback_location, new_pc) \ + +// a0 holds the destination + +#define generate_indirect_branch_cycle_update(type) \ + generate_cycle_update() \ + generate_indirect_branch_no_cycle_update(type) \ + +#define generate_indirect_branch_no_cycle_update(type) \ + aa64_emit_branch(aa64_br_offset(a64_indirect_branch_##type)); \ + +#define block_prologue_size 0 +#define generate_block_prologue() \ + generate_load_imm(reg_pc, stored_pc) \ + +#define check_generate_n_flag \ + (flag_status & 0x08) \ + +#define check_generate_z_flag \ + (flag_status & 0x04) \ + +#define check_generate_c_flag \ + (flag_status & 0x02) \ + +#define check_generate_v_flag \ + (flag_status & 0x01) \ + +#define generate_load_reg_pc(ireg, reg_index, pc_offset) \ + if(reg_index == REG_PC) \ + { \ + generate_load_pc(ireg, (pc + pc_offset)); \ + } \ + else \ + { \ + generate_load_reg(ireg, reg_index); \ + } \ + +#define check_load_reg_pc(arm_reg, reg_index, pc_offset) \ + if(reg_index == REG_PC) \ + { \ + reg_index = arm_reg; \ + generate_load_pc(arm_to_a64_reg[arm_reg], (pc + pc_offset)); \ + } \ + +#define check_store_reg_pc_no_flags(reg_index) \ + if(reg_index == REG_PC) \ + { \ + generate_indirect_branch_arm(); \ + } \ + +#define check_store_reg_pc_flags(reg_index) \ + if(reg_index == REG_PC) \ + { \ + generate_function_call(execute_spsr_restore); \ + generate_indirect_branch_dual(); \ + } \ + +#define generate_shift_imm_lsl_no_flags(arm_reg, _rm, _shift) \ + check_load_reg_pc(arm_reg, _rm, 8); \ + if(_shift != 0) \ + { \ + aa64_emit_lsl(arm_to_a64_reg[arm_reg], arm_to_a64_reg[_rm], _shift); \ + _rm = arm_reg; \ + } \ + +#define generate_shift_imm_lsr_no_flags(arm_reg, _rm, _shift) \ + if(_shift != 0) \ + { \ + check_load_reg_pc(arm_reg, _rm, 8); \ + aa64_emit_lsr(arm_to_a64_reg[arm_reg], arm_to_a64_reg[_rm], _shift); \ + } \ + else \ + { \ + aa64_emit_movlo(arm_to_a64_reg[arm_reg], 0); \ + } \ + _rm = arm_reg \ + +#define generate_shift_imm_asr_no_flags(arm_reg, _rm, _shift) \ + check_load_reg_pc(arm_reg, _rm, 8); \ + aa64_emit_asr(arm_to_a64_reg[arm_reg], arm_to_a64_reg[_rm], \ + _shift ? _shift : 31); \ + _rm = arm_reg \ + +#define generate_shift_imm_ror_no_flags(arm_reg, _rm, _shift) \ + check_load_reg_pc(arm_reg, _rm, 8); \ + if(_shift != 0) \ + { \ + aa64_emit_ror(arm_to_a64_reg[arm_reg], arm_to_a64_reg[_rm], _shift); \ + } \ + else \ + { /* Special case: RRX (no carry update) */ \ + aa64_emit_extr(arm_to_a64_reg[arm_reg], \ + reg_c_cache, arm_to_a64_reg[_rm], 1); \ + } \ + _rm = arm_reg \ + +#define generate_shift_imm_lsl_flags(arm_reg, _rm, _shift) \ + check_load_reg_pc(arm_reg, _rm, 8); \ + if(_shift != 0) \ + { \ + aa64_emit_ubfx(reg_c_cache, arm_to_a64_reg[_rm], (32 - _shift), 1); \ + aa64_emit_lsl(arm_to_a64_reg[arm_reg], arm_to_a64_reg[_rm], _shift); \ + _rm = arm_reg; \ + } \ + +#define generate_shift_imm_lsr_flags(arm_reg, _rm, _shift) \ + check_load_reg_pc(arm_reg, _rm, 8); \ + if(_shift != 0) \ + { \ + aa64_emit_ubfx(reg_c_cache, arm_to_a64_reg[_rm], (_shift - 1), 1); \ + aa64_emit_lsr(arm_to_a64_reg[arm_reg], arm_to_a64_reg[_rm], _shift); \ + } \ + else \ + { \ + aa64_emit_lsr(reg_c_cache, arm_to_a64_reg[_rm], 31); \ + aa64_emit_movlo(arm_to_a64_reg[arm_reg], 0); \ + } \ + _rm = arm_reg \ + +#define generate_shift_imm_asr_flags(arm_reg, _rm, _shift) \ + check_load_reg_pc(arm_reg, _rm, 8); \ + if(_shift != 0) \ + { \ + aa64_emit_ubfx(reg_c_cache, arm_to_a64_reg[_rm], (_shift - 1), 1); \ + aa64_emit_asr(arm_to_a64_reg[arm_reg], arm_to_a64_reg[_rm], _shift); \ + } \ + else \ + { \ + aa64_emit_lsr(reg_c_cache, arm_to_a64_reg[_rm], 31); \ + aa64_emit_asr(arm_to_a64_reg[arm_reg], arm_to_a64_reg[_rm], 31); \ + } \ + _rm = arm_reg \ + +#define generate_shift_imm_ror_flags(arm_reg, _rm, _shift) \ + check_load_reg_pc(arm_reg, _rm, 8); \ + if(_shift != 0) \ + { \ + aa64_emit_ubfx(reg_c_cache, arm_to_a64_reg[_rm], (_shift - 1), 1); \ + aa64_emit_ror(arm_to_a64_reg[arm_reg], arm_to_a64_reg[_rm], _shift); \ + } \ + else \ + { /* Special case: RRX (carry update) */ \ + aa64_emit_extr(reg_temp, reg_c_cache, arm_to_a64_reg[_rm], 1); \ + aa64_emit_ubfx(reg_c_cache, arm_to_a64_reg[_rm], 0, 1); \ + aa64_emit_mov(arm_to_a64_reg[arm_reg], reg_temp); \ + } \ + _rm = arm_reg \ + +#define generate_shift_reg_lsl_no_flags(_rm, _rs) \ + aa64_emit_cmpi(arm_to_a64_reg[_rs], 32); \ + aa64_emit_lslv(reg_temp, arm_to_a64_reg[_rm], arm_to_a64_reg[_rs]); \ + aa64_emit_csel(reg_a0, reg_zero, reg_temp, ccode_hs); \ + +#define generate_shift_reg_lsr_no_flags(_rm, _rs) \ + aa64_emit_cmpi(arm_to_a64_reg[_rs], 32); \ + aa64_emit_lsrv(reg_temp, arm_to_a64_reg[_rm], arm_to_a64_reg[_rs]); \ + aa64_emit_csel(reg_a0, reg_zero, reg_temp, ccode_hs); \ + +#define generate_shift_reg_asr_no_flags(_rm, _rs) \ + aa64_emit_cmpi(arm_to_a64_reg[_rs], 31); \ + aa64_emit_asrv(reg_a0, arm_to_a64_reg[_rm], arm_to_a64_reg[_rs]); \ + aa64_emit_asr(reg_temp, arm_to_a64_reg[_rm], 31); \ + aa64_emit_csel(reg_a0, reg_a0, reg_temp, ccode_lo); \ + +#define generate_shift_reg_ror_no_flags(_rm, _rs) \ + aa64_emit_rorv(reg_a0, arm_to_a64_reg[_rm], arm_to_a64_reg[_rs]) \ + +#define generate_shift_reg_lsl_flags(_rm, _rs) \ +{ \ + u32 shift_reg = _rs; \ + check_load_reg_pc(arm_reg_a1, shift_reg, 8); \ + generate_load_reg_pc(reg_a0, _rm, 12); \ + /* Only load the result on zero, no shift */ \ + aa64_emit_cbz(arm_to_a64_reg[shift_reg], 8); \ + aa64_emit_subi(reg_temp, arm_to_a64_reg[shift_reg], 1); \ + aa64_emit_lslv(reg_a0, reg_a0, reg_temp); \ + aa64_emit_lsr(reg_c_cache, reg_a0, 31); \ + aa64_emit_cmpi(arm_to_a64_reg[shift_reg], 33); \ + aa64_emit_lsl(reg_a0, reg_a0, 1); \ + /* Result and flag to be zero if shift is > 32 */ \ + aa64_emit_csel(reg_c_cache, reg_zero, reg_c_cache, ccode_hs); \ + aa64_emit_csel(reg_a0, reg_zero, reg_a0, ccode_hs); \ +} \ + +#define generate_shift_reg_lsr_flags(_rm, _rs) \ +{ \ + u32 shift_reg = _rs; \ + check_load_reg_pc(arm_reg_a1, shift_reg, 8); \ + generate_load_reg_pc(reg_a0, _rm, 12); \ + /* Only load the result on zero, no shift */ \ + aa64_emit_cbz(arm_to_a64_reg[shift_reg], 8); \ + aa64_emit_subi(reg_temp, arm_to_a64_reg[shift_reg], 1); \ + aa64_emit_lsrv(reg_a0, reg_a0, reg_temp); \ + aa64_emit_andi(reg_c_cache, reg_a0, 0, 0); /* imm=1 */ \ + aa64_emit_cmpi(arm_to_a64_reg[shift_reg], 33); \ + aa64_emit_lsr(reg_a0, reg_a0, 1); \ + /* Result and flag to be zero if shift is > 32 */ \ + aa64_emit_csel(reg_c_cache, reg_zero, reg_c_cache, ccode_hs); \ + aa64_emit_csel(reg_a0, reg_zero, reg_a0, ccode_hs); \ +} \ + +#define generate_shift_reg_asr_flags(_rm, _rs) \ + generate_load_reg_pc(reg_a1, _rs, 8); \ + generate_load_reg_pc(reg_a0, _rm, 12); \ + /* Only load the result on zero, no shift */ \ + aa64_emit_cbz(reg_a1, 8); \ + /* Cap shift at 32, since it's equivalent */ \ + aa64_emit_movlo(reg_temp, 32); \ + aa64_emit_cmpi(reg_a1, 32); \ + aa64_emit_csel(reg_a1, reg_a1, reg_temp, ccode_ls); \ + aa64_emit_subi(reg_temp, reg_a1, 1); \ + aa64_emit_asrv(reg_a0, reg_a0, reg_temp); \ + aa64_emit_andi(reg_c_cache, reg_a0, 0, 0); /* imm=1 */ \ + aa64_emit_asr(reg_a0, reg_a0, 1); \ + +#define generate_shift_reg_ror_flags(_rm, _rs) \ + aa64_emit_cbz(arm_to_a64_reg[_rs], 4); \ + aa64_emit_subi(reg_temp, arm_to_a64_reg[_rs], 1); \ + aa64_emit_lsrv(reg_temp, arm_to_a64_reg[_rm], reg_temp); \ + aa64_emit_andi(reg_c_cache, reg_temp, 0, 0); /* imm=1 */ \ + aa64_emit_rorv(reg_a0, arm_to_a64_reg[_rm], arm_to_a64_reg[_rs]) \ + +#define generate_shift_imm(arm_reg, name, flags_op) \ + u32 shift = (opcode >> 7) & 0x1F; \ + generate_shift_imm_##name##_##flags_op(arm_reg, rm, shift) \ + +#define generate_shift_reg(arm_reg, name, flags_op) \ + u32 rs = ((opcode >> 8) & 0x0F); \ + generate_shift_reg_##name##_##flags_op(rm, rs); \ + rm = arm_reg \ + +// Made functions due to the macro expansion getting too large. +// Returns a new rm if it redirects it (which will happen on most of these +// cases) + +#define generate_load_rm_sh(flags_op) \ +{ \ + switch((opcode >> 4) & 0x07) \ + { \ + /* LSL imm */ \ + case 0x0: \ + { \ + generate_shift_imm(arm_reg_a0, lsl, flags_op); \ + break; \ + } \ + \ + /* LSL reg */ \ + case 0x1: \ + { \ + generate_shift_reg(arm_reg_a0, lsl, flags_op); \ + break; \ + } \ + \ + /* LSR imm */ \ + case 0x2: \ + { \ + generate_shift_imm(arm_reg_a0, lsr, flags_op); \ + break; \ + } \ + \ + /* LSR reg */ \ + case 0x3: \ + { \ + generate_shift_reg(arm_reg_a0, lsr, flags_op); \ + break; \ + } \ + \ + /* ASR imm */ \ + case 0x4: \ + { \ + generate_shift_imm(arm_reg_a0, asr, flags_op); \ + break; \ + } \ + \ + /* ASR reg */ \ + case 0x5: \ + { \ + generate_shift_reg(arm_reg_a0, asr, flags_op); \ + break; \ + } \ + \ + /* ROR imm */ \ + case 0x6: \ + { \ + generate_shift_imm(arm_reg_a0, ror, flags_op); \ + break; \ + } \ + \ + /* ROR reg */ \ + case 0x7: \ + { \ + generate_shift_reg(arm_reg_a0, ror, flags_op); \ + break; \ + } \ + } \ +} \ + +#define generate_block_extra_vars() \ + u32 stored_pc = pc; \ + +#define generate_block_extra_vars_arm() \ + generate_block_extra_vars(); \ + +#define generate_load_offset_sh() \ + { \ + switch((opcode >> 5) & 0x03) \ + { \ + /* LSL imm */ \ + case 0x0: \ + { \ + generate_shift_imm(arm_reg_a1, lsl, no_flags); \ + break; \ + } \ + \ + /* LSR imm */ \ + case 0x1: \ + { \ + generate_shift_imm(arm_reg_a1, lsr, no_flags); \ + break; \ + } \ + \ + /* ASR imm */ \ + case 0x2: \ + { \ + generate_shift_imm(arm_reg_a1, asr, no_flags); \ + break; \ + } \ + \ + /* ROR imm */ \ + case 0x3: \ + { \ + generate_shift_imm(arm_reg_a1, ror, no_flags); \ + break; \ + } \ + } \ + } \ + +#define generate_indirect_branch_arm() \ +{ \ + if(condition == 0x0E) \ + { \ + generate_indirect_branch_cycle_update(arm); \ + } \ + else \ + { \ + generate_indirect_branch_no_cycle_update(arm); \ + } \ +} \ + +#define generate_indirect_branch_dual() \ +{ \ + if(condition == 0x0E) \ + { \ + generate_indirect_branch_cycle_update(dual); \ + } \ + else \ + { \ + generate_indirect_branch_no_cycle_update(dual); \ + } \ +} \ + +#define generate_block_extra_vars_thumb() \ + generate_block_extra_vars() \ + +// It should be okay to still generate result flags, spsr will overwrite them. +// This is pretty infrequent (returning from interrupt handlers, et al) so +// probably not worth optimizing for. + +u32 execute_spsr_restore_body(u32 address) +{ + set_cpu_mode(cpu_modes[reg[REG_CPSR] & 0x1F]); + if((io_registers[REG_IE] & io_registers[REG_IF]) && + io_registers[REG_IME] && ((reg[REG_CPSR] & 0x80) == 0)) + { + reg_mode[MODE_IRQ][6] = address + 4; + spsr[MODE_IRQ] = reg[REG_CPSR]; + reg[REG_CPSR] = 0xD2; + address = 0x00000018; + set_cpu_mode(MODE_IRQ); + } + + if(reg[REG_CPSR] & 0x20) + address |= 0x01; + + return address; +} + +/* Generate the opposite condition to skip the block */ +#define generate_condition_eq() \ + (backpatch_address) = translation_ptr; \ + aa64_emit_cbz(reg_z_cache, 0); \ + +#define generate_condition_ne() \ + (backpatch_address) = translation_ptr; \ + aa64_emit_cbnz(reg_z_cache, 0); \ + +#define generate_condition_cs() \ + (backpatch_address) = translation_ptr; \ + aa64_emit_cbz(reg_c_cache, 0); \ + +#define generate_condition_cc() \ + (backpatch_address) = translation_ptr; \ + aa64_emit_cbnz(reg_c_cache, 0); \ + +#define generate_condition_mi() \ + (backpatch_address) = translation_ptr; \ + aa64_emit_cbz(reg_n_cache, 0); \ + +#define generate_condition_pl() \ + (backpatch_address) = translation_ptr; \ + aa64_emit_cbnz(reg_n_cache, 0); \ + +#define generate_condition_vs() \ + (backpatch_address) = translation_ptr; \ + aa64_emit_cbz(reg_v_cache, 0); \ + +#define generate_condition_vc() \ + (backpatch_address) = translation_ptr; \ + aa64_emit_cbnz(reg_v_cache, 0); \ + +#define generate_condition_hi() \ + aa64_emit_eori(reg_temp, reg_c_cache, 0, 0); /* imm=1 */ \ + aa64_emit_orr(reg_temp, reg_temp, reg_z_cache); \ + (backpatch_address) = translation_ptr; \ + aa64_emit_cbnz(reg_temp, 0); \ + +#define generate_condition_ls() \ + aa64_emit_eori(reg_temp, reg_c_cache, 0, 0); /* imm=1 */ \ + aa64_emit_orr(reg_temp, reg_temp, reg_z_cache); \ + (backpatch_address) = translation_ptr; \ + aa64_emit_cbz(reg_temp, 0); \ + +#define generate_condition_ge() \ + aa64_emit_sub(reg_temp, reg_n_cache, reg_v_cache); \ + (backpatch_address) = translation_ptr; \ + aa64_emit_cbnz(reg_temp, 0); \ + +#define generate_condition_lt() \ + aa64_emit_sub(reg_temp, reg_n_cache, reg_v_cache); \ + (backpatch_address) = translation_ptr; \ + aa64_emit_cbz(reg_temp, 0); \ + +#define generate_condition_gt() \ + aa64_emit_xor(reg_temp, reg_n_cache, reg_v_cache); \ + aa64_emit_orr(reg_temp, reg_temp, reg_z_cache); \ + (backpatch_address) = translation_ptr; \ + aa64_emit_cbnz(reg_temp, 0); \ + +#define generate_condition_le() \ + aa64_emit_xor(reg_temp, reg_n_cache, reg_v_cache); \ + aa64_emit_orr(reg_temp, reg_temp, reg_z_cache); \ + (backpatch_address) = translation_ptr; \ + aa64_emit_cbz(reg_temp, 0); \ + +#define generate_condition() \ + switch(condition) \ + { \ + case 0x0: \ + generate_condition_eq(); \ + break; \ + \ + case 0x1: \ + generate_condition_ne(); \ + break; \ + \ + case 0x2: \ + generate_condition_cs(); \ + break; \ + \ + case 0x3: \ + generate_condition_cc(); \ + break; \ + \ + case 0x4: \ + generate_condition_mi(); \ + break; \ + \ + case 0x5: \ + generate_condition_pl(); \ + break; \ + \ + case 0x6: \ + generate_condition_vs(); \ + break; \ + \ + case 0x7: \ + generate_condition_vc(); \ + break; \ + \ + case 0x8: \ + generate_condition_hi(); \ + break; \ + \ + case 0x9: \ + generate_condition_ls(); \ + break; \ + \ + case 0xA: \ + generate_condition_ge(); \ + break; \ + \ + case 0xB: \ + generate_condition_lt(); \ + break; \ + \ + case 0xC: \ + generate_condition_gt(); \ + break; \ + \ + case 0xD: \ + generate_condition_le(); \ + break; \ + \ + case 0xE: \ + break; \ + \ + case 0xF: \ + break; \ + } \ + +#define generate_branch() \ +{ \ + if(condition == 0x0E) \ + { \ + generate_branch_cycle_update( \ + block_exits[block_exit_position].branch_source, \ + block_exits[block_exit_position].branch_target); \ + } \ + else \ + { \ + generate_branch_no_cycle_update( \ + block_exits[block_exit_position].branch_source, \ + block_exits[block_exit_position].branch_target); \ + } \ + block_exit_position++; \ +} + +// Flag generation, using the native CPU ALU flags +#define generate_op_logic_flags(_reg) \ + if(check_generate_n_flag) \ + { \ + aa64_emit_lsr(reg_n_cache, _reg, 31); \ + } \ + if(check_generate_z_flag) \ + { \ + aa64_emit_cmpi(_reg, 0); \ + aa64_emit_cset(reg_z_cache, ccode_eq); \ + } \ + +#define generate_op_arith_flags() \ + /* Assumes that state is in the flags */ \ + if(check_generate_c_flag) { \ + aa64_emit_cset(reg_c_cache, ccode_hs); \ + } \ + if(check_generate_v_flag) { \ + aa64_emit_cset(reg_v_cache, ccode_vs); \ + } \ + if(check_generate_n_flag) { \ + aa64_emit_cset(reg_n_cache, ccode_mi); \ + } \ + if(check_generate_z_flag) { \ + aa64_emit_cset(reg_z_cache, ccode_eq); \ + } \ + +#define load_c_flag() \ + aa64_emit_movne(reg_temp, 0); \ + aa64_emit_adds(reg_temp, reg_temp, reg_c_cache); \ + +// Muls instruction +#define generate_op_muls_reg(_rd, _rn, _rm) \ + aa64_emit_mul(_rd, _rn, _rm); \ + generate_op_logic_flags(_rd) \ + +// Immediate logical operations. Use native Z and N flag and CSET instruction. +#define generate_op_and_imm(_rd, _rn) \ + generate_logical_imm(and, _rd, _rn, imm) \ + +#define generate_op_orr_imm(_rd, _rn) \ + generate_logical_imm(orr, _rd, _rn, imm) \ + +#define generate_op_eor_imm(_rd, _rn) \ + generate_logical_imm(xor, _rd, _rn, imm) \ + +#define generate_op_bic_imm(_rd, _rn) \ + generate_logical_imm(bic, _rd, _rn, imm) \ + +#define generate_op_ands_imm(_rd, _rn) \ + generate_op_and_imm(_rd, _rn); \ + generate_op_logic_flags(_rd) \ + +#define generate_op_orrs_imm(_rd, _rn) \ + generate_op_orr_imm(_rd, _rn); \ + generate_op_logic_flags(_rd) \ + +#define generate_op_eors_imm(_rd, _rn) \ + generate_op_eor_imm(_rd, _rn); \ + generate_op_logic_flags(_rd) \ + +#define generate_op_bics_imm(_rd, _rn) \ + generate_op_bic_imm(_rd, _rn); \ + generate_op_logic_flags(_rd) \ + +// Register logical operations. Uses also native flags. +#define generate_op_and_reg(_rd, _rn, _rm) \ + aa64_emit_and(_rd, _rn, _rm) \ + +#define generate_op_orr_reg(_rd, _rn, _rm) \ + aa64_emit_orr(_rd, _rn, _rm) \ + +#define generate_op_eor_reg(_rd, _rn, _rm) \ + aa64_emit_xor(_rd, _rn, _rm) \ + +#define generate_op_bic_reg(_rd, _rn, _rm) \ + aa64_emit_bic(_rd, _rn, _rm) \ + +#define generate_op_ands_reg(_rd, _rn, _rm) \ + aa64_emit_and(_rd, _rn, _rm); \ + generate_op_logic_flags(_rd) \ + +#define generate_op_orrs_reg(_rd, _rn, _rm) \ + aa64_emit_orr(_rd, _rn, _rm); \ + generate_op_logic_flags(_rd) \ + +#define generate_op_eors_reg(_rd, _rn, _rm) \ + aa64_emit_xor(_rd, _rn, _rm); \ + generate_op_logic_flags(_rd) \ + +#define generate_op_bics_reg(_rd, _rn, _rm) \ + aa64_emit_bic(_rd, _rn, _rm); \ + generate_op_logic_flags(_rd) \ + +// Arithmetic reg-reg operations. + +#define generate_op_add_reg(_rd, _rn, _rm) \ + aa64_emit_add(_rd, _rn, _rm) \ + +#define generate_op_sub_reg(_rd, _rn, _rm) \ + aa64_emit_sub(_rd, _rn, _rm) \ + +#define generate_op_rsb_reg(_rd, _rn, _rm) \ + aa64_emit_sub(_rd, _rm, _rn) \ + +#define generate_op_adds_reg(_rd, _rn, _rm) \ + aa64_emit_adds(_rd, _rn, _rm) \ + generate_op_arith_flags() \ + +#define generate_op_subs_reg(_rd, _rn, _rm) \ + aa64_emit_subs(_rd, _rn, _rm) \ + generate_op_arith_flags() \ + +#define generate_op_rsbs_reg(_rd, _rn, _rm) \ + aa64_emit_subs(_rd, _rm, _rn) \ + generate_op_arith_flags() \ + +#define generate_op_adc_reg(_rd, _rn, _rm) \ + aa64_emit_add(_rd, _rn, _rm); /* Two adds is faster */ \ + aa64_emit_add(_rd, _rd, reg_c_cache); \ + +#define generate_op_sbc_reg(_rd, _rn, _rm) \ + /* Rd = Rn - Rm + (C - 1) */ \ + aa64_emit_sub(_rd, _rn, _rm); \ + aa64_emit_subi(reg_temp, reg_c_cache, 1); \ + aa64_emit_add(_rd, _rd, reg_temp); \ + +#define generate_op_rsc_reg(_rd, _rn, _rm) \ + aa64_emit_sub(_rd, _rm, _rn); \ + aa64_emit_subi(reg_temp, reg_c_cache, 1); \ + aa64_emit_add(_rd, _rd, reg_temp); \ + +/* Must use native instruction to accurately calculate C/V flags */ +#define generate_op_adcs_reg(_rd, _rn, _rm) \ + load_c_flag(); \ + aa64_emit_adcs(_rd, _rn, _rm); \ + generate_op_arith_flags() \ + +#define generate_op_sbcs_reg(_rd, _rn, _rm) \ + load_c_flag(); \ + aa64_emit_sbcs(_rd, _rn, _rm); \ + generate_op_arith_flags() \ + +#define generate_op_rscs_reg(_rd, _rn, _rm) \ + load_c_flag(); \ + aa64_emit_sbcs(_rd, _rm, _rn); \ + generate_op_arith_flags() \ + + +#define generate_op_neg_reg(_rd, _rn, _rm) \ + generate_op_subs_reg(_rd, reg_zero, _rm) \ + +// Arithmetic immediate operations. Use native flags when needed (input). + +#define generate_op_add_imm(_rd, _rn) \ + generate_alu_imm(addi, add, _rd, _rn, imm) \ + +#define generate_op_adds_imm(_rd, _rn) \ + generate_alu_imm(addis, adds, _rd, _rn, imm) \ + generate_op_arith_flags(); \ + +#define generate_op_sub_imm(_rd, _rn) \ + generate_alu_imm(subi, sub, _rd, _rn, imm) \ + +#define generate_op_subs_imm(_rd, _rn) \ + generate_alu_imm(subis, subs, _rd, _rn, imm) \ + generate_op_arith_flags(); \ + +#define generate_op_rsb_imm(_rd, _rn) \ + if (imm) { \ + generate_load_imm(reg_temp, imm); \ + aa64_emit_sub(_rd, reg_temp, _rn) \ + } else { \ + aa64_emit_sub(_rd, reg_zero, _rn) \ + } \ + +#define generate_op_rsbs_imm(_rd, _rn) \ + if (imm) { \ + generate_load_imm(reg_temp, imm); \ + aa64_emit_subs(_rd, reg_temp, _rn) \ + } else { \ + aa64_emit_subs(_rd, reg_zero, _rn) \ + } \ + generate_op_arith_flags(); \ + + +#define generate_op_adc_imm(_rd, _rn) \ + if (imm) { \ + generate_alu_imm(addi, add, _rd, _rn, imm); \ + aa64_emit_add(_rd, _rd, reg_c_cache); \ + } else { \ + aa64_emit_add(_rd, _rn, reg_c_cache); \ + } \ + +#define generate_op_sbc_imm(_rd, _rn) \ + /* Rd = Rn - Imm - 1 + C = Rn - (Imm + 1) + C */ \ + generate_alu_imm(subi, sub, _rd, _rn, ((imm) + 1)); \ + aa64_emit_add(_rd, _rd, reg_c_cache); \ + +#define generate_op_rsc_imm(_rd, _rn) \ + /* Rd = Imm - Rn - 1 + C = (Imm - 1) - Rn + C */ \ + generate_load_imm(reg_temp, ((imm)-1)); \ + aa64_emit_sub(_rd, reg_temp, _rn) \ + aa64_emit_add(_rd, _rd, reg_c_cache); \ + +/* Uses native instructions when needed, for C/V accuracy */ +#define generate_op_adcs_imm(_rd, _rn) \ + if (imm) { \ + load_c_flag(); \ + generate_load_imm(reg_temp, (imm)); \ + aa64_emit_adcs(_rd, _rn, reg_temp); \ + } else { \ + aa64_emit_adds(_rd, _rn, reg_c_cache); \ + } \ + generate_op_arith_flags(); \ + +#define generate_op_sbcs_imm(_rd, _rn) \ + load_c_flag(); \ + if (imm) { \ + generate_load_imm(reg_temp, (imm)); \ + aa64_emit_sbcs(_rd, _rn, reg_temp); \ + } else { \ + aa64_emit_sbcs(_rd, _rn, reg_zero); \ + } \ + generate_op_arith_flags(); \ + +#define generate_op_rscs_imm(_rd, _rn) \ + load_c_flag(); \ + if (imm) { \ + generate_load_imm(reg_temp, (imm)); \ + aa64_emit_sbcs(_rd, reg_temp, _rn); \ + } else { \ + aa64_emit_sbcs(_rd, reg_zero, _rn); \ + } \ + generate_op_arith_flags(); \ + + +// Move operations, only logical flags +#define generate_op_mov_imm(_rd, _rn) \ + generate_load_imm(_rd, imm) \ + +#define generate_op_movs_imm(_rd, _rn) \ + generate_load_imm(_rd, imm) \ + aa64_emit_movlo(reg_n_cache, (imm) >> 31); \ + aa64_emit_movlo(reg_z_cache, (imm) ? 0 : 1); \ + +#define generate_op_mvn_imm(_rd, _rn) \ + generate_load_imm(_rd, (~imm)) \ + +#define generate_op_mvns_imm(_rd, _rn) \ + generate_load_imm(_rd, (~imm)); \ + aa64_emit_movlo(reg_n_cache, (~(imm)) >> 31); \ + aa64_emit_movlo(reg_z_cache, (~(imm)) ? 1 : 0); \ + +#define generate_op_mov_reg(_rd, _rn, _rm) \ + aa64_emit_mov(_rd, _rm) \ + +#define generate_op_movs_reg(_rd, _rn, _rm) \ + aa64_emit_addi(_rd, _rm, 0); \ + generate_op_logic_flags(_rd) \ + +#define generate_op_mvn_reg(_rd, _rn, _rm) \ + aa64_emit_orn(_rd, reg_zero, _rm) \ + +#define generate_op_mvns_reg(_rd, _rn, _rm) \ + aa64_emit_orn(_rd, reg_zero, _rm) \ + generate_op_logic_flags(_rd) \ + +// Testing/Comparison functions +#define generate_op_cmp_reg(_rd, _rn, _rm) \ + generate_op_subs_reg(reg_temp2, _rn, _rm) \ + +#define generate_op_cmn_reg(_rd, _rn, _rm) \ + generate_op_adds_reg(reg_temp2, _rn, _rm) \ + +#define generate_op_tst_reg(_rd, _rn, _rm) \ + generate_op_ands_reg(reg_temp2, _rn, _rm) \ + +#define generate_op_teq_reg(_rd, _rn, _rm) \ + generate_op_eors_reg(reg_temp2, _rn, _rm) \ + +#define generate_op_cmp_imm(_rd, _rn) \ + generate_op_subs_imm(reg_temp2, _rn) \ + +#define generate_op_cmn_imm(_rd, _rn) \ + generate_op_adds_imm(reg_temp2, _rn) \ + +#define generate_op_tst_imm(_rd, _rn) \ + generate_op_ands_imm(reg_temp2, _rn) \ + +#define generate_op_teq_imm(_rd, _rn) \ + generate_op_eors_imm(reg_temp2, _rn) \ + + +#define arm_generate_op_load_yes() \ + generate_load_reg_pc(reg_a1, rn, 8) \ + +#define arm_generate_op_load_no() \ + +#define arm_op_check_yes() \ + check_load_reg_pc(arm_reg_a1, rn, 8) \ + +#define arm_op_check_no() \ + +#define arm_generate_op_reg_flags(name, load_op) \ + arm_decode_data_proc_reg(opcode); \ + if(check_generate_c_flag) \ + { \ + generate_load_rm_sh(flags); \ + } \ + else \ + { \ + generate_load_rm_sh(no_flags); \ + } \ + \ + arm_op_check_##load_op(); \ + generate_op_##name##_reg(arm_to_a64_reg[rd], arm_to_a64_reg[rn], \ + arm_to_a64_reg[rm]) \ + +#define arm_generate_op_reg(name, load_op) \ + arm_decode_data_proc_reg(opcode); \ + generate_load_rm_sh(no_flags); \ + arm_op_check_##load_op(); \ + generate_op_##name##_reg(arm_to_a64_reg[rd], arm_to_a64_reg[rn], \ + arm_to_a64_reg[rm]) \ + +#define arm_generate_op_imm(name, load_op) \ + arm_decode_data_proc_imm(opcode); \ + ror(imm, imm, imm_ror); \ + arm_op_check_##load_op(); \ + generate_op_##name##_imm(arm_to_a64_reg[rd], arm_to_a64_reg[rn]) \ + +#define arm_generate_op_imm_flags(name, load_op) \ + arm_generate_op_imm(name, load_op) \ + +#define arm_data_proc(name, type, flags_op) \ +{ \ + arm_generate_op_##type(name, yes); \ + check_store_reg_pc_##flags_op(rd); \ +} \ + +#define arm_data_proc_test(name, type) \ +{ \ + arm_generate_op_##type(name, yes); \ +} \ + +#define arm_data_proc_unary(name, type, flags_op) \ +{ \ + arm_generate_op_##type(name, no); \ + check_store_reg_pc_##flags_op(rd); \ +} \ + +// 32 bit multiplication + +#define arm_multiply_flags_yes(_rd) \ + generate_op_logic_flags(_rd) \ + +#define arm_multiply_flags_no(_rd) \ + +#define arm_multiply_add_no() \ + aa64_emit_mul(arm_to_a64_reg[rd], arm_to_a64_reg[rm], arm_to_a64_reg[rs]); \ + +#define arm_multiply_add_yes() \ + aa64_emit_madd(arm_to_a64_reg[rd], arm_to_a64_reg[rn], \ + arm_to_a64_reg[rm], arm_to_a64_reg[rs]); \ + +#define arm_multiply(add_op, flags) \ +{ \ + arm_decode_multiply(); \ + arm_multiply_add_##add_op(); \ + arm_multiply_flags_##flags(arm_to_a64_reg[rd]); \ +} \ + +// 32x32 -> 64 multiplication (long mul/muladd) + +#define generate_multiply_s64() \ + aa64_emit_smaddl(reg_temp, reg_zero, arm_to_a64_reg[rm], arm_to_a64_reg[rs]) + +#define generate_multiply_u64() \ + aa64_emit_umaddl(reg_temp, reg_zero, arm_to_a64_reg[rm], arm_to_a64_reg[rs]) + +#define generate_multiply_s64_add() \ + aa64_emit_smaddl(reg_temp, reg_temp, arm_to_a64_reg[rm], arm_to_a64_reg[rs]) + +#define generate_multiply_u64_add() \ + aa64_emit_umaddl(reg_temp, reg_temp, arm_to_a64_reg[rm], arm_to_a64_reg[rs]) + +#define arm_multiply_long_flags_yes(_rdlo, _rdhi) \ + aa64_emit_orr(reg_z_cache, _rdlo, _rdhi); \ + aa64_emit_cmpi(reg_z_cache, 0); \ + aa64_emit_cset(reg_z_cache, ccode_eq); \ + aa64_emit_lsr(reg_n_cache, _rdhi, 31); \ + +#define arm_multiply_long_flags_no(_rdlo, _rdhi) \ + +#define arm_multiply_long_add_yes(name) \ + aa64_emit_merge_regs(reg_temp, arm_to_a64_reg[rdhi], arm_to_a64_reg[rdlo]); \ + generate_multiply_##name() \ + +#define arm_multiply_long_add_no(name) \ + generate_multiply_##name() \ + +#define arm_multiply_long(name, add_op, flags) \ +{ \ + arm_decode_multiply_long(); \ + arm_multiply_long_add_##add_op(name); \ + aa64_emit_andi64(arm_to_a64_reg[rdlo], reg_temp, 0, 31); \ + aa64_emit_lsr64(arm_to_a64_reg[rdhi], reg_temp, 32); \ + arm_multiply_long_flags_##flags(arm_to_a64_reg[rdlo], arm_to_a64_reg[rdhi]);\ +} \ + +#define arm_psr_read(op_type, psr_reg) \ + generate_function_call(execute_read_##psr_reg); \ + generate_store_reg(reg_res, rd) \ + +u32 execute_store_cpsr_body(u32 _cpsr, u32 store_mask, u32 address) +{ + reg[REG_CPSR] = _cpsr; + if(store_mask & 0xFF) + { + set_cpu_mode(cpu_modes[_cpsr & 0x1F]); + if((io_registers[REG_IE] & io_registers[REG_IF]) && + io_registers[REG_IME] && ((_cpsr & 0x80) == 0)) + { + reg_mode[MODE_IRQ][6] = address + 4; + spsr[MODE_IRQ] = _cpsr; + reg[REG_CPSR] = 0xD2; + set_cpu_mode(MODE_IRQ); + return 0x00000018; + } + } + + return 0; +} + +#define arm_psr_load_new_reg() \ + generate_load_reg(reg_a0, rm) \ + +#define arm_psr_load_new_imm() \ + generate_load_imm(reg_a0, imm) \ + +#define arm_psr_store(op_type, psr_reg) \ + arm_psr_load_new_##op_type(); \ + generate_load_imm(reg_a1, psr_masks[psr_field]); \ + generate_load_pc(reg_a2, (pc)); \ + generate_function_call(execute_store_##psr_reg) \ + +#define arm_psr(op_type, transfer_type, psr_reg) \ +{ \ + arm_decode_psr_##op_type(opcode); \ + arm_psr_##transfer_type(op_type, psr_reg); \ +} \ + +#define thumb_load_pc_pool_const(rd, value) \ + generate_load_imm(arm_to_a64_reg[rd], (value)); \ + +#define arm_access_memory_load(mem_type) \ + cycle_count += 2; \ + generate_load_pc(reg_a1, (pc + 8)); \ + generate_function_call(execute_load_##mem_type); \ + generate_store_reg(reg_res, rd); \ + check_store_reg_pc_no_flags(rd) \ + +#define arm_access_memory_store(mem_type) \ + cycle_count++; \ + generate_load_pc(reg_a2, (pc + 4)); \ + generate_load_reg_pc(reg_a1, rd, 12); \ + generate_function_call(execute_store_##mem_type) \ + +#define arm_access_memory_reg_pre_up() \ + aa64_emit_add(reg_a0, arm_to_a64_reg[rn], arm_to_a64_reg[rm]) \ + +#define arm_access_memory_reg_pre_down() \ + aa64_emit_sub(reg_a0, arm_to_a64_reg[rn], arm_to_a64_reg[rm]) \ + +#define arm_access_memory_reg_pre(adjust_dir) \ + check_load_reg_pc(arm_reg_a0, rn, 8); \ + arm_access_memory_reg_pre_##adjust_dir() \ + +#define arm_access_memory_reg_pre_wb(adjust_dir) \ + arm_access_memory_reg_pre(adjust_dir); \ + generate_store_reg(reg_a0, rn) \ + +#define arm_access_memory_reg_post_up() \ + aa64_emit_add(arm_to_a64_reg[rn], arm_to_a64_reg[rn], arm_to_a64_reg[rm]) \ + +#define arm_access_memory_reg_post_down() \ + aa64_emit_sub(arm_to_a64_reg[rn], arm_to_a64_reg[rn], arm_to_a64_reg[rm]) \ + +#define arm_access_memory_reg_post(adjust_dir) \ + generate_load_reg(reg_a0, rn); \ + arm_access_memory_reg_post_##adjust_dir() \ + +#define arm_access_memory_imm_pre_up() \ + aa64_emit_addi(reg_a0, arm_to_a64_reg[rn], offset) \ + +#define arm_access_memory_imm_pre_down() \ + aa64_emit_subi(reg_a0, arm_to_a64_reg[rn], offset) \ + +#define arm_access_memory_imm_pre(adjust_dir) \ + check_load_reg_pc(arm_reg_a0, rn, 8); \ + arm_access_memory_imm_pre_##adjust_dir() \ + +#define arm_access_memory_imm_pre_wb(adjust_dir) \ + arm_access_memory_imm_pre(adjust_dir); \ + generate_store_reg(reg_a0, rn) \ + +#define arm_access_memory_imm_post_up() \ + aa64_emit_addi(arm_to_a64_reg[rn], arm_to_a64_reg[rn], offset) \ + +#define arm_access_memory_imm_post_down() \ + aa64_emit_subi(arm_to_a64_reg[rn], arm_to_a64_reg[rn], offset) \ + +#define arm_access_memory_imm_post(adjust_dir) \ + generate_load_reg(reg_a0, rn); \ + arm_access_memory_imm_post_##adjust_dir() \ + +#define arm_data_trans_reg(adjust_op, adjust_dir) \ + arm_decode_data_trans_reg(); \ + generate_load_offset_sh(); \ + arm_access_memory_reg_##adjust_op(adjust_dir) \ + +#define arm_data_trans_imm(adjust_op, adjust_dir) \ + arm_decode_data_trans_imm(); \ + arm_access_memory_imm_##adjust_op(adjust_dir) \ + +#define arm_data_trans_half_reg(adjust_op, adjust_dir) \ + arm_decode_half_trans_r(); \ + arm_access_memory_reg_##adjust_op(adjust_dir) \ + +#define arm_data_trans_half_imm(adjust_op, adjust_dir) \ + arm_decode_half_trans_of(); \ + arm_access_memory_imm_##adjust_op(adjust_dir) \ + +#define arm_access_memory(access_type, direction, adjust_op, mem_type, \ + offset_type) \ +{ \ + arm_data_trans_##offset_type(adjust_op, direction); \ + arm_access_memory_##access_type(mem_type); \ +} \ + +#define word_bit_count(word) \ + (bit_count[word >> 8] + bit_count[word & 0xFF]) \ + +#define arm_block_memory_load() \ + generate_function_call(execute_aligned_load32); \ + generate_store_reg(reg_res, i) \ + +#define arm_block_memory_store() \ + generate_load_reg_pc(reg_a1, i, 8); \ + generate_function_call(execute_aligned_store32) \ + +#define arm_block_memory_final_load() \ + arm_block_memory_load() \ + +#define arm_block_memory_final_store() \ + generate_load_pc(reg_a2, (pc + 4)); \ + generate_load_reg(reg_a1, i) \ + generate_function_call(execute_store_u32); \ + +#define arm_block_memory_adjust_pc_store() \ + +#define arm_block_memory_adjust_pc_load() \ + if(reg_list & 0x8000) \ + { \ + generate_indirect_branch_arm(); \ + } \ + +#define arm_block_memory_offset_down_a() \ + aa64_emit_subi(reg_save0, base_reg, ((word_bit_count(reg_list)-1) * 4)) \ + +#define arm_block_memory_offset_down_b() \ + aa64_emit_subi(reg_save0, base_reg, (word_bit_count(reg_list) * 4)) \ + +#define arm_block_memory_offset_no() \ + aa64_emit_addi(reg_save0, base_reg, 0) \ + +#define arm_block_memory_offset_up() \ + aa64_emit_addi(reg_save0, base_reg, 4) \ + +#define arm_block_memory_writeback_down() \ + aa64_emit_subi(base_reg, base_reg, (word_bit_count(reg_list) * 4)) \ + +#define arm_block_memory_writeback_up() \ + aa64_emit_addi(base_reg, base_reg, (word_bit_count(reg_list) * 4)) \ + +#define arm_block_memory_writeback_no() + +// Only emit writeback if the register is not in the list + +#define arm_block_memory_writeback_load(writeback_type) \ + if(!((reg_list >> rn) & 0x01)) \ + { \ + arm_block_memory_writeback_##writeback_type(); \ + } \ + +#define arm_block_memory_writeback_store(writeback_type) \ + arm_block_memory_writeback_##writeback_type() \ + +#define arm_block_memory(access_type, offset_type, writeback_type, s_bit) \ +{ \ + arm_decode_block_trans(); \ + u32 i; \ + u32 offset = 0; \ + u32 base_reg = arm_to_a64_reg[rn]; \ + \ + arm_block_memory_offset_##offset_type(); \ + arm_block_memory_writeback_##access_type(writeback_type); \ + \ + { \ + aa64_emit_andi(reg_save0, reg_save0, 30, 29); /* clear 2 LSB */ \ + \ + for(i = 0; i < 16; i++) \ + { \ + if((reg_list >> i) & 0x01) \ + { \ + cycle_count++; \ + aa64_emit_addi(reg_a0, reg_save0, offset); \ + if(reg_list & ~((2 << i) - 1)) \ + { \ + arm_block_memory_##access_type(); \ + offset += 4; \ + } \ + else \ + { \ + arm_block_memory_final_##access_type(); \ + break; \ + } \ + } \ + } \ + \ + arm_block_memory_adjust_pc_##access_type(); \ + } \ +} \ + + +// ARM: rn *must* be different from rm and rd. rm *can* be the same as rd. + +#define arm_swap(type) \ +{ \ + arm_decode_swap(); \ + cycle_count += 3; \ + generate_load_reg(reg_a0, rn); \ + generate_function_call(execute_load_##type); \ + generate_load_reg(reg_a1, rm); \ + generate_store_reg(reg_res, rd); \ + generate_load_reg(reg_a0, rn); \ + generate_function_call(execute_store_##type); \ +} \ + +#define thumb_generate_op_load_yes(_rs) \ + generate_load_reg(reg_a1, _rs) \ + +#define thumb_generate_op_load_no(_rs) \ + +#define thumb_generate_op_reg(name, _rd, _rs, _rn) \ + generate_op_##name##_reg(arm_to_a64_reg[_rd], \ + arm_to_a64_reg[_rs], arm_to_a64_reg[_rn]) \ + +#define thumb_generate_op_imm(name, _rd, _rs, _rn) \ + generate_op_##name##_imm(arm_to_a64_reg[_rd], arm_to_a64_reg[_rs]) \ + +// Types: add_sub, add_sub_imm, alu_op, imm +// Affects N/Z/C/V flags + +#define thumb_data_proc(type, name, rn_type, _rd, _rs, _rn) \ +{ \ + thumb_decode_##type(); \ + thumb_generate_op_##rn_type(name, _rd, _rs, _rn); \ +} \ + +#define thumb_data_proc_test(type, name, rn_type, _rs, _rn) \ +{ \ + thumb_decode_##type(); \ + thumb_generate_op_##rn_type(name, 0, _rs, _rn); \ +} \ + +#define thumb_data_proc_unary(type, name, rn_type, _rd, _rn) \ +{ \ + thumb_decode_##type(); \ + thumb_generate_op_##rn_type(name, _rd, 0, _rn); \ +} \ + +#define check_store_reg_pc_thumb(_rd) \ + if(_rd == REG_PC) \ + { \ + generate_indirect_branch_cycle_update(thumb); \ + } \ + +#define thumb_data_proc_hi(name) \ +{ \ + thumb_decode_hireg_op(); \ + u32 dest_rd = rd; \ + check_load_reg_pc(arm_reg_a0, rs, 4); \ + check_load_reg_pc(arm_reg_a1, rd, 4); \ + generate_op_##name##_reg(arm_to_a64_reg[dest_rd], arm_to_a64_reg[rd], \ + arm_to_a64_reg[rs]); \ + check_store_reg_pc_thumb(dest_rd); \ +} \ + +#define thumb_data_proc_test_hi(name) \ +{ \ + thumb_decode_hireg_op(); \ + check_load_reg_pc(arm_reg_a0, rs, 4); \ + check_load_reg_pc(arm_reg_a1, rd, 4); \ + generate_op_##name##_reg(reg_temp, arm_to_a64_reg[rd], \ + arm_to_a64_reg[rs]); \ +} \ + +#define thumb_data_proc_mov_hi() \ +{ \ + thumb_decode_hireg_op(); \ + check_load_reg_pc(arm_reg_a0, rs, 4); \ + generate_mov(rd, rs); \ + check_store_reg_pc_thumb(rd); \ +} \ + +#define thumb_load_pc(_rd) \ +{ \ + thumb_decode_imm(); \ + generate_load_pc(arm_to_a64_reg[_rd], (((pc & ~2) + 4) + (imm * 4))); \ +} \ + +#define thumb_load_sp(_rd) \ +{ \ + thumb_decode_imm(); \ + aa64_emit_addi(arm_to_a64_reg[_rd], reg_r13, (imm * 4)); \ +} \ + +#define thumb_adjust_sp_up() \ + aa64_emit_addi(reg_r13, reg_r13, (imm * 4)); \ + +#define thumb_adjust_sp_down() \ + aa64_emit_subi(reg_r13, reg_r13, (imm * 4)); \ + +#define thumb_adjust_sp(direction) \ +{ \ + thumb_decode_add_sp(); \ + thumb_adjust_sp_##direction(); \ +} \ + +// Decode types: shift, alu_op +// Operation types: lsl, lsr, asr, ror +// Affects N/Z/C flags + +#define thumb_generate_shift_imm(name) \ + if(check_generate_c_flag) \ + { \ + generate_shift_imm_##name##_flags(rd, rs, imm); \ + } \ + else \ + { \ + generate_shift_imm_##name##_no_flags(rd, rs, imm); \ + } \ + if(rs != rd) \ + { \ + generate_mov(rd, rs); \ + } \ + +#define thumb_generate_shift_reg(name) \ +{ \ + u32 original_rd = rd; \ + if(check_generate_c_flag) \ + { \ + generate_shift_reg_##name##_flags(rd, rs); \ + } \ + else \ + { \ + generate_shift_reg_##name##_no_flags(rd, rs); \ + } \ + aa64_emit_addi(arm_to_a64_reg[original_rd], reg_a0, 0); \ +} \ + +#define thumb_shift(decode_type, op_type, value_type) \ +{ \ + thumb_decode_##decode_type(); \ + thumb_generate_shift_##value_type(op_type); \ + generate_op_logic_flags(arm_to_a64_reg[rd]); \ +} \ + +// Operation types: imm, mem_reg, mem_imm + +#define thumb_access_memory_load(mem_type, reg_rd) \ + cycle_count += 2; \ + generate_load_pc(reg_a1, (pc + 4)); \ + generate_function_call(execute_load_##mem_type); \ + generate_store_reg(reg_res, reg_rd) \ + +#define thumb_access_memory_store(mem_type, reg_rd) \ + cycle_count++; \ + generate_load_reg(reg_a1, reg_rd) \ + generate_load_pc(reg_a2, (pc + 2)); \ + generate_function_call(execute_store_##mem_type); \ + +#define thumb_access_memory_generate_address_pc_relative(offset, reg_rb, \ + reg_ro) \ + generate_load_pc(reg_a0, (offset)) \ + +#define thumb_access_memory_generate_address_reg_imm(offset, reg_rb, reg_ro) \ + aa64_emit_addi(reg_a0, arm_to_a64_reg[reg_rb], (offset)) \ + +#define thumb_access_memory_generate_address_reg_imm_sp(offset, reg_rb, reg_ro) \ + aa64_emit_addi(reg_a0, arm_to_a64_reg[reg_rb], (offset * 4)) \ + +#define thumb_access_memory_generate_address_reg_reg(offset, reg_rb, reg_ro) \ + aa64_emit_add(reg_a0, arm_to_a64_reg[reg_rb], arm_to_a64_reg[reg_ro]) \ + +#define thumb_access_memory(access_type, op_type, reg_rd, reg_rb, reg_ro, \ + address_type, offset, mem_type) \ +{ \ + thumb_decode_##op_type(); \ + thumb_access_memory_generate_address_##address_type(offset, reg_rb, \ + reg_ro); \ + thumb_access_memory_##access_type(mem_type, reg_rd); \ +} \ + + +#define thumb_block_address_preadjust_no(base_reg) \ + aa64_emit_addi(reg_save0, base_reg, 0) \ + +#define thumb_block_address_preadjust_down(base_reg) \ + aa64_emit_subi(reg_save0, base_reg, (bit_count[reg_list] * 4)); \ + aa64_emit_addi(base_reg, reg_save0, 0); \ + +#define thumb_block_address_preadjust_push_lr(base_reg) \ + aa64_emit_subi(reg_save0, base_reg, ((bit_count[reg_list] + 1) * 4)); \ + aa64_emit_addi(base_reg, reg_save0, 0); \ + +#define thumb_block_address_postadjust_no(base_reg) \ + +#define thumb_block_address_postadjust_up(base_reg) \ + aa64_emit_addi(base_reg, reg_save0, (bit_count[reg_list] * 4)); \ + +#define thumb_block_address_postadjust_pop_pc(base_reg) \ + aa64_emit_addi(base_reg, reg_save0, ((bit_count[reg_list] + 1) * 4)); \ + +#define thumb_block_address_postadjust_push_lr(base_reg) \ + +#define thumb_block_memory_load() \ + generate_function_call(execute_aligned_load32); \ + generate_store_reg(reg_res, i) \ + +#define thumb_block_memory_store() \ + generate_load_reg(reg_a1, i) \ + generate_function_call(execute_aligned_store32); \ + +#define thumb_block_memory_final_load() \ + thumb_block_memory_load() \ + +#define thumb_block_memory_final_store() \ + generate_load_pc(reg_a2, (pc + 2)); \ + generate_load_reg(reg_a1, i) \ + generate_function_call(execute_store_u32); \ + +#define thumb_block_memory_final_no(access_type) \ + thumb_block_memory_final_##access_type() \ + +#define thumb_block_memory_final_up(access_type) \ + thumb_block_memory_final_##access_type() \ + +#define thumb_block_memory_final_down(access_type) \ + thumb_block_memory_final_##access_type() \ + +#define thumb_block_memory_final_push_lr(access_type) \ + thumb_block_memory_##access_type() \ + +#define thumb_block_memory_final_pop_pc(access_type) \ + thumb_block_memory_##access_type() \ + +#define thumb_block_memory_extra_no() \ + +#define thumb_block_memory_extra_up() \ + +#define thumb_block_memory_extra_down() \ + +#define thumb_block_memory_extra_push_lr() \ + aa64_emit_addi(reg_a0, reg_save0, (bit_count[reg_list] * 4)); \ + generate_load_reg(reg_a1, REG_LR) \ + generate_function_call(execute_aligned_store32); \ + +#define thumb_block_memory_extra_pop_pc() \ + aa64_emit_addi(reg_a0, reg_save0, (bit_count[reg_list] * 4)); \ + generate_function_call(execute_aligned_load32); \ + generate_indirect_branch_cycle_update(thumb) \ + +#define thumb_block_memory(access_type, pre_op, post_op, arm_base_reg) \ +{ \ + thumb_decode_rlist(); \ + u32 i; \ + u32 offset = 0; \ + u32 base_reg = arm_to_a64_reg[arm_base_reg]; \ + \ + thumb_block_address_preadjust_##pre_op(base_reg); \ + thumb_block_address_postadjust_##post_op(base_reg); \ + \ + { \ + aa64_emit_andi(reg_save0, reg_save0, 30, 29); /* clear 2 LSB */ \ + \ + for(i = 0; i < 8; i++) \ + { \ + if((reg_list >> i) & 0x01) \ + { \ + cycle_count++; \ + aa64_emit_addi(reg_a0, reg_save0, offset); \ + if(reg_list & ~((2 << i) - 1)) \ + { \ + thumb_block_memory_##access_type(); \ + offset += 4; \ + } \ + else \ + { \ + thumb_block_memory_final_##post_op(access_type); \ + break; \ + } \ + } \ + } \ + \ + thumb_block_memory_extra_##post_op(); \ + } \ +} + +#define generate_branch_filler(condition_code, writeback_location) \ + (writeback_location) = translation_ptr; \ + aa64_emit_brcond(condition_code, 0); \ + + +#define thumb_conditional_branch(condition) \ +{ \ + generate_cycle_update(); \ + generate_condition_##condition(); \ + generate_branch_no_cycle_update( \ + block_exits[block_exit_position].branch_source, \ + block_exits[block_exit_position].branch_target); \ + generate_branch_patch_conditional(backpatch_address, translation_ptr); \ + block_exit_position++; \ +} \ + +#define arm_conditional_block_header() \ + generate_cycle_update(); \ + generate_condition(); \ + +#define arm_b() \ + generate_branch() \ + +#define arm_bl() \ + generate_load_pc(reg_r14, (pc + 4)); \ + generate_branch() \ + +#define arm_bx() \ + arm_decode_branchx(opcode); \ + generate_load_reg(reg_a0, rn); \ + generate_indirect_branch_dual() \ + +#define arm_swi() \ + generate_load_pc(reg_a0, (pc + 4)); \ + generate_function_call(execute_swi); \ + generate_branch() \ + +#define thumb_b() \ + generate_branch_cycle_update( \ + block_exits[block_exit_position].branch_source, \ + block_exits[block_exit_position].branch_target); \ + block_exit_position++ \ + +#define thumb_bl() \ + generate_load_pc(reg_r14, ((pc + 2) | 0x01)); \ + generate_branch_cycle_update( \ + block_exits[block_exit_position].branch_source, \ + block_exits[block_exit_position].branch_target); \ + block_exit_position++ \ + +#define thumb_blh() \ +{ \ + thumb_decode_branch(); \ + generate_alu_imm(addi, add, reg_a0, reg_r14, (offset * 2)); \ + generate_load_pc(reg_r14, ((pc + 2) | 0x01)); \ + generate_indirect_branch_cycle_update(dual); \ + break; \ +} \ + +#define thumb_bx() \ +{ \ + thumb_decode_hireg_op(); \ + generate_load_reg_pc(reg_a0, rs, 4); \ + generate_indirect_branch_cycle_update(dual); \ +} \ + +#define thumb_process_cheats() \ + generate_function_call(a64_cheat_hook); + +#define arm_process_cheats() \ + generate_function_call(a64_cheat_hook); + +#ifdef TRACE_INSTRUCTIONS + void trace_instruction(u32 pc, u32 mode) + { + if (mode) + printf("Executed arm %x\n", pc); + else + printf("Executed thumb %x\n", pc); + #ifdef TRACE_REGISTERS + print_regs(); + #endif + } + + #define emit_trace_instruction(pc, mode) \ + emit_save_regs(); \ + generate_load_imm(reg_a0, pc); \ + generate_load_imm(reg_a1, mode); \ + generate_function_call(trace_instruction); \ + emit_restore_regs() + #define emit_trace_thumb_instruction(pc) emit_trace_instruction(pc, 0) + #define emit_trace_arm_instruction(pc) emit_trace_instruction(pc, 1) +#else + #define emit_trace_thumb_instruction(pc) + #define emit_trace_arm_instruction(pc) +#endif + +#define thumb_swi() \ + generate_load_pc(reg_a0, (pc + 2)); \ + generate_function_call(execute_swi); \ + generate_branch_cycle_update( \ + block_exits[block_exit_position].branch_source, \ + block_exits[block_exit_position].branch_target); \ + block_exit_position++ \ + +#define arm_hle_div(cpu_mode) \ + aa64_emit_sdiv(reg_r3, reg_r0, reg_r1); \ + aa64_emit_msub(reg_r1, reg_r0, reg_r1, reg_r3); \ + aa64_emit_mov(reg_r0, reg_r3); \ + aa64_emit_cmpi(reg_r3, 0); \ + aa64_emit_csneg(reg_r3, reg_r3, reg_r3, ccode_ge); \ + +#define arm_hle_div_arm(cpu_mode) \ + aa64_emit_sdiv(reg_r3, reg_r1, reg_r0); \ + aa64_emit_msub(reg_r1, reg_r1, reg_r0, reg_r3); \ + aa64_emit_mov(reg_r0, reg_r3); \ + aa64_emit_cmpi(reg_r3, 0); \ + aa64_emit_csneg(reg_r3, reg_r3, reg_r3, ccode_ge); \ + +#define generate_translation_gate(type) \ + generate_load_pc(reg_a0, pc); \ + generate_indirect_branch_no_cycle_update(type) \ + + +extern void* ldst_handler_functions[16*4 + 17*6]; +extern void* ldst_lookup_tables[16*4 + 17*6]; + + +void init_emitter() { + rom_cache_watermark = 0; + init_bios_hooks(); + + // Generate handler table + memcpy(ldst_lookup_tables, ldst_handler_functions, sizeof(ldst_lookup_tables)); +} + +u32 execute_arm_translate_internal(u32 cycles, void *regptr); + +u32 execute_arm_translate(u32 cycles) { + return execute_arm_translate_internal(cycles, ®[0]); +} + +#endif + + diff --git a/arm/arm64_stub.S b/arm/arm64_stub.S new file mode 100644 index 0000000..66a87f8 --- /dev/null +++ b/arm/arm64_stub.S @@ -0,0 +1,705 @@ +# gameplaySP +# +# Copyright (C) 2021 David Guillen Fandos +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of +# the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + + +#include "../gpsp_config.h" + +#define defsymbl(symbol) \ +.align 2; \ +.type symbol, %function ;\ +.global symbol ; \ +.global _##symbol ; \ +symbol: \ +_##symbol: + +.text +.align 2 + +#define REG_R0 (0 * 4) +#define REG_R1 (1 * 4) +#define REG_R2 (2 * 4) +#define REG_R3 (3 * 4) +#define REG_R4 (4 * 4) +#define REG_R5 (5 * 4) +#define REG_R6 (6 * 4) +#define REG_R7 (7 * 4) +#define REG_R8 (8 * 4) +#define REG_R9 (9 * 4) +#define REG_R10 (10 * 4) +#define REG_R11 (11 * 4) +#define REG_R12 (12 * 4) +#define REG_R13 (13 * 4) +#define REG_R14 (14 * 4) +#define REG_SP (13 * 4) +#define REG_LR (14 * 4) +#define REG_PC (15 * 4) +#define REG_CPSR (16 * 4) +#define CPU_MODE (17 * 4) +#define CPU_HALT_STATE (18 * 4) + +#define REG_N_FLAG (20 * 4) +#define REG_Z_FLAG (21 * 4) +#define REG_C_FLAG (22 * 4) +#define REG_V_FLAG (23 * 4) +#define CHANGED_PC_STATUS (24 * 4) +#define COMPLETED_FRAME (25 * 4) +#define OAM_UPDATED (26 * 4) +#define REG_SAVE (27 * 4) +#define REG_SAVE2 (28 * 4) +#define REG_SAVE3 (29 * 4) +#define REG_SAVE4 (30 * 4) +#define REG_SAVE5 (31 * 4) + +#define reg_base x20 +#define reg_cycles w21 + +#define reg_c_flag w22 +#define reg_v_flag w23 +#define reg_z_flag w24 +#define reg_n_flag w25 + + +// Memory offsets from reg_base to the different buffers +#define RDMAP_OFF -0xB9000 // 8K pointers (64KB) +#define IWRAM_OFF -0xA9000 // 32KB (double for shadow) +#define VRAM_OFF -0x99000 // 96KB +#define EWRAM_OFF -0x81000 // 256KB (double for shadow) +#define MEM_TBL_OFF -0x1000 // Some space for the tables +#define SPSR_RAM_OFF 0x100 +#define REGMODE_RAM_OFF 0x118 +#define OAM_RAM_OFF 0x200 +#define PAL_RAM_OFF 0x600 +#define IOREG_OFF 0xA00 +#define PALCNV_RAM_OFF 0xE00 + +// Used for SWI handling +#define MODE_SUPERVISOR 3 +#define SUPERVISOR_SPSR (SPSR_RAM_OFF + 3*4) // spsr[3] +#define SUPERVISOR_LR (REGMODE_RAM_OFF + (3 * (7 * 4)) + (6 * 4)) // reg_mode[3][6] + + +// Stores and restores registers to their register storage in RAM + +#define load_registers() ;\ + ldp w6, w7, [reg_base, #0] ;\ + ldp w8, w9, [reg_base, #8] ;\ + ldp w10, w11, [reg_base, #16] ;\ + ldp w12, w13, [reg_base, #24] ;\ + ldp w14, w15, [reg_base, #32] ;\ + ldp w16, w17, [reg_base, #40] ;\ + ldp w26, w27, [reg_base, #48] ;\ + ldr w28, [reg_base, #56] ;\ + +#define store_registers() ;\ + stp w6, w7, [reg_base, #0] ;\ + stp w8, w9, [reg_base, #8] ;\ + stp w10, w11, [reg_base, #16] ;\ + stp w12, w13, [reg_base, #24] ;\ + stp w14, w15, [reg_base, #32] ;\ + stp w16, w17, [reg_base, #40] ;\ + stp w26, w27, [reg_base, #48] ;\ + str w28, [reg_base, #56] ;\ + + +// Extracts flags from CPSR into the cache flag registers + +#define extract_flags_reg(tmpreg) ;\ + ubfx reg_n_flag, tmpreg, #31, #1 ;\ + ubfx reg_z_flag, tmpreg, #30, #1 ;\ + ubfx reg_c_flag, tmpreg, #29, #1 ;\ + ubfx reg_v_flag, tmpreg, #28, #1 ;\ + +#define extract_flags(tmpreg) ;\ + ldr tmpreg, [reg_base, #REG_CPSR] ;\ + extract_flags_reg(tmpreg) ;\ + +// Collects cache flag bits and consolidates them to the CPSR reg + +#define consolidate_flags(tmpreg) ;\ + ldr tmpreg, [reg_base, #REG_CPSR] ;\ + bfi tmpreg, reg_n_flag, #31, #1 ;\ + bfi tmpreg, reg_z_flag, #30, #1 ;\ + bfi tmpreg, reg_c_flag, #29, #1 ;\ + bfi tmpreg, reg_v_flag, #28, #1 ;\ + str tmpreg, [reg_base, #REG_CPSR] ;\ + + +// Update the GBA hardware (video, sound, input, etc) +// w0: current PC + +defsymbl(a64_update_gba) + str w0, [reg_base, #REG_PC] // update the PC value + str lr, [reg_base, #REG_SAVE] // Save LR for later if needed + + consolidate_flags(w0) // update the CPSR + store_registers() // save out registers + + bl update_gba // update GBA state + + ldr w1, [reg_base, #COMPLETED_FRAME] // return to main if new frame + cbnz w1, return_to_main + + // Resume execution (perhaps from a new PC) + mov reg_cycles, w0 // load new cycle count + extract_flags(w2) // reload flag cache bits + + ldr w0, [reg_base, #CHANGED_PC_STATUS] // see if PC has change + cbnz w0, 1f // go start from new PC + + ldr lr, [reg_base, #REG_SAVE] // Restore return point + load_registers() // reload registers + ret // resume execution, no PC change + +1: // Resume from new PC + ldr w0, [reg_base, #REG_PC] // load new PC + tbnz w2, #5, 2f // CPSR.T means in thumb mode + + bl block_lookup_address_arm + load_registers() // reload registers + br x0 // jump to new ARM block +2: + bl block_lookup_address_thumb + load_registers() // reload registers + br x0 // jump to new Thumb block +.size a64_update_gba, .-a64_update_gba + + +// Cheat hooks for master function +// This is called whenever PC == cheats-master-function +// Just calls the C function to process cheats + +defsymbl(a64_cheat_hook) + store_registers() + str lr, [reg_base, #REG_SAVE] + bl process_cheats + ldr lr, [reg_base, #REG_SAVE] + load_registers() + ret + + +// These are b stubs for performing indirect branches. They are not +// linked to and don't return, instead they link elsewhere. + +// Input: +// r0: PC to branch to + +defsymbl(a64_indirect_branch_arm) + store_registers() + bl block_lookup_address_arm + load_registers() + br x0 + +defsymbl(a64_indirect_branch_thumb) + store_registers() + bl block_lookup_address_thumb + load_registers() + br x0 + +defsymbl(a64_indirect_branch_dual) + store_registers() + bl block_lookup_address_dual + load_registers() + br x0 + + +// Read CPSR and SPSR values + +defsymbl(execute_read_cpsr) + consolidate_flags(w0) // Consolidate on ret value + ret + +defsymbl(execute_read_spsr) + ldr w1, [reg_base, #CPU_MODE] // read cpu mode to w1 + add x0, reg_base, #SPSR_RAM_OFF // ptr to spsr table + ldr w0, [x0, x1, lsl #2] // Read actual value from trable + ret + + +// Update the cpsr. + +// Input: +// w0: new cpsr value +// w1: bitmask of which bits in cpsr to update +// w2: current PC + +defsymbl(execute_store_cpsr) + ldr w4, [reg_base, #REG_CPSR] // read current CPSR + and w3, w0, w1 // reg_flags = new_cpsr & store_mask + bic w4, w4, w1 // current_cpsr & ~store_mask + orr w0, w3, w4 // w3 = final CPSR value + extract_flags_reg(w0) // Update cached flags too + + str lr, [reg_base, #REG_SAVE] + store_registers() + bl execute_store_cpsr_body // Do the remaining work in C mode + + cbnz w0, 1f // If PC has changed due to this + + ldr lr, [reg_base, #REG_SAVE] // Resume execution where we left it + load_registers() + ret + +1: + // Returned value contains the PC, resume execution there + bl block_lookup_address_arm + load_registers() + br x0 // Resume in the returned block +.size execute_store_cpsr, .-execute_store_cpsr + + +// Write to SPSR +// w0: new SPSR value +// w1: store mask + +defsymbl(execute_store_spsr) + ldr w2, [reg_base, #CPU_MODE] // read cpu mode to w1 + add x2, reg_base, x2, lsl #2 // calculate table offset + ldr w3, [x2, #SPSR_RAM_OFF] // Read actual value from trable + + and w0, w0, w1 // new-spsr & mask + bic w3, w3, w1 // old-spsr & ~mask + orr w0, w0, w3 // final spsr value + + str w0, [x2, #SPSR_RAM_OFF] // Store new SPSR + ret +.size execute_store_spsr, .-execute_store_spsr + +// Restore the cpsr from the mode spsr and mode shift. + +// Input: +// r0: current pc + +defsymbl(execute_spsr_restore) + ldr w1, [reg_base, #CPU_MODE] // w1 = cpu_mode + cbz w1, 1f // Ignore if in user mode + + lsl w2, w1, #2 // We access 32 bit words + add w2, w2, #SPSR_RAM_OFF + ldr w3, [reg_base, x2] // w3 = spsr[cpu_mode] + str w3, [reg_base, #REG_CPSR] // update CPSR with SPSR value + extract_flags_reg(w3) // update cached flag values + + // This function call will pass r0 (address) and return it. + str lr, [reg_base, #REG_SAVE] + store_registers() // save ARM registers + bl execute_spsr_restore_body + ldr lr, [reg_base, #REG_SAVE] + load_registers() + +1: + ret +.size execute_spsr_restore, .-execute_spsr_restore + + +// Setup the mode transition work for calling an SWI. + +// Input: +// r0: current pc + +defsymbl(execute_swi) + str lr, [reg_base, #REG_SAVE] + str w0, [reg_base, #SUPERVISOR_LR] // Store next PC into supervisor LR + consolidate_flags(w1) // Calculate current CPSR flags + str w1, [reg_base, #SUPERVISOR_SPSR] // Store them in the SPSR + bic w1, w1, #0x3F // Clear mode bits + mov w2, #(0x13 | 0x80) // Set supervisor mode bits + orr w1, w1, w2 + str w1, [reg_base, #REG_CPSR] // Update CPSR with new value + store_registers() + mov w0, #MODE_SUPERVISOR + bl set_cpu_mode // Set supervisor mode + ldr lr, [reg_base, #REG_SAVE] + load_registers() + ret +.size execute_swi, .-execute_swi + +defsymbl(execute_arm_translate_internal) + // save registers that will be clobbered + sub sp, sp, #96 + stp x19, x20, [sp, #0] + stp x21, x22, [sp, #16] + stp x23, x24, [sp, #32] + stp x25, x26, [sp, #48] + stp x27, x28, [sp, #64] + stp x29, x30, [sp, #80] + + mov reg_cycles, w0 // load cycle counter + mov reg_base, x1 // init base_reg + + // Check whether the CPU is sleeping already, we should just wait for IRQs + ldr w1, [reg_base, #CPU_HALT_STATE] + cmp w1, #0 + bne alert_loop + + ldr w0, [reg_base, #REG_PC] // r0 = current pc + ldr w1, [reg_base, #REG_CPSR] // r1 = flags + tst w1, #0x20 // see if Thumb bit is set + extract_flags(w2) // load flags + + bne 1f // if so lookup thumb + + bl block_lookup_address_arm + load_registers() + br x0 // jump to first ARM block +1: + bl block_lookup_address_thumb + load_registers() + br x0 // jump to first Thumb block + + +// Epilogue to return to the main thread (whatever called execute_arm_translate) + +return_to_main: + // restore the saved regs and return + ldp x19, x20, [sp, #0] + ldp x21, x22, [sp, #16] + ldp x23, x24, [sp, #32] + ldp x25, x26, [sp, #48] + ldp x27, x28, [sp, #64] + ldp x29, x30, [sp, #80] + add sp, sp, #96 + ret + + +// Memory read stub routines + +#define execute_load_builder(load_type, ldop, ldmask, tblidx, ldfn) ;\ + ;\ +defsymbl(execute_load_##load_type) ;\ + tst w0, #(0xf0000000 | ldmask) ;\ + lsr w3, w0, #24 ;\ + csinc w3, wzr, w3, ne ;\ + add x4, reg_base, (MEM_TBL_OFF + tblidx*136) ;\ + ldr x3, [x4, x3, lsl #3] ;\ + br x3 ;\ + ;\ +ld_bios_##load_type: /* BIOS area, need to verify PC */;\ + lsr w3, w1, #24 /* Are we running the BIOS */;\ + cbnz w3, ld_slow_##load_type ;\ + and w0, w0, #(0x7fff) /* BIOS only 16 KB */;\ + add x3, reg_base, #(RDMAP_OFF) ;\ + ldr x3, [x3] /* x3 = bios mem buffer */;\ + ldop w0, [x3, x0] /* load actual value */;\ + ret ;\ + ;\ +ld_ewram_##load_type: /* EWRAM area */;\ + and w0, w0, #(0x3ffff) ;\ + add x3, reg_base, #EWRAM_OFF ;\ + ldop w0, [x3, x0] ;\ + ret ;\ + ;\ +ld_iwram_##load_type: /* IWRAM area */;\ + and w0, w0, #(0x7fff) ;\ + add x3, reg_base, #(IWRAM_OFF+0x8000) ;\ + ldop w0, [x3, x0] ;\ + ret ;\ + ;\ +ld_ioram_##load_type: /* I/O RAM area */;\ + and w0, w0, #(0x3ff) ;\ + add x3, reg_base, #(IOREG_OFF) ;\ + ldop w0, [x3, x0] ;\ + ret ;\ + ;\ +ld_palram_##load_type: /* PAL RAM area */;\ + and w0, w0, #(0x3ff) ;\ + add x3, reg_base, #(PAL_RAM_OFF) ;\ + ldop w0, [x3, x0] ;\ + ret ;\ + ;\ +ld_oamram_##load_type: /* OAM RAM area */;\ + and w0, w0, #(0x3ff) ;\ + add x3, reg_base, #(OAM_RAM_OFF) ;\ + ldop w0, [x3, x0] ;\ + ret ;\ + ;\ +ld_rdmap_##load_type: ;\ + lsr w4, w0, #15 /* Each block is 32KB */;\ + add x3, reg_base, #(RDMAP_OFF) ;\ + ldr x4, [x3, x4, lsl #3] /* x4 = table pointer */;\ + and w0, w0, #(0x7fff) /* 32KB pages */;\ + ldop w0, [x4, x0] /* load actual value */;\ + ret ;\ + ;\ +ld_slow_##load_type: /* Slow C path */;\ + str w1, [reg_base, #REG_PC] /* write out PC */;\ + str lr, [reg_base, #REG_SAVE] /* Save LR */;\ + store_registers() ;\ + bl ldfn ;\ + ldr lr, [reg_base, #REG_SAVE] ;\ + load_registers() ;\ + ret ;\ +.size execute_load_##load_type, .-execute_load_##load_type + +#define load_lookup_table(load_type, aload_type) ;\ + .quad ld_slow_##aload_type /* -1: Unaligned/Bad access */;\ + .quad ld_bios_##aload_type /* 0x00: BIOS */;\ + .quad ld_slow_##aload_type /* 0x01: Open bus */;\ + .quad ld_ewram_##load_type /* 0x02: ewram */;\ + .quad ld_iwram_##load_type /* 0x03: iwram */;\ + .quad ld_ioram_##load_type /* 0x04: I/O regs */;\ + .quad ld_palram_##load_type /* 0x05: palette RAM */;\ + .quad ld_rdmap_##load_type /* 0x06: vram */;\ + .quad ld_oamram_##load_type /* 0x07: oam ram */;\ + .quad ld_rdmap_##load_type /* 0x08: gamepak: ignore */;\ + .quad ld_rdmap_##load_type /* 0x09: gamepak: ignore */;\ + .quad ld_rdmap_##load_type /* 0x0A: gamepak: ignore */;\ + .quad ld_rdmap_##load_type /* 0x0B: gamepak: ignore */;\ + .quad ld_rdmap_##load_type /* 0x0C: gamepak: ignore */;\ + .quad ld_slow_##aload_type /* 0x0D: EEPROM */;\ + .quad ld_slow_##aload_type /* 0x0E: backup */;\ + .quad ld_slow_##aload_type /* 0x0F: ignore */;\ + +// Aligned load is a bit special +defsymbl(execute_aligned_load32) + tst w0, #(0xf0000000) + lsr w3, w0, #24 + csinc w3, wzr, w3, ne + add x4, reg_base, (MEM_TBL_OFF + 5*136) + ldr x3, [x4, x3, lsl #3] + br x3 +ld_slow_aligned_u32: // Slow C path for multiple loads + str lr, [reg_base, #REG_SAVE] // Save LR + store_registers() + bl read_memory32 + ldr lr, [reg_base, #REG_SAVE] + load_registers() + ret +ld_bios_aligned_u32: + and w0, w0, #(0x7fff) // Do not verify PC on purpose + add x3, reg_base, #(RDMAP_OFF) + ldr x3, [x3] + ldr w0, [x3, x0] + ret + + +execute_load_builder( u8, ldrb, 0, 0, read_memory8) +execute_load_builder( s8, ldrsb, 0, 1, read_memory8s) +execute_load_builder(u16, ldrh, 1, 2, read_memory16) +execute_load_builder(s16, ldrsh, 1, 3, read_memory16s) +execute_load_builder(u32, ldr, 3, 4, read_memory32) + + +// Prepares for a external store (calls C code) +#define store_align_8() and w1, w1, #0xff +#define store_align_16() and w1, w1, #0xffff; bic w0, w0, #1 +#define store_align_32() bic w0, w0, #3 + +// Write out to memory. + +// Input: +// w0: address +// w1: value +// w2: PC value + +#define execute_store_builder(store_type, str_op, str_op16, load_op, \ + stmask, stmask16, tblidx) ;\ + ;\ +defsymbl(execute_store_u##store_type) ;\ + lsr w4, w0, #28 ;\ + lsr w3, w0, #24 ;\ + cbnz w4, ext_store_u##store_type ;\ + add x4, reg_base, (MEM_TBL_OFF + 816 + tblidx*128) ;\ + ldr x3, [x4, x3, lsl #3] ;\ + br x3 ;\ + ;\ +ext_store_u##store_type: ;\ +ext_store_u##store_type##_safe: ;\ + str w2, [reg_base, #REG_PC] /* write out PC */;\ + str lr, [reg_base, #REG_SAVE] /* Preserve LR */;\ + store_align_##store_type() ;\ + store_registers() ;\ + bl write_memory##store_type ;\ + cbnz w0, write_epilogue /* handle additional write stuff */;\ + ldr lr, [reg_base, #REG_SAVE] ;\ + load_registers() ;\ + ret /* resume if no side effects */;\ + ;\ +ext_store_iwram_u##store_type: ;\ + and w0, w0, #(0x7fff & ~stmask) /* Mask to mirror memory (+align)*/;\ + add x3, reg_base, #(IWRAM_OFF+0x8000) /* x3 = iwram base */;\ + str_op w1, [x0, x3] /* store data */;\ + sub x3, x3, #0x8000 /* x3 = iwram smc base */;\ + load_op w1, [x0, x3] /* w1 = SMC sentinel */;\ + cbnz w1, 3f /* Check value, should be zero */;\ + ret /* return */;\ + ;\ +ext_store_ewram_u##store_type: ;\ + and w0, w0, #(0x3ffff & ~stmask) /* Mask to mirror memory (+align)*/;\ + add x3, reg_base, #EWRAM_OFF /* x3 = ewram base */;\ + str_op w1, [x0, x3] /* store data */;\ + add x3, x3, #0x40000 /* x3 = ewram smc base */;\ + load_op w1, [x0, x3] /* w1 = SMC sentinel */;\ + cbnz w1, 3f /* Check value, should be zero */;\ + ret /* return */;\ + ;\ +ext_store_vram_u##store_type: ;\ +ext_store_vram_u##store_type##_safe: ;\ + and w0, w0, #(0x1ffff & ~stmask16) /* Mask to mirror memory (+align)*/;\ + sub w3, w0, #0x8000 /* Mirrored addr for last bank */;\ + cmp w0, #0x18000 /* Check if exceeds 96KB */;\ + csel w0, w3, w0, cs /* If it does, pick the mirror */;\ + add x3, reg_base, #VRAM_OFF /* x3 = ewram base */;\ + str_op16 w1, [x0, x3] /* store data */;\ + ret /* return */;\ + ;\ +ext_store_oam_ram_u##store_type: ;\ +ext_store_oam_ram_u##store_type##_safe: ;\ + and w0, w0, #(0x3ff & ~stmask16) /* Mask to mirror memory (+align)*/;\ + add x3, reg_base, #OAM_RAM_OFF /* x3 = oam ram base */;\ + str_op16 w1, [x0, x3] /* store data */;\ + str w29, [reg_base, #OAM_UPDATED] /* write non zero to signal */;\ + ret /* return */;\ + ;\ +3: ;\ + str w2, [reg_base, #REG_PC] /* write out PC */;\ + store_registers() /* store registers */;\ + consolidate_flags(w1) ;\ + b smc_write /* perform smc write */;\ +.size execute_store_u##store_type, .-execute_store_u##store_type + +// for ignored areas, just return +ext_store_ignore: + ret // return + +#define store_lookup_table(store_type) ;\ + .quad ext_store_ignore /* 0x00: BIOS, ignore */;\ + .quad ext_store_ignore /* 0x01: ignore */;\ + .quad ext_store_ewram_u##store_type /* 0x02: ewram */;\ + .quad ext_store_iwram_u##store_type /* 0x03: iwram */;\ + .quad ext_store_u##store_type /* 0x04: I/O regs */;\ + .quad ext_store_u##store_type /* 0x05: palette RAM */;\ + .quad ext_store_vram_u##store_type /* 0x06: vram */;\ + .quad ext_store_oam_ram_u##store_type /* 0x07: oam ram */;\ + .quad ext_store_u##store_type /* 0x08: gamepak: ignore */;\ + .quad ext_store_u##store_type /* 0x09: gamepak: ignore */;\ + .quad ext_store_u##store_type /* 0x0A: gamepak: ignore */;\ + .quad ext_store_u##store_type /* 0x0B: gamepak: ignore */;\ + .quad ext_store_u##store_type /* 0x0C: gamepak: ignore */;\ + .quad ext_store_u##store_type /* 0x0D: EEPROM */;\ + .quad ext_store_u##store_type /* 0x0E: backup */;\ + .quad ext_store_ignore /* 0x0F: ignore */;\ + +execute_store_builder(8, strb, strh, ldrb, 0, 1, 0) +execute_store_builder(16, strh, strh, ldrh, 1, 1, 1) +execute_store_builder(32, str, str, ldr, 3, 3, 2) + +// This is a store that is executed in a strm case (so no SMC checks in-between) + +defsymbl(execute_aligned_store32) + lsr w4, w0, #28 + lsr w3, w0, #24 + cbnz w4, ext_store_u32 + add x4, reg_base, MEM_TBL_OFF + 816 + 3*128 + ldr x3, [x4, x3, lsl #3] + br x3 +ext_store_iwram_u32_safe: + and w0, w0, #(0x7fff) // Mask to mirror memory (no need to align!) + add x3, reg_base, #(IWRAM_OFF+0x8000) // x3 = iwram base + str w1, [x0, x3] // store data + ret // Return +ext_store_ewram_u32_safe: + and w0, w0, #(0x3ffff) // Mask to mirror memory (no need to align!) + add x3, reg_base, #(EWRAM_OFF) // x3 = ewram base + str w1, [x0, x3] // store data + ret // Return +.size execute_aligned_store32, .-execute_aligned_store32 + +// This is called whenever an external store with side effects was performed +write_epilogue: + consolidate_flags(w1) // update the CPSR before update + + cmp w0, #2 // see if the alert is due to SMC + beq smc_write // if so, goto SMC handler + +alert_loop: + bl update_gba // update GBA until CPU isn't halted + + ldr w1, [reg_base, #COMPLETED_FRAME] // Check whether a frame was completed + cbnz w1, return_to_main // and return to caller function. + + ldr w1, [reg_base, #CPU_HALT_STATE] // Check whether the CPU is halted + cbnz w1, alert_loop // and keep looping until it is + + mov reg_cycles, w0 // load new cycle count + ldr w0, [reg_base, #REG_PC] // load new PC + b lookup_pc // Resume execution at that PC + + +smc_write: + bl flush_translation_cache_ram + ldr w0, [reg_base, #REG_PC] // load "current new" PC + +// Resume execution at PC (at w0) +lookup_pc: + ldr w1, [reg_base, #REG_CPSR] // w1 = flags + extract_flags_reg(w1) + tbnz w1, #5, 2f // see if Thumb bit is set + + // Lookup and jump to the right mode block + bl block_lookup_address_arm + load_registers() + br x0 +2: + bl block_lookup_address_thumb + load_registers() + br x0 + +.data +.align 4 +defsymbl(ldst_handler_functions) + load_lookup_table(u8, u8) + load_lookup_table(s8, s8) + load_lookup_table(u16, u16) + load_lookup_table(s16, s16) + load_lookup_table(u32, u32) + load_lookup_table(u32, aligned_u32) + store_lookup_table(8) + store_lookup_table(16) + store_lookup_table(32) + store_lookup_table(32_safe) + +.bss +.align 4 + +defsymbl(memory_map_read) + .space 0x10000 +defsymbl(iwram) + .space 0x10000 +defsymbl(vram) + .space 0x18000 +defsymbl(ewram) + .space 0x80000 +defsymbl(ldst_lookup_tables) + .space 4096 +defsymbl(reg) + .space 0x100 +defsymbl(spsr) + .space 24 +defsymbl(reg_mode) + .space 196 + .space 36 // Padding +defsymbl(oam_ram) + .space 0x400 +defsymbl(palette_ram) + .space 0x400 +defsymbl(io_registers) + .space 0x400 +defsymbl(palette_ram_converted) + .space 0x400 + + diff --git a/cpu_threaded.c b/cpu_threaded.c index 0d3a989..7988493 100644 --- a/cpu_threaded.c +++ b/cpu_threaded.c @@ -218,6 +218,8 @@ extern u8 bit_count[256]; #include "mips/mips_emit.h" #elif defined(ARM_ARCH) #include "arm/arm_emit.h" +#elif defined(ARM64_ARCH) + #include "arm/arm64_emit.h" #else #include "x86/x86_emit.h" #endif @@ -243,7 +245,7 @@ extern u8 bit_count[256]; void platform_cache_sync(void *baseaddr, void *endptr) { ctr_flush_invalidate_cache(); } -#elif defined(ARM_ARCH) +#elif defined(ARM_ARCH) || defined(ARM64_ARCH) void platform_cache_sync(void *baseaddr, void *endptr) { __clear_cache(baseaddr, endptr); } diff --git a/jni/Android.mk b/jni/Android.mk index 1ece93b..f582726 100644 --- a/jni/Android.mk +++ b/jni/Android.mk @@ -12,6 +12,10 @@ ifeq ($(TARGET_ARCH),arm) COREFLAGS += -DARM_ARCH -DMMAP_JIT_CACHE CPU_ARCH := arm HAVE_DYNAREC := 1 +else ifeq ($(TARGET_ARCH),arm64) + COREFLAGS += -DARM64_ARCH -DMMAP_JIT_CACHE + CPU_ARCH := arm64 + HAVE_DYNAREC := 1 else ifeq ($(TARGET_ARCH),x86) COREFLAGS += -DMMAP_JIT_CACHE CPU_ARCH := x86_32 diff --git a/tests/Makefile b/tests/Makefile new file mode 100644 index 0000000..20a92d9 --- /dev/null +++ b/tests/Makefile @@ -0,0 +1,12 @@ + +ARMV8PFX=/opt/buildroot-armv8el-uclibc/bin/aarch64-buildroot-linux-uclibc + +all: + gcc -o arm64gen arm64gen.c -ggdb -I../arm/ + ./arm64gen > bytecode.bin + $(ARMV8PFX)-as -o bytecoderef.o arm64gen.S + $(ARMV8PFX)-objcopy -O binary bytecoderef.o bytecoderef.bin + @ cmp bytecoderef.bin bytecode.bin || echo "Bytecode mismatch" + @ cmp bytecoderef.bin bytecode.bin && echo "Test passed!" + + diff --git a/tests/arm64gen.S b/tests/arm64gen.S new file mode 100644 index 0000000..127463f --- /dev/null +++ b/tests/arm64gen.S @@ -0,0 +1,208 @@ + +b 16*4 +bl 16*4 + +b.eq 16*4 +b.ne 16*4 +b.hs 16*4 +b.lo 16*4 +b.mi 16*4 +b.pl 16*4 +b.vs 16*4 +b.vc 16*4 +b.hi 16*4 +b.ls 16*4 +b.ge 16*4 +b.lt 16*4 +b.gt 16*4 +b.le 16*4 +b.al 16*4 +b.nv 16*4 + +ldr w1, [x2, #64] +ldr w29, [x30, #64] +str w1, [x2, #64] +str w29, [x30, #64] + +mov w0, #0x1234 +mov w12, #0x5656 +mov w12, #0xFFFF + +movk w13, #0x9876, lsl #16 +movk w13, #0xFFFF, lsl #16 + +movz w13, #0xabcd, lsl #16 + +mov w14, #0xffff5555 + +add w11, w12, w13, lsl #0 +add w11, w12, w13, lsl #19 +add w11, w12, w13, lsl #31 + +add w1, w29, #0x123 +add w1, w29, #0xFFF +sub w1, w29, #0x123 +sub w1, w29, #0xFFF + +add w3, w30, #0x123000 +add w3, w30, #0xFFF000 +sub w3, w30, #0x123000 +sub w3, w30, #0xFFF000 + +adds w29, w30, #0x123 +adds w29, w30, #0xFFF +subs w29, w30, #0x123 +subs w29, w30, #0xFFF + +madd w2, w3, w4, w5 +madd w25, w26, w27, w28 +msub w2, w3, w4, w5 +msub w25, w26, w27, w28 + +smaddl x2, w3, w4, x5 +smaddl x25, w26, w27, x28 +umaddl x2, w3, w4, x5 +umaddl x25, w26, w27, x28 + +mul w1, w2, w3 +mul w27, w28, w29 + +ror w1, w2, #1 +ror w1, w2, #31 +ror w30, w29, #1 +ror w30, w29, #31 + +lsr w1, w2, #1 +lsr w1, w2, #31 +lsr w30, w29, #1 +lsr w30, w29, #31 + +lsl w1, w2, #1 +lsl w1, w2, #31 +lsl w30, w29, #1 +lsl w30, w29, #31 + +asr w1, w2, #1 +asr w1, w2, #31 +asr w30, w29, #1 +asr w30, w29, #31 + +lsr x1, x2, #1 +lsr x1, x2, #2 +lsr x1, x2, #62 +lsr x1, x2, #63 +lsr x30, x29, #1 +lsr x30, x29, #62 + +eor w3, w4, #1 +eor w3, w4, #(~1) +orr w3, w4, #1 +orr w3, w4, #(~1) +and w3, w4, #1 +and w3, w4, #(~3) + +and x3, x4, #0xffffffff +and x3, x4, #0x1 +and x1, x2, #1 +and x1, x2, #(~1) +and x1, x2, #0xffffffff + +mov w1, w2 +mov w30, wzr + +orr w1, w2, w3 +orr w29, w30, wzr +eor w1, w2, w3 +eor w29, w30, wzr +orn w1, w2, w3 +orn w29, w30, wzr +and w1, w2, w3 +and w29, w30, wzr +bic w1, w2, w3 +bic w29, w30, wzr +ands w1, w2, w3 +ands w29, w30, wzr + +tst w1, w2 +tst w25, wzr +cmp w1, #0 +cmp w30, #0 +cmp w1, #32 +cmp w30, #32 +cmp w1, #200 +cmp w30, #200 + +add w1, w2, w3 +add w29, w30, w28 +sub w1, w2, w3 +sub w29, w30, w28 +adc w1, w2, w3 +adc w29, w30, w28 +sbc w1, w2, w3 +sbc w29, w30, w28 +adds w1, w2, w3 +adds w29, w30, w28 +subs w1, w2, w3 +subs w29, w30, w28 +adcs w1, w2, w3 +adcs w29, w30, w28 +sbcs w1, w2, w3 +sbcs w29, w30, w28 + +tbz w20, #1, 63*4 +tbnz w20, #1, 63*4 +tbz w20, #0, 2*4 +tbnz w20, #7, 2*4 + +cbz w20, 63*4 +cbnz w20, 63*4 +cbz w20, 2*4 +cbnz w20, 2*4 + +csel w20, w24, w25, ne +csel w1, w2, w3, eq +csel w1, w20, wzr, lt +csel w1, wzr, wzr, gt + +csinc w20, w24, w25, ne +csinc w1, w2, w3, eq +csinc w1, w20, wzr, lt +csinc w1, wzr, wzr, gt + +csinv w20, w24, w25, ne +csinv w1, w2, w3, eq +csinv w1, w20, wzr, lt +csinv w1, wzr, wzr, gt + +csneg w20, w24, w25, ne +csneg w1, w2, w3, eq +csneg w1, w20, wzr, lt +csneg w1, wzr, wzr, gt + +cset w1, eq +cset w1, hs +cset w20, lo +csetm w1, hs +csetm w20, lo + +ubfx w1, w2, #8, #8 +ubfx w1, w2, #16, #16 +ubfx w1, wzr, #8, #24 +ubfx w1, wzr, #16, #16 + +rorv w1, w2, w3 +rorv w28, w29, w30 +lslv w1, w2, w3 +lslv w28, w29, w30 +lsrv w1, w2, w3 +lsrv w28, w29, w30 +asrv w1, w2, w3 +asrv w28, w29, w30 + +orr x1, x2, x3, lsl #32 +orr x25, x26, x27, lsl #32 + +sdiv w1, w2, w3 +sdiv w28, w29, w30 + + diff --git a/tests/arm64gen.c b/tests/arm64gen.c new file mode 100644 index 0000000..55b57d9 --- /dev/null +++ b/tests/arm64gen.c @@ -0,0 +1,223 @@ + +#define u32 uint32_t +#define u8 uint8_t + +#include +#include +#include "arm64_codegen.h" + +int main() { + u32 buffer[1024]; + u8 *translation_ptr = (u8*)&buffer[0]; + + aa64_emit_branch(16); + aa64_emit_brlink(16); + + aa64_emit_brcond(ccode_eq, 16); + aa64_emit_brcond(ccode_ne, 16); + aa64_emit_brcond(ccode_hs, 16); + aa64_emit_brcond(ccode_lo, 16); + aa64_emit_brcond(ccode_mi, 16); + aa64_emit_brcond(ccode_pl, 16); + aa64_emit_brcond(ccode_vs, 16); + aa64_emit_brcond(ccode_vc, 16); + aa64_emit_brcond(ccode_hi, 16); + aa64_emit_brcond(ccode_ls, 16); + aa64_emit_brcond(ccode_ge, 16); + aa64_emit_brcond(ccode_lt, 16); + aa64_emit_brcond(ccode_gt, 16); + aa64_emit_brcond(ccode_le, 16); + aa64_emit_brcond(ccode_al, 16); + aa64_emit_brcond(ccode_nv, 16); + + aa64_emit_ldr(1, 2, 16); + aa64_emit_ldr(29, 30, 16); + aa64_emit_str(1, 2, 16); + aa64_emit_str(29, 30, 16); + + aa64_emit_movlo(0, 0x1234); + aa64_emit_movlo(12, 0x5656); + aa64_emit_movlo(12, ~0); + + aa64_emit_movhi(13, 0x9876); + aa64_emit_movhi(13, ~0); + + aa64_emit_movhiz(13, 0xabcd); + + aa64_emit_movne(14, 0xAAAA); + + aa64_emit_add_lsl(11, 12, 13, 0); + aa64_emit_add_lsl(11, 12, 13, 19); + aa64_emit_add_lsl(11, 12, 13, 31); + + aa64_emit_addi(1, 29, 0x123); + aa64_emit_addi(1, 29, 0xFFF); + aa64_emit_subi(1, 29, 0x123); + aa64_emit_subi(1, 29, 0xFFF); + + aa64_emit_addi12(3, 30, 0x123); + aa64_emit_addi12(3, 30, 0xFFF); + aa64_emit_subi12(3, 30, 0x123); + aa64_emit_subi12(3, 30, 0xFFF); + + aa64_emit_addis(29, 30, 0x123); + aa64_emit_addis(29, 30, 0xFFF); + aa64_emit_subis(29, 30, 0x123); + aa64_emit_subis(29, 30, 0xFFF); + + aa64_emit_madd(2, 5, 3, 4); + aa64_emit_madd(25, 28, 26, 27); + aa64_emit_msub(2, 5, 3, 4); + aa64_emit_msub(25, 28, 26, 27); + + aa64_emit_smaddl(2, 5, 3, 4); + aa64_emit_smaddl(25, 28, 26, 27); + aa64_emit_umaddl(2, 5, 3, 4); + aa64_emit_umaddl(25, 28, 26, 27); + + aa64_emit_mul(1, 2, 3); + aa64_emit_mul(27, 28, 29); + + aa64_emit_ror(1, 2, 1); + aa64_emit_ror(1, 2, 31); + aa64_emit_ror(30, 29, 1); + aa64_emit_ror(30, 29, 31); + + aa64_emit_lsr(1, 2, 1); + aa64_emit_lsr(1, 2, 31); + aa64_emit_lsr(30, 29, 1); + aa64_emit_lsr(30, 29, 31); + + aa64_emit_lsl(1, 2, 1); + aa64_emit_lsl(1, 2, 31); + aa64_emit_lsl(30, 29, 1); + aa64_emit_lsl(30, 29, 31); + + aa64_emit_asr(1, 2, 1); + aa64_emit_asr(1, 2, 31); + aa64_emit_asr(30, 29, 1); + aa64_emit_asr(30, 29, 31); + + aa64_emit_lsr64(1, 2, 1); + aa64_emit_lsr64(1, 2, 2); + aa64_emit_lsr64(1, 2, 62); + aa64_emit_lsr64(1, 2, 63); + aa64_emit_lsr64(30, 29, 1); + aa64_emit_lsr64(30, 29, 62); + + aa64_emit_eori(3, 4, 0, 0); + aa64_emit_eori(3, 4, 31, 30); /* ~1 */ + aa64_emit_orri(3, 4, 0, 0); + aa64_emit_orri(3, 4, 31, 30); + aa64_emit_andi(3, 4, 0, 0); + aa64_emit_andi(3, 4, 30, 29); /* ~3 */ + + aa64_emit_andi64(3, 4, 0, 31); + aa64_emit_andi64(3, 4, 0, 0); + aa64_emit_andi64(1, 2, 0, 0); /* & 1 */ + aa64_emit_andi64(1, 2, 63, 62); /* & ~1 */ + aa64_emit_andi64(1, 2, 0, 31); /* & 0xffffffff */ + + aa64_emit_mov(1, 2); + aa64_emit_mov(30, 31); + + aa64_emit_orr(1, 2, 3); + aa64_emit_orr(29, 30, 31); + aa64_emit_xor(1, 2, 3); + aa64_emit_xor(29, 30, 31); + aa64_emit_orn(1, 2, 3); + aa64_emit_orn(29, 30, 31); + aa64_emit_and(1, 2, 3); + aa64_emit_and(29, 30, 31); + aa64_emit_bic(1, 2, 3); + aa64_emit_bic(29, 30, 31); + aa64_emit_ands(1, 2, 3); + aa64_emit_ands(29, 30, 31); + + aa64_emit_tst(1, 2); + aa64_emit_tst(25, 31); + + aa64_emit_cmpi(1, 0); + aa64_emit_cmpi(30, 0); + aa64_emit_cmpi(1, 32); + aa64_emit_cmpi(30, 32); + aa64_emit_cmpi(1, 200); + aa64_emit_cmpi(30, 200); + + aa64_emit_add(1, 2, 3); + aa64_emit_add(29, 30, 28); + aa64_emit_sub(1, 2, 3); + aa64_emit_sub(29, 30, 28); + aa64_emit_adc(1, 2, 3); + aa64_emit_adc(29, 30, 28); + aa64_emit_sbc(1, 2, 3); + aa64_emit_sbc(29, 30, 28); + aa64_emit_adds(1, 2, 3); + aa64_emit_adds(29, 30, 28); + aa64_emit_subs(1, 2, 3); + aa64_emit_subs(29, 30, 28); + aa64_emit_adcs(1, 2, 3); + aa64_emit_adcs(29, 30, 28); + aa64_emit_sbcs(1, 2, 3); + aa64_emit_sbcs(29, 30, 28); + + aa64_emit_tbz(20, 1, 63); + aa64_emit_tbnz(20, 1, 63); + aa64_emit_tbz(20, 0, 2); + aa64_emit_tbnz(20, 7, 2); + + aa64_emit_cbz(20, 63); + aa64_emit_cbnz(20, 63); + aa64_emit_cbz(20, 2); + aa64_emit_cbnz(20, 2); + + aa64_emit_csel(20, 24, 25, ccode_ne); + aa64_emit_csel(1, 2, 3, ccode_eq); + aa64_emit_csel(1, 20, 31, ccode_lt); + aa64_emit_csel(1, 31, 31, ccode_gt); + + aa64_emit_csinc(20, 24, 25, ccode_ne); + aa64_emit_csinc(1, 2, 3, ccode_eq); + aa64_emit_csinc(1, 20, 31, ccode_lt); + aa64_emit_csinc(1, 31, 31, ccode_gt); + + aa64_emit_csinv(20, 24, 25, ccode_ne); + aa64_emit_csinv(1, 2, 3, ccode_eq); + aa64_emit_csinv(1, 20, 31, ccode_lt); + aa64_emit_csinv(1, 31, 31, ccode_gt); + + aa64_emit_csneg(20, 24, 25, ccode_ne); + aa64_emit_csneg(1, 2, 3, ccode_eq); + aa64_emit_csneg(1, 20, 31, ccode_lt); + aa64_emit_csneg(1, 31, 31, ccode_gt); + + aa64_emit_cset(1, ccode_eq); + aa64_emit_cset(1, ccode_hs); + aa64_emit_cset(20, ccode_lo); + aa64_emit_csetm(1, ccode_hs); + aa64_emit_csetm(20, ccode_lo); + + aa64_emit_ubfx(1, 2, 8, 8); + aa64_emit_ubfx(1, 2, 16, 16); + aa64_emit_ubfx(1, 31, 8, 24); + aa64_emit_ubfx(1, 31, 16, 16); + + aa64_emit_rorv(1, 2, 3); + aa64_emit_rorv(28, 29, 30); + aa64_emit_lslv(1, 2, 3); + aa64_emit_lslv(28, 29, 30); + aa64_emit_lsrv(1, 2, 3); + aa64_emit_lsrv(28, 29, 30); + aa64_emit_asrv(1, 2, 3); + aa64_emit_asrv(28, 29, 30); + + aa64_emit_merge_regs(1, 3, 2); /* hi, lo */ + aa64_emit_merge_regs(25, 27, 26); + + aa64_emit_sdiv(1, 2, 3); + aa64_emit_sdiv(28, 29, 30); + + fwrite(buffer, 1, translation_ptr-(u8*)buffer, stdout); +} + +