1 files changed, 9508 insertions, 0 deletions
diff --git a/fluent-bit/lib/wasm-micro-runtime-WAMR-1.2.2/core/iwasm/fast-jit/cg/x86-64/jit_codegen_x86_64.cpp b/fluent-bit/lib/wasm-micro-runtime-WAMR-1.2.2/core/iwasm/fast-jit/cg/x86-64/jit_codegen_x86_64.cpp
new file mode 100644
index 000000000..e28acf98a
--- /dev/null
+++ b/fluent-bit/lib/wasm-micro-runtime-WAMR-1.2.2/core/iwasm/fast-jit/cg/x86-64/jit_codegen_x86_64.cpp
@@ -0,0 +1,9508 @@
+/*
+ * Copyright (C) 2021 Intel Corporation.  All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+#include "jit_codegen.h"
+#include "jit_codecache.h"
+#include "jit_compiler.h"
+#include "jit_frontend.h"
+#include "jit_dump.h"
+
+#include <asmjit/core.h>
+#include <asmjit/x86.h>
+#if WASM_ENABLE_FAST_JIT_DUMP != 0
+#include <Zydis/Zydis.h>
+#endif
+
+#define CODEGEN_CHECK_ARGS 1
+#define CODEGEN_DUMP 0
+
+using namespace asmjit;
+
+static char *code_block_switch_to_jitted_from_interp = NULL;
+static char *code_block_return_to_interp_from_jitted = NULL;
+#if WASM_ENABLE_LAZY_JIT != 0
+static char *code_block_compile_fast_jit_and_then_call = NULL;
+#endif
+
+typedef enum {
+    REG_BPL_IDX = 0,
+    REG_AXL_IDX,
+    REG_BXL_IDX,
+    REG_CXL_IDX,
+    REG_DXL_IDX,
+    REG_DIL_IDX,
+    REG_SIL_IDX,
+    REG_I8_FREE_IDX = REG_SIL_IDX
+} RegIndexI8;
+
+typedef enum {
+    REG_BP_IDX = 0,
+    REG_AX_IDX,
+    REG_BX_IDX,
+    REG_CX_IDX,
+    REG_DX_IDX,
+    REG_DI_IDX,
+    REG_SI_IDX,
+    REG_I16_FREE_IDX = REG_SI_IDX
+} RegIndexI16;
+
+typedef enum {
+    REG_EBP_IDX = 0,
+    REG_EAX_IDX,
+    REG_EBX_IDX,
+    REG_ECX_IDX,
+    REG_EDX_IDX,
+    REG_EDI_IDX,
+    REG_ESI_IDX,
+    REG_I32_FREE_IDX = REG_ESI_IDX
+} RegIndexI32;
+
+typedef enum {
+    REG_RBP_IDX = 0,
+    REG_RAX_IDX,
+    REG_RBX_IDX,
+    REG_RCX_IDX,
+    REG_RDX_IDX,
+    REG_RDI_IDX,
+    REG_RSI_IDX,
+    REG_RSP_IDX,
+    REG_R8_IDX,
+    REG_R9_IDX,
+    REG_R10_IDX,
+    REG_R11_IDX,
+    REG_R12_IDX,
+    REG_R13_IDX,
+    REG_R14_IDX,
+    REG_R15_IDX,
+    REG_I64_FREE_IDX = REG_RSI_IDX
+} RegIndexI64;
+
+/* clang-format off */
+x86::Gp regs_i8[] = {
+    x86::bpl,  x86::al, x86::bl, x86::cl,
+    x86::dl,   x86::dil,  x86::sil,  x86::spl,
+    x86::r8b,  x86::r9b,  x86::r10b, x86::r11b,
+    x86::r12b, x86::r13b, x86::r14b, x86::r15b
+};
+
+x86::Gp regs_i16[] = {
+    x86::bp,   x86::ax,   x86::bx,   x86::cx,
+    x86::dx,   x86::di,   x86::si,   x86::sp,
+    x86::r8w,  x86::r9w,  x86::r10w, x86::r11w,
+    x86::r12w, x86::r13w, x86::r14w, x86::r15w
+};
+
+x86::Gp regs_i32[] = {
+    x86::ebp,  x86::eax,  x86::ebx,  x86::ecx,
+    x86::edx,  x86::edi,  x86::esi,  x86::esp,
+    x86::r8d,  x86::r9d,  x86::r10d, x86::r11d,
+    x86::r12d, x86::r13d, x86::r14d, x86::r15d
+};
+
+x86::Gp regs_i64[] = {
+    x86::rbp, x86::rax, x86::rbx, x86::rcx,
+    x86::rdx, x86::rdi, x86::rsi, x86::rsp,
+    x86::r8,  x86::r9,  x86::r10, x86::r11,
+    x86::r12, x86::r13, x86::r14, x86::r15,
+};
+
+#define REG_F32_FREE_IDX 15
+#define REG_F64_FREE_IDX 15
+
+x86::Xmm regs_float[] = {
+    x86::xmm0,
+    x86::xmm1,
+    x86::xmm2,
+    x86::xmm3,
+    x86::xmm4,
+    x86::xmm5,
+    x86::xmm6,
+    x86::xmm7,
+    x86::xmm8,
+    x86::xmm9,
+    x86::xmm10,
+    x86::xmm11,
+    x86::xmm12,
+    x86::xmm13,
+    x86::xmm14,
+    x86::xmm15,
+};
+/* clang-format on */
+
+int
+jit_codegen_interp_jitted_glue(void *exec_env, JitInterpSwitchInfo *info,
+                               uint32 func_idx, void *target)
+{
+    typedef int32 (*F)(const void *exec_env, void *info, uint32 func_idx,
+                       const void *target);
+    union {
+        F f;
+        void *v;
+    } u;
+
+    u.v = code_block_switch_to_jitted_from_interp;
+    return u.f(exec_env, info, func_idx, target);
+}
+
+#define PRINT_LINE() LOG_VERBOSE("<Line:%d>\n", __LINE__)
+
+#if CODEGEN_DUMP != 0
+#define GOTO_FAIL     \
+    do {              \
+        PRINT_LINE(); \
+        goto fail;    \
+    } while (0)
+#else
+#define GOTO_FAIL goto fail
+#endif
+
+#if CODEGEN_CHECK_ARGS == 0
+
+#define CHECK_EQKIND(reg0, reg1) (void)0
+#define CHECK_CONST(reg0) (void)0
+#define CHECK_NCONST(reg0) (void)0
+#define CHECK_KIND(reg0, type) (void)0
+#define CHECK_REG_NO(no, kind) (void)0
+#else
+
+/* Check if two register's kind is equal */
+#define CHECK_EQKIND(reg0, reg1)                        \
+    do {                                                \
+        if (jit_reg_kind(reg0) != jit_reg_kind(reg1)) { \
+            PRINT_LINE();                               \
+            LOG_VERBOSE("reg type not equal:\n");       \
+            jit_dump_reg(cc, reg0);                     \
+            jit_dump_reg(cc, reg1);                     \
+            GOTO_FAIL;                                  \
+        }                                               \
+    } while (0)
+
+/* Check if a register is an const */
+#define CHECK_CONST(reg0)                       \
+    do {                                        \
+        if (!jit_reg_is_const(reg0)) {          \
+            PRINT_LINE();                       \
+            LOG_VERBOSE("reg is not const:\n"); \
+            jit_dump_reg(cc, reg0);             \
+            GOTO_FAIL;                          \
+        }                                       \
+    } while (0)
+
+/* Check if a register is not an const */
+#define CHECK_NCONST(reg0)                  \
+    do {                                    \
+        if (jit_reg_is_const(reg0)) {       \
+            PRINT_LINE();                   \
+            LOG_VERBOSE("reg is const:\n"); \
+            jit_dump_reg(cc, reg0);         \
+            GOTO_FAIL;                      \
+        }                                   \
+    } while (0)
+
+/* Check if a register is a special type */
+#define CHECK_KIND(reg0, type)                                  \
+    do {                                                        \
+        if (jit_reg_kind(reg0) != type) {                       \
+            PRINT_LINE();                                       \
+            LOG_VERBOSE("invalid reg type %d, expected is: %d", \
+                        jit_reg_kind(reg0), type);              \
+            jit_dump_reg(cc, reg0);                             \
+            GOTO_FAIL;                                          \
+        }                                                       \
+    } while (0)
+
+#define CHECK_I32_REG_NO(no)                                      \
+    do {                                                          \
+        if ((uint32)no >= sizeof(regs_i32) / sizeof(regs_i32[0])) \
+            GOTO_FAIL;                                            \
+    } while (0)
+
+#define CHECK_I64_REG_NO(no)                                      \
+    do {                                                          \
+        if ((uint32)no >= sizeof(regs_i64) / sizeof(regs_i64[0])) \
+            GOTO_FAIL;                                            \
+    } while (0)
+
+#define CHECK_F32_REG_NO(no)                                          \
+    do {                                                              \
+        if ((uint32)no >= sizeof(regs_float) / sizeof(regs_float[0])) \
+            GOTO_FAIL;                                                \
+    } while (0)
+
+#define CHECK_F64_REG_NO(no)                                          \
+    do {                                                              \
+        if ((uint32)no >= sizeof(regs_float) / sizeof(regs_float[0])) \
+            GOTO_FAIL;                                                \
+    } while (0)
+
+/* Check if a register number is valid */
+#define CHECK_REG_NO(no, kind)                                           \
+    do {                                                                 \
+        if (kind == JIT_REG_KIND_I32 || kind == JIT_REG_KIND_I64) {      \
+            CHECK_I32_REG_NO(no);                                        \
+            CHECK_I64_REG_NO(no);                                        \
+        }                                                                \
+        else if (kind == JIT_REG_KIND_F32 || kind == JIT_REG_KIND_F64) { \
+            CHECK_F32_REG_NO(no);                                        \
+            CHECK_F64_REG_NO(no);                                        \
+        }                                                                \
+        else                                                             \
+            GOTO_FAIL;                                                   \
+    } while (0)
+
+#endif /* end of CODEGEN_CHECK_ARGS == 0 */
+
+/* Load one operand from insn and check none */
+#define LOAD_1ARG() r0 = *jit_insn_opnd(insn, 0)
+
+/* Load two operands from insn and check if r0 is non-const */
+#define LOAD_2ARGS()              \
+    r0 = *jit_insn_opnd(insn, 0); \
+    r1 = *jit_insn_opnd(insn, 1); \
+    CHECK_NCONST(r0)
+
+/* Load three operands from insn and check if r0 is non-const */
+#define LOAD_3ARGS()              \
+    r0 = *jit_insn_opnd(insn, 0); \
+    r1 = *jit_insn_opnd(insn, 1); \
+    r2 = *jit_insn_opnd(insn, 2); \
+    CHECK_NCONST(r0)
+
+/* Load three operands from insn and check none */
+#define LOAD_3ARGS_NO_ASSIGN()    \
+    r0 = *jit_insn_opnd(insn, 0); \
+    r1 = *jit_insn_opnd(insn, 1); \
+    r2 = *jit_insn_opnd(insn, 2);
+
+/* Load four operands from insn and check if r0 is non-const */
+#define LOAD_4ARGS()              \
+    r0 = *jit_insn_opnd(insn, 0); \
+    r1 = *jit_insn_opnd(insn, 1); \
+    r2 = *jit_insn_opnd(insn, 2); \
+    r3 = *jit_insn_opnd(insn, 3); \
+    CHECK_NCONST(r0)
+
+/* Load five operands from insn and check if r0 is non-const */
+#define LOAD_4ARGS_NO_ASSIGN()    \
+    r0 = *jit_insn_opnd(insn, 0); \
+    r1 = *jit_insn_opnd(insn, 1); \
+    r2 = *jit_insn_opnd(insn, 2); \
+    r3 = *jit_insn_opnd(insn, 3);
+
+class JitErrorHandler : public ErrorHandler
+{
+  public:
+    Error err;
+
+    JitErrorHandler()
+      : err(kErrorOk)
+    {}
+
+    void handleError(Error e, const char *msg, BaseEmitter *base) override
+    {
+        (void)msg;
+        (void)base;
+        this->err = e;
+    }
+};
+
+/* Alu opcode */
+typedef enum { ADD, SUB, MUL, DIV_S, REM_S, DIV_U, REM_U, MIN, MAX } ALU_OP;
+/* Bit opcode */
+typedef enum { OR, XOR, AND } BIT_OP;
+/* Shift opcode */
+typedef enum { SHL, SHRS, SHRU, ROTL, ROTR } SHIFT_OP;
+/* Bitcount opcode */
+typedef enum { CLZ, CTZ, POPCNT } BITCOUNT_OP;
+/* Condition opcode */
+typedef enum { EQ, NE, GTS, GES, LTS, LES, GTU, GEU, LTU, LEU } COND_OP;
+
+typedef union _cast_float_to_integer {
+    float f;
+    uint32 i;
+} cast_float_to_integer;
+
+typedef union _cast_double_to_integer {
+    double d;
+    uint64 i;
+} cast_double_to_integer;
+
+static uint32
+local_log2(uint32 data)
+{
+    uint32 ret = 0;
+    while (data >>= 1) {
+        ret++;
+    }
+    return ret;
+}
+
+static uint64
+local_log2l(uint64 data)
+{
+    uint64 ret = 0;
+    while (data >>= 1) {
+        ret++;
+    }
+    return ret;
+}
+
+/* Jmp type */
+typedef enum JmpType {
+    JMP_DST_LABEL_REL,     /* jmp to dst label with relative addr */
+    JMP_DST_LABEL_ABS,     /* jmp to dst label with absolute addr */
+    JMP_END_OF_CALLBC,     /* jmp to end of CALLBC */
+    JMP_LOOKUPSWITCH_BASE, /* LookupSwitch table base addr */
+} JmpType;
+
+/**
+ * Jmp info, save the info on first encoding pass,
+ * and replace the offset with exact offset when the code cache
+ * has been allocated actually.
+ */
+typedef struct JmpInfo {
+    bh_list_link link;
+    JmpType type;
+    uint32 label_src;
+    uint32 offset;
+    union {
+        uint32 label_dst;
+    } dst_info;
+} JmpInfo;
+
+static bool
+label_is_neighboring(JitCompContext *cc, int32 label_prev, int32 label_succ)
+{
+    return (label_prev == 0 && label_succ == 2)
+           || (label_prev >= 2 && label_succ == label_prev + 1)
+           || (label_prev == (int32)jit_cc_label_num(cc) - 1
+               && label_succ == 1);
+}
+
+static bool
+label_is_ahead(JitCompContext *cc, int32 label_dst, int32 label_src)
+{
+    return (label_dst == 0 && label_src != 0)
+           || (label_dst != 1 && label_src == 1)
+           || (2 <= label_dst && label_dst < label_src
+               && label_src <= (int32)jit_cc_label_num(cc) - 1);
+}
+
+/**
+ * Encode jumping from one label to the other label
+ *
+ * @param a the assembler to emit the code
+ * @param jmp_info_list the jmp info list
+ * @param label_dst the index of dst label
+ * @param label_src the index of src label
+ *
+ * @return true if success, false if failed
+ */
+static bool
+jmp_from_label_to_label(x86::Assembler &a, bh_list *jmp_info_list,
+                        int32 label_dst, int32 label_src)
+{
+    Imm imm(INT32_MAX);
+    JmpInfo *node;
+
+    node = (JmpInfo *)jit_calloc(sizeof(JmpInfo));
+    if (!node)
+        return false;
+
+    node->type = JMP_DST_LABEL_REL;
+    node->label_src = label_src;
+    node->dst_info.label_dst = label_dst;
+    node->offset = a.code()->sectionById(0)->buffer().size() + 2;
+    bh_list_insert(jmp_info_list, node);
+
+    a.jmp(imm);
+    return true;
+}
+
+/**
+ * Encode detecting compare result register according to condition code
+ * and then jumping to suitable label when the condtion is met
+ *
+ * @param cc the compiler context
+ * @param a the assembler to emit the code
+ * @param jmp_info_list the jmp info list
+ * @param label_src the index of src label
+ * @param op the opcode of condition operation
+ * @param r1 the label info when condition is met
+ * @param r2 the label info when condition is unmet, do nonthing if VOID
+ * @param is_last_insn if current insn is the last insn of current block
+ *
+ * @return true if success, false if failed
+ */
+static bool
+cmp_r_and_jmp_label(JitCompContext *cc, x86::Assembler &a,
+                    bh_list *jmp_info_list, int32 label_src, COND_OP op,
+                    JitReg r1, JitReg r2, bool is_last_insn)
+{
+    Imm imm(INT32_MAX);
+    JmpInfo *node;
+
+    node = (JmpInfo *)jit_malloc(sizeof(JmpInfo));
+    if (!node)
+        return false;
+
+    node->type = JMP_DST_LABEL_REL;
+    node->label_src = label_src;
+    node->dst_info.label_dst = jit_reg_no(r1);
+    node->offset = a.code()->sectionById(0)->buffer().size() + 2;
+    bh_list_insert(jmp_info_list, node);
+
+    bool fp_cmp = cc->last_cmp_on_fp;
+
+    bh_assert(!fp_cmp || (fp_cmp && (op == GTS || op == GES)));
+
+    switch (op) {
+        case EQ:
+        {
+            a.je(imm);
+            break;
+        }
+        case NE:
+        {
+            a.jne(imm);
+            break;
+        }
+        case GTS:
+        {
+            if (fp_cmp)
+                a.ja(imm);
+            else
+                a.jg(imm);
+            break;
+        }
+        case LES:
+        {
+            a.jng(imm);
+            break;
+        }
+        case GES:
+        {
+            if (fp_cmp)
+                a.jae(imm);
+            else
+                a.jnl(imm);
+            break;
+        }
+        case LTS:
+        {
+            a.jl(imm);
+            break;
+        }
+        case GTU:
+        {
+            a.ja(imm);
+            break;
+        }
+        case LEU:
+        {
+            a.jna(imm);
+            break;
+        }
+        case GEU:
+        {
+            a.jnb(imm);
+            break;
+        }
+        case LTU:
+        {
+            a.jb(imm);
+            break;
+        }
+        default:
+        {
+            bh_assert(0);
+            break;
+        }
+    }
+
+    if (r2) {
+        int32 label_dst = jit_reg_no(r2);
+        if (!(is_last_insn && label_is_neighboring(cc, label_src, label_dst)))
+            if (!jmp_from_label_to_label(a, jmp_info_list, label_dst,
+                                         label_src))
+                return false;
+    }
+
+    return true;
+}
+
+#if WASM_ENABLE_FAST_JIT_DUMP != 0
+static void
+dump_native(char *data, uint32 length)
+{
+    /* Initialize decoder context */
+    ZydisDecoder decoder;
+    ZydisDecoderInit(&decoder, ZYDIS_MACHINE_MODE_LONG_64,
+                     ZYDIS_STACK_WIDTH_64);
+
+    /* Initialize formatter */
+    ZydisFormatter formatter;
+    ZydisFormatterInit(&formatter, ZYDIS_FORMATTER_STYLE_INTEL);
+
+    /* Loop over the instructions in our buffer */
+    ZyanU64 runtime_address = (ZyanU64)(uintptr_t)data;
+    ZyanUSize offset = 0;
+    ZydisDecodedInstruction instruction;
+    ZydisDecodedOperand operands[ZYDIS_MAX_OPERAND_COUNT_VISIBLE];
+
+    while (ZYAN_SUCCESS(ZydisDecoderDecodeFull(
+        &decoder, data + offset, length - offset, &instruction, operands,
+        ZYDIS_MAX_OPERAND_COUNT_VISIBLE, ZYDIS_DFLAG_VISIBLE_OPERANDS_ONLY))) {
+        /* Print current instruction pointer */
+        os_printf("%012" PRIX64 "  ", runtime_address);
+
+        /* Format & print the binary instruction structure to
+           human readable format */
+        char buffer[256];
+        ZydisFormatterFormatInstruction(&formatter, &instruction, operands,
+                                        instruction.operand_count_visible,
+                                        buffer, sizeof(buffer),
+                                        runtime_address);
+        puts(buffer);
+
+        offset += instruction.length;
+        runtime_address += instruction.length;
+    }
+}
+#endif
+
+/**
+ * Encode extending register of byte to register of dword
+ * @param a the assembler to emit the code
+ * @param reg_no_dst the no of dst register
+ * @param reg_no_src tho no of src register
+ * @param is_signed the data is signed or unsigned
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+extend_r8_to_r32(x86::Assembler &a, int32 reg_no_dst, int32 reg_no_src,
+                 bool is_signed)
+{
+    if (is_signed) {
+        a.movsx(regs_i32[reg_no_dst], regs_i8[reg_no_src]);
+    }
+    else {
+        a.movzx(regs_i32[reg_no_dst], regs_i8[reg_no_src]);
+    }
+    return true;
+}
+/**
+ * Encode extending register of word to register of dword
+ * @param a the assembler to emit the code
+ * @param reg_no_dst the no of dst register
+ * @param reg_no_src tho no of src register
+ * @param is_signed the data is signed or unsigned
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+extend_r16_to_r32(x86::Assembler &a, int32 reg_no_dst, int32 reg_no_src,
+                  bool is_signed)
+{
+    if (is_signed) {
+        a.movsx(regs_i32[reg_no_dst], regs_i16[reg_no_src]);
+    }
+    else {
+        a.movzx(regs_i32[reg_no_dst], regs_i16[reg_no_src]);
+    }
+    return true;
+}
+
+/**
+ * Encode extending register of byte to register of qword
+ * @param a the assembler to emit the code
+ * @param reg_no_dst the no of dst register
+ * @param reg_no_src tho no of src register
+ * @param is_signed the data is signed or unsigned
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+extend_r8_to_r64(x86::Assembler &a, int32 reg_no_dst, int32 reg_no_src,
+                 bool is_signed)
+{
+    if (is_signed) {
+        a.movsx(regs_i64[reg_no_dst], regs_i8[reg_no_src]);
+    }
+    else {
+        a.movzx(regs_i64[reg_no_dst], regs_i8[reg_no_src]);
+    }
+    return true;
+}
+
+/**
+ * Encode extending register of word to register of qword
+ * @param a the assembler to emit the code
+ * @param reg_no_dst the no of dst register
+ * @param reg_no_src tho no of src register
+ * @param is_signed the data is signed or unsigned
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+extend_r16_to_r64(x86::Assembler &a, int32 reg_no_dst, int32 reg_no_src,
+                  bool is_signed)
+{
+    if (is_signed) {
+        a.movsx(regs_i64[reg_no_dst], regs_i16[reg_no_src]);
+    }
+    else {
+        a.movzx(regs_i64[reg_no_dst], regs_i16[reg_no_src]);
+    }
+    return true;
+}
+
+/**
+ * Encode extending register of dword to register of qword
+ * @param a the assembler to emit the code
+ * @param reg_no_dst the no of dst register
+ * @param reg_no_src tho no of src register
+ * @param is_signed the data is signed or unsigned
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+extend_r32_to_r64(x86::Assembler &a, int32 reg_no_dst, int32 reg_no_src,
+                  bool is_signed)
+{
+    if (is_signed) {
+        a.movsxd(regs_i64[reg_no_dst], regs_i32[reg_no_src]);
+    }
+    else {
+        /*
+         * The upper 32-bit will be zero-extended, ref to Intel document,
+         * 3.4.1.1 General-Purpose Registers: 32-bit operands generate
+         * a 32-bit result, zero-extended to a 64-bit result in the
+         * destination general-purpose register
+         */
+        a.mov(regs_i32[reg_no_dst], regs_i32[reg_no_src]);
+    }
+    return true;
+}
+
+static bool
+mov_r_to_r_i32(x86::Assembler &a, int32 reg_no_dst, int32 reg_no_src);
+
+static bool
+mov_r_to_r_i64(x86::Assembler &a, int32 reg_no_dst, int32 reg_no_src);
+
+static void
+mov_r_to_r(x86::Assembler &a, uint32 kind_dst, int32 reg_no_dst,
+           int32 reg_no_src)
+{
+    if (kind_dst == JIT_REG_KIND_I32)
+        mov_r_to_r_i32(a, reg_no_dst, reg_no_src);
+    else if (kind_dst == JIT_REG_KIND_I64)
+        mov_r_to_r_i64(a, reg_no_dst, reg_no_src);
+    else if (kind_dst == JIT_REG_KIND_F32) {
+        /* TODO */
+        bh_assert(0);
+    }
+    else if (kind_dst == JIT_REG_KIND_F64) {
+        /* TODO */
+        bh_assert(0);
+    }
+    else {
+        bh_assert(0);
+    }
+}
+
+/**
+ * Encode moving memory to a register
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data,
+ *        could be 1(byte), 2(short), 4(int32), 8(int64),
+ *        skipped by float and double
+ * @param kind_dst the kind of data to move, could be I32, I64, F32 or F64
+ * @param is_signed whether the data is signed or unsigned
+ * @param reg_no_dst the index of dest register
+ * @param m_src the memory operand which contains the source data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+mov_m_to_r(x86::Assembler &a, uint32 bytes_dst, uint32 kind_dst, bool is_signed,
+           int32 reg_no_dst, x86::Mem &m_src)
+{
+    if (kind_dst == JIT_REG_KIND_I32) {
+        switch (bytes_dst) {
+            case 1:
+            case 2:
+                if (is_signed)
+                    a.movsx(regs_i32[reg_no_dst], m_src);
+                else
+                    a.movzx(regs_i32[reg_no_dst], m_src);
+                break;
+            case 4:
+                a.mov(regs_i32[reg_no_dst], m_src);
+                break;
+            default:
+                bh_assert(0);
+                return false;
+        }
+    }
+    else if (kind_dst == JIT_REG_KIND_I64) {
+        switch (bytes_dst) {
+            case 1:
+            case 2:
+                if (is_signed)
+                    a.movsx(regs_i64[reg_no_dst], m_src);
+                else
+                    a.movzx(regs_i64[reg_no_dst], m_src);
+                break;
+            case 4:
+                if (is_signed)
+                    a.movsxd(regs_i64[reg_no_dst], m_src);
+                else
+                    /*
+                     * The upper 32-bit will be zero-extended, ref to Intel
+                     * document, 3.4.1.1 General-Purpose Registers: 32-bit
+                     * operands generate a 32-bit result, zero-extended to
+                     * a 64-bit result in the destination general-purpose
+                     * register
+                     */
+                    a.mov(regs_i32[reg_no_dst], m_src);
+                break;
+            case 8:
+                a.mov(regs_i64[reg_no_dst], m_src);
+                break;
+            default:
+                bh_assert(0);
+                return false;
+        }
+    }
+    else if (kind_dst == JIT_REG_KIND_F32) {
+        a.movss(regs_float[reg_no_dst], m_src);
+    }
+    else if (kind_dst == JIT_REG_KIND_F64) {
+        a.movsd(regs_float[reg_no_dst], m_src);
+    }
+    return true;
+}
+
+/**
+ * Encode moving register to memory
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data,
+ *        could be 1(byte), 2(short), 4(int32), 8(int64),
+ *        skipped by float and double
+ * @param kind_dst the kind of data to move, could be I32, I64, F32 or F64
+ * @param is_signed whether the data is signed or unsigned
+ * @param m_dst the dest memory operand
+ * @param reg_no_src the index of dest register
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+mov_r_to_m(x86::Assembler &a, uint32 bytes_dst, uint32 kind_dst,
+           x86::Mem &m_dst, int32 reg_no_src)
+{
+    if (kind_dst == JIT_REG_KIND_I32) {
+        bh_assert(reg_no_src < 16);
+        switch (bytes_dst) {
+            case 1:
+                a.mov(m_dst, regs_i8[reg_no_src]);
+                break;
+            case 2:
+                a.mov(m_dst, regs_i16[reg_no_src]);
+                break;
+            case 4:
+                a.mov(m_dst, regs_i32[reg_no_src]);
+                break;
+            default:
+                bh_assert(0);
+                return false;
+        }
+    }
+    else if (kind_dst == JIT_REG_KIND_I64) {
+        bh_assert(reg_no_src < 16);
+        switch (bytes_dst) {
+            case 1:
+                a.mov(m_dst, regs_i8[reg_no_src]);
+                break;
+            case 2:
+                a.mov(m_dst, regs_i16[reg_no_src]);
+                break;
+            case 4:
+                a.mov(m_dst, regs_i32[reg_no_src]);
+                break;
+            case 8:
+                a.mov(m_dst, regs_i64[reg_no_src]);
+                break;
+            default:
+                bh_assert(0);
+                return false;
+        }
+    }
+    else if (kind_dst == JIT_REG_KIND_F32) {
+        a.movss(m_dst, regs_float[reg_no_src]);
+    }
+    else if (kind_dst == JIT_REG_KIND_F64) {
+        a.movsd(m_dst, regs_float[reg_no_src]);
+    }
+    return true;
+}
+
+/**
+ * Encode moving immediate data to memory
+ *
+ * @param m dst memory
+ * @param imm src immediate data
+ *
+ * @return new stream
+ */
+static bool
+mov_imm_to_m(x86::Assembler &a, x86::Mem &m_dst, Imm imm_src, uint32 bytes_dst)
+{
+    if (bytes_dst == 8) {
+        int64 value = imm_src.value();
+        if (value >= INT32_MIN && value <= INT32_MAX) {
+            imm_src.setValue((int32)value);
+            a.mov(m_dst, imm_src);
+        }
+        else {
+            /* There is no instruction `MOV m64, imm64`, we use
+               two instructions to implement it */
+            a.mov(regs_i64[REG_I64_FREE_IDX], imm_src);
+            a.mov(m_dst, regs_i64[REG_I64_FREE_IDX]);
+        }
+    }
+    else
+        a.mov(m_dst, imm_src);
+    return true;
+}
+
+#if WASM_ENABLE_SHARED_MEMORY != 0
+/**
+ * Encode exchange register with memory
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data,
+ *        could be 1(byte), 2(short), 4(int32), 8(int64),
+ *        skipped by float and double
+ * @param kind_dst the kind of data to move, could only be I32 or I64
+ * @param m_dst the dest memory operand
+ * @param reg_no_src the index of dest register
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+xchg_r_to_m(x86::Assembler &a, uint32 bytes_dst, uint32 kind_dst,
+            x86::Mem &m_dst, int32 reg_no_src)
+{
+    bh_assert((kind_dst == JIT_REG_KIND_I32 && bytes_dst <= 4)
+              || kind_dst == JIT_REG_KIND_I64);
+    bh_assert(reg_no_src < 16);
+    switch (bytes_dst) {
+        case 1:
+            a.xchg(m_dst, regs_i8[reg_no_src]);
+            break;
+        case 2:
+            a.xchg(m_dst, regs_i16[reg_no_src]);
+            break;
+        case 4:
+            a.xchg(m_dst, regs_i32[reg_no_src]);
+            break;
+        case 8:
+            a.xchg(m_dst, regs_i64[reg_no_src]);
+            break;
+        default:
+            bh_assert(0);
+            return false;
+    }
+    return true;
+}
+#endif
+/**
+ * Encode loading register data from memory with imm base and imm offset
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data,
+ *        could be 1(byte), 2(short), 4(int32), 8(int64), skipped by
+ * float/double
+ * @param kind_dst the kind of data to move, could be I32, I64, F32 or F64
+ * @param is_signed the data is signed or unsigned
+ * @param reg_no_dst the index of dest register
+ * @param base the base address of the memory
+ * @param offset the offset address of the memory
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+ld_r_from_base_imm_offset_imm(x86::Assembler &a, uint32 bytes_dst,
+                              uint32 kind_dst, bool is_signed, int32 reg_no_dst,
+                              int32 base, int32 offset)
+{
+    x86::Mem m((uintptr_t)(base + offset), bytes_dst);
+    return mov_m_to_r(a, bytes_dst, kind_dst, is_signed, reg_no_dst, m);
+}
+
+/**
+ * Encode loading register data from memory with imm base and register offset
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data,
+ *        could be 1(byte), 2(short), 4(int32), 8(int64), skipped by
+ * float/double
+ * @param kind_dst the kind of data to move, could be I32, I64, F32 or F64
+ * @param is_signed the data is signed or unsigned
+ * @param reg_no_dst the index of dest register
+ * @param base the base address of the memory
+ * @param reg_no_offset the no of register which stores the offset of the memory
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+ld_r_from_base_imm_offset_r(x86::Assembler &a, uint32 bytes_dst,
+                            uint32 kind_dst, bool is_signed, int32 reg_no_dst,
+                            int32 base, int32 reg_no_offset)
+{
+    x86::Mem m(regs_i64[reg_no_offset], base, bytes_dst);
+    return mov_m_to_r(a, bytes_dst, kind_dst, is_signed, reg_no_dst, m);
+}
+
+/**
+ * Encode loading register data from memory with register base and imm offset
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data,
+ *        could be 1(byte), 2(short), 4(int32), 8(int64), skipped by
+ * float/double
+ * @param kind_dst the kind of data to move, could be I32, I64, F32 or F64
+ * @param is_signed the data is signed or unsigned
+ * @param reg_no_dst the index of dest register
+ * @param reg_no_base the no of register which stores the base of the memory
+ * @param offset the offset address of the memory
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+ld_r_from_base_r_offset_imm(x86::Assembler &a, uint32 bytes_dst,
+                            uint32 kind_dst, bool is_signed, int32 reg_no_dst,
+                            int32 reg_no_base, int32 offset)
+{
+    x86::Mem m(regs_i64[reg_no_base], offset, bytes_dst);
+    return mov_m_to_r(a, bytes_dst, kind_dst, is_signed, reg_no_dst, m);
+}
+
+/**
+ * Encode loading register data from memory with register base and register
+ * offset
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data,
+ *        could be 1(byte), 2(short), 4(int32), 8(int64), skipped by
+ * float/double
+ * @param kind_dst the kind of data to move, could be I32, I64, F32 or F64
+ * @param is_signed the data is signed or unsigned
+ * @param reg_no_dst the index of dest register
+ * @param reg_no_base the no of register which stores the base of the memory
+ * @param reg_no_offset the no of register which stores the offset of the memory
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+ld_r_from_base_r_offset_r(x86::Assembler &a, uint32 bytes_dst, uint32 kind_dst,
+                          bool is_signed, int32 reg_no_dst, int32 reg_no_base,
+                          int32 reg_no_offset)
+{
+    x86::Mem m(regs_i64[reg_no_base], regs_i64[reg_no_offset], 0, 0, bytes_dst);
+    return mov_m_to_r(a, bytes_dst, kind_dst, is_signed, reg_no_dst, m);
+}
+
+/**
+ * Encode storing register data to memory with imm base and imm offset
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data,
+ *        could be 1(byte), 2(short), 4(int32), 8(int64), skipped by
+ * float/double
+ * @param kind_dst the kind of data to move, could be I32, I64, F32 or F64
+ * @param reg_no_src the index of src register
+ * @param base the base address of the dst memory
+ * @param offset the offset address of the dst memory
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+st_r_to_base_imm_offset_imm(x86::Assembler &a, uint32 bytes_dst,
+                            uint32 kind_dst, int32 reg_no_src, int32 base,
+                            int32 offset, bool atomic)
+{
+    x86::Mem m((uintptr_t)(base + offset), bytes_dst);
+#if WASM_ENABLE_SHARED_MEMORY != 0
+    if (atomic)
+        return xchg_r_to_m(a, bytes_dst, kind_dst, m, reg_no_src);
+#endif
+    return mov_r_to_m(a, bytes_dst, kind_dst, m, reg_no_src);
+}
+
+/**
+ * Encode storing register data to memory with imm base and register offset
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data,
+ *        could be 1(byte), 2(short), 4(int32), 8(int64), skipped by
+ * float/double
+ * @param kind_dst the kind of data to move, could be I32, I64, F32 or F64
+ * @param reg_no_src the index of src register
+ * @param base the base address of the dst memory
+ * @param reg_no_offset the no of register which stores the offset of the dst
+ * memory
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+st_r_to_base_imm_offset_r(x86::Assembler &a, uint32 bytes_dst, uint32 kind_dst,
+                          int32 reg_no_src, int32 base, int32 reg_no_offset,
+                          bool atomic)
+{
+    x86::Mem m(regs_i64[reg_no_offset], base, bytes_dst);
+#if WASM_ENABLE_SHARED_MEMORY != 0
+    if (atomic)
+        return xchg_r_to_m(a, bytes_dst, kind_dst, m, reg_no_src);
+#endif
+    return mov_r_to_m(a, bytes_dst, kind_dst, m, reg_no_src);
+}
+
+/**
+ * Encode storing register data to memory with register base and imm offset
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data,
+ *        could be 1(byte), 2(short), 4(int32), 8(int64), skipped by
+ * float/double
+ * @param kind_dst the kind of data to move, could be I32, I64, F32 or F64
+ * @param reg_no_src the index of src register
+ * @param reg_no_base the no of register which stores the base of the dst memory
+ * @param offset the offset address of the dst memory
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+st_r_to_base_r_offset_imm(x86::Assembler &a, uint32 bytes_dst, uint32 kind_dst,
+                          int32 reg_no_src, int32 reg_no_base, int32 offset,
+                          bool atomic)
+{
+    x86::Mem m(regs_i64[reg_no_base], offset, bytes_dst);
+#if WASM_ENABLE_SHARED_MEMORY != 0
+    if (atomic)
+        return xchg_r_to_m(a, bytes_dst, kind_dst, m, reg_no_src);
+#endif
+    return mov_r_to_m(a, bytes_dst, kind_dst, m, reg_no_src);
+}
+
+/**
+ * Encode storing register data to memory with register base and register offset
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data,
+ *        could be 1(byte), 2(short), 4(int32), 8(int64), skipped by
+ * float/double
+ * @param kind_dst the kind of data to move, could be I32, I64, F32 or F64
+ * @param reg_no_src the index of src register
+ * @param reg_no_base the no of register which stores the base of the dst memory
+ * @param reg_no_offset the no of register which stores the offset of the dst
+ * memory
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+st_r_to_base_r_offset_r(x86::Assembler &a, uint32 bytes_dst, uint32 kind_dst,
+                        int32 reg_no_src, int32 reg_no_base,
+                        int32 reg_no_offset, bool atomic)
+{
+    x86::Mem m(regs_i64[reg_no_base], regs_i64[reg_no_offset], 0, 0, bytes_dst);
+#if WASM_ENABLE_SHARED_MEMORY != 0
+    if (atomic)
+        return xchg_r_to_m(a, bytes_dst, kind_dst, m, reg_no_src);
+#endif
+    return mov_r_to_m(a, bytes_dst, kind_dst, m, reg_no_src);
+}
+
+static void
+imm_set_value(Imm &imm, void *data, uint32 bytes)
+{
+    switch (bytes) {
+        case 1:
+            imm.setValue(*(uint8 *)data);
+            break;
+        case 2:
+            imm.setValue(*(uint16 *)data);
+            break;
+        case 4:
+            imm.setValue(*(uint32 *)data);
+            break;
+        case 8:
+            imm.setValue(*(uint64 *)data);
+            break;
+        default:
+            bh_assert(0);
+    }
+}
+
+#if WASM_ENABLE_SHARED_MEMORY != 0
+static uint32
+mov_imm_to_free_reg(x86::Assembler &a, Imm &imm, uint32 bytes)
+{
+    uint32 reg_no;
+
+    switch (bytes) {
+        case 1:
+            reg_no = REG_I8_FREE_IDX;
+            a.mov(regs_i8[reg_no], imm);
+            break;
+        case 2:
+            reg_no = REG_I16_FREE_IDX;
+            a.mov(regs_i16[reg_no], imm);
+            break;
+        case 4:
+            reg_no = REG_I32_FREE_IDX;
+            a.mov(regs_i32[reg_no], imm);
+            break;
+        case 8:
+            reg_no = REG_I64_FREE_IDX;
+            a.mov(regs_i64[reg_no], imm);
+            break;
+        default:
+            bh_assert(0);
+    }
+
+    return reg_no;
+}
+#endif
+
+/**
+ * Encode storing int32 imm data to memory with imm base and imm offset
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data,
+ *        could be 1(byte), 2(short), 4(int32), 8(int64)
+ * @param data_src the src immediate data
+ * @param base the base address of dst memory
+ * @param offset the offset address of dst memory
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+st_imm_to_base_imm_offset_imm(x86::Assembler &a, uint32 bytes_dst,
+                              void *data_src, int32 base, int32 offset,
+                              bool atomic)
+{
+    x86::Mem m((uintptr_t)(base + offset), bytes_dst);
+    Imm imm;
+    imm_set_value(imm, data_src, bytes_dst);
+#if WASM_ENABLE_SHARED_MEMORY != 0
+    uint32 reg_no_src = mov_imm_to_free_reg(a, imm, bytes_dst);
+    if (atomic) {
+        return xchg_r_to_m(a, bytes_dst, JIT_REG_KIND_I64, m, reg_no_src);
+    }
+#endif
+    return mov_imm_to_m(a, m, imm, bytes_dst);
+}
+
+/**
+ * Encode storing int32 imm data to memory with imm base and reg offset
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data,
+ *        could be 1(byte), 2(short), 4(int32), 8(int64)
+ * @param data_src the src immediate data
+ * @param base the base address of dst memory
+ * @param reg_no_offset the no of register that stores the offset address
+ *        of dst memory
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+st_imm_to_base_imm_offset_r(x86::Assembler &a, uint32 bytes_dst, void *data_src,
+                            int32 base, int32 reg_no_offset, bool atomic)
+{
+    x86::Mem m(regs_i64[reg_no_offset], base, bytes_dst);
+    Imm imm;
+    imm_set_value(imm, data_src, bytes_dst);
+#if WASM_ENABLE_SHARED_MEMORY != 0
+    uint32 reg_no_src = mov_imm_to_free_reg(a, imm, bytes_dst);
+    if (atomic) {
+        return xchg_r_to_m(a, bytes_dst, JIT_REG_KIND_I64, m, reg_no_src);
+    }
+#endif
+    return mov_imm_to_m(a, m, imm, bytes_dst);
+}
+
+/**
+ * Encode storing int32 imm data to memory with reg base and imm offset
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data,
+ *        could be 1(byte), 2(short), 4(int32), 8(int64)
+ * @param data_src the src immediate data
+ * @param reg_no_base the no of register that stores the base address
+ *        of dst memory
+ * @param offset the offset address of dst memory
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+st_imm_to_base_r_offset_imm(x86::Assembler &a, uint32 bytes_dst, void *data_src,
+                            int32 reg_no_base, int32 offset, bool atomic)
+{
+    x86::Mem m(regs_i64[reg_no_base], offset, bytes_dst);
+    Imm imm;
+    imm_set_value(imm, data_src, bytes_dst);
+#if WASM_ENABLE_SHARED_MEMORY != 0
+    uint32 reg_no_src = mov_imm_to_free_reg(a, imm, bytes_dst);
+    if (atomic) {
+        return xchg_r_to_m(a, bytes_dst, JIT_REG_KIND_I64, m, reg_no_src);
+    }
+#endif
+    return mov_imm_to_m(a, m, imm, bytes_dst);
+}
+
+/**
+ * Encode storing int32 imm data to memory with reg base and reg offset
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data,
+ *        could be 1(byte), 2(short), 4(int32), 8(int64)
+ * @param data_src the src immediate data
+ * @param reg_no_base the no of register that stores the base address
+ *        of dst memory
+ * @param reg_no_offset the no of register that stores the offset address
+ *        of dst memory
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+st_imm_to_base_r_offset_r(x86::Assembler &a, uint32 bytes_dst, void *data_src,
+                          int32 reg_no_base, int32 reg_no_offset, bool atomic)
+{
+    x86::Mem m(regs_i64[reg_no_base], regs_i64[reg_no_offset], 0, 0, bytes_dst);
+    Imm imm;
+    imm_set_value(imm, data_src, bytes_dst);
+#if WASM_ENABLE_SHARED_MEMORY != 0
+    uint32 reg_no_src = mov_imm_to_free_reg(a, imm, bytes_dst);
+    if (atomic) {
+        return xchg_r_to_m(a, bytes_dst, JIT_REG_KIND_I64, m, reg_no_src);
+    }
+#endif
+    return mov_imm_to_m(a, m, imm, bytes_dst);
+}
+
+/**
+ * Encode moving immediate int32 data to register
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no the no of dst register
+ * @param data the immediate data to move
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+mov_imm_to_r_i32(x86::Assembler &a, int32 reg_no, int32 data)
+{
+    Imm imm(data);
+    a.mov(regs_i32[reg_no], imm);
+    return true;
+}
+
+/**
+ * Encode moving int32 data from src register to dst register
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no_dst the no of dst register
+ * @param reg_no_src the no of src register
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+mov_r_to_r_i32(x86::Assembler &a, int32 reg_no_dst, int32 reg_no_src)
+{
+    if (reg_no_dst != reg_no_src)
+        a.mov(regs_i32[reg_no_dst], regs_i32[reg_no_src]);
+    return true;
+}
+
+/**
+ * Encode moving immediate int64 data to register
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no the no of dst register
+ * @param data the immediate data to move
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+mov_imm_to_r_i64(x86::Assembler &a, int32 reg_no, int64 data)
+{
+    Imm imm(data);
+    a.mov(regs_i64[reg_no], imm);
+    return true;
+}
+
+/**
+ * Encode moving int64 data from src register to dst register
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no_dst the no of dst register
+ * @param reg_no_src the no of src register
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+mov_r_to_r_i64(x86::Assembler &a, int32 reg_no_dst, int32 reg_no_src)
+{
+    if (reg_no_dst != reg_no_src)
+        a.mov(regs_i64[reg_no_dst], regs_i64[reg_no_src]);
+    return true;
+}
+
+/**
+ * Encode moving immediate float data to register
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no the no of dst register
+ * @param data the immediate data to move
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+mov_imm_to_r_f32(x86::Assembler &a, int32 reg_no, float data)
+{
+    /* imm -> gp -> xmm */
+    cast_float_to_integer v = { .f = data };
+    Imm imm(v.i);
+    a.mov(regs_i32[REG_I32_FREE_IDX], imm);
+    a.movd(regs_float[reg_no], regs_i32[REG_I32_FREE_IDX]);
+    return true;
+}
+
+/**
+ * Encode moving float data from src register to dst register
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no_dst the no of dst register
+ * @param reg_no_src the no of src register
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+mov_r_to_r_f32(x86::Assembler &a, int32 reg_no_dst, int32 reg_no_src)
+{
+    if (reg_no_dst != reg_no_src) {
+        a.movss(regs_float[reg_no_dst], regs_float[reg_no_src]);
+    }
+    return true;
+}
+
+/**
+ * Encode moving immediate double data to register
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no the no of dst register
+ * @param data the immediate data to move
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+mov_imm_to_r_f64(x86::Assembler &a, int32 reg_no, double data)
+{
+    cast_double_to_integer v = { .d = data };
+    Imm imm(v.i);
+    a.mov(regs_i64[REG_I32_FREE_IDX], imm);
+    /* REG_I32_FREE_IDX == REG_I64_FREE_IDX */
+    a.movq(regs_float[reg_no], regs_i64[REG_I64_FREE_IDX]);
+    return true;
+}
+
+/**
+ * Encode moving double data from src register to dst register
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no_dst the no of dst register
+ * @param reg_no_src the no of src register
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+mov_r_to_r_f64(x86::Assembler &a, int32 reg_no_dst, int32 reg_no_src)
+{
+    if (reg_no_dst != reg_no_src) {
+        a.movsd(regs_float[reg_no_dst], regs_float[reg_no_src]);
+    }
+    return true;
+}
+
+/* Let compiler do the conversation job as much as possible */
+
+/**
+ * Encoding convert int8 immediate data to int32 register
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no the dst register, need to be converted to int32
+ * @param data the src int8 immediate data
+ *
+ * @return  true if success, false otherwise
+ */
+static bool
+convert_imm_i8_to_r_i32(x86::Assembler &a, int32 reg_no, int8 data)
+{
+    return mov_imm_to_r_i32(a, reg_no, (int32)data);
+}
+
+/**
+ * encoding convert int8 register to int32 register
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no_dst the dst register
+ * @param reg_no_src the src register
+ *
+ * @return  true if success, false otherwise
+ */
+static bool
+convert_r_i8_to_r_i32(x86::Assembler &a, int32 reg_no_dst, int32 reg_no_src)
+{
+    return extend_r8_to_r32(a, reg_no_dst, reg_no_src, true);
+}
+
+/**
+ * encoding convert int8 immediate data to int64 register
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no the dst register, need to be converted to int64
+ * @param data the src int8 immediate data
+ *
+ * @return  true if success, false otherwise
+ */
+static bool
+convert_imm_i8_to_r_i64(x86::Assembler &a, int32 reg_no, int8 data)
+{
+    return mov_imm_to_r_i64(a, reg_no, (int64)data);
+}
+
+/**
+ * encoding convert int8 register to int64 register
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no_dst the dst register
+ * @param reg_no_src the src register
+ *
+ * @return  true if success, false otherwise
+ */
+static bool
+convert_r_i8_to_r_i64(x86::Assembler &a, int32 reg_no_dst, int32 reg_no_src)
+{
+    return extend_r8_to_r64(a, reg_no_dst, reg_no_src, true);
+}
+
+/**
+ * Encoding convert int16 immediate data to int32 register
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no the dst register, need to be converted to int32
+ * @param data the src int16 immediate data
+ *
+ * @return  true if success, false otherwise
+ */
+static bool
+convert_imm_i16_to_r_i32(x86::Assembler &a, int32 reg_no, int16 data)
+{
+    return mov_imm_to_r_i32(a, reg_no, (int32)data);
+}
+
+/**
+ * encoding convert int16 register to int32 register
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no_dst the dst register
+ * @param reg_no_src the src register
+ *
+ * @return  true if success, false otherwise
+ */
+static bool
+convert_r_i16_to_r_i32(x86::Assembler &a, int32 reg_no_dst, int32 reg_no_src)
+{
+    return extend_r16_to_r32(a, reg_no_dst, reg_no_src, true);
+}
+
+/**
+ * encoding convert int16 immediate data to int64 register
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no the dst register, need to be converted to int64
+ * @param data the src int16 immediate data
+ *
+ * @return  true if success, false otherwise
+ */
+static bool
+convert_imm_i16_to_r_i64(x86::Assembler &a, int32 reg_no, int16 data)
+{
+    return mov_imm_to_r_i64(a, reg_no, (int64)data);
+}
+
+/**
+ * encoding convert int16 register to int64 register
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no_dst the dst register
+ * @param reg_no_src the src register
+ *
+ * @return  true if success, false otherwise
+ */
+static bool
+convert_r_i16_to_r_i64(x86::Assembler &a, int32 reg_no_dst, int32 reg_no_src)
+{
+    return extend_r16_to_r64(a, reg_no_dst, reg_no_src, true);
+}
+
+/**
+ * Encoding convert int32 immediate data to int8 register
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no the dst register, need to be converted to int8
+ * @param data the src int32 immediate data
+ *
+ * @return  true if success, false otherwise
+ */
+static bool
+convert_imm_i32_to_r_i8(x86::Assembler &a, int32 reg_no, int32 data)
+{
+    /* (int32)(int8)data will do sign-extension */
+    /* (int32)(uint32)(int8)data is longer */
+    return mov_imm_to_r_i32(a, reg_no, data & 0x000000FF);
+}
+
+/**
+ * Encoding convert int32 immediate data to int8 register
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no_dst the dst register, need to be converted to int8
+ * @param reg_no_src the src register
+ *
+ * @return  true if success, false otherwise
+ */
+static bool
+convert_r_i32_to_r_i8(x86::Assembler &a, int32 reg_no_dst, int32 reg_no_src)
+{
+    mov_r_to_r_i32(a, reg_no_dst, reg_no_src);
+    a.and_(regs_i32[reg_no_dst], 0x000000FF);
+    return true;
+}
+
+/**
+ * Encoding convert int32 immediate data to uint8 register
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no the dst register, need to be converted to uint8
+ * @param data the src int32 immediate data
+ *
+ * @return  true if success, false otherwise
+ */
+static bool
+convert_imm_i32_to_r_u8(x86::Assembler &a, int32 reg_no, int32 data)
+{
+    return mov_imm_to_r_i32(a, reg_no, (uint8)data);
+}
+
+/**
+ * Encoding convert int32 immediate data to uint8 register
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no_dst the dst register, need to be converted to uint8
+ * @param reg_no_src the src register
+ *
+ * @return  true if success, false otherwise
+ */
+static bool
+convert_r_i32_to_r_u8(x86::Assembler &a, int32 reg_no_dst, int32 reg_no_src)
+{
+    return convert_r_i32_to_r_i8(a, reg_no_dst, reg_no_src);
+}
+
+/**
+ * Encoding convert int32 immediate data to int16 register
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no the dst register, need to be converted to int16
+ * @param data the src int32 immediate data
+ *
+ * @return  true if success, false otherwise
+ */
+static bool
+convert_imm_i32_to_r_i16(x86::Assembler &a, int32 reg_no, int32 data)
+{
+    /* (int32)(int16)data will do sign-extension */
+    /* (int32)(uint32)(int16)data is longer */
+    return mov_imm_to_r_i32(a, reg_no, data & 0x0000FFFF);
+}
+
+/**
+ * Encoding convert int32 immediate data to int16 register
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no_dst the dst register, need to be converted to int16
+ * @param reg_no_src the src register
+ *
+ * @return  true if success, false otherwise
+ */
+static bool
+convert_r_i32_to_r_i16(x86::Assembler &a, int32 reg_no_dst, int32 reg_no_src)
+{
+    mov_r_to_r_i32(a, reg_no_dst, reg_no_src);
+    a.and_(regs_i32[reg_no_dst], 0x0000FFFF);
+    return true;
+}
+
+/**
+ * Encoding convert int32 immediate data to uint16 register
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no the dst register, need to be converted to uint16
+ * @param data the src int32 immediate data
+ *
+ * @return  true if success, false otherwise
+ */
+static bool
+convert_imm_i32_to_r_u16(x86::Assembler &a, int32 reg_no, int32 data)
+{
+    return mov_imm_to_r_i32(a, reg_no, (uint16)data);
+}
+
+/**
+ * Encoding convert int32 immediate data to uint16 register
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no_dst the dst register, need to be converted to uint16
+ * @param reg_no_src the src register
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+convert_r_i32_to_r_u16(x86::Assembler &a, int32 reg_no_dst, int32 reg_no_src)
+{
+    return convert_r_i32_to_r_i16(a, reg_no_dst, reg_no_src);
+}
+
+/**
+ * Encoding convert int32 immediate data to int64 register
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no the dst register, need to be converted to uint64
+ * @param data the src int32 immediate data
+ *
+ * @return  true if success, false otherwise
+ */
+static bool
+convert_imm_i32_to_r_i64(x86::Assembler &a, int32 reg_no, int32 data)
+{
+    return mov_imm_to_r_i64(a, reg_no, (int64)data);
+}
+
+/**
+ * Encoding convert int32 register data to int64 register with signed extension
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no_dst the dst register, need to be converted to uint64
+ * @param reg_no_src the src register
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+convert_r_i32_to_r_i64(x86::Assembler &a, int32 reg_no_dst, int32 reg_no_src)
+{
+    return extend_r32_to_r64(a, reg_no_dst, reg_no_src, true);
+}
+
+/**
+ * Encode converting int32 register data to float register data
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no_dst the no of dst float register
+ * @param reg_no_src the no of src int32 register
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+convert_r_i32_to_r_f32(x86::Assembler &a, int32 reg_no_dst, int32 reg_no_src)
+{
+    a.cvtsi2ss(regs_float[reg_no_dst], regs_i32[reg_no_src]);
+    return true;
+}
+
+/**
+ * Encode converting int32 immediate data to float register data
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no the no of dst float register
+ * @param data the src immediate data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+convert_imm_i32_to_r_f32(x86::Assembler &a, int32 reg_no, int32 data)
+{
+    mov_imm_to_r_i32(a, REG_I32_FREE_IDX, data);
+    return convert_r_i32_to_r_f32(a, reg_no, REG_I32_FREE_IDX);
+}
+
+/**
+ * Encode converting int32 register data to double register data
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no_dst the no of dst double register
+ * @param reg_no_src the no of src int32 register
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+convert_r_i32_to_r_f64(x86::Assembler &a, int32 reg_no_dst, int32 reg_no_src)
+{
+    a.cvtsi2sd(regs_float[reg_no_dst], regs_i32[reg_no_src]);
+    return true;
+}
+
+/**
+ * Encode converting int32 immediate data to double register data
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no the no of dst double register
+ * @param data the src immediate int32 data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+convert_imm_i32_to_r_f64(x86::Assembler &a, int32 reg_no, int32 data)
+{
+    mov_imm_to_r_i32(a, REG_I32_FREE_IDX, data);
+    return convert_r_i32_to_r_f64(a, reg_no, REG_I32_FREE_IDX);
+}
+
+/**
+ * Encode converting int64 immediate data to int32 register data
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no the no of dst int32 register
+ * @param data the src immediate int64 data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+convert_imm_i64_to_r_i32(x86::Assembler &a, int32 reg_no, int64 data)
+{
+    return mov_imm_to_r_i64(a, reg_no, (int32)data);
+}
+
+/**
+ * Encode converting int64 register data to int32 register data
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no_dst the no of dst int32 register
+ * @param reg_no_src the no of src int64 register
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+convert_r_i64_to_r_i32(x86::Assembler &a, int32 reg_no_dst, int32 reg_no_src)
+{
+    mov_r_to_r_i64(a, reg_no_dst, reg_no_src);
+    a.and_(regs_i64[reg_no_dst], 0x00000000FFFFFFFFLL);
+    return true;
+}
+
+/**
+ * Encode converting int64 immediate data to int8 register data
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no the no of dst int32 register
+ * @param data the src immediate int64 data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+convert_imm_i64_to_r_i8(x86::Assembler &a, int32 reg_no, int64 data)
+{
+    return mov_imm_to_r_i64(a, reg_no, (int8)data);
+}
+
+/**
+ * Encode converting int64 register data to int8 register data
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no_dst the no of dst int8 register
+ * @param reg_no_src the no of src int64 register
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+convert_r_i64_to_r_i8(x86::Assembler &a, int32 reg_no_dst, int32 reg_no_src)
+{
+    mov_r_to_r_i64(a, reg_no_dst, reg_no_src);
+    a.and_(regs_i64[reg_no_dst], 0x00000000000000FFLL);
+    return true;
+}
+
+/**
+ * Encode converting int64 immediate data to int16 register data
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no the no of dst int32 register
+ * @param data the src immediate int64 data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+convert_imm_i64_to_r_i16(x86::Assembler &a, int32 reg_no, int64 data)
+{
+    return mov_imm_to_r_i64(a, reg_no, (int16)data);
+}
+
+/**
+ * Encode converting int64 register data to int16 register data
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no_dst the no of dst int16 register
+ * @param reg_no_src the no of src int64 register
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+convert_r_i64_to_r_i16(x86::Assembler &a, int32 reg_no_dst, int32 reg_no_src)
+{
+    mov_r_to_r_i64(a, reg_no_dst, reg_no_src);
+    a.and_(regs_i64[reg_no_dst], 0x000000000000FFFFLL);
+    return true;
+}
+
+/**
+ * Encode converting uint32 immediate data to int64 register data
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no the no of dst int64 register
+ * @param data the src immediate uint32 data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+convert_imm_u32_to_r_i64(x86::Assembler &a, int32 reg_no, uint32 data)
+{
+    return mov_imm_to_r_i64(a, reg_no, (int64)(uint64)data);
+}
+
+/**
+ * Encode converting uint32 register data to int64 register data
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no_dst the no of dst uint32 register
+ * @param reg_no_src the no of src int64 register
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+convert_r_u32_to_r_i64(x86::Assembler &a, int32 reg_no_dst, int32 reg_no_src)
+{
+    return extend_r32_to_r64(a, reg_no_dst, reg_no_src, false);
+}
+
+/**
+ * Encode converting uint32 immediate data to float register data
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no the no of dst float register
+ * @param data the src immediate uint32 data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+convert_imm_u32_to_r_f32(x86::Assembler &a, int32 reg_no, uint32 data)
+{
+    mov_imm_to_r_i64(a, REG_I64_FREE_IDX, (int64)(uint64)data);
+    a.cvtsi2ss(regs_float[reg_no], regs_i64[REG_I64_FREE_IDX]);
+    return true;
+}
+
+/**
+ * Encode converting uint32 register data to float register data
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no_dst the no of dst uint32 register
+ * @param reg_no_src the no of src float register
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+convert_r_u32_to_r_f32(x86::Assembler &a, int32 reg_no_dst, int32 reg_no_src)
+{
+    extend_r32_to_r64(a, REG_I64_FREE_IDX, reg_no_src, false);
+    a.cvtsi2ss(regs_float[reg_no_dst], regs_i64[REG_I64_FREE_IDX]);
+    return true;
+}
+
+/**
+ * Encode converting uint32 immediate data to double register data
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no the no of dst double register
+ * @param data the src immediate uint32 data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+convert_imm_u32_to_r_f64(x86::Assembler &a, int32 reg_no, uint32 data)
+{
+    mov_imm_to_r_i64(a, REG_I64_FREE_IDX, (int64)(uint64)data);
+    a.cvtsi2sd(regs_float[reg_no], regs_i64[REG_I64_FREE_IDX]);
+    return true;
+}
+
+/**
+ * Encode converting uint32 register data to double register data
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no_dst the no of dst uint32 register
+ * @param reg_no_src the no of src double register
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+convert_r_u32_to_r_f64(x86::Assembler &a, int32 reg_no_dst, int32 reg_no_src)
+{
+    extend_r32_to_r64(a, REG_I64_FREE_IDX, reg_no_src, false);
+    a.cvtsi2sd(regs_float[reg_no_dst], regs_i64[REG_I64_FREE_IDX]);
+    return true;
+}
+
+/**
+ * Encode converting int64 register data to float register data
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no_dst the no of dst float register
+ * @param reg_no_src the no of src int64 register
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+convert_r_i64_to_r_f32(x86::Assembler &a, int32 reg_no_dst, int32 reg_no_src)
+{
+    a.cvtsi2ss(regs_float[reg_no_dst], regs_i64[reg_no_src]);
+    return true;
+}
+
+/**
+ * Encode converting int64 immediate data to float register data
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no the no of dst float register
+ * @param data the src immediate int64 data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+convert_imm_i64_to_r_f32(x86::Assembler &a, int32 reg_no, int64 data)
+{
+    mov_imm_to_r_i64(a, REG_I64_FREE_IDX, data);
+    return convert_r_i64_to_r_f32(a, reg_no, REG_I64_FREE_IDX);
+}
+
+/**
+ * Encode converting int64 register data to double register data
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no_dst the no of dst double register
+ * @param reg_no_src the no of src int64 register
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+convert_r_i64_to_r_f64(x86::Assembler &a, int32 reg_no_dst, int32 reg_no_src)
+{
+    a.cvtsi2sd(regs_float[reg_no_dst], regs_i64[reg_no_src]);
+    return true;
+}
+
+/**
+ * Encode converting int64 immediate data to double register data
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no the no of dst double register
+ * @param data the src immediate int64 data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+convert_imm_i64_to_r_f64(x86::Assembler &a, int32 reg_no, int64 data)
+{
+    mov_imm_to_r_i64(a, REG_I64_FREE_IDX, data);
+    return convert_r_i64_to_r_f64(a, reg_no, REG_I64_FREE_IDX);
+}
+
+/**
+ * Encode converting float immediate data to int32 register data
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no the no of dst int32 register
+ * @param data the src immediate float data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+convert_imm_f32_to_r_i32(x86::Assembler &a, int32 reg_no, float data)
+{
+    return mov_imm_to_r_i32(a, reg_no, (int32)data);
+}
+
+/**
+ * Encode converting float register data to int32 register data
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no_dst the no of dst int32 register
+ * @param reg_no_src the no of src float register
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+convert_r_f32_to_r_i32(x86::Assembler &a, int32 reg_no_dst, int32 reg_no_src)
+{
+    a.cvttss2si(regs_i32[reg_no_dst], regs_float[reg_no_src]);
+    return true;
+}
+
+/**
+ * Encode converting float immediate data to int32 register data
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no the no of dst int32 register
+ * @param data the src immediate float data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+convert_imm_f32_to_r_u32(x86::Assembler &a, int32 reg_no, float data)
+{
+    return mov_imm_to_r_i32(a, reg_no, (uint32)data);
+}
+
+/**
+ * Encode converting float register data to int32 register data
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no_dst the no of dst int32 register
+ * @param reg_no_src the no of src float register
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+convert_r_f32_to_r_u32(x86::Assembler &a, int32 reg_no_dst, int32 reg_no_src)
+{
+    a.cvttss2si(regs_i64[reg_no_dst], regs_float[reg_no_src]);
+    return true;
+}
+
+/**
+ * Encode converting float immediate data to int64 register data
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no the no of dst int64 register
+ * @param data the src immediate float data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+convert_imm_f32_to_r_i64(x86::Assembler &a, int32 reg_no, float data)
+{
+    return mov_imm_to_r_i64(a, reg_no, (int64)data);
+}
+
+/**
+ * Encode converting float register data to int64 register data
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no_dst the no of dst int64 register
+ * @param reg_no_src the no of src float register
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+convert_r_f32_to_r_i64(x86::Assembler &a, int32 reg_no_dst, int32 reg_no_src)
+{
+    a.cvttss2si(regs_i64[reg_no_dst], regs_float[reg_no_src]);
+    return true;
+}
+
+/**
+ * Encode converting float immediate data to double register data
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no the no of dst double register
+ * @param data the src immediate float data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+convert_imm_f32_to_r_f64(x86::Assembler &a, int32 reg_no, float data)
+{
+    return mov_imm_to_r_f64(a, reg_no, (double)data);
+}
+
+/**
+ * Encode converting float register data to double register data
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no_dst the no of dst double register
+ * @param reg_no_src the no of src float register
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+convert_r_f32_to_r_f64(x86::Assembler &a, int32 reg_no_dst, int32 reg_no_src)
+{
+    a.cvtss2sd(regs_float[reg_no_dst], regs_float[reg_no_src]);
+    return true;
+}
+
+/**
+ * Encode converting double immediate data to int32 register data
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no the no of dst int32 register
+ * @param data the src immediate double data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+convert_imm_f64_to_r_i32(x86::Assembler &a, int32 reg_no, double data)
+{
+    return mov_imm_to_r_i32(a, reg_no, (int32)data);
+}
+
+/**
+ * Encode converting double register data to int32 register data
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no_dst the no of dst int32 register
+ * @param reg_no_src the no of src double register
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+convert_r_f64_to_r_i32(x86::Assembler &a, int32 reg_no_dst, int32 reg_no_src)
+{
+    a.cvttsd2si(regs_i32[reg_no_dst], regs_float[reg_no_src]);
+    return true;
+}
+
+/**
+ * Encode converting double immediate data to int64 register data
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no the no of dst int64 register
+ * @param data the src immediate double data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+convert_imm_f64_to_r_i64(x86::Assembler &a, int32 reg_no, double data)
+{
+    return mov_imm_to_r_i64(a, reg_no, (int64)data);
+}
+
+/**
+ * Encode converting double register data to int64 register data
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no_dst the no of dst int64 register
+ * @param reg_no_src the no of src double register
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+convert_r_f64_to_r_i64(x86::Assembler &a, int32 reg_no_dst, int32 reg_no_src)
+{
+    a.cvttsd2si(regs_i64[reg_no_dst], regs_float[reg_no_src]);
+    return true;
+}
+
+/**
+ * Encode converting double immediate data to float register data
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no the no of dst float register
+ * @param data the src immediate double data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+convert_imm_f64_to_r_f32(x86::Assembler &a, int32 reg_no, double data)
+{
+    return mov_imm_to_r_f32(a, reg_no, (float)data);
+}
+
+/**
+ * Encode converting double register data to float register data
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no_dst the no of dst float register
+ * @param reg_no_src the no of src double register
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+convert_r_f64_to_r_f32(x86::Assembler &a, int32 reg_no_dst, int32 reg_no_src)
+{
+    a.cvtsd2ss(regs_float[reg_no_dst], regs_float[reg_no_src]);
+    return true;
+}
+
+/**
+ * Encode converting double immediate data to int32 register data
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no the no of dst int32 register
+ * @param data the src immediate double data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+convert_imm_f64_to_r_u32(x86::Assembler &a, int32 reg_no, double data)
+{
+    return mov_imm_to_r_i32(a, reg_no, (uint32)data);
+}
+
+/**
+ * Encode converting double register data to int32 register data
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no_dst the no of dst int32 register
+ * @param reg_no_src the no of src double register
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+convert_r_f64_to_r_u32(x86::Assembler &a, int32 reg_no_dst, int32 reg_no_src)
+{
+    a.cvttsd2si(regs_i64[reg_no_dst], regs_float[reg_no_src]);
+    return true;
+}
+
+/**
+ * Encode making negative from int32 immediate data to int32 register
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no the no of dst register
+ * @param data the src int32 immediate data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+neg_imm_to_r_i32(x86::Assembler &a, int32 reg_no, int32 data)
+{
+    Imm imm(-data);
+    a.mov(regs_i32[reg_no], imm);
+    return true;
+}
+
+/**
+ * Encode making negative from int32 register to int32 register
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no_dst the no of dst register
+ * @param reg_no_src the no of src register
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+neg_r_to_r_i32(x86::Assembler &a, int32 reg_no_dst, int32 reg_no_src)
+{
+    mov_r_to_r_i32(a, reg_no_dst, reg_no_src);
+    a.neg(regs_i32[reg_no_dst]);
+    return true;
+}
+
+/**
+ * Encode making negative from int64 immediate data to int64 register
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no the no of dst register
+ * @param data the src int64 immediate data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+neg_imm_to_r_i64(x86::Assembler &a, int32 reg_no, int64 data)
+{
+    Imm imm(-data);
+    a.mov(regs_i64[reg_no], imm);
+    return true;
+}
+
+/**
+ * Encode making negative from int64 register to int64 register
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no_dst the no of dst register
+ * @param reg_no_src the no of src register
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+neg_r_to_r_i64(x86::Assembler &a, int32 reg_no_dst, int32 reg_no_src)
+{
+    mov_r_to_r_i64(a, reg_no_dst, reg_no_src);
+    a.neg(regs_i64[reg_no_dst]);
+    return true;
+}
+
+/**
+ * Encode making negative from float immediate data to float register
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no the no of dst float register
+ * @param data the src float immediate data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+neg_imm_to_r_f32(x86::Assembler &a, int32 reg_no, float data)
+{
+    bh_assert(0);
+    (void)a;
+    (void)reg_no;
+    (void)data;
+    return false;
+}
+
+/**
+ * Encode making negative from float register to float register
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no_dst the no of dst register
+ * @param reg_no_src the no of src register
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+neg_r_to_r_f32(x86::Assembler &a, int32 reg_no_dst, int32 reg_no_src)
+{
+    bh_assert(0);
+    (void)a;
+    (void)reg_no_dst;
+    (void)reg_no_src;
+    return false;
+}
+
+/**
+ * Encode making negative from double immediate data to double register
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no the no of dst double register
+ * @param data the src double immediate data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+neg_imm_to_r_f64(x86::Assembler &a, int32 reg_no, double data)
+{
+    bh_assert(0);
+    (void)a;
+    (void)reg_no;
+    (void)data;
+    return false;
+}
+
+/**
+ * Encode making negative from double register to double register
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no_dst the no of dst double register
+ * @param reg_no_src the no of src double register
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+neg_r_to_r_f64(x86::Assembler &a, int32 reg_no_dst, int32 reg_no_src)
+{
+    bh_assert(0);
+    (void)a;
+    (void)reg_no_dst;
+    (void)reg_no_src;
+    return false;
+}
+
+static COND_OP
+not_cond(COND_OP op)
+{
+    COND_OP not_list[] = { NE, EQ, LES, LTS, GES, GTS, LEU, LTU, GEU, GTU };
+
+    bh_assert(op <= LEU);
+    return not_list[op];
+}
+
+/**
+ * Encode int32 alu operation of reg and data, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of ALU operation
+ * @param reg_no the no of register, as first operand, and save result
+ * @param data the immediate data, as the second operand
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+alu_r_r_imm_i32(x86::Assembler &a, ALU_OP op, int32 reg_no_dst,
+                int32 reg_no_src, int32 data)
+{
+    Imm imm(data);
+
+    switch (op) {
+        case ADD:
+            mov_r_to_r(a, JIT_REG_KIND_I32, reg_no_dst, reg_no_src);
+            if (data == 1)
+                a.inc(regs_i32[reg_no_dst]);
+            else if (data == -1)
+                a.dec(regs_i32[reg_no_dst]);
+            else if (data != 0)
+                a.add(regs_i32[reg_no_dst], imm);
+            break;
+        case SUB:
+            mov_r_to_r(a, JIT_REG_KIND_I32, reg_no_dst, reg_no_src);
+            if (data == -1)
+                a.inc(regs_i32[reg_no_dst]);
+            else if (data == 1)
+                a.dec(regs_i32[reg_no_dst]);
+            else if (data != 0)
+                a.sub(regs_i32[reg_no_dst], imm);
+            break;
+        case MUL:
+            if (data == 0)
+                a.xor_(regs_i32[reg_no_dst], regs_i32[reg_no_dst]);
+            else if (data == -1) {
+                mov_r_to_r(a, JIT_REG_KIND_I32, reg_no_dst, reg_no_src);
+                a.neg(regs_i32[reg_no_dst]);
+            }
+            else if (data == 1) {
+                mov_r_to_r(a, JIT_REG_KIND_I32, reg_no_dst, reg_no_src);
+            }
+            else if (data > 0 && (data & (data - 1)) == 0x0) {
+                mov_r_to_r(a, JIT_REG_KIND_I32, reg_no_dst, reg_no_src);
+                data = (int32)local_log2(data);
+                imm.setValue(data);
+                a.shl(regs_i32[reg_no_dst], imm);
+            }
+            else {
+                a.imul(regs_i32[reg_no_dst], regs_i32[reg_no_src], imm);
+            }
+            break;
+        case DIV_S:
+        case REM_S:
+            bh_assert(reg_no_src == REG_EAX_IDX);
+            if (op == DIV_S) {
+                bh_assert(reg_no_dst == REG_EAX_IDX);
+            }
+            else {
+                bh_assert(reg_no_dst == REG_EDX_IDX);
+            }
+            a.mov(regs_i32[REG_I32_FREE_IDX], imm);
+            /* signed extend eax to edx:eax */
+            a.cdq();
+            a.idiv(regs_i32[REG_I32_FREE_IDX]);
+            break;
+        case DIV_U:
+        case REM_U:
+            bh_assert(reg_no_src == REG_EAX_IDX);
+            if (op == DIV_U) {
+                bh_assert(reg_no_dst == REG_EAX_IDX);
+            }
+            else {
+                bh_assert(reg_no_dst == REG_EDX_IDX);
+            }
+            a.mov(regs_i32[REG_I32_FREE_IDX], imm);
+            /* unsigned extend eax to edx:eax */
+            a.xor_(regs_i32[REG_EDX_IDX], regs_i32[REG_EDX_IDX]);
+            a.div(regs_i32[REG_I32_FREE_IDX]);
+            break;
+        default:
+            bh_assert(0);
+            break;
+    }
+
+    return true;
+}
+
+/**
+ * Encode int32 alu operation of reg and reg, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of ALU operation
+ * @param reg_no_dst the no of register, as first operand, and save result
+ * @param reg_no_src the no of register, as the second operand
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+alu_r_r_r_i32(x86::Assembler &a, ALU_OP op, int32 reg_no_dst, int32 reg_no1_src,
+              int32 reg_no2_src)
+{
+    switch (op) {
+        case ADD:
+            if (reg_no_dst != reg_no2_src) {
+                mov_r_to_r(a, JIT_REG_KIND_I32, reg_no_dst, reg_no1_src);
+                a.add(regs_i32[reg_no_dst], regs_i32[reg_no2_src]);
+            }
+            else
+                a.add(regs_i32[reg_no2_src], regs_i32[reg_no1_src]);
+            break;
+        case SUB:
+            if (reg_no_dst != reg_no2_src) {
+                mov_r_to_r(a, JIT_REG_KIND_I32, reg_no_dst, reg_no1_src);
+                a.sub(regs_i32[reg_no_dst], regs_i32[reg_no2_src]);
+            }
+            else {
+                a.sub(regs_i32[reg_no2_src], regs_i32[reg_no1_src]);
+                a.neg(regs_i32[reg_no2_src]);
+            }
+            break;
+        case MUL:
+            if (reg_no_dst != reg_no2_src) {
+                mov_r_to_r(a, JIT_REG_KIND_I32, reg_no_dst, reg_no1_src);
+                a.imul(regs_i32[reg_no_dst], regs_i32[reg_no2_src]);
+            }
+            else
+                a.imul(regs_i32[reg_no2_src], regs_i32[reg_no1_src]);
+            break;
+        case DIV_S:
+        case REM_S:
+            bh_assert(reg_no1_src == REG_EAX_IDX);
+            if (op == DIV_S) {
+                bh_assert(reg_no_dst == REG_EAX_IDX);
+            }
+            else {
+                bh_assert(reg_no_dst == REG_EDX_IDX);
+                if (reg_no2_src == REG_EDX_IDX) {
+                    /* convert `REM_S edx, eax, edx` into
+                       `mov esi, edx` and `REM_S edx eax, rsi` to
+                       avoid overwritting edx when a.cdq() */
+                    a.mov(regs_i32[REG_I32_FREE_IDX], regs_i32[REG_EDX_IDX]);
+                    reg_no2_src = REG_I32_FREE_IDX;
+                }
+            }
+            /* signed extend eax to edx:eax */
+            a.cdq();
+            a.idiv(regs_i32[reg_no2_src]);
+            break;
+        case DIV_U:
+        case REM_U:
+            bh_assert(reg_no1_src == REG_EAX_IDX);
+            if (op == DIV_U) {
+                bh_assert(reg_no_dst == REG_EAX_IDX);
+            }
+            else {
+                bh_assert(reg_no_dst == REG_EDX_IDX);
+                if (reg_no2_src == REG_EDX_IDX) {
+                    /* convert `REM_U edx, eax, edx` into
+                       `mov esi, edx` and `REM_U edx eax, rsi` to
+                       avoid overwritting edx when unsigned extend
+                       eax to edx:eax */
+                    a.mov(regs_i32[REG_I32_FREE_IDX], regs_i32[REG_EDX_IDX]);
+                    reg_no2_src = REG_I32_FREE_IDX;
+                }
+            }
+            /* unsigned extend eax to edx:eax */
+            a.xor_(regs_i32[REG_EDX_IDX], regs_i32[REG_EDX_IDX]);
+            a.div(regs_i32[reg_no2_src]);
+            break;
+        default:
+            bh_assert(0);
+            return false;
+    }
+
+    return true;
+}
+
+/**
+ * Encode int32 alu operation of imm and imm, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of ALU operation
+ * @param reg_no_dst the no of register
+ * @param data1_src the first src immediate data
+ * @param data2_src the second src immediate data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+alu_imm_imm_to_r_i32(x86::Assembler &a, ALU_OP op, int32 reg_no_dst,
+                     int32 data1_src, int32 data2_src)
+{
+    Imm imm;
+    int32 data = 0;
+
+    switch (op) {
+        case ADD:
+            data = data1_src + data2_src;
+            break;
+        case SUB:
+            data = data1_src - data2_src;
+            break;
+        case MUL:
+            data = data1_src * data2_src;
+            break;
+        case DIV_S:
+            data = data1_src / data2_src;
+            break;
+        case REM_S:
+            data = data1_src % data2_src;
+            break;
+        case DIV_U:
+            data = (uint32)data1_src / (uint32)data2_src;
+            break;
+        case REM_U:
+            data = (uint32)data1_src % (uint32)data2_src;
+            break;
+        default:
+            bh_assert(0);
+            return false;
+    }
+
+    imm.setValue(data);
+    a.mov(regs_i32[reg_no_dst], imm);
+    return true;
+}
+
+/**
+ * Encode int32 alu operation of imm and reg, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of ALU operation
+ * @param reg_no_dst the no of register
+ * @param data1_src the first src immediate data
+ * @param reg_no2_src the reg no of second src register data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+alu_imm_r_to_r_i32(x86::Assembler &a, ALU_OP op, int32 reg_no_dst,
+                   int32 data1_src, int32 reg_no2_src)
+{
+    if (op == ADD || op == MUL)
+        return alu_r_r_imm_i32(a, op, reg_no_dst, reg_no2_src, data1_src);
+    else if (op == SUB) {
+        if (!alu_r_r_imm_i32(a, op, reg_no_dst, reg_no2_src, data1_src))
+            return false;
+        a.neg(regs_i32[reg_no_dst]);
+        return true;
+    }
+    else {
+        if (reg_no_dst != reg_no2_src) {
+            if (!mov_imm_to_r_i32(a, reg_no_dst, data1_src)
+                || !alu_r_r_r_i32(a, op, reg_no_dst, reg_no_dst, reg_no2_src))
+                return false;
+            return true;
+        }
+        else {
+            if (!mov_imm_to_r_i32(a, REG_I32_FREE_IDX, data1_src)
+                || !alu_r_r_r_i32(a, op, reg_no_dst, REG_I32_FREE_IDX,
+                                  reg_no2_src))
+                return false;
+            return true;
+        }
+    }
+
+    return true;
+}
+
+/**
+ * Encode int32 alu operation of reg and imm, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of ALU operation
+ * @param reg_no_dst the no of register
+ * @param reg_no1_src the reg no of first src register data
+ * @param reg_no2_src the reg no of second src register data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+alu_r_imm_to_r_i32(x86::Assembler &a, ALU_OP op, int32 reg_no_dst,
+                   int32 reg_no1_src, int32 data2_src)
+{
+    return alu_r_r_imm_i32(a, op, reg_no_dst, reg_no1_src, data2_src);
+}
+
+/**
+ * Encode int32 alu operation of reg and reg, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of ALU operation
+ * @param reg_no_dst the no of register
+ * @param reg_no1_src the reg no of first src register data
+ * @param reg_no2_src the reg no of second src register data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+alu_r_r_to_r_i32(x86::Assembler &a, ALU_OP op, int32 reg_no_dst,
+                 int32 reg_no1_src, int32 reg_no2_src)
+{
+    return alu_r_r_r_i32(a, op, reg_no_dst, reg_no1_src, reg_no2_src);
+}
+
+/**
+ * Encode int64 alu operation of reg and reg, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of ALU operation
+ * @param reg_no_dst the no of register, as first operand, and save result
+ * @param reg_no_src the no of register, as the second operand
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+alu_r_r_r_i64(x86::Assembler &a, ALU_OP op, int32 reg_no_dst, int32 reg_no1_src,
+              int32 reg_no2_src)
+{
+    switch (op) {
+        case ADD:
+            if (reg_no_dst != reg_no2_src) {
+                mov_r_to_r(a, JIT_REG_KIND_I64, reg_no_dst, reg_no1_src);
+                a.add(regs_i64[reg_no_dst], regs_i64[reg_no2_src]);
+            }
+            else
+                a.add(regs_i64[reg_no2_src], regs_i64[reg_no1_src]);
+            break;
+        case SUB:
+            if (reg_no_dst != reg_no2_src) {
+                mov_r_to_r(a, JIT_REG_KIND_I64, reg_no_dst, reg_no1_src);
+                a.sub(regs_i64[reg_no_dst], regs_i64[reg_no2_src]);
+            }
+            else {
+                a.sub(regs_i64[reg_no2_src], regs_i64[reg_no1_src]);
+                a.neg(regs_i64[reg_no2_src]);
+            }
+            break;
+        case MUL:
+            if (reg_no_dst != reg_no2_src) {
+                mov_r_to_r(a, JIT_REG_KIND_I64, reg_no_dst, reg_no1_src);
+                a.imul(regs_i64[reg_no_dst], regs_i64[reg_no2_src]);
+            }
+            else
+                a.imul(regs_i64[reg_no2_src], regs_i64[reg_no1_src]);
+            break;
+        case DIV_S:
+        case REM_S:
+            bh_assert(reg_no1_src == REG_RAX_IDX);
+            if (op == DIV_S) {
+                bh_assert(reg_no_dst == REG_RAX_IDX);
+            }
+            else {
+                bh_assert(reg_no_dst == REG_RDX_IDX);
+            }
+            /* signed extend rax to rdx:rax */
+            a.cqo();
+            a.idiv(regs_i64[reg_no2_src]);
+            break;
+        case DIV_U:
+        case REM_U:
+            bh_assert(reg_no1_src == REG_RAX_IDX);
+            if (op == DIV_U) {
+                bh_assert(reg_no_dst == REG_RAX_IDX);
+            }
+            else {
+                bh_assert(reg_no_dst == REG_RDX_IDX);
+            }
+            /* unsigned extend rax to rdx:rax */
+            a.xor_(regs_i64[REG_RDX_IDX], regs_i64[REG_RDX_IDX]);
+            a.div(regs_i64[reg_no2_src]);
+            break;
+        default:
+            bh_assert(0);
+            break;
+    }
+
+    return true;
+}
+
+/**
+ * Encode int64 alu operation of reg and data, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of ALU operation
+ * @param reg_no the no of register, as first operand, and save result
+ * @param data the immediate data, as the second operand
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+alu_r_r_imm_i64(x86::Assembler &a, ALU_OP op, int32 reg_no_dst,
+                int32 reg_no_src, int64 data)
+{
+    Imm imm(data);
+
+    switch (op) {
+        case ADD:
+            mov_r_to_r(a, JIT_REG_KIND_I64, reg_no_dst, reg_no_src);
+            if (data == 1)
+                a.inc(regs_i64[reg_no_dst]);
+            else if (data == -1)
+                a.dec(regs_i64[reg_no_dst]);
+            else if (data != 0) {
+                if (data >= INT32_MIN && data <= INT32_MAX) {
+                    imm.setValue((int32)data);
+                    a.add(regs_i64[reg_no_dst], imm);
+                }
+                else {
+                    a.mov(regs_i64[REG_I64_FREE_IDX], imm);
+                    a.add(regs_i64[reg_no_dst], regs_i64[REG_I64_FREE_IDX]);
+                }
+            }
+            break;
+        case SUB:
+            mov_r_to_r(a, JIT_REG_KIND_I64, reg_no_dst, reg_no_src);
+            if (data == -1)
+                a.inc(regs_i64[reg_no_dst]);
+            else if (data == 1)
+                a.dec(regs_i64[reg_no_dst]);
+            else if (data != 0) {
+                if (data >= INT32_MIN && data <= INT32_MAX) {
+                    imm.setValue((int32)data);
+                    a.sub(regs_i64[reg_no_dst], imm);
+                }
+                else {
+                    a.mov(regs_i64[REG_I64_FREE_IDX], imm);
+                    a.sub(regs_i64[reg_no_dst], regs_i64[REG_I64_FREE_IDX]);
+                }
+            }
+            break;
+        case MUL:
+            if (data == 0)
+                a.xor_(regs_i64[reg_no_dst], regs_i64[reg_no_dst]);
+            else if (data == -1) {
+                mov_r_to_r(a, JIT_REG_KIND_I64, reg_no_dst, reg_no_src);
+                a.neg(regs_i64[reg_no_dst]);
+            }
+            else if (data == 1) {
+                mov_r_to_r(a, JIT_REG_KIND_I64, reg_no_dst, reg_no_src);
+            }
+            else if (data > 0 && (data & (data - 1)) == 0x0) {
+                mov_r_to_r(a, JIT_REG_KIND_I64, reg_no_dst, reg_no_src);
+                data = (int64)local_log2l(data);
+                imm.setValue(data);
+                a.shl(regs_i64[reg_no_dst], imm);
+            }
+            else if (INT32_MIN <= data && data <= INT32_MAX) {
+                a.imul(regs_i64[reg_no_dst], regs_i64[reg_no_src], imm);
+            }
+            else {
+                mov_imm_to_r_i64(
+                    a, reg_no_dst == reg_no_src ? REG_I64_FREE_IDX : reg_no_dst,
+                    data);
+                alu_r_r_r_i64(a, op, reg_no_dst,
+                              reg_no_dst == reg_no_src ? REG_I64_FREE_IDX
+                                                       : reg_no_dst,
+                              reg_no_src);
+            }
+            break;
+        case DIV_S:
+        case REM_S:
+            bh_assert(reg_no_src == REG_RAX_IDX);
+            if (op == DIV_S) {
+                bh_assert(reg_no_dst == REG_RAX_IDX);
+            }
+            else {
+                bh_assert(reg_no_dst == REG_RDX_IDX);
+            }
+            a.mov(regs_i64[REG_I64_FREE_IDX], imm);
+            /* signed extend rax to rdx:rax */
+            a.cqo();
+            a.idiv(regs_i64[REG_I64_FREE_IDX]);
+            break;
+        case DIV_U:
+        case REM_U:
+            bh_assert(reg_no_src == REG_RAX_IDX);
+            if (op == DIV_U) {
+                bh_assert(reg_no_dst == REG_RAX_IDX);
+            }
+            else {
+                bh_assert(reg_no_dst == REG_RDX_IDX);
+            }
+            a.mov(regs_i64[REG_I64_FREE_IDX], imm);
+            /* unsigned extend rax to rdx:rax */
+            a.xor_(regs_i64[REG_RDX_IDX], regs_i64[REG_RDX_IDX]);
+            a.div(regs_i64[REG_I64_FREE_IDX]);
+            break;
+        default:
+            bh_assert(0);
+            break;
+    }
+
+    return true;
+}
+
+/**
+ * Encode int64 alu operation of imm and imm, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of ALU operation
+ * @param reg_no_dst the no of register
+ * @param data1_src the first src immediate data
+ * @param data2_src the second src immediate data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+alu_imm_imm_to_r_i64(x86::Assembler &a, ALU_OP op, int32 reg_no_dst,
+                     int64 data1_src, int64 data2_src)
+{
+    Imm imm;
+    int64 data = 0;
+
+    switch (op) {
+        case ADD:
+            data = data1_src + data2_src;
+            break;
+        case SUB:
+            data = data1_src - data2_src;
+            break;
+        case MUL:
+            data = data1_src * data2_src;
+            break;
+        case DIV_S:
+            data = data1_src / data2_src;
+            break;
+        case REM_S:
+            data = data1_src % data2_src;
+            break;
+        case DIV_U:
+            data = (uint64)data1_src / (uint64)data2_src;
+            break;
+        case REM_U:
+            data = (uint64)data1_src % (uint64)data2_src;
+            break;
+        default:
+            bh_assert(0);
+            break;
+    }
+
+    imm.setValue(data);
+    a.mov(regs_i64[reg_no_dst], imm);
+    return true;
+}
+
+/**
+ * Encode int64 alu operation of imm and reg, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of ALU operation
+ * @param reg_no_dst the no of register
+ * @param data1_src the first src immediate data
+ * @param reg_no2_src the reg no of second src register data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+alu_imm_r_to_r_i64(x86::Assembler &a, ALU_OP op, int32 reg_no_dst,
+                   int64 data1_src, int32 reg_no2_src)
+{
+    if (op == ADD || op == MUL)
+        return alu_r_r_imm_i64(a, op, reg_no_dst, reg_no2_src, data1_src);
+    else if (op == SUB) {
+        if (!alu_r_r_imm_i64(a, op, reg_no_dst, reg_no2_src, data1_src))
+            return false;
+        a.neg(regs_i64[reg_no_dst]);
+        return true;
+    }
+    else {
+        if (reg_no_dst != reg_no2_src) {
+            if (!mov_imm_to_r_i64(a, reg_no_dst, data1_src)
+                || !alu_r_r_r_i64(a, op, reg_no_dst, reg_no_dst, reg_no2_src))
+                return false;
+            return true;
+        }
+        else {
+            if (!mov_imm_to_r_i64(a, REG_I64_FREE_IDX, data1_src)
+                || !alu_r_r_r_i64(a, op, reg_no_dst, REG_I64_FREE_IDX,
+                                  reg_no2_src))
+                return false;
+            return true;
+        }
+    }
+
+    return true;
+}
+
+/**
+ * Encode int64 alu operation of reg and imm, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of ALU operation
+ * @param reg_no_dst the no of register
+ * @param reg_no1_src the reg no of first src register data
+ * @param reg_no2_src the reg no of second src register data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+alu_r_imm_to_r_i64(x86::Assembler &a, ALU_OP op, int32 reg_no_dst,
+                   int32 reg_no1_src, int64 data2_src)
+{
+    return alu_r_r_imm_i64(a, op, reg_no_dst, reg_no1_src, data2_src);
+}
+
+/**
+ * Encode int64 alu operation of reg and reg, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of ALU operation
+ * @param reg_no_dst the no of register
+ * @param reg_no1_src the reg no of first src register data
+ * @param reg_no2_src the reg no of second src register data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+alu_r_r_to_r_i64(x86::Assembler &a, ALU_OP op, int32 reg_no_dst,
+                 int32 reg_no1_src, int32 reg_no2_src)
+{
+    return alu_r_r_r_i64(a, op, reg_no_dst, reg_no1_src, reg_no2_src);
+}
+
+/**
+ * Encode float alu operation of imm and imm, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of ALU operation
+ * @param reg_no_dst the no of register
+ * @param data1_src the first src immediate data
+ * @param data2_src the second src immediate data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+alu_imm_imm_to_r_f32(x86::Assembler &a, ALU_OP op, int32 reg_no_dst,
+                     float data1_src, float data2_src)
+{
+    Imm imm;
+    float data = 0;
+
+    switch (op) {
+        case ADD:
+        {
+            data = data1_src + data2_src;
+            break;
+        }
+        case SUB:
+        {
+            data = data1_src - data2_src;
+            break;
+        }
+        case MUL:
+        {
+            data = data1_src * data2_src;
+            break;
+        }
+        case DIV_S:
+        {
+            data = data1_src / data2_src;
+            break;
+        }
+        case MAX:
+        {
+            data = fmaxf(data1_src, data2_src);
+            break;
+        }
+        case MIN:
+        {
+            data = fminf(data1_src, data2_src);
+            break;
+        }
+        default:
+        {
+            bh_assert(0);
+            return false;
+        }
+    }
+
+    return mov_imm_to_r_f32(a, reg_no_dst, data);
+}
+
+static bool
+alu_r_m_float(x86::Assembler &a, ALU_OP op, int32 reg_no, x86::Mem &m,
+              bool is_f32)
+{
+    switch (op) {
+        case ADD:
+        {
+            if (is_f32)
+                a.addss(regs_float[reg_no], m);
+            else
+                a.addsd(regs_float[reg_no], m);
+            break;
+        }
+        case SUB:
+        {
+            if (is_f32)
+                a.subss(regs_float[reg_no], m);
+            else
+                a.subsd(regs_float[reg_no], m);
+            break;
+        }
+        case MUL:
+        {
+            if (is_f32)
+                a.mulss(regs_float[reg_no], m);
+            else
+                a.mulsd(regs_float[reg_no], m);
+            break;
+        }
+        case DIV_S:
+        {
+            if (is_f32)
+                a.divss(regs_float[reg_no], m);
+            else
+                a.divsd(regs_float[reg_no], m);
+            break;
+        }
+        case MAX:
+        {
+            if (is_f32)
+                a.maxss(regs_float[reg_no], m);
+            else
+                a.maxsd(regs_float[reg_no], m);
+            break;
+        }
+        case MIN:
+        {
+            if (is_f32)
+                a.minss(regs_float[reg_no], m);
+            else
+                a.minsd(regs_float[reg_no], m);
+            break;
+        }
+        default:
+        {
+            bh_assert(0);
+            return false;
+        }
+    }
+    return true;
+}
+
+/**
+ * Encode float alu operation of imm and reg, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of ALU operation
+ * @param reg_no_dst the no of register
+ * @param data1_src the first src immediate data
+ * @param reg_no2_src the reg no of second src register data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+alu_imm_r_to_r_f32(x86::Assembler &a, ALU_OP op, int32 reg_no_dst,
+                   float data1_src, int32 reg_no2_src)
+{
+    const JitHardRegInfo *hreg_info = jit_codegen_get_hreg_info();
+    /* xmm -> m128 */
+    x86::Mem cache = x86::xmmword_ptr(regs_i64[hreg_info->exec_env_hreg_index],
+                                      offsetof(WASMExecEnv, jit_cache));
+    a.movups(cache, regs_float[reg_no2_src]);
+
+    /* imm -> gp -> xmm */
+    mov_imm_to_r_f32(a, reg_no_dst, data1_src);
+
+    return alu_r_m_float(a, op, reg_no_dst, cache, true);
+}
+
+/**
+ * Encode float alu operation of reg and imm, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of ALU operation
+ * @param reg_no_dst the no of register
+ * @param reg_no1_src the reg no of first src register data
+ * @param data2_src the second src immediate data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+alu_r_imm_to_r_f32(x86::Assembler &a, ALU_OP op, int32 reg_no_dst,
+                   int32 reg_no1_src, float data2_src)
+{
+    const JitHardRegInfo *hreg_info = jit_codegen_get_hreg_info();
+    /* imm -> m32 */
+    x86::Mem cache = x86::dword_ptr(regs_i64[hreg_info->exec_env_hreg_index],
+                                    offsetof(WASMExecEnv, jit_cache));
+    cast_float_to_integer v = { .f = data2_src };
+    Imm imm(v.i);
+    mov_imm_to_m(a, cache, imm, 4);
+
+    mov_r_to_r_f32(a, reg_no_dst, reg_no1_src);
+    return alu_r_m_float(a, op, reg_no_dst, cache, true);
+}
+
+/**
+ * Encode float alu operation of reg and reg, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of ALU operation
+ * @param reg_no_dst the no of register
+ * @param reg_no1_src the reg no of first src register data
+ * @param reg_no2_src the reg no of second src register data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+alu_r_r_to_r_f32(x86::Assembler &a, ALU_OP op, int32 reg_no_dst,
+                 int32 reg_no1_src, int32 reg_no2_src)
+{
+    bool store_result = false;
+
+    /**
+     * - op r0,r0,r1. do nothing since instructions always store results in
+     *   the first register
+     *
+     * - op r1,r0,r1. use FREE_REG to cache and replace r0, and then store
+     *   results in r1
+     *
+     * - op r0,r1,r2. use r0 to cache and replace r1, and accept the result
+     *   naturally
+     **/
+    if (reg_no_dst == reg_no2_src) {
+        store_result = true;
+        reg_no_dst = REG_F32_FREE_IDX;
+    }
+    mov_r_to_r_f32(a, reg_no_dst, reg_no1_src);
+
+    switch (op) {
+        case ADD:
+        {
+            a.addss(regs_float[reg_no_dst], regs_float[reg_no2_src]);
+            break;
+        }
+        case SUB:
+        {
+            a.subss(regs_float[reg_no_dst], regs_float[reg_no2_src]);
+            break;
+        }
+        case MUL:
+        {
+            a.mulss(regs_float[reg_no_dst], regs_float[reg_no2_src]);
+            break;
+        }
+        case DIV_S:
+        {
+            a.divss(regs_float[reg_no_dst], regs_float[reg_no2_src]);
+            break;
+        }
+        case MAX:
+        {
+            a.maxss(regs_float[reg_no_dst], regs_float[reg_no2_src]);
+            break;
+        }
+        case MIN:
+        {
+            a.minss(regs_float[reg_no_dst], regs_float[reg_no2_src]);
+            break;
+        }
+        default:
+        {
+            bh_assert(0);
+            return false;
+        }
+    }
+
+    if (store_result)
+        mov_r_to_r_f32(a, reg_no2_src, REG_F32_FREE_IDX);
+
+    return true;
+}
+
+/**
+ * Encode double alu operation of imm and imm, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of ALU operation
+ * @param reg_no_dst the no of register
+ * @param data1_src the first src immediate data
+ * @param data2_src the second src immediate data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+alu_imm_imm_to_r_f64(x86::Assembler &a, ALU_OP op, int32 reg_no_dst,
+                     double data1_src, double data2_src)
+{
+    Imm imm;
+    double data = 0;
+
+    switch (op) {
+        case ADD:
+        {
+            data = data1_src + data2_src;
+            break;
+        }
+        case SUB:
+        {
+            data = data1_src - data2_src;
+            break;
+        }
+        case MUL:
+        {
+            data = data1_src * data2_src;
+            break;
+        }
+        case DIV_S:
+        {
+            data = data1_src / data2_src;
+            break;
+        }
+        case MAX:
+        {
+            data = fmax(data1_src, data2_src);
+            break;
+        }
+        case MIN:
+        {
+            data = fmin(data1_src, data2_src);
+            break;
+        }
+        default:
+        {
+            bh_assert(0);
+            return false;
+        }
+    }
+
+    return mov_imm_to_r_f64(a, reg_no_dst, data);
+}
+
+/**
+ * Encode double alu operation of imm and reg, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of ALU operation
+ * @param reg_no_dst the no of register
+ * @param data1_src the first src immediate data
+ * @param reg_no2_src the reg no of second src register data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+alu_imm_r_to_r_f64(x86::Assembler &a, ALU_OP op, int32 reg_no_dst,
+                   double data1_src, int32 reg_no2_src)
+{
+    const JitHardRegInfo *hreg_info = jit_codegen_get_hreg_info();
+    /* xmm -> m128 */
+    x86::Mem cache = x86::qword_ptr(regs_i64[hreg_info->exec_env_hreg_index],
+                                    offsetof(WASMExecEnv, jit_cache));
+    a.movupd(cache, regs_float[reg_no2_src]);
+
+    /* imm -> gp -> xmm */
+    mov_imm_to_r_f64(a, reg_no_dst, data1_src);
+
+    return alu_r_m_float(a, op, reg_no_dst, cache, false);
+}
+
+/**
+ * Encode double alu operation of reg and imm, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of ALU operation
+ * @param reg_no_dst the no of register
+ * @param reg_no1_src the reg no of first src register data
+ * @param data2_src the second src immediate data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+alu_r_imm_to_r_f64(x86::Assembler &a, ALU_OP op, int32 reg_no_dst,
+                   int32 reg_no1_src, double data2_src)
+{
+    const JitHardRegInfo *hreg_info = jit_codegen_get_hreg_info();
+    /* imm -> m64 */
+    x86::Mem cache = x86::qword_ptr(regs_i64[hreg_info->exec_env_hreg_index],
+                                    offsetof(WASMExecEnv, jit_cache));
+    cast_double_to_integer v = { .d = data2_src };
+    Imm imm(v.i);
+    mov_imm_to_m(a, cache, imm, 8);
+
+    mov_r_to_r_f64(a, reg_no_dst, reg_no1_src);
+    return alu_r_m_float(a, op, reg_no_dst, cache, false);
+}
+
+/**
+ * Encode double alu operation of reg and reg, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of ALU operation
+ * @param reg_no_dst the no of register
+ * @param reg_no1_src the reg no of first src register data
+ * @param reg_no2_src the reg no of second src register data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+alu_r_r_to_r_f64(x86::Assembler &a, ALU_OP op, int32 reg_no_dst,
+                 int32 reg_no1_src, int32 reg_no2_src)
+{
+    bool store_result = false;
+
+    /**
+     * - op r0,r0,r1. do nothing since instructions always store results in
+     *   the first register
+     *
+     * - op r1,r0,r1. use FREE_REG to cache and replace r0, and then store
+     *   results in r1
+     *
+     * - op r0,r1,r2. use r0 to cache and replace r1, and accept the result
+     *   naturally
+     **/
+    if (reg_no_dst == reg_no2_src) {
+        store_result = true;
+        reg_no_dst = REG_F64_FREE_IDX;
+    }
+    mov_r_to_r_f64(a, reg_no_dst, reg_no1_src);
+
+    switch (op) {
+        case ADD:
+        {
+            a.addsd(regs_float[reg_no_dst], regs_float[reg_no2_src]);
+            break;
+        }
+        case SUB:
+        {
+            a.subsd(regs_float[reg_no_dst], regs_float[reg_no2_src]);
+            break;
+        }
+        case MUL:
+        {
+            a.mulsd(regs_float[reg_no_dst], regs_float[reg_no2_src]);
+            break;
+        }
+        case DIV_S:
+        {
+            a.divsd(regs_float[reg_no_dst], regs_float[reg_no2_src]);
+            break;
+        }
+        case MAX:
+        {
+            a.maxsd(regs_float[reg_no_dst], regs_float[reg_no2_src]);
+            break;
+        }
+        case MIN:
+        {
+            a.minsd(regs_float[reg_no_dst], regs_float[reg_no2_src]);
+            break;
+        }
+        default:
+        {
+            bh_assert(0);
+            return false;
+        }
+    }
+
+    if (store_result)
+        mov_r_to_r_f64(a, reg_no2_src, REG_F64_FREE_IDX);
+
+    return true;
+}
+
+/**
+ * Encode int32 bit operation of reg and data, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of BIT operation
+ * @param reg_no the no of register, as first operand, and save result
+ * @param data the immediate data, as the second operand
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+bit_r_imm_i32(x86::Assembler &a, BIT_OP op, int32 reg_no, int32 data)
+{
+    Imm imm(data);
+
+    switch (op) {
+        case OR:
+            if (data != 0)
+                a.or_(regs_i32[reg_no], imm);
+            break;
+        case XOR:
+            if (data == -1)
+                a.not_(regs_i32[reg_no]);
+            else if (data != 0)
+                a.xor_(regs_i32[reg_no], imm);
+            break;
+        case AND:
+            if (data != -1)
+                a.and_(regs_i32[reg_no], imm);
+            break;
+        default:
+            bh_assert(0);
+            break;
+    }
+    return true;
+}
+
+/**
+ * Encode int32 bit operation of reg and reg, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of BIT operation
+ * @param reg_no_dst the no of register, as first operand, and save result
+ * @param reg_no_src the no of register, as second operand
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+bit_r_r_i32(x86::Assembler &a, BIT_OP op, int32 reg_no_dst, int32 reg_no_src)
+{
+    switch (op) {
+        case OR:
+            a.or_(regs_i32[reg_no_dst], regs_i32[reg_no_src]);
+            break;
+        case XOR:
+            a.xor_(regs_i32[reg_no_dst], regs_i32[reg_no_src]);
+            break;
+        case AND:
+            a.and_(regs_i32[reg_no_dst], regs_i32[reg_no_src]);
+            break;
+        default:
+            bh_assert(0);
+            break;
+    }
+    return true;
+}
+
+/**
+ * Encode int32 bit operation of imm and imm, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of BIT operation
+ * @param reg_no_dst the no of register
+ * @param data1_src the first src immediate data
+ * @param data2_src the second src immediate data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+bit_imm_imm_to_r_i32(x86::Assembler &a, BIT_OP op, int32 reg_no_dst,
+                     int32 data1_src, int32 data2_src)
+{
+    Imm imm;
+
+    switch (op) {
+        case OR:
+            imm.setValue(data1_src | data2_src);
+            break;
+        case XOR:
+            imm.setValue(data1_src ^ data2_src);
+            break;
+        case AND:
+            imm.setValue(data1_src & data2_src);
+            break;
+        default:
+            bh_assert(0);
+            break;
+    }
+
+    a.mov(regs_i32[reg_no_dst], imm);
+    return true;
+}
+
+/**
+ * Encode int32 bit operation of imm and reg, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of BIT operation
+ * @param reg_no_dst the no of register
+ * @param data1_src the first src immediate data
+ * @param reg_no2_src the reg no of second src register data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+bit_imm_r_to_r_i32(x86::Assembler &a, BIT_OP op, int32 reg_no_dst,
+                   int32 data1_src, int32 reg_no2_src)
+{
+    if (op == AND && data1_src == 0)
+        a.xor_(regs_i32[reg_no_dst], regs_i32[reg_no_dst]);
+    else if (op == OR && data1_src == -1) {
+        Imm imm(-1);
+        a.mov(regs_i32[reg_no_dst], imm);
+    }
+    else {
+        mov_r_to_r_i32(a, reg_no_dst, reg_no2_src);
+        return bit_r_imm_i32(a, op, reg_no_dst, data1_src);
+    }
+    return true;
+}
+
+/**
+ * Encode int32 bit operation of reg and imm, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of BIT operation
+ * @param reg_no_dst the no of register
+ * @param reg_no1_src the reg no of first src register data
+ * @param data2_src the second src immediate data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+bit_r_imm_to_r_i32(x86::Assembler &a, BIT_OP op, int32 reg_no_dst,
+                   int32 reg_no1_src, int32 data2_src)
+{
+    return bit_imm_r_to_r_i32(a, op, reg_no_dst, data2_src, reg_no1_src);
+}
+
+/**
+ * Encode int32 bit operation of reg and reg, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of BIT operation
+ * @param reg_no_dst the no of register
+ * @param reg_no1_src the reg no of first src register data
+ * @param reg_no2_src the reg no of second src register data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+bit_r_r_to_r_i32(x86::Assembler &a, BIT_OP op, int32 reg_no_dst,
+                 int32 reg_no1_src, int32 reg_no2_src)
+{
+    if (reg_no_dst != reg_no2_src) {
+        mov_r_to_r_i32(a, reg_no_dst, reg_no1_src);
+        return bit_r_r_i32(a, op, reg_no_dst, reg_no2_src);
+    }
+    else
+        return bit_r_r_i32(a, op, reg_no_dst, reg_no1_src);
+    return false;
+}
+
+/**
+ * Encode int64 bit operation of reg and data, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of BIT operation
+ * @param reg_no the no of register, as first operand, and save result
+ * @param data the immediate data, as the second operand
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+bit_r_imm_i64(x86::Assembler &a, BIT_OP op, int32 reg_no, int64 data)
+{
+    Imm imm(data);
+
+    switch (op) {
+        case OR:
+            if (data != 0) {
+                if (data >= INT32_MIN && data <= INT32_MAX) {
+                    imm.setValue((int32)data);
+                    a.or_(regs_i64[reg_no], imm);
+                }
+                else {
+                    a.mov(regs_i64[REG_I64_FREE_IDX], imm);
+                    a.or_(regs_i64[reg_no], regs_i64[REG_I64_FREE_IDX]);
+                }
+            }
+            break;
+        case XOR:
+            if (data == -1LL)
+                a.not_(regs_i64[reg_no]);
+            else if (data != 0) {
+                if (data >= INT32_MIN && data <= INT32_MAX) {
+                    imm.setValue((int32)data);
+                    a.xor_(regs_i64[reg_no], imm);
+                }
+                else {
+                    a.mov(regs_i64[REG_I64_FREE_IDX], imm);
+                    a.xor_(regs_i64[reg_no], regs_i64[REG_I64_FREE_IDX]);
+                }
+            }
+            break;
+        case AND:
+            if (data != -1LL) {
+                if (data >= INT32_MIN && data <= INT32_MAX) {
+                    imm.setValue((int32)data);
+                    a.and_(regs_i64[reg_no], imm);
+                }
+                else {
+                    a.mov(regs_i64[REG_I64_FREE_IDX], imm);
+                    a.and_(regs_i64[reg_no], regs_i64[REG_I64_FREE_IDX]);
+                }
+            }
+            break;
+        default:
+            bh_assert(0);
+            break;
+    }
+    return true;
+}
+
+/**
+ * Encode int64 bit operation of reg and reg, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of BIT operation
+ * @param reg_no_dst the no of register, as first operand, and save result
+ * @param reg_no_src the no of register, as second operand
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+bit_r_r_i64(x86::Assembler &a, BIT_OP op, int32 reg_no_dst, int32 reg_no_src)
+{
+    switch (op) {
+        case OR:
+            a.or_(regs_i64[reg_no_dst], regs_i64[reg_no_src]);
+            break;
+        case XOR:
+            a.xor_(regs_i64[reg_no_dst], regs_i64[reg_no_src]);
+            break;
+        case AND:
+            a.and_(regs_i64[reg_no_dst], regs_i64[reg_no_src]);
+            break;
+        default:
+            bh_assert(0);
+            break;
+    }
+    return true;
+}
+
+/**
+ * Encode int64 bit operation of imm and imm, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of BIT operation
+ * @param reg_no_dst the no of register
+ * @param data1_src the first src immediate data
+ * @param data2_src the second src immediate data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+bit_imm_imm_to_r_i64(x86::Assembler &a, BIT_OP op, int32 reg_no_dst,
+                     int32 data1_src, int64 data2_src)
+{
+    Imm imm;
+
+    switch (op) {
+        case OR:
+            imm.setValue(data1_src | data2_src);
+            break;
+        case XOR:
+            imm.setValue(data1_src ^ data2_src);
+            break;
+        case AND:
+            imm.setValue(data1_src & data2_src);
+            break;
+        default:
+            bh_assert(0);
+            break;
+    }
+
+    a.mov(regs_i64[reg_no_dst], imm);
+    return true;
+}
+
+/**
+ * Encode int64 bit operation of imm and reg, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of BIT operation
+ * @param reg_no_dst the no of register
+ * @param data1_src the first src immediate data
+ * @param reg_no2_src the reg no of second src register data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+bit_imm_r_to_r_i64(x86::Assembler &a, BIT_OP op, int32 reg_no_dst,
+                   int64 data1_src, int32 reg_no2_src)
+{
+    if (op == AND && data1_src == 0)
+        a.xor_(regs_i64[reg_no_dst], regs_i64[reg_no_dst]);
+    else if (op == OR && data1_src == -1LL) {
+        Imm imm(-1LL);
+        a.mov(regs_i64[reg_no_dst], imm);
+    }
+    else {
+        mov_r_to_r_i64(a, reg_no_dst, reg_no2_src);
+        return bit_r_imm_i64(a, op, reg_no_dst, data1_src);
+    }
+    return true;
+}
+
+/**
+ * Encode int64 bit operation of reg and imm, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of BIT operation
+ * @param reg_no_dst the no of register
+ * @param reg_no1_src the reg no of first src register data
+ * @param data2_src the second src immediate data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+bit_r_imm_to_r_i64(x86::Assembler &a, BIT_OP op, int32 reg_no_dst,
+                   int32 reg_no1_src, int64 data2_src)
+{
+    return bit_imm_r_to_r_i64(a, op, reg_no_dst, data2_src, reg_no1_src);
+}
+
+/**
+ * Encode int64 bit operation of reg and reg, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of BIT operation
+ * @param reg_no_dst the no of register
+ * @param reg_no1_src the reg no of first src register data
+ * @param reg_no2_src the reg no of second src register data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+bit_r_r_to_r_i64(x86::Assembler &a, BIT_OP op, int32 reg_no_dst,
+                 int32 reg_no1_src, int32 reg_no2_src)
+{
+    if (reg_no_dst != reg_no2_src) {
+        mov_r_to_r_i64(a, reg_no_dst, reg_no1_src);
+        return bit_r_r_i64(a, op, reg_no_dst, reg_no2_src);
+    }
+    else
+        return bit_r_r_i64(a, op, reg_no_dst, reg_no1_src);
+    return false;
+}
+
+/**
+ * Encode int32 shift operation of imm and imm, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of SHIFT operation
+ * @param reg_no_dst the no of register
+ * @param data1_src the first src immediate data
+ * @param data2_src the second src immediate data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+shift_imm_imm_to_r_i32(x86::Assembler &a, SHIFT_OP op, int32 reg_no_dst,
+                       int32 data1_src, int32 data2_src)
+{
+    int32 data;
+    switch (op) {
+        case SHL:
+        {
+            data = data1_src << data2_src;
+            break;
+        }
+        case SHRS:
+        {
+            data = data1_src >> data2_src;
+            break;
+        }
+        case SHRU:
+        {
+            data = ((uint32)data1_src) >> data2_src;
+            break;
+        }
+        case ROTL:
+        {
+            data = (data1_src << data2_src)
+                   | (((uint32)data1_src) >> (32 - data2_src));
+            break;
+        }
+        case ROTR:
+        {
+            data = (((uint32)data1_src) >> data2_src)
+                   | (data1_src << (32 - data2_src));
+            break;
+        }
+        default:
+        {
+            bh_assert(0);
+            goto fail;
+        }
+    }
+
+    return mov_imm_to_r_i32(a, reg_no_dst, data);
+fail:
+    return false;
+}
+
+/**
+ * Encode int32 shift operation of imm and reg, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of SHIFT operation
+ * @param reg_no_dst the no of register
+ * @param data1_src the first src immediate data
+ * @param reg_no2_src the reg no of second src register data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+shift_imm_r_to_r_i32(x86::Assembler &a, SHIFT_OP op, int32 reg_no_dst,
+                     int32 data1_src, int32 reg_no2_src)
+{
+    /* Should have been optimized by previous lower */
+    bh_assert(0);
+    (void)a;
+    (void)op;
+    (void)reg_no_dst;
+    (void)data1_src;
+    (void)reg_no2_src;
+    return false;
+}
+
+/**
+ * Encode int32 shift operation of reg and imm, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of SHIFT operation
+ * @param reg_no_dst the no of register
+ * @param reg_no1_src the reg no of first src register data
+ * @param data2_src the second src immediate data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+shift_r_imm_to_r_i32(x86::Assembler &a, SHIFT_OP op, int32 reg_no_dst,
+                     int32 reg_no1_src, int32 data2_src)
+{
+    /* SHL/SHA/SHR r/m32, imm8 */
+    Imm imm((uint8)data2_src);
+
+    mov_r_to_r_i32(a, reg_no_dst, reg_no1_src);
+    switch (op) {
+        case SHL:
+        {
+            a.shl(regs_i32[reg_no_dst], imm);
+            break;
+        }
+        case SHRS:
+        {
+            a.sar(regs_i32[reg_no_dst], imm);
+            break;
+        }
+        case SHRU:
+        {
+            a.shr(regs_i32[reg_no_dst], imm);
+            break;
+        }
+        case ROTL:
+        {
+            a.rol(regs_i32[reg_no_dst], imm);
+            break;
+        }
+        case ROTR:
+        {
+            a.ror(regs_i32[reg_no_dst], imm);
+            break;
+        }
+        default:
+        {
+            bh_assert(0);
+            goto fail;
+        }
+    }
+
+    return true;
+fail:
+    return false;
+}
+
+/**
+ * Encode int32 shift operation of reg and reg, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of shift operation
+ * @param reg_no_dst the no of register
+ * @param reg_no1_src the reg no of first src register data
+ * @param reg_no2_src the reg no of second src register data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+shift_r_r_to_r_i32(x86::Assembler &a, SHIFT_OP op, int32 reg_no_dst,
+                   int32 reg_no1_src, int32 reg_no2_src)
+{
+    /* should be CL */
+    if (reg_no2_src != REG_ECX_IDX)
+        return false;
+
+    mov_r_to_r_i32(a, reg_no_dst, reg_no1_src);
+
+    switch (op) {
+        case SHL:
+        {
+            a.shl(regs_i32[reg_no_dst], x86::cl);
+            break;
+        }
+        case SHRS:
+        {
+            a.sar(regs_i32[reg_no_dst], x86::cl);
+            break;
+        }
+        case SHRU:
+        {
+            a.shr(regs_i32[reg_no_dst], x86::cl);
+            break;
+        }
+        case ROTL:
+        {
+            a.rol(regs_i32[reg_no_dst], x86::cl);
+            break;
+        }
+        case ROTR:
+        {
+            a.ror(regs_i32[reg_no_dst], x86::cl);
+            break;
+        }
+        default:
+        {
+            bh_assert(0);
+            goto fail;
+        }
+    }
+
+    return true;
+fail:
+    return false;
+}
+
+/**
+ * Encode int64 shift operation of imm and imm, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of SHIFT operation
+ * @param reg_no_dst the no of register
+ * @param data1_src the first src immediate data
+ * @param data2_src the second src immediate data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+shift_imm_imm_to_r_i64(x86::Assembler &a, SHIFT_OP op, int32 reg_no_dst,
+                       int64 data1_src, int64 data2_src)
+{
+    int64 data;
+
+    switch (op) {
+        case SHL:
+        {
+            data = data1_src << data2_src;
+            break;
+        }
+        case SHRS:
+        {
+            data = data1_src >> data2_src;
+            break;
+        }
+        case SHRU:
+        {
+            data = ((uint64)data1_src) >> data2_src;
+            break;
+        }
+        case ROTL:
+        {
+            data = (data1_src << data2_src)
+                   | (((uint64)data1_src) >> (64LL - data2_src));
+            break;
+        }
+        case ROTR:
+        {
+            data = (((uint64)data1_src) >> data2_src)
+                   | (data1_src << (64LL - data2_src));
+            break;
+        }
+        default:
+        {
+            bh_assert(0);
+            goto fail;
+        }
+    }
+
+    return mov_imm_to_r_i64(a, reg_no_dst, data);
+fail:
+    return false;
+}
+
+/**
+ * Encode int64 shift operation of imm and reg, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of SHIFT operation
+ * @param reg_no_dst the no of register
+ * @param data1_src the first src immediate data
+ * @param reg_no2_src the reg no of second src register data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+shift_imm_r_to_r_i64(x86::Assembler &a, SHIFT_OP op, int32 reg_no_dst,
+                     int64 data1_src, int32 reg_no2_src)
+{
+    /* Should have been optimized by previous lower */
+    bh_assert(0);
+    (void)a;
+    (void)op;
+    (void)reg_no_dst;
+    (void)data1_src;
+    (void)reg_no2_src;
+    return false;
+}
+
+/**
+ * Encode int64 shift operation of reg and imm, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of SHIFT operation
+ * @param reg_no_dst the no of register
+ * @param reg_no1_src the reg no of first src register data
+ * @param data2_src the second src immediate data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+shift_r_imm_to_r_i64(x86::Assembler &a, SHIFT_OP op, int32 reg_no_dst,
+                     int32 reg_no1_src, int64 data2_src)
+{
+    /* SHL/SHA/SHR r/m64, imm8 */
+    Imm imm((uint8)data2_src);
+
+    mov_r_to_r_i64(a, reg_no_dst, reg_no1_src);
+    switch (op) {
+        case SHL:
+        {
+            a.shl(regs_i64[reg_no_dst], imm);
+            break;
+        }
+        case SHRS:
+        {
+            a.sar(regs_i64[reg_no_dst], imm);
+            break;
+        }
+        case SHRU:
+        {
+            a.shr(regs_i64[reg_no_dst], imm);
+            break;
+        }
+        case ROTL:
+        {
+            a.rol(regs_i64[reg_no_dst], imm);
+            break;
+        }
+        case ROTR:
+        {
+            a.ror(regs_i64[reg_no_dst], imm);
+            break;
+        }
+        default:
+        {
+            bh_assert(0);
+            goto fail;
+        }
+    }
+
+    return true;
+fail:
+    return false;
+}
+
+/**
+ * Encode int64 shift operation of reg and reg, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of shift operation
+ * @param reg_no_dst the no of register
+ * @param reg_no1_src the reg no of first src register data
+ * @param reg_no2_src the reg no of second src register data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+shift_r_r_to_r_i64(x86::Assembler &a, SHIFT_OP op, int32 reg_no_dst,
+                   int32 reg_no1_src, int32 reg_no2_src)
+{
+    /* should be CL */
+    if (reg_no2_src != REG_ECX_IDX)
+        return false;
+
+    mov_r_to_r_i64(a, reg_no_dst, reg_no1_src);
+
+    switch (op) {
+        case SHL:
+        {
+            a.shl(regs_i64[reg_no_dst], x86::cl);
+            break;
+        }
+        case SHRS:
+        {
+            a.sar(regs_i64[reg_no_dst], x86::cl);
+            break;
+        }
+        case SHRU:
+        {
+            a.shr(regs_i64[reg_no_dst], x86::cl);
+            break;
+        }
+        case ROTL:
+        {
+            a.rol(regs_i64[reg_no_dst], x86::cl);
+            break;
+        }
+        case ROTR:
+        {
+            a.ror(regs_i64[reg_no_dst], x86::cl);
+            break;
+        }
+        default:
+        {
+            bh_assert(0);
+            goto fail;
+        }
+    }
+
+    return true;
+fail:
+    return false;
+}
+
+/**
+ * Encode int32 cmp operation of imm and imm, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of cmp operation
+ * @param reg_no_dst the no of register
+ * @param data1_src the first src immediate data
+ * @param data2_src the second src immediate data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+cmp_imm_imm_to_r_i32(x86::Assembler &a, int32 reg_no_dst, int32 data1_src,
+                     int32 data2_src)
+{
+    Imm imm(data1_src);
+    a.mov(regs_i32[REG_I32_FREE_IDX], imm);
+    imm.setValue(data2_src);
+    a.cmp(regs_i32[REG_I32_FREE_IDX], imm);
+    (void)reg_no_dst;
+    return true;
+}
+
+/**
+ * Encode int32 cmp operation of imm and reg, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of cmp operation
+ * @param reg_no_dst the no of register
+ * @param data1_src the first src immediate data
+ * @param reg_no2_src the reg no of second src register data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+cmp_imm_r_to_r_i32(x86::Assembler &a, int32 reg_no_dst, int32 data1_src,
+                   int32 reg_no2_src)
+{
+    Imm imm(data1_src);
+    a.mov(regs_i32[REG_I32_FREE_IDX], imm);
+    a.cmp(regs_i32[REG_I32_FREE_IDX], regs_i32[reg_no2_src]);
+    (void)reg_no_dst;
+    return true;
+}
+
+/**
+ * Encode int32 cmp operation of reg and imm, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of cmp operation
+ * @param reg_no_dst the no of register
+ * @param reg_no1_src the reg no of first src register data
+ * @param data2_src the second src immediate data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+cmp_r_imm_to_r_i32(x86::Assembler &a, int32 reg_no_dst, int32 reg_no1_src,
+                   int32 data2_src)
+{
+    Imm imm(data2_src);
+    a.cmp(regs_i32[reg_no1_src], imm);
+    (void)reg_no_dst;
+    return true;
+}
+
+/**
+ * Encode int32 cmp operation of reg and reg, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of cmp operation
+ * @param reg_no_dst the no of register
+ * @param reg_no1_src the reg no of first src register data
+ * @param reg_no2_src the reg no of second src register data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+cmp_r_r_to_r_i32(x86::Assembler &a, int32 reg_no_dst, int32 reg_no1_src,
+                 int32 reg_no2_src)
+{
+    a.cmp(regs_i32[reg_no1_src], regs_i32[reg_no2_src]);
+    (void)reg_no_dst;
+    return true;
+}
+
+/**
+ * Encode int64 cmp operation of imm and imm, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of cmp operation
+ * @param reg_no_dst the no of register
+ * @param data1_src the first src immediate data
+ * @param data2_src the second src immediate data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+cmp_imm_imm_to_r_i64(x86::Assembler &a, int32 reg_no_dst, int32 data1_src,
+                     int32 data2_src)
+{
+    Imm imm(data1_src);
+    a.mov(regs_i64[REG_I64_FREE_IDX], imm);
+    imm.setValue(data2_src);
+    a.cmp(regs_i64[REG_I64_FREE_IDX], imm);
+    (void)reg_no_dst;
+    return true;
+}
+
+/**
+ * Encode int64 cmp operation of imm and reg, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of cmp operation
+ * @param reg_no_dst the no of register
+ * @param data1_src the first src immediate data
+ * @param reg_no2_src the reg no of second src register data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+cmp_imm_r_to_r_i64(x86::Assembler &a, int32 reg_no_dst, int64 data1_src,
+                   int32 reg_no2_src)
+{
+    Imm imm(data1_src);
+    a.mov(regs_i64[REG_I64_FREE_IDX], imm);
+    a.cmp(regs_i64[REG_I64_FREE_IDX], regs_i64[reg_no2_src]);
+    (void)reg_no_dst;
+    return true;
+}
+
+/**
+ * Encode int64 cmp operation of reg and imm, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of cmp operation
+ * @param reg_no_dst the no of register
+ * @param reg_no1_src the reg no of first src register data
+ * @param data2_src the second src immediate data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+cmp_r_imm_to_r_i64(x86::Assembler &a, int32 reg_no_dst, int32 reg_no1_src,
+                   int64 data2_src)
+{
+    Imm imm(data2_src);
+
+    if (data2_src >= INT32_MIN && data2_src <= INT32_MAX) {
+        imm.setValue((int32)data2_src);
+        a.cmp(regs_i64[reg_no1_src], imm);
+    }
+    else {
+        a.mov(regs_i64[REG_I64_FREE_IDX], imm);
+        a.cmp(regs_i64[reg_no1_src], regs_i64[REG_I64_FREE_IDX]);
+    }
+    (void)reg_no_dst;
+    return true;
+}
+
+/**
+ * Encode int64 cmp operation of reg and reg, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of cmp operation
+ * @param reg_no_dst the no of register
+ * @param reg_no1_src the reg no of first src register data
+ * @param reg_no2_src the reg no of second src register data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+cmp_r_r_to_r_i64(x86::Assembler &a, int32 reg_no_dst, int32 reg_no1_src,
+                 int32 reg_no2_src)
+{
+    a.cmp(regs_i64[reg_no1_src], regs_i64[reg_no2_src]);
+    (void)reg_no_dst;
+    return true;
+}
+
+/**
+ * Encode float cmp operation of reg and reg, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of cmp operation
+ * @param reg_no_dst the no of register
+ * @param reg_no1_src the reg no of first src register data
+ * @param reg_no2_src the reg no of second src register data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+cmp_r_r_to_r_f32(x86::Assembler &a, int32 reg_no_dst, int32 reg_no1_src,
+                 int32 reg_no2_src)
+{
+    a.comiss(regs_float[reg_no1_src], regs_float[reg_no2_src]);
+    (void)reg_no_dst;
+    return true;
+}
+
+/**
+ * Encode float cmp operation of imm and imm, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of cmp operation
+ * @param reg_no_dst the no of register
+ * @param data1_src the first src immediate data
+ * @param data2_src the second src immediate data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+cmp_imm_imm_to_r_f32(x86::Assembler &a, int32 reg_no_dst, float data1_src,
+                     float data2_src)
+{
+    /* should have been optimized in the frontend */
+    bh_assert(0);
+    (void)a;
+    (void)reg_no_dst;
+    (void)data1_src;
+    (void)data2_src;
+    return false;
+}
+
+/**
+ * Encode float cmp operation of imm and reg, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of cmp operation
+ * @param reg_no_dst the no of register
+ * @param data1_src the first src immediate data
+ * @param reg_no2_src the reg no of second src register data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+cmp_imm_r_to_r_f32(x86::Assembler &a, int32 reg_no_dst, float data1_src,
+                   int32 reg_no2_src)
+{
+    mov_imm_to_r_f32(a, REG_F32_FREE_IDX, data1_src);
+    a.comiss(regs_float[REG_F32_FREE_IDX], regs_float[reg_no2_src]);
+    (void)reg_no_dst;
+    return true;
+}
+
+/**
+ * Encode float cmp operation of reg and imm, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of cmp operation
+ * @param reg_no_dst the no of register
+ * @param reg_no1_src the reg no of first src register data
+ * @param data2_src the second src immediate data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+cmp_r_imm_to_r_f32(x86::Assembler &a, int32 reg_no_dst, int32 reg_no1_src,
+                   float data2_src)
+{
+    mov_imm_to_r_f32(a, REG_F32_FREE_IDX, data2_src);
+    a.comiss(regs_float[reg_no1_src], regs_float[REG_F32_FREE_IDX]);
+    (void)reg_no_dst;
+    return true;
+}
+
+/**
+ * Encode double cmp operation of reg and reg, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of cmp operation
+ * @param reg_no_dst the no of register
+ * @param reg_no1_src the reg no of first src register data
+ * @param reg_no2_src the reg no of second src register data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+cmp_r_r_to_r_f64(x86::Assembler &a, int32 reg_no_dst, int32 reg_no1_src,
+                 int32 reg_no2_src)
+{
+    a.comisd(regs_float[reg_no1_src], regs_float[reg_no2_src]);
+    (void)reg_no_dst;
+    return true;
+}
+
+/**
+ * Encode double cmp operation of imm and imm, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of cmp operation
+ * @param reg_no_dst the no of register
+ * @param data1_src the first src immediate data
+ * @param data2_src the second src immediate data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+cmp_imm_imm_to_r_f64(x86::Assembler &a, int32 reg_no_dst, double data1_src,
+                     double data2_src)
+{
+    /* should have been optimized in the frontend */
+    bh_assert(0);
+    (void)a;
+    (void)reg_no_dst;
+    (void)data1_src;
+    (void)data2_src;
+    return false;
+}
+
+/**
+ * Encode double cmp operation of imm and reg, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of cmp operation
+ * @param reg_no_dst the no of register
+ * @param data1_src the first src immediate data
+ * @param reg_no2_src the reg no of second src register data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+cmp_imm_r_to_r_f64(x86::Assembler &a, int32 reg_no_dst, double data1_src,
+                   int32 reg_no2_src)
+{
+    mov_imm_to_r_f64(a, REG_F64_FREE_IDX, data1_src);
+    a.comisd(regs_float[REG_F64_FREE_IDX], regs_float[reg_no2_src]);
+    (void)reg_no_dst;
+    return true;
+}
+
+/**
+ * Encode double cmp operation of reg and imm, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of cmp operation
+ * @param reg_no_dst the no of register
+ * @param reg_no1_src the reg no of first src register data
+ * @param data2_src the second src immediate data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+cmp_r_imm_to_r_f64(x86::Assembler &a, int32 reg_no_dst, int32 reg_no1_src,
+                   double data2_src)
+{
+    mov_imm_to_r_f64(a, REG_F64_FREE_IDX, data2_src);
+    a.comisd(regs_float[reg_no1_src], regs_float[REG_F64_FREE_IDX]);
+    (void)reg_no_dst;
+    return true;
+}
+
+/**
+ * Encode insn ld: LD_type r0, r1, r2
+ * @param kind the data kind, such as I32, I64, F32 and F64
+ * @param bytes_dst the byte number of dst data
+ * @param is_signed the data is signed or unsigned
+ */
+#define LD_R_R_R(kind, bytes_dst, is_signed)                                  \
+    do {                                                                      \
+        int32 reg_no_dst = 0, reg_no_base = 0, reg_no_offset = 0;             \
+        int32 base = 0, offset = 0;                                           \
+        bool _ret = false;                                                    \
+                                                                              \
+        if (jit_reg_is_const(r1)) {                                           \
+            CHECK_KIND(r1, JIT_REG_KIND_I32);                                 \
+        }                                                                     \
+        else {                                                                \
+            CHECK_KIND(r1, JIT_REG_KIND_I64);                                 \
+        }                                                                     \
+        if (jit_reg_is_const(r2)) {                                           \
+            CHECK_KIND(r2, JIT_REG_KIND_I32);                                 \
+        }                                                                     \
+        else {                                                                \
+            CHECK_KIND(r2, JIT_REG_KIND_I64);                                 \
+        }                                                                     \
+                                                                              \
+        reg_no_dst = jit_reg_no(r0);                                          \
+        CHECK_REG_NO(reg_no_dst, jit_reg_kind(r0));                           \
+        if (jit_reg_is_const(r1))                                             \
+            base = jit_cc_get_const_I32(cc, r1);                              \
+        else {                                                                \
+            reg_no_base = jit_reg_no(r1);                                     \
+            CHECK_REG_NO(reg_no_base, jit_reg_kind(r1));                      \
+        }                                                                     \
+        if (jit_reg_is_const(r2))                                             \
+            offset = jit_cc_get_const_I32(cc, r2);                            \
+        else {                                                                \
+            reg_no_offset = jit_reg_no(r2);                                   \
+            CHECK_REG_NO(reg_no_offset, jit_reg_kind(r2));                    \
+        }                                                                     \
+                                                                              \
+        if (jit_reg_is_const(r1)) {                                           \
+            if (jit_reg_is_const(r2))                                         \
+                _ret = ld_r_from_base_imm_offset_imm(                         \
+                    a, bytes_dst, JIT_REG_KIND_##kind, is_signed, reg_no_dst, \
+                    base, offset);                                            \
+            else                                                              \
+                _ret = ld_r_from_base_imm_offset_r(                           \
+                    a, bytes_dst, JIT_REG_KIND_##kind, is_signed, reg_no_dst, \
+                    base, reg_no_offset);                                     \
+        }                                                                     \
+        else if (jit_reg_is_const(r2))                                        \
+            _ret = ld_r_from_base_r_offset_imm(                               \
+                a, bytes_dst, JIT_REG_KIND_##kind, is_signed, reg_no_dst,     \
+                reg_no_base, offset);                                         \
+        else                                                                  \
+            _ret = ld_r_from_base_r_offset_r(                                 \
+                a, bytes_dst, JIT_REG_KIND_##kind, is_signed, reg_no_dst,     \
+                reg_no_base, reg_no_offset);                                  \
+        if (!_ret)                                                            \
+            GOTO_FAIL;                                                        \
+    } while (0)
+
+/**
+ * Encode insn sd: ST_type r0, r1, r2
+ * @param kind the data kind, such as I32, I64, F32 and F64
+ * @param bytes_dst the byte number of dst data
+ * @param atomic whether it's atomic store
+ */
+#define ST_R_R_R(kind, type, bytes_dst, atomic)                                \
+    do {                                                                       \
+        type data_src = 0;                                                     \
+        int32 reg_no_src = 0, reg_no_base = 0, reg_no_offset = 0;              \
+        int32 base = 0, offset = 0;                                            \
+        bool _ret = false;                                                     \
+                                                                               \
+        if (jit_reg_is_const(r1)) {                                            \
+            CHECK_KIND(r1, JIT_REG_KIND_I32);                                  \
+        }                                                                      \
+        else {                                                                 \
+            CHECK_KIND(r1, JIT_REG_KIND_I64);                                  \
+        }                                                                      \
+        if (jit_reg_is_const(r2)) {                                            \
+            CHECK_KIND(r2, JIT_REG_KIND_I32);                                  \
+        }                                                                      \
+        else {                                                                 \
+            CHECK_KIND(r2, JIT_REG_KIND_I64);                                  \
+        }                                                                      \
+                                                                               \
+        if (jit_reg_is_const(r0))                                              \
+            data_src = jit_cc_get_const_##kind(cc, r0);                        \
+        else {                                                                 \
+            reg_no_src = jit_reg_no(r0);                                       \
+            CHECK_REG_NO(reg_no_src, jit_reg_kind(r0));                        \
+        }                                                                      \
+        if (jit_reg_is_const(r1))                                              \
+            base = jit_cc_get_const_I32(cc, r1);                               \
+        else {                                                                 \
+            reg_no_base = jit_reg_no(r1);                                      \
+            CHECK_REG_NO(reg_no_base, jit_reg_kind(r1));                       \
+        }                                                                      \
+        if (jit_reg_is_const(r2))                                              \
+            offset = jit_cc_get_const_I32(cc, r2);                             \
+        else {                                                                 \
+            reg_no_offset = jit_reg_no(r2);                                    \
+            CHECK_REG_NO(reg_no_offset, jit_reg_kind(r2));                     \
+        }                                                                      \
+                                                                               \
+        if (jit_reg_is_const(r0)) {                                            \
+            if (jit_reg_is_const(r1)) {                                        \
+                if (jit_reg_is_const(r2))                                      \
+                    _ret = st_imm_to_base_imm_offset_imm(                      \
+                        a, bytes_dst, &data_src, base, offset, atomic);        \
+                else                                                           \
+                    _ret = st_imm_to_base_imm_offset_r(                        \
+                        a, bytes_dst, &data_src, base, reg_no_offset, atomic); \
+            }                                                                  \
+            else if (jit_reg_is_const(r2))                                     \
+                _ret = st_imm_to_base_r_offset_imm(                            \
+                    a, bytes_dst, &data_src, reg_no_base, offset, atomic);     \
+            else                                                               \
+                _ret = st_imm_to_base_r_offset_r(a, bytes_dst, &data_src,      \
+                                                 reg_no_base, reg_no_offset,   \
+                                                 atomic);                      \
+        }                                                                      \
+        else if (jit_reg_is_const(r1)) {                                       \
+            if (jit_reg_is_const(r2))                                          \
+                _ret = st_r_to_base_imm_offset_imm(                            \
+                    a, bytes_dst, JIT_REG_KIND_##kind, reg_no_src, base,       \
+                    offset, atomic);                                           \
+            else                                                               \
+                _ret = st_r_to_base_imm_offset_r(                              \
+                    a, bytes_dst, JIT_REG_KIND_##kind, reg_no_src, base,       \
+                    reg_no_offset, atomic);                                    \
+        }                                                                      \
+        else if (jit_reg_is_const(r2))                                         \
+            _ret = st_r_to_base_r_offset_imm(a, bytes_dst,                     \
+                                             JIT_REG_KIND_##kind, reg_no_src,  \
+                                             reg_no_base, offset, atomic);     \
+        else                                                                   \
+            _ret = st_r_to_base_r_offset_r(a, bytes_dst, JIT_REG_KIND_##kind,  \
+                                           reg_no_src, reg_no_base,            \
+                                           reg_no_offset, atomic);             \
+        if (!_ret)                                                             \
+            GOTO_FAIL;                                                         \
+    } while (0)
+
+/**
+ * Encode insn mov: MOV r0, r1
+ * @param kind the data kind, such as I32, I64, F32 and F64
+ * @param Type the data type, such as int32, int64, float32, and float64
+ * @param type the abbreviation of data type, such as i32, i64, f32, and f64
+ * @param bytes_dst the byte number of dst data
+ */
+#define MOV_R_R(kind, Type, type)                                \
+    do {                                                         \
+        bool _ret = false;                                       \
+        int32 reg_no_dst = 0, reg_no_src = 0;                    \
+        CHECK_EQKIND(r0, r1);                                    \
+                                                                 \
+        CHECK_NCONST(r0);                                        \
+        reg_no_dst = jit_reg_no(r0);                             \
+        CHECK_REG_NO(reg_no_dst, jit_reg_kind(r0));              \
+                                                                 \
+        if (jit_reg_is_const(r1)) {                              \
+            Type data = jit_cc_get_const_##kind(cc, r1);         \
+            _ret = mov_imm_to_r_##type(a, reg_no_dst, data);     \
+        }                                                        \
+        else {                                                   \
+            reg_no_src = jit_reg_no(r1);                         \
+            CHECK_REG_NO(reg_no_src, jit_reg_kind(r1));          \
+            _ret = mov_r_to_r_##type(a, reg_no_dst, reg_no_src); \
+        }                                                        \
+        if (!_ret)                                               \
+            GOTO_FAIL;                                           \
+    } while (0)
+
+/**
+ * Encode mov insn, MOV r0, r1
+ *
+ * @param cc the compiler context
+ * @param a the assembler to emit the code
+ * @param r0 dst jit register that contains the dst operand info
+ * @param r1 src jit register that contains the src operand info
+ *
+ * @return true if success, false if failed
+ */
+static bool
+lower_mov(JitCompContext *cc, x86::Assembler &a, JitReg r0, JitReg r1)
+{
+    switch (jit_reg_kind(r0)) {
+        case JIT_REG_KIND_I32:
+            MOV_R_R(I32, int32, i32);
+            break;
+        case JIT_REG_KIND_I64:
+            MOV_R_R(I64, int64, i64);
+            break;
+        case JIT_REG_KIND_F32:
+            MOV_R_R(F32, float32, f32);
+            break;
+        case JIT_REG_KIND_F64:
+            MOV_R_R(F64, float64, f64);
+            break;
+        default:
+            LOG_VERBOSE("Invalid reg type of mov: %d\n", jit_reg_kind(r0));
+            GOTO_FAIL;
+    }
+
+    return true;
+fail:
+    return false;
+}
+
+/**
+ * Encode insn neg: NEG r0, r1
+ * @param kind the data kind, such as I32, I64, F32 and F64
+ * @param Type the data type, such as int32, int64, float32, and float64
+ * @param type the abbreviation of data type, such as i32, i64, f32, and f64
+ */
+#define NEG_R_R(kind, Type, type)                                \
+    do {                                                         \
+        bool _ret = false;                                       \
+        int32 reg_no_dst = 0, reg_no_src = 0;                    \
+        CHECK_EQKIND(r0, r1);                                    \
+                                                                 \
+        CHECK_NCONST(r0);                                        \
+        reg_no_dst = jit_reg_no(r0);                             \
+        CHECK_REG_NO(reg_no_dst, jit_reg_kind(r0));              \
+                                                                 \
+        if (jit_reg_is_const(r1)) {                              \
+            Type data = jit_cc_get_const_##kind(cc, r1);         \
+            _ret = neg_imm_to_r_##type(a, reg_no_dst, data);     \
+        }                                                        \
+        else {                                                   \
+            reg_no_src = jit_reg_no(r1);                         \
+            CHECK_REG_NO(reg_no_src, jit_reg_kind(r1));          \
+            _ret = neg_r_to_r_##type(a, reg_no_dst, reg_no_src); \
+        }                                                        \
+        if (!_ret)                                               \
+            GOTO_FAIL;                                           \
+    } while (0)
+
+/**
+ * Encode neg insn, NEG r0, r1
+ *
+ * @param cc the compiler context
+ * @param a the assembler to emit the code
+ * @param r0 dst jit register that contains the dst operand info
+ * @param r1 src jit register that contains the src operand info
+ *
+ * @return true if success, false if failed
+ */
+static bool
+lower_neg(JitCompContext *cc, x86::Assembler &a, JitReg r0, JitReg r1)
+{
+    switch (jit_reg_kind(r0)) {
+        case JIT_REG_KIND_I32:
+            NEG_R_R(I32, int32, i32);
+            break;
+        case JIT_REG_KIND_I64:
+            NEG_R_R(I64, int64, i64);
+            break;
+        case JIT_REG_KIND_F32:
+            NEG_R_R(F32, float32, f32);
+            break;
+        case JIT_REG_KIND_F64:
+            NEG_R_R(F64, float64, f64);
+            break;
+        default:
+            LOG_VERBOSE("Invalid reg type of neg: %d\n", jit_reg_kind(r0));
+            GOTO_FAIL;
+    }
+
+    return true;
+fail:
+    return false;
+}
+
+/**
+ * Encode insn convert: I32TOI8 r0, r1, or I32TOI16, I32TOF32, F32TOF64, etc.
+ * @param kind0 the dst JIT_REG_KIND, such as I32, I64, F32 and F64
+ * @param kind1 the src JIT_REG_KIND, such as I32, I64, F32 and F64
+ * @param type0 the dst data type, such as i8, u8, i16, u16, i32, f32, i64, f32,
+ * f64
+ * @param type1 the src data type, such as i8, u8, i16, u16, i32, f32, i64, f32,
+ * f64
+ */
+#define CONVERT_R_R(kind0, kind1, type0, type1, Type1)                       \
+    do {                                                                     \
+        bool _ret = false;                                                   \
+        int32 reg_no_dst = 0, reg_no_src = 0;                                \
+        CHECK_KIND(r0, JIT_REG_KIND_##kind0);                                \
+        CHECK_KIND(r1, JIT_REG_KIND_##kind1);                                \
+                                                                             \
+        CHECK_NCONST(r0);                                                    \
+        reg_no_dst = jit_reg_no(r0);                                         \
+        CHECK_REG_NO(reg_no_dst, jit_reg_kind(r0));                          \
+                                                                             \
+        if (jit_reg_is_const(r1)) {                                          \
+            Type1 data = jit_cc_get_const_##kind1(cc, r1);                   \
+            _ret = convert_imm_##type1##_to_r_##type0(a, reg_no_dst, data);  \
+        }                                                                    \
+        else {                                                               \
+            reg_no_src = jit_reg_no(r1);                                     \
+            CHECK_REG_NO(reg_no_src, jit_reg_kind(r1));                      \
+            _ret =                                                           \
+                convert_r_##type1##_to_r_##type0(a, reg_no_dst, reg_no_src); \
+        }                                                                    \
+        if (!_ret)                                                           \
+            GOTO_FAIL;                                                       \
+    } while (0)
+
+/**
+ * Encode insn alu: ADD/SUB/MUL/DIV/REM r0, r1, r2
+ * @param kind the data kind, such as I32, I64, F32 and F64
+ * @param Type the data type, such as int32, int64, float32, and float64
+ * @param type the abbreviation of data type, such as i32, i64, f32, and f64
+ * @param op the opcode of alu
+ */
+#define ALU_R_R_R(kind, Type, type, op)                                       \
+    do {                                                                      \
+        Type data1, data2;                                                    \
+        int32 reg_no_dst = 0, reg_no_src1 = 0, reg_no_src2 = 0;               \
+        bool _ret = false;                                                    \
+                                                                              \
+        CHECK_EQKIND(r0, r1);                                                 \
+        CHECK_EQKIND(r0, r2);                                                 \
+        memset(&data1, 0, sizeof(Type));                                      \
+        memset(&data2, 0, sizeof(Type));                                      \
+                                                                              \
+        reg_no_dst = jit_reg_no(r0);                                          \
+        CHECK_REG_NO(reg_no_dst, jit_reg_kind(r0));                           \
+        if (jit_reg_is_const(r1))                                             \
+            data1 = jit_cc_get_const_##kind(cc, r1);                          \
+        else {                                                                \
+            reg_no_src1 = jit_reg_no(r1);                                     \
+            CHECK_REG_NO(reg_no_src1, jit_reg_kind(r1));                      \
+        }                                                                     \
+        if (jit_reg_is_const(r2))                                             \
+            data2 = jit_cc_get_const_##kind(cc, r2);                          \
+        else {                                                                \
+            reg_no_src2 = jit_reg_no(r2);                                     \
+            CHECK_REG_NO(reg_no_src2, jit_reg_kind(r2));                      \
+        }                                                                     \
+                                                                              \
+        if (jit_reg_is_const(r1)) {                                           \
+            if (jit_reg_is_const(r2))                                         \
+                _ret =                                                        \
+                    alu_imm_imm_to_r_##type(a, op, reg_no_dst, data1, data2); \
+            else                                                              \
+                _ret = alu_imm_r_to_r_##type(a, op, reg_no_dst, data1,        \
+                                             reg_no_src2);                    \
+        }                                                                     \
+        else if (jit_reg_is_const(r2))                                        \
+            _ret =                                                            \
+                alu_r_imm_to_r_##type(a, op, reg_no_dst, reg_no_src1, data2); \
+        else                                                                  \
+            _ret = alu_r_r_to_r_##type(a, op, reg_no_dst, reg_no_src1,        \
+                                       reg_no_src2);                          \
+        if (!_ret)                                                            \
+            GOTO_FAIL;                                                        \
+    } while (0)
+
+/**
+ * Encode alu insn, ADD/SUB/MUL/DIV/REM r0, r1, r2
+ *
+ * @param cc the compiler context
+ * @param a the assembler to emit the code
+ * @param op the opcode of alu operations
+ * @param r0 dst jit register that contains the dst operand info
+ * @param r1 src jit register that contains the first src operand info
+ * @param r2 src jit register that contains the second src operand info
+ *
+ * @return true if success, false if failed
+ */
+static bool
+lower_alu(JitCompContext *cc, x86::Assembler &a, ALU_OP op, JitReg r0,
+          JitReg r1, JitReg r2)
+{
+    switch (jit_reg_kind(r0)) {
+        case JIT_REG_KIND_I32:
+            ALU_R_R_R(I32, int32, i32, op);
+            break;
+        case JIT_REG_KIND_I64:
+            ALU_R_R_R(I64, int64, i64, op);
+            break;
+        case JIT_REG_KIND_F32:
+            ALU_R_R_R(F32, float32, f32, op);
+            break;
+        case JIT_REG_KIND_F64:
+            ALU_R_R_R(F64, float64, f64, op);
+            break;
+        default:
+            LOG_VERBOSE("Invalid reg type of alu: %d\n", jit_reg_kind(r0));
+            GOTO_FAIL;
+    }
+
+    return true;
+fail:
+    return false;
+}
+
+/**
+ * Encode insn bit: AND/OR/XOR r0, r1, r2
+ * @param kind the data kind, such as I32, I64
+ * @param Type the data type, such as int32, int64
+ * @param type the abbreviation of data type, such as i32, i64
+ * @param op the opcode of bit operation
+ */
+#define BIT_R_R_R(kind, Type, type, op)                                       \
+    do {                                                                      \
+        Type data1, data2;                                                    \
+        int32 reg_no_dst = 0, reg_no_src1 = 0, reg_no_src2 = 0;               \
+        bool _ret = false;                                                    \
+                                                                              \
+        CHECK_EQKIND(r0, r1);                                                 \
+        CHECK_EQKIND(r0, r2);                                                 \
+        memset(&data1, 0, sizeof(Type));                                      \
+        memset(&data2, 0, sizeof(Type));                                      \
+                                                                              \
+        reg_no_dst = jit_reg_no(r0);                                          \
+        CHECK_REG_NO(reg_no_dst, jit_reg_kind(r0));                           \
+        if (jit_reg_is_const(r1))                                             \
+            data1 = jit_cc_get_const_##kind(cc, r1);                          \
+        else {                                                                \
+            reg_no_src1 = jit_reg_no(r1);                                     \
+            CHECK_REG_NO(reg_no_src1, jit_reg_kind(r1));                      \
+        }                                                                     \
+        if (jit_reg_is_const(r2))                                             \
+            data2 = jit_cc_get_const_##kind(cc, r2);                          \
+        else {                                                                \
+            reg_no_src2 = jit_reg_no(r2);                                     \
+            CHECK_REG_NO(reg_no_src2, jit_reg_kind(r2));                      \
+        }                                                                     \
+                                                                              \
+        if (jit_reg_is_const(r1)) {                                           \
+            if (jit_reg_is_const(r2))                                         \
+                _ret =                                                        \
+                    bit_imm_imm_to_r_##type(a, op, reg_no_dst, data1, data2); \
+            else                                                              \
+                _ret = bit_imm_r_to_r_##type(a, op, reg_no_dst, data1,        \
+                                             reg_no_src2);                    \
+        }                                                                     \
+        else if (jit_reg_is_const(r2))                                        \
+            _ret =                                                            \
+                bit_r_imm_to_r_##type(a, op, reg_no_dst, reg_no_src1, data2); \
+        else                                                                  \
+            _ret = bit_r_r_to_r_##type(a, op, reg_no_dst, reg_no_src1,        \
+                                       reg_no_src2);                          \
+        if (!_ret)                                                            \
+            GOTO_FAIL;                                                        \
+    } while (0)
+
+/**
+ * Encode bit insn, AND/OR/XOR r0, r1, r2
+ *
+ * @param cc the compiler context
+ * @param a the assembler to emit the code
+ * @param op the opcode of bit operations
+ * @param r0 dst jit register that contains the dst operand info
+ * @param r1 src jit register that contains the first src operand info
+ * @param r2 src jit register that contains the second src operand info
+ *
+ * @return true if success, false if failed
+ */
+static bool
+lower_bit(JitCompContext *cc, x86::Assembler &a, BIT_OP op, JitReg r0,
+          JitReg r1, JitReg r2)
+{
+    switch (jit_reg_kind(r0)) {
+        case JIT_REG_KIND_I32:
+            BIT_R_R_R(I32, int32, i32, op);
+            break;
+        case JIT_REG_KIND_I64:
+            BIT_R_R_R(I64, int64, i64, op);
+            break;
+        default:
+            LOG_VERBOSE("Invalid reg type of bit: %d\n", jit_reg_kind(r0));
+            GOTO_FAIL;
+    }
+
+    return true;
+fail:
+    return false;
+}
+
+/**
+ * Encode insn shift: SHL/SHRS/SHRU r0, r1, r2
+ * @param kind the data kind, such as I32, I64
+ * @param Type the data type, such as int32, int64
+ * @param type the abbreviation of data type, such as i32, i64
+ * @param op the opcode of shift operation
+ */
+#define SHIFT_R_R_R(kind, Type, type, op)                                  \
+    do {                                                                   \
+        Type data1, data2;                                                 \
+        int32 reg_no_dst = 0, reg_no_src1 = 0, reg_no_src2 = 0;            \
+        bool _ret = false;                                                 \
+                                                                           \
+        CHECK_EQKIND(r0, r1);                                              \
+        CHECK_KIND(r2, JIT_REG_KIND_##kind);                               \
+        memset(&data1, 0, sizeof(Type));                                   \
+        memset(&data2, 0, sizeof(Type));                                   \
+                                                                           \
+        reg_no_dst = jit_reg_no(r0);                                       \
+        CHECK_REG_NO(reg_no_dst, jit_reg_kind(r0));                        \
+        if (jit_reg_is_const(r1))                                          \
+            data1 = jit_cc_get_const_##kind(cc, r1);                       \
+        else {                                                             \
+            reg_no_src1 = jit_reg_no(r1);                                  \
+            CHECK_REG_NO(reg_no_src1, jit_reg_kind(r1));                   \
+        }                                                                  \
+        if (jit_reg_is_const(r2))                                          \
+            data2 = jit_cc_get_const_##kind(cc, r2);                       \
+        else {                                                             \
+            reg_no_src2 = jit_reg_no(r2);                                  \
+            CHECK_REG_NO(reg_no_src2, jit_reg_kind(r2));                   \
+        }                                                                  \
+                                                                           \
+        if (jit_reg_is_const(r1)) {                                        \
+            if (jit_reg_is_const(r2))                                      \
+                _ret = shift_imm_imm_to_r_##type(a, op, reg_no_dst, data1, \
+                                                 data2);                   \
+            else                                                           \
+                _ret = shift_imm_r_to_r_##type(a, op, reg_no_dst, data1,   \
+                                               reg_no_src2);               \
+        }                                                                  \
+        else if (jit_reg_is_const(r2))                                     \
+            _ret = shift_r_imm_to_r_##type(a, op, reg_no_dst, reg_no_src1, \
+                                           data2);                         \
+        else                                                               \
+            _ret = shift_r_r_to_r_##type(a, op, reg_no_dst, reg_no_src1,   \
+                                         reg_no_src2);                     \
+        if (!_ret)                                                         \
+            GOTO_FAIL;                                                     \
+    } while (0)
+
+/**
+ * Encode shift insn, SHL/SHRS/SHRU r0, r1, r2
+ *
+ * @param cc the compiler context
+ * @param a the assembler to emit the code
+ * @param op the opcode of shift operations
+ * @param r0 dst jit register that contains the dst operand info
+ * @param r1 src jit register that contains the first src operand info
+ * @param r2 src jit register that contains the second src operand info
+ *
+ * @return true if success, false if failed
+ */
+static bool
+lower_shift(JitCompContext *cc, x86::Assembler &a, SHIFT_OP op, JitReg r0,
+            JitReg r1, JitReg r2)
+{
+    switch (jit_reg_kind(r0)) {
+        case JIT_REG_KIND_I32:
+            SHIFT_R_R_R(I32, int32, i32, op);
+            break;
+        case JIT_REG_KIND_I64:
+            SHIFT_R_R_R(I64, int64, i64, op);
+            break;
+        default:
+            LOG_VERBOSE("Invalid reg type of shift: %d\n", jit_reg_kind(r0));
+            GOTO_FAIL;
+    }
+
+    return true;
+fail:
+    return false;
+}
+
+/**
+ * Encode int32 bitcount operation of reg, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of BITCOUNT operation
+ * @param reg_no_dst the no of register
+ * @param reg_no_src the reg no of first src register data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+bitcount_r_to_r_i32(x86::Assembler &a, BITCOUNT_OP op, int32 reg_no_dst,
+                    int32 reg_no_src)
+{
+    switch (op) {
+        case CLZ:
+            a.lzcnt(regs_i32[reg_no_dst], regs_i32[reg_no_src]);
+            break;
+        case CTZ:
+            a.tzcnt(regs_i32[reg_no_dst], regs_i32[reg_no_src]);
+            break;
+        case POPCNT:
+            a.popcnt(regs_i32[reg_no_dst], regs_i32[reg_no_src]);
+            break;
+        default:
+            bh_assert(0);
+            return false;
+    }
+    return true;
+}
+
+/**
+ * Encode int64 bitcount operation of reg, and save result to reg
+ *
+ * @param a the assembler to emit the code
+ * @param op the opcode of BITCOUNT operation
+ * @param reg_no_dst the no of register
+ * @param reg_no_src the reg no of first src register data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+bitcount_r_to_r_i64(x86::Assembler &a, BITCOUNT_OP op, int32 reg_no_dst,
+                    int32 reg_no_src)
+{
+    switch (op) {
+        case CLZ:
+            a.lzcnt(regs_i64[reg_no_dst], regs_i64[reg_no_src]);
+            break;
+        case CTZ:
+            a.tzcnt(regs_i64[reg_no_dst], regs_i64[reg_no_src]);
+            break;
+        case POPCNT:
+            a.popcnt(regs_i64[reg_no_dst], regs_i64[reg_no_src]);
+            break;
+        default:
+            bh_assert(0);
+            return false;
+    }
+    return true;
+}
+
+/**
+ * Encode insn bitcount: CLZ/CTZ/POPCNT r0, r1
+ * @param kind the data kind, such as I32, I64
+ * @param Type the data type, such as int32, int64
+ * @param type the abbreviation of data type, such as i32, i64
+ * @param op the opcode of bit operation
+ */
+#define BITCOUNT_R_R(kind, Type, type, op)                          \
+    do {                                                            \
+        int32 reg_no_dst = 0, reg_no_src = 0;                       \
+                                                                    \
+        CHECK_EQKIND(r0, r1);                                       \
+        CHECK_NCONST(r0);                                           \
+        CHECK_NCONST(r1);                                           \
+                                                                    \
+        reg_no_dst = jit_reg_no(r0);                                \
+        CHECK_REG_NO(reg_no_dst, jit_reg_kind(r0));                 \
+        reg_no_src = jit_reg_no(r1);                                \
+        CHECK_REG_NO(reg_no_src, jit_reg_kind(r1));                 \
+        if (!bitcount_r_to_r_##type(a, op, reg_no_dst, reg_no_src)) \
+            GOTO_FAIL;                                              \
+    } while (0)
+
+/**
+ * Encode bitcount insn, CLZ/CTZ/POPCNT r0, r1
+ *
+ * @param cc the compiler context
+ * @param a the assembler to emit the code
+ * @param op the opcode of bitcount operations
+ * @param r0 dst jit register that contains the dst operand info
+ * @param r1 src jit register that contains the src operand info
+ *
+ * @return true if success, false if failed
+ */
+static bool
+lower_bitcount(JitCompContext *cc, x86::Assembler &a, BITCOUNT_OP op, JitReg r0,
+               JitReg r1)
+{
+    switch (jit_reg_kind(r0)) {
+        case JIT_REG_KIND_I32:
+            BITCOUNT_R_R(I32, int32, i32, op);
+            break;
+        case JIT_REG_KIND_I64:
+            BITCOUNT_R_R(I64, int64, i64, op);
+            break;
+        default:
+            LOG_VERBOSE("Invalid reg type of bit: %d\n", jit_reg_kind(r0));
+            GOTO_FAIL;
+    }
+
+    return true;
+fail:
+    return false;
+}
+
+/**
+ * Encode insn cmp: CMP r0, r1, r2
+ * @param kind the data kind, such as I32, I64, F32 and F64
+ * @param Type the data type, such as int32, int64, float32, and float64
+ * @param type the abbreviation of data type, such as i32, i64, f32, and f64
+ */
+#define CMP_R_R_R(kind, Type, type)                                           \
+    do {                                                                      \
+        Type data1, data2;                                                    \
+        int32 reg_no_dst = 0, reg_no_src1 = 0, reg_no_src2 = 0;               \
+        bool _ret = false;                                                    \
+                                                                              \
+        CHECK_KIND(r0, JIT_REG_KIND_I32);                                     \
+        CHECK_KIND(r1, JIT_REG_KIND_##kind);                                  \
+        CHECK_EQKIND(r1, r2);                                                 \
+        memset(&data1, 0, sizeof(Type));                                      \
+        memset(&data2, 0, sizeof(Type));                                      \
+                                                                              \
+        reg_no_dst = jit_reg_no(r0);                                          \
+        CHECK_REG_NO(reg_no_dst, jit_reg_kind(r0));                           \
+        if (jit_reg_is_const(r1))                                             \
+            data1 = jit_cc_get_const_##kind(cc, r1);                          \
+        else {                                                                \
+            reg_no_src1 = jit_reg_no(r1);                                     \
+            CHECK_REG_NO(reg_no_src1, jit_reg_kind(r1));                      \
+        }                                                                     \
+        if (jit_reg_is_const(r2))                                             \
+            data2 = jit_cc_get_const_##kind(cc, r2);                          \
+        else {                                                                \
+            reg_no_src2 = jit_reg_no(r2);                                     \
+            CHECK_REG_NO(reg_no_src2, jit_reg_kind(r2));                      \
+        }                                                                     \
+                                                                              \
+        if (jit_reg_is_const(r1)) {                                           \
+            if (jit_reg_is_const(r2))                                         \
+                _ret = cmp_imm_imm_to_r_##type(a, reg_no_dst, data1, data2);  \
+            else                                                              \
+                _ret =                                                        \
+                    cmp_imm_r_to_r_##type(a, reg_no_dst, data1, reg_no_src2); \
+        }                                                                     \
+        else if (jit_reg_is_const(r2))                                        \
+            _ret = cmp_r_imm_to_r_##type(a, reg_no_dst, reg_no_src1, data2);  \
+        else                                                                  \
+            _ret =                                                            \
+                cmp_r_r_to_r_##type(a, reg_no_dst, reg_no_src1, reg_no_src2); \
+        if (!_ret)                                                            \
+            GOTO_FAIL;                                                        \
+    } while (0)
+
+/**
+ * Encode cmp insn, CMP r0, r1, r2
+ *
+ * @param cc the compiler context
+ * @param a the assembler to emit the code
+ * @param r0 dst jit register that contains the dst operand info
+ * @param r1 condition jit register
+ * @param r2 src jit register that contains the first src operand info
+ * @param r3 src jit register that contains the second src operand info
+ *
+ * @return true if success, false if failed
+ */
+static bool
+lower_cmp(JitCompContext *cc, x86::Assembler &a, JitReg r0, JitReg r1,
+          JitReg r2)
+{
+    switch (jit_reg_kind(r1)) {
+        case JIT_REG_KIND_I32:
+            CMP_R_R_R(I32, int32, i32);
+            cc->last_cmp_on_fp = false;
+            break;
+        case JIT_REG_KIND_I64:
+            CMP_R_R_R(I64, int64, i64);
+            cc->last_cmp_on_fp = false;
+            break;
+        case JIT_REG_KIND_F32:
+            CMP_R_R_R(F32, float32, f32);
+            cc->last_cmp_on_fp = true;
+            break;
+        case JIT_REG_KIND_F64:
+            CMP_R_R_R(F64, float64, f64);
+            cc->last_cmp_on_fp = true;
+            break;
+        default:
+            cc->last_cmp_on_fp = false;
+            LOG_VERBOSE("Invalid reg type of cmp: %d\n", jit_reg_kind(r1));
+            GOTO_FAIL;
+    }
+
+    return true;
+fail:
+    return false;
+}
+
+/**
+ * Encode detecting the cmp flags in reg, and jmp to the relative address
+ * according to the condition opcode
+ *
+ * @param cc the compiler context
+ * @param a the assembler to emit the code
+ * @param op the condition opcode to jmp
+ * @param offset the relative offset to jmp when the contidtion meeted
+ *
+ * @return return the next address of native code after encoded
+ */
+static bool
+cmp_r_and_jmp_relative(JitCompContext *cc, x86::Assembler &a, COND_OP op,
+                       int32 offset)
+{
+    Imm target(INT32_MAX);
+    char *stream;
+    bool fp_cmp = cc->last_cmp_on_fp;
+
+    bh_assert(!fp_cmp || (fp_cmp && (op == GTS || op == GES)));
+
+    switch (op) {
+        case EQ:
+        {
+            a.je(target);
+            break;
+        }
+        case NE:
+        {
+            a.jne(target);
+            break;
+        }
+        case GTS:
+        {
+            if (fp_cmp) {
+                a.ja(target);
+            }
+            else {
+                a.jg(target);
+            }
+            break;
+        }
+        case LES:
+        {
+            a.jng(target);
+            break;
+        }
+        case GES:
+        {
+            if (fp_cmp) {
+                a.jae(target);
+            }
+            else {
+                a.jnl(target);
+            }
+            break;
+        }
+        case LTS:
+        {
+            a.jl(target);
+            break;
+        }
+        case GTU:
+        {
+            a.ja(target);
+            break;
+        }
+        case LEU:
+        {
+            a.jna(target);
+            break;
+        }
+        case GEU:
+        {
+            a.jae(target);
+            break;
+        }
+        case LTU:
+        {
+            a.jb(target);
+            break;
+        }
+        default:
+        {
+            bh_assert(0);
+            break;
+        }
+    }
+
+    JitErrorHandler *err_handler = (JitErrorHandler *)a.code()->errorHandler();
+
+    if (!err_handler->err) {
+        /* The offset written by asmjit is always 0, we patch it again */
+        stream = (char *)a.code()->sectionById(0)->buffer().data()
+                 + a.code()->sectionById(0)->buffer().size() - 6;
+        *(int32 *)(stream + 2) = offset;
+    }
+    return true;
+}
+
+/**
+ * Encode select insn, SELECT r0, r1, r2, r3
+ *
+ * @param cc the compiler context
+ * @param a the assembler to emit the code
+ * @param r0 dst jit register that contains the dst operand info
+ * @param r1 src jit register that contains the first src operand info
+ * @param r2 src jit register that contains the second src operand info
+ *
+ * @return true if success, false if failed
+ */
+/* TODO: optimize with setcc */
+static bool
+lower_select(JitCompContext *cc, x86::Assembler &a, COND_OP op, JitReg r0,
+             JitReg r1, JitReg r2, JitReg r3)
+{
+    JitErrorHandler err_handler;
+    Environment env(Arch::kX64);
+    CodeHolder code1, code2;
+    char *stream_mov1, *stream_mov2;
+    uint32 size_mov1, size_mov2;
+
+    code1.init(env);
+    code1.setErrorHandler(&err_handler);
+    x86::Assembler a1(&code1);
+
+    code2.init(env);
+    code2.setErrorHandler(&err_handler);
+    x86::Assembler a2(&code2);
+
+    CHECK_NCONST(r0);
+    CHECK_NCONST(r1);
+    CHECK_KIND(r1, JIT_REG_KIND_I32);
+
+    if (r0 == r3 && r0 != r2 && !cc->last_cmp_on_fp) {
+        JitReg r_tmp;
+
+        /* For i32/i64, exchange r2 and r3 to make r0 equal to r2,
+           so as to decrease possible execution instructions.
+           For f32/f64 comparison, should not change the order as
+           the result of comparison with NaN may be different. */
+        r_tmp = r2;
+        r2 = r3;
+        r3 = r_tmp;
+        op = not_cond(op);
+    }
+
+    if (!lower_mov(cc, a1, r0, r2))
+        GOTO_FAIL;
+
+    if (!lower_mov(cc, a2, r0, r3))
+        GOTO_FAIL;
+
+    stream_mov1 = (char *)a1.code()->sectionById(0)->buffer().data();
+    size_mov1 = a1.code()->sectionById(0)->buffer().size();
+    stream_mov2 = (char *)a2.code()->sectionById(0)->buffer().data();
+    size_mov2 = a2.code()->sectionById(0)->buffer().size();
+
+    if (r0 != r2) {
+        a.embedDataArray(TypeId::kInt8, stream_mov1, size_mov1);
+    }
+
+    if (r3 && r0 != r3) {
+        if (!cmp_r_and_jmp_relative(cc, a, op, (int32)size_mov2))
+            return false;
+        a.embedDataArray(TypeId::kInt8, stream_mov2, size_mov2);
+    }
+
+    return true;
+fail:
+    return false;
+}
+
+/* jmp to dst label */
+#define JMP_TO_LABEL(label_dst, label_src)                                 \
+    do {                                                                   \
+        if (label_is_ahead(cc, label_dst, label_src)) {                    \
+            JitErrorHandler *err_handler =                                 \
+                (JitErrorHandler *)a.code()->errorHandler();               \
+            int32 _offset;                                                 \
+            char *stream;                                                  \
+            Imm imm(INT32_MAX);                                            \
+            a.jmp(imm);                                                    \
+            if (!err_handler->err) {                                       \
+                /* The offset written by asmjit is always 0, we patch it   \
+                   again, 6 is the size of jmp instruciton */              \
+                stream = (char *)a.code()->sectionById(0)->buffer().data() \
+                         + a.code()->sectionById(0)->buffer().size() - 6;  \
+                _offset = label_offsets[label_dst]                         \
+                          - a.code()->sectionById(0)->buffer().size();     \
+                *(int32 *)(stream + 2) = _offset;                          \
+            }                                                              \
+        }                                                                  \
+        else {                                                             \
+            if (!jmp_from_label_to_label(a, jmp_info_list, label_dst,      \
+                                         label_src))                       \
+                GOTO_FAIL;                                                 \
+        }                                                                  \
+    } while (0)
+
+/**
+ * Encode branch insn, BEQ/BNE/../BLTU r0, r1, r2
+ *
+ * @param cc the compiler context
+ * @param a the assembler to emit the code
+ * @param jmp_info_list the jmp info list
+ * @param r0 dst jit register that contains the dst operand info
+ * @param r1 src jit register that contains the first src operand info
+ * @param r2 src jit register that contains the second src operand info
+ * @param is_last_insn if current insn is the last insn of current block
+ *
+ * @return true if success, false if failed
+ */
+static bool
+lower_branch(JitCompContext *cc, x86::Assembler &a, bh_list *jmp_info_list,
+             int32 label_src, COND_OP op, JitReg r0, JitReg r1, JitReg r2,
+             bool is_last_insn)
+{
+    int32 label_dst;
+
+    CHECK_NCONST(r0);
+    CHECK_KIND(r0, JIT_REG_KIND_I32);
+    CHECK_KIND(r1, JIT_REG_KIND_L32);
+
+    CHECK_REG_NO(jit_reg_no(r0), jit_reg_kind(r0));
+
+    label_dst = jit_reg_no(r1);
+    if (label_dst < (int32)jit_cc_label_num(cc) - 1 && is_last_insn
+        && label_is_neighboring(cc, label_src, label_dst)
+        && !cc->last_cmp_on_fp) {
+        JitReg r_tmp;
+
+        r_tmp = r1;
+        r1 = r2;
+        r2 = r_tmp;
+        op = not_cond(op);
+    }
+
+    if (!cmp_r_and_jmp_label(cc, a, jmp_info_list, label_src, op, r1, r2,
+                             is_last_insn))
+        GOTO_FAIL;
+
+    return true;
+fail:
+    return false;
+}
+
+/**
+ * Encode lookupswitch with key of immediate data
+ *
+ * @param cc the compiler context
+ * @param a the assembler to emit the code
+ * @param jmp_info_list the jmp info list
+ * @param label_offsets the offsets of each label
+ * @param label_src the index of src label
+ * @param key the entry key
+ * @param opnd the lookup switch operand
+ * @param is_last_insn if current insn is the last insn of current block
+ *
+ * @return true if success, false if failed
+ */
+static bool
+lookupswitch_imm(JitCompContext *cc, x86::Assembler &a, bh_list *jmp_info_list,
+                 uint32 *label_offsets, int32 label_src, int32 key,
+                 const JitOpndLookupSwitch *opnd, bool is_last_insn)
+{
+    uint32 i;
+    int32 label_dst;
+
+    for (i = 0; i < opnd->match_pairs_num; i++)
+        if (key == opnd->match_pairs[i].value) {
+            label_dst = jit_reg_no(opnd->match_pairs[i].target);
+            if (!(is_last_insn
+                  && label_is_neighboring(cc, label_src, label_dst))) {
+                JMP_TO_LABEL(label_dst, label_src);
+            }
+            return true;
+        }
+
+    if (opnd->default_target) {
+        label_dst = jit_reg_no(opnd->default_target);
+        if (!(is_last_insn && label_is_neighboring(cc, label_src, label_dst))) {
+            JMP_TO_LABEL(label_dst, label_src);
+        }
+    }
+
+    return true;
+fail:
+    return false;
+}
+
+/**
+ * Encode detecting lookupswitch entry register and jumping to matched label
+ *
+ * @param cc the compiler context
+ * @param a the assembler to emit the code
+ * @param jmp_info_list the jmp info list
+ * @param label_offsets the offsets of each label
+ * @param label_src the index of src label
+ * @param reg_no the no of entry register
+ * @param opnd the lookup switch operand
+ * @param is_last_insn if current insn is the last insn of current block
+ *
+ * @return true if success, false if failed
+ */
+static bool
+lookupswitch_r(JitCompContext *cc, x86::Assembler &a, bh_list *jmp_info_list,
+               uint32 *label_offsets, int32 label_src, int32 reg_no,
+               const JitOpndLookupSwitch *opnd, bool is_last_insn)
+{
+    JmpInfo *node;
+    Imm imm;
+    x86::Mem m;
+    uint32 i;
+    int32 label_dst = 0;
+    char *stream;
+
+    if (opnd->match_pairs_num < 10) {
+        /* For small count of branches, it is better to compare
+           the key with branch value and jump one by one */
+        for (i = 0; i < opnd->match_pairs_num; i++) {
+            imm.setValue(opnd->match_pairs[i].value);
+            a.cmp(regs_i32[reg_no], imm);
+
+            node = (JmpInfo *)jit_malloc(sizeof(JmpInfo));
+            if (!node)
+                GOTO_FAIL;
+
+            node->type = JMP_DST_LABEL_REL;
+            node->label_src = label_src;
+            node->dst_info.label_dst = jit_reg_no(opnd->match_pairs[i].target);
+            node->offset = a.code()->sectionById(0)->buffer().size() + 2;
+            bh_list_insert(jmp_info_list, node);
+
+            imm.setValue(INT32_MAX);
+            a.je(imm);
+        }
+
+        if (opnd->default_target) {
+            label_dst = jit_reg_no(opnd->default_target);
+            if (!(is_last_insn
+                  && label_is_neighboring(cc, label_src, label_dst)))
+                JMP_TO_LABEL(label_dst, label_src);
+        }
+    }
+    else {
+        /* For bigger count of branches, use indirect jump */
+        /* unsigned extend to rsi */
+        a.mov(regs_i32[REG_I32_FREE_IDX], regs_i32[reg_no]);
+        imm.setValue(opnd->match_pairs_num);
+        a.cmp(regs_i64[REG_I64_FREE_IDX], imm);
+
+        /* Jump to default label if rsi >= br_count */
+        stream = (char *)a.code()->sectionById(0)->buffer().data()
+                 + a.code()->sectionById(0)->buffer().size();
+        imm.setValue(INT32_MAX);
+        a.jb(imm);
+        *(uint32 *)(stream + 2) = 6;
+
+        node = (JmpInfo *)jit_calloc(sizeof(JmpInfo));
+        if (!node)
+            goto fail;
+
+        node->type = JMP_DST_LABEL_REL;
+        node->label_src = label_src;
+        node->dst_info.label_dst = jit_reg_no(opnd->default_target);
+        node->offset = a.code()->sectionById(0)->buffer().size() + 2;
+        bh_list_insert(jmp_info_list, node);
+
+        imm.setValue(INT32_MAX);
+        a.jmp(imm);
+
+        node = (JmpInfo *)jit_malloc(sizeof(JmpInfo));
+        if (!node)
+            GOTO_FAIL;
+
+        node->type = JMP_LOOKUPSWITCH_BASE;
+        node->offset = a.code()->sectionById(0)->buffer().size() + 2;
+        bh_list_insert(jmp_info_list, node);
+
+        /* LookupSwitch table base addr */
+        imm.setValue(INT64_MAX);
+        a.mov(regs_i64[reg_no], imm);
+
+        /* jmp *(base_addr + rsi * 8) */
+        m = x86::ptr(regs_i64[reg_no], regs_i64[REG_I64_FREE_IDX], 3);
+        a.jmp(m);
+
+        /* Store each dst label absolute address */
+        for (i = 0; i < opnd->match_pairs_num; i++) {
+            node = (JmpInfo *)jit_malloc(sizeof(JmpInfo));
+            if (!node)
+                GOTO_FAIL;
+
+            node->type = JMP_DST_LABEL_ABS;
+            node->dst_info.label_dst = jit_reg_no(opnd->match_pairs[i].target);
+            node->offset = a.code()->sectionById(0)->buffer().size();
+            bh_list_insert(jmp_info_list, node);
+
+            a.embedUInt64(UINT64_MAX);
+        }
+    }
+
+    return true;
+fail:
+    return false;
+}
+
+/**
+ * Encode lookupswitch insn, LOOKUPSWITCH opnd
+ *
+ * @param cc the compiler context
+ * @param a the assembler to emit the code
+ * @param jmp_info_list the jmp info list
+ * @param label_offsets the offsets of each label
+ * @param label_src the index of src label
+ * @param opnd the lookup switch operand
+ * @param is_last_insn if current insn is the last insn of current block
+ *
+ * @return true if success, false if failed
+ */
+static bool
+lower_lookupswitch(JitCompContext *cc, x86::Assembler &a,
+                   bh_list *jmp_info_list, uint32 *label_offsets,
+                   int32 label_src, const JitOpndLookupSwitch *opnd,
+                   bool is_last_insn)
+{
+    JitReg r0 = opnd->value;
+    int32 key, reg_no;
+
+    CHECK_KIND(r0, JIT_REG_KIND_I32);
+    CHECK_KIND(opnd->default_target, JIT_REG_KIND_L32);
+
+    if (jit_reg_is_const(r0)) {
+        key = jit_cc_get_const_I32(cc, r0);
+        if (!lookupswitch_imm(cc, a, jmp_info_list, label_offsets, label_src,
+                              key, opnd, is_last_insn))
+            GOTO_FAIL;
+    }
+    else {
+        reg_no = jit_reg_no(r0);
+        CHECK_I32_REG_NO(reg_no);
+        if (!lookupswitch_r(cc, a, jmp_info_list, label_offsets, label_src,
+                            reg_no, opnd, is_last_insn))
+            GOTO_FAIL;
+    }
+
+    return true;
+fail:
+    return false;
+}
+
+/**
+ * Encode callnative insn, CALLNATIVE r0, r1, ...
+ *
+ * @param cc the compiler context
+ * @param a the assembler to emit the code
+ * @param insn current insn info
+ *
+ * @return true if success, false if failed
+ */
+static bool
+lower_callnative(JitCompContext *cc, x86::Assembler &a, JitInsn *insn)
+{
+    void (*func_ptr)(void);
+    JitReg ret_reg, func_reg, arg_reg;
+    /* the index of callee saved registers in regs_i64 */
+    uint8 regs_arg_idx[] = { REG_RDI_IDX, REG_RSI_IDX, REG_RDX_IDX,
+                             REG_RCX_IDX, REG_R8_IDX,  REG_R9_IDX };
+    Imm imm;
+    uint32 i, opnd_num;
+    int32 integer_reg_index = 0, floatpoint_reg_index = 0;
+
+    ret_reg = *(jit_insn_opndv(insn, 0));
+    func_reg = *(jit_insn_opndv(insn, 1));
+    CHECK_KIND(func_reg, JIT_REG_KIND_I64);
+    CHECK_CONST(func_reg);
+
+    func_ptr = (void (*)(void))jit_cc_get_const_I64(cc, func_reg);
+
+    opnd_num = jit_insn_opndv_num(insn);
+    for (i = 0; i < opnd_num - 2; i++) {
+        /*TODO: if arguments number is greater than 6 */
+        bh_assert(integer_reg_index < 6);
+        bh_assert(floatpoint_reg_index < 6);
+
+        arg_reg = *(jit_insn_opndv(insn, i + 2));
+        switch (jit_reg_kind(arg_reg)) {
+            case JIT_REG_KIND_I32:
+            {
+                int32 reg_no = regs_arg_idx[integer_reg_index++];
+                CHECK_I64_REG_NO(reg_no);
+                if (jit_reg_is_const(arg_reg)) {
+                    mov_imm_to_r_i64(a, reg_no,
+                                     (int64)jit_cc_get_const_I32(cc, arg_reg));
+                }
+                else {
+                    int32 arg_reg_no = jit_reg_no(arg_reg);
+                    CHECK_I32_REG_NO(arg_reg_no);
+                    extend_r32_to_r64(a, reg_no, arg_reg_no, true);
+                }
+                break;
+            }
+            case JIT_REG_KIND_I64:
+            {
+                int32 reg_no = regs_arg_idx[integer_reg_index++];
+                CHECK_I64_REG_NO(reg_no);
+                if (jit_reg_is_const(arg_reg)) {
+                    mov_imm_to_r_i64(a, reg_no,
+                                     jit_cc_get_const_I64(cc, arg_reg));
+                }
+                else {
+                    int32 arg_reg_no = jit_reg_no(arg_reg);
+                    CHECK_I64_REG_NO(arg_reg_no);
+                    mov_r_to_r_i64(a, reg_no, arg_reg_no);
+                }
+                break;
+            }
+            case JIT_REG_KIND_F32:
+            {
+                CHECK_F32_REG_NO((int32)floatpoint_reg_index);
+                if (jit_reg_is_const(arg_reg)) {
+                    mov_imm_to_r_f32(a, floatpoint_reg_index,
+                                     jit_cc_get_const_F32(cc, arg_reg));
+                }
+                else {
+                    int32 arg_reg_no = jit_reg_no(arg_reg);
+                    CHECK_F32_REG_NO(arg_reg_no);
+                    mov_r_to_r_f32(a, floatpoint_reg_index, arg_reg_no);
+                }
+                floatpoint_reg_index++;
+                break;
+            }
+            case JIT_REG_KIND_F64:
+            {
+                CHECK_F64_REG_NO((int32)floatpoint_reg_index);
+                if (jit_reg_is_const(arg_reg)) {
+                    mov_imm_to_r_f64(a, floatpoint_reg_index,
+                                     jit_cc_get_const_F64(cc, arg_reg));
+                }
+                else {
+                    int32 arg_reg_no = jit_reg_no(arg_reg);
+                    CHECK_F64_REG_NO(arg_reg_no);
+                    mov_r_to_r_f64(a, floatpoint_reg_index, arg_reg_no);
+                }
+                floatpoint_reg_index++;
+                break;
+            }
+            default:
+            {
+
+                bh_assert(0);
+                goto fail;
+            }
+        }
+    }
+
+    imm.setValue((uint64)func_ptr);
+    a.mov(regs_i64[REG_RAX_IDX], imm);
+    a.call(regs_i64[REG_RAX_IDX]);
+
+    if (ret_reg) {
+        uint32 ret_reg_no = jit_reg_no(ret_reg);
+        if (jit_reg_kind(ret_reg) == JIT_REG_KIND_I64) {
+            CHECK_I64_REG_NO(ret_reg_no);
+            /* mov res, rax */
+            mov_r_to_r_i64(a, ret_reg_no, REG_RAX_IDX);
+        }
+        else if (jit_reg_kind(ret_reg) == JIT_REG_KIND_F64) {
+            CHECK_F64_REG_NO(ret_reg_no);
+            /* mov res, xmm0_f64 */
+            mov_r_to_r_f64(a, ret_reg_no, 0);
+        }
+        else {
+            bh_assert((jit_reg_kind(ret_reg) == JIT_REG_KIND_I32
+                       && ret_reg_no == REG_EAX_IDX)
+                      || (jit_reg_kind(ret_reg) == JIT_REG_KIND_F32
+                          && ret_reg_no == 0));
+        }
+    }
+
+    return true;
+fail:
+    return false;
+}
+
+/**
+ * Encode callbc insn, CALLBC r0, r1, r2
+ *
+ * @param cc the compiler context
+ * @param a the assembler to emit the code
+ * @param jmp_info_list the jmp info list
+ * @param label_src the index of src label
+ * @param insn current insn info
+ *
+ * @return true if success, false if failed
+ */
+static bool
+lower_callbc(JitCompContext *cc, x86::Assembler &a, bh_list *jmp_info_list,
+             int32 label_src, JitInsn *insn)
+{
+    JmpInfo *node;
+    Imm imm;
+    JitReg edx_hreg = jit_reg_new(JIT_REG_KIND_I32, REG_EDX_IDX);
+    JitReg rdx_hreg = jit_reg_new(JIT_REG_KIND_I64, REG_RDX_IDX);
+    JitReg xmm0_f32_hreg = jit_reg_new(JIT_REG_KIND_F32, 0);
+    JitReg xmm0_f64_hreg = jit_reg_new(JIT_REG_KIND_F64, 0);
+    JitReg ret_reg = *(jit_insn_opnd(insn, 0));
+    JitReg func_reg = *(jit_insn_opnd(insn, 2));
+    JitReg func_idx = *(jit_insn_opnd(insn, 3));
+    JitReg src_reg;
+    int32 func_reg_no;
+
+    /* Load return_jitted_addr from stack */
+    x86::Mem m(x86::rbp, cc->jitted_return_address_offset);
+
+    CHECK_KIND(func_reg, JIT_REG_KIND_I64);
+    func_reg_no = jit_reg_no(func_reg);
+    CHECK_I64_REG_NO(func_reg_no);
+
+    CHECK_KIND(func_idx, JIT_REG_KIND_I32);
+    if (jit_reg_is_const(func_idx)) {
+        imm.setValue(jit_cc_get_const_I32(cc, func_idx));
+        a.mov(regs_i64[REG_RDX_IDX], imm);
+    }
+    else {
+        a.movzx(regs_i64[REG_RDX_IDX], regs_i32[jit_reg_no(func_idx)]);
+    }
+
+    node = (JmpInfo *)jit_malloc(sizeof(JmpInfo));
+    if (!node)
+        GOTO_FAIL;
+
+    node->type = JMP_END_OF_CALLBC;
+    node->label_src = label_src;
+    node->offset = a.code()->sectionById(0)->buffer().size() + 2;
+    bh_list_insert(jmp_info_list, node);
+
+    /* Set next jited addr to glue_ret_jited_addr, 0 will be replaced with
+       actual offset after actual code cache is allocated */
+    imm.setValue(INT64_MAX);
+    a.mov(regs_i64[REG_I64_FREE_IDX], imm);
+    a.mov(m, regs_i64[REG_I64_FREE_IDX]);
+    a.jmp(regs_i64[func_reg_no]);
+
+    if (ret_reg) {
+        switch (jit_reg_kind(ret_reg)) {
+            case JIT_REG_KIND_I32:
+                src_reg = edx_hreg;
+                break;
+            case JIT_REG_KIND_I64:
+                src_reg = rdx_hreg;
+                break;
+            case JIT_REG_KIND_F32:
+                src_reg = xmm0_f32_hreg;
+                break;
+            case JIT_REG_KIND_F64:
+                src_reg = xmm0_f64_hreg;
+                break;
+            default:
+                bh_assert(0);
+                return false;
+        }
+
+        if (!lower_mov(cc, a, ret_reg, src_reg))
+            return false;
+    }
+    return true;
+fail:
+    return false;
+}
+
+static bool
+lower_returnbc(JitCompContext *cc, x86::Assembler &a, JitInsn *insn)
+{
+    JitReg edx_hreg = jit_reg_new(JIT_REG_KIND_I32, REG_EDX_IDX);
+    JitReg rdx_hreg = jit_reg_new(JIT_REG_KIND_I64, REG_RDX_IDX);
+    JitReg xmm0_f32_hreg = jit_reg_new(JIT_REG_KIND_F32, 0);
+    JitReg xmm0_f64_hreg = jit_reg_new(JIT_REG_KIND_F64, 0);
+    JitReg act_reg = *(jit_insn_opnd(insn, 0));
+    JitReg ret_reg = *(jit_insn_opnd(insn, 1));
+    JitReg dst_reg;
+    int32 act;
+
+    CHECK_CONST(act_reg);
+    CHECK_KIND(act_reg, JIT_REG_KIND_I32);
+
+    act = jit_cc_get_const_I32(cc, act_reg);
+
+    if (ret_reg) {
+        switch (jit_reg_kind(ret_reg)) {
+            case JIT_REG_KIND_I32:
+                dst_reg = edx_hreg;
+                break;
+            case JIT_REG_KIND_I64:
+                dst_reg = rdx_hreg;
+                break;
+            case JIT_REG_KIND_F32:
+                dst_reg = xmm0_f32_hreg;
+                break;
+            case JIT_REG_KIND_F64:
+                dst_reg = xmm0_f64_hreg;
+                break;
+            default:
+                bh_assert(0);
+                return false;
+        }
+        if (!lower_mov(cc, a, dst_reg, ret_reg))
+            return false;
+    }
+
+    {
+        /* eax = act */
+        Imm imm(act);
+        a.mov(x86::eax, imm);
+
+        x86::Mem m(x86::rbp, cc->jitted_return_address_offset);
+        a.jmp(m);
+    }
+    return true;
+fail:
+    return false;
+}
+
+static bool
+lower_return(JitCompContext *cc, x86::Assembler &a, JitInsn *insn)
+{
+    JitReg act_reg = *(jit_insn_opnd(insn, 0));
+    int32 act;
+
+    CHECK_CONST(act_reg);
+    CHECK_KIND(act_reg, JIT_REG_KIND_I32);
+
+    act = jit_cc_get_const_I32(cc, act_reg);
+    {
+        /* eax = act */
+        Imm imm(act);
+        a.mov(x86::eax, imm);
+
+        imm.setValue((uintptr_t)code_block_return_to_interp_from_jitted);
+        a.mov(regs_i64[REG_I64_FREE_IDX], imm);
+        a.jmp(regs_i64[REG_I64_FREE_IDX]);
+    }
+    return true;
+fail:
+    return false;
+}
+
+/**
+ * Replace all the jmp address pre-saved when the code cache hasn't been
+ * allocated with actual address after code cache allocated
+ *
+ * @param cc compiler context containting the allocated code cacha info
+ * @param jmp_info_list the jmp info list
+ */
+static void
+patch_jmp_info_list(JitCompContext *cc, bh_list *jmp_info_list)
+{
+    JmpInfo *jmp_info, *jmp_info_next;
+    JitReg reg_dst;
+    char *stream;
+
+    jmp_info = (JmpInfo *)bh_list_first_elem(jmp_info_list);
+
+    while (jmp_info) {
+        jmp_info_next = (JmpInfo *)bh_list_elem_next(jmp_info);
+
+        stream = (char *)cc->jitted_addr_begin + jmp_info->offset;
+
+        if (jmp_info->type == JMP_DST_LABEL_REL) {
+            /* Jmp with relative address */
+            reg_dst =
+                jit_reg_new(JIT_REG_KIND_L32, jmp_info->dst_info.label_dst);
+            *(int32 *)stream =
+                (int32)((uintptr_t)*jit_annl_jitted_addr(cc, reg_dst)
+                        - (uintptr_t)stream)
+                - 4;
+        }
+        else if (jmp_info->type == JMP_DST_LABEL_ABS) {
+            /* Jmp with absolute address */
+            reg_dst =
+                jit_reg_new(JIT_REG_KIND_L32, jmp_info->dst_info.label_dst);
+            *(uintptr_t *)stream =
+                (uintptr_t)*jit_annl_jitted_addr(cc, reg_dst);
+        }
+        else if (jmp_info->type == JMP_END_OF_CALLBC) {
+            /* 7 is the size of mov and jmp instruction */
+            *(uintptr_t *)stream = (uintptr_t)stream + sizeof(uintptr_t) + 7;
+        }
+        else if (jmp_info->type == JMP_LOOKUPSWITCH_BASE) {
+            /* 11 is the size of 8-byte addr and 3-byte jmp instruction */
+            *(uintptr_t *)stream = (uintptr_t)stream + 11;
+        }
+
+        jmp_info = jmp_info_next;
+    }
+}
+
+/* Free the jmp info list */
+static void
+free_jmp_info_list(bh_list *jmp_info_list)
+{
+    void *cur_node = bh_list_first_elem(jmp_info_list);
+
+    while (cur_node) {
+        void *next_node = bh_list_elem_next(cur_node);
+
+        bh_list_remove(jmp_info_list, cur_node);
+        jit_free(cur_node);
+        cur_node = next_node;
+    }
+}
+
+/**
+ * Encode cast int32 immediate data to float register data
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no the no of dst float register
+ * @param data the src immediate data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+cast_imm_i32_to_r_f32(x86::Assembler &a, int32 reg_no, int32 data)
+{
+    Imm imm(data);
+    a.mov(regs_i32[REG_I32_FREE_IDX], imm);
+    a.movd(regs_float[reg_no], regs_i32[REG_I32_FREE_IDX]);
+    return true;
+}
+
+/**
+ * Encode cast int32 register data to float register data
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no_dst the no of dst float register
+ * @param reg_no_src the no of src int32 register
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+cast_r_i32_to_r_f32(x86::Assembler &a, int32 reg_no_dst, int32 reg_no_src)
+{
+    a.movd(regs_float[reg_no_dst], regs_i32[reg_no_src]);
+    return true;
+}
+
+/**
+ * Encode cast int64 immediate data to double register data
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no the no of dst double register
+ * @param data the src immediate data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+cast_imm_i64_to_r_f64(x86::Assembler &a, int32 reg_no, int64 data)
+{
+    Imm imm(data);
+    a.mov(regs_i64[REG_I64_FREE_IDX], imm);
+    a.movq(regs_float[reg_no], regs_i64[REG_I64_FREE_IDX]);
+    return true;
+}
+
+/**
+ * Encode cast int64 register data to double register data
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no_dst the no of dst double register
+ * @param reg_no_src the no of src int64 register
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+cast_r_i64_to_r_f64(x86::Assembler &a, int32 reg_no_dst, int32 reg_no_src)
+{
+    a.movq(regs_float[reg_no_dst], regs_i64[reg_no_src]);
+    return true;
+}
+
+/**
+ * Encode cast float immediate data to int32 register data
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no the no of dst int32 register
+ * @param data the src immediate data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+cast_imm_f32_to_r_i32(x86::Assembler &a, int32 reg_no, float data)
+{
+    cast_float_to_integer v = { .f = data };
+    return mov_imm_to_r_i32(a, reg_no, v.i);
+}
+
+/**
+ * Encode cast float register data to int32 register data
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no_dst the no of dst int32 register
+ * @param reg_no_src the no of src float register
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+cast_r_f32_to_r_i32(x86::Assembler &a, int32 reg_no_dst, int32 reg_no_src)
+{
+    a.movd(regs_i32[reg_no_dst], regs_float[reg_no_src]);
+    return true;
+}
+
+/**
+ * Encode cast double immediate data to int64 register data
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no the no of dst int64 register
+ * @param data the src immediate data
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+cast_imm_f64_to_r_i64(x86::Assembler &a, int32 reg_no, double data)
+{
+    cast_double_to_integer v = { .d = data };
+    return mov_imm_to_r_i64(a, reg_no, v.i);
+}
+
+/**
+ * Encode cast float register data to int32 register data
+ *
+ * @param a the assembler to emit the code
+ * @param reg_no_dst the no of dst int32 register
+ * @param reg_no_src the no of src float register
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+cast_r_f64_to_r_i64(x86::Assembler &a, int32 reg_no_dst, int32 reg_no_src)
+{
+    a.movq(regs_i64[reg_no_dst], regs_float[reg_no_src]);
+    return true;
+}
+
+/**
+ * Encode insn cast: F32CASTI32,
+ * @param kind0 the dst JIT_REG_KIND, such as I32, I64, F32 and F64
+ * @param kind1 the src JIT_REG_KIND, such as I32, I64, F32 and F64
+ * @param type0 the dst data type, such as i8, u8, i16, u16, i32, f32, i64, f32,
+ * f64
+ * @param type1 the src data type, such as i8, u8, i16, u16, i32, f32, i64, f32,
+ * f64
+ */
+#define CAST_R_R(kind0, kind1, type0, type1, Type1)                          \
+    do {                                                                     \
+        bool _ret = false;                                                   \
+        int32 reg_no_dst = 0, reg_no_src = 0;                                \
+        CHECK_KIND(r0, JIT_REG_KIND_##kind0);                                \
+        CHECK_KIND(r1, JIT_REG_KIND_##kind1);                                \
+                                                                             \
+        reg_no_dst = jit_reg_no(r0);                                         \
+        CHECK_REG_NO(reg_no_dst, JIT_REG_KIND_##kind0);                      \
+        if (jit_reg_is_const(r1)) {                                          \
+            Type1 data = jit_cc_get_const_##kind1(cc, r1);                   \
+            _ret = cast_imm_##type1##_to_r_##type0(a, reg_no_dst, data);     \
+        }                                                                    \
+        else {                                                               \
+            reg_no_src = jit_reg_no(r1);                                     \
+            CHECK_REG_NO(reg_no_src, JIT_REG_KIND_##kind1);                  \
+            _ret = cast_r_##type1##_to_r_##type0(a, reg_no_dst, reg_no_src); \
+        }                                                                    \
+        if (!_ret)                                                           \
+            GOTO_FAIL;                                                       \
+    } while (0)
+
+#if WASM_ENABLE_SHARED_MEMORY != 0
+
+/**
+ * Encode extend certain bytes in the src register to a I32 or I64 kind value in
+ * dst register
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data,
+ *        could be 1(byte), 2(short), 4(int32), 8(int64),
+ * @param kind_dst the kind of data to extend to, could be I32, I64
+ * @param reg_no_src the index of register hold src value
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+extend_r_to_r(x86::Assembler &a, uint32 bytes_dst, uint32 kind_dst,
+              int32 reg_no_src, int32 reg_no_dst)
+{
+    if (kind_dst == JIT_REG_KIND_I32) {
+        bh_assert(reg_no_src < 16 && reg_no_dst < 16);
+        switch (bytes_dst) {
+            case 1:
+                extend_r8_to_r32(a, reg_no_dst, reg_no_src, false);
+                break;
+            case 2:
+                extend_r16_to_r32(a, reg_no_dst, reg_no_src, false);
+                break;
+            case 4:
+                mov_r_to_r_i32(a, reg_no_dst, reg_no_src);
+                break;
+            default:
+                bh_assert(0);
+                return false;
+        }
+    }
+    else if (kind_dst == JIT_REG_KIND_I64) {
+        bh_assert(reg_no_src < 16 && reg_no_dst < 16);
+        switch (bytes_dst) {
+            case 1:
+                extend_r8_to_r64(a, reg_no_dst, reg_no_src, false);
+                break;
+            case 2:
+                extend_r16_to_r64(a, reg_no_dst, reg_no_src, false);
+                break;
+            case 4:
+                extend_r32_to_r64(a, reg_no_dst, reg_no_src, false);
+                break;
+            case 8:
+                mov_r_to_r_i64(a, reg_no_dst, reg_no_src);
+                break;
+            default:
+                bh_assert(0);
+                return false;
+        }
+    }
+    else {
+        bh_assert(0);
+    }
+    return true;
+}
+
+/**
+ * Encode atomic compare and exchange, when calling this function,
+ * value for comparison should be already moved in register
+ * al/ax/eax/rax
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data,
+ *        could be 1(byte), 2(short), 4(int32), 8(int64),
+ * @param kind_dst the kind of data to move, could be I32, I64
+ * @param m_dst the dest memory operand
+ * @param reg_no_xchg the index of register hold exchange value
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+at_cmpxchg(x86::Assembler &a, uint32 bytes_dst, uint32 kind_dst,
+           int32 reg_no_xchg, x86::Mem &m_dst)
+{
+    bh_assert((kind_dst == JIT_REG_KIND_I32 && bytes_dst <= 4)
+              || kind_dst == JIT_REG_KIND_I64);
+    bh_assert(reg_no_xchg < 16);
+    switch (bytes_dst) {
+        case 1:
+            a.lock().cmpxchg(m_dst, regs_i8[reg_no_xchg]);
+            break;
+        case 2:
+            a.lock().cmpxchg(m_dst, regs_i16[reg_no_xchg]);
+            break;
+        case 4:
+            a.lock().cmpxchg(m_dst, regs_i32[reg_no_xchg]);
+            break;
+        case 8:
+            a.lock().cmpxchg(m_dst, regs_i64[reg_no_xchg]);
+            break;
+        default:
+            bh_assert(0);
+            return false;
+    }
+    return true;
+}
+
+/**
+ * Encode atomic compare and exchange: load value into a register from
+ * memory with reg base and reg offset, compare (expected) reg data with the
+ * loaded value, if equal, store the (replacement) reg data to the same
+ * memory, else, do nothing. Either way, returns the loaded value
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data to actual operated on(load,
+ * compare, replacement) could be 1(byte), 2(short), 4(int32), 8(int64)
+ * @param reg_no_xchg the no of register that stores the conditionally
+ * replacement value
+ * @param reg_no_base the no of register that stores the base address
+ *        of src&dst memory
+ * @param reg_no_offset the no of register that stores the offset address
+ *        of src&dst memory
+ * @return true if success, false otherwise
+ */
+static bool
+at_cmpxchg_r_ra_base_r_offset_r(x86::Assembler &a, uint32 bytes_dst,
+                                uint32 kind_dst, int32 reg_no_xchg,
+                                int32 reg_no_base, int32 reg_no_offset)
+{
+    x86::Mem m(regs_i64[reg_no_base], regs_i64[reg_no_offset], 0, 0, bytes_dst);
+    return at_cmpxchg(a, bytes_dst, kind_dst, reg_no_xchg, m)
+           && extend_r_to_r(a, bytes_dst, kind_dst, REG_RAX_IDX, REG_RAX_IDX);
+}
+
+/**
+ * Encode atomic compare and exchange: load value into a register from
+ * memory with reg base and imm offset, compare (expected) reg data with the
+ * loaded value, if equal, store the (replacement) reg data to the same
+ * memory, else, do nothing. Either way, returns the loaded value
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data to actual operated on(load,
+ * compare, replacement) could be 1(byte), 2(short), 4(int32), 8(int64)
+ * @param reg_no_xchg the no of register that stores the conditionally
+ * replacement value
+ * @param reg_no_base the no of register that stores the base address
+ *        of src&dst memory
+ * @param offset the offset address of the memory
+ * @return true if success, false otherwise
+ */
+static bool
+at_cmpxchg_r_ra_base_r_offset_imm(x86::Assembler &a, uint32 bytes_dst,
+                                  uint32 kind_dst, int32 reg_no_xchg,
+                                  int32 reg_no_base, int32 offset)
+{
+    x86::Mem m(regs_i64[reg_no_base], offset, bytes_dst);
+    return at_cmpxchg(a, bytes_dst, kind_dst, reg_no_xchg, m)
+           && extend_r_to_r(a, bytes_dst, kind_dst, REG_RAX_IDX, REG_RAX_IDX);
+}
+
+/**
+ * Encode atomic compare and exchange: load value into a register from
+ * memory with reg base and reg offset, compare (expected) reg data with the
+ * loaded value, if equal, store the (replacement) imm data to the same
+ * memory, else, do nothing. Either way, returns the loaded value
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data to actual operated on(load,
+ * compare, replacement) could be 1(byte), 2(short), 4(int32), 8(int64)
+ * @param data_xchg the immediate data for exchange(conditionally replacment
+ * value)
+ * @param reg_no_base the no of register that stores the base address
+ *        of src&dst memory
+ * @param reg_no_offset the no of register that stores the offset address
+ *        of src&dst memory
+ * @return true if success, false otherwise
+ */
+static bool
+at_cmpxchg_imm_ra_base_r_offset_r(x86::Assembler &a, uint32 bytes_dst,
+                                  uint32 kind_dst, void *data_xchg,
+                                  int32 reg_no_base, int32 reg_no_offset)
+{
+    x86::Mem m(regs_i64[reg_no_base], regs_i64[reg_no_offset], 0, 0, bytes_dst);
+    Imm imm;
+    imm_set_value(imm, data_xchg, bytes_dst);
+    uint32 reg_no_xchg = mov_imm_to_free_reg(a, imm, bytes_dst);
+    return at_cmpxchg(a, bytes_dst, kind_dst, reg_no_xchg, m)
+           && extend_r_to_r(a, bytes_dst, kind_dst, REG_RAX_IDX, REG_RAX_IDX);
+}
+
+/**
+ * Encode atomic compare and exchange: load value into a register from
+ * memory with reg base and imm offset, compare (expected) reg data with the
+ * loaded value, if equal, store the (replacement) imm data to the same
+ * memory, else, do nothing. Either way, returns the loaded value
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data to actual operated on(load,
+ * compare, replacement) could be 1(byte), 2(short), 4(int32), 8(int64)
+ * @param data_xchg the immediate data for exchange(conditionally replacment
+ * value)
+ * @param reg_no_base the no of register that stores the base address
+ *        of src&dst memory
+ * @param offset the offset address of the memory
+ * @return true if success, false otherwise
+ */
+static bool
+at_cmpxchg_imm_ra_base_r_offset_imm(x86::Assembler &a, uint32 bytes_dst,
+                                    uint32 kind_dst, void *data_xchg,
+                                    int32 reg_no_base, int32 offset)
+{
+    x86::Mem m(regs_i64[reg_no_base], offset, bytes_dst);
+    Imm imm;
+    imm_set_value(imm, data_xchg, bytes_dst);
+    uint32 reg_no_xchg = mov_imm_to_free_reg(a, imm, bytes_dst);
+    return at_cmpxchg(a, bytes_dst, kind_dst, reg_no_xchg, m)
+           && extend_r_to_r(a, bytes_dst, kind_dst, REG_RAX_IDX, REG_RAX_IDX);
+}
+
+/**
+ * Encode insn cmpxchg: CMPXCHG_type r0, r1, r2, r3, r4
+ * @param kind the data kind, can only be I32 or I64
+ * @param bytes_dst the byte number of dst data
+ */
+#define CMPXCHG_R_R_R_R_R(kind, type, bytes_dst)                           \
+    do {                                                                   \
+        type data_xchg = 0;                                                \
+        int32 reg_no_xchg = 0, reg_no_cmp = 0, reg_no_base = 0,            \
+              reg_no_offset = 0;                                           \
+        int32 offset = 0;                                                  \
+        bool _ret = false;                                                 \
+        if (jit_reg_is_const(r3)) {                                        \
+            CHECK_KIND(r3, JIT_REG_KIND_I32);                              \
+        }                                                                  \
+        else {                                                             \
+            CHECK_KIND(r3, JIT_REG_KIND_I64);                              \
+        }                                                                  \
+        /* r1: expected value(it must in register a)                       \
+         * r2: memory base addr can't be const */                          \
+        CHECK_NCONST(r1);                                                  \
+        reg_no_cmp = jit_reg_no(r1);                                       \
+        bh_assert(reg_no_cmp == REG_EAX_IDX || reg_no_cmp == REG_RAX_IDX); \
+        CHECK_REG_NO(reg_no_cmp, jit_reg_kind(r1));                        \
+        CHECK_NCONST(r2);                                                  \
+        reg_no_base = jit_reg_no(r2);                                      \
+        CHECK_REG_NO(reg_no_base, jit_reg_kind(r2));                       \
+        /* r0: replacement value r3: offset can be const */                \
+        if (jit_reg_is_const(r0))                                          \
+            data_xchg = jit_cc_get_const_##kind(cc, r0);                   \
+        else {                                                             \
+            reg_no_xchg = jit_reg_no(r0);                                  \
+            CHECK_REG_NO(reg_no_xchg, jit_reg_kind(r0));                   \
+        }                                                                  \
+        if (jit_reg_is_const(r3))                                          \
+            offset = jit_cc_get_const_I32(cc, r3);                         \
+        else {                                                             \
+            reg_no_offset = jit_reg_no(r3);                                \
+            CHECK_REG_NO(reg_no_offset, jit_reg_kind(r3));                 \
+        }                                                                  \
+                                                                           \
+        if (jit_reg_is_const(r0)) {                                        \
+            if (jit_reg_is_const(r3))                                      \
+                _ret = at_cmpxchg_imm_ra_base_r_offset_imm(                \
+                    a, bytes_dst, JIT_REG_KIND_##kind, &data_xchg,         \
+                    reg_no_base, offset);                                  \
+            else                                                           \
+                _ret = at_cmpxchg_imm_ra_base_r_offset_r(                  \
+                    a, bytes_dst, JIT_REG_KIND_##kind, &data_xchg,         \
+                    reg_no_base, reg_no_offset);                           \
+        }                                                                  \
+        else {                                                             \
+            if (jit_reg_is_const(r3))                                      \
+                _ret = at_cmpxchg_r_ra_base_r_offset_imm(                  \
+                    a, bytes_dst, JIT_REG_KIND_##kind, reg_no_xchg,        \
+                    reg_no_base, offset);                                  \
+            else                                                           \
+                _ret = at_cmpxchg_r_ra_base_r_offset_r(                    \
+                    a, bytes_dst, JIT_REG_KIND_##kind, reg_no_xchg,        \
+                    reg_no_base, reg_no_offset);                           \
+        }                                                                  \
+        if (!_ret)                                                         \
+            GOTO_FAIL;                                                     \
+    } while (0)
+
+/**
+ * Encode negate a value in the register
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data,
+ *        could be 1(byte), 2(short), 4(int32), 8(int64),
+ * @param kind_dst the kind of data to move, could be I32, I64
+ * @param reg_no_src the index of register hold src value
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+neg_r(x86::Assembler &a, uint32 bytes_dst, uint32 kind_dst, int32 reg_no_src)
+{
+    bh_assert((kind_dst == JIT_REG_KIND_I32 && bytes_dst <= 4)
+              || kind_dst == JIT_REG_KIND_I64);
+    bh_assert(reg_no_src < 16);
+    switch (bytes_dst) {
+        case 1:
+            a.neg(regs_i8[reg_no_src]);
+            break;
+        case 2:
+            a.neg(regs_i16[reg_no_src]);
+            break;
+        case 4:
+            a.neg(regs_i32[reg_no_src]);
+            break;
+        case 8:
+            a.neg(regs_i64[reg_no_src]);
+            break;
+        default:
+            bh_assert(0);
+            return false;
+    }
+    return true;
+}
+
+/**
+ * Encode atomic exchange and add
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data,
+ *        could be 1(byte), 2(short), 4(int32), 8(int64),
+ * @param kind_dst the kind of data to move, could be I32, I64
+ * @param reg_no_src the index of register hold operand value of add operation
+ * @param m_dst the dest memory operand
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+at_xadd(x86::Assembler &a, uint32 bytes_dst, uint32 kind_dst, int32 reg_no_src,
+        x86::Mem &m_dst)
+{
+    bh_assert((kind_dst == JIT_REG_KIND_I32 && bytes_dst <= 4)
+              || kind_dst == JIT_REG_KIND_I64);
+    bh_assert(reg_no_src < 16);
+    switch (bytes_dst) {
+        case 1:
+            a.lock().xadd(m_dst, regs_i8[reg_no_src]);
+            break;
+        case 2:
+            a.lock().xadd(m_dst, regs_i16[reg_no_src]);
+            break;
+        case 4:
+            a.lock().xadd(m_dst, regs_i32[reg_no_src]);
+            break;
+        case 8:
+            a.lock().xadd(m_dst, regs_i64[reg_no_src]);
+            break;
+        default:
+            bh_assert(0);
+            return false;
+    }
+
+    return true;
+}
+
+/**
+ * Encode atomic rmw add: load value into a register from memory
+ * with reg base and reg offset, add loaded value with imm data, store back
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data to actual operated on(load,
+ * compare, replacement) could be 1(byte), 2(short), 4(int32), 8(int64)
+ * @param reg_no_dst the no of register that stores the returned value
+ * @param data_src the immediate data(first operand)
+ * @param reg_no_base the no of register that stores the base address
+ *        of src&dst memory(second operand&store back)
+ * @param offset the offset address of the memory
+ * @return true if success, false otherwise
+ */
+static bool
+at_rmw_add_imm_base_r_offset_imm(x86::Assembler &a, uint32 bytes_dst,
+                                 uint32 kind_dst, int32 reg_no_dst,
+                                 void *data_src, int32 reg_no_base,
+                                 int32 offset)
+{
+    x86::Mem m(regs_i64[reg_no_base], offset, bytes_dst);
+    Imm imm;
+    imm_set_value(imm, data_src, bytes_dst);
+    uint32 reg_no_src = mov_imm_to_free_reg(a, imm, bytes_dst);
+    return at_xadd(a, bytes_dst, kind_dst, reg_no_src, m)
+           && extend_r_to_r(a, bytes_dst, kind_dst, reg_no_src, reg_no_dst);
+}
+
+/**
+ * Encode atomic rmw add: load value into a register from memory
+ * with reg base and reg offset, add loaded value with imm data, store back
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data to actual operated on(load,
+ * compare, replacement) could be 1(byte), 2(short), 4(int32), 8(int64)
+ * @param reg_no_dst the no of register that stores the returned value
+ * @param data_src the immediate data(second operand)
+ * @param reg_no_base the no of register that stores the base address
+ *        of src&dst memory(first operand&store back location)
+ * @param reg_no_offset the no of register that stores the offset of the memory
+ * @return true if success, false otherwise
+ */
+static bool
+at_rmw_add_imm_base_r_offset_r(x86::Assembler &a, uint32 bytes_dst,
+                               uint32 kind_dst, int32 reg_no_dst,
+                               void *data_src, int32 reg_no_base,
+                               int32 reg_no_offset)
+{
+    x86::Mem m(regs_i64[reg_no_base], regs_i64[reg_no_offset], 0, 0, bytes_dst);
+    Imm imm;
+    imm_set_value(imm, data_src, bytes_dst);
+    uint32 reg_no_src = mov_imm_to_free_reg(a, imm, bytes_dst);
+    return at_xadd(a, bytes_dst, kind_dst, reg_no_src, m)
+           && extend_r_to_r(a, bytes_dst, kind_dst, reg_no_src, reg_no_dst);
+}
+
+/**
+ * Encode atomic rmw add: load value into a register from memory
+ * with reg base and imm offset, add loaded value with reg data, store back
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data to actual operated on(load,
+ * compare, replacement) could be 1(byte), 2(short), 4(int32), 8(int64)
+ * @param reg_no_dst the no of register that stores the returned value
+ * @param reg_no_src the no of register store the src data(second operand)
+ * @param reg_no_base the no of register that stores the base address
+ *        of src&dst memory(first operand&store back location)
+ * @param offset the offset address of the memory
+ * @return true if success, false otherwise
+ */
+static bool
+at_rmw_add_r_base_r_offset_imm(x86::Assembler &a, uint32 bytes_dst,
+                               uint32 kind_dst, int32 reg_no_dst,
+                               int32 reg_no_src, int32 reg_no_base,
+                               int32 offset)
+{
+    x86::Mem m(regs_i64[reg_no_base], offset, bytes_dst);
+    return at_xadd(a, bytes_dst, kind_dst, reg_no_src, m)
+           && extend_r_to_r(a, bytes_dst, kind_dst, reg_no_src, reg_no_dst);
+}
+
+/**
+ * Encode atomic rmw add: load value into a register from memory
+ * with reg base and reg offset, add loaded value with reg data, store back
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data to actual operated on(load,
+ * compare, replacement) could be 1(byte), 2(short), 4(int32), 8(int64)
+ * @param reg_no_dst the no of register that stores the returned value
+ * @param reg_no_src the no of register store the src data(second operand)
+ * @param reg_no_base the no of register that stores the base address
+ *        of src&dst memory(first operand&store back)
+ * @param reg_no_offset the no of register that stores the offset of the memory
+ * @return true if success, false otherwise
+ */
+static bool
+at_rmw_add_r_base_r_offset_r(x86::Assembler &a, uint32 bytes_dst,
+                             uint32 kind_dst, int32 reg_no_dst,
+                             int32 reg_no_src, int32 reg_no_base,
+                             int32 reg_no_offset)
+{
+    x86::Mem m(regs_i64[reg_no_base], regs_i64[reg_no_offset], 0, 0, bytes_dst);
+    return at_xadd(a, bytes_dst, kind_dst, reg_no_src, m)
+           && extend_r_to_r(a, bytes_dst, kind_dst, reg_no_src, reg_no_dst);
+}
+
+/**
+ * Encode atomic rmw sub: load value into a register from memory
+ * with reg base and reg offset, sub loaded value with imm data, store back
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data to actual operated on(load,
+ * compare, replacement) could be 1(byte), 2(short), 4(int32), 8(int64)
+ * @param reg_no_dst the no of register that stores the returned value
+ * @param data_src the immediate data(first operand)
+ * @param reg_no_base the no of register that stores the base address
+ *        of src&dst memory(second operand&store back)
+ * @param offset the offset address of the memory
+ * @return true if success, false otherwise
+ */
+static bool
+at_rmw_sub_imm_base_r_offset_imm(x86::Assembler &a, uint32 bytes_dst,
+                                 uint32 kind_dst, int32 reg_no_dst,
+                                 void *data_src, int32 reg_no_base,
+                                 int32 offset)
+{
+    x86::Mem m(regs_i64[reg_no_base], offset, bytes_dst);
+    Imm imm;
+    imm_set_value(imm, data_src, bytes_dst);
+    uint32 reg_no_src = mov_imm_to_free_reg(a, imm, bytes_dst);
+    return neg_r(a, bytes_dst, kind_dst, reg_no_src)
+           && at_xadd(a, bytes_dst, kind_dst, reg_no_src, m)
+           && extend_r_to_r(a, bytes_dst, kind_dst, reg_no_src, reg_no_dst);
+}
+
+/**
+ * Encode atomic rmw sub: load value into a register from memory
+ * with reg base and reg offset, sub loaded value with imm data, store back
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data to actual operated on(load,
+ * compare, replacement) could be 1(byte), 2(short), 4(int32), 8(int64)
+ * @param reg_no_dst the no of register that stores the returned value
+ * @param data_src the immediate data(second operand)
+ * @param reg_no_base the no of register that stores the base address
+ *        of src&dst memory(first operand&store back location)
+ * @param reg_no_offset the no of register that stores the offset of the memory
+ * @return true if success, false otherwise
+ */
+static bool
+at_rmw_sub_imm_base_r_offset_r(x86::Assembler &a, uint32 bytes_dst,
+                               uint32 kind_dst, int32 reg_no_dst,
+                               void *data_src, int32 reg_no_base,
+                               int32 reg_no_offset)
+{
+    x86::Mem m(regs_i64[reg_no_base], regs_i64[reg_no_offset], 0, 0, bytes_dst);
+    Imm imm;
+    imm_set_value(imm, data_src, bytes_dst);
+    uint32 reg_no_src = mov_imm_to_free_reg(a, imm, bytes_dst);
+    return neg_r(a, bytes_dst, kind_dst, reg_no_src)
+           && at_xadd(a, bytes_dst, kind_dst, reg_no_src, m)
+           && extend_r_to_r(a, bytes_dst, kind_dst, reg_no_src, reg_no_dst);
+}
+
+/**
+ * Encode atomic rmw sub: load value into a register from memory
+ * with reg base and imm offset, sub loaded value with reg data, store back
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data to actual operated on(load,
+ * compare, replacement) could be 1(byte), 2(short), 4(int32), 8(int64)
+ * @param reg_no_dst the no of register that stores the returned value
+ * @param reg_no_src the no of register store the src data(second operand)
+ * @param reg_no_base the no of register that stores the base address
+ *        of src&dst memory(first operand&store back location)
+ * @param offset the offset address of the memory
+ * @return true if success, false otherwise
+ */
+static bool
+at_rmw_sub_r_base_r_offset_imm(x86::Assembler &a, uint32 bytes_dst,
+                               uint32 kind_dst, int32 reg_no_dst,
+                               int32 reg_no_src, int32 reg_no_base,
+                               int32 offset)
+{
+    x86::Mem m(regs_i64[reg_no_base], offset, bytes_dst);
+    return neg_r(a, bytes_dst, kind_dst, reg_no_src)
+           && at_xadd(a, bytes_dst, kind_dst, reg_no_src, m)
+           && extend_r_to_r(a, bytes_dst, kind_dst, reg_no_src, reg_no_dst);
+}
+
+/**
+ * Encode atomic rmw sub: load value into a register from memory
+ * with reg base and reg offset, sub loaded value with reg data, store back
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data to actual operated on(load,
+ * compare, replacement) could be 1(byte), 2(short), 4(int32), 8(int64)
+ * @param reg_no_dst the no of register that stores the returned value
+ * @param reg_no_src the no of register store the src data(second operand)
+ * @param reg_no_base the no of register that stores the base address
+ *        of src&dst memory(first operand&store back)
+ * @param reg_no_offset the no of register that stores the offset of the memory
+ * @return true if success, false otherwise
+ */
+static bool
+at_rmw_sub_r_base_r_offset_r(x86::Assembler &a, uint32 bytes_dst,
+                             uint32 kind_dst, int32 reg_no_dst,
+                             int32 reg_no_src, int32 reg_no_base,
+                             int32 reg_no_offset)
+{
+    x86::Mem m(regs_i64[reg_no_base], regs_i64[reg_no_offset], 0, 0, bytes_dst);
+    return neg_r(a, bytes_dst, kind_dst, reg_no_src)
+           && at_xadd(a, bytes_dst, kind_dst, reg_no_src, m)
+           && extend_r_to_r(a, bytes_dst, kind_dst, reg_no_src, reg_no_dst);
+}
+
+/**
+ * Encode atomic rmw xchg: load value into a register from memory
+ * with reg base and reg offset, exchange loaded value with imm data, store back
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data to actual operated on(load,
+ * compare, replacement) could be 1(byte), 2(short), 4(int32), 8(int64)
+ * @param reg_no_dst the no of register that stores the returned value
+ * @param data_src the immediate data(first operand)
+ * @param reg_no_base the no of register that stores the base address
+ *        of src&dst memory(second operand&store back)
+ * @param offset the offset address of the memory
+ * @return true if success, false otherwise
+ */
+static bool
+at_rmw_xchg_imm_base_r_offset_imm(x86::Assembler &a, uint32 bytes_dst,
+                                  uint32 kind_dst, int32 reg_no_dst,
+                                  void *data_src, int32 reg_no_base,
+                                  int32 offset)
+{
+    x86::Mem m(regs_i64[reg_no_base], offset, bytes_dst);
+    Imm imm;
+    imm_set_value(imm, data_src, bytes_dst);
+    uint32 reg_no_src = mov_imm_to_free_reg(a, imm, bytes_dst);
+    return xchg_r_to_m(a, bytes_dst, kind_dst, m, reg_no_src)
+           && extend_r_to_r(a, bytes_dst, kind_dst, reg_no_src, reg_no_dst);
+}
+
+/**
+ * Encode atomic rmw xchg: load value into a register from memory
+ * with reg base and reg offset, exchange loaded value with imm data, store back
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data to actual operated on(load,
+ * compare, replacement) could be 1(byte), 2(short), 4(int32), 8(int64)
+ * @param reg_no_dst the no of register that stores the returned value
+ * @param data_src the immediate data(second operand)
+ * @param reg_no_base the no of register that stores the base address
+ *        of src&dst memory(first operand&store back location)
+ * @param reg_no_offset the no of register that stores the offset of the memory
+ * @return true if success, false otherwise
+ */
+static bool
+at_rmw_xchg_imm_base_r_offset_r(x86::Assembler &a, uint32 bytes_dst,
+                                uint32 kind_dst, int32 reg_no_dst,
+                                void *data_src, int32 reg_no_base,
+                                int32 reg_no_offset)
+{
+    x86::Mem m(regs_i64[reg_no_base], regs_i64[reg_no_offset], 0, 0, bytes_dst);
+    Imm imm;
+    imm_set_value(imm, data_src, bytes_dst);
+    uint32 reg_no_src = mov_imm_to_free_reg(a, imm, bytes_dst);
+    return xchg_r_to_m(a, bytes_dst, kind_dst, m, reg_no_src)
+           && extend_r_to_r(a, bytes_dst, kind_dst, reg_no_src, reg_no_dst);
+}
+
+/**
+ * Encode atomic rmw xchg: load value into a register from memory
+ * with reg base and imm offset, exchange loaded value with reg data, store back
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data to actual operated on(load,
+ * compare, replacement) could be 1(byte), 2(short), 4(int32), 8(int64)
+ * @param reg_no_dst the no of register that stores the returned value
+ * @param reg_no_src the no of register store the src data(second operand)
+ * @param reg_no_base the no of register that stores the base address
+ *        of src&dst memory(first operand&store back location)
+ * @param offset the offset address of the memory
+ * @return true if success, false otherwise
+ */
+static bool
+at_rmw_xchg_r_base_r_offset_imm(x86::Assembler &a, uint32 bytes_dst,
+                                uint32 kind_dst, int32 reg_no_dst,
+                                int32 reg_no_src, int32 reg_no_base,
+                                int32 offset)
+{
+    x86::Mem m(regs_i64[reg_no_base], offset, bytes_dst);
+    return xchg_r_to_m(a, bytes_dst, kind_dst, m, reg_no_src)
+           && extend_r_to_r(a, bytes_dst, kind_dst, reg_no_src, reg_no_dst);
+}
+
+/**
+ * Encode atomic rmw xchg: load value into a register from memory
+ * with reg base and reg offset, exchange loaded value with reg data, store back
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data to actual operated on(load,
+ * compare, replacement) could be 1(byte), 2(short), 4(int32), 8(int64)
+ * @param reg_no_dst the no of register that stores the returned value
+ * @param reg_no_src the no of register store the src data(second operand)
+ * @param reg_no_base the no of register that stores the base address
+ *        of src&dst memory(first operand&store back)
+ * @param reg_no_offset the no of register that stores the offset of the memory
+ * @return true if success, false otherwise
+ */
+static bool
+at_rmw_xchg_r_base_r_offset_r(x86::Assembler &a, uint32 bytes_dst,
+                              uint32 kind_dst, int32 reg_no_dst,
+                              int32 reg_no_src, int32 reg_no_base,
+                              int32 reg_no_offset)
+{
+    x86::Mem m(regs_i64[reg_no_base], regs_i64[reg_no_offset], 0, 0, bytes_dst);
+    return xchg_r_to_m(a, bytes_dst, kind_dst, m, reg_no_src)
+           && extend_r_to_r(a, bytes_dst, kind_dst, reg_no_src, reg_no_dst);
+}
+
+/**
+ * Encode insn rmw logical operation: generate a loop to make sure it's atomic
+ * @param bin_op the operation, can be and/or/xor
+ * @param kind the data kind, can only be I32 or I64
+ * @param bytes_dst the byte number of dst data
+ */
+#define AT_RMW_LOGICAL_LOOP(bin_op, kind, bytes_dst)                           \
+    do {                                                                       \
+        bh_assert((kind_dst == JIT_REG_KIND_I32 && bytes_dst <= 4)             \
+                  || kind_dst == JIT_REG_KIND_I64);                            \
+        bh_assert(reg_no_src < 16 && reg_no_dst < 16);                         \
+        /* read original value in memory(operand 1) to rax(expected) */        \
+        mov_m_to_r(a, bytes_dst, kind_dst, false, REG_RAX_IDX, m_dst);         \
+        Label loop = a.newLabel();                                             \
+        /* check whether loop is valid, and bind the loop label                \
+         * to the current position in the code. */                             \
+        if (!loop.isValid() || a.bind(loop) != kErrorOk)                       \
+            return false;                                                      \
+        /* move operand 1 to temp reg rb */                                    \
+        mov_r_to_r(a, kind_dst, REG_RBX_IDX, REG_RAX_IDX);                     \
+        /* actual logical operation with operand 2, result save to rbx */      \
+        switch (bytes_dst) {                                                   \
+            case 1:                                                            \
+                a.bin_op##_(regs_i8[REG_RBX_IDX], regs_i8[reg_no_src]);        \
+                break;                                                         \
+            case 2:                                                            \
+                a.bin_op##_(regs_i16[REG_RBX_IDX], regs_i16[reg_no_src]);      \
+                break;                                                         \
+            case 4:                                                            \
+                a.bin_op##_(regs_i32[REG_RBX_IDX], regs_i32[reg_no_src]);      \
+                break;                                                         \
+            case 8:                                                            \
+                a.bin_op##_(regs_i64[REG_RBX_IDX], regs_i64[reg_no_src]);      \
+                break;                                                         \
+            default:                                                           \
+                bh_assert(0);                                                  \
+                return false;                                                  \
+        }                                                                      \
+        /* cmp with read value in RAX, try to change with result value in RBX  \
+         * REG, if change successfully, mem data is changed and exit loop(ZF   \
+         * is set) if not, loop again(ZF is clear) and tries to do logical ops \
+         * atomically */                                                       \
+        at_cmpxchg(a, bytes_dst, kind_dst, REG_RBX_IDX, m_dst);                \
+        a.jne(loop);                                                           \
+        return true;                                                           \
+    } while (0)
+
+/**
+ * Encode atomic logical binary operation: and
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data,
+ *        could be 1(byte), 2(short), 4(int32), 8(int64),
+ * @param kind_dst the kind of data to move, could be I32, I64
+ * @param reg_no_dst the index of dest register
+ * @param reg_no_src the index of register hold operand value of add operation
+ * @param m_dst the dest memory operand
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+at_and(x86::Assembler &a, uint32 bytes_dst, uint32 kind_dst, int32 reg_no_dst,
+       int32 reg_no_src, x86::Mem &m_dst)
+{
+    AT_RMW_LOGICAL_LOOP(and, kind_dst, bytes_dst);
+}
+
+/**
+ * Encode atomic logical binary operation: or
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data,
+ *        could be 1(byte), 2(short), 4(int32), 8(int64),
+ * @param kind_dst the kind of data to move, could be I32, I64
+ * @param reg_no_dst the index of dest register
+ * @param reg_no_src the index of register hold operand value of add operation
+ * @param m_dst the dest memory operand
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+at_or(x86::Assembler &a, uint32 bytes_dst, uint32 kind_dst, int32 reg_no_dst,
+      int32 reg_no_src, x86::Mem &m_dst)
+{
+    AT_RMW_LOGICAL_LOOP(or, kind_dst, bytes_dst);
+}
+/**
+ * Encode atomic logical binary operation: xor
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data,
+ *        could be 1(byte), 2(short), 4(int32), 8(int64),
+ * @param kind_dst the kind of data to move, could be I32, I64
+ * @param reg_no_dst the index of dest register
+ * @param reg_no_src the index of register hold operand value of add operation
+ * @param m_dst the dest memory operand
+ *
+ * @return true if success, false otherwise
+ */
+static bool
+at_xor(x86::Assembler &a, uint32 bytes_dst, uint32 kind_dst, int32 reg_no_dst,
+       int32 reg_no_src, x86::Mem &m_dst)
+{
+    AT_RMW_LOGICAL_LOOP(xor, kind_dst, bytes_dst);
+}
+
+/**
+ * Encode atomic rmw and: load value into a register from memory with reg base
+ * and reg offset, bitwise and loaded value with imm data, store back
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data to actual operated on(load,
+ * compare, replacement) could be 1(byte), 2(short), 4(int32), 8(int64)
+ * @param reg_no_dst the no of register that stores the returned value
+ * @param data_src the immediate data(first operand)
+ * @param reg_no_base the no of register that stores the base address
+ *        of src&dst memory(second operand&store back)
+ * @param offset the offset address of the memory
+ * @return true if success, false otherwise
+ */
+static bool
+at_rmw_and_imm_base_r_offset_imm(x86::Assembler &a, uint32 bytes_dst,
+                                 uint32 kind_dst, int32 reg_no_dst,
+                                 void *data_src, int32 reg_no_base,
+                                 int32 offset)
+{
+    x86::Mem m(regs_i64[reg_no_base], offset, bytes_dst);
+    Imm imm;
+    imm_set_value(imm, data_src, bytes_dst);
+    uint32 reg_no_src = mov_imm_to_free_reg(a, imm, bytes_dst);
+    return at_and(a, bytes_dst, kind_dst, reg_no_dst, reg_no_src, m)
+           && extend_r_to_r(a, bytes_dst, kind_dst, REG_RAX_IDX, reg_no_dst);
+}
+
+/**
+ * Encode atomic rmw and: load value into a register from memory with reg base
+ * and reg offset, bitwise and loaded value with imm data, store back
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data to actual operated on(load,
+ * compare, replacement) could be 1(byte), 2(short), 4(int32), 8(int64)
+ * @param reg_no_dst the no of register that stores the returned value
+ * @param data_src the immediate data(second operand)
+ * @param reg_no_base the no of register that stores the base address
+ *        of src&dst memory(first operand&store back location)
+ * @param reg_no_offset the no of register that stores the offset of the memory
+ * @return true if success, false otherwise
+ */
+static bool
+at_rmw_and_imm_base_r_offset_r(x86::Assembler &a, uint32 bytes_dst,
+                               uint32 kind_dst, int32 reg_no_dst,
+                               void *data_src, int32 reg_no_base,
+                               int32 reg_no_offset)
+{
+    x86::Mem m(regs_i64[reg_no_base], regs_i64[reg_no_offset], 0, 0, bytes_dst);
+    Imm imm;
+    imm_set_value(imm, data_src, bytes_dst);
+    uint32 reg_no_src = mov_imm_to_free_reg(a, imm, bytes_dst);
+    return at_and(a, bytes_dst, kind_dst, reg_no_dst, reg_no_src, m)
+           && extend_r_to_r(a, bytes_dst, kind_dst, REG_RAX_IDX, reg_no_dst);
+}
+
+/**
+ * Encode atomic rmw and: load value into a register from memory with reg base
+ * and imm offset, bitwise and value with reg data, store back
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data to actual operated on(load,
+ * compare, replacement) could be 1(byte), 2(short), 4(int32), 8(int64)
+ * @param reg_no_dst the no of register that stores the returned value
+ * @param reg_no_src the no of register store the src data(second operand)
+ * @param reg_no_base the no of register that stores the base address
+ *        of src&dst memory(first operand&store back location)
+ * @param offset the offset address of the memory
+ * @return true if success, false otherwise
+ */
+static bool
+at_rmw_and_r_base_r_offset_imm(x86::Assembler &a, uint32 bytes_dst,
+                               uint32 kind_dst, int32 reg_no_dst,
+                               int32 reg_no_src, int32 reg_no_base,
+                               int32 offset)
+{
+    x86::Mem m(regs_i64[reg_no_base], offset, bytes_dst);
+    return at_and(a, bytes_dst, kind_dst, reg_no_dst, reg_no_src, m)
+           && extend_r_to_r(a, bytes_dst, kind_dst, REG_RAX_IDX, reg_no_dst);
+}
+
+/**
+ * Encode atomic rmw and: load value into a register from memory with reg base
+ * and reg offset, bitwise and loaded value with reg data, store back
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data to actual operated on(load,
+ * compare, replacement) could be 1(byte), 2(short), 4(int32), 8(int64)
+ * @param reg_no_dst the no of register that stores the returned value
+ * @param reg_no_src the no of register store the src data(second operand)
+ * @param reg_no_base the no of register that stores the base address
+ *        of src&dst memory(first operand&store back)
+ * @param reg_no_offset the no of register that stores the offset of the memory
+ * @return true if success, false otherwise
+ */
+static bool
+at_rmw_and_r_base_r_offset_r(x86::Assembler &a, uint32 bytes_dst,
+                             uint32 kind_dst, int32 reg_no_dst,
+                             int32 reg_no_src, int32 reg_no_base,
+                             int32 reg_no_offset)
+{
+    x86::Mem m(regs_i64[reg_no_base], regs_i64[reg_no_offset], 0, 0, bytes_dst);
+    return at_and(a, bytes_dst, kind_dst, reg_no_dst, reg_no_src, m)
+           && extend_r_to_r(a, bytes_dst, kind_dst, REG_RAX_IDX, reg_no_dst);
+}
+
+/**
+ * Encode atomic rmw or: load value into a register from memory with reg base
+ * and reg offset, bitwise or loaded value with imm data, store back
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data to actual operated on(load,
+ * compare, replacement) could be 1(byte), 2(short), 4(int32), 8(int64)
+ * @param reg_no_dst the no of register that stores the returned value
+ * @param data_src the immediate data(first operand)
+ * @param reg_no_base the no of register that stores the base address
+ *        of src&dst memory(second operand&store back)
+ * @param offset the offset address of the memory
+ * @return true if success, false otherwise
+ */
+static bool
+at_rmw_or_imm_base_r_offset_imm(x86::Assembler &a, uint32 bytes_dst,
+                                uint32 kind_dst, int32 reg_no_dst,
+                                void *data_src, int32 reg_no_base, int32 offset)
+{
+    x86::Mem m(regs_i64[reg_no_base], offset, bytes_dst);
+    Imm imm;
+    imm_set_value(imm, data_src, bytes_dst);
+    uint32 reg_no_src = mov_imm_to_free_reg(a, imm, bytes_dst);
+    return at_or(a, bytes_dst, kind_dst, reg_no_dst, reg_no_src, m)
+           && extend_r_to_r(a, bytes_dst, kind_dst, REG_RAX_IDX, reg_no_dst);
+}
+
+/**
+ * Encode atomic rmw or: load value into a register from memory with reg base
+ * and reg offset, bitwise or loaded value with imm data, store back
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data to actual operated on(load,
+ * compare, replacement) could be 1(byte), 2(short), 4(int32), 8(int64)
+ * @param reg_no_dst the no of register that stores the returned value
+ * @param data_src the immediate data(second operand)
+ * @param reg_no_base the no of register that stores the base address
+ *        of src&dst memory(first operand&store back location)
+ * @param reg_no_offset the no of register that stores the offset of the memory
+ * @return true if success, false otherwise
+ */
+static bool
+at_rmw_or_imm_base_r_offset_r(x86::Assembler &a, uint32 bytes_dst,
+                              uint32 kind_dst, int32 reg_no_dst, void *data_src,
+                              int32 reg_no_base, int32 reg_no_offset)
+{
+    x86::Mem m(regs_i64[reg_no_base], regs_i64[reg_no_offset], 0, 0, bytes_dst);
+    Imm imm;
+    imm_set_value(imm, data_src, bytes_dst);
+    uint32 reg_no_src = mov_imm_to_free_reg(a, imm, bytes_dst);
+    return at_or(a, bytes_dst, kind_dst, reg_no_dst, reg_no_src, m)
+           && extend_r_to_r(a, bytes_dst, kind_dst, REG_RAX_IDX, reg_no_dst);
+}
+
+/**
+ * Encode atomic rmw or: load value into a register from memory with reg base
+ * and imm offset, bitwise or loaded value with reg data, store back
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data to actual operated on(load,
+ * compare, replacement) could be 1(byte), 2(short), 4(int32), 8(int64)
+ * @param reg_no_dst the no of register that stores the returned value
+ * @param reg_no_src the no of register store the src data(second operand)
+ * @param reg_no_base the no of register that stores the base address
+ *        of src&dst memory(first operand&store back location)
+ * @param offset the offset address of the memory
+ * @return true if success, false otherwise
+ */
+static bool
+at_rmw_or_r_base_r_offset_imm(x86::Assembler &a, uint32 bytes_dst,
+                              uint32 kind_dst, int32 reg_no_dst,
+                              int32 reg_no_src, int32 reg_no_base, int32 offset)
+{
+    x86::Mem m(regs_i64[reg_no_base], offset, bytes_dst);
+    return at_or(a, bytes_dst, kind_dst, reg_no_dst, reg_no_src, m)
+           && extend_r_to_r(a, bytes_dst, kind_dst, REG_RAX_IDX, reg_no_dst);
+}
+
+/**
+ * Encode atomic rmw or: load value into a register from memory with reg base
+ * and reg offset, bitwise or loaded value with reg data, store back
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data to actual operated on(load,
+ * compare, replacement) could be 1(byte), 2(short), 4(int32), 8(int64)
+ * @param reg_no_dst the no of register that stores the returned value
+ * @param reg_no_src the no of register store the src data(second operand)
+ * @param reg_no_base the no of register that stores the base address
+ *        of src&dst memory(first operand&store back)
+ * @param reg_no_offset the no of register that stores the offset of the memory
+ * @return true if success, false otherwise
+ */
+static bool
+at_rmw_or_r_base_r_offset_r(x86::Assembler &a, uint32 bytes_dst,
+                            uint32 kind_dst, int32 reg_no_dst, int32 reg_no_src,
+                            int32 reg_no_base, int32 reg_no_offset)
+{
+    x86::Mem m(regs_i64[reg_no_base], regs_i64[reg_no_offset], 0, 0, bytes_dst);
+    return at_or(a, bytes_dst, kind_dst, reg_no_dst, reg_no_src, m)
+           && extend_r_to_r(a, bytes_dst, kind_dst, REG_RAX_IDX, reg_no_dst);
+}
+
+/**
+ * Encode atomic rmw xor: load value into a register from memory with reg base
+ * and reg offset, bitwise xor loaded value with imm data, store back
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data to actual operated on(load,
+ * compare, replacement) could be 1(byte), 2(short), 4(int32), 8(int64)
+ * @param reg_no_dst the no of register that stores the returned value
+ * @param data_src the immediate data(first operand)
+ * @param reg_no_base the no of register that stores the base address
+ *        of src&dst memory(second operand&store back)
+ * @param offset the offset address of the memory
+ * @return true if success, false otherwise
+ */
+static bool
+at_rmw_xor_imm_base_r_offset_imm(x86::Assembler &a, uint32 bytes_dst,
+                                 uint32 kind_dst, int32 reg_no_dst,
+                                 void *data_src, int32 reg_no_base,
+                                 int32 offset)
+{
+    x86::Mem m(regs_i64[reg_no_base], offset, bytes_dst);
+    Imm imm;
+    imm_set_value(imm, data_src, bytes_dst);
+    uint32 reg_no_src = mov_imm_to_free_reg(a, imm, bytes_dst);
+    return at_xor(a, bytes_dst, kind_dst, reg_no_dst, reg_no_src, m)
+           && extend_r_to_r(a, bytes_dst, kind_dst, REG_RAX_IDX, reg_no_dst);
+}
+
+/**
+ * Encode atomic rmw xor: load value into a register from memory with reg base
+ * and reg offset, bitwise xor loaded value with imm data, store back
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data to actual operated on(load,
+ * compare, replacement) could be 1(byte), 2(short), 4(int32), 8(int64)
+ * @param reg_no_dst the no of register that stores the returned value
+ * @param data_src the immediate data(second operand)
+ * @param reg_no_base the no of register that stores the base address
+ *        of src&dst memory(first operand&store back location)
+ * @param reg_no_offset the no of register that stores the offset of the memory
+ * @return true if success, false otherwise
+ */
+static bool
+at_rmw_xor_imm_base_r_offset_r(x86::Assembler &a, uint32 bytes_dst,
+                               uint32 kind_dst, int32 reg_no_dst,
+                               void *data_src, int32 reg_no_base,
+                               int32 reg_no_offset)
+{
+    x86::Mem m(regs_i64[reg_no_base], regs_i64[reg_no_offset], 0, 0, bytes_dst);
+    Imm imm;
+    imm_set_value(imm, data_src, bytes_dst);
+    uint32 reg_no_src = mov_imm_to_free_reg(a, imm, bytes_dst);
+    return at_xor(a, bytes_dst, kind_dst, reg_no_dst, reg_no_src, m)
+           && extend_r_to_r(a, bytes_dst, kind_dst, REG_RAX_IDX, reg_no_dst);
+}
+
+/**
+ * Encode atomic rmw xor: load value into a register from memory with reg base
+ * and imm offset, bitwise xor exchange loaded value with reg data, store back
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data to actual operated on(load,
+ * compare, replacement) could be 1(byte), 2(short), 4(int32), 8(int64)
+ * @param reg_no_dst the no of register that stores the returned value
+ * @param reg_no_src the no of register store the src data(second operand)
+ * @param reg_no_base the no of register that stores the base address
+ *        of src&dst memory(first operand&store back location)
+ * @param offset the offset address of the memory
+ * @return true if success, false otherwise
+ */
+static bool
+at_rmw_xor_r_base_r_offset_imm(x86::Assembler &a, uint32 bytes_dst,
+                               uint32 kind_dst, int32 reg_no_dst,
+                               int32 reg_no_src, int32 reg_no_base,
+                               int32 offset)
+{
+    x86::Mem m(regs_i64[reg_no_base], offset, bytes_dst);
+    return at_xor(a, bytes_dst, kind_dst, reg_no_dst, reg_no_src, m)
+           && extend_r_to_r(a, bytes_dst, kind_dst, REG_RAX_IDX, reg_no_dst);
+}
+
+/**
+ * Encode atomic rmw xor: load value into a register from memory with reg base
+ * and reg offset, bitwise xor loaded value with reg data, store back
+ *
+ * @param a the assembler to emit the code
+ * @param bytes_dst the bytes number of the data to actual operated on(load,
+ * compare, replacement) could be 1(byte), 2(short), 4(int32), 8(int64)
+ * @param reg_no_dst the no of register that stores the returned value
+ * @param reg_no_src the no of register store the src data(second operand)
+ * @param reg_no_base the no of register that stores the base address
+ *        of src&dst memory(first operand&store back)
+ * @param reg_no_offset the no of register that stores the offset of the memory
+ * @return true if success, false otherwise
+ */
+static bool
+at_rmw_xor_r_base_r_offset_r(x86::Assembler &a, uint32 bytes_dst,
+                             uint32 kind_dst, int32 reg_no_dst,
+                             int32 reg_no_src, int32 reg_no_base,
+                             int32 reg_no_offset)
+{
+    x86::Mem m(regs_i64[reg_no_base], regs_i64[reg_no_offset], 0, 0, bytes_dst);
+    return at_xor(a, bytes_dst, kind_dst, reg_no_dst, reg_no_src, m)
+           && extend_r_to_r(a, bytes_dst, kind_dst, REG_RAX_IDX, reg_no_dst);
+}
+
+/**
+ * Encode insn rmw RMW_type r0, r1, r2, r3
+ * @param bin_op the operation, can be add/sub/xchg/and/or/xor
+ * @param kind the data kind, can only be I32 or I64
+ * @param bytes_dst the byte number of dst data
+ */
+#define AT_RMW_R_R_R_R(bin_op, kind, type, bytes_dst)                          \
+    do {                                                                       \
+        type data_src = 0;                                                     \
+        int32 reg_no_dst = 0, reg_no_src = 0, reg_no_base = 0,                 \
+              reg_no_offset = 0;                                               \
+        int32 offset = 0;                                                      \
+        bool _ret = false;                                                     \
+        if (jit_reg_is_const(r3)) {                                            \
+            CHECK_KIND(r3, JIT_REG_KIND_I32);                                  \
+        }                                                                      \
+        else {                                                                 \
+            CHECK_KIND(r3, JIT_REG_KIND_I64);                                  \
+        }                                                                      \
+        /* r0: read/return value r2: memory base addr can't be const */        \
+        /* already check it's not const in LOAD_4ARGS(); */                    \
+        reg_no_dst = jit_reg_no(r0);                                           \
+        CHECK_REG_NO(reg_no_dst, jit_reg_kind(r0));                            \
+        /* mem_data base address has to be non-const */                        \
+        CHECK_NCONST(r2);                                                      \
+        reg_no_base = jit_reg_no(r2);                                          \
+        CHECK_REG_NO(reg_no_base, jit_reg_kind(r2));                           \
+        /* r1: source operand value r3: offset can be const */                 \
+        if (jit_reg_is_const(r1))                                              \
+            data_src = jit_cc_get_const_##kind(cc, r1);                        \
+        else {                                                                 \
+            reg_no_src = jit_reg_no(r1);                                       \
+            CHECK_REG_NO(reg_no_src, jit_reg_kind(r1));                        \
+        }                                                                      \
+        if (jit_reg_is_const(r3))                                              \
+            offset = jit_cc_get_const_I32(cc, r3);                             \
+        else {                                                                 \
+            reg_no_offset = jit_reg_no(r3);                                    \
+            CHECK_REG_NO(reg_no_offset, jit_reg_kind(r3));                     \
+        }                                                                      \
+                                                                               \
+        if (jit_reg_is_const(r1)) {                                            \
+            if (jit_reg_is_const(r3))                                          \
+                _ret = at_rmw_##bin_op##_imm_base_r_offset_imm(                \
+                    a, bytes_dst, JIT_REG_KIND_##kind, reg_no_dst, &data_src,  \
+                    reg_no_base, offset);                                      \
+            else                                                               \
+                _ret = at_rmw_##bin_op##_imm_base_r_offset_r(                  \
+                    a, bytes_dst, JIT_REG_KIND_##kind, reg_no_dst, &data_src,  \
+                    reg_no_base, reg_no_offset);                               \
+        }                                                                      \
+        else {                                                                 \
+            if (jit_reg_is_const(r3))                                          \
+                _ret = at_rmw_##bin_op##_r_base_r_offset_imm(                  \
+                    a, bytes_dst, JIT_REG_KIND_##kind, reg_no_dst, reg_no_src, \
+                    reg_no_base, offset);                                      \
+            else                                                               \
+                _ret = at_rmw_##bin_op##_r_base_r_offset_r(                    \
+                    a, bytes_dst, JIT_REG_KIND_##kind, reg_no_dst, reg_no_src, \
+                    reg_no_base, reg_no_offset);                               \
+        }                                                                      \
+        if (!_ret)                                                             \
+            GOTO_FAIL;                                                         \
+    } while (0)
+
+/**
+ * Encode insn mfence
+ **/
+static void
+fence(x86::Assembler &a)
+{
+    a.mfence();
+}
+
+/**
+ * Encode insn fence
+ */
+#define FENCE() fence(a)
+
+#endif
+
+bool
+jit_codegen_gen_native(JitCompContext *cc)
+{
+    bool atomic;
+    JitBasicBlock *block;
+    JitInsn *insn;
+    JitReg r0, r1, r2, r3, r4;
+    JmpInfo jmp_info_head;
+    bh_list *jmp_info_list = (bh_list *)&jmp_info_head;
+    uint32 label_index, label_num, i;
+    uint32 *label_offsets = NULL, code_size;
+#if CODEGEN_DUMP != 0
+    uint32 code_offset = 0;
+#endif
+    bool return_value = false, is_last_insn;
+    void **jitted_addr;
+    char *code_buf, *stream;
+
+    JitErrorHandler err_handler;
+    Environment env(Arch::kX64);
+    CodeHolder code;
+    code.init(env);
+    code.setErrorHandler(&err_handler);
+    x86::Assembler a(&code);
+
+    if (BH_LIST_SUCCESS != bh_list_init(jmp_info_list)) {
+        jit_set_last_error(cc, "init jmp info list failed");
+        return false;
+    }
+
+    label_num = jit_cc_label_num(cc);
+
+    if (!(label_offsets =
+              (uint32 *)jit_calloc(((uint32)sizeof(uint32)) * label_num))) {
+        jit_set_last_error(cc, "allocate memory failed");
+        goto fail;
+    }
+
+    for (i = 0; i < label_num; i++) {
+        if (i == 0)
+            label_index = 0;
+        else if (i == label_num - 1)
+            label_index = 1;
+        else
+            label_index = i + 1;
+
+        label_offsets[label_index] = code.sectionById(0)->buffer().size();
+
+        block = *jit_annl_basic_block(
+            cc, jit_reg_new(JIT_REG_KIND_L32, label_index));
+
+#if CODEGEN_DUMP != 0
+        os_printf("\nL%d:\n\n", label_index);
+#endif
+
+        JIT_FOREACH_INSN(block, insn)
+        {
+            is_last_insn = (insn->next == block) ? true : false;
+
+#if CODEGEN_DUMP != 0
+            os_printf("\n");
+            jit_dump_insn(cc, insn);
+#endif
+            switch (insn->opcode) {
+                case JIT_OP_MOV:
+                    LOAD_2ARGS();
+                    if (!lower_mov(cc, a, r0, r1))
+                        GOTO_FAIL;
+                    break;
+
+                case JIT_OP_I8TOI32:
+                    LOAD_2ARGS();
+                    CONVERT_R_R(I32, I32, i32, i8, int8);
+                    break;
+
+                case JIT_OP_I8TOI64:
+                    LOAD_2ARGS();
+                    CONVERT_R_R(I64, I32, i64, i8, int8);
+                    break;
+
+                case JIT_OP_I16TOI32:
+                    LOAD_2ARGS();
+                    CONVERT_R_R(I32, I32, i32, i16, int16);
+                    break;
+
+                case JIT_OP_I16TOI64:
+                    LOAD_2ARGS();
+                    CONVERT_R_R(I64, I32, i64, i16, int16);
+                    break;
+
+                case JIT_OP_I32TOI8:
+                    LOAD_2ARGS();
+                    CONVERT_R_R(I32, I32, i8, i32, int32);
+                    break;
+
+                case JIT_OP_I32TOU8:
+                    LOAD_2ARGS();
+                    CONVERT_R_R(I32, I32, u8, i32, int32);
+                    break;
+
+                case JIT_OP_I32TOI16:
+                    LOAD_2ARGS();
+                    CONVERT_R_R(I32, I32, i16, i32, int32);
+                    break;
+
+                case JIT_OP_I32TOU16:
+                    LOAD_2ARGS();
+                    CONVERT_R_R(I32, I32, u16, i32, int32);
+                    break;
+
+                case JIT_OP_I32TOI64:
+                    LOAD_2ARGS();
+                    CONVERT_R_R(I64, I32, i64, i32, int32);
+                    break;
+
+                case JIT_OP_U32TOI64:
+                    LOAD_2ARGS();
+                    CONVERT_R_R(I64, I32, i64, u32, int32);
+                    break;
+
+                case JIT_OP_I32TOF32:
+                    LOAD_2ARGS();
+                    CONVERT_R_R(F32, I32, f32, i32, int32);
+                    break;
+
+                case JIT_OP_U32TOF32:
+                    LOAD_2ARGS();
+                    CONVERT_R_R(F32, I32, f32, u32, uint32);
+                    break;
+
+                case JIT_OP_I32TOF64:
+                    LOAD_2ARGS();
+                    CONVERT_R_R(F64, I32, f64, i32, int32);
+                    break;
+
+                case JIT_OP_U32TOF64:
+                    LOAD_2ARGS();
+                    CONVERT_R_R(F64, I32, f64, u32, uint32);
+                    break;
+
+                case JIT_OP_I64TOI8:
+                    LOAD_2ARGS();
+                    CONVERT_R_R(I32, I64, i8, i64, int64);
+                    break;
+
+                case JIT_OP_I64TOI16:
+                    LOAD_2ARGS();
+                    CONVERT_R_R(I32, I64, i16, i64, int64);
+                    break;
+
+                case JIT_OP_I64TOI32:
+                    LOAD_2ARGS();
+                    CONVERT_R_R(I32, I64, i32, i64, int64);
+                    break;
+
+                case JIT_OP_I64TOF32:
+                    LOAD_2ARGS();
+                    CONVERT_R_R(F32, I64, f32, i64, int64);
+                    break;
+
+                case JIT_OP_I64TOF64:
+                    LOAD_2ARGS();
+                    CONVERT_R_R(F64, I64, f64, i64, int64);
+                    break;
+
+                case JIT_OP_F32TOI32:
+                    LOAD_2ARGS();
+                    CONVERT_R_R(I32, F32, i32, f32, float32);
+                    break;
+
+                case JIT_OP_F32TOI64:
+                    LOAD_2ARGS();
+                    CONVERT_R_R(I64, F32, i64, f32, float32);
+                    break;
+
+                case JIT_OP_F32TOF64:
+                    LOAD_2ARGS();
+                    CONVERT_R_R(F64, F32, f64, f32, float32);
+                    break;
+
+                case JIT_OP_F32TOU32:
+                    LOAD_2ARGS();
+                    CONVERT_R_R(I32, F32, u32, f32, float32);
+                    break;
+
+                case JIT_OP_F64TOI32:
+                    LOAD_2ARGS();
+                    CONVERT_R_R(I32, F64, i32, f64, float64);
+                    break;
+
+                case JIT_OP_F64TOI64:
+                    LOAD_2ARGS();
+                    CONVERT_R_R(I64, F64, i64, f64, float64);
+                    break;
+
+                case JIT_OP_F64TOF32:
+                    LOAD_2ARGS();
+                    CONVERT_R_R(F32, F64, f32, f64, float64);
+                    break;
+
+                case JIT_OP_F64TOU32:
+                    LOAD_2ARGS();
+                    CONVERT_R_R(I32, F64, u32, f64, float64);
+                    break;
+
+                case JIT_OP_NEG:
+                    LOAD_2ARGS();
+                    if (!lower_neg(cc, a, r0, r1))
+                        GOTO_FAIL;
+                    break;
+
+                case JIT_OP_ADD:
+                case JIT_OP_SUB:
+                case JIT_OP_MUL:
+                case JIT_OP_DIV_S:
+                case JIT_OP_REM_S:
+                case JIT_OP_DIV_U:
+                case JIT_OP_REM_U:
+                    LOAD_3ARGS();
+                    if (!lower_alu(cc, a,
+                                   (ALU_OP)(ADD + (insn->opcode - JIT_OP_ADD)),
+                                   r0, r1, r2))
+                        GOTO_FAIL;
+                    break;
+
+                case JIT_OP_SHL:
+                case JIT_OP_SHRS:
+                case JIT_OP_SHRU:
+                case JIT_OP_ROTL:
+                case JIT_OP_ROTR:
+                    LOAD_3ARGS();
+                    if (!lower_shift(
+                            cc, a,
+                            (SHIFT_OP)(SHL + (insn->opcode - JIT_OP_SHL)), r0,
+                            r1, r2))
+                        GOTO_FAIL;
+                    break;
+
+                case JIT_OP_OR:
+                case JIT_OP_XOR:
+                case JIT_OP_AND:
+                    LOAD_3ARGS();
+                    if (!lower_bit(cc, a,
+                                   (BIT_OP)(OR + (insn->opcode - JIT_OP_OR)),
+                                   r0, r1, r2))
+                        GOTO_FAIL;
+                    break;
+
+                case JIT_OP_CLZ:
+                case JIT_OP_CTZ:
+                case JIT_OP_POPCNT:
+                    LOAD_2ARGS();
+                    if (!lower_bitcount(
+                            cc, a,
+                            (BITCOUNT_OP)(CLZ + (insn->opcode - JIT_OP_CLZ)),
+                            r0, r1))
+                        GOTO_FAIL;
+                    break;
+
+                case JIT_OP_CMP:
+                    LOAD_3ARGS();
+                    if (!lower_cmp(cc, a, r0, r1, r2))
+                        GOTO_FAIL;
+                    break;
+
+                case JIT_OP_SELECTEQ:
+                case JIT_OP_SELECTNE:
+                case JIT_OP_SELECTGTS:
+                case JIT_OP_SELECTGES:
+                case JIT_OP_SELECTLTS:
+                case JIT_OP_SELECTLES:
+                case JIT_OP_SELECTGTU:
+                case JIT_OP_SELECTGEU:
+                case JIT_OP_SELECTLTU:
+                case JIT_OP_SELECTLEU:
+                    LOAD_4ARGS();
+                    if (!lower_select(
+                            cc, a,
+                            (COND_OP)(EQ + (insn->opcode - JIT_OP_SELECTEQ)),
+                            r0, r1, r2, r3))
+                        GOTO_FAIL;
+                    break;
+
+                case JIT_OP_LDEXECENV:
+                    LOAD_1ARG();
+                    CHECK_KIND(r0, JIT_REG_KIND_I32);
+                    /* TODO */
+                    break;
+
+                case JIT_OP_LDJITINFO:
+                    LOAD_1ARG();
+                    CHECK_KIND(r0, JIT_REG_KIND_I32);
+                    /* TODO */
+                    break;
+
+                case JIT_OP_LDI8:
+                    LOAD_3ARGS();
+                    bh_assert(jit_reg_kind(r0) == JIT_REG_KIND_I32
+                              || jit_reg_kind(r0) == JIT_REG_KIND_I64);
+                    if (jit_reg_kind(r0) == JIT_REG_KIND_I32)
+                        LD_R_R_R(I32, 1, true);
+                    else
+                        LD_R_R_R(I64, 1, true);
+                    break;
+
+                case JIT_OP_LDU8:
+                    LOAD_3ARGS();
+                    bh_assert(jit_reg_kind(r0) == JIT_REG_KIND_I32
+                              || jit_reg_kind(r0) == JIT_REG_KIND_I64);
+                    if (jit_reg_kind(r0) == JIT_REG_KIND_I32)
+                        LD_R_R_R(I32, 1, false);
+                    else
+                        LD_R_R_R(I64, 1, false);
+                    break;
+
+                case JIT_OP_LDI16:
+                    LOAD_3ARGS();
+                    bh_assert(jit_reg_kind(r0) == JIT_REG_KIND_I32
+                              || jit_reg_kind(r0) == JIT_REG_KIND_I64);
+                    if (jit_reg_kind(r0) == JIT_REG_KIND_I32)
+                        LD_R_R_R(I32, 2, true);
+                    else
+                        LD_R_R_R(I64, 2, true);
+                    break;
+
+                case JIT_OP_LDU16:
+                    LOAD_3ARGS();
+                    bh_assert(jit_reg_kind(r0) == JIT_REG_KIND_I32
+                              || jit_reg_kind(r0) == JIT_REG_KIND_I64);
+                    if (jit_reg_kind(r0) == JIT_REG_KIND_I32)
+                        LD_R_R_R(I32, 2, false);
+                    else
+                        LD_R_R_R(I64, 2, false);
+                    break;
+
+                case JIT_OP_LDI32:
+                    LOAD_3ARGS();
+                    bh_assert(jit_reg_kind(r0) == JIT_REG_KIND_I32
+                              || jit_reg_kind(r0) == JIT_REG_KIND_I64);
+                    if (jit_reg_kind(r0) == JIT_REG_KIND_I32)
+                        LD_R_R_R(I32, 4, true);
+                    else
+                        LD_R_R_R(I64, 4, true);
+                    break;
+
+                case JIT_OP_LDU32:
+                    LOAD_3ARGS();
+                    bh_assert(jit_reg_kind(r0) == JIT_REG_KIND_I32
+                              || jit_reg_kind(r0) == JIT_REG_KIND_I64);
+                    if (jit_reg_kind(r0) == JIT_REG_KIND_I32)
+                        LD_R_R_R(I32, 4, false);
+                    else
+                        LD_R_R_R(I64, 4, false);
+                    break;
+
+                case JIT_OP_LDI64:
+                case JIT_OP_LDU64:
+                case JIT_OP_LDPTR:
+                    LOAD_3ARGS();
+                    LD_R_R_R(I64, 8, false);
+                    break;
+
+                case JIT_OP_LDF32:
+                    LOAD_3ARGS();
+                    LD_R_R_R(F32, 4, false);
+                    break;
+
+                case JIT_OP_LDF64:
+                    LOAD_3ARGS();
+                    LD_R_R_R(F64, 8, false);
+                    break;
+
+                case JIT_OP_STI8:
+                    LOAD_3ARGS_NO_ASSIGN();
+                    atomic = insn->flags_u8 & 0x1;
+                    ST_R_R_R(I32, int32, 1, atomic);
+                    break;
+
+                case JIT_OP_STI16:
+                    LOAD_3ARGS_NO_ASSIGN();
+                    atomic = insn->flags_u8 & 0x1;
+                    ST_R_R_R(I32, int32, 2, atomic);
+                    break;
+
+                case JIT_OP_STI32:
+                    LOAD_3ARGS_NO_ASSIGN();
+                    atomic = insn->flags_u8 & 0x1;
+                    ST_R_R_R(I32, int32, 4, atomic);
+                    break;
+
+                case JIT_OP_STI64:
+                    LOAD_3ARGS_NO_ASSIGN();
+                    atomic = insn->flags_u8 & 0x1;
+                    ST_R_R_R(I64, int64, 8, atomic);
+                    break;
+
+                case JIT_OP_STPTR:
+                    LOAD_3ARGS_NO_ASSIGN();
+                    ST_R_R_R(I64, int64, 8, false);
+                    break;
+
+                case JIT_OP_STF32:
+                    LOAD_3ARGS_NO_ASSIGN();
+                    ST_R_R_R(F32, float32, 4, false);
+                    break;
+
+                case JIT_OP_STF64:
+                    LOAD_3ARGS_NO_ASSIGN();
+                    ST_R_R_R(F64, float64, 8, false);
+                    break;
+
+                case JIT_OP_JMP:
+                    LOAD_1ARG();
+                    CHECK_KIND(r0, JIT_REG_KIND_L32);
+                    if (!(is_last_insn
+                          && label_is_neighboring(cc, label_index,
+                                                  jit_reg_no(r0))))
+                        JMP_TO_LABEL(jit_reg_no(r0), label_index);
+                    break;
+
+                case JIT_OP_BEQ:
+                case JIT_OP_BNE:
+                case JIT_OP_BGTS:
+                case JIT_OP_BGES:
+                case JIT_OP_BLTS:
+                case JIT_OP_BLES:
+                case JIT_OP_BGTU:
+                case JIT_OP_BGEU:
+                case JIT_OP_BLTU:
+                case JIT_OP_BLEU:
+                    LOAD_3ARGS();
+                    if (!lower_branch(
+                            cc, a, jmp_info_list, label_index,
+                            (COND_OP)(EQ + (insn->opcode - JIT_OP_BEQ)), r0, r1,
+                            r2, is_last_insn))
+                        GOTO_FAIL;
+                    break;
+
+                case JIT_OP_LOOKUPSWITCH:
+                {
+                    JitOpndLookupSwitch *opnd = jit_insn_opndls(insn);
+                    if (!lower_lookupswitch(cc, a, jmp_info_list, label_offsets,
+                                            label_index, opnd, is_last_insn))
+                        GOTO_FAIL;
+                    break;
+                }
+
+                case JIT_OP_CALLNATIVE:
+                    if (!lower_callnative(cc, a, insn))
+                        GOTO_FAIL;
+                    break;
+
+                case JIT_OP_CALLBC:
+                    if (!lower_callbc(cc, a, jmp_info_list, label_index, insn))
+                        GOTO_FAIL;
+                    break;
+
+                case JIT_OP_RETURNBC:
+                    if (!lower_returnbc(cc, a, insn))
+                        GOTO_FAIL;
+                    break;
+
+                case JIT_OP_RETURN:
+                    if (!lower_return(cc, a, insn))
+                        GOTO_FAIL;
+                    break;
+
+                case JIT_OP_I32CASTF32:
+                    LOAD_2ARGS();
+                    CAST_R_R(F32, I32, f32, i32, int32);
+                    break;
+
+                case JIT_OP_I64CASTF64:
+                    LOAD_2ARGS();
+                    CAST_R_R(F64, I64, f64, i64, int64);
+                    break;
+
+                case JIT_OP_F32CASTI32:
+                    LOAD_2ARGS();
+                    CAST_R_R(I32, F32, i32, f32, float);
+                    break;
+
+                case JIT_OP_F64CASTI64:
+                    LOAD_2ARGS();
+                    CAST_R_R(I64, F64, i64, f64, double);
+                    break;
+
+#if WASM_ENABLE_SHARED_MEMORY != 0
+                case JIT_OP_AT_CMPXCHGU8:
+                    LOAD_4ARGS_NO_ASSIGN();
+                    if (jit_reg_kind(r0) == JIT_REG_KIND_I32)
+                        CMPXCHG_R_R_R_R_R(I32, int32, 1);
+                    else
+                        CMPXCHG_R_R_R_R_R(I64, int64, 1);
+                    break;
+
+                case JIT_OP_AT_CMPXCHGU16:
+                    LOAD_4ARGS_NO_ASSIGN();
+                    if (jit_reg_kind(r0) == JIT_REG_KIND_I32)
+                        CMPXCHG_R_R_R_R_R(I32, int32, 2);
+                    else
+                        CMPXCHG_R_R_R_R_R(I64, int64, 2);
+                    break;
+
+                case JIT_OP_AT_CMPXCHGI32:
+                    LOAD_4ARGS_NO_ASSIGN();
+                    CMPXCHG_R_R_R_R_R(I32, int32, 4);
+                    break;
+
+                case JIT_OP_AT_CMPXCHGU32:
+                    LOAD_4ARGS_NO_ASSIGN();
+                    CMPXCHG_R_R_R_R_R(I64, int32, 4);
+                    break;
+
+                case JIT_OP_AT_CMPXCHGI64:
+                    LOAD_4ARGS_NO_ASSIGN();
+                    CMPXCHG_R_R_R_R_R(I64, int64, 8);
+                    break;
+
+                case JIT_OP_AT_ADDU8:
+                    LOAD_4ARGS();
+                    bh_assert(jit_reg_kind(r0) == JIT_REG_KIND_I32
+                              || jit_reg_kind(r0) == JIT_REG_KIND_I64);
+                    if (jit_reg_kind(r0) == JIT_REG_KIND_I32)
+                        AT_RMW_R_R_R_R(add, I32, int32, 1);
+                    else
+                        AT_RMW_R_R_R_R(add, I64, int64, 1);
+                    break;
+
+                case JIT_OP_AT_ADDU16:
+                    LOAD_4ARGS();
+                    bh_assert(jit_reg_kind(r0) == JIT_REG_KIND_I32
+                              || jit_reg_kind(r0) == JIT_REG_KIND_I64);
+                    if (jit_reg_kind(r0) == JIT_REG_KIND_I32)
+                        AT_RMW_R_R_R_R(add, I32, int32, 2);
+                    else
+                        AT_RMW_R_R_R_R(add, I64, int64, 2);
+                    break;
+
+                case JIT_OP_AT_ADDI32:
+                    LOAD_4ARGS();
+                    AT_RMW_R_R_R_R(add, I32, int32, 4);
+                    break;
+
+                case JIT_OP_AT_ADDU32:
+                    LOAD_4ARGS();
+                    AT_RMW_R_R_R_R(add, I64, int64, 4);
+                    break;
+
+                case JIT_OP_AT_ADDI64:
+                    LOAD_4ARGS();
+                    AT_RMW_R_R_R_R(add, I64, int64, 8);
+                    break;
+
+                case JIT_OP_AT_SUBU8:
+                    LOAD_4ARGS();
+                    bh_assert(jit_reg_kind(r0) == JIT_REG_KIND_I32
+                              || jit_reg_kind(r0) == JIT_REG_KIND_I64);
+                    if (jit_reg_kind(r0) == JIT_REG_KIND_I32)
+                        AT_RMW_R_R_R_R(sub, I32, int32, 1);
+                    else
+                        AT_RMW_R_R_R_R(sub, I64, int64, 1);
+                    break;
+
+                case JIT_OP_AT_SUBU16:
+                    LOAD_4ARGS();
+                    bh_assert(jit_reg_kind(r0) == JIT_REG_KIND_I32
+                              || jit_reg_kind(r0) == JIT_REG_KIND_I64);
+                    if (jit_reg_kind(r0) == JIT_REG_KIND_I32)
+                        AT_RMW_R_R_R_R(sub, I32, int32, 2);
+                    else
+                        AT_RMW_R_R_R_R(sub, I64, int64, 2);
+                    break;
+
+                case JIT_OP_AT_SUBI32:
+                    LOAD_4ARGS();
+                    AT_RMW_R_R_R_R(sub, I32, int32, 4);
+                    break;
+
+                case JIT_OP_AT_SUBU32:
+                    LOAD_4ARGS();
+                    AT_RMW_R_R_R_R(sub, I64, int64, 4);
+                    break;
+
+                case JIT_OP_AT_SUBI64:
+                    LOAD_4ARGS();
+                    AT_RMW_R_R_R_R(sub, I64, int64, 8);
+                    break;
+
+                case JIT_OP_AT_XCHGU8:
+                    LOAD_4ARGS();
+                    bh_assert(jit_reg_kind(r0) == JIT_REG_KIND_I32
+                              || jit_reg_kind(r0) == JIT_REG_KIND_I64);
+                    if (jit_reg_kind(r0) == JIT_REG_KIND_I32)
+                        AT_RMW_R_R_R_R(xchg, I32, int32, 1);
+                    else
+                        AT_RMW_R_R_R_R(xchg, I64, int64, 1);
+                    break;
+
+                case JIT_OP_AT_XCHGU16:
+                    LOAD_4ARGS();
+                    bh_assert(jit_reg_kind(r0) == JIT_REG_KIND_I32
+                              || jit_reg_kind(r0) == JIT_REG_KIND_I64);
+                    if (jit_reg_kind(r0) == JIT_REG_KIND_I32)
+                        AT_RMW_R_R_R_R(xchg, I32, int32, 2);
+                    else
+                        AT_RMW_R_R_R_R(xchg, I64, int64, 2);
+                    break;
+
+                case JIT_OP_AT_XCHGI32:
+                    LOAD_4ARGS();
+                    AT_RMW_R_R_R_R(xchg, I32, int32, 4);
+                    break;
+
+                case JIT_OP_AT_XCHGU32:
+                    LOAD_4ARGS();
+                    AT_RMW_R_R_R_R(xchg, I64, int64, 4);
+                    break;
+
+                case JIT_OP_AT_XCHGI64:
+                    LOAD_4ARGS();
+                    AT_RMW_R_R_R_R(xchg, I64, int64, 8);
+                    break;
+
+                case JIT_OP_AT_ANDU8:
+                    LOAD_4ARGS();
+                    bh_assert(jit_reg_kind(r0) == JIT_REG_KIND_I32
+                              || jit_reg_kind(r0) == JIT_REG_KIND_I64);
+                    if (jit_reg_kind(r0) == JIT_REG_KIND_I32)
+                        AT_RMW_R_R_R_R(and, I32, int32, 1);
+                    else
+                        AT_RMW_R_R_R_R(and, I64, int64, 1);
+                    break;
+
+                case JIT_OP_AT_ANDU16:
+                    LOAD_4ARGS();
+                    bh_assert(jit_reg_kind(r0) == JIT_REG_KIND_I32
+                              || jit_reg_kind(r0) == JIT_REG_KIND_I64);
+                    if (jit_reg_kind(r0) == JIT_REG_KIND_I32)
+                        AT_RMW_R_R_R_R(and, I32, int32, 2);
+                    else
+                        AT_RMW_R_R_R_R(and, I64, int64, 2);
+                    break;
+
+                case JIT_OP_AT_ANDI32:
+                    LOAD_4ARGS();
+                    AT_RMW_R_R_R_R(and, I32, int32, 4);
+                    break;
+
+                case JIT_OP_AT_ANDU32:
+                    LOAD_4ARGS();
+                    AT_RMW_R_R_R_R(and, I64, int64, 4);
+                    break;
+
+                case JIT_OP_AT_ANDI64:
+                    LOAD_4ARGS();
+                    AT_RMW_R_R_R_R(and, I64, int64, 8);
+                    break;
+
+                case JIT_OP_AT_ORU8:
+                    LOAD_4ARGS();
+                    bh_assert(jit_reg_kind(r0) == JIT_REG_KIND_I32
+                              || jit_reg_kind(r0) == JIT_REG_KIND_I64);
+                    if (jit_reg_kind(r0) == JIT_REG_KIND_I32)
+                        AT_RMW_R_R_R_R(or, I32, int32, 1);
+                    else
+                        AT_RMW_R_R_R_R(or, I64, int64, 1);
+                    break;
+
+                case JIT_OP_AT_ORU16:
+                    LOAD_4ARGS();
+                    bh_assert(jit_reg_kind(r0) == JIT_REG_KIND_I32
+                              || jit_reg_kind(r0) == JIT_REG_KIND_I64);
+                    if (jit_reg_kind(r0) == JIT_REG_KIND_I32)
+                        AT_RMW_R_R_R_R(or, I32, int32, 2);
+                    else
+                        AT_RMW_R_R_R_R(or, I64, int64, 2);
+                    break;
+
+                case JIT_OP_AT_ORI32:
+                    LOAD_4ARGS();
+                    AT_RMW_R_R_R_R(or, I32, int32, 4);
+                    break;
+
+                case JIT_OP_AT_ORU32:
+                    LOAD_4ARGS();
+                    AT_RMW_R_R_R_R(or, I64, int64, 4);
+                    break;
+
+                case JIT_OP_AT_ORI64:
+                    LOAD_4ARGS();
+                    AT_RMW_R_R_R_R(or, I64, int64, 8);
+                    break;
+
+                case JIT_OP_AT_XORU8:
+                    LOAD_4ARGS();
+                    bh_assert(jit_reg_kind(r0) == JIT_REG_KIND_I32
+                              || jit_reg_kind(r0) == JIT_REG_KIND_I64);
+                    if (jit_reg_kind(r0) == JIT_REG_KIND_I32)
+                        AT_RMW_R_R_R_R(xor, I32, int32, 1);
+                    else
+                        AT_RMW_R_R_R_R(xor, I64, int64, 1);
+                    break;
+
+                case JIT_OP_AT_XORU16:
+                    LOAD_4ARGS();
+                    bh_assert(jit_reg_kind(r0) == JIT_REG_KIND_I32
+                              || jit_reg_kind(r0) == JIT_REG_KIND_I64);
+                    if (jit_reg_kind(r0) == JIT_REG_KIND_I32)
+                        AT_RMW_R_R_R_R(xor, I32, int32, 2);
+                    else
+                        AT_RMW_R_R_R_R(xor, I64, int64, 2);
+                    break;
+
+                case JIT_OP_AT_XORI32:
+                    LOAD_4ARGS();
+                    AT_RMW_R_R_R_R(xor, I32, int32, 4);
+                    break;
+
+                case JIT_OP_AT_XORU32:
+                    LOAD_4ARGS();
+                    AT_RMW_R_R_R_R(xor, I64, int64, 4);
+                    break;
+
+                case JIT_OP_AT_XORI64:
+                    LOAD_4ARGS();
+                    AT_RMW_R_R_R_R(xor, I64, int64, 8);
+                    break;
+
+                case JIT_OP_FENCE:
+                    FENCE();
+                    break;
+
+#endif
+
+                default:
+                    jit_set_last_error_v(cc, "unsupported JIT opcode 0x%2x",
+                                         insn->opcode);
+                    GOTO_FAIL;
+            }
+
+            if (err_handler.err) {
+                jit_set_last_error_v(cc,
+                                     "failed to generate native code for JIT "
+                                     "opcode 0x%02x, ErrorCode is %u",
+                                     insn->opcode, err_handler.err);
+                GOTO_FAIL;
+            }
+
+#if CODEGEN_DUMP != 0
+            dump_native((char *)code.sectionById(0)->buffer().data()
+                            + code_offset,
+                        code.sectionById(0)->buffer().size() - code_offset);
+            code_offset = code.sectionById(0)->buffer().size();
+#endif
+        }
+    }
+
+    code_buf = (char *)code.sectionById(0)->buffer().data();
+    code_size = code.sectionById(0)->buffer().size();
+    if (!(stream = (char *)jit_code_cache_alloc(code_size))) {
+        jit_set_last_error(cc, "allocate memory failed");
+        goto fail;
+    }
+
+    bh_memcpy_s(stream, code_size, code_buf, code_size);
+    cc->jitted_addr_begin = stream;
+    cc->jitted_addr_end = stream + code_size;
+
+    for (i = 0; i < label_num; i++) {
+        if (i == 0)
+            label_index = 0;
+        else if (i == label_num - 1)
+            label_index = 1;
+        else
+            label_index = i + 1;
+
+        jitted_addr = jit_annl_jitted_addr(
+            cc, jit_reg_new(JIT_REG_KIND_L32, label_index));
+        *jitted_addr = stream + label_offsets[label_index];
+    }
+
+    patch_jmp_info_list(cc, jmp_info_list);
+    return_value = true;
+
+fail:
+
+    jit_free(label_offsets);
+    free_jmp_info_list(jmp_info_list);
+    return return_value;
+}
+
+#if WASM_ENABLE_LAZY_JIT != 0 && WASM_ENABLE_JIT != 0
+
+#define MAX_REG_INTS 6
+#define MAX_REG_FLOATS 8
+
+void *
+jit_codegen_compile_call_to_llvm_jit(const WASMType *func_type)
+{
+    const JitHardRegInfo *hreg_info = jit_codegen_get_hreg_info();
+    x86::Gp reg_lp = x86::r10, reg_res = x86::r12;
+    x86::Gp reg_tmp_i64 = x86::r11, reg_tmp_i32 = x86::r11d;
+    /* the index of integer argument registers */
+    uint8 reg_idx_of_int_args[] = { REG_RDI_IDX, REG_RSI_IDX, REG_RDX_IDX,
+                                    REG_RCX_IDX, REG_R8_IDX,  REG_R9_IDX };
+    uint32 n_ints = 0, n_fps = 0, n_stacks = 0, n_pushed;
+    uint32 int_reg_idx = 0, fp_reg_idx = 0, stack_arg_idx = 0;
+    uint32 off_to_lp = 0, off_to_res = 0, code_size, i;
+    uint32 param_count = func_type->param_count;
+    uint32 result_count = func_type->result_count;
+    uint32 ext_result_count;
+    char *code_buf, *stream;
+    Imm imm;
+
+    JitErrorHandler err_handler;
+    Environment env(Arch::kX64);
+    CodeHolder code;
+    code.init(env);
+    code.setErrorHandler(&err_handler);
+    x86::Assembler a(&code);
+
+    /* Load the llvm jit function pointer */
+    {
+        /* r11 = exec_env->module_inst */
+        x86::Mem m1(regs_i64[hreg_info->exec_env_hreg_index],
+                    (uint32)offsetof(WASMExecEnv, module_inst));
+        a.mov(reg_tmp_i64, m1);
+        /* r11 = module_inst->func_ptrs */
+        x86::Mem m2(reg_tmp_i64,
+                    (uint32)offsetof(WASMModuleInstance, func_ptrs));
+        a.mov(reg_tmp_i64, m2);
+        /* rax = func_ptrs[func_idx] */
+        x86::Mem m3(reg_tmp_i64, x86::rdx, 3, 0);
+        a.mov(x86::rax, m3);
+    }
+
+    n_ints++; /* exec_env */
+
+    for (i = 0; i < param_count; i++) {
+        switch (func_type->types[i]) {
+            case VALUE_TYPE_I32:
+            case VALUE_TYPE_I64:
+#if WASM_ENABLE_REF_TYPES != 0
+            case VALUE_TYPE_FUNCREF:
+            case VALUE_TYPE_EXTERNREF:
+#endif
+                if (n_ints < MAX_REG_INTS)
+                    n_ints++;
+                else
+                    n_stacks++;
+                break;
+            case VALUE_TYPE_F32:
+            case VALUE_TYPE_F64:
+                if (n_fps < MAX_REG_FLOATS)
+                    n_fps++;
+                else
+                    n_stacks++;
+                break;
+        }
+    }
+
+    ext_result_count = result_count > 1 ? result_count - 1 : 0;
+
+    if (ext_result_count > 0) {
+        if (n_ints + ext_result_count <= MAX_REG_INTS) {
+            /* extra result pointers can be stored into int registers */
+            n_ints += ext_result_count;
+        }
+        else {
+            /* part or all extra result pointers must be stored into stack */
+            n_stacks += n_ints + ext_result_count - MAX_REG_INTS;
+            n_ints = MAX_REG_INTS;
+        }
+    }
+
+    n_pushed = n_stacks;
+    if (n_stacks & 1) {
+        /* Align stack on 16 bytes */
+        n_pushed++;
+    }
+    if (n_pushed > 0) {
+        imm.setValue(n_pushed * 8);
+        a.sub(x86::rsp, imm);
+    }
+
+    /* r10 = outs_area->lp */
+    {
+        x86::Mem m(regs_i64[hreg_info->exec_env_hreg_index],
+                   (uint32)offsetof(WASMExecEnv, wasm_stack.s.top));
+        a.mov(reg_lp, m);
+        a.add(reg_lp, (uint32)offsetof(WASMInterpFrame, lp));
+    }
+
+    /* rdi = exec_env */
+    a.mov(regs_i64[reg_idx_of_int_args[int_reg_idx++]],
+          regs_i64[hreg_info->exec_env_hreg_index]);
+
+    for (i = 0; i < param_count; i++) {
+        x86::Mem m_src(reg_lp, off_to_lp);
+
+        switch (func_type->types[i]) {
+            case VALUE_TYPE_I32:
+#if WASM_ENABLE_REF_TYPES != 0
+            case VALUE_TYPE_FUNCREF:
+            case VALUE_TYPE_EXTERNREF:
+#endif
+            {
+                if (int_reg_idx < MAX_REG_INTS) {
+                    a.mov(regs_i32[reg_idx_of_int_args[int_reg_idx]], m_src);
+                    int_reg_idx++;
+                }
+                else {
+                    a.mov(reg_tmp_i32, m_src);
+                    x86::Mem m_dst(x86::rsp, stack_arg_idx * 8);
+                    a.mov(m_dst, reg_tmp_i32);
+                    stack_arg_idx++;
+                }
+                off_to_lp += 4;
+                break;
+            }
+            case VALUE_TYPE_I64:
+            {
+                if (int_reg_idx < MAX_REG_INTS) {
+                    a.mov(regs_i64[reg_idx_of_int_args[int_reg_idx]], m_src);
+                    int_reg_idx++;
+                }
+                else {
+                    a.mov(reg_tmp_i64, m_src);
+                    x86::Mem m_dst(x86::rsp, stack_arg_idx * 8);
+                    a.mov(m_dst, reg_tmp_i64);
+                    stack_arg_idx++;
+                }
+                off_to_lp += 8;
+                break;
+            }
+            case VALUE_TYPE_F32:
+            {
+                if (fp_reg_idx < MAX_REG_FLOATS) {
+                    a.movss(regs_float[fp_reg_idx], m_src);
+                    fp_reg_idx++;
+                }
+                else {
+                    a.mov(reg_tmp_i32, m_src);
+                    x86::Mem m_dst(x86::rsp, stack_arg_idx * 8);
+                    a.mov(m_dst, reg_tmp_i32);
+                    stack_arg_idx++;
+                }
+                off_to_lp += 4;
+                break;
+            }
+            case VALUE_TYPE_F64:
+            {
+                if (fp_reg_idx < MAX_REG_FLOATS) {
+                    a.movsd(regs_float[fp_reg_idx], m_src);
+                    fp_reg_idx++;
+                }
+                else {
+                    a.mov(reg_tmp_i64, m_src);
+                    x86::Mem m_dst(x86::rsp, stack_arg_idx * 8);
+                    a.mov(m_dst, reg_tmp_i64);
+                    stack_arg_idx++;
+                }
+                off_to_lp += 8;
+                break;
+            }
+        }
+    }
+
+    if (result_count > 0) {
+        switch (func_type->types[param_count]) {
+            case VALUE_TYPE_I32:
+#if WASM_ENABLE_REF_TYPES != 0
+            case VALUE_TYPE_FUNCREF:
+            case VALUE_TYPE_EXTERNREF:
+#endif
+            case VALUE_TYPE_F32:
+                off_to_res = 4;
+                break;
+            case VALUE_TYPE_I64:
+            case VALUE_TYPE_F64:
+                off_to_res = 8;
+                break;
+        }
+
+        /* r12 = cur_frame->sp */
+        x86::Mem m(x86::rbp, (uint32)offsetof(WASMInterpFrame, sp));
+        a.mov(reg_res, m);
+
+        for (i = 0; i < ext_result_count; i++) {
+            x86::Mem m(reg_res, off_to_res);
+
+            if (int_reg_idx < MAX_REG_INTS) {
+                a.lea(regs_i64[reg_idx_of_int_args[int_reg_idx]], m);
+                int_reg_idx++;
+            }
+            else {
+                a.lea(reg_tmp_i64, m);
+                x86::Mem m_dst(x86::rsp, stack_arg_idx * 8);
+                a.mov(m_dst, reg_tmp_i64);
+                stack_arg_idx++;
+            }
+
+            switch (func_type->types[param_count + 1 + i]) {
+                case VALUE_TYPE_I32:
+#if WASM_ENABLE_REF_TYPES != 0
+                case VALUE_TYPE_FUNCREF:
+                case VALUE_TYPE_EXTERNREF:
+#endif
+                case VALUE_TYPE_F32:
+                    off_to_res += 4;
+                    break;
+                case VALUE_TYPE_I64:
+                case VALUE_TYPE_F64:
+                    off_to_res += 8;
+                    break;
+            }
+        }
+    }
+
+    bh_assert(int_reg_idx == n_ints);
+    bh_assert(fp_reg_idx == n_fps);
+    bh_assert(stack_arg_idx == n_stacks);
+
+    /* Call the llvm jit function */
+    a.call(x86::rax);
+
+    /* Check if there was exception thrown */
+    {
+        /* r11 = exec_env->module_inst */
+        x86::Mem m1(regs_i64[hreg_info->exec_env_hreg_index],
+                    (uint32)offsetof(WASMExecEnv, module_inst));
+        a.mov(reg_tmp_i64, m1);
+        /* module_inst->cur_exception */
+        x86::Mem m2(reg_tmp_i64,
+                    (uint32)offsetof(WASMModuleInstance, cur_exception));
+        /* bl = module_inst->cur_exception[0] */
+        a.mov(x86::bl, m2);
+
+        /* cur_exception[0] == 0 ? */
+        Imm imm((uint8)0);
+        a.cmp(x86::bl, imm);
+        /* If yes, jump to `Get function result and return` */
+        imm.setValue(INT32_MAX);
+        a.je(imm);
+
+        char *stream = (char *)a.code()->sectionById(0)->buffer().data()
+                       + a.code()->sectionById(0)->buffer().size();
+
+        /* If no, set eax to JIT_INTERP_ACTION_THROWN, and
+           jump to code_block_return_to_interp_from_jitted to
+           return to interpreter */
+        imm.setValue(JIT_INTERP_ACTION_THROWN);
+        a.mov(x86::eax, imm);
+        imm.setValue(code_block_return_to_interp_from_jitted);
+        a.mov(x86::rsi, imm);
+        a.jmp(x86::rsi);
+
+        char *stream_new = (char *)a.code()->sectionById(0)->buffer().data()
+                           + a.code()->sectionById(0)->buffer().size();
+
+        *(int32 *)(stream - 4) = (uint32)(stream_new - stream);
+    }
+
+    /* Get function result and return */
+
+    if (result_count > 0 && func_type->types[param_count] != VALUE_TYPE_F32
+        && func_type->types[param_count] != VALUE_TYPE_F64) {
+        a.mov(x86::rdx, x86::rax);
+    }
+
+    if (off_to_res > 0) {
+        imm.setValue(off_to_res);
+        a.add(reg_res, imm);
+        /* cur_frame->sp = r12 */
+        x86::Mem m(x86::rbp, (uint32)offsetof(WASMInterpFrame, sp));
+        a.mov(m, reg_res);
+    }
+
+    if (n_pushed > 0) {
+        imm.setValue(n_pushed * 8);
+        a.add(x86::rsp, imm);
+    }
+
+    /* Return to the caller */
+    {
+        /* eax = action = JIT_INTERP_ACTION_NORMAL */
+        Imm imm(0);
+        a.mov(x86::eax, imm);
+
+        uint32 jitted_return_addr_offset =
+            jit_frontend_get_jitted_return_addr_offset();
+        x86::Mem m(x86::rbp, jitted_return_addr_offset);
+        a.jmp(m);
+    }
+
+    if (err_handler.err)
+        return NULL;
+
+    code_buf = (char *)code.sectionById(0)->buffer().data();
+    code_size = code.sectionById(0)->buffer().size();
+    stream = (char *)jit_code_cache_alloc(code_size);
+    if (!stream)
+        return NULL;
+
+    bh_memcpy_s(stream, code_size, code_buf, code_size);
+
+#if 0
+    dump_native(stream, code_size);
+#endif
+
+    return stream;
+}
+
+static WASMInterpFrame *
+fast_jit_alloc_frame(WASMExecEnv *exec_env, uint32 param_cell_num,
+                     uint32 ret_cell_num)
+{
+    WASMModuleInstance *module_inst =
+        (WASMModuleInstance *)exec_env->module_inst;
+    WASMInterpFrame *frame;
+    uint32 size_frame1 = wasm_interp_interp_frame_size(ret_cell_num);
+    uint32 size_frame2 = wasm_interp_interp_frame_size(param_cell_num);
+
+    /**
+     * Check whether we can allocate two frames: the first is an implied
+     * frame to store the function results from jit function to call,
+     * the second is the frame for the jit function
+     */
+    if ((uint8 *)exec_env->wasm_stack.s.top + size_frame1 + size_frame2
+        > exec_env->wasm_stack.s.top_boundary) {
+        wasm_set_exception(module_inst, "wasm operand stack overflow");
+        return NULL;
+    }
+
+    /* Allocate the frame */
+    frame = (WASMInterpFrame *)exec_env->wasm_stack.s.top;
+    exec_env->wasm_stack.s.top += size_frame1;
+
+    frame->function = NULL;
+    frame->ip = NULL;
+    frame->sp = frame->lp;
+    frame->prev_frame = wasm_exec_env_get_cur_frame(exec_env);
+    frame->jitted_return_addr =
+        (uint8 *)code_block_return_to_interp_from_jitted;
+
+    wasm_exec_env_set_cur_frame(exec_env, frame);
+
+    return frame;
+}
+
+void *
+jit_codegen_compile_call_to_fast_jit(const WASMModule *module, uint32 func_idx)
+{
+    uint32 func_idx_non_import = func_idx - module->import_function_count;
+    WASMType *func_type = module->functions[func_idx_non_import]->func_type;
+    /* the index of integer argument registers */
+    uint8 reg_idx_of_int_args[] = { REG_RDI_IDX, REG_RSI_IDX, REG_RDX_IDX,
+                                    REG_RCX_IDX, REG_R8_IDX,  REG_R9_IDX };
+    uint32 int_reg_idx, fp_reg_idx, stack_arg_idx;
+    uint32 switch_info_offset, exec_env_offset, stack_arg_offset;
+    uint32 int_reg_offset, frame_lp_offset;
+    uint32 switch_info_size, code_size, i;
+    uint32 param_count = func_type->param_count;
+    uint32 result_count = func_type->result_count;
+    uint32 ext_result_count = result_count > 1 ? result_count - 1 : 0;
+    uint32 param_cell_num = func_type->param_cell_num;
+    uint32 ret_cell_num =
+        func_type->ret_cell_num > 2 ? func_type->ret_cell_num : 2;
+    char *code_buf, *stream;
+    Imm imm;
+
+    JitErrorHandler err_handler;
+    Environment env(Arch::kX64);
+    CodeHolder code;
+    code.init(env);
+    code.setErrorHandler(&err_handler);
+    x86::Assembler a(&code);
+
+    /**
+     * Push JitInterpSwitchInfo and make stack 16-byte aligned:
+     *   the size pushed must be odd multiples of 8, as the stack pointer
+     *   %rsp must be aligned to a 16-byte boundary before making a call,
+     *   and when a function (including this llvm jit function) gets
+     *   control, the %rsp is not 16-byte aligned (call instruction will
+     *   push the ret address to stack).
+     */
+    switch_info_size = align_uint((uint32)sizeof(JitInterpSwitchInfo), 16) + 8;
+    imm.setValue((uint64)switch_info_size);
+    a.sub(x86::rsp, imm);
+
+    /* Push all integer argument registers since we will use them as
+       temporarily registers to load/store data */
+    for (i = 0; i < MAX_REG_INTS; i++) {
+        a.push(regs_i64[reg_idx_of_int_args[MAX_REG_INTS - 1 - i]]);
+    }
+
+    /* We don't push float/double register since we don't use them here */
+
+    /**
+     * Layout of the stack now:
+     *   stack arguments
+     *   ret address of the caller
+     *   switch info
+     *   int registers: r9, r8, rcx, rdx, rsi
+     *   exec_env: rdi
+     */
+
+    /* offset of the first stack argument to the stack pointer,
+       add 8 to skip the ret address of the caller */
+    stack_arg_offset = switch_info_size + 8 * MAX_REG_INTS + 8;
+    /* offset of jit interp switch info to the stack pointer */
+    switch_info_offset = 8 * MAX_REG_INTS;
+    /* offset of the first int register to the stack pointer */
+    int_reg_offset = 8;
+    /* offset of exec_env to the stack pointer */
+    exec_env_offset = 0;
+
+    /* Call fast_jit_alloc_frame to allocate the stack frame to
+       receive the results of the fast jit function to call */
+
+    /* rdi = exec_env, has been already set as exec_env is
+       the first argument of LLVM JIT function */
+    /* rsi = param_cell_num */
+    imm.setValue(param_cell_num);
+    a.mov(x86::rsi, imm);
+    /* rdx = ret_cell_num */
+    imm.setValue(ret_cell_num);
+    a.mov(x86::rdx, imm);
+    /* call fast_jit_alloc_frame */
+    imm.setValue((uint64)(uintptr_t)fast_jit_alloc_frame);
+    a.mov(x86::rax, imm);
+    a.call(x86::rax);
+
+    /* Check the return value, note now rax is the allocated frame */
+    {
+        /* Did fast_jit_alloc_frame return NULL? */
+        Imm imm((uint64)0);
+        a.cmp(x86::rax, imm);
+        /* If no, jump to `Copy arguments to frame lp area` */
+        imm.setValue(INT32_MAX);
+        a.jne(imm);
+
+        char *stream = (char *)a.code()->sectionById(0)->buffer().data()
+                       + a.code()->sectionById(0)->buffer().size();
+
+        /* If yes, set eax to 0, return to caller */
+
+        /* Pop all integer arument registers */
+        for (i = 0; i < MAX_REG_INTS; i++) {
+            a.pop(regs_i64[reg_idx_of_int_args[i]]);
+        }
+        /* Pop jit interp switch info */
+        imm.setValue((uint64)switch_info_size);
+        a.add(x86::rsp, imm);
+
+        /* Return to the caller, don't use leave as we didn't
+           `push rbp` and `mov rbp, rsp` */
+        a.ret();
+
+        /* Patch the offset of jne instruction */
+        char *stream_new = (char *)a.code()->sectionById(0)->buffer().data()
+                           + a.code()->sectionById(0)->buffer().size();
+        *(int32 *)(stream - 4) = (int32)(stream_new - stream);
+    }
+
+    int_reg_idx = 1; /* skip exec_env */
+    fp_reg_idx = 0;
+    stack_arg_idx = 0;
+
+    /* Offset of the dest arguments to outs area */
+    frame_lp_offset = wasm_interp_interp_frame_size(ret_cell_num)
+                      + (uint32)offsetof(WASMInterpFrame, lp);
+
+    /* Copy arguments to frame lp area */
+    for (i = 0; i < func_type->param_count; i++) {
+        x86::Mem m_dst(x86::rax, frame_lp_offset);
+        switch (func_type->types[i]) {
+            case VALUE_TYPE_I32:
+#if WASM_ENABLE_REF_TYPES != 0
+            case VALUE_TYPE_FUNCREF:
+            case VALUE_TYPE_EXTERNREF:
+#endif
+                if (int_reg_idx < MAX_REG_INTS) {
+                    /* Copy i32 argument from int register */
+                    x86::Mem m_src(x86::rsp, int_reg_offset);
+                    a.mov(x86::esi, m_src);
+                    a.mov(m_dst, x86::esi);
+                    int_reg_offset += 8;
+                    int_reg_idx++;
+                }
+                else {
+                    /* Copy i32 argument from stack */
+                    x86::Mem m_src(x86::rsp, stack_arg_offset);
+                    a.mov(x86::esi, m_src);
+                    a.mov(m_dst, x86::esi);
+                    stack_arg_offset += 8;
+                    stack_arg_idx++;
+                }
+                frame_lp_offset += 4;
+                break;
+            case VALUE_TYPE_I64:
+                if (int_reg_idx < MAX_REG_INTS) {
+                    /* Copy i64 argument from int register */
+                    x86::Mem m_src(x86::rsp, int_reg_offset);
+                    a.mov(x86::rsi, m_src);
+                    a.mov(m_dst, x86::rsi);
+                    int_reg_offset += 8;
+                    int_reg_idx++;
+                }
+                else {
+                    /* Copy i64 argument from stack */
+                    x86::Mem m_src(x86::rsp, stack_arg_offset);
+                    a.mov(x86::rsi, m_src);
+                    a.mov(m_dst, x86::rsi);
+                    stack_arg_offset += 8;
+                    stack_arg_idx++;
+                }
+                frame_lp_offset += 8;
+                break;
+            case VALUE_TYPE_F32:
+                if (fp_reg_idx < MAX_REG_FLOATS) {
+                    /* Copy f32 argument from fp register */
+                    a.movss(m_dst, regs_float[fp_reg_idx++]);
+                }
+                else {
+                    /* Copy f32 argument from stack */
+                    x86::Mem m_src(x86::rsp, stack_arg_offset);
+                    a.mov(x86::esi, m_src);
+                    a.mov(m_dst, x86::esi);
+                    stack_arg_offset += 8;
+                    stack_arg_idx++;
+                }
+                frame_lp_offset += 4;
+                break;
+            case VALUE_TYPE_F64:
+                if (fp_reg_idx < MAX_REG_FLOATS) {
+                    /* Copy f64 argument from fp register */
+                    a.movsd(m_dst, regs_float[fp_reg_idx++]);
+                }
+                else {
+                    /* Copy f64 argument from stack */
+                    x86::Mem m_src(x86::rsp, stack_arg_offset);
+                    a.mov(x86::rsi, m_src);
+                    a.mov(m_dst, x86::rsi);
+                    stack_arg_offset += 8;
+                    stack_arg_idx++;
+                }
+                frame_lp_offset += 8;
+                break;
+            default:
+                bh_assert(0);
+        }
+    }
+
+    /* Call the fast jit function */
+    {
+        /* info = rsp + switch_info_offset */
+        a.lea(x86::rsi, x86::ptr(x86::rsp, switch_info_offset));
+        /* info.frame = frame = rax, or return of fast_jit_alloc_frame */
+        x86::Mem m1(x86::rsi, (uint32)offsetof(JitInterpSwitchInfo, frame));
+        a.mov(m1, x86::rax);
+
+        /* Call code_block_switch_to_jitted_from_interp
+           with argument (exec_env, info, func_idx, pc) */
+        /* rdi = exec_env */
+        a.mov(x86::rdi, x86::ptr(x86::rsp, exec_env_offset));
+        /* rsi = info, has been set */
+        /* rdx = func_idx */
+        imm.setValue(func_idx);
+        a.mov(x86::rdx, imm);
+        /* module_inst = exec_env->module_inst */
+        a.mov(x86::rcx,
+              x86::ptr(x86::rdi, (uint32)offsetof(WASMExecEnv, module_inst)));
+        /* fast_jit_func_ptrs = module_inst->fast_jit_func_ptrs */
+        a.mov(x86::rcx,
+              x86::ptr(x86::rcx, (uint32)offsetof(WASMModuleInstance,
+                                                  fast_jit_func_ptrs)));
+        imm.setValue(func_idx_non_import);
+        a.mov(x86::rax, imm);
+        x86::Mem m3(x86::rcx, x86::rax, 3, 0);
+        /* rcx = module_inst->fast_jit_func_ptrs[func_idx_non_import] */
+        a.mov(x86::rcx, m3);
+
+        imm.setValue(
+            (uint64)(uintptr_t)code_block_switch_to_jitted_from_interp);
+        a.mov(x86::rax, imm);
+        a.call(x86::rax);
+    }
+
+    /* No need to check exception thrown here as it will be checked
+       in the caller */
+
+    /* Copy function results */
+    if (result_count > 0) {
+        frame_lp_offset = offsetof(WASMInterpFrame, lp);
+
+        switch (func_type->types[param_count]) {
+            case VALUE_TYPE_I32:
+#if WASM_ENABLE_REF_TYPES != 0
+            case VALUE_TYPE_FUNCREF:
+            case VALUE_TYPE_EXTERNREF:
+#endif
+                a.mov(x86::eax, x86::edx);
+                frame_lp_offset += 4;
+                break;
+            case VALUE_TYPE_I64:
+                a.mov(x86::rax, x86::rdx);
+                frame_lp_offset += 8;
+                break;
+            case VALUE_TYPE_F32:
+                /* The first result has been put to xmm0 */
+                frame_lp_offset += 4;
+                break;
+            case VALUE_TYPE_F64:
+                /* The first result has been put to xmm0 */
+                frame_lp_offset += 8;
+                break;
+            default:
+                bh_assert(0);
+        }
+
+        /* Copy extra results from exec_env->cur_frame */
+        if (ext_result_count > 0) {
+            /* rdi = exec_env */
+            a.mov(x86::rdi, x86::ptr(x86::rsp, exec_env_offset));
+            /* rsi = exec_env->cur_frame */
+            a.mov(x86::rsi,
+                  x86::ptr(x86::rdi, (uint32)offsetof(WASMExecEnv, cur_frame)));
+
+            for (i = 0; i < ext_result_count; i++) {
+                switch (func_type->types[param_count + 1 + i]) {
+                    case VALUE_TYPE_I32:
+#if WASM_ENABLE_REF_TYPES != 0
+                    case VALUE_TYPE_FUNCREF:
+                    case VALUE_TYPE_EXTERNREF:
+#endif
+                    case VALUE_TYPE_F32:
+                    {
+                        /* Copy 32-bit result */
+                        a.mov(x86::ecx, x86::ptr(x86::rsi, frame_lp_offset));
+                        if (int_reg_idx < MAX_REG_INTS) {
+                            x86::Mem m1(x86::rsp,
+                                        exec_env_offset + int_reg_idx * 8);
+                            a.mov(x86::rdx, m1);
+                            x86::Mem m2(x86::rdx, 0);
+                            a.mov(m2, x86::ecx);
+                            int_reg_idx++;
+                        }
+                        else {
+                            x86::Mem m1(x86::rsp, stack_arg_offset);
+                            a.mov(x86::rdx, m1);
+                            x86::Mem m2(x86::rdx, 0);
+                            a.mov(m2, x86::ecx);
+                            stack_arg_offset += 8;
+                            stack_arg_idx++;
+                        }
+                        frame_lp_offset += 4;
+                        break;
+                    }
+                    case VALUE_TYPE_I64:
+                    case VALUE_TYPE_F64:
+                    {
+                        /* Copy 64-bit result */
+                        a.mov(x86::rcx, x86::ptr(x86::rsi, frame_lp_offset));
+                        if (int_reg_idx < MAX_REG_INTS) {
+                            x86::Mem m1(x86::rsp,
+                                        exec_env_offset + int_reg_idx * 8);
+                            a.mov(x86::rdx, m1);
+                            x86::Mem m2(x86::rdx, 0);
+                            a.mov(m2, x86::rcx);
+                            int_reg_idx++;
+                        }
+                        else {
+                            x86::Mem m1(x86::rsp, stack_arg_offset);
+                            a.mov(x86::rdx, m1);
+                            x86::Mem m2(x86::rdx, 0);
+                            a.mov(m2, x86::rcx);
+                            stack_arg_offset += 8;
+                            stack_arg_idx++;
+                        }
+                        frame_lp_offset += 8;
+                        break;
+                    }
+                    default:
+                        bh_assert(0);
+                }
+            }
+        }
+    }
+
+    /* Free the frame allocated */
+
+    /* rdi = exec_env */
+    a.mov(x86::rdi, x86::ptr(x86::rsp, exec_env_offset));
+    /* rsi = exec_env->cur_frame */
+    a.mov(x86::rsi,
+          x86::ptr(x86::rdi, (uint32)offsetof(WASMExecEnv, cur_frame)));
+    /* rdx = exec_env->cur_frame->prev_frame */
+    a.mov(x86::rdx,
+          x86::ptr(x86::rsi, (uint32)offsetof(WASMInterpFrame, prev_frame)));
+    /* exec_env->wasm_stack.s.top = cur_frame */
+    {
+        x86::Mem m(x86::rdi, offsetof(WASMExecEnv, wasm_stack.s.top));
+        a.mov(m, x86::rsi);
+    }
+    /* exec_env->cur_frame = prev_frame */
+    {
+        x86::Mem m(x86::rdi, offsetof(WASMExecEnv, cur_frame));
+        a.mov(m, x86::rdx);
+    }
+
+    /* Pop all integer arument registers */
+    for (i = 0; i < MAX_REG_INTS; i++) {
+        a.pop(regs_i64[reg_idx_of_int_args[i]]);
+    }
+    /* Pop jit interp switch info */
+    imm.setValue((uint64)switch_info_size);
+    a.add(x86::rsp, imm);
+
+    /* Return to the caller, don't use leave as we didn't
+       `push rbp` and `mov rbp, rsp` */
+    a.ret();
+
+    if (err_handler.err) {
+        return NULL;
+    }
+
+    code_buf = (char *)code.sectionById(0)->buffer().data();
+    code_size = code.sectionById(0)->buffer().size();
+    stream = (char *)jit_code_cache_alloc(code_size);
+    if (!stream)
+        return NULL;
+
+    bh_memcpy_s(stream, code_size, code_buf, code_size);
+
+#if 0
+    printf("Code of call to fast jit of func %u:\n", func_idx);
+    dump_native(stream, code_size);
+    printf("\n");
+#endif
+
+    return stream;
+}
+
+#endif /* end of WASM_ENABLE_LAZY_JIT != 0 && WASM_ENABLE_JIT != 0 */
+
+bool
+jit_codegen_lower(JitCompContext *cc)
+{
+    (void)cc;
+    return true;
+}
+
+void
+jit_codegen_free_native(JitCompContext *cc)
+{
+    (void)cc;
+}
+
+void
+jit_codegen_dump_native(void *begin_addr, void *end_addr)
+{
+#if WASM_ENABLE_FAST_JIT_DUMP != 0
+    os_printf("\n");
+    dump_native((char *)begin_addr, (char *)end_addr - (char *)begin_addr);
+    os_printf("\n");
+#else
+    (void)begin_addr;
+    (void)end_addr;
+#endif
+}
+
+bool
+jit_codegen_init()
+{
+    const JitHardRegInfo *hreg_info = jit_codegen_get_hreg_info();
+    JitGlobals *jit_globals = jit_compiler_get_jit_globals();
+    char *code_buf, *stream;
+    uint32 code_size;
+
+    JitErrorHandler err_handler;
+    Environment env(Arch::kX64);
+    CodeHolder code;
+    code.init(env);
+    code.setErrorHandler(&err_handler);
+    x86::Assembler a(&code);
+
+    /* Initialize code_block_switch_to_jitted_from_interp */
+
+    /* push callee-save registers */
+    a.push(x86::rbp);
+    a.push(x86::rbx);
+    a.push(x86::r12);
+    a.push(x86::r13);
+    a.push(x86::r14);
+    a.push(x86::r15);
+    /* push info */
+    a.push(x86::rsi);
+
+    /* Note: the number of register pushed must be odd, as the stack pointer
+       %rsp must be aligned to a 16-byte boundary before making a call, so
+       when a function (including this function) gets control, %rsp is not
+       aligned. We push odd number registers here to make %rsp happy before
+       calling native functions. */
+
+    /* exec_env_reg = exec_env */
+    a.mov(regs_i64[hreg_info->exec_env_hreg_index], x86::rdi);
+    /* fp_reg = info->frame */
+    a.mov(x86::rbp, x86::ptr(x86::rsi, offsetof(JitInterpSwitchInfo, frame)));
+    /* rdx = func_idx, is already set in the func_idx argument of
+       jit_codegen_interp_jitted_glue  */
+    /* jmp target, rcx = pc */
+    a.jmp(x86::rcx);
+
+    if (err_handler.err)
+        return false;
+
+    code_buf = (char *)code.sectionById(0)->buffer().data();
+    code_size = code.sectionById(0)->buffer().size();
+    stream = (char *)jit_code_cache_alloc(code_size);
+    if (!stream)
+        return false;
+
+    bh_memcpy_s(stream, code_size, code_buf, code_size);
+    code_block_switch_to_jitted_from_interp = stream;
+
+#if 0
+    dump_native(stream, code_size);
+#endif
+
+    /* Initialize code_block_return_to_interp_from_jitted */
+
+    a.setOffset(0);
+
+    /* pop info */
+    a.pop(x86::rsi);
+    /* info->frame = fp_reg */
+    {
+        x86::Mem m(x86::rsi, offsetof(JitInterpSwitchInfo, frame));
+        a.mov(m, x86::rbp);
+    }
+    /* info->out.ret.ival[0, 1] = rdx */
+    {
+        x86::Mem m(x86::rsi, offsetof(JitInterpSwitchInfo, out.ret.ival));
+        a.mov(m, x86::rdx);
+    }
+    /* info->out.ret.fval[0, 1] = xmm0 */
+    {
+        x86::Mem m(x86::rsi, offsetof(JitInterpSwitchInfo, out.ret.fval));
+        a.movsd(m, x86::xmm0);
+    }
+
+    /* pop callee-save registers */
+    a.pop(x86::r15);
+    a.pop(x86::r14);
+    a.pop(x86::r13);
+    a.pop(x86::r12);
+    a.pop(x86::rbx);
+    a.pop(x86::rbp);
+    a.ret();
+
+    if (err_handler.err)
+        goto fail1;
+
+    code_buf = (char *)code.sectionById(0)->buffer().data();
+    code_size = code.sectionById(0)->buffer().size();
+    stream = (char *)jit_code_cache_alloc(code_size);
+    if (!stream)
+        goto fail1;
+
+    bh_memcpy_s(stream, code_size, code_buf, code_size);
+    code_block_return_to_interp_from_jitted =
+        jit_globals->return_to_interp_from_jitted = stream;
+
+#if 0
+    dump_native(stream, code_size);
+#endif
+
+#if WASM_ENABLE_LAZY_JIT != 0
+    /* Initialize code_block_compile_fast_jit_and_then_call */
+
+    a.setOffset(0);
+
+    /* Use rbx, r12, r13 to save func_dix, module_inst and module,
+       as they are callee-save registers */
+
+    /* Backup func_idx: rbx = rdx = func_idx, note that rdx has
+       been prepared in the caller:
+         callbc or code_block_switch_to_jitted_from_interp */
+    a.mov(x86::rbx, x86::rdx);
+    /* r12 = module_inst = exec_env->module_inst */
+    {
+        x86::Mem m(regs_i64[hreg_info->exec_env_hreg_index],
+                   (uint32)offsetof(WASMExecEnv, module_inst));
+        a.mov(x86::r12, m);
+    }
+    /* rdi = r13 = module_inst->module */
+    {
+        x86::Mem m(x86::r12, (uint32)offsetof(WASMModuleInstance, module));
+        a.mov(x86::rdi, m);
+        a.mov(x86::r13, x86::rdi);
+    }
+    /* rsi = rdx = func_idx */
+    a.mov(x86::rsi, x86::rdx);
+    /* Call jit_compiler_compile(module, func_idx) */
+    {
+        Imm imm((uint64)(uintptr_t)jit_compiler_compile);
+        a.mov(x86::rax, imm);
+        a.call(x86::rax);
+    }
+
+    /* Check if failed to compile the jit function */
+    {
+        /* Did jit_compiler_compile return false? */
+        Imm imm((uint8)0);
+        a.cmp(x86::al, imm);
+        /* If no, jump to `Load compiled func ptr and call it` */
+        imm.setValue(INT32_MAX);
+        a.jne(imm);
+
+        char *stream = (char *)a.code()->sectionById(0)->buffer().data()
+                       + a.code()->sectionById(0)->buffer().size();
+
+        /* If yes, call jit_set_exception_with_id to throw exception,
+           and then set eax to JIT_INTERP_ACTION_THROWN, and jump to
+           code_block_return_to_interp_from_jitted to return */
+
+        /* rdi = module_inst */
+        a.mov(x86::rdi, x86::r12);
+        /* rsi = EXCE_FAILED_TO_COMPILE_FAST_JIT_FUNC */
+        imm.setValue(EXCE_FAILED_TO_COMPILE_FAST_JIT_FUNC);
+        a.mov(x86::rsi, imm);
+        /* Call jit_set_exception_with_id */
+        imm.setValue((uint64)(uintptr_t)jit_set_exception_with_id);
+        a.mov(x86::rax, imm);
+        a.call(x86::rax);
+        /* Return to the caller */
+        imm.setValue(JIT_INTERP_ACTION_THROWN);
+        a.mov(x86::eax, imm);
+        imm.setValue(code_block_return_to_interp_from_jitted);
+        a.mov(x86::rsi, imm);
+        a.jmp(x86::rsi);
+
+        /* Patch the offset of jne instruction */
+        char *stream_new = (char *)a.code()->sectionById(0)->buffer().data()
+                           + a.code()->sectionById(0)->buffer().size();
+        *(int32 *)(stream - 4) = (int32)(stream_new - stream);
+    }
+
+    /* Load compiled func ptr and call it */
+    {
+        /* rsi = module->import_function_count */
+        x86::Mem m1(x86::r13,
+                    (uint32)offsetof(WASMModule, import_function_count));
+        a.movzx(x86::rsi, m1);
+        /* rbx = rbx - module->import_function_count */
+        a.sub(x86::rbx, x86::rsi);
+        /* rax = module->fast_jit_func_ptrs */
+        x86::Mem m2(x86::r13, (uint32)offsetof(WASMModule, fast_jit_func_ptrs));
+        a.mov(x86::rax, m2);
+        /* rax = fast_jit_func_ptrs[rbx] */
+        x86::Mem m3(x86::rax, x86::rbx, 3, 0);
+        a.mov(x86::rax, m3);
+        a.jmp(x86::rax);
+    }
+
+    if (err_handler.err)
+        goto fail2;
+
+    code_buf = (char *)code.sectionById(0)->buffer().data();
+    code_size = code.sectionById(0)->buffer().size();
+    stream = (char *)jit_code_cache_alloc(code_size);
+    if (!stream)
+        goto fail2;
+
+    bh_memcpy_s(stream, code_size, code_buf, code_size);
+    code_block_compile_fast_jit_and_then_call =
+        jit_globals->compile_fast_jit_and_then_call = stream;
+
+#if 0
+    dump_native(stream, code_size);
+#endif
+#endif /* end of WASM_ENABLE_LAZY_JIT != 0 */
+
+    return true;
+
+#if WASM_ENABLE_LAZY_JIT != 0
+fail2:
+    jit_code_cache_free(code_block_return_to_interp_from_jitted);
+#endif
+fail1:
+    jit_code_cache_free(code_block_switch_to_jitted_from_interp);
+    return false;
+}
+
+void
+jit_codegen_destroy()
+{
+#if WASM_ENABLE_LAZY_JIT != 0
+    jit_code_cache_free(code_block_compile_fast_jit_and_then_call);
+#endif
+    jit_code_cache_free(code_block_return_to_interp_from_jitted);
+    jit_code_cache_free(code_block_switch_to_jitted_from_interp);
+}
+
+/* clang-format off */
+static const uint8 hreg_info_I32[3][7] = {
+    /* ebp, eax, ebx, ecx, edx, edi, esi */
+    { 1, 0, 0, 0, 0, 0, 1 }, /* fixed, esi is freely used */
+    { 0, 1, 0, 1, 1, 1, 0 }, /* caller_saved_native */
+    { 0, 1, 1, 1, 1, 1, 0 }  /* caller_saved_jitted */
+};
+
+static const uint8 hreg_info_I64[3][16] = {
+    /* rbp, rax, rbx, rcx, rdx, rdi, rsi, rsp,
+       r8,  r9,  r10, r11, r12, r13, r14, r15 */
+    { 1, 1, 1, 1, 1, 1, 1, 1,
+      0, 0, 0, 0, 0, 0, 0, 1 }, /* fixed, rsi is freely used */
+    { 0, 1, 0, 1, 1, 1, 0, 0,
+      1, 1, 1, 1, 0, 0, 0, 0 }, /* caller_saved_native */
+    { 0, 1, 1, 1, 1, 1, 0, 0,
+      1, 1, 1, 1, 1, 1, 1, 0 }, /* caller_saved_jitted */
+};
+
+/* System V AMD64 ABI Calling Conversion. [XYZ]MM0-7 */
+static uint8 hreg_info_F32[3][16] = {
+    /* xmm0 ~ xmm15 */
+    { 0, 0, 0, 0, 0, 0, 0, 0,
+      1, 1, 1, 1, 1, 1, 1, 1 },
+    { 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 0 }, /* caller_saved_native */
+    { 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 0 }, /* caller_saved_jitted */
+};
+
+/* System V AMD64 ABI Calling Conversion. [XYZ]MM0-7 */
+static uint8 hreg_info_F64[3][16] = {
+    /* xmm0 ~ xmm15 */
+    { 1, 1, 1, 1, 1, 1, 1, 1,
+      0, 0, 0, 0, 0, 0, 0, 1 },
+    { 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 0 }, /* caller_saved_native */
+    { 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 0 }, /* caller_saved_jitted */
+};
+
+static const JitHardRegInfo hreg_info = {
+    {
+        { 0, NULL, NULL, NULL }, /* VOID */
+
+        { sizeof(hreg_info_I32[0]), /* I32 */
+          hreg_info_I32[0],
+          hreg_info_I32[1],
+          hreg_info_I32[2] },
+
+        { sizeof(hreg_info_I64[0]), /* I64 */
+          hreg_info_I64[0],
+          hreg_info_I64[1],
+          hreg_info_I64[2] },
+
+        { sizeof(hreg_info_F32[0]), /* F32 */
+          hreg_info_F32[0],
+          hreg_info_F32[1],
+          hreg_info_F32[2] },
+
+        { sizeof(hreg_info_F64[0]), /* F64 */
+          hreg_info_F64[0],
+          hreg_info_F64[1],
+          hreg_info_F64[2] },
+
+        { 0, NULL, NULL, NULL }, /* V8 */
+        { 0, NULL, NULL, NULL }, /* V16 */
+        { 0, NULL, NULL, NULL }  /* V32 */
+    },
+    /* frame pointer hreg index: rbp */
+    0,
+    /* exec_env hreg index: r15 */
+    15,
+    /* cmp hreg index: esi */
+    6
+};
+/* clang-format on */
+
+const JitHardRegInfo *
+jit_codegen_get_hreg_info()
+{
+    return &hreg_info;
+}
+
+static const char *reg_names_i32[] = {
+    "ebp", "eax", "ebx", "ecx", "edx", "edi", "esi", "esp",
+};
+
+static const char *reg_names_i64[] = {
+    "rbp", "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "rsp",
+    "r8",  "r9",  "r10", "r11", "r12", "r13", "r14", "r15",
+};
+
+static const char *reg_names_f32[] = { "xmm0",  "xmm1",  "xmm2",  "xmm3",
+                                       "xmm4",  "xmm5",  "xmm6",  "xmm7",
+                                       "xmm8",  "xmm9",  "xmm10", "xmm11",
+                                       "xmm12", "xmm13", "xmm14", "xmm15" };
+
+static const char *reg_names_f64[] = {
+    "xmm0_f64",  "xmm1_f64",  "xmm2_f64",  "xmm3_f64", "xmm4_f64",  "xmm5_f64",
+    "xmm6_f64",  "xmm7_f64",  "xmm8_f64",  "xmm9_f64", "xmm10_f64", "xmm11_f64",
+    "xmm12_f64", "xmm13_f64", "xmm14_f64", "xmm15_f64"
+};
+
+JitReg
+jit_codegen_get_hreg_by_name(const char *name)
+{
+    size_t i;
+
+    if (name[0] == 'e') {
+        for (i = 0; i < sizeof(reg_names_i32) / sizeof(char *); i++)
+            if (!strcmp(reg_names_i32[i], name))
+                return jit_reg_new(JIT_REG_KIND_I32, i);
+    }
+    else if (name[0] == 'r') {
+        for (i = 0; i < sizeof(reg_names_i64) / sizeof(char *); i++)
+            if (!strcmp(reg_names_i64[i], name))
+                return jit_reg_new(JIT_REG_KIND_I64, i);
+    }
+    else if (!strncmp(name, "xmm", 3)) {
+        if (!strstr(name, "_f64")) {
+            for (i = 0; i < sizeof(reg_names_f32) / sizeof(char *); i++)
+                if (!strcmp(reg_names_f32[i], name))
+                    return jit_reg_new(JIT_REG_KIND_F32, i);
+        }
+        else {
+            for (i = 0; i < sizeof(reg_names_f64) / sizeof(char *); i++)
+                if (!strcmp(reg_names_f64[i], name))
+                    return jit_reg_new(JIT_REG_KIND_F64, i);
+        }
+    }
+    return 0;
+}