tcg-target.c.inc - cachepc-qemu - Fork of AMDESE/qemu with changes for cachepc side-channel attack

	cachepc-qemu Fork of AMDESE/qemu with changes for cachepc side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-qemu
	Log \| Files \| Refs \| Submodules \| LICENSE \| sfeed.txt
tcg-target.c.inc (124959B)
      1/*
      2 * Tiny Code Generator for QEMU
      3 *
      4 * Copyright (c) 2008 Fabrice Bellard
      5 *
      6 * Permission is hereby granted, free of charge, to any person obtaining a copy
      7 * of this software and associated documentation files (the "Software"), to deal
      8 * in the Software without restriction, including without limitation the rights
      9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     10 * copies of the Software, and to permit persons to whom the Software is
     11 * furnished to do so, subject to the following conditions:
     12 *
     13 * The above copyright notice and this permission notice shall be included in
     14 * all copies or substantial portions of the Software.
     15 *
     16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
     19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     22 * THE SOFTWARE.
     23 */
     24
     25#include "../tcg-pool.c.inc"
     26
     27#ifdef CONFIG_DEBUG_TCG
     28static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
     29#if TCG_TARGET_REG_BITS == 64
     30    "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
     31#else
     32    "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
     33#endif
     34    "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
     35    "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
     36#if TCG_TARGET_REG_BITS == 64
     37    "%xmm8", "%xmm9", "%xmm10", "%xmm11",
     38    "%xmm12", "%xmm13", "%xmm14", "%xmm15",
     39#endif
     40};
     41#endif
     42
     43static const int tcg_target_reg_alloc_order[] = {
     44#if TCG_TARGET_REG_BITS == 64
     45    TCG_REG_RBP,
     46    TCG_REG_RBX,
     47    TCG_REG_R12,
     48    TCG_REG_R13,
     49    TCG_REG_R14,
     50    TCG_REG_R15,
     51    TCG_REG_R10,
     52    TCG_REG_R11,
     53    TCG_REG_R9,
     54    TCG_REG_R8,
     55    TCG_REG_RCX,
     56    TCG_REG_RDX,
     57    TCG_REG_RSI,
     58    TCG_REG_RDI,
     59    TCG_REG_RAX,
     60#else
     61    TCG_REG_EBX,
     62    TCG_REG_ESI,
     63    TCG_REG_EDI,
     64    TCG_REG_EBP,
     65    TCG_REG_ECX,
     66    TCG_REG_EDX,
     67    TCG_REG_EAX,
     68#endif
     69    TCG_REG_XMM0,
     70    TCG_REG_XMM1,
     71    TCG_REG_XMM2,
     72    TCG_REG_XMM3,
     73    TCG_REG_XMM4,
     74    TCG_REG_XMM5,
     75#ifndef _WIN64
     76    /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
     77       any of them.  Therefore only allow xmm0-xmm5 to be allocated.  */
     78    TCG_REG_XMM6,
     79    TCG_REG_XMM7,
     80#if TCG_TARGET_REG_BITS == 64
     81    TCG_REG_XMM8,
     82    TCG_REG_XMM9,
     83    TCG_REG_XMM10,
     84    TCG_REG_XMM11,
     85    TCG_REG_XMM12,
     86    TCG_REG_XMM13,
     87    TCG_REG_XMM14,
     88    TCG_REG_XMM15,
     89#endif
     90#endif
     91};
     92
     93static const int tcg_target_call_iarg_regs[] = {
     94#if TCG_TARGET_REG_BITS == 64
     95#if defined(_WIN64)
     96    TCG_REG_RCX,
     97    TCG_REG_RDX,
     98#else
     99    TCG_REG_RDI,
    100    TCG_REG_RSI,
    101    TCG_REG_RDX,
    102    TCG_REG_RCX,
    103#endif
    104    TCG_REG_R8,
    105    TCG_REG_R9,
    106#else
    107    /* 32 bit mode uses stack based calling convention (GCC default). */
    108#endif
    109};
    110
    111static const int tcg_target_call_oarg_regs[] = {
    112    TCG_REG_EAX,
    113#if TCG_TARGET_REG_BITS == 32
    114    TCG_REG_EDX
    115#endif
    116};
    117
    118/* Constants we accept.  */
    119#define TCG_CT_CONST_S32 0x100
    120#define TCG_CT_CONST_U32 0x200
    121#define TCG_CT_CONST_I32 0x400
    122#define TCG_CT_CONST_WSZ 0x800
    123
    124/* Registers used with L constraint, which are the first argument
    125   registers on x86_64, and two random call clobbered registers on
    126   i386. */
    127#if TCG_TARGET_REG_BITS == 64
    128# define TCG_REG_L0 tcg_target_call_iarg_regs[0]
    129# define TCG_REG_L1 tcg_target_call_iarg_regs[1]
    130#else
    131# define TCG_REG_L0 TCG_REG_EAX
    132# define TCG_REG_L1 TCG_REG_EDX
    133#endif
    134
    135#define ALL_BYTEH_REGS         0x0000000fu
    136#if TCG_TARGET_REG_BITS == 64
    137# define ALL_GENERAL_REGS      0x0000ffffu
    138# define ALL_VECTOR_REGS       0xffff0000u
    139# define ALL_BYTEL_REGS        ALL_GENERAL_REGS
    140#else
    141# define ALL_GENERAL_REGS      0x000000ffu
    142# define ALL_VECTOR_REGS       0x00ff0000u
    143# define ALL_BYTEL_REGS        ALL_BYTEH_REGS
    144#endif
    145#ifdef CONFIG_SOFTMMU
    146# define SOFTMMU_RESERVE_REGS  ((1 << TCG_REG_L0) | (1 << TCG_REG_L1))
    147#else
    148# define SOFTMMU_RESERVE_REGS  0
    149#endif
    150
    151/* The host compiler should supply <cpuid.h> to enable runtime features
    152   detection, as we're not going to go so far as our own inline assembly.
    153   If not available, default values will be assumed.  */
    154#if defined(CONFIG_CPUID_H)
    155#include "qemu/cpuid.h"
    156#endif
    157
    158/* For 64-bit, we always know that CMOV is available.  */
    159#if TCG_TARGET_REG_BITS == 64
    160# define have_cmov 1
    161#elif defined(CONFIG_CPUID_H)
    162static bool have_cmov;
    163#else
    164# define have_cmov 0
    165#endif
    166
    167/* We need these symbols in tcg-target.h, and we can't properly conditionalize
    168   it there.  Therefore we always define the variable.  */
    169bool have_bmi1;
    170bool have_popcnt;
    171bool have_avx1;
    172bool have_avx2;
    173bool have_movbe;
    174
    175#ifdef CONFIG_CPUID_H
    176static bool have_bmi2;
    177static bool have_lzcnt;
    178#else
    179# define have_bmi2 0
    180# define have_lzcnt 0
    181#endif
    182
    183static const tcg_insn_unit *tb_ret_addr;
    184
    185static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
    186                        intptr_t value, intptr_t addend)
    187{
    188    value += addend;
    189    switch(type) {
    190    case R_386_PC32:
    191        value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
    192        if (value != (int32_t)value) {
    193            return false;
    194        }
    195        /* FALLTHRU */
    196    case R_386_32:
    197        tcg_patch32(code_ptr, value);
    198        break;
    199    case R_386_PC8:
    200        value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
    201        if (value != (int8_t)value) {
    202            return false;
    203        }
    204        tcg_patch8(code_ptr, value);
    205        break;
    206    default:
    207        tcg_abort();
    208    }
    209    return true;
    210}
    211
    212/* test if a constant matches the constraint */
    213static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
    214{
    215    if (ct & TCG_CT_CONST) {
    216        return 1;
    217    }
    218    if (type == TCG_TYPE_I32) {
    219        if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 | TCG_CT_CONST_I32)) {
    220            return 1;
    221        }
    222    } else {
    223        if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
    224            return 1;
    225        }
    226        if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
    227            return 1;
    228        }
    229        if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
    230            return 1;
    231        }
    232    }
    233    if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
    234        return 1;
    235    }
    236    return 0;
    237}
    238
    239# define LOWREGMASK(x)	((x) & 7)
    240
    241#define P_EXT		0x100		/* 0x0f opcode prefix */
    242#define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
    243#define P_DATA16        0x400           /* 0x66 opcode prefix */
    244#define P_VEXW          0x1000          /* Set VEX.W = 1 */
    245#if TCG_TARGET_REG_BITS == 64
    246# define P_REXW         P_VEXW          /* Set REX.W = 1; match VEXW */
    247# define P_REXB_R       0x2000          /* REG field as byte register */
    248# define P_REXB_RM      0x4000          /* R/M field as byte register */
    249# define P_GS           0x8000          /* gs segment override */
    250#else
    251# define P_REXW		0
    252# define P_REXB_R	0
    253# define P_REXB_RM	0
    254# define P_GS           0
    255#endif
    256#define P_EXT3A         0x10000         /* 0x0f 0x3a opcode prefix */
    257#define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */
    258#define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */
    259#define P_VEXL          0x80000         /* Set VEX.L = 1 */
    260
    261#define OPC_ARITH_EvIz	(0x81)
    262#define OPC_ARITH_EvIb	(0x83)
    263#define OPC_ARITH_GvEv	(0x03)		/* ... plus (ARITH_FOO << 3) */
    264#define OPC_ANDN        (0xf2 | P_EXT38)
    265#define OPC_ADD_GvEv	(OPC_ARITH_GvEv | (ARITH_ADD << 3))
    266#define OPC_AND_GvEv    (OPC_ARITH_GvEv | (ARITH_AND << 3))
    267#define OPC_BLENDPS     (0x0c | P_EXT3A | P_DATA16)
    268#define OPC_BSF         (0xbc | P_EXT)
    269#define OPC_BSR         (0xbd | P_EXT)
    270#define OPC_BSWAP	(0xc8 | P_EXT)
    271#define OPC_CALL_Jz	(0xe8)
    272#define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
    273#define OPC_CMP_GvEv	(OPC_ARITH_GvEv | (ARITH_CMP << 3))
    274#define OPC_DEC_r32	(0x48)
    275#define OPC_IMUL_GvEv	(0xaf | P_EXT)
    276#define OPC_IMUL_GvEvIb	(0x6b)
    277#define OPC_IMUL_GvEvIz	(0x69)
    278#define OPC_INC_r32	(0x40)
    279#define OPC_JCC_long	(0x80 | P_EXT)	/* ... plus condition code */
    280#define OPC_JCC_short	(0x70)		/* ... plus condition code */
    281#define OPC_JMP_long	(0xe9)
    282#define OPC_JMP_short	(0xeb)
    283#define OPC_LEA         (0x8d)
    284#define OPC_LZCNT       (0xbd | P_EXT | P_SIMDF3)
    285#define OPC_MOVB_EvGv	(0x88)		/* stores, more or less */
    286#define OPC_MOVL_EvGv	(0x89)		/* stores, more or less */
    287#define OPC_MOVL_GvEv	(0x8b)		/* loads, more or less */
    288#define OPC_MOVB_EvIz   (0xc6)
    289#define OPC_MOVL_EvIz	(0xc7)
    290#define OPC_MOVL_Iv     (0xb8)
    291#define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
    292#define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
    293#define OPC_MOVD_VyEy   (0x6e | P_EXT | P_DATA16)
    294#define OPC_MOVD_EyVy   (0x7e | P_EXT | P_DATA16)
    295#define OPC_MOVDDUP     (0x12 | P_EXT | P_SIMDF2)
    296#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
    297#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
    298#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
    299#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
    300#define OPC_MOVQ_VqWq   (0x7e | P_EXT | P_SIMDF3)
    301#define OPC_MOVQ_WqVq   (0xd6 | P_EXT | P_DATA16)
    302#define OPC_MOVSBL	(0xbe | P_EXT)
    303#define OPC_MOVSWL	(0xbf | P_EXT)
    304#define OPC_MOVSLQ	(0x63 | P_REXW)
    305#define OPC_MOVZBL	(0xb6 | P_EXT)
    306#define OPC_MOVZWL	(0xb7 | P_EXT)
    307#define OPC_PABSB       (0x1c | P_EXT38 | P_DATA16)
    308#define OPC_PABSW       (0x1d | P_EXT38 | P_DATA16)
    309#define OPC_PABSD       (0x1e | P_EXT38 | P_DATA16)
    310#define OPC_PACKSSDW    (0x6b | P_EXT | P_DATA16)
    311#define OPC_PACKSSWB    (0x63 | P_EXT | P_DATA16)
    312#define OPC_PACKUSDW    (0x2b | P_EXT38 | P_DATA16)
    313#define OPC_PACKUSWB    (0x67 | P_EXT | P_DATA16)
    314#define OPC_PADDB       (0xfc | P_EXT | P_DATA16)
    315#define OPC_PADDW       (0xfd | P_EXT | P_DATA16)
    316#define OPC_PADDD       (0xfe | P_EXT | P_DATA16)
    317#define OPC_PADDQ       (0xd4 | P_EXT | P_DATA16)
    318#define OPC_PADDSB      (0xec | P_EXT | P_DATA16)
    319#define OPC_PADDSW      (0xed | P_EXT | P_DATA16)
    320#define OPC_PADDUB      (0xdc | P_EXT | P_DATA16)
    321#define OPC_PADDUW      (0xdd | P_EXT | P_DATA16)
    322#define OPC_PAND        (0xdb | P_EXT | P_DATA16)
    323#define OPC_PANDN       (0xdf | P_EXT | P_DATA16)
    324#define OPC_PBLENDW     (0x0e | P_EXT3A | P_DATA16)
    325#define OPC_PCMPEQB     (0x74 | P_EXT | P_DATA16)
    326#define OPC_PCMPEQW     (0x75 | P_EXT | P_DATA16)
    327#define OPC_PCMPEQD     (0x76 | P_EXT | P_DATA16)
    328#define OPC_PCMPEQQ     (0x29 | P_EXT38 | P_DATA16)
    329#define OPC_PCMPGTB     (0x64 | P_EXT | P_DATA16)
    330#define OPC_PCMPGTW     (0x65 | P_EXT | P_DATA16)
    331#define OPC_PCMPGTD     (0x66 | P_EXT | P_DATA16)
    332#define OPC_PCMPGTQ     (0x37 | P_EXT38 | P_DATA16)
    333#define OPC_PMAXSB      (0x3c | P_EXT38 | P_DATA16)
    334#define OPC_PMAXSW      (0xee | P_EXT | P_DATA16)
    335#define OPC_PMAXSD      (0x3d | P_EXT38 | P_DATA16)
    336#define OPC_PMAXUB      (0xde | P_EXT | P_DATA16)
    337#define OPC_PMAXUW      (0x3e | P_EXT38 | P_DATA16)
    338#define OPC_PMAXUD      (0x3f | P_EXT38 | P_DATA16)
    339#define OPC_PMINSB      (0x38 | P_EXT38 | P_DATA16)
    340#define OPC_PMINSW      (0xea | P_EXT | P_DATA16)
    341#define OPC_PMINSD      (0x39 | P_EXT38 | P_DATA16)
    342#define OPC_PMINUB      (0xda | P_EXT | P_DATA16)
    343#define OPC_PMINUW      (0x3a | P_EXT38 | P_DATA16)
    344#define OPC_PMINUD      (0x3b | P_EXT38 | P_DATA16)
    345#define OPC_PMOVSXBW    (0x20 | P_EXT38 | P_DATA16)
    346#define OPC_PMOVSXWD    (0x23 | P_EXT38 | P_DATA16)
    347#define OPC_PMOVSXDQ    (0x25 | P_EXT38 | P_DATA16)
    348#define OPC_PMOVZXBW    (0x30 | P_EXT38 | P_DATA16)
    349#define OPC_PMOVZXWD    (0x33 | P_EXT38 | P_DATA16)
    350#define OPC_PMOVZXDQ    (0x35 | P_EXT38 | P_DATA16)
    351#define OPC_PMULLW      (0xd5 | P_EXT | P_DATA16)
    352#define OPC_PMULLD      (0x40 | P_EXT38 | P_DATA16)
    353#define OPC_POR         (0xeb | P_EXT | P_DATA16)
    354#define OPC_PSHUFB      (0x00 | P_EXT38 | P_DATA16)
    355#define OPC_PSHUFD      (0x70 | P_EXT | P_DATA16)
    356#define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
    357#define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
    358#define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
    359#define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /2 /6 /4 */
    360#define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
    361#define OPC_PSLLW       (0xf1 | P_EXT | P_DATA16)
    362#define OPC_PSLLD       (0xf2 | P_EXT | P_DATA16)
    363#define OPC_PSLLQ       (0xf3 | P_EXT | P_DATA16)
    364#define OPC_PSRAW       (0xe1 | P_EXT | P_DATA16)
    365#define OPC_PSRAD       (0xe2 | P_EXT | P_DATA16)
    366#define OPC_PSRLW       (0xd1 | P_EXT | P_DATA16)
    367#define OPC_PSRLD       (0xd2 | P_EXT | P_DATA16)
    368#define OPC_PSRLQ       (0xd3 | P_EXT | P_DATA16)
    369#define OPC_PSUBB       (0xf8 | P_EXT | P_DATA16)
    370#define OPC_PSUBW       (0xf9 | P_EXT | P_DATA16)
    371#define OPC_PSUBD       (0xfa | P_EXT | P_DATA16)
    372#define OPC_PSUBQ       (0xfb | P_EXT | P_DATA16)
    373#define OPC_PSUBSB      (0xe8 | P_EXT | P_DATA16)
    374#define OPC_PSUBSW      (0xe9 | P_EXT | P_DATA16)
    375#define OPC_PSUBUB      (0xd8 | P_EXT | P_DATA16)
    376#define OPC_PSUBUW      (0xd9 | P_EXT | P_DATA16)
    377#define OPC_PUNPCKLBW   (0x60 | P_EXT | P_DATA16)
    378#define OPC_PUNPCKLWD   (0x61 | P_EXT | P_DATA16)
    379#define OPC_PUNPCKLDQ   (0x62 | P_EXT | P_DATA16)
    380#define OPC_PUNPCKLQDQ  (0x6c | P_EXT | P_DATA16)
    381#define OPC_PUNPCKHBW   (0x68 | P_EXT | P_DATA16)
    382#define OPC_PUNPCKHWD   (0x69 | P_EXT | P_DATA16)
    383#define OPC_PUNPCKHDQ   (0x6a | P_EXT | P_DATA16)
    384#define OPC_PUNPCKHQDQ  (0x6d | P_EXT | P_DATA16)
    385#define OPC_PXOR        (0xef | P_EXT | P_DATA16)
    386#define OPC_POP_r32	(0x58)
    387#define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
    388#define OPC_PUSH_r32	(0x50)
    389#define OPC_PUSH_Iv	(0x68)
    390#define OPC_PUSH_Ib	(0x6a)
    391#define OPC_RET		(0xc3)
    392#define OPC_SETCC	(0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
    393#define OPC_SHIFT_1	(0xd1)
    394#define OPC_SHIFT_Ib	(0xc1)
    395#define OPC_SHIFT_cl	(0xd3)
    396#define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
    397#define OPC_SHUFPS      (0xc6 | P_EXT)
    398#define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
    399#define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
    400#define OPC_SHRD_Ib     (0xac | P_EXT)
    401#define OPC_TESTL	(0x85)
    402#define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
    403#define OPC_UD2         (0x0b | P_EXT)
    404#define OPC_VPBLENDD    (0x02 | P_EXT3A | P_DATA16)
    405#define OPC_VPBLENDVB   (0x4c | P_EXT3A | P_DATA16)
    406#define OPC_VPINSRB     (0x20 | P_EXT3A | P_DATA16)
    407#define OPC_VPINSRW     (0xc4 | P_EXT | P_DATA16)
    408#define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
    409#define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16)
    410#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
    411#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
    412#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
    413#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
    414#define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_VEXW)
    415#define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
    416#define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
    417#define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_VEXW)
    418#define OPC_VPSRAVD     (0x46 | P_EXT38 | P_DATA16)
    419#define OPC_VPSRLVD     (0x45 | P_EXT38 | P_DATA16)
    420#define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_VEXW)
    421#define OPC_VZEROUPPER  (0x77 | P_EXT)
    422#define OPC_XCHG_ax_r32	(0x90)
    423
    424#define OPC_GRP3_Ev	(0xf7)
    425#define OPC_GRP5	(0xff)
    426#define OPC_GRP14       (0x73 | P_EXT | P_DATA16)
    427
    428/* Group 1 opcode extensions for 0x80-0x83.
    429   These are also used as modifiers for OPC_ARITH.  */
    430#define ARITH_ADD 0
    431#define ARITH_OR  1
    432#define ARITH_ADC 2
    433#define ARITH_SBB 3
    434#define ARITH_AND 4
    435#define ARITH_SUB 5
    436#define ARITH_XOR 6
    437#define ARITH_CMP 7
    438
    439/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
    440#define SHIFT_ROL 0
    441#define SHIFT_ROR 1
    442#define SHIFT_SHL 4
    443#define SHIFT_SHR 5
    444#define SHIFT_SAR 7
    445
    446/* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
    447#define EXT3_NOT   2
    448#define EXT3_NEG   3
    449#define EXT3_MUL   4
    450#define EXT3_IMUL  5
    451#define EXT3_DIV   6
    452#define EXT3_IDIV  7
    453
    454/* Group 5 opcode extensions for 0xff.  To be used with OPC_GRP5.  */
    455#define EXT5_INC_Ev	0
    456#define EXT5_DEC_Ev	1
    457#define EXT5_CALLN_Ev	2
    458#define EXT5_JMPN_Ev	4
    459
    460/* Condition codes to be added to OPC_JCC_{long,short}.  */
    461#define JCC_JMP (-1)
    462#define JCC_JO  0x0
    463#define JCC_JNO 0x1
    464#define JCC_JB  0x2
    465#define JCC_JAE 0x3
    466#define JCC_JE  0x4
    467#define JCC_JNE 0x5
    468#define JCC_JBE 0x6
    469#define JCC_JA  0x7
    470#define JCC_JS  0x8
    471#define JCC_JNS 0x9
    472#define JCC_JP  0xa
    473#define JCC_JNP 0xb
    474#define JCC_JL  0xc
    475#define JCC_JGE 0xd
    476#define JCC_JLE 0xe
    477#define JCC_JG  0xf
    478
    479static const uint8_t tcg_cond_to_jcc[] = {
    480    [TCG_COND_EQ] = JCC_JE,
    481    [TCG_COND_NE] = JCC_JNE,
    482    [TCG_COND_LT] = JCC_JL,
    483    [TCG_COND_GE] = JCC_JGE,
    484    [TCG_COND_LE] = JCC_JLE,
    485    [TCG_COND_GT] = JCC_JG,
    486    [TCG_COND_LTU] = JCC_JB,
    487    [TCG_COND_GEU] = JCC_JAE,
    488    [TCG_COND_LEU] = JCC_JBE,
    489    [TCG_COND_GTU] = JCC_JA,
    490};
    491
    492#if TCG_TARGET_REG_BITS == 64
    493static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
    494{
    495    int rex;
    496
    497    if (opc & P_GS) {
    498        tcg_out8(s, 0x65);
    499    }
    500    if (opc & P_DATA16) {
    501        /* We should never be asking for both 16 and 64-bit operation.  */
    502        tcg_debug_assert((opc & P_REXW) == 0);
    503        tcg_out8(s, 0x66);
    504    }
    505    if (opc & P_SIMDF3) {
    506        tcg_out8(s, 0xf3);
    507    } else if (opc & P_SIMDF2) {
    508        tcg_out8(s, 0xf2);
    509    }
    510
    511    rex = 0;
    512    rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
    513    rex |= (r & 8) >> 1;                /* REX.R */
    514    rex |= (x & 8) >> 2;                /* REX.X */
    515    rex |= (rm & 8) >> 3;               /* REX.B */
    516
    517    /* P_REXB_{R,RM} indicates that the given register is the low byte.
    518       For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
    519       as otherwise the encoding indicates %[abcd]h.  Note that the values
    520       that are ORed in merely indicate that the REX byte must be present;
    521       those bits get discarded in output.  */
    522    rex |= opc & (r >= 4 ? P_REXB_R : 0);
    523    rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
    524
    525    if (rex) {
    526        tcg_out8(s, (uint8_t)(rex | 0x40));
    527    }
    528
    529    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
    530        tcg_out8(s, 0x0f);
    531        if (opc & P_EXT38) {
    532            tcg_out8(s, 0x38);
    533        } else if (opc & P_EXT3A) {
    534            tcg_out8(s, 0x3a);
    535        }
    536    }
    537
    538    tcg_out8(s, opc);
    539}
    540#else
    541static void tcg_out_opc(TCGContext *s, int opc)
    542{
    543    if (opc & P_DATA16) {
    544        tcg_out8(s, 0x66);
    545    }
    546    if (opc & P_SIMDF3) {
    547        tcg_out8(s, 0xf3);
    548    } else if (opc & P_SIMDF2) {
    549        tcg_out8(s, 0xf2);
    550    }
    551    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
    552        tcg_out8(s, 0x0f);
    553        if (opc & P_EXT38) {
    554            tcg_out8(s, 0x38);
    555        } else if (opc & P_EXT3A) {
    556            tcg_out8(s, 0x3a);
    557        }
    558    }
    559    tcg_out8(s, opc);
    560}
    561/* Discard the register arguments to tcg_out_opc early, so as not to penalize
    562   the 32-bit compilation paths.  This method works with all versions of gcc,
    563   whereas relying on optimization may not be able to exclude them.  */
    564#define tcg_out_opc(s, opc, r, rm, x)  (tcg_out_opc)(s, opc)
    565#endif
    566
    567static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
    568{
    569    tcg_out_opc(s, opc, r, rm, 0);
    570    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
    571}
    572
    573static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
    574                            int rm, int index)
    575{
    576    int tmp;
    577
    578    /* Use the two byte form if possible, which cannot encode
    579       VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
    580    if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT
    581        && ((rm | index) & 8) == 0) {
    582        /* Two byte VEX prefix.  */
    583        tcg_out8(s, 0xc5);
    584
    585        tmp = (r & 8 ? 0 : 0x80);              /* VEX.R */
    586    } else {
    587        /* Three byte VEX prefix.  */
    588        tcg_out8(s, 0xc4);
    589
    590        /* VEX.m-mmmm */
    591        if (opc & P_EXT3A) {
    592            tmp = 3;
    593        } else if (opc & P_EXT38) {
    594            tmp = 2;
    595        } else if (opc & P_EXT) {
    596            tmp = 1;
    597        } else {
    598            g_assert_not_reached();
    599        }
    600        tmp |= (r & 8 ? 0 : 0x80);             /* VEX.R */
    601        tmp |= (index & 8 ? 0 : 0x40);         /* VEX.X */
    602        tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
    603        tcg_out8(s, tmp);
    604
    605        tmp = (opc & P_VEXW ? 0x80 : 0);       /* VEX.W */
    606    }
    607
    608    tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */
    609    /* VEX.pp */
    610    if (opc & P_DATA16) {
    611        tmp |= 1;                          /* 0x66 */
    612    } else if (opc & P_SIMDF3) {
    613        tmp |= 2;                          /* 0xf3 */
    614    } else if (opc & P_SIMDF2) {
    615        tmp |= 3;                          /* 0xf2 */
    616    }
    617    tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
    618    tcg_out8(s, tmp);
    619    tcg_out8(s, opc);
    620}
    621
    622static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
    623{
    624    tcg_out_vex_opc(s, opc, r, v, rm, 0);
    625    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
    626}
    627
    628/* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
    629   We handle either RM and INDEX missing with a negative value.  In 64-bit
    630   mode for absolute addresses, ~RM is the size of the immediate operand
    631   that will follow the instruction.  */
    632
    633static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
    634                               int shift, intptr_t offset)
    635{
    636    int mod, len;
    637
    638    if (index < 0 && rm < 0) {
    639        if (TCG_TARGET_REG_BITS == 64) {
    640            /* Try for a rip-relative addressing mode.  This has replaced
    641               the 32-bit-mode absolute addressing encoding.  */
    642            intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
    643            intptr_t disp = offset - pc;
    644            if (disp == (int32_t)disp) {
    645                tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
    646                tcg_out32(s, disp);
    647                return;
    648            }
    649
    650            /* Try for an absolute address encoding.  This requires the
    651               use of the MODRM+SIB encoding and is therefore larger than
    652               rip-relative addressing.  */
    653            if (offset == (int32_t)offset) {
    654                tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
    655                tcg_out8(s, (4 << 3) | 5);
    656                tcg_out32(s, offset);
    657                return;
    658            }
    659
    660            /* ??? The memory isn't directly addressable.  */
    661            g_assert_not_reached();
    662        } else {
    663            /* Absolute address.  */
    664            tcg_out8(s, (r << 3) | 5);
    665            tcg_out32(s, offset);
    666            return;
    667        }
    668    }
    669
    670    /* Find the length of the immediate addend.  Note that the encoding
    671       that would be used for (%ebp) indicates absolute addressing.  */
    672    if (rm < 0) {
    673        mod = 0, len = 4, rm = 5;
    674    } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
    675        mod = 0, len = 0;
    676    } else if (offset == (int8_t)offset) {
    677        mod = 0x40, len = 1;
    678    } else {
    679        mod = 0x80, len = 4;
    680    }
    681
    682    /* Use a single byte MODRM format if possible.  Note that the encoding
    683       that would be used for %esp is the escape to the two byte form.  */
    684    if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
    685        /* Single byte MODRM format.  */
    686        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
    687    } else {
    688        /* Two byte MODRM+SIB format.  */
    689
    690        /* Note that the encoding that would place %esp into the index
    691           field indicates no index register.  In 64-bit mode, the REX.X
    692           bit counts, so %r12 can be used as the index.  */
    693        if (index < 0) {
    694            index = 4;
    695        } else {
    696            tcg_debug_assert(index != TCG_REG_ESP);
    697        }
    698
    699        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
    700        tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
    701    }
    702
    703    if (len == 1) {
    704        tcg_out8(s, offset);
    705    } else if (len == 4) {
    706        tcg_out32(s, offset);
    707    }
    708}
    709
    710static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
    711                                     int index, int shift, intptr_t offset)
    712{
    713    tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
    714    tcg_out_sib_offset(s, r, rm, index, shift, offset);
    715}
    716
    717static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
    718                                         int rm, int index, int shift,
    719                                         intptr_t offset)
    720{
    721    tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
    722    tcg_out_sib_offset(s, r, rm, index, shift, offset);
    723}
    724
    725/* A simplification of the above with no index or shift.  */
    726static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
    727                                        int rm, intptr_t offset)
    728{
    729    tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
    730}
    731
    732static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
    733                                            int v, int rm, intptr_t offset)
    734{
    735    tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
    736}
    737
    738/* Output an opcode with an expected reference to the constant pool.  */
    739static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
    740{
    741    tcg_out_opc(s, opc, r, 0, 0);
    742    /* Absolute for 32-bit, pc-relative for 64-bit.  */
    743    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
    744    tcg_out32(s, 0);
    745}
    746
    747/* Output an opcode with an expected reference to the constant pool.  */
    748static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
    749{
    750    tcg_out_vex_opc(s, opc, r, 0, 0, 0);
    751    /* Absolute for 32-bit, pc-relative for 64-bit.  */
    752    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
    753    tcg_out32(s, 0);
    754}
    755
    756/* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
    757static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
    758{
    759    /* Propagate an opcode prefix, such as P_REXW.  */
    760    int ext = subop & ~0x7;
    761    subop &= 0x7;
    762
    763    tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
    764}
    765
    766static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
    767{
    768    int rexw = 0;
    769
    770    if (arg == ret) {
    771        return true;
    772    }
    773    switch (type) {
    774    case TCG_TYPE_I64:
    775        rexw = P_REXW;
    776        /* fallthru */
    777    case TCG_TYPE_I32:
    778        if (ret < 16) {
    779            if (arg < 16) {
    780                tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
    781            } else {
    782                tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
    783            }
    784        } else {
    785            if (arg < 16) {
    786                tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
    787            } else {
    788                tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
    789            }
    790        }
    791        break;
    792
    793    case TCG_TYPE_V64:
    794        tcg_debug_assert(ret >= 16 && arg >= 16);
    795        tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
    796        break;
    797    case TCG_TYPE_V128:
    798        tcg_debug_assert(ret >= 16 && arg >= 16);
    799        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
    800        break;
    801    case TCG_TYPE_V256:
    802        tcg_debug_assert(ret >= 16 && arg >= 16);
    803        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
    804        break;
    805
    806    default:
    807        g_assert_not_reached();
    808    }
    809    return true;
    810}
    811
    812static const int avx2_dup_insn[4] = {
    813    OPC_VPBROADCASTB, OPC_VPBROADCASTW,
    814    OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
    815};
    816
    817static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
    818                            TCGReg r, TCGReg a)
    819{
    820    if (have_avx2) {
    821        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
    822        tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a);
    823    } else {
    824        switch (vece) {
    825        case MO_8:
    826            /* ??? With zero in a register, use PSHUFB.  */
    827            tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
    828            a = r;
    829            /* FALLTHRU */
    830        case MO_16:
    831            tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
    832            a = r;
    833            /* FALLTHRU */
    834        case MO_32:
    835            tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
    836            /* imm8 operand: all output lanes selected from input lane 0.  */
    837            tcg_out8(s, 0);
    838            break;
    839        case MO_64:
    840            tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
    841            break;
    842        default:
    843            g_assert_not_reached();
    844        }
    845    }
    846    return true;
    847}
    848
    849static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
    850                             TCGReg r, TCGReg base, intptr_t offset)
    851{
    852    if (have_avx2) {
    853        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
    854        tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l,
    855                                 r, 0, base, offset);
    856    } else {
    857        switch (vece) {
    858        case MO_64:
    859            tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
    860            break;
    861        case MO_32:
    862            tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
    863            break;
    864        case MO_16:
    865            tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset);
    866            tcg_out8(s, 0); /* imm8 */
    867            tcg_out_dup_vec(s, type, vece, r, r);
    868            break;
    869        case MO_8:
    870            tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset);
    871            tcg_out8(s, 0); /* imm8 */
    872            tcg_out_dup_vec(s, type, vece, r, r);
    873            break;
    874        default:
    875            g_assert_not_reached();
    876        }
    877    }
    878    return true;
    879}
    880
    881static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
    882                             TCGReg ret, int64_t arg)
    883{
    884    int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
    885
    886    if (arg == 0) {
    887        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
    888        return;
    889    }
    890    if (arg == -1) {
    891        tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
    892        return;
    893    }
    894
    895    if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) {
    896        if (have_avx2) {
    897            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
    898        } else {
    899            tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
    900        }
    901        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
    902    } else {
    903        if (type == TCG_TYPE_V64) {
    904            tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
    905        } else if (have_avx2) {
    906            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
    907        } else {
    908            tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
    909        }
    910        if (TCG_TARGET_REG_BITS == 64) {
    911            new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
    912        } else {
    913            new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32);
    914        }
    915    }
    916}
    917
    918static void tcg_out_movi_vec(TCGContext *s, TCGType type,
    919                             TCGReg ret, tcg_target_long arg)
    920{
    921    if (arg == 0) {
    922        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
    923        return;
    924    }
    925    if (arg == -1) {
    926        tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret);
    927        return;
    928    }
    929
    930    int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
    931    tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret);
    932    if (TCG_TARGET_REG_BITS == 64) {
    933        new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
    934    } else {
    935        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
    936    }
    937}
    938
    939static void tcg_out_movi_int(TCGContext *s, TCGType type,
    940                             TCGReg ret, tcg_target_long arg)
    941{
    942    tcg_target_long diff;
    943
    944    if (arg == 0) {
    945        tgen_arithr(s, ARITH_XOR, ret, ret);
    946        return;
    947    }
    948    if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
    949        tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
    950        tcg_out32(s, arg);
    951        return;
    952    }
    953    if (arg == (int32_t)arg) {
    954        tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
    955        tcg_out32(s, arg);
    956        return;
    957    }
    958
    959    /* Try a 7 byte pc-relative lea before the 10 byte movq.  */
    960    diff = tcg_pcrel_diff(s, (const void *)arg) - 7;
    961    if (diff == (int32_t)diff) {
    962        tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
    963        tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
    964        tcg_out32(s, diff);
    965        return;
    966    }
    967
    968    tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
    969    tcg_out64(s, arg);
    970}
    971
    972static void tcg_out_movi(TCGContext *s, TCGType type,
    973                         TCGReg ret, tcg_target_long arg)
    974{
    975    switch (type) {
    976    case TCG_TYPE_I32:
    977#if TCG_TARGET_REG_BITS == 64
    978    case TCG_TYPE_I64:
    979#endif
    980        if (ret < 16) {
    981            tcg_out_movi_int(s, type, ret, arg);
    982        } else {
    983            tcg_out_movi_vec(s, type, ret, arg);
    984        }
    985        break;
    986    default:
    987        g_assert_not_reached();
    988    }
    989}
    990
    991static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
    992{
    993    if (val == (int8_t)val) {
    994        tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
    995        tcg_out8(s, val);
    996    } else if (val == (int32_t)val) {
    997        tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
    998        tcg_out32(s, val);
    999    } else {
   1000        tcg_abort();
   1001    }
   1002}
   1003
   1004static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
   1005{
   1006    /* Given the strength of x86 memory ordering, we only need care for
   1007       store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
   1008       faster than "mfence", so don't bother with the sse insn.  */
   1009    if (a0 & TCG_MO_ST_LD) {
   1010        tcg_out8(s, 0xf0);
   1011        tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
   1012        tcg_out8(s, 0);
   1013    }
   1014}
   1015
   1016static inline void tcg_out_push(TCGContext *s, int reg)
   1017{
   1018    tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
   1019}
   1020
   1021static inline void tcg_out_pop(TCGContext *s, int reg)
   1022{
   1023    tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
   1024}
   1025
   1026static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
   1027                       TCGReg arg1, intptr_t arg2)
   1028{
   1029    switch (type) {
   1030    case TCG_TYPE_I32:
   1031        if (ret < 16) {
   1032            tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
   1033        } else {
   1034            tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
   1035        }
   1036        break;
   1037    case TCG_TYPE_I64:
   1038        if (ret < 16) {
   1039            tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
   1040            break;
   1041        }
   1042        /* FALLTHRU */
   1043    case TCG_TYPE_V64:
   1044        /* There is no instruction that can validate 8-byte alignment.  */
   1045        tcg_debug_assert(ret >= 16);
   1046        tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
   1047        break;
   1048    case TCG_TYPE_V128:
   1049        /*
   1050         * The gvec infrastructure is asserts that v128 vector loads
   1051         * and stores use a 16-byte aligned offset.  Validate that the
   1052         * final pointer is aligned by using an insn that will SIGSEGV.
   1053         */
   1054        tcg_debug_assert(ret >= 16);
   1055        tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);
   1056        break;
   1057    case TCG_TYPE_V256:
   1058        /*
   1059         * The gvec infrastructure only requires 16-byte alignment,
   1060         * so here we must use an unaligned load.
   1061         */
   1062        tcg_debug_assert(ret >= 16);
   1063        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
   1064                                 ret, 0, arg1, arg2);
   1065        break;
   1066    default:
   1067        g_assert_not_reached();
   1068    }
   1069}
   1070
   1071static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
   1072                       TCGReg arg1, intptr_t arg2)
   1073{
   1074    switch (type) {
   1075    case TCG_TYPE_I32:
   1076        if (arg < 16) {
   1077            tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
   1078        } else {
   1079            tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
   1080        }
   1081        break;
   1082    case TCG_TYPE_I64:
   1083        if (arg < 16) {
   1084            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
   1085            break;
   1086        }
   1087        /* FALLTHRU */
   1088    case TCG_TYPE_V64:
   1089        /* There is no instruction that can validate 8-byte alignment.  */
   1090        tcg_debug_assert(arg >= 16);
   1091        tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
   1092        break;
   1093    case TCG_TYPE_V128:
   1094        /*
   1095         * The gvec infrastructure is asserts that v128 vector loads
   1096         * and stores use a 16-byte aligned offset.  Validate that the
   1097         * final pointer is aligned by using an insn that will SIGSEGV.
   1098         */
   1099        tcg_debug_assert(arg >= 16);
   1100        tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);
   1101        break;
   1102    case TCG_TYPE_V256:
   1103        /*
   1104         * The gvec infrastructure only requires 16-byte alignment,
   1105         * so here we must use an unaligned store.
   1106         */
   1107        tcg_debug_assert(arg >= 16);
   1108        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
   1109                                 arg, 0, arg1, arg2);
   1110        break;
   1111    default:
   1112        g_assert_not_reached();
   1113    }
   1114}
   1115
   1116static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
   1117                        TCGReg base, intptr_t ofs)
   1118{
   1119    int rexw = 0;
   1120    if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
   1121        if (val != (int32_t)val) {
   1122            return false;
   1123        }
   1124        rexw = P_REXW;
   1125    } else if (type != TCG_TYPE_I32) {
   1126        return false;
   1127    }
   1128    tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
   1129    tcg_out32(s, val);
   1130    return true;
   1131}
   1132
   1133static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
   1134{
   1135    /* Propagate an opcode prefix, such as P_DATA16.  */
   1136    int ext = subopc & ~0x7;
   1137    subopc &= 0x7;
   1138
   1139    if (count == 1) {
   1140        tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
   1141    } else {
   1142        tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
   1143        tcg_out8(s, count);
   1144    }
   1145}
   1146
   1147static inline void tcg_out_bswap32(TCGContext *s, int reg)
   1148{
   1149    tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
   1150}
   1151
   1152static inline void tcg_out_rolw_8(TCGContext *s, int reg)
   1153{
   1154    tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
   1155}
   1156
   1157static inline void tcg_out_ext8u(TCGContext *s, int dest, int src)
   1158{
   1159    /* movzbl */
   1160    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
   1161    tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
   1162}
   1163
   1164static void tcg_out_ext8s(TCGContext *s, int dest, int src, int rexw)
   1165{
   1166    /* movsbl */
   1167    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
   1168    tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
   1169}
   1170
   1171static inline void tcg_out_ext16u(TCGContext *s, int dest, int src)
   1172{
   1173    /* movzwl */
   1174    tcg_out_modrm(s, OPC_MOVZWL, dest, src);
   1175}
   1176
   1177static inline void tcg_out_ext16s(TCGContext *s, int dest, int src, int rexw)
   1178{
   1179    /* movsw[lq] */
   1180    tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
   1181}
   1182
   1183static inline void tcg_out_ext32u(TCGContext *s, int dest, int src)
   1184{
   1185    /* 32-bit mov zero extends.  */
   1186    tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
   1187}
   1188
   1189static inline void tcg_out_ext32s(TCGContext *s, int dest, int src)
   1190{
   1191    tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
   1192}
   1193
   1194static inline void tcg_out_bswap64(TCGContext *s, int reg)
   1195{
   1196    tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
   1197}
   1198
   1199static void tgen_arithi(TCGContext *s, int c, int r0,
   1200                        tcg_target_long val, int cf)
   1201{
   1202    int rexw = 0;
   1203
   1204    if (TCG_TARGET_REG_BITS == 64) {
   1205        rexw = c & -8;
   1206        c &= 7;
   1207    }
   1208
   1209    /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
   1210       partial flags update stalls on Pentium4 and are not recommended
   1211       by current Intel optimization manuals.  */
   1212    if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) {
   1213        int is_inc = (c == ARITH_ADD) ^ (val < 0);
   1214        if (TCG_TARGET_REG_BITS == 64) {
   1215            /* The single-byte increment encodings are re-tasked as the
   1216               REX prefixes.  Use the MODRM encoding.  */
   1217            tcg_out_modrm(s, OPC_GRP5 + rexw,
   1218                          (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
   1219        } else {
   1220            tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
   1221        }
   1222        return;
   1223    }
   1224
   1225    if (c == ARITH_AND) {
   1226        if (TCG_TARGET_REG_BITS == 64) {
   1227            if (val == 0xffffffffu) {
   1228                tcg_out_ext32u(s, r0, r0);
   1229                return;
   1230            }
   1231            if (val == (uint32_t)val) {
   1232                /* AND with no high bits set can use a 32-bit operation.  */
   1233                rexw = 0;
   1234            }
   1235        }
   1236        if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
   1237            tcg_out_ext8u(s, r0, r0);
   1238            return;
   1239        }
   1240        if (val == 0xffffu) {
   1241            tcg_out_ext16u(s, r0, r0);
   1242            return;
   1243        }
   1244    }
   1245
   1246    if (val == (int8_t)val) {
   1247        tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
   1248        tcg_out8(s, val);
   1249        return;
   1250    }
   1251    if (rexw == 0 || val == (int32_t)val) {
   1252        tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
   1253        tcg_out32(s, val);
   1254        return;
   1255    }
   1256
   1257    tcg_abort();
   1258}
   1259
   1260static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
   1261{
   1262    if (val != 0) {
   1263        tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
   1264    }
   1265}
   1266
   1267/* Use SMALL != 0 to force a short forward branch.  */
   1268static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, int small)
   1269{
   1270    int32_t val, val1;
   1271
   1272    if (l->has_value) {
   1273        val = tcg_pcrel_diff(s, l->u.value_ptr);
   1274        val1 = val - 2;
   1275        if ((int8_t)val1 == val1) {
   1276            if (opc == -1) {
   1277                tcg_out8(s, OPC_JMP_short);
   1278            } else {
   1279                tcg_out8(s, OPC_JCC_short + opc);
   1280            }
   1281            tcg_out8(s, val1);
   1282        } else {
   1283            if (small) {
   1284                tcg_abort();
   1285            }
   1286            if (opc == -1) {
   1287                tcg_out8(s, OPC_JMP_long);
   1288                tcg_out32(s, val - 5);
   1289            } else {
   1290                tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
   1291                tcg_out32(s, val - 6);
   1292            }
   1293        }
   1294    } else if (small) {
   1295        if (opc == -1) {
   1296            tcg_out8(s, OPC_JMP_short);
   1297        } else {
   1298            tcg_out8(s, OPC_JCC_short + opc);
   1299        }
   1300        tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
   1301        s->code_ptr += 1;
   1302    } else {
   1303        if (opc == -1) {
   1304            tcg_out8(s, OPC_JMP_long);
   1305        } else {
   1306            tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
   1307        }
   1308        tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
   1309        s->code_ptr += 4;
   1310    }
   1311}
   1312
   1313static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
   1314                        int const_arg2, int rexw)
   1315{
   1316    if (const_arg2) {
   1317        if (arg2 == 0) {
   1318            /* test r, r */
   1319            tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
   1320        } else {
   1321            tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
   1322        }
   1323    } else {
   1324        tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
   1325    }
   1326}
   1327
   1328static void tcg_out_brcond32(TCGContext *s, TCGCond cond,
   1329                             TCGArg arg1, TCGArg arg2, int const_arg2,
   1330                             TCGLabel *label, int small)
   1331{
   1332    tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
   1333    tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
   1334}
   1335
   1336#if TCG_TARGET_REG_BITS == 64
   1337static void tcg_out_brcond64(TCGContext *s, TCGCond cond,
   1338                             TCGArg arg1, TCGArg arg2, int const_arg2,
   1339                             TCGLabel *label, int small)
   1340{
   1341    tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
   1342    tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
   1343}
   1344#else
   1345/* XXX: we implement it at the target level to avoid having to
   1346   handle cross basic blocks temporaries */
   1347static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
   1348                            const int *const_args, int small)
   1349{
   1350    TCGLabel *label_next = gen_new_label();
   1351    TCGLabel *label_this = arg_label(args[5]);
   1352
   1353    switch(args[4]) {
   1354    case TCG_COND_EQ:
   1355        tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
   1356                         label_next, 1);
   1357        tcg_out_brcond32(s, TCG_COND_EQ, args[1], args[3], const_args[3],
   1358                         label_this, small);
   1359        break;
   1360    case TCG_COND_NE:
   1361        tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
   1362                         label_this, small);
   1363        tcg_out_brcond32(s, TCG_COND_NE, args[1], args[3], const_args[3],
   1364                         label_this, small);
   1365        break;
   1366    case TCG_COND_LT:
   1367        tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
   1368                         label_this, small);
   1369        tcg_out_jxx(s, JCC_JNE, label_next, 1);
   1370        tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
   1371                         label_this, small);
   1372        break;
   1373    case TCG_COND_LE:
   1374        tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
   1375                         label_this, small);
   1376        tcg_out_jxx(s, JCC_JNE, label_next, 1);
   1377        tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
   1378                         label_this, small);
   1379        break;
   1380    case TCG_COND_GT:
   1381        tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
   1382                         label_this, small);
   1383        tcg_out_jxx(s, JCC_JNE, label_next, 1);
   1384        tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
   1385                         label_this, small);
   1386        break;
   1387    case TCG_COND_GE:
   1388        tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
   1389                         label_this, small);
   1390        tcg_out_jxx(s, JCC_JNE, label_next, 1);
   1391        tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
   1392                         label_this, small);
   1393        break;
   1394    case TCG_COND_LTU:
   1395        tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
   1396                         label_this, small);
   1397        tcg_out_jxx(s, JCC_JNE, label_next, 1);
   1398        tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
   1399                         label_this, small);
   1400        break;
   1401    case TCG_COND_LEU:
   1402        tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
   1403                         label_this, small);
   1404        tcg_out_jxx(s, JCC_JNE, label_next, 1);
   1405        tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
   1406                         label_this, small);
   1407        break;
   1408    case TCG_COND_GTU:
   1409        tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
   1410                         label_this, small);
   1411        tcg_out_jxx(s, JCC_JNE, label_next, 1);
   1412        tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
   1413                         label_this, small);
   1414        break;
   1415    case TCG_COND_GEU:
   1416        tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
   1417                         label_this, small);
   1418        tcg_out_jxx(s, JCC_JNE, label_next, 1);
   1419        tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
   1420                         label_this, small);
   1421        break;
   1422    default:
   1423        tcg_abort();
   1424    }
   1425    tcg_out_label(s, label_next);
   1426}
   1427#endif
   1428
   1429static void tcg_out_setcond32(TCGContext *s, TCGCond cond, TCGArg dest,
   1430                              TCGArg arg1, TCGArg arg2, int const_arg2)
   1431{
   1432    tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
   1433    tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
   1434    tcg_out_ext8u(s, dest, dest);
   1435}
   1436
   1437#if TCG_TARGET_REG_BITS == 64
   1438static void tcg_out_setcond64(TCGContext *s, TCGCond cond, TCGArg dest,
   1439                              TCGArg arg1, TCGArg arg2, int const_arg2)
   1440{
   1441    tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
   1442    tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
   1443    tcg_out_ext8u(s, dest, dest);
   1444}
   1445#else
   1446static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
   1447                             const int *const_args)
   1448{
   1449    TCGArg new_args[6];
   1450    TCGLabel *label_true, *label_over;
   1451
   1452    memcpy(new_args, args+1, 5*sizeof(TCGArg));
   1453
   1454    if (args[0] == args[1] || args[0] == args[2]
   1455        || (!const_args[3] && args[0] == args[3])
   1456        || (!const_args[4] && args[0] == args[4])) {
   1457        /* When the destination overlaps with one of the argument
   1458           registers, don't do anything tricky.  */
   1459        label_true = gen_new_label();
   1460        label_over = gen_new_label();
   1461
   1462        new_args[5] = label_arg(label_true);
   1463        tcg_out_brcond2(s, new_args, const_args+1, 1);
   1464
   1465        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
   1466        tcg_out_jxx(s, JCC_JMP, label_over, 1);
   1467        tcg_out_label(s, label_true);
   1468
   1469        tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
   1470        tcg_out_label(s, label_over);
   1471    } else {
   1472        /* When the destination does not overlap one of the arguments,
   1473           clear the destination first, jump if cond false, and emit an
   1474           increment in the true case.  This results in smaller code.  */
   1475
   1476        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
   1477
   1478        label_over = gen_new_label();
   1479        new_args[4] = tcg_invert_cond(new_args[4]);
   1480        new_args[5] = label_arg(label_over);
   1481        tcg_out_brcond2(s, new_args, const_args+1, 1);
   1482
   1483        tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
   1484        tcg_out_label(s, label_over);
   1485    }
   1486}
   1487#endif
   1488
   1489static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
   1490                         TCGReg dest, TCGReg v1)
   1491{
   1492    if (have_cmov) {
   1493        tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
   1494    } else {
   1495        TCGLabel *over = gen_new_label();
   1496        tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
   1497        tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
   1498        tcg_out_label(s, over);
   1499    }
   1500}
   1501
   1502static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGReg dest,
   1503                              TCGReg c1, TCGArg c2, int const_c2,
   1504                              TCGReg v1)
   1505{
   1506    tcg_out_cmp(s, c1, c2, const_c2, 0);
   1507    tcg_out_cmov(s, cond, 0, dest, v1);
   1508}
   1509
   1510#if TCG_TARGET_REG_BITS == 64
   1511static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest,
   1512                              TCGReg c1, TCGArg c2, int const_c2,
   1513                              TCGReg v1)
   1514{
   1515    tcg_out_cmp(s, c1, c2, const_c2, P_REXW);
   1516    tcg_out_cmov(s, cond, P_REXW, dest, v1);
   1517}
   1518#endif
   1519
   1520static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
   1521                        TCGArg arg2, bool const_a2)
   1522{
   1523    if (have_bmi1) {
   1524        tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
   1525        if (const_a2) {
   1526            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
   1527        } else {
   1528            tcg_debug_assert(dest != arg2);
   1529            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
   1530        }
   1531    } else {
   1532        tcg_debug_assert(dest != arg2);
   1533        tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
   1534        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
   1535    }
   1536}
   1537
   1538static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
   1539                        TCGArg arg2, bool const_a2)
   1540{
   1541    if (have_lzcnt) {
   1542        tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
   1543        if (const_a2) {
   1544            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
   1545        } else {
   1546            tcg_debug_assert(dest != arg2);
   1547            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
   1548        }
   1549    } else {
   1550        tcg_debug_assert(!const_a2);
   1551        tcg_debug_assert(dest != arg1);
   1552        tcg_debug_assert(dest != arg2);
   1553
   1554        /* Recall that the output of BSR is the index not the count.  */
   1555        tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
   1556        tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
   1557
   1558        /* Since we have destroyed the flags from BSR, we have to re-test.  */
   1559        tcg_out_cmp(s, arg1, 0, 1, rexw);
   1560        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
   1561    }
   1562}
   1563
   1564static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest)
   1565{
   1566    intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
   1567
   1568    if (disp == (int32_t)disp) {
   1569        tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
   1570        tcg_out32(s, disp);
   1571    } else {
   1572        /* rip-relative addressing into the constant pool.
   1573           This is 6 + 8 = 14 bytes, as compared to using an
   1574           an immediate load 10 + 6 = 16 bytes, plus we may
   1575           be able to re-use the pool constant for more calls.  */
   1576        tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
   1577        tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
   1578        new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
   1579        tcg_out32(s, 0);
   1580    }
   1581}
   1582
   1583static inline void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest)
   1584{
   1585    tcg_out_branch(s, 1, dest);
   1586}
   1587
   1588static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest)
   1589{
   1590    tcg_out_branch(s, 0, dest);
   1591}
   1592
   1593static void tcg_out_nopn(TCGContext *s, int n)
   1594{
   1595    int i;
   1596    /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
   1597     * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
   1598     * duplicate prefix, and all of the interesting recent cores can
   1599     * decode and discard the duplicates in a single cycle.
   1600     */
   1601    tcg_debug_assert(n >= 1);
   1602    for (i = 1; i < n; ++i) {
   1603        tcg_out8(s, 0x66);
   1604    }
   1605    tcg_out8(s, 0x90);
   1606}
   1607
   1608#if defined(CONFIG_SOFTMMU)
   1609#include "../tcg-ldst.c.inc"
   1610
   1611/* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
   1612 *                                     int mmu_idx, uintptr_t ra)
   1613 */
   1614static void * const qemu_ld_helpers[(MO_SIZE | MO_BSWAP) + 1] = {
   1615    [MO_UB]   = helper_ret_ldub_mmu,
   1616    [MO_LEUW] = helper_le_lduw_mmu,
   1617    [MO_LEUL] = helper_le_ldul_mmu,
   1618    [MO_LEQ]  = helper_le_ldq_mmu,
   1619    [MO_BEUW] = helper_be_lduw_mmu,
   1620    [MO_BEUL] = helper_be_ldul_mmu,
   1621    [MO_BEQ]  = helper_be_ldq_mmu,
   1622};
   1623
   1624/* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
   1625 *                                     uintxx_t val, int mmu_idx, uintptr_t ra)
   1626 */
   1627static void * const qemu_st_helpers[(MO_SIZE | MO_BSWAP) + 1] = {
   1628    [MO_UB]   = helper_ret_stb_mmu,
   1629    [MO_LEUW] = helper_le_stw_mmu,
   1630    [MO_LEUL] = helper_le_stl_mmu,
   1631    [MO_LEQ]  = helper_le_stq_mmu,
   1632    [MO_BEUW] = helper_be_stw_mmu,
   1633    [MO_BEUL] = helper_be_stl_mmu,
   1634    [MO_BEQ]  = helper_be_stq_mmu,
   1635};
   1636
   1637/* Perform the TLB load and compare.
   1638
   1639   Inputs:
   1640   ADDRLO and ADDRHI contain the low and high part of the address.
   1641
   1642   MEM_INDEX and S_BITS are the memory context and log2 size of the load.
   1643
   1644   WHICH is the offset into the CPUTLBEntry structure of the slot to read.
   1645   This should be offsetof addr_read or addr_write.
   1646
   1647   Outputs:
   1648   LABEL_PTRS is filled with 1 (32-bit addresses) or 2 (64-bit addresses)
   1649   positions of the displacements of forward jumps to the TLB miss case.
   1650
   1651   Second argument register is loaded with the low part of the address.
   1652   In the TLB hit case, it has been adjusted as indicated by the TLB
   1653   and so is a host address.  In the TLB miss case, it continues to
   1654   hold a guest address.
   1655
   1656   First argument register is clobbered.  */
   1657
   1658static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
   1659                                    int mem_index, MemOp opc,
   1660                                    tcg_insn_unit **label_ptr, int which)
   1661{
   1662    const TCGReg r0 = TCG_REG_L0;
   1663    const TCGReg r1 = TCG_REG_L1;
   1664    TCGType ttype = TCG_TYPE_I32;
   1665    TCGType tlbtype = TCG_TYPE_I32;
   1666    int trexw = 0, hrexw = 0, tlbrexw = 0;
   1667    unsigned a_bits = get_alignment_bits(opc);
   1668    unsigned s_bits = opc & MO_SIZE;
   1669    unsigned a_mask = (1 << a_bits) - 1;
   1670    unsigned s_mask = (1 << s_bits) - 1;
   1671    target_ulong tlb_mask;
   1672
   1673    if (TCG_TARGET_REG_BITS == 64) {
   1674        if (TARGET_LONG_BITS == 64) {
   1675            ttype = TCG_TYPE_I64;
   1676            trexw = P_REXW;
   1677        }
   1678        if (TCG_TYPE_PTR == TCG_TYPE_I64) {
   1679            hrexw = P_REXW;
   1680            if (TARGET_PAGE_BITS + CPU_TLB_DYN_MAX_BITS > 32) {
   1681                tlbtype = TCG_TYPE_I64;
   1682                tlbrexw = P_REXW;
   1683            }
   1684        }
   1685    }
   1686
   1687    tcg_out_mov(s, tlbtype, r0, addrlo);
   1688    tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0,
   1689                   TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
   1690
   1691    tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, r0, TCG_AREG0,
   1692                         TLB_MASK_TABLE_OFS(mem_index) +
   1693                         offsetof(CPUTLBDescFast, mask));
   1694
   1695    tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r0, TCG_AREG0,
   1696                         TLB_MASK_TABLE_OFS(mem_index) +
   1697                         offsetof(CPUTLBDescFast, table));
   1698
   1699    /* If the required alignment is at least as large as the access, simply
   1700       copy the address and mask.  For lesser alignments, check that we don't
   1701       cross pages for the complete access.  */
   1702    if (a_bits >= s_bits) {
   1703        tcg_out_mov(s, ttype, r1, addrlo);
   1704    } else {
   1705        tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo, s_mask - a_mask);
   1706    }
   1707    tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask;
   1708    tgen_arithi(s, ARITH_AND + trexw, r1, tlb_mask, 0);
   1709
   1710    /* cmp 0(r0), r1 */
   1711    tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, r1, r0, which);
   1712
   1713    /* Prepare for both the fast path add of the tlb addend, and the slow
   1714       path function argument setup.  */
   1715    tcg_out_mov(s, ttype, r1, addrlo);
   1716
   1717    /* jne slow_path */
   1718    tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
   1719    label_ptr[0] = s->code_ptr;
   1720    s->code_ptr += 4;
   1721
   1722    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
   1723        /* cmp 4(r0), addrhi */
   1724        tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, r0, which + 4);
   1725
   1726        /* jne slow_path */
   1727        tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
   1728        label_ptr[1] = s->code_ptr;
   1729        s->code_ptr += 4;
   1730    }
   1731
   1732    /* TLB Hit.  */
   1733
   1734    /* add addend(r0), r1 */
   1735    tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r1, r0,
   1736                         offsetof(CPUTLBEntry, addend));
   1737}
   1738
   1739/*
   1740 * Record the context of a call to the out of line helper code for the slow path
   1741 * for a load or store, so that we can later generate the correct helper code
   1742 */
   1743static void add_qemu_ldst_label(TCGContext *s, bool is_ld, bool is_64,
   1744                                MemOpIdx oi,
   1745                                TCGReg datalo, TCGReg datahi,
   1746                                TCGReg addrlo, TCGReg addrhi,
   1747                                tcg_insn_unit *raddr,
   1748                                tcg_insn_unit **label_ptr)
   1749{
   1750    TCGLabelQemuLdst *label = new_ldst_label(s);
   1751
   1752    label->is_ld = is_ld;
   1753    label->oi = oi;
   1754    label->type = is_64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
   1755    label->datalo_reg = datalo;
   1756    label->datahi_reg = datahi;
   1757    label->addrlo_reg = addrlo;
   1758    label->addrhi_reg = addrhi;
   1759    label->raddr = tcg_splitwx_to_rx(raddr);
   1760    label->label_ptr[0] = label_ptr[0];
   1761    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
   1762        label->label_ptr[1] = label_ptr[1];
   1763    }
   1764}
   1765
   1766/*
   1767 * Generate code for the slow path for a load at the end of block
   1768 */
   1769static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
   1770{
   1771    MemOpIdx oi = l->oi;
   1772    MemOp opc = get_memop(oi);
   1773    TCGReg data_reg;
   1774    tcg_insn_unit **label_ptr = &l->label_ptr[0];
   1775    int rexw = (l->type == TCG_TYPE_I64 ? P_REXW : 0);
   1776
   1777    /* resolve label address */
   1778    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
   1779    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
   1780        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
   1781    }
   1782
   1783    if (TCG_TARGET_REG_BITS == 32) {
   1784        int ofs = 0;
   1785
   1786        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
   1787        ofs += 4;
   1788
   1789        tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
   1790        ofs += 4;
   1791
   1792        if (TARGET_LONG_BITS == 64) {
   1793            tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
   1794            ofs += 4;
   1795        }
   1796
   1797        tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
   1798        ofs += 4;
   1799
   1800        tcg_out_sti(s, TCG_TYPE_PTR, (uintptr_t)l->raddr, TCG_REG_ESP, ofs);
   1801    } else {
   1802        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
   1803        /* The second argument is already loaded with addrlo.  */
   1804        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], oi);
   1805        tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
   1806                     (uintptr_t)l->raddr);
   1807    }
   1808
   1809    tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
   1810
   1811    data_reg = l->datalo_reg;
   1812    switch (opc & MO_SSIZE) {
   1813    case MO_SB:
   1814        tcg_out_ext8s(s, data_reg, TCG_REG_EAX, rexw);
   1815        break;
   1816    case MO_SW:
   1817        tcg_out_ext16s(s, data_reg, TCG_REG_EAX, rexw);
   1818        break;
   1819#if TCG_TARGET_REG_BITS == 64
   1820    case MO_SL:
   1821        tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
   1822        break;
   1823#endif
   1824    case MO_UB:
   1825    case MO_UW:
   1826        /* Note that the helpers have zero-extended to tcg_target_long.  */
   1827    case MO_UL:
   1828        tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
   1829        break;
   1830    case MO_Q:
   1831        if (TCG_TARGET_REG_BITS == 64) {
   1832            tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
   1833        } else if (data_reg == TCG_REG_EDX) {
   1834            /* xchg %edx, %eax */
   1835            tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
   1836            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX);
   1837        } else {
   1838            tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
   1839            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX);
   1840        }
   1841        break;
   1842    default:
   1843        tcg_abort();
   1844    }
   1845
   1846    /* Jump to the code corresponding to next IR of qemu_st */
   1847    tcg_out_jmp(s, l->raddr);
   1848    return true;
   1849}
   1850
   1851/*
   1852 * Generate code for the slow path for a store at the end of block
   1853 */
   1854static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
   1855{
   1856    MemOpIdx oi = l->oi;
   1857    MemOp opc = get_memop(oi);
   1858    MemOp s_bits = opc & MO_SIZE;
   1859    tcg_insn_unit **label_ptr = &l->label_ptr[0];
   1860    TCGReg retaddr;
   1861
   1862    /* resolve label address */
   1863    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
   1864    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
   1865        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
   1866    }
   1867
   1868    if (TCG_TARGET_REG_BITS == 32) {
   1869        int ofs = 0;
   1870
   1871        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
   1872        ofs += 4;
   1873
   1874        tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
   1875        ofs += 4;
   1876
   1877        if (TARGET_LONG_BITS == 64) {
   1878            tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
   1879            ofs += 4;
   1880        }
   1881
   1882        tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs);
   1883        ofs += 4;
   1884
   1885        if (s_bits == MO_64) {
   1886            tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs);
   1887            ofs += 4;
   1888        }
   1889
   1890        tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
   1891        ofs += 4;
   1892
   1893        retaddr = TCG_REG_EAX;
   1894        tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
   1895        tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, ofs);
   1896    } else {
   1897        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
   1898        /* The second argument is already loaded with addrlo.  */
   1899        tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
   1900                    tcg_target_call_iarg_regs[2], l->datalo_reg);
   1901        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], oi);
   1902
   1903        if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
   1904            retaddr = tcg_target_call_iarg_regs[4];
   1905            tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
   1906        } else {
   1907            retaddr = TCG_REG_RAX;
   1908            tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
   1909            tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP,
   1910                       TCG_TARGET_CALL_STACK_OFFSET);
   1911        }
   1912    }
   1913
   1914    /* "Tail call" to the helper, with the return address back inline.  */
   1915    tcg_out_push(s, retaddr);
   1916    tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
   1917    return true;
   1918}
   1919#elif TCG_TARGET_REG_BITS == 32
   1920# define x86_guest_base_seg     0
   1921# define x86_guest_base_index   -1
   1922# define x86_guest_base_offset  guest_base
   1923#else
   1924static int x86_guest_base_seg;
   1925static int x86_guest_base_index = -1;
   1926static int32_t x86_guest_base_offset;
   1927# if defined(__x86_64__) && defined(__linux__)
   1928#  include <asm/prctl.h>
   1929#  include <sys/prctl.h>
   1930int arch_prctl(int code, unsigned long addr);
   1931static inline int setup_guest_base_seg(void)
   1932{
   1933    if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
   1934        return P_GS;
   1935    }
   1936    return 0;
   1937}
   1938# elif defined (__FreeBSD__) || defined (__FreeBSD_kernel__)
   1939#  include <machine/sysarch.h>
   1940static inline int setup_guest_base_seg(void)
   1941{
   1942    if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
   1943        return P_GS;
   1944    }
   1945    return 0;
   1946}
   1947# else
   1948static inline int setup_guest_base_seg(void)
   1949{
   1950    return 0;
   1951}
   1952# endif
   1953#endif /* SOFTMMU */
   1954
   1955static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
   1956                                   TCGReg base, int index, intptr_t ofs,
   1957                                   int seg, bool is64, MemOp memop)
   1958{
   1959    bool use_movbe = false;
   1960    int rexw = is64 * P_REXW;
   1961    int movop = OPC_MOVL_GvEv;
   1962
   1963    /* Do big-endian loads with movbe.  */
   1964    if (memop & MO_BSWAP) {
   1965        tcg_debug_assert(have_movbe);
   1966        use_movbe = true;
   1967        movop = OPC_MOVBE_GyMy;
   1968    }
   1969
   1970    switch (memop & MO_SSIZE) {
   1971    case MO_UB:
   1972        tcg_out_modrm_sib_offset(s, OPC_MOVZBL + seg, datalo,
   1973                                 base, index, 0, ofs);
   1974        break;
   1975    case MO_SB:
   1976        tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + seg, datalo,
   1977                                 base, index, 0, ofs);
   1978        break;
   1979    case MO_UW:
   1980        if (use_movbe) {
   1981            /* There is no extending movbe; only low 16-bits are modified.  */
   1982            if (datalo != base && datalo != index) {
   1983                /* XOR breaks dependency chains.  */
   1984                tgen_arithr(s, ARITH_XOR, datalo, datalo);
   1985                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
   1986                                         datalo, base, index, 0, ofs);
   1987            } else {
   1988                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
   1989                                         datalo, base, index, 0, ofs);
   1990                tcg_out_ext16u(s, datalo, datalo);
   1991            }
   1992        } else {
   1993            tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
   1994                                     base, index, 0, ofs);
   1995        }
   1996        break;
   1997    case MO_SW:
   1998        if (use_movbe) {
   1999            tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
   2000                                     datalo, base, index, 0, ofs);
   2001            tcg_out_ext16s(s, datalo, datalo, rexw);
   2002        } else {
   2003            tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + seg,
   2004                                     datalo, base, index, 0, ofs);
   2005        }
   2006        break;
   2007    case MO_UL:
   2008        tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
   2009        break;
   2010#if TCG_TARGET_REG_BITS == 64
   2011    case MO_SL:
   2012        if (use_movbe) {
   2013            tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + seg, datalo,
   2014                                     base, index, 0, ofs);
   2015            tcg_out_ext32s(s, datalo, datalo);
   2016        } else {
   2017            tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + seg, datalo,
   2018                                     base, index, 0, ofs);
   2019        }
   2020        break;
   2021#endif
   2022    case MO_Q:
   2023        if (TCG_TARGET_REG_BITS == 64) {
   2024            tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
   2025                                     base, index, 0, ofs);
   2026        } else {
   2027            if (use_movbe) {
   2028                TCGReg t = datalo;
   2029                datalo = datahi;
   2030                datahi = t;
   2031            }
   2032            if (base != datalo) {
   2033                tcg_out_modrm_sib_offset(s, movop + seg, datalo,
   2034                                         base, index, 0, ofs);
   2035                tcg_out_modrm_sib_offset(s, movop + seg, datahi,
   2036                                         base, index, 0, ofs + 4);
   2037            } else {
   2038                tcg_out_modrm_sib_offset(s, movop + seg, datahi,
   2039                                         base, index, 0, ofs + 4);
   2040                tcg_out_modrm_sib_offset(s, movop + seg, datalo,
   2041                                         base, index, 0, ofs);
   2042            }
   2043        }
   2044        break;
   2045    default:
   2046        g_assert_not_reached();
   2047    }
   2048}
   2049
   2050/* XXX: qemu_ld and qemu_st could be modified to clobber only EDX and
   2051   EAX. It will be useful once fixed registers globals are less
   2052   common. */
   2053static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
   2054{
   2055    TCGReg datalo, datahi, addrlo;
   2056    TCGReg addrhi __attribute__((unused));
   2057    MemOpIdx oi;
   2058    MemOp opc;
   2059#if defined(CONFIG_SOFTMMU)
   2060    int mem_index;
   2061    tcg_insn_unit *label_ptr[2];
   2062#endif
   2063
   2064    datalo = *args++;
   2065    datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
   2066    addrlo = *args++;
   2067    addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
   2068    oi = *args++;
   2069    opc = get_memop(oi);
   2070
   2071#if defined(CONFIG_SOFTMMU)
   2072    mem_index = get_mmuidx(oi);
   2073
   2074    tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
   2075                     label_ptr, offsetof(CPUTLBEntry, addr_read));
   2076
   2077    /* TLB Hit.  */
   2078    tcg_out_qemu_ld_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, is64, opc);
   2079
   2080    /* Record the current context of a load into ldst label */
   2081    add_qemu_ldst_label(s, true, is64, oi, datalo, datahi, addrlo, addrhi,
   2082                        s->code_ptr, label_ptr);
   2083#else
   2084    tcg_out_qemu_ld_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
   2085                           x86_guest_base_offset, x86_guest_base_seg,
   2086                           is64, opc);
   2087#endif
   2088}
   2089
   2090static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
   2091                                   TCGReg base, int index, intptr_t ofs,
   2092                                   int seg, MemOp memop)
   2093{
   2094    bool use_movbe = false;
   2095    int movop = OPC_MOVL_EvGv;
   2096
   2097    /*
   2098     * Do big-endian stores with movbe or softmmu.
   2099     * User-only without movbe will have its swapping done generically.
   2100     */
   2101    if (memop & MO_BSWAP) {
   2102        tcg_debug_assert(have_movbe);
   2103        use_movbe = true;
   2104        movop = OPC_MOVBE_MyGy;
   2105    }
   2106
   2107    switch (memop & MO_SIZE) {
   2108    case MO_8:
   2109        /* This is handled with constraints on INDEX_op_qemu_st8_i32. */
   2110        tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4);
   2111        tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + seg,
   2112                                 datalo, base, index, 0, ofs);
   2113        break;
   2114    case MO_16:
   2115        tcg_out_modrm_sib_offset(s, movop + P_DATA16 + seg, datalo,
   2116                                 base, index, 0, ofs);
   2117        break;
   2118    case MO_32:
   2119        tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
   2120        break;
   2121    case MO_64:
   2122        if (TCG_TARGET_REG_BITS == 64) {
   2123            tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
   2124                                     base, index, 0, ofs);
   2125        } else {
   2126            if (use_movbe) {
   2127                TCGReg t = datalo;
   2128                datalo = datahi;
   2129                datahi = t;
   2130            }
   2131            tcg_out_modrm_sib_offset(s, movop + seg, datalo,
   2132                                     base, index, 0, ofs);
   2133            tcg_out_modrm_sib_offset(s, movop + seg, datahi,
   2134                                     base, index, 0, ofs + 4);
   2135        }
   2136        break;
   2137    default:
   2138        g_assert_not_reached();
   2139    }
   2140}
   2141
   2142static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
   2143{
   2144    TCGReg datalo, datahi, addrlo;
   2145    TCGReg addrhi __attribute__((unused));
   2146    MemOpIdx oi;
   2147    MemOp opc;
   2148#if defined(CONFIG_SOFTMMU)
   2149    int mem_index;
   2150    tcg_insn_unit *label_ptr[2];
   2151#endif
   2152
   2153    datalo = *args++;
   2154    datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
   2155    addrlo = *args++;
   2156    addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
   2157    oi = *args++;
   2158    opc = get_memop(oi);
   2159
   2160#if defined(CONFIG_SOFTMMU)
   2161    mem_index = get_mmuidx(oi);
   2162
   2163    tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
   2164                     label_ptr, offsetof(CPUTLBEntry, addr_write));
   2165
   2166    /* TLB Hit.  */
   2167    tcg_out_qemu_st_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, opc);
   2168
   2169    /* Record the current context of a store into ldst label */
   2170    add_qemu_ldst_label(s, false, is64, oi, datalo, datahi, addrlo, addrhi,
   2171                        s->code_ptr, label_ptr);
   2172#else
   2173    tcg_out_qemu_st_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
   2174                           x86_guest_base_offset, x86_guest_base_seg, opc);
   2175#endif
   2176}
   2177
   2178static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
   2179                              const TCGArg args[TCG_MAX_OP_ARGS],
   2180                              const int const_args[TCG_MAX_OP_ARGS])
   2181{
   2182    TCGArg a0, a1, a2;
   2183    int c, const_a2, vexop, rexw = 0;
   2184
   2185#if TCG_TARGET_REG_BITS == 64
   2186# define OP_32_64(x) \
   2187        case glue(glue(INDEX_op_, x), _i64): \
   2188            rexw = P_REXW; /* FALLTHRU */    \
   2189        case glue(glue(INDEX_op_, x), _i32)
   2190#else
   2191# define OP_32_64(x) \
   2192        case glue(glue(INDEX_op_, x), _i32)
   2193#endif
   2194
   2195    /* Hoist the loads of the most common arguments.  */
   2196    a0 = args[0];
   2197    a1 = args[1];
   2198    a2 = args[2];
   2199    const_a2 = const_args[2];
   2200
   2201    switch (opc) {
   2202    case INDEX_op_exit_tb:
   2203        /* Reuse the zeroing that exists for goto_ptr.  */
   2204        if (a0 == 0) {
   2205            tcg_out_jmp(s, tcg_code_gen_epilogue);
   2206        } else {
   2207            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
   2208            tcg_out_jmp(s, tb_ret_addr);
   2209        }
   2210        break;
   2211    case INDEX_op_goto_tb:
   2212        if (s->tb_jmp_insn_offset) {
   2213            /* direct jump method */
   2214            int gap;
   2215            /* jump displacement must be aligned for atomic patching;
   2216             * see if we need to add extra nops before jump
   2217             */
   2218            gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
   2219            if (gap != 1) {
   2220                tcg_out_nopn(s, gap - 1);
   2221            }
   2222            tcg_out8(s, OPC_JMP_long); /* jmp im */
   2223            s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
   2224            tcg_out32(s, 0);
   2225        } else {
   2226            /* indirect jump method */
   2227            tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, -1,
   2228                                 (intptr_t)(s->tb_jmp_target_addr + a0));
   2229        }
   2230        set_jmp_reset_offset(s, a0);
   2231        break;
   2232    case INDEX_op_goto_ptr:
   2233        /* jmp to the given host address (could be epilogue) */
   2234        tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
   2235        break;
   2236    case INDEX_op_br:
   2237        tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
   2238        break;
   2239    OP_32_64(ld8u):
   2240        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
   2241        tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
   2242        break;
   2243    OP_32_64(ld8s):
   2244        tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
   2245        break;
   2246    OP_32_64(ld16u):
   2247        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
   2248        tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
   2249        break;
   2250    OP_32_64(ld16s):
   2251        tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
   2252        break;
   2253#if TCG_TARGET_REG_BITS == 64
   2254    case INDEX_op_ld32u_i64:
   2255#endif
   2256    case INDEX_op_ld_i32:
   2257        tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
   2258        break;
   2259
   2260    OP_32_64(st8):
   2261        if (const_args[0]) {
   2262            tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
   2263            tcg_out8(s, a0);
   2264        } else {
   2265            tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
   2266        }
   2267        break;
   2268    OP_32_64(st16):
   2269        if (const_args[0]) {
   2270            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
   2271            tcg_out16(s, a0);
   2272        } else {
   2273            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
   2274        }
   2275        break;
   2276#if TCG_TARGET_REG_BITS == 64
   2277    case INDEX_op_st32_i64:
   2278#endif
   2279    case INDEX_op_st_i32:
   2280        if (const_args[0]) {
   2281            tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
   2282            tcg_out32(s, a0);
   2283        } else {
   2284            tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
   2285        }
   2286        break;
   2287
   2288    OP_32_64(add):
   2289        /* For 3-operand addition, use LEA.  */
   2290        if (a0 != a1) {
   2291            TCGArg c3 = 0;
   2292            if (const_a2) {
   2293                c3 = a2, a2 = -1;
   2294            } else if (a0 == a2) {
   2295                /* Watch out for dest = src + dest, since we've removed
   2296                   the matching constraint on the add.  */
   2297                tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
   2298                break;
   2299            }
   2300
   2301            tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
   2302            break;
   2303        }
   2304        c = ARITH_ADD;
   2305        goto gen_arith;
   2306    OP_32_64(sub):
   2307        c = ARITH_SUB;
   2308        goto gen_arith;
   2309    OP_32_64(and):
   2310        c = ARITH_AND;
   2311        goto gen_arith;
   2312    OP_32_64(or):
   2313        c = ARITH_OR;
   2314        goto gen_arith;
   2315    OP_32_64(xor):
   2316        c = ARITH_XOR;
   2317        goto gen_arith;
   2318    gen_arith:
   2319        if (const_a2) {
   2320            tgen_arithi(s, c + rexw, a0, a2, 0);
   2321        } else {
   2322            tgen_arithr(s, c + rexw, a0, a2);
   2323        }
   2324        break;
   2325
   2326    OP_32_64(andc):
   2327        if (const_a2) {
   2328            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
   2329            tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
   2330        } else {
   2331            tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
   2332        }
   2333        break;
   2334
   2335    OP_32_64(mul):
   2336        if (const_a2) {
   2337            int32_t val;
   2338            val = a2;
   2339            if (val == (int8_t)val) {
   2340                tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
   2341                tcg_out8(s, val);
   2342            } else {
   2343                tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
   2344                tcg_out32(s, val);
   2345            }
   2346        } else {
   2347            tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
   2348        }
   2349        break;
   2350
   2351    OP_32_64(div2):
   2352        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
   2353        break;
   2354    OP_32_64(divu2):
   2355        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
   2356        break;
   2357
   2358    OP_32_64(shl):
   2359        /* For small constant 3-operand shift, use LEA.  */
   2360        if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
   2361            if (a2 - 1 == 0) {
   2362                /* shl $1,a1,a0 -> lea (a1,a1),a0 */
   2363                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
   2364            } else {
   2365                /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
   2366                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
   2367            }
   2368            break;
   2369        }
   2370        c = SHIFT_SHL;
   2371        vexop = OPC_SHLX;
   2372        goto gen_shift_maybe_vex;
   2373    OP_32_64(shr):
   2374        c = SHIFT_SHR;
   2375        vexop = OPC_SHRX;
   2376        goto gen_shift_maybe_vex;
   2377    OP_32_64(sar):
   2378        c = SHIFT_SAR;
   2379        vexop = OPC_SARX;
   2380        goto gen_shift_maybe_vex;
   2381    OP_32_64(rotl):
   2382        c = SHIFT_ROL;
   2383        goto gen_shift;
   2384    OP_32_64(rotr):
   2385        c = SHIFT_ROR;
   2386        goto gen_shift;
   2387    gen_shift_maybe_vex:
   2388        if (have_bmi2) {
   2389            if (!const_a2) {
   2390                tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
   2391                break;
   2392            }
   2393            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
   2394        }
   2395        /* FALLTHRU */
   2396    gen_shift:
   2397        if (const_a2) {
   2398            tcg_out_shifti(s, c + rexw, a0, a2);
   2399        } else {
   2400            tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
   2401        }
   2402        break;
   2403
   2404    OP_32_64(ctz):
   2405        tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
   2406        break;
   2407    OP_32_64(clz):
   2408        tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
   2409        break;
   2410    OP_32_64(ctpop):
   2411        tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
   2412        break;
   2413
   2414    case INDEX_op_brcond_i32:
   2415        tcg_out_brcond32(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
   2416        break;
   2417    case INDEX_op_setcond_i32:
   2418        tcg_out_setcond32(s, args[3], a0, a1, a2, const_a2);
   2419        break;
   2420    case INDEX_op_movcond_i32:
   2421        tcg_out_movcond32(s, args[5], a0, a1, a2, const_a2, args[3]);
   2422        break;
   2423
   2424    OP_32_64(bswap16):
   2425        if (a2 & TCG_BSWAP_OS) {
   2426            /* Output must be sign-extended. */
   2427            if (rexw) {
   2428                tcg_out_bswap64(s, a0);
   2429                tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48);
   2430            } else {
   2431                tcg_out_bswap32(s, a0);
   2432                tcg_out_shifti(s, SHIFT_SAR, a0, 16);
   2433            }
   2434        } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) {
   2435            /* Output must be zero-extended, but input isn't. */
   2436            tcg_out_bswap32(s, a0);
   2437            tcg_out_shifti(s, SHIFT_SHR, a0, 16);
   2438        } else {
   2439            tcg_out_rolw_8(s, a0);
   2440        }
   2441        break;
   2442    OP_32_64(bswap32):
   2443        tcg_out_bswap32(s, a0);
   2444        if (rexw && (a2 & TCG_BSWAP_OS)) {
   2445            tcg_out_ext32s(s, a0, a0);
   2446        }
   2447        break;
   2448
   2449    OP_32_64(neg):
   2450        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
   2451        break;
   2452    OP_32_64(not):
   2453        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
   2454        break;
   2455
   2456    OP_32_64(ext8s):
   2457        tcg_out_ext8s(s, a0, a1, rexw);
   2458        break;
   2459    OP_32_64(ext16s):
   2460        tcg_out_ext16s(s, a0, a1, rexw);
   2461        break;
   2462    OP_32_64(ext8u):
   2463        tcg_out_ext8u(s, a0, a1);
   2464        break;
   2465    OP_32_64(ext16u):
   2466        tcg_out_ext16u(s, a0, a1);
   2467        break;
   2468
   2469    case INDEX_op_qemu_ld_i32:
   2470        tcg_out_qemu_ld(s, args, 0);
   2471        break;
   2472    case INDEX_op_qemu_ld_i64:
   2473        tcg_out_qemu_ld(s, args, 1);
   2474        break;
   2475    case INDEX_op_qemu_st_i32:
   2476    case INDEX_op_qemu_st8_i32:
   2477        tcg_out_qemu_st(s, args, 0);
   2478        break;
   2479    case INDEX_op_qemu_st_i64:
   2480        tcg_out_qemu_st(s, args, 1);
   2481        break;
   2482
   2483    OP_32_64(mulu2):
   2484        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
   2485        break;
   2486    OP_32_64(muls2):
   2487        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
   2488        break;
   2489    OP_32_64(add2):
   2490        if (const_args[4]) {
   2491            tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
   2492        } else {
   2493            tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
   2494        }
   2495        if (const_args[5]) {
   2496            tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
   2497        } else {
   2498            tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
   2499        }
   2500        break;
   2501    OP_32_64(sub2):
   2502        if (const_args[4]) {
   2503            tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
   2504        } else {
   2505            tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
   2506        }
   2507        if (const_args[5]) {
   2508            tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
   2509        } else {
   2510            tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
   2511        }
   2512        break;
   2513
   2514#if TCG_TARGET_REG_BITS == 32
   2515    case INDEX_op_brcond2_i32:
   2516        tcg_out_brcond2(s, args, const_args, 0);
   2517        break;
   2518    case INDEX_op_setcond2_i32:
   2519        tcg_out_setcond2(s, args, const_args);
   2520        break;
   2521#else /* TCG_TARGET_REG_BITS == 64 */
   2522    case INDEX_op_ld32s_i64:
   2523        tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
   2524        break;
   2525    case INDEX_op_ld_i64:
   2526        tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
   2527        break;
   2528    case INDEX_op_st_i64:
   2529        if (const_args[0]) {
   2530            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
   2531            tcg_out32(s, a0);
   2532        } else {
   2533            tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
   2534        }
   2535        break;
   2536
   2537    case INDEX_op_brcond_i64:
   2538        tcg_out_brcond64(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
   2539        break;
   2540    case INDEX_op_setcond_i64:
   2541        tcg_out_setcond64(s, args[3], a0, a1, a2, const_a2);
   2542        break;
   2543    case INDEX_op_movcond_i64:
   2544        tcg_out_movcond64(s, args[5], a0, a1, a2, const_a2, args[3]);
   2545        break;
   2546
   2547    case INDEX_op_bswap64_i64:
   2548        tcg_out_bswap64(s, a0);
   2549        break;
   2550    case INDEX_op_extu_i32_i64:
   2551    case INDEX_op_ext32u_i64:
   2552    case INDEX_op_extrl_i64_i32:
   2553        tcg_out_ext32u(s, a0, a1);
   2554        break;
   2555    case INDEX_op_ext_i32_i64:
   2556    case INDEX_op_ext32s_i64:
   2557        tcg_out_ext32s(s, a0, a1);
   2558        break;
   2559    case INDEX_op_extrh_i64_i32:
   2560        tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
   2561        break;
   2562#endif
   2563
   2564    OP_32_64(deposit):
   2565        if (args[3] == 0 && args[4] == 8) {
   2566            /* load bits 0..7 */
   2567            tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
   2568        } else if (args[3] == 8 && args[4] == 8) {
   2569            /* load bits 8..15 */
   2570            tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
   2571        } else if (args[3] == 0 && args[4] == 16) {
   2572            /* load bits 0..15 */
   2573            tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
   2574        } else {
   2575            tcg_abort();
   2576        }
   2577        break;
   2578
   2579    case INDEX_op_extract_i64:
   2580        if (a2 + args[3] == 32) {
   2581            /* This is a 32-bit zero-extending right shift.  */
   2582            tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
   2583            tcg_out_shifti(s, SHIFT_SHR, a0, a2);
   2584            break;
   2585        }
   2586        /* FALLTHRU */
   2587    case INDEX_op_extract_i32:
   2588        /* On the off-chance that we can use the high-byte registers.
   2589           Otherwise we emit the same ext16 + shift pattern that we
   2590           would have gotten from the normal tcg-op.c expansion.  */
   2591        tcg_debug_assert(a2 == 8 && args[3] == 8);
   2592        if (a1 < 4 && a0 < 8) {
   2593            tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
   2594        } else {
   2595            tcg_out_ext16u(s, a0, a1);
   2596            tcg_out_shifti(s, SHIFT_SHR, a0, 8);
   2597        }
   2598        break;
   2599
   2600    case INDEX_op_sextract_i32:
   2601        /* We don't implement sextract_i64, as we cannot sign-extend to
   2602           64-bits without using the REX prefix that explicitly excludes
   2603           access to the high-byte registers.  */
   2604        tcg_debug_assert(a2 == 8 && args[3] == 8);
   2605        if (a1 < 4 && a0 < 8) {
   2606            tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
   2607        } else {
   2608            tcg_out_ext16s(s, a0, a1, 0);
   2609            tcg_out_shifti(s, SHIFT_SAR, a0, 8);
   2610        }
   2611        break;
   2612
   2613    OP_32_64(extract2):
   2614        /* Note that SHRD outputs to the r/m operand.  */
   2615        tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0);
   2616        tcg_out8(s, args[3]);
   2617        break;
   2618
   2619    case INDEX_op_mb:
   2620        tcg_out_mb(s, a0);
   2621        break;
   2622    case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
   2623    case INDEX_op_mov_i64:
   2624    case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
   2625    default:
   2626        tcg_abort();
   2627    }
   2628
   2629#undef OP_32_64
   2630}
   2631
   2632static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
   2633                           unsigned vecl, unsigned vece,
   2634                           const TCGArg args[TCG_MAX_OP_ARGS],
   2635                           const int const_args[TCG_MAX_OP_ARGS])
   2636{
   2637    static int const add_insn[4] = {
   2638        OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
   2639    };
   2640    static int const ssadd_insn[4] = {
   2641        OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
   2642    };
   2643    static int const usadd_insn[4] = {
   2644        OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
   2645    };
   2646    static int const sub_insn[4] = {
   2647        OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
   2648    };
   2649    static int const sssub_insn[4] = {
   2650        OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
   2651    };
   2652    static int const ussub_insn[4] = {
   2653        OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
   2654    };
   2655    static int const mul_insn[4] = {
   2656        OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_UD2
   2657    };
   2658    static int const shift_imm_insn[4] = {
   2659        OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
   2660    };
   2661    static int const cmpeq_insn[4] = {
   2662        OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
   2663    };
   2664    static int const cmpgt_insn[4] = {
   2665        OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
   2666    };
   2667    static int const punpckl_insn[4] = {
   2668        OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
   2669    };
   2670    static int const punpckh_insn[4] = {
   2671        OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
   2672    };
   2673    static int const packss_insn[4] = {
   2674        OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
   2675    };
   2676    static int const packus_insn[4] = {
   2677        OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
   2678    };
   2679    static int const smin_insn[4] = {
   2680        OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_UD2
   2681    };
   2682    static int const smax_insn[4] = {
   2683        OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_UD2
   2684    };
   2685    static int const umin_insn[4] = {
   2686        OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_UD2
   2687    };
   2688    static int const umax_insn[4] = {
   2689        OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_UD2
   2690    };
   2691    static int const shlv_insn[4] = {
   2692        /* TODO: AVX512 adds support for MO_16.  */
   2693        OPC_UD2, OPC_UD2, OPC_VPSLLVD, OPC_VPSLLVQ
   2694    };
   2695    static int const shrv_insn[4] = {
   2696        /* TODO: AVX512 adds support for MO_16.  */
   2697        OPC_UD2, OPC_UD2, OPC_VPSRLVD, OPC_VPSRLVQ
   2698    };
   2699    static int const sarv_insn[4] = {
   2700        /* TODO: AVX512 adds support for MO_16, MO_64.  */
   2701        OPC_UD2, OPC_UD2, OPC_VPSRAVD, OPC_UD2
   2702    };
   2703    static int const shls_insn[4] = {
   2704        OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
   2705    };
   2706    static int const shrs_insn[4] = {
   2707        OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
   2708    };
   2709    static int const sars_insn[4] = {
   2710        OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_UD2
   2711    };
   2712    static int const abs_insn[4] = {
   2713        /* TODO: AVX512 adds support for MO_64.  */
   2714        OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_UD2
   2715    };
   2716
   2717    TCGType type = vecl + TCG_TYPE_V64;
   2718    int insn, sub;
   2719    TCGArg a0, a1, a2;
   2720
   2721    a0 = args[0];
   2722    a1 = args[1];
   2723    a2 = args[2];
   2724
   2725    switch (opc) {
   2726    case INDEX_op_add_vec:
   2727        insn = add_insn[vece];
   2728        goto gen_simd;
   2729    case INDEX_op_ssadd_vec:
   2730        insn = ssadd_insn[vece];
   2731        goto gen_simd;
   2732    case INDEX_op_usadd_vec:
   2733        insn = usadd_insn[vece];
   2734        goto gen_simd;
   2735    case INDEX_op_sub_vec:
   2736        insn = sub_insn[vece];
   2737        goto gen_simd;
   2738    case INDEX_op_sssub_vec:
   2739        insn = sssub_insn[vece];
   2740        goto gen_simd;
   2741    case INDEX_op_ussub_vec:
   2742        insn = ussub_insn[vece];
   2743        goto gen_simd;
   2744    case INDEX_op_mul_vec:
   2745        insn = mul_insn[vece];
   2746        goto gen_simd;
   2747    case INDEX_op_and_vec:
   2748        insn = OPC_PAND;
   2749        goto gen_simd;
   2750    case INDEX_op_or_vec:
   2751        insn = OPC_POR;
   2752        goto gen_simd;
   2753    case INDEX_op_xor_vec:
   2754        insn = OPC_PXOR;
   2755        goto gen_simd;
   2756    case INDEX_op_smin_vec:
   2757        insn = smin_insn[vece];
   2758        goto gen_simd;
   2759    case INDEX_op_umin_vec:
   2760        insn = umin_insn[vece];
   2761        goto gen_simd;
   2762    case INDEX_op_smax_vec:
   2763        insn = smax_insn[vece];
   2764        goto gen_simd;
   2765    case INDEX_op_umax_vec:
   2766        insn = umax_insn[vece];
   2767        goto gen_simd;
   2768    case INDEX_op_shlv_vec:
   2769        insn = shlv_insn[vece];
   2770        goto gen_simd;
   2771    case INDEX_op_shrv_vec:
   2772        insn = shrv_insn[vece];
   2773        goto gen_simd;
   2774    case INDEX_op_sarv_vec:
   2775        insn = sarv_insn[vece];
   2776        goto gen_simd;
   2777    case INDEX_op_shls_vec:
   2778        insn = shls_insn[vece];
   2779        goto gen_simd;
   2780    case INDEX_op_shrs_vec:
   2781        insn = shrs_insn[vece];
   2782        goto gen_simd;
   2783    case INDEX_op_sars_vec:
   2784        insn = sars_insn[vece];
   2785        goto gen_simd;
   2786    case INDEX_op_x86_punpckl_vec:
   2787        insn = punpckl_insn[vece];
   2788        goto gen_simd;
   2789    case INDEX_op_x86_punpckh_vec:
   2790        insn = punpckh_insn[vece];
   2791        goto gen_simd;
   2792    case INDEX_op_x86_packss_vec:
   2793        insn = packss_insn[vece];
   2794        goto gen_simd;
   2795    case INDEX_op_x86_packus_vec:
   2796        insn = packus_insn[vece];
   2797        goto gen_simd;
   2798#if TCG_TARGET_REG_BITS == 32
   2799    case INDEX_op_dup2_vec:
   2800        /* First merge the two 32-bit inputs to a single 64-bit element. */
   2801        tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2);
   2802        /* Then replicate the 64-bit elements across the rest of the vector. */
   2803        if (type != TCG_TYPE_V64) {
   2804            tcg_out_dup_vec(s, type, MO_64, a0, a0);
   2805        }
   2806        break;
   2807#endif
   2808    case INDEX_op_abs_vec:
   2809        insn = abs_insn[vece];
   2810        a2 = a1;
   2811        a1 = 0;
   2812        goto gen_simd;
   2813    gen_simd:
   2814        tcg_debug_assert(insn != OPC_UD2);
   2815        if (type == TCG_TYPE_V256) {
   2816            insn |= P_VEXL;
   2817        }
   2818        tcg_out_vex_modrm(s, insn, a0, a1, a2);
   2819        break;
   2820
   2821    case INDEX_op_cmp_vec:
   2822        sub = args[3];
   2823        if (sub == TCG_COND_EQ) {
   2824            insn = cmpeq_insn[vece];
   2825        } else if (sub == TCG_COND_GT) {
   2826            insn = cmpgt_insn[vece];
   2827        } else {
   2828            g_assert_not_reached();
   2829        }
   2830        goto gen_simd;
   2831
   2832    case INDEX_op_andc_vec:
   2833        insn = OPC_PANDN;
   2834        if (type == TCG_TYPE_V256) {
   2835            insn |= P_VEXL;
   2836        }
   2837        tcg_out_vex_modrm(s, insn, a0, a2, a1);
   2838        break;
   2839
   2840    case INDEX_op_shli_vec:
   2841        sub = 6;
   2842        goto gen_shift;
   2843    case INDEX_op_shri_vec:
   2844        sub = 2;
   2845        goto gen_shift;
   2846    case INDEX_op_sari_vec:
   2847        tcg_debug_assert(vece != MO_64);
   2848        sub = 4;
   2849    gen_shift:
   2850        tcg_debug_assert(vece != MO_8);
   2851        insn = shift_imm_insn[vece];
   2852        if (type == TCG_TYPE_V256) {
   2853            insn |= P_VEXL;
   2854        }
   2855        tcg_out_vex_modrm(s, insn, sub, a0, a1);
   2856        tcg_out8(s, a2);
   2857        break;
   2858
   2859    case INDEX_op_ld_vec:
   2860        tcg_out_ld(s, type, a0, a1, a2);
   2861        break;
   2862    case INDEX_op_st_vec:
   2863        tcg_out_st(s, type, a0, a1, a2);
   2864        break;
   2865    case INDEX_op_dupm_vec:
   2866        tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
   2867        break;
   2868
   2869    case INDEX_op_x86_shufps_vec:
   2870        insn = OPC_SHUFPS;
   2871        sub = args[3];
   2872        goto gen_simd_imm8;
   2873    case INDEX_op_x86_blend_vec:
   2874        if (vece == MO_16) {
   2875            insn = OPC_PBLENDW;
   2876        } else if (vece == MO_32) {
   2877            insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
   2878        } else {
   2879            g_assert_not_reached();
   2880        }
   2881        sub = args[3];
   2882        goto gen_simd_imm8;
   2883    case INDEX_op_x86_vperm2i128_vec:
   2884        insn = OPC_VPERM2I128;
   2885        sub = args[3];
   2886        goto gen_simd_imm8;
   2887    gen_simd_imm8:
   2888        if (type == TCG_TYPE_V256) {
   2889            insn |= P_VEXL;
   2890        }
   2891        tcg_out_vex_modrm(s, insn, a0, a1, a2);
   2892        tcg_out8(s, sub);
   2893        break;
   2894
   2895    case INDEX_op_x86_vpblendvb_vec:
   2896        insn = OPC_VPBLENDVB;
   2897        if (type == TCG_TYPE_V256) {
   2898            insn |= P_VEXL;
   2899        }
   2900        tcg_out_vex_modrm(s, insn, a0, a1, a2);
   2901        tcg_out8(s, args[3] << 4);
   2902        break;
   2903
   2904    case INDEX_op_x86_psrldq_vec:
   2905        tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
   2906        tcg_out8(s, a2);
   2907        break;
   2908
   2909    case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
   2910    case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
   2911    default:
   2912        g_assert_not_reached();
   2913    }
   2914}
   2915
   2916static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
   2917{
   2918    switch (op) {
   2919    case INDEX_op_goto_ptr:
   2920        return C_O0_I1(r);
   2921
   2922    case INDEX_op_ld8u_i32:
   2923    case INDEX_op_ld8u_i64:
   2924    case INDEX_op_ld8s_i32:
   2925    case INDEX_op_ld8s_i64:
   2926    case INDEX_op_ld16u_i32:
   2927    case INDEX_op_ld16u_i64:
   2928    case INDEX_op_ld16s_i32:
   2929    case INDEX_op_ld16s_i64:
   2930    case INDEX_op_ld_i32:
   2931    case INDEX_op_ld32u_i64:
   2932    case INDEX_op_ld32s_i64:
   2933    case INDEX_op_ld_i64:
   2934        return C_O1_I1(r, r);
   2935
   2936    case INDEX_op_st8_i32:
   2937    case INDEX_op_st8_i64:
   2938        return C_O0_I2(qi, r);
   2939
   2940    case INDEX_op_st16_i32:
   2941    case INDEX_op_st16_i64:
   2942    case INDEX_op_st_i32:
   2943    case INDEX_op_st32_i64:
   2944        return C_O0_I2(ri, r);
   2945
   2946    case INDEX_op_st_i64:
   2947        return C_O0_I2(re, r);
   2948
   2949    case INDEX_op_add_i32:
   2950    case INDEX_op_add_i64:
   2951        return C_O1_I2(r, r, re);
   2952
   2953    case INDEX_op_sub_i32:
   2954    case INDEX_op_sub_i64:
   2955    case INDEX_op_mul_i32:
   2956    case INDEX_op_mul_i64:
   2957    case INDEX_op_or_i32:
   2958    case INDEX_op_or_i64:
   2959    case INDEX_op_xor_i32:
   2960    case INDEX_op_xor_i64:
   2961        return C_O1_I2(r, 0, re);
   2962
   2963    case INDEX_op_and_i32:
   2964    case INDEX_op_and_i64:
   2965        return C_O1_I2(r, 0, reZ);
   2966
   2967    case INDEX_op_andc_i32:
   2968    case INDEX_op_andc_i64:
   2969        return C_O1_I2(r, r, rI);
   2970
   2971    case INDEX_op_shl_i32:
   2972    case INDEX_op_shl_i64:
   2973    case INDEX_op_shr_i32:
   2974    case INDEX_op_shr_i64:
   2975    case INDEX_op_sar_i32:
   2976    case INDEX_op_sar_i64:
   2977        return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci);
   2978
   2979    case INDEX_op_rotl_i32:
   2980    case INDEX_op_rotl_i64:
   2981    case INDEX_op_rotr_i32:
   2982    case INDEX_op_rotr_i64:
   2983        return C_O1_I2(r, 0, ci);
   2984
   2985    case INDEX_op_brcond_i32:
   2986    case INDEX_op_brcond_i64:
   2987        return C_O0_I2(r, re);
   2988
   2989    case INDEX_op_bswap16_i32:
   2990    case INDEX_op_bswap16_i64:
   2991    case INDEX_op_bswap32_i32:
   2992    case INDEX_op_bswap32_i64:
   2993    case INDEX_op_bswap64_i64:
   2994    case INDEX_op_neg_i32:
   2995    case INDEX_op_neg_i64:
   2996    case INDEX_op_not_i32:
   2997    case INDEX_op_not_i64:
   2998    case INDEX_op_extrh_i64_i32:
   2999        return C_O1_I1(r, 0);
   3000
   3001    case INDEX_op_ext8s_i32:
   3002    case INDEX_op_ext8s_i64:
   3003    case INDEX_op_ext8u_i32:
   3004    case INDEX_op_ext8u_i64:
   3005        return C_O1_I1(r, q);
   3006
   3007    case INDEX_op_ext16s_i32:
   3008    case INDEX_op_ext16s_i64:
   3009    case INDEX_op_ext16u_i32:
   3010    case INDEX_op_ext16u_i64:
   3011    case INDEX_op_ext32s_i64:
   3012    case INDEX_op_ext32u_i64:
   3013    case INDEX_op_ext_i32_i64:
   3014    case INDEX_op_extu_i32_i64:
   3015    case INDEX_op_extrl_i64_i32:
   3016    case INDEX_op_extract_i32:
   3017    case INDEX_op_extract_i64:
   3018    case INDEX_op_sextract_i32:
   3019    case INDEX_op_ctpop_i32:
   3020    case INDEX_op_ctpop_i64:
   3021        return C_O1_I1(r, r);
   3022
   3023    case INDEX_op_extract2_i32:
   3024    case INDEX_op_extract2_i64:
   3025        return C_O1_I2(r, 0, r);
   3026
   3027    case INDEX_op_deposit_i32:
   3028    case INDEX_op_deposit_i64:
   3029        return C_O1_I2(Q, 0, Q);
   3030
   3031    case INDEX_op_setcond_i32:
   3032    case INDEX_op_setcond_i64:
   3033        return C_O1_I2(q, r, re);
   3034
   3035    case INDEX_op_movcond_i32:
   3036    case INDEX_op_movcond_i64:
   3037        return C_O1_I4(r, r, re, r, 0);
   3038
   3039    case INDEX_op_div2_i32:
   3040    case INDEX_op_div2_i64:
   3041    case INDEX_op_divu2_i32:
   3042    case INDEX_op_divu2_i64:
   3043        return C_O2_I3(a, d, 0, 1, r);
   3044
   3045    case INDEX_op_mulu2_i32:
   3046    case INDEX_op_mulu2_i64:
   3047    case INDEX_op_muls2_i32:
   3048    case INDEX_op_muls2_i64:
   3049        return C_O2_I2(a, d, a, r);
   3050
   3051    case INDEX_op_add2_i32:
   3052    case INDEX_op_add2_i64:
   3053    case INDEX_op_sub2_i32:
   3054    case INDEX_op_sub2_i64:
   3055        return C_O2_I4(r, r, 0, 1, re, re);
   3056
   3057    case INDEX_op_ctz_i32:
   3058    case INDEX_op_ctz_i64:
   3059        return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
   3060
   3061    case INDEX_op_clz_i32:
   3062    case INDEX_op_clz_i64:
   3063        return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
   3064
   3065    case INDEX_op_qemu_ld_i32:
   3066        return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
   3067                ? C_O1_I1(r, L) : C_O1_I2(r, L, L));
   3068
   3069    case INDEX_op_qemu_st_i32:
   3070        return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
   3071                ? C_O0_I2(L, L) : C_O0_I3(L, L, L));
   3072    case INDEX_op_qemu_st8_i32:
   3073        return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
   3074                ? C_O0_I2(s, L) : C_O0_I3(s, L, L));
   3075
   3076    case INDEX_op_qemu_ld_i64:
   3077        return (TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L)
   3078                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O2_I1(r, r, L)
   3079                : C_O2_I2(r, r, L, L));
   3080
   3081    case INDEX_op_qemu_st_i64:
   3082        return (TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L)
   3083                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O0_I3(L, L, L)
   3084                : C_O0_I4(L, L, L, L));
   3085
   3086    case INDEX_op_brcond2_i32:
   3087        return C_O0_I4(r, r, ri, ri);
   3088
   3089    case INDEX_op_setcond2_i32:
   3090        return C_O1_I4(r, r, r, ri, ri);
   3091
   3092    case INDEX_op_ld_vec:
   3093    case INDEX_op_dupm_vec:
   3094        return C_O1_I1(x, r);
   3095
   3096    case INDEX_op_st_vec:
   3097        return C_O0_I2(x, r);
   3098
   3099    case INDEX_op_add_vec:
   3100    case INDEX_op_sub_vec:
   3101    case INDEX_op_mul_vec:
   3102    case INDEX_op_and_vec:
   3103    case INDEX_op_or_vec:
   3104    case INDEX_op_xor_vec:
   3105    case INDEX_op_andc_vec:
   3106    case INDEX_op_ssadd_vec:
   3107    case INDEX_op_usadd_vec:
   3108    case INDEX_op_sssub_vec:
   3109    case INDEX_op_ussub_vec:
   3110    case INDEX_op_smin_vec:
   3111    case INDEX_op_umin_vec:
   3112    case INDEX_op_smax_vec:
   3113    case INDEX_op_umax_vec:
   3114    case INDEX_op_shlv_vec:
   3115    case INDEX_op_shrv_vec:
   3116    case INDEX_op_sarv_vec:
   3117    case INDEX_op_shls_vec:
   3118    case INDEX_op_shrs_vec:
   3119    case INDEX_op_sars_vec:
   3120    case INDEX_op_rotls_vec:
   3121    case INDEX_op_cmp_vec:
   3122    case INDEX_op_x86_shufps_vec:
   3123    case INDEX_op_x86_blend_vec:
   3124    case INDEX_op_x86_packss_vec:
   3125    case INDEX_op_x86_packus_vec:
   3126    case INDEX_op_x86_vperm2i128_vec:
   3127    case INDEX_op_x86_punpckl_vec:
   3128    case INDEX_op_x86_punpckh_vec:
   3129#if TCG_TARGET_REG_BITS == 32
   3130    case INDEX_op_dup2_vec:
   3131#endif
   3132        return C_O1_I2(x, x, x);
   3133
   3134    case INDEX_op_abs_vec:
   3135    case INDEX_op_dup_vec:
   3136    case INDEX_op_shli_vec:
   3137    case INDEX_op_shri_vec:
   3138    case INDEX_op_sari_vec:
   3139    case INDEX_op_x86_psrldq_vec:
   3140        return C_O1_I1(x, x);
   3141
   3142    case INDEX_op_x86_vpblendvb_vec:
   3143        return C_O1_I3(x, x, x, x);
   3144
   3145    default:
   3146        g_assert_not_reached();
   3147    }
   3148}
   3149
   3150int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
   3151{
   3152    switch (opc) {
   3153    case INDEX_op_add_vec:
   3154    case INDEX_op_sub_vec:
   3155    case INDEX_op_and_vec:
   3156    case INDEX_op_or_vec:
   3157    case INDEX_op_xor_vec:
   3158    case INDEX_op_andc_vec:
   3159        return 1;
   3160    case INDEX_op_rotli_vec:
   3161    case INDEX_op_cmp_vec:
   3162    case INDEX_op_cmpsel_vec:
   3163        return -1;
   3164
   3165    case INDEX_op_shli_vec:
   3166    case INDEX_op_shri_vec:
   3167        /* We must expand the operation for MO_8.  */
   3168        return vece == MO_8 ? -1 : 1;
   3169
   3170    case INDEX_op_sari_vec:
   3171        /* We must expand the operation for MO_8.  */
   3172        if (vece == MO_8) {
   3173            return -1;
   3174        }
   3175        /* We can emulate this for MO_64, but it does not pay off
   3176           unless we're producing at least 4 values.  */
   3177        if (vece == MO_64) {
   3178            return type >= TCG_TYPE_V256 ? -1 : 0;
   3179        }
   3180        return 1;
   3181
   3182    case INDEX_op_shls_vec:
   3183    case INDEX_op_shrs_vec:
   3184        return vece >= MO_16;
   3185    case INDEX_op_sars_vec:
   3186        return vece >= MO_16 && vece <= MO_32;
   3187    case INDEX_op_rotls_vec:
   3188        return vece >= MO_16 ? -1 : 0;
   3189
   3190    case INDEX_op_shlv_vec:
   3191    case INDEX_op_shrv_vec:
   3192        return have_avx2 && vece >= MO_32;
   3193    case INDEX_op_sarv_vec:
   3194        return have_avx2 && vece == MO_32;
   3195    case INDEX_op_rotlv_vec:
   3196    case INDEX_op_rotrv_vec:
   3197        return have_avx2 && vece >= MO_32 ? -1 : 0;
   3198
   3199    case INDEX_op_mul_vec:
   3200        if (vece == MO_8) {
   3201            /* We can expand the operation for MO_8.  */
   3202            return -1;
   3203        }
   3204        if (vece == MO_64) {
   3205            return 0;
   3206        }
   3207        return 1;
   3208
   3209    case INDEX_op_ssadd_vec:
   3210    case INDEX_op_usadd_vec:
   3211    case INDEX_op_sssub_vec:
   3212    case INDEX_op_ussub_vec:
   3213        return vece <= MO_16;
   3214    case INDEX_op_smin_vec:
   3215    case INDEX_op_smax_vec:
   3216    case INDEX_op_umin_vec:
   3217    case INDEX_op_umax_vec:
   3218    case INDEX_op_abs_vec:
   3219        return vece <= MO_32;
   3220
   3221    default:
   3222        return 0;
   3223    }
   3224}
   3225
   3226static void expand_vec_shi(TCGType type, unsigned vece, TCGOpcode opc,
   3227                           TCGv_vec v0, TCGv_vec v1, TCGArg imm)
   3228{
   3229    TCGv_vec t1, t2;
   3230
   3231    tcg_debug_assert(vece == MO_8);
   3232
   3233    t1 = tcg_temp_new_vec(type);
   3234    t2 = tcg_temp_new_vec(type);
   3235
   3236    /*
   3237     * Unpack to W, shift, and repack.  Tricky bits:
   3238     * (1) Use punpck*bw x,x to produce DDCCBBAA,
   3239     *     i.e. duplicate in other half of the 16-bit lane.
   3240     * (2) For right-shift, add 8 so that the high half of the lane
   3241     *     becomes zero.  For left-shift, and left-rotate, we must
   3242     *     shift up and down again.
   3243     * (3) Step 2 leaves high half zero such that PACKUSWB
   3244     *     (pack with unsigned saturation) does not modify
   3245     *     the quantity.
   3246     */
   3247    vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
   3248              tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
   3249    vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
   3250              tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
   3251
   3252    if (opc != INDEX_op_rotli_vec) {
   3253        imm += 8;
   3254    }
   3255    if (opc == INDEX_op_shri_vec) {
   3256        tcg_gen_shri_vec(MO_16, t1, t1, imm);
   3257        tcg_gen_shri_vec(MO_16, t2, t2, imm);
   3258    } else {
   3259        tcg_gen_shli_vec(MO_16, t1, t1, imm);
   3260        tcg_gen_shli_vec(MO_16, t2, t2, imm);
   3261        tcg_gen_shri_vec(MO_16, t1, t1, 8);
   3262        tcg_gen_shri_vec(MO_16, t2, t2, 8);
   3263    }
   3264
   3265    vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
   3266              tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
   3267    tcg_temp_free_vec(t1);
   3268    tcg_temp_free_vec(t2);
   3269}
   3270
   3271static void expand_vec_sari(TCGType type, unsigned vece,
   3272                            TCGv_vec v0, TCGv_vec v1, TCGArg imm)
   3273{
   3274    TCGv_vec t1, t2;
   3275
   3276    switch (vece) {
   3277    case MO_8:
   3278        /* Unpack to W, shift, and repack, as in expand_vec_shi.  */
   3279        t1 = tcg_temp_new_vec(type);
   3280        t2 = tcg_temp_new_vec(type);
   3281        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
   3282                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
   3283        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
   3284                  tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
   3285        tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
   3286        tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
   3287        vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
   3288                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
   3289        tcg_temp_free_vec(t1);
   3290        tcg_temp_free_vec(t2);
   3291        break;
   3292
   3293    case MO_64:
   3294        if (imm <= 32) {
   3295            /*
   3296             * We can emulate a small sign extend by performing an arithmetic
   3297             * 32-bit shift and overwriting the high half of a 64-bit logical
   3298             * shift.  Note that the ISA says shift of 32 is valid, but TCG
   3299             * does not, so we have to bound the smaller shift -- we get the
   3300             * same result in the high half either way.
   3301             */
   3302            t1 = tcg_temp_new_vec(type);
   3303            tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31));
   3304            tcg_gen_shri_vec(MO_64, v0, v1, imm);
   3305            vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
   3306                      tcgv_vec_arg(v0), tcgv_vec_arg(v0),
   3307                      tcgv_vec_arg(t1), 0xaa);
   3308            tcg_temp_free_vec(t1);
   3309        } else {
   3310            /* Otherwise we will need to use a compare vs 0 to produce
   3311             * the sign-extend, shift and merge.
   3312             */
   3313            t1 = tcg_const_zeros_vec(type);
   3314            tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, t1, v1);
   3315            tcg_gen_shri_vec(MO_64, v0, v1, imm);
   3316            tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
   3317            tcg_gen_or_vec(MO_64, v0, v0, t1);
   3318            tcg_temp_free_vec(t1);
   3319        }
   3320        break;
   3321
   3322    default:
   3323        g_assert_not_reached();
   3324    }
   3325}
   3326
   3327static void expand_vec_rotli(TCGType type, unsigned vece,
   3328                             TCGv_vec v0, TCGv_vec v1, TCGArg imm)
   3329{
   3330    TCGv_vec t;
   3331
   3332    if (vece == MO_8) {
   3333        expand_vec_shi(type, vece, INDEX_op_rotli_vec, v0, v1, imm);
   3334        return;
   3335    }
   3336
   3337    t = tcg_temp_new_vec(type);
   3338    tcg_gen_shli_vec(vece, t, v1, imm);
   3339    tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
   3340    tcg_gen_or_vec(vece, v0, v0, t);
   3341    tcg_temp_free_vec(t);
   3342}
   3343
   3344static void expand_vec_rotls(TCGType type, unsigned vece,
   3345                             TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
   3346{
   3347    TCGv_i32 rsh;
   3348    TCGv_vec t;
   3349
   3350    tcg_debug_assert(vece != MO_8);
   3351
   3352    t = tcg_temp_new_vec(type);
   3353    rsh = tcg_temp_new_i32();
   3354
   3355    tcg_gen_neg_i32(rsh, lsh);
   3356    tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
   3357    tcg_gen_shls_vec(vece, t, v1, lsh);
   3358    tcg_gen_shrs_vec(vece, v0, v1, rsh);
   3359    tcg_gen_or_vec(vece, v0, v0, t);
   3360    tcg_temp_free_vec(t);
   3361    tcg_temp_free_i32(rsh);
   3362}
   3363
   3364static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
   3365                            TCGv_vec v1, TCGv_vec sh, bool right)
   3366{
   3367    TCGv_vec t = tcg_temp_new_vec(type);
   3368
   3369    tcg_gen_dupi_vec(vece, t, 8 << vece);
   3370    tcg_gen_sub_vec(vece, t, t, sh);
   3371    if (right) {
   3372        tcg_gen_shlv_vec(vece, t, v1, t);
   3373        tcg_gen_shrv_vec(vece, v0, v1, sh);
   3374    } else {
   3375        tcg_gen_shrv_vec(vece, t, v1, t);
   3376        tcg_gen_shlv_vec(vece, v0, v1, sh);
   3377    }
   3378    tcg_gen_or_vec(vece, v0, v0, t);
   3379    tcg_temp_free_vec(t);
   3380}
   3381
   3382static void expand_vec_mul(TCGType type, unsigned vece,
   3383                           TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
   3384{
   3385    TCGv_vec t1, t2, t3, t4, zero;
   3386
   3387    tcg_debug_assert(vece == MO_8);
   3388
   3389    /*
   3390     * Unpack v1 bytes to words, 0 | x.
   3391     * Unpack v2 bytes to words, y | 0.
   3392     * This leaves the 8-bit result, x * y, with 8 bits of right padding.
   3393     * Shift logical right by 8 bits to clear the high 8 bytes before
   3394     * using an unsigned saturated pack.
   3395     *
   3396     * The difference between the V64, V128 and V256 cases is merely how
   3397     * we distribute the expansion between temporaries.
   3398     */
   3399    switch (type) {
   3400    case TCG_TYPE_V64:
   3401        t1 = tcg_temp_new_vec(TCG_TYPE_V128);
   3402        t2 = tcg_temp_new_vec(TCG_TYPE_V128);
   3403        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
   3404        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
   3405                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
   3406        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
   3407                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
   3408        tcg_gen_mul_vec(MO_16, t1, t1, t2);
   3409        tcg_gen_shri_vec(MO_16, t1, t1, 8);
   3410        vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
   3411                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
   3412        tcg_temp_free_vec(t1);
   3413        tcg_temp_free_vec(t2);
   3414        break;
   3415
   3416    case TCG_TYPE_V128:
   3417    case TCG_TYPE_V256:
   3418        t1 = tcg_temp_new_vec(type);
   3419        t2 = tcg_temp_new_vec(type);
   3420        t3 = tcg_temp_new_vec(type);
   3421        t4 = tcg_temp_new_vec(type);
   3422        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
   3423        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
   3424                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
   3425        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
   3426                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
   3427        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
   3428                  tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
   3429        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
   3430                  tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
   3431        tcg_gen_mul_vec(MO_16, t1, t1, t2);
   3432        tcg_gen_mul_vec(MO_16, t3, t3, t4);
   3433        tcg_gen_shri_vec(MO_16, t1, t1, 8);
   3434        tcg_gen_shri_vec(MO_16, t3, t3, 8);
   3435        vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
   3436                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
   3437        tcg_temp_free_vec(t1);
   3438        tcg_temp_free_vec(t2);
   3439        tcg_temp_free_vec(t3);
   3440        tcg_temp_free_vec(t4);
   3441        break;
   3442
   3443    default:
   3444        g_assert_not_reached();
   3445    }
   3446}
   3447
   3448static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
   3449                                 TCGv_vec v1, TCGv_vec v2, TCGCond cond)
   3450{
   3451    enum {
   3452        NEED_INV  = 1,
   3453        NEED_SWAP = 2,
   3454        NEED_BIAS = 4,
   3455        NEED_UMIN = 8,
   3456        NEED_UMAX = 16,
   3457    };
   3458    TCGv_vec t1, t2, t3;
   3459    uint8_t fixup;
   3460
   3461    switch (cond) {
   3462    case TCG_COND_EQ:
   3463    case TCG_COND_GT:
   3464        fixup = 0;
   3465        break;
   3466    case TCG_COND_NE:
   3467    case TCG_COND_LE:
   3468        fixup = NEED_INV;
   3469        break;
   3470    case TCG_COND_LT:
   3471        fixup = NEED_SWAP;
   3472        break;
   3473    case TCG_COND_GE:
   3474        fixup = NEED_SWAP | NEED_INV;
   3475        break;
   3476    case TCG_COND_LEU:
   3477        if (vece <= MO_32) {
   3478            fixup = NEED_UMIN;
   3479        } else {
   3480            fixup = NEED_BIAS | NEED_INV;
   3481        }
   3482        break;
   3483    case TCG_COND_GTU:
   3484        if (vece <= MO_32) {
   3485            fixup = NEED_UMIN | NEED_INV;
   3486        } else {
   3487            fixup = NEED_BIAS;
   3488        }
   3489        break;
   3490    case TCG_COND_GEU:
   3491        if (vece <= MO_32) {
   3492            fixup = NEED_UMAX;
   3493        } else {
   3494            fixup = NEED_BIAS | NEED_SWAP | NEED_INV;
   3495        }
   3496        break;
   3497    case TCG_COND_LTU:
   3498        if (vece <= MO_32) {
   3499            fixup = NEED_UMAX | NEED_INV;
   3500        } else {
   3501            fixup = NEED_BIAS | NEED_SWAP;
   3502        }
   3503        break;
   3504    default:
   3505        g_assert_not_reached();
   3506    }
   3507
   3508    if (fixup & NEED_INV) {
   3509        cond = tcg_invert_cond(cond);
   3510    }
   3511    if (fixup & NEED_SWAP) {
   3512        t1 = v1, v1 = v2, v2 = t1;
   3513        cond = tcg_swap_cond(cond);
   3514    }
   3515
   3516    t1 = t2 = NULL;
   3517    if (fixup & (NEED_UMIN | NEED_UMAX)) {
   3518        t1 = tcg_temp_new_vec(type);
   3519        if (fixup & NEED_UMIN) {
   3520            tcg_gen_umin_vec(vece, t1, v1, v2);
   3521        } else {
   3522            tcg_gen_umax_vec(vece, t1, v1, v2);
   3523        }
   3524        v2 = t1;
   3525        cond = TCG_COND_EQ;
   3526    } else if (fixup & NEED_BIAS) {
   3527        t1 = tcg_temp_new_vec(type);
   3528        t2 = tcg_temp_new_vec(type);
   3529        t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1));
   3530        tcg_gen_sub_vec(vece, t1, v1, t3);
   3531        tcg_gen_sub_vec(vece, t2, v2, t3);
   3532        v1 = t1;
   3533        v2 = t2;
   3534        cond = tcg_signed_cond(cond);
   3535    }
   3536
   3537    tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
   3538    /* Expand directly; do not recurse.  */
   3539    vec_gen_4(INDEX_op_cmp_vec, type, vece,
   3540              tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
   3541
   3542    if (t1) {
   3543        tcg_temp_free_vec(t1);
   3544        if (t2) {
   3545            tcg_temp_free_vec(t2);
   3546        }
   3547    }
   3548    return fixup & NEED_INV;
   3549}
   3550
   3551static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
   3552                           TCGv_vec v1, TCGv_vec v2, TCGCond cond)
   3553{
   3554    if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) {
   3555        tcg_gen_not_vec(vece, v0, v0);
   3556    }
   3557}
   3558
   3559static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0,
   3560                              TCGv_vec c1, TCGv_vec c2,
   3561                              TCGv_vec v3, TCGv_vec v4, TCGCond cond)
   3562{
   3563    TCGv_vec t = tcg_temp_new_vec(type);
   3564
   3565    if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) {
   3566        /* Invert the sense of the compare by swapping arguments.  */
   3567        TCGv_vec x;
   3568        x = v3, v3 = v4, v4 = x;
   3569    }
   3570    vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece,
   3571              tcgv_vec_arg(v0), tcgv_vec_arg(v4),
   3572              tcgv_vec_arg(v3), tcgv_vec_arg(t));
   3573    tcg_temp_free_vec(t);
   3574}
   3575
   3576void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
   3577                       TCGArg a0, ...)
   3578{
   3579    va_list va;
   3580    TCGArg a2;
   3581    TCGv_vec v0, v1, v2, v3, v4;
   3582
   3583    va_start(va, a0);
   3584    v0 = temp_tcgv_vec(arg_temp(a0));
   3585    v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
   3586    a2 = va_arg(va, TCGArg);
   3587
   3588    switch (opc) {
   3589    case INDEX_op_shli_vec:
   3590    case INDEX_op_shri_vec:
   3591        expand_vec_shi(type, vece, opc, v0, v1, a2);
   3592        break;
   3593
   3594    case INDEX_op_sari_vec:
   3595        expand_vec_sari(type, vece, v0, v1, a2);
   3596        break;
   3597
   3598    case INDEX_op_rotli_vec:
   3599        expand_vec_rotli(type, vece, v0, v1, a2);
   3600        break;
   3601
   3602    case INDEX_op_rotls_vec:
   3603        expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2)));
   3604        break;
   3605
   3606    case INDEX_op_rotlv_vec:
   3607        v2 = temp_tcgv_vec(arg_temp(a2));
   3608        expand_vec_rotv(type, vece, v0, v1, v2, false);
   3609        break;
   3610    case INDEX_op_rotrv_vec:
   3611        v2 = temp_tcgv_vec(arg_temp(a2));
   3612        expand_vec_rotv(type, vece, v0, v1, v2, true);
   3613        break;
   3614
   3615    case INDEX_op_mul_vec:
   3616        v2 = temp_tcgv_vec(arg_temp(a2));
   3617        expand_vec_mul(type, vece, v0, v1, v2);
   3618        break;
   3619
   3620    case INDEX_op_cmp_vec:
   3621        v2 = temp_tcgv_vec(arg_temp(a2));
   3622        expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
   3623        break;
   3624
   3625    case INDEX_op_cmpsel_vec:
   3626        v2 = temp_tcgv_vec(arg_temp(a2));
   3627        v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
   3628        v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
   3629        expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg));
   3630        break;
   3631
   3632    default:
   3633        break;
   3634    }
   3635
   3636    va_end(va);
   3637}
   3638
   3639static const int tcg_target_callee_save_regs[] = {
   3640#if TCG_TARGET_REG_BITS == 64
   3641    TCG_REG_RBP,
   3642    TCG_REG_RBX,
   3643#if defined(_WIN64)
   3644    TCG_REG_RDI,
   3645    TCG_REG_RSI,
   3646#endif
   3647    TCG_REG_R12,
   3648    TCG_REG_R13,
   3649    TCG_REG_R14, /* Currently used for the global env. */
   3650    TCG_REG_R15,
   3651#else
   3652    TCG_REG_EBP, /* Currently used for the global env. */
   3653    TCG_REG_EBX,
   3654    TCG_REG_ESI,
   3655    TCG_REG_EDI,
   3656#endif
   3657};
   3658
   3659/* Compute frame size via macros, to share between tcg_target_qemu_prologue
   3660   and tcg_register_jit.  */
   3661
   3662#define PUSH_SIZE \
   3663    ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
   3664     * (TCG_TARGET_REG_BITS / 8))
   3665
   3666#define FRAME_SIZE \
   3667    ((PUSH_SIZE \
   3668      + TCG_STATIC_CALL_ARGS_SIZE \
   3669      + CPU_TEMP_BUF_NLONGS * sizeof(long) \
   3670      + TCG_TARGET_STACK_ALIGN - 1) \
   3671     & ~(TCG_TARGET_STACK_ALIGN - 1))
   3672
   3673/* Generate global QEMU prologue and epilogue code */
   3674static void tcg_target_qemu_prologue(TCGContext *s)
   3675{
   3676    int i, stack_addend;
   3677
   3678    /* TB prologue */
   3679
   3680    /* Reserve some stack space, also for TCG temps.  */
   3681    stack_addend = FRAME_SIZE - PUSH_SIZE;
   3682    tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
   3683                  CPU_TEMP_BUF_NLONGS * sizeof(long));
   3684
   3685    /* Save all callee saved registers.  */
   3686    for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
   3687        tcg_out_push(s, tcg_target_callee_save_regs[i]);
   3688    }
   3689
   3690#if TCG_TARGET_REG_BITS == 32
   3691    tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
   3692               (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
   3693    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
   3694    /* jmp *tb.  */
   3695    tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
   3696                         (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
   3697                         + stack_addend);
   3698#else
   3699# if !defined(CONFIG_SOFTMMU) && TCG_TARGET_REG_BITS == 64
   3700    if (guest_base) {
   3701        int seg = setup_guest_base_seg();
   3702        if (seg != 0) {
   3703            x86_guest_base_seg = seg;
   3704        } else if (guest_base == (int32_t)guest_base) {
   3705            x86_guest_base_offset = guest_base;
   3706        } else {
   3707            /* Choose R12 because, as a base, it requires a SIB byte. */
   3708            x86_guest_base_index = TCG_REG_R12;
   3709            tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base_index, guest_base);
   3710            tcg_regset_set_reg(s->reserved_regs, x86_guest_base_index);
   3711        }
   3712    }
   3713# endif
   3714    tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
   3715    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
   3716    /* jmp *tb.  */
   3717    tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
   3718#endif
   3719
   3720    /*
   3721     * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
   3722     * and fall through to the rest of the epilogue.
   3723     */
   3724    tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr);
   3725    tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
   3726
   3727    /* TB epilogue */
   3728    tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr);
   3729
   3730    tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
   3731
   3732    if (have_avx2) {
   3733        tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
   3734    }
   3735    for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
   3736        tcg_out_pop(s, tcg_target_callee_save_regs[i]);
   3737    }
   3738    tcg_out_opc(s, OPC_RET, 0, 0, 0);
   3739}
   3740
   3741static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
   3742{
   3743    memset(p, 0x90, count);
   3744}
   3745
   3746static void tcg_target_init(TCGContext *s)
   3747{
   3748#ifdef CONFIG_CPUID_H
   3749    unsigned a, b, c, d, b7 = 0;
   3750    int max = __get_cpuid_max(0, 0);
   3751
   3752    if (max >= 7) {
   3753        /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs.  */
   3754        __cpuid_count(7, 0, a, b7, c, d);
   3755        have_bmi1 = (b7 & bit_BMI) != 0;
   3756        have_bmi2 = (b7 & bit_BMI2) != 0;
   3757    }
   3758
   3759    if (max >= 1) {
   3760        __cpuid(1, a, b, c, d);
   3761#ifndef have_cmov
   3762        /* For 32-bit, 99% certainty that we're running on hardware that
   3763           supports cmov, but we still need to check.  In case cmov is not
   3764           available, we'll use a small forward branch.  */
   3765        have_cmov = (d & bit_CMOV) != 0;
   3766#endif
   3767
   3768        /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
   3769           need to probe for it.  */
   3770        have_movbe = (c & bit_MOVBE) != 0;
   3771        have_popcnt = (c & bit_POPCNT) != 0;
   3772
   3773        /* There are a number of things we must check before we can be
   3774           sure of not hitting invalid opcode.  */
   3775        if (c & bit_OSXSAVE) {
   3776            unsigned xcrl, xcrh;
   3777            /* The xgetbv instruction is not available to older versions of
   3778             * the assembler, so we encode the instruction manually.
   3779             */
   3780            asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcrl), "=d" (xcrh) : "c" (0));
   3781            if ((xcrl & 6) == 6) {
   3782                have_avx1 = (c & bit_AVX) != 0;
   3783                have_avx2 = (b7 & bit_AVX2) != 0;
   3784            }
   3785        }
   3786    }
   3787
   3788    max = __get_cpuid_max(0x8000000, 0);
   3789    if (max >= 1) {
   3790        __cpuid(0x80000001, a, b, c, d);
   3791        /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs.  */
   3792        have_lzcnt = (c & bit_LZCNT) != 0;
   3793    }
   3794#endif /* CONFIG_CPUID_H */
   3795
   3796    tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
   3797    if (TCG_TARGET_REG_BITS == 64) {
   3798        tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
   3799    }
   3800    if (have_avx1) {
   3801        tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
   3802        tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
   3803    }
   3804    if (have_avx2) {
   3805        tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
   3806    }
   3807
   3808    tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
   3809    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
   3810    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
   3811    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
   3812    if (TCG_TARGET_REG_BITS == 64) {
   3813#if !defined(_WIN64)
   3814        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
   3815        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
   3816#endif
   3817        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
   3818        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
   3819        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
   3820        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
   3821    }
   3822
   3823    s->reserved_regs = 0;
   3824    tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
   3825}
   3826
   3827typedef struct {
   3828    DebugFrameHeader h;
   3829    uint8_t fde_def_cfa[4];
   3830    uint8_t fde_reg_ofs[14];
   3831} DebugFrame;
   3832
   3833/* We're expecting a 2 byte uleb128 encoded value.  */
   3834QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
   3835
   3836#if !defined(__ELF__)
   3837    /* Host machine without ELF. */
   3838#elif TCG_TARGET_REG_BITS == 64
   3839#define ELF_HOST_MACHINE EM_X86_64
   3840static const DebugFrame debug_frame = {
   3841    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
   3842    .h.cie.id = -1,
   3843    .h.cie.version = 1,
   3844    .h.cie.code_align = 1,
   3845    .h.cie.data_align = 0x78,             /* sleb128 -8 */
   3846    .h.cie.return_column = 16,
   3847
   3848    /* Total FDE size does not include the "len" member.  */
   3849    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
   3850
   3851    .fde_def_cfa = {
   3852        12, 7,                          /* DW_CFA_def_cfa %rsp, ... */
   3853        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
   3854        (FRAME_SIZE >> 7)
   3855    },
   3856    .fde_reg_ofs = {
   3857        0x90, 1,                        /* DW_CFA_offset, %rip, -8 */
   3858        /* The following ordering must match tcg_target_callee_save_regs.  */
   3859        0x86, 2,                        /* DW_CFA_offset, %rbp, -16 */
   3860        0x83, 3,                        /* DW_CFA_offset, %rbx, -24 */
   3861        0x8c, 4,                        /* DW_CFA_offset, %r12, -32 */
   3862        0x8d, 5,                        /* DW_CFA_offset, %r13, -40 */
   3863        0x8e, 6,                        /* DW_CFA_offset, %r14, -48 */
   3864        0x8f, 7,                        /* DW_CFA_offset, %r15, -56 */
   3865    }
   3866};
   3867#else
   3868#define ELF_HOST_MACHINE EM_386
   3869static const DebugFrame debug_frame = {
   3870    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
   3871    .h.cie.id = -1,
   3872    .h.cie.version = 1,
   3873    .h.cie.code_align = 1,
   3874    .h.cie.data_align = 0x7c,             /* sleb128 -4 */
   3875    .h.cie.return_column = 8,
   3876
   3877    /* Total FDE size does not include the "len" member.  */
   3878    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
   3879
   3880    .fde_def_cfa = {
   3881        12, 4,                          /* DW_CFA_def_cfa %esp, ... */
   3882        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
   3883        (FRAME_SIZE >> 7)
   3884    },
   3885    .fde_reg_ofs = {
   3886        0x88, 1,                        /* DW_CFA_offset, %eip, -4 */
   3887        /* The following ordering must match tcg_target_callee_save_regs.  */
   3888        0x85, 2,                        /* DW_CFA_offset, %ebp, -8 */
   3889        0x83, 3,                        /* DW_CFA_offset, %ebx, -12 */
   3890        0x86, 4,                        /* DW_CFA_offset, %esi, -16 */
   3891        0x87, 5,                        /* DW_CFA_offset, %edi, -20 */
   3892    }
   3893};
   3894#endif
   3895
   3896#if defined(ELF_HOST_MACHINE)
   3897void tcg_register_jit(const void *buf, size_t buf_size)
   3898{
   3899    tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
   3900}
   3901#endif