cachepc-qemu

Fork of AMDESE/qemu with changes for cachepc side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-qemu
Log | Files | Refs | Submodules | LICENSE | sfeed.txt

mpy.idef (54669B)


      1/*
      2 *  Copyright(c) 2019-2021 Qualcomm Innovation Center, Inc. All Rights Reserved.
      3 *
      4 *  This program is free software; you can redistribute it and/or modify
      5 *  it under the terms of the GNU General Public License as published by
      6 *  the Free Software Foundation; either version 2 of the License, or
      7 *  (at your option) any later version.
      8 *
      9 *  This program is distributed in the hope that it will be useful,
     10 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
     11 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     12 *  GNU General Public License for more details.
     13 *
     14 *  You should have received a copy of the GNU General Public License
     15 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
     16 */
     17
     18/*
     19 * Multiply Instructions
     20 */
     21
     22
     23#define STD_SP_MODES(TAG,OPER,ATR,DST,ACCSEM,SEM,OSEM,SATSEM,RNDSEM)\
     24Q6INSN(M2_##TAG##_hh_s0, OPER"(Rs.H32,Rt.H32)"OSEM,        ATR,"",{DST=SATSEM(RNDSEM(ACCSEM SEM(         fGETHALF(1,RsV),fGETHALF(1,RtV))));})\
     25Q6INSN(M2_##TAG##_hh_s1, OPER"(Rs.H32,Rt.H32):<<1"OSEM,    ATR,"",{DST=SATSEM(RNDSEM(ACCSEM fSCALE(1,SEM(fGETHALF(1,RsV),fGETHALF(1,RtV)))));})\
     26Q6INSN(M2_##TAG##_hl_s0, OPER"(Rs.H32,Rt.L32)"OSEM,        ATR,"",{DST=SATSEM(RNDSEM(ACCSEM SEM(         fGETHALF(1,RsV),fGETHALF(0,RtV))));})\
     27Q6INSN(M2_##TAG##_hl_s1, OPER"(Rs.H32,Rt.L32):<<1"OSEM,    ATR,"",{DST=SATSEM(RNDSEM(ACCSEM fSCALE(1,SEM(fGETHALF(1,RsV),fGETHALF(0,RtV)))));})\
     28Q6INSN(M2_##TAG##_lh_s0, OPER"(Rs.L32,Rt.H32)"OSEM,        ATR,"",{DST=SATSEM(RNDSEM(ACCSEM SEM(         fGETHALF(0,RsV),fGETHALF(1,RtV))));})\
     29Q6INSN(M2_##TAG##_lh_s1, OPER"(Rs.L32,Rt.H32):<<1"OSEM,    ATR,"",{DST=SATSEM(RNDSEM(ACCSEM fSCALE(1,SEM(fGETHALF(0,RsV),fGETHALF(1,RtV)))));})\
     30Q6INSN(M2_##TAG##_ll_s0, OPER"(Rs.L32,Rt.L32)"OSEM,        ATR,"",{DST=SATSEM(RNDSEM(ACCSEM SEM(         fGETHALF(0,RsV),fGETHALF(0,RtV))));})\
     31Q6INSN(M2_##TAG##_ll_s1, OPER"(Rs.L32,Rt.L32):<<1"OSEM,    ATR,"",{DST=SATSEM(RNDSEM(ACCSEM fSCALE(1,SEM(fGETHALF(0,RsV),fGETHALF(0,RtV)))));})
     32
     33/*****************************************************/
     34/* multiply 16x16->32 signed instructions            */
     35/*****************************************************/
     36STD_SP_MODES(mpy_acc,    "Rx32+=mpy", ,RxV,RxV+    ,fMPY16SS,          ,fPASS,fPASS)
     37STD_SP_MODES(mpy_nac,    "Rx32-=mpy", ,RxV,RxV-    ,fMPY16SS,          ,fPASS,fPASS)
     38STD_SP_MODES(mpy_acc_sat,"Rx32+=mpy", ,RxV,RxV+    ,fMPY16SS,":sat"    ,fSAT, fPASS)
     39STD_SP_MODES(mpy_nac_sat,"Rx32-=mpy", ,RxV,RxV-    ,fMPY16SS,":sat"    ,fSAT, fPASS)
     40STD_SP_MODES(mpy,        "Rd32=mpy",  ,RdV,        ,fMPY16SS,          ,fPASS,fPASS)
     41STD_SP_MODES(mpy_sat,    "Rd32=mpy",  ,RdV,        ,fMPY16SS,":sat"    ,fSAT, fPASS)
     42STD_SP_MODES(mpy_rnd,    "Rd32=mpy",  ,RdV,        ,fMPY16SS,":rnd"    ,fPASS,fROUND)
     43STD_SP_MODES(mpy_sat_rnd,"Rd32=mpy",  ,RdV,        ,fMPY16SS,":rnd:sat",fSAT, fROUND)
     44STD_SP_MODES(mpyd_acc,   "Rxx32+=mpy",,RxxV,RxxV+  ,fMPY16SS,          ,fPASS,fPASS)
     45STD_SP_MODES(mpyd_nac,   "Rxx32-=mpy",,RxxV,RxxV-  ,fMPY16SS,          ,fPASS,fPASS)
     46STD_SP_MODES(mpyd,       "Rdd32=mpy", ,RddV,       ,fMPY16SS,          ,fPASS,fPASS)
     47STD_SP_MODES(mpyd_rnd,   "Rdd32=mpy", ,RddV,       ,fMPY16SS,":rnd"    ,fPASS,fROUND)
     48
     49
     50/*****************************************************/
     51/* multiply 16x16->32 unsigned instructions          */
     52/*****************************************************/
     53#define STD_USP_MODES(TAG,OPER,ATR,DST,ACCSEM,SEM,OSEM,SATSEM,RNDSEM)\
     54Q6INSN(M2_##TAG##_hh_s0, OPER"(Rs.H32,Rt.H32)"OSEM,        ATR,"",{DST=SATSEM(RNDSEM(ACCSEM SEM(         fGETUHALF(1,RsV),fGETUHALF(1,RtV))));})\
     55Q6INSN(M2_##TAG##_hh_s1, OPER"(Rs.H32,Rt.H32):<<1"OSEM,    ATR,"",{DST=SATSEM(RNDSEM(ACCSEM fSCALE(1,SEM(fGETUHALF(1,RsV),fGETUHALF(1,RtV)))));})\
     56Q6INSN(M2_##TAG##_hl_s0, OPER"(Rs.H32,Rt.L32)"OSEM,        ATR,"",{DST=SATSEM(RNDSEM(ACCSEM SEM(         fGETUHALF(1,RsV),fGETUHALF(0,RtV))));})\
     57Q6INSN(M2_##TAG##_hl_s1, OPER"(Rs.H32,Rt.L32):<<1"OSEM,    ATR,"",{DST=SATSEM(RNDSEM(ACCSEM fSCALE(1,SEM(fGETUHALF(1,RsV),fGETUHALF(0,RtV)))));})\
     58Q6INSN(M2_##TAG##_lh_s0, OPER"(Rs.L32,Rt.H32)"OSEM,        ATR,"",{DST=SATSEM(RNDSEM(ACCSEM SEM(         fGETUHALF(0,RsV),fGETUHALF(1,RtV))));})\
     59Q6INSN(M2_##TAG##_lh_s1, OPER"(Rs.L32,Rt.H32):<<1"OSEM,    ATR,"",{DST=SATSEM(RNDSEM(ACCSEM fSCALE(1,SEM(fGETUHALF(0,RsV),fGETUHALF(1,RtV)))));})\
     60Q6INSN(M2_##TAG##_ll_s0, OPER"(Rs.L32,Rt.L32)"OSEM,        ATR,"",{DST=SATSEM(RNDSEM(ACCSEM SEM(         fGETUHALF(0,RsV),fGETUHALF(0,RtV))));})\
     61Q6INSN(M2_##TAG##_ll_s1, OPER"(Rs.L32,Rt.L32):<<1"OSEM,    ATR,"",{DST=SATSEM(RNDSEM(ACCSEM fSCALE(1,SEM(fGETUHALF(0,RsV),fGETUHALF(0,RtV)))));})
     62
     63STD_USP_MODES(mpyu_acc,    "Rx32+=mpyu", ,RxV,RxV+  ,fMPY16UU,          ,fPASS,fPASS)
     64STD_USP_MODES(mpyu_nac,    "Rx32-=mpyu", ,RxV,RxV-  ,fMPY16UU,          ,fPASS,fPASS)
     65STD_USP_MODES(mpyu,        "Rd32=mpyu",  ATTRIBS() ,RdV,  ,fMPY16UU, ,fPASS,fPASS)
     66STD_USP_MODES(mpyud_acc,   "Rxx32+=mpyu",,RxxV,RxxV+,fMPY16UU,          ,fPASS,fPASS)
     67STD_USP_MODES(mpyud_nac,   "Rxx32-=mpyu",,RxxV,RxxV-,fMPY16UU,          ,fPASS,fPASS)
     68STD_USP_MODES(mpyud,       "Rdd32=mpyu", ATTRIBS() ,RddV, ,fMPY16UU, ,fPASS,fPASS)
     69
     70/**********************************************/
     71/* mpy 16x#s8->32                             */
     72/**********************************************/
     73
     74Q6INSN(M2_mpysip,"Rd32=+mpyi(Rs32,#u8)",ATTRIBS(A_ARCHV2),
     75"32-bit Multiply by unsigned immediate",
     76{ fIMMEXT(uiV); RdV=RsV*uiV; })
     77
     78Q6INSN(M2_mpysin,"Rd32=-mpyi(Rs32,#u8)",ATTRIBS(A_ARCHV2),
     79"32-bit Multiply by unsigned immediate, negate result",
     80{ RdV=RsV*-uiV; })
     81
     82Q6INSN(M2_macsip,"Rx32+=mpyi(Rs32,#u8)",ATTRIBS(A_ARCHV2),
     83"32-bit Multiply-Add by unsigned immediate",
     84{ fIMMEXT(uiV); RxV=RxV + (RsV*uiV);})
     85
     86Q6INSN(M2_macsin,"Rx32-=mpyi(Rs32,#u8)",ATTRIBS(A_ARCHV2),
     87"32-bit Multiply-Subtract by unsigned immediate",
     88{ fIMMEXT(uiV); RxV=RxV - (RsV*uiV);})
     89
     90
     91/**********************************************/
     92/* multiply/mac  32x32->64 instructions       */
     93/**********************************************/
     94Q6INSN(M2_dpmpyss_s0,    "Rdd32=mpy(Rs32,Rt32)", ATTRIBS(),"Multiply 32x32",{RddV=fMPY32SS(RsV,RtV);})
     95Q6INSN(M2_dpmpyss_acc_s0,"Rxx32+=mpy(Rs32,Rt32)",ATTRIBS(),"Multiply 32x32",{RxxV= RxxV + fMPY32SS(RsV,RtV);})
     96Q6INSN(M2_dpmpyss_nac_s0,"Rxx32-=mpy(Rs32,Rt32)",ATTRIBS(),"Multiply 32x32",{RxxV= RxxV - fMPY32SS(RsV,RtV);})
     97
     98Q6INSN(M2_dpmpyuu_s0,    "Rdd32=mpyu(Rs32,Rt32)", ATTRIBS(),"Multiply 32x32",{RddV=fMPY32UU(fCAST4u(RsV),fCAST4u(RtV));})
     99Q6INSN(M2_dpmpyuu_acc_s0,"Rxx32+=mpyu(Rs32,Rt32)",ATTRIBS(),"Multiply 32x32",{RxxV= RxxV + fMPY32UU(fCAST4u(RsV),fCAST4u(RtV));})
    100Q6INSN(M2_dpmpyuu_nac_s0,"Rxx32-=mpyu(Rs32,Rt32)",ATTRIBS(),"Multiply 32x32",{RxxV= RxxV - fMPY32UU(fCAST4u(RsV),fCAST4u(RtV));})
    101
    102
    103/******************************************************/
    104/* multiply/mac  32x32->32 (upper) instructions       */
    105/******************************************************/
    106Q6INSN(M2_mpy_up,        "Rd32=mpy(Rs32,Rt32)", ATTRIBS(),"Multiply 32x32",{RdV=fMPY32SS(RsV,RtV)>>32;})
    107Q6INSN(M2_mpy_up_s1,     "Rd32=mpy(Rs32,Rt32):<<1", ATTRIBS(),"Multiply 32x32",{RdV=fMPY32SS(RsV,RtV)>>31;})
    108Q6INSN(M2_mpy_up_s1_sat, "Rd32=mpy(Rs32,Rt32):<<1:sat", ATTRIBS(),"Multiply 32x32",{RdV=fSAT(fMPY32SS(RsV,RtV)>>31);})
    109Q6INSN(M2_mpyu_up,       "Rd32=mpyu(Rs32,Rt32)", ATTRIBS(),"Multiply 32x32",{RdV=fMPY32UU(fCAST4u(RsV),fCAST4u(RtV))>>32;})
    110Q6INSN(M2_mpysu_up,      "Rd32=mpysu(Rs32,Rt32)", ATTRIBS(),"Multiply 32x32",{RdV=fMPY32SU(RsV,fCAST4u(RtV))>>32;})
    111Q6INSN(M2_dpmpyss_rnd_s0,"Rd32=mpy(Rs32,Rt32):rnd", ATTRIBS(),"Multiply 32x32",{RdV=(fMPY32SS(RsV,RtV)+fCONSTLL(0x80000000))>>32;})
    112
    113Q6INSN(M4_mac_up_s1_sat, "Rx32+=mpy(Rs32,Rt32):<<1:sat", ATTRIBS(),"Multiply 32x32",{RxV=fSAT(  (fSE32_64(RxV)) + (fMPY32SS(RsV,RtV)>>31));})
    114Q6INSN(M4_nac_up_s1_sat, "Rx32-=mpy(Rs32,Rt32):<<1:sat", ATTRIBS(),"Multiply 32x32",{RxV=fSAT(  (fSE32_64(RxV)) - (fMPY32SS(RsV,RtV)>>31));})
    115
    116
    117/**********************************************/
    118/* 32x32->32 multiply (lower)                 */
    119/**********************************************/
    120
    121Q6INSN(M2_mpyi,"Rd32=mpyi(Rs32,Rt32)",ATTRIBS(),
    122"Multiply Integer",
    123{ RdV=RsV*RtV;})
    124
    125Q6INSN(M2_maci,"Rx32+=mpyi(Rs32,Rt32)",ATTRIBS(A_ARCHV2),
    126"Multiply-Accumulate Integer",
    127{ RxV=RxV + RsV*RtV;})
    128
    129Q6INSN(M2_mnaci,"Rx32-=mpyi(Rs32,Rt32)",ATTRIBS(A_ARCHV2),
    130"Multiply-Neg-Accumulate Integer",
    131{ RxV=RxV - RsV*RtV;})
    132
    133/****** WHY ARE THESE IN MPY.IDEF? **********/
    134
    135Q6INSN(M2_acci,"Rx32+=add(Rs32,Rt32)",ATTRIBS(A_ARCHV2),
    136"Add with accumulate",
    137{ RxV=RxV + RsV + RtV;})
    138
    139Q6INSN(M2_accii,"Rx32+=add(Rs32,#s8)",ATTRIBS(A_ARCHV2),
    140"Add with accumulate",
    141{ fIMMEXT(siV); RxV=RxV + RsV + siV;})
    142
    143Q6INSN(M2_nacci,"Rx32-=add(Rs32,Rt32)",ATTRIBS(A_ARCHV2),
    144"Add with neg accumulate",
    145{ RxV=RxV - (RsV + RtV);})
    146
    147Q6INSN(M2_naccii,"Rx32-=add(Rs32,#s8)",ATTRIBS(A_ARCHV2),
    148"Add with neg accumulate",
    149{ fIMMEXT(siV); RxV=RxV - (RsV + siV);})
    150
    151Q6INSN(M2_subacc,"Rx32+=sub(Rt32,Rs32)",ATTRIBS(A_ARCHV2),
    152"Sub with accumulate",
    153{ RxV=RxV + RtV - RsV;})
    154
    155
    156
    157
    158Q6INSN(M4_mpyrr_addr,"Ry32=add(Ru32,mpyi(Ry32,Rs32))",ATTRIBS(),
    159"Mpy by immed and add immed",
    160{ RyV = RuV + RsV*RyV;})
    161
    162Q6INSN(M4_mpyri_addr_u2,"Rd32=add(Ru32,mpyi(#u6:2,Rs32))",ATTRIBS(),
    163"Mpy by immed and add immed",
    164{ RdV = RuV + RsV*uiV;})
    165
    166Q6INSN(M4_mpyri_addr,"Rd32=add(Ru32,mpyi(Rs32,#u6))",ATTRIBS(),
    167"Mpy by immed and add immed",
    168{ fIMMEXT(uiV); RdV = RuV + RsV*uiV;})
    169
    170
    171
    172Q6INSN(M4_mpyri_addi,"Rd32=add(#u6,mpyi(Rs32,#U6))",ATTRIBS(),
    173"Mpy by immed and add immed",
    174{ fIMMEXT(uiV); RdV = uiV + RsV*UiV;})
    175
    176
    177
    178Q6INSN(M4_mpyrr_addi,"Rd32=add(#u6,mpyi(Rs32,Rt32))",ATTRIBS(),
    179"Mpy by immed and add immed",
    180{ fIMMEXT(uiV); RdV = uiV + RsV*RtV;})
    181
    182
    183
    184
    185
    186
    187
    188
    189
    190
    191
    192
    193
    194
    195
    196
    197
    198/**********************************************/
    199/* vector mac  2x[16x16 -> 32]                */
    200/**********************************************/
    201
    202#undef vmac_sema
    203#define vmac_sema(N)\
    204{ fSETWORD(0,RddV,fSAT(fSCALE(N,fMPY16SS(fGETHALF(0,RsV),fGETHALF(0,RtV)))));\
    205  fSETWORD(1,RddV,fSAT(fSCALE(N,fMPY16SS(fGETHALF(1,RsV),fGETHALF(1,RtV)))));\
    206}
    207Q6INSN(M2_vmpy2s_s0,"Rdd32=vmpyh(Rs32,Rt32):sat",ATTRIBS(),"Vector Multiply",vmac_sema(0))
    208Q6INSN(M2_vmpy2s_s1,"Rdd32=vmpyh(Rs32,Rt32):<<1:sat",ATTRIBS(),"Vector Multiply",vmac_sema(1))
    209
    210
    211#undef vmac_sema
    212#define vmac_sema(N)\
    213{ fSETWORD(0,RxxV,fSAT(fGETWORD(0,RxxV) + fSCALE(N,fMPY16SS(fGETHALF(0,RsV),fGETHALF(0,RtV)))));\
    214  fSETWORD(1,RxxV,fSAT(fGETWORD(1,RxxV) + fSCALE(N,fMPY16SS(fGETHALF(1,RsV),fGETHALF(1,RtV)))));\
    215}
    216Q6INSN(M2_vmac2s_s0,"Rxx32+=vmpyh(Rs32,Rt32):sat",ATTRIBS(),"Vector Multiply",vmac_sema(0))
    217Q6INSN(M2_vmac2s_s1,"Rxx32+=vmpyh(Rs32,Rt32):<<1:sat",ATTRIBS(),"Vector Multiply",vmac_sema(1))
    218
    219#undef vmac_sema
    220#define vmac_sema(N)\
    221{ fSETWORD(0,RddV,fSAT(fSCALE(N,fMPY16SU(fGETHALF(0,RsV),fGETUHALF(0,RtV)))));\
    222  fSETWORD(1,RddV,fSAT(fSCALE(N,fMPY16SU(fGETHALF(1,RsV),fGETUHALF(1,RtV)))));\
    223}
    224Q6INSN(M2_vmpy2su_s0,"Rdd32=vmpyhsu(Rs32,Rt32):sat",ATTRIBS(),"Vector Multiply",vmac_sema(0))
    225Q6INSN(M2_vmpy2su_s1,"Rdd32=vmpyhsu(Rs32,Rt32):<<1:sat",ATTRIBS(),"Vector Multiply",vmac_sema(1))
    226
    227
    228#undef vmac_sema
    229#define vmac_sema(N)\
    230{ fSETWORD(0,RxxV,fSAT(fGETWORD(0,RxxV) + fSCALE(N,fMPY16SU(fGETHALF(0,RsV),fGETUHALF(0,RtV)))));\
    231  fSETWORD(1,RxxV,fSAT(fGETWORD(1,RxxV) + fSCALE(N,fMPY16SU(fGETHALF(1,RsV),fGETUHALF(1,RtV)))));\
    232}
    233Q6INSN(M2_vmac2su_s0,"Rxx32+=vmpyhsu(Rs32,Rt32):sat",ATTRIBS(),"Vector Multiply",vmac_sema(0))
    234Q6INSN(M2_vmac2su_s1,"Rxx32+=vmpyhsu(Rs32,Rt32):<<1:sat",ATTRIBS(),"Vector Multiply",vmac_sema(1))
    235
    236
    237
    238#undef vmac_sema
    239#define vmac_sema(N)\
    240{ fSETHALF(1,RdV,fGETHALF(1,(fSAT(fSCALE(N,fMPY16SS(fGETHALF(1,RsV),fGETHALF(1,RtV))) + 0x8000))));\
    241  fSETHALF(0,RdV,fGETHALF(1,(fSAT(fSCALE(N,fMPY16SS(fGETHALF(0,RsV),fGETHALF(0,RtV))) + 0x8000))));\
    242}
    243Q6INSN(M2_vmpy2s_s0pack,"Rd32=vmpyh(Rs32,Rt32):rnd:sat",ATTRIBS(A_ARCHV2),"Vector Multiply",vmac_sema(0))
    244Q6INSN(M2_vmpy2s_s1pack,"Rd32=vmpyh(Rs32,Rt32):<<1:rnd:sat",ATTRIBS(A_ARCHV2),"Vector Multiply",vmac_sema(1))
    245
    246
    247#undef vmac_sema
    248#define vmac_sema(N)\
    249{ fSETWORD(0,RxxV,fGETWORD(0,RxxV) + fMPY16SS(fGETHALF(0,RsV),fGETHALF(0,RtV)));\
    250  fSETWORD(1,RxxV,fGETWORD(1,RxxV) + fMPY16SS(fGETHALF(1,RsV),fGETHALF(1,RtV)));\
    251}
    252Q6INSN(M2_vmac2,"Rxx32+=vmpyh(Rs32,Rt32)",ATTRIBS(A_ARCHV2),"Vector Multiply",vmac_sema(0))
    253
    254#undef vmac_sema
    255#define vmac_sema(N)\
    256{ fSETWORD(0,RddV,fSAT(fSCALE(N,fMPY16SS(fGETHALF(0,RssV),fGETHALF(0,RttV)))));\
    257  fSETWORD(1,RddV,fSAT(fSCALE(N,fMPY16SS(fGETHALF(2,RssV),fGETHALF(2,RttV)))));\
    258}
    259Q6INSN(M2_vmpy2es_s0,"Rdd32=vmpyeh(Rss32,Rtt32):sat",ATTRIBS(),"Vector Multiply",vmac_sema(0))
    260Q6INSN(M2_vmpy2es_s1,"Rdd32=vmpyeh(Rss32,Rtt32):<<1:sat",ATTRIBS(),"Vector Multiply",vmac_sema(1))
    261
    262#undef vmac_sema
    263#define vmac_sema(N)\
    264{ fSETWORD(0,RxxV,fSAT(fGETWORD(0,RxxV) + fSCALE(N,fMPY16SS(fGETHALF(0,RssV),fGETHALF(0,RttV)))));\
    265  fSETWORD(1,RxxV,fSAT(fGETWORD(1,RxxV) + fSCALE(N,fMPY16SS(fGETHALF(2,RssV),fGETHALF(2,RttV)))));\
    266}
    267Q6INSN(M2_vmac2es_s0,"Rxx32+=vmpyeh(Rss32,Rtt32):sat",ATTRIBS(),"Vector Multiply",vmac_sema(0))
    268Q6INSN(M2_vmac2es_s1,"Rxx32+=vmpyeh(Rss32,Rtt32):<<1:sat",ATTRIBS(),"Vector Multiply",vmac_sema(1))
    269
    270#undef vmac_sema
    271#define vmac_sema(N)\
    272{ fSETWORD(0,RxxV,fGETWORD(0,RxxV) + fMPY16SS(fGETHALF(0,RssV),fGETHALF(0,RttV)));\
    273  fSETWORD(1,RxxV,fGETWORD(1,RxxV) + fMPY16SS(fGETHALF(2,RssV),fGETHALF(2,RttV)));\
    274}
    275Q6INSN(M2_vmac2es,"Rxx32+=vmpyeh(Rss32,Rtt32)",ATTRIBS(A_ARCHV2),"Vector Multiply",vmac_sema(0))
    276
    277
    278
    279
    280/********************************************************/
    281/* vrmpyh, aka Big Mac, aka Mac Daddy, aka Mac-ac-ac-ac */
    282/* vector mac  4x[16x16] + 64 ->64                      */
    283/********************************************************/
    284
    285
    286#undef vmac_sema
    287#define vmac_sema(N)\
    288{ RxxV = RxxV + fMPY16SS(fGETHALF(0,RssV),fGETHALF(0,RttV))\
    289              + fMPY16SS(fGETHALF(1,RssV),fGETHALF(1,RttV))\
    290              + fMPY16SS(fGETHALF(2,RssV),fGETHALF(2,RttV))\
    291              + fMPY16SS(fGETHALF(3,RssV),fGETHALF(3,RttV));\
    292}
    293Q6INSN(M2_vrmac_s0,"Rxx32+=vrmpyh(Rss32,Rtt32)",ATTRIBS(),"Vector Multiply",vmac_sema(0))
    294
    295#undef vmac_sema
    296#define vmac_sema(N)\
    297{ RddV = fMPY16SS(fGETHALF(0,RssV),fGETHALF(0,RttV))\
    298       + fMPY16SS(fGETHALF(1,RssV),fGETHALF(1,RttV))\
    299       + fMPY16SS(fGETHALF(2,RssV),fGETHALF(2,RttV))\
    300       + fMPY16SS(fGETHALF(3,RssV),fGETHALF(3,RttV));\
    301}
    302Q6INSN(M2_vrmpy_s0,"Rdd32=vrmpyh(Rss32,Rtt32)",ATTRIBS(),"Vector Multiply",vmac_sema(0))
    303
    304
    305
    306/******************************************************/
    307/* vector dual macs. just like complex                */
    308/******************************************************/
    309
    310
    311/* With round&pack */
    312#undef dmpy_sema
    313#define dmpy_sema(N)\
    314{ fSETHALF(0,RdV,fGETHALF(1,(fSAT(fSCALE(N,fMPY16SS(fGETHALF(0,RssV),fGETHALF(0,RttV))) + \
    315                                  fSCALE(N,fMPY16SS(fGETHALF(1,RssV),fGETHALF(1,RttV))) + 0x8000))));\
    316  fSETHALF(1,RdV,fGETHALF(1,(fSAT(fSCALE(N,fMPY16SS(fGETHALF(2,RssV),fGETHALF(2,RttV))) + \
    317                                  fSCALE(N,fMPY16SS(fGETHALF(3,RssV),fGETHALF(3,RttV))) + 0x8000))));\
    318}
    319Q6INSN(M2_vdmpyrs_s0,"Rd32=vdmpy(Rss32,Rtt32):rnd:sat",ATTRIBS(),    "vector dual mac w/ round&pack",dmpy_sema(0))
    320Q6INSN(M2_vdmpyrs_s1,"Rd32=vdmpy(Rss32,Rtt32):<<1:rnd:sat",ATTRIBS(),"vector dual mac w/ round&pack",dmpy_sema(1))
    321
    322
    323
    324
    325
    326/******************************************************/
    327/* vector byte multiplies                             */
    328/******************************************************/
    329
    330
    331Q6INSN(M5_vrmpybuu,"Rdd32=vrmpybu(Rss32,Rtt32)",ATTRIBS(),
    332 "vector dual mpy bytes",
    333{
    334  fSETWORD(0,RddV,(fMPY16SS(fGETUBYTE(0,RssV),fGETUBYTE(0,RttV)) +
    335                   fMPY16SS(fGETUBYTE(1,RssV),fGETUBYTE(1,RttV)) +
    336                   fMPY16SS(fGETUBYTE(2,RssV),fGETUBYTE(2,RttV)) +
    337                   fMPY16SS(fGETUBYTE(3,RssV),fGETUBYTE(3,RttV))));
    338  fSETWORD(1,RddV,(fMPY16SS(fGETUBYTE(4,RssV),fGETUBYTE(4,RttV)) +
    339                   fMPY16SS(fGETUBYTE(5,RssV),fGETUBYTE(5,RttV)) +
    340                   fMPY16SS(fGETUBYTE(6,RssV),fGETUBYTE(6,RttV)) +
    341                   fMPY16SS(fGETUBYTE(7,RssV),fGETUBYTE(7,RttV))));
    342 })
    343
    344Q6INSN(M5_vrmacbuu,"Rxx32+=vrmpybu(Rss32,Rtt32)",ATTRIBS(),
    345 "vector dual mac bytes",
    346{
    347  fSETWORD(0,RxxV,(fGETWORD(0,RxxV) +
    348                   fMPY16SS(fGETUBYTE(0,RssV),fGETUBYTE(0,RttV)) +
    349                   fMPY16SS(fGETUBYTE(1,RssV),fGETUBYTE(1,RttV)) +
    350                   fMPY16SS(fGETUBYTE(2,RssV),fGETUBYTE(2,RttV)) +
    351                   fMPY16SS(fGETUBYTE(3,RssV),fGETUBYTE(3,RttV))));
    352  fSETWORD(1,RxxV,(fGETWORD(1,RxxV) +
    353                   fMPY16SS(fGETUBYTE(4,RssV),fGETUBYTE(4,RttV)) +
    354                   fMPY16SS(fGETUBYTE(5,RssV),fGETUBYTE(5,RttV)) +
    355                   fMPY16SS(fGETUBYTE(6,RssV),fGETUBYTE(6,RttV)) +
    356                   fMPY16SS(fGETUBYTE(7,RssV),fGETUBYTE(7,RttV))));
    357 })
    358
    359
    360Q6INSN(M5_vrmpybsu,"Rdd32=vrmpybsu(Rss32,Rtt32)",ATTRIBS(),
    361 "vector dual mpy bytes",
    362{
    363  fSETWORD(0,RddV,(fMPY16SS(fGETBYTE(0,RssV),fGETUBYTE(0,RttV)) +
    364                   fMPY16SS(fGETBYTE(1,RssV),fGETUBYTE(1,RttV)) +
    365                   fMPY16SS(fGETBYTE(2,RssV),fGETUBYTE(2,RttV)) +
    366                   fMPY16SS(fGETBYTE(3,RssV),fGETUBYTE(3,RttV))));
    367  fSETWORD(1,RddV,(fMPY16SS(fGETBYTE(4,RssV),fGETUBYTE(4,RttV)) +
    368                   fMPY16SS(fGETBYTE(5,RssV),fGETUBYTE(5,RttV)) +
    369                   fMPY16SS(fGETBYTE(6,RssV),fGETUBYTE(6,RttV)) +
    370                   fMPY16SS(fGETBYTE(7,RssV),fGETUBYTE(7,RttV))));
    371 })
    372
    373Q6INSN(M5_vrmacbsu,"Rxx32+=vrmpybsu(Rss32,Rtt32)",ATTRIBS(),
    374 "vector dual mac bytes",
    375{
    376  fSETWORD(0,RxxV,(fGETWORD(0,RxxV) +
    377                   fMPY16SS(fGETBYTE(0,RssV),fGETUBYTE(0,RttV)) +
    378                   fMPY16SS(fGETBYTE(1,RssV),fGETUBYTE(1,RttV)) +
    379                   fMPY16SS(fGETBYTE(2,RssV),fGETUBYTE(2,RttV)) +
    380                   fMPY16SS(fGETBYTE(3,RssV),fGETUBYTE(3,RttV))));
    381  fSETWORD(1,RxxV,(fGETWORD(1,RxxV) +
    382                   fMPY16SS(fGETBYTE(4,RssV),fGETUBYTE(4,RttV)) +
    383                   fMPY16SS(fGETBYTE(5,RssV),fGETUBYTE(5,RttV)) +
    384                   fMPY16SS(fGETBYTE(6,RssV),fGETUBYTE(6,RttV)) +
    385                   fMPY16SS(fGETBYTE(7,RssV),fGETUBYTE(7,RttV))));
    386 })
    387
    388
    389Q6INSN(M5_vmpybuu,"Rdd32=vmpybu(Rs32,Rt32)",ATTRIBS(),
    390 "vector mpy bytes",
    391{
    392  fSETHALF(0,RddV,(fMPY16SS(fGETUBYTE(0,RsV),fGETUBYTE(0,RtV))));
    393  fSETHALF(1,RddV,(fMPY16SS(fGETUBYTE(1,RsV),fGETUBYTE(1,RtV))));
    394  fSETHALF(2,RddV,(fMPY16SS(fGETUBYTE(2,RsV),fGETUBYTE(2,RtV))));
    395  fSETHALF(3,RddV,(fMPY16SS(fGETUBYTE(3,RsV),fGETUBYTE(3,RtV))));
    396 })
    397
    398Q6INSN(M5_vmpybsu,"Rdd32=vmpybsu(Rs32,Rt32)",ATTRIBS(),
    399 "vector mpy bytes",
    400{
    401  fSETHALF(0,RddV,(fMPY16SS(fGETBYTE(0,RsV),fGETUBYTE(0,RtV))));
    402  fSETHALF(1,RddV,(fMPY16SS(fGETBYTE(1,RsV),fGETUBYTE(1,RtV))));
    403  fSETHALF(2,RddV,(fMPY16SS(fGETBYTE(2,RsV),fGETUBYTE(2,RtV))));
    404  fSETHALF(3,RddV,(fMPY16SS(fGETBYTE(3,RsV),fGETUBYTE(3,RtV))));
    405 })
    406
    407
    408Q6INSN(M5_vmacbuu,"Rxx32+=vmpybu(Rs32,Rt32)",ATTRIBS(),
    409 "vector mac bytes",
    410{
    411  fSETHALF(0,RxxV,(fGETHALF(0,RxxV)+fMPY16SS(fGETUBYTE(0,RsV),fGETUBYTE(0,RtV))));
    412  fSETHALF(1,RxxV,(fGETHALF(1,RxxV)+fMPY16SS(fGETUBYTE(1,RsV),fGETUBYTE(1,RtV))));
    413  fSETHALF(2,RxxV,(fGETHALF(2,RxxV)+fMPY16SS(fGETUBYTE(2,RsV),fGETUBYTE(2,RtV))));
    414  fSETHALF(3,RxxV,(fGETHALF(3,RxxV)+fMPY16SS(fGETUBYTE(3,RsV),fGETUBYTE(3,RtV))));
    415 })
    416
    417Q6INSN(M5_vmacbsu,"Rxx32+=vmpybsu(Rs32,Rt32)",ATTRIBS(),
    418 "vector mac bytes",
    419{
    420  fSETHALF(0,RxxV,(fGETHALF(0,RxxV)+fMPY16SS(fGETBYTE(0,RsV),fGETUBYTE(0,RtV))));
    421  fSETHALF(1,RxxV,(fGETHALF(1,RxxV)+fMPY16SS(fGETBYTE(1,RsV),fGETUBYTE(1,RtV))));
    422  fSETHALF(2,RxxV,(fGETHALF(2,RxxV)+fMPY16SS(fGETBYTE(2,RsV),fGETUBYTE(2,RtV))));
    423  fSETHALF(3,RxxV,(fGETHALF(3,RxxV)+fMPY16SS(fGETBYTE(3,RsV),fGETUBYTE(3,RtV))));
    424 })
    425
    426
    427
    428Q6INSN(M5_vdmpybsu,"Rdd32=vdmpybsu(Rss32,Rtt32):sat",ATTRIBS(),
    429 "vector quad mpy bytes",
    430{
    431  fSETHALF(0,RddV,fSATN(16,(fMPY16SS(fGETBYTE(0,RssV),fGETUBYTE(0,RttV)) +
    432                            fMPY16SS(fGETBYTE(1,RssV),fGETUBYTE(1,RttV)))));
    433  fSETHALF(1,RddV,fSATN(16,(fMPY16SS(fGETBYTE(2,RssV),fGETUBYTE(2,RttV)) +
    434                            fMPY16SS(fGETBYTE(3,RssV),fGETUBYTE(3,RttV)))));
    435  fSETHALF(2,RddV,fSATN(16,(fMPY16SS(fGETBYTE(4,RssV),fGETUBYTE(4,RttV)) +
    436                            fMPY16SS(fGETBYTE(5,RssV),fGETUBYTE(5,RttV)))));
    437  fSETHALF(3,RddV,fSATN(16,(fMPY16SS(fGETBYTE(6,RssV),fGETUBYTE(6,RttV)) +
    438                            fMPY16SS(fGETBYTE(7,RssV),fGETUBYTE(7,RttV)))));
    439 })
    440
    441
    442Q6INSN(M5_vdmacbsu,"Rxx32+=vdmpybsu(Rss32,Rtt32):sat",ATTRIBS(),
    443 "vector quad mac bytes",
    444{
    445  fSETHALF(0,RxxV,fSATN(16,(fGETHALF(0,RxxV) +
    446                   fMPY16SS(fGETBYTE(0,RssV),fGETUBYTE(0,RttV)) +
    447                   fMPY16SS(fGETBYTE(1,RssV),fGETUBYTE(1,RttV)))));
    448  fSETHALF(1,RxxV,fSATN(16,(fGETHALF(1,RxxV) +
    449                   fMPY16SS(fGETBYTE(2,RssV),fGETUBYTE(2,RttV)) +
    450                   fMPY16SS(fGETBYTE(3,RssV),fGETUBYTE(3,RttV)))));
    451  fSETHALF(2,RxxV,fSATN(16,(fGETHALF(2,RxxV) +
    452                   fMPY16SS(fGETBYTE(4,RssV),fGETUBYTE(4,RttV)) +
    453                   fMPY16SS(fGETBYTE(5,RssV),fGETUBYTE(5,RttV)))));
    454  fSETHALF(3,RxxV,fSATN(16,(fGETHALF(3,RxxV) +
    455                   fMPY16SS(fGETBYTE(6,RssV),fGETUBYTE(6,RttV)) +
    456                   fMPY16SS(fGETBYTE(7,RssV),fGETUBYTE(7,RttV)))));
    457 })
    458
    459
    460
    461/* Full version */
    462#undef dmpy_sema
    463#define dmpy_sema(N)\
    464{ fSETWORD(0,RxxV,fSAT(fGETWORD(0,RxxV) + fSCALE(N,fMPY16SS(fGETHALF(0,RssV),fGETHALF(0,RttV))) + \
    465                     fSCALE(N,fMPY16SS(fGETHALF(1,RssV),fGETHALF(1,RttV)))));\
    466  fSETWORD(1,RxxV,fSAT(fGETWORD(1,RxxV) + fSCALE(N,fMPY16SS(fGETHALF(2,RssV),fGETHALF(2,RttV))) + \
    467                     fSCALE(N,fMPY16SS(fGETHALF(3,RssV),fGETHALF(3,RttV)))));\
    468}
    469Q6INSN(M2_vdmacs_s0,"Rxx32+=vdmpy(Rss32,Rtt32):sat",ATTRIBS(),    "",dmpy_sema(0))
    470Q6INSN(M2_vdmacs_s1,"Rxx32+=vdmpy(Rss32,Rtt32):<<1:sat",ATTRIBS(),"",dmpy_sema(1))
    471
    472#undef dmpy_sema
    473#define dmpy_sema(N)\
    474{ fSETWORD(0,RddV,fSAT(fSCALE(N,fMPY16SS(fGETHALF(0,RssV),fGETHALF(0,RttV))) + \
    475              fSCALE(N,fMPY16SS(fGETHALF(1,RssV),fGETHALF(1,RttV)))));\
    476  fSETWORD(1,RddV,fSAT(fSCALE(N,fMPY16SS(fGETHALF(2,RssV),fGETHALF(2,RttV))) + \
    477              fSCALE(N,fMPY16SS(fGETHALF(3,RssV),fGETHALF(3,RttV)))));\
    478}
    479
    480Q6INSN(M2_vdmpys_s0,"Rdd32=vdmpy(Rss32,Rtt32):sat",ATTRIBS(),    "",dmpy_sema(0))
    481Q6INSN(M2_vdmpys_s1,"Rdd32=vdmpy(Rss32,Rtt32):<<1:sat",ATTRIBS(),"",dmpy_sema(1))
    482
    483
    484
    485/******************************************************/
    486/* complex multiply/mac with                          */
    487/* real&imag are packed together and always saturated */
    488/* to protect against overflow.                       */
    489/******************************************************/
    490
    491#undef cmpy_sema
    492#define cmpy_sema(N,CONJMINUS,CONJPLUS)\
    493{ fSETHALF(1,RdV,fGETHALF(1,(fSAT(fSCALE(N,fMPY16SS(fGETHALF(1,RsV),fGETHALF(0,RtV))) CONJMINUS \
    494                                  fSCALE(N,fMPY16SS(fGETHALF(0,RsV),fGETHALF(1,RtV))) + 0x8000))));\
    495  fSETHALF(0,RdV,fGETHALF(1,(fSAT(fSCALE(N,fMPY16SS(fGETHALF(0,RsV),fGETHALF(0,RtV))) CONJPLUS \
    496                                  fSCALE(N,fMPY16SS(fGETHALF(1,RsV),fGETHALF(1,RtV))) + 0x8000))));\
    497}
    498Q6INSN(M2_cmpyrs_s0,"Rd32=cmpy(Rs32,Rt32):rnd:sat",ATTRIBS(),    "Complex Multiply",cmpy_sema(0,+,-))
    499Q6INSN(M2_cmpyrs_s1,"Rd32=cmpy(Rs32,Rt32):<<1:rnd:sat",ATTRIBS(),"Complex Multiply",cmpy_sema(1,+,-))
    500
    501
    502Q6INSN(M2_cmpyrsc_s0,"Rd32=cmpy(Rs32,Rt32*):rnd:sat",ATTRIBS(A_ARCHV2),    "Complex Multiply",cmpy_sema(0,-,+))
    503Q6INSN(M2_cmpyrsc_s1,"Rd32=cmpy(Rs32,Rt32*):<<1:rnd:sat",ATTRIBS(A_ARCHV2),"Complex Multiply",cmpy_sema(1,-,+))
    504
    505
    506#undef cmpy_sema
    507#define cmpy_sema(N,CONJMINUS,CONJPLUS)\
    508{ fSETWORD(1,RxxV,fSAT(fGETWORD(1,RxxV) + fSCALE(N,fMPY16SS(fGETHALF(1,RsV),fGETHALF(0,RtV))) CONJMINUS \
    509                                          fSCALE(N,fMPY16SS(fGETHALF(0,RsV),fGETHALF(1,RtV)))));\
    510  fSETWORD(0,RxxV,fSAT(fGETWORD(0,RxxV) + fSCALE(N,fMPY16SS(fGETHALF(0,RsV),fGETHALF(0,RtV))) CONJPLUS \
    511                                          fSCALE(N,fMPY16SS(fGETHALF(1,RsV),fGETHALF(1,RtV)))));\
    512}
    513Q6INSN(M2_cmacs_s0,"Rxx32+=cmpy(Rs32,Rt32):sat",ATTRIBS(),    "Complex Multiply",cmpy_sema(0,+,-))
    514Q6INSN(M2_cmacs_s1,"Rxx32+=cmpy(Rs32,Rt32):<<1:sat",ATTRIBS(),"Complex Multiply",cmpy_sema(1,+,-))
    515
    516/* EJP: Need mac versions w/ CONJ T? */
    517Q6INSN(M2_cmacsc_s0,"Rxx32+=cmpy(Rs32,Rt32*):sat",ATTRIBS(A_ARCHV2),    "Complex Multiply",cmpy_sema(0,-,+))
    518Q6INSN(M2_cmacsc_s1,"Rxx32+=cmpy(Rs32,Rt32*):<<1:sat",ATTRIBS(A_ARCHV2),"Complex Multiply",cmpy_sema(1,-,+))
    519
    520
    521#undef cmpy_sema
    522#define cmpy_sema(N,CONJMINUS,CONJPLUS)\
    523{ fSETWORD(1,RddV,fSAT(fSCALE(N,fMPY16SS(fGETHALF(1,RsV),fGETHALF(0,RtV))) CONJMINUS \
    524                       fSCALE(N,fMPY16SS(fGETHALF(0,RsV),fGETHALF(1,RtV)))));\
    525  fSETWORD(0,RddV,fSAT(fSCALE(N,fMPY16SS(fGETHALF(0,RsV),fGETHALF(0,RtV))) CONJPLUS \
    526                       fSCALE(N,fMPY16SS(fGETHALF(1,RsV),fGETHALF(1,RtV)))));\
    527}
    528
    529Q6INSN(M2_cmpys_s0,"Rdd32=cmpy(Rs32,Rt32):sat",ATTRIBS(),    "Complex Multiply",cmpy_sema(0,+,-))
    530Q6INSN(M2_cmpys_s1,"Rdd32=cmpy(Rs32,Rt32):<<1:sat",ATTRIBS(),"Complex Multiply",cmpy_sema(1,+,-))
    531
    532Q6INSN(M2_cmpysc_s0,"Rdd32=cmpy(Rs32,Rt32*):sat",ATTRIBS(A_ARCHV2),    "Complex Multiply",cmpy_sema(0,-,+))
    533Q6INSN(M2_cmpysc_s1,"Rdd32=cmpy(Rs32,Rt32*):<<1:sat",ATTRIBS(A_ARCHV2),"Complex Multiply",cmpy_sema(1,-,+))
    534
    535
    536
    537#undef cmpy_sema
    538#define cmpy_sema(N,CONJMINUS,CONJPLUS)\
    539{ fSETWORD(1,RxxV,fSAT(fGETWORD(1,RxxV) - (fSCALE(N,fMPY16SS(fGETHALF(1,RsV),fGETHALF(0,RtV))) CONJMINUS \
    540                                           fSCALE(N,fMPY16SS(fGETHALF(0,RsV),fGETHALF(1,RtV))))));\
    541  fSETWORD(0,RxxV,fSAT(fGETWORD(0,RxxV) - (fSCALE(N,fMPY16SS(fGETHALF(0,RsV),fGETHALF(0,RtV))) CONJPLUS \
    542                                           fSCALE(N,fMPY16SS(fGETHALF(1,RsV),fGETHALF(1,RtV))))));\
    543}
    544Q6INSN(M2_cnacs_s0,"Rxx32-=cmpy(Rs32,Rt32):sat",ATTRIBS(A_ARCHV2),    "Complex Multiply",cmpy_sema(0,+,-))
    545Q6INSN(M2_cnacs_s1,"Rxx32-=cmpy(Rs32,Rt32):<<1:sat",ATTRIBS(A_ARCHV2),"Complex Multiply",cmpy_sema(1,+,-))
    546
    547/* EJP: need CONJ versions? */
    548Q6INSN(M2_cnacsc_s0,"Rxx32-=cmpy(Rs32,Rt32*):sat",ATTRIBS(A_ARCHV2),    "Complex Multiply",cmpy_sema(0,-,+))
    549Q6INSN(M2_cnacsc_s1,"Rxx32-=cmpy(Rs32,Rt32*):<<1:sat",ATTRIBS(A_ARCHV2),"Complex Multiply",cmpy_sema(1,-,+))
    550
    551
    552/******************************************************/
    553/* complex interpolation                              */
    554/* Given a pair of complex values, scale by a,b, sum  */
    555/* Saturate/shift1 and round/pack                     */
    556/******************************************************/
    557
    558#undef vrcmpys_sema
    559#define vrcmpys_sema(N,INWORD) \
    560{ fSETWORD(1,RddV,fSAT(fSCALE(N,fMPY16SS(fGETHALF(1,RssV),fGETHALF(0,INWORD))) + \
    561                       fSCALE(N,fMPY16SS(fGETHALF(3,RssV),fGETHALF(1,INWORD)))));\
    562  fSETWORD(0,RddV,fSAT(fSCALE(N,fMPY16SS(fGETHALF(0,RssV),fGETHALF(0,INWORD))) + \
    563                       fSCALE(N,fMPY16SS(fGETHALF(2,RssV),fGETHALF(1,INWORD)))));\
    564}
    565
    566
    567
    568Q6INSN(M2_vrcmpys_s1_h,"Rdd32=vrcmpys(Rss32,Rtt32):<<1:sat:raw:hi",ATTRIBS(A_ARCHV3), "Vector Reduce Complex Multiply by Scalar",vrcmpys_sema(1,fGETWORD(1,RttV)))
    569Q6INSN(M2_vrcmpys_s1_l,"Rdd32=vrcmpys(Rss32,Rtt32):<<1:sat:raw:lo",ATTRIBS(A_ARCHV3), "Vector Reduce Complex Multiply by Scalar",vrcmpys_sema(1,fGETWORD(0,RttV)))
    570
    571#undef vrcmpys_sema
    572#define vrcmpys_sema(N,INWORD) \
    573{ fSETWORD(1,RxxV,fSAT(fGETWORD(1,RxxV) + fSCALE(N,fMPY16SS(fGETHALF(1,RssV),fGETHALF(0,INWORD))) + \
    574                       fSCALE(N,fMPY16SS(fGETHALF(3,RssV),fGETHALF(1,INWORD)))));\
    575  fSETWORD(0,RxxV,fSAT(fGETWORD(0,RxxV) + fSCALE(N,fMPY16SS(fGETHALF(0,RssV),fGETHALF(0,INWORD))) + \
    576                       fSCALE(N,fMPY16SS(fGETHALF(2,RssV),fGETHALF(1,INWORD)))));\
    577}
    578
    579
    580
    581Q6INSN(M2_vrcmpys_acc_s1_h,"Rxx32+=vrcmpys(Rss32,Rtt32):<<1:sat:raw:hi",ATTRIBS(A_ARCHV3), "Vector Reduce Complex Multiply by Scalar",vrcmpys_sema(1,fGETWORD(1,RttV)))
    582Q6INSN(M2_vrcmpys_acc_s1_l,"Rxx32+=vrcmpys(Rss32,Rtt32):<<1:sat:raw:lo",ATTRIBS(A_ARCHV3), "Vector Reduce Complex Multiply by Scalar",vrcmpys_sema(1,fGETWORD(0,RttV)))
    583
    584#undef vrcmpys_sema
    585#define vrcmpys_sema(N,INWORD) \
    586{ fSETHALF(1,RdV,fGETHALF(1,fSAT(fSCALE(N,fMPY16SS(fGETHALF(1,RssV),fGETHALF(0,INWORD))) + \
    587                       fSCALE(N,fMPY16SS(fGETHALF(3,RssV),fGETHALF(1,INWORD))) + 0x8000)));\
    588  fSETHALF(0,RdV,fGETHALF(1,fSAT(fSCALE(N,fMPY16SS(fGETHALF(0,RssV),fGETHALF(0,INWORD))) + \
    589                       fSCALE(N,fMPY16SS(fGETHALF(2,RssV),fGETHALF(1,INWORD))) + 0x8000)));\
    590}
    591
    592Q6INSN(M2_vrcmpys_s1rp_h,"Rd32=vrcmpys(Rss32,Rtt32):<<1:rnd:sat:raw:hi",ATTRIBS(A_ARCHV3), "Vector Reduce Complex Multiply by Scalar",vrcmpys_sema(1,fGETWORD(1,RttV)))
    593Q6INSN(M2_vrcmpys_s1rp_l,"Rd32=vrcmpys(Rss32,Rtt32):<<1:rnd:sat:raw:lo",ATTRIBS(A_ARCHV3), "Vector Reduce Complex Multiply by Scalar",vrcmpys_sema(1,fGETWORD(0,RttV)))
    594
    595/**************************************************************/
    596/* mixed mode 32x16 vector dual multiplies                    */
    597/*                                                            */
    598/**************************************************************/
    599
    600/* SIGNED 32 x SIGNED 16 */
    601
    602
    603#undef mixmpy_sema
    604#define mixmpy_sema(N)\
    605{ fSETWORD(1,RxxV,fSAT(fGETWORD(1,RxxV) + ((fSCALE(N,fMPY3216SS(fGETWORD(1,RssV),fGETHALF(2,RttV))))>>16)) ); \
    606  fSETWORD(0,RxxV,fSAT(fGETWORD(0,RxxV) + ((fSCALE(N,fMPY3216SS(fGETWORD(0,RssV),fGETHALF(0,RttV))))>>16)) ); \
    607}
    608Q6INSN(M2_mmacls_s0,"Rxx32+=vmpyweh(Rss32,Rtt32):sat",ATTRIBS(),    "Mixed Precision Multiply",mixmpy_sema(0))
    609Q6INSN(M2_mmacls_s1,"Rxx32+=vmpyweh(Rss32,Rtt32):<<1:sat",ATTRIBS(),"Mixed Precision Multiply",mixmpy_sema(1))
    610
    611#undef mixmpy_sema
    612#define mixmpy_sema(N)\
    613{ fSETWORD(1,RxxV,fSAT(fGETWORD(1,RxxV) + ((fSCALE(N,fMPY3216SS(fGETWORD(1,RssV),fGETHALF(3,RttV))))>>16) )); \
    614  fSETWORD(0,RxxV,fSAT(fGETWORD(0,RxxV) + ((fSCALE(N,fMPY3216SS(fGETWORD(0,RssV),fGETHALF(1,RttV))))>>16 ))); \
    615}
    616Q6INSN(M2_mmachs_s0,"Rxx32+=vmpywoh(Rss32,Rtt32):sat",ATTRIBS(),    "Mixed Precision Multiply",mixmpy_sema(0))
    617Q6INSN(M2_mmachs_s1,"Rxx32+=vmpywoh(Rss32,Rtt32):<<1:sat",ATTRIBS(),"Mixed Precision Multiply",mixmpy_sema(1))
    618
    619#undef mixmpy_sema
    620#define mixmpy_sema(N)\
    621{ fSETWORD(1,RddV,fSAT((fSCALE(N,fMPY3216SS(fGETWORD(1,RssV),fGETHALF(2,RttV))))>>16)); \
    622  fSETWORD(0,RddV,fSAT((fSCALE(N,fMPY3216SS(fGETWORD(0,RssV),fGETHALF(0,RttV))))>>16)); \
    623}
    624Q6INSN(M2_mmpyl_s0,"Rdd32=vmpyweh(Rss32,Rtt32):sat",ATTRIBS(),    "Mixed Precision Multiply",mixmpy_sema(0))
    625Q6INSN(M2_mmpyl_s1,"Rdd32=vmpyweh(Rss32,Rtt32):<<1:sat",ATTRIBS(),"Mixed Precision Multiply",mixmpy_sema(1))
    626
    627#undef mixmpy_sema
    628#define mixmpy_sema(N)\
    629{ fSETWORD(1,RddV,fSAT((fSCALE(N,fMPY3216SS(fGETWORD(1,RssV),fGETHALF(3,RttV))))>>16)); \
    630  fSETWORD(0,RddV,fSAT((fSCALE(N,fMPY3216SS(fGETWORD(0,RssV),fGETHALF(1,RttV))))>>16)); \
    631}
    632Q6INSN(M2_mmpyh_s0,"Rdd32=vmpywoh(Rss32,Rtt32):sat",ATTRIBS(),    "Mixed Precision Multiply",mixmpy_sema(0))
    633Q6INSN(M2_mmpyh_s1,"Rdd32=vmpywoh(Rss32,Rtt32):<<1:sat",ATTRIBS(),"Mixed Precision Multiply",mixmpy_sema(1))
    634
    635
    636/* With rounding */
    637
    638#undef mixmpy_sema
    639#define mixmpy_sema(N)\
    640{ fSETWORD(1,RxxV,fSAT(fGETWORD(1,RxxV) + ((fSCALE(N,fMPY3216SS(fGETWORD(1,RssV),fGETHALF(2,RttV)))+0x8000)>>16)) ); \
    641  fSETWORD(0,RxxV,fSAT(fGETWORD(0,RxxV) + ((fSCALE(N,fMPY3216SS(fGETWORD(0,RssV),fGETHALF(0,RttV)))+0x8000)>>16)) ); \
    642}
    643Q6INSN(M2_mmacls_rs0,"Rxx32+=vmpyweh(Rss32,Rtt32):rnd:sat",ATTRIBS(),    "Mixed Precision Multiply",mixmpy_sema(0))
    644Q6INSN(M2_mmacls_rs1,"Rxx32+=vmpyweh(Rss32,Rtt32):<<1:rnd:sat",ATTRIBS(),"Mixed Precision Multiply",mixmpy_sema(1))
    645
    646#undef mixmpy_sema
    647#define mixmpy_sema(N)\
    648{ fSETWORD(1,RxxV,fSAT(fGETWORD(1,RxxV) + ((fSCALE(N,fMPY3216SS(fGETWORD(1,RssV),fGETHALF(3,RttV)))+0x8000)>>16) )); \
    649  fSETWORD(0,RxxV,fSAT(fGETWORD(0,RxxV) + ((fSCALE(N,fMPY3216SS(fGETWORD(0,RssV),fGETHALF(1,RttV)))+0x8000)>>16 ))); \
    650}
    651Q6INSN(M2_mmachs_rs0,"Rxx32+=vmpywoh(Rss32,Rtt32):rnd:sat",ATTRIBS(),    "Mixed Precision Multiply",mixmpy_sema(0))
    652Q6INSN(M2_mmachs_rs1,"Rxx32+=vmpywoh(Rss32,Rtt32):<<1:rnd:sat",ATTRIBS(),"Mixed Precision Multiply",mixmpy_sema(1))
    653
    654#undef mixmpy_sema
    655#define mixmpy_sema(N)\
    656{ fSETWORD(1,RddV,fSAT((fSCALE(N,fMPY3216SS(fGETWORD(1,RssV),fGETHALF(2,RttV)))+0x8000)>>16)); \
    657  fSETWORD(0,RddV,fSAT((fSCALE(N,fMPY3216SS(fGETWORD(0,RssV),fGETHALF(0,RttV)))+0x8000)>>16)); \
    658}
    659Q6INSN(M2_mmpyl_rs0,"Rdd32=vmpyweh(Rss32,Rtt32):rnd:sat",ATTRIBS(),    "Mixed Precision Multiply",mixmpy_sema(0))
    660Q6INSN(M2_mmpyl_rs1,"Rdd32=vmpyweh(Rss32,Rtt32):<<1:rnd:sat",ATTRIBS(),"Mixed Precision Multiply",mixmpy_sema(1))
    661
    662#undef mixmpy_sema
    663#define mixmpy_sema(N)\
    664{ fSETWORD(1,RddV,fSAT((fSCALE(N,fMPY3216SS(fGETWORD(1,RssV),fGETHALF(3,RttV)))+0x8000)>>16)); \
    665  fSETWORD(0,RddV,fSAT((fSCALE(N,fMPY3216SS(fGETWORD(0,RssV),fGETHALF(1,RttV)))+0x8000)>>16)); \
    666}
    667Q6INSN(M2_mmpyh_rs0,"Rdd32=vmpywoh(Rss32,Rtt32):rnd:sat",ATTRIBS(),    "Mixed Precision Multiply",mixmpy_sema(0))
    668Q6INSN(M2_mmpyh_rs1,"Rdd32=vmpywoh(Rss32,Rtt32):<<1:rnd:sat",ATTRIBS(),"Mixed Precision Multiply",mixmpy_sema(1))
    669
    670
    671#undef mixmpy_sema
    672#define mixmpy_sema(DEST,EQUALS,N)\
    673{ DEST EQUALS fSCALE(N,fMPY3216SS(fGETWORD(1,RssV),fGETHALF(2,RttV))) + fSCALE(N,fMPY3216SS(fGETWORD(0,RssV),fGETHALF(0,RttV)));}
    674
    675Q6INSN(M4_vrmpyeh_s0,"Rdd32=vrmpyweh(Rss32,Rtt32)",ATTRIBS(),    "Mixed Precision Multiply",mixmpy_sema(RddV,=,0))
    676Q6INSN(M4_vrmpyeh_s1,"Rdd32=vrmpyweh(Rss32,Rtt32):<<1",ATTRIBS(),"Mixed Precision Multiply",mixmpy_sema(RddV,=,1))
    677Q6INSN(M4_vrmpyeh_acc_s0,"Rxx32+=vrmpyweh(Rss32,Rtt32)",ATTRIBS(),    "Mixed Precision Multiply",mixmpy_sema(RxxV,+=,0))
    678Q6INSN(M4_vrmpyeh_acc_s1,"Rxx32+=vrmpyweh(Rss32,Rtt32):<<1",ATTRIBS(),"Mixed Precision Multiply",mixmpy_sema(RxxV,+=,1))
    679
    680#undef mixmpy_sema
    681#define mixmpy_sema(DEST,EQUALS,N)\
    682{ DEST EQUALS fSCALE(N,fMPY3216SS(fGETWORD(1,RssV),fGETHALF(3,RttV))) + fSCALE(N,fMPY3216SS(fGETWORD(0,RssV),fGETHALF(1,RttV)));}
    683
    684Q6INSN(M4_vrmpyoh_s0,"Rdd32=vrmpywoh(Rss32,Rtt32)",ATTRIBS(),    "Mixed Precision Multiply",mixmpy_sema(RddV,=,0))
    685Q6INSN(M4_vrmpyoh_s1,"Rdd32=vrmpywoh(Rss32,Rtt32):<<1",ATTRIBS(),"Mixed Precision Multiply",mixmpy_sema(RddV,=,1))
    686Q6INSN(M4_vrmpyoh_acc_s0,"Rxx32+=vrmpywoh(Rss32,Rtt32)",ATTRIBS(),    "Mixed Precision Multiply",mixmpy_sema(RxxV,+=,0))
    687Q6INSN(M4_vrmpyoh_acc_s1,"Rxx32+=vrmpywoh(Rss32,Rtt32):<<1",ATTRIBS(),"Mixed Precision Multiply",mixmpy_sema(RxxV,+=,1))
    688
    689
    690
    691
    692
    693
    694#undef mixmpy_sema
    695#define mixmpy_sema(N,H,RND)\
    696{  RdV = fSAT((fSCALE(N,fMPY3216SS(RsV,fGETHALF(H,RtV)))RND)>>16); \
    697}
    698Q6INSN(M2_hmmpyl_rs1,"Rd32=mpy(Rs32,Rt.L32):<<1:rnd:sat",ATTRIBS(A_ARCHV2),"Mixed Precision Multiply",mixmpy_sema(1,0,+0x8000))
    699Q6INSN(M2_hmmpyh_rs1,"Rd32=mpy(Rs32,Rt.H32):<<1:rnd:sat",ATTRIBS(A_ARCHV2),"Mixed Precision Multiply",mixmpy_sema(1,1,+0x8000))
    700Q6INSN(M2_hmmpyl_s1,"Rd32=mpy(Rs32,Rt.L32):<<1:sat",ATTRIBS(A_ARCHV2),"Mixed Precision Multiply",mixmpy_sema(1,0,))
    701Q6INSN(M2_hmmpyh_s1,"Rd32=mpy(Rs32,Rt.H32):<<1:sat",ATTRIBS(A_ARCHV2),"Mixed Precision Multiply",mixmpy_sema(1,1,))
    702
    703
    704
    705
    706
    707
    708
    709
    710
    711/* SIGNED 32 x UNSIGNED 16 */
    712
    713#undef mixmpy_sema
    714#define mixmpy_sema(N)\
    715{ fSETWORD(1,RxxV,fSAT(fGETWORD(1,RxxV) + ((fSCALE(N,fMPY3216SU(fGETWORD(1,RssV),fGETUHALF(2,RttV))))>>16)) ); \
    716  fSETWORD(0,RxxV,fSAT(fGETWORD(0,RxxV) + ((fSCALE(N,fMPY3216SU(fGETWORD(0,RssV),fGETUHALF(0,RttV))))>>16)) ); \
    717}
    718Q6INSN(M2_mmaculs_s0,"Rxx32+=vmpyweuh(Rss32,Rtt32):sat",ATTRIBS(),    "Mixed Precision Multiply",mixmpy_sema(0))
    719Q6INSN(M2_mmaculs_s1,"Rxx32+=vmpyweuh(Rss32,Rtt32):<<1:sat",ATTRIBS(),"Mixed Precision Multiply",mixmpy_sema(1))
    720
    721#undef mixmpy_sema
    722#define mixmpy_sema(N)\
    723{ fSETWORD(1,RxxV,fSAT(fGETWORD(1,RxxV) + ((fSCALE(N,fMPY3216SU(fGETWORD(1,RssV),fGETUHALF(3,RttV))))>>16) )); \
    724  fSETWORD(0,RxxV,fSAT(fGETWORD(0,RxxV) + ((fSCALE(N,fMPY3216SU(fGETWORD(0,RssV),fGETUHALF(1,RttV))))>>16 ))); \
    725}
    726Q6INSN(M2_mmacuhs_s0,"Rxx32+=vmpywouh(Rss32,Rtt32):sat",ATTRIBS(),    "Mixed Precision Multiply",mixmpy_sema(0))
    727Q6INSN(M2_mmacuhs_s1,"Rxx32+=vmpywouh(Rss32,Rtt32):<<1:sat",ATTRIBS(),"Mixed Precision Multiply",mixmpy_sema(1))
    728
    729#undef mixmpy_sema
    730#define mixmpy_sema(N)\
    731{ fSETWORD(1,RddV,fSAT((fSCALE(N,fMPY3216SU(fGETWORD(1,RssV),fGETUHALF(2,RttV))))>>16)); \
    732  fSETWORD(0,RddV,fSAT((fSCALE(N,fMPY3216SU(fGETWORD(0,RssV),fGETUHALF(0,RttV))))>>16)); \
    733}
    734Q6INSN(M2_mmpyul_s0,"Rdd32=vmpyweuh(Rss32,Rtt32):sat",ATTRIBS(),    "Mixed Precision Multiply",mixmpy_sema(0))
    735Q6INSN(M2_mmpyul_s1,"Rdd32=vmpyweuh(Rss32,Rtt32):<<1:sat",ATTRIBS(),"Mixed Precision Multiply",mixmpy_sema(1))
    736
    737#undef mixmpy_sema
    738#define mixmpy_sema(N)\
    739{ fSETWORD(1,RddV,fSAT((fSCALE(N,fMPY3216SU(fGETWORD(1,RssV),fGETUHALF(3,RttV))))>>16)); \
    740  fSETWORD(0,RddV,fSAT((fSCALE(N,fMPY3216SU(fGETWORD(0,RssV),fGETUHALF(1,RttV))))>>16)); \
    741}
    742Q6INSN(M2_mmpyuh_s0,"Rdd32=vmpywouh(Rss32,Rtt32):sat",ATTRIBS(),    "Mixed Precision Multiply",mixmpy_sema(0))
    743Q6INSN(M2_mmpyuh_s1,"Rdd32=vmpywouh(Rss32,Rtt32):<<1:sat",ATTRIBS(),"Mixed Precision Multiply",mixmpy_sema(1))
    744
    745
    746/* With rounding */
    747
    748#undef mixmpy_sema
    749#define mixmpy_sema(N)\
    750{ fSETWORD(1,RxxV,fSAT(fGETWORD(1,RxxV) + ((fSCALE(N,fMPY3216SU(fGETWORD(1,RssV),fGETUHALF(2,RttV)))+0x8000)>>16)) ); \
    751  fSETWORD(0,RxxV,fSAT(fGETWORD(0,RxxV) + ((fSCALE(N,fMPY3216SU(fGETWORD(0,RssV),fGETUHALF(0,RttV)))+0x8000)>>16)) ); \
    752}
    753Q6INSN(M2_mmaculs_rs0,"Rxx32+=vmpyweuh(Rss32,Rtt32):rnd:sat",ATTRIBS(),    "Mixed Precision Multiply",mixmpy_sema(0))
    754Q6INSN(M2_mmaculs_rs1,"Rxx32+=vmpyweuh(Rss32,Rtt32):<<1:rnd:sat",ATTRIBS(),"Mixed Precision Multiply",mixmpy_sema(1))
    755
    756#undef mixmpy_sema
    757#define mixmpy_sema(N)\
    758{ fSETWORD(1,RxxV,fSAT(fGETWORD(1,RxxV) + ((fSCALE(N,fMPY3216SU(fGETWORD(1,RssV),fGETUHALF(3,RttV)))+0x8000)>>16) )); \
    759  fSETWORD(0,RxxV,fSAT(fGETWORD(0,RxxV) + ((fSCALE(N,fMPY3216SU(fGETWORD(0,RssV),fGETUHALF(1,RttV)))+0x8000)>>16 ))); \
    760}
    761Q6INSN(M2_mmacuhs_rs0,"Rxx32+=vmpywouh(Rss32,Rtt32):rnd:sat",ATTRIBS(),    "Mixed Precision Multiply",mixmpy_sema(0))
    762Q6INSN(M2_mmacuhs_rs1,"Rxx32+=vmpywouh(Rss32,Rtt32):<<1:rnd:sat",ATTRIBS(),"Mixed Precision Multiply",mixmpy_sema(1))
    763
    764#undef mixmpy_sema
    765#define mixmpy_sema(N)\
    766{ fSETWORD(1,RddV,fSAT((fSCALE(N,fMPY3216SU(fGETWORD(1,RssV),fGETUHALF(2,RttV)))+0x8000)>>16)); \
    767  fSETWORD(0,RddV,fSAT((fSCALE(N,fMPY3216SU(fGETWORD(0,RssV),fGETUHALF(0,RttV)))+0x8000)>>16)); \
    768}
    769Q6INSN(M2_mmpyul_rs0,"Rdd32=vmpyweuh(Rss32,Rtt32):rnd:sat",ATTRIBS(),    "Mixed Precision Multiply",mixmpy_sema(0))
    770Q6INSN(M2_mmpyul_rs1,"Rdd32=vmpyweuh(Rss32,Rtt32):<<1:rnd:sat",ATTRIBS(),"Mixed Precision Multiply",mixmpy_sema(1))
    771
    772#undef mixmpy_sema
    773#define mixmpy_sema(N)\
    774{ fSETWORD(1,RddV,fSAT((fSCALE(N,fMPY3216SU(fGETWORD(1,RssV),fGETUHALF(3,RttV)))+0x8000)>>16)); \
    775  fSETWORD(0,RddV,fSAT((fSCALE(N,fMPY3216SU(fGETWORD(0,RssV),fGETUHALF(1,RttV)))+0x8000)>>16)); \
    776}
    777Q6INSN(M2_mmpyuh_rs0,"Rdd32=vmpywouh(Rss32,Rtt32):rnd:sat",ATTRIBS(),    "Mixed Precision Multiply",mixmpy_sema(0))
    778Q6INSN(M2_mmpyuh_rs1,"Rdd32=vmpywouh(Rss32,Rtt32):<<1:rnd:sat",ATTRIBS(),"Mixed Precision Multiply",mixmpy_sema(1))
    779
    780
    781/**************************************************************/
    782/* complex mac with full 64-bit accum - no sat, no shift      */
    783/* either do real or accum, never both                        */
    784/**************************************************************/
    785
    786Q6INSN(M2_vrcmaci_s0,"Rxx32+=vrcmpyi(Rss32,Rtt32)",ATTRIBS(),"Vector Complex Mac Imaginary",
    787{
    788RxxV = RxxV + fMPY16SS(fGETHALF(1,RssV),fGETHALF(0,RttV)) + \
    789              fMPY16SS(fGETHALF(0,RssV),fGETHALF(1,RttV)) + \
    790              fMPY16SS(fGETHALF(3,RssV),fGETHALF(2,RttV)) + \
    791              fMPY16SS(fGETHALF(2,RssV),fGETHALF(3,RttV));\
    792})
    793
    794Q6INSN(M2_vrcmacr_s0,"Rxx32+=vrcmpyr(Rss32,Rtt32)",ATTRIBS(),"Vector Complex Mac Real",
    795{ RxxV = RxxV + fMPY16SS(fGETHALF(0,RssV),fGETHALF(0,RttV)) - \
    796                fMPY16SS(fGETHALF(1,RssV),fGETHALF(1,RttV)) + \
    797                fMPY16SS(fGETHALF(2,RssV),fGETHALF(2,RttV)) - \
    798                fMPY16SS(fGETHALF(3,RssV),fGETHALF(3,RttV));\
    799})
    800
    801Q6INSN(M2_vrcmaci_s0c,"Rxx32+=vrcmpyi(Rss32,Rtt32*)",ATTRIBS(A_ARCHV2),"Vector Complex Mac Imaginary",
    802{
    803RxxV = RxxV + fMPY16SS(fGETHALF(1,RssV),fGETHALF(0,RttV)) - \
    804              fMPY16SS(fGETHALF(0,RssV),fGETHALF(1,RttV)) + \
    805              fMPY16SS(fGETHALF(3,RssV),fGETHALF(2,RttV)) - \
    806              fMPY16SS(fGETHALF(2,RssV),fGETHALF(3,RttV));\
    807})
    808
    809Q6INSN(M2_vrcmacr_s0c,"Rxx32+=vrcmpyr(Rss32,Rtt32*)",ATTRIBS(A_ARCHV2),"Vector Complex Mac Real",
    810{ RxxV = RxxV + fMPY16SS(fGETHALF(0,RssV),fGETHALF(0,RttV)) + \
    811                fMPY16SS(fGETHALF(1,RssV),fGETHALF(1,RttV)) + \
    812                fMPY16SS(fGETHALF(2,RssV),fGETHALF(2,RttV)) + \
    813                fMPY16SS(fGETHALF(3,RssV),fGETHALF(3,RttV));\
    814})
    815
    816Q6INSN(M2_cmaci_s0,"Rxx32+=cmpyi(Rs32,Rt32)",ATTRIBS(),"Vector Complex Mac Imaginary",
    817{
    818RxxV = RxxV + fMPY16SS(fGETHALF(1,RsV),fGETHALF(0,RtV)) + \
    819              fMPY16SS(fGETHALF(0,RsV),fGETHALF(1,RtV));
    820})
    821
    822Q6INSN(M2_cmacr_s0,"Rxx32+=cmpyr(Rs32,Rt32)",ATTRIBS(),"Vector Complex Mac Real",
    823{ RxxV = RxxV + fMPY16SS(fGETHALF(0,RsV),fGETHALF(0,RtV)) - \
    824                fMPY16SS(fGETHALF(1,RsV),fGETHALF(1,RtV));
    825})
    826
    827
    828Q6INSN(M2_vrcmpyi_s0,"Rdd32=vrcmpyi(Rss32,Rtt32)",ATTRIBS(),"Vector Complex Mpy Imaginary",
    829{
    830RddV = fMPY16SS(fGETHALF(1,RssV),fGETHALF(0,RttV)) + \
    831       fMPY16SS(fGETHALF(0,RssV),fGETHALF(1,RttV)) + \
    832       fMPY16SS(fGETHALF(3,RssV),fGETHALF(2,RttV)) + \
    833       fMPY16SS(fGETHALF(2,RssV),fGETHALF(3,RttV));\
    834})
    835
    836Q6INSN(M2_vrcmpyr_s0,"Rdd32=vrcmpyr(Rss32,Rtt32)",ATTRIBS(),"Vector Complex Mpy Real",
    837{ RddV = fMPY16SS(fGETHALF(0,RssV),fGETHALF(0,RttV)) - \
    838         fMPY16SS(fGETHALF(1,RssV),fGETHALF(1,RttV)) + \
    839         fMPY16SS(fGETHALF(2,RssV),fGETHALF(2,RttV)) - \
    840         fMPY16SS(fGETHALF(3,RssV),fGETHALF(3,RttV));\
    841})
    842
    843Q6INSN(M2_vrcmpyi_s0c,"Rdd32=vrcmpyi(Rss32,Rtt32*)",ATTRIBS(A_ARCHV2),"Vector Complex Mpy Imaginary",
    844{
    845RddV = fMPY16SS(fGETHALF(1,RssV),fGETHALF(0,RttV)) - \
    846       fMPY16SS(fGETHALF(0,RssV),fGETHALF(1,RttV)) + \
    847       fMPY16SS(fGETHALF(3,RssV),fGETHALF(2,RttV)) - \
    848       fMPY16SS(fGETHALF(2,RssV),fGETHALF(3,RttV));\
    849})
    850
    851Q6INSN(M2_vrcmpyr_s0c,"Rdd32=vrcmpyr(Rss32,Rtt32*)",ATTRIBS(A_ARCHV2),"Vector Complex Mpy Real",
    852{ RddV = fMPY16SS(fGETHALF(0,RssV),fGETHALF(0,RttV)) + \
    853         fMPY16SS(fGETHALF(1,RssV),fGETHALF(1,RttV)) + \
    854         fMPY16SS(fGETHALF(2,RssV),fGETHALF(2,RttV)) + \
    855         fMPY16SS(fGETHALF(3,RssV),fGETHALF(3,RttV));\
    856})
    857
    858Q6INSN(M2_cmpyi_s0,"Rdd32=cmpyi(Rs32,Rt32)",ATTRIBS(),"Vector Complex Mpy Imaginary",
    859{
    860RddV = fMPY16SS(fGETHALF(1,RsV),fGETHALF(0,RtV)) + \
    861       fMPY16SS(fGETHALF(0,RsV),fGETHALF(1,RtV));
    862})
    863
    864Q6INSN(M2_cmpyr_s0,"Rdd32=cmpyr(Rs32,Rt32)",ATTRIBS(),"Vector Complex Mpy Real",
    865{ RddV = fMPY16SS(fGETHALF(0,RsV),fGETHALF(0,RtV)) - \
    866         fMPY16SS(fGETHALF(1,RsV),fGETHALF(1,RtV));
    867})
    868
    869
    870/**************************************************************/
    871/* Complex mpy/mac with 2x32 bit accum, sat, shift            */
    872/* 32x16 real or imag                                         */
    873/**************************************************************/
    874
    875#if 1
    876
    877Q6INSN(M4_cmpyi_wh,"Rd32=cmpyiwh(Rss32,Rt32):<<1:rnd:sat",ATTRIBS(),"Mixed Precision Complex Multiply",
    878{
    879 RdV = fSAT(  (  fMPY3216SS(fGETWORD(0,RssV),fGETHALF(1,RtV))
    880               + fMPY3216SS(fGETWORD(1,RssV),fGETHALF(0,RtV))
    881               + 0x4000)>>15);
    882})
    883
    884
    885Q6INSN(M4_cmpyr_wh,"Rd32=cmpyrwh(Rss32,Rt32):<<1:rnd:sat",ATTRIBS(),"Mixed Precision Complex Multiply",
    886{
    887 RdV = fSAT(  (  fMPY3216SS(fGETWORD(0,RssV),fGETHALF(0,RtV))
    888               - fMPY3216SS(fGETWORD(1,RssV),fGETHALF(1,RtV))
    889               + 0x4000)>>15);
    890})
    891
    892Q6INSN(M4_cmpyi_whc,"Rd32=cmpyiwh(Rss32,Rt32*):<<1:rnd:sat",ATTRIBS(),"Mixed Precision Complex Multiply",
    893{
    894 RdV = fSAT(  (  fMPY3216SS(fGETWORD(1,RssV),fGETHALF(0,RtV))
    895               - fMPY3216SS(fGETWORD(0,RssV),fGETHALF(1,RtV))
    896               + 0x4000)>>15);
    897})
    898
    899
    900Q6INSN(M4_cmpyr_whc,"Rd32=cmpyrwh(Rss32,Rt32*):<<1:rnd:sat",ATTRIBS(),"Mixed Precision Complex Multiply",
    901{
    902 RdV = fSAT(  (  fMPY3216SS(fGETWORD(0,RssV),fGETHALF(0,RtV))
    903               + fMPY3216SS(fGETWORD(1,RssV),fGETHALF(1,RtV))
    904               + 0x4000)>>15);
    905})
    906
    907
    908#endif
    909
    910/**************************************************************/
    911/* Vector mpy/mac with 2x32 bit accum, sat, shift             */
    912/* either do real or imag,  never both                        */
    913/**************************************************************/
    914
    915#undef VCMPYSEMI
    916#define VCMPYSEMI(DST,ACC0,ACC1,SHIFT,SAT) \
    917    fSETWORD(0,DST,SAT(ACC0 fSCALE(SHIFT,fMPY16SS(fGETHALF(1,RssV),fGETHALF(0,RttV)) + \
    918        fMPY16SS(fGETHALF(0,RssV),fGETHALF(1,RttV))))); \
    919    fSETWORD(1,DST,SAT(ACC1 fSCALE(SHIFT,fMPY16SS(fGETHALF(3,RssV),fGETHALF(2,RttV)) + \
    920        fMPY16SS(fGETHALF(2,RssV),fGETHALF(3,RttV))))); \
    921
    922#undef VCMPYSEMR
    923#define VCMPYSEMR(DST,ACC0,ACC1,SHIFT,SAT) \
    924    fSETWORD(0,DST,SAT(ACC0 fSCALE(SHIFT,fMPY16SS(fGETHALF(0,RssV),fGETHALF(0,RttV)) - \
    925        fMPY16SS(fGETHALF(1,RssV),fGETHALF(1,RttV))))); \
    926    fSETWORD(1,DST,SAT(ACC1 fSCALE(SHIFT,fMPY16SS(fGETHALF(2,RssV),fGETHALF(2,RttV)) - \
    927        fMPY16SS(fGETHALF(3,RssV),fGETHALF(3,RttV))))); \
    928
    929
    930#undef VCMPYIR
    931#define VCMPYIR(TAGBASE,DSTSYN,DSTVAL,ACCSEM,ACCVAL0,ACCVAL1,SHIFTSYN,SHIFTVAL,SATSYN,SATVAL) \
    932Q6INSN(M2_##TAGBASE##i,DSTSYN ACCSEM "vcmpyi(Rss32,Rtt32)" SHIFTSYN SATSYN,ATTRIBS(A_ARCHV2), \
    933    "Vector Complex Multiply Imaginary", { VCMPYSEMI(DSTVAL,ACCVAL0,ACCVAL1,SHIFTVAL,SATVAL); }) \
    934Q6INSN(M2_##TAGBASE##r,DSTSYN ACCSEM "vcmpyr(Rss32,Rtt32)" SHIFTSYN SATSYN,ATTRIBS(A_ARCHV2), \
    935    "Vector Complex Multiply Imaginary", { VCMPYSEMR(DSTVAL,ACCVAL0,ACCVAL1,SHIFTVAL,SATVAL); })
    936
    937
    938VCMPYIR(vcmpy_s0_sat_,"Rdd32",RddV,"=",,,"",0,":sat",fSAT)
    939VCMPYIR(vcmpy_s1_sat_,"Rdd32",RddV,"=",,,":<<1",1,":sat",fSAT)
    940VCMPYIR(vcmac_s0_sat_,"Rxx32",RxxV,"+=",fGETWORD(0,RxxV) + ,fGETWORD(1,RxxV) + ,"",0,":sat",fSAT)
    941
    942
    943/**********************************************************************
    944 *  Rotation  -- by 0, 90, 180, or 270 means mult by 1, J, -1, -J     *
    945 *********************************************************************/
    946
    947Q6INSN(S2_vcrotate,"Rdd32=vcrotate(Rss32,Rt32)",ATTRIBS(A_ARCHV2),"Rotate complex value by multiple of PI/2",
    948{
    949    fHIDE(size1u_t tmp;)
    950    tmp = fEXTRACTU_RANGE(RtV,1,0);
    951    if (tmp == 0) { /* No rotation */
    952        fSETHALF(0,RddV,fGETHALF(0,RssV));
    953        fSETHALF(1,RddV,fGETHALF(1,RssV));
    954    } else if (tmp == 1) { /* Multiply by -J */
    955        fSETHALF(0,RddV,fGETHALF(1,RssV));
    956        fSETHALF(1,RddV,fSATH(-fGETHALF(0,RssV)));
    957    } else if (tmp == 2) { /* Multiply by J */
    958        fSETHALF(0,RddV,fSATH(-fGETHALF(1,RssV)));
    959        fSETHALF(1,RddV,fGETHALF(0,RssV));
    960    } else { /* Multiply by -1 */
    961        fHIDE(if (tmp != 3) fatal("C is broken");)
    962        fSETHALF(0,RddV,fSATH(-fGETHALF(0,RssV)));
    963        fSETHALF(1,RddV,fSATH(-fGETHALF(1,RssV)));
    964    }
    965    tmp = fEXTRACTU_RANGE(RtV,3,2);
    966    if (tmp == 0) { /* No rotation */
    967        fSETHALF(2,RddV,fGETHALF(2,RssV));
    968        fSETHALF(3,RddV,fGETHALF(3,RssV));
    969    } else if (tmp == 1) { /* Multiply by -J */
    970        fSETHALF(2,RddV,fGETHALF(3,RssV));
    971        fSETHALF(3,RddV,fSATH(-fGETHALF(2,RssV)));
    972    } else if (tmp == 2) { /* Multiply by J */
    973        fSETHALF(2,RddV,fSATH(-fGETHALF(3,RssV)));
    974        fSETHALF(3,RddV,fGETHALF(2,RssV));
    975    } else { /* Multiply by -1 */
    976        fHIDE(if (tmp != 3) fatal("C is broken");)
    977        fSETHALF(2,RddV,fSATH(-fGETHALF(2,RssV)));
    978        fSETHALF(3,RddV,fSATH(-fGETHALF(3,RssV)));
    979    }
    980})
    981
    982
    983Q6INSN(S4_vrcrotate_acc,"Rxx32+=vrcrotate(Rss32,Rt32,#u2)",ATTRIBS(),"Rotate and Reduce Bytes",
    984{
    985    fHIDE(int i; int tmpr; int tmpi; unsigned int control;)
    986    fHIDE(int sumr; int sumi;)
    987    sumr = 0;
    988    sumi = 0;
    989    control = fGETUBYTE(uiV,RtV);
    990    for (i = 0; i < 8; i += 2) {
    991        tmpr = fGETBYTE(i  ,RssV);
    992        tmpi = fGETBYTE(i+1,RssV);
    993        switch (control & 3) {
    994        case 0: /* No Rotation */
    995            sumr += tmpr;
    996            sumi += tmpi;
    997            break;
    998        case 1: /* Multiply by -J */
    999            sumr += tmpi;
   1000            sumi -= tmpr;
   1001            break;
   1002        case 2: /* Multiply by J */
   1003            sumr -= tmpi;
   1004            sumi += tmpr;
   1005            break;
   1006        case 3: /* Multiply by -1 */
   1007            sumr -= tmpr;
   1008            sumi -= tmpi;
   1009            break;
   1010        fHIDE(default: fatal("C is broken!");)
   1011        }
   1012        control = control >> 2;
   1013    }
   1014    fSETWORD(0,RxxV,fGETWORD(0,RxxV) + sumr);
   1015    fSETWORD(1,RxxV,fGETWORD(1,RxxV) + sumi);
   1016})
   1017
   1018Q6INSN(S4_vrcrotate,"Rdd32=vrcrotate(Rss32,Rt32,#u2)",ATTRIBS(),"Rotate and Reduce Bytes",
   1019{
   1020    fHIDE(int i; int tmpr; int tmpi; unsigned int control;)
   1021    fHIDE(int sumr; int sumi;)
   1022    sumr = 0;
   1023    sumi = 0;
   1024    control = fGETUBYTE(uiV,RtV);
   1025    for (i = 0; i < 8; i += 2) {
   1026        tmpr = fGETBYTE(i  ,RssV);
   1027        tmpi = fGETBYTE(i+1,RssV);
   1028        switch (control & 3) {
   1029        case 0: /* No Rotation */
   1030            sumr += tmpr;
   1031            sumi += tmpi;
   1032            break;
   1033        case 1: /* Multiply by -J */
   1034            sumr += tmpi;
   1035            sumi -= tmpr;
   1036            break;
   1037        case 2: /* Multiply by J */
   1038            sumr -= tmpi;
   1039            sumi += tmpr;
   1040            break;
   1041        case 3: /* Multiply by -1 */
   1042            sumr -= tmpr;
   1043            sumi -= tmpi;
   1044            break;
   1045        fHIDE(default: fatal("C is broken!");)
   1046        }
   1047        control = control >> 2;
   1048    }
   1049    fSETWORD(0,RddV,sumr);
   1050    fSETWORD(1,RddV,sumi);
   1051})
   1052
   1053
   1054Q6INSN(S2_vcnegh,"Rdd32=vcnegh(Rss32,Rt32)",ATTRIBS(),"Conditional Negate halfwords",
   1055{
   1056    fHIDE(int i;)
   1057    for (i = 0; i < 4; i++) {
   1058        if (fGETBIT(i,RtV)) {
   1059            fSETHALF(i,RddV,fSATH(-fGETHALF(i,RssV)));
   1060        } else {
   1061            fSETHALF(i,RddV,fGETHALF(i,RssV));
   1062        }
   1063    }
   1064})
   1065
   1066Q6INSN(S2_vrcnegh,"Rxx32+=vrcnegh(Rss32,Rt32)",ATTRIBS(),"Vector Reduce Conditional Negate halfwords",
   1067{
   1068    fHIDE(int i;)
   1069    for (i = 0; i < 4; i++) {
   1070        if (fGETBIT(i,RtV)) {
   1071            RxxV += -fGETHALF(i,RssV);
   1072        } else {
   1073            RxxV += fGETHALF(i,RssV);
   1074        }
   1075    }
   1076})
   1077
   1078
   1079/**********************************************************************
   1080 *  Finite-field multiplies.  Written by David Hoyle                  *
   1081 *********************************************************************/
   1082
   1083Q6INSN(M4_pmpyw,"Rdd32=pmpyw(Rs32,Rt32)",ATTRIBS(),"Polynomial 32bit Multiplication with Addition in GF(2)",
   1084{
   1085        fHIDE(int i; unsigned int y;)
   1086        fHIDE(unsigned long long x; unsigned long long prod;)
   1087        x = fGETUWORD(0, RsV);
   1088        y = fGETUWORD(0, RtV);
   1089
   1090        prod = 0;
   1091        for(i=0; i < 32; i++) {
   1092            if((y >> i) & 1) prod ^= (x << i);
   1093        }
   1094        RddV = prod;
   1095})
   1096
   1097Q6INSN(M4_vpmpyh,"Rdd32=vpmpyh(Rs32,Rt32)",ATTRIBS(),"Dual Polynomial 16bit Multiplication with Addition in GF(2)",
   1098{
   1099        fHIDE(int i; unsigned int x0; unsigned int x1;)
   1100        fHIDE(unsigned int y0; unsigned int y1;)
   1101        fHIDE(unsigned int prod0; unsigned int prod1;)
   1102
   1103        x0 = fGETUHALF(0, RsV);
   1104        x1 = fGETUHALF(1, RsV);
   1105        y0 = fGETUHALF(0, RtV);
   1106        y1 = fGETUHALF(1, RtV);
   1107
   1108        prod0 = prod1 = 0;
   1109        for(i=0; i < 16; i++) {
   1110            if((y0 >> i) & 1) prod0 ^= (x0 << i);
   1111            if((y1 >> i) & 1) prod1 ^= (x1 << i);
   1112        }
   1113        fSETHALF(0,RddV,fGETUHALF(0,prod0));
   1114        fSETHALF(1,RddV,fGETUHALF(0,prod1));
   1115        fSETHALF(2,RddV,fGETUHALF(1,prod0));
   1116        fSETHALF(3,RddV,fGETUHALF(1,prod1));
   1117})
   1118
   1119Q6INSN(M4_pmpyw_acc,"Rxx32^=pmpyw(Rs32,Rt32)",ATTRIBS(),"Polynomial 32bit Multiplication with Addition in GF(2)",
   1120{
   1121        fHIDE(int i; unsigned int y;)
   1122        fHIDE(unsigned long long x; unsigned long long prod;)
   1123        x = fGETUWORD(0, RsV);
   1124        y = fGETUWORD(0, RtV);
   1125
   1126        prod = 0;
   1127        for(i=0; i < 32; i++) {
   1128            if((y >> i) & 1) prod ^= (x << i);
   1129        }
   1130        RxxV ^= prod;
   1131})
   1132
   1133Q6INSN(M4_vpmpyh_acc,"Rxx32^=vpmpyh(Rs32,Rt32)",ATTRIBS(),"Dual Polynomial 16bit Multiplication with Addition in GF(2)",
   1134{
   1135        fHIDE(int i; unsigned int x0; unsigned int x1;)
   1136        fHIDE(unsigned int y0; unsigned int y1;)
   1137        fHIDE(unsigned int prod0; unsigned int prod1;)
   1138
   1139        x0 = fGETUHALF(0, RsV);
   1140        x1 = fGETUHALF(1, RsV);
   1141        y0 = fGETUHALF(0, RtV);
   1142        y1 = fGETUHALF(1, RtV);
   1143
   1144        prod0 = prod1 = 0;
   1145        for(i=0; i < 16; i++) {
   1146            if((y0 >> i) & 1) prod0 ^= (x0 << i);
   1147            if((y1 >> i) & 1) prod1 ^= (x1 << i);
   1148        }
   1149        fSETHALF(0,RxxV,fGETUHALF(0,RxxV) ^ fGETUHALF(0,prod0));
   1150        fSETHALF(1,RxxV,fGETUHALF(1,RxxV) ^ fGETUHALF(0,prod1));
   1151        fSETHALF(2,RxxV,fGETUHALF(2,RxxV) ^ fGETUHALF(1,prod0));
   1152        fSETHALF(3,RxxV,fGETUHALF(3,RxxV) ^ fGETUHALF(1,prod1));
   1153})
   1154
   1155
   1156/* V70: TINY CORE */
   1157
   1158#define CMPY64(TAG,NAME,DESC,OPERAND1,OP,W0,W1,W2,W3) \
   1159Q6INSN(M7_##TAG,"Rdd32=" NAME "(Rss32," OPERAND1 ")",ATTRIBS(A_RESTRICT_SLOT3ONLY),"Complex Multiply 64-bit " DESC,    { RddV  = (fMPY32SS(fGETWORD(W0, RssV), fGETWORD(W1, RttV)) OP fMPY32SS(fGETWORD(W2, RssV), fGETWORD(W3, RttV)));})\
   1160Q6INSN(M7_##TAG##_acc,"Rxx32+=" NAME "(Rss32,"OPERAND1")",ATTRIBS(A_RESTRICT_SLOT3ONLY),"Complex Multiply-Accumulate 64-bit " DESC, { RxxV += (fMPY32SS(fGETWORD(W0, RssV), fGETWORD(W1, RttV)) OP fMPY32SS(fGETWORD(W2, RssV), fGETWORD(W3, RttV)));})
   1161
   1162CMPY64(dcmpyrw, "cmpyrw","Real","Rtt32" ,-,0,0,1,1)
   1163CMPY64(dcmpyrwc,"cmpyrw","Real","Rtt32*",+,0,0,1,1)
   1164CMPY64(dcmpyiw, "cmpyiw","Imag","Rtt32" ,+,0,1,1,0)
   1165CMPY64(dcmpyiwc,"cmpyiw","Imag","Rtt32*",-,1,0,0,1)
   1166
   1167#define CMPY128(TAG, NAME, OPERAND1, WORD0, WORD1, WORD2, WORD3, OP) \
   1168Q6INSN(M7_##TAG,"Rd32=" NAME "(Rss32,"OPERAND1"):<<1:sat",ATTRIBS(A_RESTRICT_SLOT3ONLY),"Complex Multiply 32-bit result real",  \
   1169{ \
   1170fHIDE(size16s_t acc128;)\
   1171fHIDE(size16s_t tmp128;)\
   1172fHIDE(size8s_t acc64;)\
   1173tmp128 = fCAST8S_16S(fMPY32SS(fGETWORD(WORD0, RssV), fGETWORD(WORD1, RttV)));\
   1174acc128 = fCAST8S_16S(fMPY32SS(fGETWORD(WORD2, RssV), fGETWORD(WORD3, RttV)));\
   1175acc128 = OP(tmp128,acc128);\
   1176acc128 = fSHIFTR128(acc128, 31);\
   1177acc64 =  fCAST16S_8S(acc128);\
   1178RdV = fSATW(acc64);\
   1179})
   1180
   1181
   1182CMPY128(wcmpyrw, "cmpyrw", "Rtt32", 0, 0, 1, 1, fSUB128)
   1183CMPY128(wcmpyrwc, "cmpyrw", "Rtt32*", 0, 0, 1, 1, fADD128)
   1184CMPY128(wcmpyiw, "cmpyiw", "Rtt32", 0, 1, 1, 0, fADD128)
   1185CMPY128(wcmpyiwc, "cmpyiw", "Rtt32*", 1, 0, 0, 1, fSUB128)
   1186
   1187
   1188#define CMPY128RND(TAG, NAME, OPERAND1, WORD0, WORD1, WORD2, WORD3, OP) \
   1189Q6INSN(M7_##TAG##_rnd,"Rd32=" NAME "(Rss32,"OPERAND1"):<<1:rnd:sat",ATTRIBS(A_RESTRICT_SLOT3ONLY),"Complex Multiply 32-bit result real",  \
   1190{ \
   1191fHIDE(size16s_t acc128;)\
   1192fHIDE(size16s_t tmp128;)\
   1193fHIDE(size16s_t const128;)\
   1194fHIDE(size8s_t acc64;)\
   1195tmp128 = fCAST8S_16S(fMPY32SS(fGETWORD(WORD0, RssV), fGETWORD(WORD1, RttV)));\
   1196acc128 = fCAST8S_16S(fMPY32SS(fGETWORD(WORD2, RssV), fGETWORD(WORD3, RttV)));\
   1197const128 = fCAST8S_16S(fCONSTLL(0x40000000));\
   1198acc128 = OP(tmp128,acc128);\
   1199acc128 = fADD128(acc128,const128);\
   1200acc128 = fSHIFTR128(acc128, 31);\
   1201acc64 =  fCAST16S_8S(acc128);\
   1202RdV = fSATW(acc64);\
   1203})
   1204
   1205CMPY128RND(wcmpyrw, "cmpyrw", "Rtt32", 0, 0, 1, 1, fSUB128)
   1206CMPY128RND(wcmpyrwc, "cmpyrw", "Rtt32*", 0, 0, 1, 1, fADD128)
   1207CMPY128RND(wcmpyiw, "cmpyiw", "Rtt32", 0, 1, 1, 0, fADD128)
   1208CMPY128RND(wcmpyiwc, "cmpyiw", "Rtt32*", 1, 0, 0, 1, fSUB128)