/*将src1和src2中的2对16位有符号数进行点积(相加),结果被写成有符号32位int或者符号扩展为64位*/
int _dotp2 (int , int);
__int40_t _ldotp2 (int, int);
/*将src1和src2中的16位有符号数进行点积相减*/
int _dotpn2 (int, int);
/*src1和src2中的高16位的点积“减去”低16位的点积。src1中的数被当做有符号数,src2中的数被当做无符号数,再加上2^15,结果再带符号右移16位*/
int _dotpnrsu2 (int src1, unsigned src2);
/*src1和src2中的高16位的点积“加上”低16位的点积。src1中的数被当做有符号数,src2中的数被当做无符号数,再加上2^15,结果再带符号右移16位*/
int _dotprsu2 (int, unsigned);
/*将src1和src2的4对8位数进行相乘再求和,src1的每8位数被当做有符号数,src2的每8位数被当做无符号数*/
int _dotpsu4 (int, unsigned);
unsigned _dotpu4 (unsigned, unsigned);//同上,都被当做无符号数
/*将src1和src2的4对8位无符号数进行迦罗瓦域的乘法*/
int _gmpy4 (int, int);
/*将src1和src2中的2对16位有符号数比较,取较大值*/
int _max2 (int, int);
/*将src1和src2中的4对8位无符号数比较,取较大值*/
unsigned _maxu4 (unsigned, unsigned);
/*将src1和src2中的2对16位有符号数比较,取较小值*/
int _min2 (int, int);
/*将src1和src2中的4对8位无符号数比较,取较小值*/
unsigned _minu4 (unsigned, unsigned);
/*将src1和src2中的2对16位有符号数分别相乘,将2个32位的结果写入long long中*/
long long _mpy2ll (int, int);
/*将src1中高16位作为1个16位有符号数乘以src2的32位有符号数,结果写入long long的低48位*/
long long _mpyhill (int src1, int src2);
/*将src1中低16位作为1个16位有符号数乘以src2的32位有符号数,结果写入long long的低48位*/
long long _mpylill (int, int);
/*将src1的高16位作为1个16位有符号数乘以src2的32位有符号数。乘积利用round模式通过加2^14转成32位,最后再右移15位*/
int _mpyhir (int, int);
/*将src1的低16位作为1个16位有符号数乘以src2的32位有符号数。乘积利用round模式通过加2^14转成32位,最后再右移15位*/
int _mpylir (int, int);
/*将src1的4个8位有符号乘src2的4个8位无符号数,得到4个16位有符号数,组成一个64位*/
long long _mpysu4ll (int src1, unsigned src2);
long long _mpyu4ll (unsigned, unsigned);//同时,都是无符号数
/*平行做2步,1.src1+src2->dst_o 2.src1-src2->dst_e,dst_o代表奇数寄存器,dst_e代表偶数寄存器*/
long long _addsub (int src1, int src2);
/*同上,分成2对16位有符号数进行ADD2和SUB2*/
long long _addsub2 (unsigned, unsigned);
/* *
* 饱和(src1低16位和src2高16位的点积加上src1高16位和src2低16位点积)赋给dst_e
* sat((lsb16(src1) × msb16(src2)) + (msb16(src1) × lsb16(src2))) → dst_e
* 有符号16位src1和src2的高16位的点积减去src1和src2低16位的点积赋给dst_o
* (msb16(src1) × msb16(src2)) - (lsb16(src1) × lsb16(src2)) → dst_o
* */
long long _cmpy (unsigned src1, unsigned src2);
/* *
* sat((lsb16(src1) × msb16(src2)) + (msb16(src1) × lsb16(src2))) → tmp_e
* msb16(sat(tmp_e + 00008000h)) → lsb16(dst)
* sat((msb16(src1) × msb16(src2)) - (lsb16(src1) × lsb16(src2))) → tmp_o
* msb16(sat(tmp_o + 00008000h)) → msb16(dst)
* */
unsigned _cmpyr (unsigned, unsigned);
/* *
* sat((lsb16(src1) × msb16(src2)) + (msb16(src1) × lsb16(src2))) → tmp_e
* msb16(sat((tmp_e + 00004000h) << 1)) → lsb16(dst)
* sat((msb16(src1) × msb16(src2)) - (lsb16(src1) × lsb16(src2))) → tmp_o
* msb16(sat((tmp_e + 00004000h) << 1)) → msb16(dst)
* */
unsigned _cmpyr1 (unsigned, unsigned);
/*看TMS320C6000内联指令汇编的35页图示,完成点积和功能*/
long long _ddotph2 (long long, unsigned);
unsigned _ddotph2r (long long, unsigned);
long long _ddotpl2 (long long, unsigned);
unsigned _ddotpl2r (long long, unsigned);
long long _ddotp4 (unsigned src1, unsigned src2);
/*数据打包看37页图示*/
long long _dpack2 (unsigned src1, unsigned src2);
long long _dpackx2 (unsigned, unsigned);
/*将两个寄存器移入一个寄存器一次性的*/
long long _dmv (unsigned, unsigned);
double _fdmv (float, float);
/*迦罗瓦域上的乘法*/
unsigned _gmpy (unsigned, unsigned);
/*进行32位乘32位。都是有符号数,64位结果都被写入dst*/
long long _mpy32ll (int, int);
/*进行32位乘32位。都是有符号数,64位结果中的低32位写入dst*/
int _mpy32 (int, int);
/*src1有符号32位乘以src2无符号32位=有符号64位*/
long long _mpy32su (int, unsigned);
long long _mpy32us (unsigned, int); //同上,无符号乘以有符号
long long _mpy32u (unsigned, unsigned);//同上上,都为无符号
/* *
* 进行16位乘32位。将src1的高16位和低16位当做有符号16位;将src2的值当做有符号32位。
* 乘积通过加上2^14 round到32位,然后结果右移15位。2个结果的低32位写入dst_o:dst_e
* */
long long _mpy2ir (unsigned src1, int src2);
/*src1和src2的高16位,分别右移1位后饱和,放入32位结果中*/
unsigned _rpack2 (unsigned src1, unsigned src2);
/*并行进行1.饱和(src1+src2)->dst_o 2.饱和(src1-src2)->dst_e*/
long long _saddsub (int, int);
long long _saddsub2 (unsigned, unsigned);//并行进行SADD2和SSUB2指令
/*不知道干嘛用的,看39页*/
long long _shfl3 (unsigned, unsigned);
/*src1中的2个16位有符号数减去src2中的2个16位有符号数*/
int _ssub2 (int src1, int src2);
/*迦罗瓦乘法*/
unsigned _xormpy (unsigned, unsigned);
long long _dcmpyr1 (long long, long long);
long long _dccmpyr1 (long long, long long);
long long _cmpy32r1 (long long, long long);
long long _ccmpy32r1 (long long, long long);
long long _mpyu2 (unsigned, unsigned);
/*4对8位有符号数点积相加*/
int _dotp4h (long long, long long);
long long _dotp4hll (long long, long long);
/*src1中的4对16位有符号数,src2中的4对16位无符号数,得到一个32位的点积和*/
int _dotpsu4h (long long, long long);
/*src1中的4对16位有符号数,src2中的4对16位无符号数,得到一个64位的点积和*/
long long _dotpsu4hll (long long, long long);
/*src1的2个32位有符号数加上src2的2个32位有符号数*/
long long _dadd (long long src1, long long src2);
long long _dadd_c (int, long long);
long long _dsadd (long long, long long);
/*4对16位有符号数相加*/
long long _dadd2 (long long, long long);
long long _dsadd2 (long long, long long);
long long _dsub (long long, long long);
long long _dssub (long long, long long);
long long _dssub2 (long long, long long);
long long _dapys2 (long long, long long);
long long _dshr (long long, unsigned);
long long _dshru (long long, unsigned);
long long _dshl (long long, unsigned);
long long _dshr2 (long long, unsigned);
long long _dshru2 (long long, unsigned);
unsigned _shl2 (unsigned , unsigned);
long long _dshl2 (long long, unsigned);
long long _dxpnd4 (unsigned);
long long _dxpnd2 (unsigned);
int _crot90 (int);
long long _dcrot90 (long long);
int _crot270 (int);
long long _dcrot270 (long long);
/*src1和src2中的4对16位有符号数比较,取大者放入dst*/
long long _dmax2 (long long, long long);
long long _dmin2 (long long, long long);
/*src1和src2中的8对8位有符号数比较,取大者放入dst*/
long long _dmaxu4 (long long, long long);
long long _dminu4 (long long, long long);
/*4对16位比较,相等返回1,不等返回0*/
unsigned _dcmpeq2 (long long, long long);
/*8对8位比较,相等返回1,不等返回0*/
unsigned _dcmpeq4 (long long, long long);
/*4对16位比较,大于返回1,不大于返回0*/
unsigned _dcmpgt2 (long long, long long);
/*8对8位比较,大于返回1,不大于返回0*/
unsigned _dcmpgtu4 (long long, long long);
/*4对16位有符号数求4个平均"(a+b+1)/2"*/
long long _davg2 (long long, long long);
/*8对8位无符号数求8个平均*/
long long _davgu4 (long long, long long);
/*有符号16位,无round模式,4个平均"(a+b)/2"*/
long long _davgnr2 (long long, long long);
/*无符号8位,无round模式,8个平均*/
long long _davgnru4 (long long, long long);
long long _unpkbu4 (unsigned);
long long _unpkh2 (unsigned);
long long _unpkhu2 (unsigned);
/*并行执行2个PACKL2*/
long long _dpackl2 (long long, long long);
/*并行执行2个PACKH2*/
long long _dpackh2 (long long, long long);
long long _dpackhl2 (long long, long long);
/*并行执行PACKH4和PACKL4*/
long long _dpacklh4 (unsigned, unsigned);
long long _dpackl4 (long long, long long);
long long _dpackh4 (long long, long long);
long long _dspacku4 (long long, long long);
int _land (int, int);
int _landn (int, int);
int _lor (int, int);
/*将2个寄存器移入1个寄存器中。一次进行2次移动,当处理很多double word时很有用。减去寄存器压力*/
long long _dmvd (int, int);
double _fdmvd (float, float);
double _complex_mpysp (double, double); /* CMPYSP then DADDSP */
double _complex_conjugate_mpysp (double, double); /* CMPYSP then DSUBSP */
long long _xorll_c (int, long long);
__x128_t __BUILTIN _dcmpy (long long, long long);
__x128_t __BUILTIN _dccmpy (long long, long long);
long long __BUILTIN _cmatmpyr1 (long long, __x128_t);
long long __BUILTIN _ccmatmpyr1 (long long, __x128_t);
__x128_t __BUILTIN _cmatmpy (long long, __x128_t);
__x128_t __BUILTIN _ccmatmpy (long long, __x128_t);
__x128_t __BUILTIN _qsmpy32r1 (__x128_t, __x128_t);
__x128_t __BUILTIN _qmpy32 (__x128_t, __x128_t);
__x128_t __BUILTIN _dsmpy2 (long long, long long);
/*4对16位有符号数相乘,得到32位有符号数放入128位寄存器中*/
__x128_t __BUILTIN _dmpy2 (long long, long long);
/*4对16位有符号数相乘,得到32位有符号数放入128位寄存器中*/
__x128_t __BUILTIN _dmpyu2 (long long, long long);
/*将src1中8个8位有符号数乘以src2中8个8位无符号数,得到8个16位有符号数*/
__x128_t __BUILTIN _dmpysu4 (long long src1, long long src2);
__x128_t __BUILTIN _dmpyu4 (long long, long long);//同上,都是无符号
__x128_t __BUILTIN _cmpysp (__float2_t, __float2_t);
__x128_t __BUILTIN _qmpysp (__x128_t, __x128_t);
/*执行2个dotp4h,都是有符号的*/
long long __BUILTIN _ddotp4h (__x128_t, __x128_t);
/*执行2个dotpsu4h,一个有符号,一个无符号*/
long long __BUILTIN _ddotpsu4h (__x128_t, __x128_t);
long long __BUILTIN _hi128 (__x128_t);
double __BUILTIN _hid128 (__x128_t);
long long __BUILTIN _lo128 (__x128_t);
double __BUILTIN _lod128 (__x128_t);