ETISS 0.8.0
Extendable Translating Instruction Set Simulator (version 0.8.0)
mmintrin.h
Go to the documentation of this file.
1 /*===---- mmintrin.h - Implementation of MMX intrinsics on PowerPC ---------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 /* Implemented from the specification included in the Intel C++ Compiler
11  User Guide and Reference, version 9.0. */
12 
13 #ifndef NO_WARN_X86_INTRINSICS
14 /* This header file is to help porting code using Intel intrinsics
15  explicitly from x86_64 to powerpc64/powerpc64le.
16 
17  Since PowerPC target doesn't support native 64-bit vector type, we
18  typedef __m64 to 64-bit unsigned long long in MMX intrinsics, which
19  works well for _si64 and some _pi32 operations.
20 
21  For _pi16 and _pi8 operations, it's better to transfer __m64 into
22  128-bit PowerPC vector first. Power8 introduced direct register
23  move instructions which helps for more efficient implementation.
24 
25  It's user's responsibility to determine if the results of such port
26  are acceptable or further changes are needed. Please note that much
27  code using Intel intrinsics CAN BE REWRITTEN in more portable and
28  efficient standard C or GNU C extensions with 64-bit scalar
29  operations, or 128-bit SSE/Altivec operations, which are more
30  recommended. */
31 #error \
32  "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
33 #endif
34 
35 #ifndef _MMINTRIN_H_INCLUDED
36 #define _MMINTRIN_H_INCLUDED
37 
38 #if defined(__linux__) && defined(__ppc64__)
39 
40 #include <altivec.h>
41 /* The Intel API is flexible enough that we must allow aliasing with other
42  vector types, and their scalar components. */
43 typedef __attribute__((__aligned__(8))) unsigned long long __m64;
44 
45 typedef __attribute__((__aligned__(8))) union {
46  __m64 as_m64;
47  char as_char[8];
48  signed char as_signed_char[8];
49  short as_short[4];
50  int as_int[2];
51  long long as_long_long;
52  float as_float[2];
53  double as_double;
54 } __m64_union;
55 
56 /* Empty the multimedia state. */
57 extern __inline void
58  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
59  _mm_empty(void) {
60  /* nothing to do on PowerPC. */
61 }
62 
63 extern __inline void
64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
65  _m_empty(void) {
66  /* nothing to do on PowerPC. */
67 }
68 
69 /* Convert I to a __m64 object. The integer is zero-extended to 64-bits. */
70 extern __inline __m64
71  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
72  _mm_cvtsi32_si64(int __i) {
73  return (__m64)(unsigned int)__i;
74 }
75 
76 extern __inline __m64
77  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
78  _m_from_int(int __i) {
79  return _mm_cvtsi32_si64(__i);
80 }
81 
82 /* Convert the lower 32 bits of the __m64 object into an integer. */
83 extern __inline int
84  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
85  _mm_cvtsi64_si32(__m64 __i) {
86  return ((int)__i);
87 }
88 
89 extern __inline int
90  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
91  _m_to_int(__m64 __i) {
92  return _mm_cvtsi64_si32(__i);
93 }
94 
95 /* Convert I to a __m64 object. */
96 
97 /* Intel intrinsic. */
98 extern __inline __m64
99  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
100  _m_from_int64(long long __i) {
101  return (__m64)__i;
102 }
103 
104 extern __inline __m64
105  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
106  _mm_cvtsi64_m64(long long __i) {
107  return (__m64)__i;
108 }
109 
110 /* Microsoft intrinsic. */
111 extern __inline __m64
112  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
113  _mm_cvtsi64x_si64(long long __i) {
114  return (__m64)__i;
115 }
116 
117 extern __inline __m64
118  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
119  _mm_set_pi64x(long long __i) {
120  return (__m64)__i;
121 }
122 
123 /* Convert the __m64 object to a 64bit integer. */
124 
125 /* Intel intrinsic. */
126 extern __inline long long
127  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
128  _m_to_int64(__m64 __i) {
129  return (long long)__i;
130 }
131 
132 extern __inline long long
133  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
134  _mm_cvtm64_si64(__m64 __i) {
135  return (long long)__i;
136 }
137 
138 /* Microsoft intrinsic. */
139 extern __inline long long
140  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
141  _mm_cvtsi64_si64x(__m64 __i) {
142  return (long long)__i;
143 }
144 
145 #ifdef _ARCH_PWR8
146 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
147  the result, and the four 16-bit values from M2 into the upper four 8-bit
148  values of the result, all with signed saturation. */
149 extern __inline __m64
150  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
151  _mm_packs_pi16(__m64 __m1, __m64 __m2) {
152  __vector signed short vm1;
153  __vector signed char vresult;
154 
155  vm1 = (__vector signed short)(__vector unsigned long long)
156 #ifdef __LITTLE_ENDIAN__
157  {__m1, __m2};
158 #else
159  {__m2, __m1};
160 #endif
161  vresult = vec_packs(vm1, vm1);
162  return (__m64)((__vector long long)vresult)[0];
163 }
164 
165 extern __inline __m64
166  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
167  _m_packsswb(__m64 __m1, __m64 __m2) {
168  return _mm_packs_pi16(__m1, __m2);
169 }
170 
171 /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
172  the result, and the two 32-bit values from M2 into the upper two 16-bit
173  values of the result, all with signed saturation. */
174 extern __inline __m64
175  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
176  _mm_packs_pi32(__m64 __m1, __m64 __m2) {
177  __vector signed int vm1;
178  __vector signed short vresult;
179 
180  vm1 = (__vector signed int)(__vector unsigned long long)
181 #ifdef __LITTLE_ENDIAN__
182  {__m1, __m2};
183 #else
184  {__m2, __m1};
185 #endif
186  vresult = vec_packs(vm1, vm1);
187  return (__m64)((__vector long long)vresult)[0];
188 }
189 
190 extern __inline __m64
191  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
192  _m_packssdw(__m64 __m1, __m64 __m2) {
193  return _mm_packs_pi32(__m1, __m2);
194 }
195 
196 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
197  the result, and the four 16-bit values from M2 into the upper four 8-bit
198  values of the result, all with unsigned saturation. */
199 extern __inline __m64
200  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
201  _mm_packs_pu16(__m64 __m1, __m64 __m2) {
202  __vector unsigned char r;
203  __vector signed short vm1 = (__vector signed short)(__vector long long)
204 #ifdef __LITTLE_ENDIAN__
205  {__m1, __m2};
206 #else
207  {__m2, __m1};
208 #endif
209  const __vector signed short __zero = {0};
210  __vector __bool short __select = vec_cmplt(vm1, __zero);
211  r = vec_packs((__vector unsigned short)vm1, (__vector unsigned short)vm1);
212  __vector __bool char packsel = vec_pack(__select, __select);
213  r = vec_sel(r, (const __vector unsigned char)__zero, packsel);
214  return (__m64)((__vector long long)r)[0];
215 }
216 
217 extern __inline __m64
218  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
219  _m_packuswb(__m64 __m1, __m64 __m2) {
220  return _mm_packs_pu16(__m1, __m2);
221 }
222 #endif /* end ARCH_PWR8 */
223 
224 /* Interleave the four 8-bit values from the high half of M1 with the four
225  8-bit values from the high half of M2. */
226 extern __inline __m64
227  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
228  _mm_unpackhi_pi8(__m64 __m1, __m64 __m2) {
229 #if _ARCH_PWR8
230  __vector unsigned char a, b, c;
231 
232  a = (__vector unsigned char)vec_splats(__m1);
233  b = (__vector unsigned char)vec_splats(__m2);
234  c = vec_mergel(a, b);
235  return (__m64)((__vector long long)c)[1];
236 #else
237  __m64_union m1, m2, res;
238 
239  m1.as_m64 = __m1;
240  m2.as_m64 = __m2;
241 
242  res.as_char[0] = m1.as_char[4];
243  res.as_char[1] = m2.as_char[4];
244  res.as_char[2] = m1.as_char[5];
245  res.as_char[3] = m2.as_char[5];
246  res.as_char[4] = m1.as_char[6];
247  res.as_char[5] = m2.as_char[6];
248  res.as_char[6] = m1.as_char[7];
249  res.as_char[7] = m2.as_char[7];
250 
251  return (__m64)res.as_m64;
252 #endif
253 }
254 
255 extern __inline __m64
256  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
257  _m_punpckhbw(__m64 __m1, __m64 __m2) {
258  return _mm_unpackhi_pi8(__m1, __m2);
259 }
260 
261 /* Interleave the two 16-bit values from the high half of M1 with the two
262  16-bit values from the high half of M2. */
263 extern __inline __m64
264  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
265  _mm_unpackhi_pi16(__m64 __m1, __m64 __m2) {
266  __m64_union m1, m2, res;
267 
268  m1.as_m64 = __m1;
269  m2.as_m64 = __m2;
270 
271  res.as_short[0] = m1.as_short[2];
272  res.as_short[1] = m2.as_short[2];
273  res.as_short[2] = m1.as_short[3];
274  res.as_short[3] = m2.as_short[3];
275 
276  return (__m64)res.as_m64;
277 }
278 
279 extern __inline __m64
280  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
281  _m_punpckhwd(__m64 __m1, __m64 __m2) {
282  return _mm_unpackhi_pi16(__m1, __m2);
283 }
284 /* Interleave the 32-bit value from the high half of M1 with the 32-bit
285  value from the high half of M2. */
286 extern __inline __m64
287  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
288  _mm_unpackhi_pi32(__m64 __m1, __m64 __m2) {
289  __m64_union m1, m2, res;
290 
291  m1.as_m64 = __m1;
292  m2.as_m64 = __m2;
293 
294  res.as_int[0] = m1.as_int[1];
295  res.as_int[1] = m2.as_int[1];
296 
297  return (__m64)res.as_m64;
298 }
299 
300 extern __inline __m64
301  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
302  _m_punpckhdq(__m64 __m1, __m64 __m2) {
303  return _mm_unpackhi_pi32(__m1, __m2);
304 }
305 /* Interleave the four 8-bit values from the low half of M1 with the four
306  8-bit values from the low half of M2. */
307 extern __inline __m64
308  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
309  _mm_unpacklo_pi8(__m64 __m1, __m64 __m2) {
310 #if _ARCH_PWR8
311  __vector unsigned char a, b, c;
312 
313  a = (__vector unsigned char)vec_splats(__m1);
314  b = (__vector unsigned char)vec_splats(__m2);
315  c = vec_mergel(a, b);
316  return (__m64)((__vector long long)c)[0];
317 #else
318  __m64_union m1, m2, res;
319 
320  m1.as_m64 = __m1;
321  m2.as_m64 = __m2;
322 
323  res.as_char[0] = m1.as_char[0];
324  res.as_char[1] = m2.as_char[0];
325  res.as_char[2] = m1.as_char[1];
326  res.as_char[3] = m2.as_char[1];
327  res.as_char[4] = m1.as_char[2];
328  res.as_char[5] = m2.as_char[2];
329  res.as_char[6] = m1.as_char[3];
330  res.as_char[7] = m2.as_char[3];
331 
332  return (__m64)res.as_m64;
333 #endif
334 }
335 
336 extern __inline __m64
337  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
338  _m_punpcklbw(__m64 __m1, __m64 __m2) {
339  return _mm_unpacklo_pi8(__m1, __m2);
340 }
341 /* Interleave the two 16-bit values from the low half of M1 with the two
342  16-bit values from the low half of M2. */
343 extern __inline __m64
344  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
345  _mm_unpacklo_pi16(__m64 __m1, __m64 __m2) {
346  __m64_union m1, m2, res;
347 
348  m1.as_m64 = __m1;
349  m2.as_m64 = __m2;
350 
351  res.as_short[0] = m1.as_short[0];
352  res.as_short[1] = m2.as_short[0];
353  res.as_short[2] = m1.as_short[1];
354  res.as_short[3] = m2.as_short[1];
355 
356  return (__m64)res.as_m64;
357 }
358 
359 extern __inline __m64
360  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
361  _m_punpcklwd(__m64 __m1, __m64 __m2) {
362  return _mm_unpacklo_pi16(__m1, __m2);
363 }
364 
365 /* Interleave the 32-bit value from the low half of M1 with the 32-bit
366  value from the low half of M2. */
367 extern __inline __m64
368  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
369  _mm_unpacklo_pi32(__m64 __m1, __m64 __m2) {
370  __m64_union m1, m2, res;
371 
372  m1.as_m64 = __m1;
373  m2.as_m64 = __m2;
374 
375  res.as_int[0] = m1.as_int[0];
376  res.as_int[1] = m2.as_int[0];
377 
378  return (__m64)res.as_m64;
379 }
380 
381 extern __inline __m64
382  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
383  _m_punpckldq(__m64 __m1, __m64 __m2) {
384  return _mm_unpacklo_pi32(__m1, __m2);
385 }
386 
387 /* Add the 8-bit values in M1 to the 8-bit values in M2. */
388 extern __inline __m64
389  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
390  _mm_add_pi8(__m64 __m1, __m64 __m2) {
391 #if _ARCH_PWR8
392  __vector signed char a, b, c;
393 
394  a = (__vector signed char)vec_splats(__m1);
395  b = (__vector signed char)vec_splats(__m2);
396  c = vec_add(a, b);
397  return (__m64)((__vector long long)c)[0];
398 #else
399  __m64_union m1, m2, res;
400 
401  m1.as_m64 = __m1;
402  m2.as_m64 = __m2;
403 
404  res.as_char[0] = m1.as_char[0] + m2.as_char[0];
405  res.as_char[1] = m1.as_char[1] + m2.as_char[1];
406  res.as_char[2] = m1.as_char[2] + m2.as_char[2];
407  res.as_char[3] = m1.as_char[3] + m2.as_char[3];
408  res.as_char[4] = m1.as_char[4] + m2.as_char[4];
409  res.as_char[5] = m1.as_char[5] + m2.as_char[5];
410  res.as_char[6] = m1.as_char[6] + m2.as_char[6];
411  res.as_char[7] = m1.as_char[7] + m2.as_char[7];
412 
413  return (__m64)res.as_m64;
414 #endif
415 }
416 
417 extern __inline __m64
418  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
419  _m_paddb(__m64 __m1, __m64 __m2) {
420  return _mm_add_pi8(__m1, __m2);
421 }
422 
423 /* Add the 16-bit values in M1 to the 16-bit values in M2. */
424 extern __inline __m64
425  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
426  _mm_add_pi16(__m64 __m1, __m64 __m2) {
427 #if _ARCH_PWR8
428  __vector signed short a, b, c;
429 
430  a = (__vector signed short)vec_splats(__m1);
431  b = (__vector signed short)vec_splats(__m2);
432  c = vec_add(a, b);
433  return (__m64)((__vector long long)c)[0];
434 #else
435  __m64_union m1, m2, res;
436 
437  m1.as_m64 = __m1;
438  m2.as_m64 = __m2;
439 
440  res.as_short[0] = m1.as_short[0] + m2.as_short[0];
441  res.as_short[1] = m1.as_short[1] + m2.as_short[1];
442  res.as_short[2] = m1.as_short[2] + m2.as_short[2];
443  res.as_short[3] = m1.as_short[3] + m2.as_short[3];
444 
445  return (__m64)res.as_m64;
446 #endif
447 }
448 
449 extern __inline __m64
450  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
451  _m_paddw(__m64 __m1, __m64 __m2) {
452  return _mm_add_pi16(__m1, __m2);
453 }
454 
455 /* Add the 32-bit values in M1 to the 32-bit values in M2. */
456 extern __inline __m64
457  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
458  _mm_add_pi32(__m64 __m1, __m64 __m2) {
459 #if _ARCH_PWR9
460  __vector signed int a, b, c;
461 
462  a = (__vector signed int)vec_splats(__m1);
463  b = (__vector signed int)vec_splats(__m2);
464  c = vec_add(a, b);
465  return (__m64)((__vector long long)c)[0];
466 #else
467  __m64_union m1, m2, res;
468 
469  m1.as_m64 = __m1;
470  m2.as_m64 = __m2;
471 
472  res.as_int[0] = m1.as_int[0] + m2.as_int[0];
473  res.as_int[1] = m1.as_int[1] + m2.as_int[1];
474 
475  return (__m64)res.as_m64;
476 #endif
477 }
478 
479 extern __inline __m64
480  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
481  _m_paddd(__m64 __m1, __m64 __m2) {
482  return _mm_add_pi32(__m1, __m2);
483 }
484 
485 /* Subtract the 8-bit values in M2 from the 8-bit values in M1. */
486 extern __inline __m64
487  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
488  _mm_sub_pi8(__m64 __m1, __m64 __m2) {
489 #if _ARCH_PWR8
490  __vector signed char a, b, c;
491 
492  a = (__vector signed char)vec_splats(__m1);
493  b = (__vector signed char)vec_splats(__m2);
494  c = vec_sub(a, b);
495  return (__m64)((__vector long long)c)[0];
496 #else
497  __m64_union m1, m2, res;
498 
499  m1.as_m64 = __m1;
500  m2.as_m64 = __m2;
501 
502  res.as_char[0] = m1.as_char[0] - m2.as_char[0];
503  res.as_char[1] = m1.as_char[1] - m2.as_char[1];
504  res.as_char[2] = m1.as_char[2] - m2.as_char[2];
505  res.as_char[3] = m1.as_char[3] - m2.as_char[3];
506  res.as_char[4] = m1.as_char[4] - m2.as_char[4];
507  res.as_char[5] = m1.as_char[5] - m2.as_char[5];
508  res.as_char[6] = m1.as_char[6] - m2.as_char[6];
509  res.as_char[7] = m1.as_char[7] - m2.as_char[7];
510 
511  return (__m64)res.as_m64;
512 #endif
513 }
514 
515 extern __inline __m64
516  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
517  _m_psubb(__m64 __m1, __m64 __m2) {
518  return _mm_sub_pi8(__m1, __m2);
519 }
520 
521 /* Subtract the 16-bit values in M2 from the 16-bit values in M1. */
522 extern __inline __m64
523  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
524  _mm_sub_pi16(__m64 __m1, __m64 __m2) {
525 #if _ARCH_PWR8
526  __vector signed short a, b, c;
527 
528  a = (__vector signed short)vec_splats(__m1);
529  b = (__vector signed short)vec_splats(__m2);
530  c = vec_sub(a, b);
531  return (__m64)((__vector long long)c)[0];
532 #else
533  __m64_union m1, m2, res;
534 
535  m1.as_m64 = __m1;
536  m2.as_m64 = __m2;
537 
538  res.as_short[0] = m1.as_short[0] - m2.as_short[0];
539  res.as_short[1] = m1.as_short[1] - m2.as_short[1];
540  res.as_short[2] = m1.as_short[2] - m2.as_short[2];
541  res.as_short[3] = m1.as_short[3] - m2.as_short[3];
542 
543  return (__m64)res.as_m64;
544 #endif
545 }
546 
547 extern __inline __m64
548  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
549  _m_psubw(__m64 __m1, __m64 __m2) {
550  return _mm_sub_pi16(__m1, __m2);
551 }
552 
553 /* Subtract the 32-bit values in M2 from the 32-bit values in M1. */
554 extern __inline __m64
555  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
556  _mm_sub_pi32(__m64 __m1, __m64 __m2) {
557 #if _ARCH_PWR9
558  __vector signed int a, b, c;
559 
560  a = (__vector signed int)vec_splats(__m1);
561  b = (__vector signed int)vec_splats(__m2);
562  c = vec_sub(a, b);
563  return (__m64)((__vector long long)c)[0];
564 #else
565  __m64_union m1, m2, res;
566 
567  m1.as_m64 = __m1;
568  m2.as_m64 = __m2;
569 
570  res.as_int[0] = m1.as_int[0] - m2.as_int[0];
571  res.as_int[1] = m1.as_int[1] - m2.as_int[1];
572 
573  return (__m64)res.as_m64;
574 #endif
575 }
576 
577 extern __inline __m64
578  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
579  _m_psubd(__m64 __m1, __m64 __m2) {
580  return _mm_sub_pi32(__m1, __m2);
581 }
582 
583 extern __inline __m64
584  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
585  _mm_add_si64(__m64 __m1, __m64 __m2) {
586  return (__m1 + __m2);
587 }
588 
589 extern __inline __m64
590  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
591  _mm_sub_si64(__m64 __m1, __m64 __m2) {
592  return (__m1 - __m2);
593 }
594 
595 /* Shift the 64-bit value in M left by COUNT. */
596 extern __inline __m64
597  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
598  _mm_sll_si64(__m64 __m, __m64 __count) {
599  return (__m << __count);
600 }
601 
602 extern __inline __m64
603  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
604  _m_psllq(__m64 __m, __m64 __count) {
605  return _mm_sll_si64(__m, __count);
606 }
607 
608 extern __inline __m64
609  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
610  _mm_slli_si64(__m64 __m, const int __count) {
611  return (__m << __count);
612 }
613 
614 extern __inline __m64
615  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
616  _m_psllqi(__m64 __m, const int __count) {
617  return _mm_slli_si64(__m, __count);
618 }
619 
620 /* Shift the 64-bit value in M left by COUNT; shift in zeros. */
621 extern __inline __m64
622  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
623  _mm_srl_si64(__m64 __m, __m64 __count) {
624  return (__m >> __count);
625 }
626 
627 extern __inline __m64
628  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
629  _m_psrlq(__m64 __m, __m64 __count) {
630  return _mm_srl_si64(__m, __count);
631 }
632 
633 extern __inline __m64
634  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
635  _mm_srli_si64(__m64 __m, const int __count) {
636  return (__m >> __count);
637 }
638 
639 extern __inline __m64
640  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
641  _m_psrlqi(__m64 __m, const int __count) {
642  return _mm_srli_si64(__m, __count);
643 }
644 
645 /* Bit-wise AND the 64-bit values in M1 and M2. */
646 extern __inline __m64
647  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
648  _mm_and_si64(__m64 __m1, __m64 __m2) {
649  return (__m1 & __m2);
650 }
651 
652 extern __inline __m64
653  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
654  _m_pand(__m64 __m1, __m64 __m2) {
655  return _mm_and_si64(__m1, __m2);
656 }
657 
658 /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
659  64-bit value in M2. */
660 extern __inline __m64
661  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
662  _mm_andnot_si64(__m64 __m1, __m64 __m2) {
663  return (~__m1 & __m2);
664 }
665 
666 extern __inline __m64
667  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
668  _m_pandn(__m64 __m1, __m64 __m2) {
669  return _mm_andnot_si64(__m1, __m2);
670 }
671 
672 /* Bit-wise inclusive OR the 64-bit values in M1 and M2. */
673 extern __inline __m64
674  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
675  _mm_or_si64(__m64 __m1, __m64 __m2) {
676  return (__m1 | __m2);
677 }
678 
679 extern __inline __m64
680  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
681  _m_por(__m64 __m1, __m64 __m2) {
682  return _mm_or_si64(__m1, __m2);
683 }
684 
685 /* Bit-wise exclusive OR the 64-bit values in M1 and M2. */
686 extern __inline __m64
687  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
688  _mm_xor_si64(__m64 __m1, __m64 __m2) {
689  return (__m1 ^ __m2);
690 }
691 
692 extern __inline __m64
693  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
694  _m_pxor(__m64 __m1, __m64 __m2) {
695  return _mm_xor_si64(__m1, __m2);
696 }
697 
698 /* Creates a 64-bit zero. */
699 extern __inline __m64
700  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
701  _mm_setzero_si64(void) {
702  return (__m64)0;
703 }
704 
705 /* Compare eight 8-bit values. The result of the comparison is 0xFF if the
706  test is true and zero if false. */
707 extern __inline __m64
708  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
709  _mm_cmpeq_pi8(__m64 __m1, __m64 __m2) {
710 #if defined(_ARCH_PWR6) && defined(__powerpc64__)
711  __m64 res;
712  __asm__("cmpb %0,%1,%2;\n" : "=r"(res) : "r"(__m1), "r"(__m2) :);
713  return (res);
714 #else
715  __m64_union m1, m2, res;
716 
717  m1.as_m64 = __m1;
718  m2.as_m64 = __m2;
719 
720  res.as_char[0] = (m1.as_char[0] == m2.as_char[0]) ? -1 : 0;
721  res.as_char[1] = (m1.as_char[1] == m2.as_char[1]) ? -1 : 0;
722  res.as_char[2] = (m1.as_char[2] == m2.as_char[2]) ? -1 : 0;
723  res.as_char[3] = (m1.as_char[3] == m2.as_char[3]) ? -1 : 0;
724  res.as_char[4] = (m1.as_char[4] == m2.as_char[4]) ? -1 : 0;
725  res.as_char[5] = (m1.as_char[5] == m2.as_char[5]) ? -1 : 0;
726  res.as_char[6] = (m1.as_char[6] == m2.as_char[6]) ? -1 : 0;
727  res.as_char[7] = (m1.as_char[7] == m2.as_char[7]) ? -1 : 0;
728 
729  return (__m64)res.as_m64;
730 #endif
731 }
732 
733 extern __inline __m64
734  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
735  _m_pcmpeqb(__m64 __m1, __m64 __m2) {
736  return _mm_cmpeq_pi8(__m1, __m2);
737 }
738 
739 extern __inline __m64
740  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
741  _mm_cmpgt_pi8(__m64 __m1, __m64 __m2) {
742 #if _ARCH_PWR8
743  __vector signed char a, b, c;
744 
745  a = (__vector signed char)vec_splats(__m1);
746  b = (__vector signed char)vec_splats(__m2);
747  c = (__vector signed char)vec_cmpgt(a, b);
748  return (__m64)((__vector long long)c)[0];
749 #else
750  __m64_union m1, m2, res;
751 
752  m1.as_m64 = __m1;
753  m2.as_m64 = __m2;
754 
755  res.as_char[0] = (m1.as_char[0] > m2.as_char[0]) ? -1 : 0;
756  res.as_char[1] = (m1.as_char[1] > m2.as_char[1]) ? -1 : 0;
757  res.as_char[2] = (m1.as_char[2] > m2.as_char[2]) ? -1 : 0;
758  res.as_char[3] = (m1.as_char[3] > m2.as_char[3]) ? -1 : 0;
759  res.as_char[4] = (m1.as_char[4] > m2.as_char[4]) ? -1 : 0;
760  res.as_char[5] = (m1.as_char[5] > m2.as_char[5]) ? -1 : 0;
761  res.as_char[6] = (m1.as_char[6] > m2.as_char[6]) ? -1 : 0;
762  res.as_char[7] = (m1.as_char[7] > m2.as_char[7]) ? -1 : 0;
763 
764  return (__m64)res.as_m64;
765 #endif
766 }
767 
768 extern __inline __m64
769  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
770  _m_pcmpgtb(__m64 __m1, __m64 __m2) {
771  return _mm_cmpgt_pi8(__m1, __m2);
772 }
773 
774 /* Compare four 16-bit values. The result of the comparison is 0xFFFF if
775  the test is true and zero if false. */
776 extern __inline __m64
777  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
778  _mm_cmpeq_pi16(__m64 __m1, __m64 __m2) {
779 #if _ARCH_PWR8
780  __vector signed short a, b, c;
781 
782  a = (__vector signed short)vec_splats(__m1);
783  b = (__vector signed short)vec_splats(__m2);
784  c = (__vector signed short)vec_cmpeq(a, b);
785  return (__m64)((__vector long long)c)[0];
786 #else
787  __m64_union m1, m2, res;
788 
789  m1.as_m64 = __m1;
790  m2.as_m64 = __m2;
791 
792  res.as_short[0] = (m1.as_short[0] == m2.as_short[0]) ? -1 : 0;
793  res.as_short[1] = (m1.as_short[1] == m2.as_short[1]) ? -1 : 0;
794  res.as_short[2] = (m1.as_short[2] == m2.as_short[2]) ? -1 : 0;
795  res.as_short[3] = (m1.as_short[3] == m2.as_short[3]) ? -1 : 0;
796 
797  return (__m64)res.as_m64;
798 #endif
799 }
800 
801 extern __inline __m64
802  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
803  _m_pcmpeqw(__m64 __m1, __m64 __m2) {
804  return _mm_cmpeq_pi16(__m1, __m2);
805 }
806 
807 extern __inline __m64
808  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
809  _mm_cmpgt_pi16(__m64 __m1, __m64 __m2) {
810 #if _ARCH_PWR8
811  __vector signed short a, b, c;
812 
813  a = (__vector signed short)vec_splats(__m1);
814  b = (__vector signed short)vec_splats(__m2);
815  c = (__vector signed short)vec_cmpgt(a, b);
816  return (__m64)((__vector long long)c)[0];
817 #else
818  __m64_union m1, m2, res;
819 
820  m1.as_m64 = __m1;
821  m2.as_m64 = __m2;
822 
823  res.as_short[0] = (m1.as_short[0] > m2.as_short[0]) ? -1 : 0;
824  res.as_short[1] = (m1.as_short[1] > m2.as_short[1]) ? -1 : 0;
825  res.as_short[2] = (m1.as_short[2] > m2.as_short[2]) ? -1 : 0;
826  res.as_short[3] = (m1.as_short[3] > m2.as_short[3]) ? -1 : 0;
827 
828  return (__m64)res.as_m64;
829 #endif
830 }
831 
832 extern __inline __m64
833  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
834  _m_pcmpgtw(__m64 __m1, __m64 __m2) {
835  return _mm_cmpgt_pi16(__m1, __m2);
836 }
837 
838 /* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if
839  the test is true and zero if false. */
840 extern __inline __m64
841  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
842  _mm_cmpeq_pi32(__m64 __m1, __m64 __m2) {
843 #if _ARCH_PWR9
844  __vector signed int a, b, c;
845 
846  a = (__vector signed int)vec_splats(__m1);
847  b = (__vector signed int)vec_splats(__m2);
848  c = (__vector signed int)vec_cmpeq(a, b);
849  return (__m64)((__vector long long)c)[0];
850 #else
851  __m64_union m1, m2, res;
852 
853  m1.as_m64 = __m1;
854  m2.as_m64 = __m2;
855 
856  res.as_int[0] = (m1.as_int[0] == m2.as_int[0]) ? -1 : 0;
857  res.as_int[1] = (m1.as_int[1] == m2.as_int[1]) ? -1 : 0;
858 
859  return (__m64)res.as_m64;
860 #endif
861 }
862 
863 extern __inline __m64
864  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
865  _m_pcmpeqd(__m64 __m1, __m64 __m2) {
866  return _mm_cmpeq_pi32(__m1, __m2);
867 }
868 
869 extern __inline __m64
870  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
871  _mm_cmpgt_pi32(__m64 __m1, __m64 __m2) {
872 #if _ARCH_PWR9
873  __vector signed int a, b, c;
874 
875  a = (__vector signed int)vec_splats(__m1);
876  b = (__vector signed int)vec_splats(__m2);
877  c = (__vector signed int)vec_cmpgt(a, b);
878  return (__m64)((__vector long long)c)[0];
879 #else
880  __m64_union m1, m2, res;
881 
882  m1.as_m64 = __m1;
883  m2.as_m64 = __m2;
884 
885  res.as_int[0] = (m1.as_int[0] > m2.as_int[0]) ? -1 : 0;
886  res.as_int[1] = (m1.as_int[1] > m2.as_int[1]) ? -1 : 0;
887 
888  return (__m64)res.as_m64;
889 #endif
890 }
891 
892 extern __inline __m64
893  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
894  _m_pcmpgtd(__m64 __m1, __m64 __m2) {
895  return _mm_cmpgt_pi32(__m1, __m2);
896 }
897 
898 #if _ARCH_PWR8
899 /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
900  saturated arithmetic. */
901 extern __inline __m64
902  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
903  _mm_adds_pi8(__m64 __m1, __m64 __m2) {
904  __vector signed char a, b, c;
905 
906  a = (__vector signed char)vec_splats(__m1);
907  b = (__vector signed char)vec_splats(__m2);
908  c = vec_adds(a, b);
909  return (__m64)((__vector long long)c)[0];
910 }
911 
912 extern __inline __m64
913  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
914  _m_paddsb(__m64 __m1, __m64 __m2) {
915  return _mm_adds_pi8(__m1, __m2);
916 }
917 /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
918  saturated arithmetic. */
919 extern __inline __m64
920  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
921  _mm_adds_pi16(__m64 __m1, __m64 __m2) {
922  __vector signed short a, b, c;
923 
924  a = (__vector signed short)vec_splats(__m1);
925  b = (__vector signed short)vec_splats(__m2);
926  c = vec_adds(a, b);
927  return (__m64)((__vector long long)c)[0];
928 }
929 
930 extern __inline __m64
931  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
932  _m_paddsw(__m64 __m1, __m64 __m2) {
933  return _mm_adds_pi16(__m1, __m2);
934 }
935 /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
936  saturated arithmetic. */
937 extern __inline __m64
938  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
939  _mm_adds_pu8(__m64 __m1, __m64 __m2) {
940  __vector unsigned char a, b, c;
941 
942  a = (__vector unsigned char)vec_splats(__m1);
943  b = (__vector unsigned char)vec_splats(__m2);
944  c = vec_adds(a, b);
945  return (__m64)((__vector long long)c)[0];
946 }
947 
948 extern __inline __m64
949  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
950  _m_paddusb(__m64 __m1, __m64 __m2) {
951  return _mm_adds_pu8(__m1, __m2);
952 }
953 
954 /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
955  saturated arithmetic. */
956 extern __inline __m64
957  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
958  _mm_adds_pu16(__m64 __m1, __m64 __m2) {
959  __vector unsigned short a, b, c;
960 
961  a = (__vector unsigned short)vec_splats(__m1);
962  b = (__vector unsigned short)vec_splats(__m2);
963  c = vec_adds(a, b);
964  return (__m64)((__vector long long)c)[0];
965 }
966 
967 extern __inline __m64
968  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
969  _m_paddusw(__m64 __m1, __m64 __m2) {
970  return _mm_adds_pu16(__m1, __m2);
971 }
972 
973 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
974  saturating arithmetic. */
975 extern __inline __m64
976  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
977  _mm_subs_pi8(__m64 __m1, __m64 __m2) {
978  __vector signed char a, b, c;
979 
980  a = (__vector signed char)vec_splats(__m1);
981  b = (__vector signed char)vec_splats(__m2);
982  c = vec_subs(a, b);
983  return (__m64)((__vector long long)c)[0];
984 }
985 
986 extern __inline __m64
987  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
988  _m_psubsb(__m64 __m1, __m64 __m2) {
989  return _mm_subs_pi8(__m1, __m2);
990 }
991 
992 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
993  signed saturating arithmetic. */
994 extern __inline __m64
995  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
996  _mm_subs_pi16(__m64 __m1, __m64 __m2) {
997  __vector signed short a, b, c;
998 
999  a = (__vector signed short)vec_splats(__m1);
1000  b = (__vector signed short)vec_splats(__m2);
1001  c = vec_subs(a, b);
1002  return (__m64)((__vector long long)c)[0];
1003 }
1004 
1005 extern __inline __m64
1006  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1007  _m_psubsw(__m64 __m1, __m64 __m2) {
1008  return _mm_subs_pi16(__m1, __m2);
1009 }
1010 
1011 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
1012  unsigned saturating arithmetic. */
1013 extern __inline __m64
1014  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1015  _mm_subs_pu8(__m64 __m1, __m64 __m2) {
1016  __vector unsigned char a, b, c;
1017 
1018  a = (__vector unsigned char)vec_splats(__m1);
1019  b = (__vector unsigned char)vec_splats(__m2);
1020  c = vec_subs(a, b);
1021  return (__m64)((__vector long long)c)[0];
1022 }
1023 
1024 extern __inline __m64
1025  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1026  _m_psubusb(__m64 __m1, __m64 __m2) {
1027  return _mm_subs_pu8(__m1, __m2);
1028 }
1029 
1030 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
1031  unsigned saturating arithmetic. */
1032 extern __inline __m64
1033  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1034  _mm_subs_pu16(__m64 __m1, __m64 __m2) {
1035  __vector unsigned short a, b, c;
1036 
1037  a = (__vector unsigned short)vec_splats(__m1);
1038  b = (__vector unsigned short)vec_splats(__m2);
1039  c = vec_subs(a, b);
1040  return (__m64)((__vector long long)c)[0];
1041 }
1042 
1043 extern __inline __m64
1044  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1045  _m_psubusw(__m64 __m1, __m64 __m2) {
1046  return _mm_subs_pu16(__m1, __m2);
1047 }
1048 
1049 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
1050  four 32-bit intermediate results, which are then summed by pairs to
1051  produce two 32-bit results. */
1052 extern __inline __m64
1053  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1054  _mm_madd_pi16(__m64 __m1, __m64 __m2) {
1055  __vector signed short a, b;
1056  __vector signed int c;
1057  __vector signed int zero = {0, 0, 0, 0};
1058 
1059  a = (__vector signed short)vec_splats(__m1);
1060  b = (__vector signed short)vec_splats(__m2);
1061  c = vec_vmsumshm(a, b, zero);
1062  return (__m64)((__vector long long)c)[0];
1063 }
1064 
1065 extern __inline __m64
1066  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1067  _m_pmaddwd(__m64 __m1, __m64 __m2) {
1068  return _mm_madd_pi16(__m1, __m2);
1069 }
1070 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
1071  M2 and produce the high 16 bits of the 32-bit results. */
1072 extern __inline __m64
1073  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1074  _mm_mulhi_pi16(__m64 __m1, __m64 __m2) {
1075  __vector signed short a, b;
1076  __vector signed short c;
1077  __vector signed int w0, w1;
1078  __vector unsigned char xform1 = {
1079 #ifdef __LITTLE_ENDIAN__
1080  0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1081  0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1082 #else
1083  0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x00,
1084  0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
1085 #endif
1086  };
1087 
1088  a = (__vector signed short)vec_splats(__m1);
1089  b = (__vector signed short)vec_splats(__m2);
1090 
1091  w0 = vec_vmulesh(a, b);
1092  w1 = vec_vmulosh(a, b);
1093  c = (__vector signed short)vec_perm(w0, w1, xform1);
1094 
1095  return (__m64)((__vector long long)c)[0];
1096 }
1097 
1098 extern __inline __m64
1099  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1100  _m_pmulhw(__m64 __m1, __m64 __m2) {
1101  return _mm_mulhi_pi16(__m1, __m2);
1102 }
1103 
1104 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
1105  the low 16 bits of the results. */
1106 extern __inline __m64
1107  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1108  _mm_mullo_pi16(__m64 __m1, __m64 __m2) {
1109  __vector signed short a, b, c;
1110 
1111  a = (__vector signed short)vec_splats(__m1);
1112  b = (__vector signed short)vec_splats(__m2);
1113  c = a * b;
1114  return (__m64)((__vector long long)c)[0];
1115 }
1116 
1117 extern __inline __m64
1118  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1119  _m_pmullw(__m64 __m1, __m64 __m2) {
1120  return _mm_mullo_pi16(__m1, __m2);
1121 }
1122 
1123 /* Shift four 16-bit values in M left by COUNT. */
1124 extern __inline __m64
1125  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1126  _mm_sll_pi16(__m64 __m, __m64 __count) {
1127  __vector signed short m, r;
1128  __vector unsigned short c;
1129 
1130  if (__count <= 15) {
1131  m = (__vector signed short)vec_splats(__m);
1132  c = (__vector unsigned short)vec_splats((unsigned short)__count);
1133  r = vec_sl(m, (__vector unsigned short)c);
1134  return (__m64)((__vector long long)r)[0];
1135  } else
1136  return (0);
1137 }
1138 
1139 extern __inline __m64
1140  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1141  _m_psllw(__m64 __m, __m64 __count) {
1142  return _mm_sll_pi16(__m, __count);
1143 }
1144 
1145 extern __inline __m64
1146  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1147  _mm_slli_pi16(__m64 __m, int __count) {
1148  /* Promote int to long then invoke mm_sll_pi16. */
1149  return _mm_sll_pi16(__m, __count);
1150 }
1151 
1152 extern __inline __m64
1153  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1154  _m_psllwi(__m64 __m, int __count) {
1155  return _mm_slli_pi16(__m, __count);
1156 }
1157 
1158 /* Shift two 32-bit values in M left by COUNT. */
1159 extern __inline __m64
1160  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1161  _mm_sll_pi32(__m64 __m, __m64 __count) {
1162  __m64_union m, res;
1163 
1164  m.as_m64 = __m;
1165 
1166  res.as_int[0] = m.as_int[0] << __count;
1167  res.as_int[1] = m.as_int[1] << __count;
1168  return (res.as_m64);
1169 }
1170 
1171 extern __inline __m64
1172  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1173  _m_pslld(__m64 __m, __m64 __count) {
1174  return _mm_sll_pi32(__m, __count);
1175 }
1176 
1177 extern __inline __m64
1178  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1179  _mm_slli_pi32(__m64 __m, int __count) {
1180  /* Promote int to long then invoke mm_sll_pi32. */
1181  return _mm_sll_pi32(__m, __count);
1182 }
1183 
1184 extern __inline __m64
1185  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1186  _m_pslldi(__m64 __m, int __count) {
1187  return _mm_slli_pi32(__m, __count);
1188 }
1189 
1190 /* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */
1191 extern __inline __m64
1192  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1193  _mm_sra_pi16(__m64 __m, __m64 __count) {
1194  __vector signed short m, r;
1195  __vector unsigned short c;
1196 
1197  if (__count <= 15) {
1198  m = (__vector signed short)vec_splats(__m);
1199  c = (__vector unsigned short)vec_splats((unsigned short)__count);
1200  r = vec_sra(m, (__vector unsigned short)c);
1201  return (__m64)((__vector long long)r)[0];
1202  } else
1203  return (0);
1204 }
1205 
1206 extern __inline __m64
1207  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1208  _m_psraw(__m64 __m, __m64 __count) {
1209  return _mm_sra_pi16(__m, __count);
1210 }
1211 
1212 extern __inline __m64
1213  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1214  _mm_srai_pi16(__m64 __m, int __count) {
1215  /* Promote int to long then invoke mm_sra_pi32. */
1216  return _mm_sra_pi16(__m, __count);
1217 }
1218 
1219 extern __inline __m64
1220  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1221  _m_psrawi(__m64 __m, int __count) {
1222  return _mm_srai_pi16(__m, __count);
1223 }
1224 
1225 /* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */
1226 extern __inline __m64
1227  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1228  _mm_sra_pi32(__m64 __m, __m64 __count) {
1229  __m64_union m, res;
1230 
1231  m.as_m64 = __m;
1232 
1233  res.as_int[0] = m.as_int[0] >> __count;
1234  res.as_int[1] = m.as_int[1] >> __count;
1235  return (res.as_m64);
1236 }
1237 
1238 extern __inline __m64
1239  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1240  _m_psrad(__m64 __m, __m64 __count) {
1241  return _mm_sra_pi32(__m, __count);
1242 }
1243 
1244 extern __inline __m64
1245  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1246  _mm_srai_pi32(__m64 __m, int __count) {
1247  /* Promote int to long then invoke mm_sra_pi32. */
1248  return _mm_sra_pi32(__m, __count);
1249 }
1250 
1251 extern __inline __m64
1252  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1253  _m_psradi(__m64 __m, int __count) {
1254  return _mm_srai_pi32(__m, __count);
1255 }
1256 
1257 /* Shift four 16-bit values in M right by COUNT; shift in zeros. */
1258 extern __inline __m64
1259  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1260  _mm_srl_pi16(__m64 __m, __m64 __count) {
1261  __vector unsigned short m, r;
1262  __vector unsigned short c;
1263 
1264  if (__count <= 15) {
1265  m = (__vector unsigned short)vec_splats(__m);
1266  c = (__vector unsigned short)vec_splats((unsigned short)__count);
1267  r = vec_sr(m, (__vector unsigned short)c);
1268  return (__m64)((__vector long long)r)[0];
1269  } else
1270  return (0);
1271 }
1272 
1273 extern __inline __m64
1274  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1275  _m_psrlw(__m64 __m, __m64 __count) {
1276  return _mm_srl_pi16(__m, __count);
1277 }
1278 
1279 extern __inline __m64
1280  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1281  _mm_srli_pi16(__m64 __m, int __count) {
1282  /* Promote int to long then invoke mm_sra_pi32. */
1283  return _mm_srl_pi16(__m, __count);
1284 }
1285 
1286 extern __inline __m64
1287  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1288  _m_psrlwi(__m64 __m, int __count) {
1289  return _mm_srli_pi16(__m, __count);
1290 }
1291 
1292 /* Shift two 32-bit values in M right by COUNT; shift in zeros. */
1293 extern __inline __m64
1294  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1295  _mm_srl_pi32(__m64 __m, __m64 __count) {
1296  __m64_union m, res;
1297 
1298  m.as_m64 = __m;
1299 
1300  res.as_int[0] = (unsigned int)m.as_int[0] >> __count;
1301  res.as_int[1] = (unsigned int)m.as_int[1] >> __count;
1302  return (res.as_m64);
1303 }
1304 
1305 extern __inline __m64
1306  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1307  _m_psrld(__m64 __m, __m64 __count) {
1308  return _mm_srl_pi32(__m, __count);
1309 }
1310 
1311 extern __inline __m64
1312  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1313  _mm_srli_pi32(__m64 __m, int __count) {
1314  /* Promote int to long then invoke mm_srl_pi32. */
1315  return _mm_srl_pi32(__m, __count);
1316 }
1317 
1318 extern __inline __m64
1319  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1320  _m_psrldi(__m64 __m, int __count) {
1321  return _mm_srli_pi32(__m, __count);
1322 }
1323 #endif /* _ARCH_PWR8 */
1324 
1325 /* Creates a vector of two 32-bit values; I0 is least significant. */
1326 extern __inline __m64
1327  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1328  _mm_set_pi32(int __i1, int __i0) {
1329  __m64_union res;
1330 
1331  res.as_int[0] = __i0;
1332  res.as_int[1] = __i1;
1333  return (res.as_m64);
1334 }
1335 
1336 /* Creates a vector of four 16-bit values; W0 is least significant. */
1337 extern __inline __m64
1338  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1339  _mm_set_pi16(short __w3, short __w2, short __w1, short __w0) {
1340  __m64_union res;
1341 
1342  res.as_short[0] = __w0;
1343  res.as_short[1] = __w1;
1344  res.as_short[2] = __w2;
1345  res.as_short[3] = __w3;
1346  return (res.as_m64);
1347 }
1348 
1349 /* Creates a vector of eight 8-bit values; B0 is least significant. */
1350 extern __inline __m64
1351  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1352  _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3,
1353  char __b2, char __b1, char __b0) {
1354  __m64_union res;
1355 
1356  res.as_char[0] = __b0;
1357  res.as_char[1] = __b1;
1358  res.as_char[2] = __b2;
1359  res.as_char[3] = __b3;
1360  res.as_char[4] = __b4;
1361  res.as_char[5] = __b5;
1362  res.as_char[6] = __b6;
1363  res.as_char[7] = __b7;
1364  return (res.as_m64);
1365 }
1366 
1367 /* Similar, but with the arguments in reverse order. */
1368 extern __inline __m64
1369  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1370  _mm_setr_pi32(int __i0, int __i1) {
1371  __m64_union res;
1372 
1373  res.as_int[0] = __i0;
1374  res.as_int[1] = __i1;
1375  return (res.as_m64);
1376 }
1377 
1378 extern __inline __m64
1379  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1380  _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) {
1381  return _mm_set_pi16(__w3, __w2, __w1, __w0);
1382 }
1383 
1384 extern __inline __m64
1385  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1386  _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4,
1387  char __b5, char __b6, char __b7) {
1388  return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
1389 }
1390 
1391 /* Creates a vector of two 32-bit values, both elements containing I. */
1392 extern __inline __m64
1393  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1394  _mm_set1_pi32(int __i) {
1395  __m64_union res;
1396 
1397  res.as_int[0] = __i;
1398  res.as_int[1] = __i;
1399  return (res.as_m64);
1400 }
1401 
1402 /* Creates a vector of four 16-bit values, all elements containing W. */
1403 extern __inline __m64
1404  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1405  _mm_set1_pi16(short __w) {
1406 #if _ARCH_PWR9
1407  __vector signed short w;
1408 
1409  w = (__vector signed short)vec_splats(__w);
1410  return (__m64)((__vector long long)w)[0];
1411 #else
1412  __m64_union res;
1413 
1414  res.as_short[0] = __w;
1415  res.as_short[1] = __w;
1416  res.as_short[2] = __w;
1417  res.as_short[3] = __w;
1418  return (res.as_m64);
1419 #endif
1420 }
1421 
1422 /* Creates a vector of eight 8-bit values, all elements containing B. */
1423 extern __inline __m64
1424  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1425  _mm_set1_pi8(signed char __b) {
1426 #if _ARCH_PWR8
1427  __vector signed char b;
1428 
1429  b = (__vector signed char)vec_splats(__b);
1430  return (__m64)((__vector long long)b)[0];
1431 #else
1432  __m64_union res;
1433 
1434  res.as_char[0] = __b;
1435  res.as_char[1] = __b;
1436  res.as_char[2] = __b;
1437  res.as_char[3] = __b;
1438  res.as_char[4] = __b;
1439  res.as_char[5] = __b;
1440  res.as_char[6] = __b;
1441  res.as_char[7] = __b;
1442  return (res.as_m64);
1443 #endif
1444 }
1445 
1446 #else
1447 #include_next <mmintrin.h>
1448 #endif /* defined(__linux__) && defined(__ppc64__) */
1449 
1450 #endif /* _MMINTRIN_H_INCLUDED */
__device__ __2f16 b
__device__ int
__device__ __2f16 float c
static __inline__ vector unsigned char __ATTRS_o_ai vec_sr(vector unsigned char __a, vector unsigned char __b)
Definition: altivec.h:9543
static __inline__ vector bool char __ATTRS_o_ai vec_cmpeq(vector signed char __a, vector signed char __b)
Definition: altivec.h:1625
static __inline__ vector signed char __ATTRS_o_ai vec_sra(vector signed char __a, vector unsigned char __b)
Definition: altivec.h:9633
static __inline__ vector signed char __ATTRS_o_ai vec_splats(signed char __a)
Definition: altivec.h:13710
static __inline__ vector signed char __ATTRS_o_ai vec_mergel(vector signed char __a, vector signed char __b)
Definition: altivec.h:4804
static __inline__ vector signed char __ATTRS_o_ai vec_subs(vector signed char __a, vector signed char __b)
Definition: altivec.h:11232
static __inline__ vector float vector float __b
Definition: altivec.h:520
static __inline__ vector signed char __ATTRS_o_ai vec_adds(vector signed char __a, vector signed char __b)
Definition: altivec.h:560
static __inline__ vector signed char __ATTRS_o_ai vec_perm(vector signed char __a, vector signed char __b, vector unsigned char __c)
Definition: altivec.h:7320
static __inline__ vector signed char __ATTRS_o_ai vec_sel(vector signed char __a, vector signed char __b, vector unsigned char __c)
Definition: altivec.h:7834
static __inline__ vector bool char __ATTRS_o_ai vec_cmplt(vector signed char __a, vector signed char __b)
Definition: altivec.h:2196
static __inline__ vector signed char __ATTRS_o_ai vec_pack(vector signed short __a, vector signed short __b)
Definition: altivec.h:6747
static __inline__ vector unsigned char __ATTRS_o_ai vec_sl(vector unsigned char __a, vector unsigned char __b)
Definition: altivec.h:8088
static __inline__ vector signed char __ATTRS_o_ai vec_add(vector signed char __a, vector signed char __b)
Definition: altivec.h:198
static __inline__ vector bool char __ATTRS_o_ai vec_cmpgt(vector signed char __a, vector signed char __b)
Definition: altivec.h:1964
static __inline__ vector signed char __ATTRS_o_ai vec_packs(vector short __a, vector short __b)
Definition: altivec.h:7073
static __inline__ vector signed char __ATTRS_o_ai vec_sub(vector signed char __a, vector signed char __b)
Definition: altivec.h:10963
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sub_si64(__m64 __a, __m64 __b)
Subtracts signed or unsigned 64-bit integer values and writes the difference to the corresponding bit...
Definition: emmintrin.h:2625
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_add_si64(__m64 __a, __m64 __b)
Adds two signed or unsigned 64-bit integer values, returning the lower 64 bits of the sum.
Definition: emmintrin.h:2179
#define _m_empty
Definition: mmintrin.h:1499
#define _m_pcmpeqd
Definition: mmintrin.h:1552
#define _m_pand
Definition: mmintrin.h:1546
#define _m_pslld
Definition: mmintrin.h:1532
#define _m_pcmpgtd
Definition: mmintrin.h:1555
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
Unpacks the upper 32 bits from two 64-bit integer vectors of [4 x i16] and interleaves them into a 64...
Definition: mmintrin.h:237
#define _m_pcmpgtb
Definition: mmintrin.h:1553
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_set1_pi16(short __w)
Constructs a 64-bit integer vector of [4 x i16], with each of the 16-bit integer vector elements set ...
Definition: mmintrin.h:1397
#define _m_psrlwi
Definition: mmintrin.h:1541
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_add_pi32(__m64 __m1, __m64 __m2)
Adds each 32-bit integer element of the first 64-bit integer vector of [2 x i32] to the corresponding...
Definition: mmintrin.h:392
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_srl_pi16(__m64 __m, __m64 __count)
Right-shifts each 16-bit integer element of the first parameter, which is a 64-bit integer vector of ...
Definition: mmintrin.h:954
#define _m_psllq
Definition: mmintrin.h:1534
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_srai_pi32(__m64 __m, int __count)
Right-shifts each 32-bit integer element of a 64-bit integer vector of [2 x i32] by the number of bit...
Definition: mmintrin.h:931
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3)
Constructs a 64-bit integer vector, initialized in reverse order with the specified 16-bit integer va...
Definition: mmintrin.h:1459
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8] and interleaves them into a 64-...
Definition: mmintrin.h:214
#define _m_packuswb
Definition: mmintrin.h:1506
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7)
Constructs a 64-bit integer vector, initialized in reverse order with the specified 8-bit integer val...
Definition: mmintrin.h:1490
#define _m_psllwi
Definition: mmintrin.h:1531
#define _m_packsswb
Definition: mmintrin.h:1504
#define _m_to_int64
Definition: mmintrin.h:1503
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_madd_pi16(__m64 __m1, __m64 __m2)
Multiplies each 16-bit signed integer element of the first 64-bit integer vector of [4 x i16] by the ...
Definition: mmintrin.h:665
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_srl_si64(__m64 __m, __m64 __count)
Right-shifts the first 64-bit integer parameter by the number of bits specified by the second 64-bit ...
Definition: mmintrin.h:1041
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_subs_pu16(__m64 __m1, __m64 __m2)
Subtracts each 16-bit unsigned integer element of the second 64-bit integer vector of [4 x i16] from ...
Definition: mmintrin.h:638
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_adds_pi8(__m64 __m1, __m64 __m2)
Adds each 8-bit signed integer element of the first 64-bit integer vector of [8 x i8] to the correspo...
Definition: mmintrin.h:414
#define _m_paddb
Definition: mmintrin.h:1513
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8] and interleaves them into a 64-...
Definition: mmintrin.h:285
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_subs_pi16(__m64 __m1, __m64 __m2)
Subtracts each 16-bit signed integer element of the second 64-bit integer vector of [4 x i16] from th...
Definition: mmintrin.h:590
#define _m_paddusw
Definition: mmintrin.h:1519
long long __m64 __attribute__((__vector_size__(8), __aligned__(8)))
Definition: mmintrin.h:13
#define _m_psubusb
Definition: mmintrin.h:1525
#define _m_to_int
Definition: mmintrin.h:1502
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
Compares the 16-bit integer elements of two 64-bit integer vectors of [4 x i16] to determine if the e...
Definition: mmintrin.h:1247
#define _m_punpckhdq
Definition: mmintrin.h:1509
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sll_si64(__m64 __m, __m64 __count)
Left-shifts the first 64-bit integer parameter by the number of bits specified by the second 64-bit i...
Definition: mmintrin.h:817
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sub_pi8(__m64 __m1, __m64 __m2)
Subtracts each 8-bit integer element of the second 64-bit integer vector of [8 x i8] from the corresp...
Definition: mmintrin.h:502
static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtm64_si64(__m64 __m)
Casts a 64-bit integer vector into a 64-bit signed integer value.
Definition: mmintrin.h:97
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_subs_pi8(__m64 __m1, __m64 __m2)
Subtracts each 8-bit signed integer element of the second 64-bit integer vector of [8 x i8] from the ...
Definition: mmintrin.h:567
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_adds_pi16(__m64 __m1, __m64 __m2)
Adds each 16-bit signed integer element of the first 64-bit integer vector of [4 x i16] to the corres...
Definition: mmintrin.h:437
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
Compares the 32-bit integer elements of two 64-bit integer vectors of [2 x i32] to determine if the e...
Definition: mmintrin.h:1269
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
Compares the 8-bit integer elements of two 64-bit integer vectors of [8 x i8] to determine if the ele...
Definition: mmintrin.h:1225
#define _m_pcmpeqw
Definition: mmintrin.h:1551
#define _m_psllw
Definition: mmintrin.h:1530
#define _m_por
Definition: mmintrin.h:1548
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_packs_pu16(__m64 __m1, __m64 __m2)
Converts 16-bit signed integers from both 64-bit integer vector parameters of [4 x i16] into 8-bit un...
Definition: mmintrin.h:187
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvtsi32_si64(int __i)
Constructs a 64-bit integer vector, setting the lower 32 bits to the value of the 32-bit integer para...
Definition: mmintrin.h:48
#define _m_punpckhwd
Definition: mmintrin.h:1508
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_set1_pi32(int __i)
Constructs a 64-bit integer vector of [2 x i32], with each of the 32-bit integer vector elements set ...
Definition: mmintrin.h:1378
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_slli_pi16(__m64 __m, int __count)
Left-shifts each 16-bit signed integer element of a 64-bit integer vector of [4 x i16] by the number ...
Definition: mmintrin.h:752
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_set1_pi8(char __b)
Constructs a 64-bit integer vector of [8 x i8], with each of the 8-bit integer vector elements set to...
Definition: mmintrin.h:1415
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_packs_pi32(__m64 __m1, __m64 __m2)
Converts 32-bit signed integers from both 64-bit integer vector parameters of [2 x i32] into 16-bit s...
Definition: mmintrin.h:157
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_add_pi16(__m64 __m1, __m64 __m2)
Adds each 16-bit integer element of the first 64-bit integer vector of [4 x i16] to the corresponding...
Definition: mmintrin.h:371
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_subs_pu8(__m64 __m1, __m64 __m2)
Subtracts each 8-bit unsigned integer element of the second 64-bit integer vector of [8 x i8] from th...
Definition: mmintrin.h:614
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_adds_pu8(__m64 __m1, __m64 __m2)
Adds each 8-bit unsigned integer element of the first 64-bit integer vector of [8 x i8] to the corres...
Definition: mmintrin.h:459
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_or_si64(__m64 __m1, __m64 __m2)
Performs a bitwise OR of two 64-bit integer vectors.
Definition: mmintrin.h:1119
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_slli_si64(__m64 __m, int __count)
Left-shifts the first parameter, which is a 64-bit integer, by the number of bits specified by the se...
Definition: mmintrin.h:837
#define _m_psrlqi
Definition: mmintrin.h:1545
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sra_pi32(__m64 __m, __m64 __count)
Right-shifts each 32-bit integer element of the first parameter, which is a 64-bit integer vector of ...
Definition: mmintrin.h:908
#define _m_punpckhbw
Definition: mmintrin.h:1507
#define _m_paddsb
Definition: mmintrin.h:1516
#define _m_psllqi
Definition: mmintrin.h:1535
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_srl_pi32(__m64 __m, __m64 __count)
Right-shifts each 32-bit integer element of the first parameter, which is a 64-bit integer vector of ...
Definition: mmintrin.h:999
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
Unpacks the lower 32 bits from two 64-bit integer vectors of [4 x i16] and interleaves them into a 64...
Definition: mmintrin.h:308
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
Compares the 16-bit integer elements of two 64-bit integer vectors of [4 x i16] to determine if the e...
Definition: mmintrin.h:1181
#define _m_pslldi
Definition: mmintrin.h:1533
#define _m_pmullw
Definition: mmintrin.h:1529
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_srli_pi16(__m64 __m, int __count)
Right-shifts each 16-bit integer element of a 64-bit integer vector of [4 x i16] by the number of bit...
Definition: mmintrin.h:976
#define _m_psubsb
Definition: mmintrin.h:1523
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_srli_si64(__m64 __m, int __count)
Right-shifts the first parameter, which is a 64-bit integer, by the number of bits specified by the s...
Definition: mmintrin.h:1062
#define _m_pcmpgtw
Definition: mmintrin.h:1554
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_set_pi16(short __s3, short __s2, short __s1, short __s0)
Constructs a 64-bit integer vector initialized with the specified 16-bit integer values.
Definition: mmintrin.h:1326
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
Compares the 32-bit integer elements of two 64-bit integer vectors of [2 x i32] to determine if the e...
Definition: mmintrin.h:1203
#define _m_pcmpeqb
Definition: mmintrin.h:1550
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sll_pi32(__m64 __m, __m64 __count)
Left-shifts each 32-bit signed integer element of the first parameter, which is a 64-bit integer vect...
Definition: mmintrin.h:775
#define _m_psrldi
Definition: mmintrin.h:1543
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_setr_pi32(int __i0, int __i1)
Constructs a 64-bit integer vector, initialized in reverse order with the specified 32-bit integer va...
Definition: mmintrin.h:1436
#define _m_from_int
Definition: mmintrin.h:1500
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_add_pi8(__m64 __m1, __m64 __m2)
Adds each 8-bit integer element of the first 64-bit integer vector of [8 x i8] to the corresponding 8...
Definition: mmintrin.h:350
#define _m_paddd
Definition: mmintrin.h:1515
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_srai_pi16(__m64 __m, int __count)
Right-shifts each 16-bit integer element of a 64-bit integer vector of [4 x i16] by the number of bit...
Definition: mmintrin.h:884
#define _m_psubw
Definition: mmintrin.h:1521
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_set_pi32(int __i1, int __i0)
Constructs a 64-bit integer vector initialized with the specified 32-bit integer values.
Definition: mmintrin.h:1303
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_adds_pu16(__m64 __m1, __m64 __m2)
Adds each 16-bit unsigned integer element of the first 64-bit integer vector of [4 x i16] to the corr...
Definition: mmintrin.h:481
#define _m_psrawi
Definition: mmintrin.h:1537
#define _m_psubb
Definition: mmintrin.h:1520
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_xor_si64(__m64 __m1, __m64 __m2)
Performs a bitwise exclusive OR of two 64-bit integer vectors.
Definition: mmintrin.h:1137
#define _m_from_int64
Definition: mmintrin.h:1501
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvtsi64_m64(long long __i)
Casts a 64-bit signed integer value into a 64-bit integer vector.
Definition: mmintrin.h:81
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
Unpacks the upper 32 bits from two 64-bit integer vectors of [2 x i32] and interleaves them into a 64...
Definition: mmintrin.h:258
#define _m_psubsw
Definition: mmintrin.h:1524
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_mullo_pi16(__m64 __m1, __m64 __m2)
Multiplies each 16-bit signed integer element of the first 64-bit integer vector of [4 x i16] by the ...
Definition: mmintrin.h:707
#define _m_punpcklwd
Definition: mmintrin.h:1511
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_slli_pi32(__m64 __m, int __count)
Left-shifts each 32-bit signed integer element of a 64-bit integer vector of [2 x i32] by the number ...
Definition: mmintrin.h:797
#define _m_pxor
Definition: mmintrin.h:1549
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
Unpacks the lower 32 bits from two 64-bit integer vectors of [2 x i32] and interleaves them into a 64...
Definition: mmintrin.h:329
#define _m_packssdw
Definition: mmintrin.h:1505
#define _m_pmulhw
Definition: mmintrin.h:1528
#define _m_psrld
Definition: mmintrin.h:1542
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_setzero_si64(void)
Constructs a 64-bit integer vector initialized to zero.
Definition: mmintrin.h:1282
#define _m_paddw
Definition: mmintrin.h:1514
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sll_pi16(__m64 __m, __m64 __count)
Left-shifts each 16-bit signed integer element of the first parameter, which is a 64-bit integer vect...
Definition: mmintrin.h:730
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sra_pi16(__m64 __m, __m64 __count)
Right-shifts each 16-bit integer element of the first parameter, which is a 64-bit integer vector of ...
Definition: mmintrin.h:861
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sub_pi32(__m64 __m1, __m64 __m2)
Subtracts each 32-bit integer element of the second 64-bit integer vector of [2 x i32] from the corre...
Definition: mmintrin.h:544
#define _m_psraw
Definition: mmintrin.h:1536
#define _m_psubd
Definition: mmintrin.h:1522
#define _m_paddsw
Definition: mmintrin.h:1517
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi64_si32(__m64 __m)
Returns the lower 32 bits of a 64-bit integer vector as a 32-bit signed integer.
Definition: mmintrin.h:65
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
Compares the 8-bit integer elements of two 64-bit integer vectors of [8 x i8] to determine if the ele...
Definition: mmintrin.h:1159
#define _m_psrlq
Definition: mmintrin.h:1544
#define _m_psubusw
Definition: mmintrin.h:1526
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
Constructs a 64-bit integer vector initialized with the specified 8-bit integer values.
Definition: mmintrin.h:1357
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_packs_pi16(__m64 __m1, __m64 __m2)
Converts 16-bit signed integers from both 64-bit integer vector parameters of [4 x i16] into 8-bit si...
Definition: mmintrin.h:127
#define _m_pandn
Definition: mmintrin.h:1547
#define _m_psradi
Definition: mmintrin.h:1539
#define _m_paddusb
Definition: mmintrin.h:1518
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_srli_pi32(__m64 __m, int __count)
Right-shifts each 32-bit integer element of a 64-bit integer vector of [2 x i32] by the number of bit...
Definition: mmintrin.h:1021
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_mulhi_pi16(__m64 __m1, __m64 __m2)
Multiplies each 16-bit signed integer element of the first 64-bit integer vector of [4 x i16] by the ...
Definition: mmintrin.h:686
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_andnot_si64(__m64 __m1, __m64 __m2)
Performs a bitwise NOT of the first 64-bit integer vector, and then performs a bitwise AND of the int...
Definition: mmintrin.h:1101
#define _m_pmaddwd
Definition: mmintrin.h:1527
#define _m_psrad
Definition: mmintrin.h:1538
#define _m_punpcklbw
Definition: mmintrin.h:1510
#define _m_punpckldq
Definition: mmintrin.h:1512
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sub_pi16(__m64 __m1, __m64 __m2)
Subtracts each 16-bit integer element of the second 64-bit integer vector of [4 x i16] from the corre...
Definition: mmintrin.h:523
#define _m_psrlw
Definition: mmintrin.h:1540
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_and_si64(__m64 __m1, __m64 __m2)
Performs a bitwise AND of two 64-bit integer vectors.
Definition: mmintrin.h:1080
#define as_int(x)
Definition: opencl-c.h:6372
#define as_float(x)
Definition: opencl-c.h:6400
#define as_char(x)
OpenCL v1.1/1.2/2.0 s6.2.4.2 - as_type operators Reinterprets a data type as another data type of the...
Definition: opencl-c.h:6344
#define as_short(x)
Definition: opencl-c.h:6358