ETISS 0.8.0
Extendable Translating Instruction Set Simulator (version 0.8.0)
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
emmintrin.h
Go to the documentation of this file.
1/*===---- emmintrin.h - Implementation of SSE2 intrinsics on PowerPC -------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10/* Implemented from the specification included in the Intel C++ Compiler
11 User Guide and Reference, version 9.0. */
12
13#ifndef NO_WARN_X86_INTRINSICS
14/* This header file is to help porting code using Intel intrinsics
15 explicitly from x86_64 to powerpc64/powerpc64le.
16
17 Since X86 SSE2 intrinsics mainly handles __m128i and __m128d type,
18 PowerPC VMX/VSX ISA is a good match for vector float SIMD operations.
19 However scalar float operations in vector (XMM) registers require
20 the POWER8 VSX ISA (2.07) level. There are differences for data
21 format and placement of float scalars in the vector register, which
22 require extra steps to match SSE2 scalar float semantics on POWER.
23
24 It should be noted that there's much difference between X86_64's
25 MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
26 portable <fenv.h> instead of access MXSCR directly.
27
28 Most SSE2 scalar float intrinsic operations can be performed more
29 efficiently as C language float scalar operations or optimized to
30 use vector SIMD operations. We recommend this for new applications.
31*/
32#error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
33#endif
34
35#ifndef EMMINTRIN_H_
36#define EMMINTRIN_H_
37
38#if defined(__linux__) && defined(__ppc64__)
39
40#include <altivec.h>
41
42/* We need definitions from the SSE header files. */
43#include <xmmintrin.h>
44
45/* SSE2 */
46typedef __vector double __v2df;
47typedef __vector long long __v2di;
48typedef __vector unsigned long long __v2du;
49typedef __vector int __v4si;
50typedef __vector unsigned int __v4su;
51typedef __vector short __v8hi;
52typedef __vector unsigned short __v8hu;
53typedef __vector signed char __v16qi;
54typedef __vector unsigned char __v16qu;
55
56/* The Intel API is flexible enough that we must allow aliasing with other
57 vector types, and their scalar components. */
58typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
59typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
60
61/* Unaligned version of the same types. */
62typedef long long __m128i_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
63typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
64
65/* Define two value permute mask. */
66#define _MM_SHUFFLE2(x,y) (((x) << 1) | (y))
67
68/* Create a vector with element 0 as F and the rest zero. */
69extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
70_mm_set_sd (double __F)
71{
72 return __extension__ (__m128d){ __F, 0.0 };
73}
74
75/* Create a vector with both elements equal to F. */
76extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
77_mm_set1_pd (double __F)
78{
79 return __extension__ (__m128d){ __F, __F };
80}
81
82extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
83_mm_set_pd1 (double __F)
84{
85 return _mm_set1_pd (__F);
86}
87
88/* Create a vector with the lower value X and upper value W. */
89extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
90_mm_set_pd (double __W, double __X)
91{
92 return __extension__ (__m128d){ __X, __W };
93}
94
95/* Create a vector with the lower value W and upper value X. */
96extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
97_mm_setr_pd (double __W, double __X)
98{
99 return __extension__ (__m128d){ __W, __X };
100}
101
102/* Create an undefined vector. */
103extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
104_mm_undefined_pd (void)
105{
106 __m128d __Y = __Y;
107 return __Y;
108}
109
110/* Create a vector of zeros. */
111extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
112_mm_setzero_pd (void)
113{
114 return (__m128d) vec_splats (0);
115}
116
117/* Sets the low DPFP value of A from the low value of B. */
118extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
119_mm_move_sd (__m128d __A, __m128d __B)
120{
121 __v2df result = (__v2df) __A;
122 result [0] = ((__v2df) __B)[0];
123 return (__m128d) result;
124}
125
126/* Load two DPFP values from P. The address must be 16-byte aligned. */
127extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
128_mm_load_pd (double const *__P)
129{
130 return ((__m128d)vec_ld(0, (__v16qu*)__P));
131}
132
133/* Load two DPFP values from P. The address need not be 16-byte aligned. */
134extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
135_mm_loadu_pd (double const *__P)
136{
137 return (vec_vsx_ld(0, __P));
138}
139
140/* Create a vector with all two elements equal to *P. */
141extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
142_mm_load1_pd (double const *__P)
143{
144 return (vec_splats (*__P));
145}
146
147/* Create a vector with element 0 as *P and the rest zero. */
148extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
149_mm_load_sd (double const *__P)
150{
151 return _mm_set_sd (*__P);
152}
153
154extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
155_mm_load_pd1 (double const *__P)
156{
157 return _mm_load1_pd (__P);
158}
159
160/* Load two DPFP values in reverse order. The address must be aligned. */
161extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
162_mm_loadr_pd (double const *__P)
163{
164 __v2df __tmp = _mm_load_pd (__P);
165 return (__m128d)vec_xxpermdi (__tmp, __tmp, 2);
166}
167
168/* Store two DPFP values. The address must be 16-byte aligned. */
169extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
170_mm_store_pd (double *__P, __m128d __A)
171{
172 vec_st((__v16qu)__A, 0, (__v16qu*)__P);
173}
174
175/* Store two DPFP values. The address need not be 16-byte aligned. */
176extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
177_mm_storeu_pd (double *__P, __m128d __A)
178{
179 *(__m128d_u *)__P = __A;
180}
181
182/* Stores the lower DPFP value. */
183extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
184_mm_store_sd (double *__P, __m128d __A)
185{
186 *__P = ((__v2df)__A)[0];
187}
188
189extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__))
190_mm_cvtsd_f64 (__m128d __A)
191{
192 return ((__v2df)__A)[0];
193}
194
195extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
196_mm_storel_pd (double *__P, __m128d __A)
197{
198 _mm_store_sd (__P, __A);
199}
200
201/* Stores the upper DPFP value. */
202extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
203_mm_storeh_pd (double *__P, __m128d __A)
204{
205 *__P = ((__v2df)__A)[1];
206}
207/* Store the lower DPFP value across two words.
208 The address must be 16-byte aligned. */
209extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
210_mm_store1_pd (double *__P, __m128d __A)
211{
212 _mm_store_pd (__P, vec_splat (__A, 0));
213}
214
215extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
216_mm_store_pd1 (double *__P, __m128d __A)
217{
218 _mm_store1_pd (__P, __A);
219}
220
221/* Store two DPFP values in reverse order. The address must be aligned. */
222extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
223_mm_storer_pd (double *__P, __m128d __A)
224{
225 _mm_store_pd (__P, vec_xxpermdi (__A, __A, 2));
226}
227
228/* Intel intrinsic. */
229extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
230_mm_cvtsi128_si64 (__m128i __A)
231{
232 return ((__v2di)__A)[0];
233}
234
235/* Microsoft intrinsic. */
236extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
237_mm_cvtsi128_si64x (__m128i __A)
238{
239 return ((__v2di)__A)[0];
240}
241
242extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
243_mm_add_pd (__m128d __A, __m128d __B)
244{
245 return (__m128d) ((__v2df)__A + (__v2df)__B);
246}
247
248/* Add the lower double-precision (64-bit) floating-point element in
249 a and b, store the result in the lower element of dst, and copy
250 the upper element from a to the upper element of dst. */
251extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
252_mm_add_sd (__m128d __A, __m128d __B)
253{
254 __A[0] = __A[0] + __B[0];
255 return (__A);
256}
257
258extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
259_mm_sub_pd (__m128d __A, __m128d __B)
260{
261 return (__m128d) ((__v2df)__A - (__v2df)__B);
262}
263
264extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
265_mm_sub_sd (__m128d __A, __m128d __B)
266{
267 __A[0] = __A[0] - __B[0];
268 return (__A);
269}
270
271extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
272_mm_mul_pd (__m128d __A, __m128d __B)
273{
274 return (__m128d) ((__v2df)__A * (__v2df)__B);
275}
276
277extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
278_mm_mul_sd (__m128d __A, __m128d __B)
279{
280 __A[0] = __A[0] * __B[0];
281 return (__A);
282}
283
284extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
285_mm_div_pd (__m128d __A, __m128d __B)
286{
287 return (__m128d) ((__v2df)__A / (__v2df)__B);
288}
289
290extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
291_mm_div_sd (__m128d __A, __m128d __B)
292{
293 __A[0] = __A[0] / __B[0];
294 return (__A);
295}
296
297extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
298_mm_sqrt_pd (__m128d __A)
299{
300 return (vec_sqrt (__A));
301}
302
303/* Return pair {sqrt (B[0]), A[1]}. */
304extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
305_mm_sqrt_sd (__m128d __A, __m128d __B)
306{
307 __v2df c;
308 c = vec_sqrt ((__v2df) _mm_set1_pd (__B[0]));
309 return (__m128d) _mm_setr_pd (c[0], __A[1]);
310}
311
312extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
313_mm_min_pd (__m128d __A, __m128d __B)
314{
315 return (vec_min (__A, __B));
316}
317
318extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
319_mm_min_sd (__m128d __A, __m128d __B)
320{
321 __v2df a, b, c;
322 a = vec_splats (__A[0]);
323 b = vec_splats (__B[0]);
324 c = vec_min (a, b);
325 return (__m128d) _mm_setr_pd (c[0], __A[1]);
326}
327
328extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
329_mm_max_pd (__m128d __A, __m128d __B)
330{
331 return (vec_max (__A, __B));
332}
333
334extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
335_mm_max_sd (__m128d __A, __m128d __B)
336{
337 __v2df a, b, c;
338 a = vec_splats (__A[0]);
339 b = vec_splats (__B[0]);
340 c = vec_max (a, b);
341 return (__m128d) _mm_setr_pd (c[0], __A[1]);
342}
343
344extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
345_mm_cmpeq_pd (__m128d __A, __m128d __B)
346{
347 return ((__m128d)vec_cmpeq ((__v2df) __A, (__v2df) __B));
348}
349
350extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
351_mm_cmplt_pd (__m128d __A, __m128d __B)
352{
353 return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B));
354}
355
356extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
357_mm_cmple_pd (__m128d __A, __m128d __B)
358{
359 return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B));
360}
361
362extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
363_mm_cmpgt_pd (__m128d __A, __m128d __B)
364{
365 return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B));
366}
367
368extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
369_mm_cmpge_pd (__m128d __A, __m128d __B)
370{
371 return ((__m128d)vec_cmpge ((__v2df) __A,(__v2df) __B));
372}
373
374extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
375_mm_cmpneq_pd (__m128d __A, __m128d __B)
376{
377 __v2df temp = (__v2df) vec_cmpeq ((__v2df) __A, (__v2df)__B);
378 return ((__m128d)vec_nor (temp, temp));
379}
380
381extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
382_mm_cmpnlt_pd (__m128d __A, __m128d __B)
383{
384 return ((__m128d)vec_cmpge ((__v2df) __A, (__v2df) __B));
385}
386
387extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
388_mm_cmpnle_pd (__m128d __A, __m128d __B)
389{
390 return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B));
391}
392
393extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
394_mm_cmpngt_pd (__m128d __A, __m128d __B)
395{
396 return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B));
397}
398
399extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
400_mm_cmpnge_pd (__m128d __A, __m128d __B)
401{
402 return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B));
403}
404
405extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
406_mm_cmpord_pd (__m128d __A, __m128d __B)
407{
408#if _ARCH_PWR8
409 __v2du c, d;
410 /* Compare against self will return false (0's) if NAN. */
411 c = (__v2du)vec_cmpeq (__A, __A);
412 d = (__v2du)vec_cmpeq (__B, __B);
413#else
414 __v2du a, b;
415 __v2du c, d;
416 const __v2du double_exp_mask = {0x7ff0000000000000, 0x7ff0000000000000};
417 a = (__v2du)vec_abs ((__v2df)__A);
418 b = (__v2du)vec_abs ((__v2df)__B);
419 c = (__v2du)vec_cmpgt (double_exp_mask, a);
420 d = (__v2du)vec_cmpgt (double_exp_mask, b);
421#endif
422 /* A != NAN and B != NAN. */
423 return ((__m128d)vec_and(c, d));
424}
425
426extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
427_mm_cmpunord_pd (__m128d __A, __m128d __B)
428{
429#if _ARCH_PWR8
430 __v2du c, d;
431 /* Compare against self will return false (0's) if NAN. */
432 c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A);
433 d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B);
434 /* A == NAN OR B == NAN converts too:
435 NOT(A != NAN) OR NOT(B != NAN). */
436 c = vec_nor (c, c);
437 return ((__m128d)vec_orc(c, d));
438#else
439 __v2du c, d;
440 /* Compare against self will return false (0's) if NAN. */
441 c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A);
442 d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B);
443 /* Convert the true ('1's) is NAN. */
444 c = vec_nor (c, c);
445 d = vec_nor (d, d);
446 return ((__m128d)vec_or(c, d));
447#endif
448}
449
450extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
451_mm_cmpeq_sd(__m128d __A, __m128d __B)
452{
453 __v2df a, b, c;
454 /* PowerISA VSX does not allow partial (for just lower double)
455 results. So to insure we don't generate spurious exceptions
456 (from the upper double values) we splat the lower double
457 before we do the operation. */
458 a = vec_splats (__A[0]);
459 b = vec_splats (__B[0]);
460 c = (__v2df) vec_cmpeq(a, b);
461 /* Then we merge the lower double result with the original upper
462 double from __A. */
463 return (__m128d) _mm_setr_pd (c[0], __A[1]);
464}
465
466extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
467_mm_cmplt_sd (__m128d __A, __m128d __B)
468{
469 __v2df a, b, c;
470 a = vec_splats (__A[0]);
471 b = vec_splats (__B[0]);
472 c = (__v2df) vec_cmplt(a, b);
473 return (__m128d) _mm_setr_pd (c[0], __A[1]);
474}
475
476extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
477_mm_cmple_sd (__m128d __A, __m128d __B)
478{
479 __v2df a, b, c;
480 a = vec_splats (__A[0]);
481 b = vec_splats (__B[0]);
482 c = (__v2df) vec_cmple(a, b);
483 return (__m128d) _mm_setr_pd (c[0], __A[1]);
484}
485
486extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
487_mm_cmpgt_sd (__m128d __A, __m128d __B)
488{
489 __v2df a, b, c;
490 a = vec_splats (__A[0]);
491 b = vec_splats (__B[0]);
492 c = (__v2df) vec_cmpgt(a, b);
493 return (__m128d) _mm_setr_pd (c[0], __A[1]);
494}
495
496extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
497_mm_cmpge_sd (__m128d __A, __m128d __B)
498{
499 __v2df a, b, c;
500 a = vec_splats (__A[0]);
501 b = vec_splats (__B[0]);
502 c = (__v2df) vec_cmpge(a, b);
503 return (__m128d) _mm_setr_pd (c[0], __A[1]);
504}
505
506extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
507_mm_cmpneq_sd (__m128d __A, __m128d __B)
508{
509 __v2df a, b, c;
510 a = vec_splats (__A[0]);
511 b = vec_splats (__B[0]);
512 c = (__v2df) vec_cmpeq(a, b);
513 c = vec_nor (c, c);
514 return (__m128d) _mm_setr_pd (c[0], __A[1]);
515}
516
517extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
518_mm_cmpnlt_sd (__m128d __A, __m128d __B)
519{
520 __v2df a, b, c;
521 a = vec_splats (__A[0]);
522 b = vec_splats (__B[0]);
523 /* Not less than is just greater than or equal. */
524 c = (__v2df) vec_cmpge(a, b);
525 return (__m128d) _mm_setr_pd (c[0], __A[1]);
526}
527
528extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
529_mm_cmpnle_sd (__m128d __A, __m128d __B)
530{
531 __v2df a, b, c;
532 a = vec_splats (__A[0]);
533 b = vec_splats (__B[0]);
534 /* Not less than or equal is just greater than. */
535 c = (__v2df) vec_cmpge(a, b);
536 return (__m128d) _mm_setr_pd (c[0], __A[1]);
537}
538
539extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
540_mm_cmpngt_sd (__m128d __A, __m128d __B)
541{
542 __v2df a, b, c;
543 a = vec_splats (__A[0]);
544 b = vec_splats (__B[0]);
545 /* Not greater than is just less than or equal. */
546 c = (__v2df) vec_cmple(a, b);
547 return (__m128d) _mm_setr_pd (c[0], __A[1]);
548}
549
550extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
551_mm_cmpnge_sd (__m128d __A, __m128d __B)
552{
553 __v2df a, b, c;
554 a = vec_splats (__A[0]);
555 b = vec_splats (__B[0]);
556 /* Not greater than or equal is just less than. */
557 c = (__v2df) vec_cmplt(a, b);
558 return (__m128d) _mm_setr_pd (c[0], __A[1]);
559}
560
561extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
562_mm_cmpord_sd (__m128d __A, __m128d __B)
563{
564 __v2df r;
565 r = (__v2df)_mm_cmpord_pd (vec_splats (__A[0]), vec_splats (__B[0]));
566 return (__m128d) _mm_setr_pd (r[0], ((__v2df)__A)[1]);
567}
568
569extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
570_mm_cmpunord_sd (__m128d __A, __m128d __B)
571{
572 __v2df r;
573 r = _mm_cmpunord_pd (vec_splats (__A[0]), vec_splats (__B[0]));
574 return (__m128d) _mm_setr_pd (r[0], __A[1]);
575}
576
577/* FIXME
578 The __mm_comi??_sd and __mm_ucomi??_sd implementations below are
579 exactly the same because GCC for PowerPC only generates unordered
580 compares (scalar and vector).
581 Technically __mm_comieq_sp et all should be using the ordered
582 compare and signal for QNaNs. The __mm_ucomieq_sd et all should
583 be OK. */
584extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
585_mm_comieq_sd (__m128d __A, __m128d __B)
586{
587 return (__A[0] == __B[0]);
588}
589
590extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
591_mm_comilt_sd (__m128d __A, __m128d __B)
592{
593 return (__A[0] < __B[0]);
594}
595
596extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
597_mm_comile_sd (__m128d __A, __m128d __B)
598{
599 return (__A[0] <= __B[0]);
600}
601
602extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
603_mm_comigt_sd (__m128d __A, __m128d __B)
604{
605 return (__A[0] > __B[0]);
606}
607
608extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
609_mm_comige_sd (__m128d __A, __m128d __B)
610{
611 return (__A[0] >= __B[0]);
612}
613
614extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
615_mm_comineq_sd (__m128d __A, __m128d __B)
616{
617 return (__A[0] != __B[0]);
618}
619
620extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
621_mm_ucomieq_sd (__m128d __A, __m128d __B)
622{
623 return (__A[0] == __B[0]);
624}
625
626extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
627_mm_ucomilt_sd (__m128d __A, __m128d __B)
628{
629 return (__A[0] < __B[0]);
630}
631
632extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
633_mm_ucomile_sd (__m128d __A, __m128d __B)
634{
635 return (__A[0] <= __B[0]);
636}
637
638extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
639_mm_ucomigt_sd (__m128d __A, __m128d __B)
640{
641 return (__A[0] > __B[0]);
642}
643
644extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
645_mm_ucomige_sd (__m128d __A, __m128d __B)
646{
647 return (__A[0] >= __B[0]);
648}
649
650extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
651_mm_ucomineq_sd (__m128d __A, __m128d __B)
652{
653 return (__A[0] != __B[0]);
654}
655
656/* Create a vector of Qi, where i is the element number. */
657extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
658_mm_set_epi64x (long long __q1, long long __q0)
659{
660 return __extension__ (__m128i)(__v2di){ __q0, __q1 };
661}
662
663extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
664_mm_set_epi64 (__m64 __q1, __m64 __q0)
665{
666 return _mm_set_epi64x ((long long)__q1, (long long)__q0);
667}
668
669extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
670_mm_set_epi32 (int __q3, int __q2, int __q1, int __q0)
671{
672 return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 };
673}
674
675extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
676_mm_set_epi16 (short __q7, short __q6, short __q5, short __q4,
677 short __q3, short __q2, short __q1, short __q0)
678{
679 return __extension__ (__m128i)(__v8hi){
680 __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 };
681}
682
683extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
684_mm_set_epi8 (char __q15, char __q14, char __q13, char __q12,
685 char __q11, char __q10, char __q09, char __q08,
686 char __q07, char __q06, char __q05, char __q04,
687 char __q03, char __q02, char __q01, char __q00)
688{
689 return __extension__ (__m128i)(__v16qi){
690 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
691 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
692 };
693}
694
695/* Set all of the elements of the vector to A. */
696extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
697_mm_set1_epi64x (long long __A)
698{
699 return _mm_set_epi64x (__A, __A);
700}
701
702extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
703_mm_set1_epi64 (__m64 __A)
704{
705 return _mm_set_epi64 (__A, __A);
706}
707
708extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
709_mm_set1_epi32 (int __A)
710{
711 return _mm_set_epi32 (__A, __A, __A, __A);
712}
713
714extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
715_mm_set1_epi16 (short __A)
716{
717 return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A);
718}
719
720extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
721_mm_set1_epi8 (char __A)
722{
723 return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
724 __A, __A, __A, __A, __A, __A, __A, __A);
725}
726
727/* Create a vector of Qi, where i is the element number.
728 The parameter order is reversed from the _mm_set_epi* functions. */
729extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
730_mm_setr_epi64 (__m64 __q0, __m64 __q1)
731{
732 return _mm_set_epi64 (__q1, __q0);
733}
734
735extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
736_mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3)
737{
738 return _mm_set_epi32 (__q3, __q2, __q1, __q0);
739}
740
741extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
742_mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3,
743 short __q4, short __q5, short __q6, short __q7)
744{
745 return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
746}
747
748extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
749_mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03,
750 char __q04, char __q05, char __q06, char __q07,
751 char __q08, char __q09, char __q10, char __q11,
752 char __q12, char __q13, char __q14, char __q15)
753{
754 return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
755 __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
756}
757
758/* Create a vector with element 0 as *P and the rest zero. */
759extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
760_mm_load_si128 (__m128i const *__P)
761{
762 return *__P;
763}
764
765extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
766_mm_loadu_si128 (__m128i_u const *__P)
767{
768 return (__m128i) (vec_vsx_ld(0, (signed int const *)__P));
769}
770
771extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
772_mm_loadl_epi64 (__m128i_u const *__P)
773{
774 return _mm_set_epi64 ((__m64)0LL, *(__m64 *)__P);
775}
776
777extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
778_mm_store_si128 (__m128i *__P, __m128i __B)
779{
780 vec_st ((__v16qu) __B, 0, (__v16qu*)__P);
781}
782
783extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
784_mm_storeu_si128 (__m128i_u *__P, __m128i __B)
785{
786 *__P = __B;
787}
788
789extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
790_mm_storel_epi64 (__m128i_u *__P, __m128i __B)
791{
792 *(long long *)__P = ((__v2di)__B)[0];
793}
794
795extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
796_mm_movepi64_pi64 (__m128i_u __B)
797{
798 return (__m64) ((__v2di)__B)[0];
799}
800
801extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
802_mm_movpi64_epi64 (__m64 __A)
803{
804 return _mm_set_epi64 ((__m64)0LL, __A);
805}
806
807extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
808_mm_move_epi64 (__m128i __A)
809{
810 return _mm_set_epi64 ((__m64)0LL, (__m64)__A[0]);
811}
812
813/* Create an undefined vector. */
814extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
816{
817 __m128i __Y = __Y;
818 return __Y;
819}
820
821/* Create a vector of zeros. */
822extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
824{
825 return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 };
826}
827
828#ifdef _ARCH_PWR8
829extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
830_mm_cvtepi32_pd (__m128i __A)
831{
832 __v2di val;
833 /* For LE need to generate Vector Unpack Low Signed Word.
834 Which is generated from unpackh. */
835 val = (__v2di)vec_unpackh ((__v4si)__A);
836
837 return (__m128d)vec_ctf (val, 0);
838}
839#endif
840
841extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
842_mm_cvtepi32_ps (__m128i __A)
843{
844 return ((__m128)vec_ctf((__v4si)__A, 0));
845}
846
847extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
848_mm_cvtpd_epi32 (__m128d __A)
849{
850 __v2df rounded = vec_rint (__A);
851 __v4si result, temp;
852 const __v4si vzero =
853 { 0, 0, 0, 0 };
854
855 /* VSX Vector truncate Double-Precision to integer and Convert to
856 Signed Integer Word format with Saturate. */
857 __asm__(
858 "xvcvdpsxws %x0,%x1"
859 : "=wa" (temp)
860 : "wa" (rounded)
861 : );
862
863#ifdef _ARCH_PWR8
864 temp = vec_mergeo (temp, temp);
865 result = (__v4si) vec_vpkudum ((__vector long long) temp,
866 (__vector long long) vzero);
867#else
868 {
869 const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
870 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
871 result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
872 }
873#endif
874 return (__m128i) result;
875}
876
877extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
878_mm_cvtpd_pi32 (__m128d __A)
879{
880 __m128i result = _mm_cvtpd_epi32(__A);
881
882 return (__m64) result[0];
883}
884
885extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
886_mm_cvtpd_ps (__m128d __A)
887{
888 __v4sf result;
889 __v4si temp;
890 const __v4si vzero = { 0, 0, 0, 0 };
891
892 __asm__(
893 "xvcvdpsp %x0,%x1"
894 : "=wa" (temp)
895 : "wa" (__A)
896 : );
897
898#ifdef _ARCH_PWR8
899 temp = vec_mergeo (temp, temp);
900 result = (__v4sf) vec_vpkudum ((__vector long long) temp,
901 (__vector long long) vzero);
902#else
903 {
904 const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
905 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
906 result = (__v4sf) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
907 }
908#endif
909 return ((__m128)result);
910}
911
912extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
913_mm_cvttpd_epi32 (__m128d __A)
914{
915 __v4si result;
916 __v4si temp;
917 const __v4si vzero = { 0, 0, 0, 0 };
918
919 /* VSX Vector truncate Double-Precision to integer and Convert to
920 Signed Integer Word format with Saturate. */
921 __asm__(
922 "xvcvdpsxws %x0,%x1"
923 : "=wa" (temp)
924 : "wa" (__A)
925 : );
926
927#ifdef _ARCH_PWR8
928 temp = vec_mergeo (temp, temp);
929 result = (__v4si) vec_vpkudum ((__vector long long) temp,
930 (__vector long long) vzero);
931#else
932 {
933 const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
934 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
935 result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
936 }
937#endif
938
939 return ((__m128i) result);
940}
941
942extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
943_mm_cvttpd_pi32 (__m128d __A)
944{
945 __m128i result = _mm_cvttpd_epi32 (__A);
946
947 return (__m64) result[0];
948}
949
950extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
951_mm_cvtsi128_si32 (__m128i __A)
952{
953 return ((__v4si)__A)[0];
954}
955
956#ifdef _ARCH_PWR8
957extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
958_mm_cvtpi32_pd (__m64 __A)
959{
960 __v4si temp;
961 __v2di tmp2;
962 __v2df result;
963
964 temp = (__v4si)vec_splats (__A);
965 tmp2 = (__v2di)vec_unpackl (temp);
966 result = vec_ctf ((__vector signed long long) tmp2, 0);
967 return (__m128d)result;
968}
969#endif
970
971extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
972_mm_cvtps_epi32 (__m128 __A)
973{
974 __v4sf rounded;
975 __v4si result;
976
977 rounded = vec_rint((__v4sf) __A);
978 result = vec_cts (rounded, 0);
979 return (__m128i) result;
980}
981
982extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
983_mm_cvttps_epi32 (__m128 __A)
984{
985 __v4si result;
986
987 result = vec_cts ((__v4sf) __A, 0);
988 return (__m128i) result;
989}
990
991extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
992_mm_cvtps_pd (__m128 __A)
993{
994 /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */
995#ifdef vec_doubleh
996 return (__m128d) vec_doubleh ((__v4sf)__A);
997#else
998 /* Otherwise the compiler is not current and so need to generate the
999 equivalent code. */
1000 __v4sf a = (__v4sf)__A;
1001 __v4sf temp;
1002 __v2df result;
1003#ifdef __LITTLE_ENDIAN__
1004 /* The input float values are in elements {[0], [1]} but the convert
1005 instruction needs them in elements {[1], [3]}, So we use two
1006 shift left double vector word immediates to get the elements
1007 lined up. */
1008 temp = __builtin_vsx_xxsldwi (a, a, 3);
1009 temp = __builtin_vsx_xxsldwi (a, temp, 2);
1010#else
1011 /* The input float values are in elements {[0], [1]} but the convert
1012 instruction needs them in elements {[0], [2]}, So we use two
1013 shift left double vector word immediates to get the elements
1014 lined up. */
1015 temp = vec_vmrghw (a, a);
1016#endif
1017 __asm__(
1018 " xvcvspdp %x0,%x1"
1019 : "=wa" (result)
1020 : "wa" (temp)
1021 : );
1022 return (__m128d) result;
1023#endif
1024}
1025
1026extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1027_mm_cvtsd_si32 (__m128d __A)
1028{
1029 __v2df rounded = vec_rint((__v2df) __A);
1030 int result = ((__v2df)rounded)[0];
1031
1032 return result;
1033}
1034/* Intel intrinsic. */
1035extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1036_mm_cvtsd_si64 (__m128d __A)
1037{
1038 __v2df rounded = vec_rint ((__v2df) __A );
1039 long long result = ((__v2df) rounded)[0];
1040
1041 return result;
1042}
1043
1044/* Microsoft intrinsic. */
1045extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1046_mm_cvtsd_si64x (__m128d __A)
1047{
1048 return _mm_cvtsd_si64 ((__v2df)__A);
1049}
1050
1051extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1052_mm_cvttsd_si32 (__m128d __A)
1053{
1054 int result = ((__v2df)__A)[0];
1055
1056 return result;
1057}
1058
1059/* Intel intrinsic. */
1060extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1061_mm_cvttsd_si64 (__m128d __A)
1062{
1063 long long result = ((__v2df)__A)[0];
1064
1065 return result;
1066}
1067
1068/* Microsoft intrinsic. */
1069extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1070_mm_cvttsd_si64x (__m128d __A)
1071{
1072 return _mm_cvttsd_si64 (__A);
1073}
1074
1075extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1076_mm_cvtsd_ss (__m128 __A, __m128d __B)
1077{
1078 __v4sf result = (__v4sf)__A;
1079
1080#ifdef __LITTLE_ENDIAN__
1081 __v4sf temp_s;
1082 /* Copy double element[0] to element [1] for conversion. */
1083 __v2df temp_b = vec_splat((__v2df)__B, 0);
1084
1085 /* Pre-rotate __A left 3 (logically right 1) elements. */
1086 result = __builtin_vsx_xxsldwi (result, result, 3);
1087 /* Convert double to single float scalar in a vector. */
1088 __asm__(
1089 "xscvdpsp %x0,%x1"
1090 : "=wa" (temp_s)
1091 : "wa" (temp_b)
1092 : );
1093 /* Shift the resulting scalar into vector element [0]. */
1094 result = __builtin_vsx_xxsldwi (result, temp_s, 1);
1095#else
1096 result [0] = ((__v2df)__B)[0];
1097#endif
1098 return (__m128) result;
1099}
1100
1101extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1102_mm_cvtsi32_sd (__m128d __A, int __B)
1103{
1104 __v2df result = (__v2df)__A;
1105 double db = __B;
1106 result [0] = db;
1107 return (__m128d)result;
1108}
1109
1110/* Intel intrinsic. */
1111extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1112_mm_cvtsi64_sd (__m128d __A, long long __B)
1113{
1114 __v2df result = (__v2df)__A;
1115 double db = __B;
1116 result [0] = db;
1117 return (__m128d)result;
1118}
1119
1120/* Microsoft intrinsic. */
1121extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1122_mm_cvtsi64x_sd (__m128d __A, long long __B)
1123{
1124 return _mm_cvtsi64_sd (__A, __B);
1125}
1126
1127extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1128_mm_cvtss_sd (__m128d __A, __m128 __B)
1129{
1130#ifdef __LITTLE_ENDIAN__
1131 /* Use splat to move element [0] into position for the convert. */
1132 __v4sf temp = vec_splat ((__v4sf)__B, 0);
1133 __v2df res;
1134 /* Convert single float scalar to double in a vector. */
1135 __asm__(
1136 "xscvspdp %x0,%x1"
1137 : "=wa" (res)
1138 : "wa" (temp)
1139 : );
1140 return (__m128d) vec_mergel (res, (__v2df)__A);
1141#else
1142 __v2df res = (__v2df)__A;
1143 res [0] = ((__v4sf)__B) [0];
1144 return (__m128d) res;
1145#endif
1146}
1147
1148extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1149_mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask)
1150{
1151 __vector double result;
1152 const int litmsk = __mask & 0x3;
1153
1154 if (litmsk == 0)
1155 result = vec_mergeh (__A, __B);
1156#if __GNUC__ < 6
1157 else if (litmsk == 1)
1158 result = vec_xxpermdi (__B, __A, 2);
1159 else if (litmsk == 2)
1160 result = vec_xxpermdi (__B, __A, 1);
1161#else
1162 else if (litmsk == 1)
1163 result = vec_xxpermdi (__A, __B, 2);
1164 else if (litmsk == 2)
1165 result = vec_xxpermdi (__A, __B, 1);
1166#endif
1167 else
1168 result = vec_mergel (__A, __B);
1169
1170 return result;
1171}
1172
1173extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1174_mm_unpackhi_pd (__m128d __A, __m128d __B)
1175{
1176 return (__m128d) vec_mergel ((__v2df)__A, (__v2df)__B);
1177}
1178
1179extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1180_mm_unpacklo_pd (__m128d __A, __m128d __B)
1181{
1182 return (__m128d) vec_mergeh ((__v2df)__A, (__v2df)__B);
1183}
1184
1185extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1186_mm_loadh_pd (__m128d __A, double const *__B)
1187{
1188 __v2df result = (__v2df)__A;
1189 result [1] = *__B;
1190 return (__m128d)result;
1191}
1192
1193extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1194_mm_loadl_pd (__m128d __A, double const *__B)
1195{
1196 __v2df result = (__v2df)__A;
1197 result [0] = *__B;
1198 return (__m128d)result;
1199}
1200
1201#ifdef _ARCH_PWR8
1202/* Intrinsic functions that require PowerISA 2.07 minimum. */
1203
1204/* Creates a 2-bit mask from the most significant bits of the DPFP values. */
1205extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1206_mm_movemask_pd (__m128d __A)
1207{
1208 __vector unsigned long long result;
1209 static const __vector unsigned int perm_mask =
1210 {
1211#ifdef __LITTLE_ENDIAN__
1212 0x80800040, 0x80808080, 0x80808080, 0x80808080
1213#else
1214 0x80808080, 0x80808080, 0x80808080, 0x80804000
1215#endif
1216 };
1217
1218 result = ((__vector unsigned long long)
1219 vec_vbpermq ((__vector unsigned char) __A,
1220 (__vector unsigned char) perm_mask));
1221
1222#ifdef __LITTLE_ENDIAN__
1223 return result[1];
1224#else
1225 return result[0];
1226#endif
1227}
1228#endif /* _ARCH_PWR8 */
1229
1230extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1231_mm_packs_epi16 (__m128i __A, __m128i __B)
1232{
1233 return (__m128i) vec_packs ((__v8hi) __A, (__v8hi)__B);
1234}
1235
1236extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1237_mm_packs_epi32 (__m128i __A, __m128i __B)
1238{
1239 return (__m128i) vec_packs ((__v4si)__A, (__v4si)__B);
1240}
1241
1242extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1243_mm_packus_epi16 (__m128i __A, __m128i __B)
1244{
1245 return (__m128i) vec_packsu ((__v8hi) __A, (__v8hi)__B);
1246}
1247
1248extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1249_mm_unpackhi_epi8 (__m128i __A, __m128i __B)
1250{
1251 return (__m128i) vec_mergel ((__v16qu)__A, (__v16qu)__B);
1252}
1253
1254extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1255_mm_unpackhi_epi16 (__m128i __A, __m128i __B)
1256{
1257 return (__m128i) vec_mergel ((__v8hu)__A, (__v8hu)__B);
1258}
1259
1260extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1261_mm_unpackhi_epi32 (__m128i __A, __m128i __B)
1262{
1263 return (__m128i) vec_mergel ((__v4su)__A, (__v4su)__B);
1264}
1265
1266extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1267_mm_unpackhi_epi64 (__m128i __A, __m128i __B)
1268{
1269 return (__m128i) vec_mergel ((__vector long long) __A,
1270 (__vector long long) __B);
1271}
1272
1273extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1274_mm_unpacklo_epi8 (__m128i __A, __m128i __B)
1275{
1276 return (__m128i) vec_mergeh ((__v16qu)__A, (__v16qu)__B);
1277}
1278
1279extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1280_mm_unpacklo_epi16 (__m128i __A, __m128i __B)
1281{
1282 return (__m128i) vec_mergeh ((__v8hi)__A, (__v8hi)__B);
1283}
1284
1285extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1286_mm_unpacklo_epi32 (__m128i __A, __m128i __B)
1287{
1288 return (__m128i) vec_mergeh ((__v4si)__A, (__v4si)__B);
1289}
1290
1291extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1292_mm_unpacklo_epi64 (__m128i __A, __m128i __B)
1293{
1294 return (__m128i) vec_mergeh ((__vector long long) __A,
1295 (__vector long long) __B);
1296}
1297
1298extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1299_mm_add_epi8 (__m128i __A, __m128i __B)
1300{
1301 return (__m128i) ((__v16qu)__A + (__v16qu)__B);
1302}
1303
1304extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1305_mm_add_epi16 (__m128i __A, __m128i __B)
1306{
1307 return (__m128i) ((__v8hu)__A + (__v8hu)__B);
1308}
1309
1310extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1311_mm_add_epi32 (__m128i __A, __m128i __B)
1312{
1313 return (__m128i) ((__v4su)__A + (__v4su)__B);
1314}
1315
1316extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1317_mm_add_epi64 (__m128i __A, __m128i __B)
1318{
1319 return (__m128i) ((__v2du)__A + (__v2du)__B);
1320}
1321
1322extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1323_mm_adds_epi8 (__m128i __A, __m128i __B)
1324{
1325 return (__m128i) vec_adds ((__v16qi)__A, (__v16qi)__B);
1326}
1327
1328extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1329_mm_adds_epi16 (__m128i __A, __m128i __B)
1330{
1331 return (__m128i) vec_adds ((__v8hi)__A, (__v8hi)__B);
1332}
1333
1334extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1335_mm_adds_epu8 (__m128i __A, __m128i __B)
1336{
1337 return (__m128i) vec_adds ((__v16qu)__A, (__v16qu)__B);
1338}
1339
1340extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1341_mm_adds_epu16 (__m128i __A, __m128i __B)
1342{
1343 return (__m128i) vec_adds ((__v8hu)__A, (__v8hu)__B);
1344}
1345
1346extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1347_mm_sub_epi8 (__m128i __A, __m128i __B)
1348{
1349 return (__m128i) ((__v16qu)__A - (__v16qu)__B);
1350}
1351
1352extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1353_mm_sub_epi16 (__m128i __A, __m128i __B)
1354{
1355 return (__m128i) ((__v8hu)__A - (__v8hu)__B);
1356}
1357
1358extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1359_mm_sub_epi32 (__m128i __A, __m128i __B)
1360{
1361 return (__m128i) ((__v4su)__A - (__v4su)__B);
1362}
1363
1364extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1365_mm_sub_epi64 (__m128i __A, __m128i __B)
1366{
1367 return (__m128i) ((__v2du)__A - (__v2du)__B);
1368}
1369
1370extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1371_mm_subs_epi8 (__m128i __A, __m128i __B)
1372{
1373 return (__m128i) vec_subs ((__v16qi)__A, (__v16qi)__B);
1374}
1375
1376extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1377_mm_subs_epi16 (__m128i __A, __m128i __B)
1378{
1379 return (__m128i) vec_subs ((__v8hi)__A, (__v8hi)__B);
1380}
1381
1382extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1383_mm_subs_epu8 (__m128i __A, __m128i __B)
1384{
1385 return (__m128i) vec_subs ((__v16qu)__A, (__v16qu)__B);
1386}
1387
1388extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1389_mm_subs_epu16 (__m128i __A, __m128i __B)
1390{
1391 return (__m128i) vec_subs ((__v8hu)__A, (__v8hu)__B);
1392}
1393
1394extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1395_mm_madd_epi16 (__m128i __A, __m128i __B)
1396{
1397 __vector signed int zero = {0, 0, 0, 0};
1398
1399 return (__m128i) vec_vmsumshm ((__v8hi)__A, (__v8hi)__B, zero);
1400}
1401
1402extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1403_mm_mulhi_epi16 (__m128i __A, __m128i __B)
1404{
1405 __vector signed int w0, w1;
1406
1407 __vector unsigned char xform1 = {
1408#ifdef __LITTLE_ENDIAN__
1409 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
1410 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1411#else
1412 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15,
1413 0x08, 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
1414#endif
1415 };
1416
1417 w0 = vec_vmulesh ((__v8hi)__A, (__v8hi)__B);
1418 w1 = vec_vmulosh ((__v8hi)__A, (__v8hi)__B);
1419 return (__m128i) vec_perm (w0, w1, xform1);
1420}
1421
1422extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1423_mm_mullo_epi16 (__m128i __A, __m128i __B)
1424{
1425 return (__m128i) ((__v8hi)__A * (__v8hi)__B);
1426}
1427
1428extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1429_mm_mul_su32 (__m64 __A, __m64 __B)
1430{
1431 unsigned int a = __A;
1432 unsigned int b = __B;
1433
1434 return ((__m64)a * (__m64)b);
1435}
1436
1437extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1438_mm_mul_epu32 (__m128i __A, __m128i __B)
1439{
1440#if __GNUC__ < 8
1441 __v2du result;
1442
1443#ifdef __LITTLE_ENDIAN__
1444 /* VMX Vector Multiply Odd Unsigned Word. */
1445 __asm__(
1446 "vmulouw %0,%1,%2"
1447 : "=v" (result)
1448 : "v" (__A), "v" (__B)
1449 : );
1450#else
1451 /* VMX Vector Multiply Even Unsigned Word. */
1452 __asm__(
1453 "vmuleuw %0,%1,%2"
1454 : "=v" (result)
1455 : "v" (__A), "v" (__B)
1456 : );
1457#endif
1458 return (__m128i) result;
1459#else
1460 return (__m128i) vec_mule ((__v4su)__A, (__v4su)__B);
1461#endif
1462}
1463
1464extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1465_mm_slli_epi16 (__m128i __A, int __B)
1466{
1467 __v8hu lshift;
1468 __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 };
1469
1470 if (__B >= 0 && __B < 16)
1471 {
1472 if (__builtin_constant_p(__B))
1473 lshift = (__v8hu) vec_splat_s16(__B);
1474 else
1475 lshift = vec_splats ((unsigned short) __B);
1476
1477 result = vec_sl ((__v8hi) __A, lshift);
1478 }
1479
1480 return (__m128i) result;
1481}
1482
1483extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1484_mm_slli_epi32 (__m128i __A, int __B)
1485{
1486 __v4su lshift;
1487 __v4si result = { 0, 0, 0, 0 };
1488
1489 if (__B >= 0 && __B < 32)
1490 {
1491 if (__builtin_constant_p(__B) && __B < 16)
1492 lshift = (__v4su) vec_splat_s32(__B);
1493 else
1494 lshift = vec_splats ((unsigned int) __B);
1495
1496 result = vec_sl ((__v4si) __A, lshift);
1497 }
1498
1499 return (__m128i) result;
1500}
1501
1502#ifdef _ARCH_PWR8
1503extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1504_mm_slli_epi64 (__m128i __A, int __B)
1505{
1506 __v2du lshift;
1507 __v2di result = { 0, 0 };
1508
1509 if (__B >= 0 && __B < 64)
1510 {
1511 if (__builtin_constant_p(__B) && __B < 16)
1512 lshift = (__v2du) vec_splat_s32(__B);
1513 else
1514 lshift = (__v2du) vec_splats ((unsigned int) __B);
1515
1516 result = vec_sl ((__v2di) __A, lshift);
1517 }
1518
1519 return (__m128i) result;
1520}
1521#endif
1522
1523extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1524_mm_srai_epi16 (__m128i __A, int __B)
1525{
1526 __v8hu rshift = { 15, 15, 15, 15, 15, 15, 15, 15 };
1527 __v8hi result;
1528
1529 if (__B < 16)
1530 {
1531 if (__builtin_constant_p(__B))
1532 rshift = (__v8hu) vec_splat_s16(__B);
1533 else
1534 rshift = vec_splats ((unsigned short) __B);
1535 }
1536 result = vec_sra ((__v8hi) __A, rshift);
1537
1538 return (__m128i) result;
1539}
1540
1541extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1542_mm_srai_epi32 (__m128i __A, int __B)
1543{
1544 __v4su rshift = { 31, 31, 31, 31 };
1545 __v4si result;
1546
1547 if (__B < 32)
1548 {
1549 if (__builtin_constant_p(__B))
1550 {
1551 if (__B < 16)
1552 rshift = (__v4su) vec_splat_s32(__B);
1553 else
1554 rshift = (__v4su) vec_splats((unsigned int)__B);
1555 }
1556 else
1557 rshift = vec_splats ((unsigned int) __B);
1558 }
1559 result = vec_sra ((__v4si) __A, rshift);
1560
1561 return (__m128i) result;
1562}
1563
1564extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1565_mm_bslli_si128 (__m128i __A, const int __N)
1566{
1567 __v16qu result;
1568 const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1569
1570 if (__N < 16)
1571 result = vec_sld ((__v16qu) __A, zeros, __N);
1572 else
1573 result = zeros;
1574
1575 return (__m128i) result;
1576}
1577
1578extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1579_mm_bsrli_si128 (__m128i __A, const int __N)
1580{
1581 __v16qu result;
1582 const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1583
1584 if (__N < 16)
1585#ifdef __LITTLE_ENDIAN__
1586 if (__builtin_constant_p(__N))
1587 /* Would like to use Vector Shift Left Double by Octet
1588 Immediate here to use the immediate form and avoid
1589 load of __N * 8 value into a separate VR. */
1590 result = vec_sld (zeros, (__v16qu) __A, (16 - __N));
1591 else
1592#endif
1593 {
1594 __v16qu shift = vec_splats((unsigned char)(__N*8));
1595#ifdef __LITTLE_ENDIAN__
1596 result = vec_sro ((__v16qu)__A, shift);
1597#else
1598 result = vec_slo ((__v16qu)__A, shift);
1599#endif
1600 }
1601 else
1602 result = zeros;
1603
1604 return (__m128i) result;
1605}
1606
1607extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1608_mm_srli_si128 (__m128i __A, const int __N)
1609{
1610 return _mm_bsrli_si128 (__A, __N);
1611}
1612
1613extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1614_mm_slli_si128 (__m128i __A, const int _imm5)
1615{
1616 __v16qu result;
1617 const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1618
1619 if (_imm5 < 16)
1620#ifdef __LITTLE_ENDIAN__
1621 result = vec_sld ((__v16qu) __A, zeros, _imm5);
1622#else
1623 result = vec_sld (zeros, (__v16qu) __A, (16 - _imm5));
1624#endif
1625 else
1626 result = zeros;
1627
1628 return (__m128i) result;
1629}
1630
1631extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1632
1633_mm_srli_epi16 (__m128i __A, int __B)
1634{
1635 __v8hu rshift;
1636 __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 };
1637
1638 if (__B < 16)
1639 {
1640 if (__builtin_constant_p(__B))
1641 rshift = (__v8hu) vec_splat_s16(__B);
1642 else
1643 rshift = vec_splats ((unsigned short) __B);
1644
1645 result = vec_sr ((__v8hi) __A, rshift);
1646 }
1647
1648 return (__m128i) result;
1649}
1650
1651extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1652_mm_srli_epi32 (__m128i __A, int __B)
1653{
1654 __v4su rshift;
1655 __v4si result = { 0, 0, 0, 0 };
1656
1657 if (__B < 32)
1658 {
1659 if (__builtin_constant_p(__B))
1660 {
1661 if (__B < 16)
1662 rshift = (__v4su) vec_splat_s32(__B);
1663 else
1664 rshift = (__v4su) vec_splats((unsigned int)__B);
1665 }
1666 else
1667 rshift = vec_splats ((unsigned int) __B);
1668
1669 result = vec_sr ((__v4si) __A, rshift);
1670 }
1671
1672 return (__m128i) result;
1673}
1674
1675#ifdef _ARCH_PWR8
1676extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1677_mm_srli_epi64 (__m128i __A, int __B)
1678{
1679 __v2du rshift;
1680 __v2di result = { 0, 0 };
1681
1682 if (__B < 64)
1683 {
1684 if (__builtin_constant_p(__B))
1685 {
1686 if (__B < 16)
1687 rshift = (__v2du) vec_splat_s32(__B);
1688 else
1689 rshift = (__v2du) vec_splats((unsigned long long)__B);
1690 }
1691 else
1692 rshift = (__v2du) vec_splats ((unsigned int) __B);
1693
1694 result = vec_sr ((__v2di) __A, rshift);
1695 }
1696
1697 return (__m128i) result;
1698}
1699#endif
1700
1701extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1702_mm_sll_epi16 (__m128i __A, __m128i __B)
1703{
1704 __v8hu lshift;
1705 __vector __bool short shmask;
1706 const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1707 __v8hu result;
1708
1709#ifdef __LITTLE_ENDIAN__
1710 lshift = vec_splat ((__v8hu) __B, 0);
1711#else
1712 lshift = vec_splat ((__v8hu) __B, 3);
1713#endif
1714 shmask = vec_cmple (lshift, shmax);
1715 result = vec_sl ((__v8hu) __A, lshift);
1716 result = vec_sel ((__v8hu) shmask, result, shmask);
1717
1718 return (__m128i) result;
1719}
1720
1721extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1722_mm_sll_epi32 (__m128i __A, __m128i __B)
1723{
1724 __v4su lshift;
1725 __vector __bool int shmask;
1726 const __v4su shmax = { 32, 32, 32, 32 };
1727 __v4su result;
1728#ifdef __LITTLE_ENDIAN__
1729 lshift = vec_splat ((__v4su) __B, 0);
1730#else
1731 lshift = vec_splat ((__v4su) __B, 1);
1732#endif
1733 shmask = vec_cmplt (lshift, shmax);
1734 result = vec_sl ((__v4su) __A, lshift);
1735 result = vec_sel ((__v4su) shmask, result, shmask);
1736
1737 return (__m128i) result;
1738}
1739
1740#ifdef _ARCH_PWR8
1741extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1742_mm_sll_epi64 (__m128i __A, __m128i __B)
1743{
1744 __v2du lshift;
1745 __vector __bool long long shmask;
1746 const __v2du shmax = { 64, 64 };
1747 __v2du result;
1748
1749 lshift = vec_splat ((__v2du) __B, 0);
1750 shmask = vec_cmplt (lshift, shmax);
1751 result = vec_sl ((__v2du) __A, lshift);
1752 result = (__v2du)vec_sel ((__v2df) shmask, (__v2df)result, shmask);
1753
1754 return (__m128i) result;
1755}
1756#endif
1757
1758extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1759_mm_sra_epi16 (__m128i __A, __m128i __B)
1760{
1761 const __v8hu rshmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1762 __v8hu rshift;
1763 __v8hi result;
1764
1765#ifdef __LITTLE_ENDIAN__
1766 rshift = vec_splat ((__v8hu)__B, 0);
1767#else
1768 rshift = vec_splat ((__v8hu)__B, 3);
1769#endif
1770 rshift = vec_min (rshift, rshmax);
1771 result = vec_sra ((__v8hi) __A, rshift);
1772
1773 return (__m128i) result;
1774}
1775
1776extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1777_mm_sra_epi32 (__m128i __A, __m128i __B)
1778{
1779 const __v4su rshmax = { 31, 31, 31, 31 };
1780 __v4su rshift;
1781 __v4si result;
1782
1783#ifdef __LITTLE_ENDIAN__
1784 rshift = vec_splat ((__v4su)__B, 0);
1785#else
1786 rshift = vec_splat ((__v4su)__B, 1);
1787#endif
1788 rshift = vec_min (rshift, rshmax);
1789 result = vec_sra ((__v4si) __A, rshift);
1790
1791 return (__m128i) result;
1792}
1793
1794extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1795_mm_srl_epi16 (__m128i __A, __m128i __B)
1796{
1797 __v8hu rshift;
1798 __vector __bool short shmask;
1799 const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1800 __v8hu result;
1801
1802#ifdef __LITTLE_ENDIAN__
1803 rshift = vec_splat ((__v8hu) __B, 0);
1804#else
1805 rshift = vec_splat ((__v8hu) __B, 3);
1806#endif
1807 shmask = vec_cmple (rshift, shmax);
1808 result = vec_sr ((__v8hu) __A, rshift);
1809 result = vec_sel ((__v8hu) shmask, result, shmask);
1810
1811 return (__m128i) result;
1812}
1813
1814extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1815_mm_srl_epi32 (__m128i __A, __m128i __B)
1816{
1817 __v4su rshift;
1818 __vector __bool int shmask;
1819 const __v4su shmax = { 32, 32, 32, 32 };
1820 __v4su result;
1821
1822#ifdef __LITTLE_ENDIAN__
1823 rshift = vec_splat ((__v4su) __B, 0);
1824#else
1825 rshift = vec_splat ((__v4su) __B, 1);
1826#endif
1827 shmask = vec_cmplt (rshift, shmax);
1828 result = vec_sr ((__v4su) __A, rshift);
1829 result = vec_sel ((__v4su) shmask, result, shmask);
1830
1831 return (__m128i) result;
1832}
1833
1834#ifdef _ARCH_PWR8
1835extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1836_mm_srl_epi64 (__m128i __A, __m128i __B)
1837{
1838 __v2du rshift;
1839 __vector __bool long long shmask;
1840 const __v2du shmax = { 64, 64 };
1841 __v2du result;
1842
1843 rshift = vec_splat ((__v2du) __B, 0);
1844 shmask = vec_cmplt (rshift, shmax);
1845 result = vec_sr ((__v2du) __A, rshift);
1846 result = (__v2du)vec_sel ((__v2df) shmask, (__v2df)result, shmask);
1847
1848 return (__m128i) result;
1849}
1850#endif
1851
1852extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1853_mm_and_pd (__m128d __A, __m128d __B)
1854{
1855 return (vec_and ((__v2df) __A, (__v2df) __B));
1856}
1857
1858extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1859_mm_andnot_pd (__m128d __A, __m128d __B)
1860{
1861 return (vec_andc ((__v2df) __B, (__v2df) __A));
1862}
1863
1864extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1865_mm_or_pd (__m128d __A, __m128d __B)
1866{
1867 return (vec_or ((__v2df) __A, (__v2df) __B));
1868}
1869
1870extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1871_mm_xor_pd (__m128d __A, __m128d __B)
1872{
1873 return (vec_xor ((__v2df) __A, (__v2df) __B));
1874}
1875
1876extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1877_mm_and_si128 (__m128i __A, __m128i __B)
1878{
1879 return (__m128i)vec_and ((__v2di) __A, (__v2di) __B);
1880}
1881
1882extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1883_mm_andnot_si128 (__m128i __A, __m128i __B)
1884{
1885 return (__m128i)vec_andc ((__v2di) __B, (__v2di) __A);
1886}
1887
1888extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1889_mm_or_si128 (__m128i __A, __m128i __B)
1890{
1891 return (__m128i)vec_or ((__v2di) __A, (__v2di) __B);
1892}
1893
1894extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1895_mm_xor_si128 (__m128i __A, __m128i __B)
1896{
1897 return (__m128i)vec_xor ((__v2di) __A, (__v2di) __B);
1898}
1899
1900extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1901_mm_cmpeq_epi8 (__m128i __A, __m128i __B)
1902{
1903 return (__m128i) vec_cmpeq ((__v16qi) __A, (__v16qi)__B);
1904}
1905
1906extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1907_mm_cmpeq_epi16 (__m128i __A, __m128i __B)
1908{
1909 return (__m128i) vec_cmpeq ((__v8hi) __A, (__v8hi)__B);
1910}
1911
1912extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1913_mm_cmpeq_epi32 (__m128i __A, __m128i __B)
1914{
1915 return (__m128i) vec_cmpeq ((__v4si) __A, (__v4si)__B);
1916}
1917
1918extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1919_mm_cmplt_epi8 (__m128i __A, __m128i __B)
1920{
1921 return (__m128i) vec_cmplt ((__v16qi) __A, (__v16qi)__B);
1922}
1923
1924extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1925_mm_cmplt_epi16 (__m128i __A, __m128i __B)
1926{
1927 return (__m128i) vec_cmplt ((__v8hi) __A, (__v8hi)__B);
1928}
1929
1930extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1931_mm_cmplt_epi32 (__m128i __A, __m128i __B)
1932{
1933 return (__m128i) vec_cmplt ((__v4si) __A, (__v4si)__B);
1934}
1935
1936extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1937_mm_cmpgt_epi8 (__m128i __A, __m128i __B)
1938{
1939 return (__m128i) vec_cmpgt ((__v16qi) __A, (__v16qi)__B);
1940}
1941
1942extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1943_mm_cmpgt_epi16 (__m128i __A, __m128i __B)
1944{
1945 return (__m128i) vec_cmpgt ((__v8hi) __A, (__v8hi)__B);
1946}
1947
1948extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1949_mm_cmpgt_epi32 (__m128i __A, __m128i __B)
1950{
1951 return (__m128i) vec_cmpgt ((__v4si) __A, (__v4si)__B);
1952}
1953
1954extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1955_mm_extract_epi16 (__m128i const __A, int const __N)
1956{
1957 return (unsigned short) ((__v8hi)__A)[__N & 7];
1958}
1959
1960extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1961_mm_insert_epi16 (__m128i const __A, int const __D, int const __N)
1962{
1963 __v8hi result = (__v8hi)__A;
1964
1965 result [(__N & 7)] = __D;
1966
1967 return (__m128i) result;
1968}
1969
1970extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1971_mm_max_epi16 (__m128i __A, __m128i __B)
1972{
1973 return (__m128i) vec_max ((__v8hi)__A, (__v8hi)__B);
1974}
1975
1976extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1977_mm_max_epu8 (__m128i __A, __m128i __B)
1978{
1979 return (__m128i) vec_max ((__v16qu) __A, (__v16qu)__B);
1980}
1981
1982extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1983_mm_min_epi16 (__m128i __A, __m128i __B)
1984{
1985 return (__m128i) vec_min ((__v8hi) __A, (__v8hi)__B);
1986}
1987
1988extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1989_mm_min_epu8 (__m128i __A, __m128i __B)
1990{
1991 return (__m128i) vec_min ((__v16qu) __A, (__v16qu)__B);
1992}
1993
1994
1995#ifdef _ARCH_PWR8
1996/* Intrinsic functions that require PowerISA 2.07 minimum. */
1997
1998/* Creates a 4-bit mask from the most significant bits of the SPFP values. */
1999extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2000_mm_movemask_epi8 (__m128i __A)
2001{
2002 __vector unsigned long long result;
2003 static const __vector unsigned char perm_mask =
2004 {
2005 0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
2006 0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00
2007 };
2008
2009 result = ((__vector unsigned long long)
2010 vec_vbpermq ((__vector unsigned char) __A,
2011 (__vector unsigned char) perm_mask));
2012
2013#ifdef __LITTLE_ENDIAN__
2014 return result[1];
2015#else
2016 return result[0];
2017#endif
2018}
2019#endif /* _ARCH_PWR8 */
2020
2021extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2022_mm_mulhi_epu16 (__m128i __A, __m128i __B)
2023{
2024 __v4su w0, w1;
2025 __v16qu xform1 = {
2026#ifdef __LITTLE_ENDIAN__
2027 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
2028 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
2029#else
2030 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15,
2031 0x08, 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
2032#endif
2033 };
2034
2035 w0 = vec_vmuleuh ((__v8hu)__A, (__v8hu)__B);
2036 w1 = vec_vmulouh ((__v8hu)__A, (__v8hu)__B);
2037 return (__m128i) vec_perm (w0, w1, xform1);
2038}
2039
2040extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2041_mm_shufflehi_epi16 (__m128i __A, const int __mask)
2042{
2043 unsigned long element_selector_98 = __mask & 0x03;
2044 unsigned long element_selector_BA = (__mask >> 2) & 0x03;
2045 unsigned long element_selector_DC = (__mask >> 4) & 0x03;
2046 unsigned long element_selector_FE = (__mask >> 6) & 0x03;
2047 static const unsigned short permute_selectors[4] =
2048 {
2049#ifdef __LITTLE_ENDIAN__
2050 0x0908, 0x0B0A, 0x0D0C, 0x0F0E
2051#else
2052 0x0809, 0x0A0B, 0x0C0D, 0x0E0F
2053#endif
2054 };
2055 __v2du pmask =
2056#ifdef __LITTLE_ENDIAN__
2057 { 0x1716151413121110UL, 0UL};
2058#else
2059 { 0x1011121314151617UL, 0UL};
2060#endif
2061 __m64_union t;
2062 __v2du a, r;
2063
2064 t.as_short[0] = permute_selectors[element_selector_98];
2065 t.as_short[1] = permute_selectors[element_selector_BA];
2066 t.as_short[2] = permute_selectors[element_selector_DC];
2067 t.as_short[3] = permute_selectors[element_selector_FE];
2068 pmask[1] = t.as_m64;
2069 a = (__v2du)__A;
2070 r = vec_perm (a, a, (__vector unsigned char)pmask);
2071 return (__m128i) r;
2072}
2073
2074extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2075_mm_shufflelo_epi16 (__m128i __A, const int __mask)
2076{
2077 unsigned long element_selector_10 = __mask & 0x03;
2078 unsigned long element_selector_32 = (__mask >> 2) & 0x03;
2079 unsigned long element_selector_54 = (__mask >> 4) & 0x03;
2080 unsigned long element_selector_76 = (__mask >> 6) & 0x03;
2081 static const unsigned short permute_selectors[4] =
2082 {
2083#ifdef __LITTLE_ENDIAN__
2084 0x0100, 0x0302, 0x0504, 0x0706
2085#else
2086 0x0001, 0x0203, 0x0405, 0x0607
2087#endif
2088 };
2089 __v2du pmask =
2090#ifdef __LITTLE_ENDIAN__
2091 { 0UL, 0x1f1e1d1c1b1a1918UL};
2092#else
2093 { 0UL, 0x18191a1b1c1d1e1fUL};
2094#endif
2095 __m64_union t;
2096 __v2du a, r;
2097 t.as_short[0] = permute_selectors[element_selector_10];
2098 t.as_short[1] = permute_selectors[element_selector_32];
2099 t.as_short[2] = permute_selectors[element_selector_54];
2100 t.as_short[3] = permute_selectors[element_selector_76];
2101 pmask[0] = t.as_m64;
2102 a = (__v2du)__A;
2103 r = vec_perm (a, a, (__vector unsigned char)pmask);
2104 return (__m128i) r;
2105}
2106
2107extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2108_mm_shuffle_epi32 (__m128i __A, const int __mask)
2109{
2110 unsigned long element_selector_10 = __mask & 0x03;
2111 unsigned long element_selector_32 = (__mask >> 2) & 0x03;
2112 unsigned long element_selector_54 = (__mask >> 4) & 0x03;
2113 unsigned long element_selector_76 = (__mask >> 6) & 0x03;
2114 static const unsigned int permute_selectors[4] =
2115 {
2116#ifdef __LITTLE_ENDIAN__
2117 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
2118#else
2119 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
2120#endif
2121 };
2122 __v4su t;
2123
2124 t[0] = permute_selectors[element_selector_10];
2125 t[1] = permute_selectors[element_selector_32];
2126 t[2] = permute_selectors[element_selector_54] + 0x10101010;
2127 t[3] = permute_selectors[element_selector_76] + 0x10101010;
2128 return (__m128i)vec_perm ((__v4si) __A, (__v4si)__A, (__vector unsigned char)t);
2129}
2130
2131extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2132_mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C)
2133{
2134 __v2du hibit = { 0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL};
2135 __v16qu mask, tmp;
2136 __m128i_u *p = (__m128i_u*)__C;
2137
2138 tmp = (__v16qu)_mm_loadu_si128(p);
2139 mask = (__v16qu)vec_cmpgt ((__v16qu)__B, (__v16qu)hibit);
2140 tmp = vec_sel (tmp, (__v16qu)__A, mask);
2141 _mm_storeu_si128 (p, (__m128i)tmp);
2142}
2143
2144extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2145_mm_avg_epu8 (__m128i __A, __m128i __B)
2146{
2147 return (__m128i) vec_avg ((__v16qu)__A, (__v16qu)__B);
2148}
2149
2150extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2151_mm_avg_epu16 (__m128i __A, __m128i __B)
2152{
2153 return (__m128i) vec_avg ((__v8hu)__A, (__v8hu)__B);
2154}
2155
2156
2157extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2158_mm_sad_epu8 (__m128i __A, __m128i __B)
2159{
2160 __v16qu a, b;
2161 __v16qu vmin, vmax, vabsdiff;
2162 __v4si vsum;
2163 const __v4su zero = { 0, 0, 0, 0 };
2164 __v4si result;
2165
2166 a = (__v16qu) __A;
2167 b = (__v16qu) __B;
2168 vmin = vec_min (a, b);
2169 vmax = vec_max (a, b);
2170 vabsdiff = vec_sub (vmax, vmin);
2171 /* Sum four groups of bytes into integers. */
2172 vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
2173 /* Sum across four integers with two integer results. */
2174 result = vec_sum2s (vsum, (__vector signed int) zero);
2175 /* Rotate the sums into the correct position. */
2176#ifdef __LITTLE_ENDIAN__
2177 result = vec_sld (result, result, 4);
2178#else
2179 result = vec_sld (result, result, 6);
2180#endif
2181 /* Rotate the sums into the correct position. */
2182 return (__m128i) result;
2183}
2184
2185extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2186_mm_stream_si32 (int *__A, int __B)
2187{
2188 /* Use the data cache block touch for store transient. */
2189 __asm__ (
2190 "dcbtstt 0,%0"
2191 :
2192 : "b" (__A)
2193 : "memory"
2194 );
2195 *__A = __B;
2196}
2197
2198extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2199_mm_stream_si64 (long long int *__A, long long int __B)
2200{
2201 /* Use the data cache block touch for store transient. */
2202 __asm__ (
2203 " dcbtstt 0,%0"
2204 :
2205 : "b" (__A)
2206 : "memory"
2207 );
2208 *__A = __B;
2209}
2210
2211extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2212_mm_stream_si128 (__m128i *__A, __m128i __B)
2213{
2214 /* Use the data cache block touch for store transient. */
2215 __asm__ (
2216 "dcbtstt 0,%0"
2217 :
2218 : "b" (__A)
2219 : "memory"
2220 );
2221 *__A = __B;
2222}
2223
2224extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2225_mm_stream_pd (double *__A, __m128d __B)
2226{
2227 /* Use the data cache block touch for store transient. */
2228 __asm__ (
2229 "dcbtstt 0,%0"
2230 :
2231 : "b" (__A)
2232 : "memory"
2233 );
2234 *(__m128d*)__A = __B;
2235}
2236
2237extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2238_mm_clflush (void const *__A)
2239{
2240 /* Use the data cache block flush. */
2241 __asm__ (
2242 "dcbf 0,%0"
2243 :
2244 : "b" (__A)
2245 : "memory"
2246 );
2247}
2248
2249extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2250_mm_lfence (void)
2251{
2252 /* Use light weight sync for load to load ordering. */
2253 __atomic_thread_fence (__ATOMIC_RELEASE);
2254}
2255
2256extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2257_mm_mfence (void)
2258{
2259 /* Use heavy weight sync for any to any ordering. */
2260 __atomic_thread_fence (__ATOMIC_SEQ_CST);
2261}
2262
2263extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2264_mm_cvtsi32_si128 (int __A)
2265{
2266 return _mm_set_epi32 (0, 0, 0, __A);
2267}
2268
2269extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2270_mm_cvtsi64_si128 (long long __A)
2271{
2272 return __extension__ (__m128i)(__v2di){ __A, 0LL };
2273}
2274
2275/* Microsoft intrinsic. */
2276extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2277_mm_cvtsi64x_si128 (long long __A)
2278{
2279 return __extension__ (__m128i)(__v2di){ __A, 0LL };
2280}
2281
2282/* Casts between various SP, DP, INT vector types. Note that these do no
2283 conversion of values, they just change the type. */
2284extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2285_mm_castpd_ps(__m128d __A)
2286{
2287 return (__m128) __A;
2288}
2289
2290extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2291_mm_castpd_si128(__m128d __A)
2292{
2293 return (__m128i) __A;
2294}
2295
2296extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2297_mm_castps_pd(__m128 __A)
2298{
2299 return (__m128d) __A;
2300}
2301
2302extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2303_mm_castps_si128(__m128 __A)
2304{
2305 return (__m128i) __A;
2306}
2307
2308extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2309_mm_castsi128_ps(__m128i __A)
2310{
2311 return (__m128) __A;
2312}
2313
2314extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2315_mm_castsi128_pd(__m128i __A)
2316{
2317 return (__m128d) __A;
2318}
2319
2320#else
2321#include_next <emmintrin.h>
2322#endif /* defined(__linux__) && defined(__ppc64__) */
2323
2324#endif /* EMMINTRIN_H_ */
__device__ __2f16 b
__device__ __2f16 float c
static __inline__ vector unsigned char __ATTRS_o_ai vec_sr(vector unsigned char __a, vector unsigned char __b)
Definition altivec.h:9543
static __inline__ vector bool char __ATTRS_o_ai vec_cmpeq(vector signed char __a, vector signed char __b)
Definition altivec.h:1625
static __inline__ vector signed char __ATTRS_o_ai vec_sra(vector signed char __a, vector unsigned char __b)
Definition altivec.h:9633
static __inline__ vector int __ATTRS_o_ai vec_vmrghw(vector int __a, vector int __b)
Definition altivec.h:4769
static __inline__ vector signed char __ATTRS_o_ai vec_sro(vector signed char __a, vector signed char __b)
Definition altivec.h:10073
static __inline__ vector signed char __ATTRS_o_ai vec_ld(int __a, const vector signed char *__b)
Definition altivec.h:3504
#define vec_ctf(__a, __b)
Definition altivec.h:2950
static __inline__ vector short __ATTRS_o_ai vec_mule(vector signed char __a, vector signed char __b)
Definition altivec.h:5696
static __inline__ vector signed char __ATTRS_o_ai vec_splats(signed char __a)
Definition altivec.h:13710
static __inline__ void __ATTRS_o_ai vec_st(vector signed char __a, int __b, vector signed char *__c)
Definition altivec.h:10278
static __inline__ vector signed char __ATTRS_o_ai vec_andc(vector signed char __a, vector signed char __b)
Definition altivec.h:1163
static __inline__ vector signed int __ATTRS_o_ai vec_sld(vector signed int, vector signed int, unsigned const int __c)
Definition altivec.h:8309
static __inline__ vector short __ATTRS_o_ai vec_unpackl(vector signed char __a)
Definition altivec.h:11807
static __inline__ vector int __ATTRS_o_ai vec_sum4s(vector signed char __a, vector int __b)
Definition altivec.h:11531
static __inline__ vector signed char __ATTRS_o_ai vec_and(vector signed char __a, vector signed char __b)
Definition altivec.h:810
static __inline__ vector signed char __ATTRS_o_ai vec_avg(vector signed char __a, vector signed char __b)
Definition altivec.h:1514
static __inline__ vector signed char __ATTRS_o_ai vec_mergel(vector signed char __a, vector signed char __b)
Definition altivec.h:4804
static __inline__ vector signed char __ATTRS_o_ai vec_subs(vector signed char __a, vector signed char __b)
Definition altivec.h:11232
static __inline__ vector int __ATTRS_o_ai vec_splat_s32(signed char __a)
Definition altivec.h:9503
static __inline__ vector signed char __ATTRS_o_ai vec_adds(vector signed char __a, vector signed char __b)
Definition altivec.h:560
static __inline__ vector signed char __ATTRS_o_ai vec_perm(vector signed char __a, vector signed char __b, vector unsigned char __c)
Definition altivec.h:7320
static __inline__ vector signed char __ATTRS_o_ai vec_sel(vector signed char __a, vector signed char __b, vector unsigned char __c)
Definition altivec.h:7834
static __inline__ vector signed char __ATTRS_o_ai vec_mergeh(vector signed char __a, vector signed char __b)
Definition altivec.h:4534
static __inline__ vector bool char __ATTRS_o_ai vec_cmplt(vector signed char __a, vector signed char __b)
Definition altivec.h:2196
static __inline__ vector signed char __ATTRS_o_ai vec_max(vector signed char __a, vector signed char __b)
Definition altivec.h:4281
static __inline__ vector signed char __ATTRS_o_ai vec_slo(vector signed char __a, vector signed char __b)
Definition altivec.h:9034
static __inline__ vector signed char __ATTRS_o_ai vec_nor(vector signed char __a, vector signed char __b)
Definition altivec.h:6098
static __inline__ vector bool char __ATTRS_o_ai vec_cmpge(vector signed char __a, vector signed char __b)
Definition altivec.h:2024
static __inline__ vector unsigned char __ATTRS_o_ai vec_packsu(vector short __a, vector short __b)
Definition altivec.h:7202
static __inline__ vector signed char __ATTRS_o_ai vec_min(vector signed char __a, vector signed char __b)
Definition altivec.h:5185
#define vec_cts
Definition altivec.h:2981
static __inline__ vector signed char __ATTRS_o_ai vec_splat(vector signed char __a, unsigned const int __b)
Definition altivec.h:9240
static __inline__ vector signed char __ATTRS_o_ai vec_or(vector signed char __a, vector signed char __b)
Definition altivec.h:6234
static __inline__ vector short __ATTRS_o_ai vec_unpackh(vector signed char __a)
Definition altivec.h:11668
static __inline__ vector unsigned char __ATTRS_o_ai vec_sl(vector unsigned char __a, vector unsigned char __b)
Definition altivec.h:8088
static __inline__ vector short __ATTRS_o_ai vec_splat_s16(signed char __a)
Definition altivec.h:9487
static __inline__ vector signed char __ATTRS_o_ai vec_abs(vector signed char __a)
Definition altivec.h:115
static __inline__ vector unsigned char __ATTRS_o_ai vec_xor(vector unsigned char __a, vector unsigned char __b)
Definition altivec.h:12223
static __inline__ vector bool char __ATTRS_o_ai vec_cmpgt(vector signed char __a, vector signed char __b)
Definition altivec.h:1964
static __inline__ vector bool char __ATTRS_o_ai vec_cmple(vector signed char __a, vector signed char __b)
Definition altivec.h:2140
static __inline__ vector signed char __ATTRS_o_ai vec_packs(vector short __a, vector short __b)
Definition altivec.h:7073
static __inline__ vector signed char __ATTRS_o_ai vec_sub(vector signed char __a, vector signed char __b)
Definition altivec.h:10963
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition emmintrin.h:1508
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0, __m64 __q1)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 64-bit integral ...
Definition emmintrin.h:3862
static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition emmintrin.h:1044
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a, __m128i __b)
Unpacks the low-order (index 0-3) values from each of the two 128-bit vectors of [8 x i16] and interl...
Definition emmintrin.h:4659
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a)
Moves the 64-bit operand to a 128-bit integer vector, zeroing the upper bits.
Definition emmintrin.h:4737
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp, __m128d __a)
Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to the upper and lower 64 bits of a...
Definition emmintrin.h:2013
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0)
Initializes the 16-bit values in a 128-bit vector of [8 x i16] with the specified 16-bit integer valu...
Definition emmintrin.h:3699
static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition emmintrin.h:1018
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd1(double __w)
Constructs a 128-bit floating-point vector of [2 x double], with each of the two double-precision flo...
Definition emmintrin.h:1855
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a, __m128i __b)
Converts 16-bit signed integers from both 128-bit integer vector operands into 8-bit unsigned integer...
Definition emmintrin.h:4331
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit unsigned [16 x i8] vectors, saving the smaller value f...
Definition emmintrin.h:2432
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition emmintrin.h:577
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a, __m128d __b)
Adds lower double-precision values in both operands and returns the sum in the lower 64 bits of the r...
Definition emmintrin.h:56
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_pd(__m128d __a, __m128d __b)
Performs an element-by-element division of two 128-bit vectors of [2 x double].
Definition emmintrin.h:201
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a, __m128d __b)
Subtracts two 128-bit vectors of [2 x double].
Definition emmintrin.h:117
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a)
Casts a 128-bit floating-point vector of [2 x double] into a 128-bit integer vector.
Definition emmintrin.h:4879
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a, __m128d __b)
Performs a bitwise OR of two 128-bit vectors of [2 x double].
Definition emmintrin.h:389
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a)
Copies the values of the most significant bits from each 8-bit element in a 128-bit integer vector of...
Definition emmintrin.h:4399
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a, __m128i __count)
Left-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition emmintrin.h:2897
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a, __m128i __b)
Performs a bitwise AND of two 128-bit integer vectors.
Definition emmintrin.h:2743
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition emmintrin.h:813
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition emmintrin.h:1198
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an unaligned memory location.
Definition emmintrin.h:1641
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi64(__m128i __a, __m128i __b)
Subtracts the corresponding elements of two [2 x i64] vectors.
Definition emmintrin.h:2643
static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
Moves bytes selected by the mask from the first operand to the specified unaligned memory location.
Definition emmintrin.h:4104
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition emmintrin.h:1172
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
Initializes the 32-bit values in a 128-bit vector of [4 x i32] with the specified 32-bit integer valu...
Definition emmintrin.h:3659
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition emmintrin.h:1224
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit signed [16 x i8] vectors,...
Definition emmintrin.h:2222
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an aligned memory location.
Definition emmintrin.h:1579
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a)
Converts the lower two integer elements of a 128-bit vector of [4 x i32] into two double-precision fl...
Definition emmintrin.h:1338
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding 8-bit values of the 128-bit integer vectors for equality.
Definition emmintrin.h:3171
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi32(__m128i __a, int __count)
Right-shifts each of 32-bit values in the 128-bit integer vector operand by the specified number of b...
Definition emmintrin.h:3095
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w)
Constructs a 128-bit floating-point vector of [2 x double], with each of the two double-precision flo...
Definition emmintrin.h:1837
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding signed 16-bit values of the 128-bit integer vectors to determine i...
Definition emmintrin.h:3315
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w, double __x)
Constructs a 128-bit floating-point vector of [2 x double] initialized with the specified double-prec...
Definition emmintrin.h:1875
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a, __m128i __b)
Subtracts the corresponding 16-bit integer values in the operands.
Definition emmintrin.h:2588
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition emmintrin.h:734
#define _mm_slli_si128(a, imm)
Left-shifts the 128-bit integer vector operand by the specified number of bytes.
Definition emmintrin.h:2820
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a, __m128d __b)
Divides the lower double-precision value of the first operand by the lower double-precision value of ...
Definition emmintrin.h:181
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p, __m128i __a)
Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to a memory location.
Definition emmintrin.h:4123
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding signed 32-bit values of the 128-bit integer vectors to determine i...
Definition emmintrin.h:3336
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a, __m128i __b)
Performs a bitwise OR of two 128-bit integer vectors.
Definition emmintrin.h:2780
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition emmintrin.h:509
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a, __m128d __b)
Performs element-by-element comparison of the two 128-bit vectors of [2 x double] and returns the vec...
Definition emmintrin.h:288
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp)
Loads a 64-bit double-precision value to the low element of a 128-bit integer vector and clears the u...
Definition emmintrin.h:1724
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a, __m128d __b)
Unpacks the high-order 64-bit elements from two 128-bit vectors of [2 x double] and interleaves them ...
Definition emmintrin.h:4776
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition emmintrin.h:759
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a, __m128i __b)
Performs a bitwise AND of two 128-bit integer vectors, using the one's complement of the values conta...
Definition emmintrin.h:2763
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding signed 8-bit values of the 128-bit integer vectors to determine if...
Definition emmintrin.h:3229
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a, __m128i __count)
Right-shifts each of 32-bit values in the 128-bit integer vector operand by the specified number of b...
Definition emmintrin.h:3114
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding 16-bit values of the 128-bit integer vectors for equality.
Definition emmintrin.h:3190
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two signed [8 x i16] vectors, saving the lower 16 bits of ea...
Definition emmintrin.h:2492
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition emmintrin.h:1146
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi16(__m128i __a, int __count)
Right-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition emmintrin.h:2955
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a, __m128d __b)
Performs a bitwise XOR of two 128-bit vectors of [2 x double].
Definition emmintrin.h:407
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu8(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit unsigned [16 x i8] vectors, saving the greater value f...
Definition emmintrin.h:2392
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a, __m128i __b)
Computes the rounded averages of corresponding elements of two 128-bit unsigned [8 x i16] vectors,...
Definition emmintrin.h:2326
static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp, __m128d __a)
Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to the upper and lower 64 bits of a...
Definition emmintrin.h:1992
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a)
Casts a 128-bit integer vector into a 128-bit floating-point vector of [2 x double].
Definition emmintrin.h:4947
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a, __m128i __count)
Right-shifts each of 16-bit values in the 128-bit integer vector operand by the specified number of b...
Definition emmintrin.h:3076
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a, __m128i __b)
Unpacks the low-order (index 0-7) values from two 128-bit vectors of [16 x i8] and interleaves them i...
Definition emmintrin.h:4631
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition emmintrin.h:785
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding signed 16-bit values of the 128-bit integer vectors to determine i...
Definition emmintrin.h:3252
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition emmintrin.h:967
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpacklo_pd(__m128d __a, __m128d __b)
Unpacks the low-order 64-bit elements from two 128-bit vectors of [2 x double] and interleaves them i...
Definition emmintrin.h:4797
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a, __m128i __b)
Subtracts corresponding 8-bit signed integer values in the input and returns the differences in the c...
Definition emmintrin.h:2664
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition emmintrin.h:709
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition emmintrin.h:661
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a, __m128i __b)
Unpacks the low-order 64-bit elements from two 128-bit vectors of [2 x i64] and interleaves them into...
Definition emmintrin.h:4703
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi32(__m128i __a, int __count)
Right-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition emmintrin.h:2995
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a)
Extracts the sign bits of the double-precision values in the 128-bit vector of [2 x double],...
Definition emmintrin.h:4816
#define _mm_shuffle_pd(a, b, i)
Constructs a 128-bit floating-point vector of [2 x double] from two 128-bit vector parameters of [2 x...
Definition emmintrin.h:4846
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(__m128i *__p, __m128i __a)
Stores a 128-bit integer vector to a 128-bit aligned memory location.
Definition emmintrin.h:4165
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding 32-bit values of the 128-bit integer vectors for equality.
Definition emmintrin.h:3209
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a, __m128d __b)
Multiplies lower double-precision values in both operands and returns the product in the lower 64 bit...
Definition emmintrin.h:139
void _mm_mfence(void)
Forces strong memory ordering (serialization) between load and store instructions preceding this inst...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a, __m128i __b)
Subtracts corresponding 16-bit unsigned integer values in the input and returns the differences in th...
Definition emmintrin.h:2725
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a)
Moves the lower 64 bits of a 128-bit integer vector to a 128-bit integer vector, zeroing the upper bi...
Definition emmintrin.h:4755
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition emmintrin.h:467
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a, __m128i __b)
Unpacks the high-order 64-bit elements from two 128-bit vectors of [2 x i64] and interleaves them int...
Definition emmintrin.h:4596
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a)
Converts the low-order element of a [2 x double] vector into a 32-bit signed integer value,...
Definition emmintrin.h:1491
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a)
Converts a vector of [4 x i32] into a vector of [4 x float].
Definition emmintrin.h:3412
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a, __m128i __b)
Unpacks the low-order (index 0,1) values from two 128-bit vectors of [4 x i32] and interleaves them i...
Definition emmintrin.h:4682
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding signed 32-bit values of the 128-bit integer vectors to determine i...
Definition emmintrin.h:3273
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtss_sd(__m128d __a, __m128 __b)
Converts the lower single-precision floating-point element of a 128-bit vector of [4 x float],...
Definition emmintrin.h:1449
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a)
Calculates the square root of the each of two values stored in a 128-bit vector of [2 x double].
Definition emmintrin.h:244
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_move_sd(__m128d __a, __m128d __b)
Constructs a 128-bit floating-point vector of [2 x double].
Definition emmintrin.h:1932
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a, __m128i __b)
Unpacks the high-order (index 4-7) values from two 128-bit vectors of [8 x i16] and interleaves them ...
Definition emmintrin.h:4552
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition emmintrin.h:1528
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition emmintrin.h:1358
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void)
Generates a 128-bit vector of [4 x i32] with unspecified content.
Definition emmintrin.h:3587
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a)
Casts a 128-bit floating-point vector of [4 x float] into a 128-bit integer vector.
Definition emmintrin.h:4913
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition emmintrin.h:1250
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two 128-bit signed [8 x i16] vectors, producing eight interm...
Definition emmintrin.h:2352
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu16(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit unsigned [8 x i16] vectors,...
Definition emmintrin.h:2286
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a)
Returns a vector of [4 x i32] where the lowest element is the input operand and the remaining element...
Definition emmintrin.h:3461
#define _mm_load_pd1(dp)
Definition emmintrin.h:1606
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi64(__m128i __a, int __count)
Right-shifts each of 64-bit values in the 128-bit integer vector operand by the specified number of b...
Definition emmintrin.h:3133
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi16(__m128i __a, int __count)
Left-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition emmintrin.h:2840
#define _mm_insert_epi16(a, b, imm)
Constructs a 128-bit integer vector by first making a copy of the 128-bit integer vector parameter,...
Definition emmintrin.h:4382
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition emmintrin.h:598
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a, __m128d __b)
Converts the lower double-precision floating-point element of a 128-bit vector of [2 x double],...
Definition emmintrin.h:1400
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a)
Converts the low-order element of a 128-bit vector of [2 x double] into a 32-bit signed integer value...
Definition emmintrin.h:1375
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a, __m128d __b)
Subtracts the lower double-precision value of the second operand from the lower double-precision valu...
Definition emmintrin.h:98
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a, double const *__dp)
Loads a double-precision value into the low-order bits of a 128-bit vector of [2 x double].
Definition emmintrin.h:1778
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit unsigned [16 x i8] vectors,...
Definition emmintrin.h:2265
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit signed [8 x i16] vectors,...
Definition emmintrin.h:2244
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a, __m128i __b)
Subtracts corresponding 16-bit signed integer values in the input and returns the differences in the ...
Definition emmintrin.h:2685
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi8(__m128i __a, __m128i __b)
Subtracts the corresponding 8-bit integer values in the operands.
Definition emmintrin.h:2570
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] with unspecified content.
Definition emmintrin.h:1799
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a, __m128i __b)
Unpacks the high-order (index 2,3) values from two 128-bit vectors of [4 x i32] and interleaves them ...
Definition emmintrin.h:4575
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition emmintrin.h:1295
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 8-bit integral v...
Definition emmintrin.h:3963
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two signed [8 x i16] vectors, saving the upper 16 bits of ea...
Definition emmintrin.h:2452
#define _mm_extract_epi16(a, imm)
Extracts 16 bits from a 128-bit integer vector of [8 x i16], using the immediate-value parameter as a...
Definition emmintrin.h:4358
static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition emmintrin.h:1096
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition emmintrin.h:556
#define _mm_shufflelo_epi16(a, imm)
Constructs a 128-bit integer vector by shuffling four lower 16-bit elements of a 128-bit integer vect...
Definition emmintrin.h:4462
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [2 x i64], saving the lower 64 bits of each...
Definition emmintrin.h:2201
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two unsigned [8 x i16] vectors, saving the upper 16 bits of ...
Definition emmintrin.h:2472
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a, __m128d __b)
Performs a bitwise AND of two 128-bit vectors of [2 x double], using the one's complement of the valu...
Definition emmintrin.h:371
static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition emmintrin.h:992
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi16(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit signed [8 x i16] vectors, saving the greater value fro...
Definition emmintrin.h:2372
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a)
Casts a 128-bit floating-point vector of [2 x double] into a 128-bit floating-point vector of [4 x fl...
Definition emmintrin.h:4862
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a, __m128i __count)
Left-shifts each 64-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition emmintrin.h:2935
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a, __m128i __b)
Converts 16-bit signed integers from both 128-bit integer vector operands into 8-bit signed integers,...
Definition emmintrin.h:4275
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i)
Initializes all values in a 128-bit vector of [4 x i32] with the specified 32-bit value.
Definition emmintrin.h:3804
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a, __m128i __count)
Right-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition emmintrin.h:2975
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp)
Loads two double-precision values, in reverse order, from an aligned memory location into a 128-bit v...
Definition emmintrin.h:1623
static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition emmintrin.h:1070
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition emmintrin.h:640
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a)
Casts a 128-bit floating-point vector of [4 x float] into a 128-bit floating-point vector of [2 x dou...
Definition emmintrin.h:4896
#define _mm_bsrli_si128(a, imm)
Definition emmintrin.h:3040
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp, __m128d __a)
Stores the upper 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition emmintrin.h:2072
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a, int __count)
Left-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition emmintrin.h:2878
void _mm_lfence(void)
Forces strong memory ordering (serialization) between load instructions preceding this instruction an...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sad_epu8(__m128i __a, __m128i __b)
Computes the absolute differences of corresponding 8-bit integer values in two 128-bit vectors.
Definition emmintrin.h:2552
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition emmintrin.h:941
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition emmintrin.h:841
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp, __m128d __a)
Stores the lower 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition emmintrin.h:2092
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp, __m128d __a)
Moves packed double-precision values from a 128-bit vector of [2 x double] to a memory location.
Definition emmintrin.h:1972
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [4 x i32], saving the lower 32 bits of each...
Definition emmintrin.h:2161
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si128(__m128i_u const *__p)
Moves packed integer values from an unaligned 128-bit memory location to elements in a 128-bit intege...
Definition emmintrin.h:3548
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b)
Initializes all values in a 128-bit vector of [16 x i8] with the specified 8-bit value.
Definition emmintrin.h:3842
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a, __m128i __b)
Converts 32-bit signed integers from both 128-bit integer vector operands into 16-bit signed integers...
Definition emmintrin.h:4303
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a, __m128i __count)
Left-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition emmintrin.h:2859
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi8(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [16 x i8], saving the lower 8 bits of each ...
Definition emmintrin.h:2117
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition emmintrin.h:916
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition emmintrin.h:488
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_load_si128(__m128i const *__p)
Moves packed integer values from an aligned 128-bit memory location to elements in a 128-bit integer ...
Definition emmintrin.h:3532
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition emmintrin.h:684
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding signed 8-bit values of the 128-bit integer vectors to determine if...
Definition emmintrin.h:3294
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a)
Casts a 128-bit integer vector into a 128-bit floating-point vector of [4 x float].
Definition emmintrin.h:4930
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] initialized to zero.
Definition emmintrin.h:1911
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a, __m128d __b)
Performs a bitwise AND of two 128-bit vectors of [2 x double].
Definition emmintrin.h:350
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1, long long __q0)
Initializes both 64-bit values in a 128-bit vector of [2 x i64] with the specified 64-bit integer val...
Definition emmintrin.h:3609
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a, int __b)
Converts a 32-bit signed integer value, in the second parameter, into a double-precision floating-poi...
Definition emmintrin.h:1423
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a, __m128i __b)
Multiplies 32-bit unsigned integer values contained in the lower bits of the corresponding elements o...
Definition emmintrin.h:2530
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition emmintrin.h:891
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a, __m128i __count)
Right-shifts each of 64-bit values in the 128-bit integer vector operand by the specified number of b...
Definition emmintrin.h:3152
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition emmintrin.h:866
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a)
Converts a vector of [4 x float] into a vector of [4 x i32], truncating the result when it is inexact...
Definition emmintrin.h:3445
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 16-bit integral ...
Definition emmintrin.h:3916
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a, double const *__dp)
Loads a double-precision value into the high-order bits of a 128-bit vector of [2 x double].
Definition emmintrin.h:1751
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
Initializes the 8-bit values in a 128-bit vector of [16 x i8] with the specified 8-bit integer values...
Definition emmintrin.h:3747
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q)
Initializes both values in a 128-bit vector of [2 x i64] with the specified 64-bit value.
Definition emmintrin.h:3785
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a, __m128d __b)
Calculates the square root of the lower double-precision value of the second operand and returns it i...
Definition emmintrin.h:226
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w, double __x)
Constructs a 128-bit floating-point vector of [2 x double], initialized in reverse order with the spe...
Definition emmintrin.h:1896
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1, __m64 __q0)
Initializes both 64-bit values in a 128-bit vector of [2 x i64] with the specified 64-bit integer val...
Definition emmintrin.h:3631
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a, __m128d __b)
Compares lower 64-bit double-precision values of both operands, and returns the lesser of the pair of...
Definition emmintrin.h:268
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi64(__m128i __a, int __count)
Left-shifts each 64-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition emmintrin.h:2916
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a, __m128d __b)
Adds two 128-bit vectors of [2 x double].
Definition emmintrin.h:75
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(double *__p, __m128d __a)
Stores a 128-bit floating point vector of [2 x double] to a 128-bit aligned memory location.
Definition emmintrin.h:4146
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp, __m128d __a)
Stores the lower 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition emmintrin.h:1950
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp)
Loads a double-precision floating-point value from a specified memory location and duplicates it to b...
Definition emmintrin.h:1597
#define _mm_shufflehi_epi16(a, imm)
Constructs a 128-bit integer vector by shuffling four upper 16-bit elements of a 128-bit integer vect...
Definition emmintrin.h:4492
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a, __m128d __b)
Performs element-by-element comparison of the two 128-bit vectors of [2 x double] and returns the vec...
Definition emmintrin.h:332
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi16(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit signed [8 x i16] vectors, saving the smaller value fro...
Definition emmintrin.h:2412
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a)
Returns the lower 64 bits of a 128-bit integer vector as a 64-bit integer.
Definition emmintrin.h:4720
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p, __m128i __b)
Stores a 128-bit integer vector to a memory location aligned on a 128-bit boundary.
Definition emmintrin.h:3995
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a, __m128d __b)
Multiplies two 128-bit vectors of [2 x double].
Definition emmintrin.h:158
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] for...
Definition emmintrin.h:426
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w)
Constructs a 128-bit floating-point vector of [2 x double].
Definition emmintrin.h:1819
static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_pd(__m64 __a)
Converts the two signed 32-bit integer elements of a 64-bit vector of [2 x i32] into two double-preci...
Definition emmintrin.h:1545
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a, __m128d __b)
Compares lower 64-bit double-precision values of both operands, and returns the greater of the pair o...
Definition emmintrin.h:312
static __inline__ double __DEFAULT_FN_ATTRS _mm_cvtsd_f64(__m128d __a)
Returns the low-order element of a 128-bit vector of [2 x double] as a double-precision floating-poin...
Definition emmintrin.h:1562
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadl_epi64(__m128i_u const *__p)
Returns a vector of [2 x i64] where the lower element is taken from the lower element of the operand,...
Definition emmintrin.h:3569
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w)
Initializes all values in a 128-bit vector of [8 x i16] with the specified 16-bit value.
Definition emmintrin.h:3823
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q)
Initializes both values in a 128-bit integer vector with the specified 64-bit integer value.
Definition emmintrin.h:3766
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a)
Moves the least significant 32 bits of a vector of [4 x i32] to a 32-bit signed integer value.
Definition emmintrin.h:3496
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a, __m128i __b)
Subtracts the corresponding 32-bit integer values in the operands.
Definition emmintrin.h:2606
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a, __m128i __b)
Computes the rounded averages of corresponding elements of two 128-bit unsigned [16 x i8] vectors,...
Definition emmintrin.h:2306
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mul_su32(__m64 __a, __m64 __b)
Multiplies 32-bit unsigned integer values contained in the lower bits of the two 64-bit integer vecto...
Definition emmintrin.h:2511
static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp, __m128d __a)
Stores two double-precision values, in reverse order, from a 128-bit vector of [2 x double] to a 16-b...
Definition emmintrin.h:2054
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a, __m128i __b)
Unpacks the high-order (index 8-15) values from two 128-bit vectors of [16 x i8] and interleaves them...
Definition emmintrin.h:4525
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi16(__m128i __a, int __count)
Right-shifts each of 16-bit values in the 128-bit integer vector operand by the specified number of b...
Definition emmintrin.h:3057
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp, __m128d __a)
Stores a 128-bit vector of [2 x double] into an unaligned memory location.
Definition emmintrin.h:2031
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition emmintrin.h:1276
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [8 x i16], saving the lower 16 bits of each...
Definition emmintrin.h:2139
#define _mm_bslli_si128(a, imm)
Definition emmintrin.h:2823
#define _mm_srli_si128(a, imm)
Right-shifts the 128-bit integer vector operand by the specified number of bytes.
Definition emmintrin.h:3037
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a, __m128i __b)
Subtracts corresponding 8-bit unsigned integer values in the input and returns the differences in the...
Definition emmintrin.h:2705
#define _mm_shuffle_epi32(a, imm)
Constructs a 128-bit integer vector by shuffling four 32-bit elements of a 128-bit integer vector par...
Definition emmintrin.h:4432
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a)
Converts the lower two single-precision floating-point elements of a 128-bit vector of [4 x float] in...
Definition emmintrin.h:1315
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void)
Creates a 128-bit integer vector initialized to zero.
Definition emmintrin.h:3977
static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition emmintrin.h:1122
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition emmintrin.h:532
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition emmintrin.h:446
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p, __m128i __b)
Stores a 128-bit integer vector to an unaligned memory location.
Definition emmintrin.h:4011
double __m128d __attribute__((__vector_size__(16), __aligned__(16)))
Definition emmintrin.h:15
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a, __m128i __count)
Right-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition emmintrin.h:3015
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 32-bit integral ...
Definition emmintrin.h:3885
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition emmintrin.h:1473
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a)
Converts a vector of [4 x float] into a vector of [4 x i32].
Definition emmintrin.h:3428
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a, __m128i __b)
Performs a bitwise exclusive OR of two 128-bit integer vectors.
Definition emmintrin.h:2798
void _mm_clflush(void const *__p)
The cache line containing __p is flushed and invalidated from all caches in the coherency domain.
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition emmintrin.h:619
static __inline__ unsigned int unsigned char __D
Definition ia32intrin.h:283
static __inline__ unsigned char int __C
Definition ia32intrin.h:373