ETISS 0.8.0
Extendable Translating Instruction Set Simulator (version 0.8.0)
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
xmmintrin.h
Go to the documentation of this file.
1/*===---- xmmintrin.h - Implementation of SSE intrinsics on PowerPC --------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10/* Implemented from the specification included in the Intel C++ Compiler
11 User Guide and Reference, version 9.0. */
12
13#ifndef NO_WARN_X86_INTRINSICS
14/* This header file is to help porting code using Intel intrinsics
15 explicitly from x86_64 to powerpc64/powerpc64le.
16
17 Since X86 SSE intrinsics mainly handles __m128 type, PowerPC
18 VMX/VSX ISA is a good match for vector float SIMD operations.
19 However scalar float operations in vector (XMM) registers require
20 the POWER8 VSX ISA (2.07) level. There are differences for data
21 format and placement of float scalars in the vector register, which
22 require extra steps to match SSE scalar float semantics on POWER.
23
24 It should be noted that there's much difference between X86_64's
25 MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
26 portable <fenv.h> instead of access MXSCR directly.
27
28 Most SSE scalar float intrinsic operations can be performed more
29 efficiently as C language float scalar operations or optimized to
30 use vector SIMD operations. We recommend this for new applications. */
31#error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
32#endif
33
34#ifndef _XMMINTRIN_H_INCLUDED
35#define _XMMINTRIN_H_INCLUDED
36
37#if defined(__linux__) && defined(__ppc64__)
38
39/* Define four value permute mask */
40#define _MM_SHUFFLE(w,x,y,z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z))
41
42#include <altivec.h>
43
44/* Avoid collisions between altivec.h and strict adherence to C++ and
45 C11 standards. This should eventually be done inside altivec.h itself,
46 but only after testing a full distro build. */
47#if defined(__STRICT_ANSI__) && (defined(__cplusplus) || \
48 (defined(__STDC_VERSION__) && \
49 __STDC_VERSION__ >= 201112L))
50#undef vector
51#undef pixel
52#undef bool
53#endif
54
55/* We need type definitions from the MMX header file. */
56#include <mmintrin.h>
57
58/* Get _mm_malloc () and _mm_free (). */
59#if __STDC_HOSTED__
60#include <mm_malloc.h>
61#endif
62
63/* The Intel API is flexible enough that we must allow aliasing with other
64 vector types, and their scalar components. */
65typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
66
67/* Unaligned version of the same type. */
68typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__,
69 __aligned__ (1)));
70
71/* Internal data types for implementing the intrinsics. */
72typedef float __v4sf __attribute__ ((__vector_size__ (16)));
73
74/* Create an undefined vector. */
75extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
77{
78 __m128 __Y = __Y;
79 return __Y;
80}
81
82/* Create a vector of zeros. */
83extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
84_mm_setzero_ps (void)
85{
86 return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
87}
88
89/* Load four SPFP values from P. The address must be 16-byte aligned. */
90extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
91_mm_load_ps (float const *__P)
92{
93 return ((__m128)vec_ld(0, (__v4sf*)__P));
94}
95
96/* Load four SPFP values from P. The address need not be 16-byte aligned. */
97extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
98_mm_loadu_ps (float const *__P)
99{
100 return (vec_vsx_ld(0, __P));
101}
102
103/* Load four SPFP values in reverse order. The address must be aligned. */
104extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
105_mm_loadr_ps (float const *__P)
106{
107 __v4sf __tmp;
108 __m128 result;
109 static const __vector unsigned char permute_vector =
110 { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
111 0x17, 0x10, 0x11, 0x12, 0x13 };
112
113 __tmp = vec_ld (0, (__v4sf *) __P);
114 result = (__m128) vec_perm (__tmp, __tmp, permute_vector);
115 return result;
116}
117
118/* Create a vector with all four elements equal to F. */
119extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
120_mm_set1_ps (float __F)
121{
122 return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F };
123}
124
125extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
126_mm_set_ps1 (float __F)
127{
128 return _mm_set1_ps (__F);
129}
130
131/* Create the vector [Z Y X W]. */
132extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
133_mm_set_ps (const float __Z, const float __Y, const float __X, const float __W)
134{
135 return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z };
136}
137
138/* Create the vector [W X Y Z]. */
139extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
140_mm_setr_ps (float __Z, float __Y, float __X, float __W)
141{
142 return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
143}
144
145/* Store four SPFP values. The address must be 16-byte aligned. */
146extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
147_mm_store_ps (float *__P, __m128 __A)
148{
149 vec_st((__v4sf)__A, 0, (__v4sf*)__P);
150}
151
152/* Store four SPFP values. The address need not be 16-byte aligned. */
153extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
154_mm_storeu_ps (float *__P, __m128 __A)
155{
156 *(__m128_u *)__P = __A;
157}
158
159/* Store four SPFP values in reverse order. The address must be aligned. */
160extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
161_mm_storer_ps (float *__P, __m128 __A)
162{
163 __v4sf __tmp;
164 static const __vector unsigned char permute_vector =
165 { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
166 0x17, 0x10, 0x11, 0x12, 0x13 };
167
168 __tmp = (__m128) vec_perm (__A, __A, permute_vector);
169
170 _mm_store_ps (__P, __tmp);
171}
172
173/* Store the lower SPFP value across four words. */
174extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
175_mm_store1_ps (float *__P, __m128 __A)
176{
177 __v4sf __va = vec_splat((__v4sf)__A, 0);
178 _mm_store_ps (__P, __va);
179}
180
181extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
182_mm_store_ps1 (float *__P, __m128 __A)
183{
184 _mm_store1_ps (__P, __A);
185}
186
187/* Create a vector with element 0 as F and the rest zero. */
188extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
189_mm_set_ss (float __F)
190{
191 return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f };
192}
193
194/* Sets the low SPFP value of A from the low value of B. */
195extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
196_mm_move_ss (__m128 __A, __m128 __B)
197{
198 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
199
200 return (vec_sel ((__v4sf)__A, (__v4sf)__B, mask));
201}
202
203/* Create a vector with element 0 as *P and the rest zero. */
204extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
205_mm_load_ss (float const *__P)
206{
207 return _mm_set_ss (*__P);
208}
209
210/* Stores the lower SPFP value. */
211extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
212_mm_store_ss (float *__P, __m128 __A)
213{
214 *__P = ((__v4sf)__A)[0];
215}
216
217/* Perform the respective operation on the lower SPFP (single-precision
218 floating-point) values of A and B; the upper three SPFP values are
219 passed through from A. */
220
221extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
222_mm_add_ss (__m128 __A, __m128 __B)
223{
224#ifdef _ARCH_PWR7
225 __m128 a, b, c;
226 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
227 /* PowerISA VSX does not allow partial (for just lower double)
228 results. So to insure we don't generate spurious exceptions
229 (from the upper double values) we splat the lower double
230 before we to the operation. */
231 a = vec_splat (__A, 0);
232 b = vec_splat (__B, 0);
233 c = a + b;
234 /* Then we merge the lower float result with the original upper
235 float elements from __A. */
236 return (vec_sel (__A, c, mask));
237#else
238 __A[0] = __A[0] + __B[0];
239 return (__A);
240#endif
241}
242
243extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
244_mm_sub_ss (__m128 __A, __m128 __B)
245{
246#ifdef _ARCH_PWR7
247 __m128 a, b, c;
248 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
249 /* PowerISA VSX does not allow partial (for just lower double)
250 results. So to insure we don't generate spurious exceptions
251 (from the upper double values) we splat the lower double
252 before we to the operation. */
253 a = vec_splat (__A, 0);
254 b = vec_splat (__B, 0);
255 c = a - b;
256 /* Then we merge the lower float result with the original upper
257 float elements from __A. */
258 return (vec_sel (__A, c, mask));
259#else
260 __A[0] = __A[0] - __B[0];
261 return (__A);
262#endif
263}
264
265extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
266_mm_mul_ss (__m128 __A, __m128 __B)
267{
268#ifdef _ARCH_PWR7
269 __m128 a, b, c;
270 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
271 /* PowerISA VSX does not allow partial (for just lower double)
272 results. So to insure we don't generate spurious exceptions
273 (from the upper double values) we splat the lower double
274 before we to the operation. */
275 a = vec_splat (__A, 0);
276 b = vec_splat (__B, 0);
277 c = a * b;
278 /* Then we merge the lower float result with the original upper
279 float elements from __A. */
280 return (vec_sel (__A, c, mask));
281#else
282 __A[0] = __A[0] * __B[0];
283 return (__A);
284#endif
285}
286
287extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
288_mm_div_ss (__m128 __A, __m128 __B)
289{
290#ifdef _ARCH_PWR7
291 __m128 a, b, c;
292 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
293 /* PowerISA VSX does not allow partial (for just lower double)
294 results. So to insure we don't generate spurious exceptions
295 (from the upper double values) we splat the lower double
296 before we to the operation. */
297 a = vec_splat (__A, 0);
298 b = vec_splat (__B, 0);
299 c = a / b;
300 /* Then we merge the lower float result with the original upper
301 float elements from __A. */
302 return (vec_sel (__A, c, mask));
303#else
304 __A[0] = __A[0] / __B[0];
305 return (__A);
306#endif
307}
308
309extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
310_mm_sqrt_ss (__m128 __A)
311{
312 __m128 a, c;
313 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
314 /* PowerISA VSX does not allow partial (for just lower double)
315 * results. So to insure we don't generate spurious exceptions
316 * (from the upper double values) we splat the lower double
317 * before we to the operation. */
318 a = vec_splat (__A, 0);
319 c = vec_sqrt (a);
320 /* Then we merge the lower float result with the original upper
321 * float elements from __A. */
322 return (vec_sel (__A, c, mask));
323}
324
325/* Perform the respective operation on the four SPFP values in A and B. */
326extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
327_mm_add_ps (__m128 __A, __m128 __B)
328{
329 return (__m128) ((__v4sf)__A + (__v4sf)__B);
330}
331
332extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
333_mm_sub_ps (__m128 __A, __m128 __B)
334{
335 return (__m128) ((__v4sf)__A - (__v4sf)__B);
336}
337
338extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
339_mm_mul_ps (__m128 __A, __m128 __B)
340{
341 return (__m128) ((__v4sf)__A * (__v4sf)__B);
342}
343
344extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
345_mm_div_ps (__m128 __A, __m128 __B)
346{
347 return (__m128) ((__v4sf)__A / (__v4sf)__B);
348}
349
350extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
351_mm_sqrt_ps (__m128 __A)
352{
353 return (vec_sqrt ((__v4sf)__A));
354}
355
356extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
357_mm_rcp_ps (__m128 __A)
358{
359 return (vec_re ((__v4sf)__A));
360}
361
362extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
363_mm_rsqrt_ps (__m128 __A)
364{
365 return (vec_rsqrte (__A));
366}
367
368extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
369_mm_rcp_ss (__m128 __A)
370{
371 __m128 a, c;
372 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
373 /* PowerISA VSX does not allow partial (for just lower double)
374 * results. So to insure we don't generate spurious exceptions
375 * (from the upper double values) we splat the lower double
376 * before we to the operation. */
377 a = vec_splat (__A, 0);
378 c = _mm_rcp_ps (a);
379 /* Then we merge the lower float result with the original upper
380 * float elements from __A. */
381 return (vec_sel (__A, c, mask));
382}
383
384extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
385_mm_rsqrt_ss (__m128 __A)
386{
387 __m128 a, c;
388 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
389 /* PowerISA VSX does not allow partial (for just lower double)
390 * results. So to insure we don't generate spurious exceptions
391 * (from the upper double values) we splat the lower double
392 * before we to the operation. */
393 a = vec_splat (__A, 0);
394 c = vec_rsqrte (a);
395 /* Then we merge the lower float result with the original upper
396 * float elements from __A. */
397 return (vec_sel (__A, c, mask));
398}
399
400extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
401_mm_min_ss (__m128 __A, __m128 __B)
402{
403 __v4sf a, b, c;
404 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
405 /* PowerISA VSX does not allow partial (for just lower float)
406 * results. So to insure we don't generate spurious exceptions
407 * (from the upper float values) we splat the lower float
408 * before we to the operation. */
409 a = vec_splat ((__v4sf)__A, 0);
410 b = vec_splat ((__v4sf)__B, 0);
411 c = vec_min (a, b);
412 /* Then we merge the lower float result with the original upper
413 * float elements from __A. */
414 return (vec_sel ((__v4sf)__A, c, mask));
415}
416
417extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
418_mm_max_ss (__m128 __A, __m128 __B)
419{
420 __v4sf a, b, c;
421 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
422 /* PowerISA VSX does not allow partial (for just lower float)
423 * results. So to insure we don't generate spurious exceptions
424 * (from the upper float values) we splat the lower float
425 * before we to the operation. */
426 a = vec_splat (__A, 0);
427 b = vec_splat (__B, 0);
428 c = vec_max (a, b);
429 /* Then we merge the lower float result with the original upper
430 * float elements from __A. */
431 return (vec_sel ((__v4sf)__A, c, mask));
432}
433
434extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
435_mm_min_ps (__m128 __A, __m128 __B)
436{
437 __vector __bool int m = vec_cmpgt ((__v4sf) __B, (__v4sf) __A);
438 return vec_sel (__B, __A, m);
439}
440
441extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
442_mm_max_ps (__m128 __A, __m128 __B)
443{
444 __vector __bool int m = vec_cmpgt ((__v4sf) __A, (__v4sf) __B);
445 return vec_sel (__B, __A, m);
446}
447
448/* Perform logical bit-wise operations on 128-bit values. */
449extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
450_mm_and_ps (__m128 __A, __m128 __B)
451{
452 return ((__m128)vec_and ((__v4sf)__A, (__v4sf)__B));
453// return __builtin_ia32_andps (__A, __B);
454}
455
456extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
457_mm_andnot_ps (__m128 __A, __m128 __B)
458{
459 return ((__m128)vec_andc ((__v4sf)__B, (__v4sf)__A));
460}
461
462extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
463_mm_or_ps (__m128 __A, __m128 __B)
464{
465 return ((__m128)vec_or ((__v4sf)__A, (__v4sf)__B));
466}
467
468extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
469_mm_xor_ps (__m128 __A, __m128 __B)
470{
471 return ((__m128)vec_xor ((__v4sf)__A, (__v4sf)__B));
472}
473
474/* Perform a comparison on the four SPFP values of A and B. For each
475 element, if the comparison is true, place a mask of all ones in the
476 result, otherwise a mask of zeros. */
477extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
478_mm_cmpeq_ps (__m128 __A, __m128 __B)
479{
480 return ((__m128)vec_cmpeq ((__v4sf)__A,(__v4sf) __B));
481}
482
483extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
484_mm_cmplt_ps (__m128 __A, __m128 __B)
485{
486 return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
487}
488
489extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
490_mm_cmple_ps (__m128 __A, __m128 __B)
491{
492 return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
493}
494
495extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
496_mm_cmpgt_ps (__m128 __A, __m128 __B)
497{
498 return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
499}
500
501extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
502_mm_cmpge_ps (__m128 __A, __m128 __B)
503{
504 return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
505}
506
507extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
508_mm_cmpneq_ps (__m128 __A, __m128 __B)
509{
510 __v4sf temp = (__v4sf ) vec_cmpeq ((__v4sf) __A, (__v4sf)__B);
511 return ((__m128)vec_nor (temp, temp));
512}
513
514extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
515_mm_cmpnlt_ps (__m128 __A, __m128 __B)
516{
517 return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
518}
519
520extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
521_mm_cmpnle_ps (__m128 __A, __m128 __B)
522{
523 return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
524}
525
526extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
527_mm_cmpngt_ps (__m128 __A, __m128 __B)
528{
529 return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
530}
531
532extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
533_mm_cmpnge_ps (__m128 __A, __m128 __B)
534{
535 return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
536}
537
538extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
539_mm_cmpord_ps (__m128 __A, __m128 __B)
540{
541 __vector unsigned int a, b;
542 __vector unsigned int c, d;
543 static const __vector unsigned int float_exp_mask =
544 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
545
546 a = (__vector unsigned int) vec_abs ((__v4sf)__A);
547 b = (__vector unsigned int) vec_abs ((__v4sf)__B);
548 c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
549 d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
550 return ((__m128 ) vec_and (c, d));
551}
552
553extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
554_mm_cmpunord_ps (__m128 __A, __m128 __B)
555{
556 __vector unsigned int a, b;
557 __vector unsigned int c, d;
558 static const __vector unsigned int float_exp_mask =
559 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
560
561 a = (__vector unsigned int) vec_abs ((__v4sf)__A);
562 b = (__vector unsigned int) vec_abs ((__v4sf)__B);
563 c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
564 d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
565 return ((__m128 ) vec_or (c, d));
566}
567
568/* Perform a comparison on the lower SPFP values of A and B. If the
569 comparison is true, place a mask of all ones in the result, otherwise a
570 mask of zeros. The upper three SPFP values are passed through from A. */
571extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
572_mm_cmpeq_ss (__m128 __A, __m128 __B)
573{
574 static const __vector unsigned int mask =
575 { 0xffffffff, 0, 0, 0 };
576 __v4sf a, b, c;
577 /* PowerISA VMX does not allow partial (for just element 0)
578 * results. So to insure we don't generate spurious exceptions
579 * (from the upper elements) we splat the lower float
580 * before we to the operation. */
581 a = vec_splat ((__v4sf) __A, 0);
582 b = vec_splat ((__v4sf) __B, 0);
583 c = (__v4sf) vec_cmpeq(a, b);
584 /* Then we merge the lower float result with the original upper
585 * float elements from __A. */
586 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
587}
588
589extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
590_mm_cmplt_ss (__m128 __A, __m128 __B)
591{
592 static const __vector unsigned int mask =
593 { 0xffffffff, 0, 0, 0 };
594 __v4sf a, b, c;
595 /* PowerISA VMX does not allow partial (for just element 0)
596 * results. So to insure we don't generate spurious exceptions
597 * (from the upper elements) we splat the lower float
598 * before we to the operation. */
599 a = vec_splat ((__v4sf) __A, 0);
600 b = vec_splat ((__v4sf) __B, 0);
601 c = (__v4sf) vec_cmplt(a, b);
602 /* Then we merge the lower float result with the original upper
603 * float elements from __A. */
604 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
605}
606
607extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
608_mm_cmple_ss (__m128 __A, __m128 __B)
609{
610 static const __vector unsigned int mask =
611 { 0xffffffff, 0, 0, 0 };
612 __v4sf a, b, c;
613 /* PowerISA VMX does not allow partial (for just element 0)
614 * results. So to insure we don't generate spurious exceptions
615 * (from the upper elements) we splat the lower float
616 * before we to the operation. */
617 a = vec_splat ((__v4sf) __A, 0);
618 b = vec_splat ((__v4sf) __B, 0);
619 c = (__v4sf) vec_cmple(a, b);
620 /* Then we merge the lower float result with the original upper
621 * float elements from __A. */
622 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
623}
624
625extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
626_mm_cmpgt_ss (__m128 __A, __m128 __B)
627{
628 static const __vector unsigned int mask =
629 { 0xffffffff, 0, 0, 0 };
630 __v4sf a, b, c;
631 /* PowerISA VMX does not allow partial (for just element 0)
632 * results. So to insure we don't generate spurious exceptions
633 * (from the upper elements) we splat the lower float
634 * before we to the operation. */
635 a = vec_splat ((__v4sf) __A, 0);
636 b = vec_splat ((__v4sf) __B, 0);
637 c = (__v4sf) vec_cmpgt(a, b);
638 /* Then we merge the lower float result with the original upper
639 * float elements from __A. */
640 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
641}
642
643extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
644_mm_cmpge_ss (__m128 __A, __m128 __B)
645{
646 static const __vector unsigned int mask =
647 { 0xffffffff, 0, 0, 0 };
648 __v4sf a, b, c;
649 /* PowerISA VMX does not allow partial (for just element 0)
650 * results. So to insure we don't generate spurious exceptions
651 * (from the upper elements) we splat the lower float
652 * before we to the operation. */
653 a = vec_splat ((__v4sf) __A, 0);
654 b = vec_splat ((__v4sf) __B, 0);
655 c = (__v4sf) vec_cmpge(a, b);
656 /* Then we merge the lower float result with the original upper
657 * float elements from __A. */
658 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
659}
660
661extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
662_mm_cmpneq_ss (__m128 __A, __m128 __B)
663{
664 static const __vector unsigned int mask =
665 { 0xffffffff, 0, 0, 0 };
666 __v4sf a, b, c;
667 /* PowerISA VMX does not allow partial (for just element 0)
668 * results. So to insure we don't generate spurious exceptions
669 * (from the upper elements) we splat the lower float
670 * before we to the operation. */
671 a = vec_splat ((__v4sf) __A, 0);
672 b = vec_splat ((__v4sf) __B, 0);
673 c = (__v4sf) vec_cmpeq(a, b);
674 c = vec_nor (c, c);
675 /* Then we merge the lower float result with the original upper
676 * float elements from __A. */
677 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
678}
679
680extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
681_mm_cmpnlt_ss (__m128 __A, __m128 __B)
682{
683 static const __vector unsigned int mask =
684 { 0xffffffff, 0, 0, 0 };
685 __v4sf a, b, c;
686 /* PowerISA VMX does not allow partial (for just element 0)
687 * results. So to insure we don't generate spurious exceptions
688 * (from the upper elements) we splat the lower float
689 * before we to the operation. */
690 a = vec_splat ((__v4sf) __A, 0);
691 b = vec_splat ((__v4sf) __B, 0);
692 c = (__v4sf) vec_cmpge(a, b);
693 /* Then we merge the lower float result with the original upper
694 * float elements from __A. */
695 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
696}
697
698extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
699_mm_cmpnle_ss (__m128 __A, __m128 __B)
700{
701 static const __vector unsigned int mask =
702 { 0xffffffff, 0, 0, 0 };
703 __v4sf a, b, c;
704 /* PowerISA VMX does not allow partial (for just element 0)
705 * results. So to insure we don't generate spurious exceptions
706 * (from the upper elements) we splat the lower float
707 * before we to the operation. */
708 a = vec_splat ((__v4sf) __A, 0);
709 b = vec_splat ((__v4sf) __B, 0);
710 c = (__v4sf) vec_cmpgt(a, b);
711 /* Then we merge the lower float result with the original upper
712 * float elements from __A. */
713 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
714}
715
716extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
717_mm_cmpngt_ss (__m128 __A, __m128 __B)
718{
719 static const __vector unsigned int mask =
720 { 0xffffffff, 0, 0, 0 };
721 __v4sf a, b, c;
722 /* PowerISA VMX does not allow partial (for just element 0)
723 * results. So to insure we don't generate spurious exceptions
724 * (from the upper elements) we splat the lower float
725 * before we to the operation. */
726 a = vec_splat ((__v4sf) __A, 0);
727 b = vec_splat ((__v4sf) __B, 0);
728 c = (__v4sf) vec_cmple(a, b);
729 /* Then we merge the lower float result with the original upper
730 * float elements from __A. */
731 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
732}
733
734extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
735_mm_cmpnge_ss (__m128 __A, __m128 __B)
736{
737 static const __vector unsigned int mask =
738 { 0xffffffff, 0, 0, 0 };
739 __v4sf a, b, c;
740 /* PowerISA VMX does not allow partial (for just element 0)
741 * results. So to insure we don't generate spurious exceptions
742 * (from the upper elements) we splat the lower float
743 * before we do the operation. */
744 a = vec_splat ((__v4sf) __A, 0);
745 b = vec_splat ((__v4sf) __B, 0);
746 c = (__v4sf) vec_cmplt(a, b);
747 /* Then we merge the lower float result with the original upper
748 * float elements from __A. */
749 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
750}
751
752extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
753_mm_cmpord_ss (__m128 __A, __m128 __B)
754{
755 __vector unsigned int a, b;
756 __vector unsigned int c, d;
757 static const __vector unsigned int float_exp_mask =
758 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
759 static const __vector unsigned int mask =
760 { 0xffffffff, 0, 0, 0 };
761
762 a = (__vector unsigned int) vec_abs ((__v4sf)__A);
763 b = (__vector unsigned int) vec_abs ((__v4sf)__B);
764 c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
765 d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
766 c = vec_and (c, d);
767 /* Then we merge the lower float result with the original upper
768 * float elements from __A. */
769 return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
770}
771
772extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
773_mm_cmpunord_ss (__m128 __A, __m128 __B)
774{
775 __vector unsigned int a, b;
776 __vector unsigned int c, d;
777 static const __vector unsigned int float_exp_mask =
778 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
779 static const __vector unsigned int mask =
780 { 0xffffffff, 0, 0, 0 };
781
782 a = (__vector unsigned int) vec_abs ((__v4sf)__A);
783 b = (__vector unsigned int) vec_abs ((__v4sf)__B);
784 c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
785 d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
786 c = vec_or (c, d);
787 /* Then we merge the lower float result with the original upper
788 * float elements from __A. */
789 return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
790}
791
792/* Compare the lower SPFP values of A and B and return 1 if true
793 and 0 if false. */
794extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
795_mm_comieq_ss (__m128 __A, __m128 __B)
796{
797 return (__A[0] == __B[0]);
798}
799
800extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
801_mm_comilt_ss (__m128 __A, __m128 __B)
802{
803 return (__A[0] < __B[0]);
804}
805
806extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
807_mm_comile_ss (__m128 __A, __m128 __B)
808{
809 return (__A[0] <= __B[0]);
810}
811
812extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
813_mm_comigt_ss (__m128 __A, __m128 __B)
814{
815 return (__A[0] > __B[0]);
816}
817
818extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
819_mm_comige_ss (__m128 __A, __m128 __B)
820{
821 return (__A[0] >= __B[0]);
822}
823
824extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
825_mm_comineq_ss (__m128 __A, __m128 __B)
826{
827 return (__A[0] != __B[0]);
828}
829
830/* FIXME
831 * The __mm_ucomi??_ss implementations below are exactly the same as
832 * __mm_comi??_ss because GCC for PowerPC only generates unordered
833 * compares (scalar and vector).
834 * Technically __mm_comieq_ss et al should be using the ordered
835 * compare and signal for QNaNs.
836 * The __mm_ucomieq_sd et all should be OK, as is.
837 */
838extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
839_mm_ucomieq_ss (__m128 __A, __m128 __B)
840{
841 return (__A[0] == __B[0]);
842}
843
844extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
845_mm_ucomilt_ss (__m128 __A, __m128 __B)
846{
847 return (__A[0] < __B[0]);
848}
849
850extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
851_mm_ucomile_ss (__m128 __A, __m128 __B)
852{
853 return (__A[0] <= __B[0]);
854}
855
856extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
857_mm_ucomigt_ss (__m128 __A, __m128 __B)
858{
859 return (__A[0] > __B[0]);
860}
861
862extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
863_mm_ucomige_ss (__m128 __A, __m128 __B)
864{
865 return (__A[0] >= __B[0]);
866}
867
868extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
869_mm_ucomineq_ss (__m128 __A, __m128 __B)
870{
871 return (__A[0] != __B[0]);
872}
873
874extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
875_mm_cvtss_f32 (__m128 __A)
876{
877 return ((__v4sf)__A)[0];
878}
879
880/* Convert the lower SPFP value to a 32-bit integer according to the current
881 rounding mode. */
882extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
883_mm_cvtss_si32 (__m128 __A)
884{
885 __m64 res = 0;
886#ifdef _ARCH_PWR8
887 double dtmp;
888 __asm__(
889#ifdef __LITTLE_ENDIAN__
890 "xxsldwi %x0,%x0,%x0,3;\n"
891#endif
892 "xscvspdp %x2,%x0;\n"
893 "fctiw %2,%2;\n"
894 "mfvsrd %1,%x2;\n"
895 : "+wa" (__A),
896 "=r" (res),
897 "=f" (dtmp)
898 : );
899#else
900 res = __builtin_rint(__A[0]);
901#endif
902 return (res);
903}
904
905extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
906_mm_cvt_ss2si (__m128 __A)
907{
908 return _mm_cvtss_si32 (__A);
909}
910
911/* Convert the lower SPFP value to a 32-bit integer according to the
912 current rounding mode. */
913
914/* Intel intrinsic. */
915extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
916_mm_cvtss_si64 (__m128 __A)
917{
918 __m64 res = 0;
919#ifdef _ARCH_PWR8
920 double dtmp;
921 __asm__(
922#ifdef __LITTLE_ENDIAN__
923 "xxsldwi %x0,%x0,%x0,3;\n"
924#endif
925 "xscvspdp %x2,%x0;\n"
926 "fctid %2,%2;\n"
927 "mfvsrd %1,%x2;\n"
928 : "+wa" (__A),
929 "=r" (res),
930 "=f" (dtmp)
931 : );
932#else
933 res = __builtin_llrint(__A[0]);
934#endif
935 return (res);
936}
937
938/* Microsoft intrinsic. */
939extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
940_mm_cvtss_si64x (__m128 __A)
941{
942 return _mm_cvtss_si64 ((__v4sf) __A);
943}
944
945/* Constants for use with _mm_prefetch. */
946enum _mm_hint
947{
948 /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit. */
949 _MM_HINT_ET0 = 7,
950 _MM_HINT_ET1 = 6,
951 _MM_HINT_T0 = 3,
952 _MM_HINT_T1 = 2,
953 _MM_HINT_T2 = 1,
954 _MM_HINT_NTA = 0
955};
956
957/* Loads one cache line from address P to a location "closer" to the
958 processor. The selector I specifies the type of prefetch operation. */
959extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
960_mm_prefetch (const void *__P, enum _mm_hint __I)
961{
962 /* Current PowerPC will ignores the hint parameters. */
963 __builtin_prefetch (__P);
964}
965
966/* Convert the two lower SPFP values to 32-bit integers according to the
967 current rounding mode. Return the integers in packed form. */
968extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
969_mm_cvtps_pi32 (__m128 __A)
970{
971 /* Splat two lower SPFP values to both halves. */
972 __v4sf temp, rounded;
973 __vector unsigned long long result;
974
975 /* Splat two lower SPFP values to both halves. */
976 temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
977 rounded = vec_rint(temp);
978 result = (__vector unsigned long long) vec_cts (rounded, 0);
979
980 return (__m64) ((__vector long long) result)[0];
981}
982
983extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
984_mm_cvt_ps2pi (__m128 __A)
985{
986 return _mm_cvtps_pi32 (__A);
987}
988
989/* Truncate the lower SPFP value to a 32-bit integer. */
990extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
991_mm_cvttss_si32 (__m128 __A)
992{
993 /* Extract the lower float element. */
994 float temp = __A[0];
995 /* truncate to 32-bit integer and return. */
996 return temp;
997}
998
999extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1000_mm_cvtt_ss2si (__m128 __A)
1001{
1002 return _mm_cvttss_si32 (__A);
1003}
1004
1005/* Intel intrinsic. */
1006extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1007_mm_cvttss_si64 (__m128 __A)
1008{
1009 /* Extract the lower float element. */
1010 float temp = __A[0];
1011 /* truncate to 32-bit integer and return. */
1012 return temp;
1013}
1014
1015/* Microsoft intrinsic. */
1016extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1017_mm_cvttss_si64x (__m128 __A)
1018{
1019 /* Extract the lower float element. */
1020 float temp = __A[0];
1021 /* truncate to 32-bit integer and return. */
1022 return temp;
1023}
1024
1025/* Truncate the two lower SPFP values to 32-bit integers. Return the
1026 integers in packed form. */
1027extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1028_mm_cvttps_pi32 (__m128 __A)
1029{
1030 __v4sf temp;
1031 __vector unsigned long long result;
1032
1033 /* Splat two lower SPFP values to both halves. */
1034 temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
1035 result = (__vector unsigned long long) vec_cts (temp, 0);
1036
1037 return (__m64) ((__vector long long) result)[0];
1038}
1039
1040extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1041_mm_cvtt_ps2pi (__m128 __A)
1042{
1043 return _mm_cvttps_pi32 (__A);
1044}
1045
1046/* Convert B to a SPFP value and insert it as element zero in A. */
1047extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1048_mm_cvtsi32_ss (__m128 __A, int __B)
1049{
1050 float temp = __B;
1051 __A[0] = temp;
1052
1053 return __A;
1054}
1055
1056extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1057_mm_cvt_si2ss (__m128 __A, int __B)
1058{
1059 return _mm_cvtsi32_ss (__A, __B);
1060}
1061
1062/* Convert B to a SPFP value and insert it as element zero in A. */
1063/* Intel intrinsic. */
1064extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1065_mm_cvtsi64_ss (__m128 __A, long long __B)
1066{
1067 float temp = __B;
1068 __A[0] = temp;
1069
1070 return __A;
1071}
1072
1073/* Microsoft intrinsic. */
1074extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1075_mm_cvtsi64x_ss (__m128 __A, long long __B)
1076{
1077 return _mm_cvtsi64_ss (__A, __B);
1078}
1079
1080/* Convert the two 32-bit values in B to SPFP form and insert them
1081 as the two lower elements in A. */
1082extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1083_mm_cvtpi32_ps (__m128 __A, __m64 __B)
1084{
1085 __vector signed int vm1;
1086 __vector float vf1;
1087
1088 vm1 = (__vector signed int) (__vector unsigned long long) {__B, __B};
1089 vf1 = (__vector float) vec_ctf (vm1, 0);
1090
1091 return ((__m128) (__vector unsigned long long)
1092 { ((__vector unsigned long long)vf1) [0],
1093 ((__vector unsigned long long)__A) [1]});
1094}
1095
1096extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1097_mm_cvt_pi2ps (__m128 __A, __m64 __B)
1098{
1099 return _mm_cvtpi32_ps (__A, __B);
1100}
1101
1102/* Convert the four signed 16-bit values in A to SPFP form. */
1103extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1104_mm_cvtpi16_ps (__m64 __A)
1105{
1106 __vector signed short vs8;
1107 __vector signed int vi4;
1108 __vector float vf1;
1109
1110 vs8 = (__vector signed short) (__vector unsigned long long) { __A, __A };
1111 vi4 = vec_vupklsh (vs8);
1112 vf1 = (__vector float) vec_ctf (vi4, 0);
1113
1114 return (__m128) vf1;
1115}
1116
1117/* Convert the four unsigned 16-bit values in A to SPFP form. */
1118extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1119_mm_cvtpu16_ps (__m64 __A)
1120{
1121 const __vector unsigned short zero =
1122 { 0, 0, 0, 0, 0, 0, 0, 0 };
1123 __vector unsigned short vs8;
1124 __vector unsigned int vi4;
1125 __vector float vf1;
1126
1127 vs8 = (__vector unsigned short) (__vector unsigned long long) { __A, __A };
1128 vi4 = (__vector unsigned int) vec_mergel
1129#ifdef __LITTLE_ENDIAN__
1130 (vs8, zero);
1131#else
1132 (zero, vs8);
1133#endif
1134 vf1 = (__vector float) vec_ctf (vi4, 0);
1135
1136 return (__m128) vf1;
1137}
1138
1139/* Convert the low four signed 8-bit values in A to SPFP form. */
1140extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1141_mm_cvtpi8_ps (__m64 __A)
1142{
1143 __vector signed char vc16;
1144 __vector signed short vs8;
1145 __vector signed int vi4;
1146 __vector float vf1;
1147
1148 vc16 = (__vector signed char) (__vector unsigned long long) { __A, __A };
1149 vs8 = vec_vupkhsb (vc16);
1150 vi4 = vec_vupkhsh (vs8);
1151 vf1 = (__vector float) vec_ctf (vi4, 0);
1152
1153 return (__m128) vf1;
1154}
1155
1156/* Convert the low four unsigned 8-bit values in A to SPFP form. */
1157extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1158
1159_mm_cvtpu8_ps (__m64 __A)
1160{
1161 const __vector unsigned char zero =
1162 { 0, 0, 0, 0, 0, 0, 0, 0 };
1163 __vector unsigned char vc16;
1164 __vector unsigned short vs8;
1165 __vector unsigned int vi4;
1166 __vector float vf1;
1167
1168 vc16 = (__vector unsigned char) (__vector unsigned long long) { __A, __A };
1169#ifdef __LITTLE_ENDIAN__
1170 vs8 = (__vector unsigned short) vec_mergel (vc16, zero);
1171 vi4 = (__vector unsigned int) vec_mergeh (vs8,
1172 (__vector unsigned short) zero);
1173#else
1174 vs8 = (__vector unsigned short) vec_mergel (zero, vc16);
1175 vi4 = (__vector unsigned int) vec_mergeh ((__vector unsigned short) zero,
1176 vs8);
1177#endif
1178 vf1 = (__vector float) vec_ctf (vi4, 0);
1179
1180 return (__m128) vf1;
1181}
1182
1183/* Convert the four signed 32-bit values in A and B to SPFP form. */
1184extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1185_mm_cvtpi32x2_ps (__m64 __A, __m64 __B)
1186{
1187 __vector signed int vi4;
1188 __vector float vf4;
1189
1190 vi4 = (__vector signed int) (__vector unsigned long long) { __A, __B };
1191 vf4 = (__vector float) vec_ctf (vi4, 0);
1192 return (__m128) vf4;
1193}
1194
1195/* Convert the four SPFP values in A to four signed 16-bit integers. */
1196extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1197_mm_cvtps_pi16 (__m128 __A)
1198{
1199 __v4sf rounded;
1200 __vector signed int temp;
1201 __vector unsigned long long result;
1202
1203 rounded = vec_rint(__A);
1204 temp = vec_cts (rounded, 0);
1205 result = (__vector unsigned long long) vec_pack (temp, temp);
1206
1207 return (__m64) ((__vector long long) result)[0];
1208}
1209
1210/* Convert the four SPFP values in A to four signed 8-bit integers. */
1211extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1212_mm_cvtps_pi8 (__m128 __A)
1213{
1214 __v4sf rounded;
1215 __vector signed int tmp_i;
1216 static const __vector signed int zero = {0, 0, 0, 0};
1217 __vector signed short tmp_s;
1218 __vector signed char res_v;
1219
1220 rounded = vec_rint(__A);
1221 tmp_i = vec_cts (rounded, 0);
1222 tmp_s = vec_pack (tmp_i, zero);
1223 res_v = vec_pack (tmp_s, tmp_s);
1224 return (__m64) ((__vector long long) res_v)[0];
1225}
1226
1227/* Selects four specific SPFP values from A and B based on MASK. */
1228extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1229
1230_mm_shuffle_ps (__m128 __A, __m128 __B, int const __mask)
1231{
1232 unsigned long element_selector_10 = __mask & 0x03;
1233 unsigned long element_selector_32 = (__mask >> 2) & 0x03;
1234 unsigned long element_selector_54 = (__mask >> 4) & 0x03;
1235 unsigned long element_selector_76 = (__mask >> 6) & 0x03;
1236 static const unsigned int permute_selectors[4] =
1237 {
1238#ifdef __LITTLE_ENDIAN__
1239 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
1240#else
1241 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
1242#endif
1243 };
1244 __vector unsigned int t;
1245
1246 t[0] = permute_selectors[element_selector_10];
1247 t[1] = permute_selectors[element_selector_32];
1248 t[2] = permute_selectors[element_selector_54] + 0x10101010;
1249 t[3] = permute_selectors[element_selector_76] + 0x10101010;
1250 return vec_perm ((__v4sf) __A, (__v4sf)__B, (__vector unsigned char)t);
1251}
1252
1253/* Selects and interleaves the upper two SPFP values from A and B. */
1254extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1255_mm_unpackhi_ps (__m128 __A, __m128 __B)
1256{
1257 return (__m128) vec_vmrglw ((__v4sf) __A, (__v4sf)__B);
1258}
1259
1260/* Selects and interleaves the lower two SPFP values from A and B. */
1261extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1262_mm_unpacklo_ps (__m128 __A, __m128 __B)
1263{
1264 return (__m128) vec_vmrghw ((__v4sf) __A, (__v4sf)__B);
1265}
1266
1267/* Sets the upper two SPFP values with 64-bits of data loaded from P;
1268 the lower two values are passed through from A. */
1269extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1270_mm_loadh_pi (__m128 __A, __m64 const *__P)
1271{
1272 __vector unsigned long long __a = (__vector unsigned long long)__A;
1273 __vector unsigned long long __p = vec_splats(*__P);
1274 __a [1] = __p [1];
1275
1276 return (__m128)__a;
1277}
1278
1279/* Stores the upper two SPFP values of A into P. */
1280extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1281_mm_storeh_pi (__m64 *__P, __m128 __A)
1282{
1283 __vector unsigned long long __a = (__vector unsigned long long) __A;
1284
1285 *__P = __a[1];
1286}
1287
1288/* Moves the upper two values of B into the lower two values of A. */
1289extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1290_mm_movehl_ps (__m128 __A, __m128 __B)
1291{
1292 return (__m128) vec_mergel ((__vector unsigned long long)__B,
1293 (__vector unsigned long long)__A);
1294}
1295
1296/* Moves the lower two values of B into the upper two values of A. */
1297extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1298_mm_movelh_ps (__m128 __A, __m128 __B)
1299{
1300 return (__m128) vec_mergeh ((__vector unsigned long long)__A,
1301 (__vector unsigned long long)__B);
1302}
1303
1304/* Sets the lower two SPFP values with 64-bits of data loaded from P;
1305 the upper two values are passed through from A. */
1306extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1307_mm_loadl_pi (__m128 __A, __m64 const *__P)
1308{
1309 __vector unsigned long long __a = (__vector unsigned long long)__A;
1310 __vector unsigned long long __p = vec_splats(*__P);
1311 __a [0] = __p [0];
1312
1313 return (__m128)__a;
1314}
1315
1316/* Stores the lower two SPFP values of A into P. */
1317extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1318_mm_storel_pi (__m64 *__P, __m128 __A)
1319{
1320 __vector unsigned long long __a = (__vector unsigned long long) __A;
1321
1322 *__P = __a[0];
1323}
1324
1325#ifdef _ARCH_PWR8
1326/* Intrinsic functions that require PowerISA 2.07 minimum. */
1327
1328/* Creates a 4-bit mask from the most significant bits of the SPFP values. */
1329extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1330_mm_movemask_ps (__m128 __A)
1331{
1332 __vector unsigned long long result;
1333 static const __vector unsigned int perm_mask =
1334 {
1335#ifdef __LITTLE_ENDIAN__
1336 0x00204060, 0x80808080, 0x80808080, 0x80808080
1337#else
1338 0x80808080, 0x80808080, 0x80808080, 0x00204060
1339#endif
1340 };
1341
1342 result = ((__vector unsigned long long)
1343 vec_vbpermq ((__vector unsigned char) __A,
1344 (__vector unsigned char) perm_mask));
1345
1346#ifdef __LITTLE_ENDIAN__
1347 return result[1];
1348#else
1349 return result[0];
1350#endif
1351}
1352#endif /* _ARCH_PWR8 */
1353
1354/* Create a vector with all four elements equal to *P. */
1355extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1356_mm_load1_ps (float const *__P)
1357{
1358 return _mm_set1_ps (*__P);
1359}
1360
1361extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1362_mm_load_ps1 (float const *__P)
1363{
1364 return _mm_load1_ps (__P);
1365}
1366
1367/* Extracts one of the four words of A. The selector N must be immediate. */
1368extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1369_mm_extract_pi16 (__m64 const __A, int const __N)
1370{
1371 unsigned int shiftr = __N & 3;
1372#ifdef __BIG_ENDIAN__
1373 shiftr = 3 - shiftr;
1374#endif
1375
1376 return ((__A >> (shiftr * 16)) & 0xffff);
1377}
1378
1379extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1380_m_pextrw (__m64 const __A, int const __N)
1381{
1382 return _mm_extract_pi16 (__A, __N);
1383}
1384
1385/* Inserts word D into one of four words of A. The selector N must be
1386 immediate. */
1387extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1388_mm_insert_pi16 (__m64 const __A, int const __D, int const __N)
1389{
1390 const int shiftl = (__N & 3) * 16;
1391 const __m64 shiftD = (const __m64) __D << shiftl;
1392 const __m64 mask = 0xffffUL << shiftl;
1393 __m64 result = (__A & (~mask)) | (shiftD & mask);
1394
1395 return (result);
1396}
1397
1398extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1399_m_pinsrw (__m64 const __A, int const __D, int const __N)
1400{
1401 return _mm_insert_pi16 (__A, __D, __N);
1402}
1403
1404/* Compute the element-wise maximum of signed 16-bit values. */
1405extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1406
1407_mm_max_pi16 (__m64 __A, __m64 __B)
1408{
1409#if _ARCH_PWR8
1410 __vector signed short a, b, r;
1411 __vector __bool short c;
1412
1413 a = (__vector signed short)vec_splats (__A);
1414 b = (__vector signed short)vec_splats (__B);
1415 c = (__vector __bool short)vec_cmpgt (a, b);
1416 r = vec_sel (b, a, c);
1417 return (__m64) ((__vector long long) r)[0];
1418#else
1419 __m64_union m1, m2, res;
1420
1421 m1.as_m64 = __A;
1422 m2.as_m64 = __B;
1423
1424 res.as_short[0] =
1425 (m1.as_short[0] > m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
1426 res.as_short[1] =
1427 (m1.as_short[1] > m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
1428 res.as_short[2] =
1429 (m1.as_short[2] > m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
1430 res.as_short[3] =
1431 (m1.as_short[3] > m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
1432
1433 return (__m64) res.as_m64;
1434#endif
1435}
1436
1437extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1438_m_pmaxsw (__m64 __A, __m64 __B)
1439{
1440 return _mm_max_pi16 (__A, __B);
1441}
1442
1443/* Compute the element-wise maximum of unsigned 8-bit values. */
1444extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1445_mm_max_pu8 (__m64 __A, __m64 __B)
1446{
1447#if _ARCH_PWR8
1448 __vector unsigned char a, b, r;
1449 __vector __bool char c;
1450
1451 a = (__vector unsigned char)vec_splats (__A);
1452 b = (__vector unsigned char)vec_splats (__B);
1453 c = (__vector __bool char)vec_cmpgt (a, b);
1454 r = vec_sel (b, a, c);
1455 return (__m64) ((__vector long long) r)[0];
1456#else
1457 __m64_union m1, m2, res;
1458 long i;
1459
1460 m1.as_m64 = __A;
1461 m2.as_m64 = __B;
1462
1463
1464 for (i = 0; i < 8; i++)
1465 res.as_char[i] =
1466 ((unsigned char) m1.as_char[i] > (unsigned char) m2.as_char[i]) ?
1467 m1.as_char[i] : m2.as_char[i];
1468
1469 return (__m64) res.as_m64;
1470#endif
1471}
1472
1473extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1474_m_pmaxub (__m64 __A, __m64 __B)
1475{
1476 return _mm_max_pu8 (__A, __B);
1477}
1478
1479/* Compute the element-wise minimum of signed 16-bit values. */
1480extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1481_mm_min_pi16 (__m64 __A, __m64 __B)
1482{
1483#if _ARCH_PWR8
1484 __vector signed short a, b, r;
1485 __vector __bool short c;
1486
1487 a = (__vector signed short)vec_splats (__A);
1488 b = (__vector signed short)vec_splats (__B);
1489 c = (__vector __bool short)vec_cmplt (a, b);
1490 r = vec_sel (b, a, c);
1491 return (__m64) ((__vector long long) r)[0];
1492#else
1493 __m64_union m1, m2, res;
1494
1495 m1.as_m64 = __A;
1496 m2.as_m64 = __B;
1497
1498 res.as_short[0] =
1499 (m1.as_short[0] < m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
1500 res.as_short[1] =
1501 (m1.as_short[1] < m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
1502 res.as_short[2] =
1503 (m1.as_short[2] < m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
1504 res.as_short[3] =
1505 (m1.as_short[3] < m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
1506
1507 return (__m64) res.as_m64;
1508#endif
1509}
1510
1511extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1512_m_pminsw (__m64 __A, __m64 __B)
1513{
1514 return _mm_min_pi16 (__A, __B);
1515}
1516
1517/* Compute the element-wise minimum of unsigned 8-bit values. */
1518extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1519_mm_min_pu8 (__m64 __A, __m64 __B)
1520{
1521#if _ARCH_PWR8
1522 __vector unsigned char a, b, r;
1523 __vector __bool char c;
1524
1525 a = (__vector unsigned char)vec_splats (__A);
1526 b = (__vector unsigned char)vec_splats (__B);
1527 c = (__vector __bool char)vec_cmplt (a, b);
1528 r = vec_sel (b, a, c);
1529 return (__m64) ((__vector long long) r)[0];
1530#else
1531 __m64_union m1, m2, res;
1532 long i;
1533
1534 m1.as_m64 = __A;
1535 m2.as_m64 = __B;
1536
1537
1538 for (i = 0; i < 8; i++)
1539 res.as_char[i] =
1540 ((unsigned char) m1.as_char[i] < (unsigned char) m2.as_char[i]) ?
1541 m1.as_char[i] : m2.as_char[i];
1542
1543 return (__m64) res.as_m64;
1544#endif
1545}
1546
1547extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1548_m_pminub (__m64 __A, __m64 __B)
1549{
1550 return _mm_min_pu8 (__A, __B);
1551}
1552
1553/* Create an 8-bit mask of the signs of 8-bit values. */
1554extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1555_mm_movemask_pi8 (__m64 __A)
1556{
1557 unsigned long long p =
1558#ifdef __LITTLE_ENDIAN__
1559 0x0008101820283038UL; // permute control for sign bits
1560#else
1561 0x3830282018100800UL; // permute control for sign bits
1562#endif
1563 return __builtin_bpermd (p, __A);
1564}
1565
1566extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1567_m_pmovmskb (__m64 __A)
1568{
1569 return _mm_movemask_pi8 (__A);
1570}
1571
1572/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
1573 in B and produce the high 16 bits of the 32-bit results. */
1574extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1575_mm_mulhi_pu16 (__m64 __A, __m64 __B)
1576{
1577 __vector unsigned short a, b;
1578 __vector unsigned short c;
1579 __vector unsigned int w0, w1;
1580 __vector unsigned char xform1 = {
1581#ifdef __LITTLE_ENDIAN__
1582 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
1583 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1584#else
1585 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15,
1586 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
1587#endif
1588 };
1589
1590 a = (__vector unsigned short)vec_splats (__A);
1591 b = (__vector unsigned short)vec_splats (__B);
1592
1593 w0 = vec_vmuleuh (a, b);
1594 w1 = vec_vmulouh (a, b);
1595 c = (__vector unsigned short)vec_perm (w0, w1, xform1);
1596
1597 return (__m64) ((__vector long long) c)[0];
1598}
1599
1600extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1601_m_pmulhuw (__m64 __A, __m64 __B)
1602{
1603 return _mm_mulhi_pu16 (__A, __B);
1604}
1605
1606/* Return a combination of the four 16-bit values in A. The selector
1607 must be an immediate. */
1608extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1609_mm_shuffle_pi16 (__m64 __A, int const __N)
1610{
1611 unsigned long element_selector_10 = __N & 0x03;
1612 unsigned long element_selector_32 = (__N >> 2) & 0x03;
1613 unsigned long element_selector_54 = (__N >> 4) & 0x03;
1614 unsigned long element_selector_76 = (__N >> 6) & 0x03;
1615 static const unsigned short permute_selectors[4] =
1616 {
1617#ifdef __LITTLE_ENDIAN__
1618 0x0908, 0x0B0A, 0x0D0C, 0x0F0E
1619#else
1620 0x0607, 0x0405, 0x0203, 0x0001
1621#endif
1622 };
1623 __m64_union t;
1624 __vector unsigned long long a, p, r;
1625
1626#ifdef __LITTLE_ENDIAN__
1627 t.as_short[0] = permute_selectors[element_selector_10];
1628 t.as_short[1] = permute_selectors[element_selector_32];
1629 t.as_short[2] = permute_selectors[element_selector_54];
1630 t.as_short[3] = permute_selectors[element_selector_76];
1631#else
1632 t.as_short[3] = permute_selectors[element_selector_10];
1633 t.as_short[2] = permute_selectors[element_selector_32];
1634 t.as_short[1] = permute_selectors[element_selector_54];
1635 t.as_short[0] = permute_selectors[element_selector_76];
1636#endif
1637 p = vec_splats (t.as_m64);
1638 a = vec_splats (__A);
1639 r = vec_perm (a, a, (__vector unsigned char)p);
1640 return (__m64) ((__vector long long) r)[0];
1641}
1642
1643extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1644_m_pshufw (__m64 __A, int const __N)
1645{
1646 return _mm_shuffle_pi16 (__A, __N);
1647}
1648
1649/* Conditionally store byte elements of A into P. The high bit of each
1650 byte in the selector N determines whether the corresponding byte from
1651 A is stored. */
1652extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1653_mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
1654{
1655 __m64 hibit = 0x8080808080808080UL;
1656 __m64 mask, tmp;
1657 __m64 *p = (__m64*)__P;
1658
1659 tmp = *p;
1660 mask = _mm_cmpeq_pi8 ((__N & hibit), hibit);
1661 tmp = (tmp & (~mask)) | (__A & mask);
1662 *p = tmp;
1663}
1664
1665extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1666_m_maskmovq (__m64 __A, __m64 __N, char *__P)
1667{
1668 _mm_maskmove_si64 (__A, __N, __P);
1669}
1670
1671/* Compute the rounded averages of the unsigned 8-bit values in A and B. */
1672extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1673_mm_avg_pu8 (__m64 __A, __m64 __B)
1674{
1675 __vector unsigned char a, b, c;
1676
1677 a = (__vector unsigned char)vec_splats (__A);
1678 b = (__vector unsigned char)vec_splats (__B);
1679 c = vec_avg (a, b);
1680 return (__m64) ((__vector long long) c)[0];
1681}
1682
1683extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1684_m_pavgb (__m64 __A, __m64 __B)
1685{
1686 return _mm_avg_pu8 (__A, __B);
1687}
1688
1689/* Compute the rounded averages of the unsigned 16-bit values in A and B. */
1690extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1691_mm_avg_pu16 (__m64 __A, __m64 __B)
1692{
1693 __vector unsigned short a, b, c;
1694
1695 a = (__vector unsigned short)vec_splats (__A);
1696 b = (__vector unsigned short)vec_splats (__B);
1697 c = vec_avg (a, b);
1698 return (__m64) ((__vector long long) c)[0];
1699}
1700
1701extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1702_m_pavgw (__m64 __A, __m64 __B)
1703{
1704 return _mm_avg_pu16 (__A, __B);
1705}
1706
1707/* Compute the sum of the absolute differences of the unsigned 8-bit
1708 values in A and B. Return the value in the lower 16-bit word; the
1709 upper words are cleared. */
1710extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1711_mm_sad_pu8 (__m64 __A, __m64 __B)
1712{
1713 __vector unsigned char a, b;
1714 __vector unsigned char vmin, vmax, vabsdiff;
1715 __vector signed int vsum;
1716 const __vector unsigned int zero =
1717 { 0, 0, 0, 0 };
1718 __m64_union result = {0};
1719
1720 a = (__vector unsigned char) (__vector unsigned long long) { 0UL, __A };
1721 b = (__vector unsigned char) (__vector unsigned long long) { 0UL, __B };
1722 vmin = vec_min (a, b);
1723 vmax = vec_max (a, b);
1724 vabsdiff = vec_sub (vmax, vmin);
1725 /* Sum four groups of bytes into integers. */
1726 vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
1727 /* Sum across four integers with integer result. */
1728 vsum = vec_sums (vsum, (__vector signed int) zero);
1729 /* The sum is in the right most 32-bits of the vector result.
1730 Transfer to a GPR and truncate to 16 bits. */
1731 result.as_short[0] = vsum[3];
1732 return result.as_m64;
1733}
1734
1735extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1736_m_psadbw (__m64 __A, __m64 __B)
1737{
1738 return _mm_sad_pu8 (__A, __B);
1739}
1740
1741/* Stores the data in A to the address P without polluting the caches. */
1742extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1743_mm_stream_pi (__m64 *__P, __m64 __A)
1744{
1745 /* Use the data cache block touch for store transient. */
1746 __asm__ (
1747 " dcbtstt 0,%0"
1748 :
1749 : "b" (__P)
1750 : "memory"
1751 );
1752 *__P = __A;
1753}
1754
1755/* Likewise. The address must be 16-byte aligned. */
1756extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1757_mm_stream_ps (float *__P, __m128 __A)
1758{
1759 /* Use the data cache block touch for store transient. */
1760 __asm__ (
1761 " dcbtstt 0,%0"
1762 :
1763 : "b" (__P)
1764 : "memory"
1765 );
1766 _mm_store_ps (__P, __A);
1767}
1768
1769/* Guarantees that every preceding store is globally visible before
1770 any subsequent store. */
1771extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1772_mm_sfence (void)
1773{
1774 /* Generate a light weight sync. */
1775 __atomic_thread_fence (__ATOMIC_RELEASE);
1776}
1777
1778/* The execution of the next instruction is delayed by an implementation
1779 specific amount of time. The instruction does not modify the
1780 architectural state. This is after the pop_options pragma because
1781 it does not require SSE support in the processor--the encoding is a
1782 nop on processors that do not support it. */
1783extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1784_mm_pause (void)
1785{
1786 /* There is no exact match with this construct, but the following is
1787 close to the desired effect. */
1788#if _ARCH_PWR8
1789 /* On power8 and later processors we can depend on Program Priority
1790 (PRI) and associated "very low" PPI setting. Since we don't know
1791 what PPI this thread is running at we: 1) save the current PRI
1792 from the PPR SPR into a local GRP, 2) set the PRI to "very low*
1793 via the special or 31,31,31 encoding. 3) issue an "isync" to
1794 insure the PRI change takes effect before we execute any more
1795 instructions.
1796 Now we can execute a lwsync (release barrier) while we execute
1797 this thread at "very low" PRI. Finally we restore the original
1798 PRI and continue execution. */
1799 unsigned long __PPR;
1800
1801 __asm__ volatile (
1802 " mfppr %0;"
1803 " or 31,31,31;"
1804 " isync;"
1805 " lwsync;"
1806 " isync;"
1807 " mtppr %0;"
1808 : "=r" (__PPR)
1809 :
1810 : "memory"
1811 );
1812#else
1813 /* For older processor where we may not even have Program Priority
1814 controls we can only depend on Heavy Weight Sync. */
1815 __atomic_thread_fence (__ATOMIC_SEQ_CST);
1816#endif
1817}
1818
1819/* Transpose the 4x4 matrix composed of row[0-3]. */
1820#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
1821do { \
1822 __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \
1823 __v4sf __t0 = vec_vmrghw (__r0, __r1); \
1824 __v4sf __t1 = vec_vmrghw (__r2, __r3); \
1825 __v4sf __t2 = vec_vmrglw (__r0, __r1); \
1826 __v4sf __t3 = vec_vmrglw (__r2, __r3); \
1827 (row0) = (__v4sf)vec_mergeh ((__vector long long)__t0, \
1828 (__vector long long)__t1); \
1829 (row1) = (__v4sf)vec_mergel ((__vector long long)__t0, \
1830 (__vector long long)__t1); \
1831 (row2) = (__v4sf)vec_mergeh ((__vector long long)__t2, \
1832 (__vector long long)__t3); \
1833 (row3) = (__v4sf)vec_mergel ((__vector long long)__t2, \
1834 (__vector long long)__t3); \
1835} while (0)
1836
1837/* For backward source compatibility. */
1838//# include <emmintrin.h>
1839
1840#else
1841#include_next <xmmintrin.h>
1842#endif /* defined(__linux__) && defined(__ppc64__) */
1843
1844#endif /* _XMMINTRIN_H_INCLUDED */
__device__ __2f16 b
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
__device__ __2f16 float c
__device__ float
static __inline unsigned char unsigned int unsigned int unsigned int * __p
Definition adxintrin.h:24
static __inline__ vector bool char __ATTRS_o_ai vec_cmpeq(vector signed char __a, vector signed char __b)
Definition altivec.h:1625
static __inline__ vector int __ATTRS_o_ai vec_vmrghw(vector int __a, vector int __b)
Definition altivec.h:4769
static __inline__ vector signed char __ATTRS_o_ai vec_ld(int __a, const vector signed char *__b)
Definition altivec.h:3504
#define vec_ctf(__a, __b)
Definition altivec.h:2950
static __inline__ vector int __ATTRS_o_ai vec_vupkhsh(vector short __a)
Definition altivec.h:11757
static __inline__ vector signed char __ATTRS_o_ai vec_splats(signed char __a)
Definition altivec.h:13710
static __inline__ vector short __ATTRS_o_ai vec_vupkhsb(vector signed char __a)
Definition altivec.h:11738
static __inline__ void __ATTRS_o_ai vec_st(vector signed char __a, int __b, vector signed char *__c)
Definition altivec.h:10278
static __inline__ vector signed char __ATTRS_o_ai vec_andc(vector signed char __a, vector signed char __b)
Definition altivec.h:1163
static __inline__ vector int __ATTRS_o_ai vec_sum4s(vector signed char __a, vector int __b)
Definition altivec.h:11531
static __inline__ vector signed char __ATTRS_o_ai vec_and(vector signed char __a, vector signed char __b)
Definition altivec.h:810
static __inline__ vector signed char __ATTRS_o_ai vec_avg(vector signed char __a, vector signed char __b)
Definition altivec.h:1514
static __inline__ vector signed char __ATTRS_o_ai vec_mergel(vector signed char __a, vector signed char __b)
Definition altivec.h:4804
static __inline__ vector int __ATTRS_o_ai vec_vmrglw(vector int __a, vector int __b)
Definition altivec.h:5032
static __inline__ vector signed char __ATTRS_o_ai vec_perm(vector signed char __a, vector signed char __b, vector unsigned char __c)
Definition altivec.h:7320
static __inline__ vector signed char __ATTRS_o_ai vec_sel(vector signed char __a, vector signed char __b, vector unsigned char __c)
Definition altivec.h:7834
static __inline__ vector signed char __ATTRS_o_ai vec_mergeh(vector signed char __a, vector signed char __b)
Definition altivec.h:4534
static __inline__ vector int __ATTRS_o_ai vec_vupklsh(vector short __a)
Definition altivec.h:11896
static __inline__ vector bool char __ATTRS_o_ai vec_cmplt(vector signed char __a, vector signed char __b)
Definition altivec.h:2196
static __inline__ vector signed char __ATTRS_o_ai vec_max(vector signed char __a, vector signed char __b)
Definition altivec.h:4281
static __inline__ vector signed char __ATTRS_o_ai vec_nor(vector signed char __a, vector signed char __b)
Definition altivec.h:6098
static __inline__ vector bool char __ATTRS_o_ai vec_cmpge(vector signed char __a, vector signed char __b)
Definition altivec.h:2024
static __inline__ vector signed char __ATTRS_o_ai vec_pack(vector signed short __a, vector signed short __b)
Definition altivec.h:6747
static __inline__ vector float __ATTRS_o_ai vec_re(vector float __a)
Definition altivec.h:7621
static __inline__ vector signed char __ATTRS_o_ai vec_min(vector signed char __a, vector signed char __b)
Definition altivec.h:5185
#define vec_cts
Definition altivec.h:2981
static __inline__ vector signed char __ATTRS_o_ai vec_splat(vector signed char __a, unsigned const int __b)
Definition altivec.h:9240
static __inline__ vector signed char __ATTRS_o_ai vec_or(vector signed char __a, vector signed char __b)
Definition altivec.h:6234
static __inline__ vector signed char __ATTRS_o_ai vec_abs(vector signed char __a)
Definition altivec.h:115
static __inline__ vector unsigned char __ATTRS_o_ai vec_xor(vector unsigned char __a, vector unsigned char __b)
Definition altivec.h:12223
static __inline__ vector float __ATTRS_o_ai vec_rsqrte(vector float __a)
Definition altivec.h:7809
static __inline__ vector bool char __ATTRS_o_ai vec_cmpgt(vector signed char __a, vector signed char __b)
Definition altivec.h:1964
static __inline__ vector bool char __ATTRS_o_ai vec_cmple(vector signed char __a, vector signed char __b)
Definition altivec.h:2140
static __inline__ vector signed char __ATTRS_o_ai vec_sub(vector signed char __a, vector signed char __b)
Definition altivec.h:10963
static __inline__ void int __a
Definition emmintrin.h:4185
void _mm_pause(void)
Indicates that a spin loop is being executed for the purposes of optimizing power consumption during ...
static __inline__ unsigned int unsigned char __D
Definition ia32intrin.h:283
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
Compares the 8-bit integer elements of two 64-bit integer vectors of [8 x i8] to determine if the ele...
Definition mmintrin.h:1159
#define as_short(x)
Definition opencl-c.h:6358
static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition xmmintrin.h:1089
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttss_si32(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
Definition xmmintrin.h:1393
#define _m_pinsrw
Definition xmmintrin.h:2985
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rcp_ss(__m128 __a)
Calculates the approximate reciprocal of the value stored in the low-order bits of a 128-bit vector o...
Definition xmmintrin.h:249
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmplt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:545
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ss(__m128 __a)
Calculates the square root of the value stored in the low-order bits of a 128-bit vector of [4 x floa...
Definition xmmintrin.h:214
#define _MM_HINT_ET0
Definition xmmintrin.h:2068
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_div_ps(__m128 __a, __m128 __b)
Divides two 128-bit vectors of [4 x float].
Definition xmmintrin.h:196
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnge_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:880
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpeq_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] for equa...
Definition xmmintrin.h:503
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.
Definition xmmintrin.h:1903
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtps_pi32(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
Definition xmmintrin.h:1358
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ss(float __w)
Constructs a 128-bit floating-point vector of [4 x float].
Definition xmmintrin.h:1796
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_and_ps(__m128 __a, __m128 __b)
Performs a bitwise AND of two 128-bit vectors of [4 x float].
Definition xmmintrin.h:404
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmplt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:526
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvt_ss2si(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
Definition xmmintrin.h:1320
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpeq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for equality and returns the ...
Definition xmmintrin.h:485
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_add_ps(__m128 __a, __m128 __b)
Adds two 128-bit vectors of [4 x float], and returns the results of the addition.
Definition xmmintrin.h:70
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an aligned memory location.
Definition xmmintrin.h:1723
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mulhi_pu16(__m64 __a, __m64 __b)
Multiplies packed 16-bit unsigned integer values and writes the high-order 16 bits of each 32-bit pro...
Definition xmmintrin.h:2325
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpneq_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] for ineq...
Definition xmmintrin.h:719
static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition xmmintrin.h:1065
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvt_ps2pi(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
Definition xmmintrin.h:1374
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_andnot_ps(__m128 __a, __m128 __b)
Performs a bitwise AND of two 128-bit vectors of [4 x float], using the one's complement of the value...
Definition xmmintrin.h:426
static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_ps(float *__p, __m128 __a)
Stores float values from a 128-bit vector of [4 x float] to an aligned memory location in reverse ord...
Definition xmmintrin.h:2062
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_undefined_ps(void)
Create a 128-bit vector of [4 x float] with undefined values.
Definition xmmintrin.h:1776
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttps_pi32(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
Definition xmmintrin.h:1451
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnle_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:788
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition xmmintrin.h:1185
#define _m_pmulhuw
Definition xmmintrin.h:2991
#define _m_pmaxub
Definition xmmintrin.h:2987
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmple_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:569
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_ps(float *__p, __m128 __a)
Moves packed float values from a 128-bit vector of [4 x float] to a 128-bit aligned memory location.
Definition xmmintrin.h:2139
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvt_si2ss(__m128 __a, int __b)
Converts a 32-bit signed integer value into a floating point value and writes it to the lower 32 bits...
Definition xmmintrin.h:1513
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi16_ps(__m64 __a)
Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x float].
Definition xmmintrin.h:2729
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rsqrt_ps(__m128 __a)
Calculates the approximate reciprocals of the square roots of the values stored in a 128-bit vector o...
Definition xmmintrin.h:302
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtps_pi8(__m128 __a)
Converts each single-precision floating-point element of a 128-bit floating-point vector of [4 x floa...
Definition xmmintrin.h:2899
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pi(__m64 *__p, __m128 __a)
Stores the lower 64 bits of a 128-bit vector of [4 x float] to a memory location.
Definition xmmintrin.h:1941
#define _m_pmaxsw
Definition xmmintrin.h:2986
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition xmmintrin.h:1210
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpge_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:677
#define _mm_shuffle_ps(a, b, mask)
Selects 4 float values from the 128-bit operands of [4 x float], as specified by the immediate value ...
Definition xmmintrin.h:2603
#define _m_pavgw
Definition xmmintrin.h:2995
static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for equality and returns the ...
Definition xmmintrin.h:1016
static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_ps(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] into four contiguous elements in an align...
Definition xmmintrin.h:2023
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpu16_ps(__m64 __a)
Converts a 64-bit vector of 16-bit unsigned integer values into a 128-bit vector of [4 x float].
Definition xmmintrin.h:2759
void _mm_sfence(void)
Forces strong memory ordering (serialization) between store instructions preceding this instruction a...
#define _mm_load_ps1(p)
Definition xmmintrin.h:1709
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ps1(float __w)
Constructs a 128-bit floating-point vector of [4 x float], with each of the four single-precision flo...
Definition xmmintrin.h:1833
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mul_ps(__m128 __a, __m128 __b)
Multiplies two 128-bit vectors of [4 x float] and returns the results of the multiplication.
Definition xmmintrin.h:155
#define _MM_HINT_ET1
Definition xmmintrin.h:2069
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_max_ps(__m128 __a, __m128 __b)
Compares two 128-bit vectors of [4 x float] and returns the greater of each pair of values.
Definition xmmintrin.h:386
#define _m_pextrw
Definition xmmintrin.h:2984
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rsqrt_ss(__m128 __a)
Calculates the approximate reciprocal of the square root of the value stored in the low-order bits of...
Definition xmmintrin.h:285
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition xmmintrin.h:1260
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_avg_pu16(__m64 __a, __m64 __b)
Computes the rounded averages of the packed unsigned 16-bit integer values and writes the averages to...
Definition xmmintrin.h:2426
static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition xmmintrin.h:1041
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadl_pi(__m128 __a, const __m64 *__p)
Loads two packed float values from the address __p into the low-order bits of a 128-bit vector of [4 ...
Definition xmmintrin.h:1651
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] to an unaligned memory location.
Definition xmmintrin.h:1983
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sub_ss(__m128 __a, __m128 __b)
Subtracts the 32-bit float value in the low-order bits of the second operand from the corresponding v...
Definition xmmintrin.h:92
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sub_ps(__m128 __a, __m128 __b)
Subtracts each of the values of the second operand from the first operand, both of which are 128-bit ...
Definition xmmintrin.h:113
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load1_ps(const float *__p)
Loads a 32-bit float value and duplicates it to all four vector elements of a 128-bit vector of [4 x ...
Definition xmmintrin.h:1700
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_movelh_ps(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
Definition xmmintrin.h:2711
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_min_ps(__m128 __a, __m128 __b)
Compares two 128-bit vectors of [4 x float] and returns the lesser of each pair of values.
Definition xmmintrin.h:344
#define _m_pavgb
Definition xmmintrin.h:2994
#define _m_pmovmskb
Definition xmmintrin.h:2990
static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition xmmintrin.h:1113
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtss_si32(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
Definition xmmintrin.h:1302
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpgt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:632
#define _m_psadbw
Definition xmmintrin.h:2996
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_setr_ps(float __z, float __y, float __x, float __w)
Constructs a 128-bit floating-point vector of [4 x float], initialized in reverse order with the spec...
Definition xmmintrin.h:1888
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition xmmintrin.h:1235
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_unpackhi_ps(__m128 __a, __m128 __b)
Unpacks the high-order (index 2,3) values from two 128-bit vectors of [4 x float] and interleaves the...
Definition xmmintrin.h:2624
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ss(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] to a memory location.
Definition xmmintrin.h:1962
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpngt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:833
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadh_pi(__m128 __a, const __m64 *__p)
Loads two packed float values from the address __p into the high-order bits of a 128-bit vector of [4...
Definition xmmintrin.h:1624
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_xor_ps(__m128 __a, __m128 __b)
Performs a bitwise exclusive OR of two 128-bit vectors of [4 x float].
Definition xmmintrin.h:463
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rcp_ps(__m128 __a)
Calculates the approximate reciprocals of the values stored in a 128-bit vector of [4 x float].
Definition xmmintrin.h:266
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_move_ss(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
Definition xmmintrin.h:2668
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set1_ps(float __w)
Constructs a 128-bit floating-point vector of [4 x float], with each of the four single-precision flo...
Definition xmmintrin.h:1814
static __inline__ void __DEFAULT_FN_ATTRS_MMX _mm_stream_pi(__m64 *__p, __m64 __a)
Stores a 64-bit integer in the specified aligned memory location.
Definition xmmintrin.h:2120
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] into an aligned memory location.
Definition xmmintrin.h:2004
#define _MM_HINT_T0
Definition xmmintrin.h:2070
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_or_ps(__m128 __a, __m128 __b)
Performs a bitwise OR of two 128-bit vectors of [4 x float].
Definition xmmintrin.h:444
#define _mm_extract_pi16(a, n)
Extracts 16-bit element from a 64-bit vector of [4 x i16] and returns it, as specified by the immedia...
Definition xmmintrin.h:2183
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ps(__m128 __a)
Calculates the square roots of the values stored in a 128-bit vector of [4 x float].
Definition xmmintrin.h:231
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpneq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for inequality and returns th...
Definition xmmintrin.h:700
#define _mm_prefetch(a, sel)
Loads one cache line of data from the specified address to a location closer to the processor.
Definition xmmintrin.h:2103
static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtss_f32(__m128 __a)
Extracts a float value contained in the lower 32 bits of a vector of [4 x float].
Definition xmmintrin.h:1603
#define _m_pminsw
Definition xmmintrin.h:2988
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_max_pu8(__m64 __a, __m64 __b)
Compares each of the corresponding packed 8-bit unsigned integer values of the 64-bit integer vectors...
Definition xmmintrin.h:2250
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mul_ss(__m128 __a, __m128 __b)
Multiplies two 32-bit float values in the low-order bits of the operands.
Definition xmmintrin.h:135
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_min_pi16(__m64 __a, __m64 __b)
Compares each of the corresponding packed 16-bit integer values of the 64-bit integer vectors,...
Definition xmmintrin.h:2269
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsi32_ss(__m128 __a, int __b)
Converts a 32-bit signed integer value into a floating point value and writes it to the lower 32 bits...
Definition xmmintrin.h:1490
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtt_ps2pi(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
Definition xmmintrin.h:1468
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtt_ss2si(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
Definition xmmintrin.h:1412
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_ps(__m128 __a)
Extracts the sign bits from each single-precision floating-point element of a 128-bit floating-point ...
Definition xmmintrin.h:2924
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
Converts the two 32-bit signed integer values from each 64-bit vector operand of [2 x i32] into a 128...
Definition xmmintrin.h:2840
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_movehl_ps(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
Definition xmmintrin.h:2690
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadr_ps(const float *__p)
Loads four packed float values, in reverse order, from an aligned memory location to 32-bit elements ...
Definition xmmintrin.h:1762
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpord_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:927
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnlt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:763
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pi(__m64 *__p, __m128 __a)
Stores the upper 64 bits of a 128-bit vector of [4 x float] to a memory location.
Definition xmmintrin.h:1920
#define _MM_HINT_T1
Definition xmmintrin.h:2071
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpngt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:855
#define _m_pshufw
Definition xmmintrin.h:2992
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnge_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:902
#define _m_maskmovq
Definition xmmintrin.h:2993
#define _mm_insert_pi16(a, d, n)
Copies data from the 64-bit vector of [4 x i16] to the destination, and inserts the lower 16-bits of ...
Definition xmmintrin.h:2214
#define _mm_shuffle_pi16(a, n)
Shuffles the 4 16-bit integers from a 64-bit integer vector to the destination, as specified by the i...
Definition xmmintrin.h:2361
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpord_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:947
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpgt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:611
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvt_pi2ps(__m128 __a, __m64 __b)
Converts two elements of a 64-bit vector of [2 x i32] into two floating point values and writes them ...
Definition xmmintrin.h:1586
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition xmmintrin.h:1161
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_add_ss(__m128 __a, __m128 __b)
Adds the 32-bit float values in the low-order bits of the operands.
Definition xmmintrin.h:50
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ps(float __z, float __y, float __x, float __w)
Constructs a 128-bit floating-point vector of [4 x float] initialized with the specified single-preci...
Definition xmmintrin.h:1860
static __inline__ int __DEFAULT_FN_ATTRS_MMX _mm_movemask_pi8(__m64 __a)
Takes the most significant bit from each 8-bit element in a 64-bit integer vector to create an 8-bit ...
Definition xmmintrin.h:2306
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnlt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:743
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ps1(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] into four contiguous elements in an align...
Definition xmmintrin.h:2043
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_min_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands and returns the lesser value ...
Definition xmmintrin.h:325
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpu8_ps(__m64 __a)
Converts the lower four unsigned 8-bit integer values from a 64-bit vector of [8 x u8] into a 128-bit...
Definition xmmintrin.h:2813
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmple_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:588
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_avg_pu8(__m64 __a, __m64 __b)
Computes the rounded averages of the packed unsigned 8-bit integer values and writes the averages to ...
Definition xmmintrin.h:2407
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi8_ps(__m64 __a)
Converts the lower four 8-bit values from a 64-bit vector of [8 x i8] into a 128-bit vector of [4 x f...
Definition xmmintrin.h:2788
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_unpacklo_ps(__m128 __a, __m128 __b)
Unpacks the low-order (index 0,1) values from two 128-bit vectors of [4 x float] and interleaves them...
Definition xmmintrin.h:2646
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_max_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands and returns the greater value...
Definition xmmintrin.h:367
static __inline__ void __DEFAULT_FN_ATTRS_MMX _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
Conditionally copies the values from each 8-bit element in the first 64-bit integer vector operand to...
Definition xmmintrin.h:2388
#define _m_pminub
Definition xmmintrin.h:2989
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_max_pi16(__m64 __a, __m64 __b)
Compares each of the corresponding packed 16-bit integer values of the 64-bit integer vectors,...
Definition xmmintrin.h:2231
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition xmmintrin.h:1284
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtps_pi16(__m128 __a)
Converts each single-precision floating-point element of a 128-bit floating-point vector of [4 x floa...
Definition xmmintrin.h:2869
#define _MM_HINT_NTA
Definition xmmintrin.h:2073
static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition xmmintrin.h:1137
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_min_pu8(__m64 __a, __m64 __b)
Compares each of the corresponding packed 8-bit unsigned integer values of the 64-bit integer vectors...
Definition xmmintrin.h:2288
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnle_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:808
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sad_pu8(__m64 __a, __m64 __b)
Subtracts the corresponding 8-bit unsigned integer values of the two 64-bit vector operands and compu...
Definition xmmintrin.h:2448
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_ps(__m128 __a, __m64 __b)
Converts two elements of a 64-bit vector of [2 x i32] into two floating point values and writes them ...
Definition xmmintrin.h:1563
#define _MM_HINT_T2
Definition xmmintrin.h:2072
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpunord_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:972
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpge_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:656
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpunord_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:992
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadu_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an unaligned memory location.
Definition xmmintrin.h:1740
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load_ss(const float *__p)
Constructs a 128-bit floating-point vector of [4 x float].
Definition xmmintrin.h:1678
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_div_ss(__m128 __a, __m128 __b)
Divides the value in the low-order 32 bits of the first operand by the corresponding value in the sec...
Definition xmmintrin.h:177