etiss/ppc__wrappers_2xmmintrin_8h_source.html

/*===---- xmmintrin.h - Implementation of SSE intrinsics on PowerPC --------===

 *

 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

 * See https://llvm.org/LICENSE.txt for license information.

 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 *

 *===-----------------------------------------------------------------------===

 */


/* Implemented from the specification included in the Intel C++ Compiler

   User Guide and Reference, version 9.0.  */


#ifndef NO_WARN_X86_INTRINSICS

/* This header file is to help porting code using Intel intrinsics

   explicitly from x86_64 to powerpc64/powerpc64le.


   Since X86 SSE intrinsics mainly handles __m128 type, PowerPC

   VMX/VSX ISA is a good match for vector float SIMD operations.

   However scalar float operations in vector (XMM) registers require

   the POWER8 VSX ISA (2.07) level. There are differences for data

   format and placement of float scalars in the vector register, which

   require extra steps to match SSE scalar float semantics on POWER.


   It should be noted that there's much difference between X86_64's

   MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use

   portable <fenv.h> instead of access MXSCR directly.


   Most SSE scalar float intrinsic operations can be performed more

   efficiently as C language float scalar operations or optimized to

   use vector SIMD operations. We recommend this for new applications. */

#error "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."

#endif


#ifndef _XMMINTRIN_H_INCLUDED

#define _XMMINTRIN_H_INCLUDED


#if defined(__linux__) && defined(__ppc64__)


/* Define four value permute mask */

#define _MM_SHUFFLE(w,x,y,z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z))


#include <altivec.h>


/* Avoid collisions between altivec.h and strict adherence to C++ and

   C11 standards.  This should eventually be done inside altivec.h itself,

   but only after testing a full distro build.  */

#if defined(__STRICT_ANSI__) && (defined(__cplusplus) || \

                                 (defined(__STDC_VERSION__) &&  \

                                  __STDC_VERSION__ >= 201112L))

#undef vector

#undef pixel

#undef bool

#endif


/* We need type definitions from the MMX header file.  */

#include <mmintrin.h>


/* Get _mm_malloc () and _mm_free ().  */

#if __STDC_HOSTED__

#include <mm_malloc.h>

#endif


/* The Intel API is flexible enough that we must allow aliasing with other

   vector types, and their scalar components.  */

typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));


/* Unaligned version of the same type.  */

typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__,

                                       __aligned__ (1)));


/* Internal data types for implementing the intrinsics.  */

typedef float __v4sf __attribute__ ((__vector_size__ (16)));


/* Create an undefined vector.  */

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_undefined_ps (void)

{

  __m128 __Y = __Y;

  return __Y;

}


/* Create a vector of zeros.  */

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_setzero_ps (void)

{

  return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };

}


/* Load four SPFP values from P.  The address must be 16-byte aligned.  */

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_load_ps (float const *__P)

{

  return ((__m128)vec_ld(0, (__v4sf*)__P));

}


/* Load four SPFP values from P.  The address need not be 16-byte aligned.  */

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_loadu_ps (float const *__P)

{

  return (vec_vsx_ld(0, __P));

}


/* Load four SPFP values in reverse order.  The address must be aligned.  */

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_loadr_ps (float const *__P)

{

  __v4sf   __tmp;

  __m128 result;

  static const __vector unsigned char permute_vector =

    { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,

        0x17, 0x10, 0x11, 0x12, 0x13 };


  __tmp = vec_ld (0, (__v4sf *) __P);

  result = (__m128) vec_perm (__tmp, __tmp, permute_vector);

  return result;

}


/* Create a vector with all four elements equal to F.  */

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_set1_ps (float __F)

{

  return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F };

}


extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_set_ps1 (float __F)

{

  return _mm_set1_ps (__F);

}


/* Create the vector [Z Y X W].  */

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_set_ps (const float __Z, const float __Y, const float __X, const float __W)

{

  return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z };

}


/* Create the vector [W X Y Z].  */

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_setr_ps (float __Z, float __Y, float __X, float __W)

{

  return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };

}


/* Store four SPFP values.  The address must be 16-byte aligned.  */

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_store_ps (float *__P, __m128 __A)

{

  vec_st((__v4sf)__A, 0, (__v4sf*)__P);

}


/* Store four SPFP values.  The address need not be 16-byte aligned.  */

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_storeu_ps (float *__P, __m128 __A)

{

  *(__m128_u *)__P = __A;

}


/* Store four SPFP values in reverse order.  The address must be aligned.  */

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_storer_ps (float *__P, __m128 __A)

{

  __v4sf   __tmp;

  static const __vector unsigned char permute_vector =

    { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,

        0x17, 0x10, 0x11, 0x12, 0x13 };


  __tmp = (__m128) vec_perm (__A, __A, permute_vector);


  _mm_store_ps (__P, __tmp);

}


/* Store the lower SPFP value across four words.  */

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_store1_ps (float *__P, __m128 __A)

{

  __v4sf __va = vec_splat((__v4sf)__A, 0);

  _mm_store_ps (__P, __va);

}


extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_store_ps1 (float *__P, __m128 __A)

{

  _mm_store1_ps (__P, __A);

}


/* Create a vector with element 0 as F and the rest zero.  */

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_set_ss (float __F)

{

  return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f };

}


/* Sets the low SPFP value of A from the low value of B.  */

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_move_ss (__m128 __A, __m128 __B)

{

  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};


  return (vec_sel ((__v4sf)__A, (__v4sf)__B, mask));

}


/* Create a vector with element 0 as *P and the rest zero.  */

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_load_ss (float const *__P)

{

  return _mm_set_ss (*__P);

}


/* Stores the lower SPFP value.  */

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_store_ss (float *__P, __m128 __A)

{

  *__P = ((__v4sf)__A)[0];

}


/* Perform the respective operation on the lower SPFP (single-precision

   floating-point) values of A and B; the upper three SPFP values are

   passed through from A.  */


extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_add_ss (__m128 __A, __m128 __B)

{

#ifdef _ARCH_PWR7

  __m128 a, b, c;

  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};

  /* PowerISA VSX does not allow partial (for just lower double)

     results. So to insure we don't generate spurious exceptions

     (from the upper double values) we splat the lower double

     before we to the operation.  */

  a = vec_splat (__A, 0);

  b = vec_splat (__B, 0);

  c = a + b;

  /* Then we merge the lower float result with the original upper

     float elements from __A.  */

  return (vec_sel (__A, c, mask));

#else

  __A[0] = __A[0] + __B[0];

  return (__A);

#endif

}


extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_sub_ss (__m128 __A, __m128 __B)

{

#ifdef _ARCH_PWR7

  __m128 a, b, c;

  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};

  /* PowerISA VSX does not allow partial (for just lower double)

     results. So to insure we don't generate spurious exceptions

     (from the upper double values) we splat the lower double

     before we to the operation.  */

  a = vec_splat (__A, 0);

  b = vec_splat (__B, 0);

  c = a - b;

  /* Then we merge the lower float result with the original upper

     float elements from __A.  */

  return (vec_sel (__A, c, mask));

#else

  __A[0] = __A[0] - __B[0];

  return (__A);

#endif

}


extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_mul_ss (__m128 __A, __m128 __B)

{

#ifdef _ARCH_PWR7

  __m128 a, b, c;

  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};

  /* PowerISA VSX does not allow partial (for just lower double)

     results. So to insure we don't generate spurious exceptions

     (from the upper double values) we splat the lower double

     before we to the operation.  */

  a = vec_splat (__A, 0);

  b = vec_splat (__B, 0);

  c = a * b;

  /* Then we merge the lower float result with the original upper

     float elements from __A.  */

  return (vec_sel (__A, c, mask));

#else

  __A[0] = __A[0] * __B[0];

  return (__A);

#endif

}


extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_div_ss (__m128 __A, __m128 __B)

{

#ifdef _ARCH_PWR7

  __m128 a, b, c;

  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};

  /* PowerISA VSX does not allow partial (for just lower double)

     results. So to insure we don't generate spurious exceptions

     (from the upper double values) we splat the lower double

     before we to the operation.  */

  a = vec_splat (__A, 0);

  b = vec_splat (__B, 0);

  c = a / b;

  /* Then we merge the lower float result with the original upper

     float elements from __A.  */

  return (vec_sel (__A, c, mask));

#else

  __A[0] = __A[0] / __B[0];

  return (__A);

#endif

}


extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_sqrt_ss (__m128 __A)

{

  __m128 a, c;

  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};

  /* PowerISA VSX does not allow partial (for just lower double)

   * results. So to insure we don't generate spurious exceptions

   * (from the upper double values) we splat the lower double

   * before we to the operation. */

  a = vec_splat (__A, 0);

  c = vec_sqrt (a);

  /* Then we merge the lower float result with the original upper

   * float elements from __A.  */

  return (vec_sel (__A, c, mask));

}


/* Perform the respective operation on the four SPFP values in A and B.  */

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_add_ps (__m128 __A, __m128 __B)

{

  return (__m128) ((__v4sf)__A + (__v4sf)__B);

}


extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_sub_ps (__m128 __A, __m128 __B)

{

  return (__m128) ((__v4sf)__A - (__v4sf)__B);

}


extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_mul_ps (__m128 __A, __m128 __B)

{

  return (__m128) ((__v4sf)__A * (__v4sf)__B);

}


extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_div_ps (__m128 __A, __m128 __B)

{

  return (__m128) ((__v4sf)__A / (__v4sf)__B);

}


extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_sqrt_ps (__m128 __A)

{

  return (vec_sqrt ((__v4sf)__A));

}


extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_rcp_ps (__m128 __A)

{

  return (vec_re ((__v4sf)__A));

}


extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_rsqrt_ps (__m128 __A)

{

  return (vec_rsqrte (__A));

}


extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_rcp_ss (__m128 __A)

{

  __m128 a, c;

  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};

  /* PowerISA VSX does not allow partial (for just lower double)

   * results. So to insure we don't generate spurious exceptions

   * (from the upper double values) we splat the lower double

   * before we to the operation. */

  a = vec_splat (__A, 0);

  c = _mm_rcp_ps (a);

  /* Then we merge the lower float result with the original upper

   * float elements from __A.  */

  return (vec_sel (__A, c, mask));

}


extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_rsqrt_ss (__m128 __A)

{

  __m128 a, c;

  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};

  /* PowerISA VSX does not allow partial (for just lower double)

   * results. So to insure we don't generate spurious exceptions

   * (from the upper double values) we splat the lower double

   * before we to the operation. */

  a = vec_splat (__A, 0);

  c = vec_rsqrte (a);

  /* Then we merge the lower float result with the original upper

   * float elements from __A.  */

  return (vec_sel (__A, c, mask));

}


extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_min_ss (__m128 __A, __m128 __B)

{

  __v4sf a, b, c;

  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};

  /* PowerISA VSX does not allow partial (for just lower float)

   * results. So to insure we don't generate spurious exceptions

   * (from the upper float values) we splat the lower float

   * before we to the operation. */

  a = vec_splat ((__v4sf)__A, 0);

  b = vec_splat ((__v4sf)__B, 0);

  c = vec_min (a, b);

  /* Then we merge the lower float result with the original upper

   * float elements from __A.  */

  return (vec_sel ((__v4sf)__A, c, mask));

}


extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_max_ss (__m128 __A, __m128 __B)

{

  __v4sf a, b, c;

  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};

  /* PowerISA VSX does not allow partial (for just lower float)

   * results. So to insure we don't generate spurious exceptions

   * (from the upper float values) we splat the lower float

   * before we to the operation. */

  a = vec_splat (__A, 0);

  b = vec_splat (__B, 0);

  c = vec_max (a, b);

  /* Then we merge the lower float result with the original upper

   * float elements from __A.  */

  return (vec_sel ((__v4sf)__A, c, mask));

}


extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_min_ps (__m128 __A, __m128 __B)

{

  __vector __bool int m = vec_cmpgt ((__v4sf) __B, (__v4sf) __A);

  return vec_sel (__B, __A, m);

}


extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_max_ps (__m128 __A, __m128 __B)

{

  __vector __bool int m = vec_cmpgt ((__v4sf) __A, (__v4sf) __B);

  return vec_sel (__B, __A, m);

}


/* Perform logical bit-wise operations on 128-bit values.  */

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_and_ps (__m128 __A, __m128 __B)

{

  return ((__m128)vec_and ((__v4sf)__A, (__v4sf)__B));

//  return __builtin_ia32_andps (__A, __B);

}


extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_andnot_ps (__m128 __A, __m128 __B)

{

  return ((__m128)vec_andc ((__v4sf)__B, (__v4sf)__A));

}


extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_or_ps (__m128 __A, __m128 __B)

{

  return ((__m128)vec_or ((__v4sf)__A, (__v4sf)__B));

}


extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_xor_ps (__m128 __A, __m128 __B)

{

  return ((__m128)vec_xor ((__v4sf)__A, (__v4sf)__B));

}


/* Perform a comparison on the four SPFP values of A and B.  For each

   element, if the comparison is true, place a mask of all ones in the

   result, otherwise a mask of zeros.  */

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cmpeq_ps (__m128 __A, __m128 __B)

{

  return ((__m128)vec_cmpeq ((__v4sf)__A,(__v4sf) __B));

}


extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cmplt_ps (__m128 __A, __m128 __B)

{

  return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));

}


extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cmple_ps (__m128 __A, __m128 __B)

{

  return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));

}


extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cmpgt_ps (__m128 __A, __m128 __B)

{

  return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));

}


extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cmpge_ps (__m128 __A, __m128 __B)

{

  return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));

}


extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cmpneq_ps (__m128  __A, __m128  __B)

{

  __v4sf temp = (__v4sf ) vec_cmpeq ((__v4sf) __A, (__v4sf)__B);

  return ((__m128)vec_nor (temp, temp));

}


extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cmpnlt_ps (__m128 __A, __m128 __B)

{

  return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));

}


extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cmpnle_ps (__m128 __A, __m128 __B)

{

  return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));

}


extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cmpngt_ps (__m128 __A, __m128 __B)

{

  return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));

}


extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cmpnge_ps (__m128 __A, __m128 __B)

{

  return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));

}


extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cmpord_ps (__m128  __A, __m128  __B)

{

  __vector unsigned int a, b;

  __vector unsigned int c, d;

  static const __vector unsigned int float_exp_mask =

    { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };


  a = (__vector unsigned int) vec_abs ((__v4sf)__A);

  b = (__vector unsigned int) vec_abs ((__v4sf)__B);

  c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);

  d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);

  return ((__m128 ) vec_and (c, d));

}


extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cmpunord_ps (__m128 __A, __m128 __B)

{

  __vector unsigned int a, b;

  __vector unsigned int c, d;

  static const __vector unsigned int float_exp_mask =

    { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };


  a = (__vector unsigned int) vec_abs ((__v4sf)__A);

  b = (__vector unsigned int) vec_abs ((__v4sf)__B);

  c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);

  d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);

  return ((__m128 ) vec_or (c, d));

}


/* Perform a comparison on the lower SPFP values of A and B.  If the

   comparison is true, place a mask of all ones in the result, otherwise a

   mask of zeros.  The upper three SPFP values are passed through from A.  */

extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cmpeq_ss (__m128  __A, __m128  __B)

{

  static const __vector unsigned int mask =

    { 0xffffffff, 0, 0, 0 };

  __v4sf a, b, c;

  /* PowerISA VMX does not allow partial (for just element 0)

   * results. So to insure we don't generate spurious exceptions

   * (from the upper elements) we splat the lower float

   * before we to the operation. */

  a = vec_splat ((__v4sf) __A, 0);

  b = vec_splat ((__v4sf) __B, 0);

  c = (__v4sf) vec_cmpeq(a, b);

  /* Then we merge the lower float result with the original upper

   * float elements from __A.  */

  return ((__m128)vec_sel ((__v4sf)__A, c, mask));

}


extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cmplt_ss (__m128 __A, __m128 __B)

{

  static const __vector unsigned int mask =

    { 0xffffffff, 0, 0, 0 };

  __v4sf a, b, c;

  /* PowerISA VMX does not allow partial (for just element 0)

   * results. So to insure we don't generate spurious exceptions

   * (from the upper elements) we splat the lower float

   * before we to the operation. */

  a = vec_splat ((__v4sf) __A, 0);

  b = vec_splat ((__v4sf) __B, 0);

  c = (__v4sf) vec_cmplt(a, b);

  /* Then we merge the lower float result with the original upper

   * float elements from __A.  */

  return ((__m128)vec_sel ((__v4sf)__A, c, mask));

}


extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cmple_ss (__m128 __A, __m128 __B)

{

  static const __vector unsigned int mask =

    { 0xffffffff, 0, 0, 0 };

  __v4sf a, b, c;

  /* PowerISA VMX does not allow partial (for just element 0)

   * results. So to insure we don't generate spurious exceptions

   * (from the upper elements) we splat the lower float

   * before we to the operation. */

  a = vec_splat ((__v4sf) __A, 0);

  b = vec_splat ((__v4sf) __B, 0);

  c = (__v4sf) vec_cmple(a, b);

  /* Then we merge the lower float result with the original upper

   * float elements from __A.  */

  return ((__m128)vec_sel ((__v4sf)__A, c, mask));

}


extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cmpgt_ss (__m128 __A, __m128 __B)

{

  static const __vector unsigned int mask =

    { 0xffffffff, 0, 0, 0 };

  __v4sf a, b, c;

  /* PowerISA VMX does not allow partial (for just element 0)

   * results. So to insure we don't generate spurious exceptions

   * (from the upper elements) we splat the lower float

   * before we to the operation. */

  a = vec_splat ((__v4sf) __A, 0);

  b = vec_splat ((__v4sf) __B, 0);

  c = (__v4sf) vec_cmpgt(a, b);

  /* Then we merge the lower float result with the original upper

   * float elements from __A.  */

  return ((__m128)vec_sel ((__v4sf)__A, c, mask));

}


extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cmpge_ss (__m128 __A, __m128 __B)

{

  static const __vector unsigned int mask =

    { 0xffffffff, 0, 0, 0 };

  __v4sf a, b, c;

  /* PowerISA VMX does not allow partial (for just element 0)

   * results. So to insure we don't generate spurious exceptions

   * (from the upper elements) we splat the lower float

   * before we to the operation. */

  a = vec_splat ((__v4sf) __A, 0);

  b = vec_splat ((__v4sf) __B, 0);

  c = (__v4sf) vec_cmpge(a, b);

  /* Then we merge the lower float result with the original upper

   * float elements from __A.  */

  return ((__m128)vec_sel ((__v4sf)__A, c, mask));

}


extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cmpneq_ss (__m128 __A, __m128 __B)

{

  static const __vector unsigned int mask =

    { 0xffffffff, 0, 0, 0 };

  __v4sf a, b, c;

  /* PowerISA VMX does not allow partial (for just element 0)

   * results. So to insure we don't generate spurious exceptions

   * (from the upper elements) we splat the lower float

   * before we to the operation. */

  a = vec_splat ((__v4sf) __A, 0);

  b = vec_splat ((__v4sf) __B, 0);

  c = (__v4sf) vec_cmpeq(a, b);

  c = vec_nor (c, c);

  /* Then we merge the lower float result with the original upper

   * float elements from __A.  */

  return ((__m128)vec_sel ((__v4sf)__A, c, mask));

}


extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cmpnlt_ss (__m128 __A, __m128 __B)

{

  static const __vector unsigned int mask =

    { 0xffffffff, 0, 0, 0 };

  __v4sf a, b, c;

  /* PowerISA VMX does not allow partial (for just element 0)

   * results. So to insure we don't generate spurious exceptions

   * (from the upper elements) we splat the lower float

   * before we to the operation. */

  a = vec_splat ((__v4sf) __A, 0);

  b = vec_splat ((__v4sf) __B, 0);

  c = (__v4sf) vec_cmpge(a, b);

  /* Then we merge the lower float result with the original upper

   * float elements from __A.  */

  return ((__m128)vec_sel ((__v4sf)__A, c, mask));

}


extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cmpnle_ss (__m128 __A, __m128 __B)

{

  static const __vector unsigned int mask =

    { 0xffffffff, 0, 0, 0 };

  __v4sf a, b, c;

  /* PowerISA VMX does not allow partial (for just element 0)

   * results. So to insure we don't generate spurious exceptions

   * (from the upper elements) we splat the lower float

   * before we to the operation. */

  a = vec_splat ((__v4sf) __A, 0);

  b = vec_splat ((__v4sf) __B, 0);

  c = (__v4sf) vec_cmpgt(a, b);

  /* Then we merge the lower float result with the original upper

   * float elements from __A.  */

  return ((__m128)vec_sel ((__v4sf)__A, c, mask));

}


extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cmpngt_ss (__m128 __A, __m128 __B)

{

  static const __vector unsigned int mask =

    { 0xffffffff, 0, 0, 0 };

  __v4sf a, b, c;

  /* PowerISA VMX does not allow partial (for just element 0)

   * results. So to insure we don't generate spurious exceptions

   * (from the upper elements) we splat the lower float

   * before we to the operation. */

  a = vec_splat ((__v4sf) __A, 0);

  b = vec_splat ((__v4sf) __B, 0);

  c = (__v4sf) vec_cmple(a, b);

  /* Then we merge the lower float result with the original upper

   * float elements from __A.  */

  return ((__m128)vec_sel ((__v4sf)__A, c, mask));

}


extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cmpnge_ss (__m128 __A, __m128 __B)

{

  static const __vector unsigned int mask =

    { 0xffffffff, 0, 0, 0 };

  __v4sf a, b, c;

  /* PowerISA VMX does not allow partial (for just element 0)

   * results. So to insure we don't generate spurious exceptions

   * (from the upper elements) we splat the lower float

   * before we do the operation. */

  a = vec_splat ((__v4sf) __A, 0);

  b = vec_splat ((__v4sf) __B, 0);

  c = (__v4sf) vec_cmplt(a, b);

  /* Then we merge the lower float result with the original upper

   * float elements from __A.  */

  return ((__m128)vec_sel ((__v4sf)__A, c, mask));

}


extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cmpord_ss (__m128 __A, __m128 __B)

{

  __vector unsigned int a, b;

  __vector unsigned int c, d;

  static const __vector unsigned int float_exp_mask =

    { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };

  static const __vector unsigned int mask =

    { 0xffffffff, 0, 0, 0 };


  a = (__vector unsigned int) vec_abs ((__v4sf)__A);

  b = (__vector unsigned int) vec_abs ((__v4sf)__B);

  c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);

  d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);

  c = vec_and (c, d);

  /* Then we merge the lower float result with the original upper

   * float elements from __A.  */

  return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));

}


extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cmpunord_ss (__m128 __A, __m128 __B)

{

  __vector unsigned int a, b;

  __vector unsigned int c, d;

  static const __vector unsigned int float_exp_mask =

    { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };

  static const __vector unsigned int mask =

    { 0xffffffff, 0, 0, 0 };


  a = (__vector unsigned int) vec_abs ((__v4sf)__A);

  b = (__vector unsigned int) vec_abs ((__v4sf)__B);

  c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);

  d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);

  c = vec_or (c, d);

  /* Then we merge the lower float result with the original upper

   * float elements from __A.  */

  return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));

}


/* Compare the lower SPFP values of A and B and return 1 if true

   and 0 if false.  */

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_comieq_ss (__m128 __A, __m128 __B)

{

  return (__A[0] == __B[0]);

}


extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_comilt_ss (__m128 __A, __m128 __B)

{

  return (__A[0] < __B[0]);

}


extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_comile_ss (__m128 __A, __m128 __B)

{

  return (__A[0] <= __B[0]);

}


extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_comigt_ss (__m128 __A, __m128 __B)

{

  return (__A[0] > __B[0]);

}


extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_comige_ss (__m128 __A, __m128 __B)

{

  return (__A[0] >= __B[0]);

}


extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_comineq_ss (__m128 __A, __m128 __B)

{

  return (__A[0] != __B[0]);

}


/* FIXME

 * The __mm_ucomi??_ss implementations below are exactly the same as

 * __mm_comi??_ss because GCC for PowerPC only generates unordered

 * compares (scalar and vector).

 * Technically __mm_comieq_ss et al should be using the ordered

 * compare and signal for QNaNs.

 * The __mm_ucomieq_sd et all should be OK, as is.

 */

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_ucomieq_ss (__m128 __A, __m128 __B)

{

  return (__A[0] == __B[0]);

}


extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_ucomilt_ss (__m128 __A, __m128 __B)

{

  return (__A[0] < __B[0]);

}


extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_ucomile_ss (__m128 __A, __m128 __B)

{

  return (__A[0] <= __B[0]);

}


extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_ucomigt_ss (__m128 __A, __m128 __B)

{

  return (__A[0] > __B[0]);

}


extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_ucomige_ss (__m128 __A, __m128 __B)

{

  return (__A[0] >= __B[0]);

}


extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_ucomineq_ss (__m128 __A, __m128 __B)

{

  return (__A[0] != __B[0]);

}


extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cvtss_f32 (__m128 __A)

{

  return ((__v4sf)__A)[0];

}


/* Convert the lower SPFP value to a 32-bit integer according to the current

   rounding mode.  */

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cvtss_si32 (__m128 __A)

{

  __m64 res = 0;

#ifdef _ARCH_PWR8

  double dtmp;

  __asm__(

#ifdef __LITTLE_ENDIAN__

      "xxsldwi %x0,%x0,%x0,3;\n"

#endif

      "xscvspdp %x2,%x0;\n"

      "fctiw  %2,%2;\n"

      "mfvsrd  %1,%x2;\n"

      : "+wa" (__A),

        "=r" (res),

        "=f" (dtmp)

      : );

#else

  res = __builtin_rint(__A[0]);

#endif

  return (res);

}


extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cvt_ss2si (__m128 __A)

{

  return _mm_cvtss_si32 (__A);

}


/* Convert the lower SPFP value to a 32-bit integer according to the

   current rounding mode.  */


/* Intel intrinsic.  */

extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cvtss_si64 (__m128 __A)

{

  __m64 res = 0;

#ifdef _ARCH_PWR8

  double dtmp;

  __asm__(

#ifdef __LITTLE_ENDIAN__

      "xxsldwi %x0,%x0,%x0,3;\n"

#endif

      "xscvspdp %x2,%x0;\n"

      "fctid  %2,%2;\n"

      "mfvsrd  %1,%x2;\n"

      : "+wa" (__A),

        "=r" (res),

        "=f" (dtmp)

      : );

#else

  res = __builtin_llrint(__A[0]);

#endif

  return (res);

}


/* Microsoft intrinsic.  */

extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cvtss_si64x (__m128 __A)

{

  return _mm_cvtss_si64 ((__v4sf) __A);

}


/* Constants for use with _mm_prefetch.  */

enum _mm_hint

{

  /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit.  */

  _MM_HINT_ET0 = 7,

  _MM_HINT_ET1 = 6,

  _MM_HINT_T0 = 3,

  _MM_HINT_T1 = 2,

  _MM_HINT_T2 = 1,

  _MM_HINT_NTA = 0

};


/* Loads one cache line from address P to a location "closer" to the

   processor.  The selector I specifies the type of prefetch operation.  */

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_prefetch (const void *__P, enum _mm_hint __I)

{

  /* Current PowerPC will ignores the hint parameters.  */

  __builtin_prefetch (__P);

}


/* Convert the two lower SPFP values to 32-bit integers according to the

   current rounding mode.  Return the integers in packed form.  */

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cvtps_pi32 (__m128 __A)

{

  /* Splat two lower SPFP values to both halves.  */

  __v4sf temp, rounded;

  __vector unsigned long long result;


  /* Splat two lower SPFP values to both halves.  */

  temp = (__v4sf) vec_splat ((__vector long long)__A, 0);

  rounded = vec_rint(temp);

  result = (__vector unsigned long long) vec_cts (rounded, 0);


  return (__m64) ((__vector long long) result)[0];

}


extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cvt_ps2pi (__m128 __A)

{

  return _mm_cvtps_pi32 (__A);

}


/* Truncate the lower SPFP value to a 32-bit integer.  */

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cvttss_si32 (__m128 __A)

{

  /* Extract the lower float element.  */

  float temp = __A[0];

  /* truncate to 32-bit integer and return.  */

  return temp;

}


extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cvtt_ss2si (__m128 __A)

{

  return _mm_cvttss_si32 (__A);

}


/* Intel intrinsic.  */

extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cvttss_si64 (__m128 __A)

{

  /* Extract the lower float element.  */

  float temp = __A[0];

  /* truncate to 32-bit integer and return.  */

  return temp;

}


/* Microsoft intrinsic.  */

extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cvttss_si64x (__m128 __A)

{

  /* Extract the lower float element.  */

  float temp = __A[0];

  /* truncate to 32-bit integer and return.  */

  return temp;

}


/* Truncate the two lower SPFP values to 32-bit integers.  Return the

   integers in packed form.  */

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cvttps_pi32 (__m128 __A)

{

  __v4sf temp;

  __vector unsigned long long result;


  /* Splat two lower SPFP values to both halves.  */

  temp = (__v4sf) vec_splat ((__vector long long)__A, 0);

  result = (__vector unsigned long long) vec_cts (temp, 0);


  return (__m64) ((__vector long long) result)[0];

}


extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cvtt_ps2pi (__m128 __A)

{

  return _mm_cvttps_pi32 (__A);

}


/* Convert B to a SPFP value and insert it as element zero in A.  */

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cvtsi32_ss (__m128 __A, int __B)

{

  float temp = __B;

  __A[0] = temp;


  return __A;

}


extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cvt_si2ss (__m128 __A, int __B)

{

  return _mm_cvtsi32_ss (__A, __B);

}


/* Convert B to a SPFP value and insert it as element zero in A.  */

/* Intel intrinsic.  */

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cvtsi64_ss (__m128 __A, long long __B)

{

  float temp = __B;

  __A[0] = temp;


  return __A;

}


/* Microsoft intrinsic.  */

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cvtsi64x_ss (__m128 __A, long long __B)

{

  return _mm_cvtsi64_ss (__A, __B);

}


/* Convert the two 32-bit values in B to SPFP form and insert them

   as the two lower elements in A.  */

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cvtpi32_ps (__m128        __A, __m64        __B)

{

  __vector signed int vm1;

  __vector float vf1;


  vm1 = (__vector signed int) (__vector unsigned long long) {__B, __B};

  vf1 = (__vector float) vec_ctf (vm1, 0);


  return ((__m128) (__vector unsigned long long)

    { ((__vector unsigned long long)vf1) [0],

        ((__vector unsigned long long)__A) [1]});

}


extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cvt_pi2ps (__m128 __A, __m64 __B)

{

  return _mm_cvtpi32_ps (__A, __B);

}


/* Convert the four signed 16-bit values in A to SPFP form.  */

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cvtpi16_ps (__m64 __A)

{

  __vector signed short vs8;

  __vector signed int vi4;

  __vector float vf1;


  vs8 = (__vector signed short) (__vector unsigned long long) { __A, __A };

  vi4 = vec_vupklsh (vs8);

  vf1 = (__vector float) vec_ctf (vi4, 0);


  return (__m128) vf1;

}


/* Convert the four unsigned 16-bit values in A to SPFP form.  */

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cvtpu16_ps (__m64 __A)

{

  const __vector unsigned short zero =

    { 0, 0, 0, 0, 0, 0, 0, 0 };

  __vector unsigned short vs8;

  __vector unsigned int vi4;

  __vector float vf1;


  vs8 = (__vector unsigned short) (__vector unsigned long long) { __A, __A };

  vi4 = (__vector unsigned int) vec_mergel

#ifdef __LITTLE_ENDIAN__

                                           (vs8, zero);

#else

                                           (zero, vs8);

#endif

  vf1 = (__vector float) vec_ctf (vi4, 0);


  return (__m128) vf1;

}


/* Convert the low four signed 8-bit values in A to SPFP form.  */

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cvtpi8_ps (__m64 __A)

{

  __vector signed char vc16;

  __vector signed short vs8;

  __vector signed int vi4;

  __vector float vf1;


  vc16 = (__vector signed char) (__vector unsigned long long) { __A, __A };

  vs8 = vec_vupkhsb (vc16);

  vi4 = vec_vupkhsh (vs8);

  vf1 = (__vector float) vec_ctf (vi4, 0);


  return (__m128) vf1;

}


/* Convert the low four unsigned 8-bit values in A to SPFP form.  */

extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))


_mm_cvtpu8_ps (__m64  __A)

{

  const __vector unsigned char zero =

    { 0, 0, 0, 0, 0, 0, 0, 0 };

  __vector unsigned char vc16;

  __vector unsigned short vs8;

  __vector unsigned int vi4;

  __vector float vf1;


  vc16 = (__vector unsigned char) (__vector unsigned long long) { __A, __A };

#ifdef __LITTLE_ENDIAN__

  vs8 = (__vector unsigned short) vec_mergel (vc16, zero);

  vi4 = (__vector unsigned int) vec_mergeh (vs8,

                                            (__vector unsigned short) zero);

#else

  vs8 = (__vector unsigned short) vec_mergel (zero, vc16);

  vi4 = (__vector unsigned int) vec_mergeh ((__vector unsigned short) zero,

                                            vs8);

#endif

  vf1 = (__vector float) vec_ctf (vi4, 0);


  return (__m128) vf1;

}


/* Convert the four signed 32-bit values in A and B to SPFP form.  */

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cvtpi32x2_ps (__m64 __A, __m64 __B)

{

  __vector signed int vi4;

  __vector float vf4;


  vi4 = (__vector signed int) (__vector unsigned long long) { __A, __B };

  vf4 = (__vector float) vec_ctf (vi4, 0);

  return (__m128) vf4;

}


/* Convert the four SPFP values in A to four signed 16-bit integers.  */

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cvtps_pi16 (__m128 __A)

{

  __v4sf rounded;

  __vector signed int temp;

  __vector unsigned long long result;


  rounded = vec_rint(__A);

  temp = vec_cts (rounded, 0);

  result = (__vector unsigned long long) vec_pack (temp, temp);


  return (__m64) ((__vector long long) result)[0];

}


/* Convert the four SPFP values in A to four signed 8-bit integers.  */

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_cvtps_pi8 (__m128 __A)

{

  __v4sf rounded;

  __vector signed int tmp_i;

  static const __vector signed int zero = {0, 0, 0, 0};

  __vector signed short tmp_s;

  __vector signed char res_v;


  rounded = vec_rint(__A);

  tmp_i = vec_cts (rounded, 0);

  tmp_s = vec_pack (tmp_i, zero);

  res_v = vec_pack (tmp_s, tmp_s);

  return (__m64) ((__vector long long) res_v)[0];

}


/* Selects four specific SPFP values from A and B based on MASK.  */

extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))


_mm_shuffle_ps (__m128  __A, __m128  __B, int const __mask)

{

  unsigned long element_selector_10 = __mask & 0x03;

  unsigned long element_selector_32 = (__mask >> 2) & 0x03;

  unsigned long element_selector_54 = (__mask >> 4) & 0x03;

  unsigned long element_selector_76 = (__mask >> 6) & 0x03;

  static const unsigned int permute_selectors[4] =

    {

#ifdef __LITTLE_ENDIAN__

      0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C

#else

      0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F

#endif

    };

  __vector unsigned int t;


  t[0] = permute_selectors[element_selector_10];

  t[1] = permute_selectors[element_selector_32];

  t[2] = permute_selectors[element_selector_54] + 0x10101010;

  t[3] = permute_selectors[element_selector_76] + 0x10101010;

  return vec_perm ((__v4sf) __A, (__v4sf)__B, (__vector unsigned char)t);

}


/* Selects and interleaves the upper two SPFP values from A and B.  */

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_unpackhi_ps (__m128 __A, __m128 __B)

{

  return (__m128) vec_vmrglw ((__v4sf) __A, (__v4sf)__B);

}


/* Selects and interleaves the lower two SPFP values from A and B.  */

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_unpacklo_ps (__m128 __A, __m128 __B)

{

  return (__m128) vec_vmrghw ((__v4sf) __A, (__v4sf)__B);

}


/* Sets the upper two SPFP values with 64-bits of data loaded from P;

   the lower two values are passed through from A.  */

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_loadh_pi (__m128 __A, __m64 const *__P)

{

  __vector unsigned long long __a = (__vector unsigned long long)__A;

  __vector unsigned long long __p = vec_splats(*__P);

  __a [1] = __p [1];


  return (__m128)__a;

}


/* Stores the upper two SPFP values of A into P.  */

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_storeh_pi (__m64 *__P, __m128 __A)

{

  __vector unsigned long long __a = (__vector unsigned long long) __A;


  *__P = __a[1];

}


/* Moves the upper two values of B into the lower two values of A.  */

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_movehl_ps (__m128 __A, __m128 __B)

{

  return (__m128) vec_mergel ((__vector unsigned long long)__B,

                              (__vector unsigned long long)__A);

}


/* Moves the lower two values of B into the upper two values of A.  */

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_movelh_ps (__m128 __A, __m128 __B)

{

  return (__m128) vec_mergeh ((__vector unsigned long long)__A,

                              (__vector unsigned long long)__B);

}


/* Sets the lower two SPFP values with 64-bits of data loaded from P;

   the upper two values are passed through from A.  */

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_loadl_pi (__m128 __A, __m64 const *__P)

{

  __vector unsigned long long __a = (__vector unsigned long long)__A;

  __vector unsigned long long __p = vec_splats(*__P);

  __a [0] = __p [0];


  return (__m128)__a;

}


/* Stores the lower two SPFP values of A into P.  */

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_storel_pi (__m64 *__P, __m128 __A)

{

  __vector unsigned long long __a = (__vector unsigned long long) __A;


  *__P = __a[0];

}


#ifdef _ARCH_PWR8

/* Intrinsic functions that require PowerISA 2.07 minimum.  */


/* Creates a 4-bit mask from the most significant bits of the SPFP values.  */

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_movemask_ps (__m128  __A)

{

  __vector unsigned long long result;

  static const __vector unsigned int perm_mask =

    {

#ifdef __LITTLE_ENDIAN__

        0x00204060, 0x80808080, 0x80808080, 0x80808080

#else

      0x80808080, 0x80808080, 0x80808080, 0x00204060

#endif

    };


  result = ((__vector unsigned long long)

            vec_vbpermq ((__vector unsigned char) __A,

                         (__vector unsigned char) perm_mask));


#ifdef __LITTLE_ENDIAN__

  return result[1];

#else

  return result[0];

#endif

}

#endif /* _ARCH_PWR8 */


/* Create a vector with all four elements equal to *P.  */

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_load1_ps (float const *__P)

{

  return _mm_set1_ps (*__P);

}


extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_load_ps1 (float const *__P)

{

  return _mm_load1_ps (__P);

}


/* Extracts one of the four words of A.  The selector N must be immediate.  */

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_extract_pi16 (__m64 const __A, int const __N)

{

  unsigned int shiftr = __N & 3;

#ifdef __BIG_ENDIAN__

  shiftr = 3 - shiftr;

#endif


  return ((__A >> (shiftr * 16)) & 0xffff);

}


extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_m_pextrw (__m64 const __A, int const __N)

{

  return _mm_extract_pi16 (__A, __N);

}


/* Inserts word D into one of four words of A.  The selector N must be

   immediate.  */

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_insert_pi16 (__m64 const __A, int const __D, int const __N)

{

  const int shiftl = (__N & 3) * 16;

  const __m64 shiftD = (const __m64) __D << shiftl;

  const __m64 mask = 0xffffUL << shiftl;

  __m64 result = (__A & (~mask)) | (shiftD & mask);


  return (result);

}


extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_m_pinsrw (__m64 const __A, int const __D, int const __N)

{

  return _mm_insert_pi16 (__A, __D, __N);

}


/* Compute the element-wise maximum of signed 16-bit values.  */

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))


_mm_max_pi16 (__m64 __A, __m64 __B)

{

#if _ARCH_PWR8

  __vector signed short a, b, r;

  __vector __bool short c;


  a = (__vector signed short)vec_splats (__A);

  b = (__vector signed short)vec_splats (__B);

  c = (__vector __bool short)vec_cmpgt (a, b);

  r = vec_sel (b, a, c);

  return (__m64) ((__vector long long) r)[0];

#else

  __m64_union m1, m2, res;


  m1.as_m64 = __A;

  m2.as_m64 = __B;


  res.as_short[0] =

      (m1.as_short[0] > m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];

  res.as_short[1] =

      (m1.as_short[1] > m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];

  res.as_short[2] =

      (m1.as_short[2] > m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];

  res.as_short[3] =

      (m1.as_short[3] > m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];


  return (__m64) res.as_m64;

#endif

}


extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_m_pmaxsw (__m64 __A, __m64 __B)

{

  return _mm_max_pi16 (__A, __B);

}


/* Compute the element-wise maximum of unsigned 8-bit values.  */

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_max_pu8 (__m64 __A, __m64 __B)

{

#if _ARCH_PWR8

  __vector unsigned char a, b, r;

  __vector __bool char c;


  a = (__vector unsigned char)vec_splats (__A);

  b = (__vector unsigned char)vec_splats (__B);

  c = (__vector __bool char)vec_cmpgt (a, b);

  r = vec_sel (b, a, c);

  return (__m64) ((__vector long long) r)[0];

#else

  __m64_union m1, m2, res;

  long i;


  m1.as_m64 = __A;

  m2.as_m64 = __B;


  for (i = 0; i < 8; i++)

  res.as_char[i] =

      ((unsigned char) m1.as_char[i] > (unsigned char) m2.as_char[i]) ?

          m1.as_char[i] : m2.as_char[i];


  return (__m64) res.as_m64;

#endif

}


extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_m_pmaxub (__m64 __A, __m64 __B)

{

  return _mm_max_pu8 (__A, __B);

}


/* Compute the element-wise minimum of signed 16-bit values.  */

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_min_pi16 (__m64 __A, __m64 __B)

{

#if _ARCH_PWR8

  __vector signed short a, b, r;

  __vector __bool short c;


  a = (__vector signed short)vec_splats (__A);

  b = (__vector signed short)vec_splats (__B);

  c = (__vector __bool short)vec_cmplt (a, b);

  r = vec_sel (b, a, c);

  return (__m64) ((__vector long long) r)[0];

#else

  __m64_union m1, m2, res;


  m1.as_m64 = __A;

  m2.as_m64 = __B;


  res.as_short[0] =

      (m1.as_short[0] < m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];

  res.as_short[1] =

      (m1.as_short[1] < m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];

  res.as_short[2] =

      (m1.as_short[2] < m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];

  res.as_short[3] =

      (m1.as_short[3] < m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];


  return (__m64) res.as_m64;

#endif

}


extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_m_pminsw (__m64 __A, __m64 __B)

{

  return _mm_min_pi16 (__A, __B);

}


/* Compute the element-wise minimum of unsigned 8-bit values.  */

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_min_pu8 (__m64 __A, __m64 __B)

{

#if _ARCH_PWR8

  __vector unsigned char a, b, r;

  __vector __bool char c;


  a = (__vector unsigned char)vec_splats (__A);

  b = (__vector unsigned char)vec_splats (__B);

  c = (__vector __bool char)vec_cmplt (a, b);

  r = vec_sel (b, a, c);

  return (__m64) ((__vector long long) r)[0];

#else

  __m64_union m1, m2, res;

  long i;


  m1.as_m64 = __A;

  m2.as_m64 = __B;


  for (i = 0; i < 8; i++)

  res.as_char[i] =

      ((unsigned char) m1.as_char[i] < (unsigned char) m2.as_char[i]) ?

          m1.as_char[i] : m2.as_char[i];


  return (__m64) res.as_m64;

#endif

}


extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_m_pminub (__m64 __A, __m64 __B)

{

  return _mm_min_pu8 (__A, __B);

}


/* Create an 8-bit mask of the signs of 8-bit values.  */

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_movemask_pi8 (__m64 __A)

{

  unsigned long long p =

#ifdef __LITTLE_ENDIAN__

                         0x0008101820283038UL; // permute control for sign bits

#else

                         0x3830282018100800UL; // permute control for sign bits

#endif

  return __builtin_bpermd (p, __A);

}


extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_m_pmovmskb (__m64 __A)

{

  return _mm_movemask_pi8 (__A);

}


/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values

   in B and produce the high 16 bits of the 32-bit results.  */

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_mulhi_pu16 (__m64 __A, __m64 __B)

{

  __vector unsigned short a, b;

  __vector unsigned short c;

  __vector unsigned int w0, w1;

  __vector unsigned char xform1 = {

#ifdef __LITTLE_ENDIAN__

      0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,

      0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F

#else

      0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15,

      0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15

#endif

    };


  a = (__vector unsigned short)vec_splats (__A);

  b = (__vector unsigned short)vec_splats (__B);


  w0 = vec_vmuleuh (a, b);

  w1 = vec_vmulouh (a, b);

  c = (__vector unsigned short)vec_perm (w0, w1, xform1);


  return (__m64) ((__vector long long) c)[0];

}


extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_m_pmulhuw (__m64 __A, __m64 __B)

{

  return _mm_mulhi_pu16 (__A, __B);

}


/* Return a combination of the four 16-bit values in A.  The selector

   must be an immediate.  */

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_shuffle_pi16 (__m64 __A, int const __N)

{

  unsigned long element_selector_10 = __N & 0x03;

  unsigned long element_selector_32 = (__N >> 2) & 0x03;

  unsigned long element_selector_54 = (__N >> 4) & 0x03;

  unsigned long element_selector_76 = (__N >> 6) & 0x03;

  static const unsigned short permute_selectors[4] =

    {

#ifdef __LITTLE_ENDIAN__

              0x0908, 0x0B0A, 0x0D0C, 0x0F0E

#else

              0x0607, 0x0405, 0x0203, 0x0001

#endif

    };

  __m64_union t;

  __vector unsigned long long a, p, r;


#ifdef __LITTLE_ENDIAN__

  t.as_short[0] = permute_selectors[element_selector_10];

  t.as_short[1] = permute_selectors[element_selector_32];

  t.as_short[2] = permute_selectors[element_selector_54];

  t.as_short[3] = permute_selectors[element_selector_76];

#else

  t.as_short[3] = permute_selectors[element_selector_10];

  t.as_short[2] = permute_selectors[element_selector_32];

  t.as_short[1] = permute_selectors[element_selector_54];

  t.as_short[0] = permute_selectors[element_selector_76];

#endif

  p = vec_splats (t.as_m64);

  a = vec_splats (__A);

  r = vec_perm (a, a, (__vector unsigned char)p);

  return (__m64) ((__vector long long) r)[0];

}


extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_m_pshufw (__m64 __A, int const __N)

{

  return _mm_shuffle_pi16 (__A, __N);

}


/* Conditionally store byte elements of A into P.  The high bit of each

   byte in the selector N determines whether the corresponding byte from

   A is stored.  */

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)

{

  __m64 hibit = 0x8080808080808080UL;

  __m64 mask, tmp;

  __m64 *p = (__m64*)__P;


  tmp = *p;

  mask = _mm_cmpeq_pi8 ((__N & hibit), hibit);

  tmp = (tmp & (~mask)) | (__A & mask);

  *p = tmp;

}


extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_m_maskmovq (__m64 __A, __m64 __N, char *__P)

{

  _mm_maskmove_si64 (__A, __N, __P);

}


/* Compute the rounded averages of the unsigned 8-bit values in A and B.  */

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_avg_pu8 (__m64 __A, __m64 __B)

{

  __vector unsigned char a, b, c;


  a = (__vector unsigned char)vec_splats (__A);

  b = (__vector unsigned char)vec_splats (__B);

  c = vec_avg (a, b);

  return (__m64) ((__vector long long) c)[0];

}


extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_m_pavgb (__m64 __A, __m64 __B)

{

  return _mm_avg_pu8 (__A, __B);

}


/* Compute the rounded averages of the unsigned 16-bit values in A and B.  */

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_avg_pu16 (__m64 __A, __m64 __B)

{

  __vector unsigned short a, b, c;


  a = (__vector unsigned short)vec_splats (__A);

  b = (__vector unsigned short)vec_splats (__B);

  c = vec_avg (a, b);

  return (__m64) ((__vector long long) c)[0];

}


extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_m_pavgw (__m64 __A, __m64 __B)

{

  return _mm_avg_pu16 (__A, __B);

}


/* Compute the sum of the absolute differences of the unsigned 8-bit

   values in A and B.  Return the value in the lower 16-bit word; the

   upper words are cleared.  */

extern __inline    __m64    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_sad_pu8 (__m64  __A, __m64  __B)

{

  __vector unsigned char a, b;

  __vector unsigned char vmin, vmax, vabsdiff;

  __vector signed int vsum;

  const __vector unsigned int zero =

    { 0, 0, 0, 0 };

  __m64_union result = {0};


  a = (__vector unsigned char) (__vector unsigned long long) { 0UL, __A };

  b = (__vector unsigned char) (__vector unsigned long long) { 0UL, __B };

  vmin = vec_min (a, b);

  vmax = vec_max (a, b);

  vabsdiff = vec_sub (vmax, vmin);

  /* Sum four groups of bytes into integers.  */

  vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);

  /* Sum across four integers with integer result.  */

  vsum = vec_sums (vsum, (__vector signed int) zero);

  /* The sum is in the right most 32-bits of the vector result.

     Transfer to a GPR and truncate to 16 bits.  */

  result.as_short[0] = vsum[3];

  return result.as_m64;

}


extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_m_psadbw (__m64 __A, __m64 __B)

{

  return _mm_sad_pu8 (__A, __B);

}


/* Stores the data in A to the address P without polluting the caches.  */

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_stream_pi (__m64 *__P, __m64 __A)

{

  /* Use the data cache block touch for store transient.  */

  __asm__ (

    "   dcbtstt 0,%0"

    :

    : "b" (__P)

    : "memory"

  );

  *__P = __A;

}


/* Likewise.  The address must be 16-byte aligned.  */

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_stream_ps (float *__P, __m128 __A)

{

  /* Use the data cache block touch for store transient.  */

  __asm__ (

    "   dcbtstt 0,%0"

    :

    : "b" (__P)

    : "memory"

  );

  _mm_store_ps (__P, __A);

}


/* Guarantees that every preceding store is globally visible before

   any subsequent store.  */

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_sfence (void)

{

  /* Generate a light weight sync.  */

  __atomic_thread_fence (__ATOMIC_RELEASE);

}


/* The execution of the next instruction is delayed by an implementation

   specific amount of time.  The instruction does not modify the

   architectural state.  This is after the pop_options pragma because

   it does not require SSE support in the processor--the encoding is a

   nop on processors that do not support it.  */

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))

_mm_pause (void)

{

  /* There is no exact match with this construct, but the following is

     close to the desired effect.  */

#if _ARCH_PWR8

  /* On power8 and later processors we can depend on Program Priority

     (PRI) and associated "very low" PPI setting.  Since we don't know

     what PPI this thread is running at we: 1) save the current PRI

     from the PPR SPR into a local GRP, 2) set the PRI to "very low*

     via the special or 31,31,31 encoding. 3) issue an "isync" to

     insure the PRI change takes effect before we execute any more

     instructions.

     Now we can execute a lwsync (release barrier) while we execute

     this thread at "very low" PRI.  Finally we restore the original

     PRI and continue execution.  */

  unsigned long __PPR;


  __asm__ volatile (

    "   mfppr   %0;"

    "   or 31,31,31;"

    "   isync;"

    "   lwsync;"

    "   isync;"

    "   mtppr   %0;"

    : "=r" (__PPR)

    :

    : "memory"

  );

#else

  /* For older processor where we may not even have Program Priority

     controls we can only depend on Heavy Weight Sync.  */

  __atomic_thread_fence (__ATOMIC_SEQ_CST);

#endif

}


/* Transpose the 4x4 matrix composed of row[0-3].  */

#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)                       \

do {                                                                    \

  __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3);    \

  __v4sf __t0 = vec_vmrghw (__r0, __r1);                        \

  __v4sf __t1 = vec_vmrghw (__r2, __r3);                        \

  __v4sf __t2 = vec_vmrglw (__r0, __r1);                        \

  __v4sf __t3 = vec_vmrglw (__r2, __r3);                        \

  (row0) = (__v4sf)vec_mergeh ((__vector long long)__t0,        \

                               (__vector long long)__t1);       \

  (row1) = (__v4sf)vec_mergel ((__vector long long)__t0,        \

                               (__vector long long)__t1);       \

  (row2) = (__v4sf)vec_mergeh ((__vector long long)__t2,        \

                               (__vector long long)__t3);       \

  (row3) = (__v4sf)vec_mergel ((__vector long long)__t2,        \

                               (__vector long long)__t3);       \

} while (0)


/* For backward source compatibility.  */

//# include <emmintrin.h>


#else

#include_next <xmmintrin.h>

#endif /* defined(__linux__) && defined(__ppc64__) */


#endif /* _XMMINTRIN_H_INCLUDED */

b
__device__ __2f16 b
Definition __clang_hip_libdevice_declares.h:297

__attribute__
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
Definition __clang_hip_libdevice_declares.h:294

int
__device__ int
Definition __clang_hip_libdevice_declares.h:61

c
__device__ __2f16 float c
Definition __clang_hip_libdevice_declares.h:298

float
__device__ float
Definition __clang_hip_libdevice_declares.h:20

__p
static __inline unsigned char unsigned int unsigned int unsigned int * __p
Definition adxintrin.h:24

altivec.h

vec_cmpeq
static __inline__ vector bool char __ATTRS_o_ai vec_cmpeq(vector signed char __a, vector signed char __b)
Definition altivec.h:1625

vec_vmrghw
static __inline__ vector int __ATTRS_o_ai vec_vmrghw(vector int __a, vector int __b)
Definition altivec.h:4769

vec_ld
static __inline__ vector signed char __ATTRS_o_ai vec_ld(int __a, const vector signed char *__b)
Definition altivec.h:3504

vec_ctf
#define vec_ctf(__a, __b)
Definition altivec.h:2950

vec_vupkhsh
static __inline__ vector int __ATTRS_o_ai vec_vupkhsh(vector short __a)
Definition altivec.h:11757

vec_splats
static __inline__ vector signed char __ATTRS_o_ai vec_splats(signed char __a)
Definition altivec.h:13710

vec_vupkhsb
static __inline__ vector short __ATTRS_o_ai vec_vupkhsb(vector signed char __a)
Definition altivec.h:11738

vec_st
static __inline__ void __ATTRS_o_ai vec_st(vector signed char __a, int __b, vector signed char *__c)
Definition altivec.h:10278

vec_andc
static __inline__ vector signed char __ATTRS_o_ai vec_andc(vector signed char __a, vector signed char __b)
Definition altivec.h:1163

vec_sum4s
static __inline__ vector int __ATTRS_o_ai vec_sum4s(vector signed char __a, vector int __b)
Definition altivec.h:11531

vec_and
static __inline__ vector signed char __ATTRS_o_ai vec_and(vector signed char __a, vector signed char __b)
Definition altivec.h:810

vec_avg
static __inline__ vector signed char __ATTRS_o_ai vec_avg(vector signed char __a, vector signed char __b)
Definition altivec.h:1514

vec_mergel
static __inline__ vector signed char __ATTRS_o_ai vec_mergel(vector signed char __a, vector signed char __b)
Definition altivec.h:4804

vec_vmrglw
static __inline__ vector int __ATTRS_o_ai vec_vmrglw(vector int __a, vector int __b)
Definition altivec.h:5032

vec_perm
static __inline__ vector signed char __ATTRS_o_ai vec_perm(vector signed char __a, vector signed char __b, vector unsigned char __c)
Definition altivec.h:7320

vec_sel
static __inline__ vector signed char __ATTRS_o_ai vec_sel(vector signed char __a, vector signed char __b, vector unsigned char __c)
Definition altivec.h:7834

vec_mergeh
static __inline__ vector signed char __ATTRS_o_ai vec_mergeh(vector signed char __a, vector signed char __b)
Definition altivec.h:4534

vec_vupklsh
static __inline__ vector int __ATTRS_o_ai vec_vupklsh(vector short __a)
Definition altivec.h:11896

vec_cmplt
static __inline__ vector bool char __ATTRS_o_ai vec_cmplt(vector signed char __a, vector signed char __b)
Definition altivec.h:2196

vec_max
static __inline__ vector signed char __ATTRS_o_ai vec_max(vector signed char __a, vector signed char __b)
Definition altivec.h:4281

vec_nor
static __inline__ vector signed char __ATTRS_o_ai vec_nor(vector signed char __a, vector signed char __b)
Definition altivec.h:6098

vec_cmpge
static __inline__ vector bool char __ATTRS_o_ai vec_cmpge(vector signed char __a, vector signed char __b)
Definition altivec.h:2024

vec_pack
static __inline__ vector signed char __ATTRS_o_ai vec_pack(vector signed short __a, vector signed short __b)
Definition altivec.h:6747

vec_re
static __inline__ vector float __ATTRS_o_ai vec_re(vector float __a)
Definition altivec.h:7621

vec_min
static __inline__ vector signed char __ATTRS_o_ai vec_min(vector signed char __a, vector signed char __b)
Definition altivec.h:5185

vec_cts
#define vec_cts
Definition altivec.h:2981

vec_splat
static __inline__ vector signed char __ATTRS_o_ai vec_splat(vector signed char __a, unsigned const int __b)
Definition altivec.h:9240

vec_or
static __inline__ vector signed char __ATTRS_o_ai vec_or(vector signed char __a, vector signed char __b)
Definition altivec.h:6234

vec_abs
static __inline__ vector signed char __ATTRS_o_ai vec_abs(vector signed char __a)
Definition altivec.h:115

vec_xor
static __inline__ vector unsigned char __ATTRS_o_ai vec_xor(vector unsigned char __a, vector unsigned char __b)
Definition altivec.h:12223

vec_rsqrte
static __inline__ vector float __ATTRS_o_ai vec_rsqrte(vector float __a)
Definition altivec.h:7809

vec_cmpgt
static __inline__ vector bool char __ATTRS_o_ai vec_cmpgt(vector signed char __a, vector signed char __b)
Definition altivec.h:1964

vec_cmple
static __inline__ vector bool char __ATTRS_o_ai vec_cmple(vector signed char __a, vector signed char __b)
Definition altivec.h:2140

vec_sub
static __inline__ vector signed char __ATTRS_o_ai vec_sub(vector signed char __a, vector signed char __b)
Definition altivec.h:10963

__a
static __inline__ void int __a
Definition emmintrin.h:4185

_mm_pause
void _mm_pause(void)
Indicates that a spin loop is being executed for the purposes of optimizing power consumption during ...

__D
static __inline__ unsigned int unsigned char __D
Definition ia32intrin.h:283

_mm_cmpeq_pi8
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
Compares the 8-bit integer elements of two 64-bit integer vectors of [8 x i8] to determine if the ele...
Definition mmintrin.h:1159

get_metrics.d
d
Definition get_metrics.py:183

as_short
#define as_short(x)
Definition opencl-c.h:6358

_mm_comigt_ss
static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition xmmintrin.h:1089

_mm_cvttss_si32
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttss_si32(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
Definition xmmintrin.h:1393

_m_pinsrw
#define _m_pinsrw
Definition xmmintrin.h:2985

_mm_rcp_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rcp_ss(__m128 __a)
Calculates the approximate reciprocal of the value stored in the low-order bits of a 128-bit vector o...
Definition xmmintrin.h:249

_mm_cmplt_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmplt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:545

_mm_sqrt_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ss(__m128 __a)
Calculates the square root of the value stored in the low-order bits of a 128-bit vector of [4 x floa...
Definition xmmintrin.h:214

_MM_HINT_ET0
#define _MM_HINT_ET0
Definition xmmintrin.h:2068

_mm_div_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_div_ps(__m128 __a, __m128 __b)
Divides two 128-bit vectors of [4 x float].
Definition xmmintrin.h:196

_mm_cmpnge_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnge_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:880

_mm_cmpeq_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpeq_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] for equa...
Definition xmmintrin.h:503

_mm_setzero_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.
Definition xmmintrin.h:1903

_mm_cvtps_pi32
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtps_pi32(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
Definition xmmintrin.h:1358

_mm_set_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ss(float __w)
Constructs a 128-bit floating-point vector of [4 x float].
Definition xmmintrin.h:1796

_mm_and_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_and_ps(__m128 __a, __m128 __b)
Performs a bitwise AND of two 128-bit vectors of [4 x float].
Definition xmmintrin.h:404

_mm_cmplt_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmplt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:526

_mm_cvt_ss2si
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvt_ss2si(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
Definition xmmintrin.h:1320

_mm_cmpeq_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpeq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for equality and returns the ...
Definition xmmintrin.h:485

_mm_add_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_add_ps(__m128 __a, __m128 __b)
Adds two 128-bit vectors of [4 x float], and returns the results of the addition.
Definition xmmintrin.h:70

_mm_load_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an aligned memory location.
Definition xmmintrin.h:1723

_mm_mulhi_pu16
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mulhi_pu16(__m64 __a, __m64 __b)
Multiplies packed 16-bit unsigned integer values and writes the high-order 16 bits of each 32-bit pro...
Definition xmmintrin.h:2325

_mm_cmpneq_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpneq_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] for ineq...
Definition xmmintrin.h:719

_mm_comile_ss
static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition xmmintrin.h:1065

_mm_cvt_ps2pi
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvt_ps2pi(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
Definition xmmintrin.h:1374

_mm_andnot_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_andnot_ps(__m128 __a, __m128 __b)
Performs a bitwise AND of two 128-bit vectors of [4 x float], using the one's complement of the value...
Definition xmmintrin.h:426

_mm_storer_ps
static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_ps(float *__p, __m128 __a)
Stores float values from a 128-bit vector of [4 x float] to an aligned memory location in reverse ord...
Definition xmmintrin.h:2062

_mm_undefined_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_undefined_ps(void)
Create a 128-bit vector of [4 x float] with undefined values.
Definition xmmintrin.h:1776

_mm_cvttps_pi32
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttps_pi32(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
Definition xmmintrin.h:1451

_mm_cmpnle_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnle_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:788

_mm_ucomilt_ss
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition xmmintrin.h:1185

_m_pmulhuw
#define _m_pmulhuw
Definition xmmintrin.h:2991

_m_pmaxub
#define _m_pmaxub
Definition xmmintrin.h:2987

_mm_cmple_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmple_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:569

_mm_stream_ps
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_ps(float *__p, __m128 __a)
Moves packed float values from a 128-bit vector of [4 x float] to a 128-bit aligned memory location.
Definition xmmintrin.h:2139

_mm_cvt_si2ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvt_si2ss(__m128 __a, int __b)
Converts a 32-bit signed integer value into a floating point value and writes it to the lower 32 bits...
Definition xmmintrin.h:1513

_mm_cvtpi16_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi16_ps(__m64 __a)
Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x float].
Definition xmmintrin.h:2729

_mm_rsqrt_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rsqrt_ps(__m128 __a)
Calculates the approximate reciprocals of the square roots of the values stored in a 128-bit vector o...
Definition xmmintrin.h:302

_mm_cvtps_pi8
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtps_pi8(__m128 __a)
Converts each single-precision floating-point element of a 128-bit floating-point vector of [4 x floa...
Definition xmmintrin.h:2899

_mm_storel_pi
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pi(__m64 *__p, __m128 __a)
Stores the lower 64 bits of a 128-bit vector of [4 x float] to a memory location.
Definition xmmintrin.h:1941

_m_pmaxsw
#define _m_pmaxsw
Definition xmmintrin.h:2986

_mm_ucomile_ss
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition xmmintrin.h:1210

_mm_cmpge_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpge_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:677

_mm_shuffle_ps
#define _mm_shuffle_ps(a, b, mask)
Selects 4 float values from the 128-bit operands of [4 x float], as specified by the immediate value ...
Definition xmmintrin.h:2603

_m_pavgw
#define _m_pavgw
Definition xmmintrin.h:2995

_mm_comieq_ss
static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for equality and returns the ...
Definition xmmintrin.h:1016

_mm_store1_ps
static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_ps(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] into four contiguous elements in an align...
Definition xmmintrin.h:2023

_mm_cvtpu16_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpu16_ps(__m64 __a)
Converts a 64-bit vector of 16-bit unsigned integer values into a 128-bit vector of [4 x float].
Definition xmmintrin.h:2759

_mm_sfence
void _mm_sfence(void)
Forces strong memory ordering (serialization) between store instructions preceding this instruction a...

_mm_load_ps1
#define _mm_load_ps1(p)
Definition xmmintrin.h:1709

_mm_set_ps1
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ps1(float __w)
Constructs a 128-bit floating-point vector of [4 x float], with each of the four single-precision flo...
Definition xmmintrin.h:1833

_mm_mul_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mul_ps(__m128 __a, __m128 __b)
Multiplies two 128-bit vectors of [4 x float] and returns the results of the multiplication.
Definition xmmintrin.h:155

_MM_HINT_ET1
#define _MM_HINT_ET1
Definition xmmintrin.h:2069

_mm_max_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_max_ps(__m128 __a, __m128 __b)
Compares two 128-bit vectors of [4 x float] and returns the greater of each pair of values.
Definition xmmintrin.h:386

_m_pextrw
#define _m_pextrw
Definition xmmintrin.h:2984

_mm_rsqrt_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rsqrt_ss(__m128 __a)
Calculates the approximate reciprocal of the square root of the value stored in the low-order bits of...
Definition xmmintrin.h:285

_mm_ucomige_ss
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition xmmintrin.h:1260

_mm_avg_pu16
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_avg_pu16(__m64 __a, __m64 __b)
Computes the rounded averages of the packed unsigned 16-bit integer values and writes the averages to...
Definition xmmintrin.h:2426

_mm_comilt_ss
static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition xmmintrin.h:1041

_mm_loadl_pi
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadl_pi(__m128 __a, const __m64 *__p)
Loads two packed float values from the address __p into the low-order bits of a 128-bit vector of [4 ...
Definition xmmintrin.h:1651

_mm_storeu_ps
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] to an unaligned memory location.
Definition xmmintrin.h:1983

_mm_sub_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sub_ss(__m128 __a, __m128 __b)
Subtracts the 32-bit float value in the low-order bits of the second operand from the corresponding v...
Definition xmmintrin.h:92

_mm_sub_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sub_ps(__m128 __a, __m128 __b)
Subtracts each of the values of the second operand from the first operand, both of which are 128-bit ...
Definition xmmintrin.h:113

_mm_load1_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load1_ps(const float *__p)
Loads a 32-bit float value and duplicates it to all four vector elements of a 128-bit vector of [4 x ...
Definition xmmintrin.h:1700

_mm_movelh_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_movelh_ps(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
Definition xmmintrin.h:2711

_mm_min_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_min_ps(__m128 __a, __m128 __b)
Compares two 128-bit vectors of [4 x float] and returns the lesser of each pair of values.
Definition xmmintrin.h:344

_m_pavgb
#define _m_pavgb
Definition xmmintrin.h:2994

_m_pmovmskb
#define _m_pmovmskb
Definition xmmintrin.h:2990

_mm_comige_ss
static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition xmmintrin.h:1113

_mm_cvtss_si32
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtss_si32(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
Definition xmmintrin.h:1302

_mm_cmpgt_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpgt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:632

_m_psadbw
#define _m_psadbw
Definition xmmintrin.h:2996

_mm_setr_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_setr_ps(float __z, float __y, float __x, float __w)
Constructs a 128-bit floating-point vector of [4 x float], initialized in reverse order with the spec...
Definition xmmintrin.h:1888

_mm_ucomigt_ss
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition xmmintrin.h:1235

_mm_unpackhi_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_unpackhi_ps(__m128 __a, __m128 __b)
Unpacks the high-order (index 2,3) values from two 128-bit vectors of [4 x float] and interleaves the...
Definition xmmintrin.h:2624

_mm_store_ss
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ss(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] to a memory location.
Definition xmmintrin.h:1962

_mm_cmpngt_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpngt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:833

_mm_loadh_pi
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadh_pi(__m128 __a, const __m64 *__p)
Loads two packed float values from the address __p into the high-order bits of a 128-bit vector of [4...
Definition xmmintrin.h:1624

_mm_xor_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_xor_ps(__m128 __a, __m128 __b)
Performs a bitwise exclusive OR of two 128-bit vectors of [4 x float].
Definition xmmintrin.h:463

_mm_rcp_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rcp_ps(__m128 __a)
Calculates the approximate reciprocals of the values stored in a 128-bit vector of [4 x float].
Definition xmmintrin.h:266

_mm_move_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_move_ss(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
Definition xmmintrin.h:2668

_mm_set1_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set1_ps(float __w)
Constructs a 128-bit floating-point vector of [4 x float], with each of the four single-precision flo...
Definition xmmintrin.h:1814

_mm_stream_pi
static __inline__ void __DEFAULT_FN_ATTRS_MMX _mm_stream_pi(__m64 *__p, __m64 __a)
Stores a 64-bit integer in the specified aligned memory location.
Definition xmmintrin.h:2120

_mm_store_ps
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] into an aligned memory location.
Definition xmmintrin.h:2004

_MM_HINT_T0
#define _MM_HINT_T0
Definition xmmintrin.h:2070

_mm_or_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_or_ps(__m128 __a, __m128 __b)
Performs a bitwise OR of two 128-bit vectors of [4 x float].
Definition xmmintrin.h:444

_mm_extract_pi16
#define _mm_extract_pi16(a, n)
Extracts 16-bit element from a 64-bit vector of [4 x i16] and returns it, as specified by the immedia...
Definition xmmintrin.h:2183

_mm_sqrt_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ps(__m128 __a)
Calculates the square roots of the values stored in a 128-bit vector of [4 x float].
Definition xmmintrin.h:231

_mm_cmpneq_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpneq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for inequality and returns th...
Definition xmmintrin.h:700

_mm_prefetch
#define _mm_prefetch(a, sel)
Loads one cache line of data from the specified address to a location closer to the processor.
Definition xmmintrin.h:2103

_mm_cvtss_f32
static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtss_f32(__m128 __a)
Extracts a float value contained in the lower 32 bits of a vector of [4 x float].
Definition xmmintrin.h:1603

_m_pminsw
#define _m_pminsw
Definition xmmintrin.h:2988

_mm_max_pu8
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_max_pu8(__m64 __a, __m64 __b)
Compares each of the corresponding packed 8-bit unsigned integer values of the 64-bit integer vectors...
Definition xmmintrin.h:2250

_mm_mul_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mul_ss(__m128 __a, __m128 __b)
Multiplies two 32-bit float values in the low-order bits of the operands.
Definition xmmintrin.h:135

_mm_min_pi16
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_min_pi16(__m64 __a, __m64 __b)
Compares each of the corresponding packed 16-bit integer values of the 64-bit integer vectors,...
Definition xmmintrin.h:2269

_mm_cvtsi32_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsi32_ss(__m128 __a, int __b)
Converts a 32-bit signed integer value into a floating point value and writes it to the lower 32 bits...
Definition xmmintrin.h:1490

_mm_cvtt_ps2pi
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtt_ps2pi(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
Definition xmmintrin.h:1468

_mm_cvtt_ss2si
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtt_ss2si(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
Definition xmmintrin.h:1412

_mm_movemask_ps
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_ps(__m128 __a)
Extracts the sign bits from each single-precision floating-point element of a 128-bit floating-point ...
Definition xmmintrin.h:2924

_mm_cvtpi32x2_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
Converts the two 32-bit signed integer values from each 64-bit vector operand of [2 x i32] into a 128...
Definition xmmintrin.h:2840

_mm_movehl_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_movehl_ps(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
Definition xmmintrin.h:2690

_mm_loadr_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadr_ps(const float *__p)
Loads four packed float values, in reverse order, from an aligned memory location to 32-bit elements ...
Definition xmmintrin.h:1762

_mm_cmpord_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpord_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:927

_mm_cmpnlt_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnlt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:763

_mm_storeh_pi
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pi(__m64 *__p, __m128 __a)
Stores the upper 64 bits of a 128-bit vector of [4 x float] to a memory location.
Definition xmmintrin.h:1920

_MM_HINT_T1
#define _MM_HINT_T1
Definition xmmintrin.h:2071

_mm_cmpngt_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpngt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:855

_m_pshufw
#define _m_pshufw
Definition xmmintrin.h:2992

_mm_cmpnge_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnge_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:902

_m_maskmovq
#define _m_maskmovq
Definition xmmintrin.h:2993

_mm_insert_pi16
#define _mm_insert_pi16(a, d, n)
Copies data from the 64-bit vector of [4 x i16] to the destination, and inserts the lower 16-bits of ...
Definition xmmintrin.h:2214

_mm_shuffle_pi16
#define _mm_shuffle_pi16(a, n)
Shuffles the 4 16-bit integers from a 64-bit integer vector to the destination, as specified by the i...
Definition xmmintrin.h:2361

_mm_cmpord_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpord_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:947

_mm_cmpgt_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpgt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:611

_mm_cvt_pi2ps
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvt_pi2ps(__m128 __a, __m64 __b)
Converts two elements of a 64-bit vector of [2 x i32] into two floating point values and writes them ...
Definition xmmintrin.h:1586

_mm_ucomieq_ss
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition xmmintrin.h:1161

_mm_add_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_add_ss(__m128 __a, __m128 __b)
Adds the 32-bit float values in the low-order bits of the operands.
Definition xmmintrin.h:50

_mm_set_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ps(float __z, float __y, float __x, float __w)
Constructs a 128-bit floating-point vector of [4 x float] initialized with the specified single-preci...
Definition xmmintrin.h:1860

_mm_movemask_pi8
static __inline__ int __DEFAULT_FN_ATTRS_MMX _mm_movemask_pi8(__m64 __a)
Takes the most significant bit from each 8-bit element in a 64-bit integer vector to create an 8-bit ...
Definition xmmintrin.h:2306

_mm_cmpnlt_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnlt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:743

_mm_store_ps1
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ps1(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] into four contiguous elements in an align...
Definition xmmintrin.h:2043

_mm_min_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_min_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands and returns the lesser value ...
Definition xmmintrin.h:325

_mm_cvtpu8_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpu8_ps(__m64 __a)
Converts the lower four unsigned 8-bit integer values from a 64-bit vector of [8 x u8] into a 128-bit...
Definition xmmintrin.h:2813

_mm_cmple_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmple_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:588

_mm_avg_pu8
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_avg_pu8(__m64 __a, __m64 __b)
Computes the rounded averages of the packed unsigned 8-bit integer values and writes the averages to ...
Definition xmmintrin.h:2407

_mm_cvtpi8_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi8_ps(__m64 __a)
Converts the lower four 8-bit values from a 64-bit vector of [8 x i8] into a 128-bit vector of [4 x f...
Definition xmmintrin.h:2788

_mm_unpacklo_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_unpacklo_ps(__m128 __a, __m128 __b)
Unpacks the low-order (index 0,1) values from two 128-bit vectors of [4 x float] and interleaves them...
Definition xmmintrin.h:2646

_mm_max_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_max_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands and returns the greater value...
Definition xmmintrin.h:367

_mm_maskmove_si64
static __inline__ void __DEFAULT_FN_ATTRS_MMX _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
Conditionally copies the values from each 8-bit element in the first 64-bit integer vector operand to...
Definition xmmintrin.h:2388

_m_pminub
#define _m_pminub
Definition xmmintrin.h:2989

_mm_max_pi16
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_max_pi16(__m64 __a, __m64 __b)
Compares each of the corresponding packed 16-bit integer values of the 64-bit integer vectors,...
Definition xmmintrin.h:2231

_mm_ucomineq_ss
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition xmmintrin.h:1284

_mm_cvtps_pi16
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtps_pi16(__m128 __a)
Converts each single-precision floating-point element of a 128-bit floating-point vector of [4 x floa...
Definition xmmintrin.h:2869

_MM_HINT_NTA
#define _MM_HINT_NTA
Definition xmmintrin.h:2073

_mm_comineq_ss
static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition xmmintrin.h:1137

_mm_min_pu8
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_min_pu8(__m64 __a, __m64 __b)
Compares each of the corresponding packed 8-bit unsigned integer values of the 64-bit integer vectors...
Definition xmmintrin.h:2288

_mm_cmpnle_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnle_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:808

_mm_sad_pu8
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sad_pu8(__m64 __a, __m64 __b)
Subtracts the corresponding 8-bit unsigned integer values of the two 64-bit vector operands and compu...
Definition xmmintrin.h:2448

_mm_cvtpi32_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_ps(__m128 __a, __m64 __b)
Converts two elements of a 64-bit vector of [2 x i32] into two floating point values and writes them ...
Definition xmmintrin.h:1563

_MM_HINT_T2
#define _MM_HINT_T2
Definition xmmintrin.h:2072

_mm_cmpunord_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpunord_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:972

_mm_cmpge_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpge_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:656

_mm_cmpunord_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpunord_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:992

_mm_loadu_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadu_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an unaligned memory location.
Definition xmmintrin.h:1740

_mm_load_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load_ss(const float *__p)
Constructs a 128-bit floating-point vector of [4 x float].
Definition xmmintrin.h:1678

_mm_div_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_div_ss(__m128 __a, __m128 __b)
Divides the value in the low-order 32 bits of the first operand by the corresponding value in the sec...
Definition xmmintrin.h:177