10 #ifndef EIGEN_PACKET_MATH_AVX512_H
11 #define EIGEN_PACKET_MATH_AVX512_H
17 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
18 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
21 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
22 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
25 #ifdef EIGEN_VECTORIZE_FMA
26 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
27 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
95 template<>
struct packet_traits<float> : default_packet_traits
112 #if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
132 template<>
struct packet_traits<double> : default_packet_traits
141 #if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
198 return _mm512_set1_ps(from);
202 return _mm512_set1_pd(from);
206 return _mm512_set1_epi32(from);
211 return _mm512_castsi512_ps(_mm512_set1_epi32(from));
216 return _mm512_castsi512_pd(_mm512_set1_epi64(from));
224 return _mm512_castsi512_ps(_mm512_set_epi32(0, -1, 0, -1, 0, -1, 0, -1,
225 0, -1, 0, -1, 0, -1, 0, -1));
228 return _mm512_set_epi32(0, -1, 0, -1, 0, -1, 0, -1,
229 0, -1, 0, -1, 0, -1, 0, -1);
232 return _mm512_castsi512_pd(_mm512_set_epi32(0, 0, -1, -1, 0, 0, -1, -1,
233 0, 0, -1, -1, 0, 0, -1, -1));
238 return _mm512_broadcastss_ps(_mm_load_ps1(from));
242 return _mm512_set1_pd(*from);
247 return _mm512_add_ps(
249 _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f,
250 4.0f, 3.0f, 2.0f, 1.0f, 0.0f));
254 return _mm512_add_pd(_mm512_set1_pd(
a),
255 _mm512_set_pd(7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0));
261 return _mm512_add_ps(
a, b);
266 return _mm512_add_pd(
a, b);
271 return _mm512_add_epi32(
a, b);
277 return _mm512_sub_ps(
a, b);
282 return _mm512_sub_pd(
a, b);
287 return _mm512_sub_epi32(
a, b);
292 return _mm512_sub_ps(_mm512_set1_ps(0.0),
a);
296 return _mm512_sub_pd(_mm512_set1_pd(0.0),
a);
315 return _mm512_mul_ps(
a, b);
320 return _mm512_mul_pd(
a, b);
325 return _mm512_mullo_epi32(
a, b);
331 return _mm512_div_ps(
a, b);
336 return _mm512_div_pd(
a, b);
339 #ifdef EIGEN_VECTORIZE_FMA
343 return _mm512_fmadd_ps(
a, b, c);
348 return _mm512_fmadd_pd(
a, b, c);
356 __mmask16 mask16 = _mm512_cmp_epi32_mask(
357 _mm512_castps_si512(mask), _mm512_setzero_epi32(), _MM_CMPINT_EQ);
358 return _mm512_mask_blend_ps(mask16,
a, b);
365 __mmask8 mask8 = _mm512_cmp_epi64_mask(_mm512_castpd_si512(mask),
366 _mm512_setzero_epi32(), _MM_CMPINT_EQ);
367 return _mm512_mask_blend_pd(mask8,
a, b);
374 return _mm512_min_ps(b,
a);
380 return _mm512_min_pd(b,
a);
387 return _mm512_max_ps(b,
a);
393 return _mm512_max_pd(b,
a);
431 #ifdef EIGEN_VECTORIZE_AVX512DQ
438 return _mm256_castsi256_ps(_mm512_extracti64x4_epi64( _mm512_castps_si512(x),I_));
443 return _mm_castsi128_pd(_mm512_extracti32x4_epi32( _mm512_castpd_si512(x),I_));
447 return _mm512_castsi512_ps(_mm512_inserti64x4(_mm512_castsi256_si512(_mm256_castps_si256(
a)),
448 _mm256_castps_si256(b),1));
462 __m256i lo = _mm256_castps_si256(extract256<0>(rf));
463 __m256i hi = _mm256_castps_si256(extract256<1>(rf));
464 __m128i result_lo = _mm_packs_epi32(_mm256_extractf128_si256(lo, 0),
465 _mm256_extractf128_si256(lo, 1));
466 __m128i result_hi = _mm_packs_epi32(_mm256_extractf128_si256(hi, 0),
467 _mm256_extractf128_si256(hi, 1));
468 return _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi, 1);
473 __mmask16 mask = _mm512_cmp_ps_mask(
a, b, _CMP_EQ_OQ);
474 return _mm512_castsi512_ps(
475 _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
478 __mmask16 mask = _mm512_cmp_ps_mask(
a, b, _CMP_LE_OQ);
479 return _mm512_castsi512_ps(
480 _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
484 __mmask16 mask = _mm512_cmp_ps_mask(
a, b, _CMP_LT_OQ);
485 return _mm512_castsi512_ps(
486 _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
490 __mmask16 mask = _mm512_cmp_ps_mask(
a, b, _CMP_NGE_UQ);
491 return _mm512_castsi512_ps(
492 _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
496 __mmask16 mask = _mm512_cmp_epi32_mask(
a, b, _CMP_EQ_OQ);
497 return _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu);
503 __mmask8 mask = _mm512_cmp_pd_mask(
a, b, _CMP_EQ_OQ);
504 return _mm512_castsi512_pd(
505 _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu));
509 __mmask8 mask = _mm512_cmp_pd_mask(
a, b, _CMP_LE_OQ);
510 return _mm512_castsi512_pd(
511 _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu));
515 __mmask8 mask = _mm512_cmp_pd_mask(
a, b, _CMP_LT_OQ);
516 return _mm512_castsi512_pd(
517 _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu));
521 __mmask8 mask = _mm512_cmp_pd_mask(
a, b, _CMP_NGE_UQ);
522 return _mm512_castsi512_pd(
523 _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu));
537 return _mm512_set1_epi32(0xffffffffu);
553 return _mm512_and_si512(
a,b);
559 #ifdef EIGEN_VECTORIZE_AVX512DQ
560 return _mm512_and_ps(
a, b);
562 return _mm512_castsi512_ps(
pand(_mm512_castps_si512(
a),_mm512_castps_si512(b)));
568 #ifdef EIGEN_VECTORIZE_AVX512DQ
569 return _mm512_and_pd(
a, b);
571 Packet8d res = _mm512_undefined_pd();
572 Packet4d lane0_a = _mm512_extractf64x4_pd(
a, 0);
573 Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0);
574 res = _mm512_insertf64x4(res, _mm256_and_pd(lane0_a, lane0_b), 0);
576 Packet4d lane1_a = _mm512_extractf64x4_pd(
a, 1);
577 Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
578 return _mm512_insertf64x4(res, _mm256_and_pd(lane1_a, lane1_b), 1);
584 return _mm512_or_si512(
a, b);
589 #ifdef EIGEN_VECTORIZE_AVX512DQ
590 return _mm512_or_ps(
a, b);
592 return _mm512_castsi512_ps(
por(_mm512_castps_si512(
a),_mm512_castps_si512(b)));
599 #ifdef EIGEN_VECTORIZE_AVX512DQ
600 return _mm512_or_pd(
a, b);
602 return _mm512_castsi512_pd(
por(_mm512_castpd_si512(
a),_mm512_castpd_si512(b)));
608 return _mm512_xor_si512(
a, b);
613 #ifdef EIGEN_VECTORIZE_AVX512DQ
614 return _mm512_xor_ps(
a, b);
616 return _mm512_castsi512_ps(
pxor(_mm512_castps_si512(
a),_mm512_castps_si512(b)));
622 #ifdef EIGEN_VECTORIZE_AVX512DQ
623 return _mm512_xor_pd(
a, b);
625 return _mm512_castsi512_pd(
pxor(_mm512_castpd_si512(
a),_mm512_castpd_si512(b)));
631 return _mm512_andnot_si512(b,
a);
636 #ifdef EIGEN_VECTORIZE_AVX512DQ
637 return _mm512_andnot_ps(b,
a);
639 return _mm512_castsi512_ps(
pandnot(_mm512_castps_si512(
a),_mm512_castps_si512(b)));
644 #ifdef EIGEN_VECTORIZE_AVX512DQ
645 return _mm512_andnot_pd(b,
a);
647 return _mm512_castsi512_pd(
pandnot(_mm512_castpd_si512(
a),_mm512_castpd_si512(b)));
656 return _mm512_roundscale_ps(
padd(
por(
pand(
a, mask), prev0dot5),
a), _MM_FROUND_TO_ZERO);
663 return _mm512_roundscale_pd(
padd(
por(
pand(
a, mask), prev0dot5),
a), _MM_FROUND_TO_ZERO);
667 return _mm512_srai_epi32(
a,
N);
671 return _mm512_srli_epi32(
a,
N);
675 return _mm512_slli_epi32(
a,
N);
689 reinterpret_cast<const __m512i*
>(from));
703 reinterpret_cast<const __m512i*
>(from));
708 __mmask16 mask =
static_cast<__mmask16
>(umask);
718 __m256i low_half = _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(from));
719 __m512 even_elements = _mm512_castsi512_ps(_mm512_cvtepu32_epi64(low_half));
720 __m512 pairs = _mm512_permute_ps(even_elements, _MM_SHUFFLE(2, 2, 0, 0));
724 #ifdef EIGEN_VECTORIZE_AVX512DQ
730 __m512d x = _mm512_setzero_pd();
731 x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[0]), 0);
732 x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[1]), 1);
733 x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[2]), 2);
734 x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[3]), 3);
740 __m512d x = _mm512_setzero_pd();
741 x = _mm512_mask_broadcastsd_pd(x, 0x3<<0, _mm_load_sd(from+0));
742 x = _mm512_mask_broadcastsd_pd(x, 0x3<<2, _mm_load_sd(from+1));
743 x = _mm512_mask_broadcastsd_pd(x, 0x3<<4, _mm_load_sd(from+2));
744 x = _mm512_mask_broadcastsd_pd(x, 0x3<<6, _mm_load_sd(from+3));
754 const Packet16i scatter_mask = _mm512_set_epi32(3,3,3,3, 2,2,2,2, 1,1,1,1, 0,0,0,0);
755 return _mm512_permutexvar_ps(scatter_mask, tmp);
762 __m256d lane0 = _mm256_set1_pd(*from);
763 __m256d lane1 = _mm256_set1_pd(*(from+1));
764 __m512d tmp = _mm512_undefined_pd();
765 tmp = _mm512_insertf64x4(tmp, lane0, 0);
766 return _mm512_insertf64x4(tmp, lane1, 1);
794 reinterpret_cast<__m512i*
>(to), from);
798 __mmask16 mask =
static_cast<__mmask16
>(umask);
805 Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
807 _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
808 Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
810 return _mm512_i32gather_ps(indices, from, 4);
815 Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
816 Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
817 Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
819 return _mm512_i32gather_pd(indices, from, 8);
826 Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
828 _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
829 Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
830 _mm512_i32scatter_ps(to, indices, from, 4);
836 Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
837 Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
838 Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
839 _mm512_i32scatter_pd(to, indices, from, 8);
864 return _mm_cvtss_f32(_mm512_extractf32x4_ps(
a, 0));
868 return _mm_cvtsd_f64(_mm256_extractf128_pd(_mm512_extractf64x4_pd(
a, 0), 0));
872 return _mm_extract_epi32(_mm512_extracti32x4_epi32(
a, 0), 0);
877 return _mm512_permutexvar_ps(_mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
a);
882 return _mm512_permutexvar_pd(_mm512_set_epi32(0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7),
a);
888 return _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(
a), _mm512_set1_epi32(0x7fffffff)));
893 return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(
a),
894 _mm512_set1_epi64(0x7fffffffffffffff)));
907 #ifdef EIGEN_VECTORIZE_AVX512DQ
908 return _mm512_cvtepi64_pd(_mm512_srli_epi64(_mm512_castpd_si512(
pand(
a, cst_exp_mask)), 52));
910 return _mm512_cvtepi32_pd(_mm512_cvtepi64_epi32(_mm512_srli_epi64(_mm512_castpd_si512(
pand(
a, cst_exp_mask)), 52)));
930 Packet8i b = parithmetic_shift_right<2>(e);
933 const Packet8i permute_idx = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
934 Packet8i hi = _mm256_permutevar8x32_epi32(
padd(b, bias), permute_idx);
935 Packet8i lo = _mm256_slli_epi64(hi, 52);
936 hi = _mm256_slli_epi64(_mm256_srli_epi64(hi, 32), 52);
937 Packet8d c = _mm512_castsi512_pd(_mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1));
942 hi = _mm256_permutevar8x32_epi32(
padd(b, bias), permute_idx);
943 lo = _mm256_slli_epi64(hi, 52);
944 hi = _mm256_slli_epi64(_mm256_srli_epi64(hi, 32), 52);
945 c = _mm512_castsi512_pd(_mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1));
950 #ifdef EIGEN_VECTORIZE_AVX512DQ
952 #define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \
953 __m256 OUTPUT##_0 = _mm512_extractf32x8_ps(INPUT, 0); \
954 __m256 OUTPUT##_1 = _mm512_extractf32x8_ps(INPUT, 1)
956 #define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \
957 __m256 OUTPUT##_0 = _mm256_insertf128_ps( \
958 _mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 0)), \
959 _mm512_extractf32x4_ps(INPUT, 1), 1); \
960 __m256 OUTPUT##_1 = _mm256_insertf128_ps( \
961 _mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 2)), \
962 _mm512_extractf32x4_ps(INPUT, 3), 1);
965 #ifdef EIGEN_VECTORIZE_AVX512DQ
966 #define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \
967 OUTPUT = _mm512_insertf32x8(_mm512_castps256_ps512(INPUTA), INPUTB, 1);
969 #define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \
970 OUTPUT = _mm512_undefined_ps(); \
971 OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 0), 0); \
972 OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 1), 1); \
973 OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 0), 2); \
974 OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 1), 3);
979 #ifdef EIGEN_VECTORIZE_AVX512DQ
980 __m256 lane0 = _mm512_extractf32x8_ps(
a, 0);
981 __m256 lane1 = _mm512_extractf32x8_ps(
a, 1);
982 Packet8f x = _mm256_add_ps(lane0, lane1);
985 __m128 lane0 = _mm512_extractf32x4_ps(
a, 0);
986 __m128 lane1 = _mm512_extractf32x4_ps(
a, 1);
987 __m128 lane2 = _mm512_extractf32x4_ps(
a, 2);
988 __m128 lane3 = _mm512_extractf32x4_ps(
a, 3);
989 __m128 sum = _mm_add_ps(_mm_add_ps(lane0, lane1), _mm_add_ps(lane2, lane3));
990 sum = _mm_hadd_ps(sum, sum);
991 sum = _mm_hadd_ps(sum, _mm_permute_ps(sum, 1));
992 return _mm_cvtss_f32(sum);
997 __m256d lane0 = _mm512_extractf64x4_pd(
a, 0);
998 __m256d lane1 = _mm512_extractf64x4_pd(
a, 1);
999 __m256d sum = _mm256_add_pd(lane0, lane1);
1000 __m256d tmp0 = _mm256_hadd_pd(sum, _mm256_permute2f128_pd(sum, sum, 1));
1001 return _mm_cvtsd_f64(_mm256_castpd256_pd128(_mm256_hadd_pd(tmp0, tmp0)));
1006 #ifdef EIGEN_VECTORIZE_AVX512DQ
1007 __m256 lane0 = _mm512_extractf32x8_ps(
a, 0);
1008 __m256 lane1 = _mm512_extractf32x8_ps(
a, 1);
1009 return _mm256_add_ps(lane0, lane1);
1011 __m128 lane0 = _mm512_extractf32x4_ps(
a, 0);
1012 __m128 lane1 = _mm512_extractf32x4_ps(
a, 1);
1013 __m128 lane2 = _mm512_extractf32x4_ps(
a, 2);
1014 __m128 lane3 = _mm512_extractf32x4_ps(
a, 3);
1015 __m128 sum0 = _mm_add_ps(lane0, lane2);
1016 __m128 sum1 = _mm_add_ps(lane1, lane3);
1017 return _mm256_insertf128_ps(_mm256_castps128_ps256(sum0), sum1, 1);
1022 __m256d lane0 = _mm512_extractf64x4_pd(
a, 0);
1023 __m256d lane1 = _mm512_extractf64x4_pd(
a, 1);
1024 return _mm256_add_pd(lane0, lane1);
1031 Packet8f lane0 = _mm512_extractf32x8_ps(
a, 0);
1032 Packet8f lane1 = _mm512_extractf32x8_ps(
a, 1);
1034 res =
pmul(res, _mm256_permute2f128_ps(res, res, 1));
1035 res =
pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
1036 return pfirst(
pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
1038 __m128 lane0 = _mm512_extractf32x4_ps(
a, 0);
1039 __m128 lane1 = _mm512_extractf32x4_ps(
a, 1);
1040 __m128 lane2 = _mm512_extractf32x4_ps(
a, 2);
1041 __m128 lane3 = _mm512_extractf32x4_ps(
a, 3);
1042 __m128 res =
pmul(
pmul(lane0, lane1),
pmul(lane2, lane3));
1043 res =
pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
1044 return pfirst(
pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
1049 __m256d lane0 = _mm512_extractf64x4_pd(
a, 0);
1050 __m256d lane1 = _mm512_extractf64x4_pd(
a, 1);
1051 __m256d res =
pmul(lane0, lane1);
1052 res =
pmul(res, _mm256_permute2f128_pd(res, res, 1));
1053 return pfirst(
pmul(res, _mm256_shuffle_pd(res, res, 1)));
1058 __m128 lane0 = _mm512_extractf32x4_ps(
a, 0);
1059 __m128 lane1 = _mm512_extractf32x4_ps(
a, 1);
1060 __m128 lane2 = _mm512_extractf32x4_ps(
a, 2);
1061 __m128 lane3 = _mm512_extractf32x4_ps(
a, 3);
1062 __m128 res = _mm_min_ps(_mm_min_ps(lane0, lane1), _mm_min_ps(lane2, lane3));
1063 res = _mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
1064 return pfirst(_mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
1068 __m256d lane0 = _mm512_extractf64x4_pd(
a, 0);
1069 __m256d lane1 = _mm512_extractf64x4_pd(
a, 1);
1070 __m256d res = _mm256_min_pd(lane0, lane1);
1071 res = _mm256_min_pd(res, _mm256_permute2f128_pd(res, res, 1));
1072 return pfirst(_mm256_min_pd(res, _mm256_shuffle_pd(res, res, 1)));
1077 __m128 lane0 = _mm512_extractf32x4_ps(
a, 0);
1078 __m128 lane1 = _mm512_extractf32x4_ps(
a, 1);
1079 __m128 lane2 = _mm512_extractf32x4_ps(
a, 2);
1080 __m128 lane3 = _mm512_extractf32x4_ps(
a, 3);
1081 __m128 res = _mm_max_ps(_mm_max_ps(lane0, lane1), _mm_max_ps(lane2, lane3));
1082 res = _mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
1083 return pfirst(_mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
1088 __m256d lane0 = _mm512_extractf64x4_pd(
a, 0);
1089 __m256d lane1 = _mm512_extractf64x4_pd(
a, 1);
1090 __m256d res = _mm256_max_pd(lane0, lane1);
1091 res = _mm256_max_pd(res, _mm256_permute2f128_pd(res, res, 1));
1092 return pfirst(_mm256_max_pd(res, _mm256_shuffle_pd(res, res, 1)));
1098 __mmask16 tmp = _mm512_test_epi32_mask(xi,xi);
1099 return !_mm512_kortestz(tmp,tmp);
1104 #define PACK_OUTPUT(OUTPUT, INPUT, INDEX, STRIDE) \
1105 EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[INDEX], INPUT[INDEX + STRIDE]);
1108 __m512 T0 = _mm512_unpacklo_ps(kernel.
packet[0], kernel.
packet[1]);
1109 __m512 T1 = _mm512_unpackhi_ps(kernel.
packet[0], kernel.
packet[1]);
1110 __m512 T2 = _mm512_unpacklo_ps(kernel.
packet[2], kernel.
packet[3]);
1111 __m512 T3 = _mm512_unpackhi_ps(kernel.
packet[2], kernel.
packet[3]);
1112 __m512 T4 = _mm512_unpacklo_ps(kernel.
packet[4], kernel.
packet[5]);
1113 __m512 T5 = _mm512_unpackhi_ps(kernel.
packet[4], kernel.
packet[5]);
1114 __m512 T6 = _mm512_unpacklo_ps(kernel.
packet[6], kernel.
packet[7]);
1115 __m512 T7 = _mm512_unpackhi_ps(kernel.
packet[6], kernel.
packet[7]);
1116 __m512 T8 = _mm512_unpacklo_ps(kernel.
packet[8], kernel.
packet[9]);
1117 __m512 T9 = _mm512_unpackhi_ps(kernel.
packet[8], kernel.
packet[9]);
1118 __m512 T10 = _mm512_unpacklo_ps(kernel.
packet[10], kernel.
packet[11]);
1119 __m512 T11 = _mm512_unpackhi_ps(kernel.
packet[10], kernel.
packet[11]);
1120 __m512 T12 = _mm512_unpacklo_ps(kernel.
packet[12], kernel.
packet[13]);
1121 __m512 T13 = _mm512_unpackhi_ps(kernel.
packet[12], kernel.
packet[13]);
1122 __m512 T14 = _mm512_unpacklo_ps(kernel.
packet[14], kernel.
packet[15]);
1123 __m512 T15 = _mm512_unpackhi_ps(kernel.
packet[14], kernel.
packet[15]);
1124 __m512 S0 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
1125 __m512 S1 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
1126 __m512 S2 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
1127 __m512 S3 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
1128 __m512 S4 = _mm512_shuffle_ps(T4, T6, _MM_SHUFFLE(1, 0, 1, 0));
1129 __m512 S5 = _mm512_shuffle_ps(T4, T6, _MM_SHUFFLE(3, 2, 3, 2));
1130 __m512 S6 = _mm512_shuffle_ps(T5, T7, _MM_SHUFFLE(1, 0, 1, 0));
1131 __m512 S7 = _mm512_shuffle_ps(T5, T7, _MM_SHUFFLE(3, 2, 3, 2));
1132 __m512 S8 = _mm512_shuffle_ps(T8, T10, _MM_SHUFFLE(1, 0, 1, 0));
1133 __m512 S9 = _mm512_shuffle_ps(T8, T10, _MM_SHUFFLE(3, 2, 3, 2));
1134 __m512 S10 = _mm512_shuffle_ps(T9, T11, _MM_SHUFFLE(1, 0, 1, 0));
1135 __m512 S11 = _mm512_shuffle_ps(T9, T11, _MM_SHUFFLE(3, 2, 3, 2));
1136 __m512 S12 = _mm512_shuffle_ps(T12, T14, _MM_SHUFFLE(1, 0, 1, 0));
1137 __m512 S13 = _mm512_shuffle_ps(T12, T14, _MM_SHUFFLE(3, 2, 3, 2));
1138 __m512 S14 = _mm512_shuffle_ps(T13, T15, _MM_SHUFFLE(1, 0, 1, 0));
1139 __m512 S15 = _mm512_shuffle_ps(T13, T15, _MM_SHUFFLE(3, 2, 3, 2));
1160 tmp.
packet[0] = _mm256_permute2f128_ps(S0_0, S4_0, 0x20);
1161 tmp.
packet[1] = _mm256_permute2f128_ps(S1_0, S5_0, 0x20);
1162 tmp.
packet[2] = _mm256_permute2f128_ps(S2_0, S6_0, 0x20);
1163 tmp.
packet[3] = _mm256_permute2f128_ps(S3_0, S7_0, 0x20);
1164 tmp.
packet[4] = _mm256_permute2f128_ps(S0_0, S4_0, 0x31);
1165 tmp.
packet[5] = _mm256_permute2f128_ps(S1_0, S5_0, 0x31);
1166 tmp.
packet[6] = _mm256_permute2f128_ps(S2_0, S6_0, 0x31);
1167 tmp.
packet[7] = _mm256_permute2f128_ps(S3_0, S7_0, 0x31);
1169 tmp.
packet[8] = _mm256_permute2f128_ps(S0_1, S4_1, 0x20);
1170 tmp.
packet[9] = _mm256_permute2f128_ps(S1_1, S5_1, 0x20);
1171 tmp.
packet[10] = _mm256_permute2f128_ps(S2_1, S6_1, 0x20);
1172 tmp.
packet[11] = _mm256_permute2f128_ps(S3_1, S7_1, 0x20);
1173 tmp.
packet[12] = _mm256_permute2f128_ps(S0_1, S4_1, 0x31);
1174 tmp.
packet[13] = _mm256_permute2f128_ps(S1_1, S5_1, 0x31);
1175 tmp.
packet[14] = _mm256_permute2f128_ps(S2_1, S6_1, 0x31);
1176 tmp.
packet[15] = _mm256_permute2f128_ps(S3_1, S7_1, 0x31);
1179 tmp.
packet[16] = _mm256_permute2f128_ps(S8_0, S12_0, 0x20);
1180 tmp.
packet[17] = _mm256_permute2f128_ps(S9_0, S13_0, 0x20);
1181 tmp.
packet[18] = _mm256_permute2f128_ps(S10_0, S14_0, 0x20);
1182 tmp.
packet[19] = _mm256_permute2f128_ps(S11_0, S15_0, 0x20);
1183 tmp.
packet[20] = _mm256_permute2f128_ps(S8_0, S12_0, 0x31);
1184 tmp.
packet[21] = _mm256_permute2f128_ps(S9_0, S13_0, 0x31);
1185 tmp.
packet[22] = _mm256_permute2f128_ps(S10_0, S14_0, 0x31);
1186 tmp.
packet[23] = _mm256_permute2f128_ps(S11_0, S15_0, 0x31);
1188 tmp.
packet[24] = _mm256_permute2f128_ps(S8_1, S12_1, 0x20);
1189 tmp.
packet[25] = _mm256_permute2f128_ps(S9_1, S13_1, 0x20);
1190 tmp.
packet[26] = _mm256_permute2f128_ps(S10_1, S14_1, 0x20);
1191 tmp.
packet[27] = _mm256_permute2f128_ps(S11_1, S15_1, 0x20);
1192 tmp.
packet[28] = _mm256_permute2f128_ps(S8_1, S12_1, 0x31);
1193 tmp.
packet[29] = _mm256_permute2f128_ps(S9_1, S13_1, 0x31);
1194 tmp.
packet[30] = _mm256_permute2f128_ps(S10_1, S14_1, 0x31);
1195 tmp.
packet[31] = _mm256_permute2f128_ps(S11_1, S15_1, 0x31);
1218 #define PACK_OUTPUT_2(OUTPUT, INPUT, INDEX, STRIDE) \
1219 EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[2 * INDEX], \
1220 INPUT[2 * INDEX + STRIDE]);
1223 __m512 T0 = _mm512_unpacklo_ps(kernel.
packet[0], kernel.
packet[1]);
1224 __m512 T1 = _mm512_unpackhi_ps(kernel.
packet[0], kernel.
packet[1]);
1225 __m512 T2 = _mm512_unpacklo_ps(kernel.
packet[2], kernel.
packet[3]);
1226 __m512 T3 = _mm512_unpackhi_ps(kernel.
packet[2], kernel.
packet[3]);
1228 __m512 S0 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
1229 __m512 S1 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
1230 __m512 S2 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
1231 __m512 S3 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
1240 tmp.
packet[0] = _mm256_permute2f128_ps(S0_0, S1_0, 0x20);
1241 tmp.
packet[1] = _mm256_permute2f128_ps(S2_0, S3_0, 0x20);
1242 tmp.
packet[2] = _mm256_permute2f128_ps(S0_0, S1_0, 0x31);
1243 tmp.
packet[3] = _mm256_permute2f128_ps(S2_0, S3_0, 0x31);
1245 tmp.
packet[4] = _mm256_permute2f128_ps(S0_1, S1_1, 0x20);
1246 tmp.
packet[5] = _mm256_permute2f128_ps(S2_1, S3_1, 0x20);
1247 tmp.
packet[6] = _mm256_permute2f128_ps(S0_1, S1_1, 0x31);
1248 tmp.
packet[7] = _mm256_permute2f128_ps(S2_1, S3_1, 0x31);
1256 #define PACK_OUTPUT_SQ_D(OUTPUT, INPUT, INDEX, STRIDE) \
1257 OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[INDEX], 0); \
1258 OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[INDEX + STRIDE], 1);
1260 #define PACK_OUTPUT_D(OUTPUT, INPUT, INDEX, STRIDE) \
1261 OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[(2 * INDEX)], 0); \
1263 _mm512_insertf64x4(OUTPUT[INDEX], INPUT[(2 * INDEX) + STRIDE], 1);
1266 __m512d T0 = _mm512_shuffle_pd(kernel.
packet[0], kernel.
packet[1], 0);
1267 __m512d T1 = _mm512_shuffle_pd(kernel.
packet[0], kernel.
packet[1], 0xff);
1268 __m512d T2 = _mm512_shuffle_pd(kernel.
packet[2], kernel.
packet[3], 0);
1269 __m512d T3 = _mm512_shuffle_pd(kernel.
packet[2], kernel.
packet[3], 0xff);
1273 tmp.
packet[0] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0),
1274 _mm512_extractf64x4_pd(T2, 0), 0x20);
1275 tmp.
packet[1] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0),
1276 _mm512_extractf64x4_pd(T3, 0), 0x20);
1277 tmp.
packet[2] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0),
1278 _mm512_extractf64x4_pd(T2, 0), 0x31);
1279 tmp.
packet[3] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0),
1280 _mm512_extractf64x4_pd(T3, 0), 0x31);
1282 tmp.
packet[4] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1),
1283 _mm512_extractf64x4_pd(T2, 1), 0x20);
1284 tmp.
packet[5] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1),
1285 _mm512_extractf64x4_pd(T3, 1), 0x20);
1286 tmp.
packet[6] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1),
1287 _mm512_extractf64x4_pd(T2, 1), 0x31);
1288 tmp.
packet[7] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1),
1289 _mm512_extractf64x4_pd(T3, 1), 0x31);
1298 __m512d T0 = _mm512_unpacklo_pd(kernel.
packet[0], kernel.
packet[1]);
1299 __m512d T1 = _mm512_unpackhi_pd(kernel.
packet[0], kernel.
packet[1]);
1300 __m512d T2 = _mm512_unpacklo_pd(kernel.
packet[2], kernel.
packet[3]);
1301 __m512d T3 = _mm512_unpackhi_pd(kernel.
packet[2], kernel.
packet[3]);
1302 __m512d T4 = _mm512_unpacklo_pd(kernel.
packet[4], kernel.
packet[5]);
1303 __m512d T5 = _mm512_unpackhi_pd(kernel.
packet[4], kernel.
packet[5]);
1304 __m512d T6 = _mm512_unpacklo_pd(kernel.
packet[6], kernel.
packet[7]);
1305 __m512d T7 = _mm512_unpackhi_pd(kernel.
packet[6], kernel.
packet[7]);
1309 tmp.
packet[0] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0),
1310 _mm512_extractf64x4_pd(T2, 0), 0x20);
1311 tmp.
packet[1] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0),
1312 _mm512_extractf64x4_pd(T3, 0), 0x20);
1313 tmp.
packet[2] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0),
1314 _mm512_extractf64x4_pd(T2, 0), 0x31);
1315 tmp.
packet[3] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0),
1316 _mm512_extractf64x4_pd(T3, 0), 0x31);
1318 tmp.
packet[4] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1),
1319 _mm512_extractf64x4_pd(T2, 1), 0x20);
1320 tmp.
packet[5] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1),
1321 _mm512_extractf64x4_pd(T3, 1), 0x20);
1322 tmp.
packet[6] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1),
1323 _mm512_extractf64x4_pd(T2, 1), 0x31);
1324 tmp.
packet[7] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1),
1325 _mm512_extractf64x4_pd(T3, 1), 0x31);
1327 tmp.
packet[8] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T4, 0),
1328 _mm512_extractf64x4_pd(T6, 0), 0x20);
1329 tmp.
packet[9] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T5, 0),
1330 _mm512_extractf64x4_pd(T7, 0), 0x20);
1331 tmp.
packet[10] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T4, 0),
1332 _mm512_extractf64x4_pd(T6, 0), 0x31);
1333 tmp.
packet[11] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T5, 0),
1334 _mm512_extractf64x4_pd(T7, 0), 0x31);
1336 tmp.
packet[12] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T4, 1),
1337 _mm512_extractf64x4_pd(T6, 1), 0x20);
1338 tmp.
packet[13] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T5, 1),
1339 _mm512_extractf64x4_pd(T7, 1), 0x20);
1340 tmp.
packet[14] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T4, 1),
1341 _mm512_extractf64x4_pd(T6, 1), 0x31);
1342 tmp.
packet[15] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T5, 1),
1343 _mm512_extractf64x4_pd(T7, 1), 0x31);
1359 assert(
false &&
"To be implemented");
1366 __mmask8 m = (ifPacket.
select[0] )
1367 | (ifPacket.
select[1]<<1)
1368 | (ifPacket.
select[2]<<2)
1369 | (ifPacket.
select[3]<<3)
1370 | (ifPacket.
select[4]<<4)
1371 | (ifPacket.
select[5]<<5)
1372 | (ifPacket.
select[6]<<6)
1373 | (ifPacket.
select[7]<<7);
1374 return _mm512_mask_blend_pd(m, elsePacket, thenPacket);
1379 return _mm256_set1_epi16(from.
x);
1387 return _mm256_load_si256(
reinterpret_cast<const __m256i*
>(from));
1391 return _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(from));
1397 _mm256_store_si256((__m256i*)(
void*)to, from);
1403 _mm256_storeu_si256((__m256i*)(
void*)to, from);
1408 unsigned short a = from[0].
x;
1409 unsigned short b = from[1].
x;
1410 unsigned short c = from[2].
x;
1411 unsigned short d = from[3].
x;
1412 unsigned short e = from[4].
x;
1413 unsigned short f = from[5].
x;
1414 unsigned short g = from[6].
x;
1415 unsigned short h = from[7].
x;
1416 return _mm256_set_epi16(h, h, g, g, f, f, e, e,
d,
d, c, c, b, b,
a,
a);
1421 unsigned short a = from[0].
x;
1422 unsigned short b = from[1].
x;
1423 unsigned short c = from[2].
x;
1424 unsigned short d = from[3].
x;
1425 return _mm256_set_epi16(
d,
d,
d,
d, c, c, c, c, b, b, b, b,
a,
a,
a,
a);
1429 #ifdef EIGEN_HAS_FP16_C
1430 return _mm512_cvtph_ps(
a);
1451 return _mm512_set_ps(
1452 ff, fe, fd, fc, fb, fa, f9, f8, f7, f6, f5, f4, f3, f2, f1, f0);
1457 #ifdef EIGEN_HAS_FP16_C
1458 return _mm512_cvtps_ph(
a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
1479 return _mm256_set_epi16(
1480 hf.
x, he.
x, hd.
x, hc.
x, hb.
x, ha.
x, h9.
x, h8.
x,
1481 h7.
x, h6.
x, h5.
x, h4.
x, h3.
x, h2.
x, h1.
x, h0.
x);
1491 const __m256i sign_mask = _mm256_set1_epi16(
static_cast<numext::uint16_t>(0x8000));
1492 return _mm256_andnot_si256(sign_mask,
a);
1528 return _mm256_blendv_epi8(b,
a, mask);
1568 Packet16h sign_mask = _mm256_set1_epi16(
static_cast<unsigned short>(0x8000));
1569 return _mm256_xor_si256(
a, sign_mask);
1607 Packet8h lane0 = _mm256_extractf128_si256(
a, 0);
1608 Packet8h lane1 = _mm256_extractf128_si256(
a, 1);
1631 __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
1632 return _mm256_insertf128_si256(
1633 _mm256_castsi128_si256(_mm_shuffle_epi8(_mm256_extractf128_si256(
a,1),m)),
1634 _mm_shuffle_epi8(_mm256_extractf128_si256(
a,0),m), 1);
1639 return _mm256_set_epi16(
1640 from[15*stride].x, from[14*stride].x, from[13*stride].x, from[12*stride].x,
1641 from[11*stride].x, from[10*stride].x, from[9*stride].x, from[8*stride].x,
1642 from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x,
1643 from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
1650 to[stride*0] = aux[0];
1651 to[stride*1] = aux[1];
1652 to[stride*2] = aux[2];
1653 to[stride*3] = aux[3];
1654 to[stride*4] = aux[4];
1655 to[stride*5] = aux[5];
1656 to[stride*6] = aux[6];
1657 to[stride*7] = aux[7];
1658 to[stride*8] = aux[8];
1659 to[stride*9] = aux[9];
1660 to[stride*10] = aux[10];
1661 to[stride*11] = aux[11];
1662 to[stride*12] = aux[12];
1663 to[stride*13] = aux[13];
1664 to[stride*14] = aux[14];
1665 to[stride*15] = aux[15];
1671 __m256i b = kernel.
packet[1];
1672 __m256i c = kernel.
packet[2];
1674 __m256i e = kernel.
packet[4];
1675 __m256i f = kernel.
packet[5];
1676 __m256i g = kernel.
packet[6];
1677 __m256i h = kernel.
packet[7];
1678 __m256i i = kernel.
packet[8];
1679 __m256i j = kernel.
packet[9];
1680 __m256i k = kernel.
packet[10];
1681 __m256i l = kernel.
packet[11];
1682 __m256i m = kernel.
packet[12];
1683 __m256i n = kernel.
packet[13];
1684 __m256i o = kernel.
packet[14];
1685 __m256i p = kernel.
packet[15];
1687 __m256i ab_07 = _mm256_unpacklo_epi16(
a, b);
1688 __m256i cd_07 = _mm256_unpacklo_epi16(c,
d);
1689 __m256i ef_07 = _mm256_unpacklo_epi16(e, f);
1690 __m256i gh_07 = _mm256_unpacklo_epi16(g, h);
1691 __m256i ij_07 = _mm256_unpacklo_epi16(i, j);
1692 __m256i kl_07 = _mm256_unpacklo_epi16(k, l);
1693 __m256i mn_07 = _mm256_unpacklo_epi16(m, n);
1694 __m256i op_07 = _mm256_unpacklo_epi16(o, p);
1696 __m256i ab_8f = _mm256_unpackhi_epi16(
a, b);
1697 __m256i cd_8f = _mm256_unpackhi_epi16(c,
d);
1698 __m256i ef_8f = _mm256_unpackhi_epi16(e, f);
1699 __m256i gh_8f = _mm256_unpackhi_epi16(g, h);
1700 __m256i ij_8f = _mm256_unpackhi_epi16(i, j);
1701 __m256i kl_8f = _mm256_unpackhi_epi16(k, l);
1702 __m256i mn_8f = _mm256_unpackhi_epi16(m, n);
1703 __m256i op_8f = _mm256_unpackhi_epi16(o, p);
1705 __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07);
1706 __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07);
1707 __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07);
1708 __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07);
1709 __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07);
1710 __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07);
1711 __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07);
1712 __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07);
1714 __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f);
1715 __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f);
1716 __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f);
1717 __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f);
1718 __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f);
1719 __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f);
1720 __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f);
1721 __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f);
1723 __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03);
1724 __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03);
1725 __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03);
1726 __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03);
1727 __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47);
1728 __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47);
1729 __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47);
1730 __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47);
1731 __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b);
1732 __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b);
1733 __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b);
1734 __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b);
1735 __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf);
1736 __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf);
1737 __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf);
1738 __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf);
1741 __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20);
1742 __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
1743 __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
1744 __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
1745 __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
1746 __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
1747 __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
1748 __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
1749 __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
1750 __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
1751 __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
1752 __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
1753 __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
1754 __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
1755 __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
1756 __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
1758 kernel.
packet[0] = a_p_0;
1759 kernel.
packet[1] = a_p_1;
1760 kernel.
packet[2] = a_p_2;
1761 kernel.
packet[3] = a_p_3;
1762 kernel.
packet[4] = a_p_4;
1763 kernel.
packet[5] = a_p_5;
1764 kernel.
packet[6] = a_p_6;
1765 kernel.
packet[7] = a_p_7;
1766 kernel.
packet[8] = a_p_8;
1767 kernel.
packet[9] = a_p_9;
1768 kernel.
packet[10] = a_p_a;
1769 kernel.
packet[11] = a_p_b;
1770 kernel.
packet[12] = a_p_c;
1771 kernel.
packet[13] = a_p_d;
1772 kernel.
packet[14] = a_p_e;
1773 kernel.
packet[15] = a_p_f;
1790 for (
int i = 0; i < 8; ++i) {
1791 for (
int j = 0; j < 8; ++j) {
1792 out[i][j] = in[j][2*i];
1794 for (
int j = 0; j < 8; ++j) {
1795 out[i][j+8] = in[j][2*i+1];
1819 for (
int i = 0; i < 4; ++i) {
1820 for (
int j = 0; j < 4; ++j) {
1821 out[i][j] = in[j][4*i];
1823 for (
int j = 0; j < 4; ++j) {
1824 out[i][j+4] = in[j][4*i+1];
1826 for (
int j = 0; j < 4; ++j) {
1827 out[i][j+8] = in[j][4*i+2];
1829 for (
int j = 0; j < 4; ++j) {
1830 out[i][j+12] = in[j][4*i+3];
1855 #if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
1856 #ifdef EIGEN_VECTORIZE_AVX512DQ
1884 return _mm256_set1_epi16(from.
value);
1890 t.
value =
static_cast<unsigned short>(_mm256_extract_epi16(from, 0));
1896 return _mm256_load_si256(
reinterpret_cast<const __m256i*
>(from));
1901 return _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(from));
1907 _mm256_store_si256(
reinterpret_cast<__m256i*
>(to), from);
1913 _mm256_storeu_si256(
reinterpret_cast<__m256i*
>(to), from);
1919 unsigned short a = from[0].
value;
1920 unsigned short b = from[1].
value;
1921 unsigned short c = from[2].
value;
1922 unsigned short d = from[3].
value;
1923 unsigned short e = from[4].
value;
1924 unsigned short f = from[5].
value;
1925 unsigned short g = from[6].
value;
1926 unsigned short h = from[7].
value;
1927 return _mm256_set_epi16(h, h, g, g, f, f, e, e,
d,
d, c, c, b, b,
a,
a);
1933 unsigned short a = from[0].
value;
1934 unsigned short b = from[1].
value;
1935 unsigned short c = from[2].
value;
1936 unsigned short d = from[3].
value;
1937 return _mm256_set_epi16(
d,
d,
d,
d, c, c, c, c, b, b, b, b,
a,
a,
a,
a);
1941 return _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(
a), 16));
1948 #if defined(EIGEN_VECTORIZE_AVX512BF16) && EIGEN_GNUC_AT_LEAST(10, 1)
1952 r = (__m256i)(_mm512_cvtneps_pbh(
a));
1956 __m512i input = _mm512_castps_si512(
a);
1957 __m512i nan = _mm512_set1_epi32(0x7fc0);
1960 t = _mm512_and_si512(_mm512_srli_epi32(input, 16), _mm512_set1_epi32(1));
1962 t = _mm512_add_epi32(t, _mm512_set1_epi32(0x7fff));
1964 t = _mm512_add_epi32(t, input);
1966 t = _mm512_srli_epi32(t, 16);
1969 __mmask16 mask = _mm512_cmp_ps_mask(
a,
a, _CMP_ORD_Q);
1971 t = _mm512_mask_blend_epi32(mask, nan, t);
1973 r = _mm512_cvtepi32_epi16(t);
2011 return _mm256_blendv_epi8(b,
a, mask);
2057 Packet16bf sign_mask = _mm256_set1_epi16(
static_cast<unsigned short>(0x8000));
2058 return _mm256_xor_si256(
a, sign_mask);
2068 const __m256i sign_mask = _mm256_set1_epi16(
static_cast<numext::uint16_t>(0x8000));
2069 return _mm256_andnot_si256(sign_mask,
a);
2115 Packet8bf lane0 = _mm256_extractf128_si256(
a, 0);
2116 Packet8bf lane1 = _mm256_extractf128_si256(
a, 1);
2142 __m256i m = _mm256_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1,
2143 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
2147 res = _mm256_permute2x128_si256(
a,
a, 1);
2149 return _mm256_shuffle_epi8(res, m);
2155 return _mm256_set_epi16(
2168 to[stride*0] = aux[0];
2169 to[stride*1] = aux[1];
2170 to[stride*2] = aux[2];
2171 to[stride*3] = aux[3];
2172 to[stride*4] = aux[4];
2173 to[stride*5] = aux[5];
2174 to[stride*6] = aux[6];
2175 to[stride*7] = aux[7];
2176 to[stride*8] = aux[8];
2177 to[stride*9] = aux[9];
2178 to[stride*10] = aux[10];
2179 to[stride*11] = aux[11];
2180 to[stride*12] = aux[12];
2181 to[stride*13] = aux[13];
2182 to[stride*14] = aux[14];
2183 to[stride*15] = aux[15];
2188 __m256i b = kernel.
packet[1];
2189 __m256i c = kernel.
packet[2];
2191 __m256i e = kernel.
packet[4];
2192 __m256i f = kernel.
packet[5];
2193 __m256i g = kernel.
packet[6];
2194 __m256i h = kernel.
packet[7];
2195 __m256i i = kernel.
packet[8];
2196 __m256i j = kernel.
packet[9];
2197 __m256i k = kernel.
packet[10];
2198 __m256i l = kernel.
packet[11];
2199 __m256i m = kernel.
packet[12];
2200 __m256i n = kernel.
packet[13];
2201 __m256i o = kernel.
packet[14];
2202 __m256i p = kernel.
packet[15];
2204 __m256i ab_07 = _mm256_unpacklo_epi16(
a, b);
2205 __m256i cd_07 = _mm256_unpacklo_epi16(c,
d);
2206 __m256i ef_07 = _mm256_unpacklo_epi16(e, f);
2207 __m256i gh_07 = _mm256_unpacklo_epi16(g, h);
2208 __m256i ij_07 = _mm256_unpacklo_epi16(i, j);
2209 __m256i kl_07 = _mm256_unpacklo_epi16(k, l);
2210 __m256i mn_07 = _mm256_unpacklo_epi16(m, n);
2211 __m256i op_07 = _mm256_unpacklo_epi16(o, p);
2213 __m256i ab_8f = _mm256_unpackhi_epi16(
a, b);
2214 __m256i cd_8f = _mm256_unpackhi_epi16(c,
d);
2215 __m256i ef_8f = _mm256_unpackhi_epi16(e, f);
2216 __m256i gh_8f = _mm256_unpackhi_epi16(g, h);
2217 __m256i ij_8f = _mm256_unpackhi_epi16(i, j);
2218 __m256i kl_8f = _mm256_unpackhi_epi16(k, l);
2219 __m256i mn_8f = _mm256_unpackhi_epi16(m, n);
2220 __m256i op_8f = _mm256_unpackhi_epi16(o, p);
2222 __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07);
2223 __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07);
2224 __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07);
2225 __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07);
2226 __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07);
2227 __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07);
2228 __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07);
2229 __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07);
2231 __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f);
2232 __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f);
2233 __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f);
2234 __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f);
2235 __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f);
2236 __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f);
2237 __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f);
2238 __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f);
2240 __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03);
2241 __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03);
2242 __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03);
2243 __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03);
2244 __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47);
2245 __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47);
2246 __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47);
2247 __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47);
2248 __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b);
2249 __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b);
2250 __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b);
2251 __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b);
2252 __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf);
2253 __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf);
2254 __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf);
2255 __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf);
2258 kernel.
packet[0] = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20);
2259 kernel.
packet[1] = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
2260 kernel.
packet[2] = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
2261 kernel.
packet[3] = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
2262 kernel.
packet[4] = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
2263 kernel.
packet[5] = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
2264 kernel.
packet[6] = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
2265 kernel.
packet[7] = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
2266 kernel.
packet[8] = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
2267 kernel.
packet[9] = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
2268 kernel.
packet[10] = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
2269 kernel.
packet[11] = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
2270 kernel.
packet[12] = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
2271 kernel.
packet[13] = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
2272 kernel.
packet[14] = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
2273 kernel.
packet[15] = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
2278 __m256i b = kernel.
packet[1];
2279 __m256i c = kernel.
packet[2];
2282 __m256i ab_07 = _mm256_unpacklo_epi16(
a, b);
2283 __m256i cd_07 = _mm256_unpacklo_epi16(c,
d);
2284 __m256i ab_8f = _mm256_unpackhi_epi16(
a, b);
2285 __m256i cd_8f = _mm256_unpackhi_epi16(c,
d);
2287 __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07);
2288 __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07);
2289 __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f);
2290 __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f);
2293 kernel.
packet[0] = _mm256_permute2x128_si256(abcd_03, abcd_47, 0x20);
2294 kernel.
packet[1] = _mm256_permute2x128_si256(abcd_8b, abcd_cf, 0x20);
2295 kernel.
packet[2] = _mm256_permute2x128_si256(abcd_03, abcd_47, 0x31);
2296 kernel.
packet[3] = _mm256_permute2x128_si256(abcd_8b, abcd_cf, 0x31);
#define PACK_OUTPUT_2(OUTPUT, INPUT, INDEX, STRIDE)
Definition: PacketMath.h:1218
#define PACK_OUTPUT_D(OUTPUT, INPUT, INDEX, STRIDE)
Definition: PacketMath.h:1260
#define PACK_OUTPUT_SQ_D(OUTPUT, INPUT, INDEX, STRIDE)
Definition: PacketMath.h:1256
#define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT)
Definition: PacketMath.h:956
#define PACK_OUTPUT(OUTPUT, INPUT, INDEX, STRIDE)
Definition: PacketMath.h:1104
#define EIGEN_DEBUG_ALIGNED_STORE
Definition: GenericPacketMath.h:35
#define EIGEN_DEBUG_ALIGNED_LOAD
Definition: GenericPacketMath.h:27
#define EIGEN_DEBUG_UNALIGNED_STORE
Definition: GenericPacketMath.h:39
#define EIGEN_DEBUG_UNALIGNED_LOAD
Definition: GenericPacketMath.h:31
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:976
#define EIGEN_FAST_MATH
Definition: Macros.h:49
#define EIGEN_STRONG_INLINE
Definition: Macros.h:917
@ Aligned64
Definition: Constants.h:237
@ Aligned32
Definition: Constants.h:236
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw raw_uint16_to_half(numext::uint16_t x)
Definition: Half.h:495
EIGEN_STRONG_INLINE Packet8d pandnot< Packet8d >(const Packet8d &a, const Packet8d &b)
Definition: PacketMath.h:643
EIGEN_STRONG_INLINE Packet16i ptrue< Packet16i >(const Packet16i &)
Definition: PacketMath.h:536
v2f64 Packet2d
Definition: PacketMath.h:820
EIGEN_STRONG_INLINE void pstoreu< double >(double *to, const Packet4d &from)
Definition: PacketMath.h:627
EIGEN_STRONG_INLINE Packet pminmax_propagate_numbers(const Packet &a, const Packet &b, Op op)
Definition: PacketMath.h:546
EIGEN_STRONG_INLINE Packet16bf ploaddup< Packet16bf >(const bfloat16 *from)
Definition: PacketMath.h:1917
EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf &a)
Definition: Complex.h:167
EIGEN_STRONG_INLINE Packet8d pload< Packet8d >(const double *from)
Definition: PacketMath.h:683
EIGEN_STRONG_INLINE Packet8d ploadquad< Packet8d >(const double *from)
Definition: PacketMath.h:761
EIGEN_STRONG_INLINE Packet8d psub< Packet8d >(const Packet8d &a, const Packet8d &b)
Definition: PacketMath.h:280
EIGEN_STRONG_INLINE Packet8d ploadu< Packet8d >(const double *from)
Definition: PacketMath.h:697
EIGEN_STRONG_INLINE Packet8d pset1frombits< Packet8d >(const numext::uint64_t from)
Definition: PacketMath.h:215
EIGEN_STRONG_INLINE Packet16f pceil< Packet16f >(const Packet16f &a)
Definition: PacketMath.h:529
EIGEN_STRONG_INLINE Packet8d plset< Packet8d >(const double &a)
Definition: PacketMath.h:253
EIGEN_DEVICE_FUNC Packet padd(const Packet &a, const Packet &b)
Definition: GenericPacketMath.h:215
EIGEN_STRONG_INLINE bfloat16 predux_max< Packet16bf >(const Packet16bf &from)
Definition: PacketMath.h:2136
EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f &a)
Definition: PacketMath.h:1007
EIGEN_STRONG_INLINE Packet8f Bf16ToF32(const Packet8bf &a)
Definition: PacketMath.h:1260
EIGEN_STRONG_INLINE Packet8f pzero(const Packet8f &)
Definition: PacketMath.h:247
EIGEN_STRONG_INLINE Packet8d pmul< Packet8d >(const Packet8d &a, const Packet8d &b)
Definition: PacketMath.h:318
EIGEN_STRONG_INLINE Packet8f predux_half_dowto4< Packet16f >(const Packet16f &a)
Definition: PacketMath.h:1005
EIGEN_STRONG_INLINE Packet8d pmin< PropagateNaN, Packet8d >(const Packet8d &a, const Packet8d &b)
Definition: PacketMath.h:418
EIGEN_STRONG_INLINE Packet16bf pmax< Packet16bf >(const Packet16bf &a, const Packet16bf &b)
Definition: PacketMath.h:2103
EIGEN_STRONG_INLINE Packet16f pfrexp< Packet16f >(const Packet16f &a, Packet16f &exponent)
Definition: PacketMath.h:898
EIGEN_STRONG_INLINE Packet16f padd< Packet16f >(const Packet16f &a, const Packet16f &b)
Definition: PacketMath.h:259
EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b)
Definition: PacketMath.h:446
EIGEN_STRONG_INLINE Packet16bf pmul< Packet16bf >(const Packet16bf &a, const Packet16bf &b)
Definition: PacketMath.h:2085
EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x)
Definition: PacketMath.h:442
EIGEN_STRONG_INLINE Packet16bf pset1< Packet16bf >(const bfloat16 &from)
Definition: PacketMath.h:1883
EIGEN_STRONG_INLINE Packet8i pset1< Packet8i >(const int &from)
Definition: PacketMath.h:242
EIGEN_STRONG_INLINE Packet16i por< Packet16i >(const Packet16i &a, const Packet16i &b)
Definition: PacketMath.h:583
EIGEN_STRONG_INLINE Packet16h ploadquad(const Eigen::half *from)
Definition: PacketMath.h:1420
EIGEN_STRONG_INLINE Packet16f ploaddup< Packet16f >(const float *from)
Definition: PacketMath.h:715
EIGEN_STRONG_INLINE Packet16f pxor< Packet16f >(const Packet16f &a, const Packet16f &b)
Definition: PacketMath.h:612
EIGEN_STRONG_INLINE Packet16f por< Packet16f >(const Packet16f &a, const Packet16f &b)
Definition: PacketMath.h:588
EIGEN_STRONG_INLINE float predux_max< Packet16f >(const Packet16f &a)
Definition: PacketMath.h:1076
EIGEN_STRONG_INLINE Packet16h pround< Packet16h >(const Packet16h &a)
Definition: PacketMath.h:1531
EIGEN_STRONG_INLINE Eigen::half pfirst< Packet16h >(const Packet16h &from)
Definition: PacketMath.h:1382
EIGEN_STRONG_INLINE Packet4d predux_half_dowto4< Packet8d >(const Packet8d &a)
Definition: PacketMath.h:1021
EIGEN_STRONG_INLINE Packet8d print< Packet8d >(const Packet8d &a)
Definition: PacketMath.h:527
EIGEN_STRONG_INLINE Packet16i pset1< Packet16i >(const int &from)
Definition: PacketMath.h:205
EIGEN_STRONG_INLINE Packet16h pfloor< Packet16h >(const Packet16h &a)
Definition: PacketMath.h:1543
EIGEN_STRONG_INLINE void ptranspose(PacketBlock< Packet2cf, 2 > &kernel)
Definition: Complex.h:224
EIGEN_STRONG_INLINE Packet16f print< Packet16f >(const Packet16f &a)
Definition: PacketMath.h:526
EIGEN_STRONG_INLINE Packet16h ploadu< Packet16h >(const Eigen::half *from)
Definition: PacketMath.h:1390
__m512d Packet8d
Definition: PacketMath.h:33
EIGEN_STRONG_INLINE bool predux_any(const Packet4f &x)
Definition: PacketMath.h:1765
EIGEN_STRONG_INLINE Packet8i por< Packet8i >(const Packet8i &a, const Packet8i &b)
Definition: PacketMath.h:501
EIGEN_STRONG_INLINE Packet16f ploadu< Packet16f >(const float *from)
Definition: PacketMath.h:693
EIGEN_STRONG_INLINE Packet16bf pceil< Packet16bf >(const Packet16bf &a)
Definition: PacketMath.h:2023
EIGEN_DEVICE_FUNC void pscatter< float, Packet16f >(float *to, const Packet16f &from, Index stride)
Definition: PacketMath.h:823
EIGEN_DEVICE_FUNC Packet pdiv(const Packet &a, const Packet &b)
Definition: GenericPacketMath.h:244
EIGEN_STRONG_INLINE void pstore< bfloat16 >(bfloat16 *to, const Packet8bf &from)
Definition: PacketMath.h:511
EIGEN_STRONG_INLINE Packet16f psub< Packet16f >(const Packet16f &a, const Packet16f &b)
Definition: PacketMath.h:275
EIGEN_STRONG_INLINE int pfirst< Packet16i >(const Packet16i &a)
Definition: PacketMath.h:871
EIGEN_STRONG_INLINE Packet16h pceil< Packet16h >(const Packet16h &a)
Definition: PacketMath.h:1539
EIGEN_STRONG_INLINE Packet16h pload< Packet16h >(const Eigen::half *from)
Definition: PacketMath.h:1386
EIGEN_STRONG_INLINE Packet16i pandnot< Packet16i >(const Packet16i &a, const Packet16i &b)
Definition: PacketMath.h:630
EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i &a)
Definition: PacketMath.h:1191
EIGEN_STRONG_INLINE Eigen::half predux_min< Packet16h >(const Packet16h &a)
Definition: PacketMath.h:1618
EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x)
Definition: PacketMath.h:437
EIGEN_STRONG_INLINE Packet16i pmul< Packet16i >(const Packet16i &a, const Packet16i &b)
Definition: PacketMath.h:323
EIGEN_STRONG_INLINE Packet16bf pgather< bfloat16, Packet16bf >(const bfloat16 *from, Index stride)
Definition: PacketMath.h:2153
EIGEN_STRONG_INLINE Packet16f plset< Packet16f >(const float &a)
Definition: PacketMath.h:246
EIGEN_STRONG_INLINE Packet16i pxor< Packet16i >(const Packet16i &a, const Packet16i &b)
Definition: PacketMath.h:607
EIGEN_STRONG_INLINE Packet16f pmax< PropagateNumbers, Packet16f >(const Packet16f &a, const Packet16f &b)
Definition: PacketMath.h:406
EIGEN_DEVICE_FUNC Packet pmax(const Packet &a, const Packet &b)
Definition: GenericPacketMath.h:524
EIGEN_STRONG_INLINE Packet16i psub< Packet16i >(const Packet16i &a, const Packet16i &b)
Definition: PacketMath.h:285
EIGEN_STRONG_INLINE Packet4i pblend(const Selector< 4 > &ifPacket, const Packet4i &thenPacket, const Packet4i &elsePacket)
Definition: PacketMath.h:2107
EIGEN_STRONG_INLINE Packet8bf padd< Packet8bf >(const Packet8bf &a, const Packet8bf &b)
Definition: PacketMath.h:1324
EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f &a, const Packet4f &b)
Definition: PacketMath.h:867
EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i &a)
Definition: PacketMath.h:1189
EIGEN_STRONG_INLINE Packet16f pset1frombits< Packet16f >(unsigned int from)
Definition: PacketMath.h:210
EIGEN_STRONG_INLINE Packet8d ptrue< Packet8d >(const Packet8d &a)
Definition: PacketMath.h:546
EIGEN_STRONG_INLINE Packet pminmax_propagate_nan(const Packet &a, const Packet &b, Op op)
Definition: PacketMath.h:555
EIGEN_STRONG_INLINE half predux_mul< Packet16h >(const Packet16h &from)
Definition: PacketMath.h:1624
EIGEN_STRONG_INLINE Packet16bf plset< Packet16bf >(const bfloat16 &a)
Definition: PacketMath.h:2109
eigen_packet_wrapper< __m256i, 2 > Packet16bf
Definition: PacketMath.h:35
EIGEN_STRONG_INLINE void pstore1< Packet16f >(float *to, const float &a)
Definition: PacketMath.h:843
EIGEN_STRONG_INLINE void pstore< int >(int *to, const Packet4i &from)
Definition: PacketMath.h:496
EIGEN_STRONG_INLINE Packet8h por(const Packet8h &a, const Packet8h &b)
Definition: PacketMath.h:1042
EIGEN_STRONG_INLINE Packet8d pfloor< Packet8d >(const Packet8d &a)
Definition: PacketMath.h:533
EIGEN_STRONG_INLINE float predux_mul< Packet16f >(const Packet16f &a)
Definition: PacketMath.h:1028
EIGEN_STRONG_INLINE Packet16i pand< Packet16i >(const Packet16i &a, const Packet16i &b)
Definition: PacketMath.h:551
EIGEN_STRONG_INLINE double predux< Packet8d >(const Packet8d &a)
Definition: PacketMath.h:996
EIGEN_STRONG_INLINE Packet8d pload1< Packet8d >(const double *from)
Definition: PacketMath.h:241
EIGEN_STRONG_INLINE Packet16i padd< Packet16i >(const Packet16i &a, const Packet16i &b)
Definition: PacketMath.h:269
EIGEN_STRONG_INLINE float predux< Packet8f >(const Packet8f &a)
Definition: PacketMath.h:798
EIGEN_STRONG_INLINE Packet16i ploadu< Packet16i >(const int *from)
Definition: PacketMath.h:701
EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf &a)
Definition: Complex.h:184
EIGEN_STRONG_INLINE Packet16f pround< Packet16f >(const Packet16f &a)
Definition: PacketMath.h:651
EIGEN_STRONG_INLINE Packet8d por< Packet8d >(const Packet8d &a, const Packet8d &b)
Definition: PacketMath.h:597
EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h &a)
Definition: PacketMath.h:988
EIGEN_STRONG_INLINE void pstore< double >(double *to, const Packet4d &from)
Definition: PacketMath.h:623
EIGEN_STRONG_INLINE float predux< Packet16f >(const Packet16f &a)
Definition: PacketMath.h:978
EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f &a, const Packet4f &b, const Packet4f &c)
Definition: PacketMath.h:827
EIGEN_STRONG_INLINE Packet16f pset1< Packet16f >(const float &from)
Definition: PacketMath.h:197
EIGEN_STRONG_INLINE Packet8d ploaddup< Packet8d >(const double *from)
Definition: PacketMath.h:739
EIGEN_STRONG_INLINE Packet16f pmax< Packet16f >(const Packet16f &a, const Packet16f &b)
Definition: PacketMath.h:384
EIGEN_DEVICE_FUNC Packet pmul(const Packet &a, const Packet &b)
Definition: GenericPacketMath.h:237
EIGEN_STRONG_INLINE Packet8h ptrue(const Packet8h &a)
Definition: PacketMath.h:978
EIGEN_DEVICE_FUNC Packet pmin(const Packet &a, const Packet &b)
Definition: GenericPacketMath.h:512
EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h &a, const Packet8h &b)
Definition: PacketMath.h:1053
EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf &a)
Definition: Complex.h:166
EIGEN_STRONG_INLINE Packet8h predux_half_dowto4< Packet16h >(const Packet16h &a)
Definition: PacketMath.h:1606
EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f &a, const Packet4f &b)
Definition: PacketMath.h:868
EIGEN_STRONG_INLINE void prefetch< float >(const float *addr)
Definition: PacketMath.h:1117
EIGEN_STRONG_INLINE void pstoreu< bfloat16 >(bfloat16 *to, const Packet8bf &from)
Definition: PacketMath.h:1104
EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i &a)
Definition: PacketMath.h:1187
EIGEN_STRONG_INLINE Packet8d pmax< Packet8d >(const Packet8d &a, const Packet8d &b)
Definition: PacketMath.h:390
EIGEN_STRONG_INLINE Packet16f pmul< Packet16f >(const Packet16f &a, const Packet16f &b)
Definition: PacketMath.h:313
EIGEN_STRONG_INLINE Packet4d pfrexp_generic_get_biased_exponent(const Packet4d &a)
Definition: PacketMath.h:743
EIGEN_STRONG_INLINE void pscatter< half, Packet16h >(half *to, const Packet16h &from, Index stride)
Definition: PacketMath.h:1646
EIGEN_STRONG_INLINE void pstoreu< half >(Eigen::half *to, const Packet16h &from)
Definition: PacketMath.h:1400
EIGEN_STRONG_INLINE Packet16bf ploadu< Packet16bf >(const bfloat16 *from)
Definition: PacketMath.h:1900
EIGEN_STRONG_INLINE Packet8i pxor< Packet8i >(const Packet8i &a, const Packet8i &b)
Definition: PacketMath.h:511
EIGEN_STRONG_INLINE double predux_max< Packet8d >(const Packet8d &a)
Definition: PacketMath.h:1087
EIGEN_STRONG_INLINE Packet16f ptrue< Packet16f >(const Packet16f &a)
Definition: PacketMath.h:541
EIGEN_STRONG_INLINE Packet8d pmax< PropagateNumbers, Packet8d >(const Packet8d &a, const Packet8d &b)
Definition: PacketMath.h:410
EIGEN_STRONG_INLINE double predux_min< Packet8d >(const Packet8d &a)
Definition: PacketMath.h:1067
EIGEN_STRONG_INLINE bfloat16 predux_min< Packet16bf >(const Packet16bf &from)
Definition: PacketMath.h:2131
EIGEN_STRONG_INLINE Packet16f pmin< PropagateNumbers, Packet16f >(const Packet16f &a, const Packet16f &b)
Definition: PacketMath.h:398
EIGEN_STRONG_INLINE Packet16f pload1< Packet16f >(const float *from)
Definition: PacketMath.h:237
EIGEN_STRONG_INLINE Packet16bf psub< Packet16bf >(const Packet16bf &a, const Packet16bf &b)
Definition: PacketMath.h:2079
EIGEN_STRONG_INLINE Packet16f pandnot< Packet16f >(const Packet16f &a, const Packet16f &b)
Definition: PacketMath.h:635
EIGEN_STRONG_INLINE Packet8d pdiv< Packet8d >(const Packet8d &a, const Packet8d &b)
Definition: PacketMath.h:334
EIGEN_STRONG_INLINE Packet16f pdiv< Packet16f >(const Packet16f &a, const Packet16f &b)
Definition: PacketMath.h:329
EIGEN_STRONG_INLINE Packet16f pload< Packet16f >(const float *from)
Definition: PacketMath.h:679
EIGEN_STRONG_INLINE Packet16f pldexp< Packet16f >(const Packet16f &a, const Packet16f &exponent)
Definition: PacketMath.h:919
EIGEN_STRONG_INLINE Packet8d pldexp< Packet8d >(const Packet8d &a, const Packet8d &exponent)
Definition: PacketMath.h:923
EIGEN_STRONG_INLINE Packet16bf pfloor< Packet16bf >(const Packet16bf &a)
Definition: PacketMath.h:2027
const char * SsePrefetchPtrType
Definition: PacketMath.h:864
EIGEN_STRONG_INLINE void pstore< float >(float *to, const Packet4f &from)
Definition: PacketMath.h:491
__m512i Packet16i
Definition: PacketMath.h:32
EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f &a)
Definition: PacketMath.h:1176
EIGEN_STRONG_INLINE void pstore1< Packet8d >(double *to, const double &a)
Definition: PacketMath.h:848
EIGEN_STRONG_INLINE Packet8bf predux_half_dowto4< Packet16bf >(const Packet16bf &a)
Definition: PacketMath.h:2114
EIGEN_STRONG_INLINE Packet16h padd< Packet16h >(const Packet16h &a, const Packet16h &b)
Definition: PacketMath.h:1572
EIGEN_STRONG_INLINE Packet8f peven_mask(const Packet8f &)
Definition: PacketMath.h:252
EIGEN_STRONG_INLINE Packet8i pandnot< Packet8i >(const Packet8i &a, const Packet8i &b)
Definition: PacketMath.h:521
EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet8bf &a)
Definition: PacketMath.h:1429
EIGEN_STRONG_INLINE Packet16f pand< Packet16f >(const Packet16f &a, const Packet16f &b)
Definition: PacketMath.h:557
__m256i Packet8i
Definition: PacketMath.h:32
EIGEN_STRONG_INLINE __m256i Pack32To16(Packet16f rf)
Definition: PacketMath.h:454
EIGEN_DEVICE_FUNC void pstore(Scalar *to, const Packet &from)
Definition: GenericPacketMath.h:696
EIGEN_STRONG_INLINE Packet16h plset< Packet16h >(const half &a)
Definition: PacketMath.h:1508
EIGEN_STRONG_INLINE Packet16h pset1< Packet16h >(const Eigen::half &from)
Definition: PacketMath.h:1378
EIGEN_STRONG_INLINE double predux_mul< Packet8d >(const Packet8d &a)
Definition: PacketMath.h:1048
EIGEN_DEVICE_FUNC unpacket_traits< Packet >::type predux(const Packet &a)
Definition: GenericPacketMath.h:875
EIGEN_DEVICE_FUNC void pscatter< double, Packet8d >(double *to, const Packet8d &from, Index stride)
Definition: PacketMath.h:833
EIGEN_STRONG_INLINE Packet16bf print< Packet16bf >(const Packet16bf &a)
Definition: PacketMath.h:2019
EIGEN_DEVICE_FUNC Packet8d pgather< double, Packet8d >(const double *from, Index stride)
Definition: PacketMath.h:813
EIGEN_STRONG_INLINE Packet8d pfrexp< Packet8d >(const Packet8d &a, Packet8d &exponent)
Definition: PacketMath.h:915
EIGEN_STRONG_INLINE Eigen::half predux_max< Packet16h >(const Packet16h &a)
Definition: PacketMath.h:1612
EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf &a, const Packet2cf &b)
Definition: Complex.h:231
EIGEN_STRONG_INLINE bfloat16 pfirst< Packet16bf >(const Packet16bf &from)
Definition: PacketMath.h:1888
EIGEN_STRONG_INLINE bfloat16 predux< Packet16bf >(const Packet16bf &p)
Definition: PacketMath.h:2121
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_generic(const Packet &a, const Packet &exponent)
Definition: GenericPacketMathFunctions.h:85
EIGEN_STRONG_INLINE void pstoreu< int >(int *to, const Packet4i &from)
Definition: PacketMath.h:1092
EIGEN_STRONG_INLINE Packet8h pand(const Packet8h &a, const Packet8h &b)
Definition: PacketMath.h:1050
EIGEN_STRONG_INLINE Packet16bf pround< Packet16bf >(const Packet16bf &a)
Definition: PacketMath.h:2014
EIGEN_STRONG_INLINE Packet8h pxor(const Packet8h &a, const Packet8h &b)
Definition: PacketMath.h:1047
EIGEN_DEVICE_FUNC Packet16f pgather< float, Packet16f >(const float *from, Index stride)
Definition: PacketMath.h:803
EIGEN_STRONG_INLINE half predux< Packet16h >(const Packet16h &from)
Definition: PacketMath.h:1600
EIGEN_STRONG_INLINE Packet4f ploadu< Packet4f >(const float *from)
Definition: PacketMath.h:968
eigen_packet_wrapper< __m256i, 1 > Packet16h
Definition: PacketMath.h:34
EIGEN_STRONG_INLINE Packet16h print< Packet16h >(const Packet16h &a)
Definition: PacketMath.h:1535
EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f &mask, const Packet4f &a, const Packet4f &b)
Definition: PacketMath.h:917
EIGEN_STRONG_INLINE void pscatter< bfloat16, Packet16bf >(bfloat16 *to, const Packet16bf &from, Index stride)
Definition: PacketMath.h:2163
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pfrexp_generic(const Packet &a, Packet &exponent)
Definition: GenericPacketMathFunctions.h:40
EIGEN_DEVICE_FUNC Packet psub(const Packet &a, const Packet &b)
Definition: GenericPacketMath.h:222
EIGEN_STRONG_INLINE Packet16i pload< Packet16i >(const int *from)
Definition: PacketMath.h:687
EIGEN_STRONG_INLINE Packet16bf pload< Packet16bf >(const bfloat16 *from)
Definition: PacketMath.h:1895
EIGEN_STRONG_INLINE Packet16f pmin< PropagateNaN, Packet16f >(const Packet16f &a, const Packet16f &b)
Definition: PacketMath.h:414
EIGEN_STRONG_INLINE double pfirst< Packet8d >(const Packet8d &a)
Definition: PacketMath.h:867
EIGEN_STRONG_INLINE Packet8d pand< Packet8d >(const Packet8d &a, const Packet8d &b)
Definition: PacketMath.h:566
EIGEN_STRONG_INLINE void pstore< half >(Eigen::half *to, const Packet16h &from)
Definition: PacketMath.h:1394
EIGEN_STRONG_INLINE Packet16f pfloor< Packet16f >(const Packet16f &a)
Definition: PacketMath.h:532
EIGEN_STRONG_INLINE Packet8i ptrue< Packet8i >(const Packet8i &a)
Definition: PacketMath.h:459
EIGEN_STRONG_INLINE Packet8d pset1< Packet8d >(const double &from)
Definition: PacketMath.h:201
EIGEN_STRONG_INLINE float predux_min< Packet16f >(const Packet16f &a)
Definition: PacketMath.h:1057
EIGEN_STRONG_INLINE Packet16h pmul< Packet16h >(const Packet16h &a, const Packet16h &b)
Definition: PacketMath.h:1586
EIGEN_STRONG_INLINE Packet16bf pdiv< Packet16bf >(const Packet16bf &a, const Packet16bf &b)
Definition: PacketMath.h:2091
EIGEN_STRONG_INLINE Packet16f ploadquad< Packet16f >(const float *from)
Definition: PacketMath.h:752
EIGEN_STRONG_INLINE Packet8d pround< Packet8d >(const Packet8d &a)
Definition: PacketMath.h:658
EIGEN_STRONG_INLINE Packet16f pmin< Packet16f >(const Packet16f &a, const Packet16f &b)
Definition: PacketMath.h:371
EIGEN_STRONG_INLINE Packet8h padd< Packet8h >(const Packet8h &a, const Packet8h &b)
Definition: PacketMath.h:1100
EIGEN_STRONG_INLINE Packet16h pdiv< Packet16h >(const Packet16h &a, const Packet16h &b)
Definition: PacketMath.h:1593
__m256 Packet8f
Definition: PacketMath.h:31
EIGEN_STRONG_INLINE Packet16h pmax< Packet16h >(const Packet16h &a, const Packet16h &b)
Definition: PacketMath.h:1502
EIGEN_STRONG_INLINE Packet8d pxor< Packet8d >(const Packet8d &a, const Packet8d &b)
Definition: PacketMath.h:621
EIGEN_STRONG_INLINE Packet8d pceil< Packet8d >(const Packet8d &a)
Definition: PacketMath.h:530
EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f p4f)
Definition: PacketMath.h:1252
EIGEN_STRONG_INLINE Packet8d pmin< PropagateNumbers, Packet8d >(const Packet8d &a, const Packet8d &b)
Definition: PacketMath.h:402
EIGEN_STRONG_INLINE void pstoreu< float >(float *to, const Packet4f &from)
Definition: PacketMath.h:1088
EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f &a, const Packet4f &b)
Definition: PacketMath.h:870
EIGEN_STRONG_INLINE Packet8i pand< Packet8i >(const Packet8i &a, const Packet8i &b)
Definition: PacketMath.h:491
EIGEN_STRONG_INLINE Packet16f pmax< PropagateNaN, Packet16f >(const Packet16f &a, const Packet16f &b)
Definition: PacketMath.h:422
EIGEN_STRONG_INLINE Packet16bf padd< Packet16bf >(const Packet16bf &a, const Packet16bf &b)
Definition: PacketMath.h:2073
EIGEN_STRONG_INLINE void prefetch< int >(const int *addr)
Definition: PacketMath.h:1118
__m256d Packet4d
Definition: PacketMath.h:33
EIGEN_STRONG_INLINE Packet16h ploaddup< Packet16h >(const Eigen::half *from)
Definition: PacketMath.h:1407
EIGEN_STRONG_INLINE Packet16bf pmin< Packet16bf >(const Packet16bf &a, const Packet16bf &b)
Definition: PacketMath.h:2097
EIGEN_STRONG_INLINE bfloat16 predux_mul< Packet16bf >(const Packet16bf &from)
Definition: PacketMath.h:2126
EIGEN_STRONG_INLINE Packet8d pmin< Packet8d >(const Packet8d &a, const Packet8d &b)
Definition: PacketMath.h:377
__m512 Packet16f
Definition: PacketMath.h:31
EIGEN_STRONG_INLINE Packet8d pmax< PropagateNaN, Packet8d >(const Packet8d &a, const Packet8d &b)
Definition: PacketMath.h:426
EIGEN_STRONG_INLINE Packet16h psub< Packet16h >(const Packet16h &a, const Packet16h &b)
Definition: PacketMath.h:1579
EIGEN_STRONG_INLINE void pstore1< Packet16i >(int *to, const int &a)
Definition: PacketMath.h:853
EIGEN_STRONG_INLINE float pfirst< Packet16f >(const Packet16f &a)
Definition: PacketMath.h:863
EIGEN_DEVICE_FUNC unpacket_traits< Packet >::type predux_mul(const Packet &a)
Definition: GenericPacketMath.h:882
EIGEN_STRONG_INLINE void prefetch< double >(const double *addr)
Definition: PacketMath.h:692
EIGEN_STRONG_INLINE Packet16h pmin< Packet16h >(const Packet16h &a, const Packet16h &b)
Definition: PacketMath.h:1496
EIGEN_STRONG_INLINE Packet8d padd< Packet8d >(const Packet8d &a, const Packet8d &b)
Definition: PacketMath.h:264
::uint64_t uint64_t
Definition: Meta.h:58
::uint16_t uint16_t
Definition: Meta.h:54
::uint32_t uint32_t
Definition: Meta.h:56
Namespace containing all symbols from the Eigen library.
Definition: LDLT.h:16
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:74
Definition: document.h:416
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1282
const GenericPointer< typename T::ValueType > T2 T::AllocatorType & a
Definition: pointer.h:1181
unsigned short uint16_t
Definition: stdint.h:125
unsigned __int64 uint64_t
Definition: stdint.h:136
unsigned short value
Definition: BFloat16.h:36
Definition: BFloat16.h:58
numext::uint16_t x
Definition: Half.h:104
Definition: GenericPacketMath.h:1014
Packet packet[N]
Definition: GenericPacketMath.h:1015
Definition: GenericPacketMath.h:1027
bool select[N]
Definition: GenericPacketMath.h:1028
Definition: GenericPacketMath.h:43
@ HasRsqrt
Definition: GenericPacketMath.h:67
@ HasSin
Definition: GenericPacketMath.h:75
@ HasBlend
Definition: GenericPacketMath.h:60
@ HasNdtri
Definition: GenericPacketMath.h:90
@ HasCos
Definition: GenericPacketMath.h:76
@ HasCmp
Definition: GenericPacketMath.h:63
@ HasLog1p
Definition: GenericPacketMath.h:71
@ HasCeil
Definition: GenericPacketMath.h:101
@ HasExp
Definition: GenericPacketMath.h:68
@ HasRound
Definition: GenericPacketMath.h:98
@ HasRint
Definition: GenericPacketMath.h:99
@ HasSqrt
Definition: GenericPacketMath.h:66
@ HasErf
Definition: GenericPacketMath.h:88
@ HasBessel
Definition: GenericPacketMath.h:91
@ HasExpm1
Definition: GenericPacketMath.h:69
@ HasLog
Definition: GenericPacketMath.h:70
@ HasTanh
Definition: GenericPacketMath.h:83
@ HasFloor
Definition: GenericPacketMath.h:100
@ HasDiv
Definition: GenericPacketMath.h:65
Definition: GenericPacketMath.h:160
@ value
Definition: Meta.h:133
Packet8bf half
Definition: PacketMath.h:1845
Packet16bf type
Definition: PacketMath.h:1844
Packet4d half
Definition: PacketMath.h:135
Packet8d type
Definition: PacketMath.h:134
Packet8f half
Definition: PacketMath.h:98
Packet16f type
Definition: PacketMath.h:97
Packet16h half
Definition: PacketMath.h:56
Packet16h type
Definition: PacketMath.h:54
Definition: GenericPacketMath.h:107
@ HasSub
Definition: GenericPacketMath.h:118
@ HasMax
Definition: GenericPacketMath.h:124
@ HasNegate
Definition: GenericPacketMath.h:120
@ HasMul
Definition: GenericPacketMath.h:119
@ HasAdd
Definition: GenericPacketMath.h:117
@ HasSetLinear
Definition: GenericPacketMath.h:126
@ HasMin
Definition: GenericPacketMath.h:123
@ HasConj
Definition: GenericPacketMath.h:125
@ HasAbs2
Definition: GenericPacketMath.h:122
@ HasAbs
Definition: GenericPacketMath.h:121
@ HasHalfPacket
Definition: GenericPacketMath.h:114
@ size
Definition: GenericPacketMath.h:112
@ AlignedOnScalar
Definition: GenericPacketMath.h:113
@ Vectorizable
Definition: GenericPacketMath.h:111
Packet8bf half
Definition: PacketMath.h:1879
bfloat16 type
Definition: PacketMath.h:1877
uint16_t mask_t
Definition: PacketMath.h:173
float type
Definition: PacketMath.h:170
Packet8f half
Definition: PacketMath.h:171
Packet16i integer_packet
Definition: PacketMath.h:172
Packet8h half
Definition: PacketMath.h:192
Eigen::half type
Definition: PacketMath.h:191
Packet8i half
Definition: PacketMath.h:185
int type
Definition: PacketMath.h:184
Packet4d half
Definition: PacketMath.h:179
double type
Definition: PacketMath.h:178
Definition: GenericPacketMath.h:133
@ masked_load_available
Definition: GenericPacketMath.h:141
@ size
Definition: GenericPacketMath.h:138
@ masked_store_available
Definition: GenericPacketMath.h:142
@ vectorizable
Definition: GenericPacketMath.h:140
@ alignment
Definition: GenericPacketMath.h:139