10 #ifndef EIGEN_GENERAL_BLOCK_PANEL_H
11 #define EIGEN_GENERAL_BLOCK_PANEL_H
24 template<
typename _LhsScalar,
typename _RhsScalar,
bool _ConjLhs=false,
bool _ConjRhs=false,
int Arch=Architecture::Target,
int _PacketSize=GEBPPacketFull>
34 #if defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
35 #define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) EIGEN_DEFAULT_L1_CACHE_SIZE
37 #define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) val
40 #if defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
41 #define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) EIGEN_DEFAULT_L2_CACHE_SIZE
43 #define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) val
46 #if defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
47 #define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) EIGEN_DEFAULT_L3_CACHE_SIZE
49 #define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) val
52 #if EIGEN_ARCH_i386_OR_x86_64
66 #undef EIGEN_SET_DEFAULT_L1_CACHE_SIZE
67 #undef EIGEN_SET_DEFAULT_L2_CACHE_SIZE
68 #undef EIGEN_SET_DEFAULT_L3_CACHE_SIZE
94 m_cacheSizes.
m_l1 = *l1;
95 m_cacheSizes.
m_l2 = *l2;
96 m_cacheSizes.
m_l3 = *l3;
101 *l1 = m_cacheSizes.
m_l1;
102 *l2 = m_cacheSizes.
m_l2;
103 *l3 = m_cacheSizes.
m_l3;
123 template<
typename LhsScalar,
typename RhsScalar,
int KcFactor,
typename Index>
133 std::ptrdiff_t l1, l2, l3;
135 #ifdef EIGEN_VECTORIZE_AVX512
146 if (num_threads > 1) {
147 typedef typename Traits::ResScalar ResScalar;
149 kdiv = KcFactor * (Traits::mr *
sizeof(LhsScalar) + Traits::nr *
sizeof(RhsScalar)),
150 ksub = Traits::mr * Traits::nr *
sizeof(ResScalar),
160 const Index k_cache = numext::maxi<Index>(kr, (numext::mini<Index>)((l1-ksub)/kdiv, 320));
162 k = k_cache - (k_cache % kr);
166 const Index n_cache = (l2-l1) / (nr *
sizeof(RhsScalar) * k);
168 if (n_cache <= n_per_thread) {
171 n = n_cache - (n_cache % nr);
174 n = (numext::mini<Index>)(n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr));
179 const Index m_cache = (l3-l2) / (
sizeof(LhsScalar) * k * num_threads);
181 if(m_cache < m_per_thread && m_cache >=
static_cast<Index>(mr)) {
182 m = m_cache - (m_cache % mr);
185 m = (numext::mini<Index>)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr));
192 #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
205 typedef typename Traits::ResScalar ResScalar;
208 k_div = KcFactor * (Traits::mr *
sizeof(LhsScalar) + Traits::nr *
sizeof(RhsScalar)),
209 k_sub = Traits::mr * Traits::nr *
sizeof(ResScalar)
219 const Index max_kc = numext::maxi<Index>(((l1-k_sub)/k_div) & (~(k_peeling-1)),1);
220 const Index old_k = k;
226 k = (k%max_kc)==0 ? max_kc
227 : max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1)));
229 eigen_internal_assert(((old_k/k) == (old_k/max_kc)) &&
"the number of sweeps has to remain the same");
238 #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
239 const Index actual_l2 = l3;
241 const Index actual_l2 = 1572864;
251 const Index lhs_bytes = m * k *
sizeof(LhsScalar);
252 const Index remaining_l1 = l1- k_sub - lhs_bytes;
253 if(remaining_l1 >=
Index(Traits::nr*
sizeof(RhsScalar))*k)
256 max_nc = remaining_l1 / (k*
sizeof(RhsScalar));
261 max_nc = (3*actual_l2)/(2*2*max_kc*
sizeof(RhsScalar));
264 Index nc = numext::mini<Index>(actual_l2/(2*k*
sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1));
272 : (nc - Traits::nr * ((nc-(n%nc))/(Traits::nr*(n/nc+1))));
279 Index problem_size = k*n*
sizeof(LhsScalar);
280 Index actual_lm = actual_l2;
282 if(problem_size<=1024)
288 else if(l3!=0 && problem_size<=32768)
293 max_mc = (numext::mini<Index>)(576,max_mc);
295 Index mc = (numext::mini<Index>)(actual_lm/(3*k*
sizeof(LhsScalar)), max_mc);
296 if (mc > Traits::mr) mc -= mc % Traits::mr;
297 else if (mc==0)
return;
299 : (mc - Traits::mr * ((mc-(m%mc))/(Traits::mr*(m/mc+1))));
304 template <
typename Index>
307 #ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
308 if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
309 k = numext::mini<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
310 m = numext::mini<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
311 n = numext::mini<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
338 template<
typename LhsScalar,
typename RhsScalar,
int KcFactor,
typename Index>
342 evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor, Index>(k, m, n, num_threads);
346 template<
typename LhsScalar,
typename RhsScalar,
typename Index>
349 computeProductBlockingSizes<LhsScalar,RhsScalar,1,Index>(k, m, n, num_threads);
352 template <
typename RhsPacket,
typename RhsPacketx4,
int registers_taken>
360 template <
typename Packet>
370 template <
int N,
typename T1,
typename T2,
typename T3>
373 template <
typename T1,
typename T2,
typename T3>
376 template <
typename T1,
typename T2,
typename T3>
379 #define PACKET_DECL_COND_PREFIX(prefix, name, packet_size) \
380 typedef typename packet_conditional<packet_size, \
381 typename packet_traits<name ## Scalar>::type, \
382 typename packet_traits<name ## Scalar>::half, \
383 typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
384 prefix ## name ## Packet
386 #define PACKET_DECL_COND(name, packet_size) \
387 typedef typename packet_conditional<packet_size, \
388 typename packet_traits<name ## Scalar>::type, \
389 typename packet_traits<name ## Scalar>::half, \
390 typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
393 #define PACKET_DECL_COND_SCALAR_PREFIX(prefix, packet_size) \
394 typedef typename packet_conditional<packet_size, \
395 typename packet_traits<Scalar>::type, \
396 typename packet_traits<Scalar>::half, \
397 typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \
398 prefix ## ScalarPacket
400 #define PACKET_DECL_COND_SCALAR(packet_size) \
401 typedef typename packet_conditional<packet_size, \
402 typename packet_traits<Scalar>::type, \
403 typename packet_traits<Scalar>::half, \
404 typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \
417 template<
typename _LhsScalar,
typename _RhsScalar,
bool _ConjLhs,
bool _ConjRhs,
int Arch,
int _PacketSize>
473 template<
typename RhsPacketType>
476 dest = pset1<RhsPacketType>(*b);
484 template<
typename RhsPacketType>
496 dest = ploadquad<RhsPacket>(b);
499 template<
typename LhsPacketType>
502 dest = pload<LhsPacketType>(
a);
505 template<
typename LhsPacketType>
508 dest = ploadu<LhsPacketType>(
a);
511 template<
typename LhsPacketType,
typename RhsPacketType,
typename AccPacketType,
typename LaneIdType>
512 EIGEN_STRONG_INLINE void madd(
const LhsPacketType&
a,
const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp,
const LaneIdType&)
const
519 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
523 tmp = b; tmp = cj.
pmul(
a,tmp); c =
padd(c,tmp);
527 template<
typename LhsPacketType,
typename AccPacketType,
typename LaneIdType>
535 r =
pmadd(c,alpha,r);
538 template<
typename ResPacketHalf>
541 r =
pmadd(c,alpha,r);
546 template<
typename RealScalar,
bool _ConjLhs,
int Arch,
int _PacketSize>
547 class gebp_traits<
std::complex<RealScalar>, RealScalar, _ConjLhs, false, Arch, _PacketSize>
568 #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
593 template<
typename RhsPacketType>
596 dest = pset1<RhsPacketType>(*b);
604 template<
typename RhsPacketType>
622 RhsScalar tmp[4] = {b[0],b[0],b[1],b[1]};
623 dest = ploadquad<RhsPacket>(tmp);
629 dest = pset1<RhsPacket>(*b);
634 dest = pload<LhsPacket>(
a);
637 template<
typename LhsPacketType>
640 dest = ploadu<LhsPacketType>(
a);
643 template <
typename LhsPacketType,
typename RhsPacketType,
typename AccPacketType,
typename LaneIdType>
644 EIGEN_STRONG_INLINE void madd(
const LhsPacketType&
a,
const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp,
const LaneIdType&)
const
649 template <
typename LhsPacketType,
typename RhsPacketType,
typename AccPacketType>
652 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
656 tmp = b; tmp =
pmul(
a.v,tmp); c.v =
padd(c.v,tmp);
665 template<
typename LhsPacketType,
typename AccPacketType,
typename LaneIdType>
671 template <
typename ResPacketType,
typename AccPacketType>
675 r = cj.
pmadd(c,alpha,r);
681 template<
typename Packet>
688 template<
typename Packet>
701 template<
typename Packet>
702 const DoublePacket<Packet>&
709 template<
typename Packet>
710 DoublePacket<typename unpacket_traits<Packet>::half>
724 template<
typename Scalar,
typename RealPacket>
732 template<
typename Scalar,
typename RealPacket>
740 dest.
first = ploadquad<RealPacket>(r);
741 dest.
second = ploadquad<RealPacket>(i);
757 template<
typename RealScalar,
bool _ConjLhs,
bool _ConjRhs,
int Arch,
int _PacketSize>
758 class gebp_traits<
std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs, Arch, _PacketSize >
805 p.
first = pset1<RealPacket>(RealScalar(0));
806 p.
second = pset1<RealPacket>(RealScalar(0));
812 dest = pset1<ScalarPacket>(*b);
816 template<
typename RealPacketType>
838 template<
typename RealPacketType>
861 template<
typename LhsPacketType>
867 template<
typename LhsPacketType,
typename RhsPacketType,
typename ResPacketType,
typename TmpType,
typename LaneIdType>
876 template<
typename LaneIdType>
882 template<
typename LhsPacketType,
typename AccPacketType,
typename LaneIdType>
890 template<
typename RealPacketType,
typename ResPacketType>
916 r =
pmadd(tmp,alpha,r);
923 template<
typename RealScalar,
bool _ConjRhs,
int Arch,
int _PacketSize>
924 class gebp_traits<RealScalar,
std::complex<RealScalar>, false, _ConjRhs, Arch, _PacketSize >
938 #undef PACKET_DECL_COND_SCALAR_PREFIX
939 #undef PACKET_DECL_COND_PREFIX
940 #undef PACKET_DECL_COND_SCALAR
941 #undef PACKET_DECL_COND
973 template<
typename RhsPacketType>
976 dest = pset1<RhsPacketType>(*b);
984 template<
typename RhsPacketType>
995 dest = ploaddup<LhsPacket>(
a);
1000 dest = ploadquad<RhsPacket>(b);
1003 template<
typename LhsPacketType>
1006 dest = ploaddup<LhsPacketType>(
a);
1009 template <
typename LhsPacketType,
typename RhsPacketType,
typename AccPacketType,
typename LaneIdType>
1010 EIGEN_STRONG_INLINE void madd(
const LhsPacketType&
a,
const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp,
const LaneIdType&)
const
1015 template <
typename LhsPacketType,
typename RhsPacketType,
typename AccPacketType>
1018 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
1022 tmp = b; tmp.v =
pmul(
a,tmp.v); c =
padd(c,tmp);
1032 template<
typename LhsPacketType,
typename AccPacketType,
typename LaneIdType>
1035 madd(
a, b.
get(lane), c, tmp, lane);
1038 template <
typename ResPacketType,
typename AccPacketType>
1042 r = cj.
pmadd(alpha,c,r);
1056 template<
typename LhsScalar,
typename RhsScalar,
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
1104 void operator()(
const DataMapper& res,
const LhsScalar* blockA,
const RhsScalar* blockB,
1109 template<
typename LhsScalar,
typename RhsScalar,
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs,
1140 template<
typename LhsScalar,
typename RhsScalar,
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
1160 SResPacketQuarter R = res.template gatherPacket<SResPacketQuarter>(i, j2);
1161 SResPacketQuarter alphav = pset1<SResPacketQuarter>(alpha);
1163 if (depth - endk > 0)
1169 for (
Index kk = endk; kk < depth; kk++)
1171 SLhsPacketQuarter a0;
1172 SRhsPacketQuarter b0;
1175 straits.
madd(a0,b0,c0,b0, fix<0>);
1179 straits.
acc(c0, alphav, R);
1185 res.scatterPacket(i, j2, R);
1189 template<
int nr, Index LhsProgress, Index RhsProgress,
typename LhsScalar,
typename RhsScalar,
typename ResScalar,
typename AccPacket,
typename LhsPacket,
typename RhsPacket,
typename ResPacket,
typename GEBPTraits,
typename LinearMapper,
typename DataMapper>
1194 EIGEN_STRONG_INLINE void peeled_kc_onestep(
Index K,
const LhsScalar* blA,
const RhsScalar* blB, GEBPTraits
traits, LhsPacket *A0,
RhsPacketx4 *rhs_panel, RhsPacket *T0, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
1198 traits.loadLhs(&blA[(0+1*K)*LhsProgress], *A0);
1199 traits.loadRhs(&blB[(0+4*K)*RhsProgress], *rhs_panel);
1200 traits.madd(*A0, *rhs_panel, *C0, *T0, fix<0>);
1201 traits.madd(*A0, *rhs_panel, *C1, *T0, fix<1>);
1202 traits.madd(*A0, *rhs_panel, *C2, *T0, fix<2>);
1203 traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>);
1204 #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE)
1205 __asm__ (
"" :
"+x,m" (*A0));
1211 const DataMapper& res,
const LhsScalar* blockA,
const RhsScalar* blockB, ResScalar alpha,
1219 for(
Index i=peelStart; i<peelEnd; i+=LhsProgress)
1222 for(
Index j2=0; j2<packet_cols4; j2+=nr)
1227 const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)];
1231 AccPacket C0, C1, C2, C3;
1241 AccPacket D0, D1, D2, D3;
1247 LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1248 LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1249 LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1250 LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1252 r0.prefetch(prefetch_res_offset);
1253 r1.prefetch(prefetch_res_offset);
1254 r2.prefetch(prefetch_res_offset);
1255 r3.prefetch(prefetch_res_offset);
1258 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1262 for(
Index k=0; k<peeled_kc; k+=pk)
1269 peeled_kc_onestep(0, blA, blB,
traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1270 peeled_kc_onestep(1, blA, blB,
traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1271 peeled_kc_onestep(2, blA, blB,
traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1272 peeled_kc_onestep(3, blA, blB,
traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1274 peeled_kc_onestep(4, blA, blB,
traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1275 peeled_kc_onestep(5, blA, blB,
traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1276 peeled_kc_onestep(6, blA, blB,
traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1277 peeled_kc_onestep(7, blA, blB,
traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1279 blB += pk*4*RhsProgress;
1280 blA += pk*LhsProgress;
1290 for(
Index k=peeled_kc; k<depth; k++)
1294 peeled_kc_onestep(0, blA, blB,
traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1295 blB += 4*RhsProgress;
1300 ResPacket alphav = pset1<ResPacket>(alpha);
1302 R0 = r0.template loadPacket<ResPacket>(0);
1303 R1 = r1.template loadPacket<ResPacket>(0);
1304 traits.acc(C0, alphav, R0);
1305 traits.acc(C1, alphav, R1);
1306 r0.storePacket(0, R0);
1307 r1.storePacket(0, R1);
1309 R0 = r2.template loadPacket<ResPacket>(0);
1310 R1 = r3.template loadPacket<ResPacket>(0);
1311 traits.acc(C2, alphav, R0);
1312 traits.acc(C3, alphav, R1);
1313 r2.storePacket(0, R0);
1314 r3.storePacket(0, R1);
1318 for(
Index j2=packet_cols4; j2<cols; j2++)
1321 const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)];
1328 LinearMapper r0 = res.getLinearMapper(i, j2);
1331 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1334 for(
Index k= 0; k<peeled_kc; k+=pk)
1339 #define EIGEN_GEBGP_ONESTEP(K) \
1341 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1/half/quarterX1"); \
1342 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1344 traits.loadLhsUnaligned(&blA[(0+1*K)*LhsProgress], A0); \
1345 traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1346 traits.madd(A0, B_0, C0, B_0, fix<0>); \
1347 EIGEN_ASM_COMMENT("end step of gebp micro kernel 1/half/quarterX1"); \
1359 blB += pk*RhsProgress;
1360 blA += pk*LhsProgress;
1366 for(
Index k=peeled_kc; k<depth; k++)
1373 #undef EIGEN_GEBGP_ONESTEP
1375 ResPacket alphav = pset1<ResPacket>(alpha);
1376 R0 = r0.template loadPacket<ResPacket>(0);
1377 traits.acc(C0, alphav, R0);
1378 r0.storePacket(0, R0);
1384 template<
int nr, Index LhsProgress, Index RhsProgress,
typename LhsScalar,
typename RhsScalar,
typename ResScalar,
typename AccPacket,
typename LhsPacket,
typename RhsPacket,
typename ResPacket,
typename GEBPTraits,
typename LinearMapper,
typename DataMapper>
1385 struct lhs_process_fraction_of_packet :
lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, GEBPTraits, LinearMapper, DataMapper>
1388 EIGEN_STRONG_INLINE void peeled_kc_onestep(
Index K,
const LhsScalar* blA,
const RhsScalar* blB, GEBPTraits
traits, LhsPacket *A0, RhsPacket *B_0, RhsPacket *B1, RhsPacket *B2, RhsPacket *B3, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
1392 traits.loadLhsUnaligned(&blA[(0+1*K)*(LhsProgress)], *A0);
1393 traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], *B_0, *B1, *B2, *B3);
1394 traits.madd(*A0, *B_0, *C0, *B_0);
1395 traits.madd(*A0, *B1, *C1, *B1);
1396 traits.madd(*A0, *B2, *C2, *B2);
1397 traits.madd(*A0, *B3, *C3, *B3);
1402 template<
typename LhsScalar,
typename RhsScalar,
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
1405 ::operator()(
const DataMapper& res,
const LhsScalar* blockA,
const RhsScalar* blockB,
1412 if(strideA==-1) strideA = depth;
1413 if(strideB==-1) strideB = depth;
1415 Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
1416 const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0;
1417 const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0;
1418 const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? peeled_mc2+((rows-peeled_mc2)/(1*LhsProgress))*(1*LhsProgress) : 0;
1419 const Index peeled_mc_half = mr>=LhsProgressHalf ? peeled_mc1+((rows-peeled_mc1)/(LhsProgressHalf))*(LhsProgressHalf) : 0;
1420 const Index peeled_mc_quarter = mr>=LhsProgressQuarter ? peeled_mc_half+((rows-peeled_mc_half)/(LhsProgressQuarter))*(LhsProgressQuarter) : 0;
1422 const Index peeled_kc = depth & ~(pk-1);
1423 const int prefetch_res_offset = 32/
sizeof(
ResScalar);
1429 if(mr>=3*Traits::LhsProgress)
1440 const Index actual_panel_rows = (3*LhsProgress) * std::max<Index>(1,( (l1 -
sizeof(
ResScalar)*mr*nr - depth*nr*
sizeof(RhsScalar)) / (depth *
sizeof(LhsScalar) * 3*LhsProgress) ));
1441 for(
Index i1=0; i1<peeled_mc3; i1+=actual_panel_rows)
1443 const Index actual_panel_end = (
std::min)(i1+actual_panel_rows, peeled_mc3);
1444 for(
Index j2=0; j2<packet_cols4; j2+=nr)
1446 for(
Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
1452 const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*LhsProgress)];
1474 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1478 for(
Index k=0; k<peeled_kc; k+=pk)
1485 #if EIGEN_COMP_GNUC_STRICT && EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && !(EIGEN_GNUC_AT_LEAST(9,0))
1489 #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND __asm__ ("" : "+w,m" (A0), "+w,m" (A1), "+w,m" (A2));
1491 #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND
1493 #define EIGEN_GEBP_ONESTEP(K) \
1495 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
1496 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1497 internal::prefetch(blA + (3 * K + 16) * LhsProgress); \
1498 if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) { \
1499 internal::prefetch(blB + (4 * K + 16) * RhsProgress); \
1501 traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
1502 traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
1503 traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
1504 EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND \
1505 traits.loadRhs(blB + (0+4*K) * Traits::RhsProgress, rhs_panel); \
1506 traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
1507 traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
1508 traits.madd(A2, rhs_panel, C8, T0, fix<0>); \
1509 traits.updateRhs(blB + (1+4*K) * Traits::RhsProgress, rhs_panel); \
1510 traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
1511 traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
1512 traits.madd(A2, rhs_panel, C9, T0, fix<1>); \
1513 traits.updateRhs(blB + (2+4*K) * Traits::RhsProgress, rhs_panel); \
1514 traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
1515 traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
1516 traits.madd(A2, rhs_panel, C10, T0, fix<2>); \
1517 traits.updateRhs(blB + (3+4*K) * Traits::RhsProgress, rhs_panel); \
1518 traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
1519 traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
1520 traits.madd(A2, rhs_panel, C11, T0, fix<3>); \
1521 EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
1534 blB += pk*4*RhsProgress;
1535 blA += pk*3*Traits::LhsProgress;
1540 for(
Index k=peeled_kc; k<depth; k++)
1546 blB += 4*RhsProgress;
1547 blA += 3*Traits::LhsProgress;
1550 #undef EIGEN_GEBP_ONESTEP
1553 ResPacket alphav = pset1<ResPacket>(alpha);
1555 R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1556 R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1557 R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1558 traits.acc(C0, alphav, R0);
1559 traits.acc(C4, alphav, R1);
1560 traits.acc(C8, alphav, R2);
1561 r0.storePacket(0 * Traits::ResPacketSize, R0);
1562 r0.storePacket(1 * Traits::ResPacketSize, R1);
1563 r0.storePacket(2 * Traits::ResPacketSize, R2);
1565 R0 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1566 R1 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1567 R2 = r1.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1568 traits.acc(C1, alphav, R0);
1569 traits.acc(C5, alphav, R1);
1570 traits.acc(C9, alphav, R2);
1571 r1.storePacket(0 * Traits::ResPacketSize, R0);
1572 r1.storePacket(1 * Traits::ResPacketSize, R1);
1573 r1.storePacket(2 * Traits::ResPacketSize, R2);
1575 R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1576 R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1577 R2 = r2.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1578 traits.acc(C2, alphav, R0);
1579 traits.acc(C6, alphav, R1);
1580 traits.acc(C10, alphav, R2);
1581 r2.storePacket(0 * Traits::ResPacketSize, R0);
1582 r2.storePacket(1 * Traits::ResPacketSize, R1);
1583 r2.storePacket(2 * Traits::ResPacketSize, R2);
1585 R0 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1586 R1 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1587 R2 = r3.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1588 traits.acc(C3, alphav, R0);
1589 traits.acc(C7, alphav, R1);
1590 traits.acc(C11, alphav, R2);
1591 r3.storePacket(0 * Traits::ResPacketSize, R0);
1592 r3.storePacket(1 * Traits::ResPacketSize, R1);
1593 r3.storePacket(2 * Traits::ResPacketSize, R2);
1598 for(
Index j2=packet_cols4; j2<cols; j2++)
1600 for(
Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
1603 const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*Traits::LhsProgress)];
1616 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1619 for(
Index k=0; k<peeled_kc; k+=pk)
1623 #define EIGEN_GEBGP_ONESTEP(K) \
1625 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
1626 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1627 traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
1628 traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
1629 traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
1630 traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0); \
1631 traits.madd(A0, B_0, C0, B_0, fix<0>); \
1632 traits.madd(A1, B_0, C4, B_0, fix<0>); \
1633 traits.madd(A2, B_0, C8, B_0, fix<0>); \
1634 EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
1646 blB += int(pk) * int(RhsProgress);
1647 blA += int(pk) * 3 * int(Traits::LhsProgress);
1653 for(
Index k=peeled_kc; k<depth; k++)
1658 blA += 3*Traits::LhsProgress;
1660 #undef EIGEN_GEBGP_ONESTEP
1662 ResPacket alphav = pset1<ResPacket>(alpha);
1664 R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1665 R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1666 R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1667 traits.acc(C0, alphav, R0);
1668 traits.acc(C4, alphav, R1);
1669 traits.acc(C8, alphav, R2);
1670 r0.storePacket(0 * Traits::ResPacketSize, R0);
1671 r0.storePacket(1 * Traits::ResPacketSize, R1);
1672 r0.storePacket(2 * Traits::ResPacketSize, R2);
1679 if(mr>=2*Traits::LhsProgress)
1685 Index actual_panel_rows = (2*LhsProgress) * std::max<Index>(1,( (l1 -
sizeof(
ResScalar)*mr*nr - depth*nr*
sizeof(RhsScalar)) / (depth *
sizeof(LhsScalar) * 2*LhsProgress) ));
1687 for(
Index i1=peeled_mc3; i1<peeled_mc2; i1+=actual_panel_rows)
1689 Index actual_panel_end = (
std::min)(i1+actual_panel_rows, peeled_mc2);
1690 for(
Index j2=0; j2<packet_cols4; j2+=nr)
1692 for(
Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
1698 const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
1712 r0.prefetch(prefetch_res_offset);
1713 r1.prefetch(prefetch_res_offset);
1714 r2.prefetch(prefetch_res_offset);
1715 r3.prefetch(prefetch_res_offset);
1718 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1722 for(
Index k=0; k<peeled_kc; k+=pk)
1730 #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE)
1731 #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__ ("" : [a0] "+x,m" (A0),[a1] "+x,m" (A1));
1733 #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND
1735 #define EIGEN_GEBGP_ONESTEP(K) \
1737 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \
1738 traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \
1739 traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \
1740 traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], rhs_panel); \
1741 traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
1742 traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
1743 traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
1744 traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
1745 traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
1746 traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
1747 traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
1748 traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
1749 EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \
1750 EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \
1764 blB += pk*4*RhsProgress;
1765 blA += pk*(2*Traits::LhsProgress);
1770 for(
Index k=peeled_kc; k<depth; k++)
1775 blB += 4*RhsProgress;
1776 blA += 2*Traits::LhsProgress;
1778 #undef EIGEN_GEBGP_ONESTEP
1781 ResPacket alphav = pset1<ResPacket>(alpha);
1783 R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1784 R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1785 R2 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1786 R3 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1787 traits.acc(C0, alphav, R0);
1788 traits.acc(C4, alphav, R1);
1789 traits.acc(C1, alphav, R2);
1790 traits.acc(C5, alphav, R3);
1791 r0.storePacket(0 * Traits::ResPacketSize, R0);
1792 r0.storePacket(1 * Traits::ResPacketSize, R1);
1793 r1.storePacket(0 * Traits::ResPacketSize, R2);
1794 r1.storePacket(1 * Traits::ResPacketSize, R3);
1796 R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1797 R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1798 R2 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1799 R3 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1800 traits.acc(C2, alphav, R0);
1801 traits.acc(C6, alphav, R1);
1802 traits.acc(C3, alphav, R2);
1803 traits.acc(C7, alphav, R3);
1804 r2.storePacket(0 * Traits::ResPacketSize, R0);
1805 r2.storePacket(1 * Traits::ResPacketSize, R1);
1806 r3.storePacket(0 * Traits::ResPacketSize, R2);
1807 r3.storePacket(1 * Traits::ResPacketSize, R3);
1812 for(
Index j2=packet_cols4; j2<cols; j2++)
1814 for(
Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
1817 const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
1826 r0.prefetch(prefetch_res_offset);
1829 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1832 for(
Index k=0; k<peeled_kc; k+=pk)
1837 #define EIGEN_GEBGP_ONESTEP(K) \
1839 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1"); \
1840 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1841 traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
1842 traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
1843 traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1844 traits.madd(A0, B_0, C0, B1, fix<0>); \
1845 traits.madd(A1, B_0, C4, B_0, fix<0>); \
1846 EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \
1858 blB += int(pk) * int(RhsProgress);
1859 blA += int(pk) * 2 * int(Traits::LhsProgress);
1865 for(
Index k=peeled_kc; k<depth; k++)
1870 blA += 2*Traits::LhsProgress;
1872 #undef EIGEN_GEBGP_ONESTEP
1874 ResPacket alphav = pset1<ResPacket>(alpha);
1876 R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1877 R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1878 traits.acc(C0, alphav, R0);
1879 traits.acc(C4, alphav, R1);
1880 r0.storePacket(0 * Traits::ResPacketSize, R0);
1881 r0.storePacket(1 * Traits::ResPacketSize, R1);
1887 if(mr>=1*Traits::LhsProgress)
1889 lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, Traits, LinearMapper, DataMapper> p;
1890 p(res, blockA, blockB, alpha, peeled_mc2, peeled_mc1, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
1893 if((LhsProgressHalf < LhsProgress) && mr>=LhsProgressHalf)
1895 lhs_process_fraction_of_packet<nr, LhsProgressHalf, RhsProgressHalf, LhsScalar, RhsScalar, ResScalar, AccPacketHalf, LhsPacketHalf, RhsPacketHalf, ResPacketHalf, HalfTraits, LinearMapper, DataMapper> p;
1896 p(res, blockA, blockB, alpha, peeled_mc1, peeled_mc_half, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
1899 if((LhsProgressQuarter < LhsProgressHalf) && mr>=LhsProgressQuarter)
1901 lhs_process_fraction_of_packet<nr, LhsProgressQuarter, RhsProgressQuarter, LhsScalar, RhsScalar, ResScalar, AccPacketQuarter, LhsPacketQuarter, RhsPacketQuarter, ResPacketQuarter, QuarterTraits, LinearMapper, DataMapper> p;
1902 p(res, blockA, blockB, alpha, peeled_mc_half, peeled_mc_quarter, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
1905 if(peeled_mc_quarter<rows)
1908 for(
Index j2=0; j2<packet_cols4; j2+=nr)
1911 for(
Index i=peeled_mc_quarter; i<rows; i+=1)
1913 const LhsScalar* blA = &blockA[i*strideA+offsetA];
1915 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1922 if ((SwappedTraits::LhsProgress % 4) == 0 &&
1923 (SwappedTraits::LhsProgress<=16) &&
1924 (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr) &&
1925 (SwappedTraits::LhsProgress!=16 || SResPacketQuarterSize==nr))
1933 const Index spk = (
std::max)(1,SwappedTraits::LhsProgress/4);
1934 const Index endk = (depth/spk)*spk;
1935 const Index endk4 = (depth/(spk*4))*(spk*4);
1938 for(; k<endk4; k+=4*spk)
1948 straits.
madd(A0,B_0,C0,B_0, fix<0>);
1949 straits.
madd(A1,B_1,C1,B_1, fix<0>);
1955 straits.
madd(A0,B_0,C2,B_0, fix<0>);
1956 straits.
madd(A1,B_1,C3,B_1, fix<0>);
1958 blB += 4*SwappedTraits::LhsProgress;
1962 for(; k<endk; k+=spk)
1969 straits.
madd(A0,B_0,C0,B_0, fix<0>);
1971 blB += SwappedTraits::LhsProgress;
1974 if(SwappedTraits::LhsProgress==8)
1982 SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
1983 SResPacketHalf alphav = pset1<SResPacketHalf>(alpha);
1993 straits.
madd(a0,b0,c0,b0, fix<0>);
1994 straits.
acc(c0, alphav, R);
2000 res.scatterPacket(i, j2, R);
2002 else if (SwappedTraits::LhsProgress==16)
2009 p(res, straits, blA, blB, depth, endk, i, j2,alpha, C0);
2013 SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
2014 SResPacket alphav = pset1<SResPacket>(alpha);
2015 straits.
acc(C0, alphav, R);
2016 res.scatterPacket(i, j2, R);
2024 for(
Index k=0; k<depth; k++)
2033 C0 = cj.
pmadd(A0,B_0,C0);
2034 C1 = cj.
pmadd(A0,B_1,C1);
2038 C2 = cj.
pmadd(A0,B_0,C2);
2039 C3 = cj.
pmadd(A0,B_1,C3);
2043 res(i, j2 + 0) += alpha * C0;
2044 res(i, j2 + 1) += alpha * C1;
2045 res(i, j2 + 2) += alpha * C2;
2046 res(i, j2 + 3) += alpha * C3;
2051 for(
Index j2=packet_cols4; j2<cols; j2++)
2054 for(
Index i=peeled_mc_quarter; i<rows; i+=1)
2056 const LhsScalar* blA = &blockA[i*strideA+offsetA];
2060 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
2061 for(
Index k=0; k<depth; k++)
2063 LhsScalar A0 = blA[k];
2064 RhsScalar B_0 = blB[k];
2065 C0 = cj.
pmadd(A0, B_0, C0);
2067 res(i, j2) += alpha * C0;
2088 template<
typename Scalar,
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2095 template<
typename Scalar,
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2104 HasHalf = (int)HalfPacketSize < (
int)PacketSize,
2105 HasQuarter = (int)QuarterPacketSize < (
int)HalfPacketSize};
2110 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2111 eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) );
2115 const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
2116 const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
2117 const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0;
2118 const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0;
2119 const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? (rows/(QuarterPacketSize))*(QuarterPacketSize) : 0;
2120 const Index last_lhs_progress = rows > peeled_mc_quarter ? (rows - peeled_mc_quarter) & ~1 : 0;
2121 const Index peeled_mc0 = Pack2>=PacketSize ? peeled_mc_quarter
2122 : Pack2>1 && last_lhs_progress ? (rows/last_lhs_progress)*last_lhs_progress : 0;
2127 if(Pack1>=3*PacketSize)
2129 for(; i<peeled_mc3; i+=3*PacketSize)
2131 if(PanelMode) count += (3*PacketSize) * offset;
2133 for(
Index k=0; k<depth; k++)
2136 A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
2137 B = lhs.template loadPacket<Packet>(i+1*PacketSize, k);
2138 C = lhs.template loadPacket<Packet>(i+2*PacketSize, k);
2139 pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
2140 pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
2141 pstore(blockA+count, cj.pconj(C)); count+=PacketSize;
2143 if(PanelMode) count += (3*PacketSize) * (stride-offset-depth);
2147 if(Pack1>=2*PacketSize)
2149 for(; i<peeled_mc2; i+=2*PacketSize)
2151 if(PanelMode) count += (2*PacketSize) * offset;
2153 for(
Index k=0; k<depth; k++)
2156 A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
2157 B = lhs.template loadPacket<Packet>(i+1*PacketSize, k);
2158 pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
2159 pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
2161 if(PanelMode) count += (2*PacketSize) * (stride-offset-depth);
2165 if(Pack1>=1*PacketSize)
2167 for(; i<peeled_mc1; i+=1*PacketSize)
2169 if(PanelMode) count += (1*PacketSize) * offset;
2171 for(
Index k=0; k<depth; k++)
2174 A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
2175 pstore(blockA+count, cj.pconj(A));
2178 if(PanelMode) count += (1*PacketSize) * (stride-offset-depth);
2182 if(HasHalf && Pack1>=HalfPacketSize)
2184 for(; i<peeled_mc_half; i+=HalfPacketSize)
2186 if(PanelMode) count += (HalfPacketSize) * offset;
2188 for(
Index k=0; k<depth; k++)
2191 A = lhs.template loadPacket<HalfPacket>(i+0*(HalfPacketSize), k);
2192 pstoreu(blockA+count, cj.pconj(A));
2193 count+=HalfPacketSize;
2195 if(PanelMode) count += (HalfPacketSize) * (stride-offset-depth);
2199 if(HasQuarter && Pack1>=QuarterPacketSize)
2201 for(; i<peeled_mc_quarter; i+=QuarterPacketSize)
2203 if(PanelMode) count += (QuarterPacketSize) * offset;
2205 for(
Index k=0; k<depth; k++)
2208 A = lhs.template loadPacket<QuarterPacket>(i+0*(QuarterPacketSize), k);
2209 pstoreu(blockA+count, cj.pconj(A));
2210 count+=QuarterPacketSize;
2212 if(PanelMode) count += (QuarterPacketSize) * (stride-offset-depth);
2221 if(Pack2<PacketSize && Pack2>1)
2223 for(; i<peeled_mc0; i+=last_lhs_progress)
2225 if(PanelMode) count += last_lhs_progress * offset;
2227 for(
Index k=0; k<depth; k++)
2228 for(
Index w=0; w<last_lhs_progress; w++)
2229 blockA[count++] = cj(lhs(i+w, k));
2231 if(PanelMode) count += last_lhs_progress * (stride-offset-depth);
2237 if(PanelMode) count += offset;
2238 for(
Index k=0; k<depth; k++)
2239 blockA[count++] = cj(lhs(i, k));
2240 if(PanelMode) count += (stride-offset-depth);
2244 template<
typename Scalar,
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2251 template<
typename Scalar,
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2260 HasHalf = (int)HalfPacketSize < (
int)PacketSize,
2261 HasQuarter = (int)QuarterPacketSize < (
int)HalfPacketSize};
2266 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2269 bool gone_half =
false, gone_quarter =
false, gone_last =
false;
2273 int psize = PacketSize;
2276 Index remaining_rows = rows-i;
2277 Index peeled_mc = gone_last ? Pack2>1 ? (rows/pack)*pack : 0 : i+(remaining_rows/pack)*pack;
2278 Index starting_pos = i;
2279 for(; i<peeled_mc; i+=pack)
2281 if(PanelMode) count += pack * offset;
2284 if(pack>=psize && psize >= QuarterPacketSize)
2286 const Index peeled_k = (depth/psize)*psize;
2287 for(; k<peeled_k; k+=psize)
2289 for (
Index m = 0; m < pack; m += psize)
2291 if (psize == PacketSize) {
2293 for (
int p = 0; p < psize; ++p) kernel.
packet[p] = lhs.template loadPacket<Packet>(i+p+m, k);
2295 for (
int p = 0; p < psize; ++p)
pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.
packet[p]));
2296 }
else if (HasHalf && psize == HalfPacketSize) {
2299 for (
int p = 0; p < psize; ++p) kernel_half.
packet[p] = lhs.template loadPacket<HalfPacket>(i+p+m, k);
2301 for (
int p = 0; p < psize; ++p)
pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_half.
packet[p]));
2302 }
else if (HasQuarter && psize == QuarterPacketSize) {
2303 gone_quarter =
true;
2305 for (
int p = 0; p < psize; ++p) kernel_quarter.
packet[p] = lhs.template loadPacket<QuarterPacket>(i+p+m, k);
2307 for (
int p = 0; p < psize; ++p)
pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_quarter.
packet[p]));
2310 count += psize*pack;
2317 for(; w<pack-3; w+=4)
2319 Scalar
a(cj(lhs(i+w+0, k))),
2320 b(cj(lhs(i+w+1, k))),
2321 c(cj(lhs(i+w+2, k))),
2322 d(cj(lhs(i+w+3, k)));
2323 blockA[count++] =
a;
2324 blockA[count++] = b;
2325 blockA[count++] = c;
2326 blockA[count++] =
d;
2330 blockA[count++] = cj(lhs(i+w, k));
2333 if(PanelMode) count += pack * (stride-offset-depth);
2337 Index left = rows - i;
2340 (starting_pos == i || left >= psize/2 || left >= psize/4) &&
2341 ((psize/2 == HalfPacketSize && HasHalf && !gone_half) ||
2342 (psize/2 == QuarterPacketSize && HasQuarter && !gone_quarter))) {
2353 if (Pack2 < PacketSize && !gone_last) {
2355 psize = pack = left & ~1;
2362 if(PanelMode) count += offset;
2363 for(
Index k=0; k<depth; k++)
2364 blockA[count++] = cj(lhs(i, k));
2365 if(PanelMode) count += (stride-offset-depth);
2376 template<
typename Scalar,
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2385 template<
typename Scalar,
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2392 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2394 Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
2395 Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
2397 const Index peeled_k = (depth/PacketSize)*PacketSize;
2446 for(
Index j2=packet_cols8; j2<packet_cols4; j2+=4)
2449 if(PanelMode) count += 4 * offset;
2450 const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
2451 const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
2452 const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
2453 const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
2456 if((PacketSize%4)==0)
2458 for(; k<peeled_k; k+=PacketSize) {
2460 kernel.
packet[0 ] = dm0.template loadPacket<Packet>(k);
2461 kernel.packet[1%PacketSize] = dm1.template loadPacket<Packet>(k);
2462 kernel.packet[2%PacketSize] = dm2.template loadPacket<Packet>(k);
2463 kernel.packet[3%PacketSize] = dm3.template loadPacket<Packet>(k);
2465 pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0]));
2466 pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize]));
2467 pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.packet[2%PacketSize]));
2468 pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.packet[3%PacketSize]));
2469 count+=4*PacketSize;
2474 blockB[count+0] = cj(dm0(k));
2475 blockB[count+1] = cj(dm1(k));
2476 blockB[count+2] = cj(dm2(k));
2477 blockB[count+3] = cj(dm3(k));
2481 if(PanelMode) count += 4 * (stride-offset-depth);
2486 for(
Index j2=packet_cols4; j2<cols; ++j2)
2488 if(PanelMode) count += offset;
2490 for(
Index k=0; k<depth; k++)
2492 blockB[count] = cj(dm0(k));
2495 if(PanelMode) count += (stride-offset-depth);
2500 template<
typename Scalar,
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2515 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2516 const bool HasHalf = (int)HalfPacketSize < (
int)PacketSize;
2517 const bool HasQuarter = (int)QuarterPacketSize < (
int)HalfPacketSize;
2519 Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
2520 Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
2558 for(
Index j2=packet_cols8; j2<packet_cols4; j2+=4)
2561 if(PanelMode) count += 4 * offset;
2562 for(
Index k=0; k<depth; k++)
2564 if (PacketSize==4) {
2565 Packet A = rhs.template loadPacket<Packet>(k, j2);
2566 pstoreu(blockB+count, cj.pconj(A));
2567 count += PacketSize;
2568 }
else if (HasHalf && HalfPacketSize==4) {
2569 HalfPacket A = rhs.template loadPacket<HalfPacket>(k, j2);
2570 pstoreu(blockB+count, cj.pconj(A));
2571 count += HalfPacketSize;
2572 }
else if (HasQuarter && QuarterPacketSize==4) {
2573 QuarterPacket A = rhs.template loadPacket<QuarterPacket>(k, j2);
2574 pstoreu(blockB+count, cj.pconj(A));
2575 count += QuarterPacketSize;
2578 blockB[count+0] = cj(dm0(0));
2579 blockB[count+1] = cj(dm0(1));
2580 blockB[count+2] = cj(dm0(2));
2581 blockB[count+3] = cj(dm0(3));
2586 if(PanelMode) count += 4 * (stride-offset-depth);
2590 for(
Index j2=packet_cols4; j2<cols; ++j2)
2592 if(PanelMode) count += offset;
2593 for(
Index k=0; k<depth; k++)
2595 blockB[count] = cj(rhs(k, j2));
2598 if(PanelMode) count += stride-offset-depth;
2609 std::ptrdiff_t l1, l2, l3;
2618 std::ptrdiff_t l1, l2, l3;
2628 std::ptrdiff_t l1, l2, l3;
#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
Definition: PacketMath.h:22
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
Definition: PacketMath.h:27
EIGEN_DEVICE_FUNC RealReturnType real() const
Definition: CommonCwiseUnaryOps.h:100
EIGEN_DEVICE_FUNC const ImagReturnType imag() const
Definition: CommonCwiseUnaryOps.h:109
internal::enable_if< internal::valid_indexed_view_overload< RowIndices, ColIndices >::value &&internal::traits< typename EIGEN_INDEXED_VIEW_METHOD_TYPE< RowIndices, ColIndices >::type >::ReturnAsIndexedView, typename EIGEN_INDEXED_VIEW_METHOD_TYPE< RowIndices, ColIndices >::type >::type operator()(const RowIndices &rowIndices, const ColIndices &colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST
Definition: IndexedViewMethods.h:73
boost::random::uniform_int_distribution action(0, 100)
#define EIGEN_ASM_COMMENT(X)
Definition: Macros.h:1082
#define EIGEN_COMP_MSVC
Definition: Macros.h:114
#define EIGEN_PLAIN_ENUM_MIN(a, b)
Definition: Macros.h:1288
#define eigen_internal_assert(x)
Definition: Macros.h:1043
#define EIGEN_UNUSED_VARIABLE(var)
Definition: Macros.h:1076
#define EIGEN_DONT_INLINE
Definition: Macros.h:940
#define eigen_assert(x)
Definition: Macros.h:1037
#define EIGEN_STRONG_INLINE
Definition: Macros.h:917
Handle force and torque contact information.
Definition: ContactList.h:19
Definition: ForwardDeclarations.h:87
Definition: IntegralConstant.h:52
LhsPacket LhsPacket4Packing
Definition: GeneralBlockPanelKernel.h:964
PACKET_DECL_COND_PREFIX(_, Res, _PacketSize)
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar *a, LhsPacket &dest) const
Definition: GeneralBlockPanelKernel.h:993
PACKET_DECL_COND_SCALAR_PREFIX(_, _PacketSize)
PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize)
QuadPacket< RhsPacket > RhsPacketx4
Definition: GeneralBlockPanelKernel.h:965
EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType &a, const RhsPacketType &b, AccPacketType &c, RhsPacketType &tmp, const true_type &) const
Definition: GeneralBlockPanelKernel.h:1016
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketx4 &b, AccPacketType &c, RhsPacket &tmp, const LaneIdType &lane) const
Definition: GeneralBlockPanelKernel.h:1033
PACKET_DECL_COND_PREFIX(_, Real, _PacketSize)
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketType &b, AccPacketType &c, RhsPacketType &tmp, const LaneIdType &) const
Definition: GeneralBlockPanelKernel.h:1010
EIGEN_STRONG_INLINE void acc(const AccPacketType &c, const ResPacketType &alpha, ResPacketType &r) const
Definition: GeneralBlockPanelKernel.h:1039
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacketType &dest) const
Definition: GeneralBlockPanelKernel.h:974
conditional< Vectorizable, _RhsPacket, RhsScalar >::type RhsPacket
Definition: GeneralBlockPanelKernel.h:962
Scalar ResScalar
Definition: GeneralBlockPanelKernel.h:930
PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize)
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar *a, LhsPacketType &dest) const
Definition: GeneralBlockPanelKernel.h:1004
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *, RhsPacketx4 &) const
Definition: GeneralBlockPanelKernel.h:990
ResPacket AccPacket
Definition: GeneralBlockPanelKernel.h:966
RealScalar LhsScalar
Definition: GeneralBlockPanelKernel.h:928
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *b, RhsPacketType &dest) const
Definition: GeneralBlockPanelKernel.h:985
EIGEN_STRONG_INLINE void initAcc(AccPacket &p)
Definition: GeneralBlockPanelKernel.h:968
EIGEN_STRONG_INLINE void madd_impl(const LhsScalar &a, const RhsScalar &b, ResScalar &c, RhsScalar &, const false_type &) const
Definition: GeneralBlockPanelKernel.h:1027
conditional< Vectorizable, _ResPacket, ResScalar >::type ResPacket
Definition: GeneralBlockPanelKernel.h:963
std::complex< RealScalar > Scalar
Definition: GeneralBlockPanelKernel.h:927
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, RhsPacket &dest) const
Definition: GeneralBlockPanelKernel.h:998
conditional< Vectorizable, _LhsPacket, LhsScalar >::type LhsPacket
Definition: GeneralBlockPanelKernel.h:961
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacketx4 &dest) const
Definition: GeneralBlockPanelKernel.h:979
Scalar RhsScalar
Definition: GeneralBlockPanelKernel.h:929
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *b, RhsPacketType &dest) const
Definition: GeneralBlockPanelKernel.h:605
LhsPacket LhsPacket4Packing
Definition: GeneralBlockPanelKernel.h:582
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, RhsPacket &dest) const
Definition: GeneralBlockPanelKernel.h:613
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *, RhsPacketx4 &) const
Definition: GeneralBlockPanelKernel.h:610
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacketType &dest) const
Definition: GeneralBlockPanelKernel.h:594
QuadPacket< RhsPacket > RhsPacketx4
Definition: GeneralBlockPanelKernel.h:584
EIGEN_STRONG_INLINE void madd_impl(const LhsScalar &a, const RhsScalar &b, ResScalar &c, RhsScalar &, const false_type &) const
Definition: GeneralBlockPanelKernel.h:660
std::complex< RealScalar > LhsScalar
Definition: GeneralBlockPanelKernel.h:550
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketx4 &b, AccPacketType &c, RhsPacket &tmp, const LaneIdType &lane) const
Definition: GeneralBlockPanelKernel.h:666
conditional< Vectorizable, _ResPacket, ResScalar >::type ResPacket
Definition: GeneralBlockPanelKernel.h:581
conditional< Vectorizable, _LhsPacket, LhsScalar >::type LhsPacket
Definition: GeneralBlockPanelKernel.h:579
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacketx4 &dest) const
Definition: GeneralBlockPanelKernel.h:599
EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar *b, RhsPacket &dest, const false_type &) const
Definition: GeneralBlockPanelKernel.h:626
RealScalar RhsScalar
Definition: GeneralBlockPanelKernel.h:551
PACKET_DECL_COND_PREFIX(_, Res, _PacketSize)
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketType &b, AccPacketType &c, RhsPacketType &tmp, const LaneIdType &) const
Definition: GeneralBlockPanelKernel.h:644
EIGEN_STRONG_INLINE void initAcc(AccPacket &p)
Definition: GeneralBlockPanelKernel.h:588
EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar *b, RhsPacket &dest, const true_type &) const
Definition: GeneralBlockPanelKernel.h:618
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar *a, LhsPacket &dest) const
Definition: GeneralBlockPanelKernel.h:632
ScalarBinaryOpTraits< LhsScalar, RhsScalar >::ReturnType ResScalar
Definition: GeneralBlockPanelKernel.h:552
PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize)
EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType &a, const RhsPacketType &b, AccPacketType &c, RhsPacketType &tmp, const true_type &) const
Definition: GeneralBlockPanelKernel.h:650
EIGEN_STRONG_INLINE void acc(const AccPacketType &c, const ResPacketType &alpha, ResPacketType &r) const
Definition: GeneralBlockPanelKernel.h:672
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar *a, LhsPacketType &dest) const
Definition: GeneralBlockPanelKernel.h:638
PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize)
ResPacket AccPacket
Definition: GeneralBlockPanelKernel.h:586
conditional< Vectorizable, _RhsPacket, RhsScalar >::type RhsPacket
Definition: GeneralBlockPanelKernel.h:580
EIGEN_STRONG_INLINE enable_if<!is_same< RhsPacketType, RhsPacketx4 >::value >::type madd(const LhsPacketType &a, const RhsPacketType &b, DoublePacket< ResPacketType > &c, TmpType &, const LaneIdType &) const
Definition: GeneralBlockPanelKernel.h:870
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, ResPacket &dest) const
Definition: GeneralBlockPanelKernel.h:846
conditional< Vectorizable, ScalarPacket, Scalar >::type LhsPacket4Packing
Definition: GeneralBlockPanelKernel.h:792
PACKET_DECL_COND_SCALAR(_PacketSize)
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, DoublePacketType &dest) const
Definition: GeneralBlockPanelKernel.h:850
PACKET_DECL_COND(Real, _PacketSize)
PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize)
conditional< Vectorizable, ScalarPacket, Scalar >::type ResPacket
Definition: GeneralBlockPanelKernel.h:795
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *, RhsPacketx4 &) const
Definition: GeneralBlockPanelKernel.h:844
EIGEN_STRONG_INLINE void initAcc(Scalar &p)
Definition: GeneralBlockPanelKernel.h:801
PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize)
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, ScalarPacket &dest) const
Definition: GeneralBlockPanelKernel.h:810
conditional< Vectorizable, RealPacket, Scalar >::type LhsPacket
Definition: GeneralBlockPanelKernel.h:793
EIGEN_STRONG_INLINE void acc(const DoublePacket< RealPacketType > &c, const ResPacketType &alpha, ResPacketType &r) const
Definition: GeneralBlockPanelKernel.h:891
std::complex< RealScalar > ResScalar
Definition: GeneralBlockPanelKernel.h:764
conditional< Vectorizable, DoublePacketType, Scalar >::type RhsPacket
Definition: GeneralBlockPanelKernel.h:794
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar *a, LhsPacketType &dest) const
Definition: GeneralBlockPanelKernel.h:862
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacketx4 &dest) const
Definition: GeneralBlockPanelKernel.h:823
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *b, DoublePacket< RealPacketType > &dest) const
Definition: GeneralBlockPanelKernel.h:839
conditional< Vectorizable, DoublePacketType, Scalar >::type AccPacket
Definition: GeneralBlockPanelKernel.h:796
std::complex< RealScalar > LhsScalar
Definition: GeneralBlockPanelKernel.h:762
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketx4 &b, AccPacketType &c, RhsPacket &tmp, const LaneIdType &lane) const
Definition: GeneralBlockPanelKernel.h:883
PACKET_DECL_COND_PREFIX(_, Res, _PacketSize)
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar *a, LhsPacket &dest) const
Definition: GeneralBlockPanelKernel.h:856
std::complex< RealScalar > RhsScalar
Definition: GeneralBlockPanelKernel.h:763
std::complex< RealScalar > Scalar
Definition: GeneralBlockPanelKernel.h:761
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *b, ScalarPacket &dest) const
Definition: GeneralBlockPanelKernel.h:832
EIGEN_STRONG_INLINE void madd(const LhsPacket &a, const RhsPacket &b, ResPacket &c, RhsPacket &, const LaneIdType &) const
Definition: GeneralBlockPanelKernel.h:877
conj_helper< LhsScalar, RhsScalar, ConjLhs, ConjRhs > cj
Definition: GeneralBlockPanelKernel.h:920
EIGEN_STRONG_INLINE void acc(const Scalar &c, const Scalar &alpha, Scalar &r) const
Definition: GeneralBlockPanelKernel.h:888
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, DoublePacket< RealPacketType > &dest) const
Definition: GeneralBlockPanelKernel.h:817
EIGEN_STRONG_INLINE void initAcc(DoublePacketType &p)
Definition: GeneralBlockPanelKernel.h:803
QuadPacket< RhsPacket > RhsPacketx4
Definition: GeneralBlockPanelKernel.h:799
DoublePacket< RealPacket > DoublePacketType
Definition: GeneralBlockPanelKernel.h:790
Definition: GeneralBlockPanelKernel.h:419
PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize)
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, RhsPacket &dest) const
Definition: GeneralBlockPanelKernel.h:494
_RhsScalar RhsScalar
Definition: GeneralBlockPanelKernel.h:422
ScalarBinaryOpTraits< LhsScalar, RhsScalar >::ReturnType ResScalar
Definition: GeneralBlockPanelKernel.h:423
conditional< Vectorizable, _RhsPacket, RhsScalar >::type RhsPacket
Definition: GeneralBlockPanelKernel.h:461
ResPacket AccPacket
Definition: GeneralBlockPanelKernel.h:466
_LhsScalar LhsScalar
Definition: GeneralBlockPanelKernel.h:421
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacketType &dest) const
Definition: GeneralBlockPanelKernel.h:474
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketType &b, AccPacketType &c, RhsPacketType &tmp, const LaneIdType &) const
Definition: GeneralBlockPanelKernel.h:512
conditional< Vectorizable, _LhsPacket, LhsScalar >::type LhsPacket
Definition: GeneralBlockPanelKernel.h:460
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketx4 &b, AccPacketType &c, RhsPacket &tmp, const LaneIdType &lane) const
Definition: GeneralBlockPanelKernel.h:528
EIGEN_STRONG_INLINE void acc(const ResPacketHalf &c, const ResPacketHalf &alpha, ResPacketHalf &r) const
Definition: GeneralBlockPanelKernel.h:539
EIGEN_STRONG_INLINE void acc(const AccPacket &c, const ResPacket &alpha, ResPacket &r) const
Definition: GeneralBlockPanelKernel.h:533
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar *a, LhsPacketType &dest) const
Definition: GeneralBlockPanelKernel.h:500
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *b, RhsPacketType &dest) const
Definition: GeneralBlockPanelKernel.h:485
QuadPacket< RhsPacket > RhsPacketx4
Definition: GeneralBlockPanelKernel.h:465
LhsPacket LhsPacket4Packing
Definition: GeneralBlockPanelKernel.h:463
PACKET_DECL_COND_PREFIX(_, Res, _PacketSize)
PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize)
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar *a, LhsPacketType &dest) const
Definition: GeneralBlockPanelKernel.h:506
@ ConjRhs
Definition: GeneralBlockPanelKernel.h:431
@ nr
Definition: GeneralBlockPanelKernel.h:440
@ mr
Definition: GeneralBlockPanelKernel.h:452
@ LhsProgress
Definition: GeneralBlockPanelKernel.h:455
@ NumberOfRegisters
Definition: GeneralBlockPanelKernel.h:437
@ LhsPacketSize
Definition: GeneralBlockPanelKernel.h:433
@ RhsProgress
Definition: GeneralBlockPanelKernel.h:456
@ ResPacketSize
Definition: GeneralBlockPanelKernel.h:435
@ Vectorizable
Definition: GeneralBlockPanelKernel.h:432
@ default_mr
Definition: GeneralBlockPanelKernel.h:443
@ ConjLhs
Definition: GeneralBlockPanelKernel.h:430
@ RhsPacketSize
Definition: GeneralBlockPanelKernel.h:434
conditional< Vectorizable, _ResPacket, ResScalar >::type ResPacket
Definition: GeneralBlockPanelKernel.h:462
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *, RhsPacketx4 &) const
Definition: GeneralBlockPanelKernel.h:490
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacketx4 &dest) const
Definition: GeneralBlockPanelKernel.h:479
EIGEN_STRONG_INLINE void initAcc(AccPacket &p)
Definition: GeneralBlockPanelKernel.h:468
@ ColMajor
Definition: Constants.h:319
@ RowMajor
Definition: Constants.h:321
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16() max(const bfloat16 &a, const bfloat16 &b)
Definition: BFloat16.h:576
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16() min(const bfloat16 &a, const bfloat16 &b)
Definition: BFloat16.h:571
const std::ptrdiff_t defaultL2CacheSize
Definition: GeneralBlockPanelKernel.h:62
EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf &a)
Definition: Complex.h:167
EIGEN_DEVICE_FUNC Packet padd(const Packet &a, const Packet &b)
Definition: GenericPacketMath.h:215
EIGEN_DEVICE_FUNC void pbroadcast4(const typename unpacket_traits< Packet >::type *a, Packet &a0, Packet &a1, Packet &a2, Packet &a3)
Definition: GenericPacketMath.h:653
EIGEN_STRONG_INLINE void ptranspose(PacketBlock< Packet2cf, 2 > &kernel)
Definition: Complex.h:224
void queryCacheSizes(int &l1, int &l2, int &l3)
Definition: Memory.h:1106
void evaluateProductBlockingSizesHeuristic(Index &k, Index &m, Index &n, Index num_threads=1)
Definition: GeneralBlockPanelKernel.h:124
EIGEN_DEVICE_FUNC void prefetch(const Scalar *addr)
Definition: GenericPacketMath.h:719
EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f &a, const Packet4f &b, const Packet4f &c)
Definition: PacketMath.h:827
EIGEN_DEVICE_FUNC Packet pmul(const Packet &a, const Packet &b)
Definition: GenericPacketMath.h:237
const std::ptrdiff_t defaultL3CacheSize
Definition: GeneralBlockPanelKernel.h:63
EIGEN_STRONG_INLINE Packet1cd pcplxflip(const Packet1cd &x)
Definition: Complex.h:620
GEBPPacketSizeType
Definition: GeneralBlockPanelKernel.h:18
@ GEBPPacketHalf
Definition: GeneralBlockPanelKernel.h:20
@ GEBPPacketQuarter
Definition: GeneralBlockPanelKernel.h:21
@ GEBPPacketFull
Definition: GeneralBlockPanelKernel.h:19
void computeProductBlockingSizes(Index &k, Index &m, Index &n, Index num_threads=1)
Computes the blocking parameters for a m x k times k x n matrix product.
Definition: GeneralBlockPanelKernel.h:339
void manage_caching_sizes(Action action, std::ptrdiff_t *l1, std::ptrdiff_t *l2, std::ptrdiff_t *l3)
Definition: GeneralBlockPanelKernel.h:86
EIGEN_CONSTEXPR Index size(const T &x)
Definition: Meta.h:479
EIGEN_DEVICE_FUNC void pstore(Scalar *to, const Packet &from)
Definition: GenericPacketMath.h:696
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c predux_half_dowto4(const Packet8c &a)
Definition: PacketMath.h:2478
bool useSpecificBlockingSizes(Index &k, Index &m, Index &n)
Definition: GeneralBlockPanelKernel.h:305
EIGEN_DEVICE_FUNC void pstoreu(Scalar *to, const Packet &from)
Definition: GenericPacketMath.h:700
std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b)
Definition: GeneralBlockPanelKernel.h:29
const std::ptrdiff_t defaultL1CacheSize
Definition: GeneralBlockPanelKernel.h:61
EIGEN_DEVICE_FUNC Packet psub(const Packet &a, const Packet &b)
Definition: GenericPacketMath.h:222
void loadQuadToDoublePacket(const Scalar *b, DoublePacket< RealPacket > &dest, typename enable_if< unpacket_traits< RealPacket >::size<=8 >::type *=0)
Definition: GeneralBlockPanelKernel.h:725
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T &x, const T &y)
Definition: MathFunctions.h:1091
EIGEN_DEVICE_FUNC T div_ceil(const T &a, const T &b)
Definition: Meta.h:779
Namespace containing all symbols from the Eigen library.
Definition: LDLT.h:16
std::ptrdiff_t l1CacheSize()
Definition: GeneralBlockPanelKernel.h:2607
std::ptrdiff_t l2CacheSize()
Definition: GeneralBlockPanelKernel.h:2616
@ GetAction
Definition: Constants.h:504
@ SetAction
Definition: Constants.h:504
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:74
std::ptrdiff_t l3CacheSize()
Definition: GeneralBlockPanelKernel.h:2626
void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3)
Definition: GeneralBlockPanelKernel.h:2638
type
The type the bitset is encoded with.
Definition: bitset.hpp:44
Definition: document.h:416
Definition: json.hpp:5678
const GenericPointer< typename T::ValueType > T2 T::AllocatorType & a
Definition: pointer.h:1181
#define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val)
Definition: GeneralBlockPanelKernel.h:37
#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val)
Definition: GeneralBlockPanelKernel.h:49
#define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val)
Definition: GeneralBlockPanelKernel.h:43
#define EIGEN_GEBGP_ONESTEP(K)
#define EIGEN_GEBP_ONESTEP(K)
Holds information about the various numeric (i.e. scalar) types allowed by Eigen.
Definition: NumTraits.h:233
Determines whether the given binary operation of two numeric types is allowed and what the scalar ret...
Definition: XprHelper.h:806
Definition: GeneralBlockPanelKernel.h:71
std::ptrdiff_t m_l1
Definition: GeneralBlockPanelKernel.h:80
CacheSizes()
Definition: GeneralBlockPanelKernel.h:72
std::ptrdiff_t m_l2
Definition: GeneralBlockPanelKernel.h:81
std::ptrdiff_t m_l3
Definition: GeneralBlockPanelKernel.h:82
Definition: GeneralBlockPanelKernel.h:683
Packet second
Definition: GeneralBlockPanelKernel.h:685
Packet first
Definition: GeneralBlockPanelKernel.h:684
Definition: GenericPacketMath.h:1014
Packet packet[N]
Definition: GenericPacketMath.h:1015
Definition: GeneralBlockPanelKernel.h:362
const Packet & get(const FixedInt< 0 > &) const
Definition: GeneralBlockPanelKernel.h:364
Packet B2
Definition: GeneralBlockPanelKernel.h:363
Packet B_0
Definition: GeneralBlockPanelKernel.h:363
const Packet & get(const FixedInt< 2 > &) const
Definition: GeneralBlockPanelKernel.h:366
const Packet & get(const FixedInt< 3 > &) const
Definition: GeneralBlockPanelKernel.h:367
Packet B1
Definition: GeneralBlockPanelKernel.h:363
const Packet & get(const FixedInt< 1 > &) const
Definition: GeneralBlockPanelKernel.h:365
Packet B3
Definition: GeneralBlockPanelKernel.h:363
Definition: GeneralBlockPanelKernel.h:353
RhsPacket ::type type
Definition: GeneralBlockPanelKernel.h:357
static const int remaining_registers
Definition: GeneralBlockPanelKernel.h:355
typedef RhsPacketx4
Definition: GeneralBlockPanelKernel.h:357
Definition: ConjHelper.h:63
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType pmadd(const LhsType &x, const RhsType &y, const ResultType &c) const
Definition: ConjHelper.h:67
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType pmul(const LhsType &x, const RhsType &y) const
Definition: ConjHelper.h:71
Definition: ConjHelper.h:44
Definition: GeneralBlockPanelKernel.h:1058
DataMapper::LinearMapper LinearMapper
Definition: GeneralBlockPanelKernel.h:1090
Traits::RhsPacket RhsPacket
Definition: GeneralBlockPanelKernel.h:1065
SwappedTraits::ResPacket SResPacket
Definition: GeneralBlockPanelKernel.h:1077
gebp_traits< RhsScalar, LhsScalar, ConjugateRhs, ConjugateLhs, Architecture::Target > SwappedTraits
Definition: GeneralBlockPanelKernel.h:1072
QuarterTraits::ResPacket ResPacketQuarter
Definition: GeneralBlockPanelKernel.h:1087
SwappedTraits::AccPacket SAccPacket
Definition: GeneralBlockPanelKernel.h:1078
QuarterTraits::RhsPacket RhsPacketQuarter
Definition: GeneralBlockPanelKernel.h:1086
HalfTraits::ResPacket ResPacketHalf
Definition: GeneralBlockPanelKernel.h:1082
HalfTraits::RhsPacket RhsPacketHalf
Definition: GeneralBlockPanelKernel.h:1081
@ ResPacketSize
Definition: GeneralBlockPanelKernel.h:1100
@ RhsProgressQuarter
Definition: GeneralBlockPanelKernel.h:1099
@ Vectorizable
Definition: GeneralBlockPanelKernel.h:1093
@ LhsProgressQuarter
Definition: GeneralBlockPanelKernel.h:1096
@ RhsProgress
Definition: GeneralBlockPanelKernel.h:1097
@ LhsProgressHalf
Definition: GeneralBlockPanelKernel.h:1095
@ RhsProgressHalf
Definition: GeneralBlockPanelKernel.h:1098
@ LhsProgress
Definition: GeneralBlockPanelKernel.h:1094
Traits::RhsPacketx4 RhsPacketx4
Definition: GeneralBlockPanelKernel.h:1068
SwappedTraits::RhsPacket SRhsPacket
Definition: GeneralBlockPanelKernel.h:1076
QuarterTraits::LhsPacket LhsPacketQuarter
Definition: GeneralBlockPanelKernel.h:1085
HalfTraits::LhsPacket LhsPacketHalf
Definition: GeneralBlockPanelKernel.h:1080
QuarterTraits::AccPacket AccPacketQuarter
Definition: GeneralBlockPanelKernel.h:1088
SwappedTraits::ResScalar SResScalar
Definition: GeneralBlockPanelKernel.h:1074
RhsPanelHelper< RhsPacket, RhsPacketx4, 15 >::type RhsPanel15
Definition: GeneralBlockPanelKernel.h:1070
gebp_traits< LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target, GEBPPacketHalf > HalfTraits
Definition: GeneralBlockPanelKernel.h:1060
Traits::LhsPacket LhsPacket
Definition: GeneralBlockPanelKernel.h:1064
Traits::ResScalar ResScalar
Definition: GeneralBlockPanelKernel.h:1063
SwappedTraits::LhsPacket SLhsPacket
Definition: GeneralBlockPanelKernel.h:1075
Traits::ResPacket ResPacket
Definition: GeneralBlockPanelKernel.h:1066
Traits::AccPacket AccPacket
Definition: GeneralBlockPanelKernel.h:1067
gebp_traits< LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target > Traits
Definition: GeneralBlockPanelKernel.h:1059
HalfTraits::AccPacket AccPacketHalf
Definition: GeneralBlockPanelKernel.h:1083
gebp_traits< LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target, GEBPPacketQuarter > QuarterTraits
Definition: GeneralBlockPanelKernel.h:1061
EIGEN_DONT_INLINE void operator()(const DataMapper &res, const LhsScalar *blockA, const RhsScalar *blockB, Index rows, Index depth, Index cols, ResScalar alpha, Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0)
Definition: GeneralBlockPanelKernel.h:1405
DataMapper::LinearMapper LinearMapper
Definition: GeneralBlockPanelKernel.h:2247
DataMapper::LinearMapper LinearMapper
Definition: GeneralBlockPanelKernel.h:2091
Definition: BlasUtil.h:28
DataMapper::LinearMapper LinearMapper
Definition: GeneralBlockPanelKernel.h:2380
packet_traits< Scalar >::type Packet
Definition: GeneralBlockPanelKernel.h:2379
DataMapper::LinearMapper LinearMapper
Definition: GeneralBlockPanelKernel.h:2506
EIGEN_DONT_INLINE void operator()(Scalar *blockB, const DataMapper &rhs, Index depth, Index cols, Index stride=0, Index offset=0)
Definition: GeneralBlockPanelKernel.h:2510
unpacket_traits< Packet >::half HalfPacket
Definition: GeneralBlockPanelKernel.h:2504
unpacket_traits< typename unpacket_traits< Packet >::half >::half QuarterPacket
Definition: GeneralBlockPanelKernel.h:2505
packet_traits< Scalar >::type Packet
Definition: GeneralBlockPanelKernel.h:2503
Definition: BlasUtil.h:25
SwappedTraits::LhsPacket SLhsPacket
Definition: GeneralBlockPanelKernel.h:1146
gebp_traits< RhsScalar, LhsScalar, ConjugateRhs, ConjugateLhs, Architecture::Target > SwappedTraits
Definition: GeneralBlockPanelKernel.h:1143
SwappedTraits::ResPacket SResPacket
Definition: GeneralBlockPanelKernel.h:1148
EIGEN_STRONG_INLINE void operator()(const DataMapper &res, SwappedTraits &straits, const LhsScalar *blA, const RhsScalar *blB, Index depth, const Index endk, Index i, Index j2, ResScalar alpha, SAccPacket &C0)
Definition: GeneralBlockPanelKernel.h:1151
SwappedTraits::AccPacket SAccPacket
Definition: GeneralBlockPanelKernel.h:1149
gebp_traits< LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target > Traits
Definition: GeneralBlockPanelKernel.h:1142
SwappedTraits::RhsPacket SRhsPacket
Definition: GeneralBlockPanelKernel.h:1147
Traits::ResScalar ResScalar
Definition: GeneralBlockPanelKernel.h:1145
Definition: GeneralBlockPanelKernel.h:1112
SwappedTraits::AccPacket SAccPacket
Definition: GeneralBlockPanelKernel.h:1120
gebp_traits< LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target > Traits
Definition: GeneralBlockPanelKernel.h:1113
SwappedTraits::ResPacket SResPacket
Definition: GeneralBlockPanelKernel.h:1119
Traits::ResScalar ResScalar
Definition: GeneralBlockPanelKernel.h:1116
SwappedTraits::LhsPacket SLhsPacket
Definition: GeneralBlockPanelKernel.h:1117
gebp_traits< RhsScalar, LhsScalar, ConjugateRhs, ConjugateLhs, Architecture::Target > SwappedTraits
Definition: GeneralBlockPanelKernel.h:1114
EIGEN_STRONG_INLINE void operator()(const DataMapper &res, SwappedTraits &straits, const LhsScalar *blA, const RhsScalar *blB, Index depth, const Index endk, Index i, Index j2, ResScalar alpha, SAccPacket &C0)
Definition: GeneralBlockPanelKernel.h:1122
SwappedTraits::RhsPacket SRhsPacket
Definition: GeneralBlockPanelKernel.h:1118
Definition: GeneralBlockPanelKernel.h:1386
EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar *blA, const RhsScalar *blB, GEBPTraits traits, LhsPacket *A0, RhsPacket *B_0, RhsPacket *B1, RhsPacket *B2, RhsPacket *B3, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
Definition: GeneralBlockPanelKernel.h:1388
Definition: GeneralBlockPanelKernel.h:1191
EIGEN_STRONG_INLINE void operator()(const DataMapper &res, const LhsScalar *blockA, const RhsScalar *blockB, ResScalar alpha, Index peelStart, Index peelEnd, Index strideA, Index strideB, Index offsetA, Index offsetB, int prefetch_res_offset, Index peeled_kc, Index pk, Index cols, Index depth, Index packet_cols4)
Definition: GeneralBlockPanelKernel.h:1210
GEBPTraits::RhsPacketx4 RhsPacketx4
Definition: GeneralBlockPanelKernel.h:1192
EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar *blA, const RhsScalar *blB, GEBPTraits traits, LhsPacket *A0, RhsPacketx4 *rhs_panel, RhsPacket *T0, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
Definition: GeneralBlockPanelKernel.h:1194
T1 type
Definition: GeneralBlockPanelKernel.h:374
T2 type
Definition: GeneralBlockPanelKernel.h:377
Definition: GeneralBlockPanelKernel.h:371
T3 type
Definition: GeneralBlockPanelKernel.h:371
Definition: GenericPacketMath.h:107
T type
Definition: GenericPacketMath.h:108
Definition: ForwardDeclarations.h:17
DoublePacket< typename unpacket_traits< Packet >::half > half
Definition: GeneralBlockPanelKernel.h:746
Definition: GenericPacketMath.h:133
T half
Definition: GenericPacketMath.h:135
Definition: PacketMath.h:47