i386, expand: Optimize also 256-bit and 512-bit permutatations as vpmovzx if possible [PR95905]
The following patch implements what I've talked about, i.e. to no longer force operands of vec_perm_const into registers in the generic code, but let each of the (currently 8) targets force it into registers individually, giving the targets better control on if it does that and when and allowing them to do something special with some particular operands. And then defines the define_insn_and_split for the 256-bit and 512-bit permutations into vpmovzx* (only the bw, wd and dq cases, in theory we could add define_insn_and_split patterns also for the bd, bq and wq). 2021-01-13 Jakub Jelinek <jakub@redhat.com> PR target/95905 * optabs.c (expand_vec_perm_const): Don't force v0 and v1 into registers before calling targetm.vectorize.vec_perm_const, only after that. * config/i386/i386-expand.c (ix86_vectorize_vec_perm_const): Handle two argument permutation when one operand is zero vector and only after that force operands into registers. * config/i386/sse.md (*avx2_zero_extendv16qiv16hi2_1): New define_insn_and_split pattern. (*avx512bw_zero_extendv32qiv32hi2_1): Likewise. (*avx512f_zero_extendv16hiv16si2_1): Likewise. (*avx2_zero_extendv8hiv8si2_1): Likewise. (*avx512f_zero_extendv8siv8di2_1): Likewise. (*avx2_zero_extendv4siv4di2_1): Likewise. * config/mips/mips.c (mips_vectorize_vec_perm_const): Force operands into registers. * config/arm/arm.c (arm_vectorize_vec_perm_const): Likewise. * config/sparc/sparc.c (sparc_vectorize_vec_perm_const): Likewise. * config/ia64/ia64.c (ia64_vectorize_vec_perm_const): Likewise. * config/aarch64/aarch64.c (aarch64_vectorize_vec_perm_const): Likewise. * config/rs6000/rs6000.c (rs6000_vectorize_vec_perm_const): Likewise. * config/gcn/gcn.c (gcn_vectorize_vec_perm_const): Likewise. Use std::swap. * gcc.target/i386/pr95905-2.c: Use scan-assembler-times instead of scan-assembler. Add tests with zero vector as first __builtin_shuffle operand. * gcc.target/i386/pr95905-3.c: New test. * gcc.target/i386/pr95905-4.c: New test.
This commit is contained in:
parent
7875e8dc83
commit
b1d1e2b54c
@ -21084,8 +21084,11 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
|
||||
d.vmode = vmode;
|
||||
d.vec_flags = aarch64_classify_vector_mode (d.vmode);
|
||||
d.target = target;
|
||||
d.op0 = op0;
|
||||
d.op1 = op1;
|
||||
d.op0 = op0 ? force_reg (vmode, op0) : NULL_RTX;
|
||||
if (op0 == op1)
|
||||
d.op1 = d.op0;
|
||||
else
|
||||
d.op1 = op1 ? force_reg (vmode, op1) : NULL_RTX;
|
||||
d.testing_p = !target;
|
||||
|
||||
if (!d.testing_p)
|
||||
|
@ -31482,6 +31482,15 @@ arm_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0, rtx op1,
|
||||
return false;
|
||||
|
||||
d.target = target;
|
||||
if (op0)
|
||||
{
|
||||
rtx nop0 = force_reg (vmode, op0);
|
||||
if (op0 == op1)
|
||||
op1 = nop0;
|
||||
op0 = nop0;
|
||||
}
|
||||
if (op1)
|
||||
op1 = force_reg (vmode, op1);
|
||||
d.op0 = op0;
|
||||
d.op1 = op1;
|
||||
|
||||
|
@ -3982,13 +3982,14 @@ gcn_vectorize_vec_perm_const (machine_mode vmode, rtx dst,
|
||||
for (unsigned int i = 0; i < nelt; ++i)
|
||||
perm[i] = sel[i] & (2 * nelt - 1);
|
||||
|
||||
src0 = force_reg (vmode, src0);
|
||||
src1 = force_reg (vmode, src1);
|
||||
|
||||
/* Make life a bit easier by swapping operands if necessary so that
|
||||
the first element always comes from src0. */
|
||||
if (perm[0] >= nelt)
|
||||
{
|
||||
rtx temp = src0;
|
||||
src0 = src1;
|
||||
src1 = temp;
|
||||
std::swap (src0, src1);
|
||||
|
||||
for (unsigned int i = 0; i < nelt; ++i)
|
||||
if (perm[i] < nelt)
|
||||
|
@ -19929,6 +19929,32 @@ ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
|
||||
|
||||
two_args = canonicalize_perm (&d);
|
||||
|
||||
/* If one of the operands is a zero vector, try to match pmovzx. */
|
||||
if (two_args && (d.op0 == CONST0_RTX (vmode) || d.op1 == CONST0_RTX (vmode)))
|
||||
{
|
||||
struct expand_vec_perm_d dzero = d;
|
||||
if (d.op0 == CONST0_RTX (vmode))
|
||||
{
|
||||
d.op1 = dzero.op1 = force_reg (vmode, d.op1);
|
||||
std::swap (dzero.op0, dzero.op1);
|
||||
for (i = 0; i < nelt; ++i)
|
||||
dzero.perm[i] ^= nelt;
|
||||
}
|
||||
else
|
||||
d.op0 = dzero.op0 = force_reg (vmode, d.op0);
|
||||
|
||||
if (expand_vselect_vconcat (dzero.target, dzero.op0, dzero.op1,
|
||||
dzero.perm, nelt, dzero.testing_p))
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Force operands into registers. */
|
||||
rtx nop0 = force_reg (vmode, d.op0);
|
||||
if (d.op0 == d.op1)
|
||||
d.op1 = nop0;
|
||||
d.op0 = nop0;
|
||||
d.op1 = force_reg (vmode, d.op1);
|
||||
|
||||
if (ix86_expand_vec_perm_const_1 (&d))
|
||||
return true;
|
||||
|
||||
|
@ -17611,6 +17611,23 @@
|
||||
(set_attr "prefix" "maybe_evex")
|
||||
(set_attr "mode" "OI")])
|
||||
|
||||
(define_insn_and_split "*avx2_zero_extendv16qiv16hi2_1"
|
||||
[(set (match_operand:V32QI 0 "register_operand" "=v")
|
||||
(vec_select:V32QI
|
||||
(vec_concat:V64QI
|
||||
(match_operand:V32QI 1 "nonimmediate_operand" "vm")
|
||||
(match_operand:V32QI 2 "const0_operand" "C"))
|
||||
(match_parallel 3 "pmovzx_parallel"
|
||||
[(match_operand 4 "const_int_operand" "n")])))]
|
||||
"TARGET_AVX2"
|
||||
"#"
|
||||
"&& reload_completed"
|
||||
[(set (match_dup 0) (zero_extend:V16HI (match_dup 1)))]
|
||||
{
|
||||
operands[0] = lowpart_subreg (V16HImode, operands[0], V32QImode);
|
||||
operands[1] = lowpart_subreg (V16QImode, operands[1], V32QImode);
|
||||
})
|
||||
|
||||
(define_expand "<insn>v16qiv16hi2"
|
||||
[(set (match_operand:V16HI 0 "register_operand")
|
||||
(any_extend:V16HI
|
||||
@ -17628,6 +17645,23 @@
|
||||
(set_attr "prefix" "evex")
|
||||
(set_attr "mode" "XI")])
|
||||
|
||||
(define_insn_and_split "*avx512bw_zero_extendv32qiv32hi2_1"
|
||||
[(set (match_operand:V64QI 0 "register_operand" "=v")
|
||||
(vec_select:V64QI
|
||||
(vec_concat:V128QI
|
||||
(match_operand:V64QI 1 "nonimmediate_operand" "vm")
|
||||
(match_operand:V64QI 2 "const0_operand" "C"))
|
||||
(match_parallel 3 "pmovzx_parallel"
|
||||
[(match_operand 4 "const_int_operand" "n")])))]
|
||||
"TARGET_AVX512BW"
|
||||
"#"
|
||||
"&& reload_completed"
|
||||
[(set (match_dup 0) (zero_extend:V32HI (match_dup 1)))]
|
||||
{
|
||||
operands[0] = lowpart_subreg (V32HImode, operands[0], V64QImode);
|
||||
operands[1] = lowpart_subreg (V32QImode, operands[1], V64QImode);
|
||||
})
|
||||
|
||||
(define_expand "<insn>v32qiv32hi2"
|
||||
[(set (match_operand:V32HI 0 "register_operand")
|
||||
(any_extend:V32HI
|
||||
@ -17883,6 +17917,23 @@
|
||||
(match_operand:V16HI 1 "nonimmediate_operand")))]
|
||||
"TARGET_AVX512F")
|
||||
|
||||
(define_insn_and_split "avx512f_zero_extendv16hiv16si2_1"
|
||||
[(set (match_operand:V32HI 0 "register_operand" "=v")
|
||||
(vec_select:V32HI
|
||||
(vec_concat:V64HI
|
||||
(match_operand:V32HI 1 "nonimmediate_operand" "vm")
|
||||
(match_operand:V32HI 2 "const0_operand" "C"))
|
||||
(match_parallel 3 "pmovzx_parallel"
|
||||
[(match_operand 4 "const_int_operand" "n")])))]
|
||||
"TARGET_AVX512F"
|
||||
"#"
|
||||
"&& reload_completed"
|
||||
[(set (match_dup 0) (zero_extend:V16SI (match_dup 1)))]
|
||||
{
|
||||
operands[0] = lowpart_subreg (V16SImode, operands[0], V32HImode);
|
||||
operands[1] = lowpart_subreg (V16HImode, operands[1], V32HImode);
|
||||
})
|
||||
|
||||
(define_insn "avx2_<code>v8hiv8si2<mask_name>"
|
||||
[(set (match_operand:V8SI 0 "register_operand" "=v")
|
||||
(any_extend:V8SI
|
||||
@ -17900,6 +17951,23 @@
|
||||
(match_operand:V8HI 1 "nonimmediate_operand")))]
|
||||
"TARGET_AVX2")
|
||||
|
||||
(define_insn_and_split "avx2_zero_extendv8hiv8si2_1"
|
||||
[(set (match_operand:V16HI 0 "register_operand" "=v")
|
||||
(vec_select:V16HI
|
||||
(vec_concat:V32HI
|
||||
(match_operand:V16HI 1 "nonimmediate_operand" "vm")
|
||||
(match_operand:V16HI 2 "const0_operand" "C"))
|
||||
(match_parallel 3 "pmovzx_parallel"
|
||||
[(match_operand 4 "const_int_operand" "n")])))]
|
||||
"TARGET_AVX2"
|
||||
"#"
|
||||
"&& reload_completed"
|
||||
[(set (match_dup 0) (zero_extend:V8SI (match_dup 1)))]
|
||||
{
|
||||
operands[0] = lowpart_subreg (V8SImode, operands[0], V16HImode);
|
||||
operands[1] = lowpart_subreg (V8HImode, operands[1], V16HImode);
|
||||
})
|
||||
|
||||
(define_insn "sse4_1_<code>v4hiv4si2<mask_name>"
|
||||
[(set (match_operand:V4SI 0 "register_operand" "=Yr,*x,v")
|
||||
(any_extend:V4SI
|
||||
@ -18275,6 +18343,23 @@
|
||||
(set_attr "prefix" "evex")
|
||||
(set_attr "mode" "XI")])
|
||||
|
||||
(define_insn_and_split "*avx512f_zero_extendv8siv8di2_1"
|
||||
[(set (match_operand:V16SI 0 "register_operand" "=v")
|
||||
(vec_select:V16SI
|
||||
(vec_concat:V32SI
|
||||
(match_operand:V16SI 1 "nonimmediate_operand" "vm")
|
||||
(match_operand:V16SI 2 "const0_operand" "C"))
|
||||
(match_parallel 3 "pmovzx_parallel"
|
||||
[(match_operand 4 "const_int_operand" "n")])))]
|
||||
"TARGET_AVX512F"
|
||||
"#"
|
||||
"&& reload_completed"
|
||||
[(set (match_dup 0) (zero_extend:V8DI (match_dup 1)))]
|
||||
{
|
||||
operands[0] = lowpart_subreg (V8DImode, operands[0], V16SImode);
|
||||
operands[1] = lowpart_subreg (V8SImode, operands[1], V16SImode);
|
||||
})
|
||||
|
||||
(define_expand "<insn>v8siv8di2"
|
||||
[(set (match_operand:V8DI 0 "register_operand" "=v")
|
||||
(any_extend:V8DI
|
||||
@ -18292,6 +18377,23 @@
|
||||
(set_attr "prefix_extra" "1")
|
||||
(set_attr "mode" "OI")])
|
||||
|
||||
(define_insn_and_split "*avx2_zero_extendv4siv4di2_1"
|
||||
[(set (match_operand:V8SI 0 "register_operand" "=v")
|
||||
(vec_select:V8SI
|
||||
(vec_concat:V16SI
|
||||
(match_operand:V8SI 1 "nonimmediate_operand" "vm")
|
||||
(match_operand:V8SI 2 "const0_operand" "C"))
|
||||
(match_parallel 3 "pmovzx_parallel"
|
||||
[(match_operand 4 "const_int_operand" "n")])))]
|
||||
"TARGET_AVX2"
|
||||
"#"
|
||||
"&& reload_completed"
|
||||
[(set (match_dup 0) (zero_extend:V4DI (match_dup 1)))]
|
||||
{
|
||||
operands[0] = lowpart_subreg (V4DImode, operands[0], V8SImode);
|
||||
operands[1] = lowpart_subreg (V4SImode, operands[1], V8SImode);
|
||||
})
|
||||
|
||||
(define_expand "<insn>v4siv4di2"
|
||||
[(set (match_operand:V4DI 0 "register_operand" "=v")
|
||||
(any_extend:V4DI
|
||||
|
@ -11759,6 +11759,15 @@ ia64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
|
||||
unsigned int i, nelt, which;
|
||||
|
||||
d.target = target;
|
||||
if (op0)
|
||||
{
|
||||
rtx nop0 = force_reg (vmode, op0);
|
||||
if (op0 == op1)
|
||||
op1 = nop0;
|
||||
op0 = nop0;
|
||||
}
|
||||
if (op1)
|
||||
op1 = force_reg (vmode, op1);
|
||||
d.op0 = op0;
|
||||
d.op1 = op1;
|
||||
|
||||
|
@ -21624,6 +21624,15 @@ mips_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
|
||||
bool ok;
|
||||
|
||||
d.target = target;
|
||||
if (op0)
|
||||
{
|
||||
rtx nop0 = force_reg (vmode, op0);
|
||||
if (op0 == op1)
|
||||
op1 = nop0;
|
||||
op0 = nop0;
|
||||
}
|
||||
if (op1)
|
||||
op1 = force_reg (vmode, op1);
|
||||
d.op0 = op0;
|
||||
d.op1 = op1;
|
||||
|
||||
|
@ -22946,6 +22946,16 @@ rs6000_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
|
||||
if (TARGET_ALTIVEC && testing_p)
|
||||
return true;
|
||||
|
||||
if (op0)
|
||||
{
|
||||
rtx nop0 = force_reg (vmode, op0);
|
||||
if (op0 == op1)
|
||||
op1 = nop0;
|
||||
op0 = nop0;
|
||||
}
|
||||
if (op1)
|
||||
op1 = force_reg (vmode, op1);
|
||||
|
||||
/* Check for ps_merge* or xxpermdi insns. */
|
||||
if ((vmode == V2DFmode || vmode == V2DImode) && VECTOR_MEM_VSX_P (vmode))
|
||||
{
|
||||
|
@ -12942,6 +12942,12 @@ sparc_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
|
||||
if (vmode != V8QImode)
|
||||
return false;
|
||||
|
||||
rtx nop0 = force_reg (vmode, op0);
|
||||
if (op0 == op1)
|
||||
op1 = nop0;
|
||||
op0 = nop0;
|
||||
op1 = force_reg (vmode, op1);
|
||||
|
||||
unsigned int i, mask;
|
||||
for (i = mask = 0; i < 8; ++i)
|
||||
mask |= (sel[i] & 0xf) << (28 - i*4);
|
||||
|
@ -6070,11 +6070,8 @@ expand_vec_perm_const (machine_mode mode, rtx v0, rtx v1,
|
||||
|
||||
if (targetm.vectorize.vec_perm_const != NULL)
|
||||
{
|
||||
v0 = force_reg (mode, v0);
|
||||
if (single_arg_p)
|
||||
v1 = v0;
|
||||
else
|
||||
v1 = force_reg (mode, v1);
|
||||
|
||||
if (targetm.vectorize.vec_perm_const (mode, target, v0, v1, indices))
|
||||
return target;
|
||||
@ -6095,6 +6092,11 @@ expand_vec_perm_const (machine_mode mode, rtx v0, rtx v1,
|
||||
return gen_lowpart (mode, target_qi);
|
||||
}
|
||||
|
||||
v0 = force_reg (mode, v0);
|
||||
if (single_arg_p)
|
||||
v1 = v0;
|
||||
v1 = force_reg (mode, v1);
|
||||
|
||||
/* Otherwise expand as a fully variable permuation. */
|
||||
|
||||
/* The optabs are only defined for selectors with the same width
|
||||
|
@ -1,9 +1,9 @@
|
||||
/* PR target/95905 */
|
||||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2 -msse4.1" } */
|
||||
/* { dg-final { scan-assembler "\tv?pmovzxbw\t" } } */
|
||||
/* { dg-final { scan-assembler "\tv?pmovzxwd\t" } } */
|
||||
/* { dg-final { scan-assembler "\tv?pmovzxdq\t" } } */
|
||||
/* { dg-final { scan-assembler-times "\tv?pmovzxbw\t" 4 } } */
|
||||
/* { dg-final { scan-assembler-times "\tv?pmovzxwd\t" 4 } } */
|
||||
/* { dg-final { scan-assembler-times "\tv?pmovzxdq\t" 4 } } */
|
||||
|
||||
typedef unsigned char V1 __attribute__((vector_size (16)));
|
||||
typedef unsigned short V2 __attribute__((vector_size (16)));
|
||||
@ -44,3 +44,39 @@ f6 (V3 *x)
|
||||
{
|
||||
return __builtin_shuffle (*x, (V3) {}, (V3) { 0, 4, 1, 5 });
|
||||
}
|
||||
|
||||
V1
|
||||
f7 (V1 x)
|
||||
{
|
||||
return __builtin_shuffle ((V1) {}, x, (V1) { 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 });
|
||||
}
|
||||
|
||||
V2
|
||||
f8 (V2 x)
|
||||
{
|
||||
return __builtin_shuffle ((V2) {}, x, (V2) { 8, 0, 9, 1, 10, 2, 11, 3 });
|
||||
}
|
||||
|
||||
V3
|
||||
f9 (V3 x)
|
||||
{
|
||||
return __builtin_shuffle ((V3) {}, x, (V3) { 4, 0, 5, 1 });
|
||||
}
|
||||
|
||||
V1
|
||||
f10 (V1 *x)
|
||||
{
|
||||
return __builtin_shuffle ((V1) {}, *x, (V1) { 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 });
|
||||
}
|
||||
|
||||
V2
|
||||
f11 (V2 *x)
|
||||
{
|
||||
return __builtin_shuffle ((V2) {}, *x, (V2) { 8, 0, 9, 1, 10, 2, 11, 3 });
|
||||
}
|
||||
|
||||
V3
|
||||
f12 (V3 *x)
|
||||
{
|
||||
return __builtin_shuffle ((V3) {}, *x, (V3) { 4, 0, 5, 1 });
|
||||
}
|
||||
|
82
gcc/testsuite/gcc.target/i386/pr95905-3.c
Normal file
82
gcc/testsuite/gcc.target/i386/pr95905-3.c
Normal file
@ -0,0 +1,82 @@
|
||||
/* PR target/95905 */
|
||||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2 -mavx2" } */
|
||||
/* { dg-final { scan-assembler-times "\tvpmovzxbw\t" 4 } } */
|
||||
/* { dg-final { scan-assembler-times "\tvpmovzxwd\t" 4 } } */
|
||||
/* { dg-final { scan-assembler-times "\tvpmovzxdq\t" 4 } } */
|
||||
|
||||
typedef unsigned char V1 __attribute__((vector_size (32)));
|
||||
typedef unsigned short V2 __attribute__((vector_size (32)));
|
||||
typedef unsigned int V3 __attribute__((vector_size (32)));
|
||||
|
||||
V1
|
||||
f1 (V1 x)
|
||||
{
|
||||
return __builtin_shuffle (x, (V1) {}, (V1) { 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47 });
|
||||
}
|
||||
|
||||
V2
|
||||
f2 (V2 x)
|
||||
{
|
||||
return __builtin_shuffle (x, (V2) {}, (V2) { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 });
|
||||
}
|
||||
|
||||
V3
|
||||
f3 (V3 x)
|
||||
{
|
||||
return __builtin_shuffle (x, (V3) {}, (V3) { 0, 8, 1, 9, 2, 10, 3, 11 });
|
||||
}
|
||||
|
||||
V1
|
||||
f4 (V1 *x)
|
||||
{
|
||||
return __builtin_shuffle (*x, (V1) {}, (V1) { 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47 });
|
||||
}
|
||||
|
||||
V2
|
||||
f5 (V2 *x)
|
||||
{
|
||||
return __builtin_shuffle (*x, (V2) {}, (V2) { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 });
|
||||
}
|
||||
|
||||
V3
|
||||
f6 (V3 *x)
|
||||
{
|
||||
return __builtin_shuffle (*x, (V3) {}, (V3) { 0, 8, 1, 9, 2, 10, 3, 11 });
|
||||
}
|
||||
|
||||
V1
|
||||
f7 (V1 x)
|
||||
{
|
||||
return __builtin_shuffle ((V1) {}, x, (V1) { 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7, 40, 8, 41, 9, 42, 10, 43, 11, 44, 12, 45, 13, 46, 14, 47, 15 });
|
||||
}
|
||||
|
||||
V2
|
||||
f8 (V2 x)
|
||||
{
|
||||
return __builtin_shuffle ((V2) {}, x, (V2) { 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 });
|
||||
}
|
||||
|
||||
V3
|
||||
f9 (V3 x)
|
||||
{
|
||||
return __builtin_shuffle ((V3) {}, x, (V3) { 8, 0, 9, 1, 10, 2, 11, 3 });
|
||||
}
|
||||
|
||||
V1
|
||||
f10 (V1 *x)
|
||||
{
|
||||
return __builtin_shuffle ((V1) {}, *x, (V1) { 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7, 40, 8, 41, 9, 42, 10, 43, 11, 44, 12, 45, 13, 46, 14, 47, 15 });
|
||||
}
|
||||
|
||||
V2
|
||||
f11 (V2 *x)
|
||||
{
|
||||
return __builtin_shuffle ((V2) {}, *x, (V2) { 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 });
|
||||
}
|
||||
|
||||
V3
|
||||
f12 (V3 *x)
|
||||
{
|
||||
return __builtin_shuffle ((V3) {}, *x, (V3) { 8, 0, 9, 1, 10, 2, 11, 3 });
|
||||
}
|
82
gcc/testsuite/gcc.target/i386/pr95905-4.c
Normal file
82
gcc/testsuite/gcc.target/i386/pr95905-4.c
Normal file
@ -0,0 +1,82 @@
|
||||
/* PR target/95905 */
|
||||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2 -mavx512bw" } */
|
||||
/* { dg-final { scan-assembler-times "\tvpmovzxbw\t" 4 } } */
|
||||
/* { dg-final { scan-assembler-times "\tvpmovzxwd\t" 4 } } */
|
||||
/* { dg-final { scan-assembler-times "\tvpmovzxdq\t" 4 } } */
|
||||
|
||||
typedef unsigned char V1 __attribute__((vector_size (64)));
|
||||
typedef unsigned short V2 __attribute__((vector_size (64)));
|
||||
typedef unsigned int V3 __attribute__((vector_size (64)));
|
||||
|
||||
V1
|
||||
f1 (V1 x)
|
||||
{
|
||||
return __builtin_shuffle (x, (V1) {}, (V1) { 0, 64, 1, 65, 2, 66, 3, 67, 4, 68, 5, 69, 6, 70, 7, 71, 8, 72, 9, 73, 10, 74, 11, 75, 12, 76, 13, 77, 14, 78, 15, 79, 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87, 24, 88, 25, 89, 26, 90, 27, 91, 28, 92, 29, 93, 30, 94, 31, 95 });
|
||||
}
|
||||
|
||||
V2
|
||||
f2 (V2 x)
|
||||
{
|
||||
return __builtin_shuffle (x, (V2) {}, (V2) { 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47 });
|
||||
}
|
||||
|
||||
V3
|
||||
f3 (V3 x)
|
||||
{
|
||||
return __builtin_shuffle (x, (V3) {}, (V3) { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 });
|
||||
}
|
||||
|
||||
V1
|
||||
f4 (V1 *x)
|
||||
{
|
||||
return __builtin_shuffle (*x, (V1) {}, (V1) { 0, 64, 1, 65, 2, 66, 3, 67, 4, 68, 5, 69, 6, 70, 7, 71, 8, 72, 9, 73, 10, 74, 11, 75, 12, 76, 13, 77, 14, 78, 15, 79, 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87, 24, 88, 25, 89, 26, 90, 27, 91, 28, 92, 29, 93, 30, 94, 31, 95 });
|
||||
}
|
||||
|
||||
V2
|
||||
f5 (V2 *x)
|
||||
{
|
||||
return __builtin_shuffle (*x, (V2) {}, (V2) { 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47 });
|
||||
}
|
||||
|
||||
V3
|
||||
f6 (V3 *x)
|
||||
{
|
||||
return __builtin_shuffle (*x, (V3) {}, (V3) { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 });
|
||||
}
|
||||
|
||||
V1
|
||||
f7 (V1 x)
|
||||
{
|
||||
return __builtin_shuffle ((V1) {}, x, (V1) { 64, 0, 65, 1, 66, 2, 67, 3, 68, 4, 69, 5, 70, 6, 71, 7, 72, 8, 73, 9, 74, 10, 75, 11, 76, 12, 77, 13, 78, 14, 79, 15, 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23, 88, 24, 89, 25, 90, 26, 91, 27, 92, 28, 93, 29, 94, 30, 95, 31 });
|
||||
}
|
||||
|
||||
V2
|
||||
f8 (V2 x)
|
||||
{
|
||||
return __builtin_shuffle ((V2) {}, x, (V2) { 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7, 40, 8, 41, 9, 42, 10, 43, 11, 44, 12, 45, 13, 46, 14, 47, 15 });
|
||||
}
|
||||
|
||||
V3
|
||||
f9 (V3 x)
|
||||
{
|
||||
return __builtin_shuffle ((V3) {}, x, (V3) { 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 });
|
||||
}
|
||||
|
||||
V1
|
||||
f10 (V1 *x)
|
||||
{
|
||||
return __builtin_shuffle ((V1) {}, *x, (V1) { 64, 0, 65, 1, 66, 2, 67, 3, 68, 4, 69, 5, 70, 6, 71, 7, 72, 8, 73, 9, 74, 10, 75, 11, 76, 12, 77, 13, 78, 14, 79, 15, 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23, 88, 24, 89, 25, 90, 26, 91, 27, 92, 28, 93, 29, 94, 30, 95, 31 });
|
||||
}
|
||||
|
||||
V2
|
||||
f11 (V2 *x)
|
||||
{
|
||||
return __builtin_shuffle ((V2) {}, *x, (V2) { 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7, 40, 8, 41, 9, 42, 10, 43, 11, 44, 12, 45, 13, 46, 14, 47, 15 });
|
||||
}
|
||||
|
||||
V3
|
||||
f12 (V3 *x)
|
||||
{
|
||||
return __builtin_shuffle ((V3) {}, *x, (V3) { 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 });
|
||||
}
|
Loading…
Reference in New Issue
Block a user