Skip to content

Vector256.Shuffle does not produce optimal codegen #72793

@adamsitnik

Description

@adamsitnik

Detected in #72788

BenchmarkDotNet=v0.13.1, OS=Windows 10.0.22000
AMD Ryzen Threadripper PRO 3945WX 12-Cores, 1 CPU, 24 logical and 12 physical cores
.NET SDK=7.0.100-preview.6.22352.1
  [Host]   : .NET 7.0.0 (7.0.22.32404), X64 RyuJIT
Method Mean Ratio Code Size
Vector256ShuffleConst 384.42 ns 27.29 230 B
Vector256ShuffleLocal 382.85 ns 27.18 230 B
AvxShuffleLocal 14.09 ns 1.00 127 B

Repro

Details
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Configs;
using BenchmarkDotNet.Diagnosers;
using BenchmarkDotNet.Jobs;
using BenchmarkDotNet.Running;
using System;
using System.Linq;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;

namespace ShufflePerf
{
    internal class Program
    {
        static void Main() => BenchmarkRunner.Run<Repro>(
            DefaultConfig.Instance
                .AddJob(Job.ShortRun)
                .AddDiagnoser(new DisassemblyDiagnoser(new DisassemblyDiagnoserConfig())));
    }

    public class Repro
    {
        private const uint Vector256ByteCount = 32;
        private const int BitsPerInt32 = 32;
        private int[] m_array = Enumerable.Range(0, 512).ToArray();
        private bool[] boolArray = new bool[512 * 32];
        private int m_length = 512;

        [Benchmark]
        public void Vector256ShuffleConst()
        {
            Vector256<byte> bitMask = Vector256.Create(0x80402010_08040201).AsByte();
            Vector256<byte> ones = Vector256.Create((byte)1);

            ref byte destination = ref Unsafe.As<bool, byte>(ref MemoryMarshal.GetArrayDataReference<bool>(boolArray));

            for (uint i = 0; (i + Vector256ByteCount) <= (uint)m_length; i += Vector256ByteCount)
            {
                int bits = m_array[i / (uint)BitsPerInt32];
                Vector256<int> scalar = Vector256.Create(bits);
                Vector256<byte> shuffled = Vector256.Shuffle(scalar.AsByte(), Vector256.Create(0, 0x01010101_01010101, 0x02020202_02020202, 0x03030303_03030303).AsByte());

                Vector256<byte> extracted = shuffled & bitMask;
                Vector256<byte> normalized = Vector256.Min(extracted, ones);
                normalized.StoreUnsafe(ref destination, new UIntPtr(i));
            }
        }

        [Benchmark]
        public void Vector256ShuffleLocal()
        {
            Vector128<byte> lowerShuffleMask_CopyToBoolArray = Vector128.Create(0, 0x01010101_01010101).AsByte();
            Vector128<byte> upperShuffleMask_CopyToBoolArray = Vector128.Create(0x02020202_02020202, 0x03030303_03030303).AsByte();
            Vector256<byte> shuffleMask = Vector256.Create(lowerShuffleMask_CopyToBoolArray, upperShuffleMask_CopyToBoolArray);

            Vector256<byte> bitMask = Vector256.Create(0x80402010_08040201).AsByte();
            Vector256<byte> ones = Vector256.Create((byte)1);

            ref byte destination = ref Unsafe.As<bool, byte>(ref MemoryMarshal.GetArrayDataReference<bool>(boolArray));

            for (uint i = 0; (i + Vector256ByteCount) <= (uint)m_length; i += Vector256ByteCount)
            {
                int bits = m_array[i / (uint)BitsPerInt32];
                Vector256<int> scalar = Vector256.Create(bits);
                Vector256<byte> shuffled = Vector256.Shuffle(scalar.AsByte(), shuffleMask);

                Vector256<byte> extracted = shuffled & bitMask;
                Vector256<byte> normalized = Vector256.Min(extracted, ones);
                normalized.StoreUnsafe(ref destination, new UIntPtr(i));
            }
        }

        [Benchmark(Baseline = true)]
        public void AvxShuffleLocal()
        {
            Vector128<byte> lowerShuffleMask_CopyToBoolArray = Vector128.Create(0, 0x01010101_01010101).AsByte();
            Vector128<byte> upperShuffleMask_CopyToBoolArray = Vector128.Create(0x02020202_02020202, 0x03030303_03030303).AsByte();
            Vector256<byte> shuffleMask = Vector256.Create(lowerShuffleMask_CopyToBoolArray, upperShuffleMask_CopyToBoolArray);

            Vector256<byte> bitMask = Vector256.Create(0x80402010_08040201).AsByte();
            Vector256<byte> ones = Vector256.Create((byte)1);

            ref byte destination = ref Unsafe.As<bool, byte>(ref MemoryMarshal.GetArrayDataReference<bool>(boolArray));

            for (uint i = 0; (i + Vector256ByteCount) <= (uint)m_length; i += Vector256ByteCount)
            {
                int bits = m_array[i / (uint)BitsPerInt32];
                Vector256<int> scalar = Vector256.Create(bits);
                Vector256<byte> shuffled = Avx2.Shuffle(scalar.AsByte(), shuffleMask);

                Vector256<byte> extracted = shuffled & bitMask;
                Vector256<byte> normalized = Vector256.Min(extracted, ones);
                normalized.StoreUnsafe(ref destination, new UIntPtr(i));
            }
        }
    }
}
<Project Sdk="Microsoft.NET.Sdk">

  <PropertyGroup>
    <OutputType>Exe</OutputType>
    <TargetFramework>net7.0</TargetFramework>
  </PropertyGroup>

  <ItemGroup>
    <PackageReference Include="BenchmarkDotNet" Version="0.13.1" />
  </ItemGroup>

</Project>

Disassembly

Details

.NET 7.0.0 (7.0.22.32404), X64 RyuJIT

; ShufflePerf.Repro.Vector256ShuffleConst()
       sub       rsp,98
       vzeroupper
       vxorps    xmm4,xmm4,xmm4
       vmovdqa   xmmword ptr [rsp+60],xmm4
       vmovdqa   xmmword ptr [rsp+70],xmm4
       mov       rax,[rcx+10]
       cmp       [rax],al
       add       rax,10
       xor       edx,edx
       cmp       dword ptr [rcx+18],20
       jb        near ptr M00_L03
       vmovupd   ymm0,[7FF9BDA43AE0]
M00_L00:
       mov       r8,[rcx+8]
       mov       r9d,edx
       shr       r9d,5
       cmp       r9d,[r8+8]
       jae       near ptr M00_L04
       mov       r9d,r9d
       vpbroadcastd ymm1,dword ptr [r8+r9*4+10]
       vmovupd   [rsp+20],ymm1
       vmovupd   [rsp+40],ymm0
       xor       r8d,r8d
       nop       dword ptr [rax]
M00_L01:
       lea       r9,[rsp+40]
       movsxd    r10,r8d
       movzx     r9d,byte ptr [r9+r10]
       xor       r11d,r11d
       cmp       r9d,20
       jge       short M00_L02
       lea       r11,[rsp+20]
       mov       r9d,r9d
       movzx     r11d,byte ptr [r11+r9]
M00_L02:
       lea       r9,[rsp+60]
       mov       [r9+r10],r11b
       inc       r8d
       cmp       r8d,20
       jl        short M00_L01
       vmovupd   ymm1,[rsp+60]
       mov       r8d,edx
       vpand     ymm1,ymm1,[7FF9BDA43B00]
       vpminub   ymm1,ymm1,[7FF9BDA43B20]
       vmovdqu   ymmword ptr [rax+r8],ymm1
       add       edx,20
       lea       r8d,[rdx+20]
       cmp       r8d,[rcx+18]
       jbe       near ptr M00_L00
M00_L03:
       vzeroupper
       add       rsp,98
       ret
M00_L04:
       call      CORINFO_HELP_RNGCHKFAIL
       int       3
; Total bytes of code 230

.NET 7.0.0 (7.0.22.32404), X64 RyuJIT

; ShufflePerf.Repro.Vector256ShuffleLocal()
       sub       rsp,98
       vzeroupper
       vxorps    xmm4,xmm4,xmm4
       vmovdqa   xmmword ptr [rsp+60],xmm4
       vmovdqa   xmmword ptr [rsp+70],xmm4
       vmovupd   xmm0,[7FF9BDA53B40]
       vinserti128 ymm0,ymm0,xmmword ptr [7FF9BDA53B50],1
       mov       rax,[rcx+10]
       cmp       [rax],al
       add       rax,10
       xor       edx,edx
       cmp       dword ptr [rcx+18],20
       jb        near ptr M00_L03
M00_L00:
       mov       r8,[rcx+8]
       mov       r9d,edx
       shr       r9d,5
       cmp       r9d,[r8+8]
       jae       near ptr M00_L04
       mov       r9d,r9d
       vpbroadcastd ymm1,dword ptr [r8+r9*4+10]
       vmovupd   [rsp+20],ymm1
       vmovupd   [rsp+40],ymm0
       xor       r8d,r8d
M00_L01:
       lea       r9,[rsp+40]
       movsxd    r10,r8d
       movzx     r9d,byte ptr [r9+r10]
       xor       r11d,r11d
       cmp       r9d,20
       jge       short M00_L02
       lea       r11,[rsp+20]
       mov       r9d,r9d
       movzx     r11d,byte ptr [r11+r9]
M00_L02:
       lea       r9,[rsp+60]
       mov       [r9+r10],r11b
       inc       r8d
       cmp       r8d,20
       jl        short M00_L01
       vmovupd   ymm1,[rsp+60]
       mov       r8d,edx
       vpand     ymm1,ymm1,[7FF9BDA53B60]
       vpminub   ymm1,ymm1,[7FF9BDA53B80]
       vmovdqu   ymmword ptr [rax+r8],ymm1
       add       edx,20
       lea       r8d,[rdx+20]
       cmp       r8d,[rcx+18]
       jbe       near ptr M00_L00
M00_L03:
       vzeroupper
       add       rsp,98
       ret
M00_L04:
       call      CORINFO_HELP_RNGCHKFAIL
       int       3
; Total bytes of code 230

.NET 7.0.0 (7.0.22.32404), X64 RyuJIT

; ShufflePerf.Repro.AvxShuffleLocal()
       sub       rsp,28
       vzeroupper
       vmovupd   xmm0,[7FF9BDA53BA0]
       vinserti128 ymm0,ymm0,xmmword ptr [7FF9BDA53BB0],1
       mov       rax,[rcx+10]
       cmp       [rax],al
       add       rax,10
       xor       edx,edx
       cmp       dword ptr [rcx+18],20
       jb        short M00_L01
M00_L00:
       mov       r8,[rcx+8]
       mov       r9d,edx
       shr       r9d,5
       cmp       r9d,[r8+8]
       jae       short M00_L02
       mov       r9d,r9d
       vpbroadcastd ymm1,dword ptr [r8+r9*4+10]
       vpshufb   ymm1,ymm1,ymm0
       vpand     ymm1,ymm1,[7FF9BDA53BC0]
       vpminub   ymm1,ymm1,[7FF9BDA53BE0]
       mov       r8d,edx
       vmovdqu   ymmword ptr [rax+r8],ymm1
       add       edx,20
       lea       r8d,[rdx+20]
       cmp       r8d,[rcx+18]
       jbe       short M00_L00
M00_L01:
       vzeroupper
       add       rsp,28
       ret
M00_L02:
       call      CORINFO_HELP_RNGCHKFAIL
       int       3
; Total bytes of code 127

cc @tannergooding @EgorBo

category:cq
theme:vector-codegen
skill-level:intermediate
cost:medium
impact:small

Metadata

Metadata

Assignees

Labels

area-CodeGen-coreclrCLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI

Type

No type

Projects

No projects

Relationships

None yet

Development

No branches or pull requests

Issue actions