-
Notifications
You must be signed in to change notification settings - Fork 5.4k
Closed
Labels
area-CodeGen-coreclrCLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMICLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI
Milestone
Description
Detected in #72788
BenchmarkDotNet=v0.13.1, OS=Windows 10.0.22000
AMD Ryzen Threadripper PRO 3945WX 12-Cores, 1 CPU, 24 logical and 12 physical cores
.NET SDK=7.0.100-preview.6.22352.1
[Host] : .NET 7.0.0 (7.0.22.32404), X64 RyuJIT| Method | Mean | Ratio | Code Size |
|---|---|---|---|
| Vector256ShuffleConst | 384.42 ns | 27.29 | 230 B |
| Vector256ShuffleLocal | 382.85 ns | 27.18 | 230 B |
| AvxShuffleLocal | 14.09 ns | 1.00 | 127 B |
Repro
Details
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Configs;
using BenchmarkDotNet.Diagnosers;
using BenchmarkDotNet.Jobs;
using BenchmarkDotNet.Running;
using System;
using System.Linq;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
namespace ShufflePerf
{
internal class Program
{
static void Main() => BenchmarkRunner.Run<Repro>(
DefaultConfig.Instance
.AddJob(Job.ShortRun)
.AddDiagnoser(new DisassemblyDiagnoser(new DisassemblyDiagnoserConfig())));
}
public class Repro
{
private const uint Vector256ByteCount = 32;
private const int BitsPerInt32 = 32;
private int[] m_array = Enumerable.Range(0, 512).ToArray();
private bool[] boolArray = new bool[512 * 32];
private int m_length = 512;
[Benchmark]
public void Vector256ShuffleConst()
{
Vector256<byte> bitMask = Vector256.Create(0x80402010_08040201).AsByte();
Vector256<byte> ones = Vector256.Create((byte)1);
ref byte destination = ref Unsafe.As<bool, byte>(ref MemoryMarshal.GetArrayDataReference<bool>(boolArray));
for (uint i = 0; (i + Vector256ByteCount) <= (uint)m_length; i += Vector256ByteCount)
{
int bits = m_array[i / (uint)BitsPerInt32];
Vector256<int> scalar = Vector256.Create(bits);
Vector256<byte> shuffled = Vector256.Shuffle(scalar.AsByte(), Vector256.Create(0, 0x01010101_01010101, 0x02020202_02020202, 0x03030303_03030303).AsByte());
Vector256<byte> extracted = shuffled & bitMask;
Vector256<byte> normalized = Vector256.Min(extracted, ones);
normalized.StoreUnsafe(ref destination, new UIntPtr(i));
}
}
[Benchmark]
public void Vector256ShuffleLocal()
{
Vector128<byte> lowerShuffleMask_CopyToBoolArray = Vector128.Create(0, 0x01010101_01010101).AsByte();
Vector128<byte> upperShuffleMask_CopyToBoolArray = Vector128.Create(0x02020202_02020202, 0x03030303_03030303).AsByte();
Vector256<byte> shuffleMask = Vector256.Create(lowerShuffleMask_CopyToBoolArray, upperShuffleMask_CopyToBoolArray);
Vector256<byte> bitMask = Vector256.Create(0x80402010_08040201).AsByte();
Vector256<byte> ones = Vector256.Create((byte)1);
ref byte destination = ref Unsafe.As<bool, byte>(ref MemoryMarshal.GetArrayDataReference<bool>(boolArray));
for (uint i = 0; (i + Vector256ByteCount) <= (uint)m_length; i += Vector256ByteCount)
{
int bits = m_array[i / (uint)BitsPerInt32];
Vector256<int> scalar = Vector256.Create(bits);
Vector256<byte> shuffled = Vector256.Shuffle(scalar.AsByte(), shuffleMask);
Vector256<byte> extracted = shuffled & bitMask;
Vector256<byte> normalized = Vector256.Min(extracted, ones);
normalized.StoreUnsafe(ref destination, new UIntPtr(i));
}
}
[Benchmark(Baseline = true)]
public void AvxShuffleLocal()
{
Vector128<byte> lowerShuffleMask_CopyToBoolArray = Vector128.Create(0, 0x01010101_01010101).AsByte();
Vector128<byte> upperShuffleMask_CopyToBoolArray = Vector128.Create(0x02020202_02020202, 0x03030303_03030303).AsByte();
Vector256<byte> shuffleMask = Vector256.Create(lowerShuffleMask_CopyToBoolArray, upperShuffleMask_CopyToBoolArray);
Vector256<byte> bitMask = Vector256.Create(0x80402010_08040201).AsByte();
Vector256<byte> ones = Vector256.Create((byte)1);
ref byte destination = ref Unsafe.As<bool, byte>(ref MemoryMarshal.GetArrayDataReference<bool>(boolArray));
for (uint i = 0; (i + Vector256ByteCount) <= (uint)m_length; i += Vector256ByteCount)
{
int bits = m_array[i / (uint)BitsPerInt32];
Vector256<int> scalar = Vector256.Create(bits);
Vector256<byte> shuffled = Avx2.Shuffle(scalar.AsByte(), shuffleMask);
Vector256<byte> extracted = shuffled & bitMask;
Vector256<byte> normalized = Vector256.Min(extracted, ones);
normalized.StoreUnsafe(ref destination, new UIntPtr(i));
}
}
}
}<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net7.0</TargetFramework>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="BenchmarkDotNet" Version="0.13.1" />
</ItemGroup>
</Project>Disassembly
Details
.NET 7.0.0 (7.0.22.32404), X64 RyuJIT
; ShufflePerf.Repro.Vector256ShuffleConst()
sub rsp,98
vzeroupper
vxorps xmm4,xmm4,xmm4
vmovdqa xmmword ptr [rsp+60],xmm4
vmovdqa xmmword ptr [rsp+70],xmm4
mov rax,[rcx+10]
cmp [rax],al
add rax,10
xor edx,edx
cmp dword ptr [rcx+18],20
jb near ptr M00_L03
vmovupd ymm0,[7FF9BDA43AE0]
M00_L00:
mov r8,[rcx+8]
mov r9d,edx
shr r9d,5
cmp r9d,[r8+8]
jae near ptr M00_L04
mov r9d,r9d
vpbroadcastd ymm1,dword ptr [r8+r9*4+10]
vmovupd [rsp+20],ymm1
vmovupd [rsp+40],ymm0
xor r8d,r8d
nop dword ptr [rax]
M00_L01:
lea r9,[rsp+40]
movsxd r10,r8d
movzx r9d,byte ptr [r9+r10]
xor r11d,r11d
cmp r9d,20
jge short M00_L02
lea r11,[rsp+20]
mov r9d,r9d
movzx r11d,byte ptr [r11+r9]
M00_L02:
lea r9,[rsp+60]
mov [r9+r10],r11b
inc r8d
cmp r8d,20
jl short M00_L01
vmovupd ymm1,[rsp+60]
mov r8d,edx
vpand ymm1,ymm1,[7FF9BDA43B00]
vpminub ymm1,ymm1,[7FF9BDA43B20]
vmovdqu ymmword ptr [rax+r8],ymm1
add edx,20
lea r8d,[rdx+20]
cmp r8d,[rcx+18]
jbe near ptr M00_L00
M00_L03:
vzeroupper
add rsp,98
ret
M00_L04:
call CORINFO_HELP_RNGCHKFAIL
int 3
; Total bytes of code 230.NET 7.0.0 (7.0.22.32404), X64 RyuJIT
; ShufflePerf.Repro.Vector256ShuffleLocal()
sub rsp,98
vzeroupper
vxorps xmm4,xmm4,xmm4
vmovdqa xmmword ptr [rsp+60],xmm4
vmovdqa xmmword ptr [rsp+70],xmm4
vmovupd xmm0,[7FF9BDA53B40]
vinserti128 ymm0,ymm0,xmmword ptr [7FF9BDA53B50],1
mov rax,[rcx+10]
cmp [rax],al
add rax,10
xor edx,edx
cmp dword ptr [rcx+18],20
jb near ptr M00_L03
M00_L00:
mov r8,[rcx+8]
mov r9d,edx
shr r9d,5
cmp r9d,[r8+8]
jae near ptr M00_L04
mov r9d,r9d
vpbroadcastd ymm1,dword ptr [r8+r9*4+10]
vmovupd [rsp+20],ymm1
vmovupd [rsp+40],ymm0
xor r8d,r8d
M00_L01:
lea r9,[rsp+40]
movsxd r10,r8d
movzx r9d,byte ptr [r9+r10]
xor r11d,r11d
cmp r9d,20
jge short M00_L02
lea r11,[rsp+20]
mov r9d,r9d
movzx r11d,byte ptr [r11+r9]
M00_L02:
lea r9,[rsp+60]
mov [r9+r10],r11b
inc r8d
cmp r8d,20
jl short M00_L01
vmovupd ymm1,[rsp+60]
mov r8d,edx
vpand ymm1,ymm1,[7FF9BDA53B60]
vpminub ymm1,ymm1,[7FF9BDA53B80]
vmovdqu ymmword ptr [rax+r8],ymm1
add edx,20
lea r8d,[rdx+20]
cmp r8d,[rcx+18]
jbe near ptr M00_L00
M00_L03:
vzeroupper
add rsp,98
ret
M00_L04:
call CORINFO_HELP_RNGCHKFAIL
int 3
; Total bytes of code 230.NET 7.0.0 (7.0.22.32404), X64 RyuJIT
; ShufflePerf.Repro.AvxShuffleLocal()
sub rsp,28
vzeroupper
vmovupd xmm0,[7FF9BDA53BA0]
vinserti128 ymm0,ymm0,xmmword ptr [7FF9BDA53BB0],1
mov rax,[rcx+10]
cmp [rax],al
add rax,10
xor edx,edx
cmp dword ptr [rcx+18],20
jb short M00_L01
M00_L00:
mov r8,[rcx+8]
mov r9d,edx
shr r9d,5
cmp r9d,[r8+8]
jae short M00_L02
mov r9d,r9d
vpbroadcastd ymm1,dword ptr [r8+r9*4+10]
vpshufb ymm1,ymm1,ymm0
vpand ymm1,ymm1,[7FF9BDA53BC0]
vpminub ymm1,ymm1,[7FF9BDA53BE0]
mov r8d,edx
vmovdqu ymmword ptr [rax+r8],ymm1
add edx,20
lea r8d,[rdx+20]
cmp r8d,[rcx+18]
jbe short M00_L00
M00_L01:
vzeroupper
add rsp,28
ret
M00_L02:
call CORINFO_HELP_RNGCHKFAIL
int 3
; Total bytes of code 127category:cq
theme:vector-codegen
skill-level:intermediate
cost:medium
impact:small
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
area-CodeGen-coreclrCLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMICLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI