-
Notifications
You must be signed in to change notification settings - Fork 24
Expand file tree
/
Copy pathProgram.cs
More file actions
130 lines (106 loc) · 3.72 KB
/
Program.cs
File metadata and controls
130 lines (106 loc) · 3.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
// See https://aka.ms/new-console-template for more information
using System.Numerics;
using System.Runtime.InteropServices;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Running;
BenchmarkRunner.Run<Benchmarks>();
[RankColumn]
public class Benchmarks
{
private static readonly int[] Values = Enumerable.Range(0, 50_000).ToArray();
[Benchmark(Baseline = true)]
public int SumForLoop()
{
var acc = 0;
for (var i = 0; i < Values.Length; i++)
acc += i;
return acc;
}
[Benchmark]
public int SumLinq() => Values.Sum();
[Benchmark]
public int SumPLinq() => Values.AsParallel().Sum();
[Benchmark]
public int SumLinqSimdNaive()
{
// The performant way of getting an array of vectors rather than doing it by hand
var vectors = MemoryMarshal.Cast<int, Vector<int>>(Values);
var accVector = Vector<int>.Zero;
foreach (var vector in vectors)
{
accVector += vector;
}
var remainder = 0;
for (var i = Values.Length - (Values.Length % Vector<int>.Count); i < Values.Length; i++)
{
remainder += Values[i];
}
// Combine vector sum and remainder
return Vector.Sum(accVector) + remainder;
}
[Benchmark]
public int SumLinqSimdBetter()
{
/*
* In comparison to the naive version, this one is a bit more complex.
*
* Instead of only using "one" Vector<int> to sum the elements,
* we use two vectors at a time and put the result into the acc vector.
* This enables ILP (Instruction Level Parallelism) and
* allows the CPU to execute multiple instructions
*/
var spanAsVectors = MemoryMarshal.Cast<int, Vector<int>>(Values);
var remainingElements = Values.Length % Vector<int>.Count;
var accVector = Vector<int>.Zero;
for (var i = 0; i < spanAsVectors.Length - 1; i += 2)
{
accVector += spanAsVectors[i] + spanAsVectors[i + 1];
}
if (spanAsVectors.Length % 2 == 1)
{
accVector += spanAsVectors[^1];
}
if (remainingElements > 0)
{
var startingLastElements = Values.Length - remainingElements;
var remainingElementsSlice = Values[startingLastElements..];
accVector += new Vector<int>(remainingElementsSlice);
}
return Vector.Sum(accVector);
}
[Benchmark]
public int SumLinqSimdUnrolled4()
{
var vectors = MemoryMarshal.Cast<int, Vector<int>>(Values);
var simdCount = vectors.Length;
// Four accumulators to enable EVEN MORE ILP
var acc1 = Vector<int>.Zero;
var acc2 = Vector<int>.Zero;
var acc3 = Vector<int>.Zero;
var acc4 = Vector<int>.Zero;
var i = 0;
for (; i <= simdCount - 4; i += 4)
{
acc1 += vectors[i];
acc2 += vectors[i + 1];
acc3 += vectors[i + 2];
acc4 += vectors[i + 3];
}
// Combine the accumulators
var accVector = acc1 + acc2 + acc3 + acc4;
// Handle remaining vectors if length % 4 != 0
for (; i < simdCount; i++)
{
accVector += vectors[i];
}
// Handle remaining elements that didn't fit into a vector
var remainingElements = Values.Length % Vector<int>.Count;
if (remainingElements > 0)
{
Span<int> lastVectorElements = stackalloc int[Vector<int>.Count];
Values[^remainingElements..].CopyTo(lastVectorElements);
accVector += new Vector<int>(lastVectorElements);
}
return Vector.Sum(accVector);
}
}